def langcode_to_tmxcode(langcode: str, default="en_US") -> str: """ convert langcode str to a TMX code (langid-LOCALE). >>> langcode_to_tmxcode("zh") 'zh-CN' >>> langcode_to_tmxcode("zh-CHS") 'zh-CN' >>> langcode_to_tmxcode("zh-CHT") 'zh-TW' >>> langcode_to_tmxcode("en") 'en-US' >>> langcode_to_tmxcode("en-uk") 'en-GB' >>> langcode_to_tmxcode("de") 'de-DE' >>> langcode_to_tmxcode("en-ca") 'en-CA' >>> langcode_to_tmxcode("pt") 'pt-PT' """ if langcode.lower() == "zh-cht": langcode = "zh-tw" if langcode.lower() == "pt": langcode = "pt-pt" lc_ = standardize_tag(langcode) try: tmxcode = closest_match(lc_, TMX_CODES)[0] except Exception as exc: logger.warning(" exc: %s, returning en-US", exc) tmxcode = default return tmxcode
def get_country_data(self): countries = self.get_all_countries() for c in countries: name = c["name"].lower() self.countries_data[name] = {} self.countries_data[name]["timezones"] = c["timezones"] self.countries_data[name]["demonym"] = c["demonym"] self.countries_data[name]["currencies"] = c["currencies"] self.countries_data[name]["alpha2Code"] = c["alpha2Code"] self.country_codes[c["alpha2Code"]] = name self.countries_data[name]["alpha3Code"] = c["alpha3Code"] self.country_codes[c["alpha3Code"]] = name self.countries_data[name]["area"] = str(c["area"]) self.countries_data[name]["languages"] = [ langcodes.LanguageData(language=l).language_name() for l in c["languages"] ] self.countries_data[name]["lang_codes"] = [ langcodes.standardize_tag(l) for l in c["languages"] ] self.countries_data[name]["capital"] = c["capital"] self.countries_data[name]["borders"] = c["borders"] self.countries_data[name]["nativeName"] = c["nativeName"] self.countries_data[name]["population"] = str(c["population"]) self.countries_data[name]["region"] = c["region"] self.countries_data[name]["subregion"] = c["subregion"] if len(c["latlng"]): self.countries_data[name]["lat"], \ self.countries_data[name]["long"] = c["latlng"]
def language(layout: LayoutDetails) -> str: # https://tools.ietf.org/html/bcp47 if not layout.iso639: raise ValueError(f"no language for {layout.xkb_name()}") tag = layout.iso639[0] if layout.iso3166: tag += "-" + layout.iso3166[0] return langcodes.standardize_tag(tag)
def _deserialize(self, value: str, attr, data, **kwargs): try: # TODO: Does not validate that the input is a correct language tag # Replaces overlong tags with their shortest version, and also formats them according to the # conventions of BCP 47. return standardize_tag(value, macro=True) except ValueError as e: raise ValidationError(f'{e}')
def transform(self, data): raw, xml = data xpath = ".//dc:subject" for item in raw.findall(xpath, namespaces=ns): lang = item.get('{http://www.w3.org/XML/1998/namespace}lang') field = ET.Element('field') field.text = item.text field.set('name', 'keyword_{}'.format(standardize_tag(lang[0:2]))) xml.find('.').append(field) return data
def transform(self, data): xpath = ".//dc:language" raw, xml = data for lang in raw.findall(xpath, namespaces=ns): field = ET.Element('field') field.text = standardize_tag(lang.text) field.set('name', 'la') xml.find('.').append(field) return data
def transform(self, data): xpath = ".//dc:identifier" raw, xml = data for url in raw.findall(xpath, namespaces=ns): if url.text.startswith('http'): for lang in raw.findall(".//dc:language", namespaces=ns): field = ET.Element('field') field.text = url.text field.set('name', 'fulltext_html_%s' % standardize_tag(lang.text)) xml.find('.').append(field) return data
def __init__(self, width: int, height: int, storage: Storage = LocalStorage(), style: str = 'newsworthy', language: str = 'en-GB'): """ :param width: width in pixels :param height: height in pixels :param storage: storage object that will handle file saving. Default LocalStorage() class will save a file the working dir. :param style: a predefined style or the path to a custom style file :param language: a BCP 47 language tag (eg `en`, `sv-FI`) """ try: self.api_token = os.environ["DATAWRAPPER_API_KEY"] except KeyError: raise Exception("DATAWRAPPER_API_KEY must be set in environment") # P U B L I C P R O P E R T I E S # The user can alter these at any time # Unlike regular Chart objects Datawrapper does not use the DataList # class for storing data, as DataList does not handle non-numerical # data. DatawrapperCharts will understand the same list-of-list-of-list # structure as DataList builds upon, but prefer consuming list of # dictionaries self.data = [] self.labels = [] # Optionally one label for each dataset self.annotations = [] # Manually added annotations self.caption = None self.highlight = None self.decimals = None self.dw_data = {} # The DW data structure that defines the chart self._dw_id = None # Datawrapper chart id # P R I V A T E P R O P E R T I E S # Properties managed through getters/setters self._title = None self._units = "count" # Calculated properties self._storage = storage self._w, self._h = int(width), int(height) self._style = loadstyle(style) # Standardize and check if language tag is a valid BCP 47 tag self._language = standardize_tag(language) self._locale = Locale.parse(self._language.replace("-", "_")) # For renaming regions to DW conventions self._translations = None
def transform(self, data): raw, xml = data xpath = ".//dc:description" for item in raw.findall(xpath, namespaces=ns): lang = item.get('{http://www.w3.org/XML/1998/namespace}lang') if "-" in lang: lang = lang.split("-")[0] field = ET.Element('field') field.text = item.text field.set('name', 'ab_{}'.format(standardize_tag(lang))) xml.find('.').append(field) return data
def append_input_component(details: LayoutDetails): obj = dict[str, object]() obj['name'] = f"XKB's {details.xkb_name()} -- {details.description}" obj['id'] = "all-xkb-layouts-" + details.xkb_name() if details.iso639: # languages = [f"{language}-{country}" for country in details.countries for language in details.languages] if details.countries else details.languages obj['language'] = [ langcodes.standardize_tag(language) for language in details.iso639 ] # multiple allowed else: # necessary otherwise switching to layoyut will crash Chrome OS obj['language'] = ['??'] obj['layouts'] = [details.xkb_name()] # list of one manifest['input_components'].append(obj)
def process_recording(recording): output = { "dateAdded": recording["updated_at"], "videos": [{ "url": recording["recording_url"], "quality": "HD", "videoType": "MP4" }], "duration": recording["length"], "language": langcodes.standardize_tag(recording["language"]) } return output
def transform(self, data): raw, xml = data xpath = ".//dc:description" langs = set() for item in raw.findall(xpath, namespaces=ns): lang = item.get('{http://www.w3.org/XML/1998/namespace}lang') if "-" in lang: lang = lang.split("-")[0] langs.add(standardize_tag(lang)) for language in langs: field = ET.Element('field') field.text = language field.set('name', 'available_languages') xml.find('.').append(field) return data
def parse_nquads_line(line): """ Parse a line in N-Triples or N-Quads format, returning four dictionaries: (subj, pred, obj, graph). Each of the dictionaries contains fields that may or may not be present, indicating their parsed content: - 'url': a complete URL indicating a resource. (Pedants: It's an IRI, but it's also a URL.) - 'text': a string value. - 'lang': the language code associated with the given 'text'. - 'type': a URL pointing to something in the 'xsd:' namespace, indicating for how to interpret the given 'text' as a value. - 'blank': the arbitrary ID of a blank node. """ items = [] for match in NQUADS_ITEM_RE.finditer(line): item = {} for group in ['url', 'text', 'lang', 'type', 'blank', 'comment']: matched = match.group(group) if matched is not None: item[group] = matched if 'comment' in item: continue if 'url' in item: item['url'] = decode_url(item['url']) if 'lang' in item: item['lang'] = langcodes.standardize_tag(item['lang']) if 'type' in item: item['type'] = decode_url(item['type']) if 'text' in item: item['text'] = decode_escapes(item['text']) if item: items.append(item) if len(items) == 3: items.append({}) # The line is either empty aside from comments, or contains a quad assert len(items) == 0 or len(items) == 4, line return items
def parse_nquads_line(line): """ Parse a line in N-Triples or N-Quads format, returning four dictionaries: (subj, pred, obj, graph). Each of the dictionaries contains fields that may or may not be present, indicating their parsed content: - 'url': a complete URL indicating a resource. (Pedants: It's an IRI, but it's also a URL.) - 'text': a string value. - 'lang': the language code associated with the given 'text'. - 'type': a URL pointing to something in the 'xsd:' namespace, indicating for how to interpret the given 'text' as a value. - 'blank': the arbitrary ID of a blank node. """ items = [] for match in NQUADS_ITEM_RE.finditer(line): item = {} for group in ['url', 'text', 'lang', 'type', 'blank', 'comment']: matched = match.group(group) if matched is not None: item[group] = matched if 'comment' in item: continue if 'url' in item: item['url'] = decode_url(item['url']) if 'lang' in item: item['lang'] = langcodes.standardize_tag(item['lang']) if 'type' in item: item['type'] = decode_url(item['type']) if 'text' in item: item['text'] = decode_escapes(item['text']) if item: items.append(item) if len(items) == 3: items.append({}) # The line is either empty aside from comments, or contains a quad assert len(items) == 0 or len(items) == 4, line return items
def validate_aovp_args(args): # pylint: disable=too-many-branches, too-many-return-statements, too-many-statements """ Check that the commandline arguments passed to autosub are valid for audio or video processing. """ if args.sleep_seconds < 0 or args.lines_per_trans < 0: raise exceptions.AutosubException( _("Error: \"-slp\"/\"--sleep-seconds\" arg is illegal.")) if args.speech_language: # pylint: disable=too-many-nested-blocks if not args.gspeechv2: args.speech_language = args.speech_language.lower() if args.speech_language \ not in constants.SPEECH_TO_TEXT_LANGUAGE_CODES: print( _("Warning: Speech language \"{src}\" not recommended. " "Run with \"-lsc\"/\"--list-speech-codes\" " "to see all supported languages.").format( src=args.speech_language)) if args.best_match and 's' in args.best_match: best_result = lang_code_utils.match_print( dsr_lang=args.speech_language, match_list=list( constants.SPEECH_TO_TEXT_LANGUAGE_CODES.keys()), min_score=args.min_score) if best_result: print( _("Use langcodes-py2 to standardize the result.")) args.speech_language = langcodes.standardize_tag( best_result[0]) print( _("Use \"{lang_code}\" instead.").format( lang_code=args.speech_language)) else: print( _("Match failed. Still using \"{lang_code}\"."). format(lang_code=args.speech_language)) if args.min_confidence < 0.0 or args.min_confidence > 1.0: raise exceptions.AutosubException( _("Error: The arg of \"-mnc\"/\"--min-confidence\" isn't legal." )) if args.dst_language is None: print( _("Destination language not provided. " "Only performing speech recognition.")) else: if not args.src_language: print( _("Source language not provided. " "Use Speech language instead.")) args.src_language = args.speech_language if not args.best_match: args.best_match = {'src'} elif 'src' not in args.best_match: args.best_match.add('src') is_src_matched = False is_dst_matched = False for key in googletrans.constants.LANGUAGES: if args.src_language.lower() == key.lower(): args.src_language = key is_src_matched = True if args.dst_language.lower() == key.lower(): args.dst_language = key is_dst_matched = True if not is_src_matched: if not args.gtransv2: if args.best_match and 'src' in args.best_match: print( _("Warning: Source language \"{src}\" not supported. " "Run with \"-lsc\"/\"--list-translation-codes\" " "to see all supported languages.").format( src=args.src_language)) best_result = lang_code_utils.match_print( dsr_lang=args.src_language, match_list=list( googletrans.constants.LANGUAGES.keys()), min_score=args.min_score) if best_result: print( _("Use \"{lang_code}\" instead.").format( lang_code=best_result[0])) args.src_language = best_result[0] else: raise exceptions.AutosubException( _("Match failed. Still using \"{lang_code}\". " "Program stopped.").format( lang_code=args.src_language)) else: raise exceptions.AutosubException( _("Error: Source language \"{src}\" not supported. " "Run with \"-lsc\"/\"--list-translation-codes\" " "to see all supported languages. " "Or use \"-bm\"/\"--best-match\" to get a best match." ).format(src=args.src_language)) if not is_dst_matched: if not args.gtransv2: if args.best_match and 'd' in args.best_match: print( _("Warning: Destination language \"{dst}\" not supported. " "Run with \"-lsc\"/\"--list-translation-codes\" " "to see all supported languages.").format( dst=args.dst_language)) best_result = lang_code_utils.match_print( dsr_lang=args.dst_language, match_list=list( googletrans.constants.LANGUAGES.keys()), min_score=args.min_score) if best_result: print( _("Use \"{lang_code}\" instead.").format( lang_code=best_result[0])) args.dst_language = best_result[0] else: raise exceptions.AutosubException( _("Match failed. Still using \"{lang_code}\". " "Program stopped.").format( lang_code=args.dst_language)) else: raise exceptions.AutosubException( _("Error: Destination language \"{dst}\" not supported. " "Run with \"-lsc\"/\"--list-translation-codes\" " "to see all supported languages. " "Or use \"-bm\"/\"--best-match\" to get a best match." ).format(dst=args.dst_language)) if args.dst_language == args.speech_language \ or args.src_language == args.dst_language: print( _("Speech language is the same as the Destination language. " "Only performing speech recognition.")) args.dst_language = None args.src_language = None else: if args.ext_regions: if not args.keep: raise exceptions.AutosubException( _("You've already input times. " "No works done.")) else: print( _("Speech language not provided. " "Only performing speech regions detection.")) if args.styles == ' ': # when args.styles is used but without option # its value is ' ' if not args.ext_regions: raise exceptions.AutosubException( _("Error: External speech regions file not provided.")) else: args.styles = args.ext_regions
def _get_language_code(self, heading): match = JA_LANGUAGE_RE.match(heading) if match: return langcodes.standardize_tag(match.group(1)) else: return None
def __init__(self, width: int, height: int, storage: Storage=LocalStorage(), style: str='newsworthy', language: str='en-GB'): """ :param width: width in pixels :param height: height in pixels :param storage: storage object that will handle file saving. Default LocalStorage() class will save a file the working dir. :param style: a predefined style or the path to a custom style file :param language: a BCP 47 language tag (eg `en`, `sv-FI`) """ # P U B L I C P R O P E R T I E S # The user can alter these at any time self.data = DataList() # A list of datasets self.annotate_trend = True # Print out values at points on trendline? self.trendline = [] # List of x positions, or data points self.labels = [] # Optionally one label for each dataset self.annotations = [] # Manually added annotations self.interval = None # yearly|quarterly|monthly|weekly|daily # We will try to guess interval based on the data, # but explicitly providing a value is safer. Used for finetuning. self.show_ticks = True # toggle category names, dates, etc self.subtitle = None self.note = None self.xlabel = None self.ylabel = None self.caption = None self.highlight = None self.decimals = None # number of decimals to show in annotations, value ticks, etc # None means automatically chose the best number self.logo = None # Path to image that will be embedded in the caption area # Can also be set though a style property self.color_fn = None # Custom coloring function # P R I V A T E P R O P E R T I E S # Properties managed through getters/setters self._title = None self._units = "count" # Calculated properties self._annotations = [] # Automatically added annotations self._storage = storage self._w, self._h = int(width), int(height) self._style = loadstyle(style) # Standardize and check if language tag is a valid BCP 47 tag self._language = standardize_tag(language) self._locale = Locale.parse(self._language.replace("-", "_")) # Dynamic typography self._title_font = FontProperties() self._title_font.set_family(self._style["title_font"]) self._title_font.set_size(self._style["figure.titlesize"]) self._title_font.set_weight(self._style["figure.titleweight"]) self._fig = Figure() FigureCanvas(self._fig) self.ax = self._fig.add_subplot(111) # self._fig, self.ax = plt.subplots() self.value_axis = self.ax.yaxis self.category_axis = self.ax.xaxis # Calculate size in inches self._set_size(width, height) # Chart elements. Made available for fitting. self._title_elem = None self._subtitle_elem = None self._note_elem = None self._caption_elem = None self._logo_elem = None
def CQS_match_query_phrase(self, phrase): """Analyze phrase to see if it is a play-able phrase with this skill. Needs to be implemented by the skill. Arguments: phrase (str): User phrase, "What is an aardwark" Returns: (match, CQSMatchLevel[, callback_data]) or None: Tuple containing a string with the appropriate matching phrase, the PlayMatch type, and optionally data to return in the callback if the match is selected. """ response = None match = self.intents.calc_intent(phrase) level = CQSMatchLevel.CATEGORY data = match.matches intent = match.name score = match.conf data["intent"] = intent data["score"] = score if score > 0.8: level = CQSMatchLevel.EXACT elif score > 0.5: level = CQSMatchLevel.CATEGORY elif score > 0.3: level = CQSMatchLevel.GENERAL else: intent = None if intent: # Validate extracted entities country = data.get("country") region = data.get("region") language = data.get("language") if country: data["query"] = country # ensure we really have a country name response = self.dialog_renderer.render("bad_country", {}) match, score = match_one(country.lower(), list(self.countries_data.keys())) self.log.debug("Country fuzzy match: {n}, Score: {s}".format( n=match, s=score)) if score > 0.5: country = match data.update(self.countries_data[country]) else: countries = self.search_country(country) if not len(countries) > 0: level = CQSMatchLevel.GENERAL else: country = countries[0]["name"] data.update(countries[0]) # TODO disambiguation if len(countries) > 1: data["disambiguation"] = countries[1:] self.log.debug("multiple matches found: " + str([c["name"] for c in countries])) data["country"] = country # normalized from match if language: data["query"] = language # ensure we really have a language name words = language.split(" ") clean_up = ["is"] # remove words commonly caught by mistake in padatious language = " ".join( [word for word in words if word not in clean_up]) lang_code = langcodes.find_name( 'language', language, langcodes.standardize_tag(self.lang)) lang_code = str(lang_code) self.log.debug("Detected lang code: " + lang_code) if not lang_code: return None data["lang_code"] = lang_code # TODO countries = self.search_country_by_language(lang_code) data["country_list"] = countries if region: data["query"] = region # ensure we really have a region name response = self.dialog_renderer.render("bad_region") countries = None match, score = match_one(region, self.regions) data["region_score"] = score if score > 0.5: region = match countries = self.search_country_by_region(region) match, score2 = match_one(region, self.subregions) data["subregion_score"] = score2 if score2 > score: region = match countries = self.search_country_by_subregion(region) if score > 0.8 and not country: level = CQSMatchLevel.EXACT elif score > 0.5 and not country: level = CQSMatchLevel.CATEGORY elif score > 0.3 and not country: level = CQSMatchLevel.GENERAL data["region"] = region self.log.debug("Detected region: " + region) data["country_list"] = countries # Get response from intents response = self.intent2answer(intent, data) or response if response: return (phrase, level, response, data) return None
def parse_track(self, item): options = {} error = False original = item # preserve for error messages item = item.replace('\r', ' ').replace('\n', ' ') try: head, _emptyStr, lang_kind, _emptyStr, tail = re.split( r"(^| )\((.*?)\)( |$)", item) lang_kind = lang_kind.split() # split input into a list of words kinds = set(lang_kind) & set(('captions', 'descriptions', 'chapters', 'metadata', 'subtitles')) # Find kind for kind in kinds: if 'kind' not in options: options['kind'] = kind else: error = True continue lang_kind.remove(kind) # Find language for lang in lang_kind: if 'language' not in options: if langcodes.code_to_names( 'language', langcodes.get( langcodes.standardize_tag(lang)).language): options['language'] = langcodes.standardize_tag(lang) else: # lang is not a lang code. Try interpreting as a language name try: options['language'] = str(langcodes.find(lang)) except: error = True continue else: error = True continue item = head + ' ' + tail except: error = True if 'kind' not in options: options['kind'] = 'subtitles' if 'language' not in options: try: options['language'] = langcodes.standardize_tag(getlocale()[0]) except: options['language'] = 'en' # find label try: head, _emptyStr, _quote, label, _emptyStr, tail = re.split( r"""(^| )(["'])(.*?)\2( |$)""", item) if head and tail: error = True item = head + tail options['label'] = label.strip() except: try: options['label'] = options['kind'].capitalize( ) + ' in ' + langcodes.get( options['language']).autonym().capitalize() except: error = True options['label'] = None # get filename options['src'] = self.uri_check(item) # return error if error: self.state_machine.reporter.error( 'Error in "%s" directive: \n Problems encountered parsing track "%s" \n\n' 'Guessing the following values: \n' 'filename: "%s" \n' 'kind: "%s" \n' 'language: "%s" \n' 'label: "%s" \n\n' 'Track kinds should be chosen from one of the following: \n' 'captions, descriptions, chapters, metadata, subtitles \n' 'Track languages should be given as BCP 47 compliant language codes. \n' 'Track declarations should take the following form: \n' 'filename (kind language_code) "label"\n' 'Tracks must have one filename and one language_code. \n' 'If a kind is not specified, "subtitles" will be assumed. \n' 'If a label is not provided, it will be auto-generated from the kind and language specified.' % (self.name, original, options['src'], options['kind'], options['language'], options['label']), nodes.literal_block(self.block_text, self.block_text), line=self.lineno) track_node = track(self.block_text, **options) return track_node
def is_valid_lang_code(lang_code): try: standardize_tag(lang_code) except LanguageTagError: return '{} is not a valid language code.'.format(lang_code) return True
def _get_language_code(self, heading): match = JA_LANGUAGE_RE.match(heading) if match: return langcodes.standardize_tag(match.group(1)) else: return None