def get_frequency_list(lang, wordlist='best', match_cutoff=30): """ Read the raw data from a wordlist file, returning it as a list of lists. (See `read_cBpack` for what this represents.) Because we use the `langcodes` module, we can handle slight variations in language codes. For example, looking for 'pt-BR', 'pt_br', or even 'PT_BR' will get you the 'pt' (Portuguese) list. Looking up the alternate code 'por' will also get the same list. """ available = available_languages(wordlist) best, score = langcodes.best_match(lang, list(available), min_score=match_cutoff) if score == 0: raise LookupError("No wordlist %r available for language %r" % (wordlist, lang)) if best != lang: logger.warning( "You asked for word frequencies in language %r. Using the " "nearest match, which is %r (%s)." % (lang, best, langcodes.get(best).language_name('en')) ) return read_cBpack(available[best])
def get_frequency_list(lang, wordlist='best', match_cutoff=None): """ Read the raw data from a wordlist file, returning it as a list of lists. (See `read_cBpack` for what this represents.) Because we use the `langcodes` module, we can handle slight variations in language codes. For example, looking for 'pt-BR', 'pt_br', or even 'PT_BR' will get you the 'pt' (Portuguese) list. Looking up the alternate code 'por' will also get the same list. """ if match_cutoff is not None: warnings.warn("The `match_cutoff` parameter is deprecated", DeprecationWarning) available = available_languages(wordlist) # TODO: decrease the maximum distance. This distance is so high just # because it allows a test where 'yue' matches 'zh', and maybe the # distance between those is high because they shouldn't match. best, _distance = langcodes.closest_match(lang, list(available), max_distance=70) if best == 'und': raise LookupError("No wordlist %r available for language %r" % (wordlist, lang)) if best != lang: logger.warning( "You asked for word frequencies in language %r. Using the " "nearest match, which is %r (%s)." % (lang, best, langcodes.get(best).language_name('en'))) return read_cBpack(available[best])
def get_frequency_list(lang, wordlist='best', match_cutoff=30): """ Read the raw data from a wordlist file, returning it as a list of lists. (See `read_cBpack` for what this represents.) Because we use the `langcodes` module, we can handle slight variations in language codes. For example, looking for 'pt-BR', 'pt_br', or even 'PT_BR' will get you the 'pt' (Portuguese) list. Looking up the alternate code 'por' will also get the same list. """ available = available_languages(wordlist) best, score = langcodes.best_match(lang, list(available), min_score=match_cutoff) if score == 0: raise LookupError("No wordlist %r available for language %r" % (wordlist, lang)) if best != lang: logger.warning( "You asked for word frequencies in language %r. Using the " "nearest match, which is %r (%s)." % (lang, best, langcodes.get(best).language_name('en'))) return read_cBpack(available[best])
def valid_language(code): if not code or code == 'und' or '-pro' in code: return False if ALPHA3_RE.match(code): return True try: lcode = langcodes.get(code) return lcode.language is not None and len(lcode.language) <= 3 except LanguageTagError: return False
def get_population_data(): import langcodes filename = data_filename("supplementalData.xml") root = ET.fromstring(open(filename).read()) territories = root.findall("./territoryInfo/territory") language_population = defaultdict(int) language_writing_population = defaultdict(int) for territory in territories: t_code = territory.attrib['type'] t_population = float(territory.attrib['population']) t_literacy_rate = float(territory.attrib['literacyPercent']) / 100 for language in territory: attrs = language.attrib l_code = attrs['type'].replace('_', '-') l_proportion = float(attrs.get('populationPercent', 0)) / 100 if 'writingPercent' in attrs: writing_prop = float(attrs['writingPercent']) / 100 elif 'literacyPercent' in attrs: writing_prop = float(attrs['literacyPercent']) / 100 else: writing_prop = t_literacy_rate l_population = t_population * l_proportion l_writing = t_population * l_proportion * writing_prop # Distinguish data in different territories, and also in different # scripts when necessary, while also accumulating more general data # We need to use maximize() on the bare language code, not just # assume_script(), because assumed defaults like 'zh-Hans' are unwritten # in the data. We need this if we want to count the relative use of # Simplified vs. Traditional Chinese, for example. written_ls = ( langcodes.get(l_code).maximize()._filter_attributes(['language', 'script']) ) written_lst = written_ls.update_dict({'territory': t_code}) spoken_lt = written_lst._filter_attributes(['language', 'territory']) spoken_l = written_lst._filter_attributes(['language']) written_lt = written_lst._filter_attributes(['language', 'territory']) written_l = written_lst._filter_attributes(['language']) for lang in set([spoken_lt, spoken_l]): language_population[str(lang)] += int(round(l_population)) for lang in set([written_lst, written_lt, written_ls, written_l]): language_writing_population[str(lang)] += int(round(l_writing)) return language_population, language_writing_population
def update_names(names_fwd, names_rev, name_quads): for name_language, referent, name, priority in name_quads: # Get just the language from name_language, not the territory or script. short_language = langcodes.get(name_language).language rev_all = names_rev.setdefault('und', {}) rev_language = names_rev.setdefault(short_language, {}) for rev_dict in (rev_all, rev_language): rev_dict.setdefault(normalize_name(name), []).append( (name_language, referent, priority)) fwd_key = '{}@{}'.format(referent.lower(), name_language) if fwd_key not in names_fwd: names_fwd[fwd_key] = name
def get_language_name(code): """ Get the English name of a language ConceptNet supports. """ if code == 'mul': return 'Multilingual' elif code == 'roa-opt': # langcodes just has no way to produce a name for an extlang code that # Wiktionary came up with, but it's in a lot of etymological data, so # we need to describe it correctly return 'Old Portuguese' else: if code in LCODE_ALIASES: code = LCODE_ALIASES['code'] return langcodes.get(code, normalize=False).display_name()
def valid_language(code): """ Check whether a language code refers to a language we could represent in ConceptNet. """ if code is None: return False if not code or code == 'und' or '-pro' in code: return False if ALPHA3_RE.match(code): return True try: lcode = langcodes.get(code) return lcode.language is not None and len(lcode.language) <= 3 except LanguageTagError: return False
def test_alpha2_to_alpha3(): """ Test that each valid alpha2 code has a corresponding, unique alpha3 code. """ seen = set() for letter1 in string.ascii_lowercase: for letter2 in string.ascii_lowercase: code = letter1 + letter2 language = langcodes.get(code, normalize=False) if language.is_valid(): alpha3 = language.to_alpha3() # These four 2-letter codes exist only as aliases, and don't have # their own unique 3-letter codes. All other 2-letter codes should # uniquely map to 3-letter codes. if code not in {'in', 'iw', 'ji', 'jw'}: assert alpha3 not in seen seen.add(alpha3)
def language(self, value): # Fixme: better ??? invalid = False try: self._language = langcodes.find(value) except LookupError: try: self._language = langcodes.get(value) except: invalid = True except: invalid = True if invalid: self._language = '' if value: self._logger.warning('Unknown language {}'.format(value))
def cld2_detect_language(text): """ Uses CLD2 to detect the language. """ # Format of pycld2.detect: # (Confident in result: bool, # Number of bytes of text: Int, # Triples of detected languages in order of certainty: # (Language name: str, # Language code: str # Percent of text in this language: float # Confidence score: float)) text = CLD2_BAD_CHARS_RE.sub('', text) lang = pycld2.detect(text)[2][0][1] # Normalize the language code: 'iw' becomes 'he', and 'zh-Hant' # becomes 'zh' code = langcodes.get(lang).language return code
def translate_dbpedia_url(url): """ Convert an object that's defined by a DBPedia URL to a ConceptNet URI. We do this by finding the part of the URL that names the object, and using that as surface text for ConceptNet. This is, in some ways, abusing a naming convention in the Semantic Web. The URL of an object doesn't have to mean anything at all. The human-readable name is supposed to be a string, specified by the "name" relation. The problem here is that the "name" relation is not unique in either direction. A URL can have many names, and the same name can refer to many URLs, and some of these names are the result of parsing glitches. The URL itself is a stable thing that we can build a ConceptNet URI from, on the other hand. """ parsed = parse_url(url) domain = parsed.netloc if domain == "dbpedia.org": # Handle old DBPedia URLs that had no language code lang = "en" else: domain_parts = domain.split(".", 1) if domain_parts[1] == "dbpedia.org": lang = domain_parts[0] # If we can't name this language in English, it's probably # not really a language. if langcodes.get(lang).language_name("en") == lang: return None else: return None # Some Semantic Web URLs are camel-cased. ConceptNet URIs use underscores # between words. pieces = parse_topic_name(resource_name(url)) pieces[0] = un_camel_case(pieces[0]) return standardized_concept_uri(lang, *pieces)
def translate_dbpedia_url(url): """ Convert an object that's defined by a DBPedia URL to a ConceptNet URI. We do this by finding the part of the URL that names the object, and using that as surface text for ConceptNet. This is, in some ways, abusing a naming convention in the Semantic Web. The URL of an object doesn't have to mean anything at all. The human-readable name is supposed to be a string, specified by the "name" relation. The problem here is that the "name" relation is not unique in either direction. A URL can have many names, and the same name can refer to many URLs, and some of these names are the result of parsing glitches. The URL itself is a stable thing that we can build a ConceptNet URI from, on the other hand. """ parsed = parse_url(url) domain = parsed.netloc if domain == 'dbpedia.org': # Handle old DBPedia URLs that had no language code lang = 'en' else: domain_parts = domain.split('.', 1) if domain_parts[1] == 'dbpedia.org': lang = domain_parts[0] # If we can't name this language in English, it's probably # not really a language. if langcodes.get(lang).language_name('en') == lang: return None else: return None # Some Semantic Web URLs are camel-cased. ConceptNet URIs use underscores # between words. pieces = parse_topic_name(resource_name(url)) pieces[0] = un_camel_case(pieces[0]) return standardized_concept_uri(lang, *pieces)
def reduce_concept(concept): """ Remove the part of speech and disambiguation (if present) from a concept, leaving a potentially ambiguous concept that can be matched against surface text. Additionally, simplify language tags to a bare language. The main purpose is to remove the region tag from Chinese assertions, so they are considered simply as assertions about Chinese regardless of whether it is Traditional or Simplified Chinese. In the cases where they overlap, this helps to make the information more complete. >>> reduce_concept('/c/en/cat/n/feline') '/c/en/cat' >>> reduce_concept('/c/zh_TW/良好') '/c/zh/良好' """ parts = split_uri(concept) langtag = parts[1] if parts[1] != '[': langcode = langcodes.get(langtag).language if langcode: parts[1] = langcode return join_uri(*parts[:3])
import string import langcodes # Iterate through all 2- and 3-letter language codes, and for all languages # that have enough data to represent their own name, show: # # - The original code # - The code after normalization # - The language's name in English # - The language's name in that language (its autonym) for let1 in string.ascii_lowercase: for let2 in string.ascii_lowercase: for let3 in [''] + list(string.ascii_lowercase): code = let1 + let2 + let3 lcode = langcodes.get(code) autonym = lcode.autonym() name = lcode.language_name() if autonym != lcode.language: print('%-3s %-3s %-30s %s' % (code, lcode.language, name, autonym))
for row in reader: d = Dict(lang=row['language'].strip(), code=row['code'].split(',')[-1].strip()) Langs.append(d) # for l in Langs: # lcode = langcodes.get(l.code) # print(l.code, lcode.display_name(), lcode.autonym()) # #Langs = ["en_GB", "en_US", "fr_CA", "fr_FR", "de_DE", "ru_RU", "es_MX", "es_ES"] #Langs.sort() ltxt = '[\n' for L in Langs: lng = L.code lcode = langcodes.get(lng) autonym = lcode.autonym() name = lcode.display_name() #ui = '"%1 / {auto}".arg(app.tr("{name}"))'.format(name=name, auto=autonym) ui = 'app.tr("{name}")'.format(name=name, auto=autonym) # fill all translations check_translations(lcode, name) if autonym == lng: print("Skipping since we don't know much about it: " + lng) continue ltxt += ' { "key": "%s", "name": %s },\n' % (lng, ui) print(lng, '/', autonym, '/', name, '/', L.lang)
def _process(self, resource, text, ratio=True): d = pq(text, parser='html') for img in d.items('img'): width = img.attr.width height = img.attr.height src = img.attr.src src = urllib.parse.unquote(src) if src is None: self.logger.warn("[%s] has an img tag without src attribute" % resource) continue if width is None or height is None: wh = self._size(resource, src, width, height) if wh is not None: width, height = wh else: width, height = None, None if width is not None: width, height = int(width), int(height) # Adapt width/height if this is a scaled image ([email protected]) mo = re.match(r'.*@(\d+)x\.[^.]*$', src) if mo and width is not None: factor = int(mo.group(1)) width //= factor height //= factor srcset = ['{} {}x'.format(src, factor)] for f in reversed(range(1, factor)): tname = src.replace('@{}x.'.format(factor), '@{}x.'.format(f)) self._resize(src, os.path.basename(tname), float(f) / factor) srcset.append('{} {}x'.format(tname, f)) srcset = srcset[:-1] img.attr.src = tname img.attr.srcset = ','.join(srcset) # Put new width/height if width is not None: img.attr.width = '{}'.format(width) img.attr.height = '{}'.format(height) # If image is a SVG in /obj/, turns into an object if "/obj/" in src and src.endswith(".svg"): img[0].tag = 'object' img.attr("type", "image/svg+xml") img.attr("data", src) img.text('🖼 {}'.format(img.attr.alt or "")) del img.attr.src del img.attr.alt # PDF files elif src.endswith('.pdf'): img[0].tag = 'object' img.attr("type", "application/pdf") options = "&".join([ f"{k}={v}" for k, v in dict( toolbar=0, navpanes=0, scrollbar=0, view="Fit", # pdf.js in Firefox zoom="page-fit", pagemode="none").items() ]) img.attr("data", f"{src}#{options}") fallback = pq('<a />') fallback.attr("href", src) fallback.text(img.attr.alt or "PDF") img.append(fallback) del img.attr.src del img.attr.alt # On-demand videos (should be in /videos) elif src.endswith('.m3u8'): id = os.path.splitext(os.path.basename(src))[0] img[0].tag = 'video' img[0].set("controls", None) img.attr("preload", "none") img.attr("crossorigin", "anonymous") img.attr( "poster", self.site.media_url('images/posters/{}.jpg'.format(id))) del img.attr.src del img.attr.alt # Add sources m3u8 = pq('<source>') m3u8.attr.src = self.site.media_url( 'videos/{}.m3u8'.format(id)) m3u8.attr.type = 'application/vnd.apple.mpegurl' img.append(m3u8) progressive = pq('<source>') progressive.attr.src = self.site.media_url( 'videos/{}/progressive.mp4'.format(id)) progressive.attr.type = 'video/mp4; codecs="mp4a.40.2,avc1.4d401f"' img.append(progressive) # Add subtitle tracks if any vtts = [ v for v in self.site.content.node_from_relative_path( "media/videos").walk_resources() if v.name.endswith('.vtt') and v.name.startswith('{}.'.format(id)) ] for vtt in vtts: code = vtt.name[len(id) + 1:-4] track = pq('<track>') track.attr.src = self.site.media_url(vtt.relative_path[6:]) track.attr.kind = 'subtitles' track.attr.srclang = code if resource.meta.language == code: track[0].set("default", None) if '-' not in code: track.attr.label = langcodes.get(code).autonym() else: details = langcodes.get(code).describe(code) lang = details['language'] del details['language'] track.attr.label = u"{} ({})".format( lang, u", ".join(details.values())) img.append(track) # If image is a video not in /videos turn into a simple # video tag like an animated GIF. elif src.endswith(".mp4") or src.endswith(".ogv"): img[0].tag = 'video' for attr in 'muted loop autoplay playsinline controls'.split(): img[0].set(attr, None) del img.attr.alt # Lazy load if img[0].tag == "img" and width: lftext = img.parents('#lf-text') if lftext: parents = img.parents() rootEl = pq(parents[parents.index(lftext[0]) + 1]) if len(rootEl.prev_all()) > 3: img.attr.loading = "lazy" img.attr.decoding = "async" # If image is contained in a paragraph, enclose into a # responsive structure. parent = None parents = [p.tag for p in img.parents()] if parents[-1] == 'p': parent = img.parent() elif parents[-2:] == ['p', 'a']: parent = img.parent().parent() if parent and parent.contents().length == 1: img.addClass('lf-media') inner = pq('<span />') outer = pq('<div />') inner.addClass('lf-media-inner') outer.addClass('lf-media-outer') if width is not None and ratio: inner.css.padding_bottom = '{:.3f}%'.format( float(height) * 100. / width) outer.css.width = '{}px'.format(width) outer.append(inner) # Check opacity if src in self.cache: opaque = self.cache[src]['opaque'] if opaque: img.addClass('lf-opaque') try: bg = "url({})".format(self.cache[src]["lqip"]) img.css("background-image", bg) except KeyError: pass # If we have a title, also enclose in a figure figure = pq('<figure />') if img.attr.title: figcaption = pq('<figcaption />') figcaption.html(img.attr.title) del img.attr.title figure.append(outer) figure.append(figcaption) else: figure.append(outer) # Put image in inner tag if img.parent()[0].tag == 'a': inner.append(img.parent()) else: inner.append(img) # Replace parent with our enclosure parent.replace_with( lxml.html.tostring(figure[0], encoding='unicode')) return d
def get_sorted_languages(): return [(lang, langcodes.get(lang).autonym()) for lang in SUPPORTED_LANGUAGE_CODES]
def parse_track(self, item): options = {} error = False original = item # preserve for error messages item = item.replace('\r', ' ').replace('\n', ' ') try: head, _emptyStr, lang_kind, _emptyStr, tail = re.split( r"(^| )\((.*?)\)( |$)", item) lang_kind = lang_kind.split() # split input into a list of words kinds = set(lang_kind) & set(('captions', 'descriptions', 'chapters', 'metadata', 'subtitles')) # Find kind for kind in kinds: if 'kind' not in options: options['kind'] = kind else: error = True continue lang_kind.remove(kind) # Find language for lang in lang_kind: if 'language' not in options: if langcodes.code_to_names( 'language', langcodes.get( langcodes.standardize_tag(lang)).language): options['language'] = langcodes.standardize_tag(lang) else: # lang is not a lang code. Try interpreting as a language name try: options['language'] = str(langcodes.find(lang)) except: error = True continue else: error = True continue item = head + ' ' + tail except: error = True if 'kind' not in options: options['kind'] = 'subtitles' if 'language' not in options: try: options['language'] = langcodes.standardize_tag(getlocale()[0]) except: options['language'] = 'en' # find label try: head, _emptyStr, _quote, label, _emptyStr, tail = re.split( r"""(^| )(["'])(.*?)\2( |$)""", item) if head and tail: error = True item = head + tail options['label'] = label.strip() except: try: options['label'] = options['kind'].capitalize( ) + ' in ' + langcodes.get( options['language']).autonym().capitalize() except: error = True options['label'] = None # get filename options['src'] = self.uri_check(item) # return error if error: self.state_machine.reporter.error( 'Error in "%s" directive: \n Problems encountered parsing track "%s" \n\n' 'Guessing the following values: \n' 'filename: "%s" \n' 'kind: "%s" \n' 'language: "%s" \n' 'label: "%s" \n\n' 'Track kinds should be chosen from one of the following: \n' 'captions, descriptions, chapters, metadata, subtitles \n' 'Track languages should be given as BCP 47 compliant language codes. \n' 'Track declarations should take the following form: \n' 'filename (kind language_code) "label"\n' 'Tracks must have one filename and one language_code. \n' 'If a kind is not specified, "subtitles" will be assumed. \n' 'If a label is not provided, it will be auto-generated from the kind and language specified.' % (self.name, original, options['src'], options['kind'], options['language'], options['label']), nodes.literal_block(self.block_text, self.block_text), line=self.lineno) track_node = track(self.block_text, **options) return track_node
def convert_lang_code(code): """ Map a language code to the canonical one that ConceptNet 5 uses, using the 'langcodes' library. """ return str(langcodes.get(code))
def tokenize(text, lang, include_punctuation=False, external_wordlist=False): """ Tokenize this text in a way that's relatively simple but appropriate for the language. Strings that are looked up in wordfreq will be run through this function first, so that they can be expected to match the data. The text will be run through a number of pre-processing steps that vary by language; see the docstring of `wordfreq.preprocess.preprocess_text`. If `include_punctuation` is True, punctuation will be included as separate tokens. Otherwise, punctuation will be omitted in the output. CJK scripts ----------- In the CJK languages, word boundaries can't usually be identified by a regular expression. Instead, there needs to be some language-specific handling. In Chinese, we use the Jieba tokenizer, with a custom word list to match the words whose frequencies we can look up. In Japanese and Korean, we use the MeCab tokenizer. The `external_wordlist` option only affects Chinese tokenization. If it's True, then wordfreq will not use its own Chinese wordlist for tokenization. Instead, it will use the large wordlist packaged with the Jieba tokenizer, and it will leave Traditional Chinese characters as is. This will probably give more accurate tokenization, but the resulting tokens won't necessarily have word frequencies that can be looked up. If you end up seeing tokens that are entire phrases or sentences glued together, that probably means you passed in CJK text with the wrong language code. """ # Use globals to load CJK tokenizers on demand, so that we can still run # in environments that lack the CJK dependencies global _mecab_tokenize, _jieba_tokenize language = langcodes.get(lang) info = get_language_info(language) text = preprocess_text(text, language) if info['tokenizer'] == 'mecab': from wordfreq.mecab import mecab_tokenize as _mecab_tokenize # Get just the language code out of the Language object, so we can # use it to select a MeCab dictionary tokens = _mecab_tokenize(text, language.language) if not include_punctuation: tokens = [token for token in tokens if not PUNCT_RE.match(token)] elif info['tokenizer'] == 'jieba': from wordfreq.chinese import jieba_tokenize as _jieba_tokenize tokens = _jieba_tokenize(text, external_wordlist=external_wordlist) if not include_punctuation: tokens = [token for token in tokens if not PUNCT_RE.match(token)] else: # This is the default case where we use the regex tokenizer. First # let's complain a bit if we ended up here because we don't have an # appropriate tokenizer. if info['tokenizer'] != 'regex' and lang not in _WARNED_LANGUAGES: logger.warning( "The language '{}' is in the '{}' script, which we don't " "have a tokenizer for. The results will be bad." .format(lang, info['script']) ) _WARNED_LANGUAGES.add(lang) tokens = simple_tokenize(text, include_punctuation=include_punctuation) return tokens
def upload(self, file, lang): """ Call the youtube API and push the file to youtube :param file: file to upload :param lang: language of the file :return: """ # todo split up event creation and upload # todo change function name # todo add the license properly title = self._build_title(lang) if self.t.subtitle: subtitle = self.t.subtitle else: subtitle = '' if self.t.abstract: abstract = self.strip_tags(self.t.abstract) else: abstract = '' if self.t.description: description = self.strip_tags(self.t.description) else: description = '' if self.t.url: if self.t.url.startswith('//'): url = 'https:' + self.t.url else: url = self.t.url else: url = '' topline = [ "#" + x.replace(' ', '') for x in [self.t.acronym, self.t.track] if x ] if self.t.acronym and lang and lang != self.t.languages[0]: topline.append(("#" + self.t.acronym + lang).replace(' ', '')) description = '\n\n'.join([ subtitle, abstract, description, ' '.join(self.t.people), url, ' '.join(topline) ]) description = self.strip_tags(description) if self.t.voctoweb_url: description = os.path.join(self.t.voctoweb_url, self.t.slug) + '\n\n' + description if self.t.youtube_privacy: privacy = self.t.youtube_privacy else: privacy = 'private' license = self.t.get_raw_property('Meta.License') if license and 'https://creativecommons.org/licenses/by' in license: license = 'creativeCommon' else: license = 'youtube' metadata = { 'snippet': { 'title': title, # YouTube does not allow <> in description -> escape them 'description': description.replace('<', '<').replace('>', '>'), 'channelId': self.channelId, 'tags': self._select_tags(lang), 'defaultLanguage': langcodes.get(self.t.languages[0]).language, 'defaultAudioLanguage': langcodes.get(lang or self.t.languages[0]).language, }, 'status': { 'privacyStatus': privacy, 'embeddable': True, 'publicStatsViewable': True, 'license': license, }, 'recordingDetails': { 'recordingDate': self.t.date, }, } # limit title length to 100 (YouTube api conformity) metadata['snippet']['title'] = metadata['snippet']['title'][:100] # limit Description length to 5000 (YouTube api conformity) metadata['snippet']['description'] = metadata['snippet'][ 'description'][:5000] if self.t.youtube_category: metadata['snippet']['categoryId'] = int(self.t.youtube_category) (mimetype, encoding) = mimetypes.guess_type(file) size = os.stat(file).st_size logging.debug( 'guessed mime type for file %s as %s and its size as %u bytes' % (file, mimetype, size)) # https://developers.google.com/youtube/v3/docs/videos#resource r = requests.post( 'https://www.googleapis.com/upload/youtube/v3/videos', params={ 'uploadType': 'resumable', 'part': 'snippet,status,recordingDetails' }, headers={ 'Authorization': 'Bearer ' + self.accessToken, 'Content-Type': 'application/json; charset=UTF-8', 'X-Upload-Content-Type': mimetype, 'X-Upload-Content-Length': str(size), }, data=json.dumps(metadata)) if 200 != r.status_code: if 400 == r.status_code: raise YouTubeException(r.json()['error']['message'] + '\n' + r.text + '\n\n' + json.dumps(metadata, indent=2)) else: raise YouTubeException( 'Video creation failed with error-code %u: %s' % (r.status_code, r.text)) if 'location' not in r.headers: raise YouTubeException( 'Video creation did not return a location-header to upload to: %s' % (r.headers, )) logging.info( 'successfully created video and received upload-url from %s' % (r.headers['server'] if 'server' in r.headers else '-')) logging.debug('uploading video-data to %s' % r.headers['location']) with open(file, 'rb') as fp: upload = requests.put(r.headers['location'], headers={ 'Authorization': 'Bearer ' + self.accessToken, 'Content-Type': mimetype, }, data=fp) if 200 != upload.status_code and 201 != upload.status_code: raise YouTubeException( 'uploading video failed with error-code %u: %s' % (r.status_code, r.text)) video = upload.json() outjpg = os.path.join(self.t.publishing_path, self.t.local_filename_base + '_youtube.jpg') try: r = subprocess.check_output( 'ffmpeg -loglevel error -i ' + self.thumbnail.path + ' -f image2 -vcodec mjpeg -pix_fmt yuv420p -q:v 0 -y ' + outjpg, shell=True) logging.info("thumbnails reformatted for youtube") except Exception as e_: raise YoutubeException("Could not scale thumbnail: " + r.decode('utf-8')) from e_ YoutubeAPI.update_thumbnail(self.accessToken, video['id'], outjpg) youtube_url = 'https://www.youtube.com/watch?v=' + video['id'] logging.info('successfully uploaded video as %s', youtube_url) return video['id']
def tokenize(text, lang, include_punctuation=False, external_wordlist=False): """ Tokenize this text in a way that's relatively simple but appropriate for the language. Strings that are looked up in wordfreq will be run through this function first, so that they can be expected to match the data. The text will be run through a number of pre-processing steps that vary by language; see the docstring of `wordfreq.preprocess.preprocess_text`. If `include_punctuation` is True, punctuation will be included as separate tokens. Otherwise, punctuation will be omitted in the output. CJK scripts ----------- In the CJK languages, word boundaries can't usually be identified by a regular expression. Instead, there needs to be some language-specific handling. In Chinese, we use the Jieba tokenizer, with a custom word list to match the words whose frequencies we can look up. In Japanese and Korean, we use the MeCab tokenizer. The `external_wordlist` option only affects Chinese tokenization. If it's True, then wordfreq will not use its own Chinese wordlist for tokenization. Instead, it will use the large wordlist packaged with the Jieba tokenizer, and it will leave Traditional Chinese characters as is. This will probably give more accurate tokenization, but the resulting tokens won't necessarily have word frequencies that can be looked up. If you end up seeing tokens that are entire phrases or sentences glued together, that probably means you passed in CJK text with the wrong language code. """ # Use globals to load CJK tokenizers on demand, so that we can still run # in environments that lack the CJK dependencies global _mecab_tokenize, _jieba_tokenize language = langcodes.get(lang) info = get_language_info(language) text = preprocess_text(text, language) if info['tokenizer'] == 'mecab': from wordfreq.mecab import mecab_tokenize as _mecab_tokenize # Get just the language code out of the Language object, so we can # use it to select a MeCab dictionary tokens = _mecab_tokenize(text, language.language) if not include_punctuation: tokens = [token for token in tokens if not PUNCT_RE.match(token)] elif info['tokenizer'] == 'jieba': from wordfreq.chinese import jieba_tokenize as _jieba_tokenize tokens = _jieba_tokenize(text, external_wordlist=external_wordlist) if not include_punctuation: tokens = [token for token in tokens if not PUNCT_RE.match(token)] else: # This is the default case where we use the regex tokenizer. First # let's complain a bit if we ended up here because we don't have an # appropriate tokenizer. if info['tokenizer'] != 'regex' and lang not in _WARNED_LANGUAGES: logger.warning( "The language '{}' is in the '{}' script, which we don't " "have a tokenizer for. The results will be bad.".format( lang, info['script'])) _WARNED_LANGUAGES.add(lang) tokens = simple_tokenize(text, include_punctuation=include_punctuation) return tokens
def test_updated_iana(): aqk = langcodes.get('aqk') assert aqk.language_name('en') == 'Aninka'
def _language_code_to_name(code): return langcodes.get(code).language_name('en')
valid_set=list(known_multilingualities.keys()), format_func=lambda m: f"{m} : {known_multilingualities[m]}", ) if "other" in state["multilinguality"]: other_multilinguality = leftcol.text_input( "You selected 'other' type of multilinguality. Please enter a short hyphen-separated description:", value="my-multilinguality", ) leftcol.write(f"Registering other-{other_multilinguality} multilinguality") state["multilinguality"][state["multilinguality"].index("other")] = f"other-{other_multilinguality}" valid_values, invalid_values = list(), list() for langtag in state["languages"]: try: lc.get(langtag) valid_values.append(langtag) except: invalid_values.append(langtag) leftcol.markdown("#### Languages") if len(invalid_values) > 0: leftcol.markdown("Found the following invalid values:") leftcol.error(invalid_values) langtags = leftcol.text_area( "What languages are represented in the dataset? expected format is BCP47 tags separated for ';' e.g. 'en-US;fr-FR'", value=";".join(valid_values), ) state["languages"] = langtags.strip().split(";") if langtags.strip() != "" else []
def subtitles(subtitles, wikifilename, username, statuscallback=None, errorcallback=None): """Convert and upload subtitles to corresponding TimedText pages.""" statuscallback = statuscallback or (lambda text, percent: None) errorcallback = errorcallback or (lambda text: None) percent = 0 c = Converter(ffmpeg_path='/usr/bin/ffmpeg', ffprobe_path='/usr/bin/ffprobe') for langcode, filename in subtitles.items(): try: lang = langcodes.get(langcode) langcode = str(lang).lower() langdesc = lang.describe() langname = langdesc['language'] del langdesc['language'] if langdesc: langname += u' (%s)' % ', '.join(langdesc.values()) statuscallback(u'Loading subtitle in ' + langname, int(percent)) subtitletext = '' info = c.probe(filename) if not info: continue if len(info.streams) != 1: continue if info.streams[0].type != 'subtitle': continue format = info.streams[0].codec if format.lower() != 'srt': target = filename + '.srt' cmd = ['/usr/bin/ffmpeg', '-i', filename, '-f', 'srt', target] statuscallback("Running cmd: %s" % cmd, None) subprocess.check_call(cmd, stderr=None) filename = target f = open(filename) subtitletext = f.read() f.close() subtitletext = subtitletext.decode( chardet.detect(subtitletext)['encoding']) percent += 50.0 / len(subtitles) statuscallback('Uploading subtitle in ' + langname, int(percent)) # ENSURE PYWIKIBOT OAUTH PROPERLY CONFIGURED! site = pywikibot.Site('commons', 'commons', user=username) page = pywikibot.Page( site, u'TimedText:' + wikifilename.decode('utf-8') + u'.' + langcode.lower() + u'.srt') page.text = subtitletext if not page.exists(): page.save(summary=u'Import ' + langname + u' subtitles for ' + '[[:File:' + wikifilename.decode('utf-8') + ']]', minor=False) percent += 50.0 / len(subtitles) statuscallback('Finished processing subtitle in ' + langname, int(percent)) except TaskAbort: raise except Exception as e: statuscallback(type(e).__name__ + ": " + str(e), None) pass
def tokenize(text, lang, include_punctuation=False, external_wordlist=False, combine_numbers=False): """ Tokenize this text in a way that's relatively simple but appropriate for the language. Strings that are looked up in wordfreq will be run through this function first, so that they can be expected to match the data. Some of the processing steps are specific to one language, such as Chinese, but what broadly happens to the text depends on what general writing system the language uses, out of these categories: - Alphabetic scripts: English, Spanish, Russian, etc. - Abjad scripts: Arabic, Hebrew, Persian, Urdu, etc. - CJK scripts: Chinese, Japanese, Korean - Brahmic scripts: Hindi, Tamil, Telugu, Kannada, etc. The options `include_punctuation`, `external_wordlist`, and `combine_numbers` are passed on to the appropriate tokenizer: - `include_punctuation` preserves punctuation as tokens, instead of removing it. - `external_wordlist` uses the default Jieba wordlist to tokenize Chinese, instead of wordfreq's wordlist. - `combine_numbers` replaces multi-digit numbers with strings of zeroes. Alphabetic scripts ------------------ The major alphabetic scripts -- Latin, Cyrillic, and Greek -- cover most European languages, which are relatively straightforward to tokenize. Text in these scripts will be normalized to NFC form, then passed through a regular expression that implements the Word Segmentation section of Unicode Annex #29, and then case-folded to lowercase. The effect is mostly to split the text on spaces and punctuation. There are some subtleties involving apostrophes inside words, which the regex will only split when they occur before a vowel. ("Hasn't" is one token, but "l'enfant" is two.) If the language is Turkish, the case-folding rules will take this into account, so that capital I and İ map to ı and i respectively. Abjad scripts ------------- Languages in the Arabic or Hebrew scripts are written with optional vowel marks, and sometimes other decorative markings and ligatures. In these languages: - The text will be NFKC-normalized, which is a stronger and lossier form than NFC. Here its purpose is to reduce ligatures to simpler characters. - Marks will be removed, as well as the Arabic tatweel (an extension of a word that is used for justification or decoration). After these steps, the text will go through the same process as the alphabetic scripts above. CJK scripts ----------- In the CJK languages, word boundaries can't usually be identified by a regular expression. Instead, there needs to be some language-specific handling. - Chinese text first gets converted to a canonical representation we call "Oversimplified Chinese", where all characters are replaced by their Simplified Chinese form, no matter what, even when this misspells a word or a name. This representation is then tokenized using the Jieba tokenizer, trained on the list of Chinese words that can be looked up in wordfreq. - Japanese and Korean will be NFKC-normalized, then tokenized using the MeCab tokenizer, using dictionary files that are included in this package. The `external_wordlist` option only affects Chinese tokenization. If it's True, then wordfreq will not use its own Chinese wordlist for tokenization. Instead, it will use the large wordlist packaged with the Jieba tokenizer, and it will leave Traditional Chinese characters as is. This will probably give more accurate tokenization, but the resulting tokens won't necessarily have word frequencies that can be looked up. If you end up seeing tokens that are entire phrases or sentences glued together, that probably means you passed in CJK text with the wrong language code. Brahmic scripts and other languages ----------------------------------- Any kind of language not previously mentioned will just go through the same tokenizer that alphabetic languages use. We've tweaked this tokenizer for the case of Indic languages in Brahmic scripts, such as Hindi, Tamil, and Telugu, so that we can handle these languages where the default Unicode algorithm wouldn't quite work. Southeast Asian languages, such as Thai, Khmer, Lao, and Myanmar, are written in Brahmic-derived scripts, but usually *without spaces*. wordfreq does not support these languages yet. It will split on spaces and punctuation, giving tokens that are far too long. """ # Reduce whatever language code was passed in to a normal form, # containing just the language subtag. lang = langcodes.get(lang).prefer_macrolanguage().language if lang == 'ja' or lang == 'ko': result = tokenize_mecab_language(text, lang, include_punctuation) elif lang == 'zh' or lang == 'yue': result = chinese_tokenize(text, include_punctuation, external_wordlist) elif lang == 'tr': result = simple_tokenize(preprocess_turkish(text), include_punctuation) elif lang == 'ro': result = simple_tokenize(preprocess_romanian(text), include_punctuation) elif lang == 'sr': result = simple_tokenize(preprocess_serbian(text), include_punctuation) elif lang in ABJAD_LANGUAGES: text = remove_marks(unicodedata.normalize('NFKC', text)) result = simple_tokenize(text, include_punctuation) else: result = simple_tokenize(text, include_punctuation) if combine_numbers: result = [smash_numbers(token) for token in result] return result
def get_sorted_languages(): return [ (lang, langcodes.get(lang).autonym()) for lang in SUPPORTED_LANGUAGE_CODES ]
def test_cldr_v40(): en = langcodes.get('en') assert en.language_name('dsb') == 'engelšćina'
import string import langcodes # Iterate through all 2- and 3-letter language codes, and for all languages # that have enough data to represent their own name, show: # # - The original code # - The code after normalization # - The language's name in English # - The language's name in that language (its autonym) en = langcodes.get('en') for let1 in string.ascii_lowercase: for let2 in string.ascii_lowercase: for let3 in [''] + list(string.ascii_lowercase): code = let1 + let2 + let3 lcode = langcodes.get(code) if lcode.has_name_data(): autonym = lcode.autonym() name = lcode.language_name() print('%-3s %-3s %-30s %s' % (code, lcode.language, name, autonym))
def _process(self, resource, text, ratio=True): d = pq(text, parser='html') for img in d.items('img'): width = img.attr.width height = img.attr.height src = img.attr.src src = urllib.parse.unquote(src) if src is None: self.logger.warn( "[%s] has an img tag without src attribute" % resource) continue if width is None or height is None: wh = self._size(resource, src, width, height) if wh is not None: width, height = wh else: width, height = None, None if width is not None: width, height = int(width), int(height) # Adapt width/height if this is a scaled image ([email protected]) mo = re.match(r'.*@(\d+)x\.[^.]*$', src) if mo and width is not None: factor = int(mo.group(1)) width //= factor height //= factor srcset = ['{} {}x'.format(src, factor)] for f in reversed(range(1, factor)): tname = src.replace('@{}x.'.format(factor), '@{}x.'.format(f)) self._resize(src, os.path.basename(tname), float(f)/factor) srcset.append('{} {}x'.format(tname, f)) srcset = srcset[:-1] img.attr.src = tname img.attr.srcset = ','.join(srcset) # Put new width/height if width is not None: img.attr.width = '{}'.format(width) img.attr.height = '{}'.format(height) # If image is a SVG in /obj/, turns into an object if "/obj/" in src and src.endswith(".svg"): img[0].tag = 'object' img.attr("type", "image/svg+xml") img.attr("data", src) del img.attr.src img.text('🖼 {}'.format(img.attr.alt or "")) # On-demand videos (should be in /videos) elif src.endswith('.m3u8'): id = os.path.splitext(os.path.basename(src))[0] img[0].tag = 'video' img[0].set("controls", None) img.attr("preload", "none") img.attr("crossorigin", "anonymous") img.attr("poster", self.site.media_url( 'images/posters/{}.jpg'.format(id))) del img.attr.src del img.attr.alt # Add sources m3u8 = pq('<source>') m3u8.attr.src = self.site.media_url( 'videos/{}.m3u8'.format(id)) m3u8.attr.type = 'application/vnd.apple.mpegurl' img.append(m3u8) progressive = pq('<source>') progressive.attr.src = self.site.media_url( 'videos/{}/progressive.mp4'.format(id)) progressive.attr.type = 'video/mp4; codecs="avc1.4d401f, mp4a.40.2"' img.append(progressive) # Add subtitle tracks if any vtts = [v for v in self.site.content.node_from_relative_path( "media/videos").walk_resources() if v.name.endswith('.vtt') and v.name.startswith('{}.'.format(id))] for vtt in vtts: code = vtt.name[len(id)+1:-4] track = pq('<track>') track.attr.src = self.site.media_url(vtt.relative_path[6:]) track.attr.kind = 'subtitles' track.attr.srclang = code if resource.meta.language == code: track[0].set("default", None) if '-' not in code: track.attr.label = langcodes.get(code).autonym() else: details = langcodes.get(code).describe(code) lang = details['language'] del details['language'] track.attr.label = u"{} ({})".format( lang, u", ".join(details.values())) img.append(track) # If image is a video not in /videos turn into a simple # video tag like an animated GIF. elif src.endswith(".mp4") or src.endswith(".ogv"): img[0].tag = 'video' for attr in {'muted', 'loop', 'autoplay', 'playsinline', 'controls'}: img[0].set(attr, None) del img.attr.alt # Lazy load if we have a large enough image if img[0].tag == "img" and width and width*height > 20000: img.attr.loading = "lazy" # If image is contained in a paragraph, enclose into a # responsive structure. parent = None parents = [p.tag for p in img.parents()] if parents[-1] == 'p': parent = img.parent() elif parents[-2:] == ['p', 'a']: parent = img.parent().parent() if parent: img.addClass('lf-media') inner = pq('<span />') outer = pq('<div />') inner.addClass('lf-media-inner') outer.addClass('lf-media-outer') if width is not None and ratio: inner.css.padding_bottom = '{:.3f}%'.format( float(height)*100./width) outer.css.width = '{}px'.format(width) outer.append(inner) # If we have a title, also enclose in a figure figure = pq('<figure />') if img.attr.title: figcaption = pq('<figcaption />') figcaption.html(img.attr.title) del img.attr.title figure.append(outer) figure.append(figcaption) else: figure.append(outer) # Put image in inner tag if img.parent()[0].tag == 'a': inner.append(img.parent()) else: inner.append(img) # Replace parent with our enclosure parent.replace_with(lxml.html.tostring(figure[0], encoding='unicode')) return d