def preprocess_and_tokenize_text(lang, text): """ Get a string made from the tokens in the text, joined by underscores. >>> preprocess_and_tokenize_text('en', ' cat') 'cat' >>> preprocess_and_tokenize_text('en', 'Italian supercat') 'italian_supercat' >>> preprocess_and_tokenize_text('en', 'a big dog') 'a_big_dog' >>> preprocess_and_tokenize_text('en', 'Test?!') 'test' >>> preprocess_and_tokenize_text('en', 'TEST.') 'test' >>> preprocess_and_tokenize_text('en', 'test/test') 'test_test' >>> preprocess_and_tokenize_text('de', ' u\N{COMBINING DIAERESIS}ber\\n') 'über' >>> preprocess_and_tokenize_text('en', 'embedded' + chr(9) + 'tab') 'embedded_tab' >>> preprocess_and_tokenize_text('en', '_') '' >>> preprocess_and_tokenize_text('en', ',') '' """ text = preprocess_text(text.replace('_', ' '), lang) tokens = simple_tokenize(text) return '_'.join(tokens)
def test_transliteration(): # "Well, there's a lot of things you do not understand." # (from somewhere in OpenSubtitles assert ( tokenize("Па, има ту много ствари које не схваташ.", 'sr') == ['pa', 'ima', 'tu', 'mnogo', 'stvari', 'koje', 'ne', 'shvataš'] ) assert ( tokenize("Pa, ima tu mnogo stvari koje ne shvataš.", 'sr') == ['pa', 'ima', 'tu', 'mnogo', 'stvari', 'koje', 'ne', 'shvataš'] ) # I don't have examples of complete sentences in Azerbaijani that are # naturally in Cyrillic, because it turns out everyone writes Azerbaijani # in Latin letters on the Internet, _except_ sometimes for Wiktionary. # So here are some individual words. # 'library' in Azerbaijani Cyrillic assert preprocess_text('китабхана', 'az') == 'kitabxana' assert preprocess_text('КИТАБХАНА', 'az') == 'kitabxana' assert preprocess_text('KİTABXANA', 'az') == 'kitabxana' # 'scream' in Azerbaijani Cyrillic assert preprocess_text('бағырты', 'az') == 'bağırtı' assert preprocess_text('БАҒЫРТЫ', 'az') == 'bağırtı' assert preprocess_text('BAĞIRTI', 'az') == 'bağırtı'
def standardized_concept_uri(lang, text, *more): """ Make the appropriate URI for a concept in a particular language, including removing English stopwords, normalizing the text in a way appropriate to that language (using the text normalization from wordfreq), and joining its tokens with underscores in a concept URI. This text normalization can smooth over some writing differences: for example, it removes vowel points from Arabic words, and it transliterates Serbian written in the Cyrillic alphabet to the Latin alphabet so that it can match other words written in Latin letters. 'more' contains information to distinguish word senses, such as a part of speech or a WordNet domain. The items in 'more' get lowercased and joined with underscores, but skip many of the other steps -- for example, they won't have stopwords removed. >>> standardized_concept_uri('en', 'this is a test') '/c/en/this_is_test' >>> standardized_concept_uri('en', 'this is a test', 'n', 'example phrase') '/c/en/this_is_test/n/example_phrase' >>> standardized_concept_uri('sh', 'симетрија') '/c/sh/simetrija' """ lang = lang.lower() if lang in LCODE_ALIASES: lang = LCODE_ALIASES[lang] if lang == 'en': token_filter = english_filter else: token_filter = None text = preprocess_text(text.replace('_', ' '), lang) tokens = simple_tokenize(text) if token_filter is not None: tokens = token_filter(tokens) norm_text = '_'.join(tokens) more_text = [] for item in more: if item is not None: tokens = simple_tokenize(item.replace('_', ' ')) if token_filter is not None: tokens = token_filter(tokens) more_text.append('_'.join(tokens)) return concept_uri(lang, norm_text, *more_text)
def test_transliteration(): # "Well, there's a lot of things you do not understand." # (from somewhere in OpenSubtitles assert (tokenize("Па, има ту много ствари које не схваташ.", 'sr') == [ 'pa', 'ima', 'tu', 'mnogo', 'stvari', 'koje', 'ne', 'shvataš' ]) assert (tokenize("Pa, ima tu mnogo stvari koje ne shvataš.", 'sr') == [ 'pa', 'ima', 'tu', 'mnogo', 'stvari', 'koje', 'ne', 'shvataš' ]) # I don't have examples of complete sentences in Azerbaijani that are # naturally in Cyrillic, because it turns out everyone writes Azerbaijani # in Latin letters on the Internet, _except_ sometimes for Wiktionary. # So here are some individual words. # 'library' in Azerbaijani Cyrillic assert preprocess_text('китабхана', 'az') == 'kitabxana' assert preprocess_text('КИТАБХАНА', 'az') == 'kitabxana' assert preprocess_text('KİTABXANA', 'az') == 'kitabxana' # 'scream' in Azerbaijani Cyrillic assert preprocess_text('бағырты', 'az') == 'bağırtı' assert preprocess_text('БАҒЫРТЫ', 'az') == 'bağırtı' assert preprocess_text('BAĞIRTI', 'az') == 'bağırtı'
def freq(word): if (word != preprocess_text(word, "fi") or MULTI_DIGIT_RE.fullmatch(word) or word.startswith("-") or word.endswith("-")): return 0 return wordfreq.word_frequency(word, "fi")