def test_languages(): # Make sure we get all the languages when looking for the default # 'best' wordlist avail = available_languages() assert len(avail) >= 34 # 'small' covers the same languages, but with some different lists avail_small = available_languages('small') assert len(avail_small) == len(avail) assert avail_small != avail # 'combined' is the same as 'small' avail_old_name = available_languages('combined') assert avail_old_name == avail_small # 'large' covers fewer languages avail_large = available_languages('large') assert len(avail_large) >= 14 assert len(avail) > len(avail_large) # Look up the digit '2' in the main word list for each language for lang in avail: assert word_frequency('2', lang) > 0 # Make up a weirdly verbose language code and make sure # we still get it new_lang_code = '%s-001-x-fake-extension' % lang.upper() assert word_frequency('2', new_lang_code) > 0
def test_twitter(): avail = available_languages('twitter') assert_greater(len(avail), 14) for lang in avail: assert_greater(word_frequency('rt', lang, 'twitter'), word_frequency('rt', lang, 'combined'))
def test_languages(): # Make sure the number of available languages doesn't decrease avail = available_languages() assert_greater(len(avail), 14) # Laughter is the universal language for lang in avail: if lang not in {"zh", "ja"}: # we do not have enough Chinese data # Japanese people do not lol assert_greater(word_frequency("lol", lang), 0) # Make up a weirdly verbose language code and make sure # we still get it new_lang_code = "%s-001-x-fake-extension" % lang.upper() assert_greater(word_frequency("lol", new_lang_code), 0)
def test_languages(): # Make sure the number of available languages doesn't decrease avail = available_languages() assert_greater(len(avail), 15) # Laughter is the universal language. Look up either 'lol' or '笑' in each # language and make sure it has a non-zero frequency. for lang in avail: if lang in {'zh', 'ja'}: text = '笑' else: text = 'lol' assert_greater(word_frequency(text, lang), 0) # Make up a weirdly verbose language code and make sure # we still get it new_lang_code = '%s-001-x-fake-extension' % lang.upper() assert_greater(word_frequency(text, new_lang_code), 0)
def test_languages(): # Make sure the number of available languages doesn't decrease avail = available_languages() assert_greater(len(avail), 15) # Look up a word representing laughter in each language, and make sure # it has a non-zero frequency. for lang in avail: if lang in {'zh', 'ja'}: text = '笑' elif lang == 'ar': text = 'ههههه' else: text = 'lol' assert_greater(word_frequency(text, lang), 0) # Make up a weirdly verbose language code and make sure # we still get it new_lang_code = '%s-001-x-fake-extension' % lang.upper() assert_greater(word_frequency(text, new_lang_code), 0, (text, new_lang_code))
import sqlite3 import wordfreq from conceptnet5.util import get_data_filename from conceptnet5.uri import split_uri, join_uri WORDFREQ_LANGUAGES = set(wordfreq.available_languages()) # These are the languages for which we are reasonably confident in Wiktionary's # lemmatization. LEMMATIZED_LANGUAGES = { # Languages supported by wordfreq: 'ar', 'de', 'el', 'en', 'es', 'fr', 'it', 'nl', 'pl', 'pt', 'ru', 'sv', 'tr', # Other languages: 'ast', 'gl', 'ca', 'oc', 'nrf',
def test_twitter(): avail = available_languages("twitter") assert_greater(len(avail), 12) for lang in avail: assert_greater(word_frequency("rt", lang, "twitter"), word_frequency("rt", lang, "combined"))
import sqlite3 import wordfreq from conceptnet5.util import get_data_filename from conceptnet5.uri import split_uri, join_uri WORDFREQ_LANGUAGES = set(wordfreq.available_languages()) WORDFREQ_LANGUAGES_LARGE = set(wordfreq.available_languages('large')) # These are the languages for which we are reasonably confident in Wiktionary's # lemmatization. LEMMATIZED_LANGUAGES = { # Languages supported by wordfreq: 'ar', 'de', 'el', 'en', 'es', 'fr', 'it', 'nl', 'pl', 'pt', 'ru', 'sv', 'tr', # Other languages: 'ast', 'gl', 'ca', 'oc', 'nrf', 'no', 'da', 'af', 'ga', 'gd', 'is', 'fo', 'gv', 'cy', 'hsb', 'cs', 'sh', 'sk', 'sl', 'lv', 'fi', 'hu', 'ro', 'bg', 'hy', 'ka', 'rup', 'uk', 'la', 'ang', 'grc', 'xcl', 'fro', 'non', 'eo', 'io', 'vo', # Languages left out: # - No lemmatizer needed: zh, ms # - Not enough information in: # ja, se, fa, sq, mg, he, mk, sa, nv, hi, fil, eu # - Script problems: sh
""" A quick script to output the top N words (1000 for now) in each language. You can send the output to a file and diff it to see changes between wordfreq versions. """ import wordfreq N = 1000 if __name__ == '__main__': for lang in sorted(wordfreq.available_languages()): for word in wordfreq.top_n_list(lang, 1000): print('{}\t{}'.format(lang, word))
""" A quick script to output the top N words (500 for now) in each language. You can send the output to a file and diff it to see changes between wordfreq versions. """ import wordfreq N = 500 if __name__ == '__main__': for lang in sorted(wordfreq.available_languages()): for word in wordfreq.top_n_list(lang, N): print('{}\t{}'.format(lang, word))