def test_languages():
    # Make sure we get all the languages when looking for the default
    # 'best' wordlist
    avail = available_languages()
    assert len(avail) >= 34

    # 'small' covers the same languages, but with some different lists
    avail_small = available_languages('small')
    assert len(avail_small) == len(avail)
    assert avail_small != avail

    # 'combined' is the same as 'small'
    avail_old_name = available_languages('combined')
    assert avail_old_name == avail_small

    # 'large' covers fewer languages
    avail_large = available_languages('large')
    assert len(avail_large) >= 14
    assert len(avail) > len(avail_large)

    # Look up the digit '2' in the main word list for each language
    for lang in avail:
        assert word_frequency('2', lang) > 0

        # Make up a weirdly verbose language code and make sure
        # we still get it
        new_lang_code = '%s-001-x-fake-extension' % lang.upper()
        assert word_frequency('2', new_lang_code) > 0
Beispiel #2
0
def test_twitter():
    avail = available_languages('twitter')
    assert_greater(len(avail), 14)

    for lang in avail:
        assert_greater(word_frequency('rt', lang, 'twitter'),
                       word_frequency('rt', lang, 'combined'))
Beispiel #3
0
def test_languages():
    # Make sure the number of available languages doesn't decrease
    avail = available_languages()
    assert_greater(len(avail), 14)

    # Laughter is the universal language
    for lang in avail:
        if lang not in {"zh", "ja"}:
            # we do not have enough Chinese data
            # Japanese people do not lol
            assert_greater(word_frequency("lol", lang), 0)

            # Make up a weirdly verbose language code and make sure
            # we still get it
            new_lang_code = "%s-001-x-fake-extension" % lang.upper()
            assert_greater(word_frequency("lol", new_lang_code), 0)
Beispiel #4
0
def test_languages():
    # Make sure the number of available languages doesn't decrease
    avail = available_languages()
    assert_greater(len(avail), 15)

    # Laughter is the universal language. Look up either 'lol' or '笑' in each
    # language and make sure it has a non-zero frequency.
    for lang in avail:
        if lang in {'zh', 'ja'}:
            text = '笑'
        else:
            text = 'lol'
        assert_greater(word_frequency(text, lang), 0)

        # Make up a weirdly verbose language code and make sure
        # we still get it
        new_lang_code = '%s-001-x-fake-extension' % lang.upper()
        assert_greater(word_frequency(text, new_lang_code), 0)
Beispiel #5
0
def test_languages():
    # Make sure the number of available languages doesn't decrease
    avail = available_languages()
    assert_greater(len(avail), 15)

    # Look up a word representing laughter in each language, and make sure
    # it has a non-zero frequency.
    for lang in avail:
        if lang in {'zh', 'ja'}:
            text = '笑'
        elif lang == 'ar':
            text = 'ههههه'
        else:
            text = 'lol'
        assert_greater(word_frequency(text, lang), 0)

        # Make up a weirdly verbose language code and make sure
        # we still get it
        new_lang_code = '%s-001-x-fake-extension' % lang.upper()
        assert_greater(word_frequency(text, new_lang_code), 0, (text, new_lang_code))
Beispiel #6
0
import sqlite3
import wordfreq
from conceptnet5.util import get_data_filename
from conceptnet5.uri import split_uri, join_uri

WORDFREQ_LANGUAGES = set(wordfreq.available_languages())

# These are the languages for which we are reasonably confident in Wiktionary's
# lemmatization.
LEMMATIZED_LANGUAGES = {
    # Languages supported by wordfreq:
    'ar',
    'de',
    'el',
    'en',
    'es',
    'fr',
    'it',
    'nl',
    'pl',
    'pt',
    'ru',
    'sv',
    'tr',

    # Other languages:
    'ast',
    'gl',
    'ca',
    'oc',
    'nrf',
Beispiel #7
0
def test_twitter():
    avail = available_languages("twitter")
    assert_greater(len(avail), 12)

    for lang in avail:
        assert_greater(word_frequency("rt", lang, "twitter"), word_frequency("rt", lang, "combined"))
Beispiel #8
0
import sqlite3
import wordfreq
from conceptnet5.util import get_data_filename
from conceptnet5.uri import split_uri, join_uri


WORDFREQ_LANGUAGES = set(wordfreq.available_languages())
WORDFREQ_LANGUAGES_LARGE = set(wordfreq.available_languages('large'))

# These are the languages for which we are reasonably confident in Wiktionary's
# lemmatization.
LEMMATIZED_LANGUAGES = {
    # Languages supported by wordfreq:
    'ar', 'de', 'el', 'en', 'es', 'fr', 'it', 'nl', 'pl', 'pt', 'ru',
    'sv', 'tr',

    # Other languages:
    'ast', 'gl', 'ca', 'oc', 'nrf',
    'no', 'da', 'af',
    'ga', 'gd', 'is', 'fo', 'gv', 'cy',
    'hsb', 'cs', 'sh', 'sk', 'sl', 'lv',
    'fi', 'hu', 'ro', 'bg',
    'hy', 'ka', 'rup', 'uk',
    'la', 'ang', 'grc', 'xcl', 'fro', 'non',
    'eo', 'io', 'vo',

    # Languages left out:
    #   - No lemmatizer needed: zh, ms
    #   - Not enough information in:
    #     ja, se, fa, sq, mg, he, mk, sa, nv, hi, fil, eu
    #   - Script problems: sh
Beispiel #9
0
"""
A quick script to output the top N words (1000 for now) in each language.
You can send the output to a file and diff it to see changes between wordfreq
versions.
"""
import wordfreq


N = 1000

if __name__ == '__main__':
    for lang in sorted(wordfreq.available_languages()):
        for word in wordfreq.top_n_list(lang, 1000):
            print('{}\t{}'.format(lang, word))
Beispiel #10
0
"""
A quick script to output the top N words (500 for now) in each language.
You can send the output to a file and diff it to see changes between wordfreq
versions.
"""
import wordfreq

N = 500

if __name__ == '__main__':
    for lang in sorted(wordfreq.available_languages()):
        for word in wordfreq.top_n_list(lang, N):
            print('{}\t{}'.format(lang, word))