Esempio n. 1
0
def test_twitter():
    avail = available_languages('twitter')
    assert_greater(len(avail), 14)

    for lang in avail:
        assert_greater(word_frequency('rt', lang, 'twitter'),
                       word_frequency('rt', lang, 'combined'))
Esempio n. 2
0
def test_phrase_freq():
    ff = word_frequency("flip-flop", 'en')
    assert_greater(ff, 0)
    assert_almost_equal(
        1.0 / ff,
        1.0 / word_frequency('flip', 'en') + 1.0 / word_frequency('flop', 'en')
    )
Esempio n. 3
0
def match_questions_with_categories(questions, clusters):
    """A simple matching algorithm that places questions into a pre-created cluster if:
        1. The question's lemmatized form contains the cluster's keyword
        2. The question contains no rarer English words that are also cluster keywords

        Parameters:
            questions (list[dict]): A list of dictionaries with an id and question (text) field
            clusters (list[string]): A list of pre-created keywords
    """
    cluster_additions = { "uncategorized": [] }
    for question in questions:
        clean_question = clean_text(question["question"].replace("\n", ""))
        cluster_options = set()
        for token in nlp(clean_question):
            if token.lemma_ in clusters:
                cluster_options.add(token.lemma_)
        if len(cluster_options) == 0:
            cluster_additions["uncategorized"].append(question["id"])
            continue
        best_keyword = None
        rarest_freq = 1
        for keyword in cluster_options:
            if word_frequency(keyword, "en") < rarest_freq:
                rarest_freq = word_frequency(keyword, "en")
                best_keyword = keyword
        if best_keyword in cluster_additions:
            cluster_additions[best_keyword].append(question["id"])
        else:
            cluster_additions[best_keyword] = [question["id"]]
    return cluster_additions
Esempio n. 4
0
def test_languages():
    # Make sure we get all the languages when looking for the default
    # 'best' wordlist
    avail = available_languages()
    assert len(avail) >= 34

    # 'small' covers the same languages, but with some different lists
    avail_small = available_languages('small')
    assert len(avail_small) == len(avail)
    assert avail_small != avail

    # 'combined' is the same as 'small'
    avail_old_name = available_languages('combined')
    assert avail_old_name == avail_small

    # 'large' covers fewer languages
    avail_large = available_languages('large')
    assert len(avail_large) >= 14
    assert len(avail) > len(avail_large)

    # Look up the digit '2' in the main word list for each language
    for lang in avail:
        assert word_frequency('2', lang) > 0

        # Make up a weirdly verbose language code and make sure
        # we still get it
        new_lang_code = '%s-001-x-fake-extension' % lang.upper()
        assert word_frequency('2', new_lang_code) > 0
Esempio n. 5
0
def test_phrase_freq():
    ff = word_frequency("flip-flop", 'en')
    assert_greater(ff, 0)
    assert_almost_equal(
        1.0 / ff,
        1.0 / word_frequency('flip', 'en') + 1.0 / word_frequency('flop', 'en')
    )
Esempio n. 6
0
def choose_small_vocabulary(big_frame, concepts_filename, language):
    """
    Choose the vocabulary of the small frame, by eliminating the terms which:
     - contain more than one word
     - are not in ConceptNet
     - are not frequent
    """
    concepts = set(line.strip() for line in open(concepts_filename))
    vocab = []
    for term in big_frame.index:
        if '_' not in term and term in concepts:
            try:
                frequency = word_frequency(uri_to_label(term),
                                           language,
                                           wordlist='large')
            except LookupError:
                frequency = word_frequency(uri_to_label(term),
                                           language,
                                           wordlist='combined')
            vocab.append((term, frequency))
    small_vocab = [
        term for term, frequency in sorted(
            vocab, key=lambda x: x[1], reverse=True)[:50000]
    ]
    return small_vocab
Esempio n. 7
0
def makeKeyWords(amazonNameSplited, ebayNameSplited):
    # Makes a priority list for Amazon
    amazonNameSplited_Priority = []
    keyWords_Amazon = []
    for word in amazonNameSplited:
        wordFrequency = word_frequency(word, 'en')
        amazonNameSplited_Priority.append(wordFrequency)
    # Gets the minimum value
    returnParams = getMinmumValue(amazonNameSplited,
                                  amazonNameSplited_Priority)
    keyWords_Amazon.append(returnParams[0])
    amazonNameSplited_Priority = returnParams[1]
    returnParams = getMinmumValue(amazonNameSplited,
                                  amazonNameSplited_Priority)
    keyWords_Amazon.append(returnParams[0])
    # Makes a priority list for Ebay
    ebayNameSplited_Priority = []
    keyWords_Ebay = []
    for word in ebayNameSplited:
        wordFrequency = word_frequency(word, 'en')
        ebayNameSplited_Priority.append(wordFrequency)
    # Gets the minimum value
    returnParams = getMinmumValue(ebayNameSplited, ebayNameSplited_Priority)
    keyWords_Ebay.append(returnParams[0])
    ebayNameSplited_Priority = returnParams[1]
    returnParams = getMinmumValue(ebayNameSplited, ebayNameSplited_Priority)
    keyWords_Ebay.append(returnParams[0])
    return keyWords_Amazon, keyWords_Ebay
Esempio n. 8
0
def test_freq_examples():
    # Stopwords are most common in the correct language
    assert_greater(word_frequency('the', 'en'),
                   word_frequency('de', 'en'))

    assert_greater(word_frequency('de', 'es'),
                   word_frequency('the', 'es'))
Esempio n. 9
0
def test_twitter():
    avail = available_languages('twitter')
    assert_greater(len(avail), 12)

    for lang in avail:
        assert_greater(word_frequency('rt', lang, 'twitter'),
                       word_frequency('rt', lang, 'combined'))
Esempio n. 10
0
def test_languages():
    # Make sure we get all the languages when looking for the default
    # 'best' wordlist
    avail = available_languages()
    assert len(avail) >= 34

    # 'small' covers the same languages, but with some different lists
    avail_small = available_languages('small')
    assert len(avail_small) == len(avail)
    assert avail_small != avail

    # 'combined' is the same as 'small'
    avail_old_name = available_languages('combined')
    assert avail_old_name == avail_small

    # 'large' covers fewer languages
    avail_large = available_languages('large')
    assert len(avail_large) >= 14
    assert len(avail) > len(avail_large)

    # Look up the digit '2' in the main word list for each language
    for lang in avail:
        assert word_frequency('2', lang) > 0

        # Make up a weirdly verbose language code and make sure
        # we still get it
        new_lang_code = '%s-001-x-fake-extension' % lang.upper()
        assert word_frequency('2', new_lang_code) > 0
Esempio n. 11
0
def test_freq_examples():
    # Stopwords are most common in the correct language
    assert_greater(word_frequency('the', 'en'),
                   word_frequency('de', 'en'))

    assert_greater(word_frequency('de', 'es'),
                   word_frequency('the', 'es'))
Esempio n. 12
0
def term_freq(term):
    _c, lang, term = split_uri(term)[:3]
    if lang == 'en':
        return wordfreq.word_frequency(term, 'en', 'large')
    elif lang in CORE_LANGUAGES:
        return wordfreq.word_frequency(term, lang)
    else:
        return 0.
Esempio n. 13
0
def sort_by_rarity(word_list: List[str]) -> List[str]:
    if len(word_list) <= 1:
        return word_list
    return sort_by_rarity(
        [word for word in word_list[1:] if word_frequency(word, 'en') < word_frequency(word_list[0], 'en')]
    ) + [word_list[0]] + \
           sort_by_rarity(
               [word for word in word_list[1:] if word_frequency(word, 'en') >= word_frequency(word_list[0], 'en')])
Esempio n. 14
0
def getPhrasePoints(phrase, content):
    if phrase.lower() not in commonWords and len(phrase) > 2:
        points = content.lower().count(
            phrase.lower()) * len(phrase) / (word_frequency(
                phrase, 'en') if word_frequency(phrase, 'en') != 0 else 1)
        return points
    else:
        return 0
Esempio n. 15
0
def test_combination():
    ohayou_freq = word_frequency('おはよう', 'ja')
    gozai_freq = word_frequency('ござい', 'ja')
    masu_freq = word_frequency('ます', 'ja')

    assert_almost_equal(word_frequency('おはようおはよう', 'ja'), ohayou_freq / 2)
    assert_almost_equal(1.0 / word_frequency('おはようございます', 'ja'),
                        1.0 / ohayou_freq + 1.0 / gozai_freq + 1.0 / masu_freq)
Esempio n. 16
0
def test_combination():
    gamsa_freq = word_frequency('감사', 'ko')
    habnida_freq = word_frequency('합니다', 'ko')

    assert word_frequency('감사감사', 'ko') == pytest.approx(gamsa_freq / 2, rel=0.01)
    assert (
        1.0 / word_frequency('감사합니다', 'ko') ==
        pytest.approx(1.0 / gamsa_freq + 1.0 / habnida_freq, rel=0.01)
    )
Esempio n. 17
0
def test_combination():
    ohayou_freq = word_frequency('おはよう', 'ja')
    gozai_freq = word_frequency('ござい', 'ja')
    masu_freq = word_frequency('ます', 'ja')

    assert word_frequency('おはようおはよう', 'ja') == pytest.approx(ohayou_freq / 2,
                                                             rel=0.01)

    assert (1.0 / word_frequency('おはようございます', 'ja') == pytest.approx(
        1.0 / ohayou_freq + 1.0 / gozai_freq + 1.0 / masu_freq, rel=0.01))
Esempio n. 18
0
def test_twitter():
    avail = available_languages('twitter')
    assert_greater(len(avail), 15)

    for lang in avail:
        assert_greater(word_frequency('rt', lang, 'twitter'),
                       word_frequency('rt', lang, 'combined'))
        text = LAUGHTER_WORDS.get(lang, 'haha')
        assert_greater(word_frequency(text, lang, wordlist='twitter'), 0,
                       (text, lang))
Esempio n. 19
0
def test_combination():
    ohayou_freq = word_frequency('おはよう', 'ja')
    gozai_freq = word_frequency('ござい', 'ja')
    masu_freq = word_frequency('ます', 'ja')

    assert word_frequency('おはようおはよう', 'ja') == pytest.approx(ohayou_freq / 2, rel=0.01)
    
    assert (
        1.0 / word_frequency('おはようございます', 'ja') ==
        pytest.approx(1.0 / ohayou_freq + 1.0 / gozai_freq + 1.0 / masu_freq, rel=0.01)
    )
Esempio n. 20
0
def test_number_smashing():
    assert tokenize('"715 - CRΣΣKS" by Bon Iver', 'en') == ['715', 'crσσks', 'by', 'bon', 'iver']
    assert lossy_tokenize('"715 - CRΣΣKS" by Bon Iver', 'en') == ['000', 'crσσks', 'by', 'bon', 'iver']
    assert (
        lossy_tokenize('"715 - CRΣΣKS" by Bon Iver', 'en', include_punctuation=True)
        == ['"', '000', '-', 'crσσks', '"', 'by', 'bon', 'iver']
    )
    assert lossy_tokenize('1', 'en') == ['1']
    assert lossy_tokenize('3.14', 'en') == ['0.00']
    assert lossy_tokenize('24601', 'en') == ['00000']
    assert word_frequency('24601', 'en') == word_frequency('90210', 'en')
Esempio n. 21
0
def test_combination():
    gamsa_freq = word_frequency('감사', 'ko')
    habnida_freq = word_frequency('합니다', 'ko')

    assert_almost_equal(
        word_frequency('감사감사', 'ko'),
        gamsa_freq / 2
    )
    assert_almost_equal(
        1.0 / word_frequency('감사합니다', 'ko'),
        1.0 / gamsa_freq + 1.0 / habnida_freq
    )
Esempio n. 22
0
def test_combination():
    ohayou_freq = word_frequency('おはよう', 'ja')
    gozai_freq = word_frequency('ござい', 'ja')
    masu_freq = word_frequency('ます', 'ja')

    assert_almost_equal(
        word_frequency('おはようおはよう', 'ja'),
        ohayou_freq / 2
    )
    assert_almost_equal(
        1.0 / word_frequency('おはようございます', 'ja'),
        1.0 / ohayou_freq + 1.0 / gozai_freq + 1.0 / masu_freq
    )
Esempio n. 23
0
def test_languages():
    # Make sure the number of available languages doesn't decrease
    avail = available_languages()
    assert_greater(len(avail), 26)

    # Look up the digit '2' in the main word list for each language
    for lang in avail:
        assert_greater(word_frequency('2', lang), 0, lang)

        # Make up a weirdly verbose language code and make sure
        # we still get it
        new_lang_code = '%s-001-x-fake-extension' % lang.upper()
        assert_greater(word_frequency('2', new_lang_code), 0, new_lang_code)
Esempio n. 24
0
def test_number_smashing():
    assert tokenize('"715 - CRΣΣKS" by Bon Iver',
                    'en') == ['715', 'crσσks', 'by', 'bon', 'iver']
    assert lossy_tokenize('"715 - CRΣΣKS" by Bon Iver',
                          'en') == ['000', 'crσσks', 'by', 'bon', 'iver']
    assert (lossy_tokenize('"715 - CRΣΣKS" by Bon Iver',
                           'en',
                           include_punctuation=True) == [
                               '"', '000', '-', 'crσσks', '"', 'by', 'bon',
                               'iver'
                           ])
    assert lossy_tokenize('1', 'en') == ['1']
    assert lossy_tokenize('3.14', 'en') == ['0.00']
    assert lossy_tokenize('24601', 'en') == ['00000']
    assert word_frequency('24601', 'en') == word_frequency('90210', 'en')
Esempio n. 25
0
def test_number_smashing():
    eq_(tokenize('"715 - CRΣΣKS" by Bon Iver', 'en'),
        ['715', 'crσσks', 'by', 'bon', 'iver'])
    eq_(tokenize('"715 - CRΣΣKS" by Bon Iver', 'en', combine_numbers=True),
        ['000', 'crσσks', 'by', 'bon', 'iver'])
    eq_(
        tokenize('"715 - CRΣΣKS" by Bon Iver',
                 'en',
                 combine_numbers=True,
                 include_punctuation=True),
        ['"', '000', '-', 'crσσks', '"', 'by', 'bon', 'iver'])
    eq_(tokenize('1', 'en', combine_numbers=True), ['1'])
    eq_(tokenize('3.14', 'en', combine_numbers=True), ['0.00'])
    eq_(tokenize('24601', 'en', combine_numbers=True), ['00000'])
    eq_(word_frequency('24601', 'en'), word_frequency('90210', 'en'))
Esempio n. 26
0
    def lookup(self, language, word, pos=None):
        if self.db is None:
            self.db = sqlite3.connect(self.filename)
        if language not in LEMMATIZED_LANGUAGES:
            return word, ''
        exceptions = EXCEPTIONS.get(language, {})
        if word in exceptions:
            return exceptions[word]
        exceptions_fixed = EXCEPTIONS_FIXED.get(language, set())
        if word in exceptions_fixed:
            return word, ''

        cursor = self.db.cursor()
        if pos:
            cursor.execute(QUERY + ' AND pos=?', (language, word, pos))
        else:
            cursor.execute(QUERY, (language, word))

        rows = list(cursor.fetchall())
        if len(rows) == 0:
            return word, ''
        elif len(rows) == 1:
            root, form, pos = rows[0]
            return root, form
        else:
            possibilities = []
            for row in rows:
                root, form, pos = row
                if language in WORDFREQ_LANGUAGES_LARGE:
                    goodness = wordfreq.word_frequency(root, language, 'large')
                elif language in WORDFREQ_LANGUAGES:
                    goodness = wordfreq.word_frequency(root, language)
                else:
                    goodness = 0.
                if pos == 'n':
                    goodness += 1.
                if form == 'positiv' or form == 'singular' and root != word:
                    goodness -= 2.
                if goodness >= 0:
                    possibilities.append((-goodness, root, form))
            possibilities.sort()
            if not possibilities:
                return word, ''
            _, root, form = possibilities[0]

            if root == word:
                form = ''
            return root, form
Esempio n. 27
0
def test_languages():
    # Make sure the number of available languages doesn't decrease
    avail = available_languages()
    assert_greater(len(avail), 14)

    # Laughter is the universal language
    for lang in avail:
        if lang not in {'zh', 'ja'}:
            # we do not have enough Chinese data
            # Japanese people do not lol
            assert_greater(word_frequency('lol', lang), 0)

            # Make up a weirdly verbose language code and make sure
            # we still get it
            new_lang_code = '%s-001-x-fake-extension' % lang.upper()
            assert_greater(word_frequency('lol', new_lang_code), 0)
Esempio n. 28
0
    def add_vocab(self, vocab, tags=None):
        srs_notes = srs_api.find_notes(simplified=vocab)

        if len(srs_notes) == 0:
            db_v = zh.Vocab.get_or_none(simplified=vocab)
            if db_v:
                data = dict(db_v)
            else:
                data = {'simplified': vocab}

            data['frequency'] = word_frequency(vocab, 'zh') * 10**6

            srs_note = srs_api.create_note(model=self.v_model, data=data)

            for srs_card in srs_note.cards:
                if srs_card.template.name == '中英':
                    level = self.h_level[vocab]
                    label = self.LABELS[(int(level) - 1) // 10]
                else:
                    level = self.v_level[vocab]
                    label = self.LABELS[(int(level) - 1) // 10]

                srs_card.add_deck(f'ZhLevel::'
                                  f'Vocab::'
                                  f'{srs_card.template.name}::'
                                  f'{label}::'
                                  f'Level {int(level):02d}')

        if tags:
            srs_api.notes_add_tags(srs_notes, tags)

        return srs_notes
Esempio n. 29
0
def __find_next_cluster(keyword_clusters, min_cluster_size, max_cluster_size):
    """Chooses the best available cluster larger than min_cluster_size and smaller than
    max_cluster_size, where "best" means having the keyword that is rarest in the English language
    (according to wordfreq's corpus) among the options options.

    Args:
        keyword_clusters (dict): set of documents for each keyword
        min_cluster_size (int)
        max_cluster_size (int)
    """
    lemma_dfs = [(k, len(v)) for k, v in keyword_clusters.items()]
    lemma_dfs.sort(key=lambda a: a[1])
    possible_keywords = set()
    for lemma_df in lemma_dfs:
        if lemma_df[1] < min_cluster_size or lemma_df[
                0] in STOP_WORDS or lemma_df[0] == "-PRON-":
            continue
        if lemma_df[1] > max_cluster_size:
            break
        possible_keywords.add(lemma_df[0])
    if len(possible_keywords) == 0:
        return None, None
    rarest_freq = 1
    rarest_keyword = None
    for keyword in possible_keywords:
        freq = word_frequency(keyword, "en")
        if 0.0 < freq < rarest_freq:
            rarest_keyword = keyword
            rarest_freq = freq
    return rarest_keyword, keyword_clusters[rarest_keyword]
Esempio n. 30
0
def pre_sif_mean(mat, refs, lang, dtype=None):
    return pre_sif_mean_inner(
        mat,
        (wordfreq.word_frequency(get_wf(ref), lang) for ref in refs),
        1e-3,
        dtype=dtype,
    )
Esempio n. 31
0
def calc_SIP(wordTuple):
    word = wordTuple[0]
    freq = word_frequency(word, 'en')
    if freq == 0:
        freq = float(.00001)
    SIPscore = wordTuple[1] / freq
    return (word, SIPscore)
Esempio n. 32
0
def test_languages():
    # Make sure the number of available languages doesn't decrease
    avail = available_languages()
    assert_greater(len(avail), 14)

    # Laughter is the universal language
    for lang in avail:
        if lang not in {"zh", "ja"}:
            # we do not have enough Chinese data
            # Japanese people do not lol
            assert_greater(word_frequency("lol", lang), 0)

            # Make up a weirdly verbose language code and make sure
            # we still get it
            new_lang_code = "%s-001-x-fake-extension" % lang.upper()
            assert_greater(word_frequency("lol", new_lang_code), 0)
Esempio n. 33
0
def test_tokens():
    # Let's test on some Chinese text that has unusual combinations of
    # syllables, because it is about an American vice-president.
    #
    # (He was the Chinese Wikipedia's featured article of the day when I
    # wrote this test.)

    hobart = '加勒特·霍巴特'  # Garret Hobart, or "jiā lè tè huò bā tè".

    # He was the sixth American vice president to die in office.
    fact_simplified  = '他是历史上第六位在任期内去世的美国副总统。'
    fact_traditional = '他是歷史上第六位在任期內去世的美國副總統。'

    # His name breaks into five pieces, with the only piece staying together
    # being the one that means 'Bart'. The dot is not included as a token.
    eq_(
        tokenize(hobart, 'zh'),
        ['加', '勒', '特', '霍', '巴特']
    )

    eq_(
        tokenize(fact_simplified, 'zh'),
        [
         # he / is / in history / #6 / counter for people
         '他', '是',  '历史上', '第六', '位',
         # during / term of office / in / die
         '在', '任期', '内', '去世',
         # of / U.S. / deputy / president
         '的', '美国', '副', '总统'
        ]
    )

    # You match the same tokens if you look it up in Traditional Chinese.
    eq_(tokenize(fact_simplified, 'zh'), tokenize(fact_traditional, 'zh'))
    assert_greater(word_frequency(fact_traditional, 'zh'), 0)
Esempio n. 34
0
def get_least_frequent_words(sentence, n):
    """
	Extracts and returns the n least frequent words of a given sentence
	"""
    freq_list = []
    for index, word in enumerate(sentence):
        if is_a_website(word):
            continue
        if word in ['•', '’', '”', '“', ')', '–', '»', '“'
                    ] or word in string.punctuation:
            continue
        # make sure frequencies are in there (hardcoded)
        if 'ghz' in word:
            freq_list.append((index, word, 0.0))
        else:
            freq_list.append((index, word, word_frequency(word, 'en')))

    # sort words in least frequency
    sorted_on_freq = [
        (x[0], x[1])
        for x in set(sorted(freq_list, key=lambda tup: tup[2])[0:n])
    ]

    # return list of words in logical order
    return [x[1] for x in sorted(sorted_on_freq, key=lambda tup: tup[0])]
Esempio n. 35
0
def clean_text(text):
    result = []
    words = text.split(" ")
    words = [a.strip(',.!?:; ') for a in words]

    words = list(set(words))
    words = [
        word for word in words
        if not word.isalpha() or word.lower() in different_words
    ]

    for word in set(words):
        # Maybe unkify?
        result += [
            re.sub(r'[^a-zA-Z0-9]' + re.escape(word.lower()) + r'[^a-zA-Z0-9]',
                   ' potato ', " " + text.lower() + " ").strip()
        ]

    tokenizer = RegexpTokenizer(r'\w+')
    all_words = tokenizer.tokenize(text)
    # logging.info("all_words "+str(all_words))
    # Try removing all unknown words
    for word in set(all_words):
        if word.lower() not in counter and word_frequency(
                word.lower(), "en") == 0 and len(word) > 2:
            text = text.replace(word, '')

    result += [text]
    return result
Esempio n. 36
0
def uncommon_words(n1, n2):
    uncommon_words_found = [[], []]
    for idx, n in enumerate([n1, n2]):
        words = tokenize(n['value'], funcs_word=[lower])

        # Filter out words based on their lengths and if they do not contain any letter
        filtered_words = []
        for w in words:
            if len(w) > 3 and re.search("[a-zA-Z]", w):
                filtered_words.append(w)

        words_freqs = {}
        for w in filtered_words:
            if w not in words_freqs:
                probability = wordfreq.word_frequency(w,
                                                      'en',
                                                      wordlist='large')
                words_freqs[w] = probability

        res = [key for key in words_freqs.keys() if words_freqs[key] < thr]
        uncommon_words_found[idx] = res

    shared_uncommon_words = set(uncommon_words_found[0]) & set(
        uncommon_words_found[1])
    if len(shared_uncommon_words) > 0:
        return {
            'outcome': True,
            'words': sorted(list(shared_uncommon_words), key=len,
                            reverse=True),
        }
    else:
        return {'outcome': False}
Esempio n. 37
0
def remove_SE_comment(text, model, features, tf_idf_counter):
    t = time.time()
    words = text.split(" ")
    words = [a.strip(',.!?:; ') for a in words]

    words = list(set(words))
    words = [
        word for word in words
        if not word.isalpha() or word.lower() in different_words
    ]

    for word in set(words):
        # Maybe unkify?
        new_sentence = re.sub(
            r'[^a-zA-Z0-9]' + re.escape(word.lower()) + r'[^a-zA-Z0-9]',
            ' potato ', text.lower())
        new_features = rescore(new_sentence, features, tf_idf_counter)

        if model.predict([new_features])[0] == 0:
            return 1

    tokenizer = RegexpTokenizer(r'\w+')
    all_words = tokenizer.tokenize(text)
    # Try removing all unknown words
    for word in set(all_words):
        if word.lower() not in counter and word_frequency(
                word.lower(), "en") == 0 and len(word) > 2:
            text = text.replace(word, '')

    if model.predict([new_features])[0] == 0:
        return 1

    return 0
Esempio n. 38
0
    def __init__(self):
        global different_words
        global counter

        self.features = []
        self.nice_features = []
        self.parameter_names = []
        self.hyper_parameters_lists = []
        self.last_time = time.time()
        self.tf_idf_counter = 0
        self.use_filters = True
        self.counter = pickle.load(open("pickles/github_words.p", "rb"))
        counter = self.counter
        self.our_words = dict([(i, word_frequency(i, "en") * 10**9)
                               for i in self.counter])
        self.different_words = log_odds(defaultdict(int, self.counter),
                                        defaultdict(int, self.our_words))
        different_words = self.different_words
        self.anger_classifier = pickle.load(open("pickles/anger.p", "rb"))
        self.all_words = pickle.load(open("pickles/all_words.p", "rb"))
        self.m = sum(self.counter.values())
        self.all_false = {word: False for word in self.all_words}

        start_time = time.time()
        self.alpha = 0.1

        self.all_train_data = None
        self.test_data = None
        self.train_data = None
        self.model_function = None
Esempio n. 39
0
def word_list(filename):
    num_words = 0
    word_occurrence_map = {}
    with open(filename) as file:
        for line in file.readlines():
            formatted = ''.join(char for char in line if char not in EXCLUDE).strip('\n').lower()
            words = formatted.split(' ')
            for word in words:
                if word == '' or word == "'":
                    continue
                num_words += 1
                if word in word_occurrence_map.keys():
                    word_occurrence_map[word] += 1
                else:
                    word_occurrence_map[word] = 1

    # for word_occurrence in sorted(word_occurrence_map.items(), key=lambda x: x[1], reverse=True):
    #     print(f'{word_occurrence[0]}: {word_occurrence[1]}')

    results = {}
    for key in word_occurrence_map.keys():
        word_freq = word_occurrence_map[key]/num_words * 100  # Percent in text
        word_freq_control = word_frequency(key, 'en', wordlist='small') * 100  # Percent in english
        # print(key, word_freq, word_freq_control)
        results[key] = float(word_freq - CONTROL_MULTIPLY*word_freq_control)

    results = sorted(results.items(), key=lambda x: x[1], reverse=True)

    # print(results)

    return '\n\n' + filename + '\n' + ', '.join([item[0] for item in results][:30]) + '\n\n'
Esempio n. 40
0
def term_freq(term):
    """
    Get an estimate of the frequency of this term from the 'wordfreq' library.
    When miniaturizing, we use this as a cutoff for which words to include
    in the vocabulary.

    Because we have the most data for English, we allow lower word frequencies
    in English (by reading in the 'large' list, whose frequencies can go
    below 1e-6).
    """
    _c, lang, term = split_uri(term)[:3]
    if lang == 'en':
        return wordfreq.word_frequency(term, 'en', 'large')
    elif lang in CORE_LANGUAGES:
        return wordfreq.word_frequency(term, lang)
    else:
        return 0.
Esempio n. 41
0
def test_languages():
    # Make sure the number of available languages doesn't decrease
    avail = available_languages()
    assert_greater(len(avail), 15)

    # Laughter is the universal language. Look up either 'lol' or '笑' in each
    # language and make sure it has a non-zero frequency.
    for lang in avail:
        if lang in {'zh', 'ja'}:
            text = '笑'
        else:
            text = 'lol'
        assert_greater(word_frequency(text, lang), 0)

        # Make up a weirdly verbose language code and make sure
        # we still get it
        new_lang_code = '%s-001-x-fake-extension' % lang.upper()
        assert_greater(word_frequency(text, new_lang_code), 0)
Esempio n. 42
0
def test_language_matching():
    freq = word_frequency('的', 'zh')
    eq_(word_frequency('的', 'zh-TW'), freq)
    eq_(word_frequency('的', 'zh-CN'), freq)
    eq_(word_frequency('的', 'zh-Hant'), freq)
    eq_(word_frequency('的', 'zh-Hans'), freq)
    eq_(word_frequency('的', 'yue-HK'), freq)
    eq_(word_frequency('的', 'cmn'), freq)
Esempio n. 43
0
def test_language_matching():
    freq = word_frequency("的", "zh")
    eq_(word_frequency("的", "zh-TW"), freq)
    eq_(word_frequency("的", "zh-CN"), freq)
    eq_(word_frequency("的", "zh-Hant"), freq)
    eq_(word_frequency("的", "zh-Hans"), freq)
    eq_(word_frequency("的", "yue-HK"), freq)
    eq_(word_frequency("的", "cmn"), freq)
Esempio n. 44
0
def test_language_matching():
    freq = word_frequency('的', 'zh')
    assert word_frequency('的', 'zh-TW') == freq
    assert word_frequency('的', 'zh-CN') == freq
    assert word_frequency('的', 'zh-Hant') == freq
    assert word_frequency('的', 'zh-Hans') == freq
    assert word_frequency('的', 'yue-HK') == freq
    assert word_frequency('的', 'cmn') == freq
Esempio n. 45
0
def test_languages():
    # Make sure the number of available languages doesn't decrease
    avail = available_languages()
    assert_greater(len(avail), 15)

    # Look up a word representing laughter in each language, and make sure
    # it has a non-zero frequency.
    for lang in avail:
        if lang in {'zh', 'ja'}:
            text = '笑'
        elif lang == 'ar':
            text = 'ههههه'
        else:
            text = 'lol'
        assert_greater(word_frequency(text, lang), 0)

        # Make up a weirdly verbose language code and make sure
        # we still get it
        new_lang_code = '%s-001-x-fake-extension' % lang.upper()
        assert_greater(word_frequency(text, new_lang_code), 0, (text, new_lang_code))
Esempio n. 46
0
def test_tokens():
    # Let's test on some Chinese text that has unusual combinations of
    # syllables, because it is about an American vice-president.
    #
    # (He was the Chinese Wikipedia's featured article of the day when I
    # wrote this test.)

    hobart = '加勒特·霍巴特'  # Garret Hobart, or "jiā lè tè huò bā tè".

    # He was the sixth American vice president to die in office.
    fact_simplified  = '他是历史上第六位在任期内去世的美国副总统。'
    fact_traditional = '他是歷史上第六位在任期內去世的美國副總統。'

    # His name breaks into five pieces, with the only piece staying together
    # being the one that means 'Bart'. The dot is not included as a token.
    assert tokenize(hobart, 'zh') == ['加', '勒', '特', '霍', '巴特']

    assert tokenize(fact_simplified, 'zh') == [
        # he / is / history / in / #6 / counter for people
        '他', '是',  '历史', '上', '第六', '位',
        # during / term of office / in / die
        '在', '任期', '内', '去世',
        # of / U.S. / deputy / president
        '的', '美国', '副', '总统'
    ]

    # Jieba's original tokenizer knows a lot of names, it seems.
    assert tokenize(hobart, 'zh', external_wordlist=True) == ['加勒特', '霍巴特']

    # We get almost the same tokens from the sentence using Jieba's own
    # wordlist, but it tokenizes "in history" as two words and
    # "sixth person" as one.
    assert tokenize(fact_simplified, 'zh', external_wordlist=True) == [
        # he / is / history / in / sixth person
        '他', '是', '历史', '上', '第六位',
        # during / term of office / in / die
        '在', '任期', '内', '去世',
        # of / U.S. / deputy / president
        '的', '美国', '副', '总统'
    ]

    # Check that Traditional Chinese works at all
    assert word_frequency(fact_traditional, 'zh') > 0

    # You get the same token lengths if you look it up in Traditional Chinese,
    # but the words are different
    simp_tokens = tokenize(fact_simplified, 'zh', include_punctuation=True)
    trad_tokens = tokenize(fact_traditional, 'zh', include_punctuation=True)
    assert ''.join(simp_tokens) == fact_simplified
    assert ''.join(trad_tokens) == fact_traditional
    simp_lengths = [len(token) for token in simp_tokens]
    trad_lengths = [len(token) for token in trad_tokens]
    assert simp_lengths == trad_lengths
Esempio n. 47
0
    def fuzz(self):

        """
        Compute an arbitrarily-scaled "fuzziness" score for the query tokens,
        where low is focused and high is fuzzy.

        Returns: float
        """

        freqs = [
            word_frequency(t, 'en', minimum=1e-6)
            for t in self.hash_tokens
        ]

        return reduce(lambda x, y: x*y, freqs)*1e10
Esempio n. 48
0
def test_freq_examples():
    assert_almost_equal(
        word_frequency('normalization', 'en', 'google-books'),
        1.767e-6, places=9
    )
    assert_almost_equal(
        word_frequency('normalization', 'en', 'google-books', 1e-6),
        2.767e-6, places=9
    )
    assert_almost_equal(
        word_frequency('normalisation', 'fr', 'leeds-internet'),
        4.162e-6, places=9
    )
    assert_greater(
        word_frequency('lol', 'xx', 'twitter'),
        word_frequency('lol', 'en', 'google-books')
    )
    eq_(
        word_frequency('totallyfakeword', 'en', 'multi', .5),
        .5
    )
Esempio n. 49
0
def test_combination():
    xiexie_freq = word_frequency('谢谢', 'zh')   # "Thanks"
    assert_almost_equal(
        word_frequency('谢谢谢谢', 'zh'),
        xiexie_freq / 20
    )
Esempio n. 50
0
def test_freq_examples():
    # Stopwords are most common in the correct language
    assert word_frequency('the', 'en') > word_frequency('de', 'en')
    assert word_frequency('de', 'es') > word_frequency('the', 'es')
    # We get word frequencies from the 'large' list when available
    assert word_frequency('infrequency', 'en') > 0.
Esempio n. 51
0
def test_minimums():
    assert word_frequency('esquivalience', 'en') == 0
    assert word_frequency('esquivalience', 'en', minimum=1e-6) == 1e-6
    assert word_frequency('the', 'en', minimum=1) == 1
Esempio n. 52
0
def test_phrase_freq():
    ff = word_frequency("flip-flop", 'en')
    assert ff > 0
    phrase_freq = 1.0 / word_frequency('flip', 'en') + 1.0 / word_frequency('flop', 'en')
    assert 1.0 / ff == pytest.approx(phrase_freq, rel=0.01)
Esempio n. 53
0
def test_phrase_freq():
    ff = word_frequency("flip-flop", "en")
    assert_greater(ff, 0)
    assert_almost_equal(1.0 / ff, 1.0 / word_frequency("flip", "en") + 1.0 / word_frequency("flop", "en"))
Esempio n. 54
0
def test_freq_examples():
    # Stopwords are most common in the correct language
    assert_greater(word_frequency("the", "en"), word_frequency("de", "en"))

    assert_greater(word_frequency("de", "es"), word_frequency("the", "es"))
Esempio n. 55
0
def test_twitter():
    avail = available_languages("twitter")
    assert_greater(len(avail), 12)

    for lang in avail:
        assert_greater(word_frequency("rt", lang, "twitter"), word_frequency("rt", lang, "combined"))
Esempio n. 56
0
def test_minimums():
    eq_(word_frequency("esquivalience", "en"), 0)
    eq_(word_frequency("esquivalience", "en", minimum=1e-6), 1e-6)
    eq_(word_frequency("the", "en", minimum=1), 1)
Esempio n. 57
0
def test_at_in_corpus():
    # We have a word frequency for "l@s"
    assert word_frequency('l@s', 'es') > 0

    # It's not just treated as a word break
    assert word_frequency('l@s', 'es') < word_frequency('l s', 'es')
Esempio n. 58
0
def test_combination():
    xiexie_freq = word_frequency('谢谢', 'zh')   # "Thanks"
    assert word_frequency('谢谢谢谢', 'zh') == pytest.approx(xiexie_freq / 20, rel=0.01)