def test_twitter(): avail = available_languages('twitter') assert_greater(len(avail), 14) for lang in avail: assert_greater(word_frequency('rt', lang, 'twitter'), word_frequency('rt', lang, 'combined'))
def test_phrase_freq(): ff = word_frequency("flip-flop", 'en') assert_greater(ff, 0) assert_almost_equal( 1.0 / ff, 1.0 / word_frequency('flip', 'en') + 1.0 / word_frequency('flop', 'en') )
def match_questions_with_categories(questions, clusters): """A simple matching algorithm that places questions into a pre-created cluster if: 1. The question's lemmatized form contains the cluster's keyword 2. The question contains no rarer English words that are also cluster keywords Parameters: questions (list[dict]): A list of dictionaries with an id and question (text) field clusters (list[string]): A list of pre-created keywords """ cluster_additions = { "uncategorized": [] } for question in questions: clean_question = clean_text(question["question"].replace("\n", "")) cluster_options = set() for token in nlp(clean_question): if token.lemma_ in clusters: cluster_options.add(token.lemma_) if len(cluster_options) == 0: cluster_additions["uncategorized"].append(question["id"]) continue best_keyword = None rarest_freq = 1 for keyword in cluster_options: if word_frequency(keyword, "en") < rarest_freq: rarest_freq = word_frequency(keyword, "en") best_keyword = keyword if best_keyword in cluster_additions: cluster_additions[best_keyword].append(question["id"]) else: cluster_additions[best_keyword] = [question["id"]] return cluster_additions
def test_languages(): # Make sure we get all the languages when looking for the default # 'best' wordlist avail = available_languages() assert len(avail) >= 34 # 'small' covers the same languages, but with some different lists avail_small = available_languages('small') assert len(avail_small) == len(avail) assert avail_small != avail # 'combined' is the same as 'small' avail_old_name = available_languages('combined') assert avail_old_name == avail_small # 'large' covers fewer languages avail_large = available_languages('large') assert len(avail_large) >= 14 assert len(avail) > len(avail_large) # Look up the digit '2' in the main word list for each language for lang in avail: assert word_frequency('2', lang) > 0 # Make up a weirdly verbose language code and make sure # we still get it new_lang_code = '%s-001-x-fake-extension' % lang.upper() assert word_frequency('2', new_lang_code) > 0
def choose_small_vocabulary(big_frame, concepts_filename, language): """ Choose the vocabulary of the small frame, by eliminating the terms which: - contain more than one word - are not in ConceptNet - are not frequent """ concepts = set(line.strip() for line in open(concepts_filename)) vocab = [] for term in big_frame.index: if '_' not in term and term in concepts: try: frequency = word_frequency(uri_to_label(term), language, wordlist='large') except LookupError: frequency = word_frequency(uri_to_label(term), language, wordlist='combined') vocab.append((term, frequency)) small_vocab = [ term for term, frequency in sorted( vocab, key=lambda x: x[1], reverse=True)[:50000] ] return small_vocab
def makeKeyWords(amazonNameSplited, ebayNameSplited): # Makes a priority list for Amazon amazonNameSplited_Priority = [] keyWords_Amazon = [] for word in amazonNameSplited: wordFrequency = word_frequency(word, 'en') amazonNameSplited_Priority.append(wordFrequency) # Gets the minimum value returnParams = getMinmumValue(amazonNameSplited, amazonNameSplited_Priority) keyWords_Amazon.append(returnParams[0]) amazonNameSplited_Priority = returnParams[1] returnParams = getMinmumValue(amazonNameSplited, amazonNameSplited_Priority) keyWords_Amazon.append(returnParams[0]) # Makes a priority list for Ebay ebayNameSplited_Priority = [] keyWords_Ebay = [] for word in ebayNameSplited: wordFrequency = word_frequency(word, 'en') ebayNameSplited_Priority.append(wordFrequency) # Gets the minimum value returnParams = getMinmumValue(ebayNameSplited, ebayNameSplited_Priority) keyWords_Ebay.append(returnParams[0]) ebayNameSplited_Priority = returnParams[1] returnParams = getMinmumValue(ebayNameSplited, ebayNameSplited_Priority) keyWords_Ebay.append(returnParams[0]) return keyWords_Amazon, keyWords_Ebay
def test_freq_examples(): # Stopwords are most common in the correct language assert_greater(word_frequency('the', 'en'), word_frequency('de', 'en')) assert_greater(word_frequency('de', 'es'), word_frequency('the', 'es'))
def test_twitter(): avail = available_languages('twitter') assert_greater(len(avail), 12) for lang in avail: assert_greater(word_frequency('rt', lang, 'twitter'), word_frequency('rt', lang, 'combined'))
def term_freq(term): _c, lang, term = split_uri(term)[:3] if lang == 'en': return wordfreq.word_frequency(term, 'en', 'large') elif lang in CORE_LANGUAGES: return wordfreq.word_frequency(term, lang) else: return 0.
def sort_by_rarity(word_list: List[str]) -> List[str]: if len(word_list) <= 1: return word_list return sort_by_rarity( [word for word in word_list[1:] if word_frequency(word, 'en') < word_frequency(word_list[0], 'en')] ) + [word_list[0]] + \ sort_by_rarity( [word for word in word_list[1:] if word_frequency(word, 'en') >= word_frequency(word_list[0], 'en')])
def getPhrasePoints(phrase, content): if phrase.lower() not in commonWords and len(phrase) > 2: points = content.lower().count( phrase.lower()) * len(phrase) / (word_frequency( phrase, 'en') if word_frequency(phrase, 'en') != 0 else 1) return points else: return 0
def test_combination(): ohayou_freq = word_frequency('おはよう', 'ja') gozai_freq = word_frequency('ござい', 'ja') masu_freq = word_frequency('ます', 'ja') assert_almost_equal(word_frequency('おはようおはよう', 'ja'), ohayou_freq / 2) assert_almost_equal(1.0 / word_frequency('おはようございます', 'ja'), 1.0 / ohayou_freq + 1.0 / gozai_freq + 1.0 / masu_freq)
def test_combination(): gamsa_freq = word_frequency('감사', 'ko') habnida_freq = word_frequency('합니다', 'ko') assert word_frequency('감사감사', 'ko') == pytest.approx(gamsa_freq / 2, rel=0.01) assert ( 1.0 / word_frequency('감사합니다', 'ko') == pytest.approx(1.0 / gamsa_freq + 1.0 / habnida_freq, rel=0.01) )
def test_combination(): ohayou_freq = word_frequency('おはよう', 'ja') gozai_freq = word_frequency('ござい', 'ja') masu_freq = word_frequency('ます', 'ja') assert word_frequency('おはようおはよう', 'ja') == pytest.approx(ohayou_freq / 2, rel=0.01) assert (1.0 / word_frequency('おはようございます', 'ja') == pytest.approx( 1.0 / ohayou_freq + 1.0 / gozai_freq + 1.0 / masu_freq, rel=0.01))
def test_twitter(): avail = available_languages('twitter') assert_greater(len(avail), 15) for lang in avail: assert_greater(word_frequency('rt', lang, 'twitter'), word_frequency('rt', lang, 'combined')) text = LAUGHTER_WORDS.get(lang, 'haha') assert_greater(word_frequency(text, lang, wordlist='twitter'), 0, (text, lang))
def test_combination(): ohayou_freq = word_frequency('おはよう', 'ja') gozai_freq = word_frequency('ござい', 'ja') masu_freq = word_frequency('ます', 'ja') assert word_frequency('おはようおはよう', 'ja') == pytest.approx(ohayou_freq / 2, rel=0.01) assert ( 1.0 / word_frequency('おはようございます', 'ja') == pytest.approx(1.0 / ohayou_freq + 1.0 / gozai_freq + 1.0 / masu_freq, rel=0.01) )
def test_number_smashing(): assert tokenize('"715 - CRΣΣKS" by Bon Iver', 'en') == ['715', 'crσσks', 'by', 'bon', 'iver'] assert lossy_tokenize('"715 - CRΣΣKS" by Bon Iver', 'en') == ['000', 'crσσks', 'by', 'bon', 'iver'] assert ( lossy_tokenize('"715 - CRΣΣKS" by Bon Iver', 'en', include_punctuation=True) == ['"', '000', '-', 'crσσks', '"', 'by', 'bon', 'iver'] ) assert lossy_tokenize('1', 'en') == ['1'] assert lossy_tokenize('3.14', 'en') == ['0.00'] assert lossy_tokenize('24601', 'en') == ['00000'] assert word_frequency('24601', 'en') == word_frequency('90210', 'en')
def test_combination(): gamsa_freq = word_frequency('감사', 'ko') habnida_freq = word_frequency('합니다', 'ko') assert_almost_equal( word_frequency('감사감사', 'ko'), gamsa_freq / 2 ) assert_almost_equal( 1.0 / word_frequency('감사합니다', 'ko'), 1.0 / gamsa_freq + 1.0 / habnida_freq )
def test_combination(): ohayou_freq = word_frequency('おはよう', 'ja') gozai_freq = word_frequency('ござい', 'ja') masu_freq = word_frequency('ます', 'ja') assert_almost_equal( word_frequency('おはようおはよう', 'ja'), ohayou_freq / 2 ) assert_almost_equal( 1.0 / word_frequency('おはようございます', 'ja'), 1.0 / ohayou_freq + 1.0 / gozai_freq + 1.0 / masu_freq )
def test_languages(): # Make sure the number of available languages doesn't decrease avail = available_languages() assert_greater(len(avail), 26) # Look up the digit '2' in the main word list for each language for lang in avail: assert_greater(word_frequency('2', lang), 0, lang) # Make up a weirdly verbose language code and make sure # we still get it new_lang_code = '%s-001-x-fake-extension' % lang.upper() assert_greater(word_frequency('2', new_lang_code), 0, new_lang_code)
def test_number_smashing(): assert tokenize('"715 - CRΣΣKS" by Bon Iver', 'en') == ['715', 'crσσks', 'by', 'bon', 'iver'] assert lossy_tokenize('"715 - CRΣΣKS" by Bon Iver', 'en') == ['000', 'crσσks', 'by', 'bon', 'iver'] assert (lossy_tokenize('"715 - CRΣΣKS" by Bon Iver', 'en', include_punctuation=True) == [ '"', '000', '-', 'crσσks', '"', 'by', 'bon', 'iver' ]) assert lossy_tokenize('1', 'en') == ['1'] assert lossy_tokenize('3.14', 'en') == ['0.00'] assert lossy_tokenize('24601', 'en') == ['00000'] assert word_frequency('24601', 'en') == word_frequency('90210', 'en')
def test_number_smashing(): eq_(tokenize('"715 - CRΣΣKS" by Bon Iver', 'en'), ['715', 'crσσks', 'by', 'bon', 'iver']) eq_(tokenize('"715 - CRΣΣKS" by Bon Iver', 'en', combine_numbers=True), ['000', 'crσσks', 'by', 'bon', 'iver']) eq_( tokenize('"715 - CRΣΣKS" by Bon Iver', 'en', combine_numbers=True, include_punctuation=True), ['"', '000', '-', 'crσσks', '"', 'by', 'bon', 'iver']) eq_(tokenize('1', 'en', combine_numbers=True), ['1']) eq_(tokenize('3.14', 'en', combine_numbers=True), ['0.00']) eq_(tokenize('24601', 'en', combine_numbers=True), ['00000']) eq_(word_frequency('24601', 'en'), word_frequency('90210', 'en'))
def lookup(self, language, word, pos=None): if self.db is None: self.db = sqlite3.connect(self.filename) if language not in LEMMATIZED_LANGUAGES: return word, '' exceptions = EXCEPTIONS.get(language, {}) if word in exceptions: return exceptions[word] exceptions_fixed = EXCEPTIONS_FIXED.get(language, set()) if word in exceptions_fixed: return word, '' cursor = self.db.cursor() if pos: cursor.execute(QUERY + ' AND pos=?', (language, word, pos)) else: cursor.execute(QUERY, (language, word)) rows = list(cursor.fetchall()) if len(rows) == 0: return word, '' elif len(rows) == 1: root, form, pos = rows[0] return root, form else: possibilities = [] for row in rows: root, form, pos = row if language in WORDFREQ_LANGUAGES_LARGE: goodness = wordfreq.word_frequency(root, language, 'large') elif language in WORDFREQ_LANGUAGES: goodness = wordfreq.word_frequency(root, language) else: goodness = 0. if pos == 'n': goodness += 1. if form == 'positiv' or form == 'singular' and root != word: goodness -= 2. if goodness >= 0: possibilities.append((-goodness, root, form)) possibilities.sort() if not possibilities: return word, '' _, root, form = possibilities[0] if root == word: form = '' return root, form
def test_languages(): # Make sure the number of available languages doesn't decrease avail = available_languages() assert_greater(len(avail), 14) # Laughter is the universal language for lang in avail: if lang not in {'zh', 'ja'}: # we do not have enough Chinese data # Japanese people do not lol assert_greater(word_frequency('lol', lang), 0) # Make up a weirdly verbose language code and make sure # we still get it new_lang_code = '%s-001-x-fake-extension' % lang.upper() assert_greater(word_frequency('lol', new_lang_code), 0)
def add_vocab(self, vocab, tags=None): srs_notes = srs_api.find_notes(simplified=vocab) if len(srs_notes) == 0: db_v = zh.Vocab.get_or_none(simplified=vocab) if db_v: data = dict(db_v) else: data = {'simplified': vocab} data['frequency'] = word_frequency(vocab, 'zh') * 10**6 srs_note = srs_api.create_note(model=self.v_model, data=data) for srs_card in srs_note.cards: if srs_card.template.name == '中英': level = self.h_level[vocab] label = self.LABELS[(int(level) - 1) // 10] else: level = self.v_level[vocab] label = self.LABELS[(int(level) - 1) // 10] srs_card.add_deck(f'ZhLevel::' f'Vocab::' f'{srs_card.template.name}::' f'{label}::' f'Level {int(level):02d}') if tags: srs_api.notes_add_tags(srs_notes, tags) return srs_notes
def __find_next_cluster(keyword_clusters, min_cluster_size, max_cluster_size): """Chooses the best available cluster larger than min_cluster_size and smaller than max_cluster_size, where "best" means having the keyword that is rarest in the English language (according to wordfreq's corpus) among the options options. Args: keyword_clusters (dict): set of documents for each keyword min_cluster_size (int) max_cluster_size (int) """ lemma_dfs = [(k, len(v)) for k, v in keyword_clusters.items()] lemma_dfs.sort(key=lambda a: a[1]) possible_keywords = set() for lemma_df in lemma_dfs: if lemma_df[1] < min_cluster_size or lemma_df[ 0] in STOP_WORDS or lemma_df[0] == "-PRON-": continue if lemma_df[1] > max_cluster_size: break possible_keywords.add(lemma_df[0]) if len(possible_keywords) == 0: return None, None rarest_freq = 1 rarest_keyword = None for keyword in possible_keywords: freq = word_frequency(keyword, "en") if 0.0 < freq < rarest_freq: rarest_keyword = keyword rarest_freq = freq return rarest_keyword, keyword_clusters[rarest_keyword]
def pre_sif_mean(mat, refs, lang, dtype=None): return pre_sif_mean_inner( mat, (wordfreq.word_frequency(get_wf(ref), lang) for ref in refs), 1e-3, dtype=dtype, )
def calc_SIP(wordTuple): word = wordTuple[0] freq = word_frequency(word, 'en') if freq == 0: freq = float(.00001) SIPscore = wordTuple[1] / freq return (word, SIPscore)
def test_languages(): # Make sure the number of available languages doesn't decrease avail = available_languages() assert_greater(len(avail), 14) # Laughter is the universal language for lang in avail: if lang not in {"zh", "ja"}: # we do not have enough Chinese data # Japanese people do not lol assert_greater(word_frequency("lol", lang), 0) # Make up a weirdly verbose language code and make sure # we still get it new_lang_code = "%s-001-x-fake-extension" % lang.upper() assert_greater(word_frequency("lol", new_lang_code), 0)
def test_tokens(): # Let's test on some Chinese text that has unusual combinations of # syllables, because it is about an American vice-president. # # (He was the Chinese Wikipedia's featured article of the day when I # wrote this test.) hobart = '加勒特·霍巴特' # Garret Hobart, or "jiā lè tè huò bā tè". # He was the sixth American vice president to die in office. fact_simplified = '他是历史上第六位在任期内去世的美国副总统。' fact_traditional = '他是歷史上第六位在任期內去世的美國副總統。' # His name breaks into five pieces, with the only piece staying together # being the one that means 'Bart'. The dot is not included as a token. eq_( tokenize(hobart, 'zh'), ['加', '勒', '特', '霍', '巴特'] ) eq_( tokenize(fact_simplified, 'zh'), [ # he / is / in history / #6 / counter for people '他', '是', '历史上', '第六', '位', # during / term of office / in / die '在', '任期', '内', '去世', # of / U.S. / deputy / president '的', '美国', '副', '总统' ] ) # You match the same tokens if you look it up in Traditional Chinese. eq_(tokenize(fact_simplified, 'zh'), tokenize(fact_traditional, 'zh')) assert_greater(word_frequency(fact_traditional, 'zh'), 0)
def get_least_frequent_words(sentence, n): """ Extracts and returns the n least frequent words of a given sentence """ freq_list = [] for index, word in enumerate(sentence): if is_a_website(word): continue if word in ['•', '’', '”', '“', ')', '–', '»', '“' ] or word in string.punctuation: continue # make sure frequencies are in there (hardcoded) if 'ghz' in word: freq_list.append((index, word, 0.0)) else: freq_list.append((index, word, word_frequency(word, 'en'))) # sort words in least frequency sorted_on_freq = [ (x[0], x[1]) for x in set(sorted(freq_list, key=lambda tup: tup[2])[0:n]) ] # return list of words in logical order return [x[1] for x in sorted(sorted_on_freq, key=lambda tup: tup[0])]
def clean_text(text): result = [] words = text.split(" ") words = [a.strip(',.!?:; ') for a in words] words = list(set(words)) words = [ word for word in words if not word.isalpha() or word.lower() in different_words ] for word in set(words): # Maybe unkify? result += [ re.sub(r'[^a-zA-Z0-9]' + re.escape(word.lower()) + r'[^a-zA-Z0-9]', ' potato ', " " + text.lower() + " ").strip() ] tokenizer = RegexpTokenizer(r'\w+') all_words = tokenizer.tokenize(text) # logging.info("all_words "+str(all_words)) # Try removing all unknown words for word in set(all_words): if word.lower() not in counter and word_frequency( word.lower(), "en") == 0 and len(word) > 2: text = text.replace(word, '') result += [text] return result
def uncommon_words(n1, n2): uncommon_words_found = [[], []] for idx, n in enumerate([n1, n2]): words = tokenize(n['value'], funcs_word=[lower]) # Filter out words based on their lengths and if they do not contain any letter filtered_words = [] for w in words: if len(w) > 3 and re.search("[a-zA-Z]", w): filtered_words.append(w) words_freqs = {} for w in filtered_words: if w not in words_freqs: probability = wordfreq.word_frequency(w, 'en', wordlist='large') words_freqs[w] = probability res = [key for key in words_freqs.keys() if words_freqs[key] < thr] uncommon_words_found[idx] = res shared_uncommon_words = set(uncommon_words_found[0]) & set( uncommon_words_found[1]) if len(shared_uncommon_words) > 0: return { 'outcome': True, 'words': sorted(list(shared_uncommon_words), key=len, reverse=True), } else: return {'outcome': False}
def remove_SE_comment(text, model, features, tf_idf_counter): t = time.time() words = text.split(" ") words = [a.strip(',.!?:; ') for a in words] words = list(set(words)) words = [ word for word in words if not word.isalpha() or word.lower() in different_words ] for word in set(words): # Maybe unkify? new_sentence = re.sub( r'[^a-zA-Z0-9]' + re.escape(word.lower()) + r'[^a-zA-Z0-9]', ' potato ', text.lower()) new_features = rescore(new_sentence, features, tf_idf_counter) if model.predict([new_features])[0] == 0: return 1 tokenizer = RegexpTokenizer(r'\w+') all_words = tokenizer.tokenize(text) # Try removing all unknown words for word in set(all_words): if word.lower() not in counter and word_frequency( word.lower(), "en") == 0 and len(word) > 2: text = text.replace(word, '') if model.predict([new_features])[0] == 0: return 1 return 0
def __init__(self): global different_words global counter self.features = [] self.nice_features = [] self.parameter_names = [] self.hyper_parameters_lists = [] self.last_time = time.time() self.tf_idf_counter = 0 self.use_filters = True self.counter = pickle.load(open("pickles/github_words.p", "rb")) counter = self.counter self.our_words = dict([(i, word_frequency(i, "en") * 10**9) for i in self.counter]) self.different_words = log_odds(defaultdict(int, self.counter), defaultdict(int, self.our_words)) different_words = self.different_words self.anger_classifier = pickle.load(open("pickles/anger.p", "rb")) self.all_words = pickle.load(open("pickles/all_words.p", "rb")) self.m = sum(self.counter.values()) self.all_false = {word: False for word in self.all_words} start_time = time.time() self.alpha = 0.1 self.all_train_data = None self.test_data = None self.train_data = None self.model_function = None
def word_list(filename): num_words = 0 word_occurrence_map = {} with open(filename) as file: for line in file.readlines(): formatted = ''.join(char for char in line if char not in EXCLUDE).strip('\n').lower() words = formatted.split(' ') for word in words: if word == '' or word == "'": continue num_words += 1 if word in word_occurrence_map.keys(): word_occurrence_map[word] += 1 else: word_occurrence_map[word] = 1 # for word_occurrence in sorted(word_occurrence_map.items(), key=lambda x: x[1], reverse=True): # print(f'{word_occurrence[0]}: {word_occurrence[1]}') results = {} for key in word_occurrence_map.keys(): word_freq = word_occurrence_map[key]/num_words * 100 # Percent in text word_freq_control = word_frequency(key, 'en', wordlist='small') * 100 # Percent in english # print(key, word_freq, word_freq_control) results[key] = float(word_freq - CONTROL_MULTIPLY*word_freq_control) results = sorted(results.items(), key=lambda x: x[1], reverse=True) # print(results) return '\n\n' + filename + '\n' + ', '.join([item[0] for item in results][:30]) + '\n\n'
def term_freq(term): """ Get an estimate of the frequency of this term from the 'wordfreq' library. When miniaturizing, we use this as a cutoff for which words to include in the vocabulary. Because we have the most data for English, we allow lower word frequencies in English (by reading in the 'large' list, whose frequencies can go below 1e-6). """ _c, lang, term = split_uri(term)[:3] if lang == 'en': return wordfreq.word_frequency(term, 'en', 'large') elif lang in CORE_LANGUAGES: return wordfreq.word_frequency(term, lang) else: return 0.
def test_languages(): # Make sure the number of available languages doesn't decrease avail = available_languages() assert_greater(len(avail), 15) # Laughter is the universal language. Look up either 'lol' or '笑' in each # language and make sure it has a non-zero frequency. for lang in avail: if lang in {'zh', 'ja'}: text = '笑' else: text = 'lol' assert_greater(word_frequency(text, lang), 0) # Make up a weirdly verbose language code and make sure # we still get it new_lang_code = '%s-001-x-fake-extension' % lang.upper() assert_greater(word_frequency(text, new_lang_code), 0)
def test_language_matching(): freq = word_frequency('的', 'zh') eq_(word_frequency('的', 'zh-TW'), freq) eq_(word_frequency('的', 'zh-CN'), freq) eq_(word_frequency('的', 'zh-Hant'), freq) eq_(word_frequency('的', 'zh-Hans'), freq) eq_(word_frequency('的', 'yue-HK'), freq) eq_(word_frequency('的', 'cmn'), freq)
def test_language_matching(): freq = word_frequency("的", "zh") eq_(word_frequency("的", "zh-TW"), freq) eq_(word_frequency("的", "zh-CN"), freq) eq_(word_frequency("的", "zh-Hant"), freq) eq_(word_frequency("的", "zh-Hans"), freq) eq_(word_frequency("的", "yue-HK"), freq) eq_(word_frequency("的", "cmn"), freq)
def test_language_matching(): freq = word_frequency('的', 'zh') assert word_frequency('的', 'zh-TW') == freq assert word_frequency('的', 'zh-CN') == freq assert word_frequency('的', 'zh-Hant') == freq assert word_frequency('的', 'zh-Hans') == freq assert word_frequency('的', 'yue-HK') == freq assert word_frequency('的', 'cmn') == freq
def test_languages(): # Make sure the number of available languages doesn't decrease avail = available_languages() assert_greater(len(avail), 15) # Look up a word representing laughter in each language, and make sure # it has a non-zero frequency. for lang in avail: if lang in {'zh', 'ja'}: text = '笑' elif lang == 'ar': text = 'ههههه' else: text = 'lol' assert_greater(word_frequency(text, lang), 0) # Make up a weirdly verbose language code and make sure # we still get it new_lang_code = '%s-001-x-fake-extension' % lang.upper() assert_greater(word_frequency(text, new_lang_code), 0, (text, new_lang_code))
def test_tokens(): # Let's test on some Chinese text that has unusual combinations of # syllables, because it is about an American vice-president. # # (He was the Chinese Wikipedia's featured article of the day when I # wrote this test.) hobart = '加勒特·霍巴特' # Garret Hobart, or "jiā lè tè huò bā tè". # He was the sixth American vice president to die in office. fact_simplified = '他是历史上第六位在任期内去世的美国副总统。' fact_traditional = '他是歷史上第六位在任期內去世的美國副總統。' # His name breaks into five pieces, with the only piece staying together # being the one that means 'Bart'. The dot is not included as a token. assert tokenize(hobart, 'zh') == ['加', '勒', '特', '霍', '巴特'] assert tokenize(fact_simplified, 'zh') == [ # he / is / history / in / #6 / counter for people '他', '是', '历史', '上', '第六', '位', # during / term of office / in / die '在', '任期', '内', '去世', # of / U.S. / deputy / president '的', '美国', '副', '总统' ] # Jieba's original tokenizer knows a lot of names, it seems. assert tokenize(hobart, 'zh', external_wordlist=True) == ['加勒特', '霍巴特'] # We get almost the same tokens from the sentence using Jieba's own # wordlist, but it tokenizes "in history" as two words and # "sixth person" as one. assert tokenize(fact_simplified, 'zh', external_wordlist=True) == [ # he / is / history / in / sixth person '他', '是', '历史', '上', '第六位', # during / term of office / in / die '在', '任期', '内', '去世', # of / U.S. / deputy / president '的', '美国', '副', '总统' ] # Check that Traditional Chinese works at all assert word_frequency(fact_traditional, 'zh') > 0 # You get the same token lengths if you look it up in Traditional Chinese, # but the words are different simp_tokens = tokenize(fact_simplified, 'zh', include_punctuation=True) trad_tokens = tokenize(fact_traditional, 'zh', include_punctuation=True) assert ''.join(simp_tokens) == fact_simplified assert ''.join(trad_tokens) == fact_traditional simp_lengths = [len(token) for token in simp_tokens] trad_lengths = [len(token) for token in trad_tokens] assert simp_lengths == trad_lengths
def fuzz(self): """ Compute an arbitrarily-scaled "fuzziness" score for the query tokens, where low is focused and high is fuzzy. Returns: float """ freqs = [ word_frequency(t, 'en', minimum=1e-6) for t in self.hash_tokens ] return reduce(lambda x, y: x*y, freqs)*1e10
def test_freq_examples(): assert_almost_equal( word_frequency('normalization', 'en', 'google-books'), 1.767e-6, places=9 ) assert_almost_equal( word_frequency('normalization', 'en', 'google-books', 1e-6), 2.767e-6, places=9 ) assert_almost_equal( word_frequency('normalisation', 'fr', 'leeds-internet'), 4.162e-6, places=9 ) assert_greater( word_frequency('lol', 'xx', 'twitter'), word_frequency('lol', 'en', 'google-books') ) eq_( word_frequency('totallyfakeword', 'en', 'multi', .5), .5 )
def test_combination(): xiexie_freq = word_frequency('谢谢', 'zh') # "Thanks" assert_almost_equal( word_frequency('谢谢谢谢', 'zh'), xiexie_freq / 20 )
def test_freq_examples(): # Stopwords are most common in the correct language assert word_frequency('the', 'en') > word_frequency('de', 'en') assert word_frequency('de', 'es') > word_frequency('the', 'es') # We get word frequencies from the 'large' list when available assert word_frequency('infrequency', 'en') > 0.
def test_minimums(): assert word_frequency('esquivalience', 'en') == 0 assert word_frequency('esquivalience', 'en', minimum=1e-6) == 1e-6 assert word_frequency('the', 'en', minimum=1) == 1
def test_phrase_freq(): ff = word_frequency("flip-flop", 'en') assert ff > 0 phrase_freq = 1.0 / word_frequency('flip', 'en') + 1.0 / word_frequency('flop', 'en') assert 1.0 / ff == pytest.approx(phrase_freq, rel=0.01)
def test_phrase_freq(): ff = word_frequency("flip-flop", "en") assert_greater(ff, 0) assert_almost_equal(1.0 / ff, 1.0 / word_frequency("flip", "en") + 1.0 / word_frequency("flop", "en"))
def test_freq_examples(): # Stopwords are most common in the correct language assert_greater(word_frequency("the", "en"), word_frequency("de", "en")) assert_greater(word_frequency("de", "es"), word_frequency("the", "es"))
def test_twitter(): avail = available_languages("twitter") assert_greater(len(avail), 12) for lang in avail: assert_greater(word_frequency("rt", lang, "twitter"), word_frequency("rt", lang, "combined"))
def test_minimums(): eq_(word_frequency("esquivalience", "en"), 0) eq_(word_frequency("esquivalience", "en", minimum=1e-6), 1e-6) eq_(word_frequency("the", "en", minimum=1), 1)
def test_at_in_corpus(): # We have a word frequency for "l@s" assert word_frequency('l@s', 'es') > 0 # It's not just treated as a word break assert word_frequency('l@s', 'es') < word_frequency('l s', 'es')
def test_combination(): xiexie_freq = word_frequency('谢谢', 'zh') # "Thanks" assert word_frequency('谢谢谢谢', 'zh') == pytest.approx(xiexie_freq / 20, rel=0.01)