def detect_freqwords_method(text): words = set() for sentence in nltk.sent_tokenize(text): for word in nltk.word_tokenize(sentence): if word not in punctuation: words.add(word) if len(words.intersection(top_n_list('de', 30))) > len(words.intersection(top_n_list('es', 30))): return "German" else: return "Spanish"
def eval_analogies(frame): filename = get_support_data_filename('google-analogies/questions-words.txt') quads = read_google_analogies(filename) vocab = [ standardized_uri('en', word) for word in wordfreq.top_n_list('en', 200000) ] wrap = VectorSpaceWrapper(frame=frame) vecs = np.vstack([wrap.get_vector(word) for word in vocab]) tframe = pd.DataFrame(vecs, index=vocab) total = 0 correct = 0 seen_mistakes = set() for quad in quads: prompt = quad[:3] answer = quad[3] vector = analogy_func(frame, *prompt) similar = similar_to_vec(tframe, vector) result = None for match in similar.index: if match not in prompt: result = match break if result == answer: correct += 1 else: if result not in seen_mistakes: print( "%s : %s :: %s : [%s] (should be %s)" % (quad[0], quad[1], quad[2], result, answer) ) seen_mistakes.add(result) total += 1 low, high = proportion_confint(correct, total) return pd.Series([correct / total, low, high], index=['acc', 'low', 'high'])
def build_tokens(self): """ Get a set of whitelisted tokens. Returns: set """ tokens = top_n_list('en', self['token_depth'], ascii_only=True) return set(tokens)
def remove_stop_words_hebrew_extended(text): text = text.split(' ') newword = [] for word in text: for sw in hebrew_stopwords_ex: from wordfreq import top_n_list if word.startswith(sw) and word[len(sw):] in top_n_list( 'he', 100000): word = word[len(sw):] break if word not in hebrew_stopwords_ex: newword += [word] return " ".join(newword)
def export_conceptnet_to_hyperwords(table, matrix_filename, vocab_filename, nrows): vecs = [] labels = [] english_labels = [ standardized_uri('en', item) for item in wordfreq.top_n_list('en', nrows * 2, 'large') ] count = 0 for label in english_labels: if label in table.index: labels.append(label.split('/')[-1]) vecs.append(get_vector(table, label)) count += 1 if count >= nrows: break np.save(matrix_filename, np.vstack(vecs)) save_index_as_labels(labels, vocab_filename)
def main(): top_words = top_n_list('en', 50000) dic = {} words_skipped = [] counter = 0 for word in tqdm(top_words): if counter%50==0: with open('data_chinese.json', 'w') as fp: json.dump(dic, fp) with open('missing_words_chinese.json', 'w') as wp: json.dump(words_skipped, wp) try: dic[word] = '#'+word_to_color(word) except: print("skip word '{}'".format(word)) time.sleep(5) words_skipped.append(word) counter +=1
def make_clozes(sentence_file: str, language: str, nwords: int, max_sentences_per_word: int, max_translations_per_sentence: int, max_characters: int) -> None: sentence_map = OrderedDict() word_map = {} with open(sentence_file) as fh: while True: line = fh.readline() if line == "": break sentence_l1, sentence_l2 = line.strip().split("\t") if sentence_l2 not in sentence_map: sentence_map[sentence_l2] = [] sentence_map[sentence_l2].append(sentence_l1) words = sentence_to_words(sentence_l2) for word in words: if word not in word_map: # We use an OrderedDict as an ordered set, since the # Python standard library lacks the latter. word_map[word] = OrderedDict() if sentence_l2 not in word_map[word]: word_map[word][sentence_l2] = None # We request the top n*2 words, to be sure of having n words left # after filtering out the undesirable ones. top_n_words = filter_word_list(wordfreq.top_n_list(language, nwords * 2), nwords) writer = csv.writer(sys.stdout, quoting=csv.QUOTE_ALL) for word in top_n_words: if word in word_map: sentences_l2 = list(word_map[word])[:max_sentences_per_word] for sentence_l2 in sentences_l2: translation_list = \ sentence_map[sentence_l2][:max_translations_per_sentence] cloze = make_anki_cloze(sentence_l2, word) translations = " / ".join(translation_list) if len(cloze) <= max_characters and \ len(translations) <= max_characters: writer.writerow( ["<p>%s</p><p>%s</p>" % (cloze, translations)])
def choose_vocab(quads, vocab_size): """ Google and Bats analogies are not multiple-choice; instead, you're supposed to pick the best match out of your vector space's entire vocabulary, excluding the three words used in the prompt. The vocabulary size can matter a lot: Set it too high and you'll get low-frequency words that the data set wasn't looking for as answers. Set it too low and the correct answers won't be in the vocabulary. Set vocab_size='cheat' to see the results for an unrealistically optimal vocabulary (the vocabulary of the set of answer words). """ if vocab_size == 'cheat': vocab = [ standardized_uri('en', word) for word in sorted(set([quad[3] for quad in quads])) ] else: vocab = [ standardized_uri('en', word) for word in wordfreq.top_n_list('en', vocab_size) ] return vocab
""" Module with some text processing functions used by the wrappers. """ import re from typing import Tuple from wordfreq import top_n_list TOKENS_REG = re.compile(r"(?u)\b\w+\b") STOP_WORDS = set(top_n_list("en", 50, wordlist='best')) # 50 most common stop words def locate_short_forms(note: str, short_form_list: list) -> Tuple[list, list, list]: """ Find if token in the short forms list, store token, span and location. If the token happen to be a short form and a stop word, it won't get counted. """ locations: list = [] short_forms_intext: list = [] span: list = [] for i, token in enumerate(TOKENS_REG.finditer(note)): if token.group() in short_form_list and token.group( ) not in STOP_WORDS: locations.append(i) short_forms_intext.append(token.group()) span.append(token.span()) return short_forms_intext, span, locations
import os import bz2 from wordfreq import top_n_list from difflib import SequenceMatcher from collections import namedtuple from quotes.utils import clean_text Token = namedtuple('Tuple', [ 'token', 'char1', 'char2', ]) blacklist = set(top_n_list('en', 200)) class Text: @classmethod def from_stacks(cls, path: str): """ Read from a Stacks JSON file. """ with bz2.open(path, 'rt') as fh: metadata = json.loads(fh.read()) text = metadata.pop('plain_text')
'ms', # Malay 'nb', # Norwegian 'fa', # Persian 'pl', # Polish 'pt', # Portuguese 'ro', # Romanian 'ru', # Russian 'sr', # Serbian 'es', # Spanish 'sv', # Swedish 'tr', # Turkish 'uk', # Ukrainian ] for language_code in language_codes: print('Processing %s' % language_code) file_name = 'frequency-lists/%s-freq.txt' % language_code top_n = top_n_list(language_code, 1000000000) with open(file_name, 'w') as file: for word in top_n: file.write('%s\n' % word) file.close() file_name_2000 = 'frequency-lists-2000/%s-freq-2000.txt' % language_code top_2000 = top_n_list(language_code, 2000) with open(file_name_2000, 'w') as file_2000: for word in top_2000: file_2000.write('%s\n' % word) file_2000.close()
def __iter__(self): words = top_n_list(lang='en', n=self._num_examples) for w in words: image = self.create_image(w) yield image, w
""" A quick script to output the top N words (1000 for now) in each language. You can send the output to a file and diff it to see changes between wordfreq versions. """ import wordfreq N = 1000 if __name__ == '__main__': for lang in sorted(wordfreq.available_languages()): for word in wordfreq.top_n_list(lang, 1000): print('{}\t{}'.format(lang, word))
def __init__(self, nWords): self.__nWords = nWords self.__words = top_n_list('en', self.__nWords, wordlist='large') # sys.stdout.buffer.write(str(self.__words).encode('utf-8')) self.__frequencies = numpy.array( [word_frequency(w, 'en') for w in self.__words])
import json from wordfreq import zipf_frequency, top_n_list jsondict = { word: zipf_frequency(word, "en") for word in top_n_list('en', 20000) } with open('english_zipf.json', 'w') as outfile: json.dump(jsondict, outfile)
""" A quick script to output the top N words (1000 for now) in each language. You can send the output to a file and diff it to see changes between wordfreq versions. """ import wordfreq N = 1000 for lang in sorted(wordfreq.available_languages()): for word in wordfreq.top_n_list(lang, 1000): print('{}\t{}'.format(lang, word))
#data_path = '/Users/chenfish/Desktop/Thesis/Project/data/mt_pe/dev/' data_path = '/Users/yuwen/Desktop/Thesis/Project/data/ht_pe/all_no_split/mtht/' print(data_path) print('We are working on 5000 word rank.') for i in os.listdir(data_path): if i[-2:] == 'en': data = pd.read_pickle(data_path + i) print('Now we are working on', i) top_rank = top_n_list('en', 5000) elif i[-2:] == 'de': data = pd.read_pickle(data_path + i) print('Now we are working on', i) top_rank = top_n_list('de', 5000) elif i[-2:] == 'ru': data = pd.read_pickle(data_path + i) print('Now we are working on', i) top_rank = top_n_list('ru', 5000)
import json import numpy as np import re from collections import Counter import pickle import torch import torchtext.vocab as vocab glove = vocab.GloVe(name='840B', dim=300) from wordfreq import word_frequency, top_n_list import time # print(real_word, get_word(real_word)) top_words = top_n_list('en', 200000) def get_word(word): return glove.vectors[glove.stoi[word]].numpy() def word2vec(pkl_file): _file = open(pkl_file, "rb") data = pickle.load(_file) zero_embed = np.zeros(300) for d in data: print(d) json_file = d.replace('json', 'pkl') tmp_file = open(json_file, "rb") json_words = pickle.load(tmp_file) t1 = time.time() embeddings = [] for real_word in json_words:
def eval_google_analogies(vectors, subset='semantic', vocab_size=200000, verbose=False): """ Evaluate the Google Research analogies, released by Mikolov et al. along with word2vec. These analogies come in two flavors: semantic and syntactic. Numberbatch is intended to be a semantic space, so we focus on semantic analogies. The syntactic analogies are about whether you can inflect or conjugate a particular word. The semantic analogies are about whether you can sort words by their gender, and about geographic trivia. I (Rob) think this data set is not very representative, but evaluating against it is all the rage. These analogies are not multiple-choice; instead, you're supposed to pick the best match out of your vector space's entire vocabulary, excluding the three words used in the prompt. The vocabulary size can matter a lot: Set it too high and you'll get low-frequency words that the data set wasn't looking for as answers. Set it too low and the correct answers won't be in the vocabulary. Set vocab_size='cheat' to see the results for an unrealistically optimal vocabulary (the vocabulary of the set of answer words). """ filename = get_support_data_filename( 'google-analogies/{}-words.txt'.format(subset)) quads = read_google_analogies(filename) if vocab_size == 'cheat': vocab = [ standardized_uri('en', word) for word in sorted(set([quad[3] for quad in quads])) ] else: vocab = [ standardized_uri('en', word) for word in wordfreq.top_n_list('en', vocab_size) ] vecs = np.vstack([vectors.get_vector(word) for word in vocab]) tframe = pd.DataFrame(vecs, index=vocab) total = 0 correct = 0 seen_mistakes = set() for quad in quads: prompt = quad[:3] answer = quad[3] result = best_analogy_3cosmul(vectors, tframe, *prompt) if result == answer: correct += 1 else: if verbose and result not in seen_mistakes: print("%s : %s :: %s : [%s] (should be %s)" % (quad[0], quad[1], quad[2], result, answer)) seen_mistakes.add(result) total += 1 low, high = proportion_confint(correct, total) result = pd.Series([correct / total, low, high], index=['acc', 'low', 'high']) if verbose: print(result) return result
import string import wordfreq VALID_LETTERS = string.ascii_uppercase + "'" for word in wordfreq.top_n_list('en', 1000000, 'best'): word = word.upper() if all(ch in VALID_LETTERS for ch in word): freq = int(wordfreq.word_frequency(word, 'en', 'best') * 1e9) - 9 if freq > 0: print("{},{}".format(word, freq))
def get_most_common(lang): """ Return the single most common word in the language. """ return top_n_list(lang, 1)[0]
""" A quick script to output the top N words (500 for now) in each language. You can send the output to a file and diff it to see changes between wordfreq versions. """ import wordfreq N = 500 if __name__ == '__main__': for lang in sorted(wordfreq.available_languages()): for word in wordfreq.top_n_list(lang, N): print('{}\t{}'.format(lang, word))
def get_top_n(lang, start=0, end=100): if language_supported(lang): top_n = top_n_list(CODES[lang], end, wordlist='best') top_n = top_n[start:end] return top_n
import string import wordfreq VALID_LETTERS = string.ascii_uppercase + "'" for word in wordfreq.top_n_list('en', 1000000, 'large'): word = word.upper() if all(ch in VALID_LETTERS for ch in word): freq = int(wordfreq.word_frequency(word, 'en', 'large') * 1e9) print("{},{}".format(word, freq))
from django.core import serializers from .models import Question, Answer, Score from .serializers import QuestionSerializer, UserSerializer, AnswerSerializer from rest_framework.decorators import api_view from rest_framework.response import Response from rest_framework.reverse import reverse from rest_framework import generics, permissions, renderers import json from django.db.models import F, Func from django.db.models import Q from .tokenIDHelper import * from wordfreq import top_n_list from .sentencesVerif import calc_distance freq_list = top_n_list('en', 10000, wordlist='best') THRESHOLD = 3.6 # threshold for the GateKeeper classifier - important! @api_view(['GET']) def api_root(request, format=None): """ this function defines the root API of the project """ return Response({ 'users': reverse('Users-list', request=request, format=format), 'Questions': reverse('Questions-list', request=request, format=format) })
def get_vocab(language): words = list(wordfreq.top_n_list(language, 100000)) return set(words[100:])
import wordfreq import math import json wordlist = [(w, round(math.log(wordfreq.word_frequency(w, 'en')), 4)) for w in wordfreq.top_n_list("en", 25000)] with open("wordfreq-en-25000-log.json", "w") as fh: json.dump(wordlist, fh, indent=2)
# uses wordfreq==2.3.2 import argparse import wordfreq if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("languages", nargs="+") parser.add_argument("--out-pattern", type=str) parser.add_argument("--top_n", type=int, default=1_000) args = parser.parse_args() for language in args.languages: with open(args.out_pattern.format(language), "w") as f: # wordfreq lists lowercase all words for word in wordfreq.top_n_list(language, args.top_n): f.write(word + "\n") f.write(word.title() + "\n")
import click import re from wordfreq import top_n_list from twitter.utils import get_spark whitelist = set(top_n_list('en', 10000)) def tokenize_tweet(text): """Tokenize tweet text. """ # Remove URLs. text = re.sub('http\S+', '', text) return re.findall('[a-z0-9#@]+', text.lower()) def count_tokens(tweet): """Generate (token, minute) keys. """ for token in tokenize_tweet(tweet.text): if token in whitelist: yield ((tweet.key, token), 1) @click.command() @click.option('--src', default='data/states.parquet') @click.option('--dest', default='data/state-word-counts.json') def main(src, dest):
def parse_swedish(): czech = read_languages_file('czech.txt') english = read_languages_file('english.txt') french = read_languages_file('french.txt') italian = read_languages_file('italian.txt') spanish = read_languages_file('spanish.txt') requested_word_count = 5000 swedish_unfiltered = top_n_list('sv', requested_word_count) min_word_length = 3 max_word_length = 8 good_length = lambda w: len(w) >= min_word_length and len( w) <= max_word_length used_by = lambda words_set, word: word in words_set # used_by_other_languages = lambda w: used_by(czech, w) or used_by(english, w) or used_by(french, w) or used_by(italian, w) or used_by(spanish, w) swedish = [] # list(filter(lambda w: # good_length(w) and not used_by_other_languages(w), swedish)) for word in swedish_unfiltered: if not word.isalpha(): print("🔠skipped word: '{}', not alpha".format(word)) continue if not good_length(word): continue if used_by(czech, word): print("🇨🇿 skipped word: '{}', used by Czech".format(word)) continue if used_by(english, word): print("🇬🇧 skipped word: '{}', used by English".format(word)) continue if used_by(french, word): print("🇫🇷 skipped word: '{}', used by French".format(word)) continue if used_by(italian, word): print("🇮🇹 skipped word: '{}', used by Italian".format(word)) continue if used_by(spanish, word): print("🇪🇸 skipped word: '{}', used by Spanish".format(word)) continue swedish.append(word) with open("output_swedish.txt", "w") as text_file: print(f"{swedish}", file=text_file) print( "🇸🇪 Outputted #{} words, after having filtered out #{}".format( len(swedish), (requested_word_count - len(swedish))))
from docopt import docopt from gensim.models.doc2vec import TaggedDocument from gensim.models.doc2vec import Doc2Vec from lxml import html from lxml.html.clean import clean_html from sklearn import svm from sklearn import preprocessing import snowballstemmer from wordfreq import top_n_list daiquiri.setup(level=logging.INFO, outputs=("stderr", )) log = logging.getLogger(__name__) stem = snowballstemmer.stemmer("english").stemWord GARBAGE_TO_SPACE = dict.fromkeys((ord(x) for x in punctuation), " ") STOP_WORDS = set(top_n_list("en", 800)) WORD_MIN_LENGTH = 2 WORD_MAX_LENGTH = 64 # sha2 length def sane(word): return WORD_MIN_LENGTH <= len(word) <= WORD_MAX_LENGTH def string2words(string): """Converts a string to a list of words. Removes punctuation, lowercase, words strictly smaller than 2 and strictly bigger than 64 characters
#Written by Bernardo Rodrigues ([email protected]) #Based on Luminoso Insight's wordfreq module (https://github.com/LuminosoInsight/wordfreq/) from wordfreq import word_frequency from wordfreq import zipf_frequency from wordfreq import top_n_list #import matplotlib.pyplot as plt dest = "./wordlists/" ar = top_n_list('ar', 1e5, wordlist='large') de = top_n_list('de', 1e5, wordlist='large') en = top_n_list('en', 1e5, wordlist='large') es = top_n_list('es', 1e5, wordlist='large') fi = top_n_list('fi', 1e5) fr = top_n_list('fr', 1e5, wordlist='large') hi = top_n_list('hi', 1e5) it = top_n_list('it', 1e5, wordlist='large') ja = top_n_list('ja', 1e5) nl = top_n_list('nl', 1e5, wordlist='large') sv = top_n_list('sv', 1e5) pt = top_n_list('pt', 1e5, wordlist='large') zh = top_n_list('zh', 1e5) #--------------------------------------------------------------- arPopular = open(dest + '/arPopular.txt', 'w') arLongTail = open(dest + '/arLongTail.txt', 'w') integral100 = 0 for i in range(len(ar)): integral100 += word_frequency(ar[i], 'ar')