def __init__(self, params={}): exclude = params.get("exclude_words", "exclude.txt") include = params.get("include_words", "gulordava_data/vocab.txt") dict = wordfreq.get_frequency_dict('en') keys = dict.keys() self.words = [] exclusions = [] if exclude is not None: with open(exclude, "r", encoding="utf-8") as f: for line in f: word = line.strip() exclusions.append(word) inclusions = [] if include is not None: with open(include, "r", encoding="utf-8") as f: for line in f: word = line.strip() inclusions.append(word) words = list(set(inclusions) & set(keys) - set(exclusions)) else: words = list(set(keys) - set(exclusions)) for word in words: if re.match("^[a-z]*$", word): freq = math.log( dict[word] * 10**9 ) # we canonically calculate frequency as log occurrences/1 billion words self.words.append(distractor(word, freq))
def __init__(self, params={}): exclude = params.get("exclude_words", "exclude.txt") include = params.get( "include_words", "french_data/frwac_vocab.txt") #list of model's vocab dict = wordfreq.get_frequency_dict('fr') keys = dict.keys() self.words = [] exclusions = [] if exclude is not None: with open(exclude, "r", encoding="utf-8") as f: for line in f: word = line.strip() exclusions.append(word) inclusions = [] if include is not None: with open(include, "r", encoding="utf-8") as f: for line in f: word = line.strip() inclusions.append(word) words = list(set(inclusions) & set(keys) - set(exclusions)) else: words = list(set(keys) - set(exclusions)) for word in words: if re.match( "^[a-zçéâêîôûàèùëïü]*$", word ): #what I believe to be a complete set of french characters freq = math.log( dict[word] * 10**9 ) # we canonically calculate frequency as log occurrences/1 billion words self.words.append(distractor(word, freq))
def main(): letters = Counter() _2gram = Counter() _3gram = Counter() for word, freq in wordfreq.get_frequency_dict('en', wordlist='best').items(): sofar = '…' # we're including space at the beginning of the word for letter in word.upper(): if letter not in 'ABCDEFGHIJKLMNOPQRSTUVWXYZ': sofar = '' continue letters[letter] += freq / (len(word)) sofar += letter if len(sofar) > 1: _2gram[sofar[-2:]] += freq / (len(word) + 1) if len(sofar) > 2: _3gram[sofar[-3:]] += freq / (len(word)) sofar += '…' if len(sofar) > 1: _2gram[sofar[-2:]] += freq / (len(word) + 1) if len(sofar) > 2: _3gram[sofar[-3:]] += freq / (len(word)) with open("1grams.txt", "w") as out: for gram, count in letters.most_common(): out.write(gram + ' ' + str(count) + '\n') with open("2grams.txt", "w") as out: for gram, count in _2gram.most_common(): out.write(gram + ' ' + str(count) + '\n') with open("3grams.txt", "w") as out: for gram, count in _3gram.most_common(6000): out.write(gram + ' ' + str(count) + '\n')
def add_freq_data(): """ Add table of frequencies to DB """ session = get_session() metadata.create_all(session().get_bind().engine) with click.progressbar(wordfreq.get_frequency_dict("fi").items(), label="Inserting frequencies") as name_freqs: for name, freq in name_freqs: insert(session, freqs, name=name, freq=freq) session.commit()
def _precompute_sif_weights(self, wv, alpha=1e-3, no_frequency=False, lang="en"): """Precompute the weights used in the vector summation Parameters ---------- wv : `~gensim.models.keyedvectors.BaseKeyedVectors` A gensim keyedvectors child that contains the word vectors and the vocabulary alpha : float, optional Parameter which is used to weigh each individual word based on its probability p(w). If alpha = 0, the model computes the average sentence embedding. Common values range from 1e-5 to 1e-1. For more information, see the original paper. no_frequency : bool, optional Use a the commonly available frequency table if the Gensim model does not contain information about the frequency of the words (see model.wv.vocab.count). lang : str, optional Determines the language of the frequency table used to compute the weights. Returns ------- numpy.ndarray The vector of weights for all words in the model vocabulary """ logger.info("pre-computing SIF weights") if no_frequency: logger.info("no frequency mode: using wordfreq for estimation (lang=%s)", lang) freq_dict = get_frequency_dict(str(lang), wordlist='best') for w in wv.index2word: if w in freq_dict: wv.vocab[w].count = int(freq_dict[w] * (2 ** 31 - 1)) else: wv.vocab[w].count = 1 if alpha > 0: corpus_size = 0 # Set the dtype correct for cython estimation sif = zeros(shape=len(wv.vocab), dtype=REAL) for k in wv.index2word: # Compute normalization constant corpus_size += wv.vocab[k].count for idx, k in enumerate(wv.index2word): pw = wv.vocab[k].count / corpus_size sif[idx] = alpha / (alpha + pw) else: sif = ones(shape=len(wv.vocab), dtype=REAL) return sif
def _induce_frequencies(self, domain: int = 2**31 - 1): """ Induce frequencies for a pretrained model, as not all pretrained models come with frequencies. Parameters ---------- domain : int The cumulative count of the vocabulary. """ freq_dict = get_frequency_dict(self.lang_freq, wordlist="best") for word in self.wv.index2word: if word in freq_dict: self.wv.vocab[word].count = int(freq_dict[word] * domain) else: self.wv.vocab[word].count = int(1e-8 * domain)
import pickle import gzip import wordfreq as wf # store with gzip.open('freq_en.pickle.gz', 'wb') as f: pickle.dump(wf.get_frequency_dict('en', wordlist='large'), f, protocol=2) # Python 2.x compatible # load with gzip.open('freq_en.pickle.gz', 'rb') as f: freq_en = pickle.load(f) print(freq_en['the']) # should be 0.03890451449942807
def test_model_w_language(self): se = BaseSentence2VecModel(W2V, lang_freq="en") freq = int( (2**31 - 1) * get_frequency_dict("en", wordlist="best")["help"]) self.assertEqual(freq, se.wv.vocab["help"].count) self.assertEqual(21, se.wv.vocab["79"].count)
def build_vocab(present_words, *dicts): v = init_vocab() for d in dicts: append_vocab(d, v, present_words) return (v) import wordfreq from stop_words import get_stop_words STOP_WORDS = get_stop_words('en') wf = list( sorted(wordfreq.get_frequency_dict('en').items(), key=lambda x: -x[1])) present_words, _ = zip(*wf[:15000]) common_present_words = set(x.upper() for x in present_words) with open(cmudict_file) as cmu_dict_file_desc: full_vocabulary = build_vocab(common_present_words, cmu_dict_file_desc) def pronounce(string, pronounciations): ps = [[]] for w in string.upper().split(): pn = [] for p in ps: if p: p.append(' ') for pi in pronounciations[w]:
''' https://pypi.org/project/wordfreq/#description ''' from wordfreq import zipf_frequency, get_frequency_dict ''' f = zipf_frequency('frequency', 'en') print('word: "{0}"\tfrequency: {1}'.format('frequency', f)) ''' d = get_frequency_dict('en', wordlist='best') ''' f = open('e-5 4e-6_.word', 'w') # frequency > 0.0001, words = 1068 cnt = 0; for w in d: if d[w] > 0.000004 and d[w] < 0.00001: cnt += 1 f.write('{0} {1}\n'.format(w, d[w])) f.close() print("Writen {} lines into freq.word.".format(cnt)) ''' def get_wordfreq(frange): fname, whigh, wlow = frange f = open('{}.freq.word'.format(fname), 'w') cnt = 0 for w in d:
def compute_wordfreq_score(masked_word, lang): freqs = wordfreq.get_frequency_dict(lang) return freqs[masked_word]