Esempio n. 1
0
    def __init__(self, params={}):
        exclude = params.get("exclude_words", "exclude.txt")
        include = params.get("include_words", "gulordava_data/vocab.txt")
        dict = wordfreq.get_frequency_dict('en')
        keys = dict.keys()
        self.words = []
        exclusions = []

        if exclude is not None:
            with open(exclude, "r", encoding="utf-8") as f:
                for line in f:
                    word = line.strip()
                    exclusions.append(word)
        inclusions = []
        if include is not None:
            with open(include, "r", encoding="utf-8") as f:
                for line in f:
                    word = line.strip()
                    inclusions.append(word)
            words = list(set(inclusions) & set(keys) - set(exclusions))
        else:
            words = list(set(keys) - set(exclusions))
        for word in words:
            if re.match("^[a-z]*$", word):
                freq = math.log(
                    dict[word] * 10**9
                )  # we canonically calculate frequency as log occurrences/1 billion words
                self.words.append(distractor(word, freq))
Esempio n. 2
0
    def __init__(self, params={}):
        exclude = params.get("exclude_words", "exclude.txt")
        include = params.get(
            "include_words",
            "french_data/frwac_vocab.txt")  #list of model's vocab
        dict = wordfreq.get_frequency_dict('fr')
        keys = dict.keys()
        self.words = []
        exclusions = []

        if exclude is not None:
            with open(exclude, "r", encoding="utf-8") as f:
                for line in f:
                    word = line.strip()
                    exclusions.append(word)
        inclusions = []
        if include is not None:
            with open(include, "r", encoding="utf-8") as f:
                for line in f:
                    word = line.strip()
                    inclusions.append(word)
            words = list(set(inclusions) & set(keys) - set(exclusions))
        else:
            words = list(set(keys) - set(exclusions))
        for word in words:
            if re.match(
                    "^[a-zçéâêîôûàèùëïü]*$", word
            ):  #what I believe to be a complete set of french characters
                freq = math.log(
                    dict[word] * 10**9
                )  # we canonically calculate frequency as log occurrences/1 billion words
                self.words.append(distractor(word, freq))
Esempio n. 3
0
def main():
    letters = Counter()
    _2gram = Counter()
    _3gram = Counter()

    for word, freq in wordfreq.get_frequency_dict('en', wordlist='best').items():
        sofar = '…'  # we're including space at the beginning of the word
        for letter in word.upper():
            if letter not in 'ABCDEFGHIJKLMNOPQRSTUVWXYZ':
                sofar = ''
                continue
            letters[letter] += freq / (len(word))
            sofar += letter
            if len(sofar) > 1:
                _2gram[sofar[-2:]] += freq / (len(word) + 1)
            if len(sofar) > 2:
                _3gram[sofar[-3:]] += freq / (len(word))

        sofar += '…'
        if len(sofar) > 1:
            _2gram[sofar[-2:]] += freq / (len(word) + 1)
        if len(sofar) > 2:
            _3gram[sofar[-3:]] += freq / (len(word))

    with open("1grams.txt", "w") as out:
        for gram, count in letters.most_common():
            out.write(gram + ' ' + str(count) + '\n')

    with open("2grams.txt", "w") as out:
        for gram, count in _2gram.most_common():
            out.write(gram + ' ' + str(count) + '\n')

    with open("3grams.txt", "w") as out:
        for gram, count in _3gram.most_common(6000):
            out.write(gram + ' ' + str(count) + '\n')
Esempio n. 4
0
def add_freq_data():
    """
    Add table of frequencies to DB
    """
    session = get_session()
    metadata.create_all(session().get_bind().engine)
    with click.progressbar(wordfreq.get_frequency_dict("fi").items(), label="Inserting frequencies") as name_freqs:
        for name, freq in name_freqs:
            insert(session, freqs, name=name, freq=freq)
    session.commit()
Esempio n. 5
0
    def _precompute_sif_weights(self, wv, alpha=1e-3, no_frequency=False, lang="en"):
        """Precompute the weights used in the vector summation

        Parameters
        ----------
        wv : `~gensim.models.keyedvectors.BaseKeyedVectors`
            A gensim keyedvectors child that contains the word vectors and the vocabulary
        alpha : float, optional
            Parameter which is used to weigh each individual word based on its probability p(w).
            If alpha = 0, the model computes the average sentence embedding. Common values range from 1e-5 to 1e-1.
            For more information, see the original paper.
        no_frequency : bool, optional
            Use a the commonly available frequency table if the Gensim model does not contain information about
            the frequency of the words (see model.wv.vocab.count).
        lang : str, optional
            Determines the language of the frequency table used to compute the weights.

        Returns
        -------
        numpy.ndarray
            The vector of weights for all words in the model vocabulary

        """
        logger.info("pre-computing SIF weights")

        if no_frequency:
            logger.info("no frequency mode: using wordfreq for estimation (lang=%s)", lang)
            freq_dict = get_frequency_dict(str(lang), wordlist='best')

            for w in wv.index2word:
                if w in freq_dict:
                    wv.vocab[w].count = int(freq_dict[w] * (2 ** 31 - 1))
                else:
                    wv.vocab[w].count = 1

        if alpha > 0:
            corpus_size = 0
            # Set the dtype correct for cython estimation
            sif = zeros(shape=len(wv.vocab), dtype=REAL)

            for k in wv.index2word:
                # Compute normalization constant
                corpus_size += wv.vocab[k].count

            for idx, k in enumerate(wv.index2word):
                pw = wv.vocab[k].count / corpus_size
                sif[idx] = alpha / (alpha + pw)
        else:
            sif = ones(shape=len(wv.vocab), dtype=REAL)

        return sif
Esempio n. 6
0
    def _induce_frequencies(self, domain: int = 2**31 - 1):
        """ Induce frequencies for a pretrained model, as not all pretrained models come with frequencies.
        
        Parameters
        ----------
        domain : int
            The cumulative count of the vocabulary.

        """
        freq_dict = get_frequency_dict(self.lang_freq, wordlist="best")
        for word in self.wv.index2word:
            if word in freq_dict:
                self.wv.vocab[word].count = int(freq_dict[word] * domain)
            else:
                self.wv.vocab[word].count = int(1e-8 * domain)
Esempio n. 7
0
import pickle
import gzip
import wordfreq as wf

# store
with gzip.open('freq_en.pickle.gz', 'wb') as f:
    pickle.dump(wf.get_frequency_dict('en', wordlist='large'), f,
                protocol=2)  # Python 2.x compatible

# load
with gzip.open('freq_en.pickle.gz', 'rb') as f:
    freq_en = pickle.load(f)
    print(freq_en['the'])  # should be 0.03890451449942807
 def test_model_w_language(self):
     se = BaseSentence2VecModel(W2V, lang_freq="en")
     freq = int(
         (2**31 - 1) * get_frequency_dict("en", wordlist="best")["help"])
     self.assertEqual(freq, se.wv.vocab["help"].count)
     self.assertEqual(21, se.wv.vocab["79"].count)
Esempio n. 9
0

def build_vocab(present_words, *dicts):
    v = init_vocab()
    for d in dicts:
        append_vocab(d, v, present_words)
    return (v)


import wordfreq
from stop_words import get_stop_words

STOP_WORDS = get_stop_words('en')

wf = list(
    sorted(wordfreq.get_frequency_dict('en').items(), key=lambda x: -x[1]))
present_words, _ = zip(*wf[:15000])
common_present_words = set(x.upper() for x in present_words)

with open(cmudict_file) as cmu_dict_file_desc:
    full_vocabulary = build_vocab(common_present_words, cmu_dict_file_desc)


def pronounce(string, pronounciations):
    ps = [[]]
    for w in string.upper().split():
        pn = []
        for p in ps:
            if p:
                p.append(' ')
            for pi in pronounciations[w]:
Esempio n. 10
0
'''
https://pypi.org/project/wordfreq/#description
'''

from wordfreq import zipf_frequency, get_frequency_dict
'''
f = zipf_frequency('frequency', 'en')
print('word: "{0}"\tfrequency: {1}'.format('frequency', f))
'''

d = get_frequency_dict('en', wordlist='best')
'''
f = open('e-5 4e-6_.word', 'w')
# frequency > 0.0001, words = 1068
cnt = 0;
for w in d:
    if d[w] > 0.000004 and d[w] < 0.00001:
        cnt += 1
        f.write('{0} {1}\n'.format(w, d[w]))

f.close()

print("Writen {} lines into freq.word.".format(cnt))
'''


def get_wordfreq(frange):
    fname, whigh, wlow = frange
    f = open('{}.freq.word'.format(fname), 'w')
    cnt = 0
    for w in d:
Esempio n. 11
0
def compute_wordfreq_score(masked_word, lang):
    freqs = wordfreq.get_frequency_dict(lang)
    return freqs[masked_word]