Beispiel #1
0
def _read_csv_basic(filename):
    infile = codecs.open(filename, encoding='utf-8')

    counts = {}
    for line in infile:
        line = line.rstrip(u'\n')
        word, count = line.rsplit(u',', 1)
        count = float(count)
        counts[standardize_word(word)] = count
    return counts
Beispiel #2
0
def word_frequency(word, lang, wordlist='multi', offset=0.):
    """
    Get the frequency of `word` in the language with code `lang`, from the
    specified `wordlist`.

    The offset gets added to all values, to monotonically account for the
    fact that we have not observed all possible words.
    """
    c = CONN.cursor()
    c.execute("SELECT freq from words where word=? and lang=? and wordlist=?",
              (standardize_word(word), lang, wordlist))
    row = c.fetchone()
    if row is None:
        return offset
    else:
        return row[0] + offset
Beispiel #3
0
def read_leeds_corpus(filename):
    """
    Load word frequencies from a "Web as Corpus" file, collected and
    provided by the University of Leeds.

    For more information, see: http://corpus.leeds.ac.uk/list.html
    """
    infile = codecs.open(filename, encoding='utf-8')

    counts = defaultdict(float)
    for line in infile:
        line = line.rstrip()
        if line:
            rank = line.split(u' ')[0]
            if NUMBER_RE.match(rank) and line.count(u' ') == 2:
                _, freq, token = line.split(u' ')
                token = standardize_word(ftfy(token))
                freq = float(freq)
                counts[token] += freq

    return _scale_freqs(counts)
Beispiel #4
0
def read_multilingual_csv(filename):
    """
    Load word frequencies from a file of comma-separated values, where
    each line is of the form:

        word|lang,freq

    Scale the frequencies so they add up to 1.0 *for each language*,
    and return a dictionary from language -> (word -> freq).
    """
    unscaled = defaultdict(dict)
    raw_freqs = _read_csv_basic(filename)
    for wordlang in raw_freqs:
        word, lang = wordlang.rsplit('|', 1)
        word = standardize_word(word)
        unscaled[lang][word] = raw_freqs[wordlang]

    scaled = {}
    for key in unscaled:
        scaled[key] = _scale_freqs(unscaled[key])
    return scaled