def _read_csv_basic(filename): infile = codecs.open(filename, encoding='utf-8') counts = {} for line in infile: line = line.rstrip(u'\n') word, count = line.rsplit(u',', 1) count = float(count) counts[standardize_word(word)] = count return counts
def word_frequency(word, lang, wordlist='multi', offset=0.): """ Get the frequency of `word` in the language with code `lang`, from the specified `wordlist`. The offset gets added to all values, to monotonically account for the fact that we have not observed all possible words. """ c = CONN.cursor() c.execute("SELECT freq from words where word=? and lang=? and wordlist=?", (standardize_word(word), lang, wordlist)) row = c.fetchone() if row is None: return offset else: return row[0] + offset
def read_leeds_corpus(filename): """ Load word frequencies from a "Web as Corpus" file, collected and provided by the University of Leeds. For more information, see: http://corpus.leeds.ac.uk/list.html """ infile = codecs.open(filename, encoding='utf-8') counts = defaultdict(float) for line in infile: line = line.rstrip() if line: rank = line.split(u' ')[0] if NUMBER_RE.match(rank) and line.count(u' ') == 2: _, freq, token = line.split(u' ') token = standardize_word(ftfy(token)) freq = float(freq) counts[token] += freq return _scale_freqs(counts)
def read_multilingual_csv(filename): """ Load word frequencies from a file of comma-separated values, where each line is of the form: word|lang,freq Scale the frequencies so they add up to 1.0 *for each language*, and return a dictionary from language -> (word -> freq). """ unscaled = defaultdict(dict) raw_freqs = _read_csv_basic(filename) for wordlang in raw_freqs: word, lang = wordlang.rsplit('|', 1) word = standardize_word(word) unscaled[lang][word] = raw_freqs[wordlang] scaled = {} for key in unscaled: scaled[key] = _scale_freqs(unscaled[key]) return scaled