def sub(tokens, author): """sub :: ([(word, POS)], String) -> [word] Chooses shit based on shit. """ global _dick, _invdick, _loaded_freq_dick if _dick is None: _dick, _invdick = synonyms.read_wordbank("../data/wordbank.txt") if author not in _loaded_freq_dick: _loaded_freq_dick[author] = word_freq.load_file("../wordfreq/%s/sum.txt" % (author)) # tmp freq_dick = _loaded_freq_dick[author] def subsub(word): candidates = synonym_freq.query(word, _dick, _invdick, freq_dick) d = random.random() acc = 0.0 for c, w in candidates: acc += w if acc > d: return c raise Exception("") return [subsub(word) for word, _ in tokens]
"""Returns [(syn, prob.)] where `syn` is a synonym of `word` and `prob.` is its weight, i.e. how much we prefer it.""" global freq_offset # sim for similarities syn_sims = reweight(synonyms.query_word(word, dick, invdick)) # we should give any word a chance res_freqs = reweight({key: weight_func(val, freqdick.get(key, 0)) for key, val in syn_sims.iteritems()}) #return sorted(res_freqs.items(), lambda x: x[1], reverse=True) return res_freqs.items() # comment prev. line if we don't need to sort the result if __name__ == '__main__': (dick, invdick) = synonyms.read_wordbank('../data/wordbank.txt') syn_query_word = lambda word: synonyms.query_word(word, dick, invdick) word_freq_prefix = '../wordfreq/' wordfreq_files = [word_freq_prefix + fname for fname in enum_file_names()] words = dick.keys() def count(file_name, output_file = sys.stdout): freq_dick = word_freq.load_file(file_name) syn_freq_dick = {} for word in words: if word in freq_dick: syn_freq_dick[word] = freq_dick[word]