コード例 #1
0
def compare_words(filename, word_bank):
    histogram = make_histogram.make_histogram_from_file(filename)
    word_bank = make_histogram.make_histogram_from_file(word_bank)
    new_words = []
    for keys in histogram:
        if keys not in word_bank:
            new_words.append(keys)
    return new_words
コード例 #2
0
def zipf_freq(filename):
    histogram = make_histogram.make_histogram_from_file(filename)
    by_keys, by_values = sort_histogram.sort_histogram(histogram)
    x = []
    y = []
    for f in by_values:
        y.append(log(f[0]))
    for i in range(1, len(by_values)+1):
        x.append(log(i))
    simple_plot.simple_plot(x, y)
コード例 #3
0
def compare_word(filename, word_bank):
    histogram = make_histogram.make_histogram_from_file(filename)
    word_bank = make_histogram.make_histogram_from_file(word_bank)
    return set(histogram) - set(word_bank)
コード例 #4
0
__author__ = 'QHe'

import make_histogram
import random
import bisect

#Choose from a list constructed based on cumulative distribution of words
#Input - histogram (dictionary), output - the randomly chosen word (string)
def choose_from_hist(hist):
    selection_list = []
    for keys, values in hist.items():
        selection_list.extend([keys] * values)
    return random.choice(selection_list)

#Choose a random word by constructing a cumulative distribution list of words
#Input - histogram (dictionary), output - the randomly chosen word (string)
def choose_from_cdf(hist):
    word_list = hist.keys()
    cdf = []
    counter = 0
    for keys in hist:
        counter += hist[keys]
        cdf.append(counter)
    random_number = random.randint(0, counter-1)
    index = bisect.bisect(cdf, random_number)
    return word_list[index]

if __name__ == '__main__':
    print choose_from_hist(make_histogram.make_histogram_from_file('words.txt'))
    print choose_from_cdf(make_histogram.make_histogram_from_file('words.txt'))