def test_sampler(histogram): """ tests sampler by running it 10000 times, uncomment line 36 for this""" test_words = [] for _ in range(10000): test_words.append(sample(histogram)) test_hist = histogram_dictionary(test_words) for item in test_hist.keys(): print(f"{item}: {test_hist[item]}")
def sentence_creator(histogram, num_words): """ tests sampler by running it 10000 times, uncomment line 36 for this""" words = [] sentence = [] for _ in range(0, num_words): words.append(sample(histogram)) hist = histogram_dictionary(words) for item in hist.keys(): sentence.append(item) sentence_str = " ".join(sentence) return sentence_str #sentence
word_list = words[word_index] return word_list def probability(histo): """ equation for probability = (num of occurances) / sample total Take the value of a word. Divide it by the total length of the histogram. Mulitply by 100 to get percentage. Be able to show all of the words and their percent in a list. """ total_count = len(histo.values( )) # this is the total number of items in the histo. Should be 61. # print(total_count) # yes, 61 word_percents = {} for key in histo: # for the key in the histogram word_percents[key] = str( round(histo[key] / total_count * 100, 2) ) + "%" # divide it by the total items in the histogram and divide by 100 # round(_, which decimal point) <--- this will round the number to the second decimal point return word_percents # print(word_percents) if __name__ == '__main__': with open("sample.txt", "r") as data: histo = histogram_dictionary(data) random_word(histo) probability(histo)
words.append(sample(histogram)) hist = histogram_dictionary(words) for item in hist.keys(): sentence.append(item) sentence_str = " ".join(sentence) return sentence_str #sentence def test_sampler(histogram): """ tests sampler by running it 10000 times, uncomment line 36 for this""" test_words = [] for _ in range(10000): test_words.append(sample(histogram)) test_hist = histogram_dictionary(test_words) for item in test_hist.keys(): print(f"{item}: {test_hist[item]}") if __name__ == "__main__": # file = sys.argv[:1] # num_words = sys.argv[:1] num_words = 8 f = open('harry_potterb1.txt') words = f.read().split() hist = histogram_dictionary(words) print(sentence_creator(hist, num_words)) # Tests # print(test_sampler(hist, num_words)) # print(word_sampler(hist))
return: a histogram histogram creation function meant to check if words look like they are correctly sampled """ random_word_histogram = {} for word in word_list: if word in random_word_histogram: random_word_histogram[word] += 1 else: random_word_histogram[word] = 1 return random_word_histogram if __name__ == "__main__": text_document = sys.argv[1] histo = histogram.histogram_dictionary(text_document) random_words = [random_sampling(histo) for _ in range(10000)] # print(random_check(random_words)) start = time.time() random_freq_words = [weighted_dict_sampling(histo) for _ in range(100000)] finish = time.time() print(finish - start) print(random_check(random_freq_words)) for _ in range(10): print(weighted_dict_sampling(histo))
histogram_list[index + 1][1]) count += to_add index += 1 type_index = random.randrange(len(histogram_list[index][1])) return histogram_list[index][1][type_index] if __name__ == "__main__": file_name = "notess.txt" #print(sample_lists_of_lists(file_name)) text = histogram.load_words(file_name) text_length = len(text) histogram_dict = histogram.histogram_dictionary(text) print(get_probablities(histogram_dict, text_length)) '''current = time.time() words = list() print(time.time() - current) samples = histogram.histogram_count_lists_try_catch(words) print(samples) #text = histogram.load_words(file_name) histogram_list2 = histogram.histogram_dictionary(text) current2 = time.time() words = list() for i in range(10000):
import histogram import sys import random # Helper function for getting the total number of objects in the histogram get_total(histogram): # Take the token(total number of objects) and divide the frequency of the words by tokens to get a decimal. Then use that to select a random word based on frequency. def sample_by_frequency(histogram): # Get the "weight" of every letter together token = len(histogram) # Get the lucky index random_index = random.randint(1, token) # Iterate through all the words for word in histogram: # Subtract the words value from the histogram random_index = random_index - word.values if random_index <= 0: return word if __name__ == "__main__": source_text = 'harry_potterb1.txt' histogram = histogram.histogram_dictionary(source_text) sample_by_frequency(histogram)
def main(): book = 'sherlock_no_title_chapters.txt' word_list = histogram.generate_word_list(book) histo = histogram.histogram_dictionary(word_list) print(sample_by_frequency(histo))