def higher_markov(corpus): '''Creates 2nd order markov chain with histogram''' markov_dict = {} for i in range(len(corpus) - 1): first = corpus[i] second = corpus[i + 1] if second != '<STOP>': third = corpus[i + 2] key = (first, second) if key not in markov_dict.keys(): markov_dict[key] = Dictogram() markov_dict.get(key).add_count(third) return markov_dict
def create_markov_dict(text): """Create Dictionary with key equal to tuple of words and value as a dictionary.""" word_list = text.split(" ") markov_dict = {} for word_index in range(len(word_list) - 2): current_tuple = tuple( (word_list[index]) for index in range(word_index, word_index + 2)) next_word = word_list[word_index + 2] if current_tuple in markov_dict: markov_dict[current_tuple].add_count(next_word) else: markov_dict[current_tuple] = Dictogram([next_word]) pprint(markov_dict) return (markov_dict)
def test_add_count(self): histogram = Dictogram(self.fish_words) # Add more words to update frequency counts histogram.add_count('two', 2) histogram.add_count('blue', 3) histogram.add_count('fish', 4) histogram.add_count('food', 5) # Verify updated frequency count of all words assert histogram.frequency('one') == 1 assert histogram.frequency('two') == 3 assert histogram.frequency('red') == 1 assert histogram.frequency('blue') == 4 assert histogram.frequency('fish') == 8 assert histogram.frequency('food') == 5 # Verify count of distinct word types assert histogram.types == 6 # Verify total count of all word tokens assert histogram.tokens == 8 + 14
def nth_order_markov_chain(self, order, text_list): """ this function takes in a word and checks to see what words come after it to determine the word sequence for our generated markov chain""" markov_dict = dict() # for each word in list, key is word and value is dictogram for index in range(len(text_list) - order): # text_list[index] should be our word from list, and we're slicing based on the order of the markov chain window = tuple(text_list[index: index + order]) # check if key is stored already if window in markov_dict: # if it is, then append it to the existing histogram # NOTE: Instead of going through the corpus repeatedly, lets save text_list[index + order] in scope markov_dict[window].add_count([text_list[index + order]]) else: # if not, create new entry with window as key and dictogram as value markov_dict[window] = Dictogram([text_list[index + order]]) # return dictionary return markov_dict
def build_markov(self, midi_data): markov_chain = {} #get the current word and the word after current_note = midi_data[0] next_note = midi_data[1] time = midi_data[2] if current_note in markov_chain.keys(): #already there #get the histogram for that word in the chain histogram = markov_chain[current_note] #add to count histogram.dictionary_histogram[ next_note] = histogram.dictionary_histogram.get(next_note, 0) + 1 else: #first entry markov_chain[current_note] = Dictogram(midi_data) return markov_chain
def build_markov(self): dict = {} with open('holmes-text.txt') as file: corpus = file.read().split() i = 0 while i + 1 < len(corpus): word = corpus[i] if dict.get(word) == None: next_word = corpus[i + 1] list = [next_word] histogram = Dictogram(list) dict[word] = histogram else: next_word = corpus[i + 1] dict.get(word).add_count(next_word) i += 1 return dict
def order_sample(word_list, order=2): histogram = Dictogram(word_list) next_words = [] # sample a random word from histogram next_word_string = histogram.sample() # find all the words that come after chain = new_chain(word_list, next_word_string) # append both words to a list next_words.append(next_word_string) for i in range(order - 1): if len(chain) > 0: next_word_string = chain.sample() next_words.append(next_word_string) chain = new_chain(word_list, next_word_string) words_str = " ".join(next_words) return words_str
def markov_dictograms(text_string): """ argument: a string of text return: a dictionary of words and dictograms """ text_array = text_string.split() dictogram_dictionary = {} for word_index in range(len(text_array) - 1): current_word = text_array[word_index] next_word = text_array[word_index + 1] if current_word in dictogram_dictionary: dictogram_dictionary[current_word].add_count(next_word) else: dictogram_dictionary[current_word] = Dictogram([next_word]) return (dictogram_dictionary)
def build_markov(self, word_list): markov_chain = {} for i in range(len(word_list) - 1): # get the current word and the word after current_word = word_list[i] next_word = word_list[i + 1] if current_word in markov_chain.keys(): # already there # get the histogram for that word in the chain histogram = markov_chain[current_word] # add to count histogram.dictionary_histogram[ next_word] = histogram.dictionary_histogram.get( next_word, 0) + 1 else: # first entry markov_chain[current_word] = Dictogram([next_word]) return markov_chain
def make_chain(self, word_list): """Create and return a markov chain from a given list of words""" markov = {} q = Queue() for i in range(len(word_list)): if i < self.n: q.enqueue(word_list[i]) else: key = str(q) q.dequeue() q.enqueue(word_list[i]) if markov.get(key) is None: markov[key] = [] markov[key].append(str(q)) for key in markov: markov[key] = Dictogram(markov[key]) return markov
def walk(self, num_words): #defines markov_chain as a dictonary markov_chain = {} # selects a random word for our object from markov chain keys current_word = random.choice(list(self.markov_chain.keys())) # Creates our sentence list sentence = [] # while i is 0-num_words for i in range(num_words): #defines our Dictogram object markov_chain[current_word] = Dictogram([current_word]) #sets temp_word to a random word by weighted frequency temp_word = markov_chain[current_word].sample_by_frequency() # appends it to our sentence sentence.append(temp_word) #changes our focus to the next word current_word = temp_word #return the list as a string return " ".join(sentence)
def build_markov(self, word_list): markov_chain = {} for i in range(len(word_list) - 1): #get the current word and the word after current_word = word_list[i] next_word = word_list[i + 1] if current_word in markov_chain.keys(): #already there #get the histogram for that word in the chain histogram = markov_chain[current_word] #add to count # histogram[next_word] = histogram.get(next_word, 0) + 1 histogram.add_count( next_word) # ensures that tokens/total is updated else: #first entry markov_chain[current_word] = Dictogram([next_word]) return markov_chain
def markov_chain_nth_order(token_list, order = 4): """Returns a data structure of word A followed by word B in a nested dictionary. Ex: ['one', 'fish', 'two', 'fish', 'red', 'fish', 'blue', 'fish', 'two', 'fish'] {'one': {'fish': 1}, 'fish': {'two': 2, 'red': 1, 'blue': 1}, 'two': {'fish': 2}, 'red': {'fish': 1}, 'blue': {'fish': 1}}""" walk_steps = 1 nest_dict = {} step_counter = 0 total_steps = len(token_list) for i in range(total_steps - order): # returns a nested dictionary of word A which is followed by word B # {'one': {'fish': 1} type_storage = tuple(token_list[i + index] for index in range(order)) if type_storage not in nest_dict: nest_dict[type_storage] = Dictogram() # Commented out. This is literally becoming Dictogram() esp after referencing my prior code # for word in type_storage: # if word in type_storage: # nest_dict[word] += 1 # else: # nest_dict[word] = 1 nest_dict[type_storage].add_count(token_list[i + order]) # tmp_dict = {} # tmp_dict[token_list[i+1]] = 1 # nest_dict[token_list[i]] = tmp_dict # elif token_list[i] in nest_dict: # # iterates through nested dictionary and adds to words in the nested dictionary # # ex: fish': {'two': 1} => fish': {'two': 2} # if token_list[i+1] in nest_dict[token_list[i]]: # nest_dict[token_list[i]][token_list[i + 1]] += 1 # else: # nest_dict[token_list[i]][token_list[i + 1]] = 1 # step_counter += 1 # step_counter = 0 return nest_dict
def markov_chain(words_list): """Count occurences in the given list of words and return that data structure""" # create new dictionary markov = {} # iterate over the corpus for i in range(len(words_list) - 1): # create two variables for current word and current word + 1 (next) current_word = words_list[i] next_word = words_list[i + 1] # check if word is key in dictionary if current_word in markov: # if key is in big dictionary, update Dictogram markov[current_word].add_count(next_word) # if word is not a key in dictionary, create key with value as dictogram else: # Dictogram key will be next with a value of 1 markov[current_word] = Dictogram([next_word]) # return dictionary return markov
def _create_chain(self, word_list): for i in range(0, len(word_list)): # The current word in the iteration word = word_list[i] try: # The word 1 index ahead of the current word, aka the next word in the sentence # has to be in try except because of index out of range exception on end of list next_word = word_list[i + 1] # If index error then no new words in list to add so break out except IndexError: break # The word dictogram if it exists word_dicto = self.get(word, None) # If word exists then add a count of 1, otherwise create a new dictogram with the next word if word_dicto: # Add a new word entry to the dictogram if the dictogram already exists word_dicto.add_count(next_word, 1) else: # Create a new dictogram for the word self[word] = Dictogram([next_word])
def populate_chain(self): """Construct a dictionary to represent the MarkovChain state transitions of any order. """ chain = dict() i = 0 num_words = len(self.words_list) while i < num_words - self.order: # avoid IndexError at end state, state_after = self.form_states(i) # create a word frequency dict to go along with each state if chain.get(state, None) is None: list_of_states = [] list_of_states.append(state_after) chain[state] = Dictogram(list_of_states) # if the state already exists, add the token and count else: chain[state].add_count(state_after) i += 1 # move index over to start recording of next state return chain
def second_order(file): with open(file) as f: #access file text = f.read() #reads file markov_dict = {} #creates empty dictionary word_array = [ word for line in text.split('\n') for word in line.split(' ') ] #removes line breaks and whitespace,returns a list of individual words for index in range(len(word_array) - 2): start_word = word_array[index] next_word = word_array[index + 1] next_next_word = word_array[index + 2] tuple = (start_word, next_word) if tuple not in markov_dict: add_tuple = Dictogram([next_next_word]) markov_dict[tuple] = add_tuple else: markov_dict[tuple].add_count(next_next_word) return markov_dict
def build_sentence(self, sentence): next_word_index = 0 next_next_word_index = 1 words = "i like dogs and you like dogs i like cats but you hate cats" for _ in range(2): sentence.insert(0, '$START$') for word in sentence: next_word_index += 1 next_next_word_index += 1 if next_next_word_index >= len(sentence): break if next_word_index < len(sentence): next_word = sentence[next_word_index] if next_next_word_index < len(sentence): next_next_word = sentence[next_next_word_index] pair = (word, next_word) if pair not in self: self[pair] = Dictogram() if pair in self: self[pair].add_count(next_next_word) return self
def nth_markov_dictograms(text_string, nth_order): """ argument: a string of text and whatever order markov chain return: a dictionary of word snippets and dictograms of single words that follow the snippets """ text_array = text_string.split() dictogram_dictionary = {} for word_index in range(len(text_array) - nth_order): current_tuple = tuple( (text_array[index]) for index in range(word_index, word_index + nth_order)) next_word = text_array[word_index + nth_order] if current_tuple in dictogram_dictionary: dictogram_dictionary[current_tuple].add_count(next_word) else: dictogram_dictionary[current_tuple] = Dictogram([next_word]) return (dictogram_dictionary)
def build_markov(self, word_list): markov_chain = {} #traverse the string for i in range(len(word_list) - 1): #get the current word and the word after current_word = word_list[i] next_word = word_list[i + 1] #checks if current word is in the chain already if current_word in markov_chain.keys(): #get the histogram(dic) for that word in the chain histogram = markov_chain[current_word] #finds the next_word in the histogram and adds to its weight (chance of it showing up enxt after current word) histogram.add_count(next_word) #histogram[next_word] = histogram[next_word].add_count(next_word) else: #first entry in the chain and creates a new dictionary with next_word as the first entry markov_chain[current_word] = Dictogram([next_word]) print(markov_chain[current_word], "|") return markov_chain
def build_markov(self): with open(sys.argv[1]) as f: content = f.read().split() dict = {} i=0 while i+1 < len(content): word = content[i] if dict.get(word) == None: next_word = content[i+1] list = [next_word] histogram = Dictogram(list) dict[word] = histogram else: next_word = content[i+1] dict.get(word).add_count(next_word) i += 1 return dict
def second_order(self): with open(sys.argv[1]) as f: content = f.read().split() dict = {} i=0 while i+2 < len(content): word_tuple = (content[i], content[i+1]) if dict.get(word_tuple) == None: next_word = content[i+2] list = [next_word] histogram = Dictogram(list) dict[word_tuple] = histogram else: next_word = content[i+2] dict.get(word_tuple).add_count(next_word) i += 1 return dict
def second_order_markov_histo(word_list): '''Creates histogram that represents markov chain for each word in a list.''' histo = {} for i in range(len(word_list) - 1): key_word = word_list[i] next_key_word = word_list[i + 1] new_key = (key_word, next_key_word) if i + 2 < len(word_list): next_next_key = word_list[i + 2] if new_key not in histo.keys(): key_histo = [] histo[new_key] = key_histo # STINE TESTING Thank you Aucoeur!! histo[new_key].append(next_next_key) value_list = histo.items() for key, value in value_list: histo[key] = Dictogram(value) return histo
def markov_chain(text):#Create a chain to scramble the words based on frequency and Dictogram markov = {} for index in range(len(text)-1): word_1 = text[index] word_2 = text[index+1] #if word_1 in markov.keys(): #markov[word_1].add_count(word_2) #markov[word_1] = Dictogram() # print(word_1) # print(markov) #note: this one works if word_1 not in markov.keys(): markov[word_1] = Dictogram() markov.get(word_1).add_count(word_2) print(markov) return markov
def main(): """Start main process.""" start_time = time.time() file_input = grab_file() # Grabs how long you want string to be for later; defaults to 10 output_len = request.args.get('num', '') if output_len == '': word_amt = 10 else: word_amt = int(output_len) # Grab the input, make into dictionary/list of words + occurences # Changed into a class for more functionality; trades speed though input_histo = Dictogram(file_input) input_len = input_histo.tokens # List based on probability # Grabs input, length of input, and desired string length finished_list = probability_gen(input_histo, input_len, word_amt) if word_amt == 1: word_print(finished_list) else: sentence_print(finished_list) """Below are the three alternate functions not needed for the tweetgen.""" # Word you want to search up for frequency; change as needed word = "the" histogram(input_histo) # unique words... was replaced by dictogram # unique_words(input_histo.tokens) input_histo.tokens # word frequency... also replaced input_histo.count(word) print("--- %s seconds --- end after frequency \n" % (time.time() - start_time)) return render_template('main.html', output=sentence_print(finished_list))
def table_generator(corpus_text, order): """Make the actual markov table.""" """It's a hashtable with tuples, list => tuples, dictionary.""" # Corpus in linkedlist form corpus_ll = LinkedList(corpus_text) # Window and the table window_queue = LinkedList() current_table = HashTable() # Current window for iterating through corpus; order changes size for i in range(order): window_queue.append(corpus_text[i]) # Add above to hashtable + the word that comes after current_table.set((window_queue.items()), [corpus_text[order + 1]]) # For the rest for i in range(corpus_ll.length() - (order + 1)): # Dequeue window, add next to window; window_queue.move() window_queue.append(corpus_text[i + order]) # Word after window next_word = corpus_text[i + order + 1] # Check if window exists in hash table already # Add tuple + new word to list, or tuple and list w/ new word if current_table.contains((window_queue.items())): currentvalues = current_table.get(window_queue.items()) currentvalues.append(next_word) new_value = currentvalues current_table.set((window_queue.items()), new_value) else: current_table.set((window_queue.items()), [next_word]) # Turn the second element (list) into a dictionary for key, value in current_table.items(): current_table.set(key, Dictogram(value)) return current_table
def build_markov_queue(self, word_list): markov_chain = {} curr_q = [] nex_q = [] for i in range(len(word_list) - 1): curr_q.append(word_list[i]) nex_q.append(word_list[i + 1]) if len(curr_q) == 2: current_q = tuple(curr_q) next_q = tuple(nex_q) curr_q.pop(0) nex_q.pop(0) if current_q in markov_chain.keys(): histogram = markov_chain[current_q] histogram.dictionary_histogram[ next_q] = histogram.dictionary_histogram.get( next_q, 0) + 1 else: markov_chain[current_q] = Dictogram([next_q]) return markov_chain
def higher_order(self, new_words): """ Goes through word_list and combines two words in a string. The amount of words is based on the order number. Checks if string matches and combines with the string""" dictionary = dict() key_words = new_words.split() words = [] next_words = [] pairs = [] for i in range(len(self.word_list) - 1): words *= 0 for x in range(self.order): if i < (len(self.word_list) - self.order): words.append(self.word_list[i + x]) if words == key_words: next_words *= 0 for x in range(self.order): next_words.append(self.word_list[i + (x + 1)]) next_words_str = ' '.join(next_words) pairs.append(next_words_str) dictionary[new_words] = Dictogram(pairs) return dictionary
def generate_random_sentence(self, length, start_sequence): """ Генерация предложения :param length: количестов слов в предложение :type length: int :param start_sequence: начало предложения :type start_sequence: str :return: предложение :rtype: str """ current_word_sequence = tuple(map(int, start_sequence.split())) sentence = list(current_word_sequence) sentence_num = 0 while sentence_num < length: current_dictogram = Dictogram( self.get_sequence(' '.join(map(str, current_word_sequence)))) random_weighted_word = current_dictogram.return_weighted_random_word( ) current_word_sequence = current_word_sequence[1:] + tuple( [random_weighted_word]) sentence.append(current_word_sequence[-1]) sentence_num += 1 return sentence
def check_key(self, key, value): if key in self: self[key].add_count(value) else: self[key] = Dictogram([value])