def walk(self, num_words): words = [] histogram = Dictogram(self.word_list) next_word = histogram.sample() words.append(next_word) for i in range(num_words - 1): if len(self.markov_chain) > 0: next_word = histogram.sample() words.append(next_word) sentence = ' '.join(words) return sentence
def walk(word_list, length): sentence = [] histogram = Dictogram(word_list) next_word = histogram.sample() sentence.append(next_word) for i in range(length - 1): chain = new_chain(word_list, next_word) if len(chain) > 0: next_word = chain.sample() sentence.append(next_word) return sentence
class MarkovChain(): def __init__(self, text): self.nodes = self.generate_nodes(text) self.heads = Dictogram(text) def generate_nodes(self, text): '''iterates across list of words creating a list of nodes''' nodes = { } # seperate list to keep track of nodes we've already added and their respective object for word in range( len(text)): # for each word in the text we're analysing this_word = text[word] if this_word in nodes.keys( ): # if the word has already been added as a key if not (word + 2) > len( text): # checks that next word index is inbounds nodes[this_word].add_count( text[word + 1]) # add a token of the next word else: nodes[this_word] = Node( this_word) # if not we create a new node if not (word + 2) > len( text): # checks that next word index is inbounds nodes[this_word].add_count( text[word + 1]) # add a token of the next word return nodes def generate_sentence(self, num_words): '''generates a sentence with max-length (n) of words''' sentence = str() this_word = self.heads.sample( ) # samples text histogram in order to find a lead node for i in range(num_words): sentence += this_word # word gets appended onto the sentence if self.nodes[ this_word].types == 0: # checks if we're at a end node return sentence if not i == num_words: # if we're not on the last word sentence += ' ' # adds a space this_word = self.nodes[this_word].walk( ) # samples the current node for the next word return sentence
def random_walk(word_list, length): """Start sentence with sample word from histogram, and then sample each new histogram chain to get the next word, add then to sentence. """ sentence = [] histogram = Dictogram(word_list) next_word = histogram.sample() sentence.append(next_word) for i in range(length - 1): chain = new_chain(word_list, next_word) if len(chain) > 0: next_word = chain.sample() sentence.append(next_word) return sentence
def sample(self): """The first word to start the chain""" next_words = [] main_histogram = Dictogram(self.word_list) next_word = main_histogram.sample() next_words.append(next_word) chain = self.next_chain(next_word) for i in range(self.order - 1): if len(chain) > 0: word_next = chain.sample() next_words.append(word_next) chain = self.next_chain(word_next) sample = " ".join(next_words) return sample
class Markov_Chain(dict): def __init__(self, word_list, nth_order = 1): """Initialize the class and create variables""" self.word_list = create_list(word_list) self.dictionary_histogram = Dictogram(self.word_list) self.nth_order = nth_order """ Creating the Markov Chain """ #Edit so as to get rid of length of list minus 1 and it doesnt run errors def create_chain(self): pass for index in range(len(self.word_list)-nth_order): word = self.word_list[index] next_word = self.word_list[index+1] word_after_next = self.word_list[index+2] # ----------------------------------------- # if len(self.word_list)==index+1: # next_word = None # else: # next_word = self.word_list[index+1] # ------------------------------------------- if (word,next_word) not in self: small_dicto = Dictogram([(next_word,word_after_next)]) self[(word,next_word)] = small_dicto else: self[(word,next_word)].add_count((next_word,word_after_next)) def creating_sentence(self, length = 10): """Create sentence using both dictogram and the markov chain just made.""" #Edit so it adds periodss and not spaces at the end of a sentence. created_sentence = "" adding_word = self.dictionary_histogram.sample() created_sentence += adding_word+" " length = length - 1 last_word = adding_word while length > 0: next_word_for = self[adding_word].sample() created_sentence += next_word_for+" " adding_word = next_word_for length -= 1 return created_sentence
def walk(word_list, amount): '''Starts off the sentence with a sampled word from the initial histogram. Continues to sample each new histogram to create a list of words. word_list = list amount = int ''' sentence = [] main_histogram = Dictogram(word_list) next_word = main_histogram.sample() sentence.append(next_word) for i in range((amount) - 1): chain = next_chain(word_list, next_word) if len(chain) > 0: next_word = chain.sample() sentence.append(next_word) return sentence
def test_sample(): dictogram = Dictogram(fish_words) # Create a list of 10,000 word samples from histogram samples_list = [dictogram.sample() for _ in range(10000)] # Create a histogram to count frequency of each word samples_hist = Dictogram(samples_list) # Check each word in original histogram for word, count in dictogram.dictionary_histogram.items(): # Calculate word's observed frequency observed_freq = count / dictogram.tokens # Calculate word's sampled frequency samples = samples_hist.frequency(word) sampled_freq = samples / samples_hist.tokens # Verify word's sampled frequency is close to observed frequency lower_bound = observed_freq * 0.9 # 10% below = 90% = 0.9 upper_bound = observed_freq * 1.1 # 10% above = 110% = 1.1 assert lower_bound <= sampled_freq <= upper_bound
class Markov_Chain(dict): def __init__(self, word_list): """Initialize the class and create variables""" self.word_list = word_list self.dictionary_histogram = Dictogram(self.word_list) """ Creating the Markov Chain """ #Edit so as to get rid of length of list minus 1 and it doesnt run errors for index in range(len(self.word_list) - 1): word = self.word_list[index] if len(self.word_list) == index + 1: next_word = None else: next_word = self.word_list[index + 1] if word not in self: small_dicto = Dictogram([next_word]) self[word] = small_dicto else: self[word].add_count(next_word) def creating_sentence(self, length=10): """Create sentence using both dictogram and the markov chain just made.""" #Edit so it adds periodss and not spaces at the end of a sentence. created_sentence = "" adding_word = self.dictionary_histogram.sample() created_sentence += adding_word + " " length = length - 1 last_word = adding_word while length > 0: next_word_for = self[adding_word].sample() created_sentence += next_word_for + " " adding_word = next_word_for # if adding_word in self: # # pass # else: # pass length -= 1 return created_sentence
def order_sample(word_list, order=2): histogram = Dictogram(word_list) next_words = [] # sample a random word from histogram next_word_string = histogram.sample() # find all the words that come after chain = new_chain(word_list, next_word_string) # append both words to a list next_words.append(next_word_string) for i in range(order - 1): if len(chain) > 0: next_word_string = chain.sample() next_words.append(next_word_string) chain = new_chain(word_list, next_word_string) words_str = " ".join(next_words) return words_str
class MarkovChain(): def __init__(self, order=2, starttoken='!START', stoptoken='!STOP'): self.order = order # number of orders to generate the chain with self.nodes = dict() self.starttokens = Dictogram() self.stoptokens = Dictogram() self.STARTTOKEN = starttoken self.STOPTOKEN = stoptoken def get_phrase(self, text_q): phrase = () # represent the n words seperated this_q = copy.copy(text_q) for i in range(self.order): # generates the 'phrase' based off of the order which dictates the number of words we look at this_word = (this_q.dequeue(),) # stores the word we're currently looking at phrase += this_word if self.STARTTOKEN in phrase: self.starttokens.add_count(phrase) return phrase def gen_nodes(self, text): '''iterates across list of words creating or modifying nodes''' text_q = Queue() for token in text: text_q.enqueue(token) while text_q.length() > self.order: # for each first word in the text we're analysing this_phrase = self.get_phrase(text_q) text_q.dequeue() next_phrase = self.get_phrase(text_q) if this_phrase in self.nodes.keys(): # if the phrase has already been added as a key if next_phrase: self.nodes[this_phrase].add_count(next_phrase) # add a token of the next phrase else: self.nodes[this_phrase] = Node(this_phrase) # if not we create a new node if next_phrase: self.nodes[this_phrase].add_count(next_phrase) # add a token of the next phrase def get_start(self): if self.order == 1: return self.nodes[(self.STARTTOKEN),].walk() return self.starttokens.sample() def gen_sentence(self): '''generates a sentence starting with a start token''' sentence = str() this_phrase = self.get_start() # start with the start token while not self.STOPTOKEN in this_phrase: # while we don't run into a stop token slice = self.order - 1 sentence += ' '.join(this_phrase[slice:]) + ' ' # joins phrase (excluding the first word) into a string this_phrase = self.nodes[this_phrase].walk() # samples the current node for the next word if not self.order == 1: sentence += ' '.join(this_phrase[slice:1]) # joins phrase (exlcuding the last word) into a string return sentence