def histogram_as_dict(source_text_as_list): """Return histogram as a dict: {'one': 1, 'fish': 4}...""" output_dict = Dictogram() for word in source_text_as_list: output_dict.add_count(word) return output_dict
def test_types(self): histogram = Dictogram(self.fish_words) # Verify count of distinct word types assert len(set(self.fish_words)) == 5 assert histogram.types == 5 # Adding words again should not change count of distinct word types for word in self.fish_words: histogram.add_count(word) assert histogram.types == 5
def test_tokens(self): histogram = Dictogram(self.fish_words) # Verify total count of all word tokens assert len(self.fish_words) == 8 assert histogram.tokens == 8 # Adding words again should double total count of all word tokens for word in self.fish_words: histogram.add_count(word) assert histogram.tokens == 8 * 2
def test_types(): fish_words = ['one', 'fish', 'two', 'fish', 'red', 'fish', 'blue', 'fish'] histogram = Dictogram(fish_words) # Verify count of distinct word types assert len(set(fish_words)) == 5 assert histogram.types == 5 # Adding words again should not change count of distinct word types for word in fish_words: histogram.add_count(word) assert histogram.types == 5
def test_tokens(): fish_words = ['one', 'fish', 'two', 'fish', 'red', 'fish', 'blue', 'fish'] histogram = Dictogram(fish_words) # Verify total count of all word tokens assert len(fish_words) == 8 assert histogram.tokens == 8 # Adding words again should double total count of all word tokens for word in fish_words: histogram.add_count(word) assert histogram.tokens == 8 * 2
def add_count(self, word, previous_word, count=1): """Increase frequency count of given word by given count amount.""" if previous_word != '': try: dicto = self[previous_word] except KeyError: dicto = Dictogram() self.types += 1 dicto.add_count(word) self[previous_word] = dicto self.tokens = self.tokens + count
def markov_chain(list_of_values): mc = Dictogram() i = 0 while i < len(list_of_values) - 3: pair = (list_of_values[i], list_of_values[i + 1], list_of_values[i + 2] ) # pair of words in tuple # print(list_of_values[i], list_of_values[i+1], list_of_values[i+2], list_of_values[i+3], list_of_values[i+5]) next_word = list_of_values[i + 3] # next word if pair in mc: # check to see if pair of words exist in the dictionary mc[pair].add_count( next_word) # increments the count of the next word else: new_dict = Dictogram() # new dictionary new_dict.add_count(next_word) # add count of next word mc[pair] = new_dict # i += 1 # print('markov chain ==> \n', mc, '\n<== end of markov chain\n') return mc
class MarkovChain(Dictogram): def __init__(self, word_list): super().__init__() self.start_tokens = Dictogram() self.stop_tokens = Dictogram() ##### for first order MarkovChain word_list[0] = re.sub("[^a-zA-Z]", '', word_list[0]) self.start_tokens.add_count(word_list[0].lower(), 1) for i in range(1, len(word_list)-1, 1): if((word_list[i][0].isupper()) and word_list[i-1][len(word_list[i-1])-1] in string.punctuation): word_list[i] = re.sub("[^a-zA-Z]", '', word_list[i]) self.start_tokens.add_count(word_list[i].lower(), 1) for i in range(len(word_list)): if(word_list[i][len(word_list[i])-1] in string.punctuation): word_list[i] = re.sub("[^a-zA-Z]", '', word_list[i]) # word_list[i] = word_list[i][:len(word_list[i])-1] self.stop_tokens.add_count(word_list[i], 1) for i in range(len(word_list)-1): word_list[i] = re.sub("[^a-zA-Z]", '', word_list[i]) word_list[i+1] = re.sub("[^a-zA-Z]", '', word_list[i+1]) if word_list[i] in self: self[word_list[i].lower()].add_count(word_list[i+1].lower(), 1) else: self[word_list[i].lower()] = Dictogram([word_list[i+1].lower()]) def random_walk(self, length=10): sentence = "" keys = list(self.keys()) word = self.start_word() sentence += word + " " word = word.lower() for i in range(length-1): word = self[word].sample() sentence += word + " " sentence = sentence + self.end_word() + ". " return sentence def start_word(self): ###for 1st order markov chain dart = random.randint(0, len(self.start_tokens)+1) fence = 0 for elm in self.start_tokens: for key in self.start_tokens.keys(): fence += self.start_tokens[key] if fence > dart: return elm.capitalize() def end_word(self): dart = random.randint(0, len(self.stop_tokens)+1) fence = 0 while 1: for elm in self.stop_tokens: for key in self.stop_tokens.keys(): fence += self.stop_tokens[key] if fence >= dart: return elm
def test_add_count(self): histogram = Dictogram(self.fish_words) # Add more words to update frequency counts histogram.add_count('two', 2) histogram.add_count('blue', 3) histogram.add_count('fish', 4) histogram.add_count('food', 5) # Verify updated frequency count of all words assert histogram.frequency('one') == 1 assert histogram.frequency('two') == 3 assert histogram.frequency('red') == 1 assert histogram.frequency('blue') == 4 assert histogram.frequency('fish') == 8 assert histogram.frequency('food') == 5 # Verify count of distinct word types assert histogram.types == 6 # Verify total count of all word tokens assert histogram.tokens == 8 + 14
class MarkovChain(Dictogram): def __init__(self, word_list, order): # super().__init__() self.order = order self.start_tokens = Dictogram() self.stop_tokens = Dictogram() ##### for first order MarkovChain # word_list[0] = re.sub("[^a-zA-Z]", '', word_list[0]) # self.start_tokens.add_count(word_list[0].lower(), 1) for i in range(1, len(word_list) - 1, 1): try: if ((word_list[i][0].isupper()) and word_list[i - 1][len(word_list[i - 1]) - 1] in string.punctuation): # word_list[i] = re.sub("[^a-zA-Z]", '', word_list[i]) if self.order > 1: temp = list() for j in range(self.order): word_list[i + j] = re.sub("[^a-zA-Z]", '', word_list[i + j]) temp.append(word_list[i + j].lower()) if len(temp) > 1: temp = tuple(temp) else: word_list[i] = re.sub("[^a-zA-Z]", '', word_list[i]) temp = word_list[i].lower() self.start_tokens.add_count(temp, 1) except: pass for i in range(len(word_list)): try: if (word_list[i][len(word_list[i]) - 1] in string.punctuation): word_list[i] = re.sub("[^a-zA-Z]", '', word_list[i]) # word_list[i] = word_list[i][:len(word_list[i])-1] self.stop_tokens.add_count(word_list[i], 1) except: pass for i in range(len(word_list) - self.order): if self.order > 1: temp = list() for j in range(self.order): word_list[i + j] = re.sub("[^a-zA-Z]", '', word_list[i + j]) temp.append(word_list[i + j].lower()) if len(temp) > 1: temp = tuple(temp) else: word_list[i] = re.sub("[^a-zA-Z]", '', word_list[i]) temp = word_list[i].lower() if temp in self: self[temp].add_count(word_list[i + self.order].lower(), 1) else: self[temp] = Dictogram([word_list[i + self.order].lower()]) def random_walk(self, length=10): sentence = "" # print("55") # print(self.start_word()) if self.order > 1: sentence = str(self.start_word()).capitalize() + " " else: sentence = self.start_word().capitalize() + " " for i in range(length - self.order - 1): next_word = self.sample(sentence) sentence += next_word + " " sentence += self.end_word() + "." return sentence def start_word(self): tokens = 0 for elm in self.start_tokens: tokens += self.start_tokens[elm] dart = random.randrange(0, tokens) fence = 0 for elm in self.start_tokens: fence += self.start_tokens[elm] if fence > dart: if self.order > 1: sentence = "" for i in range(len(elm)): sentence += elm[i] + " " return sentence else: return elm def end_word(self): tokens = 0 for elm in self.stop_tokens: tokens += self.stop_tokens[elm] dart = random.randrange(0, tokens) fence = 0 for elm in self.stop_tokens: fence += self.stop_tokens[elm] if fence > dart: return elm def sample(self, sentence): sentence = sentence.split() sentence[0] = sentence[0].lower() if self.order > 1: key = list() for i in range( len(sentence) - 1, len(sentence) - 1 - self.order, -1): key.append(sentence[i]) key = tuple(key) else: key = sentence[len(sentence) - 1] tokens = 0 for elm in self[key]: tokens += self[key][elm] dart = random.randint(0, tokens) fence = 0 for elm in self[key]: fence += self[key][elm] if fence >= dart: if (self.order > 1): return key[len(key) - 1] else: return elm
class MarkovChain(dict): def __init__(self, sentences, order=1): super(MarkovChain, self).__init__() self.sentenceStarters = Dictogram() for s in sentences: self.compile2(s) # First Order def compile(self, sentence): words = sentence.split(' ') for i in range(len(words) - 1): if words[i] not in self: self[words[i]] = Dictogram() self[words[i]].add_count(words[i + 1]) # Second Order def compile2(self, sentence): # Split sentence by spaces, remove empty strings, add '###' to the end of wordsLst words = list(filter(lambda w: w != '', sentence.split(' '))) + ['###'] # Ensures this sentence can be evaluated in second order if len(words) > 2: # Add tuple of first two words to our sentence starters dictogram self.sentenceStarters.add_count((words[0], words[1])) # Loop through indices of wordsLst and create 2nd order markov chain for i in range(len(words) - 2): newKey = (words[i], words[i + 1]) if newKey not in self: self[newKey] = Dictogram() self[newKey].add_count(words[i + 2]) # Nth Order def compileN(self, sentence, n): words = list(filter(lambda w: w != '', sentence.split(' '))) if len(words) > n: for i in range(len(words) - n): newKey = tuple(w for w in words[i:i + n]) if len([i for i in newKey if i[-1] not in '!?.']) == n: if newKey not in self: self[newKey] = Dictogram() self[newKey].add_count(words[i + n]) def probableWordFrom(self, dictogram): '''Picks a random word from histogram containing words and weights''' words, weights = zip(*dictogram.items()) # Creates a list of integers, which act as separators between weights accumulator, separators = 0, [] for weight in weights: accumulator += weight separators.append(accumulator) # The indices of the words lst and seperators lst are concurrent # Here we return the word at index of whichever weight in the separators lst # is greater than this random number rand = random.randint(0, accumulator) for index, separator in enumerate(separators): if rand <= separator: return words[index] def makeSentence(self): words = list(self.probableWordFrom(self.sentenceStarters)) newWord = self.probableWordFrom(self[(words[-2], words[-1])]) while newWord != '###': words.append(newWord) newWord = self.probableWordFrom(self[(words[-2], words[-1])]) return ' '.join(words)
class MarkovChain(): def __init__(self, order=2, starttoken='!START', stoptoken='!STOP'): self.order = order # number of orders to generate the chain with self.nodes = dict() self.starttokens = Dictogram() self.stoptokens = Dictogram() self.STARTTOKEN = starttoken self.STOPTOKEN = stoptoken def get_phrase(self, text_q): phrase = () # represent the n words seperated this_q = copy.copy(text_q) for i in range(self.order): # generates the 'phrase' based off of the order which dictates the number of words we look at this_word = (this_q.dequeue(),) # stores the word we're currently looking at phrase += this_word if self.STARTTOKEN in phrase: self.starttokens.add_count(phrase) return phrase def gen_nodes(self, text): '''iterates across list of words creating or modifying nodes''' text_q = Queue() for token in text: text_q.enqueue(token) while text_q.length() > self.order: # for each first word in the text we're analysing this_phrase = self.get_phrase(text_q) text_q.dequeue() next_phrase = self.get_phrase(text_q) if this_phrase in self.nodes.keys(): # if the phrase has already been added as a key if next_phrase: self.nodes[this_phrase].add_count(next_phrase) # add a token of the next phrase else: self.nodes[this_phrase] = Node(this_phrase) # if not we create a new node if next_phrase: self.nodes[this_phrase].add_count(next_phrase) # add a token of the next phrase def get_start(self): if self.order == 1: return self.nodes[(self.STARTTOKEN),].walk() return self.starttokens.sample() def gen_sentence(self): '''generates a sentence starting with a start token''' sentence = str() this_phrase = self.get_start() # start with the start token while not self.STOPTOKEN in this_phrase: # while we don't run into a stop token slice = self.order - 1 sentence += ' '.join(this_phrase[slice:]) + ' ' # joins phrase (excluding the first word) into a string this_phrase = self.nodes[this_phrase].walk() # samples the current node for the next word if not self.order == 1: sentence += ' '.join(this_phrase[slice:1]) # joins phrase (exlcuding the last word) into a string return sentence