def generate_random_sentence(self, length, start_sequence): """ Генерация предложения :param length: количестов слов в предложение :type length: int :param start_sequence: начало предложения :type start_sequence: str :return: предложение :rtype: str """ current_word_sequence = tuple(map(int, start_sequence.split())) sentence = list(current_word_sequence) sentence_num = 0 while sentence_num < length: current_dictogram = Dictogram( self.get_sequence(' '.join(map(str, current_word_sequence)))) random_weighted_word = current_dictogram.return_weighted_random_word( ) current_word_sequence = current_word_sequence[1:] + tuple( [random_weighted_word]) sentence.append(current_word_sequence[-1]) if current_word_sequence[-1] == self.tokenizer.end_symbol: sentence_num += 1 return sentence
def populate_chain(self, nth_order): self.chain = Dictogram() Nth_len = self.total_words - nth_order # print("Total words: {}, {} words for Nth_order: {} word pairs".format(self.total_words, Nth_len, nth_order)) for i in range(Nth_len): """Look through available words in the corpus relative to the window size""" cur = "" nxt = "" for pos in range(1, nth_order + 1): self.transition.put(self.corpus[i + (pos - 1)]) self.transition.put(self.corpus[i + pos]) if pos > 1: cur += " " nxt += " " cur += self.transition.get() nxt += self.transition.get() # print("Current pair: {} -->\t Next pair: {}\n".format(cur, nxt)) if self.chain.get(cur, None) is None: # print("\n--Adding NEW--") self.chain[cur] = Dictogram([nxt]) else: # print("\n--Adding to Count--") self.chain[cur].add_count(nxt) # print(self.chain) return self.chain
def create_markov_chain(self, data_list): markov_chain = {} markov_chain['START'] = Dictogram() markov_chain['START'].add_count(data_list[0]) word = None middle_word = None for index in range(0, len(data_list) - 1): touple_key = None last_word = middle_word middle_word = word word = data_list[index] if word == 'START' or word == 'STOP': touple_key = word else: if middle_word == "START": touple_key = (middle_word, word) else: touple_key = (last_word, middle_word, word) if touple_key is None: pass else: if touple_key not in markov_chain: markov_chain[touple_key] = Dictogram() markov_chain[touple_key].add_count(data_list[index + 1]) return markov_chain
def __init__(self, word_list, nth_order = 1): """Initialize the class and create variables""" self.word_list = create_list(word_list) self.dictionary_histogram = Dictogram(self.word_list) self.nth_order = nth_order """ Creating the Markov Chain """ #Edit so as to get rid of length of list minus 1 and it doesnt run errors def create_chain(self): pass for index in range(len(self.word_list)-nth_order): word = self.word_list[index] next_word = self.word_list[index+1] word_after_next = self.word_list[index+2] # ----------------------------------------- # if len(self.word_list)==index+1: # next_word = None # else: # next_word = self.word_list[index+1] # ------------------------------------------- if (word,next_word) not in self: small_dicto = Dictogram([(next_word,word_after_next)]) self[(word,next_word)] = small_dicto else: self[(word,next_word)].add_count((next_word,word_after_next))
def create_histograms(self, word_list): """Read the given word list and create the Markov chain structure. Loop over the words, three at a time (previous_word, word, next_word). """ histograms = {} histograms['END'] = Dictogram(self.generate_start(word_list)) # since it's 2rd order, we are making space for 2 words for index, word in enumerate(word_list): # if word == 'END': # window = ('END', word_list[index+1]) if index < len(word_list) - 2: prev_word = word_list[index] current_word = word_list[index + 1] next_word = word_list[index + 2] window = (prev_word, current_word) # If word has never been seen before, create a new histogram with a list containing next word if window not in histograms: histograms[window] = Dictogram([next_word]) # if word has been seen, get its existing histogram and append the count to it else: histograms[window].add_count( next_word) # o(n) , n is len word_list # pprint(histograms) return histograms
def __init__(self, word_list): super().__init__() self.start_tokens = Dictogram() self.stop_tokens = Dictogram() ##### for first order MarkovChain word_list[0] = re.sub("[^a-zA-Z]", '', word_list[0]) self.start_tokens.add_count(word_list[0].lower(), 1) for i in range(1, len(word_list)-1, 1): if((word_list[i][0].isupper()) and word_list[i-1][len(word_list[i-1])-1] in string.punctuation): word_list[i] = re.sub("[^a-zA-Z]", '', word_list[i]) self.start_tokens.add_count(word_list[i].lower(), 1) for i in range(len(word_list)): if(word_list[i][len(word_list[i])-1] in string.punctuation): word_list[i] = re.sub("[^a-zA-Z]", '', word_list[i]) # word_list[i] = word_list[i][:len(word_list[i])-1] self.stop_tokens.add_count(word_list[i], 1) for i in range(len(word_list)-1): word_list[i] = re.sub("[^a-zA-Z]", '', word_list[i]) word_list[i+1] = re.sub("[^a-zA-Z]", '', word_list[i+1]) if word_list[i] in self: self[word_list[i].lower()].add_count(word_list[i+1].lower(), 1) else: self[word_list[i].lower()] = Dictogram([word_list[i+1].lower()])
def __init__(self, word_list=None): super(Markov, self).__init__() #if there is an array of words if word_list != None: #looping through all the array first_word = word_list[0] second_word = word_list[1] tuple_key = (first_word, second_word) last_word = word_list[1] self[tuple_key] = Dictogram() #start with the third word since we already used 2 for index, word in enumerate(word_list[2:]): #first we'll handle the appending logic if word not in self[tuple_key]: self[tuple_key][word] = 1 self[tuple_key].tokens += 1 else: self[tuple_key][word] += 1 self[tuple_key].tokens += 1 new_key = (last_word, word) #now we handle making a new key or not if new_key not in self: self[new_key] = Dictogram() #now we set the tuple_key and last_word tuple_key = new_key last_word = new_key[1]
def dictogram(words): hist = Dictogram() for i in range(len(words)-1): if words[i] not in hist: hist[words[i]] = Dictogram() hist[words[i]].add_count(words[i+1]) return hist
def __init__(self, words_list=None): super(MarkovChain, self).__init__() if words_list is not None: self.create_markov_chain(words_list) self['start'] = Dictogram(['the']) self['end'] = Dictogram(['.'])
def histogram_as_dict(source_text_as_list): """Return histogram as a dict: {'one': 1, 'fish': 4}...""" output_dict = Dictogram() for word in source_text_as_list: output_dict.add_count(word) return output_dict
def test_frequency(): dictogram = Dictogram(fish_words) # Verify frequency count of all words assert dictogram.frequency('one') == 1 assert dictogram.frequency('two') == 1 assert dictogram.frequency('red') == 1 assert dictogram.frequency('blue') == 1 assert dictogram.frequency('fish') == 4
def __init__(self, order=2, starttoken='!START', stoptoken='!STOP'): self.order = order # number of orders to generate the chain with self.nodes = dict() self.starttokens = Dictogram() self.stoptokens = Dictogram() self.STARTTOKEN = starttoken self.STOPTOKEN = stoptoken
def test_types(self): histogram = Dictogram(self.fish_words) # Verify count of distinct word types assert len(set(self.fish_words)) == 5 assert histogram.types == 5 # Adding words again should not change count of distinct word types for word in self.fish_words: histogram.add_count(word) assert histogram.types == 5
def test_tokens(self): histogram = Dictogram(self.fish_words) # Verify total count of all word tokens assert len(self.fish_words) == 8 assert histogram.tokens == 8 # Adding words again should double total count of all word tokens for word in self.fish_words: histogram.add_count(word) assert histogram.tokens == 8 * 2
def test_entries(self): dictogram = Dictogram(self.fish_words) # Verify histogram as dictionary of entries like {word: count} assert len(dictogram) == 5 self.assertCountEqual(dictogram, self.fish_dict) # Ignore item order # Verify histogram as list of entries like [(word, count)] listogram = dictogram.items() assert len(listogram) == 5 self.assertCountEqual(listogram, self.fish_list) # Ignore item order
def create_markov_chain(self, data_list): markov_chain = {} markov_chain['START'] = Dictogram() markov_chain['START'].add_count(data_list[0]) for index in range(0, len(data_list) - 1): word = data_list[index] if word not in markov_chain: markov_chain[word] = Dictogram() markov_chain[word].add_count(data_list[index + 1]) return markov_chain
def test_types(): fish_words = ['one', 'fish', 'two', 'fish', 'red', 'fish', 'blue', 'fish'] histogram = Dictogram(fish_words) # Verify count of distinct word types assert len(set(fish_words)) == 5 assert histogram.types == 5 # Adding words again should not change count of distinct word types for word in fish_words: histogram.add_count(word) assert histogram.types == 5
def build_state_histogram(self, words_list): tokens = Queue(words_list[0:self.n]) for index in range(self.n - 1, len(words_list)): if tokens not in self: self[tuple(tokens)] = Dictogram() try: self[tuple(tokens)].add_count(words_list[index + 1]) tokens.enqueue(words_list[index + 1]) except: self[tuple(tokens)] = Dictogram(['**STOP**']) tokens.dequeue()
def add_count(self, word, previous_word, count=1): """Increase frequency count of given word by given count amount.""" if previous_word != '': try: dicto = self[previous_word] except KeyError: dicto = Dictogram() self.types += 1 dicto.add_count(word) self[previous_word] = dicto self.tokens = self.tokens + count
def __init__(self, word_list, order): # super().__init__() self.order = order self.start_tokens = Dictogram() self.stop_tokens = Dictogram() ##### for first order MarkovChain # word_list[0] = re.sub("[^a-zA-Z]", '', word_list[0]) # self.start_tokens.add_count(word_list[0].lower(), 1) for i in range(1, len(word_list) - 1, 1): try: if ((word_list[i][0].isupper()) and word_list[i - 1][len(word_list[i - 1]) - 1] in string.punctuation): # word_list[i] = re.sub("[^a-zA-Z]", '', word_list[i]) if self.order > 1: temp = list() for j in range(self.order): word_list[i + j] = re.sub("[^a-zA-Z]", '', word_list[i + j]) temp.append(word_list[i + j].lower()) if len(temp) > 1: temp = tuple(temp) else: word_list[i] = re.sub("[^a-zA-Z]", '', word_list[i]) temp = word_list[i].lower() self.start_tokens.add_count(temp, 1) except: pass for i in range(len(word_list)): try: if (word_list[i][len(word_list[i]) - 1] in string.punctuation): word_list[i] = re.sub("[^a-zA-Z]", '', word_list[i]) # word_list[i] = word_list[i][:len(word_list[i])-1] self.stop_tokens.add_count(word_list[i], 1) except: pass for i in range(len(word_list) - self.order): if self.order > 1: temp = list() for j in range(self.order): word_list[i + j] = re.sub("[^a-zA-Z]", '', word_list[i + j]) temp.append(word_list[i + j].lower()) if len(temp) > 1: temp = tuple(temp) else: word_list[i] = re.sub("[^a-zA-Z]", '', word_list[i]) temp = word_list[i].lower() if temp in self: self[temp].add_count(word_list[i + self.order].lower(), 1) else: self[temp] = Dictogram([word_list[i + self.order].lower()])
def test_tokens(): fish_words = ['one', 'fish', 'two', 'fish', 'red', 'fish', 'blue', 'fish'] histogram = Dictogram(fish_words) # Verify total count of all word tokens assert len(fish_words) == 8 assert histogram.tokens == 8 # Adding words again should double total count of all word tokens for word in fish_words: histogram.add_count(word) assert histogram.tokens == 8 * 2
def add_token(self, current_type, next_type): # check if empty if self.empty: self.empty = False self[current_type] = Dictogram([next_type]) else: #make sure key you're trying to insert is in your markov or not if current_type in self: self[current_type].add_count(next_type) else: # we dont have anything to retrieve, therefore create key value pair self[current_type] = Dictogram([next_type])
def walk(word_list, length): sentence = [] histogram = Dictogram(word_list) next_word = histogram.sample() sentence.append(next_word) for i in range(length - 1): chain = new_chain(word_list, next_word) if len(chain) > 0: next_word = chain.sample() sentence.append(next_word) return sentence
def walk(self, num_words): words = [] histogram = Dictogram(self.word_list) next_word = histogram.sample() words.append(next_word) for i in range(num_words - 1): if len(self.markov_chain) > 0: next_word = histogram.sample() words.append(next_word) sentence = ' '.join(words) return sentence
def test_entries(): fish_words = ['one', 'fish', 'two', 'fish', 'red', 'fish', 'blue', 'fish'] fish_list = [('one', 1), ('fish', 4), ('two', 1), ('red', 1), ('blue', 1)] fish_dict = {'one': 1, 'fish': 4, 'two': 1, 'red': 1, 'blue': 1} case = unittest.TestCase() dictogram = Dictogram(fish_words) # Verify histogram as dictionary of entries like {word: count} assert len(dictogram) == 5 case.assertCountEqual(dictogram, fish_dict) # Ignore item order # Verify histogram as list of entries like [(word, count)] listogram = dictogram.items() assert len(listogram) == 5 case.assertCountEqual(listogram, fish_list) # Ignore item order
class MarkovChain(Dictogram): def __init__(self, word_list): super().__init__() self.start_tokens = Dictogram() self.stop_tokens = Dictogram() ##### for first order MarkovChain word_list[0] = re.sub("[^a-zA-Z]", '', word_list[0]) self.start_tokens.add_count(word_list[0].lower(), 1) for i in range(1, len(word_list)-1, 1): if((word_list[i][0].isupper()) and word_list[i-1][len(word_list[i-1])-1] in string.punctuation): word_list[i] = re.sub("[^a-zA-Z]", '', word_list[i]) self.start_tokens.add_count(word_list[i].lower(), 1) for i in range(len(word_list)): if(word_list[i][len(word_list[i])-1] in string.punctuation): word_list[i] = re.sub("[^a-zA-Z]", '', word_list[i]) # word_list[i] = word_list[i][:len(word_list[i])-1] self.stop_tokens.add_count(word_list[i], 1) for i in range(len(word_list)-1): word_list[i] = re.sub("[^a-zA-Z]", '', word_list[i]) word_list[i+1] = re.sub("[^a-zA-Z]", '', word_list[i+1]) if word_list[i] in self: self[word_list[i].lower()].add_count(word_list[i+1].lower(), 1) else: self[word_list[i].lower()] = Dictogram([word_list[i+1].lower()]) def random_walk(self, length=10): sentence = "" keys = list(self.keys()) word = self.start_word() sentence += word + " " word = word.lower() for i in range(length-1): word = self[word].sample() sentence += word + " " sentence = sentence + self.end_word() + ". " return sentence def start_word(self): ###for 1st order markov chain dart = random.randint(0, len(self.start_tokens)+1) fence = 0 for elm in self.start_tokens: for key in self.start_tokens.keys(): fence += self.start_tokens[key] if fence > dart: return elm.capitalize() def end_word(self): dart = random.randint(0, len(self.stop_tokens)+1) fence = 0 while 1: for elm in self.stop_tokens: for key in self.stop_tokens.keys(): fence += self.stop_tokens[key] if fence >= dart: return elm
def test_update_2(self): dsg = Dictogram(self.words) dsg.update(self.words) self.assertEquals({'a': 2, 'b': 2, 'c': 2}, dsg) dsg.update(('c', 'b',)) self.assertEquals({'a': 2, 'b': 3, 'c': 3}, dsg) dsg.update(('b',)) self.assertEquals({'a': 2, 'b': 4, 'c': 3}, dsg)
def random_walk(word_list, length): """Start sentence with sample word from histogram, and then sample each new histogram chain to get the next word, add then to sentence. """ sentence = [] histogram = Dictogram(word_list) next_word = histogram.sample() sentence.append(next_word) for i in range(length - 1): chain = new_chain(word_list, next_word) if len(chain) > 0: next_word = chain.sample() sentence.append(next_word) return sentence
def add_count(self, word_1, word_2): self.tokens += 1 if len(self) == 0: self[word_1] = Dictogram([word_2]) self.types += 1 return if word_1 in self: self[word_1].add_count(word_2) # self[self.index(word_1)][1].add_count(word_2) return self[word_1] = Dictogram([word_2]) self.types += 1
def create_nth_chain(self, words_list): #Point to start slicing start = 0 #point to stop slicing slice excludes the end point[] end = self.order while end <= len(words_list) : #take a slice state = ' '.join(words_list[start:end]) #check if it is in histogram already if self.get(state) == None: #not in histogram so add it self[state] = Dictogram() #check if token should go in start state #checks for capitalization if re.match('[A-Z]', state) is not None: self.get('start').add_count(state) #increment state start += 1 end += 1 #bounds check if end <= len(words_list): #look at next state next_state = ' '.join(words_list[end-1:end]) #add next state to current state self.get(state).add_count(next_state)