def __init__(self, markov_dict, priority_list=None, not_found_list=None, neighbor_dict=None): self.markov_dict = markov_dict self.gtype = self.markov_dict['gtype'] self.stop_words = set(stopwords.words('english')) self.neighbor_dict = neighbor_dict self.tokenizer = WhitespaceTokenizer() self.word_list = self.tokenizer.tokenize(self.markov_dict['corpus_txt']) self.lower_word_list = [w.lower() for w in self.word_list] # Count of word freq, maintaining case self.word_dict_count = Counter(self.word_list) self.truecaser = TrueCase(self.markov_dict['fname']) # Create priority and not_found_list if none were entered if priority_list: self.priority_list = priority_list else: self._make_priority() if not_found_list: self.not_found_list = not_found_list else: self._make_not_found()
class MarkovChain(object): '''Create a MarkovChain from the given dictionary and parameters, run() returns a sentence given a seed markov_dict should be a MarkovDict().api dictionary''' def __init__(self, markov_dict, priority_list=None, not_found_list=None, neighbor_dict=None): self.markov_dict = markov_dict self.gtype = self.markov_dict['gtype'] self.stop_words = set(stopwords.words('english')) self.neighbor_dict = neighbor_dict self.tokenizer = WhitespaceTokenizer() self.word_list = self.tokenizer.tokenize(self.markov_dict['corpus_txt']) self.lower_word_list = [w.lower() for w in self.word_list] # Count of word freq, maintaining case self.word_dict_count = Counter(self.word_list) self.truecaser = TrueCase(self.markov_dict['fname']) # Create priority and not_found_list if none were entered if priority_list: self.priority_list = priority_list else: self._make_priority() if not_found_list: self.not_found_list = not_found_list else: self._make_not_found() def _make_priority(self, n=10): '''Return the n most common words in the corpus''' # Remove stop_words content = [w for w in self.lower_word_list if w not in self.stop_words] # Remove words that are only punctuation content_no_punc = [] for word in content: tmp = False for char in word: if char not in punctuation: tmp = True else: continue if tmp: content_no_punc.append(word) priority_dict = Counter(content_no_punc) self.priority_list = [key for key, val in priority_dict.most_common(n)] def _make_not_found(self, n=15): '''Return the n most common sentences in the corpus''' not_found_dict = Counter(sent_tokenize(self.markov_dict['corpus_txt'])) common_sent = [key for key, val in not_found_dict.most_common(n)] self.not_found_list = [] # Might fill with small stuff, don't let that happen for sent in common_sent: if len(sent) > 5: self.not_found_list.append(sent) def _get_input(self, input_phrase): '''Take in the raw input from the user''' # Lowercase and remove common punc input_phrase = input_phrase.lower() input_phrase = re.sub('\?', '', input_phrase) input_phrase = re.sub('\.', '', input_phrase) input_phrase = re.sub(',', '', input_phrase) input_phrase = re.sub('!', '', input_phrase) # List of words from a potential input phrase word_list = input_phrase.split() # Make a list of words that are in priority_list priority_words = [w for w in word_list if w in self.priority_list] # If no priority words, look for non stop words content = [w for w in word_list if w not in self.stop_words] # Look for priority words first, content second, and finally random if priority_words: seed = np.random.choice(priority_words) elif content: seed = np.random.choice(content) else: # Final option is a random word seed = np.random.choice(word_list) # if the words is not in text, find neighbors if not self._in_text(seed): seed = self._get_neighbor(seed) return seed def _in_text(self, word): '''Return true if word is in the corpus''' return word.lower() in set(self.lower_word_list) def _get_neighbor(self, seed): '''Return the nearest neighbor to seed from a database''' if not self.neighbor_dict: return None neighbors = self.neighbor_dict[seed] good_neighbors = [] for word in neighbors: if self._in_text(word): # Only pick a neighbor if in text good_neighbors.append(word) if good_neighbors: return np.random.choice(good_neighbors) else: return None def _generate_key(self, seed, dir_dict): '''Return key from a chosen seed''' key_list = [] for key in dir_dict: # Look at the last key_gram_size words in the key # First word in that key_gram_size len phrase must match seed if seed in key[-self.key_gram_size]: key_list.append(key) return key_list[np.random.choice(len(key_list))] def _run_chain(self, seed, dir_dict): '''Return a list of words generated from seed Iterate through dictionary until a period or capital is reached''' key = self._generate_key(seed, dir_dict) text = list(key[-self.key_gram_size:]) # If not end/begin of sent, run while True: # Values is a list of lists values = dir_dict[key] # Choose a value with probability equal to distribution in corpus value = values[np.random.choice(len(values))] if (() in value) | (value == ()): # End condition break # Add a value_gram_size phrase to the text words_from_value = value[:self.value_gram_size] text += words_from_value # Create new lookup key key = tuple(text[-self.markov_dict['gram_size']:]) return text def _get_sentence(self, seed): '''Return a sentence given a seed''' f_text = self._run_chain(seed, self.markov_dict['f_dict']) b_text = self._run_chain(seed, self.markov_dict['b_dict']) # b_text is backwards obviously, so turn it around b_text = list(reversed(b_text)) # Only include seed once sent = b_text[:-1] + f_text return sent def _get_sentence_str(self, sent): '''Return a string representation of a list''' if self.gtype != 'naive': sent = [w[0] for w in sent] text = ' '.join(sent) punc_w_space = [' ' + x for x in punctuation] for i in xrange(len(text)-1): if text[i:i+2] in punc_w_space: text = text[:i] + text[i+1:] return text def run(self, input_text, key_gram_size=2, value_gram_size=1): '''Return a sentence based on gram_size Larger gram_size is more deterministic phrases gram_size cannot be larger than gram_size''' self.key_gram_size = min(key_gram_size, self.markov_dict['gram_size']) self.value_gram_size = min(value_gram_size, self.markov_dict['gram_size']) while self.key_gram_size + self.value_gram_size < self.markov_dict['gram_size']: self.value_gram_size += 1 seed = self._get_input(input_text) # If seed not in corpus and no neighbor found, return random sent if not seed: return np.random.choice(self.not_found_list) sent = self._get_sentence(seed) # Turn into string for output sent_str = self._get_sentence_str(sent) # Fix space before punc output = self.truecaser.truecase(sent_str) return output