def testSimpleMarkovClassifier(): mc = SimpleMarkovClassifier(dtype="c") text = "after the letter e follows either space or the letters r t or i" for word in text.split(): word = word.lower() features = list(zip(" " + word)) labels = list(word + " ") mc.train(mdp.numx.array(features), labels) assert mc.input_dim == 1 num_transitions = 0 features = mc.features for feature, count in list(features.items()): if count: prob = mc.prob(mdp.numx.array([feature])) prob_sum = 0 for p in prob: for k, v in list(p.items()): prob_sum += v if v: num_transitions += 1 assert abs(prob_sum - 1.0) < 1e-5 # calculate the number of transitions (the negative set deletes the artefact of two spaces) trans = len(set((list(zip(" ".join(text.split()) + " ", \ " " + " ".join(text.split()))))) - set([(' ', ' ')])) assert num_transitions == trans letters_following_e = [' ', 'r', 't', 'i'] letters_prob = mc.prob(mdp.numx.array([['e']]))[0] prob_sum = 0 for letter, prob in list(letters_prob.items()): prob_sum += prob if prob > 1e-5: assert letter in letters_following_e assert abs(prob_sum - 1.0) < 1e-5
class DictionaryDemo(object): """This demo generates words from a selected dictionary by calculating the transition probabilities from two consecutive letters to the next. """ def __init__(self, dictionary, correlation, verbose=False): self._correlation = correlation self._dictionary = dictionary self._verbose = verbose if self._verbose: print self.__doc__ self.mc = SimpleMarkovClassifier(dtype="unicode") self.trainSimpleMarkovClassifier() if self._verbose: self.print_transition_probabilities() def trainSimpleMarkovClassifier(self): regex = re.compile('[%s]' % re.escape(string.punctuation)) dictfile = codecs.open(self._dictionary, "r", "latin-1") def file_len(fname): f = open(fname) for i, l in enumerate(f): pass f.close() return i + 1 if self._verbose: print "Start learning from ‘%s’." % self._dictionary for num, word in mdp.utils.progressinfo(enumerate(dictfile), file_len(self._dictionary)): # transform input to our needs #if num == 100: break # remove punctuation word = regex.sub(' ', word).lower().strip().split() try: word = word[0] except IndexError: continue shifted_words = [ " " * i + word for i in range(self._correlation, 0, -1) ] words = zip(*shifted_words) labels = list(word + " ") self.mc.train(mdp.numx.array(words), labels) dictfile.close() def print_transition_probabilities(self): print "Transition probabilities:" features = self.mc.features for feature, count in features.items(): if count: prob = self.mc.prob(mdp.numx.array([feature])) for p in prob: for k, v in p.items(): if v: print "".join(feature).replace(" ", "_"), \ "->", k.replace(" ", "_"), \ "(", ("%7.3f %%" % (v * 100)), ")" def get_words(self, num_words): for _ in range(num_words): features = [" "] * (self._correlation) for __ in range(50): # have a maximum length f = mdp.numx.array([features[-self._correlation:]]) new_f = weighted_choice(self.mc.prob(f)[0], True) if new_f is None: break features.append(new_f) print "".join(features)
class DictionaryDemo(object): """This demo generates words from a selected dictionary by calculating the transition probabilities from two consecutive letters to the next. """ def __init__(self, dictionary, correlation, verbose=False): self._correlation = correlation self._dictionary = dictionary self._verbose = verbose if self._verbose: print self.__doc__ self.mc = SimpleMarkovClassifier(dtype="unicode") self.trainSimpleMarkovClassifier() if self._verbose: self.print_transition_probabilities() def trainSimpleMarkovClassifier(self): regex = re.compile('[%s]' % re.escape(string.punctuation)) dictfile = codecs.open(self._dictionary, "r", "latin-1") def file_len(fname): f = open(fname) for i, l in enumerate(f): pass f.close() return i + 1 if self._verbose: print "Start learning from ‘%s’." % self._dictionary for num, word in mdp.utils.progressinfo(enumerate(dictfile), file_len(self._dictionary)): # transform input to our needs #if num == 100: break # remove punctuation word = regex.sub(' ', word).lower().strip().split() try: word = word[0] except IndexError: continue shifted_words = [" " * i + word for i in range(self._correlation, 0, -1)] words = zip(*shifted_words) labels = list(word + " ") self.mc.train(mdp.numx.array(words), labels) dictfile.close() def print_transition_probabilities(self): print "Transition probabilities:" features = self.mc.features for feature, count in features.items(): if count: prob = self.mc.prob(mdp.numx.array([feature])) for p in prob: for k, v in p.items(): if v: print "".join(feature).replace(" ", "_"), \ "->", k.replace(" ", "_"), \ "(", ("%7.3f %%" % (v * 100)), ")" def get_words(self, num_words): for _ in range(num_words): features = [" "] * (self._correlation) for __ in range(50): # have a maximum length f = mdp.numx.array([features[-self._correlation:]]) new_f = weighted_choice(self.mc.prob(f)[0], True) if new_f is None: break features.append(new_f) print "".join(features)