Exemple #1
0
def testSimpleMarkovClassifier():
    mc = SimpleMarkovClassifier(dtype="c")
    text = "after the letter e follows either space or the letters r t or i"

    for word in text.split():
        word = word.lower()

        features = list(zip(" " + word))
        labels = list(word + " ")

        mc.train(mdp.numx.array(features), labels)

    assert mc.input_dim == 1

    num_transitions = 0
    features = mc.features
    for feature, count in list(features.items()):
        if count:
            prob = mc.prob(mdp.numx.array([feature]))
            prob_sum = 0
            for p in prob:
                for k, v in list(p.items()):
                    prob_sum += v
                    if v:
                        num_transitions += 1

            assert abs(prob_sum - 1.0) < 1e-5

    # calculate the number of transitions (the negative set deletes the artefact of two spaces)
    trans = len(set((list(zip("  ".join(text.split()) + " ", \
                         " " + "  ".join(text.split()))))) - set([(' ', ' ')]))
    assert num_transitions == trans

    letters_following_e = [' ', 'r', 't', 'i']
    letters_prob = mc.prob(mdp.numx.array([['e']]))[0]
    prob_sum = 0
    for letter, prob in list(letters_prob.items()):
        prob_sum += prob
        if prob > 1e-5:
            assert letter in letters_following_e

    assert abs(prob_sum - 1.0) < 1e-5
Exemple #2
0
class DictionaryDemo(object):
    """This demo generates words from a selected dictionary by calculating
    the transition probabilities from two consecutive letters to the next.
    """
    def __init__(self, dictionary, correlation, verbose=False):
        self._correlation = correlation
        self._dictionary = dictionary
        self._verbose = verbose

        if self._verbose:
            print self.__doc__

        self.mc = SimpleMarkovClassifier(dtype="unicode")

        self.trainSimpleMarkovClassifier()
        if self._verbose:
            self.print_transition_probabilities()

    def trainSimpleMarkovClassifier(self):
        regex = re.compile('[%s]' % re.escape(string.punctuation))
        dictfile = codecs.open(self._dictionary, "r", "latin-1")

        def file_len(fname):
            f = open(fname)
            for i, l in enumerate(f):
                pass
            f.close()
            return i + 1

        if self._verbose:
            print "Start learning from ‘%s’." % self._dictionary
        for num, word in mdp.utils.progressinfo(enumerate(dictfile),
                                                file_len(self._dictionary)):
            # transform input to our needs
            #if num == 100: break

            # remove punctuation
            word = regex.sub(' ', word).lower().strip().split()
            try:
                word = word[0]
            except IndexError:
                continue

            shifted_words = [
                " " * i + word for i in range(self._correlation, 0, -1)
            ]
            words = zip(*shifted_words)
            labels = list(word + " ")
            self.mc.train(mdp.numx.array(words), labels)

        dictfile.close()

    def print_transition_probabilities(self):
        print "Transition probabilities:"
        features = self.mc.features
        for feature, count in features.items():
            if count:
                prob = self.mc.prob(mdp.numx.array([feature]))
                for p in prob:
                    for k, v in p.items():
                        if v:
                            print "".join(feature).replace(" ", "_"), \
                                  "->", k.replace(" ", "_"), \
                                  "(", ("%7.3f %%" % (v * 100)), ")"

    def get_words(self, num_words):
        for _ in range(num_words):
            features = [" "] * (self._correlation)
            for __ in range(50):  # have a maximum length
                f = mdp.numx.array([features[-self._correlation:]])
                new_f = weighted_choice(self.mc.prob(f)[0], True)
                if new_f is None:
                    break
                features.append(new_f)
            print "".join(features)
Exemple #3
0
class DictionaryDemo(object):
    """This demo generates words from a selected dictionary by calculating
    the transition probabilities from two consecutive letters to the next.
    """
    def __init__(self, dictionary, correlation, verbose=False):
        self._correlation = correlation
        self._dictionary = dictionary
        self._verbose = verbose

        if self._verbose:
            print self.__doc__

        self.mc = SimpleMarkovClassifier(dtype="unicode")

        self.trainSimpleMarkovClassifier()
        if self._verbose:
            self.print_transition_probabilities()

    def trainSimpleMarkovClassifier(self):
        regex = re.compile('[%s]' % re.escape(string.punctuation))
        dictfile = codecs.open(self._dictionary, "r", "latin-1")

        def file_len(fname):
            f = open(fname)
            for i, l in enumerate(f):
                pass
            f.close()
            return i + 1

        if self._verbose:
            print "Start learning from ‘%s’." % self._dictionary
        for num, word in mdp.utils.progressinfo(enumerate(dictfile),
                                                file_len(self._dictionary)):
            # transform input to our needs
            #if num == 100: break

            # remove punctuation
            word = regex.sub(' ', word).lower().strip().split()
            try:
                word = word[0]
            except IndexError:
                continue

            shifted_words = [" " * i + word for i in range(self._correlation, 0, -1)]
            words = zip(*shifted_words)
            labels = list(word + " ")
            self.mc.train(mdp.numx.array(words), labels)

        dictfile.close()

    def print_transition_probabilities(self):
        print "Transition probabilities:"
        features = self.mc.features
        for feature, count in features.items():
            if count:
                prob = self.mc.prob(mdp.numx.array([feature]))
                for p in prob:
                    for k, v in p.items():
                        if v:
                            print "".join(feature).replace(" ", "_"), \
                                  "->", k.replace(" ", "_"), \
                                  "(", ("%7.3f %%" % (v * 100)), ")"

    def get_words(self, num_words):
        for _ in range(num_words):
            features = [" "] * (self._correlation)
            for __ in range(50): # have a maximum length
                f = mdp.numx.array([features[-self._correlation:]])
                new_f = weighted_choice(self.mc.prob(f)[0], True)
                if new_f is None:
                    break
                features.append(new_f)
            print "".join(features)