コード例 #1
0
def histogram_as_dict(source_text_as_list):
    """Return histogram as a dict: {'one': 1, 'fish': 4}..."""
    output_dict = Dictogram()

    for word in source_text_as_list:
        output_dict.add_count(word)

    return output_dict
コード例 #2
0
 def test_types(self):
     histogram = Dictogram(self.fish_words)
     # Verify count of distinct word types
     assert len(set(self.fish_words)) == 5
     assert histogram.types == 5
     # Adding words again should not change count of distinct word types
     for word in self.fish_words:
         histogram.add_count(word)
     assert histogram.types == 5
コード例 #3
0
 def test_tokens(self):
     histogram = Dictogram(self.fish_words)
     # Verify total count of all word tokens
     assert len(self.fish_words) == 8
     assert histogram.tokens == 8
     # Adding words again should double total count of all word tokens
     for word in self.fish_words:
         histogram.add_count(word)
     assert histogram.tokens == 8 * 2
コード例 #4
0
def test_types():
    fish_words = ['one', 'fish', 'two', 'fish', 'red', 'fish', 'blue', 'fish']

    histogram = Dictogram(fish_words)
    # Verify count of distinct word types
    assert len(set(fish_words)) == 5
    assert histogram.types == 5
    # Adding words again should not change count of distinct word types
    for word in fish_words:
        histogram.add_count(word)
    assert histogram.types == 5
コード例 #5
0
def test_tokens():
    fish_words = ['one', 'fish', 'two', 'fish', 'red', 'fish', 'blue', 'fish']

    histogram = Dictogram(fish_words)
    # Verify total count of all word tokens
    assert len(fish_words) == 8
    assert histogram.tokens == 8
    # Adding words again should double total count of all word tokens
    for word in fish_words:
        histogram.add_count(word)
    assert histogram.tokens == 8 * 2
コード例 #6
0
 def add_count(self, word, previous_word, count=1):
     """Increase frequency count of given word by given count amount."""
     if previous_word != '':
         try:
             dicto = self[previous_word]
         except KeyError:
             dicto = Dictogram()
             self.types += 1
         dicto.add_count(word)
         self[previous_word] = dicto
         self.tokens = self.tokens + count
コード例 #7
0
def markov_chain(list_of_values):
    mc = Dictogram()

    i = 0
    while i < len(list_of_values) - 3:
        pair = (list_of_values[i], list_of_values[i + 1], list_of_values[i + 2]
                )  # pair of words in tuple
        # print(list_of_values[i], list_of_values[i+1], list_of_values[i+2], list_of_values[i+3], list_of_values[i+5])
        next_word = list_of_values[i + 3]  # next word

        if pair in mc:  # check to see if pair of words exist in the dictionary
            mc[pair].add_count(
                next_word)  # increments the count of the next word

        else:
            new_dict = Dictogram()  # new dictionary
            new_dict.add_count(next_word)  # add count of next word
            mc[pair] = new_dict  #
        i += 1
    # print('markov chain ==> \n', mc, '\n<== end of markov chain\n')
    return mc
コード例 #8
0
class MarkovChain(Dictogram):
    def __init__(self, word_list):
        super().__init__()
        self.start_tokens = Dictogram()
        self.stop_tokens = Dictogram()

        ##### for first order MarkovChain
        word_list[0] = re.sub("[^a-zA-Z]", '', word_list[0])
        self.start_tokens.add_count(word_list[0].lower(), 1)

        for i in range(1, len(word_list)-1, 1):
            if((word_list[i][0].isupper()) and word_list[i-1][len(word_list[i-1])-1] in string.punctuation):
                word_list[i] = re.sub("[^a-zA-Z]", '', word_list[i])
                self.start_tokens.add_count(word_list[i].lower(), 1)
        for i in range(len(word_list)):
            if(word_list[i][len(word_list[i])-1] in string.punctuation):
                word_list[i] = re.sub("[^a-zA-Z]", '', word_list[i])
                # word_list[i] = word_list[i][:len(word_list[i])-1]
                self.stop_tokens.add_count(word_list[i], 1)
        for i in range(len(word_list)-1):
            word_list[i] = re.sub("[^a-zA-Z]", '', word_list[i])
            word_list[i+1] = re.sub("[^a-zA-Z]", '', word_list[i+1])
            if word_list[i] in self:
                self[word_list[i].lower()].add_count(word_list[i+1].lower(), 1)
            else:
                self[word_list[i].lower()] = Dictogram([word_list[i+1].lower()])

    def random_walk(self, length=10):
        sentence = ""
        keys = list(self.keys())
        word = self.start_word()
        sentence += word + " "
        word = word.lower()
        for i in range(length-1):
            word = self[word].sample()
            sentence += word + " "
        sentence = sentence + self.end_word() + ". "
        return sentence

    def start_word(self):
        ###for 1st order markov chain
        dart = random.randint(0, len(self.start_tokens)+1)
        fence = 0
        for elm in self.start_tokens:
            for key in self.start_tokens.keys():
                fence += self.start_tokens[key]
                if fence > dart:
                    return elm.capitalize()


    def end_word(self):
        dart = random.randint(0, len(self.stop_tokens)+1)
        fence = 0
        while 1:
            for elm in self.stop_tokens:
                for key in self.stop_tokens.keys():
                    fence += self.stop_tokens[key]
                    if fence >= dart:
                        return elm
コード例 #9
0
 def test_add_count(self):
     histogram = Dictogram(self.fish_words)
     # Add more words to update frequency counts
     histogram.add_count('two', 2)
     histogram.add_count('blue', 3)
     histogram.add_count('fish', 4)
     histogram.add_count('food', 5)
     # Verify updated frequency count of all words
     assert histogram.frequency('one') == 1
     assert histogram.frequency('two') == 3
     assert histogram.frequency('red') == 1
     assert histogram.frequency('blue') == 4
     assert histogram.frequency('fish') == 8
     assert histogram.frequency('food') == 5
     # Verify count of distinct word types
     assert histogram.types == 6
     # Verify total count of all word tokens
     assert histogram.tokens == 8 + 14
class MarkovChain(Dictogram):
    def __init__(self, word_list, order):
        # super().__init__()
        self.order = order
        self.start_tokens = Dictogram()
        self.stop_tokens = Dictogram()

        ##### for first order MarkovChain
        # word_list[0] = re.sub("[^a-zA-Z]", '', word_list[0])
        # self.start_tokens.add_count(word_list[0].lower(), 1)
        for i in range(1, len(word_list) - 1, 1):
            try:
                if ((word_list[i][0].isupper())
                        and word_list[i - 1][len(word_list[i - 1]) - 1]
                        in string.punctuation):
                    # word_list[i] = re.sub("[^a-zA-Z]", '', word_list[i])
                    if self.order > 1:
                        temp = list()
                        for j in range(self.order):
                            word_list[i + j] = re.sub("[^a-zA-Z]", '',
                                                      word_list[i + j])
                            temp.append(word_list[i + j].lower())
                        if len(temp) > 1:
                            temp = tuple(temp)
                    else:
                        word_list[i] = re.sub("[^a-zA-Z]", '', word_list[i])
                        temp = word_list[i].lower()
                    self.start_tokens.add_count(temp, 1)
            except:
                pass

        for i in range(len(word_list)):
            try:
                if (word_list[i][len(word_list[i]) - 1] in string.punctuation):
                    word_list[i] = re.sub("[^a-zA-Z]", '', word_list[i])
                    # word_list[i] = word_list[i][:len(word_list[i])-1]
                    self.stop_tokens.add_count(word_list[i], 1)
            except:
                pass
        for i in range(len(word_list) - self.order):
            if self.order > 1:
                temp = list()
                for j in range(self.order):
                    word_list[i + j] = re.sub("[^a-zA-Z]", '',
                                              word_list[i + j])
                    temp.append(word_list[i + j].lower())
                if len(temp) > 1:
                    temp = tuple(temp)
            else:
                word_list[i] = re.sub("[^a-zA-Z]", '', word_list[i])
                temp = word_list[i].lower()
            if temp in self:
                self[temp].add_count(word_list[i + self.order].lower(), 1)
            else:
                self[temp] = Dictogram([word_list[i + self.order].lower()])

    def random_walk(self, length=10):
        sentence = ""
        # print("55")
        # print(self.start_word())
        if self.order > 1:
            sentence = str(self.start_word()).capitalize() + " "
        else:
            sentence = self.start_word().capitalize() + " "
        for i in range(length - self.order - 1):
            next_word = self.sample(sentence)
            sentence += next_word + " "
        sentence += self.end_word() + "."
        return sentence

    def start_word(self):
        tokens = 0
        for elm in self.start_tokens:
            tokens += self.start_tokens[elm]
        dart = random.randrange(0, tokens)
        fence = 0
        for elm in self.start_tokens:
            fence += self.start_tokens[elm]
            if fence > dart:
                if self.order > 1:
                    sentence = ""
                    for i in range(len(elm)):
                        sentence += elm[i] + " "
                    return sentence
                else:
                    return elm

    def end_word(self):
        tokens = 0
        for elm in self.stop_tokens:
            tokens += self.stop_tokens[elm]
        dart = random.randrange(0, tokens)
        fence = 0
        for elm in self.stop_tokens:
            fence += self.stop_tokens[elm]
            if fence > dart:
                return elm

    def sample(self, sentence):
        sentence = sentence.split()
        sentence[0] = sentence[0].lower()
        if self.order > 1:
            key = list()
            for i in range(
                    len(sentence) - 1,
                    len(sentence) - 1 - self.order, -1):
                key.append(sentence[i])
            key = tuple(key)
        else:
            key = sentence[len(sentence) - 1]
        tokens = 0
        for elm in self[key]:
            tokens += self[key][elm]
        dart = random.randint(0, tokens)
        fence = 0
        for elm in self[key]:
            fence += self[key][elm]
            if fence >= dart:
                if (self.order > 1):
                    return key[len(key) - 1]
                else:
                    return elm
コード例 #11
0
class MarkovChain(dict):
    def __init__(self, sentences, order=1):
        super(MarkovChain, self).__init__()
        self.sentenceStarters = Dictogram()
        for s in sentences:
            self.compile2(s)

    # First Order
    def compile(self, sentence):
        words = sentence.split(' ')
        for i in range(len(words) - 1):
            if words[i] not in self:
                self[words[i]] = Dictogram()
            self[words[i]].add_count(words[i + 1])

    # Second Order
    def compile2(self, sentence):
        # Split sentence by spaces, remove empty strings, add '###' to the end of wordsLst
        words = list(filter(lambda w: w != '', sentence.split(' '))) + ['###']

        # Ensures this sentence can be evaluated in second order
        if len(words) > 2:
            # Add tuple of first two words to our sentence starters dictogram
            self.sentenceStarters.add_count((words[0], words[1]))

            # Loop through indices of wordsLst and create 2nd order markov chain
            for i in range(len(words) - 2):
                newKey = (words[i], words[i + 1])
                if newKey not in self:
                    self[newKey] = Dictogram()
                self[newKey].add_count(words[i + 2])

    # Nth Order
    def compileN(self, sentence, n):
        words = list(filter(lambda w: w != '', sentence.split(' ')))
        if len(words) > n:
            for i in range(len(words) - n):
                newKey = tuple(w for w in words[i:i + n])
                if len([i for i in newKey if i[-1] not in '!?.']) == n:
                    if newKey not in self:
                        self[newKey] = Dictogram()
                    self[newKey].add_count(words[i + n])

    def probableWordFrom(self, dictogram):
        '''Picks a random word from histogram containing words and weights'''
        words, weights = zip(*dictogram.items())

        # Creates a list of integers, which act as separators between weights
        accumulator, separators = 0, []
        for weight in weights:
            accumulator += weight
            separators.append(accumulator)

        # The indices of the words lst and seperators lst are concurrent
        # Here we return the word at index of whichever weight in the separators lst
        # is greater than this random number
        rand = random.randint(0, accumulator)
        for index, separator in enumerate(separators):
            if rand <= separator:
                return words[index]

    def makeSentence(self):
        words = list(self.probableWordFrom(self.sentenceStarters))
        newWord = self.probableWordFrom(self[(words[-2], words[-1])])
        while newWord != '###':
            words.append(newWord)
            newWord = self.probableWordFrom(self[(words[-2], words[-1])])
        return ' '.join(words)
コード例 #12
0
ファイル: tuple_markovchain.py プロジェクト: type9/twatter
class MarkovChain():
    def __init__(self, order=2, starttoken='!START', stoptoken='!STOP'):
        self.order = order # number of orders to generate the chain with

        self.nodes = dict()
        self.starttokens = Dictogram()
        self.stoptokens = Dictogram()

        self.STARTTOKEN = starttoken
        self.STOPTOKEN = stoptoken
    
    def get_phrase(self, text_q):
        phrase = () # represent the n words seperated
        this_q = copy.copy(text_q)

        for i in range(self.order): # generates the 'phrase' based off of the order which dictates the number of words we look at
            this_word = (this_q.dequeue(),) # stores the word we're currently looking at
            
            phrase += this_word

        if self.STARTTOKEN in phrase:
            self.starttokens.add_count(phrase)
        return phrase

    def gen_nodes(self, text):
        '''iterates across list of words creating or modifying nodes'''
        text_q = Queue()
        for token in text:
            text_q.enqueue(token)

        while text_q.length() > self.order: # for each first word in the text we're analysing
            this_phrase = self.get_phrase(text_q)
            text_q.dequeue()
            next_phrase = self.get_phrase(text_q)

            if this_phrase in self.nodes.keys(): # if the phrase has already been added as a key
                if next_phrase:
                    self.nodes[this_phrase].add_count(next_phrase) # add a token of the next phrase
            else:
                self.nodes[this_phrase] = Node(this_phrase) # if not we create a new node
                if next_phrase:
                    self.nodes[this_phrase].add_count(next_phrase) # add a token of the next phrase

    def get_start(self):
        if self.order == 1:
            return self.nodes[(self.STARTTOKEN),].walk()
        return self.starttokens.sample()

    def gen_sentence(self):
        '''generates a sentence starting with a start token'''
        sentence = str()

        this_phrase = self.get_start() # start with the start token
    
        while not self.STOPTOKEN in this_phrase: # while we don't run into a stop token

            slice = self.order - 1

            sentence += ' '.join(this_phrase[slice:]) + ' ' # joins phrase (excluding the first word) into a string
            this_phrase = self.nodes[this_phrase].walk() # samples the current node for the next word

        if not self.order == 1:
            sentence += ' '.join(this_phrase[slice:1]) # joins phrase (exlcuding the last word) into a string

        return sentence