Esempio n. 1
0
    def generate_random_sentence(self, length, start_sequence):
        """
        Генерация предложения

        :param length: количестов слов в предложение
        :type length: int
        :param start_sequence: начало предложения
        :type start_sequence: str
        :return: предложение
        :rtype: str
        """
        current_word_sequence = tuple(map(int, start_sequence.split()))
        sentence = list(current_word_sequence)
        sentence_num = 0
        while sentence_num < length:
            current_dictogram = Dictogram(
                self.get_sequence(' '.join(map(str, current_word_sequence))))
            random_weighted_word = current_dictogram.return_weighted_random_word(
            )
            current_word_sequence = current_word_sequence[1:] + tuple(
                [random_weighted_word])
            sentence.append(current_word_sequence[-1])
            if current_word_sequence[-1] == self.tokenizer.end_symbol:
                sentence_num += 1
        return sentence
Esempio n. 2
0
 def populate_chain(self, nth_order):
     self.chain = Dictogram()
     Nth_len = self.total_words - nth_order
     # print("Total words: {}, {} words for Nth_order: {} word pairs".format(self.total_words, Nth_len, nth_order))
     for i in range(Nth_len):
         """Look through available words in the corpus relative to the window size"""
         cur = ""
         nxt = ""
         for pos in range(1, nth_order + 1):
             self.transition.put(self.corpus[i + (pos - 1)])
             self.transition.put(self.corpus[i + pos])
             if pos > 1:
                 cur += " "
                 nxt += " "
             cur += self.transition.get()
             nxt += self.transition.get()
         # print("Current pair: {} -->\t Next pair: {}\n".format(cur, nxt))
         if self.chain.get(cur, None) is None:
             # print("\n--Adding NEW--")
             self.chain[cur] = Dictogram([nxt])
         else:
             # print("\n--Adding to Count--")
             self.chain[cur].add_count(nxt)
         # print(self.chain)
     return self.chain
 def create_markov_chain(self, data_list):
     markov_chain = {}
     markov_chain['START'] = Dictogram()
     markov_chain['START'].add_count(data_list[0])
     word = None
     middle_word = None
     for index in range(0, len(data_list) - 1):
         touple_key = None
         last_word = middle_word
         middle_word = word
         word = data_list[index]
         if word == 'START' or word == 'STOP':
             touple_key = word
         else:
             if middle_word == "START":
                 touple_key = (middle_word, word)
             else:
                 touple_key = (last_word, middle_word, word)
         if touple_key is None:
             pass
         else:
             if touple_key not in markov_chain:
                 markov_chain[touple_key] = Dictogram()
             markov_chain[touple_key].add_count(data_list[index + 1])
     return markov_chain
    def __init__(self, word_list, nth_order = 1):
        """Initialize the class and create variables"""
        self.word_list = create_list(word_list)
        self.dictionary_histogram = Dictogram(self.word_list)
        self.nth_order = nth_order

        """ Creating the Markov Chain """
        #Edit so as to get rid of length of list minus 1 and it doesnt run errors
        def create_chain(self):
            pass
        for index in range(len(self.word_list)-nth_order):
            word = self.word_list[index]
            next_word = self.word_list[index+1]
            word_after_next = self.word_list[index+2]

            # -----------------------------------------
            # if len(self.word_list)==index+1:
            #     next_word = None
            # else:
            #     next_word = self.word_list[index+1]
            # -------------------------------------------
            if (word,next_word) not in self:
                small_dicto = Dictogram([(next_word,word_after_next)])
                self[(word,next_word)] = small_dicto

            else:
                self[(word,next_word)].add_count((next_word,word_after_next))
    def create_histograms(self, word_list):
        """Read the given word list and create the Markov chain structure.
        Loop over the words, three at a time (previous_word, word, next_word).
        """
        histograms = {}
        histograms['END'] = Dictogram(self.generate_start(word_list))
        # since it's 2rd order, we are making space for 2 words
        for index, word in enumerate(word_list):

            # if word == 'END':
            #     window = ('END', word_list[index+1])
            if index < len(word_list) - 2:
                prev_word = word_list[index]
                current_word = word_list[index + 1]
                next_word = word_list[index + 2]
                window = (prev_word, current_word)
                # If word has never been seen before, create a new histogram with a list containing next word
                if window not in histograms:
                    histograms[window] = Dictogram([next_word])
                # if word has been seen, get its existing histogram and append the count to it
                else:
                    histograms[window].add_count(
                        next_word)  # o(n) , n is len word_list
        # pprint(histograms)
        return histograms
    def __init__(self, word_list):
        super().__init__()
        self.start_tokens = Dictogram()
        self.stop_tokens = Dictogram()

        ##### for first order MarkovChain
        word_list[0] = re.sub("[^a-zA-Z]", '', word_list[0])
        self.start_tokens.add_count(word_list[0].lower(), 1)

        for i in range(1, len(word_list)-1, 1):
            if((word_list[i][0].isupper()) and word_list[i-1][len(word_list[i-1])-1] in string.punctuation):
                word_list[i] = re.sub("[^a-zA-Z]", '', word_list[i])
                self.start_tokens.add_count(word_list[i].lower(), 1)
        for i in range(len(word_list)):
            if(word_list[i][len(word_list[i])-1] in string.punctuation):
                word_list[i] = re.sub("[^a-zA-Z]", '', word_list[i])
                # word_list[i] = word_list[i][:len(word_list[i])-1]
                self.stop_tokens.add_count(word_list[i], 1)
        for i in range(len(word_list)-1):
            word_list[i] = re.sub("[^a-zA-Z]", '', word_list[i])
            word_list[i+1] = re.sub("[^a-zA-Z]", '', word_list[i+1])
            if word_list[i] in self:
                self[word_list[i].lower()].add_count(word_list[i+1].lower(), 1)
            else:
                self[word_list[i].lower()] = Dictogram([word_list[i+1].lower()])
Esempio n. 7
0
 def __init__(self, word_list=None):
     super(Markov, self).__init__()
     #if there is an array of words
     if word_list != None:
         #looping through all the array
         first_word = word_list[0]
         second_word = word_list[1]
         tuple_key = (first_word, second_word)
         last_word = word_list[1]
         self[tuple_key] = Dictogram()
         #start with the third word since we already used 2
         for index, word in enumerate(word_list[2:]):
             #first we'll handle the appending logic
             if word not in self[tuple_key]:
                 self[tuple_key][word] = 1
                 self[tuple_key].tokens += 1
             else:
                 self[tuple_key][word] += 1
                 self[tuple_key].tokens += 1
             new_key = (last_word, word)
             #now we handle making a new key or not
             if new_key not in self:
                 self[new_key] = Dictogram()
             #now we set the tuple_key and last_word
             tuple_key = new_key
             last_word = new_key[1]
Esempio n. 8
0
def dictogram(words):
    hist = Dictogram()
    for i in range(len(words)-1):
        if words[i] not in hist:
            hist[words[i]] = Dictogram()
        hist[words[i]].add_count(words[i+1])
    return hist
    def __init__(self, words_list=None):

        super(MarkovChain, self).__init__()

        if words_list is not None:
            self.create_markov_chain(words_list)
            self['start'] = Dictogram(['the'])
            self['end'] = Dictogram(['.'])
Esempio n. 10
0
def histogram_as_dict(source_text_as_list):
    """Return histogram as a dict: {'one': 1, 'fish': 4}..."""
    output_dict = Dictogram()

    for word in source_text_as_list:
        output_dict.add_count(word)

    return output_dict
Esempio n. 11
0
def test_frequency():
    dictogram = Dictogram(fish_words)
    # Verify frequency count of all words
    assert dictogram.frequency('one') == 1
    assert dictogram.frequency('two') == 1
    assert dictogram.frequency('red') == 1
    assert dictogram.frequency('blue') == 1
    assert dictogram.frequency('fish') == 4
Esempio n. 12
0
    def __init__(self, order=2, starttoken='!START', stoptoken='!STOP'):
        self.order = order # number of orders to generate the chain with

        self.nodes = dict()
        self.starttokens = Dictogram()
        self.stoptokens = Dictogram()

        self.STARTTOKEN = starttoken
        self.STOPTOKEN = stoptoken
Esempio n. 13
0
 def test_types(self):
     histogram = Dictogram(self.fish_words)
     # Verify count of distinct word types
     assert len(set(self.fish_words)) == 5
     assert histogram.types == 5
     # Adding words again should not change count of distinct word types
     for word in self.fish_words:
         histogram.add_count(word)
     assert histogram.types == 5
Esempio n. 14
0
 def test_tokens(self):
     histogram = Dictogram(self.fish_words)
     # Verify total count of all word tokens
     assert len(self.fish_words) == 8
     assert histogram.tokens == 8
     # Adding words again should double total count of all word tokens
     for word in self.fish_words:
         histogram.add_count(word)
     assert histogram.tokens == 8 * 2
 def test_entries(self):
     dictogram = Dictogram(self.fish_words)
     # Verify histogram as dictionary of entries like {word: count}
     assert len(dictogram) == 5
     self.assertCountEqual(dictogram, self.fish_dict)  # Ignore item order
     # Verify histogram as list of entries like [(word, count)]
     listogram = dictogram.items()
     assert len(listogram) == 5
     self.assertCountEqual(listogram, self.fish_list)  # Ignore item order
Esempio n. 16
0
 def create_markov_chain(self, data_list):
     markov_chain = {}
     markov_chain['START'] = Dictogram()
     markov_chain['START'].add_count(data_list[0])
     for index in range(0, len(data_list) - 1):
         word = data_list[index]
         if word not in markov_chain:
             markov_chain[word] = Dictogram()
         markov_chain[word].add_count(data_list[index + 1])
     return markov_chain
Esempio n. 17
0
def test_types():
    fish_words = ['one', 'fish', 'two', 'fish', 'red', 'fish', 'blue', 'fish']

    histogram = Dictogram(fish_words)
    # Verify count of distinct word types
    assert len(set(fish_words)) == 5
    assert histogram.types == 5
    # Adding words again should not change count of distinct word types
    for word in fish_words:
        histogram.add_count(word)
    assert histogram.types == 5
 def build_state_histogram(self, words_list):
     tokens = Queue(words_list[0:self.n])
     for index in range(self.n - 1, len(words_list)):
         if tokens not in self:
             self[tuple(tokens)] = Dictogram()
         try:
             self[tuple(tokens)].add_count(words_list[index + 1])
             tokens.enqueue(words_list[index + 1])
         except:
             self[tuple(tokens)] = Dictogram(['**STOP**'])
         tokens.dequeue()
Esempio n. 19
0
 def add_count(self, word, previous_word, count=1):
     """Increase frequency count of given word by given count amount."""
     if previous_word != '':
         try:
             dicto = self[previous_word]
         except KeyError:
             dicto = Dictogram()
             self.types += 1
         dicto.add_count(word)
         self[previous_word] = dicto
         self.tokens = self.tokens + count
    def __init__(self, word_list, order):
        # super().__init__()
        self.order = order
        self.start_tokens = Dictogram()
        self.stop_tokens = Dictogram()

        ##### for first order MarkovChain
        # word_list[0] = re.sub("[^a-zA-Z]", '', word_list[0])
        # self.start_tokens.add_count(word_list[0].lower(), 1)
        for i in range(1, len(word_list) - 1, 1):
            try:
                if ((word_list[i][0].isupper())
                        and word_list[i - 1][len(word_list[i - 1]) - 1]
                        in string.punctuation):
                    # word_list[i] = re.sub("[^a-zA-Z]", '', word_list[i])
                    if self.order > 1:
                        temp = list()
                        for j in range(self.order):
                            word_list[i + j] = re.sub("[^a-zA-Z]", '',
                                                      word_list[i + j])
                            temp.append(word_list[i + j].lower())
                        if len(temp) > 1:
                            temp = tuple(temp)
                    else:
                        word_list[i] = re.sub("[^a-zA-Z]", '', word_list[i])
                        temp = word_list[i].lower()
                    self.start_tokens.add_count(temp, 1)
            except:
                pass

        for i in range(len(word_list)):
            try:
                if (word_list[i][len(word_list[i]) - 1] in string.punctuation):
                    word_list[i] = re.sub("[^a-zA-Z]", '', word_list[i])
                    # word_list[i] = word_list[i][:len(word_list[i])-1]
                    self.stop_tokens.add_count(word_list[i], 1)
            except:
                pass
        for i in range(len(word_list) - self.order):
            if self.order > 1:
                temp = list()
                for j in range(self.order):
                    word_list[i + j] = re.sub("[^a-zA-Z]", '',
                                              word_list[i + j])
                    temp.append(word_list[i + j].lower())
                if len(temp) > 1:
                    temp = tuple(temp)
            else:
                word_list[i] = re.sub("[^a-zA-Z]", '', word_list[i])
                temp = word_list[i].lower()
            if temp in self:
                self[temp].add_count(word_list[i + self.order].lower(), 1)
            else:
                self[temp] = Dictogram([word_list[i + self.order].lower()])
Esempio n. 21
0
def test_tokens():
    fish_words = ['one', 'fish', 'two', 'fish', 'red', 'fish', 'blue', 'fish']

    histogram = Dictogram(fish_words)
    # Verify total count of all word tokens
    assert len(fish_words) == 8
    assert histogram.tokens == 8
    # Adding words again should double total count of all word tokens
    for word in fish_words:
        histogram.add_count(word)
    assert histogram.tokens == 8 * 2
Esempio n. 22
0
 def add_token(self, current_type, next_type):
     # check if empty
     if self.empty:
         self.empty = False
         self[current_type] = Dictogram([next_type])
     else:
         #make sure key you're trying to insert is in your markov or not
         if current_type in self:
             self[current_type].add_count(next_type)
         else:
             # we dont have anything to retrieve, therefore create key value pair
             self[current_type] = Dictogram([next_type])
Esempio n. 23
0
def walk(word_list, length):
    
    sentence = []
    histogram = Dictogram(word_list)
    next_word = histogram.sample()
    sentence.append(next_word)
    for i in range(length - 1):
        chain = new_chain(word_list, next_word)
        if len(chain) > 0:
            next_word = chain.sample()
            sentence.append(next_word)

    return sentence
Esempio n. 24
0
    def walk(self, num_words):
        words = []
        histogram = Dictogram(self.word_list)
        next_word = histogram.sample()

        words.append(next_word)
        for i in range(num_words - 1):
            if len(self.markov_chain) > 0:
                next_word = histogram.sample()
                words.append(next_word)

        sentence = ' '.join(words)

        return sentence
Esempio n. 25
0
def test_entries():
    fish_words = ['one', 'fish', 'two', 'fish', 'red', 'fish', 'blue', 'fish']
    fish_list = [('one', 1), ('fish', 4), ('two', 1), ('red', 1), ('blue', 1)]
    fish_dict = {'one': 1, 'fish': 4, 'two': 1, 'red': 1, 'blue': 1}

    case = unittest.TestCase()
    dictogram = Dictogram(fish_words)
    # Verify histogram as dictionary of entries like {word: count}
    assert len(dictogram) == 5
    case.assertCountEqual(dictogram, fish_dict)  # Ignore item order
    # Verify histogram as list of entries like [(word, count)]
    listogram = dictogram.items()
    assert len(listogram) == 5
    case.assertCountEqual(listogram, fish_list)  # Ignore item order
class MarkovChain(Dictogram):
    def __init__(self, word_list):
        super().__init__()
        self.start_tokens = Dictogram()
        self.stop_tokens = Dictogram()

        ##### for first order MarkovChain
        word_list[0] = re.sub("[^a-zA-Z]", '', word_list[0])
        self.start_tokens.add_count(word_list[0].lower(), 1)

        for i in range(1, len(word_list)-1, 1):
            if((word_list[i][0].isupper()) and word_list[i-1][len(word_list[i-1])-1] in string.punctuation):
                word_list[i] = re.sub("[^a-zA-Z]", '', word_list[i])
                self.start_tokens.add_count(word_list[i].lower(), 1)
        for i in range(len(word_list)):
            if(word_list[i][len(word_list[i])-1] in string.punctuation):
                word_list[i] = re.sub("[^a-zA-Z]", '', word_list[i])
                # word_list[i] = word_list[i][:len(word_list[i])-1]
                self.stop_tokens.add_count(word_list[i], 1)
        for i in range(len(word_list)-1):
            word_list[i] = re.sub("[^a-zA-Z]", '', word_list[i])
            word_list[i+1] = re.sub("[^a-zA-Z]", '', word_list[i+1])
            if word_list[i] in self:
                self[word_list[i].lower()].add_count(word_list[i+1].lower(), 1)
            else:
                self[word_list[i].lower()] = Dictogram([word_list[i+1].lower()])

    def random_walk(self, length=10):
        sentence = ""
        keys = list(self.keys())
        word = self.start_word()
        sentence += word + " "
        word = word.lower()
        for i in range(length-1):
            word = self[word].sample()
            sentence += word + " "
        sentence = sentence + self.end_word() + ". "
        return sentence

    def start_word(self):
        ###for 1st order markov chain
        dart = random.randint(0, len(self.start_tokens)+1)
        fence = 0
        for elm in self.start_tokens:
            for key in self.start_tokens.keys():
                fence += self.start_tokens[key]
                if fence > dart:
                    return elm.capitalize()


    def end_word(self):
        dart = random.randint(0, len(self.stop_tokens)+1)
        fence = 0
        while 1:
            for elm in self.stop_tokens:
                for key in self.stop_tokens.keys():
                    fence += self.stop_tokens[key]
                    if fence >= dart:
                        return elm
Esempio n. 27
0
 def test_update_2(self):
     dsg = Dictogram(self.words)
     dsg.update(self.words)
     self.assertEquals({'a': 2, 'b': 2, 'c': 2}, dsg)
     dsg.update(('c', 'b',))
     self.assertEquals({'a': 2, 'b': 3, 'c': 3}, dsg)
     dsg.update(('b',))
     self.assertEquals({'a': 2, 'b': 4, 'c': 3}, dsg)
def random_walk(word_list, length):
    """Start sentence with sample word from histogram, and then
    sample each new histogram chain to get the next word, add then to sentence.
    """
    sentence = []
    histogram = Dictogram(word_list)
    next_word = histogram.sample()
    sentence.append(next_word)
    for i in range(length - 1):
        chain = new_chain(word_list, next_word)
        if len(chain) > 0:
            next_word = chain.sample()
            sentence.append(next_word)

    return sentence
Esempio n. 29
0
    def add_count(self, word_1, word_2):
        self.tokens += 1

        if len(self) == 0:
            self[word_1] = Dictogram([word_2])
            self.types += 1
            return

        if word_1 in self:
            self[word_1].add_count(word_2)
            # self[self.index(word_1)][1].add_count(word_2)
            return

        self[word_1] = Dictogram([word_2])
        self.types += 1
Esempio n. 30
0
    def create_nth_chain(self, words_list):
        #Point to start slicing
        start = 0
        #point to stop slicing slice excludes the end point[]
        end = self.order

        while end <= len(words_list) :
            #take a slice
            state = ' '.join(words_list[start:end])

            #check if it is in histogram already
            if self.get(state) == None:
                #not in histogram so add it
                self[state] = Dictogram()


            #check if token should go in start state
            #checks for capitalization
            if re.match('[A-Z]', state) is not None:
                self.get('start').add_count(state)


            #increment state
            start += 1
            end += 1

            #bounds check
            if end <= len(words_list):
                #look at next state
                next_state = ' '.join(words_list[end-1:end])
                #add next state to current state
                self.get(state).add_count(next_state)