Esempio n. 1
0
def higher_markov(corpus):
    '''Creates 2nd order markov chain with histogram'''
    markov_dict = {}

    for i in range(len(corpus) - 1):
        first = corpus[i]
        second = corpus[i + 1]

        if second != '<STOP>':
            third = corpus[i + 2]

            key = (first, second)
            if key not in markov_dict.keys():
                markov_dict[key] = Dictogram()

            markov_dict.get(key).add_count(third)

    return markov_dict
def create_markov_dict(text):
    """Create Dictionary with key equal to tuple of words and value as a dictionary."""
    word_list = text.split(" ")
    markov_dict = {}

    for word_index in range(len(word_list) - 2):
        current_tuple = tuple(
            (word_list[index]) for index in range(word_index, word_index + 2))
        next_word = word_list[word_index + 2]

        if current_tuple in markov_dict:
            markov_dict[current_tuple].add_count(next_word)
        else:
            markov_dict[current_tuple] = Dictogram([next_word])

    pprint(markov_dict)

    return (markov_dict)
 def test_add_count(self):
     histogram = Dictogram(self.fish_words)
     # Add more words to update frequency counts
     histogram.add_count('two', 2)
     histogram.add_count('blue', 3)
     histogram.add_count('fish', 4)
     histogram.add_count('food', 5)
     # Verify updated frequency count of all words
     assert histogram.frequency('one') == 1
     assert histogram.frequency('two') == 3
     assert histogram.frequency('red') == 1
     assert histogram.frequency('blue') == 4
     assert histogram.frequency('fish') == 8
     assert histogram.frequency('food') == 5
     # Verify count of distinct word types
     assert histogram.types == 6
     # Verify total count of all word tokens
     assert histogram.tokens == 8 + 14
Esempio n. 4
0
 def nth_order_markov_chain(self, order, text_list):
     """ this function takes in a word and checks to see what words come after it
     to determine the word sequence for our generated markov chain"""
     markov_dict = dict()
     # for each word in list, key is word and value is dictogram
     for index in range(len(text_list) - order):
         # text_list[index] should be our word from list, and we're slicing based on the order of the markov chain
         window = tuple(text_list[index: index + order])
         # check if key is stored already
         if window in markov_dict:
             # if it is, then append it to the existing histogram
             # NOTE: Instead of going through the corpus repeatedly, lets save text_list[index + order] in scope
             markov_dict[window].add_count([text_list[index + order]])
         else:
             # if not, create new entry with window as key and dictogram as value
             markov_dict[window] = Dictogram([text_list[index + order]])
     # return dictionary
     return markov_dict
Esempio n. 5
0
    def build_markov(self, midi_data):
        markov_chain = {}

        #get the current word and the word after
        current_note = midi_data[0]
        next_note = midi_data[1]
        time = midi_data[2]
        if current_note in markov_chain.keys():  #already there
            #get the histogram for that word in the chain
            histogram = markov_chain[current_note]
            #add to count
            histogram.dictionary_histogram[
                next_note] = histogram.dictionary_histogram.get(next_note,
                                                                0) + 1
        else:  #first entry
            markov_chain[current_note] = Dictogram(midi_data)

        return markov_chain
    def build_markov(self):
        dict = {}
        with open('holmes-text.txt') as file:
            corpus = file.read().split()

        i = 0
        while i + 1 < len(corpus):
            word = corpus[i]
            if dict.get(word) == None:
                next_word = corpus[i + 1]
                list = [next_word]
                histogram = Dictogram(list)
                dict[word] = histogram
            else:
                next_word = corpus[i + 1]
                dict.get(word).add_count(next_word)
            i += 1

        return dict
def order_sample(word_list, order=2):
    histogram = Dictogram(word_list)
    next_words = []

    # sample a random word from histogram
    next_word_string = histogram.sample()
    # find all the words that come after
    chain = new_chain(word_list, next_word_string)
    # append both words to a list
    next_words.append(next_word_string)

    for i in range(order - 1):
        if len(chain) > 0:
            next_word_string = chain.sample()
            next_words.append(next_word_string)
            chain = new_chain(word_list, next_word_string)

    words_str = " ".join(next_words)
    return words_str
Esempio n. 8
0
def markov_dictograms(text_string):
    """
    argument: a string of text
    return: a dictionary of words and dictograms
    """

    text_array = text_string.split()
    dictogram_dictionary = {}

    for word_index in range(len(text_array) - 1):
        current_word = text_array[word_index]
        next_word = text_array[word_index + 1]

        if current_word in dictogram_dictionary:
            dictogram_dictionary[current_word].add_count(next_word)
        else:
            dictogram_dictionary[current_word] = Dictogram([next_word])

    return (dictogram_dictionary)
Esempio n. 9
0
    def build_markov(self, word_list):
        markov_chain = {}

        for i in range(len(word_list) - 1):
            # get the current word and the word after
            current_word = word_list[i]
            next_word = word_list[i + 1]

            if current_word in markov_chain.keys():  # already there
                # get the histogram for that word in the chain
                histogram = markov_chain[current_word]
                # add to count
                histogram.dictionary_histogram[
                    next_word] = histogram.dictionary_histogram.get(
                        next_word, 0) + 1
            else:  # first entry
                markov_chain[current_word] = Dictogram([next_word])

        return markov_chain
Esempio n. 10
0
    def make_chain(self, word_list):
        """Create and return a markov chain from a given list of words"""
        markov = {}
        q = Queue()
        for i in range(len(word_list)):
            if i < self.n:
                q.enqueue(word_list[i])
            else:
                key = str(q)
                q.dequeue()
                q.enqueue(word_list[i])
                if markov.get(key) is None:
                    markov[key] = []
                markov[key].append(str(q))

        for key in markov:
            markov[key] = Dictogram(markov[key])

        return markov
Esempio n. 11
0
 def walk(self, num_words):
     #defines markov_chain as a dictonary
     markov_chain = {}
     # selects a random word for our object from markov chain keys
     current_word = random.choice(list(self.markov_chain.keys()))
     # Creates our sentence list
     sentence = []
     # while i is 0-num_words
     for i in range(num_words):
         #defines our Dictogram object
         markov_chain[current_word] = Dictogram([current_word])
         #sets temp_word to a random word by weighted frequency
         temp_word = markov_chain[current_word].sample_by_frequency()
         # appends it to our sentence
         sentence.append(temp_word)
         #changes our focus to the next word
         current_word = temp_word
     #return the list as a string
     return " ".join(sentence)
Esempio n. 12
0
    def build_markov(self, word_list):
        markov_chain = {}

        for i in range(len(word_list) - 1):
            #get the current word and the word after
            current_word = word_list[i]
            next_word = word_list[i + 1]

            if current_word in markov_chain.keys():  #already there
                #get the histogram for that word in the chain
                histogram = markov_chain[current_word]
                #add to count
                # histogram[next_word] = histogram.get(next_word, 0) + 1
                histogram.add_count(
                    next_word)  # ensures that tokens/total is updated
            else:  #first entry
                markov_chain[current_word] = Dictogram([next_word])

        return markov_chain
Esempio n. 13
0
def markov_chain_nth_order(token_list, order = 4):
    """Returns a data structure of word A followed by word B in a nested dictionary.
    Ex: ['one', 'fish', 'two', 'fish', 'red', 'fish', 'blue', 'fish', 'two', 'fish']
    {'one': {'fish': 1}, 'fish': {'two': 2, 'red': 1, 'blue': 1}, 'two': {'fish': 2}, 'red': {'fish': 1}, 'blue': {'fish': 1}}"""
    walk_steps = 1
    nest_dict = {}
    step_counter = 0
    total_steps = len(token_list)

    for i in range(total_steps - order):
            # returns a nested dictionary of word A which is followed by word B
            # {'one': {'fish': 1}
        type_storage = tuple(token_list[i + index] for index in range(order))
        if type_storage not in nest_dict:
            nest_dict[type_storage] = Dictogram()
            # Commented out. This is literally becoming Dictogram() esp after referencing my prior code
            # for word in type_storage:
                # if word in type_storage:
                #     nest_dict[word] += 1
                # else:
                #     nest_dict[word] = 1
        nest_dict[type_storage].add_count(token_list[i + order])
        


        
            
            
            
    #         tmp_dict = {}
    #         tmp_dict[token_list[i+1]] = 1
    #         nest_dict[token_list[i]] = tmp_dict
    #     elif token_list[i] in nest_dict:
    #         # iterates through nested dictionary and adds to words in the nested dictionary
    #         # ex: fish': {'two': 1} => fish': {'two': 2}
    #         if token_list[i+1] in nest_dict[token_list[i]]:
    #             nest_dict[token_list[i]][token_list[i + 1]] += 1
    #         else:
    #             nest_dict[token_list[i]][token_list[i + 1]] = 1
    #     step_counter += 1
    # step_counter = 0
    return nest_dict
Esempio n. 14
0
def markov_chain(words_list):
    """Count occurences in the given list of words and
        return that data structure"""
    # create new dictionary
    markov = {}
    # iterate over the corpus
    for i in range(len(words_list) - 1):
        # create two variables for current word and current word + 1 (next)
        current_word = words_list[i]
        next_word = words_list[i + 1]
        # check if word is key in dictionary
        if current_word in markov:
            # if key is in big dictionary, update Dictogram
            markov[current_word].add_count(next_word)
        # if word is not a key in dictionary, create key with value as dictogram
        else:
            # Dictogram key will be next with a value of 1
            markov[current_word] = Dictogram([next_word])
        # return dictionary
    return markov
 def _create_chain(self, word_list):
     for i in range(0, len(word_list)):
         # The current word in the iteration
         word = word_list[i]
         try:
             # The word 1 index ahead of the current word, aka the next word in the sentence
             # has to be in try except because of index out of range exception on end of list
             next_word = word_list[i + 1]
         # If index error then no new words in list to add so break out
         except IndexError:
             break
         # The word dictogram if it exists
         word_dicto = self.get(word, None)
         # If word exists then add a count of 1, otherwise create a new dictogram with the next word
         if word_dicto:
             # Add a new word entry to the dictogram if the dictogram already exists
             word_dicto.add_count(next_word, 1)
         else:
             # Create a new dictogram for the word
             self[word] = Dictogram([next_word])
Esempio n. 16
0
    def populate_chain(self):
        """Construct a dictionary to represent the MarkovChain state
           transitions of any order.

        """
        chain = dict()
        i = 0
        num_words = len(self.words_list)
        while i < num_words - self.order:  # avoid IndexError at end
            state, state_after = self.form_states(i)
            # create a word frequency dict to go along with each state
            if chain.get(state, None) is None:
                list_of_states = []
                list_of_states.append(state_after)
                chain[state] = Dictogram(list_of_states)
            # if the state already exists, add the token and count
            else:
                chain[state].add_count(state_after)
            i += 1  # move index over to start recording of next state
        return chain
def second_order(file):
    with open(file) as f:  #access file
        text = f.read()  #reads file
        markov_dict = {}  #creates empty dictionary
    word_array = [
        word for line in text.split('\n') for word in line.split(' ')
    ]  #removes line breaks and whitespace,returns a list of individual words

    for index in range(len(word_array) - 2):
        start_word = word_array[index]
        next_word = word_array[index + 1]
        next_next_word = word_array[index + 2]
        tuple = (start_word, next_word)

        if tuple not in markov_dict:
            add_tuple = Dictogram([next_next_word])
            markov_dict[tuple] = add_tuple
        else:
            markov_dict[tuple].add_count(next_next_word)
    return markov_dict
Esempio n. 18
0
 def build_sentence(self, sentence):
     next_word_index = 0
     next_next_word_index = 1
     words = "i like dogs and you like dogs i like cats but you hate cats"
     for _ in range(2):
         sentence.insert(0, '$START$')
     for word in sentence:
         next_word_index += 1
         next_next_word_index += 1
         if next_next_word_index >= len(sentence):
             break
         if next_word_index < len(sentence):
             next_word = sentence[next_word_index]
         if next_next_word_index < len(sentence):
             next_next_word = sentence[next_next_word_index]
         pair = (word, next_word)
         if pair not in self:
             self[pair] = Dictogram()
         if pair in self:
             self[pair].add_count(next_next_word)
     return self
Esempio n. 19
0
def nth_markov_dictograms(text_string, nth_order):
    """
    argument: a string of text and whatever order markov chain
    return: a dictionary of word snippets and dictograms of single words that follow the snippets
    """

    text_array = text_string.split()
    dictogram_dictionary = {}

    for word_index in range(len(text_array) - nth_order):
        current_tuple = tuple(
            (text_array[index])
            for index in range(word_index, word_index + nth_order))
        next_word = text_array[word_index + nth_order]

        if current_tuple in dictogram_dictionary:
            dictogram_dictionary[current_tuple].add_count(next_word)
        else:
            dictogram_dictionary[current_tuple] = Dictogram([next_word])

    return (dictogram_dictionary)
Esempio n. 20
0
    def build_markov(self, word_list):
        markov_chain = {}

        #traverse the string
        for i in range(len(word_list) - 1):
            #get the current word and the word after
            current_word = word_list[i]
            next_word = word_list[i + 1]

            #checks if current word is in the chain already
            if current_word in markov_chain.keys():
                #get the histogram(dic) for that word in the chain
                histogram = markov_chain[current_word]
                #finds the next_word in the histogram and adds to its weight (chance of it showing up enxt after current word)
                histogram.add_count(next_word)
                #histogram[next_word] = histogram[next_word].add_count(next_word)
            else:  #first entry in the chain and creates a new dictionary with next_word as the first entry
                markov_chain[current_word] = Dictogram([next_word])
            print(markov_chain[current_word], "|")

        return markov_chain
Esempio n. 21
0
    def build_markov(self):

        with open(sys.argv[1]) as f:
            content = f.read().split()

        dict = {}

        i=0
        while i+1 < len(content):
            word = content[i]
            if dict.get(word) == None:
                next_word = content[i+1]
                list = [next_word]
                histogram = Dictogram(list)
                dict[word] = histogram
            else:
                next_word = content[i+1]
                dict.get(word).add_count(next_word)
            i += 1

        return dict
Esempio n. 22
0
    def second_order(self):

        with open(sys.argv[1]) as f:
            content = f.read().split()

            dict = {}
        i=0

        while i+2 < len(content):
            word_tuple = (content[i], content[i+1])
            if dict.get(word_tuple) == None:
                next_word = content[i+2]
                list = [next_word]
                histogram = Dictogram(list)
                dict[word_tuple] = histogram
            else:
                next_word = content[i+2]
                dict.get(word_tuple).add_count(next_word)
            i += 1

        return dict
Esempio n. 23
0
def second_order_markov_histo(word_list):
    '''Creates histogram that represents markov chain for each word in a list.'''
    histo = {}
    for i in range(len(word_list) - 1):
        key_word = word_list[i]
        next_key_word = word_list[i + 1]
        new_key = (key_word, next_key_word)

        if i + 2 < len(word_list):
            next_next_key = word_list[i + 2]

            if new_key not in histo.keys():
                key_histo = []
                histo[new_key] = key_histo
            # STINE TESTING Thank you Aucoeur!!
            histo[new_key].append(next_next_key)

    value_list = histo.items()
    for key, value in value_list:
        histo[key] = Dictogram(value)

    return histo
def markov_chain(text):#Create a chain to scramble the words based on frequency and Dictogram
    markov = {}

    for index in range(len(text)-1):
        word_1 = text[index]
        word_2 = text[index+1]

        #if word_1 in markov.keys():
            #markov[word_1].add_count(word_2)

        #markov[word_1] = Dictogram()
        # print(word_1)
        # print(markov)

        #note: this one works
        if word_1 not in markov.keys():
            markov[word_1] = Dictogram()

        markov.get(word_1).add_count(word_2)

    print(markov)
    return markov
Esempio n. 25
0
def main():
    """Start main process."""
    start_time = time.time()
    file_input = grab_file()
    # Grabs how long you want string to be for later; defaults to 10
    output_len = request.args.get('num', '')
    if output_len == '':
        word_amt = 10
    else:
        word_amt = int(output_len)

    # Grab the input, make into dictionary/list of words + occurences
    # Changed into a class for more functionality; trades speed though
    input_histo = Dictogram(file_input)
    input_len = input_histo.tokens

    # List based on probability
    # Grabs input, length of input, and desired string length
    finished_list = probability_gen(input_histo, input_len, word_amt)

    if word_amt == 1:
        word_print(finished_list)
    else:
        sentence_print(finished_list)
    """Below are the three alternate functions not needed for the tweetgen."""
    # Word you want to search up for frequency; change as needed
    word = "the"

    histogram(input_histo)
    # unique words... was replaced by dictogram
    # unique_words(input_histo.tokens)
    input_histo.tokens
    # word frequency... also replaced
    input_histo.count(word)

    print("--- %s seconds --- end after frequency \n" %
          (time.time() - start_time))
    return render_template('main.html', output=sentence_print(finished_list))
Esempio n. 26
0
def table_generator(corpus_text, order):
    """Make the actual markov table."""
    """It's a hashtable with tuples, list => tuples, dictionary."""
    # Corpus in linkedlist form
    corpus_ll = LinkedList(corpus_text)
    # Window and the table
    window_queue = LinkedList()
    current_table = HashTable()

    # Current window for iterating through corpus; order changes size
    for i in range(order):
        window_queue.append(corpus_text[i])

    # Add above to hashtable + the word that comes after
    current_table.set((window_queue.items()), [corpus_text[order + 1]])

    # For the rest
    for i in range(corpus_ll.length() - (order + 1)):
        # Dequeue window, add next to window;
        window_queue.move()
        window_queue.append(corpus_text[i + order])
        # Word after window
        next_word = corpus_text[i + order + 1]
        # Check if window exists in hash table already
        # Add tuple + new word to list, or tuple and list w/ new word
        if current_table.contains((window_queue.items())):
            currentvalues = current_table.get(window_queue.items())
            currentvalues.append(next_word)
            new_value = currentvalues
            current_table.set((window_queue.items()), new_value)
        else:
            current_table.set((window_queue.items()), [next_word])

    # Turn the second element (list) into a dictionary
    for key, value in current_table.items():
        current_table.set(key, Dictogram(value))

    return current_table
Esempio n. 27
0
    def build_markov_queue(self, word_list):
        markov_chain = {}
        curr_q = []
        nex_q = []

        for i in range(len(word_list) - 1):
            curr_q.append(word_list[i])
            nex_q.append(word_list[i + 1])

            if len(curr_q) == 2:
                current_q = tuple(curr_q)
                next_q = tuple(nex_q)
                curr_q.pop(0)
                nex_q.pop(0)

                if current_q in markov_chain.keys():
                    histogram = markov_chain[current_q]
                    histogram.dictionary_histogram[
                        next_q] = histogram.dictionary_histogram.get(
                            next_q, 0) + 1
                else:
                    markov_chain[current_q] = Dictogram([next_q])
        return markov_chain
    def higher_order(self, new_words):
        """ Goes through word_list and combines two words in a string.
        The amount of words is based on the order number. Checks if
        string matches and combines with the string"""
        dictionary = dict()
        key_words = new_words.split()
        words = []
        next_words = []
        pairs = []

        for i in range(len(self.word_list) - 1):
            words *= 0
            for x in range(self.order):
                if i < (len(self.word_list) - self.order):
                    words.append(self.word_list[i + x])
            if words == key_words:
                next_words *= 0
                for x in range(self.order):
                    next_words.append(self.word_list[i + (x + 1)])
                next_words_str = ' '.join(next_words)
                pairs.append(next_words_str)

        dictionary[new_words] = Dictogram(pairs)
        return dictionary
Esempio n. 29
0
    def generate_random_sentence(self, length, start_sequence):
        """
        Генерация предложения

        :param length: количестов слов в предложение
        :type length: int
        :param start_sequence: начало предложения
        :type start_sequence: str
        :return: предложение
        :rtype: str
        """
        current_word_sequence = tuple(map(int, start_sequence.split()))
        sentence = list(current_word_sequence)
        sentence_num = 0
        while sentence_num < length:
            current_dictogram = Dictogram(
                self.get_sequence(' '.join(map(str, current_word_sequence))))
            random_weighted_word = current_dictogram.return_weighted_random_word(
            )
            current_word_sequence = current_word_sequence[1:] + tuple(
                [random_weighted_word])
            sentence.append(current_word_sequence[-1])
            sentence_num += 1
        return sentence
 def check_key(self, key, value):
     if key in self:
         self[key].add_count(value)
     else:
         self[key] = Dictogram([value])