Example #1
0
def unsupervised_learning(n_states, n_iters):
    '''
    Trains an HMM using supervised learning on the file 'ron.txt' and
    prints the results.

    Arguments:
        n_states:   Number of hidden states that the HMM should have.
    '''
    genres, genre_map = Utility.load_ron_hidden()

    # Train the HMM.
    HMM = unsupervised_HMM(genres, n_states, n_iters)

    # Print the transition matrix.
    print("Transition Matrix:")
    print('#' * 70)
    for i in range(len(HMM.A)):
        print(''.join("{:<12.3e}".format(HMM.A[i][j]) for j in range(len(HMM.A[i]))))
    print('')
    print('')

    # Print the observation matrix. 
    print("Observation Matrix:  ")
    print('#' * 70)
    for i in range(len(HMM.O)):
        print(''.join("{:<12.3e}".format(HMM.O[i][j]) for j in range(len(HMM.O[i]))))
    print('')
    print('')
Example #2
0
def generate_sonnet(poems, lines, syllables, rhymes=None):
    POSList, POSlookup, features = featurize(poems)
    HMM = unsupervised_HMM(features, 25, 100)
    emission, states = HMM.generate_emission(10)
    if rhymes is None:
        sonnet = ""
        for i in range(14):
            line = generate_words(emission, POSlookup, syllables)
            sonnet = sonnet + line + "\n"

    else:
        # abab cdcd efef gg
        sonnet = ["" for x in range(14)]
        line_idx = [0, 1, 4, 5, 8, 9, 12]
        for i in line_idx:
            # choose a random word in the dictionary
            key, val = random.choice(list(rhymes.items()))
            # choose a random word that rhymes with the previous one
            pair = np.random.choice(val)
            sonnet[i] += str(key)
            if i < 12:
                sonnet[i + 2] += str(pair)
            else:
                sonnet[i + 1] += str(pair)
        for i in range(len(sonnet)):
            line = generate_words(emission, POSlookup, syllables, True,
                                  sonnet[i])
            sonnet[i] = line
        sonnet = "\n".join(sonnet)
    print(sonnet)
    return sonnet
def unsupervised_learning(sequence, n_states, n_iters):
    '''
    Trains an HMM using supervised learning on the file 'ron.txt' and
    prints the results.

    Arguments:
        n_states:   Number of hidden states that the HMM should have.
    '''
    # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
    #genres, genre_map = Utility.load_ron_hidden()
    #genres = preprocess.preprocess_word_to_num('shakespeare.txt')
    # print genres
    # Train the HMM.
    HMM = unsupervised_HMM(sequence, n_states, n_iters)

    # Print the transition matrix.
    '''
    print("Transition Matrix:")
    print('#' * 70)
    for i in range(len(HMM.A)):
        print(''.join("{:<12.3e}".format(HMM.A[i][j]) for j in range(len(HMM.A[i]))))
    print('')
    print('')

    # Print the observation matrix. 
    print("Observation Matrix:  ")
    print('#' * 70)
    for i in range(len(HMM.O)):
        print(''.join("{:<12.3e}".format(HMM.O[i][j]) for j in range(len(HMM.O[i]))))
    print('')
    print('')
    '''
    return HMM.A, HMM.O
Example #4
0
def train(X, n_states, name):

    # Fit model
    HMM = unsupervised_HMM(X, n_states, 1000)
    A = HMM.A
    O = HMM.O

    # Using hmm learn
    #HMM = GaussianHMM(n_components=n_states)
    #HMM.fit(X)
    #A = HMM.transmat_
    #O = HMM.

    # Print the transition matrix
    print("Transition Matrix:")
    print('#' * 70)
    for i in range(len(A)):
        print(''.join("{:<12.3e}".format(A[i][j]) for j in range(len(A[i]))))
    print('')
    print('')

    # Print the observation matrix
    print("Observation Matrix:  ")
    print('#' * 70)
    for i in range(len(O)):
        print(''.join("{:<12.3e}".format(O[i][j]) for j in range(len(O[i]))))
    print('')
    print('')

    # Write trained model to files
    writeModel(A, O, name)
Example #5
0
    def __init__(self, X, word2index, word2syllable, n_states, N_iters):

        self.HMM = unsupervised_HMM(X, n_states, N_iters)
        # can use syllable counts to label the words and do supervised training
        # self.HMM = supervised_HMM(X, Y)
        self.word2index = word2index
        self.word2syllable = word2syllable
        self.index2word = {val: key for key, val in self.word2index.items()}
Example #6
0
def unsupervised_learning(n_states, n_iters):
    '''
    Trains an HMM using supervised learning on the file 'shake_words.txt' and
    prints the results.

    Arguments:
        n_states:   Number of hidden states that the HMM should have.
    '''
    genres, genre_map = Utility.load_poem_hidden()
    genres = genres[:-1]

    # Train the HMM.
    HMM = unsupervised_HMM(genres, n_states, n_iters)

    # Print the transition matrix.
    # print("Transition Matrix:")
    # print('#' * 70)
    # for i in range(len(HMM.A)):
    #     print(''.join("{:<12.3e}".format(HMM.A[i][j]) for j in range(len(HMM.A[i]))))
    # print('')
    # print('')

    # Print the observation matrix.
    # print("Observation Matrix:  ")
    # print('#' * 70)
    # for i in range(len(HMM.O)):
    #     print(''.join("{:<12.3e}".format(HMM.O[i][j]) for j in range(len(HMM.O[i]))))
    # print('')
    # print('')

    inv_map = {v: k for k, v in genre_map.iteritems()}
    x = HMM.generate_emission(10)
    # print x
    lst_x = x.split()
    # print inv_map
    word_emission = []
    for i in lst_x:
        #print word_emission
        num1 = int(i)
        if inv_map[int(i)] == '%':
            while inv_map[num1] == '%':
                replace = HMM.generate_emission(1)
                lst_r = replace.split()
                num1 = int(lst_r[0])
                # print 'PRINTING WORD:'
                # print inv_map[num1]
                # print nsyl(inv_map[num1])
        word_emission.append(inv_map[num1])
    syl_count = 0
    for i in xrange(len(word_emission)):
        if (syl_count + (nsyl(word_emission[i])[0]) > 10):
            word_emission = word_emission[:i]
            break
        else:
            syl_count += (nsyl(word_emission[i])[0])

    return (word_emission)
def ten_syllables_poem_generator(n_states, N_iters, k):
    '''
    Trains an HMM using unsupervised learning and generates k 14-line sonnets.

    Arguments:
        k:          Number of sonnets to generate.
        n_states:   Number of hidden states that the HMM should have.
        N_iters:    Number of iterations for the unsupervised learning
                    (EM algorithm)
    '''
    # Data to train on from pre-processing
    data, words_list, syllables, end_syllables, rhyme_dict, stress_dict = \
    processed_shakespeare_data2()
    # print(words_list)

    # Generate k input sequences.
    for i in range(k):

        print('Training unsupervised HMM...')

        # Train the HMM.
        HMM = unsupervised_HMM(data, n_states, N_iters)

        print('Generating emission...')

        # Generate a 14-line sonnet with 10 syllables in each line
        sonnet_lines = HMM.generate_sonnet_emission(words_list, syllables, \
        end_syllables)
        punct_marks, punct_freq = punctuation_freq_shakespeare()

        print('Sonnet # ' + str(i))

        f = open('output/10_syllables_poem.txt', 'a+')

        # Print the results.
        for s, emission in enumerate(sonnet_lines):
            if s == 13:
                # Last line of sonnet ends with period.
                line = ' '.join([words_list[j] for j in emission])+ '.'
                line = line[0].upper() + line[1:]
                print(line)
                f.write(line)
            else:
                line = ' '.join([words_list[j] for j in emission]) + \
                random.choices(punct_marks, weights=punct_freq)[0]
                line = line[0].upper() + line[1:]
                # Add some punctuation to the end of every sentence
                print(line)
                f.write(line)
            f.write('\n')
        f.write('\n\n')
        f.close()

    print('')
    print('')
def unsupervised_learning(n_states, n_iters):
    '''
    Trains an HMM using unsupervised learning.

    Arguments:
        n_states:   Number of hidden states that the HMM should have.
    '''
    observations, obs_map = Utility.load(shakespeare)

    # Train the HMM.
    HMM = unsupervised_HMM(observations, n_states, n_iters)

    return HMM, obs_map
Example #9
0
def train_HMM(ps, D, n_states, N_iters):
    '''
    Trains an HMM using unsupervised learning on the poems and then calls
    generations() to generate k emissions for each HMM, processing the emissions
    and printing them as strings.

    Arguments:
        ps: the list of poems, where each poem is a list of integers
            representing the tokens (words) of the poem.
        D: the number of "words" contained in ps.
        n_states: number of hidden states that the HMM should have.
        N_iters: the number of iterations the HMM should train for.
        k: the number of generations for each HMM.
    '''
    # simply tells us that we are running unsupervised learning. this isn't
    # necessary, but is nice for now.
    print('')
    print('')
    print('#' * 70)
    print("{:^70}".format("Running Unsupervised Learning With %d States") %
          n_states)
    print('#' * 70)
    print('')
    print('')

    # Train the HMM.
    HMM = unsupervised_HMM(ps, D, n_states, N_iters)
    # Save the HMM in a text file
    HMM.save_HMM(n_states, N_iters, 2)

    for p in ps:
        p.reverse()
        p.insert(0, 0)
        p.pop()

    HMM = unsupervised_HMM(ps, D, n_states, N_iters)
    HMM.save_HMM(n_states, N_iters, 1)
Example #10
0
def main():
    # text = open(os.path.join(os.getcwd(), 'data/shakespeare.txt')).read()
    # obs, obs_map = parse_observations(text)
    text = open(os.path.join(os.getcwd(), 'data/shakespeare.txt')).read()
    wordcloud = text_to_wordcloud(text, title='Shakespeare')

    obs, obs_map = parse_observations(text)
    hmm8 = unsupervised_HMM(obs, 10, 100)

    visualize_sparsities(hmm8, O_max_cols=50)
    wordclouds = states_to_wordclouds(hmm8, obs_map)

    print("animate_emission")
    anim = animate_emission(hmm8, obs_map, M=8)
    #This part only works in Jupyter Notebook
    HTML(anim.to_html5_video())
Example #11
0
def unsupervised_learning(n_states, n_iters):
    '''
    Trains an HMM using supervised learning on the file 'ron.txt' and
    prints the results.

    Arguments:
        n_states:   Number of hidden states that the HMM should have.
    '''
    lines = load_Shakespeare_Lines()

    # Train the HMM.
    HMM = unsupervised_HMM(lines, n_states, n_iters)
    numLines = 14
    for i in range(0, numLines):
        numWords = 10
        emission = HMM.generate_emission(numWords, HMM.indexes)
        print(emission)
Example #12
0
def matching_line(last_word, n_states, n_iters):
    genres, genre_map = Utility.load_poem_hidden()
    genres = genres[:-1]
    # Train the HMM.
    HMM = unsupervised_HMM(genres, n_states, n_iters)
    inv_map = {v: k for k, v in genre_map.iteritems()}
    x = HMM.generate_emission(10)
    # print x
    lst_x = x.split()
    # print inv_map
    word_emission = []
    for i in lst_x:
        #print word_emission
        num1 = int(i)
        if inv_map[int(i)] == '%':
            while inv_map[num1] == '%':
                replace = HMM.generate_emission(1)
                lst_r = replace.split()
                num1 = int(lst_r[0])
                # print 'PRINTING WORD:'
                # print inv_map[num1]
                # print nsyl(inv_map[num1])
        word_emission.append(inv_map[num1])
    syl_count = 0
    for i in xrange(len(word_emission)):
        if (syl_count + (nsyl(word_emission[i])[0]) > 10):
            word_emission = word_emission[:i]
            break
        else:
            syl_count += (nsyl(word_emission[i])[0])

    while True:
        temp5 = HMM.generate_emission(1)
        lst_temp5 = temp5.split()
        potential_word = inv_map[(int(lst_temp5[0]))]
        if poetrytools.rhymes(potential_word,
                              last_word) and (potential_word != last_word):
            word_emission[len(word_emission) - 1] = potential_word
            break

    return word_emission
Example #13
0
def unsupervised_learning(words, word_map, n_states, n_iters):

    # Train the HMM.
    HMM = unsupervised_HMM(words, n_states, n_iters)

    # Print the transition matrix.
    # print("Transition Matrix:")
    # print('#' * 70)
    # for i in range(len(HMM.A)):
    #     print(''.join("{:<12.3e}".format(HMM.A[i][j]) for j in range(len(HMM.A[i]))))
    # print('')
    # print('')

    # # Print the observation matrix.
    # print("Observation Matrix:  ")
    # print('#' * 70)
    # for i in range(len(HMM.O)):
    #     print(''.join("{:<12.3e}".format(HMM.O[i][j]) for j in range(len(HMM.O[i]))))
    # print('')
    # print('')

    return HMM
Example #14
0
def unsupervised_learning(tokenized_lines):
    '''
    Generate a sonnet by training a HMM using unsupervised learning and then using 
    the HMM to generate a line with 10 words. The number of hidden states used is the 
    same as the number of part of speech labels for the words in the sonnet dataset 
    (i.e. the same number of hidden states as supervised learning would use).

    Arguments:
        tokenized_lines: a list of lines tokenized as words
    '''

    # Come up with all of the maps needed (states to part-of-speech and vice versa)
    # These maps will be used for determining how many hidden states the HMM will use and
    # for generating the poem. The list of observation sequences will be used to train the
    # unsupervised HMM.
    states, state_POS_map, POS_state_map = convert_POS_to_states(
        tokenized_lines)
    observations, observation_word_map, word_observation_map = convert_lines_observations(
        tokenized_lines)

    # Initialize transition and observation matrices.
    A = [[0. for j in range(len(state_POS_map))]
         for i in range(len(state_POS_map))]
    O = [[0. for j in range(len(observation_word_map))]
         for i in range(len(state_POS_map))]

    # Create HMM and train it with unsupervised learning.
    hmm = unsupervised_HMM(observations, len(state_POS_map), 50)

    # Generate 14 lines with 10 words each and print them out.
    for i in range(14):
        obs = hmm.preethi_generate_emission(10)
        line = ''
        for j in obs:
            line += observation_word_map[j]
            line += " "
        print(line)
Example #15
0
from HMM import unsupervised_HMM
from HMM_helper import (
    text_to_wordcloud,
    states_to_wordclouds,
    parse_observations,
    sample_sentence,
    visualize_sparsities,
    animate_emission,
)

# pre-porcessing
poem_lists, uatrain_lists, volta_lists, couplet_lists, word_to_int, int_to_word = parse_data('data/shakespeare.txt')

# train HMM
hmm = unsupervised_HMM(poem_lists, n_states=20, N_iters=10)

# sample naive sentence
print('Sample Naive Sentence:\n====================')
print(sample_sentence(hmm, word_to_int, n_words=10))

def write_naive_sonnet():
    sonnet = ''
    for i in range(14):
        if i % 4 ==0:
            sonnet += '\n'
        sonnet += sample_sentence(hmm, word_to_int, 10) + ',\n'
    return sonnet

print('Naive Sonet:\n====================')
print (write_naive_sonnet() + '\n\n\n\n')
Example #16
0
ids = []
for i in text:
    line = []
    for j in i:
        word = '\'' + j
        if word in outliers:
            line.append(new_unique.index(word))
        else:
            line.append(new_unique.index(j))
    ids.append(line)

with open('obs.p', 'wb') as f:
    pickle.dump(ids, f)
    pickle.dump(ids_map, f)

hmm8 = unsupervised_HMM(ids, 10, 100)
pickle.dump(hmm8, open('hmm8.p', 'wb'))
print('Sample Sentence:\n====================')
print(sample_sentence(hmm8, ids_map, n_words=25))

hmm1 = unsupervised_HMM(ids, 1, 100)
pickle.dump(hmm1, open('hmm1.p', 'wb'))
print('\nSample Sentence:\n====================')
print(sample_sentence(hmm1, ids_map, n_words=25))

hmm2 = unsupervised_HMM(ids, 2, 100)
pickle.dump(hmm2, open('hmm2.p', 'wb'))
print('\nSample Sentence:\n====================')
print(sample_sentence(hmm2, ids_map, n_words=25))

hmm4 = unsupervised_HMM(ids, 4, 100)
Example #17
0
def naive_poem2_generator(n_states, N_iters, k):
    '''
    Trains an HMM using unsupervised learning and generates k 14-line sonnets.

    Arguments:
        k:          Number of sonnets to generate.
        n_states:   Number of hidden states that the HMM should have.
        N_iters:    Number of iterations for the unsupervised learning (EM algorithm)
    '''
    # Data to train on from pre-processing
    # data, words_list, syllables, end_syllables = processed_shakespeare_data()
    data, words_list, syllables, end_syllables, rhyme_dict = processed_shakespeare_data2(
    )

    # Generate k input sequences.
    for i in range(k):

        print('Training unsupervised HMM...')

        # Train the HMM.
        HMM = unsupervised_HMM(data, n_states, N_iters)

        print('Generating emission...')

        # Generate a 14-line sonnet in one long sequence of integers
        emission, states = HMM.generate_emission(140)

        sonnet_lines = [[] for i in range(14)]
        e = 0
        # Split the sonnet sequence into 14 lines
        for j in range(14):
            line_syllables = 0
            while line_syllables < 10:
                # Capitalize first word in every line
                if line_syllables == 0:
                    sonnet_lines[j].append(
                        words_list[emission[e]].capitalize())
                # Append word to line
                else:
                    sonnet_lines[j].append(words_list[emission[e]])
                # Add number of syllables
                line_syllables += syllables[emission[e]][0]
                e += 1

        print('Naive Sonnet # ' + str(i))

        f = open('output/naive_poem2.txt', 'a+')

        # Print the results.
        for j, line in enumerate(sonnet_lines):
            if j == 13:
                # Last line of sonnet ends with period.
                print(' '.join([word for word in line]) + '.')
                f.write(' '.join([word for word in line]) + '.')
            else:
                # Add some punctuation to the end of every sentence
                print(' '.join([word for word in line]) +
                      random.choice([';', '.', ',', ':', '!', '']))
                f.write(' '.join([word for word in line]) +
                        random.choice([';', '.', ',', ':', '!', '']))
            f.write('\n')
        f.write('\n\n')
        f.close()

    print('')
    print('')
    """
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)


def load_obj(name):
    """
    Load data
    """
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)


#########################################################################
#                               Main code                               #
#########################################################################

# load data
training = load_obj('./sonnet_preprocessing_data/training_poem_data')

# training
for i in range(10, 41, 5):
    print 'i =' + str(i)
    HMM = unsupervised_HMM(training, i, 500)
    A = HMM.A
    O = HMM.O

    save_obj(A, './data/transition_matrix_line_' + str(i))
    save_obj(O, './data/observation_matrix_line_' + str(i))
    save_obj(HMM, './data/hmm_line_' + str(i))
# We will be using the Constitution as our dataset. First, we visualize the entirety of the Constitution as a wordcloud:

# In[7]:

text = open(os.path.join(os.getcwd(), 'data/constitution.txt')).read()
wordcloud = text_to_wordcloud(text, title='Constitution')

# ## Training an HMM

# Now we train an HMM on our dataset. We use 10 hidden states and train over 100 iterations:

# In[8]:

obs, obs_map = parse_observations(text)
hmm8 = unsupervised_HMM(obs, 10, 100)

# ## Part G: Visualization of the sparsities of A and O

# We can visualize the sparsities of the A and O matrices by treating the matrix entries as intensity values and showing them as images. What patterns do you notice?

# In[9]:

visualize_sparsities(hmm8, O_max_cols=50)

# ## Generating a sample sentence

# As you have already seen, an HMM can be used to generate sample sequences based on the given dataset. Run the cell below to show a sample sentence based on the Constitution.

# In[5]:
Example #20
0
def ten_syllables_rhyme_generator(n_states, N_iters, k, train_on='line'):
    '''
    Trains an HMM using unsupervised learning and generates k 14-line sonnets.

    Arguments:
        k:          Number of sonnets to generate.
        n_states:   Number of hidden states that the HMM should have.
        N_iters:    Number of iterations for the unsupervised learning
                    (EM algorithm)
        train_on:   Optional argument. Train on either line or sonnet.
                    Default to line.
    '''
    # Data to train on from pre-processing.
    data, words_list, syllables, end_syllables, rhyme_dict, stress_dict = \
    processed_shakespeare_data2()

    # If train on sonnet instead of line.
    if train_on == 'sonnet':
        data, words_list, syllables, end_syllables = \
        processed_shakespeare_data()

    print('Training unsupervised HMM...')

    f = open('output/10_syllables_rhyme.txt', 'a+')

    print('(%d states, %d iterations, training on each %s)\n\n' % \
    (n_states, N_iters, train_on))
    f.write('(%d states, %d iterations, training on each %s)\n\n\n' % \
    (n_states, N_iters, train_on))

    # Train the HMM.
    HMM = unsupervised_HMM(data, n_states, N_iters)

    # Generate k input sequences.
    for i in range(k):

        # Generate a 14-line sonnet
        sonnet_lines = HMM.generate_sonnet_rhyme(words_list, syllables, \
        end_syllables, rhyme_dict)
        punct_marks, punct_freq = punctuation_freq_shakespeare()

        print('\n\nSonnet # ' + str(i + 1))

        # Print the results.
        for s, emission in enumerate(sonnet_lines):
            if s == 13:
                # Last line of sonnet ends with period.
                line = ' '.join([words_list[j] for j in emission]) + '.'
                line = line[0].upper() + line[1:]
                print(line)
                f.write(line)
            else:
                line = ' '.join([words_list[j] for j in emission]) + \
                random.choices(punct_marks, weights=punct_freq)[0]
                line = line[0].upper() + line[1:]
                # Add some punctuation to the end of every sentence
                print(line)
                f.write(line)
            f.write('\n')
        f.write('\n\n')

    f.close()

    print('')
    print('')
Example #21
0
    emStr = ''
    for obs in emission:
        emRate = [row[1] for row in POSlookup[obs]]
        emWords = [row[0] for row in POSlookup[obs]]
        emRate = np.array(emRate)
        emRate = emRate / sum(emRate)
        index = np.random.choice(np.arange(len(emRate)), p=emRate)
        emStr = emStr + emWords[index] + ' '
    return emStr


text, chars = block_text()
flat_text = [item for sublist in text for item in sublist]
blockText = ' '.join(map(str, flat_text))
#### WORDCLOUD VISUALIZATION ####
wordcloud = text_to_wordcloud(blockText, title='Shakespeare Poems')

#### A AND O MATRIX VISUALIZATIONS ####

poems, syllables = read_files(sep='poem')
POSList, POSlookup, features = featurize(poems)

HMM = unsupervised_HMM(features, 10, 10)

visualize_sparsities(HMM, O_max_cols=50)

#### STATE WORDCLOUD VISUALIZATIONS ####
stateTexts = state_text(HMM.O, POSlookup)
for state, string in enumerate(stateTexts):
    title = "State " + str(state)
    stateCloud = text_to_wordcloud(string, title=title)
Example #22
0
    if len(text[idx]) > 25:
        word_list += nltk.word_tokenize(text[idx])

words = list(set(word_list))
num_words = len(words)

grammar = ["'", "'s", "(", ")", ",", ";", ".", ":", "?"]
words = [x for x in words if x not in grammar]

# Create a character -> integer mapping
word_to_int = {word: i for i, word in enumerate(words)}

# Create a integer -> character mapping
int_to_word = {i: word for i, word in enumerate(words)}

# Create training set for the HMM
train_x = []

for idx in range(len(text)):
    if len(text[idx]) > 25:
        new_seq = nltk.word_tokenize(text[idx])
        new_seq = [
            int(word_to_int[word]) for word in new_seq if (word not in grammar)
        ]
        train_x.append(new_seq)

# Train HMM
print("Training HMM...")
n_states = 5
HMM = unsupervised_HMM(train_x, n_states)
Example #23
0
    visualize_sparsities,
)
from Utility import Utility

# Print the transition matrix.

if __name__ == '__main__':
    train = False
    n_states = 10
    N_iters = 50
    text = open(os.path.join(os.getcwd(), '../data/shakespeare.txt')).read()
    obs, obs_map, stress_dic = parse_observations(text)
    #print(obs)
    # Train the HMM.
    if train:
        HMM = unsupervised_HMM(obs, n_states, N_iters)
        file = open('hmm_10.txt', 'wb')
        pickle.dump(HMM, file)
        file.close()
    else:
        file = open("hmm_10.txt", "rb")
        HMM = pickle.load(file)

    #######
    dic = open(os.path.join(os.getcwd(),
                            '../data/Syllable_dictionary.txt')).read()
    lines = [line.split() for line in dic.split('\n') if line.split()]

    syl_dic = {}

    for line in lines:
Example #24
0
    for i in range(VERSE_LENGTH[verse] / 2):
        group = random.randrange(num_groups)
        rhyme_pair = random.sample(rhymes[verse][group], 2)
        rhy.append(rhyme_pair[0])
        rhy.append(rhyme_pair[1])

seeds = {}
seeds[VERSES[0]] = [
    rhy[0], rhy[2], rhy[1], rhy[3], rhy[4], rhy[6], rhy[5], rhy[7]
]  # quatrain seeds
seeds[VERSES[1]] = [rhy[8], rhy[10], rhy[9], rhy[11]]  # volta seeds
seeds[VERSES[2]] = [rhy[12], rhy[13]]  # couplet seeds

for verse in VERSES:
    X_processed, X_conversion = dh.quantify_observations(X[verse])
    syllDict = nh.getDict(X_conversion)

    print(nh.numMiss)
    if trainHMM:
        HMM = unsupervised_HMM(X_processed, nStates)
        pickle.dump(HMM, open(WRITE_FOLDER + 'HMM_' + verse + '.p', 'wb'))
    else:
        HMM = pickle.load(open(READ_FOLDER + 'HMM_' + verse + '.p', 'rb'))

    count = 0
    for word in seeds[verse]:
        count += 1
        seed_num = X_conversion.index(word)
        emission = HMM.generate_new_emission(seed_num, X_conversion, syllDict)
        print dh.convert_to_sentence(emission, X_conversion, count % 2)
from HMM import unsupervised_HMM
from HMM_utils import SyllableDict, RhymeDict, get_training_data
import sys


def print_line(line):
    print(' '.join(reversed(line)))


if len(sys.argv) < 2:
    print('Enter number of states')
    sys.exit()

# Pre-processing; get training data and generate syllable and rhyming dictionaries
sd = SyllableDict()
X = get_training_data(sd)
rd = RhymeDict(X)

# Load in previous HMM
fname = 'HMM/' + sys.argv[1] + '_states.txt'
hmm = unsupervised_HMM(X, int(sys.argv[1]), 0, fname)

# Generate a sonnet as 14 lines.
for i in range(14):
    print_line([
        sd.word_from_id(e) for e in hmm.generate_emission_without_structure(10)
    ])
Example #26
0
from our_utilities import loadsonnets, writeHMM, readHMM
from HMM import unsupervised_HMM
import sys

n_hidden = int(sys.argv[1])

n_epochs = int(sys.argv[2])

sonnets, word2syll, word2index, index2word = loadsonnets()

HMM = unsupervised_HMM(sonnets, word2index, n_hidden, n_epochs)

writeHMM(HMM, filename=None)
Example #27
0
    spenser_indexed_sonnets = [
        sonnet_to_indices(s, whitespace=False) for s in spenser_sonnets
    ]
    for i, idx in enumerate(spenser_indexed_sonnets):
        # Drop any words that we don't have syllable data for
        spenser_indexed_sonnets[i] = list(
            filter(lambda x: type(x) == type(int(0)), idx))

    sonnets = generate_sonnet_list('shakespeare.txt')
    states = list(syllables.keys())

    indexed_sonnets = [sonnet_to_indices(s) for s in sonnets]
    all_sonnets = indexed_sonnets + spenser_indexed_sonnets

    # Train unsupervised hmms on the three training sets
    hmm = unsupervised_HMM(indexed_sonnets, 10, 1000)
    print("Trained shakespeare")
    spenser_hmm = unsupervised_HMM(spenser_indexed_sonnets,
                                   10,
                                   1000,
                                   D=len(states))
    print("Trained spenser")
    all_hmm = unsupervised_HMM(all_sonnets, 10, 1000)
    print("Trained both")

    # Generate three emissions for each hmm and each type of poem and save to file
    for i in range(1, 4):
        print(
            hmm.generate_sonnet(states, syllables,
                                "sonnet_shakespeare" + str(i) + ".txt"))
        print()
Example #28
0
# text = open(os.path.join(os.getcwd(), 'data/shakespeare.txt')).read()
text = open(os.path.join(os.getcwd(), 'data/allpoems.txt')).read()
# visualization of whole data set
wordcloud = text_to_wordcloud(text, title='Shakespeare')
# TODO: extract words
# - keep hyphenated words hyphenated
# - some words could be tokenized as bigrams
# - separate punctuation from words, and store them separately
obs, obs_map = parse_observations(text)
syllables = cmudict.dict()
for punct in [".", ",", ":", ";", "!", "?"]:
    syllables.update({punct:[[]]})

# UNSUPERVISED LEARNING
#Was 20
hmm8 = unsupervised_HMM(obs, 10, 100)

# visualizations of sparsity of A, O as well as
# visualizations of states as wordclouds
visualize_sparsities(hmm8, O_max_cols=50)
wordclouds = states_to_wordclouds(hmm8, obs_map)

#This part only works in Jupyter Notebook
anim = animate_emission(hmm8, obs_map, M=8)
HTML(anim.to_html5_video())

# POETRY GENERATION, PART 1: HMMs
# TODO: write poem generation using hmm.generate_emission()
# - (suggested on piazza) in generate_emission() function, before generating 
#   the next word, go through all possibilities and check for (1) whether 
#   there are still enough syllables left for it and (2) whether it starts
Example #29
0
Tokenizer.fit_on_texts(word_sequence)

# convert text to observation (list of list of tokens) using Tokenizer
obs = parse_poetry_2(text, Tokenizer)

# reverse the order so that prediction happens backwards (for Rhyme)
for line in obs:
    line.reverse()

# %% Generate

ks = [6, 10, 14, 18, 22, 26, 30]
# ks = [22, 26, 30]

for k in ks:

    display_title('k = ' + str(k))

    # learn line-wise backwards HMM
    HMM = unsupervised_HMM(obs, k, 100)
    data = {}
    data['O'] = HMM.O
    data['A'] = HMM.A

    fname = 'OA_k' + str(k) + '.json'
    with open(dataPath + fname, 'w') as outfile:
        json.dump(data, outfile)

    # poem = poem_that_rhymes(HMM,Tokenizer,r2w_dict,w2s_dict)

    # print(poem)
Example #30
0
    index_sonnets = [[[w2i[word] for word in line if word in w2i]
                                 for line in sonnet]
                                 for sonnet in sonnet_list]
    
    # Generate the data as one long list of lines. We have to generate the
    # lines in reverse because it is easier to make the first word rhyme.
    X = []
    for sonnet in index_sonnets:
        for line in sonnet:
            X.append(list(reversed(line)))

    # Get the maximum word index is the number of words - 1.
    m = max([max(line) for line in X])

    # Train the Hidden Markov Model.
    hmm = unsupervised_HMM(X, 20, 1)

    ## Generate a sonnet using the HMM
    output = []
    last = []
    num_lines = 14
    rhyme_lines = [3, 4, 7, 8, 11, 12, 14]
    for i in range(1, num_lines + 1):
        if i in rhyme_lines:
            rhyme_bank = [w2i[word] for word in rhyme(i2w[last[i - 3]], w2i.keys())]
            lst = hmm.generate_emission(10, rhyme_bank)[0]
        else:
            lst = hmm.generate_emission(10)[0]
            
        output.append(' '.join(map(lambda idx: i2w[idx], reversed(lst))))
        last.append(lst[0])