Esempio n. 1
0
import numpy as np
from pickle import dump
import tensorflow as tf
import doc_methods

in_filename = "ss_char.txt"
raw_text = doc_methods.load_doc(in_filename)
lines = raw_text.split("\n")

# integer encode
chars = sorted(list(set(raw_text)))
mapping = dict((c, i) for i, c in enumerate(chars))
sequences = list()
for line in lines:
    encoded_seq = [mapping[char] for char in line]
    sequences.append(encoded_seq)
vocab_size = len(mapping)
print("Vocab size: %d" % vocab_size)

# separate I/O
sequences = np.array(sequences)
X, y = sequences[:, : -1], sequences[:, -1]
sequences = [tf.keras.utils.to_categorical(x, num_classes=vocab_size) for x in X]
X = np.array(sequences)
y = tf.keras.utils.to_categorical(y, num_classes=vocab_size)

# define model
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.LSTM(75, input_shape=(X.shape[1], X.shape[2])))
model.add(tf.keras.layers.Dense(vocab_size, activation="softmax"))
print(model.summary())
Esempio n. 2
0
# Extension: use complete sentences + padding as input size
import doc_methods


doc = doc_methods.load_doc("shakespeare_input.txt")
# clean document and cut down size
tokens = doc_methods.clean_doc(doc[:int(len(doc)/4)])
print('Total Tokens: %d' % len(tokens))
print('Unique Tokens: %d' % len(set(tokens)))
# organize into sequences of tokens
length = 50 + 1
sequences = list()
for i in range(length, len(tokens)):
    # select sequence of tokens
    seq = tokens[i - length:i]
    # convert into a line
    line = ' '.join(seq)
    # store
    sequences.append(line)
print('Total Sequences: %d' % len(sequences))

# save sequences to file
out_filename = 'ss_words.txt'
doc_methods.save_doc(sequences, out_filename)
        encoded = tokenizer.texts_to_sequences([in_text])[0]
        # truncate sequences to a fixed length
        encoded = tf.keras.preprocessing.sequence.pad_sequences([encoded], maxlen=seq_length, truncating='pre')
        # predict probabilities for each word
        yhat = model.predict_classes(encoded, verbose=0)
        # map predicted word index to word
        out_word = ''
        for word, index in tokenizer.word_index.items():
            if index == yhat:
                out_word = word
                break
        # append to input
        in_text += " " + out_word
        result.append(out_word)
    return " ".join(result)

# load cleaned text sequences
in_filename = 'ss_words.txt'
doc = doc_methods.load_doc(in_filename)
lines = doc.split('\n')
seq_length = len(lines[0].split()) - 1
# load model
model = tf.keras.models.load_model("ss_words.h5")
tokenizer = load(open("tokenizer.pkl", "rb"))
# select a seed text
seed_text = lines[randint(0, len(lines))]
print(seed_text + "\n")
# generate new text
generated = generate_seq(model, tokenizer, seq_length, seed_text, 10)
print(generated)