import numpy as np from pickle import dump import tensorflow as tf import doc_methods in_filename = "ss_char.txt" raw_text = doc_methods.load_doc(in_filename) lines = raw_text.split("\n") # integer encode chars = sorted(list(set(raw_text))) mapping = dict((c, i) for i, c in enumerate(chars)) sequences = list() for line in lines: encoded_seq = [mapping[char] for char in line] sequences.append(encoded_seq) vocab_size = len(mapping) print("Vocab size: %d" % vocab_size) # separate I/O sequences = np.array(sequences) X, y = sequences[:, : -1], sequences[:, -1] sequences = [tf.keras.utils.to_categorical(x, num_classes=vocab_size) for x in X] X = np.array(sequences) y = tf.keras.utils.to_categorical(y, num_classes=vocab_size) # define model model = tf.keras.models.Sequential() model.add(tf.keras.layers.LSTM(75, input_shape=(X.shape[1], X.shape[2]))) model.add(tf.keras.layers.Dense(vocab_size, activation="softmax")) print(model.summary())
# Extension: use complete sentences + padding as input size import doc_methods doc = doc_methods.load_doc("shakespeare_input.txt") # clean document and cut down size tokens = doc_methods.clean_doc(doc[:int(len(doc)/4)]) print('Total Tokens: %d' % len(tokens)) print('Unique Tokens: %d' % len(set(tokens))) # organize into sequences of tokens length = 50 + 1 sequences = list() for i in range(length, len(tokens)): # select sequence of tokens seq = tokens[i - length:i] # convert into a line line = ' '.join(seq) # store sequences.append(line) print('Total Sequences: %d' % len(sequences)) # save sequences to file out_filename = 'ss_words.txt' doc_methods.save_doc(sequences, out_filename)
encoded = tokenizer.texts_to_sequences([in_text])[0] # truncate sequences to a fixed length encoded = tf.keras.preprocessing.sequence.pad_sequences([encoded], maxlen=seq_length, truncating='pre') # predict probabilities for each word yhat = model.predict_classes(encoded, verbose=0) # map predicted word index to word out_word = '' for word, index in tokenizer.word_index.items(): if index == yhat: out_word = word break # append to input in_text += " " + out_word result.append(out_word) return " ".join(result) # load cleaned text sequences in_filename = 'ss_words.txt' doc = doc_methods.load_doc(in_filename) lines = doc.split('\n') seq_length = len(lines[0].split()) - 1 # load model model = tf.keras.models.load_model("ss_words.h5") tokenizer = load(open("tokenizer.pkl", "rb")) # select a seed text seed_text = lines[randint(0, len(lines))] print(seed_text + "\n") # generate new text generated = generate_seq(model, tokenizer, seq_length, seed_text, 10) print(generated)