def __init__(self): # Instantiate Embeddings self.embeddings = Embeddings(WORD_EMBEDDING_DIMENSION, WORD_EMBEDDING_WINDOW_SIZE, 1, 4) # Gets word2vec_model, word2index, index2word, word2vec_weights, tokenized_indexed_sentences self.word2vec_model = self.embeddings.get_intersected_model() word2index = self.embeddings.get_vocabulary()[0] word2vec_weights = self.word2vec_model.wv.syn0 indexed_sentences = self.embeddings.get_indexed_sentences() # Shifting the indexes by 1 so as to reserve space for Masking self.word2index = {word:index + 1 for word, index in word2index.items()} self.index2word = {index:word for word, index in self.word2index.items()} self.vocab_size = len(word2index) indexed_sentences = [np.array(sentence) + 1 for sentence in indexed_sentences if len(sentence) > 0] # Creating a zero vector for masking and then appending with word2vec_weights mask_vector = np.zeros((1, word2vec_weights.shape[1])) self.word2vec_weights = np.append(mask_vector, self.word2vec_weights, axis=0) # Padding Sentences sentence_with_max_len = max([len(sentence) for sentence in indexed_sentences]) self.indexed_sentences = sequence.pad_sequences(indexed_sentences, maxlen=sentence_with_max_len, padding='post')
from gensim.models import Word2Vec from gensim.utils import simple_preprocess from keras.layers import Embedding from keras.models import Sequential from keras.layers import Dense from keras.layers import Dropout from keras.layers import LSTM from keras.preprocessing import sequence from intersect_embeddings import Embeddings from keras.callbacks import ModelCheckpoint from nltk.tokenize import word_tokenize import random from itertools import groupby # ## Instantiate Embeddings embeddings = Embeddings(300, 4, 1, 4) # ### Getting data from preprocessing word2vec_model = embeddings.get_intersected_model() word2index, index2word = embeddings.get_vocabulary() word2vec_weights = word2vec_model.wv.syn0 tokenized_indexed_sentences = embeddings.get_indexed_sentences() word2index = {word: index + 1 for word, index in word2index.items()} index2word = {index: word for word, index in word2index.items()} new_weights = np.zeros((1, word2vec_weights.shape[1])) new_weights = np.append(new_weights, word2vec_weights, axis=0) # ## Defining model # Changes to the model to be done here
import random model_name = "lstm-1024-512-epochs-25-batchsize-128-acc-1" word_embedding_dimension = 300 word_embedding_window_size = 4 batch_size = 256 epochs = 15 window_size = 5 accuracy_threshold = 1 activation = 'relu' custom_accuracy = 0 loss_function = 'mse' # ## Instantiate Embeddings embeddings = Embeddings(word_embedding_dimension, word_embedding_window_size, 1, 4) # ### getting data from preprocessing word2vec_model = embeddings.get_intersected_model() word2vec_weights = word2vec_model.wv.syn0 word2index, index2word = embeddings.get_vocabulary() tokenized_indexed_sentences = embeddings.get_indexed_sentences() # ### generating training data vocab_size = len(word2index) print(vocab_size) seq_in = [] seq_out = [] # generating dataset for sentence in tokenized_indexed_sentences:
class LSTMModel(): def __init__(self): # Instantiate Embeddings self.embeddings = Embeddings(WORD_EMBEDDING_DIMENSION, WORD_EMBEDDING_WINDOW_SIZE, 1, 4) # Gets word2vec_model, word2index, index2word, word2vec_weights, tokenized_indexed_sentences self.word2vec_model = self.embeddings.get_intersected_model() word2index = self.embeddings.get_vocabulary()[0] word2vec_weights = self.word2vec_model.wv.syn0 indexed_sentences = self.embeddings.get_indexed_sentences() # Shifting the indexes by 1 so as to reserve space for Masking self.word2index = {word:index + 1 for word, index in word2index.items()} self.index2word = {index:word for word, index in self.word2index.items()} self.vocab_size = len(word2index) indexed_sentences = [np.array(sentence) + 1 for sentence in indexed_sentences if len(sentence) > 0] # Creating a zero vector for masking and then appending with word2vec_weights mask_vector = np.zeros((1, word2vec_weights.shape[1])) self.word2vec_weights = np.append(mask_vector, self.word2vec_weights, axis=0) # Padding Sentences sentence_with_max_len = max([len(sentence) for sentence in indexed_sentences]) self.indexed_sentences = sequence.pad_sequences(indexed_sentences, maxlen=sentence_with_max_len, padding='post') def generate_sequences(self): for seq_in in self.indexed_sentences: seq_in_len = len(seq_in) seq_out = np.append(seq_in[1:], seq_in[seq_in_len - 1]) one_hot_encoded_y = [np_utils.to_categorical(index, num_classes=self.vocab_size) for index in seq_out] yield (seq_in, one_hot_encoded_y) def train_model(self): # Defining model model = Sequential() model.add(Embedding(input_dim=self.word2vec_weights.shape[0], output_dim=self.word2vec_weights.shape[1], weights=[self.word2vec_weights], mask_zero=True)) model.add(LSTM(1024, return_sequences=True)) model.add(LSTM(1024)) model.add(Dense(self.vocab_size, activation='sigmoid')) model.compile(loss='cross_entropy', optimizer='adam', metrics=['accuracy']) model_weights_path = "../weights/lstm-2-1024-1024-batchsize-256-epochs-30-classification" if not os.path.exists(model_weights_path): os.makedirs(model_weights_path) checkpoint_path = model_weights_path + '/weights.{epoch:02d}.hdf5' checkpoint = ModelCheckpoint(filepath=checkpoint_path, verbose=1, save_best_only=False, mode='max') print("Model Summary") print(model.summary()) model.fit(seq_in, seq_out, epochs=EPOCHS, verbose=1, batch_size=BATCH_SIZE, callbacks=[checkpoint]) return model def predict(self): model = self.train_model() sentence_test = "In which regions in particular did" indexed_sentences = self.embeddings.get_indexed_query(sentence_test) sent = np.array(indexed_sentences) + 1 pattern = list(sent) print(' '.join(self.index2word[index] for index in pattern)) # for i in range(10): prediction = model.predict(np.array([pattern])) pred_word = self.word2vec_model.similar_by_vector(prediction[0][prediction.shape[1] - 1])[0][0] sys.stdout.write(pred_word + " ") pattern.append(self.word2index[pred_word]) pattern = pattern[:len(pattern)] def accuracy(self): count = 0 correct = 0 for sub_sample_in, sub_sample_out in zip(seq_in, seq_out): ypred = model.predict_on_batch(np.expand_dims(sub_sample_in, axis = 0))[0] ytrue = sub_sample_out pred_word = word2vec_model.similar_by_vector(ypred)[0][0] true_word = word2vec_model.similar_by_vector(ytrue)[0][0] similarity = word2vec_model.similarity(pred_word, true_word) if similarity == 1: correct += 1 count += 1 print("Accuracy {0}".format(correct/count)) def model_summary(self): # model_results = model_fit_summary.history # model_results.update(model_fit_summary.params) model_results["word_embedding_dimension"] = WORD_EMBEDDING_DIMENSION model_results["word_embedding_window_size"] = WORD_EMBEDDING_WINDOW_SIZE model_results["window_size"] = WINDOW_SIZE model_results["batch_size"] = BATCH_SIZE model_results["epochs"] = EPOCHS model_results["model_name"] = MODEL_NAME model_results["accuracy_threshold"] = ACCURACY_THRESHOLD model_results["activation"] = ACTIVATION model_results["custom_accuracy"] = CUSTOM_ACCURACY model_results["loss_function"] = LOSS_FUNCTION model_results["layers"] = [] model_results["dropouts"] = [] for layer in model.layers: if hasattr(layer, "units"): layer_summary = {} layer_summary["units"] = layer.get_config()["units"] layer_summary["name"] = layer.name model_results["layers"].append(layer_summary) if hasattr(layer, "rate"): dropout_summary = {} dropout_summary["rate"] = layer.get_config()["rate"] model_results["dropouts"].append(dropout_summary) text_file_path = "../weights/{0}/model_results.json".format(model_name) with open(text_file_path, "w") as f: json.dump(model_results, f)