def __init__(self): # Instantiate Embeddings self.embeddings = Embeddings(WORD_EMBEDDING_DIMENSION, WORD_EMBEDDING_WINDOW_SIZE, 1, 4) # Gets word2vec_model, word2index, index2word, word2vec_weights, tokenized_indexed_sentences self.word2vec_model = self.embeddings.get_intersected_model() word2index = self.embeddings.get_vocabulary()[0] word2vec_weights = self.word2vec_model.wv.syn0 indexed_sentences = self.embeddings.get_indexed_sentences() # Shifting the indexes by 1 so as to reserve space for Masking self.word2index = {word:index + 1 for word, index in word2index.items()} self.index2word = {index:word for word, index in self.word2index.items()} self.vocab_size = len(word2index) indexed_sentences = [np.array(sentence) + 1 for sentence in indexed_sentences if len(sentence) > 0] # Creating a zero vector for masking and then appending with word2vec_weights mask_vector = np.zeros((1, word2vec_weights.shape[1])) self.word2vec_weights = np.append(mask_vector, self.word2vec_weights, axis=0) # Padding Sentences sentence_with_max_len = max([len(sentence) for sentence in indexed_sentences]) self.indexed_sentences = sequence.pad_sequences(indexed_sentences, maxlen=sentence_with_max_len, padding='post')
from gensim.models import Word2Vec from gensim.utils import simple_preprocess from keras.layers import Embedding from keras.models import Sequential from keras.layers import Dense from keras.layers import Dropout from keras.layers import LSTM from keras.preprocessing import sequence from intersect_embeddings import Embeddings from keras.callbacks import ModelCheckpoint from nltk.tokenize import word_tokenize import random from itertools import groupby # ## Instantiate Embeddings embeddings = Embeddings(300, 4, 1, 4) # ### Getting data from preprocessing word2vec_model = embeddings.get_intersected_model() word2index, index2word = embeddings.get_vocabulary() word2vec_weights = word2vec_model.wv.syn0 tokenized_indexed_sentences = embeddings.get_indexed_sentences() word2index = {word: index + 1 for word, index in word2index.items()} index2word = {index: word for word, index in word2index.items()} new_weights = np.zeros((1, word2vec_weights.shape[1])) new_weights = np.append(new_weights, word2vec_weights, axis=0) # ## Defining model # Changes to the model to be done here
import random model_name = "lstm-1024-512-epochs-25-batchsize-128-acc-1" word_embedding_dimension = 300 word_embedding_window_size = 4 batch_size = 256 epochs = 15 window_size = 5 accuracy_threshold = 1 activation = 'relu' custom_accuracy = 0 loss_function = 'mse' # ## Instantiate Embeddings embeddings = Embeddings(word_embedding_dimension, word_embedding_window_size, 1, 4) # ### getting data from preprocessing word2vec_model = embeddings.get_intersected_model() word2vec_weights = word2vec_model.wv.syn0 word2index, index2word = embeddings.get_vocabulary() tokenized_indexed_sentences = embeddings.get_indexed_sentences() # ### generating training data vocab_size = len(word2index) print(vocab_size) seq_in = [] seq_out = [] # generating dataset for sentence in tokenized_indexed_sentences: