Example #1
0
    def __init__(self):

        # Instantiate Embeddings
        self.embeddings = Embeddings(WORD_EMBEDDING_DIMENSION, WORD_EMBEDDING_WINDOW_SIZE, 1, 4)

        # Gets word2vec_model, word2index, index2word, word2vec_weights, tokenized_indexed_sentences
        self.word2vec_model = self.embeddings.get_intersected_model()
        word2index = self.embeddings.get_vocabulary()[0]
        word2vec_weights = self.word2vec_model.wv.syn0
        indexed_sentences = self.embeddings.get_indexed_sentences()

        # Shifting the indexes by 1 so as to reserve space for Masking
        self.word2index = {word:index + 1 for word, index in word2index.items()}
        self.index2word = {index:word for word, index in self.word2index.items()}
        self.vocab_size = len(word2index)
        indexed_sentences = [np.array(sentence) + 1
                             for sentence in indexed_sentences
                             if len(sentence) > 0]

        # Creating a zero vector for masking and then appending with word2vec_weights
        mask_vector = np.zeros((1, word2vec_weights.shape[1]))
        self.word2vec_weights = np.append(mask_vector, self.word2vec_weights, axis=0)

        # Padding Sentences
        sentence_with_max_len = max([len(sentence) for sentence in indexed_sentences])
        self.indexed_sentences = sequence.pad_sequences(indexed_sentences,
                                                        maxlen=sentence_with_max_len,
                                                        padding='post')
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from keras.layers import Embedding
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.preprocessing import sequence
from intersect_embeddings import Embeddings
from keras.callbacks import ModelCheckpoint
from nltk.tokenize import word_tokenize
import random
from itertools import groupby

# ## Instantiate Embeddings
embeddings = Embeddings(300, 4, 1, 4)

# ### Getting data from preprocessing
word2vec_model = embeddings.get_intersected_model()
word2index, index2word = embeddings.get_vocabulary()
word2vec_weights = word2vec_model.wv.syn0
tokenized_indexed_sentences = embeddings.get_indexed_sentences()

word2index = {word: index + 1 for word, index in word2index.items()}
index2word = {index: word for word, index in word2index.items()}

new_weights = np.zeros((1, word2vec_weights.shape[1]))
new_weights = np.append(new_weights, word2vec_weights, axis=0)

# ## Defining model
# Changes to the model to be done here
import random

model_name = "lstm-1024-512-epochs-25-batchsize-128-acc-1"

word_embedding_dimension = 300
word_embedding_window_size = 4
batch_size = 256
epochs = 15
window_size = 5
accuracy_threshold = 1
activation = 'relu'
custom_accuracy = 0
loss_function = 'mse'

# ## Instantiate Embeddings
embeddings = Embeddings(word_embedding_dimension, word_embedding_window_size,
                        1, 4)

# ### getting data from preprocessing
word2vec_model = embeddings.get_intersected_model()
word2vec_weights = word2vec_model.wv.syn0
word2index, index2word = embeddings.get_vocabulary()
tokenized_indexed_sentences = embeddings.get_indexed_sentences()

# ### generating training data
vocab_size = len(word2index)
print(vocab_size)

seq_in = []
seq_out = []
# generating dataset
for sentence in tokenized_indexed_sentences: