def update_model(self, saved_model_path, new_sentences_pair, is_similar, embedding_meta_data): """ Update trained siamese model for given new sentences pairs Steps Involved: 1. Pass the each from sentences from new_sentences_pair to bidirectional LSTM encoder. 2. Merge the vectors from LSTM encodes and passed to dense layer. 3. Pass the dense layer vectors to sigmoid output layer. 4. Use cross entropy loss to train weights Args: model_path (str): model path of already trained siamese model new_sentences_pair (list): list of tuple of new sentences pairs is_similar (list): target value 1 if same sentences pair are similar otherwise 0 embedding_meta_data (dict): dict containing tokenizer and word embedding matrix Returns: return (best_model_path): path of best model """ tokenizer = embedding_meta_data['tokenizer'] train_data_x1, train_data_x2, train_labels, leaks_train, \ val_data_x1, val_data_x2, val_labels, leaks_val = create_train_dev_set(tokenizer, new_sentences_pair, is_similar, self.max_sequence_length, self.validation_split_ratio) model = load_model(saved_model_path) model_file_name = saved_model_path.split('/')[-1] new_model_checkpoint_path = saved_model_path.split('/')[:-2] + str( int(time.time())) + '/' new_model_path = new_model_checkpoint_path + model_file_name model_checkpoint = ModelCheckpoint(new_model_checkpoint_path + model_file_name, save_best_only=True, save_weights_only=False) early_stopping = EarlyStopping(monitor='val_loss', patience=3) tensorboard = TensorBoard(log_dir=new_model_checkpoint_path + "logs/{}".format(time.time())) model.fit([train_data_x1, train_data_x2, leaks_train], train_labels, validation_data=([val_data_x1, val_data_x2, leaks_val], val_labels), epochs=50, batch_size=3, shuffle=True, callbacks=[early_stopping, model_checkpoint, tensorboard]) return new_model_path
def update_model(self, saved_model_path, new_sentences_pair, is_similar, embedding_meta_data): tokenizer = embedding_meta_data['tokenizer'] train_data_x1, train_data_x2, train_labels, leaks_train, \ val_data_x1, val_data_x2, val_labels, leaks_val = create_train_dev_set(tokenizer, new_sentences_pair, is_similar, self.max_sequence_length, self.validation_split_ratio) model = load_model(saved_model_path) model_file_name = saved_model_path.split('/')[-1] new_model_checkpoint_path = saved_model_path.split('/')[:-2] + str(int(time.time())) + '/' new_model_path = new_model_checkpoint_path + model_file_name model_checkpoint = ModelCheckpoint(new_model_checkpoint_path + model_file_name, save_best_only=True, save_weights_only=False) early_stopping = EarlyStopping(monitor='val_loss', patience=5) tensorboard = TensorBoard(log_dir="logs/{}".format(time.time())) model.fit([train_data_x1, train_data_x2, leaks_train], train_labels, validation_data=([val_data_x1, val_data_x2, leaks_val], val_labels), epochs=50, batch_size=8, shuffle=True, callbacks=[early_stopping, model_checkpoint, tensorboard]) return new_model_path
def train_model(self, sentences_pair, is_similar, embedding_meta_data, model_save_directory='./'): """ Train Siamese network to find similarity between sentences in `sentences_pair` Steps Involved: 1. Pass the each from sentences_pairs to bidirectional LSTM encoder. 2. Merge the vectors from LSTM encodes and passed to dense layer. 3. Pass the dense layer vectors to sigmoid output layer. 4. Use cross entropy loss to train weights Args: sentences_pair (list): list of tuple of sentence pairs is_similar (list): target value 1 if same sentences pair are similar otherwise 0 embedding_meta_data (dict): dict containing tokenizer and word embedding matrix model_save_directory (str): working directory for where to save models Returns: return (best_model_path): path of best model """ tokenizer, embedding_matrix = embedding_meta_data['tokenizer'], embedding_meta_data['embedding_matrix'] train_data_x1, train_data_x2, train_labels, leaks_train, \ val_data_x1, val_data_x2, val_labels, leaks_val = create_train_dev_set(tokenizer, sentences_pair, is_similar, self.max_sequence_length, self.validation_split_ratio) if train_data_x1 is None: print("++++ !! Failure: Unable to train model ++++") return None nb_words = len(tokenizer.word_index) + 1 # Creating word embedding layer embedding_layer = Embedding(nb_words, self.embedding_dim, weights=[embedding_matrix], input_length=self.max_sequence_length, trainable=False) # Creating LSTM Encoder lstm_layer = Bidirectional(LSTM(self.number_lstm_units, dropout=self.rate_drop_lstm, recurrent_dropout=self.rate_drop_lstm)) # Creating LSTM Encoder layer for First Sentence sequence_1_input = Input(shape=(self.max_sequence_length,), dtype='int32') embedded_sequences_1 = embedding_layer(sequence_1_input) x1 = lstm_layer(embedded_sequences_1) # Creating LSTM Encoder layer for Second Sentence sequence_2_input = Input(shape=(self.max_sequence_length,), dtype='int32') embedded_sequences_2 = embedding_layer(sequence_2_input) x2 = lstm_layer(embedded_sequences_2) # Creating leaks input leaks_input = Input(shape=(leaks_train.shape[1],)) leaks_dense = Dense(int(self.number_dense_units/2), activation=self.activation_function)(leaks_input) # Merging two LSTM encodes vectors from sentences to # pass it to dense layer applying dropout and batch normalisation merged = concatenate([x1, x2, leaks_dense]) merged = BatchNormalization()(merged) merged = Dropout(self.rate_drop_dense)(merged) merged = Dense(self.number_dense_units, activation=self.activation_function)(merged) merged = BatchNormalization()(merged) merged = Dropout(self.rate_drop_dense)(merged) preds = Dense(1, activation='sigmoid')(merged) model = Model(inputs=[sequence_1_input, sequence_2_input, leaks_input], outputs=preds) model.compile(loss='binary_crossentropy', optimizer='nadam', metrics=['acc']) early_stopping = EarlyStopping(monitor='val_loss', patience=3) STAMP = 'lstm_%d_%d_%.2f_%.2f' % (self.number_lstm_units, self.number_dense_units, self.rate_drop_lstm, self.rate_drop_dense) checkpoint_dir = model_save_directory + 'checkpoints/' + str(int(time.time())) + '/' if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) bst_model_path = checkpoint_dir + STAMP + '.h5' model_checkpoint = ModelCheckpoint(bst_model_path, save_best_only=True, save_weights_only=False) tensorboard = TensorBoard(log_dir=checkpoint_dir + "logs/{}".format(time.time())) model.fit([train_data_x1, train_data_x2, leaks_train], train_labels, validation_data=([val_data_x1, val_data_x2, leaks_val], val_labels), epochs=200, batch_size=64, shuffle=True, callbacks=[early_stopping, model_checkpoint, tensorboard]) return bst_model_path
def Execute_Model(self): EMBEDDING_DIM = 50 MAX_SEQUENCE_LENGTH = 10 RATE_DROP_LSTM = 0.17 RATE_DROP_DENSE = 0.25 NUMBER_LSTM = 50 NUMBER_DENSE_UNITS = 50 ACTIVATION_FUNCTION = 'relu' VALIDATION_SPLIT = 0.1 sentences1 = list(self.df['question1'].astype(str)) sentences2 = list(self.df['question2'].astype(str)) is_similar = list(self.df['is_duplicate']) tokenizer, embedding_matrix = word_embed_meta_data( sentences1 + sentences2, EMBEDDING_DIM) embedding_meta_data = { 'tokenizer': tokenizer, 'embedding_matrix': embedding_matrix } sentences_pair = [(x1, x2) for x1, x2 in zip(sentences1, sentences2)] nb_words = len(tokenizer.word_index) + 1 embedding_layer = layers.Embedding(nb_words, siamese_config['EMBEDDING_DIM'], weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, trainable=False) lstm_layer = layers.Bidirectional( layers.LSTM(NUMBER_LSTM, dropout=RATE_DROP_LSTM, recurrent_dropout=RATE_DROP_LSTM)) sequence_1_input = Input(shape=(MAX_SEQUENCE_LENGTH, ), dtype='int32') embedded_sequences_1 = embedding_layer(sequence_1_input) left_output = lstm_layer(embedded_sequences_1) sequence_2_input = Input(shape=(MAX_SEQUENCE_LENGTH, ), dtype='int32') embedded_sequences_2 = embedding_layer(sequence_2_input) right_output = lstm_layer(embedded_sequences_2) merged = layers.concatenate([left_output, right_output], axis=-1) merged = BatchNormalization()(merged) merged = layers.Dropout(0.1)(merged) merged = layers.Dense(128, activation='relu')(merged) merged = BatchNormalization()(merged) merged = layers.Dropout(0.1)(merged) predictions = layers.Dense(1, activation='sigmoid')(merged) model = Model([sequence_1_input, sequence_2_input], predictions) model.compile(loss='binary_crossentropy', optimizer='nadam', metrics=['acc']) model.summary() train_data_x1, train_data_x2, train_labels, leaks_train, \ val_data_x1, val_data_x2, val_labels, leaks_val = create_train_dev_set(tokenizer, sentences_pair, is_similar, MAX_SEQUENCE_LENGTH, VALIDATION_SPLIT) callbacks = [ keras.callbacks.TensorBoard( log_dir= 'E:\workdirectory\Code Name Val Halen\DS Sup\DL\Chapter 15\logs', histogram_freq=1) ] self.history = model.fit([train_data_x1, train_data_x2], train_labels, validation_data=([val_data_x1, val_data_x2], val_labels), epochs=200, batch_size=64, shuffle=True, callbacks=callbacks)
def train_model(self, sentences_pair, is_similar, embedding_meta_data, model_save_directory='./'): tokenizer, embedding_matrix = embedding_meta_data[ 'tokenizer'], embedding_meta_data['embedding_matrix'] train_data_x1, train_data_x2, train_labels, leaks_train, \ val_data_x1, val_data_x2, val_labels, leaks_val = create_train_dev_set(tokenizer, sentences_pair, is_similar, self.max_sequence_length, self.validation_split_ratio) nb_words = len(tokenizer.word_index) + 1 embedding_layer = Embedding(nb_words, self.embedding_dim, weights=[embedding_matrix], input_length=self.max_sequence_length, trainable=False) lstm_layer = Bidirectional( LSTM(self.number_lstm_units, dropout=self.rate_drop_lstm)) sequence_1_input = Input(shape=(self.max_sequence_length, ), dtype='int32') embedded_sequences_1 = embedding_layer(sequence_1_input) x1 = lstm_layer(embedded_sequences_1) sequence_2_input = Input(shape=(self.max_sequence_length, ), dtype='int32') embedded_sequences_2 = embedding_layer(sequence_2_input) x2 = lstm_layer(embedded_sequences_2) leaks_input = Input(shape=(leaks_train.shape[1], )) leaks_dense = Dense(int(self.number_dense_units), activation=self.activation_function)(leaks_input) merged = concatenate([x1, x2, leaks_dense]) merged = BatchNormalization()(merged) merged = Dropout(self.rate_drop_dense)(merged) merged = Dense(self.number_dense_units, activation=self.activation_function)(merged) merged = BatchNormalization()(merged) merged = Dropout(self.rate_drop_dense)(merged) preds = Dense(1, activation='sigmoid')(merged) model = Model(inputs=[sequence_1_input, sequence_2_input, leaks_input], outputs=preds) model.compile(loss='binary_crossentropy', optimizer='nadam', metrics=['acc']) early_stopping = EarlyStopping(monitor='val_loss', patience=3) STAMP = 'lstm_new' checkpoint_dir = model_save_directory + 'stored_model/' + str( int(time.time())) + '/' if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) bst_model_path = checkpoint_dir + STAMP + '.h5' model_checkpoint = ModelCheckpoint(bst_model_path, save_best_only=True, save_weights_only=False) history = model.fit( [train_data_x1, train_data_x2, leaks_train], train_labels, validation_data=([val_data_x1, val_data_x2, leaks_val], val_labels), epochs=40, batch_size=64, shuffle=True, callbacks=[early_stopping, model_checkpoint]) return bst_model_path
def train_model(self, sentences_pair, is_similar, embedding_meta_data, model_save_directory='./'): tokenizer, embedding_matrix = embedding_meta_data['tokenizer'], embedding_meta_data['embedding_matrix'] train_data_x1, train_data_x2, train_labels, leaks_train, \ val_data_x1, val_data_x2, val_labels, leaks_val = create_train_dev_set(tokenizer, sentences_pair, is_similar, self.max_sequence_length, self.validation_split_ratio) if train_data_x1 is None: return None nb_words = len(tokenizer.word_index) + 1 # Creating word embedding layer embedding_layer = Embedding(nb_words, self.embedding_dim, weights=[embedding_matrix], input_length=self.max_sequence_length, trainable=False) # Creating LSTM Encoder lstm_layer = Bidirectional(LSTM(self.number_lstm_units, dropout=self.rate_drop_lstm, recurrent_dropout=self.rate_drop_lstm)) # Creating LSTM Encoder layer for First Sentence sequence_1_input = Input(shape=(self.max_sequence_length,), dtype='int32') embedded_sequences_1 = embedding_layer(sequence_1_input) x1 = lstm_layer(embedded_sequences_1) # Creating LSTM Encoder layer for Second Sentence sequence_2_input = Input(shape=(self.max_sequence_length,), dtype='int32') embedded_sequences_2 = embedding_layer(sequence_2_input) x2 = lstm_layer(embedded_sequences_2) # Creating leaks input leaks_input = Input(shape=(leaks_train.shape[1],)) leaks_dense = Dense(int(self.number_dense_units/2), activation=self.activation_function)(leaks_input) # Merging two LSTM encodes vectors from sentences to # pass it to dense layer applying dropout and batch normalisation merged = concatenate([x1, x2, leaks_dense]) merged = BatchNormalization()(merged) merged = Dropout(self.rate_drop_dense)(merged) merged = Dense(self.number_dense_units, activation=self.activation_function)(merged) merged = BatchNormalization()(merged) merged = Dropout(self.rate_drop_dense)(merged) preds = Dense(4, activation='softmax')(merged) model = Model(inputs=[sequence_1_input, sequence_2_input, leaks_input], outputs=preds) model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc']) early_stopping = EarlyStopping(monitor='val_loss', patience=5) STAMP = 'lstm_%d_%d_%.2f_%.2f' % (self.number_lstm_units, self.number_dense_units, self.rate_drop_lstm, self.rate_drop_dense) checkpoint_dir = model_save_directory + 'checkpoints/' + str(int(time.time())) + '/' if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) bst_model_path = checkpoint_dir + STAMP + '.h5' model_checkpoint = ModelCheckpoint(bst_model_path, save_best_only=True, save_weights_only=False) tensorboard = TensorBoard(log_dir="logs/{}".format(time.time())) model.fit([train_data_x1, train_data_x2, leaks_train], train_labels, validation_data=([val_data_x1, val_data_x2, leaks_val], val_labels), epochs=200, batch_size=128, shuffle=True, callbacks=[model_checkpoint, tensorboard]) return bst_model_path
def train_model(self, sentences_pair, is_similar, model_save_directory='./'): train_data_1, train_data_2, labels_train, val_data_1, val_data_2, labels_val = create_train_dev_set( sentences_pair, is_similar, self.validation_split_ratio) if train_data_1 is None: print("++++ !! Failure: Unable to train model ++++") return None # embedding_layer = Embedding(121, self.embedding_dim, input_length=self.max_sequence_length, # trainable=False) # Creating LSTM Encoder lstm_layer = Bidirectional( LSTM(self.number_lstm_units, dropout=self.rate_drop_lstm, recurrent_dropout=self.rate_drop_lstm)) # Creating LSTM Encoder layer for First Sentence sequence_1_input = Input(shape=(self.max_sequence_length, ), dtype='float32') sequence_1 = Lambda(lambda x: expand_dims(x, axis=-1))( sequence_1_input) # embedded_sequences_1 = embedding_layer(sequence_1_input) # x1 = lstm_layer(embedded_sequences_1) x1 = lstm_layer(sequence_1) # Creating LSTM Encoder layer for Second Sentence sequence_2_input = Input(shape=(self.max_sequence_length, ), dtype='float32') sequence_2 = Lambda(lambda x: expand_dims(x, axis=-1))( sequence_2_input) # embedded_sequences_2 = embedding_layer(sequence_2_input) # x2 = lstm_layer(embedded_sequences_2) x2 = lstm_layer(sequence_2) # Merging two LSTM encodes vectors from sentences to # pass it to dense layer applying dropout and batch normalisation merged = concatenate([x1, x2]) merged = BatchNormalization()(merged) merged = Dropout(self.rate_drop_dense)(merged) merged = Dense(self.number_dense_units, activation=self.activation_function)(merged) merged = BatchNormalization()(merged) merged = Dropout(self.rate_drop_dense)(merged) preds = Dense(1, activation='sigmoid')(merged) model = Model(inputs=[sequence_1_input, sequence_2_input], outputs=preds) model.compile(loss='binary_crossentropy', optimizer='nadam', metrics=['acc']) early_stopping = EarlyStopping(monitor='val_loss', patience=20) STAMP = 'lstm_%d_%d_%.2f_%.2f' % ( self.number_lstm_units, self.number_dense_units, self.rate_drop_lstm, self.rate_drop_dense) checkpoint_dir = model_save_directory + 'checkpoints/' + str( int(time.time())) + '/' if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) bst_model_path = checkpoint_dir + STAMP + '.h5' model_checkpoint = ModelCheckpoint(bst_model_path, save_best_only=True, save_weights_only=False) tensorboard = TensorBoard(log_dir=checkpoint_dir + "logs/{}".format(time.time())) model.fit([train_data_1, train_data_2], labels_train, validation_data=([val_data_1, val_data_2], labels_val), epochs=200, batch_size=64, shuffle=True, callbacks=[early_stopping, model_checkpoint, tensorboard]) return bst_model_path