def build_ep_inference_model(model_settings): # architecture input = Input(shape=(model_settings['max_len'], model_settings['emb_dim']), name='input') model = Bidirectional(LSTM(units=100, return_sequences=True), name='bilstm1')(input) # biLSTM model = Bidirectional(LSTM(units=100, return_sequences=True), name='bilstm2')(model) # 2nd biLSTM model = TimeDistributed(Dense(model_settings['n_tags'], activation=None), name='td')(model) # a dense layer crf = CRF(model_settings['n_tags'], name='crf') # CRF layer out = crf(model) # output model = Model(input, out) model.compile(optimizer=Nadam(lr=0.01, clipnorm=1), loss=losses.crf_loss, metrics=[metrics.crf_accuracy]) model.summary() return model
def build_qt_inference_model(model_settings): # architecture _input = Input(shape=(model_settings['max_len'], model_settings['emb_dim']), name='input') model = Bidirectional(LSTM(units=100, return_sequences=True, dropout=0.5, recurrent_dropout=0.5), name='bilstm1')(_input) # biLSTM model = Bidirectional(LSTM(units=100, return_sequences=False, dropout=0.5, recurrent_dropout=0.5), name='bilstm2')(model) # 2nd biLSTM _output = Dense(model_settings['n_tags'], activation='softmax', name='output')(model) # a dense layer model = Model(_input, _output) model.compile(optimizer=Nadam(clipnorm=1), loss='categorical_crossentropy', metrics=['accuracy']) model.summary() return model
def draw(self): with open(self.conf.train_dict, "rb") as fp: vocabulary = pickle.load(fp) fp.close() # Model Configuration input_character = Input(shape=(None, ), name="character") feature_character = Embedding(len(vocabulary.keys()) + 1, self.embedding_dim, mask_zero=True)(input_character) feature_character = Dropout(0.1)(feature_character) feature_character = Bidirectional(LSTM(self.bi_rnn_units // 2, return_sequences=True, recurrent_dropout=0.1))(feature_character) input_construction = Input(shape=(None, 4), name="cxn") model = concatenate([feature_character, input_construction]) model = Bidirectional(LSTM(self.bi_rnn_units // 2, return_sequences=True, recurrent_dropout=0.6))(model) output = TimeDistributed(Dense(5, activation="softmax"))(model) model = Model(inputs=[input_character, input_construction], outputs=output) plot_model(model, self.conf.model_image.format("multi_input_and_output_model.png"), show_shapes=True) model.compile("rmsprop", loss="categorical_crossentropy", metrics=["accuracy"]) model.summary() return model
def train_model(self, X, y, labels, word_index, MAX_SEQUENCE_LENGTH, model_save_directory='./models/'): """ Train deep learning model """ embedding_matrix, nb_words = get_embedding('glove',word_index) input1 = Input(shape=(MAX_SEQUENCE_LENGTH,)) embedding = Embedding(input_dim=len(embedding_matrix), output_dim=self.embedding_dim, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, trainable=False)(input1) # embedding = Dropout(self.drop_rate_embedding)(embedding) model = Bidirectional(LSTM(units=self.num_lstm_units, return_sequences=True, recurrent_dropout=self.drop_rate_lstm))(embedding) model = TimeDistributed(Dense(units=self.num_lstm_units, activation=self.activation_function))(model) crf = CRF(units=len(labels)) output1 = crf(model) model = Model(input1,output1) model.compile(optimizer='rmsprop',\ loss=crf.loss_function,\ metrics=[crf.accuracy]) print(model.summary()) early_stopping = EarlyStopping(monitor='val_loss', patience=3) STAMP = 'lstm_%f_%.2f' % (self.num_lstm_units, self.drop_rate_lstm) checkpoint_dir = model_save_directory + 'checkpoints/' + str(int(time())) + '/' if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) with open(bst_model_path+".json", "w") as json_file: json_file.write(model.to_json())
def define_model(): wordseq = Input(shape=(max_sent_length, )) charSeq = Input(shape=(max_sent_length, max_wrd_len)) wm = wordmodel(max_sent_length)(wordseq) cm = TimeDistributed(charmodel(max_wrd_len))(charSeq) cm = Reshape((max_sent_length, -1))(cm) combined_input = concatenate([wm, cm]) model = Bidirectional(LSTM(units=100, recurrent_dropout=0.1))(combined_input) out = Dense(10, activation="softmax")(model) # softmax output layer model = Model([wordseq, charSeq], out) #load existing weight if exist if os.path.isfile(outFileName + "-best.hdf5"): model.load_weights(outFileName + "-best.hdf5") model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) print(model.summary()) #plot_model(model, show_shapes=True, to_file=outFileName+'-plot.png') return model
def train(self, epochs, embedding=None): # Embedded Words txt_input = Input(shape=(None, ), name='txt_input') txt_embed = Embedding(input_dim=self.num_words, output_dim=MAX_LEN, input_length=None, name='txt_embedding', trainable=False, weights=([embedding]))(txt_input) txt_drpot = Dropout(0.1, name='txt_dropout')(txt_embed) # Embedded Part of Speech pos_input = Input(shape=(None, ), name='pos_input') pos_embed = Embedding(input_dim=self.num_pos, output_dim=MAX_LEN, input_length=None, name='pos_embedding')(pos_input) pos_drpot = Dropout(0.1, name='pos_dropout')(pos_embed) # Embedded Characters char_in = Input(shape=( None, MAX_LEN_CHAR, ), name="char_input") emb_char = TimeDistributed( Embedding(input_dim=self.num_chars, output_dim=MAX_LEN_CHAR, input_length=None))(char_in) char_enc = TimeDistributed( LSTM(units=20, return_sequences=False, recurrent_dropout=0.5))(emb_char) # Concatenate inputs x = concatenate([txt_drpot, pos_drpot, char_enc], axis=2) x = SpatialDropout1D(0.3)(x) # Deep Layers model = Bidirectional( LSTM(units=100, return_sequences=True, recurrent_dropout=0.1))(x) model = Bidirectional( LSTM(units=100, return_sequences=True, recurrent_dropout=0.1))(model) # Output out = TimeDistributed(Dense(self.num_entities, activation="softmax"))(model) model = Model(inputs=[txt_input, pos_input, char_in], outputs=[out]) model.compile(optimizer="rmsprop", loss='categorical_crossentropy', metrics=['accuracy']) plot_model(model, to_file=self.save_path + 'model_structure.png') print(model.summary()) history = model.fit( [self.X_train, self.train_pos, self.train_characters], np.array(self.Y_train), batch_size=32, epochs=epochs, validation_data=([ self.X_validation, self.valid_pos, self.valid_characters ], np.array(self.Y_validation)), verbose=1) model.save(self.save_path + 'model_ner') test_eval = model.evaluate( [self.X_test, self.test_pos, self.test_characters], np.array(self.Y_test)) print('Test loss:', test_eval[0]) print('Test accuracy:', test_eval[1]) return model, history
def bilstm(X_train, X_test, Y_train, Y_test, wordembeddings): np.random.seed(1234) tf.random.set_seed(1234) random.seed(1234) max_length_sentence = X_train.str.split().str.len().max() tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n\'', lower=True) tokenizer.fit_on_texts(X_train) word_index = tokenizer.word_index EMBEDDING_DIM = 300 vocabulary_size = len(word_index) + 1 print('Found %s unique tokens.' % len(word_index)) sequences_train = tokenizer.texts_to_sequences(X_train) sequences_valid = tokenizer.texts_to_sequences(X_test) X_train = pad_sequences(sequences_train, maxlen=max_length_sentence) X_val = pad_sequences(sequences_valid, maxlen=X_train.shape[1]) y_train = np.asarray(Y_train) y_val = np.asarray(Y_test) #print(word_index) ''' print('Shape of data tensor:', X_train.shape) print('Shape of data tensor:', X_val.shape) print('Shape of data tensor:', y_train.shape) print('Shape of data tensor:', y_val.shape) print(X_train) print("*"*100) print(X_val) print("*"*100) print(y_train) print("*"*100) print(y_val) ''' embedding_matrix = np.zeros((vocabulary_size, EMBEDDING_DIM)) for word, i in word_index.items(): if (word in wordembeddings.keys()): embedding_vector = wordembeddings[word] if len(embedding_vector) == 0: #if array is empty embedding_vector = wordembeddings[word.title()] if len(embedding_vector) == 0: embedding_vector = wordembeddings[word.upper()] if len(embedding_vector) == 0: embedding_vector = np.array([ round(np.random.rand(), 8) for i in range(0, 300) ]) else: #print("WORD NOT IN DICT",word) embedding_vector = np.array( [round(np.random.rand(), 8) for i in range(0, 300)]) if len(embedding_vector) != 0: embedding_matrix[i] = embedding_vector embedding_layer = Embedding(vocabulary_size, EMBEDDING_DIM, weights=[embedding_matrix], trainable=False) #Try with True inputs = Input(shape=(X_train.shape[1], )) model = (Embedding(vocabulary_size, EMBEDDING_DIM, input_length=max_length_sentence, weights=[embedding_matrix]))(inputs) model = Bidirectional(GRU(64))( model) # !!!!!!! CHANGE THIS FOR OTHER MODELS model = (Dense(900, activation='relu'))(model) model = (Dense(400, activation='relu'))(model) model = (Dense(250, activation='relu'))(model) model = (Dense(204, activation='softmax'))(model) model = Model(inputs=inputs, outputs=model) model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) model.summary() callbacks = [EarlyStopping(monitor='val_loss')] hist_adam = model.fit( X_train, y_train, batch_size=1000, epochs=200, verbose=1, validation_data=(X_val, y_val), callbacks=callbacks ) #!!!!!!!!!!!!!!!!!!!!!!!CHANGE BATCH SIZE TO 1000 #change epochs to 200 model.save(config.bigru_prepocessed_dataset1_chai ) # !!!!!!! CHANGE THIS FOR OTHER MODELS y_pred = model.predict(X_val) print(y_pred) y_val_class = pd.DataFrame(y_val).idxmax(axis=1) print(y_val_class) y_val_class_argmax = np.argmax(y_val, axis=1) y_pred_class_argmax = np.argmax(y_pred, axis=1) y_pred_class = pd.DataFrame(y_pred).idxmax(axis=1) print(y_pred_class) print(classification_report(y_val_class, y_pred_class)) plt.suptitle('Optimizer : Adam', fontsize=10) plt.ylabel('Loss', fontsize=16) plt.xlabel('Epoch', fontsize=14) plt.plot(hist_adam.history['loss'], color='b', label='Training Loss') plt.plot(hist_adam.history['val_loss'], color='r', label='Validation Loss') plt.legend(loc='upper right') plt.savefig( '/home/ubuntu/asset_classification/results/bigru_model_dataset1_preprocessed_chai.png' ) # !!!!!!! CHANGE THIS FOR OTHER MODELS tf.keras.utils.plot_model( model, to_file=config.bigru_architecture, show_shapes=True) # !!!!!!! CHANGE THIS FOR OTHER MODELS return (y_pred, y_val_class, y_pred_class, y_val_class_argmax, y_pred_class_argmax)
# recurrent_dropout=0.25)(inputs) # attention layer # model = attention_3d_block(model) # Output FC layer model = TimeDistributed(Dense(nb_classes, activation="softmax"))(model) model = Model(inputs=inputs, outputs=model) # model = multi_gpu_model(model, gpus=2) model.compile(loss='categorical_crossentropy', optimizer='adam', sample_weight_mode="temporal", metrics=['accuracy']) model.summary() # train on videos with sample weighting # model.fit(x=X_train_m, # y=Y_train_, # validation_data=(X_vali_m, Y_vali_, M_vali[:, :, 0]), # epochs=nb_epoch, # batch_size=batch_size, # verbose=1, # # sample_weight=M_train[:, :, 0], # sample_weight=sample_weights, # callbacks=[lr_reducer, early_stopper, tensor_board, checkpointer]) model.fit_generator(train_generator(X_train, Y_train), verbose=1,
def main(self, glove): # get word embeddings utils = wordUtils.Utils() if glove: # use glove self.words_list, self.embedding_matrix = utils.load_glove() unword_n = len(self.words_list) else: self.words_list, self.embedding_matrix = utils.load_word2vec() unword_n = len(self.words_list) # get the training corpus cr = corpusreader.CorpusReader(self.textfile, self.annotfile) corpus = cr.trainseqs print(len(corpus)) train = [] print("Processing training data", datetime.now()) for doc in corpus: tmp_dic = {} tmp_dic['tokens'] = doc['tokens'] # convert SOBIE tags to numbers tags = doc['bio'] tags = [self.lablist[i] for i in tags] tmp_dic['bion'] = tags train.append(tmp_dic) n_emb = 0 n_unk = 0 # get the number of the embedding for idx in range(len(train)): words = train[idx]['tokens'] words_id = [] for i in words: # get the number of the embedding try: # the index of the word in the embedding matrix index = self.words_list.index(i) n_emb = n_emb + 1 except ValueError: # use the embedding full of zeros to identify an unknown word n_unk = n_unk + 1 index = unword_n # the index of the word in the embedding matrix words_id.append(index) train[idx]['tokens'] = words_id # get all sizes from the sequences with training data train_l_d = {} train_l_labels = {} for seq in train: # corpus l = len(seq['tokens']) if l not in train_l_d: train_l_d[l] = [] train_l_d[l].append(seq['tokens']) # labels l1 = len(seq['bion']) if l1 not in train_l_labels: train_l_labels[l1] = [] train_l_labels[l1].append(seq['bion']) sizes = list(train_l_d.keys()) for i in sizes: if len(train_l_d[i]) != len(train_l_labels[i]): print("merda") for m in range(len(train_l_d[i])): if len(train_l_d[i][m]) != len(train_l_labels[i][m]): print("XXX") input = Input(shape=(None,)) el = Embedding(len(self.words_list) + 1, 200, weights=[self.embedding_matrix], trainable=False)(input) model = Bidirectional(LSTM(units=50, return_sequences=True, recurrent_dropout = 0.1))(el) # variational biLSTM model = TimeDistributed(Dense(50, activation="relu"))(model) # a dense layer as suggested by neuralNer crf = CRF(self.lab_len) # CRF layer out = crf(model) # output model = Model(input, out) model.compile(optimizer="rmsprop", loss=crf.loss_function, metrics=[crf.accuracy]) model.summary() f_best = -1 f_index = -1 # OK, start actually training for epoch in range(self.epochsN): print("Epoch", epoch, "start at", datetime.now()) # Train in batches of different sizes - randomize the order of sizes # Except for the first few epochs if epoch > 2: random.shuffle(sizes) for size in sizes: batch = train_l_d[size] labs = train_l_labels[size] tx = np.array([seq for seq in batch]) y = [seq for seq in labs] ty = [to_categorical(i, num_classes=self.lab_len) for i in y] # This trains in mini-batches model.fit(tx, np.array(ty), verbose=0, epochs=1) print("Trained at", datetime.now()) # save all epochs save_load_utils.save_all_weights(model, 'words-results/epoch_%s.h5' % epoch) # test the results test_data = 'corpus_char/tmVarCorpus/treated/test_data.txt' test_labels = 'corpus_char/tmVarCorpus/treated/test_labels.tsv' self.test_model(test_data, test_labels, model, glove) f = self.eval() if f > f_best: f_best = f f_index = epoch # Pick the best model, and save it with a useful name print("Choosing the best epoch") shutil.copyfile("words-results/epoch_%s.h5" % f_index, "words_glove_%s.h5" % f_index)