def data(): preprocess = preProcessing() max_claims_length = 65 max_sents_length = 138 dataset_name = 'fever_sup' nb_classes = 1 train_data = pickle.load( open("./bert/datasets/train_" + str(dataset_name) + ".pkl", "rb")) x_claim = pickle.load( open( "./bert/new_embeddings/train_claims_" + str(dataset_name) + "_embed" + ".pkl", "rb")) x_sents = pickle.load( open( "./bert/new_embeddings/train_sents_" + str(dataset_name) + "_embed" + ".pkl", "rb")) x_labels = train_data["lablel"] if dataset_name == 'fever_3': x_labels = np_utils.to_categorical(x_labels, nb_classes) test_data = pickle.load( open("./bert/datasets/test_" + str(dataset_name) + ".pkl", "rb")) test_claims_data = pickle.load( open( "./bert/new_embeddings/test_claims_" + str(dataset_name) + "_embed" + ".pkl", "rb")) test_sents_data = pickle.load( open( "./bert/new_embeddings/test_sents_" + str(dataset_name) + "_embed" + ".pkl", "rb")) test_labels = test_data["lablel"] if dataset_name == 'fever_3': test_labels = np_utils.to_categorical(test_labels, nb_classes) # x_claim, x_sents, x_labels = preprocess.to_padding(claim_embeddings, sents_embeddings, labels, max_claims_length, max_sents_length) # test_claims_data, test_sents_data, test_labels = preprocess.to_padding(test_claim_embeddings, test_sents_embeddings, test_labels, max_claims_length, max_sents_length) return (x_claim, x_sents, x_labels, test_claims_data, test_sents_data, test_labels)
def lstm_model(x_claim, x_sents, x_labels, test_claims_data, test_sents_data, test_labels): claim_length = 65 sents_length = 138 embedding_dim = 768 nb_classes = 1 preprocess = preProcessing() claims_input = Input(shape=(claim_length, embedding_dim), dtype='float32', name='claims') encoded_claims = LSTM({{choice([8, 16, 64, 128, 256, 512, 1024])}}, return_sequences=True, recurrent_dropout={{uniform(0, 1)}}, dropout={{uniform(0, 1)}})(claims_input) encoded_claims = LSTM({{choice([8, 16, 64, 128, 256, 512, 1024])}}, recurrent_dropout={{uniform(0, 1)}}, dropout={{uniform(0, 1)}})(encoded_claims) sentences_input = Input(shape=(sents_length, embedding_dim), dtype='float32', name='sentences') encoded_sentences = LSTM({{choice([8, 16, 64, 128, 256, 512, 1024])}}, return_sequences=True, recurrent_dropout={{uniform(0, 1)}}, dropout={{uniform(0, 1)}})(sentences_input) encoded_sentences = LSTM({{choice([16, 64, 256, 128, 512, 1024])}}, recurrent_dropout={{uniform(0, 1)}}, dropout={{uniform(0, 1)}})(encoded_sentences) concatenate_layers = concatenate([encoded_claims, encoded_sentences], axis=-1) concatenate_layers = Dropout({{uniform(0, 1)}})(concatenate_layers) concatenate_layers = Dense({{choice([8, 16, 32, 64, 256, 512, 1024])}}, kernel_regularizer=regularizers.l2(0.001), activation='relu')(concatenate_layers) concatenate_layers = Dense({{choice([8, 16, 32, 64, 256, 512, 1024])}}, kernel_regularizer=regularizers.l2(0.001), activation='relu')(concatenate_layers) # concatenate_layers = Dense(32, kernel_regularizer=regularizers.l2(0.001), activation='relu')(concatenate_layers) concatenate_layers = Dropout({{uniform(0, 1)}})(concatenate_layers) if nb_classes == 3: pred_label = Dense(3, activation='softmax')(concatenate_layers) else: pred_label = Dense(1, activation='sigmoid')(concatenate_layers) model = Model([claims_input, sentences_input], pred_label) early_stopping = EarlyStopping(monitor='val_loss', patience=2) checkpointer = ModelCheckpoint(filepath='bert_keras_weights_feverbin.h5', verbose=1, save_best_only=True) model.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy']) model.fit({ 'claims': x_claim, 'sentences': x_sents }, x_labels, batch_size={{choice([64, 128])}}, epochs=20, verbose=2, validation_split=0.1, callbacks=[early_stopping, checkpointer]) score, acc = model.evaluate( { 'claims': test_claims_data, 'sentences': test_sents_data }, test_labels, verbose=0) print('Test accuracy:', acc) return {'loss': -acc, 'status': STATUS_OK, 'model': model}
train_data = pd.DataFrame(data=tmp_dict) return train_data if __name__ == '__main__': parser = argparse.ArgumentParser(description='start training') parser.add_argument('task', choices=['sent_retrieval', 'claim_classification'], help="what task should be performed?") task = parser.parse_args().task preprocess = preProcessing() training = TrainModel() metrics = Metrics() max_claims_length = 40 max_sents_length = 250 if task == 'sent_retrieval': nb_classes = 2 train_dataset_name = "fever_full_binary_train" test_dataset_name = "fever_full_binary_dev" train_data = "/scratch/kkuma12s/github/fact-validation/thesis-code/Proof_Extraction/data/fever-full/" + train_dataset_name + ".jsonl" elif task == 'claim_classification': print("claim classification selected ")