def lstm_model_headline_body_combin(body_length, numb_epoch): fexc = Preprocessing() data = load_data() # Loading train data from files data.set_path(path='fnc-1-master') train_stance_data = data.get_headline_body_stance() train_bodies_data = data.get_body_id_text() train_headlines, train_bodies, train_stances = data.get_mapped_id_body( train_stance_data, train_bodies_data) # Removing punctuation and stop words from the headline and body of train data train_headlines_cl = fexc.get_clean_data(train_headlines) train_bodies_cl = fexc.get_clean_data(train_bodies) train_stances_cl = fexc.get_clean_data(train_stances) # Convert labels to integer train_stances_in = fexc.convert_lable_int(train_stances_cl) # Load the test data data.set_name("test") test_stance_data = data.get_headline_body_stance() test_bodies_data = data.get_body_id_text() test_headlines, test_bodies = data.get_mapped_id_body(test_stance_data, test_bodies_data, data_type="test") # Removing punctuation and stop words from the headline and body of test data test_headlines_cl = fexc.get_clean_data(test_headlines) test_bodies_cl = fexc.get_clean_data(test_bodies) # Remove Stop words # test_headlines_cl = fexc.remove_stop_words_list(test_headlines_cl) test_bodies_cl = fexc.remove_stop_words_list(test_bodies_cl) # Set the tokenizer alltext = train_headlines_cl + train_bodies_cl + test_headlines_cl + test_bodies_cl token = Tokenizer(num_words=30000) token.fit_on_texts(alltext) print('Number of Unique words: ' + str(len(token.word_index.keys()))) # Combine the headline and bodies of training data train_data = fexc.combine_heading_body(train_headlines_cl, train_bodies_cl) word_index = token.word_index # Converting train data to sequence train_data = token.texts_to_sequences(train_data) # Padding train data train_data = pad_sequences(train_data, maxlen=(MAX_HEADLINE_LENGTH + int(body_length))) # Converting the labels to one hot encoder onehotencoder = OneHotEncoder() train_stances_in = onehotencoder.fit_transform(train_stances_in).toarray() # Splitting the data in train and validation train_data, val_data, train_stances_final, stances_val = \ train_test_split(train_data, train_stances_in, test_size=0.2, random_state=42) # Combining test data test_data = fexc.combine_heading_body(test_headlines_cl, test_bodies_cl) # Converting test data to sequence test_data = token.texts_to_sequences(test_data) # Padding test data test_data = pad_sequences(test_data, maxlen=MAX_HEADLINE_LENGTH + int(body_length)) # Getting embedding index embeddings_index = models.get_embeddings_index(GLOVE_DIR) print('Found %s word vectors.' % len(embeddings_index)) # Getting embedding matrix embedding_matrix = models.get_embedding_matrix( embedding_dim=EMBEDDING_DIM, embeddings_index=embeddings_index, word_index=word_index) # Building the Model fake_nn = models.lstm_with_combine_headline_body( headline_length=MAX_HEADLINE_LENGTH, body_length=int(body_length), embedding_dim=EMBEDDING_DIM, word_index=word_index, embedding_matrix=embedding_matrix, activation='relu', drop_out=0.5, numb_layers=300, cells=200) # Early stopping and model checkpoint early_stopping = EarlyStopping(monitor='val_loss', patience=10) bst_model_path = 'Fake_news_nlp.h5' model_checkpoint = ModelCheckpoint(bst_model_path, save_best_only=True, save_weights_only=True) # Fitting the model fake_hist = fake_nn.fit(train_data, train_stances_final, batch_size=128, epochs=int(numb_epoch), shuffle=True, validation_data=(val_data, stances_val), callbacks=[early_stopping, model_checkpoint]) # Storing the training and validation accuracy and loss in file for plot lstm_data = [] with open( os.path.join( OBJECT_DUMP, "lstm_headline_body_combine" + str(body_length) + ".txt"), 'wb') as bow_hist: lstm_data.append(fake_hist.history['acc']) lstm_data.append(fake_hist.history['val_acc']) lstm_data.append(fake_hist.history['loss']) lstm_data.append(fake_hist.history['val_loss']) pickle.dump(lstm_data, bow_hist) # Predict the labels for test data result = fake_nn.predict([test_data], batch_size=128) # Store the results in the result file result_str = fexc.convert_lable_string(result) with io.open(TEST_FILE, mode='r', encoding='utf8') as read_file: test_stance = csv.DictReader(read_file) with io.open(RESULT_FILE + "_" + str(body_length) + ".csv", mode='w', encoding='utf8') as write_file: writer = csv.DictWriter( write_file, fieldnames=['Headline', 'Body ID', 'Stance']) writer.writeheader() for sample, prediction in zip(test_stance, result_str): writer.writerow({ 'Body ID': sample['Body ID'], 'Headline': sample['Headline'], 'Stance': prediction }) # Print the Accuracy, competition score and confusion matrix print_result("fnc-1-master/competition_test_stances.csv", RESULT_FILE + "_" + str(body_length) + ".csv")