vob = vob.union(set(data[i][0])) vecs = load_my_vecs("./output/word2vect.txt", vob) vecs = handle_unknow(list(vob), vecs) word_to_ix = {} tag_to_ix = {"K": 0, "o": 1} word_length = len(vecs.keys()) for i in range(word_length): word_to_ix[list(vecs.keys())[i]] = i pretrained_weight = list(vecs.values()) train(X_train, word_to_ix, pretrained_weight) result = [] for i in range(EPOCHS): # 训练集的结果 yp_train = lstm.test("./model/lstm_w2v" + str(i) + ".pkl", X_train, word_to_ix) p_t, r_t, f1_t = lstm.calculate(y_train, yp_train) # 测试集的结果 yp_test = lstm.test("./model/lstm_w2v" + str(i) + ".pkl", X_test, word_to_ix) p, r, f1 = lstm.calculate(y_test, yp_test) now = datetime.strftime(datetime.now(), "%m-%d %H:%M:%S") print(now, "epoch:", i) result.append(([p_t, r_t, f1_t], [p, r, f1])) with open("./output/lstmw2vresult.txt", "w", encoding="utf-8") as f: for i, j in result: f.write(" ".join(map(str, i))) f.write("\t") f.write(" ".join(map(str, j))) f.write("\n")
import os import argparse from lstm import test # parser = argparse.ArgumentParser() # parser.add_argument('testing_list') # parser.add_argument('feat_path') # option = parser.parse_args() testing_dir = './MLDS_hw2_data/testing_data/feat' testing_list = './MLDS_hw2_data/testing_id.txt' test(testing_dir, testing_list)
val_x = dev_x val_y = change_dev_y model = model_input.LSTModel( class_num=class_num, lstm_dims=lstm_sizes, fc_size=fc_size, max_sent_len=max_sent_len, ) model.build() print("****model build finished***") # # ==================train================= # min_dev_loss = model_input.train( # model, # learning_rate, # train_x, # train_y, # val_x, # val_y, # max_epochs, # batch_size, # keep_prob, # l2reg, # show_step=show_step # ) # logger.info(' ** The minimum dev_loss is {min_dev_loss}') # ==================test ================= model_input.test(model, test_x, test_y, batch_size)
def experiment_model( selection_problem, selection_test_fold, selection_source, selection_test_source, selection_count, selection_random_seed, selection_tag, selection_reject_minimum, selection_overwrite, al_threshold, embedding_type, embedding_shape, embedding_overwrite, model_type, model_arch_num, model_layer_sizes, model_maxlen, model_batch_size, model_learning_rate, model_epochs, model_num, experiment_tag, verbose=True, params=None ): # embed_df, sel_df, name = experiment_dataset( # selection_problem, # selection_source, # selection_count, # selection_random_seed, # selection_reject_minimum, # selection_overwrite, # embedding_type, # embedding_shape, # embedding_overwrite, # verbose=verbose # ) embed_df, sel_df, name, test_selection_df, test_embedding_df, al_selection_df, al_embedding_df = experiment_dataset( selection_problem, selection_test_fold, selection_source, selection_test_source, selection_count, selection_random_seed, selection_tag, selection_reject_minimum, selection_overwrite, al_threshold, embedding_type, embedding_shape, embedding_overwrite, ) X = embed_df X_test = test_embedding_df X_al_test = al_embedding_df target_col = "" if selection_problem == "reliability": target_col = "reliable" y = sel_df.reliable y_test = test_selection_df.reliable y_al_test = al_selection_df.reliable elif selection_problem == "biased" or selection_problem == "extreme_biased" or selection_problem == "bias_direction": # NOTE: unsure if this is where bias_direction should go? target_col = "biased" y = sel_df.biased y_test = test_selection_df.biased y_al_test = al_selection_df.biased # pad as needed data_width=0 if embedding_shape == "sequence": X = lstm.pad_data(X, maxlen=model_maxlen) X_test = lstm.pad_data(X_test, maxlen=model_maxlen) X_al_test = lstm.pad_data(X_al_test, maxlen=model_maxlen) # TODO: 300 actually needs to be width (num cols) of dataset data_width = X.shape[-1] if model_type == "cnn": X = np.reshape(X, (X.shape[0], model_maxlen*data_width, 1)) X_test = np.reshape(X_test, (X_test.shape[0], model_maxlen*data_width, 1)) X_al_test = np.reshape(X_al_test, (X_al_test.shape[0], model_maxlen*data_width, 1)) else: X = np.array(X) y = np.array(y) X_test = np.array(X_test) y_test = np.array(y_test) X_al_test = np.array(X_al_test) y_al_test = np.array(y_al_test) print(X) data_width = X.shape[-1] if selection_problem == "bias_direction" and model_type != "svm": y = keras.utils.to_categorical(y, num_classes=3) y_test = keras.utils.to_categorical(y_test, num_classes=3) y_al_test = keras.utils.to_categorical(y_al_test, num_classes=3) if "AL_TRAINING" in experiment_tag: model = svm.LinearSVC() print(X_al_test.shape, y_al_test.shape) cv_results = cross_validate(model, X_al_test, y_al_test, cv=10) print("_"*80) print(cv_results["test_score"]) results_scores = [] total = 0 for num in cv_results["test_score"]: results_scores.append(num) total += num total /= len(cv_results["test_score"]) print(total) save_data = {"average": float(total), "scores": results_scores} output_path = f"../data/output/{experiment_tag}" util.create_dir(output_path) with open(output_path + "/" + experiment_tag + ".json", 'w') as outfile: json.dump(save_data, outfile) exit() name = f'{experiment_tag}_{name}_{model_type}_{model_arch_num}_{model_num}_{model_maxlen}_{model_batch_size}_{model_learning_rate}' if model_type == "lstm": model, history, loss, acc, predictions = lstm.train_test(X, y, model_arch_num, model_layer_sizes, model_maxlen, model_batch_size, model_learning_rate, model_epochs, X_test, y_test, name, data_width, selection_problem) loss_al, acc_al, predictions_al = lstm.test(X_al_test, y_al_test, model_batch_size, model) elif model_type == "cnn": model, history, loss, acc, predictions = cnn.train_test(X, y, model_arch_num, model_layer_sizes, model_maxlen, model_batch_size, model_learning_rate, model_epochs, X_test, y_test, name) elif model_type == "nn": model, history, loss, acc, predictions = nn.train_test(X, y, model_arch_num, model_layer_sizes, model_maxlen, model_batch_size, model_learning_rate, model_epochs, X_test, y_test, name, data_width, selection_problem) loss_al, acc_al, predictions_al = nn.test(X_al_test, y_al_test, model_batch_size, model) elif model_type == "svm": model = svm.LinearSVC(random_state=42) model.fit(X, y) history = None loss = 0 acc = model.score(X_test, y_test) predictions = model.predict(X_test) loss_al = 0 acc_al = model.score(X_al_test, y_al_test) predictions_al = model.predict(X_al_test) print("Training done") logging.info("%s", str(test_selection_df[target_col].value_counts())) print(test_selection_df[target_col].value_counts()) # turn predictions into dataframe #pred = pd.DataFrame({"predicted": predictions}) #pred.index = test_selection_df.index if selection_problem == "bias_direction" and model_type != "svm": test_selection_df["predicted"] = np.argmax(predictions, axis=1) test_selection_df["pred_class"] = np.argmax(predictions, axis=1) al_selection_df["predicted"] = np.argmax(predictions_al, axis=1) al_selection_df["pred_class"] = np.argmax(predictions_al, axis=1) else: test_selection_df["predicted"] = predictions test_selection_df["pred_class"] = round(test_selection_df.predicted).astype(int) al_selection_df["predicted"] = predictions_al al_selection_df["pred_class"] = round(al_selection_df.predicted).astype(int) #al_unique_selection_df = [] # get list of sources for MBC that aren't in training set training_sources = list(set(sel_df.source)) mbc_sources = list(set(al_selection_df.Source)) unseen_mbc_sources = [x for x in mbc_sources if x not in training_sources and not (x in util.MBC_to_NELA and util.MBC_to_NELA[x] in training_sources)] al_unseen_selection_df = al_selection_df[al_selection_df.Source.isin(unseen_mbc_sources)] print("="*20, "TRAINING", "="*20) print(training_sources) print("="*20, "MBC", "="*20) print(mbc_sources) print("="*20, "UNSEEN", "="*20) print(unseen_mbc_sources) overall_counts = [] overall_counts_al = [] overall_counts_al_unseen = [] # only unique sources if selection_problem != "bias_direction": overall_counts = calculate_cm_counts(test_selection_df, target_col, binary=True) overall_counts_al = calculate_cm_counts(al_selection_df, target_col, binary=True) overall_counts_al_unseen = calculate_cm_counts(al_unseen_selection_df, target_col, binary=True) else: overall_counts = calculate_cm_counts(test_selection_df, target_col, binary=False) overall_counts_al = calculate_cm_counts(al_selection_df, target_col, binary=False) overall_counts_al_unseen = calculate_cm_counts(al_unseen_selection_df, target_col, binary=False) # make output directory (based on experiment tag) output_path = f"../data/output/{experiment_tag}" breakdown_output_path = output_path + "/persource" albreakdown_output_path = output_path + "/alpersource" util.create_dir(output_path) util.create_dir(breakdown_output_path) util.create_dir(albreakdown_output_path) logging.info("Overall confusion analysis") confusion_analysis(overall_counts, output_path, experiment_tag, name, history, loss, acc, params, False) logging.info("Overall analysis complete") groups = test_selection_df.groupby(test_selection_df.source) logging.info("There are %i groups", len(groups)) for group_name, group in groups: logging.info("Next group %s", name) group_counts = [] if selection_problem != "bias_direction": group_counts = calculate_cm_counts(group, target_col, binary=True) else: group_counts = calculate_cm_counts(group, target_col, binary=False) confusion_analysis(group_counts, breakdown_output_path, experiment_tag, name + "_persource", history, loss, acc, params, source=group_name) #with open("../data/output/" + name + "_predictions.pkl", 'wb') as outfile: with open(output_path + "/" + name + "_predictions.pkl", 'wb') as outfile: pickle.dump(test_selection_df, outfile) logging.info("*****-----------------------------------------*****") logging.info("Article-level analysis") confusion_analysis(overall_counts_al, output_path, experiment_tag, name + "_al", None, loss_al, acc_al, params, False) logging.info("--- (With only unseen sources)") confusion_analysis(overall_counts_al_unseen, output_path, experiment_tag, name + "_al_unseen", None, loss_al, acc_al, params, False) with open(output_path + "/" + name + "_al_unseensourcelist.json", 'w') as outfile: json.dump(unseen_mbc_sources, outfile) # TODO: move unseen source calc to bottom and redo groups? groups = al_selection_df.groupby(al_selection_df.Source) logging.info("There are %i al groups", len(groups)) for group_name, group in groups: logging.info("Next group %s", name) group_counts = [] if selection_problem != "bias_direction": group_counts = calculate_cm_counts(group, target_col, binary=True) else: group_counts = calculate_cm_counts(group, target_col, binary=False) confusion_analysis(group_counts, albreakdown_output_path, experiment_tag, name + "_peralsource", None, loss_al, acc_al, params, source=group_name) with open(output_path + "/" + name + "_predictionsal.pkl", 'wb') as outfile: pickle.dump(al_selection_df, outfile)
def main(): print("Predict Youtube cross genre") directory = 'data/csv/' '''df_data, y = preprocess_data(directory, 'train_news_twitter.csv') df_test, test_y = preprocess_data(directory, 'youtube_train.csv') train_and_test(df_data, y, df_test, test_y, 100, 'youtube') print("Predict News cross genre") directory = 'data/csv/' df_data, y = preprocess_data(directory, 'train_youtube_twitter.csv') df_test, test_y = preprocess_data(directory, 'news_train.csv') train_and_test(df_data, y, df_test, test_y, 100, 'news')''' print("Predict Twitter cross genre") #directory = 'data/csv/' #df_data, y = preprocess_data(directory, 'twitter_train.csv') #df_test, test_y = preprocess_data(directory, 'twitter_train.csv') #print("Shape of train and test: ", df_data.shape, df_test.shape) #train_and_test(df_data, y, df_test, test_y, 100, 'twitter') '''directory = 'data/csv/' df_data, y, df_test, test_y = preprocess_data(directory, 'surprise_test.csv', split=True) print("Shape of train and test: ", df_data.shape, df_test.shape) train_and_test(df_data, y, df_test, test_y, 100, 'surprise')''' #cross genre '''model = 'models/news_model_cg_0.557.pt' model = torch.load(model) corpus = pickle.load(open('models/news_corpus_cg_0.557.pk', 'rb')) corpus.batch_size = 16 model.batch_size = 16 df_test, test_y = preprocess_data(directory, 'twitter_test.csv', predict=True) test(df_test, test_y, model, corpus, 'IJS-KD_CROSS_twitter_2', test=False) model = 'models/news_model_cg_0.557.pt' model = torch.load(model) corpus = pickle.load(open('models/news_corpus_cg_0.557.pk', 'rb')) corpus.batch_size = 10 model.batch_size = 10 df_test, test_y = preprocess_data(directory, 'news_test.csv', predict=True) test(df_test, test_y, model, corpus, 'IJS-KD_CROSS_news_1', test=False) model = 'models/youtube_model_cg_0.558.pt' model = torch.load(model) corpus = pickle.load(open('models/youtube_corpus_cg_0.558.pk', 'rb')) corpus.batch_size = 2 model.batch_size = 2 df_test, test_y = preprocess_data(directory, 'youtube_test.csv', predict=True) test(df_test, test_y, model, corpus, 'IJS-KD_CROSS_youtube_1', test=False)''' '''model = 'models/news_model_in.pt' model = torch.load(model) corpus = pickle.load(open('models/news_corpus_in.pk', 'rb')) corpus.batch_size = 1 model.batch_size = 1 df_test, test_y = preprocess_data(directory, 'surprise_test.csv', predict=True) test(df_test, test_y, model, corpus, 'IJS-KD_CROSS_kb_1', test=False)''' #in_genre model = 'models/youtube_model_in.pt' model = torch.load(model) corpus = pickle.load(open('models/youtube_corpus_in.pk', 'rb')) corpus.batch_size = 2 model.batch_size = 2 df_test, test_y = preprocess_data(directory, 'youtube_test.csv', predict=True) test(df_test, test_y, model, corpus, 'IJS-KD_IN_youtube_1', test=False) '''model = 'models/news_model_in.pt'