def objective_function_veracity_branchLSTM_RumEv(params): x_train = np.load( os.path.join('preprocessing/saved_dataRumEval2019', 'train/train_array.npy')) y_train = np.load( os.path.join('preprocessing/saved_dataRumEval2019', 'train/labels.npy')) y_train = to_categorical(y_train, num_classes=None) x_test = np.load( os.path.join('preprocessing/saved_dataRumEval2019', 'dev/train_array.npy')) y_test = np.load( os.path.join('preprocessing/saved_dataRumEval2019', 'dev/labels.npy')) ids_test = np.load( os.path.join('preprocessing/saved_dataRumEval2019', 'dev/ids.npy')) y_pred, confidence = LSTM_model_veracity(x_train, y_train, x_test, params) trees, tree_prediction, tree_label, _ = branch2treelabels( ids_test, y_test, y_pred, confidence) mactest_F = f1_score(tree_label, tree_prediction, average='macro') output = { 'loss': 1 - mactest_F, 'Params': params, 'status': STATUS_OK, 'attachments': { 'ID': trees, 'Predictions': tree_prediction, 'Labels': tree_label } } return output
def objective_function_veracity_branchLSTM_fullPHEME(params): path = 'saved_data_fullPHEME' train = [ 'ebola-essien', 'ferguson', 'gurlitt', 'ottawashooting', 'prince-toronto', 'putinmissing', 'sydneysiege' ] test = 'charliehebdo' max_branch_len = 25 x_train = [] y_train = [] for t in train: temp_x_train = np.load(os.path.join(path, t, 'train_array.npy')) temp_y_train = np.load(os.path.join(path, t, 'labels.npy')) temp_x_train = pad_sequences(temp_x_train, maxlen=max_branch_len, dtype='float32', padding='post', truncating='post', value=0.) x_train.extend(temp_x_train) y_train.extend(temp_y_train) x_train = np.asarray(x_train) y_train = np.asarray(y_train) x_test = np.load(os.path.join(path, test, 'train_array.npy')) y_test = np.load(os.path.join(path, test, 'labels.npy')) ids_test = np.load(os.path.join(path, test, 'ids.npy')) #% y_train = to_categorical(y_train, num_classes=None) model = heteroscedastic_model(x_train, y_train, params, output_classes=3) mb_size = params['mb_size'] num_epochs = params['num_epochs'] model.fit(x_train, [y_train, y_train], batch_size=mb_size, epochs=num_epochs, shuffle=False, class_weight=None) verbose = False predictions_test = model.predict(x_test, batch_size=mb_size, verbose=verbose) softmax_test = predictions_test[1] y_pred = np.argmax(softmax_test, axis=1) trees, tree_prediction, tree_label = branch2treelabels( ids_test, y_test, y_pred) mactest_F = f1_score(tree_label, tree_prediction, average='macro') output = {'loss': 1 - mactest_F, 'Params': params, 'status': STATUS_OK} #%% return output
def objective_function_branchLSTM_Twitter15(params): #%% path = 'preprocessing/saved_data_15' # fold 0 is development set train = '0/train' test = '0/test' max_branch_len = 25 x_train = [] y_train = [] temp_x_train = np.load(os.path.join(path, train, 'train_array.npy')) y_train = np.load(os.path.join(path, train, 'labels.npy')) # pad sequences to the size of the largest x_train = pad_sequences(temp_x_train, maxlen=max_branch_len, dtype='float32', padding='post', truncating='post', value=0.) x_test = np.load(os.path.join(path, test, 'train_array.npy')) y_test = np.load(os.path.join(path, test, 'labels.npy')) ids_test = np.load(os.path.join(path, test, 'ids.npy')) #% y_train = to_categorical(y_train, num_classes=None) # y_pred, confidence = LSTM_model(x_train, y_train, x_test, params) model = heteroscedastic_model(x_train, y_train, params, output_classes=4) mb_size = params['mb_size'] num_epochs = params['num_epochs'] model.fit(x_train, [y_train, y_train], batch_size=mb_size, epochs=num_epochs, shuffle=False, class_weight=None) verbose = False predictions_test = model.predict(x_test, batch_size=mb_size, verbose=verbose) softmax_test = predictions_test[1] y_pred = np.argmax(softmax_test, axis=1) trees, tree_prediction, tree_label = branch2treelabels( ids_test, y_test, y_pred) mactest_F = f1_score(tree_label, tree_prediction, average='macro') output = {'loss': 1 - mactest_F, 'Params': params, 'status': STATUS_OK} #%% return output
def objective_MTL2_detection_CV5(params): max_branch_len = 25 print("Train ", train_path) print("Holdout ", holdout_path) print("") x_train = np.load(os.path.join(train_path, 'train_array.npy')) y_train = np.load(os.path.join(train_path,'rnr_labels.npy' )) x_train = pad_sequences(x_train, maxlen=max_branch_len, dtype='float32', padding='post', truncating='post', value=0.) x_train = np.asarray(x_train) print("X train shape ", x_train.shape) y_train = np.asarray(y_train) print("y train ", y_train.shape) y_train = to_categorical(y_train, num_classes=2) print("") x_holdout = np.load(os.path.join(holdout_path, 'train_array.npy')) print("X holdout shape ", x_holdout.shape) print("") y_holdout = np.load(os.path.join(holdout_path, 'rnr_labels.npy')) ids_holdout = np.load(os.path.join(holdout_path, 'ids.npy')) start =time.time() model = training(params, x_train, y_train) end=time.time() print("** Elapsed time ", end-start) print("") pred_probabilities = model.predict(x_holdout, verbose=0) Y_pred = np.argmax(pred_probabilities, axis=1) trees, tree_prediction, tree_label = branch2treelabels(ids_holdout, y_holdout, Y_pred) mactest_F = f1_score(tree_label, tree_prediction, average='binary') output = { 'loss': (1 - mactest_F), 'Params': params, 'status': STATUS_OK, } return output
def main(): x_train = np.load( os.path.join('Feature_Engineering/Saved_dataset_sdqc', 'train/train_array.npy')) y_train = np.load( os.path.join('Feature_Engineering/Saved_dataset_sdqc', 'train/labels.npy')) y_train = to_categorical(y_train, num_classes=None) x_test = np.load( os.path.join('Feature_Engineering/Saved_dataset_sdqc', 'dev/train_array.npy')) y_test = np.load( os.path.join('Feature_Engineering/Saved_dataset_sdqc', 'dev/labels.npy')) ids_test = np.load( os.path.join('Feature_Engineering/Saved_dataset_sdqc', 'dev/ids.npy')) y_pred, confidence = LSTM_model_veracity(x_train, y_train, x_test) trees, tree_prediction, tree_label, veracity_confidence = branch2treelabels( ids_test, y_test, y_pred, confidence) for i in range(len(veracity_confidence)): if tree_prediction[i] == 2: veracity_confidence[i] = 0 tree_prediction[i] = 1 if tree_label[i] == 2: tree_label[i] = 1 dummy = [] save_output(dummy, dummy, trees, tree_prediction, veracity_confidence) #print ("Charliehedbo event is the Test Data") print(trees) print("Predictions", (tree_prediction)) print("Labels", (tree_label)) print("Confidence", (veracity_confidence)) print("Accuracy :", (accuracy_score(tree_label, tree_prediction))) from sklearn.metrics import classification_report print( classification_report(tree_label, tree_prediction, target_names=["true", "false"])) import scikitplot as skplt import matplotlib.pyplot as plt skplt.metrics.plot_confusion_matrix(tree_label, tree_prediction) plt.show()
def objective_MTL2_RumEval(params): path = 'saved_data/saved_data_RumEv' x_train = np.load(os.path.join(path, 'train/train_array.npy')) y_trainA = np.load(os.path.join(path, 'train/fold_stance_labels.npy')) y_trainB = np.load(os.path.join(path, 'train/labels.npy')) y_trainB = to_categorical(y_trainB, num_classes=3) y_train_cat = y_trainA x_test = np.load(os.path.join(path, 'dev/train_array.npy')) y_testA = np.load(os.path.join(path, 'dev/fold_stance_labels.npy')) y_testB = np.load(os.path.join(path, 'dev/labels.npy')) ids_testA = np.load(os.path.join(path, 'dev/tweet_ids.npy')) ids_testB = np.load(os.path.join(path, 'dev/ids.npy')) model = training(params, x_train, y_train_cat, y_trainB) pred_probabilities_a, pred_probabilities_b = model.predict(x_test, verbose=0) Y_pred_a = np.argmax(pred_probabilities_a, axis=2) Y_pred_b = np.argmax(pred_probabilities_b, axis=1) trees, tree_prediction, tree_label = branch2treelabels( ids_testB, y_testB, Y_pred_b) mactest_F_b = f1_score(tree_label, tree_prediction, average='macro') fids_test = ids_testA.flatten() fy_pred = Y_pred_a.flatten() Y_test_A = np.argmax(y_testA, axis=2) fy_test = Y_test_A.flatten() uniqtwid, uindices2 = np.unique(fids_test, return_index=True) uniqtwid = uniqtwid.tolist() uindices2 = uindices2.tolist() del uniqtwid[0] del uindices2[0] uniq_dev_prediction = [fy_pred[i] for i in uindices2] uniq_dev_label = [fy_test[i] for i in uindices2] mactest_F_a = f1_score(uniq_dev_prediction, uniq_dev_label, average='macro') output = { 'loss': (1 - mactest_F_a) + (1 - mactest_F_b), 'Params': params, 'status': STATUS_OK } return output
def eval_MTL2_RumEval(params, fname): path = 'saved_data/saved_data_RumEv' x_train = np.load(os.path.join(path, 'train/train_array.npy')) y_trainA = np.load(os.path.join(path, 'train/fold_stance_labels.npy')) y_trainB = np.load(os.path.join(path, 'train/labels.npy')) y_trainB = to_categorical(y_trainB, num_classes=3) x_dev = np.load(os.path.join(path, 'dev/train_array.npy')) y_devA = np.load(os.path.join(path, 'dev/fold_stance_labels.npy')) y_devB = np.load(os.path.join(path, 'dev/labels.npy')) y_devB = to_categorical(y_devB, num_classes=3) x_test = np.load(os.path.join(path, 'test/train_array.npy')) y_testA = np.load(os.path.join(path, 'test/fold_stance_labels.npy')) y_testB = np.load(os.path.join(path, 'test/labels.npy')) ids_testA = np.load(os.path.join(path, 'test/tweet_ids.npy')) ids_testB = np.load(os.path.join(path, 'test/ids.npy')) x_dev = pad_sequences(x_dev, maxlen=len(x_train[0]), dtype='float32', padding='post', truncating='post', value=0.) y_devA = pad_sequences(y_devA, maxlen=len(y_trainA[0]), dtype='float32', padding='post', truncating='post', value=0.) x_train = np.concatenate((x_train, x_dev), axis=0) y_trainA = np.concatenate((y_trainA, y_devA), axis=0) y_trainB = np.concatenate((y_trainB, y_devB), axis=0) y_train_cat = y_trainA model = training(params, x_train, y_train_cat, y_trainB) pred_probabilities_a, pred_probabilities_b = model.predict(x_test, verbose=0) Y_pred_a = np.argmax(pred_probabilities_a, axis=2) Y_pred_b = np.argmax(pred_probabilities_b, axis=1) trees, tree_prediction, tree_label = branch2treelabels( ids_testB, y_testB, Y_pred_b) Bmactest_P, Bmactest_R, Bmactest_F, _ = precision_recall_fscore_support( tree_label, tree_prediction, average='macro') Bmictest_P, Bmictest_R, Bmictest_F, _ = precision_recall_fscore_support( tree_label, tree_prediction, average='micro') Btest_P, Btest_R, Btest_F, _ = precision_recall_fscore_support( tree_label, tree_prediction) Bacc = accuracy_score(tree_label, tree_prediction) fids_test = ids_testA.flatten() fy_pred = Y_pred_a.flatten() Y_test_A = np.argmax(y_testA, axis=2) fy_test = Y_test_A.flatten() uniqtwid, uindices2 = np.unique(fids_test, return_index=True) uniqtwid = uniqtwid.tolist() uindices2 = uindices2.tolist() del uniqtwid[0] del uindices2[0] uniq_dev_prediction = [fy_pred[i] for i in uindices2] uniq_dev_label = [fy_test[i] for i in uindices2] Amactest_P, Amactest_R, Amactest_F, _ = precision_recall_fscore_support( uniq_dev_label, uniq_dev_prediction, average='macro') Amictest_P, Amictest_R, Amictest_F, _ = precision_recall_fscore_support( uniq_dev_label, uniq_dev_prediction, average='micro') Atest_P, Atest_R, Atest_F, _ = precision_recall_fscore_support( uniq_dev_label, uniq_dev_prediction) Aacc = accuracy_score(uniq_dev_label, uniq_dev_prediction) output = { 'Params': params, 'TaskA': { 'accuracy': Aacc, 'Macro': { 'Macro_Precision': Amactest_P, 'Macro_Recall': Amactest_R, 'Macro_F_score': Amactest_F }, 'Micro': { 'Micro_Precision': Amictest_P, 'Micro_Recall': Amictest_R, 'Micro_F_score': Amictest_F }, 'Per_class': { 'Pclass_Precision': Atest_P, 'Pclass_Recall': Atest_R, 'Pclass_F_score': Atest_F } }, 'TaskB': { 'accuracy': Bacc, 'Macro': { 'Macro_Precision': Bmactest_P, 'Macro_Recall': Bmactest_R, 'Macro_F_score': Bmactest_F }, 'Micro': { 'Micro_Precision': Bmictest_P, 'Micro_Recall': Bmictest_R, 'Micro_F_score': Bmictest_F }, 'Per_class': { 'Pclass_Precision': Btest_P, 'Pclass_Recall': Btest_R, 'Pclass_F_score': Btest_F } }, 'attachments': { 'Task A': { 'ID': uniqtwid, 'Label': uniq_dev_label, 'Prediction': uniq_dev_prediction }, 'Task B': { 'ID': trees, 'Label': tree_label, 'Prediction': tree_prediction, 'Branch': { 'ID': ids_testB, 'Label': y_testB, 'Prediction': Y_pred_b } } } } directory = "output" if not os.path.exists(directory): os.mkdir(directory) with open('output/output' + fname + '.pkl', 'wb') as outfile: pickle.dump(output, outfile) return output
def eval_MTL2_detection_CV(params, data, fname): path = 'saved_data/saved_data_MTL2_detection' if data == 'PHEME5': folds = [ 'charliehebdo', 'germanwings-crash', 'ferguson', 'ottawashooting', 'sydneysiege' ] else: folds = [ 'charliehebdo', 'germanwings-crash', 'ferguson', 'ottawashooting', 'sydneysiege', 'putinmissing', 'prince-toronto', 'gurlitt', 'ebola-essien' ] allfolds = [] cv_ids_b = [] cv_prediction_b = [] cv_label_b = [] cv_ids_c = [] cv_prediction_c = [] cv_label_c = [] for number in range(len(folds)): test = folds[number] print(test) train = deepcopy(folds) del train[number] max_branch_len = 25 x_train = [] yb_train = [] yc_train = [] for t in train: temp_x_train = np.load(os.path.join(path, t, 'train_array.npy')) temp_yb_train = np.load(os.path.join(path, t, 'labels.npy')) temp_yc_train = np.load(os.path.join(path, t, 'rnr_labels.npy')) temp_x_train = pad_sequences(temp_x_train, maxlen=max_branch_len, dtype='float32', padding='post', truncating='post', value=0.) x_train.extend(temp_x_train) yb_train.extend(temp_yb_train) yc_train.extend(temp_yc_train) x_train = np.asarray(x_train) yb_train = np.asarray(yb_train) yc_train = np.asarray(yc_train) yc_train = to_categorical(yc_train, num_classes=2) x_test = np.load(os.path.join(path, test, 'train_array.npy')) yb_test = np.load(os.path.join(path, test, 'labels.npy')) yc_test = np.load(os.path.join(path, test, 'rnr_labels.npy')) ids_testBC = np.load(os.path.join(path, test, 'ids.npy')) model = training(params, x_train, yb_train, yc_train) pred_probabilities_b, pred_probabilities_c = model.predict(x_test, verbose=0) Y_pred_b = np.argmax(pred_probabilities_b, axis=1) Y_pred_c = np.argmax(pred_probabilities_c, axis=1) maskB = np.any(yb_test, axis=1) ids_testB = list(compress(ids_testBC, maskB)) yb_test = list(compress(yb_test, maskB)) Y_pred_b = list(compress(Y_pred_b, maskB)) ids_testB = np.asarray(ids_testB) yb_test = np.asarray(yb_test) Y_pred_b = np.asarray(Y_pred_b) yb_test = np.argmax(yb_test, axis=1) trees, tree_prediction, tree_label = branch2treelabels( ids_testB, yb_test, Y_pred_b) treesC, tree_predictionC, tree_labelC = branch2treelabels( ids_testBC, yc_test, Y_pred_c) perfold_result = { 'Task B': { 'ID': trees, 'Label': tree_label, 'Prediction': tree_prediction }, 'Task C': { 'ID': treesC, 'Label': tree_labelC, 'Prediction': tree_predictionC } } #%% cv_ids_c.extend(treesC) cv_prediction_c.extend(tree_predictionC) cv_label_c.extend(tree_labelC) cv_ids_b.extend(trees) cv_prediction_b.extend(tree_prediction) cv_label_b.extend(tree_label) allfolds.append(perfold_result) Cmactest_P, Cmactest_R, Cmactest_F, _ = precision_recall_fscore_support( cv_label_c, cv_prediction_c, average='binary') Cmictest_P, Cmictest_R, Cmictest_F, _ = precision_recall_fscore_support( cv_label_c, cv_prediction_c, average='binary') Ctest_P, Ctest_R, Ctest_F, _ = precision_recall_fscore_support( cv_label_c, cv_prediction_c) Cacc = accuracy_score(cv_label_c, cv_prediction_c) Bmactest_P, Bmactest_R, Bmactest_F, _ = precision_recall_fscore_support( cv_label_b, cv_prediction_b, labels=[0, 1, 2], average='macro') Bmictest_P, Bmictest_R, Bmictest_F, _ = precision_recall_fscore_support( cv_label_b, cv_prediction_b, labels=[0, 1, 2], average='micro') Btest_P, Btest_R, Btest_F, _ = precision_recall_fscore_support( cv_label_b, cv_prediction_b, labels=[0, 1, 2]) Bacc = accuracy_score(cv_label_b, cv_prediction_b) output = { 'Params': params, 'TaskB': { 'accuracy': Bacc, 'Macro': { 'Macro_Precision': Bmactest_P, 'Macro_Recall': Bmactest_R, 'Macro_F_score': Bmactest_F }, 'Micro': { 'Micro_Precision': Bmictest_P, 'Micro_Recall': Bmictest_R, 'Micro_F_score': Bmictest_F }, 'Per_class': { 'Pclass_Precision': Btest_P, 'Pclass_Recall': Btest_R, 'Pclass_F_score': Btest_F } }, 'TaskC': { 'accuracy': Cacc, 'Macro': { 'Macro_Precision': Cmactest_P, 'Macro_Recall': Cmactest_R, 'Macro_F_score': Cmactest_F }, 'Micro': { 'Micro_Precision': Cmictest_P, 'Micro_Recall': Cmictest_R, 'Micro_F_score': Cmictest_F }, 'Per_class': { 'Pclass_Precision': Ctest_P, 'Pclass_Recall': Ctest_R, 'Pclass_F_score': Ctest_F } }, 'attachments': { 'Task B': { 'ID': cv_ids_b, 'Label': cv_label_b, 'Prediction': cv_prediction_b }, 'Task C': { 'ID': cv_ids_c, 'Label': cv_label_c, 'Prediction': cv_prediction_c }, 'allfolds': allfolds } } directory = "output" if not os.path.exists(directory): os.mkdir(directory) with open('output/output' + fname + '.pkl', 'wb') as outfile: pickle.dump(output, outfile) return output
def objective_MTL2_detection_CV9(params): path = 'saved_data/saved_data_MTL2_detection' train = [ 'ferguson', 'ottawashooting', 'sydneysiege', 'putinmissing', 'prince-toronto', 'gurlitt', 'ebola-essien' ] test = 'charliehebdo' max_branch_len = 25 x_train = [] yb_train = [] yc_train = [] for t in train: temp_x_train = np.load(os.path.join(path, t, 'train_array.npy')) temp_yb_train = np.load(os.path.join(path, t, 'labels.npy')) temp_yc_train = np.load(os.path.join(path, t, 'rnr_labels.npy')) temp_x_train = pad_sequences(temp_x_train, maxlen=max_branch_len, dtype='float32', padding='post', truncating='post', value=0.) x_train.extend(temp_x_train) yb_train.extend(temp_yb_train) yc_train.extend(temp_yc_train) x_train = np.asarray(x_train) yb_train = np.asarray(yb_train) yc_train = np.asarray(yc_train) yc_train = to_categorical(yc_train, num_classes=2) x_test = np.load(os.path.join(path, test, 'train_array.npy')) yb_test = np.load(os.path.join(path, test, 'labels.npy')) yc_test = np.load(os.path.join(path, test, 'rnr_labels.npy')) ids_testBC = np.load(os.path.join(path, test, 'ids.npy')) model = training(params, x_train, yb_train, yc_train) pred_probabilities_b, pred_probabilities_c = model.predict(x_test, verbose=0) Y_pred_b = np.argmax(pred_probabilities_b, axis=1) Y_pred_c = np.argmax(pred_probabilities_c, axis=1) maskB = np.any(yb_test, axis=1) ids_testB = list(compress(ids_testBC, maskB)) yb_test = list(compress(yb_test, maskB)) Y_pred_b = list(compress(Y_pred_b, maskB)) ids_testB = np.asarray(ids_testB) yb_test = np.asarray(yb_test) Y_pred_b = np.asarray(Y_pred_b) yb_test = np.argmax(yb_test, axis=1) trees, tree_prediction, tree_label = branch2treelabels( ids_testB, yb_test, Y_pred_b) treesC, tree_predictionC, tree_labelC = branch2treelabels( ids_testBC, yc_test, Y_pred_c) mactest_F_b = f1_score(tree_label, tree_prediction, average='macro', labels=[0, 1, 2]) mactest_F_c = f1_score(tree_labelC, tree_predictionC, average='binary') output = { 'loss': (1 - mactest_F_b) + (1 - mactest_F_c), 'Params': params, 'status': STATUS_OK, } return output
def eval_MTL2_detection_CV(params, data, fname): max_branch_len = 25 x_train = np.load(os.path.join(train_path, 'train_array.npy')) y_train = np.load(os.path.join(train_path, 'rnr_labels.npy')) x_train = pad_sequences(x_train, maxlen=max_branch_len, dtype='float32', padding='post', truncating='post', value=0.) x_train = np.asarray(x_train) y_train = np.asarray(y_train) y_train = to_categorical(y_train, num_classes=2) x_test = np.load(os.path.join(test_path, 'train_array.npy')) y_test = np.load(os.path.join(test_path, 'rnr_labels.npy')) ids_test = np.load(os.path.join(test_path, 'ids.npy')) model = training(params, x_train, y_train) model.save(os.path.join(save_path, 'model_' + fname + '.h5')) del model model = load_model(os.path.join(save_path, 'model_' + fname + '.h5')) pred_probabilities = model.predict(x_test, verbose=0) Y_pred = np.argmax(pred_probabilities, axis=1) trees, tree_prediction, tree_label = branch2treelabels(ids_test, y_test, Y_pred) perfold_result = { 'Task C': {'ID': trees, 'Label': tree_label, 'Prediction': tree_prediction} } Cmactest_P, Cmactest_R, Cmactest_F, _ = precision_recall_fscore_support( tree_label, tree_prediction, average='binary') Cacc = accuracy_score(tree_label, tree_prediction) output = { 'Params': params, 'TaskC': { 'accuracy': Cacc, 'Macro': {'Macro_Precision': Cmactest_P, 'Macro_Recall': Cmactest_R, 'Macro_F_score': Cmactest_F} }, 'attachments': { 'Task C': {'ID': trees, 'Label': tree_label, 'Prediction': tree_prediction }, 'allfolds': perfold_result } } print("-- output") directory = save_path print(directory) os.makedirs(directory, exist_ok=True) with open(os.path.join(directory, 'output_' + fname + '.pkl'), 'wb') as outfile: pickle.dump(output, outfile) return output
def eval_MTL3(params, data, fname): path = 'saved_data/saved_data_MTL3' if data == 'PHEME5': folds = [ 'charliehebdo', 'germanwings-crash', 'ferguson', 'ottawashooting', 'sydneysiege' ] else: folds = [ 'charliehebdo', 'germanwings-crash', 'ferguson', 'ottawashooting', 'sydneysiege', 'putinmissing', 'prince-toronto', 'gurlitt', 'ebola-essien' ] allfolds = [] cv_ids_b = [] cv_prediction_b = [] cv_label_b = [] cv_ids_a = [] cv_prediction_a = [] cv_label_a = [] cv_ids_c = [] cv_prediction_c = [] cv_label_c = [] for number in range(len(folds)): test = folds[number] train = deepcopy(folds) del train[number] max_branch_len = 25 x_train = [] ya_train = [] yb_train = [] yc_train = [] for t in train: temp_x_train = np.load(os.path.join(path, t, 'train_array.npy')) temp_ya_train = np.load( os.path.join(path, t, 'fold_stance_labels.npy')) temp_yb_train = np.load(os.path.join(path, t, 'labels.npy')) temp_yc_train = np.load(os.path.join(path, t, 'rnr_labels.npy')) # pad sequences to the size of the largest temp_x_train = pad_sequences(temp_x_train, maxlen=max_branch_len, dtype='float32', padding='post', truncating='post', value=0.) temp_ya_train = pad_sequences(temp_ya_train, maxlen=max_branch_len, dtype='int32', padding='post', truncating='post', value=0.) x_train.extend(temp_x_train) ya_train.extend(temp_ya_train) yb_train.extend(temp_yb_train) yc_train.extend(temp_yc_train) x_train = np.asarray(x_train) ya_train = np.asarray(ya_train) yb_train = np.asarray(yb_train) yc_train = np.asarray(yc_train) yc_train = to_categorical(yc_train, num_classes=2) x_test = np.load(os.path.join(path, test, 'train_array.npy')) ya_test = np.load(os.path.join(path, test, 'fold_stance_labels.npy')) yb_test = np.load(os.path.join(path, test, 'labels.npy')) yc_test = np.load(os.path.join(path, test, 'rnr_labels.npy')) ids_testA = np.load(os.path.join(path, test, 'tweet_ids.npy')) ids_testBC = np.load(os.path.join(path, test, 'ids.npy')) model = training(params, x_train, ya_train, yb_train, yc_train) pred_probabilities_a, pred_probabilities_b, pred_probabilities_c = model.predict( x_test, verbose=0) Y_pred_a = np.argmax(pred_probabilities_a, axis=2) Y_pred_b = np.argmax(pred_probabilities_b, axis=1) Y_pred_c = np.argmax(pred_probabilities_c, axis=1) maskB = np.any(yb_test, axis=1) ids_testB = list(compress(ids_testBC, maskB)) yb_test = list(compress(yb_test, maskB)) Y_pred_b = list(compress(Y_pred_b, maskB)) ids_testB = np.asarray(ids_testB) yb_test = np.asarray(yb_test) Y_pred_b = np.asarray(Y_pred_b) yb_test = np.argmax(yb_test, axis=1) trees, tree_prediction, tree_label = branch2treelabels( ids_testB, yb_test, Y_pred_b) treesC, tree_predictionC, tree_labelC = branch2treelabels( ids_testBC, yc_test, Y_pred_c) maskA = np.any(np.any(ya_test, axis=2), axis=1) if np.any(maskA): Y_true_a = ya_test[maskA] Y_true_a = np.argmax(Y_true_a, axis=2) Y_pred_a = Y_pred_a[maskA] ids_testA = ids_testA[maskA] fids_test = ids_testA.flatten() fy_pred = Y_pred_a.flatten() fy_test = Y_true_a.flatten() uniqtwid, uindices2 = np.unique(fids_test, return_index=True) uniqtwid = uniqtwid.tolist() uindices2 = uindices2.tolist() del uindices2[uniqtwid.index(b'a')] del uniqtwid[uniqtwid.index(b'a')] uniq_dev_prediction = [fy_pred[i] for i in uindices2] uniq_dev_label = [fy_test[i] for i in uindices2] else: uniq_dev_prediction = [] uniq_dev_label = [] uniqtwid = [] perfold_result = { 'Task A': { 'ID': uniqtwid, 'Label': uniq_dev_label, 'Prediction': uniq_dev_prediction }, 'Task B': { 'ID': trees, 'Label': tree_label, 'Prediction': tree_prediction }, 'Task C': { 'ID': treesC, 'Label': tree_labelC, 'Prediction': tree_predictionC } } cv_ids_c.extend(treesC) cv_prediction_c.extend(tree_predictionC) cv_label_c.extend(tree_labelC) cv_ids_b.extend(trees) cv_prediction_b.extend(tree_prediction) cv_label_b.extend(tree_label) cv_ids_a.extend(uniqtwid) cv_prediction_a.extend(uniq_dev_prediction) cv_label_a.extend(uniq_dev_label) allfolds.append(perfold_result) Cmactest_P, Cmactest_R, Cmactest_F, _ = precision_recall_fscore_support( cv_label_c, cv_prediction_c, average='binary') Cmictest_P, Cmictest_R, Cmictest_F, _ = precision_recall_fscore_support( cv_label_c, cv_prediction_c, average='binary') Ctest_P, Ctest_R, Ctest_F, _ = precision_recall_fscore_support( cv_label_c, cv_prediction_c) Cacc = accuracy_score(cv_label_c, cv_prediction_c) Bmactest_P, Bmactest_R, Bmactest_F, _ = precision_recall_fscore_support( cv_label_b, cv_prediction_b, labels=[0, 1, 2], average='macro') Bmictest_P, Bmictest_R, Bmictest_F, _ = precision_recall_fscore_support( cv_label_b, cv_prediction_b, labels=[0, 1, 2], average='micro') Btest_P, Btest_R, Btest_F, _ = precision_recall_fscore_support( cv_label_b, cv_prediction_b, labels=[0, 1, 2]) Bacc = accuracy_score(cv_label_b, cv_prediction_b) Amactest_P, Amactest_R, Amactest_F, _ = precision_recall_fscore_support( cv_label_a, cv_prediction_a, labels=[0, 1, 2, 3], average='macro') Amictest_P, Amictest_R, Amictest_F, _ = precision_recall_fscore_support( cv_label_a, cv_prediction_a, labels=[0, 1, 2, 3], average='micro') Atest_P, Atest_R, Atest_F, _ = precision_recall_fscore_support( cv_label_a, cv_prediction_a, labels=[0, 1, 2, 3]) Aacc = accuracy_score(cv_label_a, cv_prediction_a) output = { 'Params': params, 'TaskA': { 'accuracy': Aacc, 'Macro': { 'Macro_Precision': Amactest_P, 'Macro_Recall': Amactest_R, 'Macro_F_score': Amactest_F }, 'Micro': { 'Micro_Precision': Amictest_P, 'Micro_Recall': Amictest_R, 'Micro_F_score': Amictest_F }, 'Per_class': { 'Pclass_Precision': Atest_P, 'Pclass_Recall': Atest_R, 'Pclass_F_score': Atest_F } }, 'TaskB': { 'accuracy': Bacc, 'Macro': { 'Macro_Precision': Bmactest_P, 'Macro_Recall': Bmactest_R, 'Macro_F_score': Bmactest_F }, 'Micro': { 'Micro_Precision': Bmictest_P, 'Micro_Recall': Bmictest_R, 'Micro_F_score': Bmictest_F }, 'Per_class': { 'Pclass_Precision': Btest_P, 'Pclass_Recall': Btest_R, 'Pclass_F_score': Btest_F } }, 'TaskC': { 'accuracy': Cacc, 'Macro': { 'Macro_Precision': Cmactest_P, 'Macro_Recall': Cmactest_R, 'Macro_F_score': Cmactest_F }, 'Micro': { 'Micro_Precision': Cmictest_P, 'Micro_Recall': Cmictest_R, 'Micro_F_score': Cmictest_F }, 'Per_class': { 'Pclass_Precision': Ctest_P, 'Pclass_Recall': Ctest_R, 'Pclass_F_score': Ctest_F } }, 'attachments': { 'Task A': { 'ID': cv_ids_a, 'Label': cv_label_a, 'Prediction': cv_prediction_a }, 'Task B': { 'ID': cv_ids_b, 'Label': cv_label_b, 'Prediction': cv_prediction_b }, 'Task C': { 'ID': cv_ids_c, 'Label': cv_label_c, 'Prediction': cv_prediction_c }, 'allfolds': allfolds } } directory = "output" if not os.path.exists(directory): os.mkdir(directory) with open('output/output' + fname + '.pkl', 'wb') as outfile: pickle.dump(output, outfile) return output
def objective_MTL3_CV9(params): path = 'saved_data/saved_data_MTL3' train = [ 'ferguson', 'ottawashooting', 'sydneysiege', 'putinmissing', 'prince-toronto', 'gurlitt', 'ebola-essien' ] test = 'charliehebdo' max_branch_len = 25 x_train = [] ya_train = [] yb_train = [] yc_train = [] for t in train: temp_x_train = np.load(os.path.join(path, t, 'train_array.npy')) temp_ya_train = np.load(os.path.join(path, t, 'fold_stance_labels.npy')) temp_yb_train = np.load(os.path.join(path, t, 'labels.npy')) temp_yc_train = np.load(os.path.join(path, t, 'rnr_labels.npy')) # pad sequences to the size of the largest temp_x_train = pad_sequences(temp_x_train, maxlen=max_branch_len, dtype='float32', padding='post', truncating='post', value=0.) temp_ya_train = pad_sequences(temp_ya_train, maxlen=max_branch_len, dtype='int32', padding='post', truncating='post', value=0.) x_train.extend(temp_x_train) ya_train.extend(temp_ya_train) yb_train.extend(temp_yb_train) yc_train.extend(temp_yc_train) x_train = np.asarray(x_train) ya_train = np.asarray(ya_train) yb_train = np.asarray(yb_train) yc_train = np.asarray(yc_train) yc_train = to_categorical(yc_train, num_classes=2) x_test = np.load(os.path.join(path, test, 'train_array.npy')) ya_test = np.load(os.path.join(path, test, 'fold_stance_labels.npy')) yb_test = np.load(os.path.join(path, test, 'labels.npy')) yc_test = np.load(os.path.join(path, test, 'rnr_labels.npy')) ids_testA = np.load(os.path.join(path, test, 'tweet_ids.npy')) ids_testBC = np.load(os.path.join(path, test, 'ids.npy')) model = training(params, x_train, ya_train, yb_train, yc_train) pred_probabilities_a, pred_probabilities_b, pred_probabilities_c = model.predict( x_test, verbose=0) Y_pred_a = np.argmax(pred_probabilities_a, axis=2) Y_pred_b = np.argmax(pred_probabilities_b, axis=1) Y_pred_c = np.argmax(pred_probabilities_c, axis=1) maskB = np.any(yb_test, axis=1) ids_testB = list(compress(ids_testBC, maskB)) yb_test = list(compress(yb_test, maskB)) Y_pred_b = list(compress(Y_pred_b, maskB)) ids_testB = np.asarray(ids_testB) yb_test = np.asarray(yb_test) Y_pred_b = np.asarray(Y_pred_b) yb_test = np.argmax(yb_test, axis=1) trees, tree_prediction, tree_label = branch2treelabels( ids_testB, yb_test, Y_pred_b) treesC, tree_predictionC, tree_labelC = branch2treelabels( ids_testBC, yc_test, Y_pred_c) mactest_F_b = f1_score(tree_label, tree_prediction, average='macro', labels=[0, 1, 2]) mactest_F_c = f1_score(tree_labelC, tree_predictionC, average='binary') maskA = np.any(np.any(ya_test, axis=2), axis=1) if np.any(maskA): Y_true_a = ya_test[maskA] Y_true_a = np.argmax(Y_true_a, axis=2) Y_pred_a = Y_pred_a[maskA] ids_testA = ids_testA[maskA] fids_test = ids_testA.flatten() fy_pred = Y_pred_a.flatten() fy_test = Y_true_a.flatten() uniqtwid, uindices2 = np.unique(fids_test, return_index=True) uniqtwid = uniqtwid.tolist() uindices2 = uindices2.tolist() del uindices2[uniqtwid.index(b'a')] del uniqtwid[uniqtwid.index(b'a')] uniq_dev_prediction = [fy_pred[i] for i in uindices2] uniq_dev_label = [fy_test[i] for i in uindices2] mactest_F_a = f1_score(uniq_dev_prediction, uniq_dev_label, average='macro', labels=[0, 1, 2, 3]) else: mactest_F_a = 0 uniq_dev_prediction = [] uniq_dev_label = [] output = { 'loss': (1 - mactest_F_a) + (1 - mactest_F_b) + (1 - mactest_F_c), 'Params': params, 'status': STATUS_OK, } return output