def objective_function_veracity_branchLSTM_RumEv(params):
    x_train = np.load(
        os.path.join('preprocessing/saved_dataRumEval2019',
                     'train/train_array.npy'))
    y_train = np.load(
        os.path.join('preprocessing/saved_dataRumEval2019',
                     'train/labels.npy'))
    y_train = to_categorical(y_train, num_classes=None)
    x_test = np.load(
        os.path.join('preprocessing/saved_dataRumEval2019',
                     'dev/train_array.npy'))
    y_test = np.load(
        os.path.join('preprocessing/saved_dataRumEval2019', 'dev/labels.npy'))
    ids_test = np.load(
        os.path.join('preprocessing/saved_dataRumEval2019', 'dev/ids.npy'))
    y_pred, confidence = LSTM_model_veracity(x_train, y_train, x_test, params)
    trees, tree_prediction, tree_label, _ = branch2treelabels(
        ids_test, y_test, y_pred, confidence)
    mactest_F = f1_score(tree_label, tree_prediction, average='macro')
    output = {
        'loss': 1 - mactest_F,
        'Params': params,
        'status': STATUS_OK,
        'attachments': {
            'ID': trees,
            'Predictions': tree_prediction,
            'Labels': tree_label
        }
    }
    return output
Esempio n. 2
0
def objective_function_veracity_branchLSTM_fullPHEME(params):
    path = 'saved_data_fullPHEME'

    train = [
        'ebola-essien', 'ferguson', 'gurlitt', 'ottawashooting',
        'prince-toronto', 'putinmissing', 'sydneysiege'
    ]

    test = 'charliehebdo'
    max_branch_len = 25
    x_train = []
    y_train = []

    for t in train:
        temp_x_train = np.load(os.path.join(path, t, 'train_array.npy'))
        temp_y_train = np.load(os.path.join(path, t, 'labels.npy'))

        temp_x_train = pad_sequences(temp_x_train,
                                     maxlen=max_branch_len,
                                     dtype='float32',
                                     padding='post',
                                     truncating='post',
                                     value=0.)

        x_train.extend(temp_x_train)
        y_train.extend(temp_y_train)

    x_train = np.asarray(x_train)
    y_train = np.asarray(y_train)

    x_test = np.load(os.path.join(path, test, 'train_array.npy'))
    y_test = np.load(os.path.join(path, test, 'labels.npy'))
    ids_test = np.load(os.path.join(path, test, 'ids.npy'))
    #%
    y_train = to_categorical(y_train, num_classes=None)

    model = heteroscedastic_model(x_train, y_train, params, output_classes=3)
    mb_size = params['mb_size']
    num_epochs = params['num_epochs']
    model.fit(x_train, [y_train, y_train],
              batch_size=mb_size,
              epochs=num_epochs,
              shuffle=False,
              class_weight=None)

    verbose = False
    predictions_test = model.predict(x_test,
                                     batch_size=mb_size,
                                     verbose=verbose)
    softmax_test = predictions_test[1]
    y_pred = np.argmax(softmax_test, axis=1)

    trees, tree_prediction, tree_label = branch2treelabels(
        ids_test, y_test, y_pred)

    mactest_F = f1_score(tree_label, tree_prediction, average='macro')

    output = {'loss': 1 - mactest_F, 'Params': params, 'status': STATUS_OK}
    #%%
    return output
Esempio n. 3
0
def objective_function_branchLSTM_Twitter15(params):
    #%%
    path = 'preprocessing/saved_data_15'

    #   fold 0 is development set

    train = '0/train'
    test = '0/test'

    max_branch_len = 25

    x_train = []
    y_train = []

    temp_x_train = np.load(os.path.join(path, train, 'train_array.npy'))
    y_train = np.load(os.path.join(path, train, 'labels.npy'))

    #   pad sequences to the size of the largest
    x_train = pad_sequences(temp_x_train,
                            maxlen=max_branch_len,
                            dtype='float32',
                            padding='post',
                            truncating='post',
                            value=0.)

    x_test = np.load(os.path.join(path, test, 'train_array.npy'))
    y_test = np.load(os.path.join(path, test, 'labels.npy'))
    ids_test = np.load(os.path.join(path, test, 'ids.npy'))

    #%
    y_train = to_categorical(y_train, num_classes=None)

    #    y_pred, confidence = LSTM_model(x_train, y_train, x_test, params)

    model = heteroscedastic_model(x_train, y_train, params, output_classes=4)
    mb_size = params['mb_size']
    num_epochs = params['num_epochs']
    model.fit(x_train, [y_train, y_train],
              batch_size=mb_size,
              epochs=num_epochs,
              shuffle=False,
              class_weight=None)

    verbose = False
    predictions_test = model.predict(x_test,
                                     batch_size=mb_size,
                                     verbose=verbose)
    softmax_test = predictions_test[1]
    y_pred = np.argmax(softmax_test, axis=1)

    trees, tree_prediction, tree_label = branch2treelabels(
        ids_test, y_test, y_pred)

    mactest_F = f1_score(tree_label, tree_prediction, average='macro')

    output = {'loss': 1 - mactest_F, 'Params': params, 'status': STATUS_OK}
    #%%
    return output
Esempio n. 4
0
def objective_MTL2_detection_CV5(params):
    max_branch_len = 25

    print("Train ", train_path)
    print("Holdout ", holdout_path)
    print("")
    x_train = np.load(os.path.join(train_path, 'train_array.npy'))
    y_train = np.load(os.path.join(train_path,'rnr_labels.npy' ))
    x_train = pad_sequences(x_train, maxlen=max_branch_len,
                  dtype='float32', padding='post',
                  truncating='post', value=0.)
    x_train = np.asarray(x_train)
    print("X train shape ", x_train.shape)
    y_train = np.asarray(y_train)
    print("y train ", y_train.shape)
    y_train = to_categorical(y_train, num_classes=2)
    print("")
    x_holdout = np.load(os.path.join(holdout_path, 'train_array.npy'))
    print("X holdout shape ", x_holdout.shape)
    print("")
    y_holdout = np.load(os.path.join(holdout_path, 'rnr_labels.npy'))
    ids_holdout = np.load(os.path.join(holdout_path, 'ids.npy'))
    start =time.time()
    model = training(params, x_train, y_train)
    end=time.time()
    print("** Elapsed time ", end-start)
    print("")
    pred_probabilities = model.predict(x_holdout, verbose=0)

    Y_pred = np.argmax(pred_probabilities, axis=1)

    trees, tree_prediction, tree_label = branch2treelabels(ids_holdout,
                                                              y_holdout,
                                                              Y_pred)

    mactest_F = f1_score(tree_label,
                           tree_prediction,
                           average='binary')

    output = {
        'loss':  (1 - mactest_F),
        'Params': params,
        'status': STATUS_OK,
    }

    return output
def main():
    x_train = np.load(
        os.path.join('Feature_Engineering/Saved_dataset_sdqc',
                     'train/train_array.npy'))
    y_train = np.load(
        os.path.join('Feature_Engineering/Saved_dataset_sdqc',
                     'train/labels.npy'))
    y_train = to_categorical(y_train, num_classes=None)
    x_test = np.load(
        os.path.join('Feature_Engineering/Saved_dataset_sdqc',
                     'dev/train_array.npy'))
    y_test = np.load(
        os.path.join('Feature_Engineering/Saved_dataset_sdqc',
                     'dev/labels.npy'))
    ids_test = np.load(
        os.path.join('Feature_Engineering/Saved_dataset_sdqc', 'dev/ids.npy'))
    y_pred, confidence = LSTM_model_veracity(x_train, y_train, x_test)
    trees, tree_prediction, tree_label, veracity_confidence = branch2treelabels(
        ids_test, y_test, y_pred, confidence)
    for i in range(len(veracity_confidence)):
        if tree_prediction[i] == 2:
            veracity_confidence[i] = 0
            tree_prediction[i] = 1
        if tree_label[i] == 2:
            tree_label[i] = 1
    dummy = []
    save_output(dummy, dummy, trees, tree_prediction, veracity_confidence)
    #print ("Charliehedbo event is the Test Data")
    print(trees)
    print("Predictions", (tree_prediction))
    print("Labels", (tree_label))
    print("Confidence", (veracity_confidence))

    print("Accuracy :", (accuracy_score(tree_label, tree_prediction)))
    from sklearn.metrics import classification_report
    print(
        classification_report(tree_label,
                              tree_prediction,
                              target_names=["true", "false"]))

    import scikitplot as skplt
    import matplotlib.pyplot as plt
    skplt.metrics.plot_confusion_matrix(tree_label, tree_prediction)
    plt.show()
def objective_MTL2_RumEval(params):

    path = 'saved_data/saved_data_RumEv'

    x_train = np.load(os.path.join(path, 'train/train_array.npy'))
    y_trainA = np.load(os.path.join(path, 'train/fold_stance_labels.npy'))
    y_trainB = np.load(os.path.join(path, 'train/labels.npy'))
    y_trainB = to_categorical(y_trainB, num_classes=3)
    y_train_cat = y_trainA
    x_test = np.load(os.path.join(path, 'dev/train_array.npy'))
    y_testA = np.load(os.path.join(path, 'dev/fold_stance_labels.npy'))
    y_testB = np.load(os.path.join(path, 'dev/labels.npy'))
    ids_testA = np.load(os.path.join(path, 'dev/tweet_ids.npy'))
    ids_testB = np.load(os.path.join(path, 'dev/ids.npy'))

    model = training(params, x_train, y_train_cat, y_trainB)
    pred_probabilities_a, pred_probabilities_b = model.predict(x_test,
                                                               verbose=0)
    Y_pred_a = np.argmax(pred_probabilities_a, axis=2)
    Y_pred_b = np.argmax(pred_probabilities_b, axis=1)
    trees, tree_prediction, tree_label = branch2treelabels(
        ids_testB, y_testB, Y_pred_b)
    mactest_F_b = f1_score(tree_label, tree_prediction, average='macro')
    fids_test = ids_testA.flatten()
    fy_pred = Y_pred_a.flatten()
    Y_test_A = np.argmax(y_testA, axis=2)
    fy_test = Y_test_A.flatten()
    uniqtwid, uindices2 = np.unique(fids_test, return_index=True)
    uniqtwid = uniqtwid.tolist()
    uindices2 = uindices2.tolist()
    del uniqtwid[0]
    del uindices2[0]
    uniq_dev_prediction = [fy_pred[i] for i in uindices2]
    uniq_dev_label = [fy_test[i] for i in uindices2]
    mactest_F_a = f1_score(uniq_dev_prediction,
                           uniq_dev_label,
                           average='macro')
    output = {
        'loss': (1 - mactest_F_a) + (1 - mactest_F_b),
        'Params': params,
        'status': STATUS_OK
    }
    return output
def eval_MTL2_RumEval(params, fname):

    path = 'saved_data/saved_data_RumEv'

    x_train = np.load(os.path.join(path, 'train/train_array.npy'))
    y_trainA = np.load(os.path.join(path, 'train/fold_stance_labels.npy'))
    y_trainB = np.load(os.path.join(path, 'train/labels.npy'))
    y_trainB = to_categorical(y_trainB, num_classes=3)

    x_dev = np.load(os.path.join(path, 'dev/train_array.npy'))
    y_devA = np.load(os.path.join(path, 'dev/fold_stance_labels.npy'))
    y_devB = np.load(os.path.join(path, 'dev/labels.npy'))
    y_devB = to_categorical(y_devB, num_classes=3)

    x_test = np.load(os.path.join(path, 'test/train_array.npy'))
    y_testA = np.load(os.path.join(path, 'test/fold_stance_labels.npy'))
    y_testB = np.load(os.path.join(path, 'test/labels.npy'))

    ids_testA = np.load(os.path.join(path, 'test/tweet_ids.npy'))
    ids_testB = np.load(os.path.join(path, 'test/ids.npy'))

    x_dev = pad_sequences(x_dev,
                          maxlen=len(x_train[0]),
                          dtype='float32',
                          padding='post',
                          truncating='post',
                          value=0.)
    y_devA = pad_sequences(y_devA,
                           maxlen=len(y_trainA[0]),
                           dtype='float32',
                           padding='post',
                           truncating='post',
                           value=0.)

    x_train = np.concatenate((x_train, x_dev), axis=0)
    y_trainA = np.concatenate((y_trainA, y_devA), axis=0)
    y_trainB = np.concatenate((y_trainB, y_devB), axis=0)
    y_train_cat = y_trainA

    model = training(params, x_train, y_train_cat, y_trainB)

    pred_probabilities_a, pred_probabilities_b = model.predict(x_test,
                                                               verbose=0)

    Y_pred_a = np.argmax(pred_probabilities_a, axis=2)
    Y_pred_b = np.argmax(pred_probabilities_b, axis=1)

    trees, tree_prediction, tree_label = branch2treelabels(
        ids_testB, y_testB, Y_pred_b)

    Bmactest_P, Bmactest_R, Bmactest_F, _ = precision_recall_fscore_support(
        tree_label, tree_prediction, average='macro')
    Bmictest_P, Bmictest_R, Bmictest_F, _ = precision_recall_fscore_support(
        tree_label, tree_prediction, average='micro')
    Btest_P, Btest_R, Btest_F, _ = precision_recall_fscore_support(
        tree_label, tree_prediction)
    Bacc = accuracy_score(tree_label, tree_prediction)
    fids_test = ids_testA.flatten()
    fy_pred = Y_pred_a.flatten()
    Y_test_A = np.argmax(y_testA, axis=2)
    fy_test = Y_test_A.flatten()
    uniqtwid, uindices2 = np.unique(fids_test, return_index=True)
    uniqtwid = uniqtwid.tolist()
    uindices2 = uindices2.tolist()
    del uniqtwid[0]
    del uindices2[0]
    uniq_dev_prediction = [fy_pred[i] for i in uindices2]
    uniq_dev_label = [fy_test[i] for i in uindices2]

    Amactest_P, Amactest_R, Amactest_F, _ = precision_recall_fscore_support(
        uniq_dev_label, uniq_dev_prediction, average='macro')
    Amictest_P, Amictest_R, Amictest_F, _ = precision_recall_fscore_support(
        uniq_dev_label, uniq_dev_prediction, average='micro')
    Atest_P, Atest_R, Atest_F, _ = precision_recall_fscore_support(
        uniq_dev_label, uniq_dev_prediction)
    Aacc = accuracy_score(uniq_dev_label, uniq_dev_prediction)

    output = {
        'Params': params,
        'TaskA': {
            'accuracy': Aacc,
            'Macro': {
                'Macro_Precision': Amactest_P,
                'Macro_Recall': Amactest_R,
                'Macro_F_score': Amactest_F
            },
            'Micro': {
                'Micro_Precision': Amictest_P,
                'Micro_Recall': Amictest_R,
                'Micro_F_score': Amictest_F
            },
            'Per_class': {
                'Pclass_Precision': Atest_P,
                'Pclass_Recall': Atest_R,
                'Pclass_F_score': Atest_F
            }
        },
        'TaskB': {
            'accuracy': Bacc,
            'Macro': {
                'Macro_Precision': Bmactest_P,
                'Macro_Recall': Bmactest_R,
                'Macro_F_score': Bmactest_F
            },
            'Micro': {
                'Micro_Precision': Bmictest_P,
                'Micro_Recall': Bmictest_R,
                'Micro_F_score': Bmictest_F
            },
            'Per_class': {
                'Pclass_Precision': Btest_P,
                'Pclass_Recall': Btest_R,
                'Pclass_F_score': Btest_F
            }
        },
        'attachments': {
            'Task A': {
                'ID': uniqtwid,
                'Label': uniq_dev_label,
                'Prediction': uniq_dev_prediction
            },
            'Task B': {
                'ID': trees,
                'Label': tree_label,
                'Prediction': tree_prediction,
                'Branch': {
                    'ID': ids_testB,
                    'Label': y_testB,
                    'Prediction': Y_pred_b
                }
            }
        }
    }
    directory = "output"
    if not os.path.exists(directory):
        os.mkdir(directory)

    with open('output/output' + fname + '.pkl', 'wb') as outfile:
        pickle.dump(output, outfile)

    return output
def eval_MTL2_detection_CV(params, data, fname):

    path = 'saved_data/saved_data_MTL2_detection'

    if data == 'PHEME5':
        folds = [
            'charliehebdo', 'germanwings-crash', 'ferguson', 'ottawashooting',
            'sydneysiege'
        ]
    else:
        folds = [
            'charliehebdo', 'germanwings-crash', 'ferguson', 'ottawashooting',
            'sydneysiege', 'putinmissing', 'prince-toronto', 'gurlitt',
            'ebola-essien'
        ]

    allfolds = []
    cv_ids_b = []
    cv_prediction_b = []
    cv_label_b = []

    cv_ids_c = []
    cv_prediction_c = []
    cv_label_c = []

    for number in range(len(folds)):

        test = folds[number]
        print(test)
        train = deepcopy(folds)
        del train[number]

        max_branch_len = 25
        x_train = []
        yb_train = []
        yc_train = []

        for t in train:
            temp_x_train = np.load(os.path.join(path, t, 'train_array.npy'))
            temp_yb_train = np.load(os.path.join(path, t, 'labels.npy'))
            temp_yc_train = np.load(os.path.join(path, t, 'rnr_labels.npy'))

            temp_x_train = pad_sequences(temp_x_train,
                                         maxlen=max_branch_len,
                                         dtype='float32',
                                         padding='post',
                                         truncating='post',
                                         value=0.)
            x_train.extend(temp_x_train)
            yb_train.extend(temp_yb_train)
            yc_train.extend(temp_yc_train)

        x_train = np.asarray(x_train)
        yb_train = np.asarray(yb_train)
        yc_train = np.asarray(yc_train)
        yc_train = to_categorical(yc_train, num_classes=2)

        x_test = np.load(os.path.join(path, test, 'train_array.npy'))
        yb_test = np.load(os.path.join(path, test, 'labels.npy'))
        yc_test = np.load(os.path.join(path, test, 'rnr_labels.npy'))
        ids_testBC = np.load(os.path.join(path, test, 'ids.npy'))

        model = training(params, x_train, yb_train, yc_train)

        pred_probabilities_b, pred_probabilities_c = model.predict(x_test,
                                                                   verbose=0)

        Y_pred_b = np.argmax(pred_probabilities_b, axis=1)
        Y_pred_c = np.argmax(pred_probabilities_c, axis=1)

        maskB = np.any(yb_test, axis=1)
        ids_testB = list(compress(ids_testBC, maskB))
        yb_test = list(compress(yb_test, maskB))
        Y_pred_b = list(compress(Y_pred_b, maskB))

        ids_testB = np.asarray(ids_testB)
        yb_test = np.asarray(yb_test)
        Y_pred_b = np.asarray(Y_pred_b)
        yb_test = np.argmax(yb_test, axis=1)
        trees, tree_prediction, tree_label = branch2treelabels(
            ids_testB, yb_test, Y_pred_b)
        treesC, tree_predictionC, tree_labelC = branch2treelabels(
            ids_testBC, yc_test, Y_pred_c)

        perfold_result = {
            'Task B': {
                'ID': trees,
                'Label': tree_label,
                'Prediction': tree_prediction
            },
            'Task C': {
                'ID': treesC,
                'Label': tree_labelC,
                'Prediction': tree_predictionC
            }
        }
        #%%
        cv_ids_c.extend(treesC)
        cv_prediction_c.extend(tree_predictionC)
        cv_label_c.extend(tree_labelC)

        cv_ids_b.extend(trees)
        cv_prediction_b.extend(tree_prediction)
        cv_label_b.extend(tree_label)

        allfolds.append(perfold_result)

    Cmactest_P, Cmactest_R, Cmactest_F, _ = precision_recall_fscore_support(
        cv_label_c, cv_prediction_c, average='binary')
    Cmictest_P, Cmictest_R, Cmictest_F, _ = precision_recall_fscore_support(
        cv_label_c, cv_prediction_c, average='binary')
    Ctest_P, Ctest_R, Ctest_F, _ = precision_recall_fscore_support(
        cv_label_c, cv_prediction_c)
    Cacc = accuracy_score(cv_label_c, cv_prediction_c)

    Bmactest_P, Bmactest_R, Bmactest_F, _ = precision_recall_fscore_support(
        cv_label_b, cv_prediction_b, labels=[0, 1, 2], average='macro')
    Bmictest_P, Bmictest_R, Bmictest_F, _ = precision_recall_fscore_support(
        cv_label_b, cv_prediction_b, labels=[0, 1, 2], average='micro')
    Btest_P, Btest_R, Btest_F, _ = precision_recall_fscore_support(
        cv_label_b, cv_prediction_b, labels=[0, 1, 2])
    Bacc = accuracy_score(cv_label_b, cv_prediction_b)

    output = {
        'Params': params,
        'TaskB': {
            'accuracy': Bacc,
            'Macro': {
                'Macro_Precision': Bmactest_P,
                'Macro_Recall': Bmactest_R,
                'Macro_F_score': Bmactest_F
            },
            'Micro': {
                'Micro_Precision': Bmictest_P,
                'Micro_Recall': Bmictest_R,
                'Micro_F_score': Bmictest_F
            },
            'Per_class': {
                'Pclass_Precision': Btest_P,
                'Pclass_Recall': Btest_R,
                'Pclass_F_score': Btest_F
            }
        },
        'TaskC': {
            'accuracy': Cacc,
            'Macro': {
                'Macro_Precision': Cmactest_P,
                'Macro_Recall': Cmactest_R,
                'Macro_F_score': Cmactest_F
            },
            'Micro': {
                'Micro_Precision': Cmictest_P,
                'Micro_Recall': Cmictest_R,
                'Micro_F_score': Cmictest_F
            },
            'Per_class': {
                'Pclass_Precision': Ctest_P,
                'Pclass_Recall': Ctest_R,
                'Pclass_F_score': Ctest_F
            }
        },
        'attachments': {
            'Task B': {
                'ID': cv_ids_b,
                'Label': cv_label_b,
                'Prediction': cv_prediction_b
            },
            'Task C': {
                'ID': cv_ids_c,
                'Label': cv_label_c,
                'Prediction': cv_prediction_c
            },
            'allfolds': allfolds
        }
    }
    directory = "output"
    if not os.path.exists(directory):
        os.mkdir(directory)
    with open('output/output' + fname + '.pkl', 'wb') as outfile:
        pickle.dump(output, outfile)

    return output
def objective_MTL2_detection_CV9(params):

    path = 'saved_data/saved_data_MTL2_detection'

    train = [
        'ferguson', 'ottawashooting', 'sydneysiege', 'putinmissing',
        'prince-toronto', 'gurlitt', 'ebola-essien'
    ]
    test = 'charliehebdo'

    max_branch_len = 25
    x_train = []

    yb_train = []
    yc_train = []

    for t in train:
        temp_x_train = np.load(os.path.join(path, t, 'train_array.npy'))

        temp_yb_train = np.load(os.path.join(path, t, 'labels.npy'))
        temp_yc_train = np.load(os.path.join(path, t, 'rnr_labels.npy'))

        temp_x_train = pad_sequences(temp_x_train,
                                     maxlen=max_branch_len,
                                     dtype='float32',
                                     padding='post',
                                     truncating='post',
                                     value=0.)
        x_train.extend(temp_x_train)
        yb_train.extend(temp_yb_train)
        yc_train.extend(temp_yc_train)

    x_train = np.asarray(x_train)
    yb_train = np.asarray(yb_train)
    yc_train = np.asarray(yc_train)
    yc_train = to_categorical(yc_train, num_classes=2)
    x_test = np.load(os.path.join(path, test, 'train_array.npy'))
    yb_test = np.load(os.path.join(path, test, 'labels.npy'))
    yc_test = np.load(os.path.join(path, test, 'rnr_labels.npy'))

    ids_testBC = np.load(os.path.join(path, test, 'ids.npy'))

    model = training(params, x_train, yb_train, yc_train)

    pred_probabilities_b, pred_probabilities_c = model.predict(x_test,
                                                               verbose=0)

    Y_pred_b = np.argmax(pred_probabilities_b, axis=1)
    Y_pred_c = np.argmax(pred_probabilities_c, axis=1)

    maskB = np.any(yb_test, axis=1)
    ids_testB = list(compress(ids_testBC, maskB))
    yb_test = list(compress(yb_test, maskB))
    Y_pred_b = list(compress(Y_pred_b, maskB))

    ids_testB = np.asarray(ids_testB)
    yb_test = np.asarray(yb_test)
    Y_pred_b = np.asarray(Y_pred_b)
    yb_test = np.argmax(yb_test, axis=1)

    trees, tree_prediction, tree_label = branch2treelabels(
        ids_testB, yb_test, Y_pred_b)

    treesC, tree_predictionC, tree_labelC = branch2treelabels(
        ids_testBC, yc_test, Y_pred_c)

    mactest_F_b = f1_score(tree_label,
                           tree_prediction,
                           average='macro',
                           labels=[0, 1, 2])
    mactest_F_c = f1_score(tree_labelC, tree_predictionC, average='binary')

    output = {
        'loss': (1 - mactest_F_b) + (1 - mactest_F_c),
        'Params': params,
        'status': STATUS_OK,
    }

    return output
Esempio n. 10
0
def eval_MTL2_detection_CV(params, data, fname):
    max_branch_len = 25
    x_train = np.load(os.path.join(train_path, 'train_array.npy'))
    y_train = np.load(os.path.join(train_path, 'rnr_labels.npy'))
    x_train = pad_sequences(x_train, maxlen=max_branch_len,
                  dtype='float32', padding='post',
                  truncating='post', value=0.)
    x_train = np.asarray(x_train)
    y_train = np.asarray(y_train)
    y_train = to_categorical(y_train, num_classes=2)

    x_test = np.load(os.path.join(test_path, 'train_array.npy'))
    y_test = np.load(os.path.join(test_path, 'rnr_labels.npy'))
    ids_test = np.load(os.path.join(test_path, 'ids.npy'))


    model = training(params, x_train, y_train)
    model.save(os.path.join(save_path, 'model_' + fname + '.h5'))
    del model
    model = load_model(os.path.join(save_path, 'model_' + fname + '.h5'))

    pred_probabilities = model.predict(x_test, verbose=0)

    Y_pred = np.argmax(pred_probabilities, axis=1)

    trees, tree_prediction, tree_label = branch2treelabels(ids_test,
                                                              y_test,
                                                              Y_pred)

    perfold_result = {
                      'Task C': {'ID': trees, 'Label': tree_label,
                                 'Prediction': tree_prediction}
                      }


    Cmactest_P, Cmactest_R, Cmactest_F, _ = precision_recall_fscore_support(
        tree_label,
        tree_prediction,
        average='binary')

    Cacc = accuracy_score(tree_label, tree_prediction)

    output = {
        'Params': params,

        'TaskC': {
            'accuracy': Cacc,
            'Macro': {'Macro_Precision': Cmactest_P,
                      'Macro_Recall': Cmactest_R,
                      'Macro_F_score': Cmactest_F}
        },
        'attachments': {

            'Task C': {'ID': trees,
                       'Label': tree_label,
                       'Prediction': tree_prediction
                       },
            'allfolds': perfold_result

        }
    }
    print("-- output")

    directory = save_path

    print(directory)
    os.makedirs(directory, exist_ok=True)
    with open(os.path.join(directory, 'output_' + fname + '.pkl'), 'wb') as outfile:
        pickle.dump(output, outfile)

    return output
Esempio n. 11
0
def eval_MTL3(params, data, fname):

    path = 'saved_data/saved_data_MTL3'

    if data == 'PHEME5':
        folds = [
            'charliehebdo', 'germanwings-crash', 'ferguson', 'ottawashooting',
            'sydneysiege'
        ]
    else:
        folds = [
            'charliehebdo', 'germanwings-crash', 'ferguson', 'ottawashooting',
            'sydneysiege', 'putinmissing', 'prince-toronto', 'gurlitt',
            'ebola-essien'
        ]

    allfolds = []

    cv_ids_b = []
    cv_prediction_b = []
    cv_label_b = []

    cv_ids_a = []
    cv_prediction_a = []
    cv_label_a = []

    cv_ids_c = []
    cv_prediction_c = []
    cv_label_c = []

    for number in range(len(folds)):

        test = folds[number]
        train = deepcopy(folds)
        del train[number]

        max_branch_len = 25
        x_train = []
        ya_train = []
        yb_train = []
        yc_train = []

        for t in train:
            temp_x_train = np.load(os.path.join(path, t, 'train_array.npy'))
            temp_ya_train = np.load(
                os.path.join(path, t, 'fold_stance_labels.npy'))
            temp_yb_train = np.load(os.path.join(path, t, 'labels.npy'))
            temp_yc_train = np.load(os.path.join(path, t, 'rnr_labels.npy'))

            # pad sequences to the size of the largest
            temp_x_train = pad_sequences(temp_x_train,
                                         maxlen=max_branch_len,
                                         dtype='float32',
                                         padding='post',
                                         truncating='post',
                                         value=0.)
            temp_ya_train = pad_sequences(temp_ya_train,
                                          maxlen=max_branch_len,
                                          dtype='int32',
                                          padding='post',
                                          truncating='post',
                                          value=0.)

            x_train.extend(temp_x_train)
            ya_train.extend(temp_ya_train)
            yb_train.extend(temp_yb_train)
            yc_train.extend(temp_yc_train)

        x_train = np.asarray(x_train)
        ya_train = np.asarray(ya_train)
        yb_train = np.asarray(yb_train)
        yc_train = np.asarray(yc_train)
        yc_train = to_categorical(yc_train, num_classes=2)

        x_test = np.load(os.path.join(path, test, 'train_array.npy'))
        ya_test = np.load(os.path.join(path, test, 'fold_stance_labels.npy'))
        yb_test = np.load(os.path.join(path, test, 'labels.npy'))
        yc_test = np.load(os.path.join(path, test, 'rnr_labels.npy'))

        ids_testA = np.load(os.path.join(path, test, 'tweet_ids.npy'))
        ids_testBC = np.load(os.path.join(path, test, 'ids.npy'))

        model = training(params, x_train, ya_train, yb_train, yc_train)

        pred_probabilities_a, pred_probabilities_b, pred_probabilities_c = model.predict(
            x_test, verbose=0)
        Y_pred_a = np.argmax(pred_probabilities_a, axis=2)
        Y_pred_b = np.argmax(pred_probabilities_b, axis=1)
        Y_pred_c = np.argmax(pred_probabilities_c, axis=1)

        maskB = np.any(yb_test, axis=1)
        ids_testB = list(compress(ids_testBC, maskB))
        yb_test = list(compress(yb_test, maskB))
        Y_pred_b = list(compress(Y_pred_b, maskB))

        ids_testB = np.asarray(ids_testB)
        yb_test = np.asarray(yb_test)
        Y_pred_b = np.asarray(Y_pred_b)
        yb_test = np.argmax(yb_test, axis=1)
        trees, tree_prediction, tree_label = branch2treelabels(
            ids_testB, yb_test, Y_pred_b)
        treesC, tree_predictionC, tree_labelC = branch2treelabels(
            ids_testBC, yc_test, Y_pred_c)

        maskA = np.any(np.any(ya_test, axis=2), axis=1)

        if np.any(maskA):

            Y_true_a = ya_test[maskA]
            Y_true_a = np.argmax(Y_true_a, axis=2)
            Y_pred_a = Y_pred_a[maskA]
            ids_testA = ids_testA[maskA]

            fids_test = ids_testA.flatten()
            fy_pred = Y_pred_a.flatten()
            fy_test = Y_true_a.flatten()

            uniqtwid, uindices2 = np.unique(fids_test, return_index=True)
            uniqtwid = uniqtwid.tolist()
            uindices2 = uindices2.tolist()
            del uindices2[uniqtwid.index(b'a')]
            del uniqtwid[uniqtwid.index(b'a')]

            uniq_dev_prediction = [fy_pred[i] for i in uindices2]
            uniq_dev_label = [fy_test[i] for i in uindices2]

        else:

            uniq_dev_prediction = []
            uniq_dev_label = []
            uniqtwid = []

        perfold_result = {
            'Task A': {
                'ID': uniqtwid,
                'Label': uniq_dev_label,
                'Prediction': uniq_dev_prediction
            },
            'Task B': {
                'ID': trees,
                'Label': tree_label,
                'Prediction': tree_prediction
            },
            'Task C': {
                'ID': treesC,
                'Label': tree_labelC,
                'Prediction': tree_predictionC
            }
        }

        cv_ids_c.extend(treesC)
        cv_prediction_c.extend(tree_predictionC)
        cv_label_c.extend(tree_labelC)

        cv_ids_b.extend(trees)
        cv_prediction_b.extend(tree_prediction)
        cv_label_b.extend(tree_label)

        cv_ids_a.extend(uniqtwid)
        cv_prediction_a.extend(uniq_dev_prediction)
        cv_label_a.extend(uniq_dev_label)

        allfolds.append(perfold_result)

    Cmactest_P, Cmactest_R, Cmactest_F, _ = precision_recall_fscore_support(
        cv_label_c, cv_prediction_c, average='binary')
    Cmictest_P, Cmictest_R, Cmictest_F, _ = precision_recall_fscore_support(
        cv_label_c, cv_prediction_c, average='binary')
    Ctest_P, Ctest_R, Ctest_F, _ = precision_recall_fscore_support(
        cv_label_c, cv_prediction_c)
    Cacc = accuracy_score(cv_label_c, cv_prediction_c)

    Bmactest_P, Bmactest_R, Bmactest_F, _ = precision_recall_fscore_support(
        cv_label_b, cv_prediction_b, labels=[0, 1, 2], average='macro')
    Bmictest_P, Bmictest_R, Bmictest_F, _ = precision_recall_fscore_support(
        cv_label_b, cv_prediction_b, labels=[0, 1, 2], average='micro')
    Btest_P, Btest_R, Btest_F, _ = precision_recall_fscore_support(
        cv_label_b, cv_prediction_b, labels=[0, 1, 2])
    Bacc = accuracy_score(cv_label_b, cv_prediction_b)

    Amactest_P, Amactest_R, Amactest_F, _ = precision_recall_fscore_support(
        cv_label_a, cv_prediction_a, labels=[0, 1, 2, 3], average='macro')
    Amictest_P, Amictest_R, Amictest_F, _ = precision_recall_fscore_support(
        cv_label_a, cv_prediction_a, labels=[0, 1, 2, 3], average='micro')
    Atest_P, Atest_R, Atest_F, _ = precision_recall_fscore_support(
        cv_label_a, cv_prediction_a, labels=[0, 1, 2, 3])
    Aacc = accuracy_score(cv_label_a, cv_prediction_a)

    output = {
        'Params': params,
        'TaskA': {
            'accuracy': Aacc,
            'Macro': {
                'Macro_Precision': Amactest_P,
                'Macro_Recall': Amactest_R,
                'Macro_F_score': Amactest_F
            },
            'Micro': {
                'Micro_Precision': Amictest_P,
                'Micro_Recall': Amictest_R,
                'Micro_F_score': Amictest_F
            },
            'Per_class': {
                'Pclass_Precision': Atest_P,
                'Pclass_Recall': Atest_R,
                'Pclass_F_score': Atest_F
            }
        },
        'TaskB': {
            'accuracy': Bacc,
            'Macro': {
                'Macro_Precision': Bmactest_P,
                'Macro_Recall': Bmactest_R,
                'Macro_F_score': Bmactest_F
            },
            'Micro': {
                'Micro_Precision': Bmictest_P,
                'Micro_Recall': Bmictest_R,
                'Micro_F_score': Bmictest_F
            },
            'Per_class': {
                'Pclass_Precision': Btest_P,
                'Pclass_Recall': Btest_R,
                'Pclass_F_score': Btest_F
            }
        },
        'TaskC': {
            'accuracy': Cacc,
            'Macro': {
                'Macro_Precision': Cmactest_P,
                'Macro_Recall': Cmactest_R,
                'Macro_F_score': Cmactest_F
            },
            'Micro': {
                'Micro_Precision': Cmictest_P,
                'Micro_Recall': Cmictest_R,
                'Micro_F_score': Cmictest_F
            },
            'Per_class': {
                'Pclass_Precision': Ctest_P,
                'Pclass_Recall': Ctest_R,
                'Pclass_F_score': Ctest_F
            }
        },
        'attachments': {
            'Task A': {
                'ID': cv_ids_a,
                'Label': cv_label_a,
                'Prediction': cv_prediction_a
            },
            'Task B': {
                'ID': cv_ids_b,
                'Label': cv_label_b,
                'Prediction': cv_prediction_b
            },
            'Task C': {
                'ID': cv_ids_c,
                'Label': cv_label_c,
                'Prediction': cv_prediction_c
            },
            'allfolds': allfolds
        }
    }

    directory = "output"
    if not os.path.exists(directory):
        os.mkdir(directory)

    with open('output/output' + fname + '.pkl', 'wb') as outfile:
        pickle.dump(output, outfile)

    return output
Esempio n. 12
0
def objective_MTL3_CV9(params):
    path = 'saved_data/saved_data_MTL3'

    train = [
        'ferguson', 'ottawashooting', 'sydneysiege', 'putinmissing',
        'prince-toronto', 'gurlitt', 'ebola-essien'
    ]
    test = 'charliehebdo'

    max_branch_len = 25
    x_train = []
    ya_train = []
    yb_train = []
    yc_train = []

    for t in train:
        temp_x_train = np.load(os.path.join(path, t, 'train_array.npy'))
        temp_ya_train = np.load(os.path.join(path, t,
                                             'fold_stance_labels.npy'))
        temp_yb_train = np.load(os.path.join(path, t, 'labels.npy'))
        temp_yc_train = np.load(os.path.join(path, t, 'rnr_labels.npy'))

        # pad sequences to the size of the largest
        temp_x_train = pad_sequences(temp_x_train,
                                     maxlen=max_branch_len,
                                     dtype='float32',
                                     padding='post',
                                     truncating='post',
                                     value=0.)
        temp_ya_train = pad_sequences(temp_ya_train,
                                      maxlen=max_branch_len,
                                      dtype='int32',
                                      padding='post',
                                      truncating='post',
                                      value=0.)

        x_train.extend(temp_x_train)
        ya_train.extend(temp_ya_train)
        yb_train.extend(temp_yb_train)
        yc_train.extend(temp_yc_train)

    x_train = np.asarray(x_train)
    ya_train = np.asarray(ya_train)
    yb_train = np.asarray(yb_train)
    yc_train = np.asarray(yc_train)
    yc_train = to_categorical(yc_train, num_classes=2)
    x_test = np.load(os.path.join(path, test, 'train_array.npy'))
    ya_test = np.load(os.path.join(path, test, 'fold_stance_labels.npy'))
    yb_test = np.load(os.path.join(path, test, 'labels.npy'))
    yc_test = np.load(os.path.join(path, test, 'rnr_labels.npy'))
    ids_testA = np.load(os.path.join(path, test, 'tweet_ids.npy'))
    ids_testBC = np.load(os.path.join(path, test, 'ids.npy'))

    model = training(params, x_train, ya_train, yb_train, yc_train)

    pred_probabilities_a, pred_probabilities_b, pred_probabilities_c = model.predict(
        x_test, verbose=0)

    Y_pred_a = np.argmax(pred_probabilities_a, axis=2)
    Y_pred_b = np.argmax(pred_probabilities_b, axis=1)
    Y_pred_c = np.argmax(pred_probabilities_c, axis=1)

    maskB = np.any(yb_test, axis=1)
    ids_testB = list(compress(ids_testBC, maskB))
    yb_test = list(compress(yb_test, maskB))
    Y_pred_b = list(compress(Y_pred_b, maskB))

    ids_testB = np.asarray(ids_testB)
    yb_test = np.asarray(yb_test)
    Y_pred_b = np.asarray(Y_pred_b)
    yb_test = np.argmax(yb_test, axis=1)

    trees, tree_prediction, tree_label = branch2treelabels(
        ids_testB, yb_test, Y_pred_b)

    treesC, tree_predictionC, tree_labelC = branch2treelabels(
        ids_testBC, yc_test, Y_pred_c)

    mactest_F_b = f1_score(tree_label,
                           tree_prediction,
                           average='macro',
                           labels=[0, 1, 2])
    mactest_F_c = f1_score(tree_labelC, tree_predictionC, average='binary')

    maskA = np.any(np.any(ya_test, axis=2), axis=1)

    if np.any(maskA):

        Y_true_a = ya_test[maskA]
        Y_true_a = np.argmax(Y_true_a, axis=2)
        Y_pred_a = Y_pred_a[maskA]
        ids_testA = ids_testA[maskA]

        fids_test = ids_testA.flatten()
        fy_pred = Y_pred_a.flatten()
        fy_test = Y_true_a.flatten()

        uniqtwid, uindices2 = np.unique(fids_test, return_index=True)
        uniqtwid = uniqtwid.tolist()
        uindices2 = uindices2.tolist()
        del uindices2[uniqtwid.index(b'a')]
        del uniqtwid[uniqtwid.index(b'a')]

        uniq_dev_prediction = [fy_pred[i] for i in uindices2]
        uniq_dev_label = [fy_test[i] for i in uindices2]

        mactest_F_a = f1_score(uniq_dev_prediction,
                               uniq_dev_label,
                               average='macro',
                               labels=[0, 1, 2, 3])
    else:
        mactest_F_a = 0
        uniq_dev_prediction = []
        uniq_dev_label = []

    output = {
        'loss': (1 - mactest_F_a) + (1 - mactest_F_b) + (1 - mactest_F_c),
        'Params': params,
        'status': STATUS_OK,
    }

    return output