Beispiel #1
0
def run_experi(X_matrix_bal, X_matrix_imb, feat_name):
    """ Get keys and labels according to dataset size """
    """ make train / test splits """
    n_folds = 5
    fold_size_bal = len(labels_bal) / n_folds
    fold_size_imb = len(labels_imb) / n_folds

    split_start_bal = 0
    split_end_bal = fold_size_bal
    split_start_imb = 0
    split_end_imb = fold_size_imb

    acc_bal, prec_bal, rec_bal, f1_bal = 0, 0, 0, 0
    acc_imb, prec_imb, rec_imb, f1_imb = 0, 0, 0, 0

    """ import salient terms matrix """
    fold = 1
    for iteration in range(0, n_folds):
        print "running %s fold" % feat_name, fold
        # get y_array train / test splits:
        y_train_bal = np.hstack((labels_bal[: int(split_start_bal)], labels_bal[int(split_end_bal) :]))
        y_test_bal = labels_bal[int(split_start_bal) : int(split_end_bal)]

        y_train_imb = np.hstack((labels_imb[: int(split_start_imb)], labels_imb[int(split_end_imb) :]))
        y_test_imb = labels_imb[int(split_start_imb) : int(split_end_imb)]

        # get X array train/test splits:
        X_train_bal = np.vstack((X_matrix_bal[: int(split_start_bal), :], X_matrix_bal[int(split_end_bal) :, :]))
        X_test_bal = X_matrix_bal[int(split_start_bal) : int(split_end_bal)]

        X_train_imb = np.vstack((X_matrix_imb[: int(split_start_imb), :], X_matrix_imb[int(split_end_imb) :, :]))
        X_test_imb = X_matrix_imb[int(split_start_imb) : int(split_end_imb)]

        # run logistic regression:
        test_bal = logit.run_logit(X_train_bal, X_test_bal, y_train_bal, y_test_bal)
        acc_bal += test_bal[0]
        prec_bal += test_bal[1]
        rec_bal += test_bal[2]
        f1_bal += test_bal[3]

        test_imb = logit.run_logit(X_train_imb, X_test_imb, y_train_imb, y_test_imb)
        acc_imb += test_imb[0]
        prec_imb += test_imb[1]
        rec_imb += test_imb[2]
        f1_imb += test_imb[3]

        # update to next interation:
        split_start_bal += fold_size_bal
        split_end_bal += fold_size_bal

        split_start_imb += fold_size_imb
        split_end_imb += fold_size_imb

        # update fold no:
        fold += 1

    return acc_bal / 5, prec_bal / 5, rec_bal / 5, f1_bal / 5, acc_imb / 5, prec_imb / 5, rec_imb / 5, f1_imb / 5
Beispiel #2
0
def run_ahst(combination):
    """ Get keys and labels according to dataset size """
    """ make train / test splits """
    n_folds = 5
    fold_size_bal = len(labels_bal) / n_folds
    fold_size_imb = len(labels_imb) / n_folds

    split_start_bal = 0
    split_end_bal = fold_size_bal
    split_start_imb = 0
    split_end_imb = fold_size_imb

    acc_bal, prec_bal, rec_bal, f1_bal = 0, 0, 0, 0
    acc_imb, prec_imb, rec_imb, f1_imb = 0, 0, 0, 0

    """ import salient terms matrix """
    fold = 1
    for iteration in range(0, n_folds):
        print "running %s hst" % combination, fold
        # get y_array train / test splits:
        y_train_bal = np.hstack((labels_bal[: int(split_start_bal)], labels_bal[int(split_end_bal) :]))
        y_test_bal = labels_bal[int(split_start_bal) : int(split_end_bal)]

        y_train_imb = np.hstack((labels_imb[: int(split_start_imb)], labels_imb[int(split_end_imb) :]))
        y_test_imb = labels_imb[int(split_start_imb) : int(split_end_imb)]

        # author historical salient terms:
        X_ahst_bal = np.load("X_ahst%s_bal.npy" % iteration)
        X_ahst_imb = np.load("X_ahst%s_imb.npy" % iteration)

        if combination == "tauth":
            X_ahst_bal = np.hstack((X_array9_bal, X_ahst_bal))
            X_ahst_imb = np.hstack((X_array9_imb, X_ahst_imb))

        # audience (response) historical salient terms:
        if combination == "r" or combination == "taud":
            X_ahst_bal = np.load("X_rhst%s_bal.npy" % iteration)
            X_ahst_imb = np.load("X_rhst%s_imb.npy" % iteration)

        if combination == "taud":
            X_rhst_bal = np.load("X_rhst%s_bal.npy" % iteration)
            X_rhst_imb = np.load("X_rhst%s_imb.npy" % iteration)
            X_ahst_bal = np.hstack((X_array11_bal, X_rhst_bal))
            X_ahst_imb = np.hstack((X_array11_imb, X_rhst_imb))

        if combination == "all":
            X_rhst_bal = np.load("X_rhst%s_bal.npy" % iteration)
            X_rhst_imb = np.load("X_rhst%s_imb.npy" % iteration)
            X_rhst_bal = np.hstack((X_array15_bal, X_rhst_bal))
            X_rhst_imb = np.hstack((X_array15_imb, X_rhst_imb))
            X_ahst_bal = np.hstack((X_rhst_bal, X_ahst_bal))
            X_ahst_imb = np.hstack((X_rhst_imb, X_ahst_imb))

        # get X array train/test splits:
        X_train_bal = np.vstack((X_ahst_bal[: int(split_start_bal), :], X_ahst_bal[int(split_end_bal) :, :]))
        X_test_bal = X_ahst_bal[int(split_start_bal) : int(split_end_bal)]

        X_train_imb = np.vstack((X_ahst_imb[: int(split_start_imb), :], X_ahst_imb[int(split_end_imb) :, :]))
        X_test_imb = X_ahst_imb[int(split_start_imb) : int(split_end_imb)]

        # run logistic regression:
        test_bal = logit.run_logit(X_train_bal, X_test_bal, y_train_bal, y_test_bal)
        acc_bal += test_bal[0]
        prec_bal += test_bal[1]
        rec_bal += test_bal[2]
        f1_bal += test_bal[3]

        test_imb = logit.run_logit(X_train_imb, X_test_imb, y_train_imb, y_test_imb)
        acc_imb += test_imb[0]
        prec_imb += test_imb[1]
        rec_imb += test_imb[2]
        f1_imb += test_imb[3]

        # update to next interation:
        split_start_bal += fold_size_bal
        split_end_bal += fold_size_bal

        split_start_imb += fold_size_imb
        split_end_imb += fold_size_imb

        # update fold no:
        fold += 1

    return acc_bal / 5, prec_bal / 5, rec_bal / 5, f1_bal / 5, acc_imb / 5, prec_imb / 5, rec_imb / 5, f1_imb / 5