コード例 #1
0
def do_reg():
    d = DataSet()
    folds, hold_out = kfold_split(d, n_folds=10)
    fold_stances, hold_out_stances = get_stances_for_folds(d, folds, hold_out)

    Xs = dict()
    ys = dict()

    # Load/Precompute all features now
    X_holdout, y_holdout = generate_features(hold_out_stances, d, "holdout")
    for fold in fold_stances:
        Xs[fold], ys[fold] = generate_features(fold_stances[fold], d,
                                               str(fold))

    best_score = 0
    best_fold = None

    # Classifier for each fold
    for fold in fold_stances:
        ids = list(range(len(folds)))
        del ids[fold]

        X_train = np.vstack(tuple([Xs[i] for i in ids]))
        y_train = np.hstack(tuple([ys[i] for i in ids]))

        X_test = Xs[fold]
        y_test = ys[fold]

        clf_stage1 = GradientBoostingClassifier(n_estimators=200,
                                                random_state=14128,
                                                verbose=True)
        #clf = GradientBoostingClassifier(n_estimators=50, random_state=14128, verbose=False)
        # Try random forest
        clf.fit(X_train, y_train)

        predicted = [LABELS[int(a)] for a in clf.predict(X_test)]
        actual = [LABELS[int(a)] for a in y_test]

        fold_score, _ = score_submission(actual, predicted)
        max_fold_score, _ = score_submission(actual, actual)

        score = fold_score / max_fold_score

        print("Score for fold " + str(fold) + " was - " + str(score))
        if score > best_score:
            best_score = score
            best_fold = clf

    #Run on Holdout set and report the final score on the holdout set
    predicted = [LABELS[int(a)] for a in best_fold.predict(X_holdout)]
    actual = [LABELS[int(a)] for a in y_holdout]

    report_score(actual, predicted)
コード例 #2
0
    def __init__(self, dataset, n_folds=10):
        self.dataset = dataset
        #print('generating folds')
        self.folds, self.hold_out = kfold_split(dataset, n_folds=n_folds)
        self.fold_stances, self.hold_out_stances = get_stances_for_folds(
            dataset, self.folds, self.hold_out)

        self.ys = dict()
        self.Xcs = dict()
        self.ys_nb = dict()
        self.Xcs_nb = dict()
        self.fold_stances_nb = dict()
        self.ys_true = dict()

        self.Xbasenb = dict()
        self.Xtotalnb = dict()
        self.Xtotal = dict()
        self.X_baseline = dict()
        self.y_baseline = dict()
コード例 #3
0
              X_overlap_pos, X_overlap_pos_sentence, X_tfidf, X_tfidf_max,
              X_overlap_bpe_SS]
    return X, y


if __name__ == "__main__":
    check_version()

    print('Running Conditioned CNN on FNC1 Dataset')
    dl_model_pred, _unused1, _unused2 = get_predictions_from_FNC_1_Test(
        params.dl_weights_file, params.apply_pos_filter, DEVICE)

    #Load the training dataset and generate folds
    d = DataSet()
    folds, hold_out = kfold_split(d, n_folds=10)
    fold_stances, hold_out_stances = get_stances_for_folds(d, folds, hold_out)

    # Load the competition dataset
    competition_dataset = DataSet("competition_test")
    stances = pd.DataFrame(competition_dataset.stances)
    X_competition, y_competition = generate_features(
        competition_dataset.stances, competition_dataset, "competition")

    Xs = dict()
    ys = dict()

    # Load/Precompute all features now
    X_holdout, y_holdout = generate_features(hold_out_stances, d, "holdout")
    for fold in fold_stances:
        Xs[fold], ys[fold] = generate_features(fold_stances[fold], d,
                                               str(fold))
コード例 #4
0
def run_stage(fn, d, competition_dataset):
    global runpass
    runpass += 1

    folds, hold_out = kfold_split(d, n_folds=10)
    fold_stances, hold_out_stances = get_stances_for_folds(d, folds, hold_out)

    # Load/Precompute all features now
    Xs = dict()
    ys = dict()
    ids = dict()
    comp_stances = competition_dataset.get_unlabelled_stances()
    X_comp, y_comp, id_comp = fn(comp_stances, competition_dataset,
                                 "competition_{}".format(str(runpass)))
    X_holdout, y_holdout, id_holdout = fn(hold_out_stances, d,
                                          "holdout_{}".format(str(runpass)))
    for fold in fold_stances:
        Xs[fold], ys[fold], ids[fold] = fn(
            fold_stances[fold], d, "{}_{}".format(str(fold), str(runpass)))

    best_score = 0
    best_fold = None

    # Classifier for each fold
    for fold in fold_stances:
        id_train = np.hstack(
            tuple([ids[i] for i in range(len(fold_stances)) if i != fold]))
        X_train = np.vstack(
            tuple([Xs[i] for i in range(len(fold_stances)) if i != fold]))
        y_train = np.hstack(
            tuple([ys[i] for i in range(len(fold_stances)) if i != fold]))
        id_test = ids[fold]
        X_test = Xs[fold]
        y_test = ys[fold]

        clf = GradientBoostingClassifier(n_estimators=200,
                                         random_state=14128,
                                         verbose=True)
        clf.fit(X_train, y_train)

        predicted_test = [LABELS[int(a)] for a in clf.predict(X_test)]
        actual_test = [LABELS[int(a)] for a in y_test]
        for i in range(len(actual_test)):
            d.stances[id_test[i]]['Predict'] = actual_test[i]  # Data is known

        fold_score, _ = score_submission(actual_test, predicted_test)
        max_fold_score, _ = score_submission(actual_test, actual_test)

        score = fold_score / max_fold_score

        print("Score for fold " + str(fold) + " was - " + str(score))
        if score > best_score:
            best_score = score
            best_fold = clf

    #Run on Holdout set and report the final score on the holdout set
    predicted_hold = [LABELS[int(a)] for a in best_fold.predict(X_holdout)]
    actual_hold = [LABELS[int(a)] for a in y_holdout]
    for i in range(len(predicted_hold)):
        d.stances[id_holdout[i]]['Predict'] = predicted_hold[
            i]  # Data is unknown

    #Run on competition dataset
    predicted_comp = [LABELS[int(a)] for a in best_fold.predict(X_comp)]
    actual_comp = [LABELS[int(a)] for a in y_comp]
    for i in range(len(actual_comp)):
        competition_dataset.stances[id_comp[i]]['Predict'] = predicted_comp[
            i]  # Data is unknown

    return id_holdout
コード例 #5
0
ファイル: FinalClassifier.py プロジェクト: aiThanet/FNC-1
if __name__ == "__main__":
    check_version()
    parse_params()

    #Load the training dataset and generate folds
    d = DataSet()

    # Load the competition dataset
    competition_dataset = DataSet("competition_test")
    X_competition, y_competition, y_competition_bi = generate_features(
        competition_dataset.stances, competition_dataset, "competition")

    # step1 : classification model for related or unrelated
    folds, hold_out = kfold_split(d, n_folds=10)
    fold_stances, hold_out_stances = get_stances_for_folds(d, folds, hold_out)
    if not os.path.isfile("models/finalClassifier.1.model"):
        generate_model(fold_stances, 1)
    best_fold = joblib.load("models/finalClassifier.1.model")

    # Load/Precompute all features now
    X_holdout, y_holdout, y_holdout_bi = generate_features(
        hold_out_stances, d, "holdout")

    # step2 : classification model for related (3 classes : Agree, Disagree, Discuss)
    related_folds, related_hold_out = kfold_split(d, n_folds=10, biClass=True)
    related_fold_stances, related_hold_out_stances = get_stances_for_folds(
        d, related_folds, related_hold_out, only_related=True)
    if not os.path.isfile("models/finalClassifier.2.model"):
        generate_model(related_fold_stances, 2)
    related_best_fold = joblib.load("models/finalClassifier.2.model")
コード例 #6
0
ファイル: fnc_kfold.py プロジェクト: paris5020/athene_system
    X = np.c_[X_hand, X_polarity, X_refuting, X_overlap]
    return X,y





if __name__ == "__main__":

    if sys.version_info.major < 3:
        sys.stderr.write('Please use Python version 3 and above\n')
        sys.exit(1)

    d = DataSet()
    folds,hold_out = kfold_split(d,n_folds=10)
    fold_stances, hold_out_stances = get_stances_for_folds(d,folds,hold_out)

    Xs = dict()
    ys = dict()

    # Load/Precompute all features now
    X_holdout,y_holdout = generate_features(hold_out_stances,d,"holdout")
    for fold in fold_stances:
        Xs[fold],ys[fold] = generate_features(fold_stances[fold],d,str(fold))


    best_score = 0
    best_fold = None


    # Classifier for each fold