コード例 #1
0
    def run(self, X_test, y_test_true):

        final = [LABELS[int(a)] for a in self.classifier.predict(X_test)]
        actual = [LABELS[int(a)] for a in y_test_true]
        fold_score, _ = score_submission(actual, final)
        max_fold_score, _ = score_submission(actual, actual)
        score = fold_score / max_fold_score
        return final, actual, score
コード例 #2
0
def naive_bayes_train(fold_stances, dataset, repl):
    # Naive Bayes classifier, modified for k-folds estimation
    # Source: http://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html
    # Added by Julian
    global cvec, tfidf, mnb
    best_score = 0
    ids = dict()
    Hs = dict()
    Bs = dict()
    ys = dict()
    for fold in fold_stances:
        ids[fold], Hs[fold], Bs[fold], ys[fold] = init_features(
            fold_stances[fold], dataset, repl)
    for fold in fold_stances:
        y_train = np.hstack(
            tuple([ys[i] for i in range(len(fold_stances)) if i != fold]))
        id_test = ids[fold]
        H_test = Hs[fold]
        B_test = Bs[fold]
        y_test = ys[fold]

        articles = []
        for i in range(len(fold_stances)):
            if i == fold: continue
            for h, b in zip(Hs[i], Bs[i]):
                articles.append(h + " " + b)
        _cvec = CountVectorizer()
        X_train_counts = _cvec.fit_transform(articles)
        _cvec2 = CountVectorizer(vocabulary=string.punctuation)
        _tfidf = TfidfTransformer()
        X_train_tfidf = _tfidf.fit_transform(X_train_counts)
        _mnb = MultinomialNB().fit(X_train_tfidf, y_train)

        articles = []
        for h, b in zip(H_test, B_test):
            articles.append(h + " " + b)
        X_test_counts = _cvec.transform(articles)
        X_test_tfidf = _tfidf.transform(X_test_counts)

        predicted_test = [LABELS[int(a)] for a in _mnb.predict(X_test_tfidf)]
        actual_test = [LABELS[int(a)] for a in y_test]
        for i in range(len(actual_test)):
            dataset.stances[id_test[i]]['Predict'] = actual_test[
                i]  # Data is known

        fold_score, _ = score_submission(actual_test, predicted_test)
        max_fold_score, _ = score_submission(actual_test, actual_test)

        score = fold_score / max_fold_score

        print("Score for fold " + str(fold) + " was - " + str(score))
        if score > best_score:
            best_score = score
            cvec = _cvec
            tfidf = _tfidf
            mnb = _mnb
コード例 #3
0
def do_reg():
    d = DataSet()
    folds, hold_out = kfold_split(d, n_folds=10)
    fold_stances, hold_out_stances = get_stances_for_folds(d, folds, hold_out)

    Xs = dict()
    ys = dict()

    # Load/Precompute all features now
    X_holdout, y_holdout = generate_features(hold_out_stances, d, "holdout")
    for fold in fold_stances:
        Xs[fold], ys[fold] = generate_features(fold_stances[fold], d,
                                               str(fold))

    best_score = 0
    best_fold = None

    # Classifier for each fold
    for fold in fold_stances:
        ids = list(range(len(folds)))
        del ids[fold]

        X_train = np.vstack(tuple([Xs[i] for i in ids]))
        y_train = np.hstack(tuple([ys[i] for i in ids]))

        X_test = Xs[fold]
        y_test = ys[fold]

        clf_stage1 = GradientBoostingClassifier(n_estimators=200,
                                                random_state=14128,
                                                verbose=True)
        #clf = GradientBoostingClassifier(n_estimators=50, random_state=14128, verbose=False)
        # Try random forest
        clf.fit(X_train, y_train)

        predicted = [LABELS[int(a)] for a in clf.predict(X_test)]
        actual = [LABELS[int(a)] for a in y_test]

        fold_score, _ = score_submission(actual, predicted)
        max_fold_score, _ = score_submission(actual, actual)

        score = fold_score / max_fold_score

        print("Score for fold " + str(fold) + " was - " + str(score))
        if score > best_score:
            best_score = score
            best_fold = clf

    #Run on Holdout set and report the final score on the holdout set
    predicted = [LABELS[int(a)] for a in best_fold.predict(X_holdout)]
    actual = [LABELS[int(a)] for a in y_holdout]

    report_score(actual, predicted)
コード例 #4
0
    def run_stage2(self, X_test_stg2):

        init_pred_ind = [i for i, e in enumerate(self.init_pred) if e == 0]
        X_test_temp = [X_test_stg2[x] for x in init_pred_ind]
        predicted_new = [
            LABELS[int(a)] for a in self.classifier2.predict(X_test_temp)
        ]

        self.final = self.predicted
        for i, e in enumerate(init_pred_ind):
            self.final[e] = predicted_new[i]

        fold_score, _ = score_submission(self.actual, self.final)
        max_fold_score, _ = score_submission(self.actual, self.actual)
        score = fold_score / max_fold_score

        return self.final, self.actual, score
コード例 #5
0
ファイル: XGBoostClassifier.py プロジェクト: aiThanet/FNC-1
def generate_model(fold_stances):
    Xs = dict()
    ys = dict()

    for fold in fold_stances:
        Xs[fold], ys[fold] = generate_features(fold_stances[fold], d,
                                               str(fold))

    best_score = 0
    best_fold = None

    # Classifier for each fold
    for fold in fold_stances:
        ids = list(range(len(folds)))
        del ids[fold]

        X_train = np.vstack(tuple([Xs[i] for i in ids]))
        y_train = np.hstack(tuple([ys[i] for i in ids]))

        X_test = Xs[fold]
        y_test = ys[fold]

        # clf = GradientBoostingClassifier(n_estimators=200, random_state=14128, verbose=True)
        # clf = xgb.XGBClassifier(objective= "multi:softmax",num_class=4,seed=12345)
        clf = xgb.XGBClassifier(seed=12345)
        clf.fit(X_train, y_train)

        predicted = [LABELS[int(a)] for a in clf.predict(X_test)]
        actual = [LABELS[int(a)] for a in y_test]

        fold_score, _ = score_submission(actual, predicted)
        max_fold_score, _ = score_submission(actual, actual)

        score = fold_score / max_fold_score

        print("Score for fold " + str(fold) + " was - " + str(score))
        if score > best_score:
            best_score = score
            best_fold = clf

    joblib.dump(best_fold, "models/xgboost.model")
コード例 #6
0
            clf.fit(X_train, y_train)

            if params.run_2_class:
                predicted = [
                    LABELS_RELATED[int(a)] for a in clf.predict(X_test)
                ]
                actual = [LABELS_RELATED[int(a)] for a in y_test]

                fold_score = score_cal(actual, predicted)
                max_fold_score = score_cal(actual, actual)

            else:
                predicted = [LABELS[int(a)] for a in clf.predict(X_test)]
                actual = [LABELS[int(a)] for a in y_test]

                fold_score, _ = score_submission(actual, predicted)
                max_fold_score, _ = score_submission(actual, actual)

            score = fold_score / max_fold_score

            print("Score for fold " + str(fold) + " was - " + str(score))
            if score > best_score:
                best_score = score
                best_fold = clf
        pickle.dump(best_fold, open(params.gb_weights_file, 'wb'))

    best_fold = pickle.load(open(params.gb_weights_file, 'rb'))
    # Run on Holdout set and report the final score on the holdout set
    if params.run_2_class:
        predicted = [
            LABELS_RELATED[int(a)] for a in best_fold.predict(X_holdout)
コード例 #7
0
                    predic = sess.run(pred,
                                      feed_dict={
                                          x: test,
                                          labels: to_one_hot(y_test)
                                      })
                    predic_lab = [LABELS[int(a)] for a in from_one_hot(predic)]
                    actual = [
                        LABELS[int(a)]
                        for a in np.argmax(to_one_hot(y_test), 1)
                    ]
                    for i, e in enumerate(init_pred_ind[fold]):
                        base_pred[e] = predic_lab[i]
                    print('confusion matrix')
                    #report_score(base_act, base_pred)

                    fold_score, _ = score_submission(base_act, base_pred)
                    max_fold_score, _ = score_submission(base_act, base_act)
                    score = fold_score / max_fold_score
                    print(fold, " : ", score)
                    if score > best_score:
                        best_score = score
                        best_fold1 = clf

                    base_pred = [
                        LABELS[3] if a == 1 else LABELS[0]
                        for a in clf.predict(X_baseline_holdout)
                    ]
                    base_act = [LABELS[int(a)] for a in y_holdout]
                    init_pred = dict()
                    init_pred_ind = dict()
                    init_pred = [
コード例 #8
0
ファイル: lstm.py プロジェクト: JayanthRR/fake-news-challenge
                            lstm.labels: y_tr_batch
                        })

                    classes = lstm.test(lstm.batch_size)

                    if (n_step % lstm.display_step) == 0:
                        outputs = sess.run(classes,
                                           feed_dict={
                                               lstm.head: XH_test,
                                               lstm.head_lengths: X_htest_len,
                                               lstm.body: XB_test,
                                               lstm.body_lengths: X_btest_len
                                           })

                        print(type(outputs))
                        predicted_labels = [LABELS[int(a)] for a in outputs]
                        actual_labels = [LABELS[int(a)] for a in y_test]

                        fold_score, _ = score_submission(
                            actual_labels, predicted_labels)
                        max_fold_score, _ = score_submission(
                            actual_labels, actual_labels)
                        score = fold_score / max_fold_score

                    print("step is :" + str(n_step) + "cost is :" +
                          str(result["cost"]) + "score is :" + str(score))

                    start_ind += lstm.batch_size
                    n_step += 1
                epoch += 1
コード例 #9
0
        predicted = np.argmax(np.asarray(y_pred), axis=1)
        actual = np.asarray(y_target)

        print(predicted)
        print(actual)

        confmat = confusion_matrix(actual, predicted)
        print('\nconfusion matrix:')
        print(confmat)
        accu = accuracy_score(actual, predicted)
        print('\naccyracy = {:>.4f}\n'.format(accu))

        predicted1 = [LABELS[int(a)] for a in predicted]
        actual1 = [LABELS[int(a)] for a in actual]

        fold_score, _ = score_submission(actual1, predicted1)
        max_fold_score, _ = score_submission(actual1, actual1)

        score = fold_score / max_fold_score

        print("Score for fold " + str(fold) + " was - " + str(score))
        if score > best_score:
            best_score = score

    X_holdout = np.array(X_holdout)
    y_holdout = np.array(y_holdout)

    testVal = X_holdout.shape[0] % 100
    testVal = X_holdout.shape[0] - testVal

    test = data_utils.TensorDataset(
コード例 #10
0
        clf = LinearSVC(verbose=True,
                        random_state=14128,
                        class_weight={
                            0: 0.74535,
                            1: 2.7549
                        },
                        loss='hinge',
                        max_iter=15000)

        print(X_train.shape)
        clf.fit(X_train, y_train)

        predic = [LABELS[int(a)] for a in clf.predict(X_test)]
        act = [LABELS[int(a)] for a in y_test]

        fold_score, _, fa = score_submission(act, predic)
        max_fold_score, _, bfa = score_submission(act, act)

        score = fold_score / max_fold_score

        print("Score for fold " + str(fold) + " was - " + str(score))
        if score > best_score:
            best_score = score
            best_fold = clf

    #Run on Holdout set and report the final score on the holdout set
    predic = [LABELS[int(a)] for a in best_fold.predict(X_holdout)]
    act = [LABELS[int(a)] for a in y_holdout]

    print("Scores on the dev set")
    report_score(act, predic)
コード例 #11
0
                        X_holdout = X_stg1["holdout"]

                        y_test_true = y_stg1["true"]
                        y_train = y_stg1["train"]
                        y_test = y_stg1["test"]
                        y_holdout = y_stg1["holdout"]

                        clf = RandomForestClassifier(n_estimators=200,
                                                     n_jobs=4,
                                                     verbose=False)
                        clf.fit(X_train, y_train)

                        final = [LABELS[int(a)] for a in clf.predict(X_test)]
                        actual = [LABELS[int(a)] for a in y_test_true]

                        fold_score, _ = score_submission(actual, final)
                        max_fold_score, _ = score_submission(actual, actual)
                        score = fold_score / max_fold_score

                        print("Score for fold " + str(fold) + " was - " +
                              str(score))
                        if score > best_score:
                            best_score = score
                            best_fold1 = clf
                            #best_fold2 = clf2

                    filename = model_dir + "_" + mode + "_" + cval[cval_ind]
                    pickle.dump(best_fold1, open(filename, "wb"))

                    final = [
                        LABELS[int(a)] for a in best_fold1.predict(X_holdout)
コード例 #12
0
def run_stage(fn, d, competition_dataset):
    global runpass
    runpass += 1

    folds, hold_out = kfold_split(d, n_folds=10)
    fold_stances, hold_out_stances = get_stances_for_folds(d, folds, hold_out)

    # Load/Precompute all features now
    Xs = dict()
    ys = dict()
    ids = dict()
    comp_stances = competition_dataset.get_unlabelled_stances()
    X_comp, y_comp, id_comp = fn(comp_stances, competition_dataset,
                                 "competition_{}".format(str(runpass)))
    X_holdout, y_holdout, id_holdout = fn(hold_out_stances, d,
                                          "holdout_{}".format(str(runpass)))
    for fold in fold_stances:
        Xs[fold], ys[fold], ids[fold] = fn(
            fold_stances[fold], d, "{}_{}".format(str(fold), str(runpass)))

    best_score = 0
    best_fold = None

    # Classifier for each fold
    for fold in fold_stances:
        id_train = np.hstack(
            tuple([ids[i] for i in range(len(fold_stances)) if i != fold]))
        X_train = np.vstack(
            tuple([Xs[i] for i in range(len(fold_stances)) if i != fold]))
        y_train = np.hstack(
            tuple([ys[i] for i in range(len(fold_stances)) if i != fold]))
        id_test = ids[fold]
        X_test = Xs[fold]
        y_test = ys[fold]

        clf = GradientBoostingClassifier(n_estimators=200,
                                         random_state=14128,
                                         verbose=True)
        clf.fit(X_train, y_train)

        predicted_test = [LABELS[int(a)] for a in clf.predict(X_test)]
        actual_test = [LABELS[int(a)] for a in y_test]
        for i in range(len(actual_test)):
            d.stances[id_test[i]]['Predict'] = actual_test[i]  # Data is known

        fold_score, _ = score_submission(actual_test, predicted_test)
        max_fold_score, _ = score_submission(actual_test, actual_test)

        score = fold_score / max_fold_score

        print("Score for fold " + str(fold) + " was - " + str(score))
        if score > best_score:
            best_score = score
            best_fold = clf

    #Run on Holdout set and report the final score on the holdout set
    predicted_hold = [LABELS[int(a)] for a in best_fold.predict(X_holdout)]
    actual_hold = [LABELS[int(a)] for a in y_holdout]
    for i in range(len(predicted_hold)):
        d.stances[id_holdout[i]]['Predict'] = predicted_hold[
            i]  # Data is unknown

    #Run on competition dataset
    predicted_comp = [LABELS[int(a)] for a in best_fold.predict(X_comp)]
    actual_comp = [LABELS[int(a)] for a in y_comp]
    for i in range(len(actual_comp)):
        competition_dataset.stances[id_comp[i]]['Predict'] = predicted_comp[
            i]  # Data is unknown

    return id_holdout
コード例 #13
0
                    init_pred[fold] = [
                        int(a) for a in clf.predict(X_baseline[fold])
                    ]
                    init_pred_ind[fold] = [
                        i for i, e in enumerate(init_pred[fold]) if e == 0
                    ]

                    Xcs_temp = [Xcs[fold][x] for x in init_pred_ind[fold]]
                    predicted_new = [
                        LABELS[int(a)] for a in clf2.predict(Xcs_temp)
                    ]
                    final = predicted
                    for i, e in enumerate(init_pred_ind[fold]):
                        final[e] = predicted_new[i]

                    fold_score, _ = score_submission(actual, final)
                    max_fold_score, _ = score_submission(actual, actual)
                    score = fold_score / max_fold_score

                    #for f in ids:

                    #    init_pred[f] = [int(a) for a in clf.predict(Xcs[f])]
                    #    init_pred_ind[f] = [i for i,e in enumerate(init_pred[f]) if e==0]
                    #    fold_stances_new[f] = [fold_stances[1][x]['Stance'] for x in init_pred_ind[f]]
                    #    Xcs_new[f],Xhs_new[f],Xbs_new[f],ys_new[f]=generate_features(fold_stances_new[f],d,str(f),model,binary=False)

                    print("Score for fold " + str(fold) + " was - " +
                          str(score))
                    if score > best_score:
                        best_score = score
                        best_fold1 = clf
コード例 #14
0
            trains = np.vstack(t)
            y_train = dict(ys)
            del y_train[index]

            trainsy = np.hstack(tuple([y_train[i] for i in y_train]))

            clf = GradientBoostingClassifier()
            clf.fit(trains, trainsy)

            X_test = Xs[index]
            y_test = ys[index]

            predicted = [a for a in clf.predict(X_test)]
            actual = [a for a in y_test]

            predicted_score = score_submission(actual, predicted)
            max_fold_score = score_submission(actual, actual)

            score = predicted_score / max_fold_score

            if score > best_score:
                best_score = score
                best_fold = clf
                best_predicted = predicted
                best_actual = actual

            index += 1
        #print the possibility's string format
        print(p)
        #print the possibility's best-fold score and the holdout score
コード例 #15
0
ファイル: fnc_kfold.py プロジェクト: paris5020/athene_system
        ids = list(range(len(folds)))
        del ids[fold]

        X_train = np.vstack(tuple([Xs[i] for i in ids]))
        y_train = np.hstack(tuple([ys[i] for i in ids]))

        X_test = Xs[fold]
        y_test = ys[fold]

        clf = GradientBoostingClassifier(n_estimators=200, random_state=14128, verbose=True)
        clf.fit(X_train, y_train)

        predicted = [LABELS[int(a)] for a in clf.predict(X_test)]
        actual = [LABELS[int(a)] for a in y_test]

        fold_score, _ = score_submission(actual, predicted)
        max_fold_score, _ = score_submission(actual, actual)

        score = fold_score/max_fold_score

        print("Score for fold "+ str(fold) + " was - " + str(score))
        if score > best_score:
            best_score = score
            best_fold = clf



    #Run on Holdout set and report the final score on the holdout set
    predicted = [LABELS[int(a)] for a in best_fold.predict(X_holdout)]
    actual = [LABELS[int(a)] for a in y_holdout]