コード例 #1
0
def create_best_gradboost(data_path, output_path):
    X, y, X_test, _ = prepare_data(data_path)
    X_train, X_valid, y_train, y_valid = train_test_split(X,
                                                          y,
                                                          test_size=0.10,
                                                          random_state=42)
    pipe = Pipeline([('feature_selection', SelectKBest(f_classif)),
                     ('clf', GradientBoostingClassifier(random_state=2))])
    params = {
        'feature_selection__k': [12, 8, 4],
        'clf__n_estimators': [50, 100, 500]
    }

    grid_search = GridSearchCV(pipe, param_grid=params, scoring='roc_auc')
    #logging.INFO(str(grid_search.fit(X_train, y_train).best_params_))
    grid_search.fit(X_train, y_train)

    tn, fp, fn, tp = confusion_matrix(
        y_valid, grid_search.best_estimator_.predict(X_valid)).ravel()
    #logging.INFO("= Gradient Boosting Classifier = ")
    #logging.INFO("Accuracy: {}".format( str(((tp + tn) / (tn + fp + fn + tp)))))
    #logging.INFO("Recall: {}".format(((tp) / (fn + tp))))
    #logging.INFO("Specificity: {}".format(((tn) / (tn + fp))))
    #logging.INFO("F1 Score: {}".format(f1_score(y_valid, grid_search.best_estimator_.predict(X_valid))))
    #logging.INFO("AUC: {}".format(roc_auc_score(y_valid, grid_search.best_estimator_.predict(X_valid))))

    model_path = "{0}{1}".format(output_path, 'gradboost.pkl')
    with open(model_path, 'wb') as file:
        pickle.dump(grid_search.best_estimator_, file=file)

    return model_path, roc_auc_score(
        y_valid, grid_search.best_estimator_.predict(X_valid))
コード例 #2
0
ファイル: main.py プロジェクト: ItayDev/nli-span-info
def build_data_loaders(args, tokenizer):
    snli, mnli = prepare_data(tokenizer)
    injected_collate = partial(collate,
                               span_drop=args.span_drop,
                               max_spans=args.max_spans,
                               padding_token_id=tokenizer.pad_token_id)

    train_loader_factory = TrainLoaderFactory(args.batch_size, snli, mnli,
                                              injected_collate)

    train_all = train_loader_factory.get_loader(MODE.TRAIN_ALL)
    train_snli = train_loader_factory.get_loader(MODE.TRAIN_SNLI)
    train_mnli = train_loader_factory.get_loader(MODE.TRAIN_MNLI)
    test_snli = train_loader_factory.get_loader(MODE.TEST_SNLI)
    test_mnli_matched = train_loader_factory.get_loader(MODE.TEST_MNLI_MATCHED)
    test_mnli_mismatched = train_loader_factory.get_loader(
        MODE.TEST_MNLI_MISMATCHED)

    if args.dataset == 'all':
        return train_all, {
            'snli': test_snli,
            "mnli_matched": test_mnli_matched,
            "mnli_mismatched": test_mnli_mismatched
        }
    elif args.dataset == 'snli':
        return train_snli, {'snli': test_snli}
    elif args.dataset == 'mnli':
        return train_mnli, {
            "mnli_matched": test_mnli_matched,
            "mnli_mismatched": test_mnli_mismatched
        }
    else:
        raise ValueError("--dataset has to contain one of: all, mnli or snli")
コード例 #3
0
def get_predictions( data_path="", model_path=""):
    champion_path = "{0}{1}".format(model_path, 'champion.pkl')
    print("0 "+champion_path)
    champion = pickle.load(open(champion_path, 'rb'))

    # Future extension: Data path or data retrieval process for consistently updated data
    _, _, X_test, test_id = prepare_data(data_path)

    # Future extension: Write to another file, database, etc.
    print(champion.predict(X_test))
    submission_path = "{0}{1}".format(model_path, "submission.csv")
    print(submission_path)

    submission = pd.DataFrame({'PassengerId': test_id, 'Survived': champion.predict(X_test)})
    submission.to_csv(submission_path)