Ejemplo n.º 1
0
def test_pipeline_init():
    # Test the various init parameters of the pipeline.
    assert_raises(TypeError, Pipeline)
    # Check that we can't instantiate pipelines with objects without fit
    # method
    pipe = assert_raises(TypeError, Pipeline, [('svc', IncorrectT)])
    # Smoke test with only an estimator
    clf = T()
    pipe = Pipeline([('svc', clf)])
    assert_equal(
        pipe.get_params(deep=True),
        dict(
            svc__a=None, svc__b=None, svc=clf, **pipe.get_params(deep=False)))

    # Check that params are set
    pipe.set_params(svc__a=0.1)
    assert_equal(clf.a, 0.1)
    assert_equal(clf.b, None)
    # Smoke test the repr:
    repr(pipe)

    # Test with two objects
    clf = SVC()
    filter1 = SelectKBest(f_classif)
    pipe = Pipeline([('anova', filter1), ('svc', clf)])

    # Check that we can't use the same stage name twice
    assert_raises(ValueError, Pipeline, [('svc', SVC()), ('svc', SVC())])

    # Check that params are set
    pipe.set_params(svc__C=0.1)
    assert_equal(clf.C, 0.1)
    # Smoke test the repr:
    repr(pipe)

    # Check that params are not set when naming them wrong
    assert_raises(ValueError, pipe.set_params, anova__C=0.1)

    # Test clone
    pipe2 = clone(pipe)
    assert_false(pipe.named_steps['svc'] is pipe2.named_steps['svc'])

    # Check that apart from estimators, the parameters are the same
    params = pipe.get_params(deep=True)
    params2 = pipe2.get_params(deep=True)

    for x in pipe.get_params(deep=False):
        params.pop(x)

    for x in pipe2.get_params(deep=False):
        params2.pop(x)

    # Remove estimators that where copied
    params.pop('svc')
    params.pop('anova')
    params2.pop('svc')
    params2.pop('anova')
    assert_equal(params, params2)
Ejemplo n.º 2
0
def test_pipeline_init():
    # Test the various init parameters of the pipeline.
    assert_raises(TypeError, Pipeline)
    # Check that we can't instantiate pipelines with objects without fit
    # method
    pipe = assert_raises(TypeError, Pipeline, [('svc', IncorrectT)])
    # Smoke test with only an estimator
    clf = T()
    pipe = Pipeline([('svc', clf)])
    assert_equal(
        pipe.get_params(deep=True),
        dict(svc__a=None, svc__b=None, svc=clf, **pipe.get_params(deep=False)))

    # Check that params are set
    pipe.set_params(svc__a=0.1)
    assert_equal(clf.a, 0.1)
    assert_equal(clf.b, None)
    # Smoke test the repr:
    repr(pipe)

    # Test with two objects
    clf = SVC()
    filter1 = SelectKBest(f_classif)
    pipe = Pipeline([('anova', filter1), ('svc', clf)])

    # Check that we can't use the same stage name twice
    assert_raises(ValueError, Pipeline, [('svc', SVC()), ('svc', SVC())])

    # Check that params are set
    pipe.set_params(svc__C=0.1)
    assert_equal(clf.C, 0.1)
    # Smoke test the repr:
    repr(pipe)

    # Check that params are not set when naming them wrong
    assert_raises(ValueError, pipe.set_params, anova__C=0.1)

    # Test clone
    pipe2 = clone(pipe)
    assert_false(pipe.named_steps['svc'] is pipe2.named_steps['svc'])

    # Check that apart from estimators, the parameters are the same
    params = pipe.get_params(deep=True)
    params2 = pipe2.get_params(deep=True)

    for x in pipe.get_params(deep=False):
        params.pop(x)

    for x in pipe2.get_params(deep=False):
        params2.pop(x)

    # Remove estimators that where copied
    params.pop('svc')
    params.pop('anova')
    params2.pop('svc')
    params2.pop('anova')
    assert_equal(params, params2)
Ejemplo n.º 3
0
def test_pipeline_init():
    # Test the various init parameters of the pipeline.
    with raises(TypeError):
        Pipeline()
    # Check that we can't instantiate pipelines with objects without fit
    # method
    error_regex = ("Last step of Pipeline should implement fit or be the "
                   "string 'passthrough'")
    with raises(TypeError, match=error_regex):
        Pipeline([("clf", NoFit())])
    # Smoke test with only an estimator
    clf = NoTrans()
    pipe = Pipeline([("svc", clf)])
    expected = dict(svc__a=None,
                    svc__b=None,
                    svc=clf,
                    **pipe.get_params(deep=False))
    assert pipe.get_params(deep=True) == expected

    # Check that params are set
    pipe.set_params(svc__a=0.1)
    assert clf.a == 0.1
    assert clf.b is None
    # Smoke test the repr:
    repr(pipe)

    # Test with two objects
    clf = SVC(gamma="scale")
    filter1 = SelectKBest(f_classif)
    pipe = Pipeline([("anova", filter1), ("svc", clf)])

    # Check that we can't instantiate with non-transformers on the way
    # Note that NoTrans implements fit, but not transform
    error_regex = "implement fit and transform or fit_resample"
    with raises(TypeError, match=error_regex):
        Pipeline([("t", NoTrans()), ("svc", clf)])

    # Check that params are set
    pipe.set_params(svc__C=0.1)
    assert clf.C == 0.1
    # Smoke test the repr:
    repr(pipe)

    # Check that params are not set when naming them wrong
    with raises(ValueError):
        pipe.set_params(anova__C=0.1)

    # Test clone
    pipe2 = clone(pipe)
    assert not pipe.named_steps["svc"] is pipe2.named_steps["svc"]

    # Check that apart from estimators, the parameters are the same
    params = pipe.get_params(deep=True)
    params2 = pipe2.get_params(deep=True)

    for x in pipe.get_params(deep=False):
        params.pop(x)

    for x in pipe2.get_params(deep=False):
        params2.pop(x)

    # Remove estimators that where copied
    params.pop("svc")
    params.pop("anova")
    params2.pop("svc")
    params2.pop("anova")
    assert params == params2
Ejemplo n.º 4
0
def test_pipeline_init():
    # Test the various init parameters of the pipeline.
    assert_raises(TypeError, Pipeline)
    # Check that we can't instantiate pipelines with objects without fit
    # method
    assert_raises_regex(
        TypeError, 'Last step of Pipeline should implement fit. '
        '.*NoFit.*', Pipeline, [('clf', NoFit())])
    # Smoke test with only an estimator
    clf = NoTrans()
    pipe = Pipeline([('svc', clf)])
    expected = dict(svc__a=None,
                    svc__b=None,
                    svc=clf,
                    **pipe.get_params(deep=False))
    assert pipe.get_params(deep=True) == expected

    # Check that params are set
    pipe.set_params(svc__a=0.1)
    assert clf.a == 0.1
    assert clf.b is None
    # Smoke test the repr:
    repr(pipe)

    # Test with two objects
    clf = SVC()
    filter1 = SelectKBest(f_classif)
    pipe = Pipeline([('anova', filter1), ('svc', clf)])

    # Check that we can't instantiate with non-transformers on the way
    # Note that NoTrans implements fit, but not transform
    assert_raises_regex(TypeError, 'implement fit and transform or sample',
                        Pipeline, [('t', NoTrans()), ('svc', clf)])

    # Check that params are set
    pipe.set_params(svc__C=0.1)
    assert clf.C == 0.1
    # Smoke test the repr:
    repr(pipe)

    # Check that params are not set when naming them wrong
    assert_raises(ValueError, pipe.set_params, anova__C=0.1)

    # Test clone
    pipe2 = clone(pipe)
    assert not pipe.named_steps['svc'] is pipe2.named_steps['svc']

    # Check that apart from estimators, the parameters are the same
    params = pipe.get_params(deep=True)
    params2 = pipe2.get_params(deep=True)

    for x in pipe.get_params(deep=False):
        params.pop(x)

    for x in pipe2.get_params(deep=False):
        params2.pop(x)

    # Remove estimators that where copied
    params.pop('svc')
    params.pop('anova')
    params2.pop('svc')
    params2.pop('anova')
    assert params == params2
Ejemplo n.º 5
0
def test_pipeline_init():
    # Test the various init parameters of the pipeline.
    with raises(TypeError):
        Pipeline()
    # Check that we can't instantiate pipelines with objects without fit
    # method
    error_regex = 'Last step of Pipeline should implement fit. .*NoFit.*'
    with raises(TypeError, match=error_regex):
        Pipeline([('clf', NoFit())])
    # Smoke test with only an estimator
    clf = NoTrans()
    pipe = Pipeline([('svc', clf)])
    expected = dict(svc__a=None, svc__b=None, svc=clf,
                    **pipe.get_params(deep=False))
    assert pipe.get_params(deep=True) == expected

    # Check that params are set
    pipe.set_params(svc__a=0.1)
    assert clf.a == 0.1
    assert clf.b is None
    # Smoke test the repr:
    repr(pipe)

    # Test with two objects
    clf = SVC()
    filter1 = SelectKBest(f_classif)
    pipe = Pipeline([('anova', filter1), ('svc', clf)])

    # Check that we can't instantiate with non-transformers on the way
    # Note that NoTrans implements fit, but not transform
    error_regex = 'implement fit and transform or sample'
    with raises(TypeError, match=error_regex):
        Pipeline([('t', NoTrans()), ('svc', clf)])

    # Check that params are set
    pipe.set_params(svc__C=0.1)
    assert clf.C == 0.1
    # Smoke test the repr:
    repr(pipe)

    # Check that params are not set when naming them wrong
    with raises(ValueError):
        pipe.set_params(anova__C=0.1)

    # Test clone
    pipe2 = clone(pipe)
    assert not pipe.named_steps['svc'] is pipe2.named_steps['svc']

    # Check that apart from estimators, the parameters are the same
    params = pipe.get_params(deep=True)
    params2 = pipe2.get_params(deep=True)

    for x in pipe.get_params(deep=False):
        params.pop(x)

    for x in pipe2.get_params(deep=False):
        params2.pop(x)

    # Remove estimators that where copied
    params.pop('svc')
    params.pop('anova')
    params2.pop('svc')
    params2.pop('anova')
    assert params == params2
def select_model(X, y):
    # make holdout-holdin split
    (
        X_in,
        X_out,
        y_in,
        y_out,
        indices_in,
        indices_out,
        removed_features,
    ) = dh.split_holdout(X, y)

    logging.info("Writing holdin-holdout split data and info to file.")

    dump_svmlight_file(X_in,
                       y_in,
                       os.path.join(s.OPT_DIRP, "holdin.svm"),
                       zero_based=True)
    dump_svmlight_file(X_out,
                       y_out,
                       os.path.join(s.OPT_DIRP, "holdout.svm"),
                       zero_based=True)
    with open(os.path.join(s.OPT_DIRP, "holdinout_split_indices.json"),
              "wt") as f:
        json.dump(
            {
                "holdin": indices_in.tolist(),
                "holdout": indices_out.tolist(),
                "num_features": X_in.shape[1],
            },
            f,
        )

    steps, param_grids = pipeline.make_pipelines(s.PIPE_STEPS,
                                                 alt_order=s.ALT_ORDER)
    steps_param_grids = zip(steps, param_grids)

    if (
            s.PARTIALRUN
    ):  # filter with partial run info from the list pkl generated by reporter.py
        partialinfo = json.load(open(s.PARTIALRUN, "rt"))
        steps_param_grids = pipeline.filter_partialrun(steps_param_grids,
                                                       partialinfo)

    all_results = {}
    fit_pred_duration = {}
    cv_pipe_dir = os.path.join(s.OPT_DIRP, "cv_pipelines")
    util.ensure_dir(cv_pipe_dir)

    for (steps, param_grid) in steps_param_grids:

        # generate a human readable name for the current pipeline from the Pipeline object
        pipe_name = []
        for (name, step) in steps:
            if not "SelectPercentile" in str(step):
                pipe_name.append(str(step).split("(")[0].lower())
            else:
                pipe_name.append(
                    str(step.score_func.func_name).split("(")[0].lower())
        pipe_name = "+".join(pipe_name)
        DATASET_NAME = "{}_{}".format(
            pipe_name,
            s.DATASET_NAME)  # append the dataset name with pipeline name for
        # logging and metadata purposes
        pipe_opt_dir = os.path.join(cv_pipe_dir, pipe_name)
        util.ensure_dir(pipe_opt_dir)

        pipe = Pipeline(steps)
        grid_search = GridSearchCV(
            pipe,
            param_grid=param_grid,
            scoring=pipeline.my_scorer,
            n_jobs=s.CV_N_JOBS,
            cv=s.CV,
            verbose=10,
            error_score=0,
            return_train_score=False,
        )

        logging.info("{}: Doing modelselection with {}.".format(
            pipe_name, grid_search))
        start_pipefit = timeit.default_timer()
        grid_search.fit(X_in, y_in)

        # save grid_search object
        logging.info("{}: Pickling crossvalidation object..".format(pipe_name))
        dump(
            grid_search,
            os.path.join(pipe_opt_dir,
                         "%s_grid_search.joblibpkl" % s.TIMESTAMP),
            compress=1,
        )

        # save all intermediate results
        all_results[pipe_name] = grid_search.cv_results_
        with open(os.path.join(s.OPT_DIRP, "all_pipeline_cv_results.pkl"),
                  "wb") as all_res_out:
            pickle.dump(all_results, all_res_out)

        logging.info(
            "{}: Evaluating winning model on holdout test set.".format(
                pipe_name))

        logging.info("{}: Evaluating holdout performance.".format(pipe_name))
        y_pred = grid_search.predict(X_out).astype(int)
        y_out_true_y_out_pred = {
            "y_out_true": y_out.tolist(),
            "y_out_pred": y_pred.tolist(),
        }
        with open(os.path.join(pipe_opt_dir, "y_out_true-y_out_pred.json"),
                  "wt") as f:
            json.dump(y_out_true_y_out_pred, f)

        # save all intermediate fit and predict durations
        elapsed = timeit.default_timer() - start_pipefit
        fit_pred_duration[pipe_name] = elapsed
        json.dump(
            fit_pred_duration,
            open(
                os.path.join(s.OPT_DIRP,
                             "all_pipeline_fit_predict_duration.json"), "wt"),
        )

        precision, recall, fscore, support = precision_recall_fscore_support(
            y_out, y_pred, average=s.SCORE_AVERAGING)
        acc = accuracy_score(y_out, y_pred)
        if s.MULTICLASS:
            auc = None
        else:
            auc = roc_auc_score(y_out, y_pred)

        # make report
        params = grid_search.best_params_
        winscore = grid_search.best_score_
        ablation_name = "blah"
        report = ("%s\t%s\t%s"
                  "\nSettings: %s"
                  "\nTested parameters: %s"
                  "\nWinning parameters: %s"
                  "\nWinning model CV score: %s %s"
                  "\nHoldout score:"
                  "\nfscore\tprecision\trecall\tacc\tauc"
                  "\n%s\t%s\t%s\t%s\t%s" % (
                      s.DATA_FP,
                      ablation_name,
                      str(pipe.get_params()),
                      s.__file__,
                      s.PIPE_STEPS,
                      params,
                      winscore,
                      s.SCORER_METRIC,
                      fscore,
                      precision,
                      recall,
                      acc,
                      auc,
                  ))
        print(report)
        with open(os.path.join(pipe_opt_dir, "%s_results.txt" % s.TIMESTAMP),
                  "wt") as f:
            f.write(report)
        report_as_dict = {
            "data_path": s.DATA_FP,
            "feature_groups": ablation_name,
            # 'classifier_type': str(type(clf)),
            "settings": str(s.__file__),
            "param_grid": str(s.PIPE_STEPS),
            "best_params": str(params),
            "score_grid_search": winscore,
            "metric_grid_search": s.SCORER_METRIC,
            "fscore_holdout": fscore,
            "precision_holdout": precision,
            "recall_holdout": recall,
            "acc_holdout": acc,
            "auc_holdout": auc,
            "support_holdout": support,
            "predictions_holdout": y_pred.tolist(),
            "y_true_holdout": y_out.tolist(),
        }

        with open(
                os.path.join(pipe_opt_dir, "%s_finalreport.txt" % s.TIMESTAMP),
                "wt") as f:
            f.write(report)
        with open(os.path.join(pipe_opt_dir, "report.json"), "wt") as f:
            json.dump(report_as_dict, f)

        logging.info("{}: Model selection done. Duration: {}".format(
            pipe_name.upper(), str(datetime.timedelta(seconds=elapsed))))

    logging.info("DONE.")