コード例 #1
0
def test_fit_stacking_classifier():
    """Test fit method of StackingClassifier class."""
    df_train = pd.read_csv("data_for_tests/clean_train.csv")
    y_train = pd.read_csv("data_for_tests/clean_target.csv", squeeze=True)
    stacking_classifier = StackingClassifier(verbose=True)
    stacking_classifier.fit(df_train, y_train)
    assert stacking_classifier._StackingClassifier__fitOK
コード例 #2
0
def test_get_params_stacking_classifier():
    """Test get_params method StackingClassifier class."""
    stacking_classifier = StackingClassifier()
    dict = stacking_classifier.get_params()
    assert len(dict["base_estimators"]) == 3
    assert isinstance(dict["level_estimator"], type(LogisticRegression()))
    assert dict["n_folds"] == 5
    assert not dict["copy"]
    assert dict["drop_first"]
    assert dict["random_state"] == 1
    assert dict["verbose"]
コード例 #3
0
def test_transform_stacking_classifier():
    """Test transform method of StackingClassifier class."""
    df_train = pd.read_csv("data_for_tests/clean_train.csv")
    y_train = pd.read_csv("data_for_tests/clean_target.csv", squeeze=True)
    df_test = pd.read_csv("data_for_tests/clean_test.csv")
    stacking_classifier = StackingClassifier()
    with pytest.raises(ValueError):
        stacking_classifier.transform(None)
    with pytest.raises(ValueError):
        stacking_classifier.transform(df_test)
    stacking_classifier.fit_transform(df_train, y_train)
    results = stacking_classifier.transform(df_test)
    assert len(results.columns == 3)
コード例 #4
0
def test_init_stacking_classifier():
    """Test init method of StackingClassifier class."""
    with pytest.raises(ValueError):
        stacking_classifier = StackingClassifier(base_estimators=dict())
    with pytest.raises(ValueError):
        stacking_classifier = StackingClassifier(n_folds=dict())
    with pytest.raises(ValueError):
        stacking_classifier = StackingClassifier(copy="True")
    with pytest.raises(ValueError):
        stacking_classifier = StackingClassifier(drop_first="True")
    with pytest.raises(ValueError):
        stacking_classifier = StackingClassifier(random_state="1")
    with pytest.raises(ValueError):
        stacking_classifier = StackingClassifier(verbose="True")
    stacking_classifier = StackingClassifier()
    assert len(stacking_classifier.base_estimators) == 3
    assert isinstance(stacking_classifier.level_estimator,
                      type(LogisticRegression()))
    assert stacking_classifier.n_folds == 5
    assert not stacking_classifier.copy
    assert stacking_classifier.drop_first
    assert stacking_classifier.random_state == 1
    assert stacking_classifier.verbose
    assert not stacking_classifier._StackingClassifier__fitOK
    assert not stacking_classifier._StackingClassifier__fittransformOK
コード例 #5
0
def test_fit_transform_stacking_classifier():
    """Test fit_transform method of StackingClassifier class."""
    df_train = pd.read_csv("data_for_tests/clean_train.csv")
    y_train = pd.read_csv("data_for_tests/clean_target.csv", squeeze=True)
    stacking_classifier = StackingClassifier()
    with pytest.raises(ValueError):
        stacking_classifier.fit_transform(None, y_train)
    with pytest.raises(ValueError):
        stacking_classifier.fit_transform(df_train, None)
    stacking_classifier.fit_transform(df_train, y_train)
    assert stacking_classifier._StackingClassifier__fittransformOK
コード例 #6
0
def test_predict_stacking_classifier():
    """Test predict method of StackingClassifier class."""
    df_train = pd.read_csv("data_for_tests/clean_train.csv")
    y_train = pd.read_csv("data_for_tests/clean_target.csv", squeeze=True)
    df_test = pd.read_csv("data_for_tests/clean_test.csv")
    stacking_classifier = StackingClassifier()
    with pytest.raises(ValueError):
        stacking_classifier.predict(df_test)
    stacking_classifier.fit(df_train, y_train)
    results = stacking_classifier.predict(df_test)
    assert np.shape(results) == (418, )
コード例 #7
0
ファイル: predictor.py プロジェクト: MetaLearners/Auto-Stream
    def fit_predict(self, params, df):


        """Fits the model and predicts on the test set.
        Also outputs feature importances and the submission file
        (.png and .csv format).
        Parameters
        ----------
        params : dict, default = None.
            Hyper-parameters dictionary for the whole pipeline.
            - The keys must respect the following syntax : "enc__param".
                - "enc" = "ne" for na encoder
                - "enc" = "ce" for categorical encoder
                - "enc" = "fs" for feature selector [OPTIONAL]
                - "enc" = "stck"+str(i) to add layer n°i of meta-features [OPTIONAL]
                - "enc" = "est" for the final estimator
                - "param" : a correct associated parameter for each step. Ex: "max_depth" for "enc"="est", ...
            - The values are those of the parameters. Ex: 4 for key = "est__max_depth", ...
        df : dict, default = None
            Dataset dictionary. Must contain keys and values:
            - "train": pandas DataFrame for the train set.
            - "test" : pandas DataFrame for the test set.
            - "target" : encoded pandas Serie for the target on train set (with dtype='float' for a regression or dtype='int' for a classification). Indexes should match the train set.
        Returns
        -------
        object
            self.
        """

        if(self.to_path is None):
            raise ValueError("You must specify a path to save your model "
                             "and your predictions")

        else:

            ne = NA_encoder()
            ce = Categorical_encoder()

            ##########################################
            #    Automatically checking the task
            ##########################################

            ##########################################
            #             Classification
            ##########################################

            if (df['target'].dtype == 'int'):

                # Estimator

                est = Classifier()

                # Feature selection if specified

                fs = None
                if(params is not None):
                    for p in params.keys():
                        if(p.startswith("fs__")):
                            fs = Clf_feature_selector()
                        else:
                            pass

                # Stacking if specified

                STCK = {}
                if(params is not None):
                    for p in params.keys():
                        if(p.startswith("stck")):
                            STCK[p.split("__")[0]] = StackingClassifier()
                        else:
                            pass

        ##########################################
        #               Regression
        ##########################################

            elif (df['target'].dtype == 'float'):

                # Estimator

                est = Regressor()

                # Feature selection if specified

                fs = None
                if(params is not None):
                    for p in params.keys():
                        if(p.startswith("fs__")):
                            fs = Reg_feature_selector()
                        else:
                            pass

                # Stacking if specified

                STCK = {}
                if(params is not None):
                    for p in params.keys():
                        if(p.startswith("stck")):
                            STCK[p.split("__")[0]] = StackingRegressor()
                        else:
                            pass

            else:
                raise ValueError("Impossible to determine the task. "
                                 "Please check that your target is encoded.")

            ##########################################
            #          Creating the Pipeline
            ##########################################

            pipe = [("ne", ne), ("ce", ce)]

            # Do we need to cache transformers?

            cache = False

            if (params is not None):
                if("ce__strategy" in params):
                    if(params["ce__strategy"] == "entity_embedding"):
                        cache = True
                    else:
                        pass
                else:
                    pass

            if (fs is not None):
                if ("fs__strategy" in params):
                    if(params["fs__strategy"] != "variance"):
                        cache = True
                    else:
                        pass
            else:
                pass

            if (len(STCK) != 0):
                cache = True
            else:
                pass

            # Pipeline creation

            if (fs is not None):
                pipe.append(("fs", fs))
            else:
                pass

            for stck in np.sort(list(STCK)):
                pipe.append((stck, STCK[stck]))

            pipe.append(("est", est))

            if(cache):
                pp = Pipeline(pipe, memory=self.to_path)
            else:
                pp = Pipeline(pipe)

            ##########################################
            #          Fitting the Pipeline
            ##########################################

            start_time = time.time()

            # No params : default configuration

            if(params is None):
                print("")
                print('> No parameters set. Default configuration is tested')
                set_params = True

            else:
                try:
                    pp = pp.set_params(**params)
                    set_params = True
                except:
                    set_params = False

            if(set_params):

                try:
                    if(self.verbose):
                        print("")
                        print("fitting the pipeline ...")

                    pp.fit(df['train'], df['target'])

                    if(self.verbose):
                        print("CPU time: %s seconds"%(time.time() - start_time))

                    try:
                        os.mkdir(self.to_path)
                    except OSError:
                        pass

                    # Feature importances

                    try:

                        importance = est.feature_importances()
                        self.__save_feature_importances(importance,
                                                        self.to_path
                                                        + "/"
                                                        + est.get_params()["strategy"]
                                                        + "_feature_importance.png")

                        if(self.verbose):
                            self.__plot_feature_importances(importance, 10)
                            print("")
                            print("> Feature importances dumped into directory : " + self.to_path)

                    except:
                        warnings.warn("Unable to get feature importances !")

                except:
                    raise ValueError("Pipeline cannot be fitted")
            else:
                raise ValueError("Pipeline cannot be set with these parameters."
                                 " Check the name of your stages.")

            ##########################################
            #               Predicting
            ##########################################

            if (df["test"].shape[0] == 0):
                warnings.warn("You have no test dataset. Cannot predict !")

            else:

                start_time = time.time()

                ##########################################
                #             Classification
                ##########################################

                if (df['target'].dtype == 'int'):

                    '''
                    enc_name = "target_encoder.obj"

                    try:

                        fhand = open(self.to_path + "/" + enc_name, 'rb')
                        enc = pickle.load(fhand)
                        fhand.close()

                    except:
                        raise ValueError("Unable to load '" + enc_name +
                                         "' from directory : " + self.to_path)
                    '''

                    try:
                        if(self.verbose):
                            print("")
                            print("predicting ...")

                        pred = pd.DataFrame(pp.predict_proba(df['test']),
                                            columns=['0', '1'],
                                            index=df['test'].index)

                    except:
                        raise ValueError("Can not predict")

                ##########################################
                #               Regression
                ##########################################

                elif (df['target'].dtype == 'float'):

                    pred = pd.DataFrame([],
                                        columns=[df['target'].name + "_predicted"],
                                        index=df['test'].index)

                    try:
                        if(self.verbose):
                            print("")
                            print("predicting...")

                        pred[df['target'].name + "_predicted"] = pp.predict(df['test'])  # noqa

                    except:
                        raise ValueError("Can not predict")

                else:
                    pass

                if(self.verbose):
                    print("CPU time: %s seconds" % (time.time() - start_time))

        self.pp = pp
        return pred['1'].values
コード例 #8
0
def test_set_params_stacking_classifier():
    """Test set_params method of StackingClassifier class."""
    stacking_classifier = StackingClassifier()
    stacking_classifier.set_params(n_folds=6)
    assert stacking_classifier.n_folds == 6
    stacking_classifier.set_params(copy=True)
    assert stacking_classifier.copy
    stacking_classifier.set_params(drop_first=False)
    assert not stacking_classifier.drop_first
    stacking_classifier.set_params(random_state=2)
    assert stacking_classifier.random_state == 2
    stacking_classifier.set_params(verbose=False)
    assert not stacking_classifier.verbose
    with pytest.warns(UserWarning) as record:
        stacking_classifier.set_params(wrong_parameters=None)
    assert len(record) == 1