def test_fit_stacking_classifier(): """Test fit method of StackingClassifier class.""" df_train = pd.read_csv("data_for_tests/clean_train.csv") y_train = pd.read_csv("data_for_tests/clean_target.csv", squeeze=True) stacking_classifier = StackingClassifier(verbose=True) stacking_classifier.fit(df_train, y_train) assert stacking_classifier._StackingClassifier__fitOK
def test_get_params_stacking_classifier(): """Test get_params method StackingClassifier class.""" stacking_classifier = StackingClassifier() dict = stacking_classifier.get_params() assert len(dict["base_estimators"]) == 3 assert isinstance(dict["level_estimator"], type(LogisticRegression())) assert dict["n_folds"] == 5 assert not dict["copy"] assert dict["drop_first"] assert dict["random_state"] == 1 assert dict["verbose"]
def test_transform_stacking_classifier(): """Test transform method of StackingClassifier class.""" df_train = pd.read_csv("data_for_tests/clean_train.csv") y_train = pd.read_csv("data_for_tests/clean_target.csv", squeeze=True) df_test = pd.read_csv("data_for_tests/clean_test.csv") stacking_classifier = StackingClassifier() with pytest.raises(ValueError): stacking_classifier.transform(None) with pytest.raises(ValueError): stacking_classifier.transform(df_test) stacking_classifier.fit_transform(df_train, y_train) results = stacking_classifier.transform(df_test) assert len(results.columns == 3)
def test_init_stacking_classifier(): """Test init method of StackingClassifier class.""" with pytest.raises(ValueError): stacking_classifier = StackingClassifier(base_estimators=dict()) with pytest.raises(ValueError): stacking_classifier = StackingClassifier(n_folds=dict()) with pytest.raises(ValueError): stacking_classifier = StackingClassifier(copy="True") with pytest.raises(ValueError): stacking_classifier = StackingClassifier(drop_first="True") with pytest.raises(ValueError): stacking_classifier = StackingClassifier(random_state="1") with pytest.raises(ValueError): stacking_classifier = StackingClassifier(verbose="True") stacking_classifier = StackingClassifier() assert len(stacking_classifier.base_estimators) == 3 assert isinstance(stacking_classifier.level_estimator, type(LogisticRegression())) assert stacking_classifier.n_folds == 5 assert not stacking_classifier.copy assert stacking_classifier.drop_first assert stacking_classifier.random_state == 1 assert stacking_classifier.verbose assert not stacking_classifier._StackingClassifier__fitOK assert not stacking_classifier._StackingClassifier__fittransformOK
def test_fit_transform_stacking_classifier(): """Test fit_transform method of StackingClassifier class.""" df_train = pd.read_csv("data_for_tests/clean_train.csv") y_train = pd.read_csv("data_for_tests/clean_target.csv", squeeze=True) stacking_classifier = StackingClassifier() with pytest.raises(ValueError): stacking_classifier.fit_transform(None, y_train) with pytest.raises(ValueError): stacking_classifier.fit_transform(df_train, None) stacking_classifier.fit_transform(df_train, y_train) assert stacking_classifier._StackingClassifier__fittransformOK
def test_predict_stacking_classifier(): """Test predict method of StackingClassifier class.""" df_train = pd.read_csv("data_for_tests/clean_train.csv") y_train = pd.read_csv("data_for_tests/clean_target.csv", squeeze=True) df_test = pd.read_csv("data_for_tests/clean_test.csv") stacking_classifier = StackingClassifier() with pytest.raises(ValueError): stacking_classifier.predict(df_test) stacking_classifier.fit(df_train, y_train) results = stacking_classifier.predict(df_test) assert np.shape(results) == (418, )
def fit_predict(self, params, df): """Fits the model and predicts on the test set. Also outputs feature importances and the submission file (.png and .csv format). Parameters ---------- params : dict, default = None. Hyper-parameters dictionary for the whole pipeline. - The keys must respect the following syntax : "enc__param". - "enc" = "ne" for na encoder - "enc" = "ce" for categorical encoder - "enc" = "fs" for feature selector [OPTIONAL] - "enc" = "stck"+str(i) to add layer n°i of meta-features [OPTIONAL] - "enc" = "est" for the final estimator - "param" : a correct associated parameter for each step. Ex: "max_depth" for "enc"="est", ... - The values are those of the parameters. Ex: 4 for key = "est__max_depth", ... df : dict, default = None Dataset dictionary. Must contain keys and values: - "train": pandas DataFrame for the train set. - "test" : pandas DataFrame for the test set. - "target" : encoded pandas Serie for the target on train set (with dtype='float' for a regression or dtype='int' for a classification). Indexes should match the train set. Returns ------- object self. """ if(self.to_path is None): raise ValueError("You must specify a path to save your model " "and your predictions") else: ne = NA_encoder() ce = Categorical_encoder() ########################################## # Automatically checking the task ########################################## ########################################## # Classification ########################################## if (df['target'].dtype == 'int'): # Estimator est = Classifier() # Feature selection if specified fs = None if(params is not None): for p in params.keys(): if(p.startswith("fs__")): fs = Clf_feature_selector() else: pass # Stacking if specified STCK = {} if(params is not None): for p in params.keys(): if(p.startswith("stck")): STCK[p.split("__")[0]] = StackingClassifier() else: pass ########################################## # Regression ########################################## elif (df['target'].dtype == 'float'): # Estimator est = Regressor() # Feature selection if specified fs = None if(params is not None): for p in params.keys(): if(p.startswith("fs__")): fs = Reg_feature_selector() else: pass # Stacking if specified STCK = {} if(params is not None): for p in params.keys(): if(p.startswith("stck")): STCK[p.split("__")[0]] = StackingRegressor() else: pass else: raise ValueError("Impossible to determine the task. " "Please check that your target is encoded.") ########################################## # Creating the Pipeline ########################################## pipe = [("ne", ne), ("ce", ce)] # Do we need to cache transformers? cache = False if (params is not None): if("ce__strategy" in params): if(params["ce__strategy"] == "entity_embedding"): cache = True else: pass else: pass if (fs is not None): if ("fs__strategy" in params): if(params["fs__strategy"] != "variance"): cache = True else: pass else: pass if (len(STCK) != 0): cache = True else: pass # Pipeline creation if (fs is not None): pipe.append(("fs", fs)) else: pass for stck in np.sort(list(STCK)): pipe.append((stck, STCK[stck])) pipe.append(("est", est)) if(cache): pp = Pipeline(pipe, memory=self.to_path) else: pp = Pipeline(pipe) ########################################## # Fitting the Pipeline ########################################## start_time = time.time() # No params : default configuration if(params is None): print("") print('> No parameters set. Default configuration is tested') set_params = True else: try: pp = pp.set_params(**params) set_params = True except: set_params = False if(set_params): try: if(self.verbose): print("") print("fitting the pipeline ...") pp.fit(df['train'], df['target']) if(self.verbose): print("CPU time: %s seconds"%(time.time() - start_time)) try: os.mkdir(self.to_path) except OSError: pass # Feature importances try: importance = est.feature_importances() self.__save_feature_importances(importance, self.to_path + "/" + est.get_params()["strategy"] + "_feature_importance.png") if(self.verbose): self.__plot_feature_importances(importance, 10) print("") print("> Feature importances dumped into directory : " + self.to_path) except: warnings.warn("Unable to get feature importances !") except: raise ValueError("Pipeline cannot be fitted") else: raise ValueError("Pipeline cannot be set with these parameters." " Check the name of your stages.") ########################################## # Predicting ########################################## if (df["test"].shape[0] == 0): warnings.warn("You have no test dataset. Cannot predict !") else: start_time = time.time() ########################################## # Classification ########################################## if (df['target'].dtype == 'int'): ''' enc_name = "target_encoder.obj" try: fhand = open(self.to_path + "/" + enc_name, 'rb') enc = pickle.load(fhand) fhand.close() except: raise ValueError("Unable to load '" + enc_name + "' from directory : " + self.to_path) ''' try: if(self.verbose): print("") print("predicting ...") pred = pd.DataFrame(pp.predict_proba(df['test']), columns=['0', '1'], index=df['test'].index) except: raise ValueError("Can not predict") ########################################## # Regression ########################################## elif (df['target'].dtype == 'float'): pred = pd.DataFrame([], columns=[df['target'].name + "_predicted"], index=df['test'].index) try: if(self.verbose): print("") print("predicting...") pred[df['target'].name + "_predicted"] = pp.predict(df['test']) # noqa except: raise ValueError("Can not predict") else: pass if(self.verbose): print("CPU time: %s seconds" % (time.time() - start_time)) self.pp = pp return pred['1'].values
def test_set_params_stacking_classifier(): """Test set_params method of StackingClassifier class.""" stacking_classifier = StackingClassifier() stacking_classifier.set_params(n_folds=6) assert stacking_classifier.n_folds == 6 stacking_classifier.set_params(copy=True) assert stacking_classifier.copy stacking_classifier.set_params(drop_first=False) assert not stacking_classifier.drop_first stacking_classifier.set_params(random_state=2) assert stacking_classifier.random_state == 2 stacking_classifier.set_params(verbose=False) assert not stacking_classifier.verbose with pytest.warns(UserWarning) as record: stacking_classifier.set_params(wrong_parameters=None) assert len(record) == 1