def test_clean_reader(): """Test clean method of Reader class.""" reader = Reader() with pytest.raises(ValueError): reader.clean(path=None, drop_duplicate=False) with pytest.raises(ValueError): reader.clean(path="data_for_tests/train.csv") reader = Reader(sep=",") df = reader.clean(path="data_for_tests/train.csv") assert np.shape(df) == (891, 12) with pytest.raises(ValueError): reader.clean(path="data_for_tests/train.wrong_extension") df_drop = reader.clean(path="data_for_tests/train.csv", drop_duplicate=True) assert np.shape(df_drop) == (891, 12) assert np.all(df["Name"] == df_drop["Name"]) reader = Reader() df_excel = reader.clean(path="data_for_tests/train.xls") assert np.shape(df_excel) == (891, 12) assert np.all(df["Name"] == df_excel["Name"]) if (sys.platform == "win32" and sys.version_info[0] <= 3 and sys.version_info[1] <= 5): pass else: if sys.version_info[0] >= 3: df_hdf = reader.clean(path="data_for_tests/train.h5") assert np.shape(df_hdf) == (891, 12) assert np.all(df["Name"] == df_hdf["Name"]) df_json = reader.clean(path="data_for_tests/train.json") assert np.shape(df_json) == (891, 12)
def test_train_test_split_reader(): """Test train_test_split method of Reader class.""" reader = Reader(sep=",") with pytest.raises(ValueError): reader.train_test_split(Lpath=None, target_name="target") with pytest.raises(ValueError): reader.train_test_split(Lpath=["data_for_tests/train.csv"], target_name=None) with pytest.raises(ValueError): reader = Reader(to_path=None) reader.train_test_split(Lpath=["data_for_tests/train.csv"], target_name="Survived") reader = Reader(sep=",") dict = reader.train_test_split(Lpath=["data_for_tests/train.csv"], target_name="Survived") assert len(dict) == 3 assert "train" in list(dict.keys()) assert "test" in list(dict.keys()) assert "target" in list(dict.keys()) assert np.all(dict["train"].columns == dict["train"].columns) if (sys.version_info[0] >= 3 and sys.platform != "win32"): reader = Reader(to_hdf5=True) dict = reader.train_test_split(Lpath=["data_for_tests/train.h5"], target_name="Survived") assert len(dict) == 3 assert "train" in list(dict.keys()) assert "test" in list(dict.keys()) assert "target" in list(dict.keys()) assert np.all(dict["train"].columns == dict["train"].columns)
def test_evaluate_regression_optimiser(): """Test evaluate method of Optimiser class for regression.""" reader = Reader(sep=",") dict = reader.train_test_split(Lpath=[ "data_for_tests/train_regression.csv", "data_for_tests/test_regression.csv" ], target_name="SalePrice") drift_thresholder = Drift_thresholder() drift_thresholder = drift_thresholder.fit_transform(dict) mape = make_scorer(lambda y_true, y_pred: 100 * np.sum( np.abs(y_true - y_pred) / y_true) / len(y_true), greater_is_better=False, needs_proba=False) with pytest.warns(UserWarning) as record: opt = Optimiser(scoring=mape, n_folds=3) assert len(record) == 1 score = opt.evaluate(None, dict) assert -np.Inf <= score with pytest.warns(UserWarning) as record: opt = Optimiser(scoring=None, n_folds=3) assert len(record) == 1 score = opt.evaluate(None, dict) assert -np.Inf <= score with pytest.warns(UserWarning) as record: opt = Optimiser(scoring="wrong_scoring", n_folds=3) assert len(record) == 1 with pytest.warns(UserWarning) as record: score = opt.evaluate(None, dict) assert -np.Inf <= score
def test_evaluate_classification_optimiser(): """Test evaluate method of Optimiser class for classication.""" reader = Reader(sep=",") dict = reader.train_test_split( Lpath=["data_for_tests/train.csv", "data_for_tests/test.csv"], target_name="Survived") drift_thresholder = Drift_thresholder() drift_thresholder = drift_thresholder.fit_transform(dict) with pytest.warns(UserWarning) as record: opt = Optimiser(scoring=None, n_folds=3) assert len(record) == 1 score = opt.evaluate(None, dict) assert -np.Inf <= score with pytest.warns(UserWarning) as record: opt = Optimiser(scoring="roc_auc", n_folds=3) assert len(record) == 1 score = opt.evaluate(None, dict) assert 0. <= score <= 1. with pytest.warns(UserWarning) as record: opt = Optimiser(scoring="wrong_scoring", n_folds=3) assert len(record) == 1 with pytest.warns(UserWarning) as record: score = opt.evaluate(None, dict) assert opt.scoring == "neg_log_loss"
def test_init_reader(): """Test init method of Reader class.""" reader = Reader() assert not reader.sep assert reader.header == 0 assert not reader.to_hdf5 assert reader.to_path == "save" assert reader.verbose
def test_drifts(): """Test drifts method of Drift_thresholder class.""" drift_thresholder = Drift_thresholder() with pytest.raises(ValueError): drift_thresholder.drifts() reader = Reader(sep=",") dict = reader.train_test_split( Lpath=["data_for_tests/train.csv", "data_for_tests/test.csv"], target_name="Survived") drift_thresholder.fit_transform(dict) drifts = drift_thresholder.drifts() assert drifts != {}
def test_fit_predict_predictor_regression(mock_show): """Test fit_predict method of Predictor class for regression.""" rd = Reader(sep=',') dt = rd.train_test_split(Lpath=[ "data_for_tests/train_regression.csv", "data_for_tests/test_regression.csv" ], target_name="SalePrice") drift_thresholder = Drift_thresholder() df = drift_thresholder.fit_transform(dt) mape = make_scorer(lambda y_true, y_pred: 100 * np.sum( np.abs(y_true - y_pred) / y_true) / len(y_true), greater_is_better=False, needs_proba=False) opt = Optimiser(scoring=mape, n_folds=3) opt.evaluate(None, df) space = { 'ne__numerical_strategy': { "search": "choice", "space": [0] }, 'ce__strategy': { "search": "choice", "space": ["label_encoding", "random_projection", "entity_embedding"] }, 'fs__threshold': { "search": "uniform", "space": [0.01, 0.3] }, 'est__max_depth': { "search": "choice", "space": [3, 4, 5, 6, 7] } } best = opt.optimise(space, df, 1) prd = Predictor(verbose=True) prd.fit_predict(best, df) pred_df = pd.read_csv("save/SalePrice_predictions.csv") assert np.all( list(pred_df.columns) == ['Unnamed: 0', 'SalePrice_predicted']) assert np.shape(pred_df) == (1459, 2)
def test_evaluate_and_optimise_classification(): """Test evaluate_and_optimise method of Optimiser class.""" reader = Reader(sep=",") dict = reader.train_test_split( Lpath=["data_for_tests/train.csv", "data_for_tests/test.csv"], target_name="Survived") drift_thresholder = Drift_thresholder() drift_thresholder = drift_thresholder.fit_transform(dict) with pytest.warns(UserWarning) as record: opt = Optimiser(scoring='accuracy', n_folds=3) assert len(record) == 1 dict_error = dict.copy() dict_error["target"] = dict_error["target"].astype(str) with pytest.raises(ValueError): score = opt.evaluate(None, dict_error) with pytest.warns(UserWarning) as record: opt = Optimiser(scoring='accuracy', n_folds=3) assert len(record) == 1 score = opt.evaluate(None, dict) assert 0. <= score <= 1. space = { 'ne__numerical_strategy': { "search": "choice", "space": [0] }, 'ce__strategy': { "search": "choice", "space": ["label_encoding", "random_projection", "entity_embedding"] }, 'fs__threshold': { "search": "uniform", "space": [0.01, 0.3] }, 'est__max_depth': { "search": "choice", "space": [3, 4, 5, 6, 7] } } best = opt.optimise(space, dict, 1) assert type(best) == type(dict)
def test_fit_predict_predictor_classification(): """Test fit_predict method of Predictor class for classification.""" reader = Reader(sep=",") dict = reader.train_test_split( Lpath=["data_for_tests/train.csv", "data_for_tests/test.csv"], target_name="Survived") drift_thresholder = Drift_thresholder() drift_thresholder = drift_thresholder.fit_transform(dict) with pytest.warns(UserWarning) as record: opt = Optimiser(scoring='accuracy', n_folds=3) assert len(record) == 1 space = { 'ne__numerical_strategy': { "search": "choice", "space": [0] }, 'ce__strategy': { "search": "choice", "space": ["label_encoding", "random_projection", "entity_embedding"] }, 'fs__threshold': { "search": "uniform", "space": [0.01, 0.3] }, 'est__max_depth': { "search": "choice", "space": [3, 4, 5, 6, 7] } } optimal_hyper_parameters = opt.optimise(space, dict, 1) predictor = Predictor(verbose=False) predictor.fit_predict(optimal_hyper_parameters, dict) pred_df = pd.read_csv("save/Survived_predictions.csv") assert np.all( list(pred_df.columns) == ['Unnamed: 0', '0.0', '1.0', 'Survived_predicted']) assert np.shape(pred_df) == (418, 4)
def test_fit_transform(): """Test fit transform method of Drift_thresholder class.""" drift_thresholder = Drift_thresholder() reader = Reader(sep=",") dict = reader.train_test_split(Lpath=["data_for_tests/train.csv"], target_name="Survived") drift_thresholder.fit_transform(dict) assert not drift_thresholder._Drift_thresholder__fitOK dict = reader.train_test_split( Lpath=["data_for_tests/train.csv", "data_for_tests/test.csv"], target_name="Survived") drift_thresholder.fit_transform(dict) assert drift_thresholder._Drift_thresholder__fitOK dict = reader.train_test_split(Lpath=[ "data_for_tests/inplace_train.csv", "data_for_tests/inplace_test.csv" ], target_name="Survived") drift_thresholder.inplace = True drift_thresholder.fit_transform(dict) assert drift_thresholder._Drift_thresholder__fitOK
import warnings warnings.filterwarnings("ignore") from mlbox.preprocessing.reader import Reader from mlbox.preprocessing.drift_thresholder import Drift_thresholder from mlbox.optimisation.optimiser import Optimiser from mlbox.prediction.predictor import Predictor #from mlbox.encoding import Categorical_encoder from mlbox.model.classification import StackingClassifier, Classifier import pandas as pd paths = ["train_1.csv", "test.csv"] target_name = "Class" rd = Reader(sep=",") df = rd.train_test_split(paths, target_name) print(df["train"].head()) dft = Drift_thresholder() df = dft.fit_transform(df) opt = Optimiser() warnings.filterwarnings('ignore', category=DeprecationWarning) score = opt.evaluate(None, df) space = { 'ne__numerical_strategy':{"search":"choice", "space":[0, "mean"]}, 'ce__strategy':{"search":"choice", "space":["label_encoding", "random_projection", "entity_embedding"]},