Exemple #1
0
def test_clean_reader():
    """Test clean method of Reader class."""
    reader = Reader()
    with pytest.raises(ValueError):
        reader.clean(path=None, drop_duplicate=False)
    with pytest.raises(ValueError):
        reader.clean(path="data_for_tests/train.csv")
    reader = Reader(sep=",")
    df = reader.clean(path="data_for_tests/train.csv")
    assert np.shape(df) == (891, 12)
    with pytest.raises(ValueError):
        reader.clean(path="data_for_tests/train.wrong_extension")
    df_drop = reader.clean(path="data_for_tests/train.csv",
                           drop_duplicate=True)
    assert np.shape(df_drop) == (891, 12)
    assert np.all(df["Name"] == df_drop["Name"])
    reader = Reader()
    df_excel = reader.clean(path="data_for_tests/train.xls")
    assert np.shape(df_excel) == (891, 12)
    assert np.all(df["Name"] == df_excel["Name"])
    if (sys.platform == "win32" and sys.version_info[0] <= 3
            and sys.version_info[1] <= 5):
        pass
    else:
        if sys.version_info[0] >= 3:
            df_hdf = reader.clean(path="data_for_tests/train.h5")
            assert np.shape(df_hdf) == (891, 12)
            assert np.all(df["Name"] == df_hdf["Name"])
        df_json = reader.clean(path="data_for_tests/train.json")
        assert np.shape(df_json) == (891, 12)
Exemple #2
0
def test_train_test_split_reader():
    """Test train_test_split method of Reader class."""
    reader = Reader(sep=",")
    with pytest.raises(ValueError):
        reader.train_test_split(Lpath=None, target_name="target")
    with pytest.raises(ValueError):
        reader.train_test_split(Lpath=["data_for_tests/train.csv"],
                                target_name=None)
    with pytest.raises(ValueError):
        reader = Reader(to_path=None)
        reader.train_test_split(Lpath=["data_for_tests/train.csv"],
                                target_name="Survived")
    reader = Reader(sep=",")
    dict = reader.train_test_split(Lpath=["data_for_tests/train.csv"],
                                   target_name="Survived")
    assert len(dict) == 3
    assert "train" in list(dict.keys())
    assert "test" in list(dict.keys())
    assert "target" in list(dict.keys())
    assert np.all(dict["train"].columns == dict["train"].columns)
    if (sys.version_info[0] >= 3 and sys.platform != "win32"):
        reader = Reader(to_hdf5=True)
        dict = reader.train_test_split(Lpath=["data_for_tests/train.h5"],
                                       target_name="Survived")
        assert len(dict) == 3
        assert "train" in list(dict.keys())
        assert "test" in list(dict.keys())
        assert "target" in list(dict.keys())
        assert np.all(dict["train"].columns == dict["train"].columns)
Exemple #3
0
def test_evaluate_regression_optimiser():
    """Test evaluate method of Optimiser class for regression."""
    reader = Reader(sep=",")
    dict = reader.train_test_split(Lpath=[
        "data_for_tests/train_regression.csv",
        "data_for_tests/test_regression.csv"
    ],
                                   target_name="SalePrice")
    drift_thresholder = Drift_thresholder()
    drift_thresholder = drift_thresholder.fit_transform(dict)

    mape = make_scorer(lambda y_true, y_pred: 100 * np.sum(
        np.abs(y_true - y_pred) / y_true) / len(y_true),
                       greater_is_better=False,
                       needs_proba=False)
    with pytest.warns(UserWarning) as record:
        opt = Optimiser(scoring=mape, n_folds=3)
    assert len(record) == 1
    score = opt.evaluate(None, dict)
    assert -np.Inf <= score

    with pytest.warns(UserWarning) as record:
        opt = Optimiser(scoring=None, n_folds=3)
    assert len(record) == 1
    score = opt.evaluate(None, dict)
    assert -np.Inf <= score

    with pytest.warns(UserWarning) as record:
        opt = Optimiser(scoring="wrong_scoring", n_folds=3)
    assert len(record) == 1
    with pytest.warns(UserWarning) as record:
        score = opt.evaluate(None, dict)
    assert -np.Inf <= score
Exemple #4
0
def test_evaluate_classification_optimiser():
    """Test evaluate method of Optimiser class for classication."""
    reader = Reader(sep=",")
    dict = reader.train_test_split(
        Lpath=["data_for_tests/train.csv", "data_for_tests/test.csv"],
        target_name="Survived")
    drift_thresholder = Drift_thresholder()
    drift_thresholder = drift_thresholder.fit_transform(dict)

    with pytest.warns(UserWarning) as record:
        opt = Optimiser(scoring=None, n_folds=3)
    assert len(record) == 1
    score = opt.evaluate(None, dict)
    assert -np.Inf <= score

    with pytest.warns(UserWarning) as record:
        opt = Optimiser(scoring="roc_auc", n_folds=3)
    assert len(record) == 1
    score = opt.evaluate(None, dict)
    assert 0. <= score <= 1.

    with pytest.warns(UserWarning) as record:
        opt = Optimiser(scoring="wrong_scoring", n_folds=3)
    assert len(record) == 1
    with pytest.warns(UserWarning) as record:
        score = opt.evaluate(None, dict)
    assert opt.scoring == "neg_log_loss"
Exemple #5
0
def test_init_reader():
    """Test init method of Reader class."""
    reader = Reader()
    assert not reader.sep
    assert reader.header == 0
    assert not reader.to_hdf5
    assert reader.to_path == "save"
    assert reader.verbose
def test_drifts():
    """Test drifts method of Drift_thresholder class."""
    drift_thresholder = Drift_thresholder()
    with pytest.raises(ValueError):
        drift_thresholder.drifts()
    reader = Reader(sep=",")
    dict = reader.train_test_split(
        Lpath=["data_for_tests/train.csv", "data_for_tests/test.csv"],
        target_name="Survived")
    drift_thresholder.fit_transform(dict)
    drifts = drift_thresholder.drifts()
    assert drifts != {}
Exemple #7
0
    def test_fit_predict_predictor_regression(mock_show):
        """Test fit_predict method of Predictor class for regression."""
        rd = Reader(sep=',')
        dt = rd.train_test_split(Lpath=[
            "data_for_tests/train_regression.csv",
            "data_for_tests/test_regression.csv"
        ],
                                 target_name="SalePrice")

        drift_thresholder = Drift_thresholder()
        df = drift_thresholder.fit_transform(dt)

        mape = make_scorer(lambda y_true, y_pred: 100 * np.sum(
            np.abs(y_true - y_pred) / y_true) / len(y_true),
                           greater_is_better=False,
                           needs_proba=False)
        opt = Optimiser(scoring=mape, n_folds=3)

        opt.evaluate(None, df)

        space = {
            'ne__numerical_strategy': {
                "search": "choice",
                "space": [0]
            },
            'ce__strategy': {
                "search":
                "choice",
                "space":
                ["label_encoding", "random_projection", "entity_embedding"]
            },
            'fs__threshold': {
                "search": "uniform",
                "space": [0.01, 0.3]
            },
            'est__max_depth': {
                "search": "choice",
                "space": [3, 4, 5, 6, 7]
            }
        }

        best = opt.optimise(space, df, 1)

        prd = Predictor(verbose=True)
        prd.fit_predict(best, df)
        pred_df = pd.read_csv("save/SalePrice_predictions.csv")
        assert np.all(
            list(pred_df.columns) == ['Unnamed: 0', 'SalePrice_predicted'])
        assert np.shape(pred_df) == (1459, 2)
def test_evaluate_and_optimise_classification():
    """Test evaluate_and_optimise method of Optimiser class."""
    reader = Reader(sep=",")

    dict = reader.train_test_split(
        Lpath=["data_for_tests/train.csv", "data_for_tests/test.csv"],
        target_name="Survived")
    drift_thresholder = Drift_thresholder()
    drift_thresholder = drift_thresholder.fit_transform(dict)

    with pytest.warns(UserWarning) as record:
        opt = Optimiser(scoring='accuracy', n_folds=3)
    assert len(record) == 1
    dict_error = dict.copy()
    dict_error["target"] = dict_error["target"].astype(str)
    with pytest.raises(ValueError):
        score = opt.evaluate(None, dict_error)

    with pytest.warns(UserWarning) as record:
        opt = Optimiser(scoring='accuracy', n_folds=3)
    assert len(record) == 1
    score = opt.evaluate(None, dict)
    assert 0. <= score <= 1.

    space = {
        'ne__numerical_strategy': {
            "search": "choice",
            "space": [0]
        },
        'ce__strategy': {
            "search": "choice",
            "space":
            ["label_encoding", "random_projection", "entity_embedding"]
        },
        'fs__threshold': {
            "search": "uniform",
            "space": [0.01, 0.3]
        },
        'est__max_depth': {
            "search": "choice",
            "space": [3, 4, 5, 6, 7]
        }
    }

    best = opt.optimise(space, dict, 1)
    assert type(best) == type(dict)
Exemple #9
0
def test_fit_predict_predictor_classification():
    """Test fit_predict method of Predictor class for classification."""
    reader = Reader(sep=",")
    dict = reader.train_test_split(
        Lpath=["data_for_tests/train.csv", "data_for_tests/test.csv"],
        target_name="Survived")
    drift_thresholder = Drift_thresholder()
    drift_thresholder = drift_thresholder.fit_transform(dict)

    with pytest.warns(UserWarning) as record:
        opt = Optimiser(scoring='accuracy', n_folds=3)
    assert len(record) == 1

    space = {
        'ne__numerical_strategy': {
            "search": "choice",
            "space": [0]
        },
        'ce__strategy': {
            "search": "choice",
            "space":
            ["label_encoding", "random_projection", "entity_embedding"]
        },
        'fs__threshold': {
            "search": "uniform",
            "space": [0.01, 0.3]
        },
        'est__max_depth': {
            "search": "choice",
            "space": [3, 4, 5, 6, 7]
        }
    }

    optimal_hyper_parameters = opt.optimise(space, dict, 1)

    predictor = Predictor(verbose=False)
    predictor.fit_predict(optimal_hyper_parameters, dict)
    pred_df = pd.read_csv("save/Survived_predictions.csv")
    assert np.all(
        list(pred_df.columns) ==
        ['Unnamed: 0', '0.0', '1.0', 'Survived_predicted'])
    assert np.shape(pred_df) == (418, 4)
def test_fit_transform():
    """Test fit transform method of Drift_thresholder class."""
    drift_thresholder = Drift_thresholder()
    reader = Reader(sep=",")
    dict = reader.train_test_split(Lpath=["data_for_tests/train.csv"],
                                   target_name="Survived")
    drift_thresholder.fit_transform(dict)
    assert not drift_thresholder._Drift_thresholder__fitOK
    dict = reader.train_test_split(
        Lpath=["data_for_tests/train.csv", "data_for_tests/test.csv"],
        target_name="Survived")
    drift_thresholder.fit_transform(dict)
    assert drift_thresholder._Drift_thresholder__fitOK
    dict = reader.train_test_split(Lpath=[
        "data_for_tests/inplace_train.csv", "data_for_tests/inplace_test.csv"
    ],
                                   target_name="Survived")
    drift_thresholder.inplace = True
    drift_thresholder.fit_transform(dict)
    assert drift_thresholder._Drift_thresholder__fitOK
Exemple #11
0
import warnings
warnings.filterwarnings("ignore")

from mlbox.preprocessing.reader import Reader
from mlbox.preprocessing.drift_thresholder import Drift_thresholder
from mlbox.optimisation.optimiser import Optimiser 
from mlbox.prediction.predictor import Predictor
#from mlbox.encoding import Categorical_encoder
from mlbox.model.classification import StackingClassifier, Classifier
import pandas as pd 

paths = ["train_1.csv", "test.csv"]
target_name = "Class"

rd = Reader(sep=",")
df = rd.train_test_split(paths, target_name)
print(df["train"].head())

dft = Drift_thresholder()
df = dft.fit_transform(df)

opt = Optimiser()
warnings.filterwarnings('ignore', category=DeprecationWarning)
score = opt.evaluate(None, df)

space = {
        'ne__numerical_strategy':{"search":"choice",
                                 "space":[0, "mean"]},
        'ce__strategy':{"search":"choice",
                        "space":["label_encoding", "random_projection", "entity_embedding"]},