Ejemplo n.º 1
0
 def test_validate_hdl2(self):
     train_df, test_df = datasets.load("titanic", return_train_test=True)
     trained_pipeline = AutoFlowClassifier(
         initial_runs=1,
         run_limit=3,
         n_jobs=1,
         included_highR_nan_imputers=["operate.pop"],
         debug=True,
         n_jobs_in_algorithm=5,
         resource_manager=self.mock_resource_manager)
     column_descriptions = {
         "id": "PassengerId",
         "target": "Survived",
         "text": "Name"
     }
     try:
         trained_pipeline.fit(X_train=train_df,
                              X_test=test_df,
                              column_descriptions=column_descriptions,
                              splitter=KFold(n_splits=3,
                                             shuffle=True,
                                             random_state=42),
                              fit_ensemble_params=True,
                              is_not_realy_run=True)
         assert Exception("didn't detect wrong HDL.")
     except Exception as e:
         self.assertEqual(
             str(e),
             "In step 'highR_nan->nan', user defined packege : 'operate.pop' does not exist!"
         )
Ejemplo n.º 2
0
 def test_3(self):
     titanic_df = load("titanic")
     titanic_df.index = reversed(titanic_df.index)
     dc = DataFrameContainer(
         dataset_instance=titanic_df,
         resource_manager=self.http_mock_resource_manager)
     feat_grp = [f"feat_{i}" for i in range(dc.shape[1])]
     dc.set_feature_groups(feat_grp)
     column_descriptions = dc.column_descriptions
     dc.upload()
     dataset_id = dc.dataset_id
     download_dc = DataFrameContainer(
         "Unittest",
         dataset_id=dataset_id,
         resource_manager=self.http_mock_resource_manager)
     self.assertTrue(
         np.all(download_dc.data.fillna(0) == dc.data.fillna(0)))
     self.assertTrue(
         np.all(download_dc.feature_groups == dc.feature_groups))
     self.assertTrue(np.all(download_dc.columns == dc.columns))
     self.assertTrue(np.all(download_dc.index == dc.index))
     self.assertEqual(download_dc.column_descriptions,
                      dc.column_descriptions)
     self.assertEqual(download_dc.columns_mapper, dc.columns_mapper)
     self.assertEqual(download_dc.dataset_type, dc.dataset_type)
     self.assertEqual(download_dc.dataset_source, dc.dataset_source)
     ###################################################################
     in_data = [1, 2, 3, 4, 5]
     dc = NdArrayContainer(dataset_instance=in_data,
                           resource_manager=self.http_mock_resource_manager)
     dc.upload()
     d_dc = NdArrayContainer(
         dataset_id=dc.dataset_id,
         resource_manager=self.http_mock_resource_manager)
     self.assertTrue(np.all(d_dc.data == np.array(in_data)))
Ejemplo n.º 3
0
 def test_set_dirty_columns(self):
     titanic_df = load("titanic")
     columns = pd.Series(titanic_df.columns)
     columns = ["@"] * len(columns)
     titanic_df.columns = columns
     dc = DataFrameContainer(dataset_instance=titanic_df,
                             resource_manager=self.mock_resource_manager)
     wanted_columns = Index([
         'col', 'col_1', 'col_2', 'col_3', 'col_4', 'col_5', 'col_6',
         'col_7', 'col_8', 'col_9', 'col_10', 'col_11'
     ],
                            dtype='object')
     self.assertTrue(np.all(dc.columns == wanted_columns))
Ejemplo n.º 4
0
 def test_set_same_column(self):
     titanic_df = load("titanic")
     columns = pd.Series(titanic_df.columns)
     columns = ["@"] * len(columns)
     columns[1] = "same"
     columns[2] = "same"
     columns[3] = "same"
     columns[5] = "ok"
     columns[6] = "ok"
     titanic_df.columns = columns
     dc = DataFrameContainer(dataset_instance=titanic_df,
                             resource_manager=self.mock_resource_manager)
     wanted = Index([
         'col', 'same_1', 'same_2', 'same_3', 'col_1', 'ok_1', 'ok_2',
         'col_2', 'col_3', 'col_4', 'col_5', 'col_6'
     ],
                    dtype='object')
     self.assertTrue(np.all(dc.columns == wanted))
 def test_draw_only_estimator(self):
     name = "test_draw_workspace"
     train_df = load("qsar")
     remain_cols = list(train_df.columns)
     remain_cols.remove("target")
     column_descriptions = {"num": remain_cols, "target": "target"}
     data_manager = DataManager(self.mock_resource_manager,
                                train_df,
                                column_descriptions=column_descriptions)
     hdl_constructor = HDL_Constructor(
         DAG_workflow={"num->target": ["lightgbm", "catboost"]})
     hdl_constructor.run(data_manager)
     hdl_df = hdl_constructor.get_hdl_dataframe()
     Path(f"{name}.html").write_text(hdl_df.to_html())
     hdl_df.to_excel(f"{name}.xlsx")
     # pip install openpyxl
     print(hdl_df)
     graph = hdl_constructor.draw_workflow_space()
     open(f"{name}.gv", "w+").write(graph.source)
     cmd = f'''dot -Tpng -Gsize=9,15\! -Gdpi=300 -o{name}.png {name}.gv'''
     os.system(cmd)
Ejemplo n.º 6
0
 def test_set_column_descriptions(self):
     final_column_descriptions = {
         'id': 'PassengerId',
         'target': 'Survived',
         'text': ['Name'],
         'num': ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare'],
         'cat': ['Sex', 'Cabin', 'Embarked'],
         'highC_cat': ['Ticket']
     }
     train_df, test_df = load("titanic", return_train_test=True)
     origin = deepcopy(test_df)
     test_dc = DataFrameContainer(
         "Unittest",
         dataset_instance=test_df,
         resource_manager=self.mock_resource_manager)
     test_dc.set_column_descriptions(final_column_descriptions)
     self.assertTrue(
         np.all(test_dc.feature_groups == pd.Series([
             'id', 'num', 'text', 'cat', 'num', 'num', 'num', 'highC_cat',
             'num', 'cat', 'cat'
         ])))
     self.assertTrue(np.all(origin.columns == test_dc.columns))
Ejemplo n.º 7
0
 def setUp(self) -> None:
     super(TestFeatureSelection, self).setUp()
     self.L = 1024
     df = load("qsar")
     y = df.pop("target")
     X = df
     X[X == 0] = -1
     X.index = reversed(X.index)
     self.index = deepcopy(X.index)
     X = DataFrameContainer("TrainSet",
                            dataset_instance=X,
                            resource_manager=self.mock_resource_manager)
     X.set_feature_groups(["num"] * X.shape[1])
     self.X = X
     self.y = NdArrayContainer("TrainSet",
                               dataset_instance=y,
                               resource_manager=self.mock_resource_manager)
     y_reg = y + np.random.rand(*y.shape)
     self.y_reg = NdArrayContainer(
         "TrainSet",
         dataset_instance=y_reg,
         resource_manager=self.mock_resource_manager)
Ejemplo n.º 8
0
 def setUp(self) -> None:
     super(RunReduce, self).setUp()
     self.L = 1024
     df = load("qsar")
     y = df.pop("target")
     X = df
     X[X == 0] = -1
     X.index = reversed(X.index)
     self.index = deepcopy(X.index)
     X = DataFrameContainer("TrainSet", dataset_instance=X)
     X.set_feature_groups(["num"] * X.shape[1])
     X2 = deepcopy(X)
     y2 = deepcopy(y)
     N = 500
     X2.data = X2.data.iloc[:N, :]
     X2.set_feature_groups(["num"] * X2.shape[1])
     y2 = y2.iloc[:N]
     self.Xs = [
         X, X2
     ]
     self.ys = [
         NdArrayContainer("TrainLabel", dataset_instance=y),
         NdArrayContainer("TrainLabel", dataset_instance=y2)
     ]
Ejemplo n.º 9
0
    def test_instancing1(self):
        def do_assert(data_manager, remote=False, stacked=True):
            final_column_descriptions = {
                'id': 'PassengerId',
                'target': 'Survived',
                'text': ['Name'],
                'num': ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare'],
                'cat': ['Sex', 'Cabin', 'Embarked'],
                'highC_cat': ['Ticket']
            }
            assert sort_dict(
                data_manager.final_column_descriptions) == sort_dict(
                    final_column_descriptions)
            if not remote:
                assert sort_dict(
                    data_manager.column_descriptions) == sort_dict({
                        'id':
                        'PassengerId',
                        'target':
                        'Survived',
                        'text':
                        'Name'
                    })
            else:
                assert sort_dict(
                    data_manager.column_descriptions) == sort_dict(
                        final_column_descriptions)
            if stacked:
                assert np.all(
                    pd.Series(data_manager.feature_groups) == pd.Series([
                        'num', 'text', 'cat', 'nan', 'num', 'num', 'highC_cat',
                        'nan', 'highR_nan', 'nan'
                    ]))
            else:
                assert np.all(
                    pd.Series(data_manager.feature_groups) == pd.Series([
                        'num', 'text', 'cat', 'nan', 'num', 'num', 'highC_cat',
                        'num', 'highR_nan', 'nan'
                    ]))
            assert np.all(data_manager.columns == Index([
                'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket',
                'Fare', 'Cabin', 'Embarked'
            ],
                                                        dtype='object'))

        train_df, test_df = datasets.load("titanic", return_train_test=True)
        column_descriptions = {
            "id": "PassengerId",
            "target": "Survived",
            "text": "Name"
        }
        data_manager1 = DataManager(
            X_train=train_df,
            X_test=test_df,
            column_descriptions=column_descriptions,
            resource_manager=self.mock_resource_manager)
        do_assert(data_manager1, remote=False, stacked=True)
        # -------------------------------------------------------------------------------

        train_df, test_df = datasets.load("titanic", return_train_test=True)
        data_manager2 = DataManager(
            X_train=train_df,
            # X_test=test_df,
            column_descriptions=column_descriptions,
            resource_manager=self.mock_resource_manager)
        do_assert(data_manager2, remote=False, stacked=False)
        # -------------------------------------------------------------------------------

        data_manager3 = DataManager(
            X_train=data_manager1.train_set_id,
            # X_test=test_df,
            column_descriptions=column_descriptions,
            resource_manager=self.mock_resource_manager)
        do_assert(data_manager3, remote=True, stacked=False)
        # -------------------------------------------------------------------------------

        data_manager4 = DataManager(
            X_train=data_manager1.train_set_id,
            X_test=data_manager1.test_set_id,
            column_descriptions=column_descriptions,
            resource_manager=self.mock_resource_manager)
        do_assert(data_manager4, remote=True, stacked=True)
Ejemplo n.º 10
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Author  : qichun tang
# @Contact    : [email protected]
import pickle
from pathlib import Path

import pandas as pd
from sklearn.model_selection import KFold

import autoflow
from autoflow import AutoFlowClassifier
from autoflow.datasets import load

train_df = load("qsar")
trained_pipeline = AutoFlowClassifier(
    initial_runs=1,
    run_limit=5,
    n_jobs=1,
    included_classifiers=["lightgbm"],
    debug=True,
    num2purified_workflow={
        "num->compressed": {
            "_name": "compress.f1score",
            "threshold": 0.9,
            "n_jobs": 12,
            # "cache_intermediate":False
        },
        "compressed->purified": ["scale.standardize", "operate.keep_going"],
    }
    # should_store_intermediate_result=True,  # 测试对中间结果存储的正确性
Ejemplo n.º 11
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Author  : qichun tang
# @Contact    : [email protected]
import pickle
from pathlib import Path

from sklearn.model_selection import KFold

from autoflow import AutoFlowClassifier
from autoflow import datasets

train_df, test_df = datasets.load("titanic", return_train_test=True)
trained_pipeline = AutoFlowClassifier(
    initial_runs=1,
    run_limit=1,
    n_jobs=1,
    included_classifiers=["catboost"],
    debug=True,
    n_jobs_in_algorithm=5
    # should_store_intermediate_result=True,  # 测试对中间结果存储的正确性
)
column_descriptions = {
    "id": "PassengerId",
    "target": "Survived",
    "text": "Name"
}
# if not os.path.exists("autoflow_classification.bz2"):
trained_pipeline.fit(
    X_train=train_df,
    X_test=test_df,
Ejemplo n.º 12
0
 def test_get_hash_of_dataframe_csv(self):
     train_df = datasets.load("titanic")
     hash_value1 = get_hash_of_dataframe_csv(train_df, L=51)
     hash_value2 = get_hash_of_dataframe_csv(train_df, L=100)
     self.assertTrue(hash_value1 == hash_value2)