def test_validate_hdl2(self): train_df, test_df = datasets.load("titanic", return_train_test=True) trained_pipeline = AutoFlowClassifier( initial_runs=1, run_limit=3, n_jobs=1, included_highR_nan_imputers=["operate.pop"], debug=True, n_jobs_in_algorithm=5, resource_manager=self.mock_resource_manager) column_descriptions = { "id": "PassengerId", "target": "Survived", "text": "Name" } try: trained_pipeline.fit(X_train=train_df, X_test=test_df, column_descriptions=column_descriptions, splitter=KFold(n_splits=3, shuffle=True, random_state=42), fit_ensemble_params=True, is_not_realy_run=True) assert Exception("didn't detect wrong HDL.") except Exception as e: self.assertEqual( str(e), "In step 'highR_nan->nan', user defined packege : 'operate.pop' does not exist!" )
def test_3(self): titanic_df = load("titanic") titanic_df.index = reversed(titanic_df.index) dc = DataFrameContainer( dataset_instance=titanic_df, resource_manager=self.http_mock_resource_manager) feat_grp = [f"feat_{i}" for i in range(dc.shape[1])] dc.set_feature_groups(feat_grp) column_descriptions = dc.column_descriptions dc.upload() dataset_id = dc.dataset_id download_dc = DataFrameContainer( "Unittest", dataset_id=dataset_id, resource_manager=self.http_mock_resource_manager) self.assertTrue( np.all(download_dc.data.fillna(0) == dc.data.fillna(0))) self.assertTrue( np.all(download_dc.feature_groups == dc.feature_groups)) self.assertTrue(np.all(download_dc.columns == dc.columns)) self.assertTrue(np.all(download_dc.index == dc.index)) self.assertEqual(download_dc.column_descriptions, dc.column_descriptions) self.assertEqual(download_dc.columns_mapper, dc.columns_mapper) self.assertEqual(download_dc.dataset_type, dc.dataset_type) self.assertEqual(download_dc.dataset_source, dc.dataset_source) ################################################################### in_data = [1, 2, 3, 4, 5] dc = NdArrayContainer(dataset_instance=in_data, resource_manager=self.http_mock_resource_manager) dc.upload() d_dc = NdArrayContainer( dataset_id=dc.dataset_id, resource_manager=self.http_mock_resource_manager) self.assertTrue(np.all(d_dc.data == np.array(in_data)))
def test_set_dirty_columns(self): titanic_df = load("titanic") columns = pd.Series(titanic_df.columns) columns = ["@"] * len(columns) titanic_df.columns = columns dc = DataFrameContainer(dataset_instance=titanic_df, resource_manager=self.mock_resource_manager) wanted_columns = Index([ 'col', 'col_1', 'col_2', 'col_3', 'col_4', 'col_5', 'col_6', 'col_7', 'col_8', 'col_9', 'col_10', 'col_11' ], dtype='object') self.assertTrue(np.all(dc.columns == wanted_columns))
def test_set_same_column(self): titanic_df = load("titanic") columns = pd.Series(titanic_df.columns) columns = ["@"] * len(columns) columns[1] = "same" columns[2] = "same" columns[3] = "same" columns[5] = "ok" columns[6] = "ok" titanic_df.columns = columns dc = DataFrameContainer(dataset_instance=titanic_df, resource_manager=self.mock_resource_manager) wanted = Index([ 'col', 'same_1', 'same_2', 'same_3', 'col_1', 'ok_1', 'ok_2', 'col_2', 'col_3', 'col_4', 'col_5', 'col_6' ], dtype='object') self.assertTrue(np.all(dc.columns == wanted))
def test_draw_only_estimator(self): name = "test_draw_workspace" train_df = load("qsar") remain_cols = list(train_df.columns) remain_cols.remove("target") column_descriptions = {"num": remain_cols, "target": "target"} data_manager = DataManager(self.mock_resource_manager, train_df, column_descriptions=column_descriptions) hdl_constructor = HDL_Constructor( DAG_workflow={"num->target": ["lightgbm", "catboost"]}) hdl_constructor.run(data_manager) hdl_df = hdl_constructor.get_hdl_dataframe() Path(f"{name}.html").write_text(hdl_df.to_html()) hdl_df.to_excel(f"{name}.xlsx") # pip install openpyxl print(hdl_df) graph = hdl_constructor.draw_workflow_space() open(f"{name}.gv", "w+").write(graph.source) cmd = f'''dot -Tpng -Gsize=9,15\! -Gdpi=300 -o{name}.png {name}.gv''' os.system(cmd)
def test_set_column_descriptions(self): final_column_descriptions = { 'id': 'PassengerId', 'target': 'Survived', 'text': ['Name'], 'num': ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare'], 'cat': ['Sex', 'Cabin', 'Embarked'], 'highC_cat': ['Ticket'] } train_df, test_df = load("titanic", return_train_test=True) origin = deepcopy(test_df) test_dc = DataFrameContainer( "Unittest", dataset_instance=test_df, resource_manager=self.mock_resource_manager) test_dc.set_column_descriptions(final_column_descriptions) self.assertTrue( np.all(test_dc.feature_groups == pd.Series([ 'id', 'num', 'text', 'cat', 'num', 'num', 'num', 'highC_cat', 'num', 'cat', 'cat' ]))) self.assertTrue(np.all(origin.columns == test_dc.columns))
def setUp(self) -> None: super(TestFeatureSelection, self).setUp() self.L = 1024 df = load("qsar") y = df.pop("target") X = df X[X == 0] = -1 X.index = reversed(X.index) self.index = deepcopy(X.index) X = DataFrameContainer("TrainSet", dataset_instance=X, resource_manager=self.mock_resource_manager) X.set_feature_groups(["num"] * X.shape[1]) self.X = X self.y = NdArrayContainer("TrainSet", dataset_instance=y, resource_manager=self.mock_resource_manager) y_reg = y + np.random.rand(*y.shape) self.y_reg = NdArrayContainer( "TrainSet", dataset_instance=y_reg, resource_manager=self.mock_resource_manager)
def setUp(self) -> None: super(RunReduce, self).setUp() self.L = 1024 df = load("qsar") y = df.pop("target") X = df X[X == 0] = -1 X.index = reversed(X.index) self.index = deepcopy(X.index) X = DataFrameContainer("TrainSet", dataset_instance=X) X.set_feature_groups(["num"] * X.shape[1]) X2 = deepcopy(X) y2 = deepcopy(y) N = 500 X2.data = X2.data.iloc[:N, :] X2.set_feature_groups(["num"] * X2.shape[1]) y2 = y2.iloc[:N] self.Xs = [ X, X2 ] self.ys = [ NdArrayContainer("TrainLabel", dataset_instance=y), NdArrayContainer("TrainLabel", dataset_instance=y2) ]
def test_instancing1(self): def do_assert(data_manager, remote=False, stacked=True): final_column_descriptions = { 'id': 'PassengerId', 'target': 'Survived', 'text': ['Name'], 'num': ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare'], 'cat': ['Sex', 'Cabin', 'Embarked'], 'highC_cat': ['Ticket'] } assert sort_dict( data_manager.final_column_descriptions) == sort_dict( final_column_descriptions) if not remote: assert sort_dict( data_manager.column_descriptions) == sort_dict({ 'id': 'PassengerId', 'target': 'Survived', 'text': 'Name' }) else: assert sort_dict( data_manager.column_descriptions) == sort_dict( final_column_descriptions) if stacked: assert np.all( pd.Series(data_manager.feature_groups) == pd.Series([ 'num', 'text', 'cat', 'nan', 'num', 'num', 'highC_cat', 'nan', 'highR_nan', 'nan' ])) else: assert np.all( pd.Series(data_manager.feature_groups) == pd.Series([ 'num', 'text', 'cat', 'nan', 'num', 'num', 'highC_cat', 'num', 'highR_nan', 'nan' ])) assert np.all(data_manager.columns == Index([ 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked' ], dtype='object')) train_df, test_df = datasets.load("titanic", return_train_test=True) column_descriptions = { "id": "PassengerId", "target": "Survived", "text": "Name" } data_manager1 = DataManager( X_train=train_df, X_test=test_df, column_descriptions=column_descriptions, resource_manager=self.mock_resource_manager) do_assert(data_manager1, remote=False, stacked=True) # ------------------------------------------------------------------------------- train_df, test_df = datasets.load("titanic", return_train_test=True) data_manager2 = DataManager( X_train=train_df, # X_test=test_df, column_descriptions=column_descriptions, resource_manager=self.mock_resource_manager) do_assert(data_manager2, remote=False, stacked=False) # ------------------------------------------------------------------------------- data_manager3 = DataManager( X_train=data_manager1.train_set_id, # X_test=test_df, column_descriptions=column_descriptions, resource_manager=self.mock_resource_manager) do_assert(data_manager3, remote=True, stacked=False) # ------------------------------------------------------------------------------- data_manager4 = DataManager( X_train=data_manager1.train_set_id, X_test=data_manager1.test_set_id, column_descriptions=column_descriptions, resource_manager=self.mock_resource_manager) do_assert(data_manager4, remote=True, stacked=True)
#!/usr/bin/env python # -*- coding: utf-8 -*- # @Author : qichun tang # @Contact : [email protected] import pickle from pathlib import Path import pandas as pd from sklearn.model_selection import KFold import autoflow from autoflow import AutoFlowClassifier from autoflow.datasets import load train_df = load("qsar") trained_pipeline = AutoFlowClassifier( initial_runs=1, run_limit=5, n_jobs=1, included_classifiers=["lightgbm"], debug=True, num2purified_workflow={ "num->compressed": { "_name": "compress.f1score", "threshold": 0.9, "n_jobs": 12, # "cache_intermediate":False }, "compressed->purified": ["scale.standardize", "operate.keep_going"], } # should_store_intermediate_result=True, # 测试对中间结果存储的正确性
#!/usr/bin/env python # -*- coding: utf-8 -*- # @Author : qichun tang # @Contact : [email protected] import pickle from pathlib import Path from sklearn.model_selection import KFold from autoflow import AutoFlowClassifier from autoflow import datasets train_df, test_df = datasets.load("titanic", return_train_test=True) trained_pipeline = AutoFlowClassifier( initial_runs=1, run_limit=1, n_jobs=1, included_classifiers=["catboost"], debug=True, n_jobs_in_algorithm=5 # should_store_intermediate_result=True, # 测试对中间结果存储的正确性 ) column_descriptions = { "id": "PassengerId", "target": "Survived", "text": "Name" } # if not os.path.exists("autoflow_classification.bz2"): trained_pipeline.fit( X_train=train_df, X_test=test_df,
def test_get_hash_of_dataframe_csv(self): train_df = datasets.load("titanic") hash_value1 = get_hash_of_dataframe_csv(train_df, L=51) hash_value2 = get_hash_of_dataframe_csv(train_df, L=100) self.assertTrue(hash_value1 == hash_value2)