def _transform(self, X: DataFrameContainer): if X is None: return None X_ = X.filter_feature_groups(self.in_feature_groups, True) X_data = self.before_trans_X(X_) X_trans = self._transform_procedure(X_data) return X.replace_feature_groups(self.in_feature_groups, X_trans, self.out_feature_groups)
def _transform(self, X: DataFrameContainer, y: Optional[NdArrayContainer]): y_data = y.data X_data, y_data = self._transform_proc(X.data, y_data) X = X.copy() y = y.copy() X_data = pd.DataFrame(X_data, columns=X.columns) X.data = X_data y.data = y_data return X, y
def setUp(self) -> None: super(TestBalance, self).setUp() X, y = load_iris(return_X_y=True) y[y == 2] = 1 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) X_train = DataFrameContainer( "TrainSet", dataset_instance=X_train, resource_manager=self.mock_resource_manager) X_test = DataFrameContainer( "TestSet", dataset_instance=X_test, resource_manager=self.mock_resource_manager) y_train = NdArrayContainer("TrainLabel", dataset_instance=y_train, resource_manager=self.mock_resource_manager) y_test = NdArrayContainer("TestLabel", dataset_instance=y_test, resource_manager=self.mock_resource_manager) X_train.set_feature_groups(["num"] * 4) X_test.set_feature_groups(["num"] * 4) self.X_train = X_train self.X_test = X_test self.y_train = y_train self.y_test = y_test
def test_classifier(self): train_df = datasets.load("titanic")[["Name", "Survived"]] y = np.array(train_df.pop("Survived")) X_train, X_test, y_train, y_test = train_test_split(train_df, y, test_size=0.2, random_state=0) X_train = DataFrameContainer( "TrainSet", dataset_instance=X_train, resource_manager=self.mock_resource_manager) X_test = DataFrameContainer( "TestSet", dataset_instance=X_test, resource_manager=self.mock_resource_manager) y_train = NdArrayContainer("TrainLabel", dataset_instance=y_train, resource_manager=self.mock_resource_manager) y_test = NdArrayContainer("TestLabel", dataset_instance=y_test, resource_manager=self.mock_resource_manager) X_train.set_feature_groups(["text"]) X_test.set_feature_groups(["text"]) est_cls_list = [ TsvdTransformer, NmfTransformer, LsiTransformer, LdaTransformer, RpTransformer, ] for cls in est_cls_list: print("=========================") print(cls.__name__) print("=========================") tokenizer = SimpleTokenlizer( **get_default_hp_of_cls(SimpleTokenlizer)) tokenizer.in_feature_groups = "text" tokenizer.out_feature_groups = "token" transformer = cls(**get_default_hp_of_cls(cls)) transformer.in_feature_groups = "token" transformer.out_feature_groups = "num" classifier = RandomForestClassifier( **get_default_hp_of_cls(RandomForestClassifier)) pipeline = ML_Workflow([ ("tokenizer", tokenizer), ("transformer", transformer), ("classifier", classifier), ], resource_manager=self.mock_resource_manager) start = time() pipeline.fit(X_train, y_train, X_test, y_test) y_pred = pipeline.predict(X_test) score = accuracy_score(y_test.data, y_pred) end = time() print("score:", score) print("time:", end - start) self.assertGreater(score, 0.6) print('\n' * 2)
def get_cache_key(self, config_id, X_train: DataFrameContainer, y_train: NdArrayContainer): experiment_id = str(self.resource_manager.experiment_id) return "-".join( [experiment_id, config_id, X_train.get_hash(), y_train.get_hash()])
def test_upload_download(self): titanic_df = load("titanic") titanic_df.index = reversed(titanic_df.index) dc = DataFrameContainer(dataset_instance=titanic_df, resource_manager=self.mock_resource_manager) feat_grp = [f"feat_{i}" for i in range(dc.shape[1])] dc.set_feature_groups(feat_grp) column_descriptions = dc.column_descriptions dc.upload() dataset_id = dc.dataset_id download_dc = DataFrameContainer( "Unittest", dataset_id=dataset_id, resource_manager=self.mock_resource_manager) self.assertTrue( np.all(download_dc.data.fillna(0) == dc.data.fillna(0))) self.assertTrue( np.all(download_dc.feature_groups == dc.feature_groups)) self.assertTrue(np.all(download_dc.columns == dc.columns)) self.assertTrue(np.all(download_dc.index == dc.index)) self.assertEqual(download_dc.column_descriptions, dc.column_descriptions) self.assertEqual(download_dc.columns_mapper, dc.columns_mapper) self.assertEqual(download_dc.dataset_type, dc.dataset_type) self.assertEqual(download_dc.dataset_source, dc.dataset_source)
def parse_data_container(self, dataset_source, X, y) -> Tuple[Optional[DataFrameContainer], str]: if X is None: return X, "" # input_dataset_id only work if X is dataset_id # keep input_dataset_id to do sample test # make sure dataset is invariant in upload and download process input_dataset_id = "" self.final_column_descriptions = None # filepath or dataset_id if isinstance(X, str): # filepath if os.path.exists(X): self.logger.info(f"'{X}' will be treated as a file path.") X = DataFrameContainer(dataset_source, dataset_path=X, resource_manager=self.resource_manager, dataset_metadata=self.dataset_metadata) # dataset_id else: self.logger.info( f"'{X}' will be treated as dataset ID, and download from database." ) input_dataset_id = X X = DataFrameContainer(dataset_source, dataset_id=X, resource_manager=self.resource_manager, dataset_metadata=self.dataset_metadata) self.final_column_descriptions = deepcopy( X.column_descriptions) elif isinstance(X, DataFrameContainer): pass else: # we should create a columns and concat X and y if isinstance(X, np.ndarray): X = pd.DataFrame( X, columns=[f"column_{i}" for i in range((X.shape[1]))]) # in this step, column_descriptions will implicitly update "target" field X = self.concat_y(X, y) X = DataFrameContainer(dataset_source, dataset_instance=X, resource_manager=self.resource_manager, dataset_metadata=self.dataset_metadata) return X, input_dataset_id
def process_X(self, X: DataFrameContainer, X_origin): if X is None: return None assert X.shape[1] == len(self.columns) if isinstance(X_origin, np.ndarray): X.columns = self.columns elif isinstance(X_origin, pd.DataFrame): assert set(X.columns) == set(self.columns) if not np.all(X.columns == self.columns): self.logger.warning( f"{X.dataset_source}'s columns do not match the TrainSet's columns by position!" ) X.data = X.data[self.columns] elif isinstance(X_origin, DataFrameContainer): pass else: raise NotImplementedError X.set_feature_groups(self.feature_groups) return X
def test_ordinal_encode_category(self): df2 = pd.DataFrame([ ['C', '3'], ['D', '4'], ['D', '4'], ], columns=['alpha', 'digits']) df2["digits"] = df2["digits"].astype( CategoricalDtype(categories=["4", "3"], ordered=True)) df2["alpha"] = df2["alpha"].astype( CategoricalDtype(categories=["D", "C"], ordered=True)) df2_ = df2.loc[1:, :] df2_1 = df2.loc[:1, :] df2_c = pd.concat([df2_, df2_1]) df2_c.index = range(4) encoder = OrdinalEncoder() encoder.in_feature_groups = "cat" encoder.out_feature_groups = "ordinal" # RunFeatureSelection().test_univar_clf() # RunCoding().test_procedure() dc = DataFrameContainer(dataset_instance=df2_c) dc.set_feature_groups(["cat"] * 2) encoder.fit(X_train=dc) result = encoder.transform(X_train=dc)["X_train"] print(result) should_be = pd.DataFrame({ 'alpha': { 0: 0, 1: 0, 2: 1, 3: 0 }, 'digits': { 0: 0, 1: 0, 2: 1, 3: 0 } }) assert np.all(result.data == should_be)
def test_set_column_descriptions(self): final_column_descriptions = { 'id': 'PassengerId', 'target': 'Survived', 'text': ['Name'], 'num': ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare'], 'cat': ['Sex', 'Cabin', 'Embarked'], 'highC_cat': ['Ticket'] } train_df, test_df = load("titanic", return_train_test=True) origin = deepcopy(test_df) test_dc = DataFrameContainer( "Unittest", dataset_instance=test_df, resource_manager=self.mock_resource_manager) test_dc.set_column_descriptions(final_column_descriptions) self.assertTrue( np.all(test_dc.feature_groups == pd.Series([ 'id', 'num', 'text', 'cat', 'num', 'num', 'num', 'highC_cat', 'num', 'cat', 'cat' ]))) self.assertTrue(np.all(origin.columns == test_dc.columns))
def test_set_dirty_columns(self): titanic_df = load("titanic") columns = pd.Series(titanic_df.columns) columns = ["@"] * len(columns) titanic_df.columns = columns dc = DataFrameContainer(dataset_instance=titanic_df, resource_manager=self.mock_resource_manager) wanted_columns = Index([ 'col', 'col_1', 'col_2', 'col_3', 'col_4', 'col_5', 'col_6', 'col_7', 'col_8', 'col_9', 'col_10', 'col_11' ], dtype='object') self.assertTrue(np.all(dc.columns == wanted_columns))
def setUp(self) -> None: super(TestFeatureSelection, self).setUp() self.L = 1024 df = load("qsar") y = df.pop("target") X = df X[X == 0] = -1 X.index = reversed(X.index) self.index = deepcopy(X.index) X = DataFrameContainer("TrainSet", dataset_instance=X, resource_manager=self.mock_resource_manager) X.set_feature_groups(["num"] * X.shape[1]) self.X = X self.y = NdArrayContainer("TrainSet", dataset_instance=y, resource_manager=self.mock_resource_manager) y_reg = y + np.random.rand(*y.shape) self.y_reg = NdArrayContainer( "TrainSet", dataset_instance=y_reg, resource_manager=self.mock_resource_manager)
def setUp(self) -> None: super(RunReduce, self).setUp() self.L = 1024 df = load("qsar") y = df.pop("target") X = df X[X == 0] = -1 X.index = reversed(X.index) self.index = deepcopy(X.index) X = DataFrameContainer("TrainSet", dataset_instance=X) X.set_feature_groups(["num"] * X.shape[1]) X2 = deepcopy(X) y2 = deepcopy(y) N = 500 X2.data = X2.data.iloc[:N, :] X2.set_feature_groups(["num"] * X2.shape[1]) y2 = y2.iloc[:N] self.Xs = [ X, X2 ] self.ys = [ NdArrayContainer("TrainLabel", dataset_instance=y), NdArrayContainer("TrainLabel", dataset_instance=y2) ]
def implement_subsample_budget( X_train: DataFrameContainer, y_train: NdArrayContainer, Xs: List[Optional[DataFrameContainer]], budget, random_state: int ) -> Tuple[DataFrameContainer, NdArrayContainer, List[Optional[DataFrameContainer]]]: rng = np.random.RandomState(random_state) samples = round(X_train.shape[0] * budget) features = X_train.shape[1] sub_sample_index = get_stratified_sampling_index(y_train.data, budget, random_state) # sub sampling X_train, y_train X_train = X_train.sub_sample(sub_sample_index) y_train = y_train.sub_sample(sub_sample_index) # if features > samples , do sub_feature avoid over-fitting if features > samples: sub_feature_index = rng.permutation(X_train.shape[1])[:samples] X_train = X_train.sub_feature(sub_feature_index) res_Xs = [] for X in Xs: res_Xs.append( X.sub_feature(sub_feature_index) if X is not None else None) else: res_Xs = Xs return X_train, y_train, res_Xs
def test_set_same_column(self): titanic_df = load("titanic") columns = pd.Series(titanic_df.columns) columns = ["@"] * len(columns) columns[1] = "same" columns[2] = "same" columns[3] = "same" columns[5] = "ok" columns[6] = "ok" titanic_df.columns = columns dc = DataFrameContainer(dataset_instance=titanic_df, resource_manager=self.mock_resource_manager) wanted = Index([ 'col', 'same_1', 'same_2', 'same_3', 'col_1', 'ok_1', 'ok_2', 'col_2', 'col_3', 'col_4', 'col_5', 'col_6' ], dtype='object') self.assertTrue(np.all(dc.columns == wanted))
def test_handle_unknown(self): X_train = pd.DataFrame([ ['A', 'alpha', 9], ['A', 'alpha', 1], ['B', 'beta', 2], ['B', 'beta', 3], ['C', 'gamma', 4], ['C', 'gamma', 5], ], columns=['col1', 'col2', 'col3']) X_valid = pd.DataFrame([ ['D', 'kappa', 6], ['D', 'kappa', 6], ['E', 'sigma', 7], ['E', 'sigma', 7], ['F', 'mu', 8], ['F', 'mu', 8], ], columns=['col1', 'col2', 'col3']) X_train = DataFrameContainer(dataset_instance=X_train) X_valid = DataFrameContainer(dataset_instance=X_valid) X_train.set_feature_groups(['cat'] * 3) X_valid.set_feature_groups(['cat'] * 3) y_train = NdArrayContainer(dataset_instance=[0, 1, 0, 1, 0, 1]) for cls in [ EntityEncoder, OrdinalEncoder, OneHotEncoder, TargetEncoder, CatBoostEncoder ]: hp = get_default_hp_of_cls(cls) encoder = cls(**hp) encoder.in_feature_groups = "cat" encoder.out_feature_groups = "ordinal" result = encoder.fit_transform(X_train=X_train, X_valid=X_valid, y_train=y_train) assert np.all( encoder.transform(X_train)['X_train'].data == result['X_train'].data) assert np.all( encoder.transform(X_valid)['X_train'].data == result['X_valid'].data)
#!/usr/bin/env python # -*- coding: utf-8 -*- # @Author : qichun tang # @Contact : [email protected] from sklearn.datasets import load_digits from sklearn.metrics import accuracy_score from sklearn.model_selection import train_test_split from autoflow.core.classifier import AutoFlowClassifier from autoflow.data_container import DataFrameContainer from autoflow.data_container import NdArrayContainer X, y = load_digits(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) X_test_ = DataFrameContainer(dataset_instance=X_test) y_test_ = NdArrayContainer(dataset_instance=y_test) pipe = AutoFlowClassifier() estimator = pipe.fit_ensemble( task_id="2435e32babd7d09b6357e99aa7fa3b89", budget_id="afff102b36a43efe4f68e299ff21cadd", trials_fetcher_params={"k": 50} ) # pipe.fit(X_train, y_train, fit_ensemble_params=False) # score = accuracy_score(y_test, y_pred) y_pred = estimator.predict(X_test_) score = accuracy_score(y_test, y_pred) print(score)
def test_classifier(self): X, y = datasets.load_digits(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) X_train = DataFrameContainer( "TrainSet", dataset_instance=X_train, resource_manager=self.mock_resource_manager) X_test = DataFrameContainer( "TestSet", dataset_instance=X_test, resource_manager=self.mock_resource_manager) y_train = NdArrayContainer("TrainLabel", dataset_instance=y_train, resource_manager=self.mock_resource_manager) y_test = NdArrayContainer("TestLabel", dataset_instance=y_test, resource_manager=self.mock_resource_manager) est_cls_list = [ LogisticRegression, GradientBoostingClassifier, RandomForestClassifier, ExtraTreesClassifier, SGDClassifier, ] for cls in est_cls_list: print("=========================") print(cls.__name__) print("=========================") est = cls(**get_default_hp_of_cls(cls)) start = time() est.fit(X_train, y_train, X_test, y_test) score = est.component.score(X_test.data, y_test.data) end = time() print("score:", score) print("time:", end - start) self.assertTrue(score == np.max(est.performance_history)) print("max_iterations:", est.max_iterations) print("best_iteration_:", est.best_iteration_) print("early_stopping_rounds:", est.early_stopping_rounds) print("early_stopping_tol:", est.early_stopping_tol) print("iter_inc:", est.iter_inc) print("iteration:", est.iteration) print("iter_ix:", est.iter_ix) print("min_performance:", np.min(est.performance_history)) print("max_performance:", np.max(est.performance_history)) print("learning_curve:", est.learning_curve) print("estimator:", est) print('\n' * 2) learning_curve = est.learning_curve plt.grid() plt.plot(learning_curve[0], learning_curve[1], label="Train Set") plt.plot(learning_curve[0], learning_curve[2], label="Valid Set") plt.xlabel(est.iterations_name) plt.ylabel("Accuracy") title = cls.__name__ plt.title(title) plt.axvline(x=est.best_iteration_, ls="--", c="k") plt.legend(loc="best") plt.savefig(self.plot_dir + f"/{title}.png", quality=100, dpi=600) plt.close()