def _transform(self, X: DataFrameContainer):
     if X is None:
         return None
     X_ = X.filter_feature_groups(self.in_feature_groups, True)
     X_data = self.before_trans_X(X_)
     X_trans = self._transform_procedure(X_data)
     return X.replace_feature_groups(self.in_feature_groups, X_trans, self.out_feature_groups)
 def _transform(self, X: DataFrameContainer, y: Optional[NdArrayContainer]):
     y_data = y.data
     X_data, y_data = self._transform_proc(X.data, y_data)
     X = X.copy()
     y = y.copy()
     X_data = pd.DataFrame(X_data, columns=X.columns)
     X.data = X_data
     y.data = y_data
     return X, y
Exemple #3
0
 def setUp(self) -> None:
     super(TestBalance, self).setUp()
     X, y = load_iris(return_X_y=True)
     y[y == 2] = 1
     X_train, X_test, y_train, y_test = train_test_split(X,
                                                         y,
                                                         test_size=0.2,
                                                         random_state=0)
     X_train = DataFrameContainer(
         "TrainSet",
         dataset_instance=X_train,
         resource_manager=self.mock_resource_manager)
     X_test = DataFrameContainer(
         "TestSet",
         dataset_instance=X_test,
         resource_manager=self.mock_resource_manager)
     y_train = NdArrayContainer("TrainLabel",
                                dataset_instance=y_train,
                                resource_manager=self.mock_resource_manager)
     y_test = NdArrayContainer("TestLabel",
                               dataset_instance=y_test,
                               resource_manager=self.mock_resource_manager)
     X_train.set_feature_groups(["num"] * 4)
     X_test.set_feature_groups(["num"] * 4)
     self.X_train = X_train
     self.X_test = X_test
     self.y_train = y_train
     self.y_test = y_test
    def test_classifier(self):
        train_df = datasets.load("titanic")[["Name", "Survived"]]
        y = np.array(train_df.pop("Survived"))

        X_train, X_test, y_train, y_test = train_test_split(train_df,
                                                            y,
                                                            test_size=0.2,
                                                            random_state=0)
        X_train = DataFrameContainer(
            "TrainSet",
            dataset_instance=X_train,
            resource_manager=self.mock_resource_manager)
        X_test = DataFrameContainer(
            "TestSet",
            dataset_instance=X_test,
            resource_manager=self.mock_resource_manager)
        y_train = NdArrayContainer("TrainLabel",
                                   dataset_instance=y_train,
                                   resource_manager=self.mock_resource_manager)
        y_test = NdArrayContainer("TestLabel",
                                  dataset_instance=y_test,
                                  resource_manager=self.mock_resource_manager)
        X_train.set_feature_groups(["text"])
        X_test.set_feature_groups(["text"])
        est_cls_list = [
            TsvdTransformer,
            NmfTransformer,
            LsiTransformer,
            LdaTransformer,
            RpTransformer,
        ]
        for cls in est_cls_list:
            print("=========================")
            print(cls.__name__)
            print("=========================")
            tokenizer = SimpleTokenlizer(
                **get_default_hp_of_cls(SimpleTokenlizer))
            tokenizer.in_feature_groups = "text"
            tokenizer.out_feature_groups = "token"
            transformer = cls(**get_default_hp_of_cls(cls))
            transformer.in_feature_groups = "token"
            transformer.out_feature_groups = "num"
            classifier = RandomForestClassifier(
                **get_default_hp_of_cls(RandomForestClassifier))
            pipeline = ML_Workflow([
                ("tokenizer", tokenizer),
                ("transformer", transformer),
                ("classifier", classifier),
            ],
                                   resource_manager=self.mock_resource_manager)
            start = time()
            pipeline.fit(X_train, y_train, X_test, y_test)
            y_pred = pipeline.predict(X_test)
            score = accuracy_score(y_test.data, y_pred)
            end = time()
            print("score:", score)
            print("time:", end - start)
            self.assertGreater(score, 0.6)
            print('\n' * 2)
 def get_cache_key(self, config_id, X_train: DataFrameContainer,
                   y_train: NdArrayContainer):
     experiment_id = str(self.resource_manager.experiment_id)
     return "-".join(
         [experiment_id, config_id,
          X_train.get_hash(),
          y_train.get_hash()])
 def test_upload_download(self):
     titanic_df = load("titanic")
     titanic_df.index = reversed(titanic_df.index)
     dc = DataFrameContainer(dataset_instance=titanic_df,
                             resource_manager=self.mock_resource_manager)
     feat_grp = [f"feat_{i}" for i in range(dc.shape[1])]
     dc.set_feature_groups(feat_grp)
     column_descriptions = dc.column_descriptions
     dc.upload()
     dataset_id = dc.dataset_id
     download_dc = DataFrameContainer(
         "Unittest",
         dataset_id=dataset_id,
         resource_manager=self.mock_resource_manager)
     self.assertTrue(
         np.all(download_dc.data.fillna(0) == dc.data.fillna(0)))
     self.assertTrue(
         np.all(download_dc.feature_groups == dc.feature_groups))
     self.assertTrue(np.all(download_dc.columns == dc.columns))
     self.assertTrue(np.all(download_dc.index == dc.index))
     self.assertEqual(download_dc.column_descriptions,
                      dc.column_descriptions)
     self.assertEqual(download_dc.columns_mapper, dc.columns_mapper)
     self.assertEqual(download_dc.dataset_type, dc.dataset_type)
     self.assertEqual(download_dc.dataset_source, dc.dataset_source)
Exemple #7
0
 def parse_data_container(self, dataset_source, X,
                          y) -> Tuple[Optional[DataFrameContainer], str]:
     if X is None:
         return X, ""
     # input_dataset_id only work if X is dataset_id
     # keep input_dataset_id to do sample test
     # make sure dataset is invariant in upload and download process
     input_dataset_id = ""
     self.final_column_descriptions = None
     # filepath or dataset_id
     if isinstance(X, str):
         # filepath
         if os.path.exists(X):
             self.logger.info(f"'{X}' will be treated as a file path.")
             X = DataFrameContainer(dataset_source,
                                    dataset_path=X,
                                    resource_manager=self.resource_manager,
                                    dataset_metadata=self.dataset_metadata)
         # dataset_id
         else:
             self.logger.info(
                 f"'{X}' will be treated as dataset ID, and download from database."
             )
             input_dataset_id = X
             X = DataFrameContainer(dataset_source,
                                    dataset_id=X,
                                    resource_manager=self.resource_manager,
                                    dataset_metadata=self.dataset_metadata)
             self.final_column_descriptions = deepcopy(
                 X.column_descriptions)
     elif isinstance(X, DataFrameContainer):
         pass
     else:
         # we should create a columns and concat X and y
         if isinstance(X, np.ndarray):
             X = pd.DataFrame(
                 X, columns=[f"column_{i}" for i in range((X.shape[1]))])
         # in this step, column_descriptions will implicitly update "target" field
         X = self.concat_y(X, y)
         X = DataFrameContainer(dataset_source,
                                dataset_instance=X,
                                resource_manager=self.resource_manager,
                                dataset_metadata=self.dataset_metadata)
     return X, input_dataset_id
Exemple #8
0
 def process_X(self, X: DataFrameContainer, X_origin):
     if X is None:
         return None
     assert X.shape[1] == len(self.columns)
     if isinstance(X_origin, np.ndarray):
         X.columns = self.columns
     elif isinstance(X_origin, pd.DataFrame):
         assert set(X.columns) == set(self.columns)
         if not np.all(X.columns == self.columns):
             self.logger.warning(
                 f"{X.dataset_source}'s columns do not match the TrainSet's columns by position!"
             )
             X.data = X.data[self.columns]
     elif isinstance(X_origin, DataFrameContainer):
         pass
     else:
         raise NotImplementedError
     X.set_feature_groups(self.feature_groups)
     return X
Exemple #9
0
    def test_ordinal_encode_category(self):
        df2 = pd.DataFrame([
            ['C', '3'],
            ['D', '4'],
            ['D', '4'],
        ],
                           columns=['alpha', 'digits'])
        df2["digits"] = df2["digits"].astype(
            CategoricalDtype(categories=["4", "3"], ordered=True))
        df2["alpha"] = df2["alpha"].astype(
            CategoricalDtype(categories=["D", "C"], ordered=True))
        df2_ = df2.loc[1:, :]
        df2_1 = df2.loc[:1, :]
        df2_c = pd.concat([df2_, df2_1])
        df2_c.index = range(4)
        encoder = OrdinalEncoder()

        encoder.in_feature_groups = "cat"
        encoder.out_feature_groups = "ordinal"
        # RunFeatureSelection().test_univar_clf()
        # RunCoding().test_procedure()
        dc = DataFrameContainer(dataset_instance=df2_c)
        dc.set_feature_groups(["cat"] * 2)
        encoder.fit(X_train=dc)
        result = encoder.transform(X_train=dc)["X_train"]
        print(result)
        should_be = pd.DataFrame({
            'alpha': {
                0: 0,
                1: 0,
                2: 1,
                3: 0
            },
            'digits': {
                0: 0,
                1: 0,
                2: 1,
                3: 0
            }
        })
        assert np.all(result.data == should_be)
 def test_set_column_descriptions(self):
     final_column_descriptions = {
         'id': 'PassengerId',
         'target': 'Survived',
         'text': ['Name'],
         'num': ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare'],
         'cat': ['Sex', 'Cabin', 'Embarked'],
         'highC_cat': ['Ticket']
     }
     train_df, test_df = load("titanic", return_train_test=True)
     origin = deepcopy(test_df)
     test_dc = DataFrameContainer(
         "Unittest",
         dataset_instance=test_df,
         resource_manager=self.mock_resource_manager)
     test_dc.set_column_descriptions(final_column_descriptions)
     self.assertTrue(
         np.all(test_dc.feature_groups == pd.Series([
             'id', 'num', 'text', 'cat', 'num', 'num', 'num', 'highC_cat',
             'num', 'cat', 'cat'
         ])))
     self.assertTrue(np.all(origin.columns == test_dc.columns))
 def test_set_dirty_columns(self):
     titanic_df = load("titanic")
     columns = pd.Series(titanic_df.columns)
     columns = ["@"] * len(columns)
     titanic_df.columns = columns
     dc = DataFrameContainer(dataset_instance=titanic_df,
                             resource_manager=self.mock_resource_manager)
     wanted_columns = Index([
         'col', 'col_1', 'col_2', 'col_3', 'col_4', 'col_5', 'col_6',
         'col_7', 'col_8', 'col_9', 'col_10', 'col_11'
     ],
                            dtype='object')
     self.assertTrue(np.all(dc.columns == wanted_columns))
Exemple #12
0
 def setUp(self) -> None:
     super(TestFeatureSelection, self).setUp()
     self.L = 1024
     df = load("qsar")
     y = df.pop("target")
     X = df
     X[X == 0] = -1
     X.index = reversed(X.index)
     self.index = deepcopy(X.index)
     X = DataFrameContainer("TrainSet",
                            dataset_instance=X,
                            resource_manager=self.mock_resource_manager)
     X.set_feature_groups(["num"] * X.shape[1])
     self.X = X
     self.y = NdArrayContainer("TrainSet",
                               dataset_instance=y,
                               resource_manager=self.mock_resource_manager)
     y_reg = y + np.random.rand(*y.shape)
     self.y_reg = NdArrayContainer(
         "TrainSet",
         dataset_instance=y_reg,
         resource_manager=self.mock_resource_manager)
Exemple #13
0
 def setUp(self) -> None:
     super(RunReduce, self).setUp()
     self.L = 1024
     df = load("qsar")
     y = df.pop("target")
     X = df
     X[X == 0] = -1
     X.index = reversed(X.index)
     self.index = deepcopy(X.index)
     X = DataFrameContainer("TrainSet", dataset_instance=X)
     X.set_feature_groups(["num"] * X.shape[1])
     X2 = deepcopy(X)
     y2 = deepcopy(y)
     N = 500
     X2.data = X2.data.iloc[:N, :]
     X2.set_feature_groups(["num"] * X2.shape[1])
     y2 = y2.iloc[:N]
     self.Xs = [
         X, X2
     ]
     self.ys = [
         NdArrayContainer("TrainLabel", dataset_instance=y),
         NdArrayContainer("TrainLabel", dataset_instance=y2)
     ]
Exemple #14
0
def implement_subsample_budget(
    X_train: DataFrameContainer, y_train: NdArrayContainer,
    Xs: List[Optional[DataFrameContainer]], budget, random_state: int
) -> Tuple[DataFrameContainer, NdArrayContainer,
           List[Optional[DataFrameContainer]]]:
    rng = np.random.RandomState(random_state)
    samples = round(X_train.shape[0] * budget)
    features = X_train.shape[1]
    sub_sample_index = get_stratified_sampling_index(y_train.data, budget,
                                                     random_state)
    # sub sampling X_train, y_train
    X_train = X_train.sub_sample(sub_sample_index)
    y_train = y_train.sub_sample(sub_sample_index)
    # if features > samples , do sub_feature avoid over-fitting
    if features > samples:
        sub_feature_index = rng.permutation(X_train.shape[1])[:samples]
        X_train = X_train.sub_feature(sub_feature_index)
        res_Xs = []
        for X in Xs:
            res_Xs.append(
                X.sub_feature(sub_feature_index) if X is not None else None)
    else:
        res_Xs = Xs
    return X_train, y_train, res_Xs
 def test_set_same_column(self):
     titanic_df = load("titanic")
     columns = pd.Series(titanic_df.columns)
     columns = ["@"] * len(columns)
     columns[1] = "same"
     columns[2] = "same"
     columns[3] = "same"
     columns[5] = "ok"
     columns[6] = "ok"
     titanic_df.columns = columns
     dc = DataFrameContainer(dataset_instance=titanic_df,
                             resource_manager=self.mock_resource_manager)
     wanted = Index([
         'col', 'same_1', 'same_2', 'same_3', 'col_1', 'ok_1', 'ok_2',
         'col_2', 'col_3', 'col_4', 'col_5', 'col_6'
     ],
                    dtype='object')
     self.assertTrue(np.all(dc.columns == wanted))
Exemple #16
0
 def test_handle_unknown(self):
     X_train = pd.DataFrame([
         ['A', 'alpha', 9],
         ['A', 'alpha', 1],
         ['B', 'beta', 2],
         ['B', 'beta', 3],
         ['C', 'gamma', 4],
         ['C', 'gamma', 5],
     ],
                            columns=['col1', 'col2', 'col3'])
     X_valid = pd.DataFrame([
         ['D', 'kappa', 6],
         ['D', 'kappa', 6],
         ['E', 'sigma', 7],
         ['E', 'sigma', 7],
         ['F', 'mu', 8],
         ['F', 'mu', 8],
     ],
                            columns=['col1', 'col2', 'col3'])
     X_train = DataFrameContainer(dataset_instance=X_train)
     X_valid = DataFrameContainer(dataset_instance=X_valid)
     X_train.set_feature_groups(['cat'] * 3)
     X_valid.set_feature_groups(['cat'] * 3)
     y_train = NdArrayContainer(dataset_instance=[0, 1, 0, 1, 0, 1])
     for cls in [
             EntityEncoder, OrdinalEncoder, OneHotEncoder, TargetEncoder,
             CatBoostEncoder
     ]:
         hp = get_default_hp_of_cls(cls)
         encoder = cls(**hp)
         encoder.in_feature_groups = "cat"
         encoder.out_feature_groups = "ordinal"
         result = encoder.fit_transform(X_train=X_train,
                                        X_valid=X_valid,
                                        y_train=y_train)
         assert np.all(
             encoder.transform(X_train)['X_train'].data ==
             result['X_train'].data)
         assert np.all(
             encoder.transform(X_valid)['X_train'].data ==
             result['X_valid'].data)
Exemple #17
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Author  : qichun tang
# @Contact    : [email protected]
from sklearn.datasets import load_digits
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

from autoflow.core.classifier import AutoFlowClassifier
from autoflow.data_container import DataFrameContainer
from autoflow.data_container import NdArrayContainer

X, y = load_digits(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
X_test_ = DataFrameContainer(dataset_instance=X_test)
y_test_ = NdArrayContainer(dataset_instance=y_test)
pipe = AutoFlowClassifier()
estimator = pipe.fit_ensemble(
    task_id="2435e32babd7d09b6357e99aa7fa3b89",
    budget_id="afff102b36a43efe4f68e299ff21cadd",
    trials_fetcher_params={"k": 50}
)
# pipe.fit(X_train, y_train, fit_ensemble_params=False)
# score = accuracy_score(y_test, y_pred)
y_pred = estimator.predict(X_test_)
score = accuracy_score(y_test, y_pred)
print(score)
    def test_classifier(self):
        X, y = datasets.load_digits(return_X_y=True)

        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.2,
                                                            random_state=0)
        X_train = DataFrameContainer(
            "TrainSet",
            dataset_instance=X_train,
            resource_manager=self.mock_resource_manager)
        X_test = DataFrameContainer(
            "TestSet",
            dataset_instance=X_test,
            resource_manager=self.mock_resource_manager)
        y_train = NdArrayContainer("TrainLabel",
                                   dataset_instance=y_train,
                                   resource_manager=self.mock_resource_manager)
        y_test = NdArrayContainer("TestLabel",
                                  dataset_instance=y_test,
                                  resource_manager=self.mock_resource_manager)

        est_cls_list = [
            LogisticRegression,
            GradientBoostingClassifier,
            RandomForestClassifier,
            ExtraTreesClassifier,
            SGDClassifier,
        ]
        for cls in est_cls_list:
            print("=========================")
            print(cls.__name__)
            print("=========================")
            est = cls(**get_default_hp_of_cls(cls))
            start = time()
            est.fit(X_train, y_train, X_test, y_test)
            score = est.component.score(X_test.data, y_test.data)
            end = time()
            print("score:", score)
            print("time:", end - start)
            self.assertTrue(score == np.max(est.performance_history))
            print("max_iterations:", est.max_iterations)
            print("best_iteration_:", est.best_iteration_)
            print("early_stopping_rounds:", est.early_stopping_rounds)
            print("early_stopping_tol:", est.early_stopping_tol)
            print("iter_inc:", est.iter_inc)
            print("iteration:", est.iteration)
            print("iter_ix:", est.iter_ix)
            print("min_performance:", np.min(est.performance_history))
            print("max_performance:", np.max(est.performance_history))
            print("learning_curve:", est.learning_curve)
            print("estimator:", est)
            print('\n' * 2)
            learning_curve = est.learning_curve
            plt.grid()
            plt.plot(learning_curve[0], learning_curve[1], label="Train Set")
            plt.plot(learning_curve[0], learning_curve[2], label="Valid Set")
            plt.xlabel(est.iterations_name)
            plt.ylabel("Accuracy")
            title = cls.__name__
            plt.title(title)
            plt.axvline(x=est.best_iteration_, ls="--", c="k")
            plt.legend(loc="best")
            plt.savefig(self.plot_dir + f"/{title}.png", quality=100, dpi=600)
            plt.close()