Beispiel #1
0
 def setUp(self) -> None:
     super(TestBalance, self).setUp()
     X, y = load_iris(return_X_y=True)
     y[y == 2] = 1
     X_train, X_test, y_train, y_test = train_test_split(X,
                                                         y,
                                                         test_size=0.2,
                                                         random_state=0)
     X_train = DataFrameContainer(
         "TrainSet",
         dataset_instance=X_train,
         resource_manager=self.mock_resource_manager)
     X_test = DataFrameContainer(
         "TestSet",
         dataset_instance=X_test,
         resource_manager=self.mock_resource_manager)
     y_train = NdArrayContainer("TrainLabel",
                                dataset_instance=y_train,
                                resource_manager=self.mock_resource_manager)
     y_test = NdArrayContainer("TestLabel",
                               dataset_instance=y_test,
                               resource_manager=self.mock_resource_manager)
     X_train.set_feature_groups(["num"] * 4)
     X_test.set_feature_groups(["num"] * 4)
     self.X_train = X_train
     self.X_test = X_test
     self.y_train = y_train
     self.y_test = y_test
    def test_classifier(self):
        train_df = datasets.load("titanic")[["Name", "Survived"]]
        y = np.array(train_df.pop("Survived"))

        X_train, X_test, y_train, y_test = train_test_split(train_df,
                                                            y,
                                                            test_size=0.2,
                                                            random_state=0)
        X_train = DataFrameContainer(
            "TrainSet",
            dataset_instance=X_train,
            resource_manager=self.mock_resource_manager)
        X_test = DataFrameContainer(
            "TestSet",
            dataset_instance=X_test,
            resource_manager=self.mock_resource_manager)
        y_train = NdArrayContainer("TrainLabel",
                                   dataset_instance=y_train,
                                   resource_manager=self.mock_resource_manager)
        y_test = NdArrayContainer("TestLabel",
                                  dataset_instance=y_test,
                                  resource_manager=self.mock_resource_manager)
        X_train.set_feature_groups(["text"])
        X_test.set_feature_groups(["text"])
        est_cls_list = [
            TsvdTransformer,
            NmfTransformer,
            LsiTransformer,
            LdaTransformer,
            RpTransformer,
        ]
        for cls in est_cls_list:
            print("=========================")
            print(cls.__name__)
            print("=========================")
            tokenizer = SimpleTokenlizer(
                **get_default_hp_of_cls(SimpleTokenlizer))
            tokenizer.in_feature_groups = "text"
            tokenizer.out_feature_groups = "token"
            transformer = cls(**get_default_hp_of_cls(cls))
            transformer.in_feature_groups = "token"
            transformer.out_feature_groups = "num"
            classifier = RandomForestClassifier(
                **get_default_hp_of_cls(RandomForestClassifier))
            pipeline = ML_Workflow([
                ("tokenizer", tokenizer),
                ("transformer", transformer),
                ("classifier", classifier),
            ],
                                   resource_manager=self.mock_resource_manager)
            start = time()
            pipeline.fit(X_train, y_train, X_test, y_test)
            y_pred = pipeline.predict(X_test)
            score = accuracy_score(y_test.data, y_pred)
            end = time()
            print("score:", score)
            print("time:", end - start)
            self.assertGreater(score, 0.6)
            print('\n' * 2)
Beispiel #3
0
 def get_cache_key(self, config_id, X_train: DataFrameContainer,
                   y_train: NdArrayContainer):
     experiment_id = str(self.resource_manager.experiment_id)
     return "-".join(
         [experiment_id, config_id,
          X_train.get_hash(),
          y_train.get_hash()])
Beispiel #4
0
 def before_fit_y(self, y: NdArrayContainer):
     if y is None:
         return None
     y = deepcopy(y.data)
     self.scaler = StandardScaler(copy=True)
     y = y.ravel().reshape([-1, 1])
     self.scaler.fit(y)
     return self.scaler.transform(y)
Beispiel #5
0
 def setUp(self) -> None:
     super(TestFeatureSelection, self).setUp()
     self.L = 1024
     df = load("qsar")
     y = df.pop("target")
     X = df
     X[X == 0] = -1
     X.index = reversed(X.index)
     self.index = deepcopy(X.index)
     X = DataFrameContainer("TrainSet",
                            dataset_instance=X,
                            resource_manager=self.mock_resource_manager)
     X.set_feature_groups(["num"] * X.shape[1])
     self.X = X
     self.y = NdArrayContainer("TrainSet",
                               dataset_instance=y,
                               resource_manager=self.mock_resource_manager)
     y_reg = y + np.random.rand(*y.shape)
     self.y_reg = NdArrayContainer(
         "TrainSet",
         dataset_instance=y_reg,
         resource_manager=self.mock_resource_manager)
Beispiel #6
0
 def setUp(self) -> None:
     super(RunReduce, self).setUp()
     self.L = 1024
     df = load("qsar")
     y = df.pop("target")
     X = df
     X[X == 0] = -1
     X.index = reversed(X.index)
     self.index = deepcopy(X.index)
     X = DataFrameContainer("TrainSet", dataset_instance=X)
     X.set_feature_groups(["num"] * X.shape[1])
     X2 = deepcopy(X)
     y2 = deepcopy(y)
     N = 500
     X2.data = X2.data.iloc[:N, :]
     X2.set_feature_groups(["num"] * X2.shape[1])
     y2 = y2.iloc[:N]
     self.Xs = [
         X, X2
     ]
     self.ys = [
         NdArrayContainer("TrainLabel", dataset_instance=y),
         NdArrayContainer("TrainLabel", dataset_instance=y2)
     ]
Beispiel #7
0
    def test_under_sample(self):

        est_cls_list = [
            AllKNN,
            ClusterCentroids,
            CondensedNearestNeighbour,
            EditedNearestNeighbours,
            InstanceHardnessThreshold,
            NearMiss,
            NeighbourhoodCleaningRule,
            OneSidedSelection,
            RandomUnderSampler,
            RepeatedEditedNearestNeighbours,
            TomekLinks,
        ]

        for cls in est_cls_list:
            print("=========================")
            print(cls.__name__)
            print("=========================")
            balancer = cls(**get_default_hp_of_cls(cls))
            classifier = LinearSVC(**get_default_hp_of_cls(LinearSVC))
            pipeline = ML_Workflow([
                ("balancer", balancer),
                ("classifier", classifier),
            ],
                                   resource_manager=self.mock_resource_manager,
                                   should_store_intermediate_result=True)
            start = time()
            pipeline.fit(self.X_train, self.y_train, self.X_test, self.y_test)
            balanced_y_train = NdArrayContainer(
                dataset_id=pipeline.intermediate_result["balancer"]["y_train"],
                resource_manager=self.mock_resource_manager)
            print("y_train:")
            print(Counter(self.y_train.data))
            print("balanced y_train:")
            print(Counter(balanced_y_train.data))

            y_pred = pipeline.predict(self.X_test)
            score = accuracy_score(self.y_test.data, y_pred)
            end = time()
            print("score:", score)
            print("time:", end - start)
            self.assertGreater(score, 0.6)
            print('\n' * 2)
Beispiel #8
0
 def test_handle_unknown(self):
     X_train = pd.DataFrame([
         ['A', 'alpha', 9],
         ['A', 'alpha', 1],
         ['B', 'beta', 2],
         ['B', 'beta', 3],
         ['C', 'gamma', 4],
         ['C', 'gamma', 5],
     ],
                            columns=['col1', 'col2', 'col3'])
     X_valid = pd.DataFrame([
         ['D', 'kappa', 6],
         ['D', 'kappa', 6],
         ['E', 'sigma', 7],
         ['E', 'sigma', 7],
         ['F', 'mu', 8],
         ['F', 'mu', 8],
     ],
                            columns=['col1', 'col2', 'col3'])
     X_train = DataFrameContainer(dataset_instance=X_train)
     X_valid = DataFrameContainer(dataset_instance=X_valid)
     X_train.set_feature_groups(['cat'] * 3)
     X_valid.set_feature_groups(['cat'] * 3)
     y_train = NdArrayContainer(dataset_instance=[0, 1, 0, 1, 0, 1])
     for cls in [
             EntityEncoder, OrdinalEncoder, OneHotEncoder, TargetEncoder,
             CatBoostEncoder
     ]:
         hp = get_default_hp_of_cls(cls)
         encoder = cls(**hp)
         encoder.in_feature_groups = "cat"
         encoder.out_feature_groups = "ordinal"
         result = encoder.fit_transform(X_train=X_train,
                                        X_valid=X_valid,
                                        y_train=y_train)
         assert np.all(
             encoder.transform(X_train)['X_train'].data ==
             result['X_train'].data)
         assert np.all(
             encoder.transform(X_valid)['X_train'].data ==
             result['X_valid'].data)
Beispiel #9
0
    def test_over_sample(self):
        est_cls_list = [
            RandomOverSampler,
            # ADASYN,
            BorderlineSMOTE,
            KMeansSMOTE,
            SMOTE,
            SVMSMOTE,
        ]

        for cls in est_cls_list:
            print("=========================")
            print(cls.__name__)
            print("=========================")
            balancer = cls(**get_default_hp_of_cls(cls))
            classifier = LinearSVC(**get_default_hp_of_cls(LinearSVC))
            pipeline = ML_Workflow([
                ("balancer", balancer),
                ("classifier", classifier),
            ],
                                   resource_manager=self.mock_resource_manager,
                                   should_store_intermediate_result=True)
            start = time()
            pipeline.fit(self.X_train, self.y_train, self.X_test, self.y_test)
            balanced_y_train = NdArrayContainer(
                dataset_id=pipeline.intermediate_result["balancer"]["y_train"],
                resource_manager=self.mock_resource_manager)
            print("y_train:")
            print(Counter(self.y_train.data))
            print("balanced y_train:")
            print(Counter(balanced_y_train.data))

            y_pred = pipeline.predict(self.X_test)
            score = accuracy_score(self.y_test.data, y_pred)
            end = time()
            print("score:", score)
            print("time:", end - start)
            self.assertGreater(score, 0.6)
            print('\n' * 2)
Beispiel #10
0
def implement_subsample_budget(
    X_train: DataFrameContainer, y_train: NdArrayContainer,
    Xs: List[Optional[DataFrameContainer]], budget, random_state: int
) -> Tuple[DataFrameContainer, NdArrayContainer,
           List[Optional[DataFrameContainer]]]:
    rng = np.random.RandomState(random_state)
    samples = round(X_train.shape[0] * budget)
    features = X_train.shape[1]
    sub_sample_index = get_stratified_sampling_index(y_train.data, budget,
                                                     random_state)
    # sub sampling X_train, y_train
    X_train = X_train.sub_sample(sub_sample_index)
    y_train = y_train.sub_sample(sub_sample_index)
    # if features > samples , do sub_feature avoid over-fitting
    if features > samples:
        sub_feature_index = rng.permutation(X_train.shape[1])[:samples]
        X_train = X_train.sub_feature(sub_feature_index)
        res_Xs = []
        for X in Xs:
            res_Xs.append(
                X.sub_feature(sub_feature_index) if X is not None else None)
    else:
        res_Xs = Xs
    return X_train, y_train, res_Xs
Beispiel #11
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Author  : qichun tang
# @Contact    : [email protected]
from sklearn.datasets import load_digits
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

from autoflow.core.classifier import AutoFlowClassifier
from autoflow.data_container import DataFrameContainer
from autoflow.data_container import NdArrayContainer

X, y = load_digits(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
X_test_ = DataFrameContainer(dataset_instance=X_test)
y_test_ = NdArrayContainer(dataset_instance=y_test)
pipe = AutoFlowClassifier()
estimator = pipe.fit_ensemble(
    task_id="2435e32babd7d09b6357e99aa7fa3b89",
    budget_id="afff102b36a43efe4f68e299ff21cadd",
    trials_fetcher_params={"k": 50}
)
# pipe.fit(X_train, y_train, fit_ensemble_params=False)
# score = accuracy_score(y_test, y_pred)
y_pred = estimator.predict(X_test_)
score = accuracy_score(y_test, y_pred)
print(score)
    def test_classifier(self):
        X, y = datasets.load_digits(return_X_y=True)

        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.2,
                                                            random_state=0)
        X_train = DataFrameContainer(
            "TrainSet",
            dataset_instance=X_train,
            resource_manager=self.mock_resource_manager)
        X_test = DataFrameContainer(
            "TestSet",
            dataset_instance=X_test,
            resource_manager=self.mock_resource_manager)
        y_train = NdArrayContainer("TrainLabel",
                                   dataset_instance=y_train,
                                   resource_manager=self.mock_resource_manager)
        y_test = NdArrayContainer("TestLabel",
                                  dataset_instance=y_test,
                                  resource_manager=self.mock_resource_manager)

        est_cls_list = [
            LogisticRegression,
            GradientBoostingClassifier,
            RandomForestClassifier,
            ExtraTreesClassifier,
            SGDClassifier,
        ]
        for cls in est_cls_list:
            print("=========================")
            print(cls.__name__)
            print("=========================")
            est = cls(**get_default_hp_of_cls(cls))
            start = time()
            est.fit(X_train, y_train, X_test, y_test)
            score = est.component.score(X_test.data, y_test.data)
            end = time()
            print("score:", score)
            print("time:", end - start)
            self.assertTrue(score == np.max(est.performance_history))
            print("max_iterations:", est.max_iterations)
            print("best_iteration_:", est.best_iteration_)
            print("early_stopping_rounds:", est.early_stopping_rounds)
            print("early_stopping_tol:", est.early_stopping_tol)
            print("iter_inc:", est.iter_inc)
            print("iteration:", est.iteration)
            print("iter_ix:", est.iter_ix)
            print("min_performance:", np.min(est.performance_history))
            print("max_performance:", np.max(est.performance_history))
            print("learning_curve:", est.learning_curve)
            print("estimator:", est)
            print('\n' * 2)
            learning_curve = est.learning_curve
            plt.grid()
            plt.plot(learning_curve[0], learning_curve[1], label="Train Set")
            plt.plot(learning_curve[0], learning_curve[2], label="Valid Set")
            plt.xlabel(est.iterations_name)
            plt.ylabel("Accuracy")
            title = cls.__name__
            plt.title(title)
            plt.axvline(x=est.best_iteration_, ls="--", c="k")
            plt.legend(loc="best")
            plt.savefig(self.plot_dir + f"/{title}.png", quality=100, dpi=600)
            plt.close()
Beispiel #13
0
    def __init__(self,
                 resource_manager=None,
                 X_train: Union[pd.DataFrame, DataFrameContainer, np.ndarray,
                                None, str] = None,
                 y_train: Union[pd.Series, np.ndarray, None] = None,
                 X_test: Union[pd.DataFrame, DataFrameContainer, np.ndarray,
                               None, str] = None,
                 y_test: Union[pd.Series, np.ndarray, None] = None,
                 dataset_metadata: Dict[str, Any] = frozendict(),
                 column_descriptions: Dict[str, Union[List[str],
                                                      str]] = frozendict(),
                 highR_nan_threshold: float = 0.5,
                 highC_cat_threshold: int = 4,
                 consider_ordinal_as_cat=False,
                 upload_type="fs"):
        '''

        Parameters
        ----------
        X_train: :class:`numpy.ndarray` or :class:`pandas.DataFrame`
        y_train: :class:`numpy.ndarray`
        X_test: :class:`numpy.ndarray` or :class:`pandas.DataFrame`
        y_test: :class:`numpy.ndarray`
        dataset_metadata: dict
        column_descriptions: dict
            ``column_descriptions`` is a dict, key is ``feature_group``,

            value is column (column name) or columns (list of column names).

            This is a list of some frequently-used built-in ``feature_group``
                * ``id``       - id of this table.
                * ``ignore``   - some columns which contains irrelevant information.
                * ``target``   - column in the dataset is what your model will learn to predict.
                * ``nan``      - Not a Number, a column contain missing values.
                * ``num``      - numerical features, such as [1, 2, 3].
                * ``cat``      - categorical features, such as ["a", "b", "c"].
                * ``num_nan``  - numerical features contains missing values. such as [1, 2, NaN].
                * ``cat_nan``  - categorical features contains missing values. such as ["a", "b", NaN].
                * ``highR_nan``  - highly ratio NaN. You can find explain in :class:`autoflow.hdl.hdl_constructor.HDL_Constructor`
                * ``lowR_nan``   - lowly ratio NaN. You can find explain in :class:`autoflow.hdl.hdl_constructor.HDL_Constructor`
                * ``highC_cat``  - highly cardinality ratio categorical. You can find explain in :class:`autoflow.hdl.hdl_constructor.HDL_Constructor`
                * ``lowR_cat``  -  lowly cardinality ratio categorical. You can find explain in :class:`autoflow.hdl.hdl_constructor.HDL_Constructor`

        highR_nan_threshold: float
            high ratio NaN threshold, you can find examples and practice in :class:`autoflow.hdl.hdl_constructor.HDL_Constructor`
        '''
        self.upload_type = upload_type
        from autoflow.resource_manager.base import ResourceManager
        self.logger = get_logger(self)
        if resource_manager is None:
            self.logger.warning(
                "In DataManager __init__, resource_manager is None, create a default local resource_manager."
            )
            resource_manager = ResourceManager()
        self.resource_manager: ResourceManager = resource_manager
        self.resource_manager = resource_manager
        self.highC_cat_threshold = highC_cat_threshold
        self.consider_ordinal_as_cat = consider_ordinal_as_cat
        dataset_metadata = dict(dataset_metadata)
        self.highR_nan_threshold = highR_nan_threshold
        self.dataset_metadata = dataset_metadata
        self.column_descriptions = dict(column_descriptions)
        # --load data to container---------------------------------
        self.X_test, self.input_test_hash = self.parse_data_container(
            "TestSet", X_test, y_test)
        #             train set 靠后,以train set 的column_descriptions为准
        self.X_train, self.input_train_hash = self.parse_data_container(
            "TrainSet", X_train, y_train)
        # --migrate column descriptions------------------------------
        # if X is dataset_id , remote data_container's column_descriptions will assigned to  final_column_descriptions
        if self.final_column_descriptions is not None:
            self.column_descriptions = deepcopy(self.final_column_descriptions)
        # --column descriptions------------------------------
        self.parse_column_descriptions()
        # 注意,此时feature_groups与columns不是一一匹配的,删除了辅助特征组
        # ---check target-----------------------------------------------------
        assert "target" in self.column_descriptions
        self.target_col_name = self.column_descriptions["target"]
        # todo: 测试集预测的情况
        # --final column descriptions------------------------------
        # 用户定义的 column descriptions 和 remote 下载的column description都不应该包含nan的内容
        # update `column2essential_feature_groups` to `final_column_descriptions`
        if self.final_column_descriptions is None:
            final_column_descriptions = defaultdict(list)
            final_column_descriptions.update(self.column_descriptions)
            # 先将非唯一的特征组处理为列表
            for feat_grp, cols in final_column_descriptions.items():
                if feat_grp not in UNIQUE_FEATURE_GROUPS:
                    if isinstance(cols, str):
                        final_column_descriptions[feat_grp] = [cols]
            # 然后开始更新
            for column, essential_feature_group in self.column2feature_groups.items(
            ):
                if column not in final_column_descriptions[
                        essential_feature_group]:
                    final_column_descriptions[essential_feature_group].append(
                        column)
            self.final_column_descriptions = final_column_descriptions
        self.final_column_descriptions = dict(self.final_column_descriptions)
        # ---set column descriptions, upload to dataset-----------------------------------------------------
        if self.X_train is not None:
            self.X_train.set_column_descriptions(
                self.final_column_descriptions)
            self.X_train.upload(self.upload_type)
            self.logger.info(
                f"TrainSet's DataSet ID = {self.X_train.dataset_id}")
        if self.X_test is not None:
            self.X_test.set_column_descriptions(self.final_column_descriptions)
            self.X_test.upload(self.upload_type)
            self.logger.info(
                f"TestSet's DataSet ID = {self.X_test.dataset_id}")
        # ---origin hash-----------------------------------------------------
        self.train_set_id = self.X_train.get_hash(
        ) if self.X_train is not None else ""
        self.test_set_id = self.X_test.get_hash(
        ) if self.X_test is not None else ""
        if self.input_train_hash:
            assert self.input_train_hash == self.train_set_id
        if self.input_test_hash:
            assert self.input_test_hash == self.test_set_id
        # ---pop auxiliary columns-----------------------------------------------------
        y_train, y_test = self.pop_auxiliary_feature_groups()
        # --验证X与X_test的列应该相同
        if self.X_test is not None and self.X_train is not None:
            assert self.X_train.shape[1] == self.X_test.shape[1]
            assert np.all(self.X_train.columns == self.X_test.columns)
        # --设置feature_groups--
        if self.X_train is not None:
            self.X_train.set_feature_groups(self.feature_groups)
        if self.X_test is not None:
            self.X_test.set_feature_groups(self.feature_groups)
        # --设置参数--
        y_train = to_array(y_train)
        y_test = to_array(y_test)
        # encode label
        assert y_train is not None, ValueError(
            f"{self.target_col_name} does not exist!")
        self.label_encoder = None
        if is_target_need_label_encode(y_train):
            self.label_encoder = LabelEncoder()
            y_train = self.label_encoder.fit_transform(y_train)
            y_test = self.encode_label(y_test)
        if y_train is not None:
            y_train = NdArrayContainer("TrainLabel",
                                       dataset_instance=y_train,
                                       resource_manager=self.resource_manager)
            y_train.upload()
        if y_test is not None:
            y_test = NdArrayContainer("TestLabel",
                                      dataset_instance=y_test,
                                      resource_manager=self.resource_manager)
            y_test.upload()
        self.ml_task: MLTask = get_ml_task_from_y(y_train.data)
        self.y_train = y_train
        self.y_test = y_test
        self.train_label_id = self.y_train.get_hash(
        ) if self.y_train is not None else ""
        self.test_label_id = self.y_test.get_hash(
        ) if self.y_test is not None else ""
        if self.X_train is not None:
            self.columns = self.X_train.columns
        else:
            self.columns = self.X_test.columns