コード例 #1
0
def concat_pipeline(*args) -> Optional[GenericPipeline]:
    pipeline_list = []
    for node in args:
        if isinstance(node, Pipeline):
            pipeline_list.extend(node.steps)
    if pipeline_list:
        return GenericPipeline(pipeline_list)
    else:
        return None
コード例 #2
0
 def create_preprocessor(self, dhp: Dict) -> Optional[GenericPipeline]:
     preprocessing_dict: dict = dhp[PHASE1]
     pipeline_list = []
     for key, value in preprocessing_dict.items():
         name = key  # like: "cat->num"
         in_feature_groups, out_feature_groups, outsideEdge_info = self.parse_key(key)
         sub_dict = preprocessing_dict[name]
         if sub_dict is None:
             continue
         preprocessor = self.create_component(sub_dict, PHASE1, name, in_feature_groups, out_feature_groups,
                                              outsideEdge_info)
         pipeline_list.extend(preprocessor)
     if pipeline_list:
         return GenericPipeline(pipeline_list)
     else:
         return None
コード例 #3
0
 def evaluate(self, model: GenericPipeline, X, y, X_test, y_test):
     assert self.resource_manager is not None
     warning_info = StringIO()
     with redirect_stderr(warning_info):
         # splitter 必须存在
         losses = []
         models = []
         y_true_indexes = []
         y_preds = []
         y_test_preds = []
         all_scores = []
         status = "SUCCESS"
         failed_info = ""
         for train_index, valid_index in self.splitter.split(X, y):
             X: GenericDataFrame
             X_train, X_valid = X.split([train_index, valid_index])
             y_train, y_valid = y[train_index], y[valid_index]
             if self.should_store_intermediate_result:
                 intermediate_result = []
             else:
                 intermediate_result = None
             try:
                 procedure_result = model.procedure(self.ml_task, X_train,
                                                    y_train, X_valid,
                                                    y_valid, X_test, y_test,
                                                    self.resource_manager,
                                                    intermediate_result)
             except Exception as e:
                 failed_info = get_trance_back_msg()
                 status = "FAILED"
                 if self.debug:
                     self.logger.error("re-raise exception")
                     raise sys.exc_info()[1]
                 break
             models.append(model)
             y_true_indexes.append(valid_index)
             y_pred = procedure_result["pred_valid"]
             y_test_pred = procedure_result["pred_test"]
             y_preds.append(y_pred)
             y_test_preds.append(y_test_pred)
             loss, all_score = self.loss(y_valid, y_pred)
             losses.append(float(loss))
             all_scores.append(all_score)
         if len(losses) > 0:
             final_loss = float(np.array(losses).mean())
         else:
             final_loss = 65535
         if len(all_scores) > 0 and all_scores[0]:
             all_score = defaultdict(list)
             for cur_all_score in all_scores:
                 if isinstance(cur_all_score, dict):
                     for key, value in cur_all_score.items():
                         all_score[key].append(value)
                 else:
                     self.logger.warning(
                         f"TypeError: cur_all_score is not dict.\ncur_all_score = {cur_all_score}"
                     )
             for key in all_score.keys():
                 all_score[key] = float(np.mean(all_score[key]))
         else:
             all_score = {}
             all_scores = []
         info = {
             "loss": final_loss,
             "losses": losses,
             "all_score": all_score,
             "all_scores": all_scores,
             "models": models,
             "y_true_indexes": y_true_indexes,
             "y_preds": y_preds,
             "intermediate_result": intermediate_result,
             "status": status,
             "failed_info": failed_info
         }
         # todo
         if y_test is not None:
             # 验证集训练模型的组合去预测测试集的数据
             if self.ml_task.mainTask == "classification":
                 y_test_pred = vote_predicts(y_test_preds)
             else:
                 y_test_pred = mean_predicts(y_test_preds)
             test_loss, test_all_score = self.loss(y_test, y_test_pred)
             info.update({
                 "test_loss": test_loss,
                 "test_all_score": test_all_score,
                 "y_test_true": y_test,
                 "y_test_pred": y_test_pred
             })
     info["warning_info"] = warning_info.getvalue()
     return info
コード例 #4
0
 def create_estimator(self, dhp: Dict) -> GenericPipeline:
     # 根据超参构造一个估计器
     return GenericPipeline(
         self.create_component(dhp[PHASE2], PHASE2, self.ml_task.role))
コード例 #5
0
ファイル: test_pipeline.py プロジェクト: chengdake/autoflow
    def test_pipeline(self):
        self.logger = get_logger(self)
        df = pd.read_csv("../examples/classification/train_classification.csv")
        y = df.pop("Survived").values
        df = df.loc[:, ["Sex", "Cabin", "Age"]]
        feature_groups = ["cat_nan", "cat_nan", "num_nan"]
        df_train, df_test, y_train, y_test = train_test_split(df,
                                                              y,
                                                              test_size=0.2,
                                                              random_state=10)
        df_train = GenericDataFrame(df_train, feature_groups=feature_groups)
        df_test = GenericDataFrame(df_test, feature_groups=feature_groups)
        cv = KFold(n_splits=5, random_state=10, shuffle=True)
        train_ix, valid_ix = next(cv.split(df_train))

        df_train, df_valid = df_train.split([train_ix, valid_ix])
        y_valid = y_train[valid_ix]
        y_train = y_train[train_ix]

        fill_cat = FillCat()
        fill_cat.in_feature_groups = "cat_nan"
        fill_cat.out_feature_groups = "cat"
        fill_cat.update_hyperparams({"strategy": "<NULL>"})

        fill_num = FillNum()
        fill_num.in_feature_groups = "num_nan"
        fill_num.out_feature_groups = "num"
        fill_num.update_hyperparams({"strategy": "median"})

        ohe = OneHotEncoder()
        ohe.in_feature_groups = "cat"
        ohe.out_feature_groups = "num"

        sgd = SGD()
        sgd.in_feature_groups = "num"
        sgd.update_hyperparams({"loss": "log", "random_state": 10})

        pipeline = GenericPipeline([
            ("fill_cat", fill_cat),
            ("fill_num", fill_num),
            ("ohe", ohe),
            ("sgd", sgd),
        ])

        pipeline.fit(df_train, y_train, df_valid, y_valid, df_test, y_test)
        pred_train = pipeline.predict(df_train)
        pred_test = pipeline.predict(df_test)
        pred_valid = pipeline.predict(df_valid)
        score_valid = pipeline.predict_proba(df_valid)
        self.logger.info(accuracy_score(y_train, pred_train))
        self.logger.info(accuracy_score(y_valid, pred_valid))
        self.logger.info(accuracy_score(y_test, pred_test))
        result = pipeline.procedure(constants.binary_classification_task,
                                    df_train, y_train, df_valid, y_valid,
                                    df_test, y_test)
        pred_test = result["pred_test"]
        pred_valid = result["pred_valid"]
        self.logger.info(
            accuracy_score(y_valid, (pred_valid > .5).astype("int")[:, 1]))
        self.logger.info(
            accuracy_score(y_test, (pred_test > .5).astype("int")[:, 1]))

        pipeline = GenericPipeline([
            ("fill_cat", fill_cat),
            ("fill_num", fill_num),
            ("ohe", ohe),
        ])

        pipeline.fit(df_train, y_train, df_valid, y_valid, df_test, y_test)
        ret1 = pipeline.transform(df_train, df_valid, df_test)
        ret2 = pipeline.fit_transform(df_train, y_train, df_valid, y_valid,
                                      df_test, y_test)
        for key in ["X_train", "X_valid", "X_test"]:
            assert np.all(ret1[key] == ret2[key])

        pipeline = GenericPipeline([
            ("sgd", sgd),
        ])

        result = pipeline.procedure(constants.binary_classification_task,
                                    ret1["X_train"], y_train, ret1["X_valid"],
                                    y_valid, ret1["X_test"], y_test)
        pred_test = result["pred_test"]
        pred_valid = result["pred_valid"]
        self.logger.info(
            accuracy_score(y_valid, (pred_valid > .5).astype("int")[:, 1]))
        self.logger.info(
            accuracy_score(y_test, (pred_test > .5).astype("int")[:, 1]))
コード例 #6
0
from autoflow.pipeline.dataframe import GenericDataFrame
from autoflow.pipeline.pipeline import GenericPipeline

df = pd.read_csv("../examples/classification/train_classification.csv")
y = df.pop("Survived").values
df = df.loc[:, ["Sex", "Ticket", "Pclass"]]

df2 = GenericDataFrame(df, feature_groups=["cat", "cat", "num"])

split_cat = SplitCat()
split_cat.in_feature_groups = "cat"
split_cat.update_hyperparams({
    "highR": "highR_cat",
    "lowR": "lowR_cat",
    "threshold": 0.5
})
result = split_cat.fit_transform(df2)
logging.info(result)

df2 = GenericDataFrame(df, feature_groups=["cat", "cat", "num"])
drop_all = DropAll()
drop_all.in_feature_groups = ["cat", "num"]
drop_all.out_feature_groups = "drop"

split_cat = SplitCat()
split_cat.in_feature_groups = "cat"

pipeline = GenericPipeline([("drop_all", drop_all), ("split_cat", split_cat)])
result = pipeline.fit_transform(df2)
logging.info(result)