def concat_pipeline(*args) -> Optional[GenericPipeline]: pipeline_list = [] for node in args: if isinstance(node, Pipeline): pipeline_list.extend(node.steps) if pipeline_list: return GenericPipeline(pipeline_list) else: return None
def create_preprocessor(self, dhp: Dict) -> Optional[GenericPipeline]: preprocessing_dict: dict = dhp[PHASE1] pipeline_list = [] for key, value in preprocessing_dict.items(): name = key # like: "cat->num" in_feature_groups, out_feature_groups, outsideEdge_info = self.parse_key(key) sub_dict = preprocessing_dict[name] if sub_dict is None: continue preprocessor = self.create_component(sub_dict, PHASE1, name, in_feature_groups, out_feature_groups, outsideEdge_info) pipeline_list.extend(preprocessor) if pipeline_list: return GenericPipeline(pipeline_list) else: return None
def evaluate(self, model: GenericPipeline, X, y, X_test, y_test): assert self.resource_manager is not None warning_info = StringIO() with redirect_stderr(warning_info): # splitter 必须存在 losses = [] models = [] y_true_indexes = [] y_preds = [] y_test_preds = [] all_scores = [] status = "SUCCESS" failed_info = "" for train_index, valid_index in self.splitter.split(X, y): X: GenericDataFrame X_train, X_valid = X.split([train_index, valid_index]) y_train, y_valid = y[train_index], y[valid_index] if self.should_store_intermediate_result: intermediate_result = [] else: intermediate_result = None try: procedure_result = model.procedure(self.ml_task, X_train, y_train, X_valid, y_valid, X_test, y_test, self.resource_manager, intermediate_result) except Exception as e: failed_info = get_trance_back_msg() status = "FAILED" if self.debug: self.logger.error("re-raise exception") raise sys.exc_info()[1] break models.append(model) y_true_indexes.append(valid_index) y_pred = procedure_result["pred_valid"] y_test_pred = procedure_result["pred_test"] y_preds.append(y_pred) y_test_preds.append(y_test_pred) loss, all_score = self.loss(y_valid, y_pred) losses.append(float(loss)) all_scores.append(all_score) if len(losses) > 0: final_loss = float(np.array(losses).mean()) else: final_loss = 65535 if len(all_scores) > 0 and all_scores[0]: all_score = defaultdict(list) for cur_all_score in all_scores: if isinstance(cur_all_score, dict): for key, value in cur_all_score.items(): all_score[key].append(value) else: self.logger.warning( f"TypeError: cur_all_score is not dict.\ncur_all_score = {cur_all_score}" ) for key in all_score.keys(): all_score[key] = float(np.mean(all_score[key])) else: all_score = {} all_scores = [] info = { "loss": final_loss, "losses": losses, "all_score": all_score, "all_scores": all_scores, "models": models, "y_true_indexes": y_true_indexes, "y_preds": y_preds, "intermediate_result": intermediate_result, "status": status, "failed_info": failed_info } # todo if y_test is not None: # 验证集训练模型的组合去预测测试集的数据 if self.ml_task.mainTask == "classification": y_test_pred = vote_predicts(y_test_preds) else: y_test_pred = mean_predicts(y_test_preds) test_loss, test_all_score = self.loss(y_test, y_test_pred) info.update({ "test_loss": test_loss, "test_all_score": test_all_score, "y_test_true": y_test, "y_test_pred": y_test_pred }) info["warning_info"] = warning_info.getvalue() return info
def create_estimator(self, dhp: Dict) -> GenericPipeline: # 根据超参构造一个估计器 return GenericPipeline( self.create_component(dhp[PHASE2], PHASE2, self.ml_task.role))
def test_pipeline(self): self.logger = get_logger(self) df = pd.read_csv("../examples/classification/train_classification.csv") y = df.pop("Survived").values df = df.loc[:, ["Sex", "Cabin", "Age"]] feature_groups = ["cat_nan", "cat_nan", "num_nan"] df_train, df_test, y_train, y_test = train_test_split(df, y, test_size=0.2, random_state=10) df_train = GenericDataFrame(df_train, feature_groups=feature_groups) df_test = GenericDataFrame(df_test, feature_groups=feature_groups) cv = KFold(n_splits=5, random_state=10, shuffle=True) train_ix, valid_ix = next(cv.split(df_train)) df_train, df_valid = df_train.split([train_ix, valid_ix]) y_valid = y_train[valid_ix] y_train = y_train[train_ix] fill_cat = FillCat() fill_cat.in_feature_groups = "cat_nan" fill_cat.out_feature_groups = "cat" fill_cat.update_hyperparams({"strategy": "<NULL>"}) fill_num = FillNum() fill_num.in_feature_groups = "num_nan" fill_num.out_feature_groups = "num" fill_num.update_hyperparams({"strategy": "median"}) ohe = OneHotEncoder() ohe.in_feature_groups = "cat" ohe.out_feature_groups = "num" sgd = SGD() sgd.in_feature_groups = "num" sgd.update_hyperparams({"loss": "log", "random_state": 10}) pipeline = GenericPipeline([ ("fill_cat", fill_cat), ("fill_num", fill_num), ("ohe", ohe), ("sgd", sgd), ]) pipeline.fit(df_train, y_train, df_valid, y_valid, df_test, y_test) pred_train = pipeline.predict(df_train) pred_test = pipeline.predict(df_test) pred_valid = pipeline.predict(df_valid) score_valid = pipeline.predict_proba(df_valid) self.logger.info(accuracy_score(y_train, pred_train)) self.logger.info(accuracy_score(y_valid, pred_valid)) self.logger.info(accuracy_score(y_test, pred_test)) result = pipeline.procedure(constants.binary_classification_task, df_train, y_train, df_valid, y_valid, df_test, y_test) pred_test = result["pred_test"] pred_valid = result["pred_valid"] self.logger.info( accuracy_score(y_valid, (pred_valid > .5).astype("int")[:, 1])) self.logger.info( accuracy_score(y_test, (pred_test > .5).astype("int")[:, 1])) pipeline = GenericPipeline([ ("fill_cat", fill_cat), ("fill_num", fill_num), ("ohe", ohe), ]) pipeline.fit(df_train, y_train, df_valid, y_valid, df_test, y_test) ret1 = pipeline.transform(df_train, df_valid, df_test) ret2 = pipeline.fit_transform(df_train, y_train, df_valid, y_valid, df_test, y_test) for key in ["X_train", "X_valid", "X_test"]: assert np.all(ret1[key] == ret2[key]) pipeline = GenericPipeline([ ("sgd", sgd), ]) result = pipeline.procedure(constants.binary_classification_task, ret1["X_train"], y_train, ret1["X_valid"], y_valid, ret1["X_test"], y_test) pred_test = result["pred_test"] pred_valid = result["pred_valid"] self.logger.info( accuracy_score(y_valid, (pred_valid > .5).astype("int")[:, 1])) self.logger.info( accuracy_score(y_test, (pred_test > .5).astype("int")[:, 1]))
from autoflow.pipeline.dataframe import GenericDataFrame from autoflow.pipeline.pipeline import GenericPipeline df = pd.read_csv("../examples/classification/train_classification.csv") y = df.pop("Survived").values df = df.loc[:, ["Sex", "Ticket", "Pclass"]] df2 = GenericDataFrame(df, feature_groups=["cat", "cat", "num"]) split_cat = SplitCat() split_cat.in_feature_groups = "cat" split_cat.update_hyperparams({ "highR": "highR_cat", "lowR": "lowR_cat", "threshold": 0.5 }) result = split_cat.fit_transform(df2) logging.info(result) df2 = GenericDataFrame(df, feature_groups=["cat", "cat", "num"]) drop_all = DropAll() drop_all.in_feature_groups = ["cat", "num"] drop_all.out_feature_groups = "drop" split_cat = SplitCat() split_cat.in_feature_groups = "cat" pipeline = GenericPipeline([("drop_all", drop_all), ("split_cat", split_cat)]) result = pipeline.fit_transform(df2) logging.info(result)