def test_graphpipeline_no_concat(): gpipeline = GraphPipeline( { "A": DebugPassThrough(debug=True), "B": DebugPassThrough(debug=True), "C": DebugPassThrough(debug=True) }, edges=[("A", "C"), ("B", "C")], no_concat_nodes={"C"}, ) Xtransformed = gpipeline.fit_transform(X) assert isinstance(Xtransformed, dict) assert set(Xtransformed.keys()) == {"A", "B"} assert (Xtransformed["A"] == X).all().all() assert (Xtransformed["B"] == X).all().all() gpipeline = GraphPipeline( { "A": DebugPassThrough(debug=True), "B": DebugPassThrough(debug=True), "C": TransformToBlockManager() }, edges=[("A", "C"), ("B", "C")], no_concat_nodes={"C"}, ) Xtransformed = gpipeline.fit_transform(X) assert isinstance(Xtransformed, BlockManager) assert (Xtransformed["A"] == X).all().all() assert (Xtransformed["B"] == X).all().all()
def test_graphpipeline_nodes_concat_order(): cols = list(dfX.columns) ### 1 pipeline = GraphPipeline({"pt1":DebugPassThrough(column_prefix="PT1_",debug=True), "pt2":DebugPassThrough(column_prefix="PT2_",debug=True), "pt3":DebugPassThrough(column_prefix="PT3_",debug=True), }, edges = [("pt1","pt3"),("pt2","pt3")] ) Xres = pipeline.fit_transform(dfX) assert list(Xres.columns) == ["PT3__PT1__" + c for c in cols] + ["PT3__PT2__" + c for c in cols] # PT1 on the left, PT2 on the right assert list(Xres.columns) == pipeline.get_feature_names() ### 2 : reverse order pipeline = GraphPipeline({"pt1":DebugPassThrough(column_prefix="PT1_",debug=True), "pt2":DebugPassThrough(column_prefix="PT2_",debug=True), "pt3":DebugPassThrough(column_prefix="PT3_",debug=True), }, edges = [("pt2","pt3"),("pt1","pt3")] ) Xres = pipeline.fit_transform(dfX) assert list(Xres.columns) == ["PT3__PT2__" + c for c in cols] + ["PT3__PT1__" + c for c in cols] # PT1 on the left, PT2 on the right assert list(Xres.columns) == pipeline.get_feature_names() ### 3 : with 4 nodes for edges in ( [("pt1","pt3","pt4"),("pt2","pt3","pt4")] , [("pt1","pt3","pt4"),("pt2","pt3")] ): pipeline = GraphPipeline({"pt1":DebugPassThrough(column_prefix="PT1_",debug=True), "pt2":DebugPassThrough(column_prefix="PT2_",debug=True), "pt3":DebugPassThrough(column_prefix="PT3_",debug=True), "pt4":DebugPassThrough(column_prefix="PT4_",debug=True)} , edges = edges ) Xres = pipeline.fit_transform(dfX) assert list(Xres.columns) == ["PT4__PT3__PT1__" + c for c in cols] + ["PT4__PT3__PT2__" + c for c in cols] # PT1 on the left, PT2 on the right assert list(Xres.columns) == pipeline.get_feature_names() ### 4 : reverse order for edges in ( [("pt2","pt3","pt4"),("pt1","pt3","pt4")] , [("pt2","pt3","pt4"),("pt1","pt3")] ): pipeline = GraphPipeline({"pt1":DebugPassThrough(column_prefix="PT1_",debug=True), "pt2":DebugPassThrough(column_prefix="PT2_",debug=True), "pt3":DebugPassThrough(column_prefix="PT3_",debug=True), "pt4":DebugPassThrough(column_prefix="PT4_",debug=True)} , edges = edges ) Xres = pipeline.fit_transform(dfX) assert list(Xres.columns) == ["PT4__PT3__PT2__" + c for c in cols] + ["PT4__PT3__PT1__" + c for c in cols] # PT1 on the left, PT2 on the right assert list(Xres.columns) == pipeline.get_feature_names()
def fit_metric_model(self): logger.info("start computing metric model...") ### Load the results df_results = self.result_reader.load_all_results(aggregate=True) self._nb_models_done = len(df_results) if self._nb_models_done <= self.min_nb_of_models: return self if (self._nb_models_done is not None and len(df_results) == self._nb_models_done and self.params_training_columns is not None): return self ### Load the params df_params = self.result_reader.load_all_params() df_merged_result = pd.merge(df_params, df_results, how="inner", on="job_id") training_cols = diff(list(df_params.columns), ["job_id"]) # X dataframe for parameters dfX_params = df_merged_result.loc[:, training_cols] ### Retrive the target metric if self.avg_metrics: scorers = self.job_config.scoring else: scorers = [self.job_config.main_scorer ] # I'll use only the main_scorer N = dfX_params.shape[0] all_y_params = [] for scorer in scorers: y_params = df_merged_result["test_%s" % scorer] # Retrive the raw metric # replace NaN by scorer's observed minimum score ; if y_params contains # only NaN -> won't work y_params = y_params.fillna(y_params.min()).values if self.metric_transformation is None: pass elif self.metric_transformation == "rank": ### Transform in non-parametric rank .... y_params = kde_transfo_quantile(y_params) # => This behave likes a uniform law elif self.metric_transformation == "normal": ### Transform into non-parametric normal ... y_params = norm.ppf(kde_transfo_quantile(y_params)) # => This behaves likes a normal law elif self.metric_transformation == "default": ### Transform using default transformation (log like function) try: f = get_metric_default_transformation(scorer) except ValueError: logger.info( "I don't know how to transform this metric %s, I'll use default normal transformation" % str(scorer)) f = None if f is None: y_params = norm.ppf(kde_transfo_quantile(y_params)) else: y_params = f(y_params) if self.avg_metrics: # If I'm averaging I'd rather have something centered y_params = (y_params - np.mean(y_params)) / np.std(y_params) else: raise ValueError("I don't know this metric_transformation %s" % self.metric_transformation) all_y_params.append(y_params.reshape((N, 1))) if len(all_y_params) > 1: y_params = np.concatenate(all_y_params, axis=1).mean(axis=1) else: y_params = all_y_params[0].reshape((N, )) # elif self.metric_transformation # # # else: # # On peut aussi utiliser la transformation par default ? # scorer = self.job_config.main_scorer # y_params = df_merged_result["test_%s" % scorer].values # # create model transformer_model = GraphPipeline(models={ "encoder": NumericalEncoder(), "imputer": NumImputer() }, edges=[("encoder", "imputer")]) xx_params = transformer_model.fit_transform(dfX_params) random_forest = RandomForestRegressor(n_estimators=100, min_samples_leaf=5) random_forest.fit(xx_params, y_params) random_forest_variance = RandomForestVariance(random_forest) random_forest_variance.fit(xx_params, y_params) self.params_training_columns = training_cols self.transformer_model = transformer_model self.random_forest = random_forest self.random_forest_variance = random_forest_variance self._nb_models_done = len(df_results) logger.info("metric model fitted") return self
def test_graphpipeline_blockselector(): Xnum, y = make_classification(n_samples=100) dfX_text = pd.DataFrame({"text1": get_random_strings(100), "text2": get_random_strings(100)}) X = {"text": dfX_text, "num": Xnum} graphpipeline = GraphPipeline( models={ "BS_text": BlockSelector("text"), "CV": CountVectorizerWrapper(analyzer="char"), "BS_num": BlockSelector("num"), "RF": DecisionTreeClassifier(), }, edges=[("BS_text", "CV", "RF"), ("BS_num", "RF")], ) graphpipeline.fit(X, y) yhat = graphpipeline.predict(X) assert yhat.ndim == 1 assert yhat.shape[0] == y.shape[0] ### X = dico ### X = {"text": dfX_text, "num": Xnum} graphpipeline = GraphPipeline( models={"BS_text": BlockSelector("text"), "BS_num": BlockSelector("num"), "PT": DebugPassThrough()}, edges=[("BS_text", "PT"), ("BS_num", "PT")], ) Xhat = graphpipeline.fit_transform(X) assert Xhat.shape[0] == dfX_text.shape[0] assert Xhat.shape[1] == dfX_text.shape[1] + Xnum.shape[1] assert "text1" in Xhat.columns assert "text2" in Xhat.columns assert (Xhat.loc[:, ["text1", "text2"]] == dfX_text).all().all() cols = diff(list(Xhat.columns), ["text1", "text2"]) assert (Xhat.loc[:, cols].values == Xnum).all() ### X = list X = [dfX_text, Xnum] graphpipeline = GraphPipeline( models={"BS_text": BlockSelector(0), "BS_num": BlockSelector(1), "PT": DebugPassThrough()}, edges=[("BS_text", "PT"), ("BS_num", "PT")], ) Xhat = graphpipeline.fit_transform(X) assert Xhat.shape[0] == dfX_text.shape[0] assert Xhat.shape[1] == dfX_text.shape[1] + Xnum.shape[1] assert "text1" in Xhat.columns assert "text2" in Xhat.columns assert (Xhat.loc[:, ["text1", "text2"]] == dfX_text).all().all() cols = diff(list(Xhat.columns), ["text1", "text2"]) assert (Xhat.loc[:, cols].values == Xnum).all() ### X = DataManager X = BlockManager({"text": dfX_text, "num": Xnum}) graphpipeline = GraphPipeline( models={"BS_text": BlockSelector("text"), "BS_num": BlockSelector("num"), "PT": DebugPassThrough()}, edges=[("BS_text", "PT"), ("BS_num", "PT")], ) Xhat = graphpipeline.fit_transform(X) assert Xhat.shape[0] == dfX_text.shape[0] assert Xhat.shape[1] == dfX_text.shape[1] + Xnum.shape[1] assert "text1" in Xhat.columns assert "text2" in Xhat.columns assert (Xhat.loc[:, ["text1", "text2"]] == dfX_text).all().all() cols = diff(list(Xhat.columns), ["text1", "text2"]) assert (Xhat.loc[:, cols].values == Xnum).all()