Exemple #1
0
def test_graphpipeline_no_concat():

    gpipeline = GraphPipeline(
        {
            "A": DebugPassThrough(debug=True),
            "B": DebugPassThrough(debug=True),
            "C": DebugPassThrough(debug=True)
        },
        edges=[("A", "C"), ("B", "C")],
        no_concat_nodes={"C"},
    )

    Xtransformed = gpipeline.fit_transform(X)
    assert isinstance(Xtransformed, dict)
    assert set(Xtransformed.keys()) == {"A", "B"}
    assert (Xtransformed["A"] == X).all().all()
    assert (Xtransformed["B"] == X).all().all()

    gpipeline = GraphPipeline(
        {
            "A": DebugPassThrough(debug=True),
            "B": DebugPassThrough(debug=True),
            "C": TransformToBlockManager()
        },
        edges=[("A", "C"), ("B", "C")],
        no_concat_nodes={"C"},
    )

    Xtransformed = gpipeline.fit_transform(X)
    assert isinstance(Xtransformed, BlockManager)
    assert (Xtransformed["A"] == X).all().all()
    assert (Xtransformed["B"] == X).all().all()
Exemple #2
0
def test_graphpipeline_nodes_concat_order():
    
    cols = list(dfX.columns)
    
    ### 1
    pipeline = GraphPipeline({"pt1":DebugPassThrough(column_prefix="PT1_",debug=True),
                              "pt2":DebugPassThrough(column_prefix="PT2_",debug=True),
                              "pt3":DebugPassThrough(column_prefix="PT3_",debug=True),
                              },
                              edges = [("pt1","pt3"),("pt2","pt3")]
                              )

    Xres = pipeline.fit_transform(dfX)
    assert list(Xres.columns) == ["PT3__PT1__" + c for c in cols] + ["PT3__PT2__" + c for c in cols] # PT1 on the left, PT2 on the right
    assert list(Xres.columns) == pipeline.get_feature_names()

    ### 2 : reverse order    
    pipeline = GraphPipeline({"pt1":DebugPassThrough(column_prefix="PT1_",debug=True),
                              "pt2":DebugPassThrough(column_prefix="PT2_",debug=True),
                              "pt3":DebugPassThrough(column_prefix="PT3_",debug=True),
                              },
                              edges = [("pt2","pt3"),("pt1","pt3")]
                              )

    Xres = pipeline.fit_transform(dfX)
    assert list(Xres.columns) == ["PT3__PT2__" + c for c in cols] + ["PT3__PT1__" + c for c in cols] # PT1 on the left, PT2 on the right
    assert list(Xres.columns) == pipeline.get_feature_names()


    ### 3 : with 4 nodes
    for edges in ( [("pt1","pt3","pt4"),("pt2","pt3","pt4")] , [("pt1","pt3","pt4"),("pt2","pt3")] ):
        pipeline = GraphPipeline({"pt1":DebugPassThrough(column_prefix="PT1_",debug=True),
                                  "pt2":DebugPassThrough(column_prefix="PT2_",debug=True),
                                  "pt3":DebugPassThrough(column_prefix="PT3_",debug=True),
                                  "pt4":DebugPassThrough(column_prefix="PT4_",debug=True)} ,
                                  edges = edges
                                  )
        Xres = pipeline.fit_transform(dfX)
        assert list(Xres.columns) == ["PT4__PT3__PT1__" + c for c in cols] + ["PT4__PT3__PT2__" + c for c in cols] # PT1 on the left, PT2 on the right
        assert list(Xres.columns) == pipeline.get_feature_names()

    
    ### 4 : reverse order
    for edges in ( [("pt2","pt3","pt4"),("pt1","pt3","pt4")] , [("pt2","pt3","pt4"),("pt1","pt3")] ):
        pipeline = GraphPipeline({"pt1":DebugPassThrough(column_prefix="PT1_",debug=True),
                                  "pt2":DebugPassThrough(column_prefix="PT2_",debug=True),
                                  "pt3":DebugPassThrough(column_prefix="PT3_",debug=True),
                                  "pt4":DebugPassThrough(column_prefix="PT4_",debug=True)} ,
                                  edges = edges
                                  )
        Xres = pipeline.fit_transform(dfX)
        assert list(Xres.columns) == ["PT4__PT3__PT2__" + c for c in cols] + ["PT4__PT3__PT1__" + c for c in cols] # PT1 on the left, PT2 on the right
        assert list(Xres.columns) == pipeline.get_feature_names()
Exemple #3
0
    def fit_metric_model(self):
        logger.info("start computing metric model...")

        ### Load the results
        df_results = self.result_reader.load_all_results(aggregate=True)

        self._nb_models_done = len(df_results)
        if self._nb_models_done <= self.min_nb_of_models:
            return self

        if (self._nb_models_done is not None
                and len(df_results) == self._nb_models_done
                and self.params_training_columns is not None):
            return self

        ### Load the params
        df_params = self.result_reader.load_all_params()

        df_merged_result = pd.merge(df_params,
                                    df_results,
                                    how="inner",
                                    on="job_id")

        training_cols = diff(list(df_params.columns), ["job_id"])

        # X dataframe for parameters
        dfX_params = df_merged_result.loc[:, training_cols]

        ### Retrive the target metric

        if self.avg_metrics:
            scorers = self.job_config.scoring
        else:
            scorers = [self.job_config.main_scorer
                       ]  # I'll use only the main_scorer

        N = dfX_params.shape[0]
        all_y_params = []
        for scorer in scorers:
            y_params = df_merged_result["test_%s" %
                                        scorer]  # Retrive the raw metric
            # replace NaN by scorer's observed minimum score ; if y_params contains
            # only NaN -> won't work
            y_params = y_params.fillna(y_params.min()).values

            if self.metric_transformation is None:
                pass

            elif self.metric_transformation == "rank":
                ### Transform in non-parametric rank ....
                y_params = kde_transfo_quantile(y_params)

                # => This behave likes a uniform law

            elif self.metric_transformation == "normal":
                ### Transform into non-parametric normal ...
                y_params = norm.ppf(kde_transfo_quantile(y_params))

                # => This behaves likes a normal law

            elif self.metric_transformation == "default":
                ### Transform using default transformation (log like function)
                try:
                    f = get_metric_default_transformation(scorer)
                except ValueError:
                    logger.info(
                        "I don't know how to transform this metric %s, I'll use default normal transformation"
                        % str(scorer))
                    f = None

                if f is None:
                    y_params = norm.ppf(kde_transfo_quantile(y_params))
                else:
                    y_params = f(y_params)

                if self.avg_metrics:
                    # If I'm averaging I'd rather have something centered
                    y_params = (y_params -
                                np.mean(y_params)) / np.std(y_params)

            else:
                raise ValueError("I don't know this metric_transformation %s" %
                                 self.metric_transformation)

            all_y_params.append(y_params.reshape((N, 1)))

        if len(all_y_params) > 1:
            y_params = np.concatenate(all_y_params, axis=1).mean(axis=1)
        else:
            y_params = all_y_params[0].reshape((N, ))

        #        elif self.metric_transformation
        #
        #
        #        else:
        #            # On peut aussi utiliser la transformation par default ?
        #            scorer = self.job_config.main_scorer
        #            y_params = df_merged_result["test_%s" % scorer].values
        #

        # create model
        transformer_model = GraphPipeline(models={
            "encoder": NumericalEncoder(),
            "imputer": NumImputer()
        },
                                          edges=[("encoder", "imputer")])

        xx_params = transformer_model.fit_transform(dfX_params)

        random_forest = RandomForestRegressor(n_estimators=100,
                                              min_samples_leaf=5)

        random_forest.fit(xx_params, y_params)

        random_forest_variance = RandomForestVariance(random_forest)
        random_forest_variance.fit(xx_params, y_params)

        self.params_training_columns = training_cols
        self.transformer_model = transformer_model
        self.random_forest = random_forest
        self.random_forest_variance = random_forest_variance

        self._nb_models_done = len(df_results)

        logger.info("metric model fitted")

        return self
def test_graphpipeline_blockselector():

    Xnum, y = make_classification(n_samples=100)

    dfX_text = pd.DataFrame({"text1": get_random_strings(100), "text2": get_random_strings(100)})

    X = {"text": dfX_text, "num": Xnum}

    graphpipeline = GraphPipeline(
        models={
            "BS_text": BlockSelector("text"),
            "CV": CountVectorizerWrapper(analyzer="char"),
            "BS_num": BlockSelector("num"),
            "RF": DecisionTreeClassifier(),
        },
        edges=[("BS_text", "CV", "RF"), ("BS_num", "RF")],
    )

    graphpipeline.fit(X, y)
    yhat = graphpipeline.predict(X)

    assert yhat.ndim == 1
    assert yhat.shape[0] == y.shape[0]

    ### X = dico ###
    X = {"text": dfX_text, "num": Xnum}

    graphpipeline = GraphPipeline(
        models={"BS_text": BlockSelector("text"), "BS_num": BlockSelector("num"), "PT": DebugPassThrough()},
        edges=[("BS_text", "PT"), ("BS_num", "PT")],
    )

    Xhat = graphpipeline.fit_transform(X)

    assert Xhat.shape[0] == dfX_text.shape[0]
    assert Xhat.shape[1] == dfX_text.shape[1] + Xnum.shape[1]

    assert "text1" in Xhat.columns
    assert "text2" in Xhat.columns
    assert (Xhat.loc[:, ["text1", "text2"]] == dfX_text).all().all()

    cols = diff(list(Xhat.columns), ["text1", "text2"])
    assert (Xhat.loc[:, cols].values == Xnum).all()

    ### X = list
    X = [dfX_text, Xnum]

    graphpipeline = GraphPipeline(
        models={"BS_text": BlockSelector(0), "BS_num": BlockSelector(1), "PT": DebugPassThrough()},
        edges=[("BS_text", "PT"), ("BS_num", "PT")],
    )

    Xhat = graphpipeline.fit_transform(X)

    assert Xhat.shape[0] == dfX_text.shape[0]
    assert Xhat.shape[1] == dfX_text.shape[1] + Xnum.shape[1]

    assert "text1" in Xhat.columns
    assert "text2" in Xhat.columns
    assert (Xhat.loc[:, ["text1", "text2"]] == dfX_text).all().all()

    cols = diff(list(Xhat.columns), ["text1", "text2"])
    assert (Xhat.loc[:, cols].values == Xnum).all()

    ### X = DataManager
    X = BlockManager({"text": dfX_text, "num": Xnum})

    graphpipeline = GraphPipeline(
        models={"BS_text": BlockSelector("text"), "BS_num": BlockSelector("num"), "PT": DebugPassThrough()},
        edges=[("BS_text", "PT"), ("BS_num", "PT")],
    )

    Xhat = graphpipeline.fit_transform(X)

    assert Xhat.shape[0] == dfX_text.shape[0]
    assert Xhat.shape[1] == dfX_text.shape[1] + Xnum.shape[1]

    assert "text1" in Xhat.columns
    assert "text2" in Xhat.columns
    assert (Xhat.loc[:, ["text1", "text2"]] == dfX_text).all().all()

    cols = diff(list(Xhat.columns), ["text1", "text2"])
    assert (Xhat.loc[:, cols].values == Xnum).all()