예제 #1
0
def test_diff():
    list1 = [1, 2, 3]
    list2 = [3, 4, 5]

    assert diff(list1, list2) == [1, 2]
    assert diff(list2, list1) == [4, 5]

    assert diff(list1, []) == list1

    list1 = ["a", "b", "c"]
    list2 = ["d", "c", "e"]

    assert diff(list1, list2) == ["a", "b"]
    assert diff(list2, list1) == ["d", "e"]

    assert diff(list1, []) == list1

    assert isinstance(diff((1, 2, 3), (1, 2)), tuple)
예제 #2
0
def test_FeaturesSelectorClassifier_get_feature_names():

    vect = CountVectorizer(analyzer="char", ngram_range=(1, 3))

    df = get_sample_df(100, seed=123)
    xx = vect.fit_transform(df["text_col"])
    y = 1 * (np.random.rand(xx.shape[0]) > 0.5)

    sel = FeaturesSelectorClassifier(n_components=10)
    sel.fit_transform(xx, y)

    ff0 = vect.get_feature_names()
    ff1 = sel.get_feature_names()

    assert len(diff(ff1, list(range(xx.shape[1])))) == 0

    ff2 = sel.get_feature_names(input_features=ff0)

    assert len(ff1) == len(ff2)

    for f1, f2 in zip(ff1, ff2):
        assert ff0[f1] == f2
예제 #3
0
def test_intersect():
    list1 = [1, 2, 3]
    list2 = [3, 4, 5]

    assert intersect(list1, list2) == [3]
    assert intersect(list2, list1) == [3]

    list1 = [1, 2, 3, 4]
    list2 = [4, 3, 5, 6]

    assert intersect(list1, list2) == [3, 4]
    assert intersect(list2, list1) == [4, 3]

    assert intersect(list1, []) == []

    list1 = ["a", "b", "c"]
    list2 = ["d", "c", "e"]

    assert intersect(list1, list2) == ["c"]
    assert intersect(list2, list1) == ["c"]

    assert intersect(list1, []) == []
    assert isinstance(diff((1, 2, 3), (1, 2)), tuple)
예제 #4
0
    def fit(self, X, y=None):
        self._expected_type = dsh.get_type(X)
        self._expected_nbcols = dsh._nbcols(X)

        ######################################
        ### Special case : keep everything ###
        ######################################
        self._return_data_as_inputed = False
        if isinstance(self.columns_to_use, str) and self.columns_to_use == "all" and self.columns_to_drop is None:
            self._already_fitted = True
            self._columns_to_use_is_integer = True
            self._final_columns_to_use = list(range(X.shape[0]))
            self._return_data_as_inputed = True
            if self._expected_type in (DataTypes.DataFrame, DataTypes.SparseDataFrame):
                self._Xcolumns = list(X.columns)
            else:
                self._Xcolumns = list(range(self._expected_nbcols))

        ### Columns to use ###
        list_columns_to_use = self._get_list_of_columns(columns=self.columns_to_use, X=X, regex_match=self.regex_match)
        list_columns_to_drop = self._get_list_of_columns(
            columns=self.columns_to_drop, X=X, regex_match=self.regex_match
        )

        #################################
        ### Special case : no columns ###
        #################################
        if list_columns_to_use is not None and len(list_columns_to_use) == 0:
            # This means that there is nothing to do : no columns will be kept
            self._already_fitted = True
            self._columns_to_use_is_integer = True
            self._final_columns_to_use = []

            if self._expected_type in (DataTypes.DataFrame, DataTypes.SparseDataFrame):
                self._Xcolumns = list(X.columns)
            else:
                self._Xcolumns = list(range(self._expected_nbcols))

            return self

        ### What is the type of columns_to_use and columns_to_drop :
        if list_columns_to_use is not None:
            is_int = "int" in str(type(list_columns_to_use[0]))
        else:
            is_int = None

        if list_columns_to_drop is not None and len(list_columns_to_drop) > 0:
            is_int_to_drop = "int" in str(type(list_columns_to_drop[0]))
        else:
            is_int_to_drop = is_int

        ### Verify type:
        if is_int is not None and is_int_to_drop is not None:
            if is_int != is_int_to_drop:
                raise ValueError(
                    "Please be consistent between 'columns_to_use' and 'columns_to_drop', both can be integer or str, but they should have the same type"
                )

        if is_int is None and is_int_to_drop is None:
            is_int = True
            is_int_to_drop = True

        if is_int is None and is_int_to_drop is not None:
            is_int = is_int_to_drop
        if is_int_to_drop is None and is_int is not None:
            is_int_to_drop = is_int

        if self._expected_type in (DataTypes.DataFrame, DataTypes.SparseDataFrame):
            if is_int:

                ##############################################
                ### Case 1 : DataFrame + Integer selection ###
                ##############################################

                if self.regex_match:
                    #######################
                    ## Case 1a : + Regex ##
                    #######################
                    raise ValueError("regex_match can only work with strings 'columns_to_use' not int")

                cols_set = set(range(self._expected_nbcols))
                if list_columns_to_use is not None:

                    # Check all column are available

                    for l in list_columns_to_use:
                        if l not in cols_set:
                            raise ValueError("Column %d isn't in the columns of the DataFrame" % l)

                    final_columns_to_use = list_columns_to_use
                    # final_columns_to_use = intersect( list_columns_to_use  , list(range(self._expected_nbcols)) )
                else:
                    final_columns_to_use = list(range(self._expected_nbcols))

                if list_columns_to_drop is not None:

                    for l in list_columns_to_drop:
                        if l not in cols_set:
                            raise ValueError("Column %d isn't in the columns of the DataFrame" % l)

                    final_columns_to_use = diff(final_columns_to_use, list_columns_to_drop)

                else:
                    final_columns_to_use = []

            else:

                #############################################
                ### Case 2 : DataFrame + String selection ###
                #############################################
                if self.regex_match:
                    #######################
                    ## Case 2a : + Regex ##
                    #######################
                    if list_columns_to_use is not None:
                        cols_that_match = []
                        for col in list(X.columns):
                            for r in list_columns_to_use:
                                if re.search(r, col) is not None:  # TODO : allow a compiled regex
                                    cols_that_match.append(col)
                                    break

                    if list_columns_to_drop is not None:
                        cols_that_match_drop = []
                        for col in list(X.columns):
                            for r in list_columns_to_drop:
                                if re.search(r, col) is not None:  # TODO : allow a compiled regex
                                    cols_that_match_drop.append(col)
                                    break

                    if list_columns_to_use is not None:
                        final_columns_to_use = cols_that_match
                        # final_columns_to_use = intersect(cols_that_match ,  list(X.columns)) # technically the intersect is useless
                    else:
                        final_columns_to_use = list(X.columns)

                    if list_columns_to_drop is not None:
                        final_columns_to_use = diff(final_columns_to_use, cols_that_match_drop)

                else:
                    ########################
                    ## Case 2b : no Regex ##
                    ########################
                    cols_set = set(X.columns)
                    if list_columns_to_use is not None:

                        for l in list_columns_to_use:
                            if l not in cols_set:
                                raise ValueError("Column %s isn't in the columns of the DataFrame" % l)
                        final_columns_to_use = list_columns_to_use  # intersect(list_columns_to_use, list(X.columns))

                    else:
                        final_columns_to_use = list(X.columns)

                    if list_columns_to_drop is not None:

                        for l in list_columns_to_drop:
                            if l not in cols_set:
                                raise ValueError("Column %s isn't in the columns of the DataFrame" % l)

                        final_columns_to_use = diff(final_columns_to_use, list_columns_to_drop)

                    else:
                        final_columns_to_use = []

        else:

            if is_int or is_int is None:
                ##########################################
                ### Case 3 : Array + Integer selection ###
                ##########################################
                if self.regex_match:

                    ########################
                    ## Case 3a  : + Regex ##
                    ########################
                    raise ValueError("regex_match can only work with strings 'columns_to_use' not int")

                ########################
                ## Case 3b : no Regex ##
                ########################
                cols_set = set(range(self._expected_nbcols))
                if list_columns_to_use is not None:

                    for l in list_columns_to_use:
                        if l not in cols_set:
                            raise ValueError("Column %d isn't in the columns of the DataFrame" % l)

                    final_columns_to_use = intersect(list_columns_to_use, list(range(self._expected_nbcols)))
                else:
                    final_columns_to_use = list(range(self._expected_nbcols))

                if list_columns_to_drop is not None:

                    for l in list_columns_to_drop:
                        if l not in cols_set:
                            raise ValueError("Column %d isn't in the columns of the DataFrame" % l)

                    final_columns_to_use = diff(final_columns_to_use, list_columns_to_drop)
                else:
                    final_columns_to_use = []

            else:
                #########################################
                ### Case 4 : Array + String selection ###
                #########################################
                raise ValueError("columns_to_use must be integers when type is array or sparseArray")

        self._columns_to_use_is_integer = is_int
        self._final_columns_to_use = final_columns_to_use

        if self._expected_type in (DataTypes.DataFrame, DataTypes.SparseDataFrame):
            self._Xcolumns = list(X.columns)
        else:
            self._Xcolumns = list(range(self._expected_nbcols))

        ## TODO : here make a simplification into a slice when it is possible

        self._already_fitted = True

        return self
예제 #5
0
    def fit_metric_model(self):
        logger.info("start computing metric model...")

        ### Load the results
        df_results = self.result_reader.load_all_results(aggregate=True)

        self._nb_models_done = len(df_results)
        if self._nb_models_done <= self.min_nb_of_models:
            return self

        if (self._nb_models_done is not None
                and len(df_results) == self._nb_models_done
                and self.params_training_columns is not None):
            return self

        ### Load the params
        df_params = self.result_reader.load_all_params()

        df_merged_result = pd.merge(df_params,
                                    df_results,
                                    how="inner",
                                    on="job_id")

        training_cols = diff(list(df_params.columns), ["job_id"])

        # X dataframe for parameters
        dfX_params = df_merged_result.loc[:, training_cols]

        ### Retrive the target metric

        if self.avg_metrics:
            scorers = self.job_config.scoring
        else:
            scorers = [self.job_config.main_scorer
                       ]  # I'll use only the main_scorer

        N = dfX_params.shape[0]
        all_y_params = []
        for scorer in scorers:
            y_params = df_merged_result["test_%s" %
                                        scorer]  # Retrive the raw metric
            # replace NaN by scorer's observed minimum score ; if y_params contains
            # only NaN -> won't work
            y_params = y_params.fillna(y_params.min()).values

            if self.metric_transformation is None:
                pass

            elif self.metric_transformation == "rank":
                ### Transform in non-parametric rank ....
                y_params = kde_transfo_quantile(y_params)

                # => This behave likes a uniform law

            elif self.metric_transformation == "normal":
                ### Transform into non-parametric normal ...
                y_params = norm.ppf(kde_transfo_quantile(y_params))

                # => This behaves likes a normal law

            elif self.metric_transformation == "default":
                ### Transform using default transformation (log like function)
                try:
                    f = get_metric_default_transformation(scorer)
                except ValueError:
                    logger.info(
                        "I don't know how to transform this metric %s, I'll use default normal transformation"
                        % str(scorer))
                    f = None

                if f is None:
                    y_params = norm.ppf(kde_transfo_quantile(y_params))
                else:
                    y_params = f(y_params)

                if self.avg_metrics:
                    # If I'm averaging I'd rather have something centered
                    y_params = (y_params -
                                np.mean(y_params)) / np.std(y_params)

            else:
                raise ValueError("I don't know this metric_transformation %s" %
                                 self.metric_transformation)

            all_y_params.append(y_params.reshape((N, 1)))

        if len(all_y_params) > 1:
            y_params = np.concatenate(all_y_params, axis=1).mean(axis=1)
        else:
            y_params = all_y_params[0].reshape((N, ))

        #        elif self.metric_transformation
        #
        #
        #        else:
        #            # On peut aussi utiliser la transformation par default ?
        #            scorer = self.job_config.main_scorer
        #            y_params = df_merged_result["test_%s" % scorer].values
        #

        # create model
        transformer_model = GraphPipeline(models={
            "encoder": NumericalEncoder(),
            "imputer": NumImputer()
        },
                                          edges=[("encoder", "imputer")])

        xx_params = transformer_model.fit_transform(dfX_params)

        random_forest = RandomForestRegressor(n_estimators=100,
                                              min_samples_leaf=5)

        random_forest.fit(xx_params, y_params)

        random_forest_variance = RandomForestVariance(random_forest)
        random_forest_variance.fit(xx_params, y_params)

        self.params_training_columns = training_cols
        self.transformer_model = transformer_model
        self.random_forest = random_forest
        self.random_forest_variance = random_forest_variance

        self._nb_models_done = len(df_results)

        logger.info("metric model fitted")

        return self
예제 #6
0
def create_graphical_representation(steps):
    """ from a an OrderedDict of steps create a Graphical reprensetation of the model we'll use """

    # Rmk : il faut a priori, mettre les numero de l'etape dans le graph
    # + mettre les labels correct
    # comme ça on pourra avoir plusieurs noeud avec le meme nom (Ex : Scaler...)

    ### 1) Split Composion Steps vs Rest
    all_composition_steps = []
    all_others = []
    for (step_name, model_name), var_type in steps.items():
        if StepCategories.is_composition_step(step_name):
            all_composition_steps.append((step_name, model_name, var_type))
        else:
            all_others.append((step_name, model_name, var_type))

    ### 2) Create Graph for non-composition step
    new_steps = OrderedDict()

    G = nx.DiGraph()
    for step_name, model_name, var_type in all_others:
        # for name,var_type in steps.items():

        unested_var_type = unnest_tuple(var_type)

        terminal_nodes = gh.get_terminal_nodes(
            G
        )  # Terminal links : I'll add the new step on one (or more) of those

        ending_node_type = {
            unnest_tuple(steps[node]): node
            for node in terminal_nodes
        }

        node_name = (step_name, model_name)  # 2-uple
        if node_name in G.nodes:
            raise ValueError("This node already exists '(%s,%s)'" % node_name)

        # 1) Soit je rattache le nouveau a UN noeud terminal
        # 2) Soit je cree une nouvelle branche (nouveau noeud ratacher a rien)
        # 3) Soit je rattache a PLUSIEURS noeud terminaux

        elif unested_var_type in ending_node_type:
            ### 1) I already have a branch of this type
            last_node = ending_node_type[unested_var_type]
            G = gh.add_node_after(G, node_name, last_node)

        ### I don't have a branch ###
        else:
            all_candidates = [(t, n) for t, n in ending_node_type.items()
                              if tuple_include(t, unested_var_type)]
            # I need to look where I want to plug it #
            if len(all_candidates) == 0:
                ### 2) Je dois creer une nouvelle branche : aucun noeud ###
                G = gh.add_node_after(G, node_name)
            else:
                ### 3) Je rattache a plusieurs noeuds

                ### Ici : il faut parfois rajouter un noeud en AMONT, si on a des types qui n'ont pas ete rajouter
                types_added = unnest_tuple([t for t, n in all_candidates])
                types_not_added = diff(unested_var_type, types_added)
                if len(types_not_added) > 0:

                    name_of_cat = "Selector_%s" % unnest_tuple(types_not_added)
                    new_node = (name_of_cat, (name_of_cat,
                                              SpecialModels.ColumnsSelector))

                    G = gh.add_node_after(G, new_node)

                    new_steps[
                        new_node] = types_not_added  # I also must dynamically add the node to the list of steps

                    all_candidates = all_candidates + [
                        (types_not_added, new_node)
                    ]

                G = gh.add_node_after(G, node_name,
                                      *[n for t, n in all_candidates])

    ### 3) Include composition node on top
    for step_name, model_name, _ in reversed(all_composition_steps):
        starting_nodes = gh.get_starting_nodes(G)
        for n in starting_nodes:
            G.add_edge((step_name, model_name), n)

    ### 4) Verify the Graph structure

    for (step_name, model_name), _ in steps.items():
        if (step_name, model_name) not in G:
            raise ValueError("'(%s , %s)' should be in graph" %
                             (step_name, model_name))
    # all nodes were in the steps
    for node in G.nodes():
        if node not in steps and node not in new_steps:
            raise ValueError("'(%s,%s)' shouldn't be in graph" % node)

    assert_model_graph_structure(G)

    return G, new_steps
예제 #7
0
    def fit(self, X, y):

        if y is None:
            raise ValueError("I need a value for 'y'")

        self._random_gen = check_random_state(self.random_state)

        Xtype = get_type(X)
        if Xtype != DataTypes.DataFrame:
            raise TypeError("X should be a DataFrame")
        Xcolumns = list(X.columns)

        if not isinstance(y, pd.Series):
            sy = pd.Series(y)
        else:
            sy = y

        # Columns to encode and to keep
        if self.columns_to_encode is None:
            self._columns_to_encode = self.guess_columns_to_encode(X)

        elif isinstance(self.columns_to_encode,
                        str) and self.columns_to_encode == "--object--":
            self._columns_to_encode = list(X.columns[X.dtypes == "object"])

        else:
            self._columns_to_encode = list(self.columns_to_encode)

        X = get_rid_of_categories(X)

        # Verif:
        if not isinstance(self._columns_to_encode, list):
            raise TypeError("_columns_to_encode should be a list")

        for c in self._columns_to_encode:
            if c not in Xcolumns:
                raise ValueError("column %s isn't in the DataFrame" % c)

        if self.columns_to_keep is None:
            self._columns_to_keep = diff(Xcolumns, self._columns_to_encode)
        else:
            self._columns_to_keep = list(self.columns_to_keep)

        # Verif:
        if not isinstance(self._columns_to_keep, list):
            raise TypeError("_columns_to_keep should be a list")

        for c in self._columns_to_keep:
            if c not in Xcolumns:
                raise ValueError("column %s isn't in the DataFrame" % c)

        # Target information
        if self.is_regression:

            self.target_classes = None  # No target classes for Regressor
            self.global_std = np.std(sy)

        else:
            # For classification I need to store it
            self.global_std = None
            self.target_classes = list(np.unique(sy))

            if len(self.target_classes) == 2:
                self.target_classes = self.target_classes[1:]

        # Columns on which we want None to be a special modality
        self._na_to_null = dict()
        for col in self._columns_to_encode:
            ii_null = X[col].isnull()
            self._na_to_null[col] = ii_null.sum(
            ) >= self.max_na_percentage * len(X)

        self._target_aggregat, self._target_aggregat_global = self._fit_aggregat(
            X, sy, noise_level=None)

        # Features names
        self._feature_names = [c for c in self._columns_to_keep]  # copy
        for col in self._columns_to_encode:
            self._feature_names += self._get_output_column_name(
                col=col, target_classes=self.target_classes)
            # self._feature_names += ["%s__target_%s" % (col,str(t)) for t in self.target_classes]

        return self
예제 #8
0
def test_graphpipeline_blockselector():

    Xnum, y = make_classification(n_samples=100)

    dfX_text = pd.DataFrame({"text1": get_random_strings(100), "text2": get_random_strings(100)})

    X = {"text": dfX_text, "num": Xnum}

    graphpipeline = GraphPipeline(
        models={
            "BS_text": BlockSelector("text"),
            "CV": CountVectorizerWrapper(analyzer="char"),
            "BS_num": BlockSelector("num"),
            "RF": DecisionTreeClassifier(),
        },
        edges=[("BS_text", "CV", "RF"), ("BS_num", "RF")],
    )

    graphpipeline.fit(X, y)
    yhat = graphpipeline.predict(X)

    assert yhat.ndim == 1
    assert yhat.shape[0] == y.shape[0]

    ### X = dico ###
    X = {"text": dfX_text, "num": Xnum}

    graphpipeline = GraphPipeline(
        models={"BS_text": BlockSelector("text"), "BS_num": BlockSelector("num"), "PT": DebugPassThrough()},
        edges=[("BS_text", "PT"), ("BS_num", "PT")],
    )

    Xhat = graphpipeline.fit_transform(X)

    assert Xhat.shape[0] == dfX_text.shape[0]
    assert Xhat.shape[1] == dfX_text.shape[1] + Xnum.shape[1]

    assert "text1" in Xhat.columns
    assert "text2" in Xhat.columns
    assert (Xhat.loc[:, ["text1", "text2"]] == dfX_text).all().all()

    cols = diff(list(Xhat.columns), ["text1", "text2"])
    assert (Xhat.loc[:, cols].values == Xnum).all()

    ### X = list
    X = [dfX_text, Xnum]

    graphpipeline = GraphPipeline(
        models={"BS_text": BlockSelector(0), "BS_num": BlockSelector(1), "PT": DebugPassThrough()},
        edges=[("BS_text", "PT"), ("BS_num", "PT")],
    )

    Xhat = graphpipeline.fit_transform(X)

    assert Xhat.shape[0] == dfX_text.shape[0]
    assert Xhat.shape[1] == dfX_text.shape[1] + Xnum.shape[1]

    assert "text1" in Xhat.columns
    assert "text2" in Xhat.columns
    assert (Xhat.loc[:, ["text1", "text2"]] == dfX_text).all().all()

    cols = diff(list(Xhat.columns), ["text1", "text2"])
    assert (Xhat.loc[:, cols].values == Xnum).all()

    ### X = DataManager
    X = BlockManager({"text": dfX_text, "num": Xnum})

    graphpipeline = GraphPipeline(
        models={"BS_text": BlockSelector("text"), "BS_num": BlockSelector("num"), "PT": DebugPassThrough()},
        edges=[("BS_text", "PT"), ("BS_num", "PT")],
    )

    Xhat = graphpipeline.fit_transform(X)

    assert Xhat.shape[0] == dfX_text.shape[0]
    assert Xhat.shape[1] == dfX_text.shape[1] + Xnum.shape[1]

    assert "text1" in Xhat.columns
    assert "text2" in Xhat.columns
    assert (Xhat.loc[:, ["text1", "text2"]] == dfX_text).all().all()

    cols = diff(list(Xhat.columns), ["text1", "text2"])
    assert (Xhat.loc[:, cols].values == Xnum).all()
예제 #9
0
    def fit(self, X, y=None):
        self._expected_type = dsh.get_type(X)
        self._expected_nbcols = dsh._nbcols(X)

        ### Columns to use ###
        if self.columns_to_use is None:
            list_columns_to_use = None  # [i for i in range(self._expected_nbcols)]
        else:
            list_columns_to_use = self.convert_to_list(cols_list=self.columns_to_use)

        ### Columns to drop ###
        if self.columns_to_drop is None:
            list_columns_to_drop = None
        else:
            list_columns_to_drop = self.convert_to_list(cols_list=self.columns_to_drop)

        if list_columns_to_use is not None and len(list_columns_to_use) == 0:
            raise ValueError("columns_to_use is empty")

        ### What is the type of columns_to_use and columns_to_drop :
        if list_columns_to_use is not None:
            is_int = "int" in str(type(list_columns_to_use[0]))
        else:
            is_int = None

        if list_columns_to_drop is not None and len(list_columns_to_drop) > 0:
            is_int_to_drop = "int" in str(type(list_columns_to_drop[0]))
        else:
            is_int_to_drop = is_int

        ### Verify type:
        if is_int is not None and is_int_to_drop is not None:
            if is_int != is_int_to_drop:
                raise ValueError(
                    "Please be consistent between columns_to_use and columns_to_drop, both can be integer or str, but they should have the same type"
                )

        if is_int is None and is_int_to_drop is None:
            is_int = True
            is_int_to_drop = True

        if is_int is None and is_int_to_drop is not None:
            is_int = is_int_to_drop
        if is_int_to_drop is None and is_int is not None:
            is_int_to_drop = is_int

        if self._expected_type in (DataTypes.DataFrame, DataTypes.SparseDataFrame):
            if is_int:

                ##############################################
                ### Case 1 : DataFrame + Integer selection ###
                ##############################################

                if self.regex_match:
                    #######################
                    ## Case 1a : + Regex ##
                    #######################
                    raise ValueError("regex_match can only work with strings 'columns_to_use' not int")

                cols_set = set(range(self._expected_nbcols))
                if list_columns_to_use is not None:

                    # Check all column are available

                    for l in list_columns_to_use:
                        if l not in cols_set:
                            raise ValueError("Column %d isn't in the columns of the DataFrame" % l)

                    final_columns_to_use = list_columns_to_use
                    # final_columns_to_use = intersect( list_columns_to_use  , list(range(self._expected_nbcols)) )
                else:
                    final_columns_to_use = list(range(self._expected_nbcols))

                if list_columns_to_drop is not None:

                    for l in list_columns_to_drop:
                        if l not in cols_set:
                            raise ValueError("Column %d isn't in the columns of the DataFrame" % l)

                    final_columns_to_use = diff(final_columns_to_use, list_columns_to_drop)

            else:

                #############################################
                ### Case 2 : DataFrame + String selection ###
                #############################################
                if self.regex_match:
                    #######################
                    ## Case 2a : + Regex ##
                    #######################
                    if list_columns_to_use is not None:
                        cols_that_match = []
                        for col in list(X.columns):
                            for r in list_columns_to_use:
                                if re.search(r, col) is not None:
                                    cols_that_match.append(col)
                                    break

                    if list_columns_to_drop is not None:
                        cols_that_match_drop = []
                        for col in list(X.columns):
                            for r in list_columns_to_drop:
                                if re.search(r, col) is not None:
                                    cols_that_match_drop.append(col)
                                    break

                    if list_columns_to_use is not None:
                        final_columns_to_use = cols_that_match
                        # final_columns_to_use = intersect(cols_that_match ,  list(X.columns)) # technically the intersect is useless
                    else:
                        final_columns_to_use = list(X.columns)

                    if list_columns_to_drop is not None:
                        final_columns_to_use = diff(final_columns_to_use, cols_that_match_drop)

                else:
                    ########################
                    ## Case 2b : no Regex ##
                    ########################
                    cols_set = set(X.columns)
                    if list_columns_to_use is not None:

                        for l in list_columns_to_use:
                            if l not in cols_set:
                                raise ValueError("Column %s isn't in the columns of the DataFrame" % l)
                        final_columns_to_use = list_columns_to_use  # intersect(list_columns_to_use, list(X.columns))

                    else:
                        final_columns_to_use = list(X.columns)

                    if list_columns_to_drop is not None:

                        for l in list_columns_to_drop:
                            if l not in cols_set:
                                raise ValueError("Column %s isn't in the columns of the DataFrame" % l)

                        final_columns_to_use = diff(final_columns_to_use, list_columns_to_drop)

        else:

            if is_int:
                ##########################################
                ### Case 3 : Array + Integer selection ###
                ##########################################
                if self.regex_match:

                    ########################
                    ## Case 3a  : + Regex ##
                    ########################
                    raise ValueError("regex_match can only work with strings 'columns_to_use' not int")

                ########################
                ## Case 3b : no Regex ##
                ########################
                cols_set = set(range(self._expected_nbcols))
                if list_columns_to_use is not None:

                    for l in list_columns_to_use:
                        if l not in cols_set:
                            raise ValueError("Column %d isn't in the columns of the DataFrame" % l)

                    final_columns_to_use = intersect(list_columns_to_use, list(range(self._expected_nbcols)))
                else:
                    final_columns_to_use = list(range(self._expected_nbcols))

                if list_columns_to_drop is not None:

                    for l in list_columns_to_drop:
                        if l not in cols_set:
                            raise ValueError("Column %d isn't in the columns of the DataFrame" % l)

                    final_columns_to_use = diff(final_columns_to_use, list_columns_to_drop)

            else:
                #########################################
                ### Case 4 : Array + String selection ###
                #########################################
                raise ValueError("columns_to_use must be integers when type is array or sparseArray")

        self._columns_to_use_is_integer = is_int
        self._final_columns_to_use = final_columns_to_use

        if self._expected_type in (DataTypes.DataFrame, DataTypes.SparseDataFrame):
            self._Xcolumns = list(X.columns)
        else:
            self._Xcolumns = list(range(self._expected_nbcols))

        ## TODO : here make a simplification into a slice when it is possible

        self._already_fitted = True

        return self
예제 #10
0
    def fit(self, X, y=None):

        Xtype = get_type(X)
        if Xtype != DataTypes.DataFrame:
            raise TypeError("X should be a DataFrame")

        Xcolumns = list(X.columns)

        # Columns to encode and to keep
        if self.columns_to_encode is None:
            self._columns_to_encode = self.guess_columns_to_encode(X)

        elif isinstance(self.columns_to_encode,
                        str) and self.columns_to_encode == "--object--":
            self._columns_to_encode = list(X.columns[X.dtypes == "object"])
        else:
            self._columns_to_encode = list(self.columns_to_encode)

        # Verif:
        if not isinstance(self._columns_to_encode, list):
            raise TypeError("_columns_to_encode should be a list")

        for c in self._columns_to_encode:
            if c not in Xcolumns:
                raise ValueError("column %s isn't in the DataFrame" % c)

        if self.columns_to_keep is None:
            self._columns_to_keep = diff(Xcolumns, self._columns_to_encode)
        else:
            self._columns_to_keep = list(self.columns_to_keep)

        # Verif:
        if not isinstance(self._columns_to_keep, list):
            raise TypeError("_columns_to_keep should be a list")

        for c in self._columns_to_keep:
            if c not in Xcolumns:
                raise ValueError("column %s isn't in the DataFrame" % c)

        self.variable_modality_mapping = {
            col: self.modalities_filter(X[col])
            for col in self._columns_to_encode
        }

        # Rmk : si on veut pas faire un encodage ou les variables sont par ordre croissant, on peut faire un randomization des numbre ici

        if self.encoding_type == "num":
            self._feature_names = self._columns_to_keep + self._columns_to_encode

            self.columns_mapping = {c: [c] for c in self._feature_names}

        elif self.encoding_type == "dummy":

            self.columns_mapping = {c: [c] for c in self._columns_to_keep}

            index_column = {}
            self._variable_shift = {}
            cum_max = 0
            for col in self._columns_to_encode:

                self.columns_mapping[col] = []

                for i, (mod, ind) in enumerate(
                        self.variable_modality_mapping[col].items()):
                    index_column[ind + cum_max] = col + "__" + str(mod)

                    self.columns_mapping[col].append(col + "__" + str(mod))

                self._variable_shift[col] = cum_max
                cum_max += i + 1

            self._dummy_size = cum_max
            self._dummy_feature_names = [
                index_column[i] for i in range(cum_max)
            ]
            self._feature_names = self._columns_to_keep + self._dummy_feature_names

        else:
            raise NotImplementedError("I don't know that type of encoding %s" %
                                      self.encoding_type)

        return self