Ejemplo n.º 1
0
 def fit_transform(self, X, y=None, **fit_params):
     X, orig_type = vtreat_impl.ready_data_frame(X)
     self.check_column_names(X.columns)
     if y is not None:
         raise ValueError("y should be None")
     self.clear()
     self.last_fit_x_id_ = vtreat.util.hash_data_frame(X)
     X = vtreat_impl.pre_prep_frame(X,
                                    col_list=self.var_list_,
                                    cols_to_copy=self.cols_to_copy_)
     self.plan_ = vtreat_impl.fit_unsupervised_treatment(
         X=X,
         var_list=self.var_list_,
         outcome_name=self.outcome_name_,
         cols_to_copy=self.cols_to_copy_,
         params=self.params_,
         imputation_map=self.imputation_map_,
     )
     res = vtreat_impl.perform_transform(x=X,
                                         transform=self,
                                         params=self.params_)
     self.score_frame_ = vtreat_impl.pseudo_score_plan_variables(
         cross_frame=res, plan=self.plan_, params=self.params_)
     if ("filter_to_recommended" in self.params_.keys()
         ) and self.params_["filter_to_recommended"]:
         self.set_result_restriction(
             set([
                 ci for ci in self.score_frame_["variable"][
                     self.score_frame_["recommended"]]
             ]))
     res = vtreat_impl.limit_to_appropriate_columns(res=res, transform=self)
     res, res_columns = vtreat_impl.back_to_orig_type_data_frame(
         res, orig_type)
     self.last_result_columns = res_columns
     return res
Ejemplo n.º 2
0
    def transform(self, X):
        """
        Apply transform to data.

        :param X: data
        :return: transformed data
        """

        X, orig_type = vtreat_impl.ready_data_frame(X)
        self.check_column_names(X.columns)
        if self.last_fit_x_id_ is None:
            raise ValueError("called transform on not yet fit treatment")
        if self.params_["check_for_duplicate_frames"] and (
            self.last_fit_x_id_ == vtreat.util.hash_data_frame(X)
        ):
            if self.params_["error_on_duplicate_frames"]:
                raise ValueError(
                    "possibly called transform on same data used to fit\n"
                    + "(this causes over-fit, please use fit_transform() instead)"
                )
            warnings.warn(
                "possibly called transform on same data used to fit\n"
                + "(this causes over-fit, please use fit_transform() instead)"
            )
        X = vtreat_impl.pre_prep_frame(
            X,
            col_list=self.plan_.num_list + self.plan_.cat_list,
            cols_to_copy=self.cols_to_copy_,
            cat_cols=self.plan_.cat_list,
        )
        res = vtreat_impl.perform_transform(x=X, transform=self, params=self.params_)
        res = vtreat_impl.limit_to_appropriate_columns(res=res, transform=self)
        res, res_columns = vtreat_impl.back_to_orig_type_data_frame(res, orig_type)
        self.last_result_columns = res_columns
        return res
Ejemplo n.º 3
0
 def fit_transform(self, X, y):
     if not isinstance(X, pandas.DataFrame):
         raise TypeError("X should be a Pandas DataFrame")
     self.check_column_names(X.columns)
     if y is None:
         y = X[self.outcome_name_]
     if not X.shape[0] == len(y):
         raise ValueError("X.shape[0] should equal len(y)")
     y_mean = numpy.mean(y == self.outcome_target_)
     if y_mean <= 0 or y_mean >= 1:
         raise ValueError("y==outcome_target does not vary")
     X = vtreat_impl.pre_prep_frame(
         X, col_list=self.var_list_, cols_to_copy=self.cols_to_copy_
     )
     if isinstance(y, pandas.Series):
         y = y.reset_index(inplace=False, drop=True)
     # model for independent transforms
     self.plan_ = None
     self.score_frame_ = None
     self.n_training_rows_ = X.shape[0]
     self.plan_ = vtreat_impl.fit_binomial_outcome_treatment(
         X=X,
         y=y,
         outcome_target=self.outcome_target_,
         var_list=self.var_list_,
         outcome_name=self.outcome_name_,
         cols_to_copy=self.cols_to_copy_,
         params=self.params_,
         imputation_map=self.imputation_map_,
     )
     res = vtreat_impl.perform_transform(x=X, transform=self, params=self.params_)
     # patch in cross-frame versions of complex columns such as impact
     self.cross_plan_ = self.params_["cross_validation_plan"].split_plan(
         n_rows=X.shape[0], k_folds=self.params_["cross_validation_k"], data=X, y=y
     )
     cross_frame = vtreat_impl.cross_patch_refit_y_aware_cols(
         x=X, y=y, res=res, plan=self.plan_, cross_plan=self.cross_plan_
     )
     cross_frame = vtreat_impl.cross_patch_user_y_aware_cols(
         x=cross_frame,
         y=y,
         res=res,
         params=self.params_,
         cross_plan=self.cross_plan_,
     )
     # use cross_frame to compute variable effects
     self.score_frame_ = vtreat_impl.score_plan_variables(
         cross_frame=cross_frame,
         outcome=numpy.asarray(
             numpy.asarray(y) == self.outcome_target_, dtype=float
         ),
         plan=self.plan_,
         params=self.params_,
     )
     cross_frame = vtreat_impl.limit_to_appropriate_columns(
         res=cross_frame, transform=self
     )
     return cross_frame
Ejemplo n.º 4
0
 def transform(self, X):
     if not isinstance(X, pandas.DataFrame):
         raise TypeError("X should be a Pandas DataFrame")
     X = vtreat_impl.pre_prep_frame(
         X, col_list=self.var_list_, cols_to_copy=self.cols_to_copy_
     )
     res = vtreat_impl.perform_transform(x=X, transform=self, params=self.params_)
     res = vtreat_impl.limit_to_appropriate_columns(res=res, transform=self)
     return res
Ejemplo n.º 5
0
 def fit_transform(self, X, y):
     if not isinstance(X, pandas.DataFrame):
         raise TypeError("X should be a Pandas DataFrame")
     if y is None:
         y = X[self.outcome_name_]
     if not X.shape[0] == len(y):
         raise ValueError("X.shape[0] should equal len(y)")
     y = numpy.asarray(y, dtype=numpy.float64)
     if vtreat.util.is_bad(y).sum() > 0:
         raise ValueError("y should not have any missing/NA/NaN values")
     if numpy.max(y) <= numpy.min(y):
         raise ValueError("y does not vary")
     X = vtreat_impl.pre_prep_frame(
         X, col_list=self.var_list_, cols_to_copy=self.cols_to_copy_
     )
     if isinstance(y, pandas.Series):
         y = y.reset_index(inplace=False, drop=True)
     # model for independent transforms
     self.plan_ = None
     self.score_frame_ = None
     self.n_training_rows_ = X.shape[0]
     self.plan_ = vtreat_impl.fit_numeric_outcome_treatment(
         X=X,
         y=y,
         var_list=self.var_list_,
         outcome_name=self.outcome_name_,
         cols_to_copy=self.cols_to_copy_,
         params=self.params_,
     )
     res = vtreat_impl.perform_transform(x=X, transform=self, params=self.params_)
     # patch in cross-frame versions of complex columns such as impact
     self.cross_plan_ = self.params_["cross_validation_plan"].split_plan(
         n_rows=X.shape[0], k_folds=self.params_["cross_validation_k"], data=X, y=y
     )
     cross_frame = vtreat_impl.cross_patch_refit_y_aware_cols(
         x=X, y=y, res=res, plan=self.plan_, cross_plan=self.cross_plan_
     )
     cross_frame = vtreat_impl.cross_patch_user_y_aware_cols(
         x=cross_frame,
         y=y,
         res=res,
         params=self.params_,
         cross_plan=self.cross_plan_,
     )
     # use cross_frame to compute variable effects
     self.score_frame_ = vtreat_impl.score_plan_variables(
         cross_frame=cross_frame, outcome=y, plan=self.plan_, params=self.params_
     )
     cross_frame = vtreat_impl.limit_to_appropriate_columns(
         res=cross_frame, transform=self
     )
     return cross_frame
Ejemplo n.º 6
0
 def transform(self, X):
     X, orig_type = vtreat_impl.ready_data_frame(X)
     self.check_column_names(X.columns)
     if self.last_fit_x_id_ is None:
         raise ValueError("called transform on not yet fit treatment")
     X = vtreat_impl.pre_prep_frame(X,
                                    col_list=self.var_list_,
                                    cols_to_copy=self.cols_to_copy_)
     res = vtreat_impl.perform_transform(x=X,
                                         transform=self,
                                         params=self.params_)
     res = vtreat_impl.limit_to_appropriate_columns(res=res, transform=self)
     res, res_columns = vtreat_impl.back_to_orig_type_data_frame(
         res, orig_type)
     self.last_result_columns = res_columns
     return res
Ejemplo n.º 7
0
 def fit_transform(self, X, y=None):
     if y is not None:
         raise ValueError("y should be None")
     X = vtreat_impl.pre_prep_frame(
         X, col_list=self.var_list_, cols_to_copy=self.cols_to_copy_
     )
     self.n_training_rows_ = X.shape[0]
     self.plan_ = vtreat_impl.fit_unsupervised_treatment(
         X=X,
         var_list=self.var_list_,
         outcome_name=self.outcome_name_,
         cols_to_copy=self.cols_to_copy_,
         params=self.params_,
     )
     res = vtreat_impl.perform_transform(x=X, transform=self, params=self.params_)
     self.score_frame_ = vtreat_impl.pseudo_score_plan_variables(
         cross_frame=res, plan=self.plan_, params=self.params_
     )
     res = vtreat_impl.limit_to_appropriate_columns(res=res, transform=self)
     return res
Ejemplo n.º 8
0
 def transform(self, X):
     X, orig_type = vtreat_impl.ready_data_frame(X)
     self.check_column_names(X.columns)
     if self.last_fit_x_id_ is None:
         raise ValueError("called transform on not yet fit treatment")
     if self.params_['check_for_duplicate_frames'] and (self.last_fit_x_id_
                                                        == id(X)):
         warnings.warn(
             "possibly called transform on same data used to fit\n" +
             "(this causes over-fit, please use fit_transform() instead)")
     X = vtreat_impl.pre_prep_frame(X,
                                    col_list=self.var_list_,
                                    cols_to_copy=self.cols_to_copy_)
     res = vtreat_impl.perform_transform(x=X,
                                         transform=self,
                                         params=self.params_)
     res = vtreat_impl.limit_to_appropriate_columns(res=res, transform=self)
     res, res_columns = vtreat_impl.back_to_orig_type_data_frame(
         res, orig_type)
     self.last_result_columns = res_columns
     return res
Ejemplo n.º 9
0
    def fit_transform(self, X, y=None, **fit_params):
        """
        fit_transform data, this is the way to fit with cross methods.

        :param X: explanatory values
        :param y: dependent values
        :param fit_params:
        :return: transformed data
        """

        X, orig_type = vtreat_impl.ready_data_frame(X)
        self.check_column_names(X.columns)
        if y is None:
            if self.outcome_name_ is None:
                raise ValueError(".fit_transform(X) must have outcome_name set")
            y = numpy.asarray(X[self.outcome_name_])
        else:
            y = numpy.asarray(y)
            if (self.outcome_name_ is not None) and (self.outcome_name_ in X.columns):
                if not numpy.all(X[self.outcome_name_] == y):
                    raise ValueError(
                        ".fit_transform(X, y) called with y != X[outcome_name]"
                    )
        if not X.shape[0] == len(y):
            raise ValueError("X.shape[0] should equal len(y)")
        if len(numpy.unique(y)) <= 1:
            raise ValueError("y must take on at least 2 values")
        cross_rows = None
        cross_plan = None
        if self.params_["retain_cross_plan"]:
            cross_rows = self.cross_rows_
            cross_plan = self.cross_plan_
        self.clear()
        self.last_fit_x_id_ = vtreat.util.hash_data_frame(X)
        X = vtreat_impl.pre_prep_frame(
            X, col_list=self.var_list_, cols_to_copy=self.cols_to_copy_
        )
        if isinstance(y, pandas.Series):
            y = y.reset_index(inplace=False, drop=True)
        # model for independent transforms
        self.plan_ = None
        self.score_frame_ = None
        self.outcomes_ = numpy.unique(y)
        self.plan_ = vtreat_impl.fit_multinomial_outcome_treatment(
            X=X,
            y=y,
            var_list=self.var_list_,
            outcome_name=self.outcome_name_,
            cols_to_copy=self.cols_to_copy_,
            params=self.params_,
            imputation_map=self.imputation_map_,
        )
        cross_frame = vtreat_impl.perform_transform(
            x=X, transform=self, params=self.params_
        )
        if (cross_plan is None) or (cross_rows != X.shape[0]):
            if cross_plan is not None:
                warnings.warn(
                    "Number of rows different than previous fit with retain_cross_plan==True"
                )
            cross_plan = self.params_["cross_validation_plan"].split_plan(
                n_rows=X.shape[0],
                k_folds=self.params_["cross_validation_k"],
                data=X,
                y=y,
            )
            cross_rows = X.shape[0]
        vtreat_impl.cross_patch_refit_y_aware_cols(
            x=X, y=y, res=cross_frame, plan=self.plan_, cross_plan=cross_plan
        )
        vtreat_impl.cross_patch_user_y_aware_cols(
            x=cross_frame,
            y=y,
            res=cross_frame,
            params=self.params_,
            cross_plan=cross_plan,
        )
        # use cross_frame to compute variable effects

        def si(oi):
            """score i-th outcome group"""
            sf = vtreat_impl.score_plan_variables(
                cross_frame=cross_frame,
                outcome=numpy.asarray(numpy.asarray(y) == oi, dtype=float),
                plan=self.plan_,
                params=self.params_,
                is_classification=True,
            )
            sf["outcome_target"] = oi
            return sf

        score_frames = [si(oi) for oi in self.outcomes_]
        self.score_frame_ = pandas.concat(score_frames, axis=0)
        self.score_frame_.reset_index(inplace=True, drop=True)
        if ("filter_to_recommended" in self.params_.keys()) and self.params_[
            "filter_to_recommended"
        ]:
            self.set_result_restriction(
                set(
                    [
                        ci
                        for ci in self.score_frame_["variable"][
                            self.score_frame_["recommended"]
                        ]
                    ]
                )
            )
        cross_frame = vtreat_impl.limit_to_appropriate_columns(
            res=cross_frame, transform=self
        )
        cross_frame, res_columns = vtreat_impl.back_to_orig_type_data_frame(
            cross_frame, orig_type
        )
        self.last_result_columns = res_columns
        if self.params_["retain_cross_plan"]:
            self.cross_plan_ = cross_plan
            self.cross_rows_ = cross_rows
        else:
            self.cross_plan_ = None
            self.cross_rows_ = None
        return cross_frame
Ejemplo n.º 10
0
    def fit_transform(self, X, y=None, **fit_params):
        """
        fit_transform data, this is the way to fit with cross methods.

        :param X: explanatory values
        :param y: dependent values
        :param fit_params:
        :return: transformed data
        """

        X, orig_type = vtreat_impl.ready_data_frame(X)
        self.check_column_names(X.columns)
        if y is None:
            if self.outcome_name_ is None:
                raise ValueError(".fit_transform(X) must have outcome_name set")
            y = numpy.asarray(X[self.outcome_name_])
        else:
            y = numpy.asarray(y)
            if (self.outcome_name_ is not None) and (self.outcome_name_ in X.columns):
                if not numpy.all(X[self.outcome_name_] == y):
                    raise ValueError(
                        ".fit_transform(X, y) called with y != X[outcome_name]"
                    )
        if not X.shape[0] == len(y):
            raise ValueError("X.shape[0] should equal len(y)")
        y = vtreat.util.safe_to_numeric_array(y)
        if vtreat.util.is_bad(y).sum() > 0:
            raise ValueError("y should not have any missing/NA/NaN values")
        if numpy.max(y) <= numpy.min(y):
            raise ValueError("y does not vary")
        cross_rows = None
        cross_plan = None
        if self.params_["retain_cross_plan"]:
            cross_rows = self.cross_rows_
            cross_plan = self.cross_plan_
        self.clear()
        self.last_fit_x_id_ = vtreat.util.hash_data_frame(X)
        X = vtreat_impl.pre_prep_frame(
            X, col_list=self.var_list_, cols_to_copy=self.cols_to_copy_
        )
        if isinstance(y, pandas.Series):
            y = y.reset_index(inplace=False, drop=True)
        # model for independent transforms
        self.plan_ = None
        self.score_frame_ = None
        self.plan_ = vtreat_impl.fit_numeric_outcome_treatment(
            X=X,
            y=y,
            var_list=self.var_list_,
            outcome_name=self.outcome_name_,
            cols_to_copy=self.cols_to_copy_,
            params=self.params_,
            imputation_map=self.imputation_map_,
        )
        cross_frame = vtreat_impl.perform_transform(
            x=X, transform=self, params=self.params_
        )
        if (cross_plan is None) or (cross_rows != X.shape[0]):
            if cross_plan is not None:
                warnings.warn(
                    "Number of rows different than previous fit with retain_cross_plan==True"
                )
            cross_plan = self.params_["cross_validation_plan"].split_plan(
                n_rows=X.shape[0],
                k_folds=self.params_["cross_validation_k"],
                data=X,
                y=y,
            )
            cross_rows = X.shape[0]
        # patch in cross-frame versions of complex columns such as impact
        vtreat_impl.cross_patch_refit_y_aware_cols(
            x=X, y=y, res=cross_frame, plan=self.plan_, cross_plan=cross_plan
        )
        vtreat_impl.cross_patch_user_y_aware_cols(
            x=cross_frame,
            y=y,
            res=cross_frame,
            params=self.params_,
            cross_plan=cross_plan,
        )
        # use cross_frame to compute variable effects
        self.score_frame_ = vtreat_impl.score_plan_variables(
            cross_frame=cross_frame,
            outcome=y,
            plan=self.plan_,
            params=self.params_,
            is_classification=False,
        )
        if ("filter_to_recommended" in self.params_.keys()) and self.params_[
            "filter_to_recommended"
        ]:
            self.set_result_restriction(
                set(
                    [
                        ci
                        for ci in self.score_frame_["variable"][
                            self.score_frame_["recommended"]
                        ]
                    ]
                )
            )
        cross_frame = vtreat_impl.limit_to_appropriate_columns(
            res=cross_frame, transform=self
        )
        cross_frame, res_columns = vtreat_impl.back_to_orig_type_data_frame(
            cross_frame, orig_type
        )
        self.last_result_columns = res_columns
        if self.params_["retain_cross_plan"]:
            self.cross_plan_ = cross_plan
            self.cross_rows_ = cross_rows
        else:
            self.cross_plan_ = None
            self.cross_rows_ = None
        return cross_frame
Ejemplo n.º 11
0
 def fit_transform(self, X, y=None, **fit_params):
     X, orig_type = vtreat_impl.ready_data_frame(X)
     self.check_column_names(X.columns)
     if y is None:
         if self.outcome_name_ is None:
             raise ValueError(
                 ".fit_transform(X) must have outcome_name set")
         y = numpy.asarray(X[self.outcome_name_])
     else:
         y = numpy.asarray(y)
         if (self.outcome_name_ is not None) and (self.outcome_name_
                                                  in X.columns):
             if not numpy.all(X[self.outcome_name_] == y):
                 raise ValueError(
                     ".fit_transform(X, y) called with y != X[outcome_name]"
                 )
     if not X.shape[0] == len(y):
         raise ValueError("X.shape[0] should equal len(y)")
     y_mean = numpy.mean(y == self.outcome_target_)
     if y_mean <= 0 or y_mean >= 1:
         raise ValueError("y==outcome_target does not vary")
     cross_rows = None
     cross_plan = None
     if self.params_['retain_cross_plan']:
         cross_rows = self.cross_rows_
         cross_plan = self.cross_plan_
     self.clear()
     self.last_fit_x_id_ = vtreat.util.hash_data_frame(X)
     X = vtreat_impl.pre_prep_frame(X,
                                    col_list=self.var_list_,
                                    cols_to_copy=self.cols_to_copy_)
     if isinstance(y, pandas.Series):
         y = y.reset_index(inplace=False, drop=True)
     # model for independent transforms
     self.plan_ = None
     self.score_frame_ = None
     self.plan_ = vtreat_impl.fit_binomial_outcome_treatment(
         X=X,
         y=y,
         outcome_target=self.outcome_target_,
         var_list=self.var_list_,
         outcome_name=self.outcome_name_,
         cols_to_copy=self.cols_to_copy_,
         params=self.params_,
         imputation_map=self.imputation_map_,
     )
     res = vtreat_impl.perform_transform(x=X,
                                         transform=self,
                                         params=self.params_)
     if (cross_plan is None) or (cross_rows != X.shape[0]):
         if cross_plan is not None:
             warnings.warn(
                 "Number of rows different than previous fit with retain_cross_plan==True"
             )
         cross_plan = self.params_["cross_validation_plan"].split_plan(
             n_rows=X.shape[0],
             k_folds=self.params_["cross_validation_k"],
             data=X,
             y=y)
         cross_rows = X.shape[0]
     # patch in cross-frame versions of complex columns such as impact
     cross_frame = vtreat_impl.cross_patch_refit_y_aware_cols(
         x=X, y=y, res=res, plan=self.plan_, cross_plan=cross_plan)
     cross_frame = vtreat_impl.cross_patch_user_y_aware_cols(
         x=cross_frame,
         y=y,
         res=res,
         params=self.params_,
         cross_plan=cross_plan,
     )
     # use cross_frame to compute variable effects
     self.score_frame_ = vtreat_impl.score_plan_variables(
         cross_frame=cross_frame,
         outcome=numpy.asarray(numpy.asarray(y) == self.outcome_target_,
                               dtype=float),
         plan=self.plan_,
         params=self.params_,
         is_classification=True)
     if ("filter_to_recommended" in self.params_.keys()
         ) and self.params_["filter_to_recommended"]:
         self.set_result_restriction(
             set([
                 ci for ci in self.score_frame_["variable"][
                     self.score_frame_["recommended"]]
             ]))
     cross_frame = vtreat_impl.limit_to_appropriate_columns(res=cross_frame,
                                                            transform=self)
     cross_frame, res_columns = vtreat_impl.back_to_orig_type_data_frame(
         cross_frame, orig_type)
     self.last_result_columns = res_columns
     if self.params_['retain_cross_plan']:
         self.cross_plan_ = cross_plan
         self.cross_rows_ = cross_rows
     else:
         self.cross_plan_ = None
         self.cross_rows_ = None
     return cross_frame
Ejemplo n.º 12
0
    def fit_transform(self, X, y=None, **fit_params):
        X, orig_type = vtreat_impl.ready_data_frame(X)
        self.check_column_names(X.columns)
        if y is None:
            if self.outcome_name_ is None:
                raise ValueError(
                    ".fit_transform(X) must have outcome_name set")
            y = numpy.asarray(X[self.outcome_name_])
        else:
            y = numpy.asarray(y)
            if self.outcome_name_ is not None:
                if not numpy.all(X[self.outcome_name_] == y):
                    raise ValueError(
                        ".fit_transform(X, y) called with y != X[outcome_name]"
                    )
        if not X.shape[0] == len(y):
            raise ValueError("X.shape[0] should equal len(y)")
        if len(numpy.unique(y)) <= 1:
            raise ValueError("y must take on at least 2 values")
        self.clear()
        self.last_fit_x_id_ = id(X)
        X = vtreat_impl.pre_prep_frame(X,
                                       col_list=self.var_list_,
                                       cols_to_copy=self.cols_to_copy_)
        if isinstance(y, pandas.Series):
            y = y.reset_index(inplace=False, drop=True)
        # model for independent transforms
        self.plan_ = None
        self.score_frame_ = None
        self.outcomes_ = numpy.unique(y)
        self.plan_ = vtreat_impl.fit_multinomial_outcome_treatment(
            X=X,
            y=y,
            var_list=self.var_list_,
            outcome_name=self.outcome_name_,
            cols_to_copy=self.cols_to_copy_,
            params=self.params_,
            imputation_map=self.imputation_map_,
        )
        res = vtreat_impl.perform_transform(x=X,
                                            transform=self,
                                            params=self.params_)
        # patch in cross-frame versions of complex columns such as impact
        cross_plan = self.params_["cross_validation_plan"].split_plan(
            n_rows=X.shape[0],
            k_folds=self.params_["cross_validation_k"],
            data=X,
            y=y)
        cross_frame = vtreat_impl.cross_patch_refit_y_aware_cols(
            x=X, y=y, res=res, plan=self.plan_, cross_plan=cross_plan)
        cross_frame = vtreat_impl.cross_patch_user_y_aware_cols(
            x=cross_frame,
            y=y,
            res=res,
            params=self.params_,
            cross_plan=cross_plan,
        )

        # use cross_frame to compute variable effects

        def si(oi):
            sf = vtreat_impl.score_plan_variables(cross_frame=cross_frame,
                                                  outcome=numpy.asarray(
                                                      numpy.asarray(y) == oi,
                                                      dtype=float),
                                                  plan=self.plan_,
                                                  params=self.params_,
                                                  is_classification=True)
            sf["outcome_target"] = oi
            return sf

        score_frames = [si(oi) for oi in self.outcomes_]
        self.score_frame_ = pandas.concat(score_frames, axis=0)
        self.score_frame_.reset_index(inplace=True, drop=True)
        cross_frame = vtreat_impl.limit_to_appropriate_columns(res=cross_frame,
                                                               transform=self)
        cross_frame, res_columns = vtreat_impl.back_to_orig_type_data_frame(
            cross_frame, orig_type)
        self.last_result_columns = res_columns
        if self.params_['retain_cross_plan']:
            self.cross_plan_ = cross_plan
        return cross_frame
Ejemplo n.º 13
0
 def fit_transform(self, X, y=None, **fit_params):
     X, orig_type = vtreat_impl.ready_data_frame(X)
     self.check_column_names(X.columns)
     if y is None:
         if self.outcome_name_ is None:
             raise ValueError(
                 ".fit_transform(X) must have outcome_name set")
         y = numpy.asarray(X[self.outcome_name_])
     else:
         y = numpy.asarray(y)
         if self.outcome_name_ is not None:
             if not numpy.all(X[self.outcome_name_] == y):
                 raise ValueError(
                     ".fit_transform(X, y) called with y != X[outcome_name]"
                 )
     if not X.shape[0] == len(y):
         raise ValueError("X.shape[0] should equal len(y)")
     y = vtreat.util.safe_to_numeric_array(y)
     if vtreat.util.is_bad(y).sum() > 0:
         raise ValueError("y should not have any missing/NA/NaN values")
     if numpy.max(y) <= numpy.min(y):
         raise ValueError("y does not vary")
     self.clear()
     self.last_fit_x_id_ = id(X)
     X = vtreat_impl.pre_prep_frame(X,
                                    col_list=self.var_list_,
                                    cols_to_copy=self.cols_to_copy_)
     if isinstance(y, pandas.Series):
         y = y.reset_index(inplace=False, drop=True)
     # model for independent transforms
     self.plan_ = None
     self.score_frame_ = None
     self.plan_ = vtreat_impl.fit_numeric_outcome_treatment(
         X=X,
         y=y,
         var_list=self.var_list_,
         outcome_name=self.outcome_name_,
         cols_to_copy=self.cols_to_copy_,
         params=self.params_,
         imputation_map=self.imputation_map_,
     )
     res = vtreat_impl.perform_transform(x=X,
                                         transform=self,
                                         params=self.params_)
     # patch in cross-frame versions of complex columns such as impact
     cross_plan = self.params_["cross_validation_plan"].split_plan(
         n_rows=X.shape[0],
         k_folds=self.params_["cross_validation_k"],
         data=X,
         y=y)
     cross_frame = vtreat_impl.cross_patch_refit_y_aware_cols(
         x=X, y=y, res=res, plan=self.plan_, cross_plan=cross_plan)
     cross_frame = vtreat_impl.cross_patch_user_y_aware_cols(
         x=cross_frame,
         y=y,
         res=res,
         params=self.params_,
         cross_plan=cross_plan,
     )
     # use cross_frame to compute variable effects
     self.score_frame_ = vtreat_impl.score_plan_variables(
         cross_frame=cross_frame,
         outcome=y,
         plan=self.plan_,
         params=self.params_,
         is_classification=False)
     cross_frame = vtreat_impl.limit_to_appropriate_columns(res=cross_frame,
                                                            transform=self)
     cross_frame, res_columns = vtreat_impl.back_to_orig_type_data_frame(
         cross_frame, orig_type)
     self.last_result_columns = res_columns
     if self.params_['retain_cross_plan']:
         self.cross_plan_ = cross_plan
     return cross_frame
Ejemplo n.º 14
0
    def fit_transform(self, X, y):
        if not isinstance(X, pandas.DataFrame):
            raise TypeError("X should be a Pandas DataFrame")
        if y is None:
            y = X[self.outcome_name_]
        if not X.shape[0] == len(y):
            raise ValueError("X.shape[0] should equal len(y)")
        if len(numpy.unique(y)) <= 1:
            raise ValueError("y must take on at least 2 values")
        X = vtreat_impl.pre_prep_frame(
            X, col_list=self.var_list_, cols_to_copy=self.cols_to_copy_
        )
        if isinstance(y, pandas.Series):
            y = y.reset_index(inplace=False, drop=True)
        # model for independent transforms
        self.plan_ = None
        self.score_frame_ = None
        self.n_training_rows_ = X.shape[0]
        self.outcomes_ = numpy.unique(y)
        self.plan_ = vtreat_impl.fit_multinomial_outcome_treatment(
            X=X,
            y=y,
            var_list=self.var_list_,
            outcome_name=self.outcome_name_,
            cols_to_copy=self.cols_to_copy_,
            params=self.params_,
        )
        res = vtreat_impl.perform_transform(x=X, transform=self, params=self.params_)
        # patch in cross-frame versions of complex columns such as impact
        self.cross_plan_ = self.params_["cross_validation_plan"].split_plan(
            n_rows=X.shape[0], k_folds=self.params_["cross_validation_k"], data=X, y=y
        )
        cross_frame = vtreat_impl.cross_patch_refit_y_aware_cols(
            x=X, y=y, res=res, plan=self.plan_, cross_plan=self.cross_plan_
        )
        cross_frame = vtreat_impl.cross_patch_user_y_aware_cols(
            x=cross_frame,
            y=y,
            res=res,
            params=self.params_,
            cross_plan=self.cross_plan_,
        )
        # use cross_frame to compute variable effects

        def si(oi):
            sf = vtreat_impl.score_plan_variables(
                cross_frame=cross_frame,
                outcome=numpy.asarray(numpy.asarray(y) == oi, dtype=numpy.float64),
                plan=self.plan_,
                params=self.params_,
            )
            sf["outcome_target"] = oi
            return sf

        score_frames = [si(oi) for oi in self.outcomes_]
        self.score_frame_ = pandas.concat(score_frames, axis=0)
        self.score_frame_.reset_index(inplace=True, drop=True)
        cross_frame = vtreat_impl.limit_to_appropriate_columns(
            res=cross_frame, transform=self
        )
        return cross_frame