Ejemplo n.º 1
0
def test_not_dataframe_Validate_dataframe():
    df = [[1, 4, 5], [-5, 8, 9]]
    with pytest.raises(TypeError):
        validate_dataframe(df)

    df = ([1, 4, 5], [-5, 8, 9])
    with pytest.raises(TypeError):
        validate_dataframe(df)
Ejemplo n.º 2
0
 def fit(self, X, y=None, **fitparams):
     X = validate_dataframe(X)
     self.fitted_transformers_ = []
     for transformer in self.list_of_transformers:
         fitted_trans = clone(transformer).fit(X, y=None, **fitparams)
         self.fitted_transformers_.append(fitted_trans)
     return self
Ejemplo n.º 3
0
 def fit(self, X, y=None, **fitparams):
     X = validate_dataframe(X)
     self._validate_params(X)
     self.types = {}
     for col in X.columns:
         self.types[col] = self._infer_dtype(X[col])
     return self
Ejemplo n.º 4
0
 def transform(self, X, **transformparams):
     X = validate_dataframe(X)
     X = X.copy()
     df_concat = pd.concat(
         [t.transform(X) for t in self.fitted_transformers_],
         axis=1).copy()
     return df_concat
 def fit(self, X, y=None, **fitparams):
     X = validate_dataframe(X)
     self.scaler = StandardScaler(copy=self.copy, with_mean=True, with_std=True)
     self.scaler.fit(X)
     self.kernelpca = KernelPCA(n_components=self.n_components, kernel=self.kernel, gamma=self.gamma, degree=self.degree)
     self.kernelpca.fit(self.scaler.transform(X))
     return self
 def fit(self, X, y=None, **fitparams):
     X = validate_dataframe(X)
     self.scaler = StandardScaler(copy=self.copy, with_mean=True, with_std=True)
     self.scaler.fit(X)
     self.pca = PCA(n_components=self.n_components, whiten=True)
     self.pca.fit(self.scaler.transform(X))
     return self
 def transform(self, X, **transformparams):
     X = validate_dataframe(X)
     Xout = X.copy()
     for col in Xout.columns:
         Xout[col][Xout[col].str.strip() == ''] = self.value_if_empty
         Xout[col][pd.isnull(Xout[col])] = self.value_if_none
     return Xout
 def transform(self, X, **transformparams):
     X = validate_dataframe(X)
     X = X.copy()
     Xs = self.scaler.transform(X)
     Xpca = self.kernelpca.transform(Xs)
     column_names = [self.prefix + '{0:03g}'.format(n) + self.suffix for n in range(Xpca.shape[1])]
     return pd.DataFrame(Xpca, index=X.index, columns=column_names)
Ejemplo n.º 9
0
    def transform(self, X, **transformparams):
        X = validate_dataframe(X)
        X = X.copy()
        num_columns = len(X.columns)

        if self.contains is None:
            contains_mask = [True] * num_columns
        else:
            contains_mask = X.columns.str.contains(self.contains)

        if self.prefix is None:
            prefix_mask = [True] * num_columns
        else:
            prefix_mask = X.columns.str.startswith(self.prefix)

        if self.suffix is None:
            suffix_mask = [True] * num_columns
        else:
            suffix_mask = X.columns.str.endswith(self.suffix)

        if self.operator == 'and':
            selected_columns = [
                c and p and s
                for c, p, s in zip(contains_mask, prefix_mask, suffix_mask)
            ]
        elif self.operator == 'or':
            selected_columns = [
                c or p or s
                for c, p, s in zip(contains_mask, prefix_mask, suffix_mask)
            ]
        else:
            raise ValueError("Operator {0} is not implemented".format(
                self.operator))

        return X.loc[:, selected_columns]
 def fit(self, X, y=None, **fitparams):
     X = validate_dataframe(X)
     self.scaler = MinMaxScaler(feature_range=self.feature_range, copy=self.copy)
     self.scaler.fit(X)
     self.scale_ = pd.Series(self.scaler.scale_, index=X.columns)
     self.min_ = pd.Series(self.scaler.min_, index=X.columns)
     return self
Ejemplo n.º 11
0
 def transform(self, X, **transformparams):
     X = validate_dataframe(X)
     X = X.copy()
     selected_columns = [
         col for col in X.columns if col in self.k_best_columns
     ]
     return X.loc[:, selected_columns]
    def fit(self, X, y=None, **fitparams):
        X = validate_dataframe(X)
        self.encodings = {}
        for col in X.columns:
            self.encodings[col] = np.sort(X[col].unique())

        return self
Ejemplo n.º 13
0
    def transform(self, X, **transformparams):
        X = validate_dataframe(X)
        new_col_list = []
        for col in X.columns:
            var_type = X[col].dtype.name
            if self.types[col] in ('object', 'bool', 'category'):
                if self.unique_vals[col] is not None:
                    not_in_list = ~X[col].isin(self.unique_vals[col])
                    if sum(not_in_list) > 0:
                        new_values = str(X[col][not_in_list].unique().tolist())
                        self.print_message(
                            'New Categories specified for column {col}: Received {received}'
                            .format(col=col, received=new_values))
            elif self.types[col] in ('int64', 'float64', 'datetime64',
                                     'timedelta'):
                minX = X[col].min()
                maxX = X[col].max()
                if minX < self.minmax[col][0]:
                    self.print_message(
                        'Low Value warning for column {col}: Lowest Training value {lowtrain}, Lowest Scoring value {lowscore}'
                        .format(col=col,
                                lowtrain=self.minmax[col][0],
                                lowscore=minX))
                if maxX > self.minmax[col][1]:
                    self.print_message(
                        'High Value warning for column {col}: Largest Training value {hightrain}, Largest Scoring value {highscore}'
                        .format(col=col,
                                hightrain=self.minmax[col][1],
                                highscore=maxX))

        return X
 def fit(self, X, y=None, **fitparams):
     X = validate_dataframe(X)
     self.scaler = RobustScaler(with_centering=self.with_centering,
                            with_scaling=self.with_scaling, quantile_range=self.quantile_range)
     self.scaler.fit(X)
     self.center_ = pd.Series(self.scaler.center_, index=X.columns)
     self.scale_ = pd.Series(self.scaler.scale_, index=X.columns)
     return self
Ejemplo n.º 15
0
 def transform(self, X, y=None):
     X = validate_dataframe(X)
     X = X.copy()
     subset = [c for c in X.columns if c not in self.excluded_columns]
     return X.replace([np.inf, -np.inf], np.nan).dropna(axis=0,
                                                        how='any',
                                                        inplace=False,
                                                        subset=subset)
Ejemplo n.º 16
0
 def fit(self, X, y=None):
     X = validate_dataframe(X)
     self._validate_params(X)
     self.drop_columns = [
         col for col in X.columns
         if X[col].nunique() < self.min_unique_values
     ]
     return self
 def fit(self, X, y=None, **fitparams):
     X = validate_dataframe(X)
     self.lams = {}
     for col in X.columns:
         Xcol_float = X[col].astype(float)
         _, lam = yeojohnson(Xcol_float)
         self.lams[col] = lam
     return self
    def transform(self, X, **transformparams):
        X = validate_dataframe(X)
        Xout = X.copy()
        for col in Xout.columns:
            Xout[col] = np.where(Xout[col].isin(self.common_categories[col]),
                                 Xout[col], self.value_if_rare)

        return Xout
 def transform(self, X, **transformparams):
     X = validate_dataframe(X)
     Xout = X.copy()
     new_col_list = []
     for col in X.columns:
         new_col = self.prefix + col + self.suffix
         new_col_list.append(new_col)
         Xout[new_col] = [self.agg_series[col][x] for x in X[col]]
     return Xout.loc[:, new_col_list]
Ejemplo n.º 20
0
 def fit(self, X, y, **fitparams):
     X = validate_dataframe(X)
     self._validate_params(X)
     var_performance = self.score_func(X, y)
     if type(var_performance) == tuple:
         var_performance = pd.Series(data=var_performance[-1],
                                     index=X.columns)
     self.k_best_columns = var_performance.sort_values(
         ascending=self.ascending).head(self.k).index
     return self
 def fit(self, X, y=None, **fitparams):
     X = validate_dataframe(X)
     if 'sample_weight' in fitparams:
         w = fitparams['sample_weight']
     else:
         w = pd.Series(np.ones(X.shape[0]))
     self.impute_val = {}
     for col in X.columns:
         self.impute_val[col] = np.nan_to_num(self._calc_impute_val(X.loc[:, col], w))
     return self
    def transform(self, X, **transformparams):
        X = validate_dataframe(X)
        X = X.copy()
        new_col_list = []
        for col in X.columns:
            new_col = self.prefix + col + self.suffix
            new_col_list.append(new_col)
            X[new_col] = X[col].clip(self.clips[col][0], self.clips[col][1])

        return X.loc[:, new_col_list]
    def transform(self, X, **transformparams):
        X = validate_dataframe(X)
        X = X.copy()
        new_col_list = []
        for col in X.columns:
            new_col = self.prefix + col + self.suffix
            new_col_list.append(new_col)
            X[new_col] = pd.cut(x=X[col], bins=self.cuts[col], duplicates='drop')

        return X.loc[:, new_col_list]
 def transform(self, X, **transformparams):
     X = validate_dataframe(X)
     X = X.copy()
     new_col_list = []
     for col in X.columns:
         new_col = self.prefix + col + self.suffix
         new_col_list.append(new_col)
         Xcol_float = X[col].astype(float)
         X[new_col] = yeojohnson(Xcol_float, lmbda=self.lams[col])
     return X.loc[:, new_col_list]
 def fit(self, X, y=None, **fitparams):
     X = validate_dataframe(X)
     self._validate_params(X)
     self.agg_series = {}
     for gb in self.groupby_vars:
         self.agg_series[gb] = {}
         for metric in self.metric_vars:
             agg_series = X.groupby(gb).agg({metric: self._agg_func})[metric]
             self.agg_series[gb][metric] = agg_series
     return self
    def transform(self, X, **transformparams):
        X = validate_dataframe(X)
        Xout = X.copy()
        new_col_list = []
        for col in X.columns:
            for cat in self.encodings[col]:
                new_col = col + '_' + str(cat)
                Xout[new_col] = Xout[col] == cat
                new_col_list.append(new_col)

        return Xout.loc[:, new_col_list]
    def transform(self, X, **transformparams):
        X = validate_dataframe(X)
        X = X.copy()
        new_col_list = []
        for col in X.columns:
            new_col = self.prefix + col + self.suffix
            x_orig = X[col].copy()
            X[new_col] = self._apply_bins(x_orig, self.cuts[col])
            new_col_list.append(new_col)

        return X.loc[:, new_col_list]
Ejemplo n.º 28
0
 def fit(self, X, y=None, **fitparams):
     X = validate_dataframe(X)
     self._validate_params(X)
     corr_matrix = X.corr(self.method).abs()
     upper = corr_matrix.where(
         np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
     self.drop_columns = [
         column for column in upper.columns
         if any(upper[column] > self.threshold)
     ]
     return self
 def fit(self, X, y, **fitparams):
     X = validate_dataframe(X)
     self.cuts = {}
     if 'sample_weight' in fitparams:
         w = fitparams['sample_weight']
     else:
         w = pd.Series(np.ones(len(y)))
     for col in X.columns:
         _cont_df = self._create_contingency_table(X[col], y, w)
         cuts = self._calc_optimal_cuts(_cont_df)
         self.cuts[col] = cuts[:]
     return self
    def fit(self, X, y=None, **fitparams):
        X = validate_dataframe(X)
        if 'sample_weight' in fitparams:
            w = fitparams['sample_weight']
        else:
            w = pd.Series(np.ones(X.shape[0]))

        self.clips = {}
        for col in X.columns:
            #self.clips[col] = X[col].quantile(q=[self.clip_p, 1 - self.clip_p]).tolist()
            self.clips[col] = weighted_percentile(X[col], q=[self.clip_p, 1 - self.clip_p], w=w).tolist()
        return self