def test_not_dataframe_Validate_dataframe(): df = [[1, 4, 5], [-5, 8, 9]] with pytest.raises(TypeError): validate_dataframe(df) df = ([1, 4, 5], [-5, 8, 9]) with pytest.raises(TypeError): validate_dataframe(df)
def fit(self, X, y=None, **fitparams): X = validate_dataframe(X) self.fitted_transformers_ = [] for transformer in self.list_of_transformers: fitted_trans = clone(transformer).fit(X, y=None, **fitparams) self.fitted_transformers_.append(fitted_trans) return self
def fit(self, X, y=None, **fitparams): X = validate_dataframe(X) self._validate_params(X) self.types = {} for col in X.columns: self.types[col] = self._infer_dtype(X[col]) return self
def transform(self, X, **transformparams): X = validate_dataframe(X) X = X.copy() df_concat = pd.concat( [t.transform(X) for t in self.fitted_transformers_], axis=1).copy() return df_concat
def fit(self, X, y=None, **fitparams): X = validate_dataframe(X) self.scaler = StandardScaler(copy=self.copy, with_mean=True, with_std=True) self.scaler.fit(X) self.kernelpca = KernelPCA(n_components=self.n_components, kernel=self.kernel, gamma=self.gamma, degree=self.degree) self.kernelpca.fit(self.scaler.transform(X)) return self
def fit(self, X, y=None, **fitparams): X = validate_dataframe(X) self.scaler = StandardScaler(copy=self.copy, with_mean=True, with_std=True) self.scaler.fit(X) self.pca = PCA(n_components=self.n_components, whiten=True) self.pca.fit(self.scaler.transform(X)) return self
def transform(self, X, **transformparams): X = validate_dataframe(X) Xout = X.copy() for col in Xout.columns: Xout[col][Xout[col].str.strip() == ''] = self.value_if_empty Xout[col][pd.isnull(Xout[col])] = self.value_if_none return Xout
def transform(self, X, **transformparams): X = validate_dataframe(X) X = X.copy() Xs = self.scaler.transform(X) Xpca = self.kernelpca.transform(Xs) column_names = [self.prefix + '{0:03g}'.format(n) + self.suffix for n in range(Xpca.shape[1])] return pd.DataFrame(Xpca, index=X.index, columns=column_names)
def transform(self, X, **transformparams): X = validate_dataframe(X) X = X.copy() num_columns = len(X.columns) if self.contains is None: contains_mask = [True] * num_columns else: contains_mask = X.columns.str.contains(self.contains) if self.prefix is None: prefix_mask = [True] * num_columns else: prefix_mask = X.columns.str.startswith(self.prefix) if self.suffix is None: suffix_mask = [True] * num_columns else: suffix_mask = X.columns.str.endswith(self.suffix) if self.operator == 'and': selected_columns = [ c and p and s for c, p, s in zip(contains_mask, prefix_mask, suffix_mask) ] elif self.operator == 'or': selected_columns = [ c or p or s for c, p, s in zip(contains_mask, prefix_mask, suffix_mask) ] else: raise ValueError("Operator {0} is not implemented".format( self.operator)) return X.loc[:, selected_columns]
def fit(self, X, y=None, **fitparams): X = validate_dataframe(X) self.scaler = MinMaxScaler(feature_range=self.feature_range, copy=self.copy) self.scaler.fit(X) self.scale_ = pd.Series(self.scaler.scale_, index=X.columns) self.min_ = pd.Series(self.scaler.min_, index=X.columns) return self
def transform(self, X, **transformparams): X = validate_dataframe(X) X = X.copy() selected_columns = [ col for col in X.columns if col in self.k_best_columns ] return X.loc[:, selected_columns]
def fit(self, X, y=None, **fitparams): X = validate_dataframe(X) self.encodings = {} for col in X.columns: self.encodings[col] = np.sort(X[col].unique()) return self
def transform(self, X, **transformparams): X = validate_dataframe(X) new_col_list = [] for col in X.columns: var_type = X[col].dtype.name if self.types[col] in ('object', 'bool', 'category'): if self.unique_vals[col] is not None: not_in_list = ~X[col].isin(self.unique_vals[col]) if sum(not_in_list) > 0: new_values = str(X[col][not_in_list].unique().tolist()) self.print_message( 'New Categories specified for column {col}: Received {received}' .format(col=col, received=new_values)) elif self.types[col] in ('int64', 'float64', 'datetime64', 'timedelta'): minX = X[col].min() maxX = X[col].max() if minX < self.minmax[col][0]: self.print_message( 'Low Value warning for column {col}: Lowest Training value {lowtrain}, Lowest Scoring value {lowscore}' .format(col=col, lowtrain=self.minmax[col][0], lowscore=minX)) if maxX > self.minmax[col][1]: self.print_message( 'High Value warning for column {col}: Largest Training value {hightrain}, Largest Scoring value {highscore}' .format(col=col, hightrain=self.minmax[col][1], highscore=maxX)) return X
def fit(self, X, y=None, **fitparams): X = validate_dataframe(X) self.scaler = RobustScaler(with_centering=self.with_centering, with_scaling=self.with_scaling, quantile_range=self.quantile_range) self.scaler.fit(X) self.center_ = pd.Series(self.scaler.center_, index=X.columns) self.scale_ = pd.Series(self.scaler.scale_, index=X.columns) return self
def transform(self, X, y=None): X = validate_dataframe(X) X = X.copy() subset = [c for c in X.columns if c not in self.excluded_columns] return X.replace([np.inf, -np.inf], np.nan).dropna(axis=0, how='any', inplace=False, subset=subset)
def fit(self, X, y=None): X = validate_dataframe(X) self._validate_params(X) self.drop_columns = [ col for col in X.columns if X[col].nunique() < self.min_unique_values ] return self
def fit(self, X, y=None, **fitparams): X = validate_dataframe(X) self.lams = {} for col in X.columns: Xcol_float = X[col].astype(float) _, lam = yeojohnson(Xcol_float) self.lams[col] = lam return self
def transform(self, X, **transformparams): X = validate_dataframe(X) Xout = X.copy() for col in Xout.columns: Xout[col] = np.where(Xout[col].isin(self.common_categories[col]), Xout[col], self.value_if_rare) return Xout
def transform(self, X, **transformparams): X = validate_dataframe(X) Xout = X.copy() new_col_list = [] for col in X.columns: new_col = self.prefix + col + self.suffix new_col_list.append(new_col) Xout[new_col] = [self.agg_series[col][x] for x in X[col]] return Xout.loc[:, new_col_list]
def fit(self, X, y, **fitparams): X = validate_dataframe(X) self._validate_params(X) var_performance = self.score_func(X, y) if type(var_performance) == tuple: var_performance = pd.Series(data=var_performance[-1], index=X.columns) self.k_best_columns = var_performance.sort_values( ascending=self.ascending).head(self.k).index return self
def fit(self, X, y=None, **fitparams): X = validate_dataframe(X) if 'sample_weight' in fitparams: w = fitparams['sample_weight'] else: w = pd.Series(np.ones(X.shape[0])) self.impute_val = {} for col in X.columns: self.impute_val[col] = np.nan_to_num(self._calc_impute_val(X.loc[:, col], w)) return self
def transform(self, X, **transformparams): X = validate_dataframe(X) X = X.copy() new_col_list = [] for col in X.columns: new_col = self.prefix + col + self.suffix new_col_list.append(new_col) X[new_col] = X[col].clip(self.clips[col][0], self.clips[col][1]) return X.loc[:, new_col_list]
def transform(self, X, **transformparams): X = validate_dataframe(X) X = X.copy() new_col_list = [] for col in X.columns: new_col = self.prefix + col + self.suffix new_col_list.append(new_col) X[new_col] = pd.cut(x=X[col], bins=self.cuts[col], duplicates='drop') return X.loc[:, new_col_list]
def transform(self, X, **transformparams): X = validate_dataframe(X) X = X.copy() new_col_list = [] for col in X.columns: new_col = self.prefix + col + self.suffix new_col_list.append(new_col) Xcol_float = X[col].astype(float) X[new_col] = yeojohnson(Xcol_float, lmbda=self.lams[col]) return X.loc[:, new_col_list]
def fit(self, X, y=None, **fitparams): X = validate_dataframe(X) self._validate_params(X) self.agg_series = {} for gb in self.groupby_vars: self.agg_series[gb] = {} for metric in self.metric_vars: agg_series = X.groupby(gb).agg({metric: self._agg_func})[metric] self.agg_series[gb][metric] = agg_series return self
def transform(self, X, **transformparams): X = validate_dataframe(X) Xout = X.copy() new_col_list = [] for col in X.columns: for cat in self.encodings[col]: new_col = col + '_' + str(cat) Xout[new_col] = Xout[col] == cat new_col_list.append(new_col) return Xout.loc[:, new_col_list]
def transform(self, X, **transformparams): X = validate_dataframe(X) X = X.copy() new_col_list = [] for col in X.columns: new_col = self.prefix + col + self.suffix x_orig = X[col].copy() X[new_col] = self._apply_bins(x_orig, self.cuts[col]) new_col_list.append(new_col) return X.loc[:, new_col_list]
def fit(self, X, y=None, **fitparams): X = validate_dataframe(X) self._validate_params(X) corr_matrix = X.corr(self.method).abs() upper = corr_matrix.where( np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)) self.drop_columns = [ column for column in upper.columns if any(upper[column] > self.threshold) ] return self
def fit(self, X, y, **fitparams): X = validate_dataframe(X) self.cuts = {} if 'sample_weight' in fitparams: w = fitparams['sample_weight'] else: w = pd.Series(np.ones(len(y))) for col in X.columns: _cont_df = self._create_contingency_table(X[col], y, w) cuts = self._calc_optimal_cuts(_cont_df) self.cuts[col] = cuts[:] return self
def fit(self, X, y=None, **fitparams): X = validate_dataframe(X) if 'sample_weight' in fitparams: w = fitparams['sample_weight'] else: w = pd.Series(np.ones(X.shape[0])) self.clips = {} for col in X.columns: #self.clips[col] = X[col].quantile(q=[self.clip_p, 1 - self.clip_p]).tolist() self.clips[col] = weighted_percentile(X[col], q=[self.clip_p, 1 - self.clip_p], w=w).tolist() return self