def _df_missing(self, categorical_fill='none', numerical_fill='none'): misc.start('replacing missing data categorical[' + `categorical_fill` + '] numerical[' + `numerical_fill` + ']') # Do numerical constants on whole DF for performance if type(numerical_fill) != str: self[self.numericals()] = self[self.numericals()].fillna(numerical_fill) self.replace([np.inf, -np.inf], numerical_fill, inplace=True) numerical_fill='none' # Do categorical constants on whole DF for performance if categorical_fill != 'none' and categorical_fill != 'mode': self[self.categorical_like()] = self[self.categorical_like()].fillna(categorical_fill) categorical_fill='none' # Get list of columns still left to fill categoricals_to_fill = [] numericals_to_fill = [] binaries_to_fill = [] if categorical_fill != 'none': categoricals_to_fill += self.categorical_like() if numerical_fill != 'none': numericals_to_fill += self.numericals() # Prepare a dictionary of column -> fill values to_fill = {} for c in categoricals_to_fill: to_fill[c] = utils.get_col_aggregate(self[c], categorical_fill) for c in numericals_to_fill: to_fill[c] = utils.get_col_aggregate(self[c], numerical_fill) self[c].replace([np.inf, -np.inf], to_fill[c], inplace=True) # Do fill in one step for performance if to_fill: self.fillna(value=to_fill, inplace=True) misc.stop('done replacing missing data') return self
def _s_to_stat(self, y, stat='mean', missing_value='missing', missing_treatment='missing-category', noise_level=None): # if not self.is_categorical_like(): raise Exception('only supported for categorical like columns') if type(y) is not pd.Series: y = pd.Series(y) train = self[:len(y)] test = self[len(y):] df = pd.DataFrame({'c_1' : train, 'n_y': y.values}) def iqm(x): return np.mean(np.percentile(x, [75 ,25])) s = df.groupby('c_1')['n_y'].\ transform(iqm if stat == 'iqm' else stat) if len(test) > 0: _, not_in_train = train.difference_with(test, quiet=True) transformer = dict(zip(train, s)) test[test.isin(not_in_train)] = missing_value if \ missing_treatment == 'missing-category' and missing_value in transformer else 'use-whole-set' if (missing_treatment != 'missing-category' or missing_value not in transformer): transformer['use-whole-set'] = utils.get_col_aggregate(y, stat) s = s.append_bottom(test.map(transformer)) if noise_level > 0: s = s.add_noise(noise_level, 'gaussian') return s
def _s_to_stat(self, y, stat='mean', missing_value='missing', missing_treatment='missing-category'): if not self.is_categorical_like(): raise Exception('only supported for categorical like columns') if type(y) is not pd.Series: y = pd.Series(y) train = self[:len(y)] test = self[len(y):] df = pd.DataFrame({'c_1': train, 'n_y': y.values}) def iqm(x): return np.mean(np.percentile(x, [75, 25])) train_values = df.groupby('c_1')['n_y'].\ transform(iqm if stat == 'iqm' else stat) if len(test) == 0: return train_values _, not_in_train = train.difference_with(test, quiet=True) transformer = dict(zip(train, train_values)) test[test.isin(not_in_train)] = missing_value if \ missing_treatment == 'missing-category' and missing_value in transformer else 'use-whole-set' if (missing_treatment != 'missing-category' or missing_value not in transformer): transformer['use-whole-set'] = utils.get_col_aggregate(y, stat) return train_values.append_bottom(test.map(transformer))
def _s_missing(self, fill='none'): misc.start('replacing series missing data fill[' + `fill` + ']') val = utils.get_col_aggregate(self, fill) self.fillna(val, inplace=True) self.replace([np.inf, -np.inf], val, inplace=True) misc.stop('replacing series missing data') return self
def _s_missing(self, fill='none'): misc.start('replacing series missing data fill[' + ` fill ` + ']') val = utils.get_col_aggregate(self, fill) self.fillna(val, inplace=True) self.replace([np.inf, -np.inf], val, inplace=True) misc.stop('replacing series missing data') return self
def _s_categorical_outliers(self, min_size=0.01, fill_mode='mode'): threshold = float(len(self)) * min_size if type(min_size) is float else min_size fill = utils.get_col_aggregate(self, fill_mode) vc = self.value_counts() under = vc[vc <= threshold] if under.shape[0] > 0: misc.debug('column [' + str(self.name) + '] threshold[' + `threshold` + '] fill[' + `fill` + '] num of rows[' + `len(under.index)` + ']') self[self.isin(under.index)] = fill return self
def _s_missing(self, fill="none"): misc.start("replacing series missing data fill[" + ` fill ` + "]") val = utils.get_col_aggregate(self, fill) self.fillna(val, inplace=True) if self.is_numerical(): self.replace([np.inf, -np.inf], val, inplace=True) misc.stop("replacing series missing data") return self
def _df_missing(self, categorical_fill='none', numerical_fill='none'): misc.start('replacing missing data categorical[' + ` categorical_fill ` + '] numerical[' + ` numerical_fill ` + ']') # Do numerical constants on whole DF for performance if type(numerical_fill) != str: self[self.numericals()] = self[self.numericals()].fillna( numerical_fill) self.replace([np.inf, -np.inf], numerical_fill, inplace=True) numerical_fill = 'none' # Do categorical constants on whole DF for performance if categorical_fill != 'none' and categorical_fill != 'mode': self[self.categorical_like()] = self[self.categorical_like()].fillna( categorical_fill) categorical_fill = 'none' # Get list of columns still left to fill categoricals_to_fill = [] numericals_to_fill = [] binaries_to_fill = [] if categorical_fill != 'none': categoricals_to_fill += self.categorical_like() if numerical_fill != 'none': numericals_to_fill += self.numericals() # Prepare a dictionary of column -> fill values to_fill = {} for c in categoricals_to_fill: to_fill[c] = utils.get_col_aggregate(self[c], categorical_fill) for c in numericals_to_fill: to_fill[c] = utils.get_col_aggregate(self[c], numerical_fill) self[c].replace([np.inf, -np.inf], to_fill[c], inplace=True) # Do fill in one step for performance if to_fill: self.fillna(value=to_fill, inplace=True) misc.stop('done replacing missing data') return self