Exemple #1
0
def _df_missing(self, categorical_fill='none', numerical_fill='none'):  
  misc.start('replacing missing data categorical[' + `categorical_fill` + '] numerical[' + `numerical_fill` + ']')
  
  # Do numerical constants on whole DF for performance
  if type(numerical_fill) != str:
    self[self.numericals()] = self[self.numericals()].fillna(numerical_fill)
    self.replace([np.inf, -np.inf], numerical_fill, inplace=True)
    numerical_fill='none'

  # Do categorical constants on whole DF for performance
  if categorical_fill != 'none' and categorical_fill != 'mode':
    self[self.categorical_like()] = self[self.categorical_like()].fillna(categorical_fill)
    categorical_fill='none'

  # Get list of columns still left to fill
  categoricals_to_fill = []
  numericals_to_fill = []
  binaries_to_fill = []
  if categorical_fill != 'none': categoricals_to_fill += self.categorical_like()
  if numerical_fill != 'none': numericals_to_fill += self.numericals()

  # Prepare a dictionary of column -> fill values
  to_fill = {}  
  for c in categoricals_to_fill: to_fill[c] = utils.get_col_aggregate(self[c], categorical_fill)
  for c in numericals_to_fill: 
    to_fill[c] = utils.get_col_aggregate(self[c], numerical_fill)
    self[c].replace([np.inf, -np.inf], to_fill[c], inplace=True)
  
  # Do fill in one step for performance
  if to_fill: self.fillna(value=to_fill, inplace=True)

  misc.stop('done replacing missing data')
  return self
Exemple #2
0
def _s_to_stat(self, y, stat='mean', 
      missing_value='missing', missing_treatment='missing-category', 
      noise_level=None):
  # if not self.is_categorical_like(): raise Exception('only supported for categorical like columns')
  if type(y) is not pd.Series: y = pd.Series(y)  
  train = self[:len(y)] 
  test = self[len(y):]
  df = pd.DataFrame({'c_1' : train, 'n_y': y.values})
  
  def iqm(x): return np.mean(np.percentile(x, [75 ,25]))

  s = df.groupby('c_1')['n_y'].\
      transform(iqm if stat == 'iqm' else stat)
  if len(test) > 0:   
    _, not_in_train = train.difference_with(test, quiet=True)  
    transformer = dict(zip(train, s))

    test[test.isin(not_in_train)] = missing_value if \
        missing_treatment == 'missing-category' and missing_value in transformer else 'use-whole-set'

    if (missing_treatment != 'missing-category' or missing_value not in transformer):
      transformer['use-whole-set'] = utils.get_col_aggregate(y, stat)
    s =  s.append_bottom(test.map(transformer))  
    
  if noise_level > 0: s = s.add_noise(noise_level, 'gaussian')
  return s
Exemple #3
0
def _s_to_stat(self,
               y,
               stat='mean',
               missing_value='missing',
               missing_treatment='missing-category'):
    if not self.is_categorical_like():
        raise Exception('only supported for categorical like columns')
    if type(y) is not pd.Series: y = pd.Series(y)
    train = self[:len(y)]
    test = self[len(y):]
    df = pd.DataFrame({'c_1': train, 'n_y': y.values})

    def iqm(x):
        return np.mean(np.percentile(x, [75, 25]))

    train_values = df.groupby('c_1')['n_y'].\
        transform(iqm if stat == 'iqm' else stat)
    if len(test) == 0:
        return train_values

    _, not_in_train = train.difference_with(test, quiet=True)
    transformer = dict(zip(train, train_values))

    test[test.isin(not_in_train)] = missing_value if \
        missing_treatment == 'missing-category' and missing_value in transformer else 'use-whole-set'

    if (missing_treatment != 'missing-category'
            or missing_value not in transformer):
        transformer['use-whole-set'] = utils.get_col_aggregate(y, stat)

    return train_values.append_bottom(test.map(transformer))
Exemple #4
0
def _s_missing(self, fill='none'):  
  misc.start('replacing series missing data fill[' + `fill` + ']')
  val = utils.get_col_aggregate(self, fill)    
  self.fillna(val, inplace=True)
  self.replace([np.inf, -np.inf], val, inplace=True)  

  misc.stop('replacing series missing data')
  return self
Exemple #5
0
def _s_missing(self, fill='none'):
    misc.start('replacing series missing data fill[' + ` fill ` + ']')
    val = utils.get_col_aggregate(self, fill)
    self.fillna(val, inplace=True)
    self.replace([np.inf, -np.inf], val, inplace=True)

    misc.stop('replacing series missing data')
    return self
Exemple #6
0
def _s_categorical_outliers(self, min_size=0.01, fill_mode='mode'):     
  threshold = float(len(self)) * min_size if type(min_size) is float else min_size 
  fill = utils.get_col_aggregate(self, fill_mode)
  vc = self.value_counts()
  under = vc[vc <= threshold]    
  if under.shape[0] > 0: 
    misc.debug('column [' + str(self.name) + '] threshold[' + `threshold` + '] fill[' + `fill` + '] num of rows[' + `len(under.index)` + ']')
    self[self.isin(under.index)] = fill
  return self
Exemple #7
0
def _s_categorical_outliers(self, min_size=0.01, fill_mode='mode'):     
  threshold = float(len(self)) * min_size if type(min_size) is float else min_size 
  fill = utils.get_col_aggregate(self, fill_mode)
  vc = self.value_counts()
  under = vc[vc <= threshold]    
  if under.shape[0] > 0: 
    misc.debug('column [' + str(self.name) + '] threshold[' + `threshold` + '] fill[' + `fill` + '] num of rows[' + `len(under.index)` + ']')
    self[self.isin(under.index)] = fill
  return self
Exemple #8
0
def _s_missing(self, fill="none"):
    misc.start("replacing series missing data fill[" + ` fill ` + "]")

    val = utils.get_col_aggregate(self, fill)
    self.fillna(val, inplace=True)
    if self.is_numerical():
        self.replace([np.inf, -np.inf], val, inplace=True)

    misc.stop("replacing series missing data")
    return self
Exemple #9
0
def _df_missing(self, categorical_fill='none', numerical_fill='none'):
    misc.start('replacing missing data categorical[' + ` categorical_fill ` +
               '] numerical[' + ` numerical_fill ` + ']')

    # Do numerical constants on whole DF for performance
    if type(numerical_fill) != str:
        self[self.numericals()] = self[self.numericals()].fillna(
            numerical_fill)
        self.replace([np.inf, -np.inf], numerical_fill, inplace=True)
        numerical_fill = 'none'

    # Do categorical constants on whole DF for performance
    if categorical_fill != 'none' and categorical_fill != 'mode':
        self[self.categorical_like()] = self[self.categorical_like()].fillna(
            categorical_fill)
        categorical_fill = 'none'

    # Get list of columns still left to fill
    categoricals_to_fill = []
    numericals_to_fill = []
    binaries_to_fill = []
    if categorical_fill != 'none':
        categoricals_to_fill += self.categorical_like()
    if numerical_fill != 'none': numericals_to_fill += self.numericals()

    # Prepare a dictionary of column -> fill values
    to_fill = {}
    for c in categoricals_to_fill:
        to_fill[c] = utils.get_col_aggregate(self[c], categorical_fill)
    for c in numericals_to_fill:
        to_fill[c] = utils.get_col_aggregate(self[c], numerical_fill)
        self[c].replace([np.inf, -np.inf], to_fill[c], inplace=True)

    # Do fill in one step for performance
    if to_fill: self.fillna(value=to_fill, inplace=True)

    misc.stop('done replacing missing data')
    return self