def standardize(self, data): """ Returns a standardized version of data. Parameters ---------- data : pandas Series or DataFrame Notes ----- data is standardized according to the rules that self was initialized with, i.e. the rules implicit in self.stats. """ self._check_compatible(data) # Convenience stats = self.stats standardized = data.copy().astype('float') if self._should_standardize: ss = common_math.get_item_names(data).intersection( self._should_standardize) standardized[ss] = (data[ss] - stats.mu[ss]) / stats.sigma[ss] return standardized
def unstandardize_params(self, w_st): """ Returns "w", an unstandardized version of w_st so that X.dot(w) = self.standardize(X).dot(w_st) Parameters ---------- w_st : Pandas.Series Index is names of variables Values are the fitted parameter values """ self._check_compatible(w_st) assert self._ones_column, ( "Specify a ones_column during initialization if you want to " "unstandardize") ## We will return this Series w = w_st.copy().astype('float') # ss = "should standardize" ss = common_math.get_item_names(w_st).intersection( self._should_standardize) ## Unstandardize colums that were standardized if len(ss) > 0: w_st_part_only = w_st[ss] sigma = self.stats.sigma[ss] w[ss] = w_st_part_only / sigma # Unstandardize the constant. Add the "excess" to self._ones_column if len(ss) > 0: mu = self.stats.mu[ss] w[self._ones_column] -= (mu * w_st_part_only / sigma).sum() return w
def _check_compatible(self, data): """ Raises ValueError if the columns/index of the DataFrame/Series "data" are not contained in self.known_columns. In this case, we don't know how to standardize/unstandardize/winsorize data, so we must raise an exception. """ diff = common_math.get_item_names(data).diff(self.known_columns) if diff: raise ValueError( "Data contained items we don't know how to work with: %s" % diff)
def _get_clip_levels(self, df): def func(s): return _get_clip_levels_series(s, self.lower_quantile, self.upper_quantile, self.max_std) items = common_math.get_item_names(df) sw = items.intersection(self._should_winsorize) levels = pd.Series(np.nan * np.ones(len(items)), index=items).astype('O') if len(sw) > 0: # This cast to float prevents a mixed data type frame...which can # cause apply to act in a funny manner levels[sw] = df[sw].astype('float').apply(func) return levels
def _get_clip_levels(self, df): def func(s): return _get_clip_levels_series( s, self.lower_quantile, self.upper_quantile, self.max_std) items = common_math.get_item_names(df) sw = items.intersection(self._should_winsorize) levels = pd.Series( np.nan * np.ones(len(items)), index=items).astype('O') if len(sw) > 0: # This cast to float prevents a mixed data type frame...which can # cause apply to act in a funny manner levels[sw] = df[sw].astype('float').apply(func) return levels
def winsorize(self, data): """ Winsorize the data using the rules determined during initialization. """ self._check_compatible(data) def func(series): lower, upper = self.clip_levels[series.name] return np.maximum(lower, np.minimum(upper, series)) # sw = "should winsorize" sw = common_math.get_item_names(data).intersection( self._should_winsorize) winsorized = data.copy() if len(sw) > 0: winsorized[sw] = winsorized[sw].apply(func) return winsorized