def _inflate(self, df, transformedDf, timeCol=None, histCol=None): """ """ timeCol = _build_key(timeCol)[0] histCol = _build_key(histCol)[0] transformedDf[timeCol] = month_year_ftime(df[timeCol]) cpiDf = transformedDf.reset_index().merge(self.cpi, left_on=timeCol, right_on=self.cpiDate, how='left')\ .set_index('index') transformedDf['inflatedValue'] = cpiDf['inflation'] * df[histCol] return transformedDf
def _build_categoricals(self, df, transformedDf, categoricalCols=None): categoricalCols = _build_key(categoricalCols) for col in categoricalCols: series = df[col].astype('category', categories=self.lookups[col]) dummified = pd.get_dummies(series, prefix=col, drop_first=True) transformedDf = pd.concat([transformedDf, dummified], axis=1) return transformedDf
def transform(self, df, groupId=('company_id', ), groupTarget=('key', ), aggTarget='n_scores'): """ """ groupId = _build_key(groupId) groupTarget = _build_key(groupTarget) groupKey = groupId + groupTarget data = df.groupby(groupKey).sum()[[aggTarget]].reset_index() data = pd.get_dummies(data, columns=groupTarget) aggValues = data.pop(aggTarget) companies = data.pop(groupId[0]) data = data.multiply(aggValues, axis='index') data = pd.concat([companies, data], axis=1) aggregated = data.groupby(groupId).sum().reset_index() return aggregated
def _run_risk(self, dumCols=('signal_group', 'metric_type', 'score_value')): self.dfCompanies = pd.read_csv(self.companiesPath) self.dfRisk = pd.read_csv(self.riskPath) dumCols = _build_key(dumCols) # Composit key => less overhead during dummification self.dfRisk['key'] = self.dfRisk[dumCols]\ .fillna('NaN')\ .apply(lambda x: "_".join(x), axis=1) self.dfRisk.drop(dumCols, inplace=True, axis=1)
def build_pipeline(clf, cols): """ Build :py:class:`sklearn.Pipeline` with feature union of cols followed by Imputer StandardScaler and an estimator :param sklearn.estimator clf: estimator :parm key-like cols: columns to pull from design to build features for training """ features = FeatureUnion([('fields', ItemSelector(_build_key(cols)))]) pipeline = Pipeline([('build_features', features), ('impute', Imputer()), ('scale', StandardScaler()), ('classifier', clf)]) return pipeline
def transform(self, df, transformKwargs=build_transform_kwargs(), dropCols=('last_sale_date', 'last_sale_amount'), customZips=True): """ Tranformer for clening raw housing data :param pandas.DataFame df: raw data :param dict transformKwargs: dict of transformation operations see :func:`windfall.src.helpers.build_transform_kwargs` and :func:`windfall.src.helpers.build_train_transform_kwargs` :param key-like dropCols: columns to drop from data :param bool customZips: switch for leveraging aggregated historical sale zipcode data """ transformedDf = pd.DataFrame(index=df.index) for operation, kwargs, in transformKwargs.iteritems(): transformedDf = self.__getattribute__(operation)(df, transformedDf, **kwargs) if customZips: transformedDf = self._custom_zips(df, transformedDf) transformedDf = self._custom_drops(df, transformedDf, _build_key(dropCols)) if not self.isTransformed: self.isTransformed = True return transformedDf
def _diffTime(df, transformedDf, dtCol=None): dtCol = _build_key(dtCol)[0] deltaT = pd.to_datetime('today') - pd.to_datetime(df[dtCol], errors='coerce') transformedDf['dt'] = deltaT.dt.days return transformedDf
def _build_categories(self, df, lookupCols): """ """ lookupCols = _build_key(lookupCols) for col in lookupCols: self.lookups[col] = df[col].unique()