Ejemplo n.º 1
0
    def _inflate(self, df, transformedDf, timeCol=None, histCol=None):
        """
        """
        timeCol = _build_key(timeCol)[0]
        histCol = _build_key(histCol)[0]
        transformedDf[timeCol] = month_year_ftime(df[timeCol])

        cpiDf = transformedDf.reset_index().merge(self.cpi, left_on=timeCol,
                                                  right_on=self.cpiDate,
                                                  how='left')\
                                           .set_index('index')

        transformedDf['inflatedValue'] = cpiDf['inflation'] * df[histCol]

        return transformedDf
Ejemplo n.º 2
0
    def _build_categoricals(self, df, transformedDf, categoricalCols=None):
        categoricalCols = _build_key(categoricalCols)
        for col in categoricalCols:
            series = df[col].astype('category', categories=self.lookups[col])
            dummified = pd.get_dummies(series, prefix=col, drop_first=True)
            transformedDf = pd.concat([transformedDf, dummified], axis=1)

        return transformedDf
Ejemplo n.º 3
0
    def transform(self,
                  df,
                  groupId=('company_id', ),
                  groupTarget=('key', ),
                  aggTarget='n_scores'):
        """
        """
        groupId = _build_key(groupId)
        groupTarget = _build_key(groupTarget)
        groupKey = groupId + groupTarget

        data = df.groupby(groupKey).sum()[[aggTarget]].reset_index()
        data = pd.get_dummies(data, columns=groupTarget)
        aggValues = data.pop(aggTarget)
        companies = data.pop(groupId[0])
        data = data.multiply(aggValues, axis='index')
        data = pd.concat([companies, data], axis=1)

        aggregated = data.groupby(groupId).sum().reset_index()
        return aggregated
Ejemplo n.º 4
0
    def _run_risk(self,
                  dumCols=('signal_group', 'metric_type', 'score_value')):

        self.dfCompanies = pd.read_csv(self.companiesPath)
        self.dfRisk = pd.read_csv(self.riskPath)

        dumCols = _build_key(dumCols)
        # Composit key => less overhead during dummification
        self.dfRisk['key'] = self.dfRisk[dumCols]\
                             .fillna('NaN')\
                             .apply(lambda x: "_".join(x), axis=1)

        self.dfRisk.drop(dumCols, inplace=True, axis=1)
Ejemplo n.º 5
0
    def build_pipeline(clf, cols):
        """
        Build :py:class:`sklearn.Pipeline` with feature union of cols followed by Imputer StandardScaler and an estimator

        :param sklearn.estimator clf: estimator
        :parm key-like cols: columns to pull from design to build features for training
        """

        features = FeatureUnion([('fields', ItemSelector(_build_key(cols)))])
        pipeline = Pipeline([('build_features', features),
                             ('impute', Imputer()),
                             ('scale', StandardScaler()),
                             ('classifier', clf)])
        return pipeline
Ejemplo n.º 6
0
    def transform(self, df, transformKwargs=build_transform_kwargs(), dropCols=('last_sale_date', 'last_sale_amount'), customZips=True):
        """
        Tranformer for clening raw housing data

        :param pandas.DataFame df: raw data
        :param dict transformKwargs: dict of transformation operations
            see :func:`windfall.src.helpers.build_transform_kwargs`
            and :func:`windfall.src.helpers.build_train_transform_kwargs`

        :param key-like dropCols: columns to drop from data
        :param bool customZips: switch for leveraging aggregated historical sale zipcode data
        """
        transformedDf = pd.DataFrame(index=df.index)

        for operation, kwargs, in transformKwargs.iteritems():
            transformedDf = self.__getattribute__(operation)(df, transformedDf, **kwargs)

        if customZips:
            transformedDf = self._custom_zips(df, transformedDf)
        transformedDf = self._custom_drops(df, transformedDf, _build_key(dropCols))

        if not self.isTransformed:
            self.isTransformed = True
        return transformedDf
Ejemplo n.º 7
0
    def _diffTime(df, transformedDf, dtCol=None):
        dtCol = _build_key(dtCol)[0]
        deltaT = pd.to_datetime('today') - pd.to_datetime(df[dtCol], errors='coerce')
        transformedDf['dt'] = deltaT.dt.days

        return transformedDf
Ejemplo n.º 8
0
 def _build_categories(self, df, lookupCols):
     """
     """
     lookupCols = _build_key(lookupCols)
     for col in lookupCols:
         self.lookups[col] = df[col].unique()