Beispiel #1
0
    def run(self, revised, X, aux):
        today = util.timestamp(self.year, self.month, self.day)
        min_date = util.timestamp(self.year - self.train_years, self.month, self.day)

        for df in (X, aux):
            date = data.index_as_series(df, 'date')
            df.drop(df.index[(date < min_date) | (date > today)], inplace=True)

        logging.info('Splitting train and test sets')
        # add date column to index

        # don't include future addresses in training
        date = data.index_as_series(aux, 'date')
        train = (date < today) & (aux.address_min_date < today)
        test = date == today

        revised = revise_helper(revised=revised, aux=aux, 
                train=train, test=test, today=today)
        if self.train_query is not None:        
            train &= train.index.isin(
                    revised.query(self.train_query).index)
        # align kid_addresses_revised with the index of X and aux

        aux = aux[(train | test)]
        #aux.drop(aux.index[~(train | test)], inplace=True)
        X,train,test = data.train_test_subset(X, train, test, drop=True)

        aggregations = self.inputs[0].aggregations # dictionary of Aggregations
        for a, args in self.aggregations.iteritems():
            X = aggregations[a].select(X, args, inplace=True)

        #logging.info('Binarizing')
        # TODO: include gender, ethnicity, etc.
        y = revised.loc[X.index].eval(self.outcome_expr)
        X = data.select_features(X, exclude=self.EXCLUDE + self.exclude, 
                include=self.include)

        if self.spacetime_normalize:
            prefixes = ['%s_.*' % a.prefix for a in 
                    self.inputs[0].aggregations 
                        if isinstance(a, SpacetimeAggregation)]
            spacetime = data.select_regexes(X.columns, prefixes)
            logging.info('Normalize %s columns' % len(spacetime))
            X.loc[:, spacetime] = X.loc[:,spacetime].groupby(
                    level='date').apply(
                        lambda x: pd.DataFrame(preprocessing.scale(x), 
                        index=x.index, columns=x.columns))

        logging.info('Imputing')
        X = data.impute(X, train=train)

        sample_weight = 1 + (revised.wic * self.wic_sample_weight)

        c = data.non_numeric_columns(X)
        if len(c) > 0:
            logging.warning('Non-numeric columns: %s' % c)

        return {'X': X, 'y': y, 
                'train': train, 'test': test, 
                'aux': aux, 'sample_weight': sample_weight}
Beispiel #2
0
    def run(self, aux, addresses):
        """
        Returns:
            left: table with kid_id, address_id, date as well as additional
                address fields (e.g. census_tract_id)
            aux: table aligned with left that includes additional fields used
                for selecting training set (e.g. date of birth) and generating
                outcome variables (e.g. maximum BLL).
        """
        min_date = util.timestamp(self.year_min, self.month, self.day)
        aux.dropna(subset=['date_of_birth'], inplace=True)
        aux.drop(aux.index[aux.date_of_birth < min_date], inplace=True)
        # Date stuff
        logging.info('dates')
        aux['date'] = aux.date_of_birth.apply(
            util.date_ceil(self.month, self.day))

        # if bll6 happens before dob.date_ceil() use date_floor instead
        bll6_before_date = aux.first_bll6_sample_date < aux.date
        aux.loc[bll6_before_date,
                'date'] = aux.loc[bll6_before_date,
                                  'first_bll6_sample_date'].apply(
                                      util.date_floor(self.month, self.day))

        columns = aux.columns
        aux = aux.merge(addresses, on='address_id')

        left_columns = ['kid_id', 'date'] + list(addresses.columns)
        left_columns.remove('address')
        left = aux[left_columns]

        return {'left': left, 'aux': aux}
Beispiel #3
0
    def run(self, revised, X, aux):
        """
        Args:
            revised: auxillary informtaion revised for the train-test date
            X: the full feature matrix from LeadData
            aux: the unrevised auxillary data from LeadData
        """
        logging.info('Splitting train and test sets')
        today = util.timestamp(self.year, self.month, self.day)
        min_date = util.timestamp(self.year - self.train_years, self.month,
                                  self.day)

        date = data.index_as_series(X, 'date')
        X = X[date.between(min_date, today)]
        aux = aux[date.between(min_date, today)]

        date = data.index_as_series(aux, 'date')
        train = (date < today) & (aux.address_min_date < today)
        test = date == today

        aux = revise_helper(revised=revised,
                            aux=aux,
                            train=train,
                            test=test,
                            today=today)

        if self.train_query is not None:
            train &= train.index.isin(aux.query(self.train_query).index)

        aux = aux[train | test]
        X, train, test = data.train_test_subset(X, train, test, drop=False)

        c = data.non_numeric_columns(X)
        if len(c) > 0:
            logging.warning('Non-numeric columns: %s' % c)

        return {'X': X, 'aux': aux, 'train': train, 'test': test}
Beispiel #4
0
 def run(self, addresses):
     """
     Returns:
         - left: the cross product of the output.addresses table with the
             specified dates.
     """
     dates = [timestamp(year, self.month, self.day)
              for year in range(self.year_min, self.year_max+1)]
     if len(dates) == 1:
         # when there's exactly one date modify in place for efficiency
         addresses['date'] = dates[0]
         left = addresses
     else:
         left = cross_join(addresses, pd.DataFrame(dates))
         
     return {'left':left}
Beispiel #5
0
    def run(self, aux, addresses):
        min_date = util.timestamp(self.year_min, self.month, self.day)
        aux.drop(aux.index[aux.date_of_birth < min_date], inplace=True)
        # Date stuff
        logging.info('dates')
        aux['date'] = aux.date_of_birth.apply(
                util.date_ceil(self.month, self.day))
        
        # if bll6 happens before dob.date_ceil() use date_floor instead
        bll6_before_date = aux.first_bll6_sample_date < aux.date
        aux.loc[bll6_before_date, 'date'] =  aux.loc[bll6_before_date, 
                'first_bll6_sample_date'].apply(
                    util.date_floor(self.month, self.day))

        columns = aux.columns
        aux = aux.merge(addresses, on='address_id')

        left_columns = ['kid_id', 'date'] + list(addresses.columns)
        left_columns.remove('address')
        left = aux[left_columns]

        return {'left':left, 'aux':aux}
Beispiel #6
0
    def run(self, aux, addresses):
        min_date = util.timestamp(self.year_min, self.month, self.day)
        aux.drop(aux.index[aux.date_of_birth < min_date], inplace=True)
        # Date stuff
        logging.info('dates')
        aux['date'] = aux.date_of_birth.apply(
            util.date_ceil(self.month, self.day))

        # if bll6 happens before dob.date_ceil() use date_floor instead
        bll6_before_date = aux.first_bll6_sample_date < aux.date
        aux.loc[bll6_before_date,
                'date'] = aux.loc[bll6_before_date,
                                  'first_bll6_sample_date'].apply(
                                      util.date_floor(self.month, self.day))

        columns = aux.columns
        aux = aux.merge(addresses, on='address_id')

        left_columns = ['kid_id', 'date'] + list(addresses.columns)
        left_columns.remove('address')
        left = aux[left_columns]

        return {'left': left, 'aux': aux}