def run(self, revised, X, aux): today = util.timestamp(self.year, self.month, self.day) min_date = util.timestamp(self.year - self.train_years, self.month, self.day) for df in (X, aux): date = data.index_as_series(df, 'date') df.drop(df.index[(date < min_date) | (date > today)], inplace=True) logging.info('Splitting train and test sets') # add date column to index # don't include future addresses in training date = data.index_as_series(aux, 'date') train = (date < today) & (aux.address_min_date < today) test = date == today revised = revise_helper(revised=revised, aux=aux, train=train, test=test, today=today) if self.train_query is not None: train &= train.index.isin( revised.query(self.train_query).index) # align kid_addresses_revised with the index of X and aux aux = aux[(train | test)] #aux.drop(aux.index[~(train | test)], inplace=True) X,train,test = data.train_test_subset(X, train, test, drop=True) aggregations = self.inputs[0].aggregations # dictionary of Aggregations for a, args in self.aggregations.iteritems(): X = aggregations[a].select(X, args, inplace=True) #logging.info('Binarizing') # TODO: include gender, ethnicity, etc. y = revised.loc[X.index].eval(self.outcome_expr) X = data.select_features(X, exclude=self.EXCLUDE + self.exclude, include=self.include) if self.spacetime_normalize: prefixes = ['%s_.*' % a.prefix for a in self.inputs[0].aggregations if isinstance(a, SpacetimeAggregation)] spacetime = data.select_regexes(X.columns, prefixes) logging.info('Normalize %s columns' % len(spacetime)) X.loc[:, spacetime] = X.loc[:,spacetime].groupby( level='date').apply( lambda x: pd.DataFrame(preprocessing.scale(x), index=x.index, columns=x.columns)) logging.info('Imputing') X = data.impute(X, train=train) sample_weight = 1 + (revised.wic * self.wic_sample_weight) c = data.non_numeric_columns(X) if len(c) > 0: logging.warning('Non-numeric columns: %s' % c) return {'X': X, 'y': y, 'train': train, 'test': test, 'aux': aux, 'sample_weight': sample_weight}
def run(self, aux, addresses): """ Returns: left: table with kid_id, address_id, date as well as additional address fields (e.g. census_tract_id) aux: table aligned with left that includes additional fields used for selecting training set (e.g. date of birth) and generating outcome variables (e.g. maximum BLL). """ min_date = util.timestamp(self.year_min, self.month, self.day) aux.dropna(subset=['date_of_birth'], inplace=True) aux.drop(aux.index[aux.date_of_birth < min_date], inplace=True) # Date stuff logging.info('dates') aux['date'] = aux.date_of_birth.apply( util.date_ceil(self.month, self.day)) # if bll6 happens before dob.date_ceil() use date_floor instead bll6_before_date = aux.first_bll6_sample_date < aux.date aux.loc[bll6_before_date, 'date'] = aux.loc[bll6_before_date, 'first_bll6_sample_date'].apply( util.date_floor(self.month, self.day)) columns = aux.columns aux = aux.merge(addresses, on='address_id') left_columns = ['kid_id', 'date'] + list(addresses.columns) left_columns.remove('address') left = aux[left_columns] return {'left': left, 'aux': aux}
def run(self, revised, X, aux): """ Args: revised: auxillary informtaion revised for the train-test date X: the full feature matrix from LeadData aux: the unrevised auxillary data from LeadData """ logging.info('Splitting train and test sets') today = util.timestamp(self.year, self.month, self.day) min_date = util.timestamp(self.year - self.train_years, self.month, self.day) date = data.index_as_series(X, 'date') X = X[date.between(min_date, today)] aux = aux[date.between(min_date, today)] date = data.index_as_series(aux, 'date') train = (date < today) & (aux.address_min_date < today) test = date == today aux = revise_helper(revised=revised, aux=aux, train=train, test=test, today=today) if self.train_query is not None: train &= train.index.isin(aux.query(self.train_query).index) aux = aux[train | test] X, train, test = data.train_test_subset(X, train, test, drop=False) c = data.non_numeric_columns(X) if len(c) > 0: logging.warning('Non-numeric columns: %s' % c) return {'X': X, 'aux': aux, 'train': train, 'test': test}
def run(self, addresses): """ Returns: - left: the cross product of the output.addresses table with the specified dates. """ dates = [timestamp(year, self.month, self.day) for year in range(self.year_min, self.year_max+1)] if len(dates) == 1: # when there's exactly one date modify in place for efficiency addresses['date'] = dates[0] left = addresses else: left = cross_join(addresses, pd.DataFrame(dates)) return {'left':left}
def run(self, aux, addresses): min_date = util.timestamp(self.year_min, self.month, self.day) aux.drop(aux.index[aux.date_of_birth < min_date], inplace=True) # Date stuff logging.info('dates') aux['date'] = aux.date_of_birth.apply( util.date_ceil(self.month, self.day)) # if bll6 happens before dob.date_ceil() use date_floor instead bll6_before_date = aux.first_bll6_sample_date < aux.date aux.loc[bll6_before_date, 'date'] = aux.loc[bll6_before_date, 'first_bll6_sample_date'].apply( util.date_floor(self.month, self.day)) columns = aux.columns aux = aux.merge(addresses, on='address_id') left_columns = ['kid_id', 'date'] + list(addresses.columns) left_columns.remove('address') left = aux[left_columns] return {'left':left, 'aux':aux}
def run(self, aux, addresses): min_date = util.timestamp(self.year_min, self.month, self.day) aux.drop(aux.index[aux.date_of_birth < min_date], inplace=True) # Date stuff logging.info('dates') aux['date'] = aux.date_of_birth.apply( util.date_ceil(self.month, self.day)) # if bll6 happens before dob.date_ceil() use date_floor instead bll6_before_date = aux.first_bll6_sample_date < aux.date aux.loc[bll6_before_date, 'date'] = aux.loc[bll6_before_date, 'first_bll6_sample_date'].apply( util.date_floor(self.month, self.day)) columns = aux.columns aux = aux.merge(addresses, on='address_id') left_columns = ['kid_id', 'date'] + list(addresses.columns) left_columns.remove('address') left = aux[left_columns] return {'left': left, 'aux': aux}