def run(self, revised, X, aux): today = util.timestamp(self.year, self.month, self.day) min_date = util.timestamp(self.year - self.train_years, self.month, self.day) for df in (X, aux): date = data.index_as_series(df, 'date') df.drop(df.index[(date < min_date) | (date > today)], inplace=True) logging.info('Splitting train and test sets') # add date column to index # don't include future addresses in training date = data.index_as_series(aux, 'date') train = (date < today) & (aux.address_min_date < today) test = date == today revised = revise_helper(revised=revised, aux=aux, train=train, test=test, today=today) if self.train_query is not None: train &= train.index.isin( revised.query(self.train_query).index) # align kid_addresses_revised with the index of X and aux aux = aux[(train | test)] #aux.drop(aux.index[~(train | test)], inplace=True) X,train,test = data.train_test_subset(X, train, test, drop=True) aggregations = self.inputs[0].aggregations # dictionary of Aggregations for a, args in self.aggregations.iteritems(): X = aggregations[a].select(X, args, inplace=True) #logging.info('Binarizing') # TODO: include gender, ethnicity, etc. y = revised.loc[X.index].eval(self.outcome_expr) X = data.select_features(X, exclude=self.EXCLUDE + self.exclude, include=self.include) if self.spacetime_normalize: prefixes = ['%s_.*' % a.prefix for a in self.inputs[0].aggregations if isinstance(a, SpacetimeAggregation)] spacetime = data.select_regexes(X.columns, prefixes) logging.info('Normalize %s columns' % len(spacetime)) X.loc[:, spacetime] = X.loc[:,spacetime].groupby( level='date').apply( lambda x: pd.DataFrame(preprocessing.scale(x), index=x.index, columns=x.columns)) logging.info('Imputing') X = data.impute(X, train=train) sample_weight = 1 + (revised.wic * self.wic_sample_weight) c = data.non_numeric_columns(X) if len(c) > 0: logging.warning('Non-numeric columns: %s' % c) return {'X': X, 'y': y, 'train': train, 'test': test, 'aux': aux, 'sample_weight': sample_weight}
def augment(y): """ augment the aux matrix with variables that are useful for train and test queries """ y['age'] = (data.index_as_series(y, 'date') - y.date_of_birth) / util.day y['last_sample_age'] = (y.last_sample_date - y.date_of_birth) / util.day y['first_sample_age'] = (y.first_sample_date - y.date_of_birth) / util.day y['address_test_max_age'] = (y.address_test_max_date - y.date_of_birth) / util.day y['first_bll6_sample_age'] = (y.first_bll6_sample_date - y.date_of_birth) / util.day y['wic'] = y.first_wic_date.notnull() y['true6'] = y.max_bll >= 6 y['true5'] = y.max_bll >= 5 y['true4'] = y.max_bll >= 4 # bll6 or tested after age about age 2 y['true6_2y'] = y.true6.where((y.max_bll >= 6) | (y.last_sample_age > 365 * 1.9)) # bll6 or tested at this address after about age 2 y['true6_2y_here'] = y.true6.where((y.max_bll >= 6) | (y.address_test_max_age > 365 * 1.9)) # bll6 at this address or tested at this address after about age 2 y['true6_here_2y_here'] = y.true6.where((y.address_max_bll >= 6) | ( y.address_test_max_age > 365 * 1.9))
def run(self, revised, X, aux): """ Args: revised: auxillary informtaion revised for the train-test date X: the full feature matrix from LeadData aux: the unrevised auxillary data from LeadData """ logging.info('Splitting train and test sets') today = util.timestamp(self.year, self.month, self.day) min_date = util.timestamp(self.year - self.train_years, self.month, self.day) date = data.index_as_series(X, 'date') X = X[date.between(min_date, today)] aux = aux[date.between(min_date, today)] date = data.index_as_series(aux, 'date') train = (date < today) & (aux.address_min_date < today) test = date == today aux = revise_helper(revised=revised, aux=aux, train=train, test=test, today=today) if self.train_query is not None: train &= train.index.isin(aux.query(self.train_query).index) aux = aux[train | test] X, train, test = data.train_test_subset(X, train, test, drop=False) c = data.non_numeric_columns(X) if len(c) > 0: logging.warning('Non-numeric columns: %s' % c) return {'X': X, 'aux': aux, 'train': train, 'test': test}
def revise_helper(revised, aux, train, test, today): """ given revised and unrevised kid_addresses (aux), merge the unrevised for the test set with the revised for training """ revised = aux[[]][train].reset_index().merge(revised, how='left', on=['kid_id', 'address_id']) revised.set_index(['kid_id', 'address_id', 'date'], inplace=True) revised = pd.concat((revised, aux[test])) revised['last_sample_age'] = (revised.last_sample_date - revised.date_of_birth)/util.day revised['wic'] = revised.first_wic_date.notnull() revised['today_age'] = (today - revised.date_of_birth)/util.day revised['address_test_max_age'] = (revised.address_test_max_date - revised.date_of_birth)/util.day date = data.index_as_series(revised, 'date') revised['age'] = (date - revised.date_of_birth)/util.day return revised
from drain import explore, model, step, data, util from lead.model import steps import lead.model.data import lead.output.aggregations import pandas as pd step.BASEDIR = '/home/epotash/lead/data/drain/' step.configure_yaml() predictions = steps.bll6_forest() query = 'address_wic_min_date < date' s = [ p for p in predictions if p.named_arguments[('transform', 'year')] == 2016 ][0] s.load() result = s.get_result() y = result['y'] y['age'] = (data.index_as_series(y, 'date') - y.date_of_birth) / util.day d = s.get_input('transform').inputs[0] d.load() X = d.get_result()['X'] engine = util.create_engine() y.query(query)[['score', 'age', 'address', 'first_name' , 'last_name', 'date_of_birth', 'max_bll', 'test_count', 'address_count', 'address_wic_infant']].join( X[['inspections_address_1y_inspected','inspections_address_1y_complied']])\ .to_sql(name='predictions', con=engine, if_exists='replace')
from drain import explore, model, step, data, util from lead.model import steps import lead.model.data import lead.output.aggregations import pandas as pd step.BASEDIR='/home/epotash/lead/data/drain/' step.configure_yaml() predictions = steps.bll6_forest() query = 'address_wic_min_date < date' s = [p for p in predictions if p.named_arguments[('transform', 'year')] == 2016][0] s.load() result = s.get_result() y = result['y'] y['age'] = (data.index_as_series(y, 'date') - y.date_of_birth) / util.day d = s.get_input('transform').inputs[0] d.load() X = d.get_result()['X'] engine = util.create_engine() y.query(query)[['score', 'age', 'address', 'first_name' , 'last_name', 'date_of_birth', 'max_bll', 'test_count', 'address_count', 'address_wic_infant']].join( X[['inspections_address_1y_inspected','inspections_address_1y_complied']])\ .to_sql(name='predictions', con=engine, if_exists='replace')