Esempio n. 1
0
    def run(self, revised, X, aux):
        today = util.timestamp(self.year, self.month, self.day)
        min_date = util.timestamp(self.year - self.train_years, self.month, self.day)

        for df in (X, aux):
            date = data.index_as_series(df, 'date')
            df.drop(df.index[(date < min_date) | (date > today)], inplace=True)

        logging.info('Splitting train and test sets')
        # add date column to index

        # don't include future addresses in training
        date = data.index_as_series(aux, 'date')
        train = (date < today) & (aux.address_min_date < today)
        test = date == today

        revised = revise_helper(revised=revised, aux=aux, 
                train=train, test=test, today=today)
        if self.train_query is not None:        
            train &= train.index.isin(
                    revised.query(self.train_query).index)
        # align kid_addresses_revised with the index of X and aux

        aux = aux[(train | test)]
        #aux.drop(aux.index[~(train | test)], inplace=True)
        X,train,test = data.train_test_subset(X, train, test, drop=True)

        aggregations = self.inputs[0].aggregations # dictionary of Aggregations
        for a, args in self.aggregations.iteritems():
            X = aggregations[a].select(X, args, inplace=True)

        #logging.info('Binarizing')
        # TODO: include gender, ethnicity, etc.
        y = revised.loc[X.index].eval(self.outcome_expr)
        X = data.select_features(X, exclude=self.EXCLUDE + self.exclude, 
                include=self.include)

        if self.spacetime_normalize:
            prefixes = ['%s_.*' % a.prefix for a in 
                    self.inputs[0].aggregations 
                        if isinstance(a, SpacetimeAggregation)]
            spacetime = data.select_regexes(X.columns, prefixes)
            logging.info('Normalize %s columns' % len(spacetime))
            X.loc[:, spacetime] = X.loc[:,spacetime].groupby(
                    level='date').apply(
                        lambda x: pd.DataFrame(preprocessing.scale(x), 
                        index=x.index, columns=x.columns))

        logging.info('Imputing')
        X = data.impute(X, train=train)

        sample_weight = 1 + (revised.wic * self.wic_sample_weight)

        c = data.non_numeric_columns(X)
        if len(c) > 0:
            logging.warning('Non-numeric columns: %s' % c)

        return {'X': X, 'y': y, 
                'train': train, 'test': test, 
                'aux': aux, 'sample_weight': sample_weight}
Esempio n. 2
0
def augment(y):
    """
    augment the aux matrix with variables that are useful for train and test queries
    """
    y['age'] = (data.index_as_series(y, 'date') - y.date_of_birth) / util.day
    y['last_sample_age'] = (y.last_sample_date - y.date_of_birth) / util.day
    y['first_sample_age'] = (y.first_sample_date - y.date_of_birth) / util.day
    y['address_test_max_age'] = (y.address_test_max_date -
                                 y.date_of_birth) / util.day
    y['first_bll6_sample_age'] = (y.first_bll6_sample_date -
                                  y.date_of_birth) / util.day
    y['wic'] = y.first_wic_date.notnull()

    y['true6'] = y.max_bll >= 6
    y['true5'] = y.max_bll >= 5
    y['true4'] = y.max_bll >= 4

    # bll6 or tested after age about age 2
    y['true6_2y'] = y.true6.where((y.max_bll >= 6)
                                  | (y.last_sample_age > 365 * 1.9))
    # bll6 or tested at this address after about age 2
    y['true6_2y_here'] = y.true6.where((y.max_bll >= 6)
                                       | (y.address_test_max_age > 365 * 1.9))
    # bll6 at this address or tested at this address after about age 2
    y['true6_here_2y_here'] = y.true6.where((y.address_max_bll >= 6) | (
        y.address_test_max_age > 365 * 1.9))
Esempio n. 3
0
    def run(self, revised, X, aux):
        """
        Args:
            revised: auxillary informtaion revised for the train-test date
            X: the full feature matrix from LeadData
            aux: the unrevised auxillary data from LeadData
        """
        logging.info('Splitting train and test sets')
        today = util.timestamp(self.year, self.month, self.day)
        min_date = util.timestamp(self.year - self.train_years, self.month,
                                  self.day)

        date = data.index_as_series(X, 'date')
        X = X[date.between(min_date, today)]
        aux = aux[date.between(min_date, today)]

        date = data.index_as_series(aux, 'date')
        train = (date < today) & (aux.address_min_date < today)
        test = date == today

        aux = revise_helper(revised=revised,
                            aux=aux,
                            train=train,
                            test=test,
                            today=today)

        if self.train_query is not None:
            train &= train.index.isin(aux.query(self.train_query).index)

        aux = aux[train | test]
        X, train, test = data.train_test_subset(X, train, test, drop=False)

        c = data.non_numeric_columns(X)
        if len(c) > 0:
            logging.warning('Non-numeric columns: %s' % c)

        return {'X': X, 'aux': aux, 'train': train, 'test': test}
Esempio n. 4
0
def revise_helper(revised, aux, train, test, today):
    """
    given revised and unrevised kid_addresses (aux), merge the unrevised for the test set
    with the revised for training
    """
    revised = aux[[]][train].reset_index().merge(revised, how='left', 
            on=['kid_id', 'address_id'])
    revised.set_index(['kid_id', 'address_id', 'date'], inplace=True)
    revised = pd.concat((revised, aux[test]))

    revised['last_sample_age'] = (revised.last_sample_date - 
             revised.date_of_birth)/util.day
    revised['wic'] = revised.first_wic_date.notnull()
    revised['today_age'] = (today - revised.date_of_birth)/util.day
    revised['address_test_max_age'] = (revised.address_test_max_date - 
             revised.date_of_birth)/util.day

    date = data.index_as_series(revised, 'date')
    revised['age'] = (date - revised.date_of_birth)/util.day

    return revised
Esempio n. 5
0
from drain import explore, model, step, data, util

from lead.model import steps
import lead.model.data
import lead.output.aggregations

import pandas as pd

step.BASEDIR = '/home/epotash/lead/data/drain/'
step.configure_yaml()

predictions = steps.bll6_forest()
query = 'address_wic_min_date < date'

s = [
    p for p in predictions if p.named_arguments[('transform', 'year')] == 2016
][0]
s.load()
result = s.get_result()
y = result['y']
y['age'] = (data.index_as_series(y, 'date') - y.date_of_birth) / util.day

d = s.get_input('transform').inputs[0]
d.load()
X = d.get_result()['X']

engine = util.create_engine()
y.query(query)[['score', 'age', 'address', 'first_name' , 'last_name', 'date_of_birth', 'max_bll', 'test_count', 'address_count', 'address_wic_infant']].join(
        X[['inspections_address_1y_inspected','inspections_address_1y_complied']])\
        .to_sql(name='predictions', con=engine, if_exists='replace')
Esempio n. 6
0
from drain import explore, model, step, data, util

from lead.model import steps
import lead.model.data
import lead.output.aggregations

import pandas as pd

step.BASEDIR='/home/epotash/lead/data/drain/'
step.configure_yaml()

predictions = steps.bll6_forest()
query = 'address_wic_min_date < date'

s = [p for p in predictions 
        if p.named_arguments[('transform', 'year')] == 2016][0]
s.load()
result = s.get_result()
y = result['y']
y['age'] = (data.index_as_series(y, 'date') - y.date_of_birth) / util.day

d = s.get_input('transform').inputs[0]
d.load()
X = d.get_result()['X']

engine = util.create_engine()
y.query(query)[['score', 'age', 'address', 'first_name' , 'last_name', 'date_of_birth', 'max_bll', 'test_count', 'address_count', 'address_wic_infant']].join(
        X[['inspections_address_1y_inspected','inspections_address_1y_complied']])\
        .to_sql(name='predictions', con=engine, if_exists='replace')