Esempio n. 1
0
    def run(self, event):
        # concatenate event and res code, e.g. 'REINS_C'
        event['event_res_code'] = event.event_code + '_' + event.res_code
        # binarize event code and event res codes
        data.binarize(event, {
            'event_code': event_codes,
            'event_res_code': event_res_codes
        })

        return event
Esempio n. 2
0
    def run(self, acs, left, aux=None):
        """
        Returns:
            - X: the feature matrix, containing all aggregation features, as
                well as ACS features, and a handful of simple features like
                age and sex.
            - aux: auxillary features used for selecting a training set, setting
                sample weights, and evaluation.
        """
        if self.address:
            index_columns = ['address', 'date']
        if not self.address:
            index_columns = ['kid_id', 'address_id', 'date']

        left_columns = [
            'ward_id', 'community_area_id', 'address_lat', 'address_lng'
        ]
        left = left[index_columns + left_columns]

        logging.info('Binarizing community area and ward')
        left = data.binarize(left, ['community_area_id', 'ward_id'],
                             astype=self.dtype)

        logging.info('Joining aggregations')
        X = left.join([a.get_result() for a in self.aggregation_joins] + [acs])
        # delete all aggregation inputs so that memory can be freed
        for a in self.aggregation_joins:
            del a._result

        if not self.address:
            logging.info('Adding auxillary features')
            add_aux_features(X, aux, self.dtype)

        X.set_index(index_columns, inplace=True)

        c = data.non_numeric_columns(X)
        if len(c) > 0:
            logging.warning('Non-numeric columns: %s' % c)

        if self.address:
            return {'X': X}
        else:
            aux.set_index(index_columns, inplace=True)
            return {'X': X, 'aux': aux}
Esempio n. 3
0
#!/usr/bin/python
from drain import util
import pandas as pd
from drain import data

engine = util.create_engine()

# read tables from db
building_permits = pd.read_sql("select street_number || ' ' || street_direction || ' ' || street_name || ' ' || suffix as address, issue_date, lower(replace(substring(permit_type from 10), '/', ' ')) as permit_type from input.building_permits where issue_date is not null", engine)

data.binarize(building_permits, {'permit_type' : building_permits.permit_type.unique()}, all_classes=True)

db = util.PgSQLDatabase(engine)
db.to_sql(frame=building_permits, name='building_permits',if_exists='replace', index=False, schema='aux')
Esempio n. 4
0
def test_binarize_not_inplace():
    df = pd.DataFrame({'a': ['b', 'c']})
    df2 = data.binarize(df, ['a'], inplace=False)
    assert df.columns.tolist() == ['a']
    assert df2.columns.tolist() == ['a_b', 'a_c']
Esempio n. 5
0
def test_binarize_all_classes():
    df = pd.DataFrame({'a': ['b', 'c']})
    data.binarize(df, ['a'], all_classes=False, inplace=True)
    assert df.columns.tolist() == ['a_b']
Esempio n. 6
0
def test_binarize_drop():
    df = pd.DataFrame({'a': ['b', 'c']})
    data.binarize(df, ['a'], drop=False, inplace=True)
    assert df.columns.tolist() == ['a', 'a_b', 'a_c']
Esempio n. 7
0
def test_binarize_not_inplace():
    df = pd.DataFrame({'a':['b','c']})
    df2 = data.binarize(df, ['a'], inplace=False)
    assert df.columns.tolist() == ['a']
    assert df2.columns.tolist() == ['a_b', 'a_c']
Esempio n. 8
0
def test_binarize_all_classes():
    df = pd.DataFrame({'a':['b','c']})
    data.binarize(df, ['a'], all_classes=False, inplace=True)
    assert df.columns.tolist() == ['a_b']
Esempio n. 9
0
def test_binarize_drop():
    df = pd.DataFrame({'a':['b','c']})
    data.binarize(df, ['a'], drop=False, inplace=True)
    assert df.columns.tolist() == ['a', 'a_b', 'a_c']