def run(self, event): # concatenate event and res code, e.g. 'REINS_C' event['event_res_code'] = event.event_code + '_' + event.res_code # binarize event code and event res codes data.binarize(event, { 'event_code': event_codes, 'event_res_code': event_res_codes }) return event
def run(self, acs, left, aux=None): """ Returns: - X: the feature matrix, containing all aggregation features, as well as ACS features, and a handful of simple features like age and sex. - aux: auxillary features used for selecting a training set, setting sample weights, and evaluation. """ if self.address: index_columns = ['address', 'date'] if not self.address: index_columns = ['kid_id', 'address_id', 'date'] left_columns = [ 'ward_id', 'community_area_id', 'address_lat', 'address_lng' ] left = left[index_columns + left_columns] logging.info('Binarizing community area and ward') left = data.binarize(left, ['community_area_id', 'ward_id'], astype=self.dtype) logging.info('Joining aggregations') X = left.join([a.get_result() for a in self.aggregation_joins] + [acs]) # delete all aggregation inputs so that memory can be freed for a in self.aggregation_joins: del a._result if not self.address: logging.info('Adding auxillary features') add_aux_features(X, aux, self.dtype) X.set_index(index_columns, inplace=True) c = data.non_numeric_columns(X) if len(c) > 0: logging.warning('Non-numeric columns: %s' % c) if self.address: return {'X': X} else: aux.set_index(index_columns, inplace=True) return {'X': X, 'aux': aux}
#!/usr/bin/python from drain import util import pandas as pd from drain import data engine = util.create_engine() # read tables from db building_permits = pd.read_sql("select street_number || ' ' || street_direction || ' ' || street_name || ' ' || suffix as address, issue_date, lower(replace(substring(permit_type from 10), '/', ' ')) as permit_type from input.building_permits where issue_date is not null", engine) data.binarize(building_permits, {'permit_type' : building_permits.permit_type.unique()}, all_classes=True) db = util.PgSQLDatabase(engine) db.to_sql(frame=building_permits, name='building_permits',if_exists='replace', index=False, schema='aux')
def test_binarize_not_inplace(): df = pd.DataFrame({'a': ['b', 'c']}) df2 = data.binarize(df, ['a'], inplace=False) assert df.columns.tolist() == ['a'] assert df2.columns.tolist() == ['a_b', 'a_c']
def test_binarize_all_classes(): df = pd.DataFrame({'a': ['b', 'c']}) data.binarize(df, ['a'], all_classes=False, inplace=True) assert df.columns.tolist() == ['a_b']
def test_binarize_drop(): df = pd.DataFrame({'a': ['b', 'c']}) data.binarize(df, ['a'], drop=False, inplace=True) assert df.columns.tolist() == ['a', 'a_b', 'a_c']
def test_binarize_not_inplace(): df = pd.DataFrame({'a':['b','c']}) df2 = data.binarize(df, ['a'], inplace=False) assert df.columns.tolist() == ['a'] assert df2.columns.tolist() == ['a_b', 'a_c']
def test_binarize_all_classes(): df = pd.DataFrame({'a':['b','c']}) data.binarize(df, ['a'], all_classes=False, inplace=True) assert df.columns.tolist() == ['a_b']
def test_binarize_drop(): df = pd.DataFrame({'a':['b','c']}) data.binarize(df, ['a'], drop=False, inplace=True) assert df.columns.tolist() == ['a', 'a_b', 'a_c']