def get_aggregates(self, date, data): aggregates = [ Count('permit_type_%s' % p, prop=True) for p in PERMIT_TYPES ] aggregates.append(Count()) return aggregates
def aggregates(self): return [ Count(), Count('Arrest'), Count(lambda c: c['Primary Type'] == 'THEFT', 'theft', prop=True), ]
def get_aggregates(self, date, delta): aggregates = [ Count(), Aggregate('inspected', 'max', fname=False), Aggregate('complied', 'max', fname=False), Count('hazard_int', prop=True), Count('hazard_ext', prop=True), Count('hazard', prop=True), Count('hazard_both', prop=True), Count('inspected'), Count('complied', prop=True), Aggregate('inspection_to_compliance', ['mean', 'min', 'max']), Aggregate(lambda i: (date - i.init_date) / day, ['mean', 'min', 'max'], name='from_inspection'), Aggregate(lambda i: (date - i.comply_date) / day, ['mean', 'min', 'max'], name='from_compliance'), ] aggregates.extend([ Count(lambda i, c=c: i['closure'] == c, name='closure_%s' % c, prop=True) for c in CLOSURE_CODES ]) return aggregates
def get_aggregates(self, date, delta): kid_count = Aggregate('kid_id', 'nunique', name='kid_count', fname=False) aggregates = [ Count(), Aggregate('bll', ['mean', 'median', 'max', 'min', 'std']), Aggregate(lambda t: t.bll.where(t.increase), ['mean', 'median', 'max', 'min', 'std'], 'increase_bll'), Count(lambda t: t.bll <= 2, 'bll2', prop=True), # prevalences Fraction(Count(['first_bll6', 'first_bll10']), kid_count, include_numerator=True, include_denominator=True), ] # incidences if delta != 'all': start_date = date - data.parse_delta(delta) no_bll6_count = Aggregate(lambda k: k.kid_id.where( (k.first_bll6_sample_date >= start_date).fillna(True)), 'nunique', name='no_bll6_count', fname=False) no_bll10_count = Aggregate(lambda k: k.kid_id.where( (k.first_bll10_sample_date >= start_date).fillna(True)), 'nunique', name='no_bll10_count', fname=False) aggregates.extend([ no_bll6_count, no_bll10_count, Count('first_bll6') / no_bll6_count, Count('first_bll10') / no_bll10_count ]) if delta == 'all': aggregates.extend([ Aggregate(days('date', date), ['min', 'max'], 'days_since_test'), Aggregate([ lambda t: (date - t.date.where(t.bll >= 6)) / day, lambda t: (date - t.date.where(t.bll >= 10)) / day ], ['min', 'max'], ['days_since_bll6', 'days_since_bll10']) ]) return aggregates
def get_aggregates(self, date, data): aggregates = [ Count(), Count(KEYWORDS, prop=True), Count(STATUS[1], prop=True), Count([ lambda v, k=k, s=s: v[k] & v[s] for k, s in product(KEYWORDS, STATUS[1]) ], prop=True, name=['%s_%s' % p for p in product(KEYWORDS, STATUS[1])]) ] return aggregates
def get_aggregates(self, date, delta): prenatal = self.inputs[0].get_result() aggregates = [ Count(), Aggregate(days('visit_d', 'date_of_birth'), ['min', 'max'], 'visit'), Aggregate(list(select_regexes(prenatal, ['service_.*'])), 'sum', fname=False), Aggregate('preg_nbr_n', 'max', 'previous_pregnancies', fname=False), Aggregate('lv_brth_n', 'max', 'previous_births', fname=False), Aggregate('othr_trm_n', 'max', 'previous_terminations', fname=False), Aggregate(lambda p: p.smk3_mth_f == 'Y', 'any', 'smoked_3mo', fname=False), Aggregate('cig3_day_n', 'max', 'cigarettes_per_day', fname=False), Aggregate(lambda p: p.drk3_mth_f == 'Y', 'any', 'drank_3mo', fname=False), Aggregate('dr_dy_wk_n', 'max', 'days_drank_per_week', fname=False), Aggregate('drnk_day_n', 'max', 'drinks_per_day', fname=False), ] return aggregates
def aggregates(self): return [ Count(), Aggregate('count', 'mean', 'assessents'), Aggregate(lambda a: a.land_value / 100000, 'mean', name='land_value'), Aggregate(['min_age', 'max_age'], ['min', 'mean', 'max']), # residential total value and average value Fraction( Aggregate(lambda a: a.total_value.where(a.residential > 0) / 100000, 'sum', 'residential_total_value', fname=False), Aggregate(lambda a: a.units.where(a.residential > 0), 'sum', name='residential_units', fname=False), include_numerator=True, include_denominator=True ), # non-residential total and average value Fraction( Aggregate(lambda a: a.total_value.where(a.residential == 0) / 100000, 'sum', 'non_residential_total_value', fname=False), Aggregate(lambda a: a.units.where(a.residential == 0), 'sum', name='non_residential_units', fname=False), include_numerator=True, include_denominator=True ), Aggregate('apartments', 'mean'), Aggregate('units', 'mean'), Aggregate(lambda a: a.rooms / a.units, 'mean', name='rooms_per_unit'), Aggregate(lambda a: a.beds / a.units, 'mean', name='beds_per_unit'), Aggregate(lambda a: a.baths / a.units, 'mean', name='baths_per_unit'), Proportion(lambda a: a.owner_occupied > 0, name='owner_occupied'), Proportion([lambda a, c=c: a[c] > 0 for c in CLASSES], name=CLASSES) ]
def aggregates(self): return [ Count(), Aggregate('area', 'sum'), Aggregate(lambda b: b.area * b.stories, 'mean', 'volume'), Aggregate('years_built', [ lambda y: np.nanmedian(np.concatenate(y.values)), lambda y: np.nanmean(np.concatenate(y.values)), lambda y: np.nanmin(np.concatenate(y.values)), lambda y: np.nanmax(np.concatenate(y.values)), ], fname=['median', 'mean', 'min', 'max']), Aggregate('address_count', 'sum'), # average proportion of sound building condition Proportion(['%s_prop' % c for c in CONDITIONS], 'condition_not_null', name=CONDITIONS), Aggregate([lambda p: p['%s_prop' % c] > 0 for c in CONDITIONS], 'any', name=CONDITIONS), Aggregate('stories', 'mean'), Aggregate('units', 'sum'), Proportion('pre1978_prop', lambda i: i.pre1978_prop.notnull(), denom_name='pre1978_not_null'), ]
def get_aggregates(self, date, delta): aggregates = [ Count(), Aggregate(days('visit_d', 'date_of_birth'), ['min', 'max'], 'visit'), Aggregate('serv_typ_c', lambda s: set(s), 'service', fname=False), Aggregate('preg_nbr_n', 'max', 'previous_pregnancies', fname=False), Aggregate('lv_brth_n', 'max', 'previous_births', fname=False), Aggregate('othr_trm_n', 'max', 'previous_terminations', fname=False), Aggregate(lambda p: p.smk3_mth_f == 'Y', 'any', 'smoked_3mo', fname=False), Aggregate('cig3_day_n', 'max', 'cigarettes_per_day', fname=False), Aggregate(lambda p: p.drk3_mth_f == 'Y', 'any', 'drank_3mo', fname=False), Aggregate('dr_dy_wk_n', 'max', 'days_drank_per_week', fname=False), Aggregate('drnk_day_n', 'max', 'drinks_per_day', fname=False), Aggregate('clinicid_i', lambda c: set(c), 'clinic', fname=False) ] return aggregates
def get_aggregates(self, date, delta): kid_count = Aggregate('kid_id', 'nunique', name='kid_count', fname=False) aggregates = [ Count(), Aggregate('bll', ['mean', 'median', 'max', 'min', 'std']), Count(lambda t: t.bll <= 2, 'bll2', prop=True), Fraction(Count(['first_bll6', 'first_bll10']), kid_count, include_numerator=True, include_denominator=True), ] if delta == 'all': aggregates.extend([ Aggregate(days('date',date), ['min','max'], 'days_since_test'), Aggregate([ lambda t: (date - t.date.where(t.bll >= 6))/day, lambda t: (date - t.date.where(t.bll >= 10))/day], ['min','max'], ['days_since_bll6', 'days_since_bll10']) ]) return aggregates
def get_aggregates(self, date, index, delta): if index == 'kid': return [ Aggregate( ['test_address_count', 'address_count', 'test_count'], 'max', fname=False), Aggregate(['max_bll'], 'max', fname=False), # Comment out this and all other wic aggregates because they can't be lagged # and they're not useful for predicting poisoning #Aggregate(lambda k: k.last_wic_date == k.address_wic_max_date, # 'any', 'last_wic_address', fname=False), #Aggregate(['address_wic_mother', 'address_wic_infant'], 'any', fname=False), #Aggregate([days('address_wic_max_date', date), # days('address_wic_min_date', date), # days('last_wic_date', date), # days('first_wic_date', date)], # ['max'], ['address_wic_min_date', 'address_wic_max_date', # 'last_wic_date', 'first_wic_date'], fname=False) ] sample_2y = lambda k: ((k.last_sample_date - k.date_of_birth) / day > 365 * 2) | (k.max_bll >= 6) counts = Count([np.float32(1), sample_2y], ['kid', 'kid_sample_2y']) aggregates = [ counts, Aggregate(['test_address_count', 'test_count', 'address_count'], ['median', 'mean', 'min', 'max']), Count([ lambda k: k.address_test_min_date.notnull(), lambda k: k.first_sample_date.notnull() ], prop=True, name=['tested_here', 'tested_ever']), #Count(lambda k: k.first_wic_date.notnull(), prop=True, name='wic'), #Count([lambda k: k.address_wic_min_date.notnull() & k.address_test_min_date.notnull(), # lambda k: k.address_wic_min_date.notnull() & k.first_sample_date.notnull()], # name=['wic_tested_here', 'wic_tested_ever'], # prop=lambda k: k.first_wic_date.notnull(), prop_name='wic'), Aggregate( [ days('address_min_date', 'address_max_date'), #days('address_wic_min_date', 'address_wic_max_date'), days('address_test_min_date', 'address_test_max_date') ], ['mean'], [ 'address_total_time', #'address_wic_time', 'address_test_time' ]), # the first of these are kid level, not address-kid level # that means kids get double counted when aggregated to above the address level # if they lived in multiple addresses on that e.g. census tract. oh well. Aggregate([ 'max_bll', 'avg_bll', 'cumulative_bll', 'avg_cumulative_bll', 'mean_bll', 'address_max_bll', 'address_mean_bll' ], ['mean', 'median', 'min', 'max']), # ebll past, present, future, ever count the number of kids who # moved into this address in the period defined by date and delta # and who were poisoned before, during, after or ever relative to their time living there Fraction(Count([ lambda k: k.first_bll6_sample_date.notnull(), lambda k: k.first_bll10_sample_date.notnull() ], ['bll6_ever', 'bll10_ever']), counts, include_numerator=True), Fraction(Count([ lambda k: k.first_bll6_sample_date > k.address_max_date, lambda k: k.first_bll10_sample_date > k.address_max_date ], ['bll6_future', 'bll10_future']), counts, include_numerator=True), Fraction(Count([ lambda k: k.first_bll6_sample_date < k.address_min_date, lambda k: k.first_bll10_sample_date < k.address_min_date ], ['bll6_past', 'bll10_past']), counts, include_numerator=True), Fraction(Count([ lambda k: k.first_bll6_sample_date.between( k.address_min_date, k.address_max_date), lambda k: k.first_bll10_sample_date.between( k.address_min_date, k.address_max_date) ], ['bll6_present', 'bll10_present']), counts, include_numerator=True), Aggregate('last_name', 'nunique', fname='count', astype=str) # TODO: min_last_sample_age cutoffs ] if delta == 'all': aggregates.extend([ #Aggregate(days('address_wic_min_date', date), ['min', 'max'], 'days_since_wic'), Aggregate(days('date_of_birth', date), ['min', 'max', 'mean'], 'date_of_birth'), ]) return aggregates
def get_aggregates(self, date, delta): return [ Count(), Count(['event_code_' + e for e in event_codes], prop=True), Count(['event_res_code_' + e for e in event_res_codes], prop=True) ]
def get_aggregates(self, date, index, delta): if index == 'kid': return [ Aggregate(['address_count', 'test_count'], 'max', fname=False), Aggregate(['max_bll'], 'max', fname=False), Aggregate(lambda k: k.last_wic_date == k.address_wic_max_date, 'any', 'last_wic_address', fname=False), Aggregate(['address_wic_mother', 'address_wic_infant'], 'any', fname=False), Aggregate([ days('address_wic_max_date', date), days('address_wic_min_date', date), days('last_wic_date', date), days('first_wic_date', date) ], ['max'], [ 'address_wic_min_date', 'address_wic_max_date', 'last_wic_date', 'first_wic_date' ], fname=False) ] sample_2y = lambda k: ((k.last_sample_date - k.date_of_birth) / day > 365 * 2) | (k.max_bll >= 6) counts = Count([np.float32(1), sample_2y], ['kid', 'kid_sample_2y']) aggregates = [ counts, Aggregate(['address_count', 'test_count'], ['median', 'mean', 'min', 'max']), Count([ lambda k: k.address_test_min_date.notnull(), lambda k: k.first_sample_date.notnull(), lambda k: k.first_wic_date.notnull() ], prop=True, name=['tested_here', 'tested_ever', 'wic']), Count([ lambda k: k.address_wic_min_date.notnull( ) & k.address_test_min_date.notnull(), lambda k: k. address_wic_min_date.notnull() & k.first_sample_date.notnull() ], name=['wic_tested_here', 'wic_tested_ever'], parent=lambda k: k.first_wic_date.notnull()), Aggregate([ days('address_min_date', 'address_max_date'), days('address_wic_min_date', 'address_wic_max_date'), days('address_test_min_date', 'address_test_max_date') ], ['mean'], [ 'address_total_time', 'address_wic_time', 'address_test_time' ]), Aggregate( ['max_bll', 'mean_bll', 'address_max_bll', 'address_mean_bll'], ['mean', 'median', 'min', 'max']), Fraction(Count([ lambda k: k.first_bll6_sample_date.notnull(), lambda k: k.first_bll10_sample_date.notnull() ], ['bll6_ever', 'bll10_ever']), counts, include_numerator=True), Fraction(Count([ lambda k: k.first_bll6_sample_date > k.address_max_date, lambda k: k.first_bll10_sample_date > k.address_max_date ], ['bll6_future', 'bll10_future']), counts, include_numerator=True), Fraction(Count([ lambda k: k.first_bll6_sample_date < k.address_min_date, lambda k: k.first_bll10_sample_date < k.address_min_date ], ['bll6_past', 'bll10_past']), counts, include_numerator=True), Fraction(Count([ lambda k: k.first_bll6_sample_date.between( k.address_min_date, k.address_max_date), lambda k: k.first_bll10_sample_date.between( k.address_min_date, k.address_max_date) ], ['bll6_present', 'bll10_present']), counts, include_numerator=True), Aggregate('last_name', 'nunique', fname='count', astype=str) # TODO: min_last_sample_age cutoffs ] if delta == 'all': aggregates.extend([ Aggregate(days('address_wic_min_date', date), ['min', 'max'], 'days_since_wic'), Aggregate(days('date_of_birth', date), ['min', 'max', 'mean'], 'date_of_birth'), ]) return aggregates
def get_aggregates(self, date, delta): return [ Count(), Count('Arrest'), Count(lambda c: c['Primary Type'] == 'THEFT', 'theft', prop=True) ]
aggregates = [ Aggregate('area', 'mean', fname=False), Aggregate('year_built', lambda l: list(l), name='years_built', fname=False), Aggregate(lambda b: (b.t_add1 - b.f_add1) / 2 + 1, 'max', name='address_count', fname=False), Aggregate('bldg_condi_not_null', 'any', name='condition_not_null', fname=False), Aggregate('stories', 'mean', fname=False), Aggregate('units', 'mean', fname=False), Fraction(Count(lambda b: b.year_built < 1978), Count(lambda b: b.year_built.notnull()), name='pre1978_prop'), Fraction(cond, Count(lambda b: b.bldg_condi.notnull()), name='{numerator}_prop') ] engine = util.create_engine() # read tables from db building_components = pd.read_sql( 'select * from buildings.building_components', engine) buildings = pd.read_sql( """ select ogc_fid id,