def reasonable_feature_values(df, control): 'return new DataFrame containing sample in df that have "reasonable" values' def below(percentile, series): quantile_value = series.quantile(percentile / 100.0) r = series < quantile_value return r # set mask value in m to True to keep the observation m = {} m['assessment total > 0'] = df[layout.assessment_total] > 0 m['assessment land > 0'] = df[layout.assessment_land] > 0 m['assessment improvement > 0'] = df[layout.assessment_improvement] > 0 m['baths > 0'] = df[layout.building_baths] > 0 m['effective year built >= year built'] = df[ layout.year_built_effective] >= df[layout.year_built] m['full price'] = layout.mask_full_price(df) m['latitude known'] = layout.mask_gps_latitude_known(df) m['longitude known'] = layout.mask_gps_longitude_known(df) m['land size < 99th percentile'] = below(99, df[layout.lot_land_square_feet]) m['land size > 0'] = df[layout.lot_land_square_feet] > 0 m['living size < 99th percentile'] = below( 99, df[layout.building_living_square_feet]) m['living square feet > 0'] = df[layout.building_living_square_feet] > 0 m['median household income > 0'] = df[ layout.census2000_median_household_income] > 0 m['new or resale'] = layout.mask_new_or_resale(df) m['one building'] = layout.mask_is_one_building(df) m['one APN'] = layout.mask_is_one_parcel(df) # m['recording date present'] = ~df[layout.recording_date + '_deed'].isnull() # ~ => not m['price > 0'] = df[layout.price] > 0 m['price < max'] = df[layout.price] < control.max_sale_price m['rooms > 0'] = df[layout.building_rooms] > 0 m['resale or new construction'] = (layout.mask_is_new_construction(df) | layout.mask_is_resale(df)) m['sale date present'] = layout.mask_sale_date_present(df) m['sale date valid'] = layout.mask_sale_date_valid(df) m['stories > 0'] = df[layout.building_stories] > 0 m['units == 1'] = df[layout.n_units] == 1 m['year_built > 0'] = df[layout.year_built] > 0 print 'effects of reasonable values' return report_and_remove(df, m)
def reasonable_feature_values(df, control): 'return new DataFrame containing sample in df that have "reasonable" values' def below(percentile, series): quantile_value = series.quantile(percentile / 100.0) r = series < quantile_value return r # set mask value in m to True to keep the observation m = {} m['assessment total > 0'] = df[layout.assessment_total] > 0 m['assessment land > 0'] = df[layout.assessment_land] > 0 m['assessment improvement > 0'] = df[layout.assessment_improvement] > 0 m['baths > 0'] = df[layout.building_baths] > 0 m['effective year built >= year built'] = df[layout.year_built_effective] >= df[layout.year_built] m['full price'] = layout.mask_full_price(df) m['latitude known'] = layout.mask_gps_latitude_known(df) m['longitude known'] = layout.mask_gps_longitude_known(df) m['land size < 99th percentile'] = below(99, df[layout.lot_land_square_feet]) m['land size > 0'] = df[layout.lot_land_square_feet] > 0 m['living size < 99th percentile'] = below(99, df[layout.building_living_square_feet]) m['living square feet > 0'] = df[layout.building_living_square_feet] > 0 m['median household income > 0'] = df[layout.census2000_median_household_income] > 0 m['new or resale'] = layout.mask_new_or_resale(df) m['one building'] = layout.mask_is_one_building(df) m['one APN'] = layout.mask_is_one_parcel(df) # m['recording date present'] = ~df[layout.recording_date + '_deed'].isnull() # ~ => not m['price > 0'] = df[layout.price] > 0 m['price < max'] = df[layout.price] < control.max_sale_price m['rooms > 0'] = df[layout.building_rooms] > 0 m['resale or new construction'] = ( layout.mask_is_new_construction(df) | layout.mask_is_resale(df) ) m['sale date present'] = layout.mask_sale_date_present(df) m['sale date valid'] = layout.mask_sale_date_valid(df) m['stories > 0'] = df[layout.building_stories] > 0 m['units == 1'] = df[layout.n_units] == 1 m['year_built > 0'] = df[layout.year_built] > 0 print 'effects of reasonable values' return report_and_remove(df, m)
def add_features(df, control): def split(date): year = int(date / 10000) md = date - year * 10000 month = int(md / 100) day = md - month * 100 return year, month, day def python_date(date): 'yyyymmdd --> datetime.date(year, month, day)' year, month, day = split(date) return datetime.date(int(year), int(month), int(day)) def year(date): 'yyyymmdd --> datetime.date(year, 7, 1); mid point of year' year, month, day = split(date) return year def yyyymm(date): year, month, day = split(date) return year * 100 + month # create sale-date related features def append_column(name, values): df.insert( len(df.columns), name, pd.Series(values, index=df.index), ) sale_date = df[layout.sale_date] sale_date_python = sale_date.apply(python_date) append_column(layout.sale_date_python, sale_date_python) append_column(layout.yyyymm, sale_date.apply(yyyymm)) # create age and similar fields # NOTE: these are ages at the sale date sale_year = sale_date.apply(year) year_built = df[layout.year_built] effective_year_built = df[layout.year_built_effective] age = sale_year - year_built effective_age = sale_year - effective_year_built append_column(layout.age, age) append_column(layout.age2, age * age) append_column(layout.age_effective, effective_age) append_column(layout.age_effective2, effective_age * effective_age) # create indicator features append_column(layout.is_new_construction, layout.mask_is_new_construction(df)) append_column(layout.is_resale, layout.mask_is_resale(df)) append_column(layout.building_has_basement, df[layout.building_basement_square_feet] > 0) append_column(layout.building_has_fireplace, df[layout.building_fireplace_number] > 0) append_column(layout.has_parking, df[layout.parking_spaces] > 0) append_column(layout.has_pool, df[layout.pool_flag] == 'Y') # create additional indicators aggregating certain PROPN codes def create(new_column_base, ored_column_bases): def create2(prefix): def ored_name(ored_index): return prefix + '_has_' + ored_column_bases[ored_index] mask = df[ored_name(0)] for index in range(1, len(ored_column_bases)): mask2 = df[ored_name(index)] mask = mask | mask2 append_column(prefix + '_has_' + new_column_base, mask) for prefix in ('census_tract', 'zip5'): create2(prefix) create('any_commercial', ( 'commercial', 'commercial_condominium', )) create('any_industrial', ( 'industrial', 'industrial_light', 'industrial_heavy', )) create('any_non_residential', ( 'amusement', 'any_commercial', 'financial_institution', 'hotel', 'any_industrial', 'medical', 'office_building', 'parking', 'retail', 'service', 'transport', 'utilities', 'warehouse', ))
def add_features(df, control): def split(date): year = int(date / 10000) md = date - year * 10000 month = int(md / 100) day = md - month * 100 return year, month, day def python_date(date): 'yyyymmdd --> datetime.date(year, month, day)' year, month, day = split(date) return datetime.date(int(year), int(month), int(day)) def year(date): 'yyyymmdd --> datetime.date(year, 7, 1); mid point of year' year, month, day = split(date) return year def yyyymm(date): year, month, day = split(date) return year * 100 + month # create sale-date related features def append_column(name, values): df.insert(len(df.columns), name, pd.Series(values, index=df.index), ) sale_date = df[layout.sale_date] sale_date_python = sale_date.apply(python_date) append_column(layout.sale_date_python, sale_date_python) append_column(layout.yyyymm, sale_date.apply(yyyymm)) # create age and similar fields # NOTE: these are ages at the sale date sale_year = sale_date.apply(year) year_built = df[layout.year_built] effective_year_built = df[layout.year_built_effective] age = sale_year - year_built effective_age = sale_year - effective_year_built append_column(layout.age, age) append_column(layout.age2, age * age) append_column(layout.age_effective, effective_age) append_column(layout.age_effective2, effective_age * effective_age) # create indicator features append_column(layout.is_new_construction, layout.mask_is_new_construction(df)) append_column(layout.is_resale, layout.mask_is_resale(df)) append_column(layout.building_has_basement, df[layout.building_basement_square_feet] > 0) append_column(layout.building_has_fireplace, df[layout.building_fireplace_number] > 0) append_column(layout.has_parking, df[layout.parking_spaces] > 0) append_column(layout.has_pool, df[layout.pool_flag] == 'Y') # create additional indicators aggregating certain PROPN codes def create(new_column_base, ored_column_bases): def create2(prefix): def ored_name(ored_index): return prefix + '_has_' + ored_column_bases[ored_index] mask = df[ored_name(0)] for index in range(1, len(ored_column_bases)): mask2 = df[ored_name(index)] mask = mask | mask2 append_column(prefix + '_has_' + new_column_base, mask) for prefix in ('census_tract', 'zip5'): create2(prefix) create('any_commercial', ('commercial', 'commercial_condominium',)) create('any_industrial', ('industrial', 'industrial_light', 'industrial_heavy',)) create('any_non_residential', ('amusement', 'any_commercial', 'financial_institution', 'hotel', 'any_industrial', 'medical', 'office_building', 'parking', 'retail', 'service', 'transport', 'utilities', 'warehouse',))