Ejemplo n.º 1
0
def reasonable_feature_values(df, control):
    'return new DataFrame containing sample in df that have "reasonable" values'

    def below(percentile, series):
        quantile_value = series.quantile(percentile / 100.0)
        r = series < quantile_value
        return r

    # set mask value in m to True to keep the observation
    m = {}
    m['assessment total > 0'] = df[layout.assessment_total] > 0
    m['assessment land > 0'] = df[layout.assessment_land] > 0
    m['assessment improvement > 0'] = df[layout.assessment_improvement] > 0
    m['baths > 0'] = df[layout.building_baths] > 0
    m['effective year built >= year built'] = df[
        layout.year_built_effective] >= df[layout.year_built]
    m['full price'] = layout.mask_full_price(df)
    m['latitude known'] = layout.mask_gps_latitude_known(df)
    m['longitude known'] = layout.mask_gps_longitude_known(df)
    m['land size < 99th percentile'] = below(99,
                                             df[layout.lot_land_square_feet])
    m['land size > 0'] = df[layout.lot_land_square_feet] > 0
    m['living size < 99th percentile'] = below(
        99, df[layout.building_living_square_feet])
    m['living square feet > 0'] = df[layout.building_living_square_feet] > 0
    m['median household income > 0'] = df[
        layout.census2000_median_household_income] > 0
    m['new or resale'] = layout.mask_new_or_resale(df)
    m['one building'] = layout.mask_is_one_building(df)
    m['one APN'] = layout.mask_is_one_parcel(df)
    # m['recording date present'] = ~df[layout.recording_date + '_deed'].isnull()  # ~ => not
    m['price > 0'] = df[layout.price] > 0
    m['price < max'] = df[layout.price] < control.max_sale_price
    m['rooms > 0'] = df[layout.building_rooms] > 0
    m['resale or new construction'] = (layout.mask_is_new_construction(df)
                                       | layout.mask_is_resale(df))
    m['sale date present'] = layout.mask_sale_date_present(df)
    m['sale date valid'] = layout.mask_sale_date_valid(df)
    m['stories > 0'] = df[layout.building_stories] > 0
    m['units == 1'] = df[layout.n_units] == 1
    m['year_built > 0'] = df[layout.year_built] > 0

    print 'effects of reasonable values'
    return report_and_remove(df, m)
Ejemplo n.º 2
0
def reasonable_feature_values(df, control):
    'return new DataFrame containing sample in df that have "reasonable" values'
    def below(percentile, series):
        quantile_value = series.quantile(percentile / 100.0)
        r = series < quantile_value
        return r

    # set mask value in m to True to keep the observation
    m = {}
    m['assessment total > 0'] = df[layout.assessment_total] > 0
    m['assessment land > 0'] = df[layout.assessment_land] > 0
    m['assessment improvement > 0'] = df[layout.assessment_improvement] > 0
    m['baths > 0'] = df[layout.building_baths] > 0
    m['effective year built >= year built'] = df[layout.year_built_effective] >= df[layout.year_built]
    m['full price'] = layout.mask_full_price(df)
    m['latitude known'] = layout.mask_gps_latitude_known(df)
    m['longitude known'] = layout.mask_gps_longitude_known(df)
    m['land size < 99th percentile'] = below(99, df[layout.lot_land_square_feet])
    m['land size > 0'] = df[layout.lot_land_square_feet] > 0
    m['living size < 99th percentile'] = below(99, df[layout.building_living_square_feet])
    m['living square feet > 0'] = df[layout.building_living_square_feet] > 0
    m['median household income > 0'] = df[layout.census2000_median_household_income] > 0
    m['new or resale'] = layout.mask_new_or_resale(df)
    m['one building'] = layout.mask_is_one_building(df)
    m['one APN'] = layout.mask_is_one_parcel(df)
    # m['recording date present'] = ~df[layout.recording_date + '_deed'].isnull()  # ~ => not
    m['price > 0'] = df[layout.price] > 0
    m['price < max'] = df[layout.price] < control.max_sale_price
    m['rooms > 0'] = df[layout.building_rooms] > 0
    m['resale or new construction'] = (
        layout.mask_is_new_construction(df) |
        layout.mask_is_resale(df)
    )
    m['sale date present'] = layout.mask_sale_date_present(df)
    m['sale date valid'] = layout.mask_sale_date_valid(df)
    m['stories > 0'] = df[layout.building_stories] > 0
    m['units == 1'] = df[layout.n_units] == 1
    m['year_built > 0'] = df[layout.year_built] > 0

    print 'effects of reasonable values'
    return report_and_remove(df, m)
Ejemplo n.º 3
0
def add_features(df, control):
    def split(date):
        year = int(date / 10000)
        md = date - year * 10000
        month = int(md / 100)
        day = md - month * 100
        return year, month, day

    def python_date(date):
        'yyyymmdd --> datetime.date(year, month, day)'
        year, month, day = split(date)
        return datetime.date(int(year), int(month), int(day))

    def year(date):
        'yyyymmdd --> datetime.date(year, 7, 1); mid point of year'
        year, month, day = split(date)
        return year

    def yyyymm(date):
        year, month, day = split(date)
        return year * 100 + month

    # create sale-date related features
    def append_column(name, values):
        df.insert(
            len(df.columns),
            name,
            pd.Series(values, index=df.index),
        )

    sale_date = df[layout.sale_date]
    sale_date_python = sale_date.apply(python_date)
    append_column(layout.sale_date_python, sale_date_python)
    append_column(layout.yyyymm, sale_date.apply(yyyymm))

    # create age and similar fields
    # NOTE: these are ages at the sale date
    sale_year = sale_date.apply(year)
    year_built = df[layout.year_built]
    effective_year_built = df[layout.year_built_effective]
    age = sale_year - year_built
    effective_age = sale_year - effective_year_built
    append_column(layout.age, age)
    append_column(layout.age2, age * age)
    append_column(layout.age_effective, effective_age)
    append_column(layout.age_effective2, effective_age * effective_age)

    # create indicator features
    append_column(layout.is_new_construction,
                  layout.mask_is_new_construction(df))
    append_column(layout.is_resale, layout.mask_is_resale(df))
    append_column(layout.building_has_basement,
                  df[layout.building_basement_square_feet] > 0)
    append_column(layout.building_has_fireplace,
                  df[layout.building_fireplace_number] > 0)
    append_column(layout.has_parking, df[layout.parking_spaces] > 0)
    append_column(layout.has_pool, df[layout.pool_flag] == 'Y')

    # create additional indicators aggregating certain PROPN codes
    def create(new_column_base, ored_column_bases):
        def create2(prefix):
            def ored_name(ored_index):
                return prefix + '_has_' + ored_column_bases[ored_index]

            mask = df[ored_name(0)]
            for index in range(1, len(ored_column_bases)):
                mask2 = df[ored_name(index)]
                mask = mask | mask2
            append_column(prefix + '_has_' + new_column_base, mask)

        for prefix in ('census_tract', 'zip5'):
            create2(prefix)

    create('any_commercial', (
        'commercial',
        'commercial_condominium',
    ))
    create('any_industrial', (
        'industrial',
        'industrial_light',
        'industrial_heavy',
    ))
    create('any_non_residential', (
        'amusement',
        'any_commercial',
        'financial_institution',
        'hotel',
        'any_industrial',
        'medical',
        'office_building',
        'parking',
        'retail',
        'service',
        'transport',
        'utilities',
        'warehouse',
    ))
Ejemplo n.º 4
0
def add_features(df, control):
    def split(date):
        year = int(date / 10000)
        md = date - year * 10000
        month = int(md / 100)
        day = md - month * 100
        return year, month, day

    def python_date(date):
        'yyyymmdd --> datetime.date(year, month, day)'
        year, month, day = split(date)
        return datetime.date(int(year), int(month), int(day))

    def year(date):
        'yyyymmdd --> datetime.date(year, 7, 1); mid point of year'
        year, month, day = split(date)
        return year

    def yyyymm(date):
        year, month, day = split(date)
        return year * 100 + month

    # create sale-date related features
    def append_column(name, values):
        df.insert(len(df.columns),
                  name,
                  pd.Series(values, index=df.index),
                  )

    sale_date = df[layout.sale_date]
    sale_date_python = sale_date.apply(python_date)
    append_column(layout.sale_date_python, sale_date_python)
    append_column(layout.yyyymm, sale_date.apply(yyyymm))

    # create age and similar fields
    # NOTE: these are ages at the sale date
    sale_year = sale_date.apply(year)
    year_built = df[layout.year_built]
    effective_year_built = df[layout.year_built_effective]
    age = sale_year - year_built
    effective_age = sale_year - effective_year_built
    append_column(layout.age, age)
    append_column(layout.age2, age * age)
    append_column(layout.age_effective, effective_age)
    append_column(layout.age_effective2, effective_age * effective_age)

    # create indicator features
    append_column(layout.is_new_construction, layout.mask_is_new_construction(df))
    append_column(layout.is_resale, layout.mask_is_resale(df))
    append_column(layout.building_has_basement, df[layout.building_basement_square_feet] > 0)
    append_column(layout.building_has_fireplace, df[layout.building_fireplace_number] > 0)
    append_column(layout.has_parking, df[layout.parking_spaces] > 0)
    append_column(layout.has_pool, df[layout.pool_flag] == 'Y')

    # create additional indicators aggregating certain PROPN codes
    def create(new_column_base, ored_column_bases):

        def create2(prefix):
            def ored_name(ored_index):
                return prefix + '_has_' + ored_column_bases[ored_index]

            mask = df[ored_name(0)]
            for index in range(1, len(ored_column_bases)):
                mask2 = df[ored_name(index)]
                mask = mask | mask2
            append_column(prefix + '_has_' + new_column_base, mask)

        for prefix in ('census_tract', 'zip5'):
            create2(prefix)

    create('any_commercial', ('commercial', 'commercial_condominium',))
    create('any_industrial', ('industrial', 'industrial_light', 'industrial_heavy',))
    create('any_non_residential', ('amusement', 'any_commercial', 'financial_institution', 'hotel',
                                   'any_industrial', 'medical', 'office_building', 'parking',
                                   'retail', 'service', 'transport', 'utilities', 'warehouse',))