def test_scale_to_targets_from_table_clip_int(df, target_col): targets = pd.DataFrame( {'column_name': [target_col], 'target_value': [1000], 'target_metric': ['mean'], 'filters': [np.nan], 'clip_low': [400], 'clip_high': [999.99], 'int_result': [True]}) result = scl.scale_to_targets_from_table(df, targets) pdt.assert_index_equal(result.columns, df.columns) pdt.assert_series_equal( result[target_col], pd.Series([400, 400, 545, 727, 909, 1000, 1000, 1000, 1000, 1000]))
def test_scale_to_targets_from_table_clip_int(df, target_col): targets = pd.DataFrame({ 'column_name': [target_col], 'target_value': [1000], 'target_metric': ['mean'], 'filters': [np.nan], 'clip_low': [400], 'clip_high': [999.99], 'int_result': [True] }) result = scl.scale_to_targets_from_table(df, targets) pdt.assert_index_equal(result.columns, df.columns) pdt.assert_series_equal( result[target_col], pd.Series([400, 400, 545, 727, 909, 1000, 1000, 1000, 1000, 1000]))
def test_scale_to_targets_from_table(df, target_col): targets = pd.DataFrame( {'column_name': [target_col, target_col], 'target_value': [100, 1000], 'target_metric': ['sum', 'sum'], 'filters': ['geo_id == "a",filter_col < 106', 'geo_id == "b"'], 'clip_low': [np.nan, np.nan], 'clip_high': [np.nan, np.nan], 'int_result': [np.nan, np.nan]}) result = scl.scale_to_targets_from_table(df, targets) pdt.assert_index_equal(result.columns, df.columns) pdt.assert_series_equal( result[target_col], pd.Series( [11.11111111, 66.66666667, 33.33333333, 133.33333333, 55.55555556, 200, 7, 266.66666667, 9, 333.33333333]), check_dtype=False)
def test_scale_to_targets_from_table(df, target_col): targets = pd.DataFrame({ 'column_name': [target_col, target_col], 'target_value': [100, 1000], 'target_metric': ['sum', 'sum'], 'filters': ['geo_id == "a",filter_col < 106', 'geo_id == "b"'], 'clip_low': [np.nan, np.nan], 'clip_high': [np.nan, np.nan], 'int_result': [np.nan, np.nan] }) result = scl.scale_to_targets_from_table(df, targets) pdt.assert_index_equal(result.columns, df.columns) pdt.assert_series_equal(result[target_col], pd.Series([ 11.11111111, 66.66666667, 33.33333333, 133.33333333, 55.55555556, 200, 7, 266.66666667, 9, 333.33333333 ]), check_dtype=False)
'int_result': [np.nan] * len(targetvalues) }) targets_non_residential_sqft = pd.DataFrame({ 'column_name': ['non_residential_sqft'] * len(targetunits), 'target_value': targetunits.targetnonressqft.values, 'target_metric': ['sum'] * len(targetunits), 'filters': ('(non_residential_sqft > 0) & (taz == ' + pd.Series(targetunits.index.values).astype('str')) + ')', 'clip_low': [np.nan] * len(targetunits), 'clip_high': [np.nan] * len(targetunits), 'int_result': [np.nan] * len(targetunits) }) buildings2 = scl.scale_to_targets_from_table(buildings2, targets_residential_year_built) targets_non_residential_sqft['taz'] = targetunits.index.values targets_non_residential_sqft = targets_non_residential_sqft.set_index('taz') targets_non_residential_sqft['existing_nrsqft'] = buildings2.groupby( 'taz').non_residential_sqft.sum() targets_non_residential_sqft.target_value[ targets_non_residential_sqft.target_value < targets_non_residential_sqft. existing_nrsqft] = targets_non_residential_sqft.existing_nrsqft[ targets_non_residential_sqft.target_value < targets_non_residential_sqft.existing_nrsqft] del targets_non_residential_sqft['existing_nrsqft'] buildings2 = scl.scale_to_targets_from_table(buildings2, targets_non_residential_sqft) print buildings[buildings.building_sqft == 0].res_type.value_counts()
'target_metric': ['mean']*len(targetvalues), 'filters': ('(residential_units > 0) & (taz == ' + pd.Series(targetvalues.index.values).astype('str')) + ')', 'clip_low': [np.nan]*len(targetvalues), 'clip_high': [np.nan]*len(targetvalues), 'int_result': [np.nan]*len(targetvalues)}) targets_non_residential_sqft = pd.DataFrame( {'column_name': ['non_residential_sqft']*len(targetunits), 'target_value': targetunits.targetnonressqft.values, 'target_metric': ['sum']*len(targetunits), 'filters': ('(non_residential_sqft > 0) & (taz == ' + pd.Series(targetunits.index.values).astype('str')) + ')', 'clip_low': [np.nan]*len(targetunits), 'clip_high': [np.nan]*len(targetunits), 'int_result': [np.nan]*len(targetunits)}) buildings2 = scl.scale_to_targets_from_table(buildings2, targets_residential_year_built) buildings2 = scl.scale_to_targets_from_table(buildings2, targets_non_residential_sqft) print buildings[buildings.building_sqft == 0].res_type.value_counts() print len(buildings2[(buildings2.building_sqft == 0) & (buildings2.res_type=='other')]) # Post scaling bound-checking buildings2.year_built[buildings2.year_built > year_built_upper_bound] = year_built_upper_bound buildings2.year_built[buildings2.year_built < year_built_lower_bound] = year_built_lower_bound # COMPARE WITH TARGETS targetunits['sf'] = buildings2[buildings2.res_type == 'single'].groupby('taz').residential_units.sum() targetunits['mf'] = buildings2[buildings2.res_type == 'multi'].groupby('taz').residential_units.sum() targetunits['nrsqft'] = buildings2.groupby('taz').non_residential_sqft.sum()
sim_data = results.predict(resbuildings) sim_data = np.exp(sim_data) sim_data = pd.Series(sim_data, index = resbuildings.index) buildings['res_price'] = 0 buildings['res_price_per_sqft'] = 0 buildings.loc[sim_data.index,'res_price'] = sim_data #Now that regression equation is applied, scale residential prices to match zonal target targets_residential_price = pd.DataFrame( {'column_name': ['res_price']*len(targetvalues), 'target_value': targetvalues.salepr2010_av.values, 'target_metric': ['mean']*len(targetvalues), 'filters': ('(residential_units > 0) & (taz == ' + pd.Series(targetvalues.index.values).astype('str')) + ')', 'clip_low': [np.nan]*len(targetvalues), 'clip_high': [np.nan]*len(targetvalues), 'int_result': [np.nan]*len(targetvalues)}) buildings = scl.scale_to_targets_from_table(buildings, targets_residential_price) buildings.res_price_per_sqft[(buildings.res_price > 0) * (buildings.sqft_per_unit > 0)] = buildings.res_price/buildings.sqft_per_unit #Nonresidential price imputation nonresprice_estimation_dataset = buildings[(buildings.costar_property_type.str.len()>2) & (buildings.res_type == 'other') & (~buildings.costar_rent.isin(['', '-', 'Negotiable', 'Withheld']))] nonresprice_estimation_dataset['observed_costar_rent'] = nonresprice_estimation_dataset.costar_rent.astype('float') specification = 'np.log(observed_costar_rent) ~ non_residential_sqft + targetnonressqft + I(development_type_id == "OF") + I(development_type_id == "RT") + I(year_built < 1940) + I(year_built > 1990) + year_built + mean_income + mean_hhsize + mean_hhchildren + mean_numvehicles + mf_sf_ratio + resdensity + empdensity + nr_res_ratio + yearbuilt_av + yearbuilt_sd + stories + I(county_id == 1) + I(county_id == 13) + I(county_id == 41) + I(county_id == 55) + I(county_id == 85) + I(county_id == 81) + I(county_id == 95) + I(county_id == 97) + e11_10 + e21_10 + e22_10 + e23_10 + e3133_10 + e42_10 + e4445_10 + e4849_10 + e51_10 + e52_10 + e53_10 + e54_10 + e55_10 + e56_10 + e61_10 + e62_10 + e71_10 + e72_10 + e81_10 + e92_10 + etot_10' model = smf.ols(formula=specification, data=nonresprice_estimation_dataset) results = model.fit() print results.summary() nonresbuildings = buildings[(buildings.res_type == 'other') & (buildings.non_residential_sqft > 0)] sim_data = results.predict(nonresbuildings) sim_data = np.exp(sim_data)