def export_ala(parcels_out): df = parcels_out.to_frame() # Cast "land_use_type_id" to string for compatibility with other counties. df['land_use_type_id'] = df.land_use_type_id.astype(str) assert df.index.is_unique assert not df.index.hasnans() df_to_db(df, 'attributes_ala', schema=staging)
targetunits['sf'] = buildings2[buildings2.res_type == 'single'].groupby( 'taz').residential_units.sum() targetunits['mf'] = buildings2[buildings2.res_type == 'multi'].groupby( 'taz').residential_units.sum() targetunits['nrsqft'] = buildings2.groupby('taz').non_residential_sqft.sum() print targetunits[[ 'sf', 'targetSF', 'mf', 'targetMF', 'nrsqft', 'targetnonressqft' ]].head() targetunits[['sf', 'targetSF', 'mf', 'targetMF', 'nrsqft', 'targetnonressqft']].sum() # summary_output_path = loader.get_path('out/regeneration/summaries/built_space_summary.csv') # targetunits[['sf','targetSF','mf','targetMF', 'nrsqft', 'targetnonressqft']].to_csv(summary_output_path) targetunits = targetunits[[ 'sf', 'targetSF', 'mf', 'targetMF', 'nrsqft', 'targetnonressqft' ]] df_to_db(targetunits, 'summary_built_space', schema=loader.tables.public) # EXPORT BUILDINGS TO DB print 'Loading processed buildings to db' df_to_db(buildings2, 'buildings', schema=loader.tables.public) ## Create geom_id (serves similar purpose to joinnumA) on parcels based on integer representation of geometry hash idx = [] geom_hashes = [] for i, geom in parcels.geom.iteritems(): idx.append(i) md5_hash = str(hashlib.md5(geom).hexdigest()) geom_hashes.append(int(md5_hash[0:11], 16)) parcel_identifier = pd.Series(geom_hashes, index=idx) parcels['geom_id'] = parcel_identifier
def export_cnc(parcels_out): df = parcels_out.to_frame() assert df.index.is_unique assert not df.index.hasnans df.res_type[df.res_type.isnull()] = '' df_to_db(df, 'attributes_cnc', schema=staging)
probabilities = np.ones(len(alternative_ids)) num_resunits = len(alts) print 'There are %s residential units in TAZ %s' % (num_resunits, taz) choices = unit_choice(chooser_ids,alternative_ids,probabilities) #households_urbansim.building_id[np.in1d(households_urbansim.index.values,chooser_ids)] = choices.values hh.loc[chooser_ids,'building_id'] = choices if num_hh > num_resunits: print 'Warning: number of households exceeds number of resunits in TAZ %s' % taz targetunits['hh_allocated'] = pd.merge(hh, buildings, left_on = 'building_id', right_index = True).groupby('taz_x').size() df = targetunits[['targetunits', 'hh', 'hh_allocated']] df['occupancy'] = df.hh*1.0/df.targetunits print df.head() #summary_output_path = loader.get_path('out/regeneration/summaries/hh_summary.csv') df_to_db(df, 'summary_hh', schema=loader.tables.public) ################ #####JOBS####### ################ sector_columns = [] for col in targetunits.columns: if col.startswith('e'): if col.endswith('_10'): sector_columns.append(col) emp_targets = targetunits[sector_columns]
def export_son(parcels_out): df = parcels_out.to_frame() assert df.index.is_unique assert not df.index.hasnans() df_to_db(df, 'attributes_son', schema=staging)
import pandas as pd, numpy as np import pandas.io.sql as sql from pandas.io.excel import read_excel from spandex.io import exec_sql, df_to_db from spandex import TableLoader loader = TableLoader() ##Read Redfin CSV and load to database redfin_csv_path = loader.get_path('built/bldg/homeprices/redfin_03feb14.csv') redfin = pd.read_csv(redfin_csv_path) redfin.index.name = 'idx' df_to_db(redfin, 'redfin', schema=loader.tables.staging) ##Lat/long to point geometry, with the right SRID exec_sql("ALTER TABLE staging.redfin ADD COLUMN geom geometry;") exec_sql("UPDATE staging.redfin SET geom = ST_GeomFromText('POINT(' || longitude || ' ' || latitude || ')',4326);") exec_sql("CREATE INDEX redfin_gidx on staging.redfin using gist (geom);") exec_sql("SELECT UpdateGeometrySRID('staging', 'redfin', 'geom', 2768);") exec_sql("UPDATE staging.redfin SET geom = ST_TRANSFORM(ST_SetSRID(geom, 4326), 2768);") ##Append the unique parcel identifier to the Redfin records exec_sql("alter table staging.redfin add gid integer default 0;") exec_sql("update staging.redfin set gid = a.gid from parcels a where st_within(staging.redfin.geom, a.geom);") def db_to_df(query): """Executes SQL query and returns DataFrame.""" conn = loader.database._connection return sql.read_frame(query, conn)
for grouper in ['county_id', 'taz']: df = tf[[grouper, 'non_residential_sqft', 'residential_units']] df.dropna(subset=[grouper], inplace=True) if grouper == 'taz': df[grouper] = df[grouper].astype(int) df['count'] = 1 summary = df.groupby(grouper).sum() if grouper == 'taz': summary['residential_units_target'] = targetunits taz_df = summary output_tablename = 'summary_{}'.format(grouper) df_to_db(summary, output_tablename, schema=loader.tables.public) parcel_output_dir = loader.get_path('out/regeneration/summaries/parcels') config = load_config() db_config = dict(config.items('database')) if 'taz_id' in db_to_df("SELECT column_name FROM information_schema.columns WHERE table_name='parcel'").column_name.values: exec_sql("ALTER TABLE parcel RENAME COLUMN taz_id to zone_id;") if 'parcel_acres' in db_to_df("SELECT column_name FROM information_schema.columns WHERE table_name='parcel'").column_name.values: exec_sql("ALTER TABLE parcel RENAME COLUMN parcel_acres to acres;") ## Export parcel shapefile to output directory os.system('pgsql2shp -f "%s" -h %s -u %s -P %s %s parcel' % (parcel_output_dir, db_config['host'], db_config['user'], db_config['password'], db_config['database'])) ## Export buildings as csv
random_allocate_agents(jobs, buildings, zone_id_col_name='block_id', capacity_col_name='job_spaces') #EXPORT DEMAND AGENTS TO DB if 'block_id' in jobs.columns: del jobs['block_id'] if 'mgra_id' in households.columns: del households['mgra_id'] for df in [jobs, households]: df['building_id'][df['building_id'].isnull()] = -1 for col in df.columns: df[col] = df[col].astype('int') df_to_db(jobs, 'jobs', schema=loader.tables.public) df_to_db(households, 'households', schema=loader.tables.public) #Export formated buildings to db if 'luz_id' in buildings.columns: del buildings['luz_id'] if 'mgra_id' in buildings.columns: del buildings['mgra_id'] if 'block_id' in buildings.columns: del buildings['block_id'] if 'job_spaces' in buildings.columns: del buildings['job_spaces']
zone_id_col_name='block_id', capacity_col_name='job_spaces') #EXPORT DEMAND AGENTS TO DB if 'block_id' in jobs.columns: del jobs['block_id'] if 'mgra_id' in households.columns: del households['mgra_id'] for df in [jobs, households]: df['building_id'][df['building_id'].isnull()] = -1 for col in df.columns: df[col] = df[col].astype('int') df_to_db(jobs, 'jobs', schema=loader.tables.public) df_to_db(households, 'households', schema=loader.tables.public) #Export formated buildings to db if 'luz_id' in buildings.columns: del buildings['luz_id'] if 'mgra_id' in buildings.columns: del buildings['mgra_id'] if 'block_id' in buildings.columns: del buildings['block_id'] if 'job_spaces' in buildings.columns: del buildings['job_spaces']
for grouper in ['county_id', 'taz']: df = tf[[grouper, 'non_residential_sqft', 'residential_units']] df.dropna(subset=[grouper], inplace=True) if grouper == 'taz': df[grouper] = df[grouper].astype(int) df['count'] = 1 summary = df.groupby(grouper).sum() if grouper == 'taz': summary['residential_units_target'] = targetunits taz_df = summary output_tablename = 'summary_{}'.format(grouper) df_to_db(summary, output_tablename, schema=loader.tables.public) parcel_output_dir = loader.get_path('out/regeneration/summaries/parcels') config = load_config() db_config = dict(config.items('database')) if 'taz_id' in db_to_df( "SELECT column_name FROM information_schema.columns WHERE table_name='parcel'" ).column_name.values: exec_sql("ALTER TABLE parcel RENAME COLUMN taz_id to zone_id;") if 'parcel_acres' in db_to_df( "SELECT column_name FROM information_schema.columns WHERE table_name='parcel'" ).column_name.values: exec_sql("ALTER TABLE parcel RENAME COLUMN parcel_acres to acres;")
def export(parcels_out): schema = loader.tables.sample df_to_db(parcels_out.to_frame(), 'parcels_out', schema=schema)
def csv_to_staging(path, table_name): """Loads csv to staging schema on database""" df = pd.read_csv(path) df.index.name = 'idx' print 'Loading %s.' % table_name df_to_db(df, table_name, schema=loader.tables.staging)
costar = pd.read_excel(costar_xls_path) costar_solano_path = loader.get_path('built/bldg/costar/2011/costar__clean2011_sol_020315.csv') costar_sol = pd.read_csv(costar_solano_path) costar2 = costar[['PropertyID', 'Building Name', 'Latitude', 'Longitude', 'Rentable Building Area', 'Year Built', 'PropertyType', 'Secondary Type', 'Total Available Space (SF)', 'Number Of Elevators', 'Last Sale Date', 'Last Sale Price', 'Average Weighted Rent', 'Number Of Stories']] costar_sol2 = costar_sol[['PropertyID', 'Building Name', 'Latitude', 'Longitude', 'Rentable Building Area', 'Year Built', 'PropertyType', 'Secondary Type', 'Total Available Space (SF)', 'Number Of Elevators', 'Last Sale Date', 'Last Sale Price', 'Average Weighted Rent', 'Number Of Stories']] costar2.columns = ['propertyid', 'building_name', 'latitude', 'longitude', 'rentable_area', 'year_built', 'property_type', 'secondary_type', 'available_space', 'elevators', 'last_sale_date', 'last_sale_price', 'rent', 'stories'] costar_sol2.columns = ['propertyid', 'building_name', 'latitude', 'longitude', 'rentable_area', 'year_built', 'property_type', 'secondary_type', 'available_space', 'elevators', 'last_sale_date', 'last_sale_price', 'rent', 'stories'] costar2 = pd.concat([costar2, costar_sol2]) for tex_col in ['building_name', 'property_type', 'secondary_type', 'last_sale_date', ]: costar2[tex_col] = costar2[tex_col].fillna(' ') costar2[tex_col] = costar2[tex_col].str.encode('utf-8') costar2.last_sale_date = costar2.last_sale_date.fillna(' ') costar2.last_sale_price = costar2.last_sale_price.str.replace(",", "").astype('float') costar2.stories = costar2.stories.fillna(0).astype('int32') costar2.index.name = 'idx' df_to_db(costar2, 'costar', schema=loader.tables.staging) ##Lat/long to point geometry, with the right SRID lat_long_to_point_geometry('costar', 'staging', 'longitude', 'latitude', 'geom', 2768) ##Append the unique parcel identifier to the Costar records append_parcel_identifier('costar', 'staging', 'geom', 'gid') ######## *IMPUTE* ######## print 'Start point-based impute.' ## Load dataframes for the imputation parcels = db_to_df('select gid, year_built, sqft_per_unit, non_residential_sqft, stories, imputation_flag from parcels;').set_index('gid') costar = db_to_df('select * from staging.costar;') redfin = db_to_df('select gid, yearbuilt, sqft, lastsalepr, saleyear, hometype from staging.redfin;') gov_buildings = db_to_df('select * from staging.public_bldgs;') gov_buildings.sqft = gov_buildings.sqft.str.replace(',', '').astype('int')
col_name = col[0] col_type = col[1] firstval = df[col_name].loc[0] if firstval in (True, False): if type(firstval) == bool: df[col_name] = df[col_name].fillna(False) if col_type == np.int64: df[col_name] = df[col_name].fillna(0) elif col_type == np.float64: df[col_name] = df[col_name].fillna(0.0) elif col_type == np.object: df[col_name] = df[col_name].fillna(' ') if 'id' in df.columns: new_id_colname = tbl + '_id' df = df.rename(columns = {'id':new_id_colname}) df_to_db(df, tbl, schema = staging) # Load excel # xls_path = loader.get_path('scheduled/scheduled_development.xlsx') # df_to_db(df, 'scheduled_development', schema = staging) ## MarketPointe multifamily rents (some munging needed to load) xls_path = loader.get_path('price/marketPointe.xlsx') df = pd.read_excel(xls_path) df = df.rename(columns = {'$/Sqft':'price_per_sqft', '#Bldg':'number_of_buildings', '#Stories':'stories', 'ZipCode':'zipcode'}) df.index.name = 'idx' df.stories[df.stories == '2-3'] = 3 df.stories[df.stories == '2-4'] = 4 df.stories[df.stories == '3-4'] = 4
# Install PostGIS and create staging schema. loader = TableLoader() with loader.database.cursor() as cur: cur.execute(""" CREATE EXTENSION IF NOT EXISTS postgis; CREATE SCHEMA IF NOT EXISTS staging; """) loader.database.refresh() # Load shapefiles specified above to the project database. loader.load_shp_map(shapefiles) # Fix invalid geometries and reproject. staging = loader.tables.staging conform_srids(loader.srid, schema=staging, fix=True) # Load county land use code mapping. csv = loader.get_path('built/parcel/2010/rtp13_processing_notes/lucodes.csv') df = pd.read_csv(csv, dtype=str) df.dropna(how='any', inplace=True, subset=['county_id', 'land_use_type_id', 'development_type_id']) df.index.name = 'index' df_to_db(df, 'lucodes', schema=staging) # Add county land use code mapping unique constraint. exec_sql(""" ALTER TABLE staging.lucodes ADD CONSTRAINT lucodes_unique UNIQUE (county_id, land_use_type_id); """)
def export_sfr(parcels_out): df = parcels_out.to_frame() assert df.index.is_unique assert not df.index.hasnans() df_to_db(df, 'attributes_sfr', schema=staging)
'clip_high': [np.nan]*len(targetvalues), 'int_result': [np.nan]*len(targetvalues)}) buildings = scl.scale_to_targets_from_table(buildings, targets_residential_price) buildings.res_price_per_sqft[(buildings.res_price > 0) * (buildings.sqft_per_unit > 0)] = buildings.res_price/buildings.sqft_per_unit #Nonresidential price imputation nonresprice_estimation_dataset = buildings[(buildings.costar_property_type.str.len()>2) & (buildings.res_type == 'other') & (~buildings.costar_rent.isin(['', '-', 'Negotiable', 'Withheld']))] nonresprice_estimation_dataset['observed_costar_rent'] = nonresprice_estimation_dataset.costar_rent.astype('float') specification = 'np.log(observed_costar_rent) ~ non_residential_sqft + targetnonressqft + I(development_type_id == "OF") + I(development_type_id == "RT") + I(year_built < 1940) + I(year_built > 1990) + year_built + mean_income + mean_hhsize + mean_hhchildren + mean_numvehicles + mf_sf_ratio + resdensity + empdensity + nr_res_ratio + yearbuilt_av + yearbuilt_sd + stories + I(county_id == 1) + I(county_id == 13) + I(county_id == 41) + I(county_id == 55) + I(county_id == 85) + I(county_id == 81) + I(county_id == 95) + I(county_id == 97) + e11_10 + e21_10 + e22_10 + e23_10 + e3133_10 + e42_10 + e4445_10 + e4849_10 + e51_10 + e52_10 + e53_10 + e54_10 + e55_10 + e56_10 + e61_10 + e62_10 + e71_10 + e72_10 + e81_10 + e92_10 + etot_10' model = smf.ols(formula=specification, data=nonresprice_estimation_dataset) results = model.fit() print results.summary() nonresbuildings = buildings[(buildings.res_type == 'other') & (buildings.non_residential_sqft > 0)] sim_data = results.predict(nonresbuildings) sim_data = np.exp(sim_data) sim_data = pd.Series(sim_data, index = nonresbuildings.index) buildings['nonres_rent_per_sqft'] = 0 buildings.loc[sim_data.index,'nonres_rent_per_sqft'] = sim_data #summary_output_path = loader.get_path('out/regeneration/summaries/price_summary.csv') price_summary = pd.DataFrame({'avg_nonres_rent_per_sqft':buildings[buildings.nonres_rent_per_sqft>0].groupby('taz').nonres_rent_per_sqft.mean(), 'avg_res_price_per_sqft':buildings[buildings.res_price_per_sqft>0].groupby('taz').res_price_per_sqft.mean(),}) df_to_db(price_summary, 'summary_price', schema=loader.tables.public) ##Now export back to the database buildings2 = buildings[['parcel_id','county_id', 'land_use_type_id', 'res_type', 'improvement_value', 'year_assessed', 'year_built', 'building_sqft', 'non_residential_sqft', 'residential_units', 'sqft_per_unit', 'stories', 'development_type_id', 'taz', 'redfin_sale_price', 'redfin_sale_year', 'redfin_home_type', 'costar_property_type', 'costar_rent', 'nonres_rent_per_sqft', 'res_price_per_sqft']] df_to_db(buildings2, 'buildings', schema=loader.tables.public)
def export_smt(parcels_out): df = parcels_out.to_frame() assert df.index.is_unique assert not df.index.hasnans() df.res_type[df.res_type.isnull()] = '' df_to_db(df, 'attributes_smt', schema=staging)
# Post scaling bound-checking buildings2.year_built[buildings2.year_built > year_built_upper_bound] = year_built_upper_bound buildings2.year_built[buildings2.year_built < year_built_lower_bound] = year_built_lower_bound # COMPARE WITH TARGETS targetunits['sf'] = buildings2[buildings2.res_type == 'single'].groupby('taz').residential_units.sum() targetunits['mf'] = buildings2[buildings2.res_type == 'multi'].groupby('taz').residential_units.sum() targetunits['nrsqft'] = buildings2.groupby('taz').non_residential_sqft.sum() print targetunits[['sf','targetSF','mf','targetMF', 'nrsqft', 'targetnonressqft']].head() targetunits[['sf','targetSF','mf','targetMF', 'nrsqft', 'targetnonressqft']].sum() # summary_output_path = loader.get_path('out/regeneration/summaries/built_space_summary.csv') # targetunits[['sf','targetSF','mf','targetMF', 'nrsqft', 'targetnonressqft']].to_csv(summary_output_path) targetunits = targetunits[['sf','targetSF','mf','targetMF', 'nrsqft', 'targetnonressqft']] df_to_db(targetunits, 'summary_built_space', schema=loader.tables.public) # EXPORT BUILDINGS TO DB print 'Loading processed buildings to db' df_to_db(buildings2, 'buildings', schema=loader.tables.public) ## Create geom_id (serves similar purpose to joinnumA) on parcels based on integer representation of geometry hash idx = [] geom_hashes = [] for i, geom in parcels.geom.iteritems(): idx.append(i) md5_hash = str(hashlib.md5(geom).hexdigest()) geom_hashes.append(int(md5_hash[0:11], 16))