##Load parcels as dataframes for the imputation parcels = db_to_df('select * from parcels;') parcels = parcels.set_index('gid') #Standardize the res_type field parcels.res_type[parcels.res_type.isnull()] = 'other' parcels.res_type[parcels.res_type == ''] = 'other' parcels.res_type[np.in1d( parcels.res_type, ['FLATS', 'APTS', 'CONDO', 'SRO', 'LIVEWORK', 'mixed'])] = 'multi' parcels.res_type[parcels.res_type == 'SINGLE'] = 'single' # Load TAZ residential unit control totals and other zonal targets. taz_controls_csv = loader.get_path('hh/taz2010_imputation.csv') targetunits = pd.read_csv(taz_controls_csv, index_col='taz1454') taz_controls_csv2 = loader.get_path('hh/tazsumm_redfin.csv') targetvalues = pd.read_csv(taz_controls_csv2, index_col='taz') nonres_sqft_zone = pd.DataFrame({ 'observed': parcels.groupby('taz').non_residential_sqft.sum(), 'target': targetunits.targetnonressqft }) # For all employment points, translate to nonres-sqft by multiplying by 250. # Filter out synthetic job-based buildings so that we keep only those that have no residential and have less than 500 existing sqft. # For each TAZ, calculate the difference needed to match aggregate target.
# Install PostGIS and create staging schema. loader = TableLoader() with loader.database.cursor() as cur: cur.execute(""" CREATE EXTENSION IF NOT EXISTS postgis; CREATE SCHEMA IF NOT EXISTS staging; """) loader.database.refresh() # Load shapefiles specified above to the project database. loader.load_shp_map(shapefiles) # Fix invalid geometries and reproject. staging = loader.tables.staging conform_srids(loader.srid, schema=staging, fix=True) # Load county land use code mapping. csv = loader.get_path('built/parcel/2010/rtp13_processing_notes/lucodes.csv') df = pd.read_csv(csv, dtype=str) df.dropna(how='any', inplace=True, subset=['county_id', 'land_use_type_id', 'development_type_id']) df.index.name = 'index' df_to_db(df, 'lucodes', schema=staging) # Add county land use code mapping unique constraint. exec_sql(""" ALTER TABLE staging.lucodes ADD CONSTRAINT lucodes_unique UNIQUE (county_id, land_use_type_id); """)
chosen = np.random.choice( alternative_ids, size=n_to_choose, replace=False, p=probabilities) # if there are fewer available units than choosers we need to pick # which choosers get a unit if n_to_choose == n_available: chooser_ids = np.random.choice( chooser_ids, size=n_to_choose, replace=False) choices[chooser_ids] = chosen return choices # Load TAZ-level synthetic population hh_path = loader.get_path('hh/synth/hhFile.p2011s3a1.2010.csv') hh = pd.read_csv(hh_path) hh = hh[hh['HHT'] > 0] #Filter out GQ households hh = hh.set_index('HHID') hh.index.name = 'household_id' hh = hh.rename(columns = {'TAZ':'taz'}) hh['building_id'] = -1 # Get the taz-level dwelling unit controls just for reference. This file also contains the employment totals by sector/zone. taz_controls_csv = loader.get_path('hh/taz2010_imputation.csv') targetunits = pd.read_csv(taz_controls_csv, index_col='taz1454') targetunits['hh'] = hh.groupby('taz').size() df = targetunits[['targetunits', 'hh']]
fee_schedule = db_to_df( 'select fee_schedule_id, development_type_id, development_fee_per_unit_space_initial from staging.fee_schedule' ) parcel_fee_schedule = db_to_df( 'select parcelid as parcel_id, fee_schedule_id, portion from staging.parcel_fee_schedule' ) # Remove uneccesary id columns appended by spandex for df in [buildings, jobs, households, assessor_transactions, zoning]: if 'id' in df.columns: del df['id'] zoning = zoning.set_index('zoning_id') # Get OSM nodes and edges for Pandana nodes_path = loader.get_path('travel/nodes.csv') edges_path = loader.get_path('travel/edges.csv') nodes = pd.read_csv(nodes_path).set_index('node_id') edges = pd.read_csv(edges_path) nodes.index.name = 'index' # Building sqft per job_id sqft_per_job = db_to_df('select * from staging.sqft_per_job_by_devtype;') sqft_per_job = sqft_per_job[['luz_id', 'development_type_id', 'sqft_per_emp']] # Get price datasets costar = db_to_df('select * from public.costar') if 'id' in costar.columns: del costar['id'] # Put tables in HDF5
"""Executes SQL query and returns DataFrame.""" conn = loader.database._connection return sql.read_frame(query, conn) ##Load parcels as dataframes for the imputation parcels = db_to_df('select * from parcels;') parcels = parcels.set_index('gid') #Standardize the res_type field parcels.res_type[parcels.res_type.isnull()] = 'other' parcels.res_type[parcels.res_type==''] = 'other' parcels.res_type[np.in1d(parcels.res_type, ['FLATS', 'APTS', 'CONDO', 'SRO', 'LIVEWORK', 'mixed'])] = 'multi' parcels.res_type[parcels.res_type=='SINGLE'] = 'single' # Load TAZ residential unit control totals and other zonal targets. taz_controls_csv = loader.get_path('hh/taz2010_imputation.csv') targetunits = pd.read_csv(taz_controls_csv, index_col='taz1454') taz_controls_csv2 = loader.get_path('hh/tazsumm_redfin.csv') targetvalues = pd.read_csv(taz_controls_csv2, index_col='taz') nonres_sqft_zone = pd.DataFrame({'observed':parcels.groupby('taz').non_residential_sqft.sum(), 'target':targetunits.targetnonressqft}) # For all employment points, translate to nonres-sqft by multiplying by 250. # Filter out synthetic job-based buildings so that we keep only those that have no residential and have less than 500 existing sqft. # For each TAZ, calculate the difference needed to match aggregate target. # If need to increment nrsqft upwards, sort synthetic buildings by sqft and take the top x that covers the needed difference # If no valid job points and non existing nonres-sqft, introduce a synthetic building in the TAZ- equal to the target, and put it on the biggest parcel. # Do same in the case of no parcels (and add synthetic parcel) # Scale to match
loader = TableLoader() # Download puma 2000 geometry zip files for i in range(73): if i < 10: filename = 'p50%s_d00_shp.zip' % i else: filename = 'p5%s_d00_shp.zip' % i try: pumageom_file = urllib.URLopener() pumageom_file.retrieve( "http://www2.census.gov/geo/tiger/PREVGENZ/pu/p500shp/%s" % filename, os.path.join(loader.get_path('puma_geom'), filename)) print 'Downloading %s' % filename except: continue # Unzip and add prj file to puma 2000 geometry for i in range(73): if i < 10: filename = 'p50%s_d00_shp.zip' % i else: filename = 'p5%s_d00_shp.zip' % i filepath = os.path.join(loader.get_path('puma_geom'), filename) if os.path.exists(filepath): print 'Unzipping and adding prj to %s' % filename
assessor_transactions = db_to_df('select * from assessor_transactions').set_index('building_id') zoning = db_to_df('select * from zoning') zoning_allowed_uses = db_to_df('select zoning_id, development_type_id from zoning_allowed_uses') fee_schedule = db_to_df('select fee_schedule_id, development_type_id, development_fee_per_unit_space_initial from staging.fee_schedule') parcel_fee_schedule = db_to_df('select parcelid as parcel_id, fee_schedule_id, portion from staging.parcel_fee_schedule') # Remove uneccesary id columns appended by spandex for df in [buildings, jobs, households, assessor_transactions, zoning]: if 'id' in df.columns: del df['id'] zoning = zoning.set_index('zoning_id') # Get OSM nodes and edges for Pandana nodes_path = loader.get_path('travel/nodes.csv') edges_path = loader.get_path('travel/edges.csv') nodes = pd.read_csv(nodes_path).set_index('node_id') edges = pd.read_csv(edges_path) nodes.index.name = 'index' # Building sqft per job_id sqft_per_job = db_to_df('select * from staging.sqft_per_job_by_devtype;') sqft_per_job = sqft_per_job[['luz_id', 'development_type_id', 'sqft_per_emp']] # Get price datasets costar = db_to_df('select * from public.costar') if 'id' in costar.columns: del costar['id'] # Put tables in HDF5
import pandas as pd, numpy as np import pandas.io.sql as sql from pandas.io.excel import read_excel from spandex.io import exec_sql, df_to_db from spandex import TableLoader loader = TableLoader() ##Read Redfin CSV and load to database redfin_csv_path = loader.get_path('built/bldg/homeprices/redfin_03feb14.csv') redfin = pd.read_csv(redfin_csv_path) redfin.index.name = 'idx' df_to_db(redfin, 'redfin', schema=loader.tables.staging) ##Lat/long to point geometry, with the right SRID exec_sql("ALTER TABLE staging.redfin ADD COLUMN geom geometry;") exec_sql("UPDATE staging.redfin SET geom = ST_GeomFromText('POINT(' || longitude || ' ' || latitude || ')',4326);") exec_sql("CREATE INDEX redfin_gidx on staging.redfin using gist (geom);") exec_sql("SELECT UpdateGeometrySRID('staging', 'redfin', 'geom', 2768);") exec_sql("UPDATE staging.redfin SET geom = ST_TRANSFORM(ST_SetSRID(geom, 4326), 2768);") ##Append the unique parcel identifier to the Redfin records exec_sql("alter table staging.redfin add gid integer default 0;") exec_sql("update staging.redfin set gid = a.gid from parcels a where st_within(staging.redfin.geom, a.geom);") def db_to_df(query): """Executes SQL query and returns DataFrame.""" conn = loader.database._connection return sql.read_frame(query, conn)
conn = loader.database._connection return sql.read_frame(query, conn) loader = TableLoader() # Download puma 2000 geometry zip files for i in range(73): if i < 10: filename = 'p50%s_d00_shp.zip' % i else: filename = 'p5%s_d00_shp.zip' % i try: pumageom_file = urllib.URLopener() pumageom_file.retrieve("http://www2.census.gov/geo/tiger/PREVGENZ/pu/p500shp/%s" % filename, os.path.join(loader.get_path('puma_geom'), filename)) print 'Downloading %s' % filename except: continue # Unzip and add prj file to puma 2000 geometry for i in range(73): if i < 10: filename = 'p50%s_d00_shp.zip' % i else: filename = 'p5%s_d00_shp.zip' % i filepath = os.path.join(loader.get_path('puma_geom'), filename) if os.path.exists(filepath): print 'Unzipping and adding prj to %s' % filename
import pandas.io.sql as sql def db_to_df(query): """Executes SQL query and returns DataFrame.""" conn = loader.database._connection return sql.read_frame(query, conn) # Build parcels TableFrame. loader = TableLoader() table = loader.database.tables.public.parcels tf = TableFrame(table, index_col='gid') # Load TAZ residential unit control totals. taz_controls_csv = loader.get_path('hh/taz2010_imputation.csv') targetunits = pd.read_csv(taz_controls_csv, index_col='taz1454')['targetunits'] # Get CSV output file directory. output_dir = loader.get_path('out/regeneration/summaries') # Generate summary CSV by county and TAZ. for grouper in ['county_id', 'taz']: df = tf[[grouper, 'non_residential_sqft', 'residential_units']] df.dropna(subset=[grouper], inplace=True) if grouper == 'taz': df[grouper] = df[grouper].astype(int) df['count'] = 1 summary = df.groupby(grouper).sum()
sr_grouped = df.groupby('gid')[attribute] if agg_function == 'median': var = sr_grouped.median() if agg_function == 'max': var = sr_grouped.max() if agg_function == 'sum': var = sr_grouped.sum() var = var[(var > lower_bound) & (var < upper_bound)] #set bounds on valid values to use for impute return var ######## *LOADING* ######## #### REDFIN # Read Redfin CSV and load to database csv_to_staging(loader.get_path('built/bldg/homeprices/redfin_03feb14.csv'), 'redfin') # Lat/long to point geometry, with the right SRID lat_long_to_point_geometry('redfin', 'staging', 'longitude', 'latitude', 'geom', 2768) # Append the unique parcel identifier to the Redfin records append_parcel_identifier('redfin', 'staging', 'geom', 'gid') #### GOV BUILDINGS # Read Gov Building CSV and load to database csv_to_staging(loader.get_path('built/bldg/add_buildings1.csv'), 'public_bldgs') # Lat/long to point geometry, with the right SRID lat_long_to_point_geometry('public_bldgs', 'staging', 'x', 'y', 'geom', 2768) # Append the unique parcel identifier to the Gov Building records append_parcel_identifier('public_bldgs', 'staging', 'geom', 'gid') #### COSTAR costar_xls_path = loader.get_path('built/bldg/costar/2011/costar_allbayarea.xlsx')
'luz_controls/pecas_PriceAndSpaceQuantity.csv', 'assessor_transactions': 'price/parcelTransactions.csv', 'fee_schedule': 'proformaInputs/fees/fee_schedule.csv', 'parcel_fee_schedule': 'proformaInputs/fees/parcel_fee_schedule.csv', } for tbl in csvs.iterkeys(): print tbl csv = loader.get_path(csvs[tbl]) df = pd.read_csv(csv) df.index.name = 'index' if df.isnull().sum().sum() > 0: for col in df.dtypes.iteritems(): col_name = col[0] col_type = col[1] firstval = df[col_name].loc[0] if firstval in (True, False): if type(firstval) == bool: df[col_name] = df[col_name].fillna(False) if col_type == np.int64: df[col_name] = df[col_name].fillna(0) elif col_type == np.float64: df[col_name] = df[col_name].fillna(0.0) elif col_type == np.object:
import pandas as pd from spandex import TableLoader import pandas.io.sql as sql loader = TableLoader() def db_to_df(query): """Executes SQL query and returns DataFrame.""" conn = loader.database._connection return sql.read_frame(query, conn) ## Export to HDF5- get path to output file h5_path = loader.get_path('out/regeneration/summaries/bayarea_v3.h5') ## Path to the output file #Buildings buildings = db_to_df('select * from building').set_index('building_id') if 'id' in buildings.columns: del buildings['id'] buildings['building_type_id'] = 0 buildings.building_type_id[buildings.development_type_id == 1] = 1 buildings.building_type_id[buildings.development_type_id == 2] = 3 buildings.building_type_id[buildings.development_type_id == 5] = 12 buildings.building_type_id[buildings.development_type_id == 7] = 10 buildings.building_type_id[buildings.development_type_id == 9] = 5 buildings.building_type_id[buildings.development_type_id == 10] = 4 buildings.building_type_id[buildings.development_type_id == 13] = 8 buildings.building_type_id[buildings.development_type_id == 14] = 7 buildings.building_type_id[buildings.development_type_id == 15] = 9 buildings.building_type_id[buildings.development_type_id == 13] = 8 buildings.building_type_id[buildings.development_type_id == 17] = 6 buildings.building_type_id[buildings.development_type_id == 24] = 16
from spandex.utils import load_config from spandex.io import exec_sql, df_to_db import pandas.io.sql as sql def db_to_df(query): """Executes SQL query and returns DataFrame.""" conn = loader.database._connection return sql.read_frame(query, conn) # Build parcels TableFrame. loader = TableLoader() table = loader.database.tables.public.parcels tf = TableFrame(table, index_col='gid') # Load TAZ residential unit control totals. taz_controls_csv = loader.get_path('hh/taz2010_imputation.csv') targetunits = pd.read_csv(taz_controls_csv, index_col='taz1454')['targetunits'] # Get CSV output file directory. output_dir = loader.get_path('out/regeneration/summaries') # Generate summary CSV by county and TAZ. for grouper in ['county_id', 'taz']: df = tf[[grouper, 'non_residential_sqft', 'residential_units']] df.dropna(subset=[grouper], inplace=True) if grouper == 'taz': df[grouper] = df[grouper].astype(int) df['count'] = 1 summary = df.groupby(grouper).sum()
import pandas.io.sql as sql from spandex import TableLoader loader = TableLoader() conn_string = "host='urbancanvas.cp2xwchuariu.us-west-2.rds.amazonaws.com' dbname='sandag_testing' user='******' password='******' port=5432" conn=psycopg2.connect(conn_string) cur = conn.cursor() def uc_db_to_df(query): return sql.read_frame(query, conn) parcels = uc_db_to_df("select parcel_id, zoning_id, devtype_id as development_type_id from parcel " "where projects = '{1}' and valid_from = '{-infinity}';").set_index('parcel_id') buildings = uc_db_to_df("SELECT building_id, parcel_id, building_type_id as development_type_id, improvement_value, " "residential_units, non_residential_sqft, stories, year_built, residential_sqft, " "note FROM building where projects = '{1}' and valid_from = '{-infinity}';").set_index('building_id') # Put tables in HDF5 h5_path = loader.get_path('out/sandag.h5') store = pd.HDFStore(h5_path) del store['buildings'] store['buildings'] = buildings p_prev = store.parcels.copy() p_prev['zoning_id'] = parcels.zoning_id p_prev['development_type_id'] = parcels.development_type_id del store['parcels'] store['parcels'] = p_prev store.close()
tags = soup.find_all(href=re.compile("csv_h..\.zip")) hpums_links = [] for t in tags: hpums_links.append(t['href']) tags = soup.find_all(href=re.compile("csv_p..\.zip")) ppums_links = [] for t in tags: ppums_links.append(t['href']) pums_links = hpums_links + ppums_links for pums_file in pums_links: print pums_file pums_file_dl = urllib.URLopener() pums_file_dl.retrieve("http://www2.census.gov/acs2013_5yr/pums/%s" % pums_file, os.path.join(loader.get_path('pums'), pums_file)) for pums_file in pums_links: filepath = os.path.join(loader.get_path('pums'), pums_file) if os.path.exists(filepath): print 'Unzipping %s' % pums_file with zipfile.ZipFile(filepath, "r") as z: z.extractall(loader.get_path('pums')) for pums_file in ['ss13husa.csv', 'ss13husb.csv', 'ss13husc.csv', 'ss13husd.csv', 'ss13pusa.csv', 'ss13pusb.csv', 'ss13pusc.csv', 'ss13pusd.csv']: print 'Processing %s' % pums_file
hpums_links = [] for t in tags: hpums_links.append(t['href']) tags = soup.find_all(href=re.compile("csv_p..\.zip")) ppums_links = [] for t in tags: ppums_links.append(t['href']) pums_links = hpums_links + ppums_links for pums_file in pums_links: print pums_file pums_file_dl = urllib.URLopener() pums_file_dl.retrieve( "http://www2.census.gov/acs2013_5yr/pums/%s" % pums_file, os.path.join(loader.get_path('pums'), pums_file)) for pums_file in pums_links: filepath = os.path.join(loader.get_path('pums'), pums_file) if os.path.exists(filepath): print 'Unzipping %s' % pums_file with zipfile.ZipFile(filepath, "r") as z: z.extractall(loader.get_path('pums')) for pums_file in [ 'ss13husa.csv', 'ss13husb.csv', 'ss13husc.csv', 'ss13husd.csv', 'ss13pusa.csv', 'ss13pusb.csv', 'ss13pusc.csv', 'ss13pusd.csv' ]: print 'Processing %s' % pums_file