def parcels_geography(parcels): df = pd.read_csv( os.path.join(misc.data_dir(), "02_01_2016_parcels_geography.csv"), index_col="geom_id") df = geom_id_to_parcel_id(df, parcels) # this will be used to map juris id to name juris_name = pd.read_csv( os.path.join(misc.data_dir(), "census_id_to_name.csv"), index_col="census_id").name10 df["juris_name"] = df.jurisdiction_id.map(juris_name) df.loc[2054504, "juris_name"] = "Marin County" df.loc[2054505, "juris_name"] = "Santa Clara County" df.loc[2054506, "juris_name"] = "Marin County" df.loc[572927, "juris_name"] = "Contra Costa County" # assert no empty juris values assert True not in df.juris_name.isnull().value_counts() df["pda_id"] = df.pda_id.str.lower() # danville wasn't supposed to be a pda df["pda_id"] = df.pda_id.replace("dan1", np.nan) return df
def parcels_geography(parcels, scenario, settings, policy): file = os.path.join(misc.data_dir(), "2021_02_25_parcels_geography.csv") print('Version of parcels_geography: {}'.format(file)) df = pd.read_csv(file, index_col="geom_id") df = geom_id_to_parcel_id(df, parcels) # this will be used to map juris id to name juris_name = pd.read_csv(os.path.join(misc.data_dir(), "census_id_to_name.csv"), index_col="census_id").name10 df["juris_name"] = df.jurisdiction_id.map(juris_name) df.loc[2054504, "juris_name"] = "Marin County" df.loc[2054505, "juris_name"] = "Santa Clara County" df.loc[2054506, "juris_name"] = "Marin County" df.loc[572927, "juris_name"] = "Contra Costa County" # assert no empty juris values assert True not in df.juris_name.isnull().value_counts() df['juris_trich'] = df.juris + '-' + df.trich_id df["pda_id_pba40"] = df.pda_id_pba40.str.lower() # danville wasn't supposed to be a pda df["pda_id_pba40"] = df.pda_id_pba40.replace("dan1", np.nan) # Add Draft Blueprint geographies: PDA, TRA, PPA, sesit if scenario in policy['geographies_db_enable']: df["pda_id_pba50"] = df.pda_id_pba50.str.lower() df["gg_id"] = df.gg_id.str.lower() df["tra_id"] = df.tra_id.str.lower() df['juris_tra'] = df.juris + '-' + df.tra_id df["ppa_id"] = df.ppa_id.str.lower() df['juris_ppa'] = df.juris + '-' + df.ppa_id df["sesit_id"] = df.sesit_id.str.lower() df['juris_sesit'] = df.juris + '-' + df.sesit_id # Use Final Blueprint geographies: PDA, TRA, PPA, sesit elif scenario in policy['geographies_fb_enable']: df["pda_id_pba50"] = df.pda_id_pba50_fb.str.lower() df["gg_id"] = df.fbp_gg_id.str.lower() df["tra_id"] = df.fbp_tra_id.str.lower() df['juris_tra'] = df.juris + '-' + df.tra_id df["ppa_id"] = df.fbp_ppa_id.str.lower() df['juris_ppa'] = df.juris + '-' + df.ppa_id df["sesit_id"] = df.fbp_sesit_id.str.lower() df['juris_sesit'] = df.juris + '-' + df.sesit_id # Use EIR geographies: TRA, PPA, sesit, CoC elif scenario in policy['geographies_eir_enable']: df["pda_id_pba50"] = df.pda_id_pba50_fb.str.lower() df["gg_id"] = df.eir_gg_id.str.lower() df["tra_id"] = df.eir_tra_id.str.lower() df['juris_tra'] = df.juris + '-' + df.tra_id df["ppa_id"] = df.eir_ppa_id.str.lower() df['juris_ppa'] = df.juris + '-' + df.ppa_id df["sesit_id"] = df.eir_sesit_id.str.lower() df['juris_sesit'] = df.juris + '-' + df.sesit_id df['coc_id'] = df.eir_coc_id.str.lower() df['juris_coc'] = df.juris + '-' + df.coc_id return df
def maz(): maz = pd.read_csv(os.path.join(misc.data_dir(), "maz_geography.csv")) maz = maz.drop_duplicates('MAZ').set_index('MAZ') taz1454 = pd.read_csv(os.path.join(misc.data_dir(), "maz22_taz1454.csv"), index_col='maz') maz['taz1454'] = taz1454.TAZ1454 return maz
def maz(): maz = pd.read_csv(os.path.join(misc.data_dir(), "maz_geography.csv")) maz = maz.drop_duplicates('MAZ').set_index('MAZ') taz1454 = pd.read_csv(os.path.join(misc.data_dir(), "maz22_taz1454.csv"), index_col='maz') maz['taz1454'] = taz1454.TAZ1454 return maz
def parcels_geography(parcels): df = pd.read_csv(os.path.join(misc.data_dir(), "02_01_2016_parcels_geography.csv"), index_col="geom_id") df = geom_id_to_parcel_id(df, parcels) # this will be used to map juris id to name juris_name = pd.read_csv(os.path.join(misc.data_dir(), "census_id_to_name.csv"), index_col="census_id").name10 df["juris_name"] = df.jurisdiction_id.map(juris_name) df.loc[2054504, "juris_name"] = "Marin County" df.loc[2054505, "juris_name"] = "Santa Clara County" df.loc[2054506, "juris_name"] = "Marin County" df.loc[572927, "juris_name"] = "Contra Costa County" # assert no empty juris values assert True not in df.juris_name.isnull().value_counts() df["pda_id"] = df.pda_id.str.lower() # danville wasn't supposed to be a pda df["pda_id"] = df.pda_id.replace("dan1", np.nan) return df
def taz_geography(): tg = pd.read_csv(os.path.join(misc.data_dir(), "taz_geography.csv"), index_col="zone") sr = pd.read_csv(os.path.join(misc.data_dir(), "superdistricts.csv"), index_col="number") tg["subregion_id"] = sr.subregion.loc[tg.superdistrict].values tg["subregion"] = tg.subregion_id.map({ 1: "Core", 2: "Urban", 3: "Suburban", 4: "Rural" }) return tg
def taz_geography(): tg = pd.read_csv(os.path.join(misc.data_dir(), "taz_geography.csv"), index_col="zone") sr = pd.read_csv(os.path.join(misc.data_dir(), "superdistricts.csv"), index_col="number") tg["subregion_id"] = sr.subregion.loc[tg.superdistrict].values tg["subregion"] = tg.subregion_id.map({ 1: "Core", 2: "Urban", 3: "Suburban", 4: "Rural" }) return tg
def superdistricts(scenario): sd_scenario_file = os.path.join( misc.data_dir(), ("superdistricts_s{}.csv").format(scenario)) # scenarios could contain policies (eg telework) and/or other modifications if os.path.isfile(sd_scenario_file): superdistricts = pd.read_csv(sd_scenario_file, index_col="number") orca.add_injectable("sqft_per_job_settings", "for this scenario") # the default includes a telework assumption and SD adjustments else: superdistricts = pd.read_csv(os.path.join(misc.data_dir(), "superdistricts.csv"), index_col="number") orca.add_injectable("sqft_per_job_settings", "default") return superdistricts
def parcels_geography(parcels): df = pd.read_csv(os.path.join(misc.data_dir(), "02_01_2016_parcels_geography.csv"), index_col="geom_id", dtype={'jurisdiction': 'str'}) df = geom_id_to_parcel_id(df, parcels) juris_name = pd.read_csv(os.path.join(misc.data_dir(), "census_id_to_name.csv"), index_col="census_id").name10 df["juris_name"] = df.jurisdiction_id.map(juris_name) df["pda_id"] = df.pda_id.str.lower() return df
def local_pois(settings): # because of the aforementioned limit of one netowrk at a time for the # POIS, as well as the large amount of memory used, this is now a # preprocessing step n = make_network( settings['build_networks']['walk']['name'], "weight", 3000) n.init_pois( num_categories=1, max_dist=3000, max_pois=1) cols = {} locations = pd.read_csv(os.path.join(misc.data_dir(), 'bart_stations.csv')) n.set_pois("tmp", locations.lng, locations.lat) cols["bartdist"] = n.nearest_pois(3000, "tmp", num_pois=1)[1] locname = 'pacheights' locs = orca.get_table('landmarks').local.query("name == '%s'" % locname) n.set_pois("tmp", locs.lng, locs.lat) cols["pacheights"] = n.nearest_pois(3000, "tmp", num_pois=1)[1] df = pd.DataFrame(cols) df.index.name = "node_id" df.to_csv('local_poi_distances.csv')
def zoning_baseline(parcels, zoning_lookup): df = pd.read_csv(os.path.join(misc.data_dir(), "2015_08_13_zoning_parcels.csv"), index_col="geom_id") df = pd.merge(df, zoning_lookup.to_frame(), left_on="zoning_id", right_index=True) df = geom_id_to_parcel_id(df, parcels) d = { "HS": "type1", "HT": "type2", "HM": "type3", "OF": "type4", "HO": "type5", "IL": "type7", "IW": "type8", "IH": "type9", "RS": "type10", "RB": "type11", "MR": "type12", "MT": "type13", "ME": "type14" } df.columns = [d.get(x, x) for x in df.columns] return df
def parcels(store): df = store['parcels'] df["zone_id"] = df.zone_id.replace(0, 1) cfg = { "fill_nas": { "zone_id": { "how": "mode", "type": "int" }, "shape_area": { "how": "median", "type": "float" } } } df = utils.table_reprocess(cfg, df) # have to do it this way because otherwise it's a circular reference sdem = pd.read_csv(os.path.join(misc.data_dir(), "development_projects.csv")) # mark parcels that are going to be developed by the sdem df["sdem"] = df.geom_id.isin(sdem.geom_id).astype('int') return df
def parcels(store): df = store['parcels'] df["zone_id"] = df.zone_id.replace(0, 1) cfg = { "fill_nas": { "zone_id": { "how": "mode", "type": "int" }, "shape_area": { "how": "median", "type": "float" } } } df = utils.table_reprocess(cfg, df) # have to do it this way because otherwise it's a circular reference sdem = pd.read_csv(os.path.join(misc.data_dir(), "development_projects.csv")) # mark parcels that are going to be developed by the sdem df["sdem"] = df.geom_id.isin(sdem.geom_id).astype('int') return df
def parcels_geography(parcels): df = pd.read_csv(os.path.join(misc.data_dir(), "02_01_2016_parcels_geography.csv"), index_col="geom_id", dtype={'jurisdiction': 'str'}) df = geom_id_to_parcel_id(df, parcels) juris_name = pd.read_csv(os.path.join(misc.data_dir(), "census_id_to_name.csv"), index_col="census_id").name10 df["juris_name"] = df.jurisdiction_id.map(juris_name) df["pda_id"] = df.pda_id.str.lower() return df
def make_network(name, weight_col, max_distance): st = pd.HDFStore(os.path.join(misc.data_dir(), name), "r") nodes, edges = st.nodes, st.edges net = pdna.Network(nodes["x"], nodes["y"], edges["from"], edges["to"], edges[[weight_col]]) net.precompute(max_distance) return net
def make_network(name, weight_col, max_distance): st = pd.HDFStore(os.path.join(misc.data_dir(), name), "r") nodes, edges = st.nodes, st.edges net = pdna.Network(nodes["x"], nodes["y"], edges["from"], edges["to"], edges[[weight_col]]) net.precompute(max_distance) return net
def zoning_np(parcels_geography): scenario_zoning = pd.read_csv(os.path.join(misc.data_dir(), 'zoning_mods_np.csv')) return pd.merge(parcels_geography.to_frame(), scenario_zoning, on=['jurisdiction', 'pda_id', 'tpp_id', 'exp_id'], how='left')
def development_projects(parcels, settings): df = pd.read_csv(os.path.join(misc.data_dir(), "development_projects.csv")) for fld in ['residential_sqft', 'residential_price', 'non_residential_price']: df[fld] = 0 df["redfin_sale_year"] = 2012 # hedonic doesn't tolerate nans df["stories"] = df.stories.fillna(1) df["building_sqft"] = df.building_sqft.fillna(0) df["non_residential_sqft"] = df.non_residential_sqft.fillna(0) df["building_type_id"] = df.building_type.map(settings["building_type_map2"]) df = df.dropna(subset=["geom_id"]) # need a geom_id to link to parcel_id df = df.dropna(subset=["year_built"]) # need a year built to get built df["geom_id"] = df.geom_id.astype("int") df = df.query('residential_units != "rent"') df["residential_units"] = df.residential_units.astype("int") df = df.set_index("geom_id") df = geom_id_to_parcel_id(df, parcels).reset_index() # use parcel id # we don't predict prices for schools and hotels right now df = df.query("building_type_id <= 4 or building_type_id >= 7") print "Describe of development projects" print df[orca.get_table('buildings').local_columns].describe() return df
def build_networks(settings): name = settings["build_networks"]["name"] st = pd.HDFStore(os.path.join(misc.data_dir(), name), "r") nodes, edges = st.nodes, st.edges net = pdna.Network(nodes["x"], nodes["y"], edges["from"], edges["to"], edges[["weight"]]) net.precompute(settings["build_networks"]["max_distance"]) return net
def local_pois(settings): # because of the aforementioned limit of one netowrk at a time for the # POIS, as well as the large amount of memory used, this is now a # preprocessing step n = make_network( settings['build_networks']['walk']['name'], "weight", 3000) n.init_pois( num_categories=1, max_dist=3000, max_pois=1) cols = {} locations = pd.read_csv(os.path.join(misc.data_dir(), 'bart_stations.csv')) n.set_pois("tmp", locations.lng, locations.lat) cols["bartdist"] = n.nearest_pois(3000, "tmp", num_pois=1)[1] locname = 'pacheights' locs = orca.get_table('landmarks').local.query("name == '%s'" % locname) n.set_pois("tmp", locs.lng, locs.lat) cols["pacheights"] = n.nearest_pois(3000, "tmp", num_pois=1)[1] df = pd.DataFrame(cols) df.index.name = "node_id" df.to_csv('local_poi_distances.csv')
def build_networks(settings): name = settings['build_networks']['name'] st = pd.HDFStore(os.path.join(misc.data_dir(), name), "r") nodes, edges = st.nodes, st.edges net = pdna.Network(nodes["x"], nodes["y"], edges["from"], edges["to"], edges[["weight"]]) net.precompute(settings['build_networks']['max_distance']) return net
def non_mandatory_accessibility(): fname = get_logsum_file('non_mandatory') df = pd.read_csv(os.path.join(misc.data_dir(), fname)) df.loc[df.subzone == 0, 'subzone'] = 'a' df.loc[df.subzone == 1, 'subzone'] = 'b' df.loc[df.subzone == 2, 'subzone'] = 'c' df['taz_sub'] = df.taz.astype('str') + df.subzone return df.set_index('taz_sub')
def non_mandatory_accessibility(): fname = get_logsum_file('non_mandatory') df = pd.read_csv(os.path.join( misc.data_dir(), fname)) df.loc[df.subzone == 0, 'subzone'] = 'c' # no walk df.loc[df.subzone == 1, 'subzone'] = 'a' # short walk df.loc[df.subzone == 2, 'subzone'] = 'b' # long walk df['taz_sub'] = df.taz.astype('str') + df.subzone return df.set_index('taz_sub')
def zoning_baseline(parcels, zoning_lookup, settings): df = pd.read_csv(os.path.join(misc.data_dir(), "2015_12_21_zoning_parcels.csv"), index_col="geom_id") df = pd.merge(df, zoning_lookup.to_frame(), left_on="zoning_id", right_index=True) df = geom_id_to_parcel_id(df, parcels) return df
def zoning_baseline(parcels, zoning_lookup, settings): df = pd.read_csv(os.path.join(misc.data_dir(), "2015_12_21_zoning_parcels.csv"), index_col="geom_id") df = pd.merge(df, zoning_lookup.to_frame(), left_on="zoning_id", right_index=True) df = geom_id_to_parcel_id(df, parcels) return df
def load_network_addons(network, file_name='PugetSoundNetworkAddons.h5'): store = pd.HDFStore(os.path.join(misc.data_dir(), file_name), "r") network.addons = {} for attr in map(lambda x: x.replace('/', ''), store.keys()): network.addons[attr] = pd.DataFrame({"node_id": network.node_ids.values}, index=network.node_ids.values) tmp = store[attr].drop_duplicates("node_id") tmp["has_poi"] = np.ones(tmp.shape[0], dtype="bool8") network.addons[attr] = pd.merge(network.addons[attr], tmp, how='left', on="node_id") network.addons[attr].set_index('node_id', inplace=True)
def non_mandatory_accessibility(): fname = get_logsum_file('non_mandatory') orca.add_injectable("nonmand_acc_file_2010", fname) df = pd.read_csv(os.path.join(misc.data_dir(), fname)) df.loc[df.subzone == 0, 'subzone'] = 'c' # no walk df.loc[df.subzone == 1, 'subzone'] = 'a' # short walk df.loc[df.subzone == 2, 'subzone'] = 'b' # long walk df['taz_sub'] = df.taz.astype('str') + df.subzone return df.set_index('taz_sub')
def craigslist(): df = pd.read_csv(os.path.join(misc.data_dir(), "sfbay_craigslist.csv")) net = orca.get_injectable('net') df['node_id'] = net['walk'].get_node_ids(df['lon'], df['lat']) df['tmnode_id'] = net['drive'].get_node_ids(df['lon'], df['lat']) # fill nans -- missing bedrooms are mostly studio apts df['bedrooms'] = df.bedrooms.replace(np.nan, 1) df['neighborhood'] = df.neighborhood.replace(np.nan, '') return df
def craigslist(): df = pd.read_csv(os.path.join(misc.data_dir(), "sfbay_craigslist.csv")) net = orca.get_injectable('net') df['node_id'] = net['walk'].get_node_ids(df['lon'], df['lat']) df['tmnode_id'] = net['drive'].get_node_ids(df['lon'], df['lat']) # fill nans -- missing bedrooms are mostly studio apts df['bedrooms'] = df.bedrooms.replace(np.nan, 1) df['neighborhood'] = df.neighborhood.replace(np.nan, '') return df
def zoning_lookup(): df = pd.read_csv(os.path.join(misc.data_dir(), "zoning_lookup.csv")) # this part is a bit strange - we do string matching on the names of zoning # in order ot link parcels and zoning and some of the strings have small # differences, so we copy the row and have different strings for the same # lookup row. for now we drop duplicates of the id field in order to run # in urbansim (all the attributes of rows that share an id are the same - # only the name is different) df = df.drop_duplicates(subset='id').set_index('id') return df
def verify(): hdf_store = pd.HDFStore(os.path.join(misc.data_dir(), "run4032_school_v2_baseyear.h5"), mode="r") new = verify_data_structure.yaml_from_store(hdf_store) with open(r"configs/data_structure.yaml", "w") as out: out.write(new) return hdf_store
def zoning_lookup(): df = pd.read_csv(os.path.join(misc.data_dir(), "zoning_lookup.csv")) # this part is a bit strange - we do string matching on the names of zoning # in order ot link parcels and zoning and some of the strings have small # differences, so we copy the row and have different strings for the same # lookup row. for now we drop duplicates of the id field in order to run # in urbansim (all the attributes of rows that share an id are the same - # only the name is different) df = df.drop_duplicates(subset='id').set_index('id') return df
def accessibilities_segmentation(): fname = get_logsum_file('segmentation') df = pd.read_csv(os.path.join(misc.data_dir(), fname)) df['AV'] = df['hasAV'].apply(lambda x: 'AV' if x == 1 else 'noAV') df['label'] = (df['incQ_label'] + '_' + df['autoSuff_label'] + '_' + df['AV']) df = df.groupby('label').sum() df['prop'] = df['num_persons'] / df['num_persons'].sum() df = df[['prop']].transpose().reset_index(drop=True) return df
def verify(): hdf_store = pd.HDFStore(os.path.join(misc.data_dir(), "all_semcog_data_02-02-18.h5"), mode="r") new = verify_data_structure.yaml_from_store(hdf_store) with open(r"configs/data_structure.yaml", "w") as out: out.write(new) return hdf_store
def build_networks(parcels): st = pd.HDFStore(os.path.join(misc.data_dir(), "osm_sandag.h5"), "r") nodes, edges = st.nodes, st.edges net = pdna.Network(nodes["x"], nodes["y"], edges["from"], edges["to"], edges[["weight"]]) net.precompute(3000) orca.add_injectable("net", net) p = parcels.to_frame(parcels.local_columns) p['node_id'] = net.get_node_ids(p['x'], p['y']) orca.add_table("parcels", p)
def build_networks(parcels): st = pd.HDFStore(os.path.join(misc.data_dir(), "osm_sandag.h5"), "r") nodes, edges = st.nodes, st.edges net = pdna.Network(nodes["x"], nodes["y"], edges["from"], edges["to"], edges[["weight"]]) net.precompute(3000) orca.add_injectable("net", net) p = parcels.to_frame(parcels.local_columns) p['node_id'] = net.get_node_ids(p['x'], p['y']) orca.add_table("parcels", p)
def accessibilities_segmentation(): fname = get_logsum_file('segmentation') df = pd.read_csv(os.path.join( misc.data_dir(), fname)) df['AV'] = df['hasAV'].apply(lambda x: 'AV' if x == 1 else 'noAV') df['label'] = (df['incQ_label'] + '_' + df['autoSuff_label'] + '_' + df['AV']) df = df.groupby('label').sum() df['prop'] = df['num_persons'] / df['num_persons'].sum() df = df[['prop']].transpose().reset_index(drop=True) return df
def zoning_baseline(parcels, zoning_lookup, settings): file = os.path.join(misc.data_dir(), "2020_11_05_zoning_parcels_hybrid_pba50.csv") print('Version of zoning_parcels: {}'.format(file)) df = pd.read_csv(file, index_col="geom_id") df = pd.merge(df, zoning_lookup.to_frame(), left_on="zoning_id", right_index=True) df = geom_id_to_parcel_id(df, parcels) return df
def load_network(precompute=None, file_name='PugetSoundNetwork.h5'): # load OSM from hdf5 file store = pd.HDFStore(os.path.join(misc.data_dir(), file_name), "r") nodes = store.nodes edges = store.edges nodes.index.name = "index" # something that Synthicity wanted to fix # create the network net=pdna.Network(nodes["x"], nodes["y"], edges["from"], edges["to"], edges[["distance"]]) if precompute is not None: for dist in precompute: net.precompute(dist) return net
def development_projects(parcels, settings, scenario): df = pd.read_csv(os.path.join(misc.data_dir(), "development_projects.csv")) df = reprocess_dev_projects(df) df = df[df.action.isin(["add", "build"])] # this filters project by scenario colname = "scen%s" % scenario # df[colname] is 1s and 0s indicating whether to include it # this used to be an optional filter but now I'm going to require it so # that we don't accidentally include all the development projects since # we've started using scenario-based dev projects pretty extensively df = df[df[colname].astype('bool')] df = df.dropna(subset=['geom_id']) for fld in [ 'residential_sqft', 'residential_price', 'non_residential_price' ]: df[fld] = 0 df["redfin_sale_year"] = 2012 # hedonic doesn't tolerate nans df["stories"] = df.stories.fillna(1) df["building_sqft"] = df.building_sqft.fillna(0) df["non_residential_sqft"] = df.non_residential_sqft.fillna(0) df["building_type"] = df.building_type.replace("HP", "OF") df["building_type"] = df.building_type.replace("GV", "OF") df["building_type"] = df.building_type.replace("SC", "OF") df["building_type_id"] = \ df.building_type.map(settings["building_type_map2"]) df = df.dropna(subset=["geom_id"]) # need a geom_id to link to parcel_id df = df.dropna(subset=["year_built"]) # need a year built to get built df["geom_id"] = df.geom_id.astype("int") df = df.query('residential_units != "rent"') df["residential_units"] = df.residential_units.fillna(0).astype("int") geom_id = df.geom_id df = df.set_index("geom_id") df = geom_id_to_parcel_id(df, parcels).reset_index() # use parcel id df["geom_id"] = geom_id.values # add it back again cause it goes away # we don't predict prices for schools and hotels right now df = df.query("building_type_id <= 4 or building_type_id >= 7") df["deed_restricted_units"] = 0 print "Describe of development projects" print df[orca.get_table('buildings').local_columns].describe() return df
def zoning_baseline(parcels, zoning_lookup, settings): df = pd.read_csv(os.path.join(misc.data_dir(), "2015_12_21_zoning_parcels.csv"), index_col="geom_id") df = pd.merge(df, zoning_lookup.to_frame(), left_on="zoning_id", right_index=True) df = geom_id_to_parcel_id(df, parcels) d = {k: "type%d" % v for k, v in settings["building_type_map2"].items()} df.columns = [d.get(x, x) for x in df.columns] return df
def zoning_baseline(parcels, zoning_lookup, settings): df = pd.read_csv(os.path.join(misc.data_dir(), "2015_12_21_zoning_parcels.csv"), index_col="geom_id") df = pd.merge(df, zoning_lookup.to_frame(), left_on="zoning_id", right_index=True) df = geom_id_to_parcel_id(df, parcels) d = {k: "type%d" % v for k, v in settings["building_type_map2"].items()} df.columns = [d.get(x, x) for x in df.columns] return df
def development_projects(parcels, settings, scenario): df = pd.read_csv(os.path.join(misc.data_dir(), "development_projects.csv")) df = reprocess_dev_projects(df) df = df[df.action.isin(["add", "build"])] # this filters project by scenario colname = "scen%s" % scenario # df[colname] is 1s and 0s indicating whether to include it # this used to be an optional filter but now I'm going to require it so # that we don't accidentally include all the development projects since # we've started using scenario-based dev projects pretty extensively df = df[df[colname].astype('bool')] df = df.dropna(subset=['geom_id']) for fld in ['residential_sqft', 'residential_price', 'non_residential_price']: df[fld] = 0 df["redfin_sale_year"] = 2012 # hedonic doesn't tolerate nans df["stories"] = df.stories.fillna(1) df["building_sqft"] = df.building_sqft.fillna(0) df["non_residential_sqft"] = df.non_residential_sqft.fillna(0) df["building_type"] = df.building_type.replace("HP", "OF") df["building_type"] = df.building_type.replace("GV", "OF") df["building_type"] = df.building_type.replace("SC", "OF") df["building_type_id"] = \ df.building_type.map(settings["building_type_map2"]) df = df.dropna(subset=["geom_id"]) # need a geom_id to link to parcel_id df = df.dropna(subset=["year_built"]) # need a year built to get built df["geom_id"] = df.geom_id.astype("int") df = df.query('residential_units != "rent"') df["residential_units"] = df.residential_units.fillna(0).astype("int") geom_id = df.geom_id df = df.set_index("geom_id") df = geom_id_to_parcel_id(df, parcels).reset_index() # use parcel id df["geom_id"] = geom_id.values # add it back again cause it goes away # we don't predict prices for schools and hotels right now df = df.query("building_type_id <= 4 or building_type_id >= 7") df["deed_restricted_units"] = 0 print "Describe of development projects" print df[orca.get_table('buildings').local_columns].describe() return df
def costar(store, parcels): df = pd.read_csv(os.path.join(misc.data_dir(), '2015_08_29_costar.csv')) df["PropertyType"] = df.PropertyType.replace("General Retail", "Retail") df = df[df.PropertyType.isin(["Office", "Retail", "Industrial"])] df["costar_rent"] = df["Average Weighted Rent"].astype('float') df["year_built"] = df["Year Built"].fillna(1980) df = df.dropna(subset=["costar_rent", "Latitude", "Longitude"]) # now assign parcel id df["parcel_id"] = nearest_neighbor( parcels.to_frame(['x', 'y']).dropna(subset=['x', 'y']), df[['Longitude', 'Latitude']]) return df
def costar(store, parcels): df = pd.read_csv(os.path.join(misc.data_dir(), '2015_08_29_costar.csv')) df["PropertyType"] = df.PropertyType.replace("General Retail", "Retail") df = df[df.PropertyType.isin(["Office", "Retail", "Industrial"])] df["costar_rent"] = df["Average Weighted Rent"].astype('float') df["year_built"] = df["Year Built"].fillna(1980) df = df.dropna(subset=["costar_rent", "Latitude", "Longitude"]) # now assign parcel id df["parcel_id"] = nearest_neighbor( parcels.to_frame(['x', 'y']).dropna(subset=['x', 'y']), df[['Longitude', 'Latitude']] ) return df
def update_sqftproforma(default_settings, yaml_file, proforma_uses, **kwargs): # extract uses blduses = proforma_uses[["building_type_id", "building_type_name", "is_residential"]].drop_duplicates() # put uses into the same order as the config file blduses = pd.merge(pd.DataFrame({"uses":default_settings.uses}), blduses, left_on="uses", right_on="building_type_name") # store in a dictionary local_settings = {} local_settings["uses"] = blduses.uses.values local_settings["residential_uses"] = blduses.is_residential local_settings["residential_uses"].index = blduses.building_type_id # get coefficient file for modeling price coeffile = os.path.join(misc.data_dir(), "expected_sales_unit_price_component_model_coefficients.csv") coefs = pd.read_csv(coeffile) coefs = pd.merge(coefs, proforma_uses[['building_type_name', "building_type_id"]].drop_duplicates(), right_on="building_type_id", left_on="sub_model_id", how="left") local_settings["price_coefs"] = coefs # Assemble forms forms = {} form_glut = {} form_density_type = {} for formid in np.unique(proforma_uses.template_id): subuse = proforma_uses[proforma_uses.template_id==formid] submerge = pd.merge(blduses, subuse, on='building_type_name', how="left") form_name = subuse.description.values[0] forms[form_name] = submerge.percent_building_sqft.fillna(0).values/100. form_glut[form_name] = subuse.generic_land_use_type_id.values[0] form_density_type[form_name] = subuse.density_type.values[0] # Conversion similar to sqftproforma._convert_types() local_settings["res_ratios"] = {} for form in forms.keys(): forms[form] /= forms[form].sum() # normalize local_settings["res_ratios"][form] = pd.Series(forms[form][np.where(local_settings["residential_uses"])]).sum() all_default_settings = yaml_to_dict(None, yaml_file) local_settings["forms"] = forms local_settings["forms_df"] = pd.DataFrame(forms, index = local_settings["uses"]).transpose() local_settings["form_glut"] = form_glut local_settings["form_density_type"] = form_density_type local_settings["forms_to_test"] = None local_settings['percent_of_max_profit'] = all_default_settings.get('percent_of_max_profit', 100) pf = default_settings for attr in local_settings.keys(): setattr(pf, attr, local_settings[attr]) pf.reference_dict = sqftproforma.SqFtProFormaReference(**pf.__dict__).reference_dict pf = update_sqftproforma_reference(pf) return pf
def taz_geography(superdistricts): tg = pd.read_csv(os.path.join(misc.data_dir(), "taz_geography.csv"), index_col="zone") # we want "subregion" geography on the taz_geography table # we have to go get it from the superdistricts table and join # using the superdistrcit id tg["subregion_id"] = \ superdistricts.subregion.loc[tg.superdistrict].values tg["subregion"] = tg.subregion_id.map({ 1: "Core", 2: "Urban", 3: "Suburban", 4: "Rural" }) return tg
def zoning_scenario(parcels_geography, scenario, settings): scenario_zoning = pd.read_csv( os.path.join(misc.data_dir(), 'zoning_mods_%s.csv' % scenario), dtype={'jurisdiction': 'str'}) d = {k: "type%d" % v for k, v in settings["building_type_map2"].items()} for k, v in d.items(): scenario_zoning[v] = scenario_zoning.add_bldg.str.contains(k) return pd.merge(parcels_geography.to_frame().reset_index(), scenario_zoning, on=['zoningmodcat'], how='left').set_index('parcel_id')
def taz_geography(superdistricts): tg = pd.read_csv( os.path.join(misc.data_dir(), "taz_geography.csv"), index_col="zone") # we want "subregion" geography on the taz_geography table # we have to go get it from the superdistricts table and join # using the superdistrcit id tg["subregion_id"] = \ superdistricts.subregion.loc[tg.superdistrict].values tg["subregion"] = tg.subregion_id.map({ 1: "Core", 2: "Urban", 3: "Suburban", 4: "Rural" }) return tg
def demolish_events(parcels, settings, scenario): df = pd.read_csv(os.path.join(misc.data_dir(), "development_projects.csv")) df = reprocess_dev_projects(df) # this filters project by scenario if scenario in df: # df[scenario] is 1s and 0s indicating whether to include it df = df[df[scenario].astype('bool')] # keep demolish and build records df = df[df.action.isin(["demolish", "build"])] df = df.dropna(subset=['geom_id']) df = df.set_index("geom_id") df = geom_id_to_parcel_id(df, parcels).reset_index() # use parcel id return df
def demolish_events(parcels, settings, scenario): df = pd.read_csv(os.path.join(misc.data_dir(), "development_projects.csv")) df = reprocess_dev_projects(df) # this filters project by scenario if scenario in df: # df[scenario] is 1s and 0s indicating whether to include it df = df[df[scenario].astype('bool')] # keep demolish and build records df = df[df.action.isin(["demolish", "build"])] df = df.dropna(subset=['geom_id']) df = df.set_index("geom_id") df = geom_id_to_parcel_id(df, parcels).reset_index() # use parcel id return df
def maz_forecast_inputs(regional_demographic_forecast): rdf = regional_demographic_forecast.to_frame() mfi = pd.read_csv(os.path.join(misc.data_dir(), "maz_forecast_inputs.csv"), index_col='MAZ').replace('#DIV/0!', np.nan) # apply regional share of hh by size to MAZs with no households in 2010 mfi.loc[mfi.shrs1_2010.isnull(), 'shrs1_2010'] = rdf.loc[rdf.year == 2010, 'shrs1'].values[0] mfi.loc[mfi.shrs2_2010.isnull(), 'shrs2_2010'] = rdf.loc[rdf.year == 2010, 'shrs2'].values[0] mfi.loc[mfi.shrs3_2010.isnull(), 'shrs3_2010'] = rdf.loc[rdf.year == 2010, 'shrs3'].values[0] # the fourth category here is missing the 'r' in the csv mfi.loc[mfi.shs4_2010.isnull(), 'shs4_2010'] = rdf.loc[rdf.year == 2010, 'shrs4'].values[0] mfi[['shrs1_2010', 'shrs2_2010', 'shrs3_2010', 'shs4_2010' ]] = mfi[['shrs1_2010', 'shrs2_2010', 'shrs3_2010', 'shs4_2010']].astype('float') return mfi
def zoning_scenario(parcels_geography, scenario, settings): scenario_zoning = pd.read_csv(os.path.join( misc.data_dir(), 'zoning_mods_%s.csv' % scenario), dtype={'jurisdiction': 'str'}) d = {k: "type%d" % v for k, v in settings["building_type_map2"].items()} for k, v in d.items(): scenario_zoning['add-' + v] = scenario_zoning.add_bldg.str.contains(k) for k, v in d.items(): scenario_zoning['drop-'+v] = scenario_zoning.drop_bldg.\ astype(str).str.contains(k) return pd.merge(parcels_geography.to_frame().reset_index(), scenario_zoning, on=['zoningmodcat'], how='left').set_index('parcel_id')
def maz_forecast_inputs(regional_demographic_forecast): rdf = regional_demographic_forecast.to_frame() mfi = pd.read_csv(os.path.join(misc.data_dir(), "maz_forecast_inputs.csv"), index_col='MAZ').replace('#DIV/0!', np.nan) # apply regional share of hh by size to MAZs with no households in 2010 mfi.loc[mfi.shrs1_2010.isnull(), 'shrs1_2010'] = rdf.loc[rdf.year == 2010, 'shrs1'].values[0] mfi.loc[mfi.shrs2_2010.isnull(), 'shrs2_2010'] = rdf.loc[rdf.year == 2010, 'shrs2'].values[0] mfi.loc[mfi.shrs3_2010.isnull(), 'shrs3_2010'] = rdf.loc[rdf.year == 2010, 'shrs3'].values[0] # the fourth category here is missing the 'r' in the csv mfi.loc[mfi.shs4_2010.isnull(), 'shs4_2010'] = rdf.loc[rdf.year == 2010, 'shrs4'].values[0] mfi[['shrs1_2010', 'shrs2_2010', 'shrs3_2010', 'shs4_2010']] = mfi[['shrs1_2010', 'shrs2_2010', 'shrs3_2010', 'shs4_2010']].astype('float') return mfi
def zoning_scenario(parcels_geography, scenario, settings): scenario_zoning = pd.read_csv( os.path.join(misc.data_dir(), 'zoning_mods_%s.csv' % scenario)) for k in settings["building_type_map"].keys(): scenario_zoning[k] = np.nan def add_drop_helper(col, val): for ind, item in scenario_zoning[col].iteritems(): if not isinstance(item, str): continue for btype in item.split(): scenario_zoning.loc[ind, btype] = val add_drop_helper("add_bldg", 1) add_drop_helper("drop_bldg", 0) return pd.merge(parcels_geography.to_frame().reset_index(), scenario_zoning, on=['zoningmodcat'], how='left').set_index('parcel_id')
def zoning_scenario(parcels_geography, scenario, settings): scenario_zoning = pd.read_csv( os.path.join(misc.data_dir(), 'zoning_mods_%s.csv' % scenario)) for k in settings["building_type_map"].keys(): scenario_zoning[k] = np.nan def add_drop_helper(col, val): for ind, item in scenario_zoning[col].iteritems(): if not isinstance(item, str): continue for btype in item.split(): scenario_zoning.loc[ind, btype] = val add_drop_helper("add_bldg", 1) add_drop_helper("drop_bldg", 0) return pd.merge(parcels_geography.to_frame().reset_index(), scenario_zoning, on=['zoningmodcat'], how='left').set_index('parcel_id')
def zoning_scenario(parcels_geography, scenario, policy, mapping): if (scenario in ["11", "12", "15"]) and\ (scenario not in policy["geographies_fr2_enable"]): scenario = str(int(scenario) - 10) scenario_zoning = pd.read_csv( os.path.join(misc.data_dir(), 'zoning_mods_%s.csv' % scenario)) if "ppa_id" in scenario_zoning.columns: orca.add_injectable("ppa", "are included") else: orca.add_injectable("ppa", "are not included") for k in mapping["building_type_map"].keys(): scenario_zoning[k] = np.nan def add_drop_helper(col, val): for ind, item in scenario_zoning[col].items(): if not isinstance(item, str): continue for btype in item.split(): scenario_zoning.loc[ind, btype] = val add_drop_helper("add_bldg", 1) add_drop_helper("drop_bldg", 0) if scenario in policy['geographies_fb_enable']: join_col = 'fbpzoningmodcat' elif scenario in policy['geographies_db_enable']: join_col = 'pba50zoningmodcat' elif 'zoninghzcat' in scenario_zoning.columns: join_col = 'zoninghzcat' else: join_col = 'zoningmodcat' return pd.merge(parcels_geography.to_frame().reset_index(), scenario_zoning, on=join_col, how='left').set_index('parcel_id')
def taz2_forecast_inputs(regional_demographic_forecast): t2fi = pd.read_csv(os.path.join(misc.data_dir(), "taz2_forecast_inputs.csv"), index_col='TAZ').replace('#DIV/0!', np.nan) rdf = regional_demographic_forecast.to_frame() # apply regional share of hh by size to MAZs with no households in 2010 t2fi.loc[t2fi.shrw0_2010.isnull(), 'shrw0_2010'] = rdf.loc[rdf.year == 2010, 'shrw0'].values[0] t2fi.loc[t2fi.shrw1_2010.isnull(), 'shrw1_2010'] = rdf.loc[rdf.year == 2010, 'shrw1'].values[0] t2fi.loc[t2fi.shrw2_2010.isnull(), 'shrw2_2010'] = rdf.loc[rdf.year == 2010, 'shrw2'].values[0] t2fi.loc[t2fi.shrw3_2010.isnull(), 'shrw3_2010'] = rdf.loc[rdf.year == 2010, 'shrw3'].values[0] # apply regional share of persons by age category t2fi.loc[t2fi.shra1_2010.isnull(), 'shra1_2010'] = rdf.loc[rdf.year == 2010, 'shra1'].values[0] t2fi.loc[t2fi.shra2_2010.isnull(), 'shra2_2010'] = rdf.loc[rdf.year == 2010, 'shra2'].values[0] t2fi.loc[t2fi.shra3_2010.isnull(), 'shra3_2010'] = rdf.loc[rdf.year == 2010, 'shra3'].values[0] t2fi.loc[t2fi.shra4_2010.isnull(), 'shra4_2010'] = rdf.loc[rdf.year == 2010, 'shra4'].values[0] # apply regional share of hh by presence of children t2fi.loc[t2fi.shrn_2010.isnull(), 'shrn_2010'] = rdf.loc[rdf.year == 2010, 'shrn'].values[0] t2fi.loc[t2fi.shry_2010.isnull(), 'shry_2010'] = rdf.loc[rdf.year == 2010, 'shry'].values[0] t2fi[[ 'shrw0_2010', 'shrw1_2010', 'shrw2_2010', 'shrw3_2010', 'shra1_2010', 'shra2_2010', 'shra3_2010', 'shra4_2010', 'shrn_2010', 'shry_2010' ]] = t2fi[[ 'shrw0_2010', 'shrw1_2010', 'shrw2_2010', 'shrw3_2010', 'shra1_2010', 'shra2_2010', 'shra3_2010', 'shra4_2010', 'shrn_2010', 'shry_2010' ]].astype('float') return t2fi
def taz2_forecast_inputs(regional_demographic_forecast): t2fi = pd.read_csv(os.path.join(misc.data_dir(), "taz2_forecast_inputs.csv"), index_col='TAZ').replace('#DIV/0!', np.nan) rdf = regional_demographic_forecast.to_frame() # apply regional share of hh by size to MAZs with no households in 2010 t2fi.loc[t2fi.shrw0_2010.isnull(), 'shrw0_2010'] = rdf.loc[rdf.year == 2010, 'shrw0'].values[0] t2fi.loc[t2fi.shrw1_2010.isnull(), 'shrw1_2010'] = rdf.loc[rdf.year == 2010, 'shrw1'].values[0] t2fi.loc[t2fi.shrw2_2010.isnull(), 'shrw2_2010'] = rdf.loc[rdf.year == 2010, 'shrw2'].values[0] t2fi.loc[t2fi.shrw3_2010.isnull(), 'shrw3_2010'] = rdf.loc[rdf.year == 2010, 'shrw3'].values[0] # apply regional share of persons by age category t2fi.loc[t2fi.shra1_2010.isnull(), 'shra1_2010'] = rdf.loc[rdf.year == 2010, 'shra1'].values[0] t2fi.loc[t2fi.shra2_2010.isnull(), 'shra2_2010'] = rdf.loc[rdf.year == 2010, 'shra2'].values[0] t2fi.loc[t2fi.shra3_2010.isnull(), 'shra3_2010'] = rdf.loc[rdf.year == 2010, 'shra3'].values[0] t2fi.loc[t2fi.shra4_2010.isnull(), 'shra4_2010'] = rdf.loc[rdf.year == 2010, 'shra4'].values[0] # apply regional share of hh by presence of children t2fi.loc[t2fi.shrn_2010.isnull(), 'shrn_2010'] = rdf.loc[rdf.year == 2010, 'shrn'].values[0] t2fi.loc[t2fi.shry_2010.isnull(), 'shry_2010'] = rdf.loc[rdf.year == 2010, 'shry'].values[0] t2fi[['shrw0_2010', 'shrw1_2010', 'shrw2_2010', 'shrw3_2010', 'shra1_2010', 'shra2_2010', 'shra3_2010', 'shra4_2010', 'shrn_2010', 'shry_2010']] = t2fi[['shrw0_2010', 'shrw1_2010', 'shrw2_2010', 'shrw3_2010', 'shra1_2010', 'shra2_2010', 'shra3_2010', 'shra4_2010', 'shrn_2010', 'shry_2010']].astype('float') return t2fi
def get_dev_projects_table(scenario, parcels): df = pd.read_csv(os.path.join(misc.data_dir(), "development_projects.csv")) df = reprocess_dev_projects(df) # this filters project by scenario if scenario in df: # df[scenario] is 1s and 0s indicating whether to include it df = df[df[scenario].astype('bool')] df = df.dropna(subset=['geom_id']) cnts = df.geom_id.isin(parcels.geom_id).value_counts() if False in cnts.index: print "%d MISSING GEOMIDS!" % cnts.loc[False] df = df[df.geom_id.isin(parcels.geom_id)] geom_id = df.geom_id # save for later df = df.set_index("geom_id") df = geom_id_to_parcel_id(df, parcels).reset_index() # use parcel id df["geom_id"] = geom_id.values # add it back again cause it goes away return df