# reformat the projections table df_ps = projections[projections.projected.isin(ps_)].drop(columns=['projected'])\ .groupby('district')\ .sum().reset_index()\ .melt('district', var_name='school_year', value_name='ps') df_is = projections[projections.projected.isin(is_)].drop(columns=['projected'])\ .groupby('district')\ .sum().reset_index()\ .melt('district', var_name='school_year', value_name='is') projections = pd.merge(df_ps, df_is, on=['district', 'school_year']) # reformat the subdistrict percentage table pct['multiplier'] = pct.multiplier.astype(float) pct = pct.groupby(['district', 'subdistrict', 'level'])\ .multiplier.sum().unstack(fill_value=0).reset_index()\ .rename(columns={'MS':'is_multiplier', 'PS':'ps_multiplier'}) # merge two tables and perform column transformation df = pd.merge(pct, projections, how='outer', on=['district'])\ .sort_values(by=['district','subdistrict']) df['school_year'] = df.school_year.apply(lambda x: x[:4]) df['ps'] = df['ps'] * df.ps_multiplier df['is'] = df['is'] * df.is_multiplier df['ps'] = df['ps'].apply(lambda x: math.ceil(x)).astype(int) df['is'] = df['is'].apply(lambda x: math.ceil(x)).astype(int) # export table to EDM_DATA exporter(df=df, output_table=output_table, DDL=DDL)
input_table = config['inputs'][0] input_boundary = config['inputs'][1] output_table = config['outputs'][0]['output_table'] DDL = config['outputs'][0]['DDL'] import_sql = f''' SELECT *, municipality_desc AS borough, wkb_geometry AS geom FROM {input_table} c WHERE wkb_geometry IS NOT NULL AND c.ogc_fid IN ( SELECT a.ogc_fid FROM {input_table} a, ( SELECT ST_Union(wkb_geometry) As wkb_geometry FROM {input_boundary} ) b WHERE ST_Contains(b.wkb_geometry, a.wkb_geometry) OR ST_Intersects(b.wkb_geometry, a.wkb_geometry) ); ''' # import data df = gpd.GeoDataFrame.from_postgis(import_sql, con=recipe_engine, geom_col='geom') os.system('echo "exporting table ..."') # export table to EDM_DATA exporter(df=df, output_table=output_table, DDL=DDL, sep='~', geo_column='geom')
config = load_config(Path(__file__).parent / 'config.json') input_table = config['inputs'][0] #ctpp_journey_to_work output_table = config['outputs'][0][ 'output_table'] #ctpp_censustract_lookup DDL = config['outputs'][0]['DDL'] # load the raw dataset df = pd.read_sql(f''' SELECT res_tract AS residential_geoid, work_tract AS work_geoid, mode, "totwork_16+" AS count, standard_error, workplace_state_county FROM {input_table} ''', con=recipe_engine) # filter out records with workplaces outside the geo_list df = df[df['workplace_state_county'].isin(geo_list)] # map the mode field to its detailed definition df['mode'] = df['mode'].apply(lambda x: MODE.get(x, '')) df['count'] = df['count'].astype('int64') df['standard_error'] = df['standard_error'].astype('double') df['MODE'] = df['mode'] os.system('echo "exporting table ..."') # export to EDM_DATA exporter(df, output_table, DDL, sep='|')
if num[0] == '.': return '0' + num else: return num except: return num if __name__ == "__main__": config = load_config(Path(__file__).parent / 'config.json') input_table = config['inputs'][0] output_table = config['outputs'][0]['output_table'] DDL = config['outputs'][0]['DDL'] df = pd.read_sql(f'''SELECT geoid, lineno AS variable, est AS value, moe FROM {input_table}''', con=recipe_engine) # clean value and moe df['value'] = df['value'].apply(lambda x: x.replace(',', '')).astype(int) df['moe'] = df['moe'].apply(lambda x: x.replace(',','').replace('+/-', ''))\ .replace(r'^\s*$', np.nan, regex=True).apply(add_zero).astype(float) # conduct data etl df = etl(df) # export to EDM_DATA exporter(df, output_table, DDL)
con=recipe_engine, geom_col='wkb_geometry') # merge the 3 states shapefile together ct_shp = ct.append(ny).append(nj) # calculate the centroid for each census tract ct_shp['centroid'] = ct_shp['wkb_geometry'].centroid # rename the geoid column ct_shp.rename(columns={'GEOID': 'geoid'}, inplace=True) df = pd.read_sql(f''' SELECT residential_geoid, work_geoid FROM {ctpp_journey_to_work} ''', con=edm_engine) # find out the unique census tracts between residential_geoid and work_geoid geoid_list = pd.concat([df.residential_geoid, df.work_geoid]).unique().astype('str') # turn the unique census tract list into a dataframe geoid_df = pd.DataFrame({'geoid': geoid_list}) # merge the journey to work census tract list with shapefile df_geo = pd.merge(geoid_df, ct_shp[['geoid', 'centroid']], on='geoid') df_geo['geom'] = df_geo['centroid'].apply(lambda x: loads(dumps(x)).wkt) exporter(df_geo, output_table, DDL=DDL, geo_column='geom')
with Pool(processes=cpu_count()) as pool: it = pool.map(geocode, records, 10000) df = pd.DataFrame(it) df['uid'] = df.apply(lambda x: hashlib.md5(x.to_string().encode()).hexdigest(), axis=1) df['geo_longitude'] = pd.to_numeric(df['geo_longitude'], errors='coerce') df['geo_latitude'] = pd.to_numeric(df['geo_latitude'], errors='coerce') df = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.geo_longitude, df.geo_latitude)) df['geom'] = df['geometry'].apply(lambda x: None if np.isnan(x.xy[0]) else str(x)) geo_rejects = df[(df['geom'].isnull())&(df['geo_x_coord']=='')&(df['geo_from_x_coord'].isnull())&(df['geo_xy_coord']=='')] print('Percent of records geocoded: ', (len(df)-len(geo_rejects))/len(df)) # Export unfiltered table to EDM_DATA exporter(df=df, output_table=output_table_all, con=edm_engine, geo_column='geom', DDL=DDL, sql=get_sql(output_table_all, output_table_schema_all)) # Remove special ed cases df_filtered = df[(df['district']!='75')&(df.org_level!='PK')&(df.org_level!='3K')] # Export filtered table to EDM_DATA exporter(df=df_filtered, output_table=output_table, con=edm_engine, geo_column='geom', DDL=DDL, sql=get_sql(output_table, output_table_schema))
# import data df = pd.read_sql(f''' select * from {input_table} where type ~* 'Capacity Projects' ''', con=recipe_engine) # perform column transformation df = df.rename(columns={'projectid': 'project_dsf', 'schoolname': 'name'}) df['org_level'] = df['name'].apply(guess_org_level) df['capacity'] = df['forecastcapacity'].apply(get_capacity_number).fillna(0).astype(int) df['pct_ps'] = df['org_level'].apply(estimate_pct_ps) df['pct_is'] = df['org_level'].apply(estimate_pct_is) df['pct_hs'] = df['org_level'].apply(estimate_pct_hs) df['guessed_pct'] = df['org_level'].apply(lambda x: True if x == 'PSIS' else False) df['start_date'] = df['constrstart'].apply(get_date) df['planned_end_date'] = df['actualestcompletion'].apply(get_date) df['total_est_cost'] = df['totalestcost'].apply(get_cost_number) df['funding_current_budget'] = df['fundingreqd'].apply(get_fund_number) df['funding_previous'] = df['previousappropriations'].apply(get_fund_number) df['pct_funded'] = df.apply(lambda row: (row['funding_previous']\ +row['funding_current_budget'])\ /row['total_est_cost'], axis=1) # export table to EDM_DATA exporter(df=df, output_table=output_table, con=edm_engine, DDL=DDL)