Esempio n. 1
0
    # reformat the projections table
    df_ps = projections[projections.projected.isin(ps_)].drop(columns=['projected'])\
                                                        .groupby('district')\
                                                        .sum().reset_index()\
                                                        .melt('district', var_name='school_year', value_name='ps')
    df_is = projections[projections.projected.isin(is_)].drop(columns=['projected'])\
                                                        .groupby('district')\
                                                        .sum().reset_index()\
                                                        .melt('district', var_name='school_year', value_name='is')

    projections = pd.merge(df_ps, df_is, on=['district', 'school_year'])

    # reformat the subdistrict percentage table
    pct['multiplier'] = pct.multiplier.astype(float)
    pct = pct.groupby(['district', 'subdistrict', 'level'])\
             .multiplier.sum().unstack(fill_value=0).reset_index()\
             .rename(columns={'MS':'is_multiplier', 'PS':'ps_multiplier'})

    # merge two tables and perform column transformation
    df = pd.merge(pct, projections, how='outer', on=['district'])\
           .sort_values(by=['district','subdistrict'])
    df['school_year'] = df.school_year.apply(lambda x: x[:4])
    df['ps'] = df['ps'] * df.ps_multiplier
    df['is'] = df['is'] * df.is_multiplier
    df['ps'] = df['ps'].apply(lambda x: math.ceil(x)).astype(int)
    df['is'] = df['is'].apply(lambda x: math.ceil(x)).astype(int)

    # export table to EDM_DATA
    exporter(df=df, output_table=output_table, DDL=DDL)
Esempio n. 2
0
    input_table = config['inputs'][0]
    input_boundary = config['inputs'][1]
    output_table = config['outputs'][0]['output_table']
    DDL = config['outputs'][0]['DDL']

    import_sql = f'''
            SELECT *, municipality_desc AS borough, wkb_geometry AS geom
            FROM {input_table} c
            WHERE wkb_geometry IS NOT NULL AND
            c.ogc_fid IN (
                SELECT a.ogc_fid FROM
                {input_table} a, (
                    SELECT ST_Union(wkb_geometry) As wkb_geometry
                    FROM {input_boundary}
                ) b
                WHERE ST_Contains(b.wkb_geometry, a.wkb_geometry)
                OR ST_Intersects(b.wkb_geometry, a.wkb_geometry)
            );
    '''
    # import data
    df = gpd.GeoDataFrame.from_postgis(import_sql,
                                       con=recipe_engine,
                                       geom_col='geom')

    os.system('echo "exporting table ..."')
    # export table to EDM_DATA
    exporter(df=df,
             output_table=output_table,
             DDL=DDL,
             sep='~',
             geo_column='geom')
Esempio n. 3
0
    config = load_config(Path(__file__).parent / 'config.json')
    input_table = config['inputs'][0]  #ctpp_journey_to_work
    output_table = config['outputs'][0][
        'output_table']  #ctpp_censustract_lookup

    DDL = config['outputs'][0]['DDL']

    # load the raw dataset
    df = pd.read_sql(f'''
                    SELECT res_tract AS residential_geoid, 
                            work_tract AS work_geoid, mode, 
                            "totwork_16+" AS count, 
                            standard_error, 
                            workplace_state_county 
                    FROM {input_table}
                    ''',
                     con=recipe_engine)

    # filter out records with workplaces outside the geo_list
    df = df[df['workplace_state_county'].isin(geo_list)]

    # map the mode field to its detailed definition
    df['mode'] = df['mode'].apply(lambda x: MODE.get(x, ''))
    df['count'] = df['count'].astype('int64')
    df['standard_error'] = df['standard_error'].astype('double')
    df['MODE'] = df['mode']

    os.system('echo "exporting table ..."')
    # export to EDM_DATA
    exporter(df, output_table, DDL, sep='|')
Esempio n. 4
0
        if num[0] == '.':
            return '0' + num
        else:
            return num
    except:
        return num


if __name__ == "__main__":
    config = load_config(Path(__file__).parent / 'config.json')
    input_table = config['inputs'][0]
    output_table = config['outputs'][0]['output_table']
    DDL = config['outputs'][0]['DDL']

    df = pd.read_sql(f'''SELECT geoid, 
                        lineno AS variable, 
                        est AS value, moe 
                        FROM {input_table}''',
                     con=recipe_engine)

    # clean value and moe
    df['value'] = df['value'].apply(lambda x: x.replace(',', '')).astype(int)
    df['moe'] = df['moe'].apply(lambda x: x.replace(',','').replace('+/-', ''))\
                            .replace(r'^\s*$', np.nan, regex=True).apply(add_zero).astype(float)

    # conduct data etl
    df = etl(df)

    # export to EDM_DATA
    exporter(df, output_table, DDL)
Esempio n. 5
0
                                       con=recipe_engine,
                                       geom_col='wkb_geometry')

    # merge the 3 states shapefile together
    ct_shp = ct.append(ny).append(nj)

    # calculate the centroid for each census tract
    ct_shp['centroid'] = ct_shp['wkb_geometry'].centroid

    # rename the geoid column
    ct_shp.rename(columns={'GEOID': 'geoid'}, inplace=True)

    df = pd.read_sql(f'''
                    SELECT residential_geoid, work_geoid
                    FROM {ctpp_journey_to_work}
                    ''',
                     con=edm_engine)

    # find out the unique census tracts between residential_geoid and work_geoid
    geoid_list = pd.concat([df.residential_geoid,
                            df.work_geoid]).unique().astype('str')

    # turn the unique census tract list into a dataframe
    geoid_df = pd.DataFrame({'geoid': geoid_list})

    # merge the journey to work census tract list with shapefile
    df_geo = pd.merge(geoid_df, ct_shp[['geoid', 'centroid']], on='geoid')
    df_geo['geom'] = df_geo['centroid'].apply(lambda x: loads(dumps(x)).wkt)

    exporter(df_geo, output_table, DDL=DDL, geo_column='geom')
Esempio n. 6
0
    with Pool(processes=cpu_count()) as pool:
        it = pool.map(geocode, records, 10000)
    
    df = pd.DataFrame(it)
    df['uid'] = df.apply(lambda x: hashlib.md5(x.to_string().encode()).hexdigest(), axis=1)
    df['geo_longitude'] = pd.to_numeric(df['geo_longitude'], errors='coerce')
    df['geo_latitude'] = pd.to_numeric(df['geo_latitude'], errors='coerce')
    df = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.geo_longitude, df.geo_latitude))
    df['geom'] = df['geometry'].apply(lambda x: None if np.isnan(x.xy[0]) else str(x))

    geo_rejects = df[(df['geom'].isnull())&(df['geo_x_coord']=='')&(df['geo_from_x_coord'].isnull())&(df['geo_xy_coord']=='')]
    print('Percent of records geocoded: ', (len(df)-len(geo_rejects))/len(df))

    # Export unfiltered table to EDM_DATA
    exporter(df=df, 
            output_table=output_table_all, 
            con=edm_engine,
            geo_column='geom', 
            DDL=DDL,
            sql=get_sql(output_table_all, output_table_schema_all))

    # Remove special ed cases
    df_filtered = df[(df['district']!='75')&(df.org_level!='PK')&(df.org_level!='3K')]

    # Export filtered table to EDM_DATA
    exporter(df=df_filtered, 
            output_table=output_table, 
            con=edm_engine,
            geo_column='geom', 
            DDL=DDL,
            sql=get_sql(output_table, output_table_schema))
Esempio n. 7
0
    # import data
    df = pd.read_sql(f'''
        select * from {input_table}
        where type ~* 'Capacity Projects' 
        ''', con=recipe_engine)

    # perform column transformation
    df = df.rename(columns={'projectid': 'project_dsf', 
                            'schoolname': 'name'})

    df['org_level'] = df['name'].apply(guess_org_level)
    df['capacity'] = df['forecastcapacity'].apply(get_capacity_number).fillna(0).astype(int)
    df['pct_ps'] = df['org_level'].apply(estimate_pct_ps)
    df['pct_is'] = df['org_level'].apply(estimate_pct_is)
    df['pct_hs'] = df['org_level'].apply(estimate_pct_hs)
    df['guessed_pct'] = df['org_level'].apply(lambda x: True if x == 'PSIS' else False)
    df['start_date'] = df['constrstart'].apply(get_date)
    df['planned_end_date'] = df['actualestcompletion'].apply(get_date)
    df['total_est_cost'] = df['totalestcost'].apply(get_cost_number)
    df['funding_current_budget'] = df['fundingreqd'].apply(get_fund_number)
    df['funding_previous'] = df['previousappropriations'].apply(get_fund_number)
    df['pct_funded'] = df.apply(lambda row: (row['funding_previous']\
                                +row['funding_current_budget'])\
                                    /row['total_est_cost'], axis=1)  

    # export table to EDM_DATA
    exporter(df=df, 
            output_table=output_table, 
            con=edm_engine, 
            DDL=DDL)