Ejemplo n.º 1
0
    df = pd.concat([process_df(file) for file in file_list])
    df.to_csv(bls_data_dict['appended zipcode'] + "appended_zip.csv", index = False)

def filter_bls_data(df, address_df):
    # pad zipcode to 5 digits
    df['zip'] ="_" + df['zip'].astype(str).str.pad(5, side="left", fillchar = "0")
    # merge city from address data
    df = df.merge(address_df[['parsed_city', 'parsed_addr_zip']].drop_duplicates(), how = "inner", left_on = "zip",
                  right_on = "parsed_addr_zip")
    print(df['parsed_city'].value_counts())
    print(df.groupby(['parsed_city']).agg(**{"num_zip":('zip','count' ), "num_unique_zip": ('zip', 'nunique')}))
    return df

if __name__ == "__main__":
    # process_raw_data()
    data_dict = make_data_dict(use_seagate=True)
    city_dict = {
        "stl": "",
        "sf": "^san francisco$",
        "seattle": "^seattle$",
        "sd": "^san diego$",
        "chicago" : "^chicago$",
        "baton_rouge": "^baton rouge$",
        "la": "^los angeles$",
        'philly': ""
    }
    def filter_df(df, city):
        df = df[df['parsed_city'].fillna("").str.contains(city)]
        # print(city, df['parsed_city'].value_counts())
        return df
    id. startDate endDate type dataframes

Main function takes in cleaned business dataframe, makes misc business vars,converts business dataframe into panel and
writes business dataframe to csv
"""
import pandas as pd
import numpy as np
import math
import re
from helper_functions import write_to_log, WTL_TIME, fuzzy_merge, get_nearest_address, make_panel
from data_constants import make_data_dict, filePrefix
from name_parsing import parse_business
from clean_address_data import parallelize_dataframe
from typing import Union

data_dict = make_data_dict(use_seagate=False)


# function that determines if variable is a chain in a given year
def make_chain_var(df,
                   name_col='business_id',
                   time_col='year',
                   loc_col='num_locations'):
    """
    Function takes in a dataframe with a name and time column and returns columns containing:
        the number of observations with the same name and an indicator variable if the number of observations is
        greater than the threshold
    :param df: dataframe
    :param name_col: string type column of names
    :param time_col: usually is something like year, but technically can be any second variable to group on
    :param loc_col: name of num_observations column to be made
Ejemplo n.º 3
0
def make_qc_aggs(bls_df: pd.DataFrame, city: str, make_naics_aggs=False):
    data_dict = make_data_dict(use_seagate=True)
    bus_df = pd.read_csv(data_dict['final'][city]['business_location'] + "business_locations.csv",
                         usecols=["year", "parsed_city", "parsed_addr_zip","is_business",
                                  "cleaned_business_name", "cleaned_dba_name", "primary_cleaned_fullAddress"]).\
        drop_duplicates(subset = ["cleaned_business_name", "cleaned_dba_name", "primary_cleaned_fullAddress", "year"])
    bus_df = bus_df.assign(index=np.arange(bus_df.shape[0]))
    bus_df = bus_df[bus_df['is_business'] != "person"]
    bls_df = bls_df[(bls_df['parsed_city'].isin(bus_df['parsed_city']))
                    & (bls_df['year'].isin(bus_df['year']))]
    bls_city_agg = (bls_df.groupby(['parsed_city', 'year']).agg(**{
        "num_establishments": ('est', 'sum')
    }).reset_index())

    bus_df_city_agg = (bus_df.groupby(['parsed_city', 'year']).agg(**{
        "num_establishments": ('index', 'count')
    }).reset_index())
    city_agg = pd.merge(bls_city_agg,
                        bus_df_city_agg,
                        how="outer",
                        suffixes=["_bls", "_business_loc"],
                        on=["parsed_city", "year"])
    # repeat for zipcode
    bls_city_zip_agg = (bls_df.groupby(['parsed_city', 'zip', 'year']).agg(**{
        "num_establishments": ('est', 'sum')
    }).reset_index())
    bus_df_city_zip_agg = (bus_df.groupby(
        ['parsed_city', "parsed_addr_zip", 'year']).agg(**{
            "num_establishments": ('index', 'count')
        }).reset_index())
    city_zip_agg = pd.merge(
        bls_city_zip_agg,
        bus_df_city_zip_agg,
        how="outer",
        suffixes=["_bls", "_business_loc"],
        left_on=["parsed_city", "year", 'zip'],
        right_on=["parsed_city", "year", 'parsed_addr_zip'])
    if make_naics_aggs is not False:
        bls_city_naics_agg = (bls_df.groupby(
            ['parsed_city', "naics",
             'year']).agg(**{"num_establishments": ('est', 'sum')}))
        bus_df_city_naics_agg = (bus_df.groupby(
            ['parsed_city', 'naics',
             'year']).agg(**{"num_establishments": 'size'}))
        city_naics_agg = pd.merge(bls_city_naics_agg,
                                  bus_df_city_naics_agg,
                                  how="outer",
                                  suffixes=["_bls", "_business_loc"],
                                  on=["parsed_city", "year", "naics"])
        bls_city_zip_naics_agg = (bls_df.groupby(
            ['parsed_city', 'zip', "naics",
             'year']).agg(**{"num_establishments": ('est', 'sum')}))
        bus_df_city_zip_naics_agg = (bus_df.groupby(
            ['parsed_city', "parsed_addr_zip", 'naics',
             'year']).agg(**{"num_establishments": 'size'}))
        city_zip_naics_agg = pd.merge(
            bls_city_zip_naics_agg,
            bus_df_city_zip_naics_agg,
            how="outer",
            suffixes=["_bls", "_business_loc"],
            left_on=["parsed_city", "year", "naics", 'zip'],
            right_on=["parsed_city", "year", "naics", 'parsed_addr_zip'])
        city_naics_agg.to_csv(filePrefix + f"/qc/bls_{city}_naics_agg.csv",
                              index=False)
        city_zip_naics_agg.to_csv(filePrefix +
                                  f"/qc/bls_{city}_zip_naics_agg.csv",
                                  index=False)

    city_agg.to_csv(filePrefix + f"/qc/bls_{city}_agg.csv", index=False)

    city_zip_agg.to_csv(filePrefix + f"/qc/bls_{city}_zip_agg.csv",
                        index=False)