def test_compute_moving_avg_from_daily_data(): daily_vacc_data = soda_data.VACCINATION_DATA_OBJ response = socrata_api_requests.SocrataAPIClient( daily_vacc_data.request_url) daily_data_df = response.data_df col_to_avg = 'total_doses_daily' data_transformations.compute_moving_avg_from_daily_data( daily_data_df, 'zip_code', 'date', [col_to_avg]) true_avg = statistics.mean( daily_data_df.loc[daily_data_df['zip_code'] == "60637"] ['total_doses_daily'][:data_transformations.MOVING_AVG_WINDOW]) assert true_avg == (daily_data_df.loc[daily_data_df['zip_code'] == '60637'] [data_transformations.MOVING_AVG_COL_PREFIX + col_to_avg][6:7].values[0])
def test_soda_data_groupby_query(): soda_obj_groupby_query = soda_data.SodaData( "Traffic Crashes - Crashes", "TRAFFIC_CRASHES", "85ca-t3if", ["COUNT(CRASH_RECORD_ID)", "CRASH_DATE"], group_by=['CRASH_DATE'], limit=100) api_resp_groupby = socrata_api_requests.SocrataAPIClient( soda_obj_groupby_query.request_url) correct_query = "https://data.cityofchicago.org/resource/85ca-t3if.json" \ "?$query=SELECT COUNT(CRASH_RECORD_ID), " \ "CRASH_DATE GROUP BY CRASH_DATE LIMIT 100" assert correct_query == soda_obj_groupby_query.request_url assert api_resp_groupby.response.status_code == 200
db = dbclient.DBClient(db_path=dbclient.DB_PATH_TEST) # SOCRATA DATA PROCESS [data from https://data.cityofchicago.org] # 1. get SodaData obj (representing single dataset) from soda_data global const # 2. use SocrataAPIClient to get dataset, using SodaData.request_url # this returns a json that is converted to pandas dataframe # by default, all data values are of type str # 3. standardize # 4. compute weekly averages # 5. use dbclient to create sql table from the pandas df # Vaccinations data_obj = soda_data.VACCINATION_DATA_OBJ # 1 print(f" ##### making api request and create table for {data_obj.dataset_name} ####") print(f" sqlite table will be named {data_obj.sql_table_name}") api_resp = socrata_api_requests.SocrataAPIClient(data_obj.request_url) # 2 data_transformations.standardize_zip_code_col(api_resp.data_df, soda_data.VACC_ZIP_COL_NAME) # 3 data_transformations.standardize_date_col(api_resp.data_df, soda_data.VACC_DATE_COL_NAME) data_transformations.\ compute_moving_avg_from_daily_data(api_resp.data_df, data_transformations.STD_ZIP_COL_NAME, # should store this data_transformations.STD_DATE_COL_NAME, # this too data_obj.COLS_TO_AVG) # 4 db.create_table_from_pandas(api_resp.data_df, data_obj.sql_table_name) # 5 print(f" request url: {api_resp.request_url}") print(f" request headers {api_resp.header_fields}") print(f" request header dtypes {api_resp.header_dtypes}") print("~~~~ pandas df dtypes ~~~~") print(api_resp.data_df.dtypes) print("~~~~ sql table info ~~~~~") print(db.get_table_info(data_obj.sql_table_name))
def build_back2normal_db(): """ Builds database from various datasources For each source: 1. get data from source (API, CSV, etc) and convert to pandas DatFrame 2. standardize zip code col name and format 3. standardize date col name and format 4. compute 7 day moving average 5. create table in sqlite db """ if os.path.exists(dbclient.DB_PATH): print("Deleting existing db and recreating with build_db_script\n") os.remove(dbclient.DB_PATH) db = dbclient.DBClient() # Vaccinations vacc_data_obj = soda_data.VACCINATION_DATA_OBJ vacc_api_resp = socrata_api_requests.SocrataAPIClient(vacc_data_obj.request_url) data_transformations.standardize_zip_code_col(vacc_api_resp.data_df, soda_data.VACC_ZIP_COL_NAME) data_transformations.standardize_date_col(vacc_api_resp.data_df, soda_data.VACC_DATE_COL_NAME) data_transformations.\ compute_moving_avg_from_daily_data(vacc_api_resp.data_df, data_transformations.STD_ZIP_COL_NAME, data_transformations.STD_DATE_COL_NAME, vacc_data_obj.COLS_TO_AVG) db.create_table_from_pandas(vacc_api_resp.data_df, VACC_TBL) # DAILY COVID DATA BY ZIP from IDPH print("...Downloading daily Covid-19 data...") daily_covid_data = daily_case_data_by_zip.get_daily_covid_data_from_api() data_transformations.standardize_zip_code_col(daily_covid_data, daily_case_data_by_zip.ZIP_COL_NAME) data_transformations.standardize_date_col(daily_covid_data, daily_case_data_by_zip.DATE_COL_NAME) data_transformations.\ compute_moving_avg_from_daily_data(daily_covid_data, data_transformations.STD_ZIP_COL_NAME, data_transformations.STD_DATE_COL_NAME, daily_case_data_by_zip.COLS_TO_AVG) db.create_table_from_pandas(daily_covid_data, CASE_TBL) # Ground truth foot traffic data daily_foot_traffic_data = process_ground_truth_data.get_combined_ground_truth_data() data_transformations.standardize_zip_code_col( daily_foot_traffic_data, process_ground_truth_data.ZIP_COL_NAME) data_transformations.standardize_date_col(daily_foot_traffic_data, process_ground_truth_data.DATE_COL_NAME) data_transformations.\ compute_moving_avg_from_daily_data(daily_foot_traffic_data, data_transformations.STD_ZIP_COL_NAME, data_transformations.STD_DATE_COL_NAME, process_ground_truth_data.COLS_TO_AVG) db.create_table_from_pandas(daily_foot_traffic_data, FOOT_TRAFF_TBL) # SOCRATA CRASH DATA crash_file = os.path.join("core", "resources", "zipcode_crash_data_1_1_2019-3_7_20201.csv") crash_data = pd.read_csv(crash_file) data_transformations.standardize_zip_code_col(crash_data, soda_data.CRASH_ZIP_COL_NAME) data_transformations.standardize_date_col(crash_data, soda_data.CRASH_DATE_COL_NAME) data_transformations.\ compute_moving_avg_from_daily_data(crash_data, data_transformations.STD_ZIP_COL_NAME, data_transformations.STD_DATE_COL_NAME, ['crash_count']) db.create_table_from_pandas(crash_data, CRASHES_TBL) # CENSUS Demographic Data census_data = census_api_pull.get_census_data_from_api() data_transformations.standardize_zip_code_col(census_data, census_api_pull.ZIP_COL_NAME) db.create_table_from_pandas(census_data, CENSUS_TBL)
from core.data import data_transformations from core.data.socrata import soda_data, socrata_api_requests daily_vacc_data = soda_data.datasets[0] response = socrata_api_requests.SocrataAPIClient(daily_vacc_data.request_url) response.data_df daily_data_df = response.data_df # daily_data_df.sort_values(date_col_name, inplace = True) # zipcode_col_name = 'zip_code' # date_col_name = 'date' # cols_to_avg = ['total_doses_daily'] # col_name = 'total_doses_daily' # daily_data_df.groupby(zipcode_col_name)[col_name].rolling(window = 7).mean() # daily_data_df.groupby(zipcode_col_name)[col_name].rolling(window = 7).mean() # daily_data_df[new_col_name] = daily_data_df.groupby(zipcode_col_name)[col_name].rolling(window = 7).mean().reset_index(level = 0, drop=True) data_transformations.compute_moving_avg_from_daily_data( daily_data_df, 'zip_code', 'date', ['total_doses_daily']) print(daily_data_df[daily_data_df['zip_code' == "60637"]])