def test_config_file_schema(): schema = yamale.make_schema(path_utils.path_to('schema_yaml')) source_files = os.listdir(path_utils.path_to('sources_dir')) for source_file in source_files: print(source_file) data = yamale.make_data( os.path.join(path_utils.path_to('sources_dir'), source_file)) yamale.validate(schema, data, strict=True)
def test_spreadsheet_dates(): _, subdirs, _ = next(os.walk(path_utils.path_to('spreadsheets_dir'))) print(subdirs) for subdir in subdirs: try: sd_date = datetime.datetime.strptime(subdir, '%Y-%m-%d') except ValueError: print('Okay value error', subdir) assert False, 'The subdirectory name {} in {} must be formatted as a YYYY-MM-DD date.'.format( subdir, path_utils.path_to('spreadsheets_dir'))
def test_inputs_are_allowlisted(): allowlist = config.read_allowlist() for source_file_name in os.listdir(path_utils.path_to('downloaded_dir')): source_key = os.path.splitext(source_file_name)[0] assert source_key in allowlist for source_file_name in os.listdir(path_utils.path_to('scraped_dir')): if source_file_name == 'spreadsheets': continue source_key = os.path.splitext(source_file_name)[0] assert source_key in allowlist
def test_intermediate_locations_unique(): for f in LOCATIONS_INTERMEDIATE_FILES: loc_path = os.path.join(path_utils.path_to('locations_intermediate_dir'), f) locations_df = pd.read_csv(loc_path) location_duplicates = locations_df[locations_df['region_code'].duplicated(keep=False)] print(location_duplicates) assert location_duplicates.shape[0] == 0
def google_load_function(data_path, params): input_dir = os.path.dirname(data_path) if 'export_path' in params: export_dir = params['export_path'] else: export_dir = os.path.join(path_utils.path_to('export_dir'), params['config_key']) print('input dir: ', input_dir) print('export dir: ', export_dir) for path, subdirs, files in os.walk(input_dir): print('path: ', path) for subdir in subdirs: print('subdir: ', subdir) export_subdir_path = os.path.join( export_dir, os.path.relpath(os.path.join(path, subdir), start=input_dir)) if not os.path.exists(export_subdir_path): print('making subdir: ', export_subdir_path) os.makedirs(export_subdir_path) for file in files: file_path = os.path.join(path, file) print('file_path: ', file_path) rel_path = os.path.relpath(file_path, start=input_dir) print('rel path: ', rel_path) export_path = os.path.join(export_dir, rel_path) if os.path.basename(file).endswith('.csv'): export_utils.write_csv_with_open_covid_region_code_added( file_path, export_path)
def join_mobility_region_codes(data_df, params): locations_df = pd.read_csv(path_utils.path_to('locations_csv')) iso1_data = data_df[data_df['country_region_code'].notna() & data_df['sub_region_1'].isna() & data_df['sub_region_2'].isna() & data_df['metro_area'].isna()] iso2_data = data_df[data_df['iso_3166_2_code'].notna() & data_df['census_fips_code'].isna() & data_df['metro_area'].isna()] fips_data = data_df[data_df['census_fips_code'].notna() & data_df['metro_area'].isna()] iso1_locations = locations_df[locations_df['region_code_type'] == 'iso_3166-1'] iso1_joined = iso1_data.merge(iso1_locations, left_on=['country_region_code'], right_on=['country_iso_3166-1_alpha-2'], how='left') iso2_locations = locations_df[locations_df['region_code_type'] == 'iso_3166-2'] iso2_joined = iso2_data.merge(iso2_locations, left_on=['iso_3166_2_code'], right_on=['region_code'], how='left') fips_locations = locations_df[locations_df['region_code_type'] == 'fips_6-4'] fips_data['padded_fips_code'] = fips_data['census_fips_code'].apply( lambda x: str(int(x)).zfill(5)) fips_joined = fips_data.merge(fips_locations, left_on=['padded_fips_code'], right_on=['leaf_region_code'], how='left') joined_df = pd.concat([iso1_joined, iso2_joined, fips_joined]) joined_df['census_fips_code'] = joined_df['padded_fips_code'] return joined_df
def test_location_and_date_unique(): for f in AGGREGATED_EXPORT_FILES: export_path = os.path.join(path_utils.path_to('export_dir'), f) exported_df = pd.read_csv(export_path) duplicates = exported_df[exported_df[['open_covid_region_code', 'date']].duplicated(keep=False)] duplicate_info = duplicates[['open_covid_region_code', 'date']] print(duplicate_info) assert duplicates.shape[0] == 0
def join_on_keys(data_df, reg_params): mapping_keys = reg_params['mapping_keys'] locations_df = pd.read_csv(path_utils.path_to('locations_csv')) if 'level_1_region_code' in reg_params: locations_df = locations_df[locations_df['level_1_region_code'] == reg_params['level_1_region_code']] reversed_mapping_keys = {value: key for key, value in mapping_keys.items()} data_df = data_df.rename(columns=reversed_mapping_keys) data_df = data_df.merge(locations_df, on=list(mapping_keys.keys()), how='inner') return data_df
def join_nytimes_region_codes(data_df, params): locations_df = pd.read_csv(path_utils.path_to('locations_csv')) fips_data_df = data_df[data_df['fips'].notna()] fips_locations = locations_df[locations_df['region_code_type'] == 'fips_6-4'] fips_data_df['padded_fips_code'] = fips_data_df['fips'].apply( lambda x: str(int(x)).zfill(5)) fips_data_joined = fips_data_df.merge(fips_locations, left_on=['padded_fips_code'], right_on=['leaf_region_code'], how='left') return fips_data_joined
def test_spreadsheet_subdirectory_contents(): dirpath, subdirs, filenames = next( os.walk(path_utils.path_to('spreadsheets_dir'))) for subdir in subdirs: subdir_path = os.path.join(dirpath, subdir) dir_contents = os.listdir(subdir_path) print(subdir_path, dir_contents) assert len( dir_contents ) == 1, 'The directory {} should contain exactly one file.'.format( subdir_path) print(dir_contents[0]) assert dir_contents[0] == 'hospitalizations.xlsx', \ 'The file in directory {} should be named `hospitalizations.xlsx`.'.format(subdir_path)
def test_spreadsheet_tabs_against_allowlist(): allowlist = config.read_allowlist() dirpath, subdirs, filenames = next( os.walk(path_utils.path_to('spreadsheets_dir'))) for subdir in subdirs: subdir_path = os.path.join(dirpath, subdir) hosp_file = os.path.join(subdir_path, 'hospitalizations.xlsx') xl = pd.ExcelFile(hosp_file) sheet_names = xl.sheet_names print('File: ', hosp_file) print('Sheet names in spreadsheet: ', sheet_names) print('Sheet names allowed in allowlist: ', allowlist) for sheet in sheet_names: assert sheet in allowlist, \ "Spreadsheet {} contains a sheet name {} that is not on the allowlist.".format(hosp_file, sheet)
def test_spreadsheet_column_names_against_schema(): allowed_data_columns = config.all_data_schema_columns() dirpath, subdirs, filenames = next( os.walk(path_utils.path_to('spreadsheets_dir'))) for subdir in subdirs: subdir_path = os.path.join(dirpath, subdir) hosp_file = os.path.join(subdir_path, 'hospitalizations.xlsx') xl = pd.ExcelFile(hosp_file) sheet_names = xl.sheet_names for s in sheet_names: df = xl.parse(s, nrows=2) print(df) columns_in_spreadsheet = list(df.columns) assert 'date' in columns_in_spreadsheet, \ 'Spreadsheet {} and sheet {} must have a column named "date".'.format(hosp_file, s) columns_in_spreadsheet.remove('date') print(columns_in_spreadsheet) print(allowed_data_columns) for column in columns_in_spreadsheet: assert column in allowed_data_columns, \ 'Sheet {} in spreadsheet {} contains the column {}, which is not a column name' + \ 'recognized by data.yaml'.format(s, hosp_file, column)
'RUNNING WITHOUT THE ALLOWLIST! DO NOT MAKE A PULL REQUEST WITH THE OUTPUT!' ) scraped = config.read_config(cc_by=True, cc_by_sa=True, google_tos=True, cc_by_nc=True, filter_by_fetch_method='SCRAPED', filter_no_load_func=False, filter_no_data=False, filter_not_approved=args.allowlist) spreadsheet_file = 'hospitalizations.xlsx' most_recent_spreadsheet = path_utils.most_recent_subdir( path_utils.path_to('spreadsheets_dir'), spreadsheet_file) if args.date: spreadsheet_date = str(args.date[0]) else: spreadsheet_date = str(most_recent_spreadsheet['date']) spreadsheet_path = os.path.join(path_utils.path_to('spreadsheets_dir'), spreadsheet_date, spreadsheet_file) print('Fetching spreadsheet for date: ', spreadsheet_date) print('Spreadsheet path: ', spreadsheet_path) # This assumes that every data source with params['fetch']['method'] == 'SCRAPED' comes from a single spreadsheet. # If that stops being the case, will need to update this. for k in scraped: params = scraped[k]
def join_single_region_code(data_df, single_region_code): data_df['region_code'] = single_region_code locations_df = pd.read_csv(path_utils.path_to('locations_csv')) locations_df = locations_df[config.all_region_columns()] data_df = data_df.merge(locations_df, on=['region_code']) return data_df
def test_source_files_are_allowlisted(): allowlist = config.read_allowlist() for source_file_name in os.listdir(path_utils.path_to('sources_dir')): source_key = os.path.splitext(source_file_name)[0] assert source_key in allowlist
import streamlit as st import os import sys import datacommons as dc dc.set_api_key(os.environ['DATACOMMONS_API_KEY']) PIPELINE_DIR = os.path.join(os.path.dirname(__file__), '../../', 'src/pipeline') sys.path.append(PIPELINE_DIR) import path_utils locations_df = pd.read_csv( os.path.join(path_utils.path_to('locations_input_dir'), 'iso_3166_2_locations_raw.csv')) aggregations_df = pd.read_csv( os.path.join(path_utils.path_to('locations_input_dir'), 'iso_3166_2_aggregations.csv')) country_df = pd.read_csv( os.path.join(path_utils.path_to('locations_intermediate_dir'), 'iso_3166_1_locations.csv')) country_df = country_df[[ 'country_iso_3166-1_alpha-2', 'country_iso_3166-1_alpha-3', 'country_iso_3166-1_numeric' ]] st.subheader('Countries') st.write(country_df) aggregations_df = aggregations_df[[
import streamlit as st import pandas as pd import os import sys PIPELINE_DIR = os.path.join(os.path.dirname(__file__), '../../../', 'src/pipeline') sys.path.append(PIPELINE_DIR) import path_utils # Load FIPS codes as strings so you don't lose the leading zeros fips_df = pd.read_csv(os.path.join( path_utils.path_to('locations_intermediate_dir'), 'fips_locations.csv'), dtype=str) iso_level_1_df = pd.read_csv( os.path.join(path_utils.path_to('locations_intermediate_dir'), 'iso_3166_1_locations.csv')) iso_level_2_df = pd.read_csv( os.path.join(path_utils.path_to('locations_intermediate_dir'), 'iso_3166_2_locations.csv')) other_df = pd.read_csv( os.path.join(path_utils.path_to('locations_intermediate_dir'), 'other_locations.csv')) concat_df = pd.concat([fips_df, iso_level_1_df, iso_level_2_df, other_df]) st.write(concat_df) concat_df.to_csv(path_utils.path_to('locations_csv'), index=False)
import pandas as pd import os import sys import datacommons as dc dc.set_api_key(os.environ['DATACOMMONS_API_KEY']) PIPELINE_DIR = os.path.join(os.path.dirname(__file__), '../../', 'src/pipeline') sys.path.append(PIPELINE_DIR) import path_utils df = pd.read_csv( os.path.join(path_utils.path_to('locations_input_dir'), 'us_state_and_county_fips_codes.csv')) state_codes = pd.read_csv( os.path.join(path_utils.path_to('locations_input_dir'), 'us_states_and_numeric_codes.csv')) ################################################################################ ##### Create table of fips codes with correct region_code formats ##### ################################################################################ st.write(state_codes) merged = df.merge(state_codes, how='left', left_on=['state_code_fips'], right_on=['numeric_code']) st.write(merged)
cc_by=True, cc_by_sa=True, cc_by_nc=True, google_tos=True, filter_not_approved=args.allowlist) sources_cc_by = config.read_config( cc_by=True, cc_by_sa=False, cc_by_nc=False, google_tos=False, filter_not_approved=args.allowlist) sources_cc_by_sa = config.read_config( cc_by=True, cc_by_sa=True, cc_by_nc=False, google_tos=False, filter_not_approved=args.allowlist) sources_cc_by_nc = config.read_config( cc_by=True, cc_by_sa=False, cc_by_nc=True, google_tos=False, filter_not_approved=args.allowlist) sources_google_tos = config.read_config( cc_by=False, cc_by_sa=False, cc_by_nc=False, google_tos=True, filter_not_approved=args.allowlist) google_search_source = {'search_trends_symptoms_dataset': sources_google_tos['search_trends_symptoms_dataset']} google_mobility_source = {'google_mobility_reports': sources_google_tos['google_mobility_reports']} # Step 1: Write source docs # sources_md contains every source, used to create the README. doc_utils.write_sources(sources_all, path_utils.path_to('sources_md')) # sources_cc_by_md is used to create aggregated license for cc-by. doc_utils.write_sources(sources_cc_by, path_utils.path_to('sources_cc_by_md')) # sources_cc_by_sa_md is used to create aggregated license for cc-by-sa. doc_utils.write_sources(sources_cc_by_sa, path_utils.path_to('sources_cc_by_sa_md')) # sources_cc_by_nc_md is used to create aggregated license for cc-by-nc. doc_utils.write_sources(sources_cc_by_nc, path_utils.path_to('sources_cc_by_nc_md')) # Step 2: Write the README (needs to happen after writing the source docs) with open(path_utils.path_to('readme_md'), 'w') as outfile: with open(path_utils.path_to('about_md'), 'r') as infile: outfile.write(infile.read()) outfile.write('\n\n## Data Sources\n') with open(path_utils.path_to('sources_md'), 'r') as infile:
cc_by_sa=False, cc_by_nc=False, google_tos=True, filter_not_approved=args.allowlist) google_search_source = { 'search_trends_symptoms_dataset': sources_google_tos['search_trends_symptoms_dataset'] } google_mobility_source = { 'google_mobility_reports': sources_google_tos['google_mobility_reports'] } # Step 1: Write source docs # sources_md contains every source, used to create the README. doc_utils.write_sources(sources_all, path_utils.path_to('sources_md')) # sources_cc_by_md is used to create aggregated license for cc-by. doc_utils.write_sources(sources_cc_by, path_utils.path_to('sources_cc_by_md')) # sources_cc_by_sa_md is used to create aggregated license for cc-by-sa. doc_utils.write_sources(sources_cc_by_sa, path_utils.path_to('sources_cc_by_sa_md')) # sources_cc_by_nc_md is used to create aggregated license for cc-by-nc. doc_utils.write_sources(sources_cc_by_nc, path_utils.path_to('sources_cc_by_nc_md')) # Step 2: Write the README (needs to happen after writing the source docs) with open(path_utils.path_to('readme_md'), 'w') as outfile: with open(path_utils.path_to('about_md'), 'r') as infile: outfile.write(infile.read())
'ABW': 'Aruba', # Netherlands: NL-AW Aruba (AW) 'CUW': 'Curaçao', # Netherlands: NL-CW Curaçao (CW) 'SXM': 'Sint Maarten (Dutch part)', # Netherlands: NL-SX Sint Maarten (SX) 'ASM': 'American Samoa', # United States: US-AS 'GUM': 'Guam', # United States: US-GU 'MNP': 'Northern Mariana Islands', # United States: US-MP 'PRI': 'Puerto Rico', # United States: US-PR 'UMI': 'United States Minor Outlying Islands', # United States: US-UM 'VIR': 'United States Virgin Islands', # United States: US-VI } st.write(len(regions_to_remove_from_iso1)) country_df = country_df[~country_df['region_code']. isin(regions_to_remove_from_iso1.keys())] st.subheader('Countries without duplicate ISO-3166-1 / ISO-3166-2 regions') ################################################################################ ##### Generate datacommons ids using the known format for the dcids ###### ################################################################################ country_df['datacommons_id'] = country_df.apply( lambda x: 'country/' + x['region_code'], axis=1) st.write(country_df) st.write(country_df.shape) country_df.to_csv(os.path.join( path_utils.path_to('locations_intermediate_dir'), 'iso_3166_1_locations.csv'), index=False)
# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import streamlit as st import pandas as pd import os import sys PIPELINE_DIR = os.path.join(os.path.dirname(__file__), '../../../', 'src/pipeline') sys.path.append(PIPELINE_DIR) import path_utils # Load FIPS codes as strings so you don't lose the leading zeros fips_df = pd.read_csv(os.path.join(path_utils.path_to('locations_intermediate_dir'), 'fips_locations.csv'), dtype=str) iso_level_1_df = pd.read_csv(os.path.join(path_utils.path_to('locations_intermediate_dir'), 'iso_3166_1_locations.csv')) iso_level_2_df = pd.read_csv(os.path.join(path_utils.path_to('locations_intermediate_dir'), 'iso_3166_2_locations.csv')) other_df = pd.read_csv(os.path.join(path_utils.path_to('locations_intermediate_dir'), 'other_locations.csv')) concat_df = pd.concat([fips_df, iso_level_1_df, iso_level_2_df, other_df]) st.write(concat_df) concat_df.to_csv(path_utils.path_to('locations_csv'), index=False) print('Wrote concatenated locations file to %s.' % path_utils.path_to('locations_csv'))
def test_locations_unique(): locations_df = pd.read_csv(path_utils.path_to('locations_csv')) location_duplicates = locations_df[locations_df['region_code'].duplicated( keep=False)] print(location_duplicates) assert location_duplicates.shape[0] == 0
# Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # pylint: disable=no-value-for-parameter import streamlit as st import sys import os PIPELINE_DIR = os.path.join(os.path.dirname(__file__), '../../', 'src/pipeline') sys.path.append(PIPELINE_DIR) import path_utils sys.path.append(path_utils.path_to('main_dir')) sys.path.append(path_utils.path_to('utils_dir')) import pipeline_explorer st.sidebar.markdown('# Select view:') radio_selection = st.sidebar.radio('', ['Pipeline Explorer']) st.sidebar.markdown('---') if radio_selection == 'Pipeline Explorer': pipeline_explorer.pipeline()