コード例 #1
0
def test_config_file_schema():
    schema = yamale.make_schema(path_utils.path_to('schema_yaml'))
    source_files = os.listdir(path_utils.path_to('sources_dir'))
    for source_file in source_files:
        print(source_file)
        data = yamale.make_data(
            os.path.join(path_utils.path_to('sources_dir'), source_file))
        yamale.validate(schema, data, strict=True)
コード例 #2
0
def test_spreadsheet_dates():
    _, subdirs, _ = next(os.walk(path_utils.path_to('spreadsheets_dir')))
    print(subdirs)
    for subdir in subdirs:
        try:
            sd_date = datetime.datetime.strptime(subdir, '%Y-%m-%d')
        except ValueError:
            print('Okay value error', subdir)
            assert False, 'The subdirectory name {} in {} must be formatted as a YYYY-MM-DD date.'.format(
                subdir, path_utils.path_to('spreadsheets_dir'))
コード例 #3
0
def test_inputs_are_allowlisted():
    allowlist = config.read_allowlist()
    for source_file_name in os.listdir(path_utils.path_to('downloaded_dir')):
        source_key = os.path.splitext(source_file_name)[0]
        assert source_key in allowlist

    for source_file_name in os.listdir(path_utils.path_to('scraped_dir')):
        if source_file_name == 'spreadsheets':
          continue
        source_key = os.path.splitext(source_file_name)[0]
        assert source_key in allowlist
コード例 #4
0
def test_intermediate_locations_unique():
    for f in LOCATIONS_INTERMEDIATE_FILES:
        loc_path = os.path.join(path_utils.path_to('locations_intermediate_dir'), f)
        locations_df = pd.read_csv(loc_path)
        location_duplicates = locations_df[locations_df['region_code'].duplicated(keep=False)]
        print(location_duplicates)
        assert location_duplicates.shape[0] == 0
コード例 #5
0
def google_load_function(data_path, params):
    input_dir = os.path.dirname(data_path)
    if 'export_path' in params:
        export_dir = params['export_path']
    else:
        export_dir = os.path.join(path_utils.path_to('export_dir'),
                                  params['config_key'])
    print('input dir: ', input_dir)
    print('export dir: ', export_dir)
    for path, subdirs, files in os.walk(input_dir):
        print('path: ', path)
        for subdir in subdirs:
            print('subdir: ', subdir)
            export_subdir_path = os.path.join(
                export_dir,
                os.path.relpath(os.path.join(path, subdir), start=input_dir))
            if not os.path.exists(export_subdir_path):
                print('making subdir: ', export_subdir_path)
                os.makedirs(export_subdir_path)
        for file in files:
            file_path = os.path.join(path, file)
            print('file_path: ', file_path)
            rel_path = os.path.relpath(file_path, start=input_dir)
            print('rel path: ', rel_path)
            export_path = os.path.join(export_dir, rel_path)
            if os.path.basename(file).endswith('.csv'):
                export_utils.write_csv_with_open_covid_region_code_added(
                    file_path, export_path)
コード例 #6
0
def join_mobility_region_codes(data_df, params):
    locations_df = pd.read_csv(path_utils.path_to('locations_csv'))
    iso1_data = data_df[data_df['country_region_code'].notna()
                        & data_df['sub_region_1'].isna()
                        & data_df['sub_region_2'].isna()
                        & data_df['metro_area'].isna()]
    iso2_data = data_df[data_df['iso_3166_2_code'].notna()
                        & data_df['census_fips_code'].isna()
                        & data_df['metro_area'].isna()]
    fips_data = data_df[data_df['census_fips_code'].notna()
                        & data_df['metro_area'].isna()]
    iso1_locations = locations_df[locations_df['region_code_type'] ==
                                  'iso_3166-1']
    iso1_joined = iso1_data.merge(iso1_locations,
                                  left_on=['country_region_code'],
                                  right_on=['country_iso_3166-1_alpha-2'],
                                  how='left')
    iso2_locations = locations_df[locations_df['region_code_type'] ==
                                  'iso_3166-2']
    iso2_joined = iso2_data.merge(iso2_locations,
                                  left_on=['iso_3166_2_code'],
                                  right_on=['region_code'],
                                  how='left')
    fips_locations = locations_df[locations_df['region_code_type'] ==
                                  'fips_6-4']
    fips_data['padded_fips_code'] = fips_data['census_fips_code'].apply(
        lambda x: str(int(x)).zfill(5))
    fips_joined = fips_data.merge(fips_locations,
                                  left_on=['padded_fips_code'],
                                  right_on=['leaf_region_code'],
                                  how='left')
    joined_df = pd.concat([iso1_joined, iso2_joined, fips_joined])
    joined_df['census_fips_code'] = joined_df['padded_fips_code']
    return joined_df
コード例 #7
0
def test_location_and_date_unique():
    for f in AGGREGATED_EXPORT_FILES:
        export_path = os.path.join(path_utils.path_to('export_dir'), f)
        exported_df = pd.read_csv(export_path)
        duplicates = exported_df[exported_df[['open_covid_region_code', 'date']].duplicated(keep=False)]
        duplicate_info = duplicates[['open_covid_region_code', 'date']]
        print(duplicate_info)
        assert duplicates.shape[0] == 0
コード例 #8
0
def join_on_keys(data_df, reg_params):
    mapping_keys = reg_params['mapping_keys']
    locations_df = pd.read_csv(path_utils.path_to('locations_csv'))
    if 'level_1_region_code' in reg_params:
        locations_df = locations_df[locations_df['level_1_region_code'] ==
                                    reg_params['level_1_region_code']]
    reversed_mapping_keys = {value: key for key, value in mapping_keys.items()}
    data_df = data_df.rename(columns=reversed_mapping_keys)
    data_df = data_df.merge(locations_df,
                            on=list(mapping_keys.keys()),
                            how='inner')
    return data_df
コード例 #9
0
def join_nytimes_region_codes(data_df, params):
    locations_df = pd.read_csv(path_utils.path_to('locations_csv'))
    fips_data_df = data_df[data_df['fips'].notna()]
    fips_locations = locations_df[locations_df['region_code_type'] ==
                                  'fips_6-4']
    fips_data_df['padded_fips_code'] = fips_data_df['fips'].apply(
        lambda x: str(int(x)).zfill(5))
    fips_data_joined = fips_data_df.merge(fips_locations,
                                          left_on=['padded_fips_code'],
                                          right_on=['leaf_region_code'],
                                          how='left')
    return fips_data_joined
コード例 #10
0
def test_spreadsheet_subdirectory_contents():
    dirpath, subdirs, filenames = next(
        os.walk(path_utils.path_to('spreadsheets_dir')))
    for subdir in subdirs:
        subdir_path = os.path.join(dirpath, subdir)
        dir_contents = os.listdir(subdir_path)
        print(subdir_path, dir_contents)
        assert len(
            dir_contents
        ) == 1, 'The directory {} should contain exactly one file.'.format(
            subdir_path)
        print(dir_contents[0])
        assert dir_contents[0] == 'hospitalizations.xlsx', \
            'The file in directory {} should be named `hospitalizations.xlsx`.'.format(subdir_path)
コード例 #11
0
def test_spreadsheet_tabs_against_allowlist():
    allowlist = config.read_allowlist()
    dirpath, subdirs, filenames = next(
        os.walk(path_utils.path_to('spreadsheets_dir')))
    for subdir in subdirs:
        subdir_path = os.path.join(dirpath, subdir)
        hosp_file = os.path.join(subdir_path, 'hospitalizations.xlsx')
        xl = pd.ExcelFile(hosp_file)
        sheet_names = xl.sheet_names
        print('File: ', hosp_file)
        print('Sheet names in spreadsheet: ', sheet_names)
        print('Sheet names allowed in allowlist: ', allowlist)
        for sheet in sheet_names:
            assert sheet in allowlist, \
            "Spreadsheet {} contains a sheet name {} that is not on the allowlist.".format(hosp_file, sheet)
コード例 #12
0
def test_spreadsheet_column_names_against_schema():
    allowed_data_columns = config.all_data_schema_columns()
    dirpath, subdirs, filenames = next(
        os.walk(path_utils.path_to('spreadsheets_dir')))
    for subdir in subdirs:
        subdir_path = os.path.join(dirpath, subdir)
        hosp_file = os.path.join(subdir_path, 'hospitalizations.xlsx')
        xl = pd.ExcelFile(hosp_file)
        sheet_names = xl.sheet_names
        for s in sheet_names:
            df = xl.parse(s, nrows=2)
            print(df)
            columns_in_spreadsheet = list(df.columns)
            assert 'date' in columns_in_spreadsheet, \
                'Spreadsheet {} and sheet {} must have a column named "date".'.format(hosp_file, s)
            columns_in_spreadsheet.remove('date')
            print(columns_in_spreadsheet)
            print(allowed_data_columns)
            for column in columns_in_spreadsheet:
                assert column in allowed_data_columns, \
                    'Sheet {} in spreadsheet {} contains the column {}, which is not a column name' + \
                    'recognized by data.yaml'.format(s, hosp_file, column)
コード例 #13
0
        'RUNNING WITHOUT THE ALLOWLIST! DO NOT MAKE A PULL REQUEST WITH THE OUTPUT!'
    )

scraped = config.read_config(cc_by=True,
                             cc_by_sa=True,
                             google_tos=True,
                             cc_by_nc=True,
                             filter_by_fetch_method='SCRAPED',
                             filter_no_load_func=False,
                             filter_no_data=False,
                             filter_not_approved=args.allowlist)

spreadsheet_file = 'hospitalizations.xlsx'

most_recent_spreadsheet = path_utils.most_recent_subdir(
    path_utils.path_to('spreadsheets_dir'), spreadsheet_file)
if args.date:
    spreadsheet_date = str(args.date[0])
else:
    spreadsheet_date = str(most_recent_spreadsheet['date'])
spreadsheet_path = os.path.join(path_utils.path_to('spreadsheets_dir'),
                                spreadsheet_date, spreadsheet_file)

print('Fetching spreadsheet for date: ', spreadsheet_date)
print('Spreadsheet path: ', spreadsheet_path)

# This assumes that every data source with params['fetch']['method'] == 'SCRAPED' comes from a single spreadsheet.
# If that stops being the case, will need to update this.

for k in scraped:
    params = scraped[k]
コード例 #14
0
def join_single_region_code(data_df, single_region_code):
    data_df['region_code'] = single_region_code
    locations_df = pd.read_csv(path_utils.path_to('locations_csv'))
    locations_df = locations_df[config.all_region_columns()]
    data_df = data_df.merge(locations_df, on=['region_code'])
    return data_df
コード例 #15
0
def test_source_files_are_allowlisted():
    allowlist = config.read_allowlist()
    for source_file_name in os.listdir(path_utils.path_to('sources_dir')):
        source_key = os.path.splitext(source_file_name)[0]
        assert source_key in allowlist
コード例 #16
0
import streamlit as st
import os
import sys
import datacommons as dc

dc.set_api_key(os.environ['DATACOMMONS_API_KEY'])

PIPELINE_DIR = os.path.join(os.path.dirname(__file__), '../../',
                            'src/pipeline')

sys.path.append(PIPELINE_DIR)

import path_utils

locations_df = pd.read_csv(
    os.path.join(path_utils.path_to('locations_input_dir'),
                 'iso_3166_2_locations_raw.csv'))
aggregations_df = pd.read_csv(
    os.path.join(path_utils.path_to('locations_input_dir'),
                 'iso_3166_2_aggregations.csv'))
country_df = pd.read_csv(
    os.path.join(path_utils.path_to('locations_intermediate_dir'),
                 'iso_3166_1_locations.csv'))

country_df = country_df[[
    'country_iso_3166-1_alpha-2', 'country_iso_3166-1_alpha-3',
    'country_iso_3166-1_numeric'
]]
st.subheader('Countries')
st.write(country_df)
aggregations_df = aggregations_df[[
コード例 #17
0
import streamlit as st
import pandas as pd
import os
import sys

PIPELINE_DIR = os.path.join(os.path.dirname(__file__), '../../../',
                            'src/pipeline')

sys.path.append(PIPELINE_DIR)

import path_utils

# Load FIPS codes as strings so you don't lose the leading zeros
fips_df = pd.read_csv(os.path.join(
    path_utils.path_to('locations_intermediate_dir'), 'fips_locations.csv'),
                      dtype=str)
iso_level_1_df = pd.read_csv(
    os.path.join(path_utils.path_to('locations_intermediate_dir'),
                 'iso_3166_1_locations.csv'))
iso_level_2_df = pd.read_csv(
    os.path.join(path_utils.path_to('locations_intermediate_dir'),
                 'iso_3166_2_locations.csv'))
other_df = pd.read_csv(
    os.path.join(path_utils.path_to('locations_intermediate_dir'),
                 'other_locations.csv'))

concat_df = pd.concat([fips_df, iso_level_1_df, iso_level_2_df, other_df])
st.write(concat_df)

concat_df.to_csv(path_utils.path_to('locations_csv'), index=False)
コード例 #18
0
import pandas as pd
import os
import sys
import datacommons as dc

dc.set_api_key(os.environ['DATACOMMONS_API_KEY'])

PIPELINE_DIR = os.path.join(os.path.dirname(__file__), '../../',
                            'src/pipeline')

sys.path.append(PIPELINE_DIR)

import path_utils

df = pd.read_csv(
    os.path.join(path_utils.path_to('locations_input_dir'),
                 'us_state_and_county_fips_codes.csv'))
state_codes = pd.read_csv(
    os.path.join(path_utils.path_to('locations_input_dir'),
                 'us_states_and_numeric_codes.csv'))

################################################################################
##### Create table of fips codes with correct region_code formats          #####
################################################################################

st.write(state_codes)
merged = df.merge(state_codes,
                  how='left',
                  left_on=['state_code_fips'],
                  right_on=['numeric_code'])
st.write(merged)
コード例 #19
0
    cc_by=True, cc_by_sa=True, cc_by_nc=True, google_tos=True, filter_not_approved=args.allowlist)
sources_cc_by = config.read_config(
    cc_by=True, cc_by_sa=False, cc_by_nc=False, google_tos=False, filter_not_approved=args.allowlist)
sources_cc_by_sa = config.read_config(
    cc_by=True, cc_by_sa=True, cc_by_nc=False, google_tos=False, filter_not_approved=args.allowlist)
sources_cc_by_nc = config.read_config(
    cc_by=True, cc_by_sa=False, cc_by_nc=True, google_tos=False, filter_not_approved=args.allowlist)
sources_google_tos = config.read_config(
    cc_by=False, cc_by_sa=False, cc_by_nc=False, google_tos=True, filter_not_approved=args.allowlist)
google_search_source = {'search_trends_symptoms_dataset': sources_google_tos['search_trends_symptoms_dataset']}
google_mobility_source = {'google_mobility_reports': sources_google_tos['google_mobility_reports']}

# Step 1: Write source docs

# sources_md contains every source, used to create the README.
doc_utils.write_sources(sources_all, path_utils.path_to('sources_md'))
# sources_cc_by_md is used to create aggregated license for cc-by.
doc_utils.write_sources(sources_cc_by, path_utils.path_to('sources_cc_by_md'))
# sources_cc_by_sa_md is used to create aggregated license for cc-by-sa.
doc_utils.write_sources(sources_cc_by_sa, path_utils.path_to('sources_cc_by_sa_md'))
# sources_cc_by_nc_md is used to create aggregated license for cc-by-nc.
doc_utils.write_sources(sources_cc_by_nc, path_utils.path_to('sources_cc_by_nc_md'))

# Step 2: Write the README (needs to happen after writing the source docs)

with open(path_utils.path_to('readme_md'), 'w') as outfile:
    with open(path_utils.path_to('about_md'), 'r') as infile:
        outfile.write(infile.read())

    outfile.write('\n\n## Data Sources\n')
    with open(path_utils.path_to('sources_md'), 'r') as infile:
コード例 #20
0
                                        cc_by_sa=False,
                                        cc_by_nc=False,
                                        google_tos=True,
                                        filter_not_approved=args.allowlist)
google_search_source = {
    'search_trends_symptoms_dataset':
    sources_google_tos['search_trends_symptoms_dataset']
}
google_mobility_source = {
    'google_mobility_reports': sources_google_tos['google_mobility_reports']
}

# Step 1: Write source docs

# sources_md contains every source, used to create the README.
doc_utils.write_sources(sources_all, path_utils.path_to('sources_md'))
# sources_cc_by_md is used to create aggregated license for cc-by.
doc_utils.write_sources(sources_cc_by, path_utils.path_to('sources_cc_by_md'))
# sources_cc_by_sa_md is used to create aggregated license for cc-by-sa.
doc_utils.write_sources(sources_cc_by_sa,
                        path_utils.path_to('sources_cc_by_sa_md'))
# sources_cc_by_nc_md is used to create aggregated license for cc-by-nc.
doc_utils.write_sources(sources_cc_by_nc,
                        path_utils.path_to('sources_cc_by_nc_md'))

# Step 2: Write the README (needs to happen after writing the source docs)

with open(path_utils.path_to('readme_md'), 'w') as outfile:
    with open(path_utils.path_to('about_md'), 'r') as infile:
        outfile.write(infile.read())
コード例 #21
0
    'ABW': 'Aruba',  # Netherlands: NL-AW Aruba (AW)
    'CUW': 'Curaçao',  # Netherlands: NL-CW Curaçao (CW)
    'SXM': 'Sint Maarten (Dutch part)',  # Netherlands: NL-SX Sint Maarten (SX)
    'ASM': 'American Samoa',  # United States: US-AS
    'GUM': 'Guam',  # United States: US-GU
    'MNP': 'Northern Mariana Islands',  # United States: US-MP
    'PRI': 'Puerto Rico',  # United States: US-PR
    'UMI': 'United States Minor Outlying Islands',  # United States: US-UM
    'VIR': 'United States Virgin Islands',  # United States: US-VI
}

st.write(len(regions_to_remove_from_iso1))
country_df = country_df[~country_df['region_code'].
                        isin(regions_to_remove_from_iso1.keys())]
st.subheader('Countries without duplicate ISO-3166-1 / ISO-3166-2 regions')

################################################################################
##### Generate datacommons ids using the known format for the dcids       ######
################################################################################

country_df['datacommons_id'] = country_df.apply(
    lambda x: 'country/' + x['region_code'], axis=1)

st.write(country_df)
st.write(country_df.shape)

country_df.to_csv(os.path.join(
    path_utils.path_to('locations_intermediate_dir'),
    'iso_3166_1_locations.csv'),
                  index=False)
コード例 #22
0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import streamlit as st
import pandas as pd
import os
import sys

PIPELINE_DIR = os.path.join(os.path.dirname(__file__), '../../../', 'src/pipeline')

sys.path.append(PIPELINE_DIR)

import path_utils


# Load FIPS codes as strings so you don't lose the leading zeros
fips_df = pd.read_csv(os.path.join(path_utils.path_to('locations_intermediate_dir'), 'fips_locations.csv'), dtype=str)
iso_level_1_df = pd.read_csv(os.path.join(path_utils.path_to('locations_intermediate_dir'), 'iso_3166_1_locations.csv'))
iso_level_2_df = pd.read_csv(os.path.join(path_utils.path_to('locations_intermediate_dir'), 'iso_3166_2_locations.csv'))
other_df = pd.read_csv(os.path.join(path_utils.path_to('locations_intermediate_dir'), 'other_locations.csv'))

concat_df = pd.concat([fips_df, iso_level_1_df, iso_level_2_df, other_df])
st.write(concat_df)

concat_df.to_csv(path_utils.path_to('locations_csv'), index=False)
print('Wrote concatenated locations file to %s.' % path_utils.path_to('locations_csv'))
コード例 #23
0
def test_locations_unique():
    locations_df = pd.read_csv(path_utils.path_to('locations_csv'))
    location_duplicates = locations_df[locations_df['region_code'].duplicated(
        keep=False)]
    print(location_duplicates)
    assert location_duplicates.shape[0] == 0
コード例 #24
0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# pylint: disable=no-value-for-parameter

import streamlit as st
import sys
import os

PIPELINE_DIR = os.path.join(os.path.dirname(__file__), '../../',
                            'src/pipeline')

sys.path.append(PIPELINE_DIR)

import path_utils

sys.path.append(path_utils.path_to('main_dir'))
sys.path.append(path_utils.path_to('utils_dir'))

import pipeline_explorer

st.sidebar.markdown('# Select view:')
radio_selection = st.sidebar.radio('', ['Pipeline Explorer'])
st.sidebar.markdown('---')

if radio_selection == 'Pipeline Explorer':
    pipeline_explorer.pipeline()