Beispiel #1
0
    logging.warning(
        'RUNNING WITHOUT THE ALLOWLIST! DO NOT MAKE A PULL REQUEST WITH THE OUTPUT!'
    )

scraped = config.read_config(cc_by=True,
                             cc_by_sa=True,
                             google_tos=True,
                             cc_by_nc=True,
                             filter_by_fetch_method='SCRAPED',
                             filter_no_load_func=False,
                             filter_no_data=False,
                             filter_not_approved=args.allowlist)

spreadsheet_file = 'hospitalizations.xlsx'

most_recent_spreadsheet = path_utils.most_recent_subdir(
    path_utils.path_to('spreadsheets_dir'), spreadsheet_file)
if args.date:
    spreadsheet_date = str(args.date[0])
else:
    spreadsheet_date = str(most_recent_spreadsheet['date'])
spreadsheet_path = os.path.join(path_utils.path_to('spreadsheets_dir'),
                                spreadsheet_date, spreadsheet_file)

print('Fetching spreadsheet for date: ', spreadsheet_date)
print('Spreadsheet path: ', spreadsheet_path)

# This assumes that every data source with params['fetch']['method'] == 'SCRAPED' comes from a single spreadsheet.
# If that stops being the case, will need to update this.

for k in scraped:
    params = scraped[k]
Beispiel #2
0
CURRENT_DIR = os.path.dirname(__file__)
ROOT_DIR = os.path.abspath(os.path.join(CURRENT_DIR, '../../'))
PIPELINE_DIR = os.path.join(ROOT_DIR, 'src/pipeline')

sys.path.append(PIPELINE_DIR)

import config
import path_utils

scraped = config.read_config(filter_by_fetch_method='SCRAPED')

spreadsheet_dir = os.path.join(ROOT_DIR, 'data/inputs/scraped/spreadsheets')
spreadsheet_file = 'hospitalizations.xlsx'

most_recent_spreadsheet = path_utils.most_recent_subdir(
    spreadsheet_dir, spreadsheet_file)
spreadsheet_path = most_recent_spreadsheet['path']
spreadsheet_date = str(most_recent_spreadsheet['date'])
spreadsheet_date = '2020-06-15'

# This assumes that every data source with params['fetch']['method'] == 'SCRAPED' comes from a single spreadsheet.
# If that stops being the case, will need to update this.

for k in scraped:
    params = scraped[k]
    df = pd.read_excel(spreadsheet_path, k)
    path_for_data = path_utils.path_to_data_for_date(params, spreadsheet_date)
    print('path for data: ', path_for_data)
    out_dir = path_for_data['dir']
    out_file = path_for_data['file']
    out_path = os.path.join(out_dir, out_file)