logging.warning( 'RUNNING WITHOUT THE ALLOWLIST! DO NOT MAKE A PULL REQUEST WITH THE OUTPUT!' ) scraped = config.read_config(cc_by=True, cc_by_sa=True, google_tos=True, cc_by_nc=True, filter_by_fetch_method='SCRAPED', filter_no_load_func=False, filter_no_data=False, filter_not_approved=args.allowlist) spreadsheet_file = 'hospitalizations.xlsx' most_recent_spreadsheet = path_utils.most_recent_subdir( path_utils.path_to('spreadsheets_dir'), spreadsheet_file) if args.date: spreadsheet_date = str(args.date[0]) else: spreadsheet_date = str(most_recent_spreadsheet['date']) spreadsheet_path = os.path.join(path_utils.path_to('spreadsheets_dir'), spreadsheet_date, spreadsheet_file) print('Fetching spreadsheet for date: ', spreadsheet_date) print('Spreadsheet path: ', spreadsheet_path) # This assumes that every data source with params['fetch']['method'] == 'SCRAPED' comes from a single spreadsheet. # If that stops being the case, will need to update this. for k in scraped: params = scraped[k]
CURRENT_DIR = os.path.dirname(__file__) ROOT_DIR = os.path.abspath(os.path.join(CURRENT_DIR, '../../')) PIPELINE_DIR = os.path.join(ROOT_DIR, 'src/pipeline') sys.path.append(PIPELINE_DIR) import config import path_utils scraped = config.read_config(filter_by_fetch_method='SCRAPED') spreadsheet_dir = os.path.join(ROOT_DIR, 'data/inputs/scraped/spreadsheets') spreadsheet_file = 'hospitalizations.xlsx' most_recent_spreadsheet = path_utils.most_recent_subdir( spreadsheet_dir, spreadsheet_file) spreadsheet_path = most_recent_spreadsheet['path'] spreadsheet_date = str(most_recent_spreadsheet['date']) spreadsheet_date = '2020-06-15' # This assumes that every data source with params['fetch']['method'] == 'SCRAPED' comes from a single spreadsheet. # If that stops being the case, will need to update this. for k in scraped: params = scraped[k] df = pd.read_excel(spreadsheet_path, k) path_for_data = path_utils.path_to_data_for_date(params, spreadsheet_date) print('path for data: ', path_for_data) out_dir = path_for_data['dir'] out_file = path_for_data['file'] out_path = os.path.join(out_dir, out_file)