def dl_file(url, dir_parent, dir_file, file, ext='.csv', user=False, verify=True, unzip=False, ab_json_to_csv=False, mb_json_to_csv=False): """Download file (generic). Used to download most file types (when Selenium is not required). Some files are handled with file-specific code: - unzip=True and file='13100781' has unique code. - Each instance of ab_json_to_csv=True has unique code. - Each instance of mb_json_to_csv=True has unique code. Parameters: url (str): URL to download file from. dir_parent (str): The parent directory. Example: 'other/can'. dir_file (str): The file directory ('epidemiology-update'). file (str): Output file name (excluding extension). Example: 'covid19' ext (str): Extension of the output file. Defaults to '.csv'. user (bool): Should the request impersonate a normal browser? Needed to access some data. Default: False. verify (bool): If False, requests will skip SSL verification. Default: True. unzip (bool): If True, this file requires unzipping. Default: False. ab_json_to_csv (bool): If True, this is an Alberta JSON file embedded in a webpage that should be converted to CSV. Default: False. mb_json_to_csv (bool): If True, this is a Manitoba JSON file that that should be converted to CSV. Default: False. """ global mode, download_log, success, failure, prefix ## set names with timestamp and file ext name = file + '_' + get_datetime('America/Toronto').strftime( '%Y-%m-%d_%H-%M') full_name = os.path.join(dir_parent, dir_file, name + ext) ## download file try: ## some websites will reject the request unless you look like a normal web browser ## user is True provides a normal-looking user agent string to bypass this if user is True: headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:66.0) Gecko/20100101 Firefox/66.0" } req = requests.get(url, headers=headers, verify=verify) else: req = requests.get(url, verify=verify) ## check if request was successful if not req.ok: ## print failure print(background('Error downloading: ' + full_name, Colors.red)) failure += 1 ## write failure to log message download_log = download_log + 'Failure: ' + full_name + '\n' ## successful request: if mode == test, print success and end elif mode == 'test': ## print success and write to log download_log = download_log + 'Success: ' + full_name + '\n' print(color('Test download successful: ' + full_name, Colors.green)) success += 1 ## successful request: mode == prod, upload file else: if unzip: ## unzip data tmpdir = tempfile.TemporaryDirectory() zpath = os.path.join(tmpdir.name, 'zip_file.zip') with open(zpath, mode='wb') as local_file: local_file.write(req.content) with ZipFile(zpath, 'r') as zip_file: zip_file.extractall(tmpdir.name) f_path = os.path.join(tmpdir.name, file + ext) if file == '13100781': ## read CSV (informative columns only) data = pd.read_csv(f_path, usecols=[ 'REF_DATE', 'Case identifier number', 'Case information', 'VALUE' ]) ## save original order of column values col_order = data['Case information'].unique() ## pivot long to wide data = data.pivot( index=['REF_DATE', 'Case identifier number'], columns='Case information', values='VALUE').reset_index() ## use original column order data = data[['REF_DATE', 'Case identifier number'] + col_order.tolist()] ## write CSV data.to_csv(f_path, index=None, quoting=csv.QUOTE_NONNUMERIC) elif ab_json_to_csv: ## for Alberta JSON data only: extract JSON from webpage, convert JSON to CSV and save as temporary file tmpdir = tempfile.TemporaryDirectory() f_path = os.path.join(tmpdir.name, file + ext) data = re.search("(?<=\"data\"\:)\[\[.*\]\]", req.text).group(0) if url == "https://www.alberta.ca/stats/covid-19-alberta-statistics.htm": data = pd.read_json(data).transpose() data = data.rename( columns={ 0: "", 1: "Date reported", 2: "Alberta Health Services Zone", 3: "Gender", 4: "Age group", 5: "Case status", 6: "Case type" }) elif url == "https://www.alberta.ca/maps/covid-19-status-map.htm": data = BeautifulSoup(data, features="html.parser") data = data.get_text() # strip HTML tags ## this regex may need some tweaking if measures column changes in the future data = re.sub("<\\\/a><\\\/li><\\\/ul>", "", data) # strip remaining tags data = re.sub("(?<=\") ", "", data) # strip whitespace data = re.sub(" (?=\")", "", data) # strip whitespace data = pd.read_json(data).transpose() data = data.rename( columns={ 0: "", 1: "Region name", 2: "Region classification", 3: "Measures", 4: "Active case rate (per 100,000 population)", 5: "Active cases", 6: "Population" }) elif url == "https://www.alberta.ca/schools/covid-19-school-status-map.htm": data = re.sub(',"container":.*', "", data) # strip remaining tags data = pd.read_json(data).transpose() data = data.rename( columns={ 0: "", 1: "Region name", 2: "School status", 3: "Schools details", 4: "num_ord" }) data['num_ord'] = data['num_ord'].astype(str).astype( int) # convert to int data[''] = data[''].astype(str).astype( int) # convert to int data = data.sort_values( by=['num_ord', ''] ) # sort ascending by num_ord and first column (like CSV output on website) data = data.to_csv( None, quoting=csv.QUOTE_ALL, index=False ) # to match website output: quote all lines, don't terminate with new line with open(f_path, 'w') as local_file: local_file.write(data[:-1]) elif mb_json_to_csv: ## for Manitoba JSON data only: convert JSON to CSV and save as temporary file tmpdir = tempfile.TemporaryDirectory() f_path = os.path.join(tmpdir.name, file + ext) data = pd.json_normalize(json.loads(req.content)['features']) data.columns = data.columns.str.lstrip( 'attributes.') # strip prefix ## replace timestamps with actual dates if 'Date' in data.columns: data.Date = pd.to_datetime(data.Date / 1000, unit='s').dt.date data.to_csv(f_path, index=None) else: ## all other data: write contents to temporary file tmpdir = tempfile.TemporaryDirectory() f_path = os.path.join(tmpdir.name, file + ext) with open(f_path, mode='wb') as local_file: local_file.write(req.content) ## upload file s3_dir = os.path.join(dir_parent, dir_file) upload_file(full_name, f_path, s3_dir=s3_dir, s3_prefix=prefix) except Exception as e: ## print failure print(e) print(background('Error downloading: ' + full_name, Colors.red)) failure += 1 ## write failure to log message download_log = download_log + 'Failure: ' + full_name + '\n'