Ejemplo n.º 1
0
def dl_file(url,
            dir_parent,
            dir_file,
            file,
            ext='.csv',
            user=False,
            verify=True,
            unzip=False,
            ab_json_to_csv=False,
            mb_json_to_csv=False):
    """Download file (generic).

    Used to download most file types (when Selenium is not required). Some files are handled with file-specific code:

    - unzip=True and file='13100781' has unique code.
    - Each instance of ab_json_to_csv=True has unique code.
    - Each instance of mb_json_to_csv=True has unique code.

    Parameters:
    url (str): URL to download file from.
    dir_parent (str): The parent directory. Example: 'other/can'.
    dir_file (str): The file directory ('epidemiology-update').
    file (str): Output file name (excluding extension). Example: 'covid19'
    ext (str): Extension of the output file. Defaults to '.csv'.
    user (bool): Should the request impersonate a normal browser? Needed to access some data. Default: False.
    verify (bool): If False, requests will skip SSL verification. Default: True.
    unzip (bool): If True, this file requires unzipping. Default: False.
    ab_json_to_csv (bool): If True, this is an Alberta JSON file embedded in a webpage that should be converted to CSV. Default: False.
    mb_json_to_csv (bool): If True, this is a Manitoba JSON file that that should be converted to CSV. Default: False.

    """
    global mode, download_log, success, failure, prefix

    ## set names with timestamp and file ext
    name = file + '_' + get_datetime('America/Toronto').strftime(
        '%Y-%m-%d_%H-%M')
    full_name = os.path.join(dir_parent, dir_file, name + ext)

    ## download file
    try:
        ## some websites will reject the request unless you look like a normal web browser
        ## user is True provides a normal-looking user agent string to bypass this
        if user is True:
            headers = {
                "User-Agent":
                "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:66.0) Gecko/20100101 Firefox/66.0"
            }
            req = requests.get(url, headers=headers, verify=verify)
        else:
            req = requests.get(url, verify=verify)

        ## check if request was successful
        if not req.ok:
            ## print failure
            print(background('Error downloading: ' + full_name, Colors.red))
            failure += 1
            ## write failure to log message
            download_log = download_log + 'Failure: ' + full_name + '\n'
        ## successful request: if mode == test, print success and end
        elif mode == 'test':
            ## print success and write to log
            download_log = download_log + 'Success: ' + full_name + '\n'
            print(color('Test download successful: ' + full_name,
                        Colors.green))
            success += 1
        ## successful request: mode == prod, upload file
        else:
            if unzip:
                ## unzip data
                tmpdir = tempfile.TemporaryDirectory()
                zpath = os.path.join(tmpdir.name, 'zip_file.zip')
                with open(zpath, mode='wb') as local_file:
                    local_file.write(req.content)
                with ZipFile(zpath, 'r') as zip_file:
                    zip_file.extractall(tmpdir.name)
                f_path = os.path.join(tmpdir.name, file + ext)
                if file == '13100781':
                    ## read CSV (informative columns only)
                    data = pd.read_csv(f_path,
                                       usecols=[
                                           'REF_DATE',
                                           'Case identifier number',
                                           'Case information', 'VALUE'
                                       ])
                    ## save original order of column values
                    col_order = data['Case information'].unique()
                    ## pivot long to wide
                    data = data.pivot(
                        index=['REF_DATE', 'Case identifier number'],
                        columns='Case information',
                        values='VALUE').reset_index()
                    ## use original column order
                    data = data[['REF_DATE', 'Case identifier number'] +
                                col_order.tolist()]
                    ## write CSV
                    data.to_csv(f_path,
                                index=None,
                                quoting=csv.QUOTE_NONNUMERIC)
            elif ab_json_to_csv:
                ## for Alberta JSON data only: extract JSON from webpage, convert JSON to CSV and save as temporary file
                tmpdir = tempfile.TemporaryDirectory()
                f_path = os.path.join(tmpdir.name, file + ext)
                data = re.search("(?<=\"data\"\:)\[\[.*\]\]",
                                 req.text).group(0)
                if url == "https://www.alberta.ca/stats/covid-19-alberta-statistics.htm":
                    data = pd.read_json(data).transpose()
                    data = data.rename(
                        columns={
                            0: "",
                            1: "Date reported",
                            2: "Alberta Health Services Zone",
                            3: "Gender",
                            4: "Age group",
                            5: "Case status",
                            6: "Case type"
                        })
                elif url == "https://www.alberta.ca/maps/covid-19-status-map.htm":
                    data = BeautifulSoup(data, features="html.parser")
                    data = data.get_text()  # strip HTML tags
                    ## this regex may need some tweaking if measures column changes in the future
                    data = re.sub("<\\\/a><\\\/li><\\\/ul>", "",
                                  data)  # strip remaining tags
                    data = re.sub("(?<=\") ", "", data)  # strip whitespace
                    data = re.sub(" (?=\")", "", data)  # strip whitespace
                    data = pd.read_json(data).transpose()
                    data = data.rename(
                        columns={
                            0: "",
                            1: "Region name",
                            2: "Region classification",
                            3: "Measures",
                            4: "Active case rate (per 100,000 population)",
                            5: "Active cases",
                            6: "Population"
                        })
                elif url == "https://www.alberta.ca/schools/covid-19-school-status-map.htm":
                    data = re.sub(',"container":.*', "",
                                  data)  # strip remaining tags
                    data = pd.read_json(data).transpose()
                    data = data.rename(
                        columns={
                            0: "",
                            1: "Region name",
                            2: "School status",
                            3: "Schools details",
                            4: "num_ord"
                        })
                    data['num_ord'] = data['num_ord'].astype(str).astype(
                        int)  # convert to int
                    data[''] = data[''].astype(str).astype(
                        int)  # convert to int
                    data = data.sort_values(
                        by=['num_ord', '']
                    )  # sort ascending by num_ord and first column (like CSV output on website)
                data = data.to_csv(
                    None, quoting=csv.QUOTE_ALL, index=False
                )  # to match website output: quote all lines, don't terminate with new line
                with open(f_path, 'w') as local_file:
                    local_file.write(data[:-1])
            elif mb_json_to_csv:
                ## for Manitoba JSON data only: convert JSON to CSV and save as temporary file
                tmpdir = tempfile.TemporaryDirectory()
                f_path = os.path.join(tmpdir.name, file + ext)
                data = pd.json_normalize(json.loads(req.content)['features'])
                data.columns = data.columns.str.lstrip(
                    'attributes.')  # strip prefix
                ## replace timestamps with actual dates
                if 'Date' in data.columns:
                    data.Date = pd.to_datetime(data.Date / 1000,
                                               unit='s').dt.date
                data.to_csv(f_path, index=None)
            else:
                ## all other data: write contents to temporary file
                tmpdir = tempfile.TemporaryDirectory()
                f_path = os.path.join(tmpdir.name, file + ext)
                with open(f_path, mode='wb') as local_file:
                    local_file.write(req.content)
            ## upload file
            s3_dir = os.path.join(dir_parent, dir_file)
            upload_file(full_name, f_path, s3_dir=s3_dir, s3_prefix=prefix)
    except Exception as e:
        ## print failure
        print(e)
        print(background('Error downloading: ' + full_name, Colors.red))
        failure += 1
        ## write failure to log message
        download_log = download_log + 'Failure: ' + full_name + '\n'