Beispiel #1
0
def main():
    # Load current file
    df_source = pd.read_csv(OUTPUT_FILE)

    #  Get date
    driver = load_driver(DATA_URL)
    try:
        date = load_date(driver)
    except:
        raise Exception("Date not found!")

    # Load dose 1 data
    url = "https://www.fhi.no/api/chartdata/api/99112"
    dix = json.loads(requests.get(url).content)
    df_dose1 = pd.DataFrame(dix, columns=["region", "people_vaccinated"])
    # Load dose 2 data
    url = "https://www.fhi.no/api/chartdata/api/99111"
    dix = json.loads(requests.get(url).content)
    df_dose2 = pd.DataFrame(dix, columns=["region", "people_fully_vaccinated"])
    # Remove row
    df_dose1 = df_dose1.loc[~(df_dose2["region"] == "Fylke")]
    df_dose2 = df_dose2.loc[~(df_dose2["region"] == "Fylke")]
    # Merge
    df = df_dose1.merge(df_dose2, on="region", how="left")

    # Process region column
    df.loc[:, "region"] = df.loc[:, "region"].replace(REGION_RENAMING)

    # Add columns
    df.loc[:, "date"] = date
    df.loc[:, "location"] = COUNTRY
    df.loc[:,
           "total_vaccinations"] = df.loc[:,
                                          "people_fully_vaccinated"] + df.loc[:,
                                                                              "people_vaccinated"]

    # Add ISO codes
    df = ISODB().merge(df, country_iso=COUNTRY_ISO)

    # Concat
    df_source = df_source.loc[~(df_source.loc[:, "date"] == date)]
    df = pd.concat([df, df_source])

    # Export
    df = df[[
        "location", "region", "date", "location_iso", "region_iso",
        "total_vaccinations", "people_vaccinated", "people_fully_vaccinated"
    ]]
    df = keep_min_date(df)
    cols = [
        "total_vaccinations", "people_vaccinated", "people_fully_vaccinated"
    ]
    df[cols] = df[cols].astype("Int64").fillna(pd.NA)
    df = df.sort_values(by=["region", "date"])
    df.to_csv(OUTPUT_FILE, index=False)

    # Tracking
    update_country_tracking(country=COUNTRY,
                            url=DATA_URL_REFERENCE,
                            last_update=df["date"].max())
def main():
    # Load current data
    df_source = pd.read_csv(OUTPUT_FILE)

    # Load data
    page_content = requests.get(DATA_URL, headers={
        'User-Agent': 'Custom'
    }).content
    soup = BeautifulSoup(page_content, "html.parser")

    # Get new data
    boxes = soup.findAll(class_="col-12 col-md-6 col-xl-4")
    new_data = []
    if len(boxes) == 3:
        for box in boxes:
            fields = box.findAll(class_="col-12")
            if len(fields) == 4:
                region = fields[0].text.strip()
                if "Vaccines administered" in fields[1].text:
                    total, regional = fields[1].findAll(
                        class_="col-auto text-end")
                    dose_1, dose_2 = list(
                        map(lambda x: int(x.replace(",", "")),
                            regional.text.strip().split("\n")))
                    new_data.append([region, dose_1, dose_2])
    df = pd.DataFrame(
        new_data,
        columns=["region", "people_vaccinated", "people_fully_vaccinated"])

    # Process
    df.loc[:,
           "total_vaccinations"] = df.loc[:,
                                          "people_vaccinated"] + df.loc[:,
                                                                        "people_fully_vaccinated"]
    df.loc[:, "location"] = COUNTRY

    # Join with date
    url = "https://covid-vaccinatie.be/en/vaccines-administered.xlsx"
    df_dates = get_date(url)
    df = df.merge(df_dates, left_on="region", right_on="Region", how="left")

    # ISO
    df = ISODB().merge(df, country_iso=COUNTRY_ISO)

    # Export
    region = df_dates.index.tolist()
    date = df_dates.date.tolist()
    df_source = df_source.loc[~(df_source["region"].isin(region)
                                & df_source["date"].isin(date))]
    df = pd.concat([df, df_source])
    df = df[[
        "location", "region", "date", "location_iso", "region_iso",
        "total_vaccinations", "people_vaccinated", "people_fully_vaccinated"
    ]]
    cols = [
        "total_vaccinations", "people_vaccinated", "people_fully_vaccinated"
    ]

    # Avoid repeating reports
    df = keep_min_date(df)

    # Export
    df[cols] = df[cols].astype("Int64").fillna(pd.NA)
    df = df.sort_values(by=["region", "date"])
    df.to_csv(OUTPUT_FILE, index=False)

    # Tracking
    update_country_tracking(country=COUNTRY,
                            url=DATA_URL_REFERENCE,
                            last_update=df["date"].max())
Beispiel #3
0
def main():
    # Load current data
    df_source = pd.read_csv(OUTPUT_FILE)

    # Locate newest pdf
    html_page = urllib.request.urlopen(DATA_URL)
    soup = BeautifulSoup(html_page, "html.parser")
    pdf_path = soup.find('a', text="Download her").get(
        "href")  # Get path to newest pdf
    # Get preliminary dataframe
    column_string = {
        'dtype': str,
        'header': None
    }  # Force dtype to be object because of thousand separator
    kwargs = {
        'pandas_options': column_string,
    }
    dfs_from_pdf = tabula.read_pdf(pdf_path, pages="all",
                                   **kwargs)  # len(dfs_from_pdf) == 8 ?
    #date = datetime.strptime(pdf_path.split("-")[-2], "%d%m%Y").strftime("%Y-%m-%d")
    date = get_date(dfs_from_pdf)

    # Get preliminary dataframe
    column_string = {
        'dtype': str,
        'header': None
    }  # Force dtype to be object because of thousand separator
    kwargs = {
        'pandas_options': column_string,
    }
    dfs_from_pdf = tabula.read_pdf(pdf_path, pages="all", **kwargs)
    df = dfs_from_pdf[1]  # Hardcoded

    if df.shape != (11, 7):
        raise Exception("Shape of table changed!")
    if not all(region in df[0].tolist() for region in regions):
        raise Exception("Region missing!")

    # Drop columns
    df = df.drop([0, 1, 2, 3, len(df) - 1])
    # Rename columns
    df = df.rename(columns={
        0: "region",
        2: "people_vaccinated",
        4: "people_fully_vaccinated"
    })
    df = df.astype(str)

    # Remove numeric 1000-separator
    df.loc[:, "people_vaccinated"] = df.loc[:, "people_vaccinated"].apply(
        lambda x: int(x.replace(".", ""))).fillna(0).astype(int)

    def del_separator(x):
        if x != 'nan':
            return int(x.replace(".", ""))
        else:
            return 0

    df.loc[:,
           "people_fully_vaccinated"] = df.loc[:,
                                               "people_fully_vaccinated"].apply(
                                                   lambda x: del_separator(
                                                       x)).astype("Int64")

    # Process region column
    df.loc[:, "region"] = df.loc[:, "region"].replace(REGION_RENAMING)

    # Get new columns
    df.loc[:,
           "total_vaccinations"] = df.loc[:,
                                          "people_vaccinated"] + df.loc[:,
                                                                        "people_fully_vaccinated"]
    df.loc[:, "location"] = COUNTRY
    df.loc[:, "date"] = date

    # Add ISO codes
    df = ISODB().merge(df, country_iso=COUNTRY_ISO)
    df.loc[df["region"] == "Others", "location_iso"] = COUNTRY_ISO

    # Concat
    df_source = df_source.loc[~(df_source.loc[:, "date"] == date)]
    df = pd.concat([df, df_source])

    # Export
    cols = [
        "location", "region", "date", "location_iso", "region_iso",
        "total_vaccinations", "people_vaccinated", "people_fully_vaccinated"
    ]
    df = keep_min_date(df[cols])[cols]
    df = df.sort_values(by=["region", "date"])
    df.to_csv(OUTPUT_FILE, index=False)

    # Tracking
    update_country_tracking(country=COUNTRY,
                            url=DATA_URL_REFERENCE,
                            last_update=df["date"].max())