コード例 #1
0
ファイル: process.py プロジェクト: vonsturm/covid-19-data
def process_location(df: pd.DataFrame,
                     monotonic_check: bool = True) -> pd.DataFrame:
    # Only report up to previous day to avoid partial reporting
    df = df.assign(date=pd.to_datetime(df.date, dayfirst=True))
    df = df[df.date.dt.date < datetime.now().date()]
    # Default columns for second doses
    if "people_vaccinated" not in df:
        df = df.assign(people_vaccinated=pd.NA)
        df.people_vaccinated = df.people_vaccinated.astype("Int64")
    if "people_fully_vaccinated" not in df:
        df = df.assign(people_fully_vaccinated=pd.NA)
        df.people_fully_vaccinated = df.people_fully_vaccinated.astype("Int64")
    # Avoid decimals
    cols = [
        "total_vaccinations", "people_vaccinated", "people_fully_vaccinated"
    ]
    df[cols] = df[cols].astype(float).astype("Int64").fillna(pd.NA)
    # Order columns and rows
    usecols = [
        "location", "date", "vaccine", "source_url", "total_vaccinations",
        "people_vaccinated", "people_fully_vaccinated"
    ]
    df = df[usecols]
    df = df.sort_values(by="date")
    # Sanity checks
    country_df_sanity_checks(df, monotonic_check=monotonic_check)
    # Strip
    df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
    # Date format
    df = df.assign(date=df.date.dt.strftime("%Y-%m-%d"))

    return df
コード例 #2
0
ファイル: gsheets.py プロジェクト: phat-ap/covid-19-data
def read_csv_and_check(filepath):
    # Read
    try:
        df = pd.read_csv(
            filepath,
            # parse_dates=["date"],
            # dayfirst=True
        )
    except Exception:
        raise ValueError(f"Check the spreadsheet corresponding to {filepath}")
    location = df.loc[:, "location"].unique()
    # Date check
    try:
        df = df.assign(date=pd.to_datetime(df["date"], format="%Y-%m-%d"))
    except Exception:
        raise ValueError(f"{location} -- Invalid date format! Should be %Y-%m-%d. Check {df.date}.")
    if not df.date.is_monotonic:
        raise ValueError(f"{location} -- Check that date field is monotonically increasing.")
    # Checks
    country_df_sanity_checks(df)
    return df