Esempio n. 1
0
def __parse_df_from_2021_01_18(source: Source,
                               df: pd.DataFrame) -> Iterable[Vaccinated]:
    df = df.drop("Unnamed: 0", axis=1)
    df_iterrows = df.iterrows()

    for row in df_iterrows:
        _, (title, *data) = row
        if type(title) == str and re.match("^(nhs )?region of residence$",
                                           title.lower()):
            break

    for row in df_iterrows:
        _, (location, *data) = row
        if type(location) == float and math.isnan(location):
            continue
        if location == "Data quality notes:":
            break
        dose_1, dose_2, cumulative = filter(lambda d: not math.isnan(d), data)
        if re.match(r"^(Total\d?|England)$", location):
            location = ALL_LOCATIONS
        else:
            location = Location(location)
        yield Vaccinated(source, dose_1,
                         Slice(location=location, dose=Dose.DOSE_1))
        yield Vaccinated(source, dose_2,
                         Slice(location=location, dose=Dose.DOSE_2))
        yield Vaccinated(source, cumulative,
                         Slice(location=location, dose=Dose.ALL))
Esempio n. 2
0
def parse(source: Source, df: pd.DataFrame) -> Iterable[Vaccinated]:
    # Data overrides. Some data formats are only used once, and not worth writing parsers for.
    if source.data_date == date(2021, 1, 7) and source.period == "weekly":
        return [
            Vaccinated(source, 438075,
                       Slice(Dose.DOSE_1, UNDER_80S, ALL_LOCATIONS)),
            Vaccinated(source, 13567,
                       Slice(Dose.DOSE_2, UNDER_80S, ALL_LOCATIONS)),
            Vaccinated(source, 654810,
                       Slice(Dose.DOSE_1, OVER_80S, ALL_LOCATIONS)),
            Vaccinated(source, 6414, Slice(Dose.DOSE_2, OVER_80S,
                                           ALL_LOCATIONS)),
        ]
    elif source.data_date == date(2020, 12, 31) and source.period == "weekly":
        return [
            Vaccinated(source, 261561,
                       Slice(Dose.DOSE_1, UNDER_80S, ALL_LOCATIONS)),
            Vaccinated(source, 0, Slice(Dose.DOSE_2, UNDER_80S,
                                        ALL_LOCATIONS)),
            Vaccinated(source, 524439,
                       Slice(Dose.DOSE_1, OVER_80S, ALL_LOCATIONS)),
            Vaccinated(source, 0, Slice(Dose.DOSE_2, OVER_80S, ALL_LOCATIONS)),
        ]

    if source.period == "daily":
        if source.data_date >= date(2021, 1, 18):
            return __parse_df_from_2021_01_18(source, df)
        else:
            return __parse_df_earliest(source, df)
    elif source.period == "weekly":
        return __parse_df_weekly(source, df)
    else:
        raise AssertionError()
Esempio n. 3
0
def __parse_df_earliest(source: Source, df: pd.DataFrame) -> Iterable[Vaccinated]:
    df = df.drop("Unnamed: 0", axis=1)
    for row in df.iterrows():
        _, (title, *data) = row
        if type(title) == str and " to " in title and len(title.split()) == 7:
            dose = Dose.ALL
        elif type(title) == str and title.strip().lower() == "of which, 1st dose":
            dose = Dose.DOSE_1
        elif type(title) == str and title.strip().lower() == "of which, 2nd dose":
            dose = Dose.DOSE_2
        else:
            continue
        vaccinated = data[1]
        yield Vaccinated(source, vaccinated, Slice(dose=dose))
Esempio n. 4
0
def __parse_df_weekly(source: Source,
                      df: pd.DataFrame) -> Iterable[Vaccinated]:
    def is_start(cell) -> bool:
        return type(cell) == str and (cell.lower() == "region of residence"
                                      or cell.lower()
                                      == "nhs region of residence")

    def is_end(cell) -> bool:
        return type(cell) == str and cell.lower() == "data quality notes:"

    def is_nan(cell) -> bool:
        return type(cell) == float and math.isnan(cell)

    a = df.to_numpy()

    # Trim.
    (start_y, ), (start_x, ) = np.where(np.vectorize(is_start)(a))
    (end_y, ), (_, ) = np.where(np.vectorize(is_end)(a))
    a = a[start_y:end_y, start_x:]

    # Remove NaNs.
    is_nans = np.vectorize(is_nan)(a)
    a = a[:, ~np.all(is_nans, axis=0)]
    a = a[~np.all(is_nans, axis=1), :]

    # Fill in dose row.
    filled_in_doses = []
    current_dose = None
    for population in a[0, 1:]:
        if not is_nan(population):
            current_dose = population
        filled_in_doses.append(current_dose)
    a[0, 1:] = filled_in_doses

    vaccinated_by_slice: DefaultDict[Slice, int] = defaultdict(int)

    for y in range(2, a.shape[0]):
        for x in range(1, a.shape[1]):
            dose = a[0, x]
            group = a[1, x]
            location = a[y, 0]
            vaccinated = a[y, x]

            ignore = [
                # Ignore population estimates.
                "population estimates",
                # Ignore precalculated %
                "% who have had at least 1 dose",
                "% who have had both doses",
                # Ignore dose summaries.
                "total 1st doses",
                "total 2nd doses",
            ]

            if any(map(lambda d: d in dose.lower(), ignore)):
                continue
            if type(group) == str and "percent of all" in group.lower():
                # Ignore percentage reports.
                continue

            is_dose_and_group_all = "cumulative total doses to date" in dose.lower(
            )

            if dose in ["1st dose", "1st dose5"]:
                dose = Dose.DOSE_1
            elif dose in ["2nd dose", "2nd dose5"]:
                dose = Dose.DOSE_2
            elif is_dose_and_group_all:
                dose = Dose.ALL
            else:
                raise AssertionError(
                    f"Unexpected dose {dose} in source {source}")

            if is_dose_and_group_all:
                group = ALL_AGES
            else:
                group = Group.from_csv_str(group)

            if re.match(r"^Total\d?$", location):
                location = ALL_LOCATIONS
            else:
                location = Location(location)

            vaccinated_by_slice[Slice(dose, group, location)] += vaccinated

    for slice_, vaccinated in vaccinated_by_slice.items():
        yield Vaccinated(source, vaccinated, slice_)
Esempio n. 5
0
def add_extrapolations(vaccinated: List[Vaccinated]) -> Iterable[Vaccinated]:
    import streamlit as st

    assert all(v.slice.location == ALL_LOCATIONS for v in vaccinated)
    assert all(v.slice.group == ALL_AGES for v in vaccinated)

    dose_1_vaccinations = {
        v.source.real_date: v.vaccinated
        for v in vaccinated if v.slice.dose == Dose.DOSE_1
    }
    dose_1_vaccinations_dates = list(sorted(dose_1_vaccinations.keys()))
    dose_1_new_vaccinations = {
        dose_1_vaccinations_dates[0]:
        dose_1_vaccinations[dose_1_vaccinations_dates[0]]
    }
    for d1, d2 in zip(dose_1_vaccinations_dates,
                      dose_1_vaccinations_dates[1:]):
        st.write(d1, d2, dose_1_vaccinations[d1], dose_1_vaccinations[d2])
        dose_1_new_vaccinations[
            d2] = dose_1_vaccinations[d2] - dose_1_vaccinations[d1]
    dose_1_new_vaccinations = defaultdict(int, dose_1_new_vaccinations)
    st.write({str(k): v for k, v in dose_1_new_vaccinations.items()})

    date_latest = max(v.source.real_date for v in vaccinated)
    this_week_vaccinations = sum(v.vaccinated for v in vaccinated
                                 if v.source.real_date == date_latest)
    last_week_vaccinations = sum(v.vaccinated for v in vaccinated
                                 if v.source.real_date == date_latest -
                                 timedelta(weeks=1))
    vaccination_rate = this_week_vaccinations - last_week_vaccinations
    st.write("last week", last_week_vaccinations)
    st.write("this week", this_week_vaccinations)
    st.write("vaccination rate", vaccination_rate)

    cumulative_dose_1_vaccinations = next(
        v.vaccinated for v in vaccinated
        if v.source.real_date == date_latest and v.slice.dose == Dose.DOSE_1)
    cumulative_dose_2_vaccinations = next(
        v.vaccinated for v in vaccinated
        if v.source.real_date == date_latest and v.slice.dose == Dose.DOSE_2)
    total_population = population.total_population()
    dose_2_vaccinations_required = 0
    for day in range(1, 365):
        current_date = date_latest + timedelta(days=day)
        new_vaccinations = int(vaccination_rate / 7)
        dose_2_vaccinations_required += dose_1_new_vaccinations[current_date -
                                                                timedelta(
                                                                    weeks=12)]

        dose_2_vaccinations = min(max(0, dose_2_vaccinations_required),
                                  new_vaccinations)
        dose_1_vaccinations = new_vaccinations - dose_2_vaccinations
        dose_1_vaccinations = min(
            dose_1_vaccinations,
            total_population - cumulative_dose_1_vaccinations)
        if dose_1_vaccinations + dose_2_vaccinations < new_vaccinations:
            dose_2_vaccinations += new_vaccinations - (dose_1_vaccinations +
                                                       dose_2_vaccinations)
            dose_2_vaccinations = min(
                dose_2_vaccinations,
                total_population - cumulative_dose_2_vaccinations)
        assert dose_1_vaccinations >= 0
        assert dose_2_vaccinations >= 0

        cumulative_dose_2_vaccinations += dose_2_vaccinations
        cumulative_dose_1_vaccinations += dose_1_vaccinations
        dose_2_vaccinations_required -= dose_2_vaccinations

        dose_1_new_vaccinations[
            current_date] = dose_1_vaccinations - dose_2_vaccinations
        yield Vaccinated(
            source=Source("", current_date, current_date, "weekly"),
            slice=Slice(dose=Dose.DOSE_1),
            vaccinated=cumulative_dose_1_vaccinations,
            extrapolated=True,
        )
        yield Vaccinated(
            source=Source("", current_date, current_date, "weekly"),
            slice=Slice(dose=Dose.DOSE_2),
            vaccinated=cumulative_dose_2_vaccinations,
            extrapolated=True,
        )

    yield from vaccinated
Esempio n. 6
0
def deaggregate_with_interpolation(
        aggregate: Vaccinated, dim: str,
        vaccinated: List[Vaccinated]) -> Iterable[Vaccinated]:
    other_dims = [d for d in __SLICE_DIMS if d != dim]

    vaccinated_weekly = [
        v for v in vaccinated if v.source.period == "weekly"
        if not getattr(v.slice, dim).is_all() and all(
            getattr(v.slice, other_dim) == getattr(aggregate.slice, other_dim)
            for other_dim in other_dims)
    ]

    if len(vaccinated_weekly) < 2:
        print(f"Failed to interpolate "
              f"{aggregate.slice} {aggregate.source.real_date} "
              f"with {len(vaccinated_weekly)} samples")
        yield from []
        return

    dates: List[date] = list(
        sorted(
            {v.source.real_date
             for v in vaccinated_weekly},
            key=lambda d: abs((d - aggregate.source.real_date).days),
        ))
    dates = dates[:2]
    dates = list(sorted(dates))

    dim_date_vaccinated = [(getattr(v.slice,
                                    dim), v.source.real_date, v.vaccinated)
                           for v in vaccinated_weekly
                           if v.source.real_date in dates]

    for dim_value in {
            getattr(v.slice, dim)
            for v in vaccinated_weekly if v.source.real_date in dates
    }:
        ratio0 = sum(
            v for d, ddate, v in dim_date_vaccinated
            if ddate == dates[0] and d == dim_value) / sum(
                v for _, ddate, v in dim_date_vaccinated if ddate == dates[0])
        ratio1 = sum(
            v for d, ddate, v in dim_date_vaccinated
            if ddate == dates[1] and d == dim_value) / sum(
                v for _, ddate, v in dim_date_vaccinated if ddate == dates[1])

        date_progress = (aggregate.source.data_date -
                         dates[0]).days / (dates[1] - dates[0]).days
        date_progress = max(0.0, min(1.0, date_progress))
        ratio = ratio0 + (ratio1 - ratio0) * date_progress
        new_vaccinated = int(aggregate.vaccinated * ratio)
        assert new_vaccinated >= 0, (
            dim,
            dim_value,
            ratio,
            ratio0,
            ratio1,
            dates,
            aggregate.source.real_date,
        )
        yield Vaccinated(
            source=aggregate.source,
            vaccinated=new_vaccinated,
            slice=replace(aggregate.slice, **{dim: dim_value}),
            interpolated=True,
        )