Ejemplo n.º 1
0
    def simulate(self):
        """
        Perform simulation with the multi-phased ODE model.

        Raises:
            covsirphy.UnExecutedError: either tau value or phase information was not set

        Returns:
            pandas.DataFrame:
                Index
                    reset index
                Columns
                    - Date (pd.Timestamp): Observation date
                    - Susceptible (int): the number of susceptible cases
                    - Infected (int): the number of currently infected cases
                    - Fatal (int): the number of fatal cases
                    - Recovered (int): the number of recovered cases
        """
        if self._tau is None:
            raise UnExecutedError(
                "ODEHandler.estimate_tau()",
                message="or specify tau when creating an instance of ODEHandler"
            )
        if not self._info_dict:
            raise UnExecutedError("ODEHandler.add()")
        combs = itertools.product(self._model.PARAMETERS,
                                  self._info_dict.items())
        for (param, (phase, phase_dict)) in combs:
            if param not in phase_dict["param"]:
                raise ValueError(
                    f"{param.capitalize()} is not registered for the {phase} phase."
                )
        solver = _MultiPhaseODESolver(self._model, self._first, self._tau)
        return solver.simulate(*self._info_dict.values())
Ejemplo n.º 2
0
    def legend(self,
               bbox_to_anchor=(0.5, -0.2),
               bbox_loc="lower center",
               ncol=None,
               **kwargs):
        """
        Set legend.

        Args:
            bbox_to_anchor (tuple(int or float, int or float)): distance of legend and plot
            bbox_loc (str): location of legend
            ncol (int or None): the number of columns that the legend has
            kwargs: keyword arguments of matplotlib.pyplot.legend()
        """
        if not self._variables:
            raise UnExecutedError("LinePlot.plot()")
        ncol = self._ensure_natural_int(
            ncol or (1 if "left" in bbox_loc else len(self._variables)),
            name="ncol")
        self._ax.legend(bbox_to_anchor=bbox_to_anchor,
                        loc=bbox_loc,
                        borderaxespad=0,
                        ncol=ncol,
                        **kwargs)
        plt.tight_layout()
Ejemplo n.º 3
0
    def _model_is_registered(self):
        """
        Ensure that model was set.

        Raises:
            NameError: ODE model is not registered
        """
        if self._model is None:
            raise UnExecutedError("PhaseUnit.set_ode(model)")
Ejemplo n.º 4
0
    def simulate(self):
        """
        Perform simulation with the multi-phased ODE model.

        Raises:
            covsirphy.UnExecutedError: either tau value or phase information was not set

        Returns:
            pandas.DataFrame:
                Index
                    reset index
                Columns
                    - Date (pandas.Timestamp): Observation date
                    - Confirmed (int): the number of confirmed cases
                    - Infected (int): the number of currently infected cases
                    - Fatal (int): the number of fatal cases
                    - Recovered (int): the number of recovered cases
                    - Susceptible (int): the number of susceptible cases

        Note:
            Deactivated phases will be included.

        Note:
            Un-registered phases will not be included.

        Note:
            If parameter set is not registered for the current phase and
            the previous phase has parameter set, this set will be used for the current phase.
        """
        # Model and tau must be set
        if self._model is None:
            raise UnExecutedError(
                "PhaseTracker.estimate() or PhaseTracker.set_ode()")
        # Get parameter sets and initial values
        record_df = self._track_df.copy()
        record_df = record_df.loc[record_df[self.ID] != 0].ffill().dropna()
        start_dates = record_df.reset_index().groupby(
            self.ID).first()[self.DATE].sort_values()
        end_dates = record_df.reset_index().groupby(
            self.ID).last()[self.DATE].sort_values()
        # Set-up ODEHandler
        handler = ODEHandler(self._model, record_df.index.min(), tau=self._tau)
        parameters = self._model.PARAMETERS[:]
        for (start, end) in zip(start_dates, end_dates):
            param_dict = record_df.loc[end, parameters].to_dict()
            if end <= self._today:
                y0_dict = record_df.loc[
                    start, [self.S, self.CI, self.F, self.R]].to_dict()
            else:
                y0_dict = None
            _ = handler.add(end, param_dict=param_dict, y0_dict=y0_dict)
        # Perform simulation
        sim_df = handler.simulate()
        sim_df[self.C] = sim_df[[self.CI, self.F, self.R]].sum(axis=1)
        return sim_df.loc[:, self.SUB_COLUMNS]
Ejemplo n.º 5
0
    def simulate(self, y0_dict=None):
        """
        Perform simulation with the set/estimated parameter values.

        Args:
            y0_dict (dict or None): dictionary of initial values or None
                - key (str): variable name
                - value (float): initial value

        Returns:
            pandas.DataFrame
                Index
                    reset index
                Columns
                    - Date (pd.Timestamp): Observation date
                    - Confirmed (int): the number of confirmed cases
                    - Infected (int): the number of currently infected cases
                    - Fatal (int): the number of fatal cases
                    - Recovered (int): the number of recovered cases
                    - Variables of the model (int): Confirmed etc.

        Note:
            Simulation starts at the start date of the phase.
            Simulation end at the next date of the end date of the phase.
        """
        self._model_is_registered()
        # Initial values
        y0_dict = y0_dict or {}
        y0_dict.update(self.y0_dict)
        diff_set = set(self._model.VARIABLES) - y0_dict.keys()
        y0_dict.update({var: 0 for var in diff_set})
        # Conditions
        param_dict = self._ode_dict.copy()
        if None in param_dict.values():
            raise UnExecutedError("PhaseUnit.set_ode()")
        tau = param_dict.pop(self.TAU)
        last_date = self.tomorrow(self._end_date)
        # Simulation
        simulator = ODESimulator()
        simulator.add(
            model=self._model,
            step_n=self.steps(self._start_date, last_date, tau),
            population=self._population,
            param_dict=param_dict,
            y0_dict=y0_dict
        )
        # Dimensionalized values
        df = simulator.dim(tau=tau, start_date=self._start_date)
        df = self._model.restore(df)
        # Return day-level data
        df = df.set_index(self.DATE).resample("D").first()
        df = df.loc[df.index <= self._ensure_date(last_date), :]
        return df.reset_index().loc[:, self.NLOC_COLUMNS]
Ejemplo n.º 6
0
    def estimate_tau(self, data, guess_quantile=0.5):
        """
        Select tau value [min] which minimize the score of the metric.

        Args:
            data (pandas.DataFrame):
                Index
                    reset index
                Columns
                    - Date (pd.Timestamp): Observation date
                    - Susceptible(int): the number of susceptible cases
                    - Infected (int): the number of currently infected cases
                    - Fatal(int): the number of fatal cases
                    - Recovered (int): the number of recovered cases
            guess_quantile (float): quantile to guess ODE parameter values for the candidates of tau

        Returns:
            int: estimated tau value [min]

        Raises:
            covsirphy.UnExecutedError: phase information was not set

        Note:
            ODE parameter for each tau value will be guessed by .guess() classmethod of the model.
            Tau value will be selected from the divisors of 1440 [min] and set to self.
        """
        self._ensure_dataframe(data, name="data", columns=self.DSIFR_COLUMNS)
        df = data.loc[:, self.DSIFR_COLUMNS]
        if not self._info_dict:
            raise UnExecutedError("ODEHandler.add()")
        # Calculate scores of tau candidates
        self._ensure_float(guess_quantile, name="quantile")
        calc_f = functools.partial(self._score_tau,
                                   data=df,
                                   quantile=guess_quantile)
        divisors = self.divisors(1440)
        if self._n_jobs == 1:
            scores = [calc_f(candidate) for candidate in divisors]
        else:
            with Pool(self._n_jobs) as p:
                scores = p.map(calc_f, divisors)
        score_dict = {k: v for (k, v) in zip(divisors, scores)}
        # Return the best tau value
        comp_f = {
            True: min,
            False: max
        }[Evaluator.smaller_is_better(metric=self._metric)]
        self._tau = comp_f(score_dict.items(), key=lambda x: x[1])[0]
        return self._tau
Ejemplo n.º 7
0
    def _cleaning(self):
        """
        Perform data cleaning of the raw data.
        This method overwrite super()._cleaning() method.

        Returns:
            pandas.DataFrame:
                Index
                    reset index
                Columns
                    - Date (pd.TimeStamp): Observation date
                    - Country (pandas.Category): country/region name
                    - Province (pandas.Category): province/prefecture/state name
                    - Confirmed (int): the number of confirmed cases
                    - Infected (int): the number of currently infected cases
                    - Fatal (int): the number of fatal cases
                    - Recovered (int): the number of recovered cases
        """
        if not self.var_dict:
            raise UnExecutedError("CountryData.set_variables()")
        df = self._raw.copy()
        # Rename the columns
        df = df.rename(self.var_dict, axis=1)
        # Confirm the expected columns are in raw data
        expected_cols = [self.DATE, self.C, self.F, self.R]
        self._ensure_dataframe(df, name="the raw data", columns=expected_cols)
        # Remove empty rows
        df = df.dropna(subset=[self.DATE])
        # Add province column
        if self.province_col is not None:
            df = df.rename({self.province_col: self.PROVINCE}, axis=1)
        else:
            df[self.PROVINCE] = self._province or self.UNKNOWN
        # Values
        v_cols = [self.C, self.F, self.R]
        for col in v_cols:
            df[col] = pd.to_numeric(df[col], errors="coerce")
        df[v_cols] = df[v_cols].fillna(0).astype(np.int64)
        df[self.CI] = df[self.C] - df[self.F] - df[self.R]
        # Groupby date and province
        df[self.DATE] = pd.to_datetime(df[self.DATE])
        df = df.groupby([self.DATE, self.PROVINCE]).sum().reset_index()
        # Add country column
        df[self.COUNTRY] = self._country
        df = df.loc[:, self.COLUMNS]
        # Update data types to reduce memory
        df[self.AREA_COLUMNS] = df[self.AREA_COLUMNS].astype("category")
        return df
Ejemplo n.º 8
0
    def track(self):
        """
        Return subset of summary and show a figure to show the history in each country.

        Args:
            param (str): parameter to show
            roll_window (int or None): rolling average window if necessary
            show_figure (bool): If True, show the result as a figure
            filename (str): filename of the figure, or None (show figure)
            kwargs: keword arguments of pd.DataFrame.plot or line_plot()

        Returns:
            pandas.DataFrame: parameter values
                Index
                    reset index
                Columns
                    - Country (str): country name
                    - Date (pd.Timestamp): date
                    - (float): model parameters
                    - (float): model day parameters
                    - Rt (float): reproduction number
                    - (float): OxCGRT values
        """
        if self.model is None:
            raise UnExecutedError("PolicyMeasures.estimate(model)")
        # Get parameter/Rt/data parameter value of each date
        df = self.summary().reset_index().replace(self.UNKNOWN, None)
        df[self.START] = pd.to_datetime(df[self.START],
                                        format=self.DATE_FORMAT)
        df[self.END] = pd.to_datetime(df[self.END], format=self.DATE_FORMAT)
        df[self.DATE] = df[[self.START, self.END]].apply(
            lambda x: pd.date_range(x[0], x[1]).tolist(), axis=1)
        df = df.explode(self.DATE)
        cols = [
            self.DATE, self.COUNTRY, *self.model.PARAMETERS,
            *self.model.DAY_PARAMETERS, self.RT
        ]
        param_df = df.reindex(cols, axis=1)
        # OxCGRT
        oxcgrt_df = self.oxcgrt_data.cleaned()
        sel = oxcgrt_df[self.COUNTRY].isin(self._countries)
        oxcgrt_df = oxcgrt_df.loc[
            sel, [self.DATE, self.COUNTRY, *OxCGRTData.OXCGRT_VARS]]
        # Combine data
        return pd.merge(param_df,
                        oxcgrt_df,
                        how="inner",
                        on=[self.COUNTRY, self.DATE])
Ejemplo n.º 9
0
    def estimate(self, record_df=None, **kwargs):
        """
        Perform parameter estimation.

        Args:
            record_df (pandas.DataFrame or None)
                Index
                    reset index
                Columns
                    - Date (pd.Timestamp): Observation date
                    - Confirmed (int): the number of confirmed cases
                    - Infected (int): the number of currently infected cases
                    - Fatal (int): the number of fatal cases
                    - Recovered (int): the number of recovered cases
                    - any other columns will be ignored
            **kwargs: keyword arguments of Estimator.run()

        Note:
            If @record_df is None, registered records will be used.
        """
        self._model_is_registered()
        # Records
        if record_df is None:
            record_df = self._record_df.copy()
        if record_df.empty:
            raise UnExecutedError("PhaseUnit.record_df = ...",
                                  message="or specify @record_df argument")
        self._ensure_dataframe(record_df,
                               name="record_df",
                               columns=self.NLOC_COLUMNS)
        # Check dates
        sta = self.date_obj(self.start_date)
        end = self.date_obj(self.end_date)
        series = record_df[self.DATE]
        record_df = record_df.loc[(series >= sta) & (series <= end), :]
        # Parameter estimation of ODE model
        estimator = Estimator(record_df, self._model, self._population,
                              **self._ode_dict, **kwargs)
        estimator.run(**kwargs)
        self._read_estimator(estimator, record_df)
        # Set estimator
        self._estimator = estimator
Ejemplo n.º 10
0
    def simulate(self, y0_dict=None):
        """
        Simulate ODE models with set/estimated parameter values.

        Args:
            y0_dict(dict[str, float] or None): dictionary of initial values of variables

        Returns:
            pandas.DataFrame
                Index
                    reset index
                Columns
                    - Date (pd.Timestamp): Observation date
                    - Country (str): country/region name
                    - Province (str): province/prefecture/state name
                    - Variables of the model and dataset (int): Confirmed etc.
        """
        self._ensure_phase_setting()
        try:
            return self._series.simulate(record_df=self.record_df,
                                         y0_dict=y0_dict)
        except NameError:
            raise UnExecutedError(".estimate()")
Ejemplo n.º 11
0
 def _ensure_phase_setting(self):
     """
     Ensure that phases were set.
     """
     if not self._series:
         raise UnExecutedError(".trend() or .add()")
Ejemplo n.º 12
0
    def parse_range(self, dates=None, past_days=None, phases=None):
        """
        Parse date range and return the minimum date and maximum date.

        Args:
            dates (tuple(str or pandas.Timestamp or None, ) or None): start date and end date
            past_days (int or None): how many past days to use in calculation from today (property)
            phases (list[str] or None): phase names to use in calculation

        Raises:
            covsirphy.UnExecutedError: no phases were registered
            ValueError: @dates argument does not have exact two elements

        Returns:
            tuple(pandas.Timestamp, pandas.Timestamp): the minimum date and maximum date

        Notes:
            When not specified (i.e. None was applied),
            the start date of the 0th phase will be used as the minimum date.

        Notes:
            When not specified (i.e. None was applied),
            the end date of the last phase phase will be used as the maximum date.

        Note:
            When @past_days was specified, (today - @past_days, today) will be returned.

        Note:
            In @phases, 'last' means the last registered phase.

        Note:
            Priority is given in the order of @dates, @past_days, @phases.
        """
        if not self:
            raise UnExecutedError("PhaseTracker.define_phase()")
        # Get list of phases: index=phase names, columns=Start/End
        track_df = self._track_df.reset_index()
        track_df = track_df.loc[track_df[self.ID] != 0]
        track_df[self.ID], _ = track_df[self.ID].factorize()
        first_df = track_df.groupby(self.ID).first()
        df = first_df.join(track_df.groupby(self.ID).last(), rsuffix="_last")
        df = df.rename(columns={
            self.DATE: self.START,
            f"{self.DATE}_last": self.END
        })
        df.index = [self.num2str(num) for num in df.index]
        # Get default values
        start_default, end_default = df[self.START].min(), df[self.END].max()
        # Read @dates
        if dates is not None:
            if len(dates) != 2:
                raise ValueError(
                    f"@dates must be a tuple which has two elements, but {dates} was applied."
                )
            start = self._ensure_date(
                dates[0],
                name="the first element of 'dates' argument",
                default=start_default)
            end = self._ensure_date(
                dates[1],
                name="the second element of 'dates' argument",
                default=end_default)
            self._ensure_date_order(
                start, end, name="the second element of 'dates' argument")
            return (start, end)
        # Read @past_days
        if past_days is not None:
            past_days = self._ensure_natural_int(past_days, name="past_days")
            return (self._today - timedelta(days=past_days), self._today)
        # No arguments were specified
        if phases is None:
            return (start_default, end_default)
        # Read @phases
        self._ensure_list(phases, name="phases")
        dates = []
        for phase in phases:
            phase_replaced = df.index[-1] if phase == "last" else phase
            self._ensure_selectable(phase_replaced,
                                    df.index.tolist(),
                                    name="phase")
            start = df.loc[phase_replaced, self.START]
            end = df.loc[phase_replaced, self.END]
            dates.extend(pd.date_range(start, end).tolist())
        return (min(dates), max(dates))
Ejemplo n.º 13
0
    def estimate_params(self,
                        data,
                        quantiles=(0.1, 0.9),
                        check_dict=None,
                        study_dict=None,
                        **kwargs):
        """
        Estimate ODE parameter values of the all phases to minimize the score of the metric.

        Args:
            data (pandas.DataFrame):
                Index
                    reset index
                Columns
                    - Date (pd.Timestamp): Observation date
                    - Susceptible(int): the number of susceptible cases
                    - Infected (int): the number of currently infected cases
                    - Fatal(int): the number of fatal cases
                    - Recovered (int): the number of recovered cases
            quantiles (tuple(int, int)): quantiles to cut parameter range, like confidence interval
            check_dict (dict[str, object] or None): setting of validation
                - None means {"timeout": 180, "timeout_interation": 5, "tail_n": 4, "allowance": (0.99, 1.01)}
                - timeout (int): timeout of optimization
                - timeout_iteration (int): timeout of one iteration
                - tail_n (int): the number of iterations to decide whether score did not change for the last iterations
                - allowance (tuple(float, float)): the allowance of the max predicted values
            study_dict (dict[str, object] or None): setting of optimization study
                - None means {"pruner": "threshold", "upper": 0.5, "percentile": 50, "seed": 0}
                - pruner (str): kind of pruner (hyperband, median, threshold or percentile)
                - upper (float): works for "threshold" pruner, intermediate score is larger than this value, it prunes
                - percentile (float): works for "Percentile" pruner, the best intermediate value is in the bottom percentile among trials, it prunes
            kwargs: we can set arguments directly. E.g. timeout=180 for check_dict={"timeout": 180,...}

        Raises:
            covsirphy.UnExecutedError: either tau value or phase information was not set

        Returns:
            dict(str, object): setting of the phase (key: phase name)
                - Start (pandas.Timestamp): start date
                - End (pandas.Timestamp): end date
                - Rt (float): phase-dependent reproduction number
                - (str, float): estimated parameter values, including rho
                - (int or float): day parameters, including 1/beta [days]
                - {metric}: score with the estimated parameter values
                - Trials (int): the number of trials
                - Runtime (str): runtime of optimization
        """
        print(f"\n<{self._model.NAME} model: parameter estimation>")
        print(f"Running optimization with {self._n_jobs} CPUs...")
        stopwatch = StopWatch()
        # Arguments
        self._ensure_dataframe(data, name="data", columns=self.DSIFR_COLUMNS)
        df = data.loc[:, self.DSIFR_COLUMNS]
        if not self._info_dict:
            raise UnExecutedError("ODEHandler.add()")
        if self._tau is None:
            raise UnExecutedError(
                "ODEHandler.estimate_tau()",
                message="or specify tau when creating an instance of ODEHandler"
            )
        # Arguments used in the old Estimator
        check_dict = check_dict or {
            "timeout": 180,
            "timeout_interation": 5,
            "tail_n": 4,
            "allowance": (0.99, 1.01)
        }
        check_dict.update(kwargs)
        study_dict = study_dict or {
            "pruner": "threshold",
            "upper": 0.5,
            "percentile": 50,
            "seed": 0
        }
        study_dict.update(kwargs)
        # ODE parameter estimation
        est_f = functools.partial(self._estimate_params,
                                  data=df,
                                  quantiles=quantiles,
                                  check_dict=check_dict,
                                  study_dict=study_dict)
        phases = list(self._info_dict.keys())
        if self._n_jobs == 1:
            est_dict_list = [est_f(ph) for ph in phases]
        else:
            with Pool(self._n_jobs) as p:
                est_dict_list = p.map(est_f, phases)
        for (phase, est_dict) in zip(phases, est_dict_list):
            self._info_dict[phase]["param"] = {
                param: est_dict[param]
                for param in self._model.PARAMETERS
            }
        print(f"Completed optimization. Total: {stopwatch.stop_show()}")
        return {
            k: {
                self.START: self._info_dict[k][self.START],
                self.END: self._info_dict[k][self.END],
                **v
            }
            for (k, v) in zip(phases, est_dict_list)
        }