def simulate(self): """ Perform simulation with the multi-phased ODE model. Raises: covsirphy.UnExecutedError: either tau value or phase information was not set Returns: pandas.DataFrame: Index reset index Columns - Date (pd.Timestamp): Observation date - Susceptible (int): the number of susceptible cases - Infected (int): the number of currently infected cases - Fatal (int): the number of fatal cases - Recovered (int): the number of recovered cases """ if self._tau is None: raise UnExecutedError( "ODEHandler.estimate_tau()", message="or specify tau when creating an instance of ODEHandler" ) if not self._info_dict: raise UnExecutedError("ODEHandler.add()") combs = itertools.product(self._model.PARAMETERS, self._info_dict.items()) for (param, (phase, phase_dict)) in combs: if param not in phase_dict["param"]: raise ValueError( f"{param.capitalize()} is not registered for the {phase} phase." ) solver = _MultiPhaseODESolver(self._model, self._first, self._tau) return solver.simulate(*self._info_dict.values())
def legend(self, bbox_to_anchor=(0.5, -0.2), bbox_loc="lower center", ncol=None, **kwargs): """ Set legend. Args: bbox_to_anchor (tuple(int or float, int or float)): distance of legend and plot bbox_loc (str): location of legend ncol (int or None): the number of columns that the legend has kwargs: keyword arguments of matplotlib.pyplot.legend() """ if not self._variables: raise UnExecutedError("LinePlot.plot()") ncol = self._ensure_natural_int( ncol or (1 if "left" in bbox_loc else len(self._variables)), name="ncol") self._ax.legend(bbox_to_anchor=bbox_to_anchor, loc=bbox_loc, borderaxespad=0, ncol=ncol, **kwargs) plt.tight_layout()
def _model_is_registered(self): """ Ensure that model was set. Raises: NameError: ODE model is not registered """ if self._model is None: raise UnExecutedError("PhaseUnit.set_ode(model)")
def simulate(self): """ Perform simulation with the multi-phased ODE model. Raises: covsirphy.UnExecutedError: either tau value or phase information was not set Returns: pandas.DataFrame: Index reset index Columns - Date (pandas.Timestamp): Observation date - Confirmed (int): the number of confirmed cases - Infected (int): the number of currently infected cases - Fatal (int): the number of fatal cases - Recovered (int): the number of recovered cases - Susceptible (int): the number of susceptible cases Note: Deactivated phases will be included. Note: Un-registered phases will not be included. Note: If parameter set is not registered for the current phase and the previous phase has parameter set, this set will be used for the current phase. """ # Model and tau must be set if self._model is None: raise UnExecutedError( "PhaseTracker.estimate() or PhaseTracker.set_ode()") # Get parameter sets and initial values record_df = self._track_df.copy() record_df = record_df.loc[record_df[self.ID] != 0].ffill().dropna() start_dates = record_df.reset_index().groupby( self.ID).first()[self.DATE].sort_values() end_dates = record_df.reset_index().groupby( self.ID).last()[self.DATE].sort_values() # Set-up ODEHandler handler = ODEHandler(self._model, record_df.index.min(), tau=self._tau) parameters = self._model.PARAMETERS[:] for (start, end) in zip(start_dates, end_dates): param_dict = record_df.loc[end, parameters].to_dict() if end <= self._today: y0_dict = record_df.loc[ start, [self.S, self.CI, self.F, self.R]].to_dict() else: y0_dict = None _ = handler.add(end, param_dict=param_dict, y0_dict=y0_dict) # Perform simulation sim_df = handler.simulate() sim_df[self.C] = sim_df[[self.CI, self.F, self.R]].sum(axis=1) return sim_df.loc[:, self.SUB_COLUMNS]
def simulate(self, y0_dict=None): """ Perform simulation with the set/estimated parameter values. Args: y0_dict (dict or None): dictionary of initial values or None - key (str): variable name - value (float): initial value Returns: pandas.DataFrame Index reset index Columns - Date (pd.Timestamp): Observation date - Confirmed (int): the number of confirmed cases - Infected (int): the number of currently infected cases - Fatal (int): the number of fatal cases - Recovered (int): the number of recovered cases - Variables of the model (int): Confirmed etc. Note: Simulation starts at the start date of the phase. Simulation end at the next date of the end date of the phase. """ self._model_is_registered() # Initial values y0_dict = y0_dict or {} y0_dict.update(self.y0_dict) diff_set = set(self._model.VARIABLES) - y0_dict.keys() y0_dict.update({var: 0 for var in diff_set}) # Conditions param_dict = self._ode_dict.copy() if None in param_dict.values(): raise UnExecutedError("PhaseUnit.set_ode()") tau = param_dict.pop(self.TAU) last_date = self.tomorrow(self._end_date) # Simulation simulator = ODESimulator() simulator.add( model=self._model, step_n=self.steps(self._start_date, last_date, tau), population=self._population, param_dict=param_dict, y0_dict=y0_dict ) # Dimensionalized values df = simulator.dim(tau=tau, start_date=self._start_date) df = self._model.restore(df) # Return day-level data df = df.set_index(self.DATE).resample("D").first() df = df.loc[df.index <= self._ensure_date(last_date), :] return df.reset_index().loc[:, self.NLOC_COLUMNS]
def estimate_tau(self, data, guess_quantile=0.5): """ Select tau value [min] which minimize the score of the metric. Args: data (pandas.DataFrame): Index reset index Columns - Date (pd.Timestamp): Observation date - Susceptible(int): the number of susceptible cases - Infected (int): the number of currently infected cases - Fatal(int): the number of fatal cases - Recovered (int): the number of recovered cases guess_quantile (float): quantile to guess ODE parameter values for the candidates of tau Returns: int: estimated tau value [min] Raises: covsirphy.UnExecutedError: phase information was not set Note: ODE parameter for each tau value will be guessed by .guess() classmethod of the model. Tau value will be selected from the divisors of 1440 [min] and set to self. """ self._ensure_dataframe(data, name="data", columns=self.DSIFR_COLUMNS) df = data.loc[:, self.DSIFR_COLUMNS] if not self._info_dict: raise UnExecutedError("ODEHandler.add()") # Calculate scores of tau candidates self._ensure_float(guess_quantile, name="quantile") calc_f = functools.partial(self._score_tau, data=df, quantile=guess_quantile) divisors = self.divisors(1440) if self._n_jobs == 1: scores = [calc_f(candidate) for candidate in divisors] else: with Pool(self._n_jobs) as p: scores = p.map(calc_f, divisors) score_dict = {k: v for (k, v) in zip(divisors, scores)} # Return the best tau value comp_f = { True: min, False: max }[Evaluator.smaller_is_better(metric=self._metric)] self._tau = comp_f(score_dict.items(), key=lambda x: x[1])[0] return self._tau
def _cleaning(self): """ Perform data cleaning of the raw data. This method overwrite super()._cleaning() method. Returns: pandas.DataFrame: Index reset index Columns - Date (pd.TimeStamp): Observation date - Country (pandas.Category): country/region name - Province (pandas.Category): province/prefecture/state name - Confirmed (int): the number of confirmed cases - Infected (int): the number of currently infected cases - Fatal (int): the number of fatal cases - Recovered (int): the number of recovered cases """ if not self.var_dict: raise UnExecutedError("CountryData.set_variables()") df = self._raw.copy() # Rename the columns df = df.rename(self.var_dict, axis=1) # Confirm the expected columns are in raw data expected_cols = [self.DATE, self.C, self.F, self.R] self._ensure_dataframe(df, name="the raw data", columns=expected_cols) # Remove empty rows df = df.dropna(subset=[self.DATE]) # Add province column if self.province_col is not None: df = df.rename({self.province_col: self.PROVINCE}, axis=1) else: df[self.PROVINCE] = self._province or self.UNKNOWN # Values v_cols = [self.C, self.F, self.R] for col in v_cols: df[col] = pd.to_numeric(df[col], errors="coerce") df[v_cols] = df[v_cols].fillna(0).astype(np.int64) df[self.CI] = df[self.C] - df[self.F] - df[self.R] # Groupby date and province df[self.DATE] = pd.to_datetime(df[self.DATE]) df = df.groupby([self.DATE, self.PROVINCE]).sum().reset_index() # Add country column df[self.COUNTRY] = self._country df = df.loc[:, self.COLUMNS] # Update data types to reduce memory df[self.AREA_COLUMNS] = df[self.AREA_COLUMNS].astype("category") return df
def track(self): """ Return subset of summary and show a figure to show the history in each country. Args: param (str): parameter to show roll_window (int or None): rolling average window if necessary show_figure (bool): If True, show the result as a figure filename (str): filename of the figure, or None (show figure) kwargs: keword arguments of pd.DataFrame.plot or line_plot() Returns: pandas.DataFrame: parameter values Index reset index Columns - Country (str): country name - Date (pd.Timestamp): date - (float): model parameters - (float): model day parameters - Rt (float): reproduction number - (float): OxCGRT values """ if self.model is None: raise UnExecutedError("PolicyMeasures.estimate(model)") # Get parameter/Rt/data parameter value of each date df = self.summary().reset_index().replace(self.UNKNOWN, None) df[self.START] = pd.to_datetime(df[self.START], format=self.DATE_FORMAT) df[self.END] = pd.to_datetime(df[self.END], format=self.DATE_FORMAT) df[self.DATE] = df[[self.START, self.END]].apply( lambda x: pd.date_range(x[0], x[1]).tolist(), axis=1) df = df.explode(self.DATE) cols = [ self.DATE, self.COUNTRY, *self.model.PARAMETERS, *self.model.DAY_PARAMETERS, self.RT ] param_df = df.reindex(cols, axis=1) # OxCGRT oxcgrt_df = self.oxcgrt_data.cleaned() sel = oxcgrt_df[self.COUNTRY].isin(self._countries) oxcgrt_df = oxcgrt_df.loc[ sel, [self.DATE, self.COUNTRY, *OxCGRTData.OXCGRT_VARS]] # Combine data return pd.merge(param_df, oxcgrt_df, how="inner", on=[self.COUNTRY, self.DATE])
def estimate(self, record_df=None, **kwargs): """ Perform parameter estimation. Args: record_df (pandas.DataFrame or None) Index reset index Columns - Date (pd.Timestamp): Observation date - Confirmed (int): the number of confirmed cases - Infected (int): the number of currently infected cases - Fatal (int): the number of fatal cases - Recovered (int): the number of recovered cases - any other columns will be ignored **kwargs: keyword arguments of Estimator.run() Note: If @record_df is None, registered records will be used. """ self._model_is_registered() # Records if record_df is None: record_df = self._record_df.copy() if record_df.empty: raise UnExecutedError("PhaseUnit.record_df = ...", message="or specify @record_df argument") self._ensure_dataframe(record_df, name="record_df", columns=self.NLOC_COLUMNS) # Check dates sta = self.date_obj(self.start_date) end = self.date_obj(self.end_date) series = record_df[self.DATE] record_df = record_df.loc[(series >= sta) & (series <= end), :] # Parameter estimation of ODE model estimator = Estimator(record_df, self._model, self._population, **self._ode_dict, **kwargs) estimator.run(**kwargs) self._read_estimator(estimator, record_df) # Set estimator self._estimator = estimator
def simulate(self, y0_dict=None): """ Simulate ODE models with set/estimated parameter values. Args: y0_dict(dict[str, float] or None): dictionary of initial values of variables Returns: pandas.DataFrame Index reset index Columns - Date (pd.Timestamp): Observation date - Country (str): country/region name - Province (str): province/prefecture/state name - Variables of the model and dataset (int): Confirmed etc. """ self._ensure_phase_setting() try: return self._series.simulate(record_df=self.record_df, y0_dict=y0_dict) except NameError: raise UnExecutedError(".estimate()")
def _ensure_phase_setting(self): """ Ensure that phases were set. """ if not self._series: raise UnExecutedError(".trend() or .add()")
def parse_range(self, dates=None, past_days=None, phases=None): """ Parse date range and return the minimum date and maximum date. Args: dates (tuple(str or pandas.Timestamp or None, ) or None): start date and end date past_days (int or None): how many past days to use in calculation from today (property) phases (list[str] or None): phase names to use in calculation Raises: covsirphy.UnExecutedError: no phases were registered ValueError: @dates argument does not have exact two elements Returns: tuple(pandas.Timestamp, pandas.Timestamp): the minimum date and maximum date Notes: When not specified (i.e. None was applied), the start date of the 0th phase will be used as the minimum date. Notes: When not specified (i.e. None was applied), the end date of the last phase phase will be used as the maximum date. Note: When @past_days was specified, (today - @past_days, today) will be returned. Note: In @phases, 'last' means the last registered phase. Note: Priority is given in the order of @dates, @past_days, @phases. """ if not self: raise UnExecutedError("PhaseTracker.define_phase()") # Get list of phases: index=phase names, columns=Start/End track_df = self._track_df.reset_index() track_df = track_df.loc[track_df[self.ID] != 0] track_df[self.ID], _ = track_df[self.ID].factorize() first_df = track_df.groupby(self.ID).first() df = first_df.join(track_df.groupby(self.ID).last(), rsuffix="_last") df = df.rename(columns={ self.DATE: self.START, f"{self.DATE}_last": self.END }) df.index = [self.num2str(num) for num in df.index] # Get default values start_default, end_default = df[self.START].min(), df[self.END].max() # Read @dates if dates is not None: if len(dates) != 2: raise ValueError( f"@dates must be a tuple which has two elements, but {dates} was applied." ) start = self._ensure_date( dates[0], name="the first element of 'dates' argument", default=start_default) end = self._ensure_date( dates[1], name="the second element of 'dates' argument", default=end_default) self._ensure_date_order( start, end, name="the second element of 'dates' argument") return (start, end) # Read @past_days if past_days is not None: past_days = self._ensure_natural_int(past_days, name="past_days") return (self._today - timedelta(days=past_days), self._today) # No arguments were specified if phases is None: return (start_default, end_default) # Read @phases self._ensure_list(phases, name="phases") dates = [] for phase in phases: phase_replaced = df.index[-1] if phase == "last" else phase self._ensure_selectable(phase_replaced, df.index.tolist(), name="phase") start = df.loc[phase_replaced, self.START] end = df.loc[phase_replaced, self.END] dates.extend(pd.date_range(start, end).tolist()) return (min(dates), max(dates))
def estimate_params(self, data, quantiles=(0.1, 0.9), check_dict=None, study_dict=None, **kwargs): """ Estimate ODE parameter values of the all phases to minimize the score of the metric. Args: data (pandas.DataFrame): Index reset index Columns - Date (pd.Timestamp): Observation date - Susceptible(int): the number of susceptible cases - Infected (int): the number of currently infected cases - Fatal(int): the number of fatal cases - Recovered (int): the number of recovered cases quantiles (tuple(int, int)): quantiles to cut parameter range, like confidence interval check_dict (dict[str, object] or None): setting of validation - None means {"timeout": 180, "timeout_interation": 5, "tail_n": 4, "allowance": (0.99, 1.01)} - timeout (int): timeout of optimization - timeout_iteration (int): timeout of one iteration - tail_n (int): the number of iterations to decide whether score did not change for the last iterations - allowance (tuple(float, float)): the allowance of the max predicted values study_dict (dict[str, object] or None): setting of optimization study - None means {"pruner": "threshold", "upper": 0.5, "percentile": 50, "seed": 0} - pruner (str): kind of pruner (hyperband, median, threshold or percentile) - upper (float): works for "threshold" pruner, intermediate score is larger than this value, it prunes - percentile (float): works for "Percentile" pruner, the best intermediate value is in the bottom percentile among trials, it prunes kwargs: we can set arguments directly. E.g. timeout=180 for check_dict={"timeout": 180,...} Raises: covsirphy.UnExecutedError: either tau value or phase information was not set Returns: dict(str, object): setting of the phase (key: phase name) - Start (pandas.Timestamp): start date - End (pandas.Timestamp): end date - Rt (float): phase-dependent reproduction number - (str, float): estimated parameter values, including rho - (int or float): day parameters, including 1/beta [days] - {metric}: score with the estimated parameter values - Trials (int): the number of trials - Runtime (str): runtime of optimization """ print(f"\n<{self._model.NAME} model: parameter estimation>") print(f"Running optimization with {self._n_jobs} CPUs...") stopwatch = StopWatch() # Arguments self._ensure_dataframe(data, name="data", columns=self.DSIFR_COLUMNS) df = data.loc[:, self.DSIFR_COLUMNS] if not self._info_dict: raise UnExecutedError("ODEHandler.add()") if self._tau is None: raise UnExecutedError( "ODEHandler.estimate_tau()", message="or specify tau when creating an instance of ODEHandler" ) # Arguments used in the old Estimator check_dict = check_dict or { "timeout": 180, "timeout_interation": 5, "tail_n": 4, "allowance": (0.99, 1.01) } check_dict.update(kwargs) study_dict = study_dict or { "pruner": "threshold", "upper": 0.5, "percentile": 50, "seed": 0 } study_dict.update(kwargs) # ODE parameter estimation est_f = functools.partial(self._estimate_params, data=df, quantiles=quantiles, check_dict=check_dict, study_dict=study_dict) phases = list(self._info_dict.keys()) if self._n_jobs == 1: est_dict_list = [est_f(ph) for ph in phases] else: with Pool(self._n_jobs) as p: est_dict_list = p.map(est_f, phases) for (phase, est_dict) in zip(phases, est_dict_list): self._info_dict[phase]["param"] = { param: est_dict[param] for param in self._model.PARAMETERS } print(f"Completed optimization. Total: {stopwatch.stop_show()}") return { k: { self.START: self._info_dict[k][self.START], self.END: self._info_dict[k][self.END], **v } for (k, v) in zip(phases, est_dict_list) }