def trend(self, force=True, show_figure=False, **kwargs): """ Split the records with trend analysis. Args: force (bool): if True, change points will be over-written show_figure (bool): if True, show the result as a figure kwargs: keyword arguments of covsirphy.TrendDetector(), .TrendDetector.sr() and .trend_plot() Returns: covsirphy.PhaseSeries """ detector = TrendDetector(data=self.record_df, area=self.area, **find_args(TrendDetector, **kwargs)) # Perform S-R trend analysis detector.sr(**find_args(TrendDetector.sr, **kwargs)) # Register phases if force or not self._series: self._series.clear(include_past=True) _, end_dates = detector.dates() [self._series.add(end_date=end_date) for end_date in end_dates] # Show S-R plane if show_figure: detector.show(**find_args(trend_plot, **kwargs)) return self._series
def trend(self, force, show_figure, **kwargs): """ Define past phases with S-R trend analysis. Args: force (bool): if True, change points will be over-written show_figure (bool): if True, show the result as a figure kwargs: keyword arguments of covsirphy.TrendDetector(), .TrendDetector.sr() and .trend_plot() Returns: covsirphy.PhaseTracker: self """ df = self._track_df.loc[:self._today].reset_index()[self.SUB_COLUMNS] detector = TrendDetector(data=df, area=self._area, **find_args(TrendDetector, **kwargs)) # Perform S-R trend analysis detector.sr(**find_args(TrendDetector.sr, **kwargs)) # Register phases if force: start_dates, end_dates = detector.dates() _ = [ self.define_phase(start, end) for (start, end) in zip(start_dates, end_dates) ] # Show S-R plane if show_figure: detector.show(**find_args(trend_plot, **kwargs)) return self
def _colored_map(self, title, **kwargs): """ Create global colored map to show the values. Args: title (str): title of the figure kwargs: arguments of ColoredMap() and ColoredMap.plot() """ with ColoredMap(**find_args([plt.savefig, ColoredMap], **kwargs)) as cm: cm.title = title cm.directory = self._dirpath cm.plot(**find_args([gpd.GeoDataFrame.plot, ColoredMap.plot], **kwargs))
def estimate(self, model, tau=None, **kwargs): """ Perform parameter estimation for each phases and update parameter values. Args: model (covsirphy.ModelBase): ODE model tau (int or None): tau value [min] or None (to be estimated) kwargs: keyword arguments of ODEHander(), ODEHandler.estimate_tau() and .estimate_param() Returns: int: applied or estimated tau value [min] Note: ODE parameter estimation will be done for all active phases. """ self._ensure_subclass(model, ModelBase, name="model") self._ensure_tau(tau, accept_none=True) # Set-up ODEHandler data_df = self._track_df.reset_index() data_df = data_df.loc[data_df[self.ID] > 0].dropna(how="all", axis=0) handler = ODEHandler(model, data_df[self.DATE].min(), tau=tau, **find_args(ODEHandler, **kwargs)) start_dates = data_df.groupby(self.ID).first()[self.DATE].sort_values() end_dates = data_df.groupby(self.ID).last()[self.DATE].sort_values() for (start, end) in zip(start_dates, end_dates): y0_series = model.convert(data_df.loc[data_df[self.DATE] >= start], tau=None).iloc[0] _ = handler.add(end, y0_dict=y0_series.to_dict()) # Estimate tau value if necessary if tau is None: tau = handler.estimate_tau( data_df, **find_args(ODEHandler.estimate_tau, **kwargs)) # Estimate ODE parameter values est_dict = handler.estimate_params(data_df, **kwargs) # Register phase information to self df = pd.DataFrame.from_dict(est_dict, orient="index") df[self.DATE] = df[[self.START, self.END ]].apply(lambda x: pd.date_range(x[0], x[1]), axis=1) df = df.explode(self.DATE).drop([self.START, self.END], axis=1).set_index(self.DATE) df.insert(0, self.ODE, model.NAME) df.insert(6, self.TAU, tau) all_columns = [*self._track_df.columns.tolist(), *df.columns.tolist()] self._track_df = self._track_df.reindex( columns=sorted(set(all_columns), key=all_columns.index)) self._track_df.update(df) # Set model and tau to self self._model = model self._tau = tau return tau
def subset(self, model=None, country=None, province=None, **kwargs): """ Return the subset of dataset. Args: model (cs.ModelBase or None): the first ODE model country (str or None): country name province (str or None): province name kwargs: other keyword arguments of JHUData.subset() Returns: (pandas.DataFrame) Index reset index Columns - Date (pd.Timestamp): Observation date - Confirmed (int): the number of confirmed cases - Infected (int): the number of currently infected cases - Fatal (int): the number of fatal cases - Recovered (int): the number of recovered cases (> 0) - Susceptible (int): the number of susceptible cases, if calculated Note: If country is None, the name of the model will be used. If province is None, '-' will be used. If @population is not None, the number of susceptible cases will be calculated. Records with Recovered > 0 will be selected. """ country, _ = self._model_to_area( model=model, country=country, province=province) kwargs = find_args([super().subset], **kwargs) return super().subset(country=country, province=province, **kwargs)
def estimate(self, data, **kwargs): """ Estimate tau value [min] and ODE parameter values. Args: data (pandas.DataFrame): Index reset index Columns - Date (pandas.Timestamp): Observation date - Susceptible (int): the number of susceptible cases - Infected (int): the number of currently infected cases - Fatal(int): the number of fatal cases - Recovered (int): the number of recovered cases kwargs: keyword arguments of ODEHander.estimate_tau() and ODEHander.estimate_param() Raises: covsirphy.UnExecutedError: phase information was not set Returns: tuple(int, dict(str, dict[str, object])) - int: tau value [min] - dict(str, object): setting of the phase (key: phase name) - Start (pandas.Timestamp): start date - End (pandas.Timestamp): end date - Rt (float): phase-dependent reproduction number - (str, float): estimated parameter values, including rho - (int or float): day parameters, including 1/beta [days] - {metric}: score with the estimated parameter values - Trials (int): the number of trials - Runtime (str): runtime of optimization """ tau = self.estimate_tau(data, **find_args(self.estimate_tau, **kwargs)) info_dict = self.estimate_params(data, **kwargs) return (tau, info_dict)
def line_plot(df, title=None, filename=None, show_legend=True, **kwargs): """ Wrapper function: show chronological change of the data. Args: data (pandas.DataFrame or pandas.Series): data to show Index Date (pandas.Timestamp) Columns variables to show title (str): title of the figure filename (str or None): filename to save the figure or None (display) show_legend (bool): whether show legend or not kwargs: keyword arguments of the following classes and methods. - covsirphy.LinePlot() and its methods, - matplotlib.pyplot.savefig(), matplotlib.pyplot.legend(), - pandas.DataFrame.plot() """ with LinePlot(filename=filename, **find_args(plt.savefig, **kwargs)) as lp: lp.title = title lp.plot(data=df, **find_args([LinePlot.plot, pd.DataFrame.plot], **kwargs)) # Axis lp.x_axis(**find_args([LinePlot.x_axis], **kwargs)) lp.y_axis(**find_args([LinePlot.y_axis], **kwargs)) # Vertical/horizontal lines lp.line(**find_args([LinePlot.line], **kwargs)) # Legend if show_legend: lp.legend(**find_args([LinePlot.legend, plt.legend], **kwargs)) else: lp.legend_hide()
def trend_plot(df, title=None, filename=None, show_legend=True, **kwargs): """ Wrapper function: show chronological change of the data. Args: df (pandas.DataFrame): data to show Index x values Columns - column defined by @actual_col, actual values for y-axis - columns defined by @predicted_cols, predicted values for y-axis actual_col (str): column name for y-axis predicted_cols (list[str]): list of columns which have predicted values title (str): title of the figure filename (str or None): filename to save the figure or None (display) show_legend (bool): whether show legend or not kwargs: keyword arguments of the following classes and methods. - covsirphy.TrendPlot() and its methods, - matplotlib.pyplot.savefig() and matplotlib.pyplot.legend() """ with TrendPlot(filename=filename, **find_args(plt.savefig, **kwargs)) as tp: tp.title = title tp.plot(data=df, **find_args([TrendPlot.plot], **kwargs)) # Axis tp.x_axis(**find_args([TrendPlot.x_axis], **kwargs)) tp.y_axis(**find_args([TrendPlot.y_axis], **kwargs)) # Vertical/horizontal lines tp.line(**find_args([TrendPlot.line], **kwargs)) # Legend if show_legend: tp.legend(**find_args([TrendPlot.legend, plt.legend], **kwargs)) else: tp.legend_hide()
def __init__(self, record_df, model, population, tau=None, **kwargs): # Arguments self.population = self.ensure_population(population) self.model = self.ensure_subclass(model, ModelBase, name="model") # Dataset if isinstance(record_df, JHUData): subset_arg_dict = find_args([JHUData.subset, record_df.subset], **kwargs) self.record_df = record_df.subset(population=population, **subset_arg_dict) else: if not set(self.NLOC_COLUMNS).issubset(record_df.columns): record_df = model.restore(record_df) self.record_df = self.ensure_dataframe(record_df, name="record_df", columns=self.NLOC_COLUMNS) # Initial values df = model.tau_free(self.record_df, population, tau=None) self.y0_dict = {k: df.loc[df.index[0], k] for k in model.VARIABLES} # Fixed parameter values self.fixed_dict = { k: v for (k, v) in kwargs.items() if k in set(model.PARAMETERS) and v is not None } # For optimization optuna.logging.disable_default_handler() self.x = self.TS self.y_list = model.VARIABLES[:] self.weight_dict = { v: p for (v, p) in zip(model.VARIABLES, model.WEIGHTS) if p > 0 } self.study = None self.total_trials = 0 self.run_time = 0 self.tau_candidates = self.divisors(1440) # Defined in parent class, but not used self.train_df = None # step_n will be defined in divide_minutes() self.step_n = None # tau value self.tau = self.ensure_tau(tau) self.taufree_df = pd.DataFrame( ) if tau is None else self.divide_minutes(tau)
def compare_plot(df, variables, groups, filename=None, **kwargs): """ Wrapper function: show chronological change of the data. Args: df (pandas.DataFrame): data to show Index x values Columns y variables to show, "{variable}_{group}" for all combinations of variables and groups variables (list[str]): variables to compare groups (list[str]): the first group name and the second group name filename (str or None): filename to save the figure or None (display) kwargs: keyword arguments of the following classes and methods. - matplotlib.pyplot.savefig() - matplotlib.pyplot.legend() """ with ComparePlot(filename=filename, **find_args(plt.savefig, **kwargs)) as cp: cp.plot(data=df, variables=variables, groups=groups)
def _split(X, y, delay, **kwargs): """ Apply delay period to the X dataset. Args: X (pandas.DataFrame): indicators with time index y (pandas.DataFrame): target values with time index delay (int): delay period [days] kwargs: keyword arguments of sklearn.model_selection.train_test_split(test_size=0.2, random_state=0) Returns: tuple(pandas.DataFrame): datasets with time index - X_train - X_test - y_train - y_test - X_target Note: If @seed is included in kwargs, this will be converted to @random_state. """ split_kwargs = { "test_size": 0.2, "random_state": 0, } split_kwargs.update(kwargs) split_kwargs["random_state"] = split_kwargs.get( "seed", split_kwargs["random_state"]) split_kwargs = find_args(train_test_split, **split_kwargs) # Apply delay period to X X_delayed = X.copy() X_delayed.index += timedelta(days=delay) # Training/test data df = X_delayed.join(y, how="inner") df = df.rolling(window=delay).mean().dropna().drop_duplicates() X_arranged = df.loc[:, X.columns] y_arranged = df.loc[:, y.columns] splitted = train_test_split(X_arranged, y_arranged, **split_kwargs) # X_target X_target = X_delayed.loc[X_delayed.index > y.index.max()] return (*splitted, X_target)
def score(self, variables=None, phases=None, y0_dict=None, **kwargs): """ Evaluate accuracy of phase setting and parameter estimation of selected enabled phases. Args: variables (list[str] or None): variables to use in calculation phases (list[str] or None): phases to use in calculation y0_dict(dict[str, float] or None): dictionary of initial values of variables kwargs: keyword arguments of covsirphy.Evaluator.score() Returns: float: score with the specified metrics Note: If @variables is None, ["Infected", "Fatal", "Recovered"] will be used. "Confirmed", "Infected", "Fatal" and "Recovered" can be used in @variables. If @phases is None, all phases will be used. """ # Arguments variables = variables or [self.CI, self.F, self.R] variables = self._ensure_list(variables, self.VALUE_COLUMNS, name="variables") # Disable the non-target phases all_phases, _ = self.past_phases(phases=None) target_phases, _ = self.past_phases(phases=phases) ignored_phases = list(set(all_phases) - set(target_phases)) if ignored_phases: self.disable(ignored_phases) # Get the number of cases rec_df, sim_df = self._compare_with_actual(variables=variables, y0_dict=y0_dict) # Calculate score evaluator = Evaluator(rec_df, sim_df) score = evaluator.score(**find_args(Evaluator.score, **kwargs)) # Enable the disabled non-target phases if ignored_phases: self.enable(ignored_phases) return score
def run(self, timeout=180, reset_n_max=3, timeout_iteration=5, tail_n=4, allowance=(0.99, 1.01), seed=0, pruner="threshold", upper=0.5, percentile=50, metric=None, metrics="RMSLE", **kwargs): """ Run optimization. If the result satisfied the following conditions, optimization ends. - Score did not change in the last @tail_n iterations. - Monotonic increasing variables increases monotonically. - Predicted values are in the allowance when each actual value shows max value. Args: timeout (int): timeout of optimization reset_n_max (int): if study was reset @reset_n_max times, will not be reset anymore timeout_iteration (int): time-out of one iteration tail_n (int): the number of iterations to decide whether score did not change for the last iterations allowance (tuple(float, float)): the allowance of the predicted value seed (int or None): random seed of hyperparameter optimization pruner (str): hyperband, median, threshold or percentile upper (float): works for "threshold" pruner, intermediate score is larger than this value, it prunes percentile (float): works for "Percentile" pruner, the best intermediate value is in the bottom percentile among trials, it prunes metric (str or None): metric name or None (use @metrics) metrics (str): alias of @metric kwargs: keyword arguments of ModelBase.param_range() Note: @n_jobs was obsoleted because this does not work effectively in Optuna. Note: Please refer to covsirphy.Evaluator.score() for metric names """ self._metric = metric or metrics self._param_range_dict = find_args(self.model.param_range, **kwargs) # Create a study of optuna if self.study is None: self._init_study(seed=seed, pruner=pruner, upper=upper, percentile=percentile) reset_n = 0 iteration_n = math.ceil(timeout / timeout_iteration) increasing_cols = [f"{v}{self.P}" for v in self.model.VARS_INCLEASE] stopwatch = StopWatch() scores = [] for _ in range(iteration_n): # Perform optimization self.study.optimize(self._objective, n_jobs=1, timeout=timeout_iteration) # If score did not change in the last iterations, stop running tau, param_dict = self._param() scores.append(self._score(tau=tau, param_dict=param_dict)) if len(scores) >= tail_n and len(set(scores[-tail_n:])) == 1: break # Create a table to compare observed/estimated values comp_df = self._compare(tau=tau, param_dict=param_dict) # Check monotonic variables mono_ok_list = [ comp_df[col].is_monotonic_increasing for col in increasing_cols ] if not all(mono_ok_list): if reset_n == reset_n_max - 1: break # Initialize the study self._init_study(seed=seed) reset_n += 1 continue # Need additional trials when the values are not in allowance if self._is_in_allowance(comp_df, allowance): break # Calculate run-time and the number of trials self.runtime += stopwatch.stop() self.total_trials = len(self.study.trials)
def run(self, check_dict, study_dict): """ Perform parameter estimation of the ODE model, not including tau. Args: check_dict (dict[str, object]): setting of validation - timeout (int): timeout of optimization - timeout_iteration (int): timeout of one iteration - tail_n (int): the number of iterations to decide whether score did not change for the last iterations - allowance (tuple(float, float)): the allowance of the max predicted values study_dict (dict[str, object]): setting of optimization study - pruner (str): kind of pruner (hyperband, median, threshold or percentile) - upper (float): works for "threshold" pruner, intermediate score is larger than this value, it prunes - percentile (float): works for "Percentile" pruner, the best intermediate value is in the bottom percentile among trials, it prunes Returns: dict(str, object): - Rt (float): phase-dependent reproduction number - (dict(str, float)): estimated parameter values - (dict(str, int or float)): day parameters, including 1/beta [days] - {metric}: score with the estimated parameter values - Trials (int): the number of trials - Runtime (str): runtime of optimization Note: Please refer to covsirphy.Evaluator.score() for metric names. """ timeout = check_dict.get("timeout", 180) timeout_iteration = check_dict.get("timeout_iteration", 5) tail_n = check_dict.get("tail_n", 4) allowance = check_dict.get("allowance", (0.99, 1.01)) # Initialize optimization study_kwargs = { "pruner": "threshold", "upper": 0.5, "percentile": 50, "seed": 0 } study_kwargs.update(study_dict) study = self._init_study(**find_args(self._init_study, **study_kwargs)) # The number of iterations iteration_n = math.ceil(timeout / timeout_iteration) stopmatch = StopWatch() # Optimization scores = [] param_dict = {} for _ in range(iteration_n): # Run iteration study.optimize(self._objective, n_jobs=1, timeout=timeout_iteration) param_dict = study.best_params.copy() # If score did not change in the last iterations, stop running scores.append(self._score(**param_dict)) if len(scores) >= tail_n and len(set(scores[-tail_n:])) == 1: break # Check max values are in the allowance if self._is_in_allowance(allowance, **param_dict): break model_instance = self._model(self._population, **param_dict) return { self.RT: model_instance.calc_r0(), **param_dict.copy(), **model_instance.calc_days_dict(self._tau), self._metric: self._score(**param_dict), self.TRIALS: len(study.trials), self.RUNTIME: stopmatch.stop_show(), }