Ejemplo n.º 1
0
    def trend(self, force=True, show_figure=False, **kwargs):
        """
        Split the records with trend analysis.

        Args:
            force (bool): if True, change points will be over-written
            show_figure (bool): if True, show the result as a figure
            kwargs: keyword arguments of covsirphy.TrendDetector(), .TrendDetector.sr() and .trend_plot()

        Returns:
            covsirphy.PhaseSeries
        """
        detector = TrendDetector(data=self.record_df,
                                 area=self.area,
                                 **find_args(TrendDetector, **kwargs))
        # Perform S-R trend analysis
        detector.sr(**find_args(TrendDetector.sr, **kwargs))
        # Register phases
        if force or not self._series:
            self._series.clear(include_past=True)
            _, end_dates = detector.dates()
            [self._series.add(end_date=end_date) for end_date in end_dates]
        # Show S-R plane
        if show_figure:
            detector.show(**find_args(trend_plot, **kwargs))
        return self._series
Ejemplo n.º 2
0
    def trend(self, force, show_figure, **kwargs):
        """
        Define past phases with S-R trend analysis.

        Args:
            force (bool): if True, change points will be over-written
            show_figure (bool): if True, show the result as a figure
            kwargs: keyword arguments of covsirphy.TrendDetector(), .TrendDetector.sr() and .trend_plot()

        Returns:
            covsirphy.PhaseTracker: self
        """
        df = self._track_df.loc[:self._today].reset_index()[self.SUB_COLUMNS]
        detector = TrendDetector(data=df,
                                 area=self._area,
                                 **find_args(TrendDetector, **kwargs))
        # Perform S-R trend analysis
        detector.sr(**find_args(TrendDetector.sr, **kwargs))
        # Register phases
        if force:
            start_dates, end_dates = detector.dates()
            _ = [
                self.define_phase(start, end)
                for (start, end) in zip(start_dates, end_dates)
            ]
        # Show S-R plane
        if show_figure:
            detector.show(**find_args(trend_plot, **kwargs))
        return self
Ejemplo n.º 3
0
    def _colored_map(self, title, **kwargs):
        """
        Create global colored map to show the values.

        Args:
            title (str): title of the figure
            kwargs: arguments of ColoredMap() and ColoredMap.plot()
        """
        with ColoredMap(**find_args([plt.savefig, ColoredMap], **kwargs)) as cm:
            cm.title = title
            cm.directory = self._dirpath
            cm.plot(**find_args([gpd.GeoDataFrame.plot, ColoredMap.plot], **kwargs))
Ejemplo n.º 4
0
    def estimate(self, model, tau=None, **kwargs):
        """
        Perform parameter estimation for each phases and update parameter values.

        Args:
            model (covsirphy.ModelBase): ODE model
            tau (int or None): tau value [min] or None (to be estimated)
            kwargs: keyword arguments of ODEHander(), ODEHandler.estimate_tau() and .estimate_param()

        Returns:
            int: applied or estimated tau value [min]

        Note:
            ODE parameter estimation will be done for all active phases.
        """
        self._ensure_subclass(model, ModelBase, name="model")
        self._ensure_tau(tau, accept_none=True)
        # Set-up ODEHandler
        data_df = self._track_df.reset_index()
        data_df = data_df.loc[data_df[self.ID] > 0].dropna(how="all", axis=0)
        handler = ODEHandler(model,
                             data_df[self.DATE].min(),
                             tau=tau,
                             **find_args(ODEHandler, **kwargs))
        start_dates = data_df.groupby(self.ID).first()[self.DATE].sort_values()
        end_dates = data_df.groupby(self.ID).last()[self.DATE].sort_values()
        for (start, end) in zip(start_dates, end_dates):
            y0_series = model.convert(data_df.loc[data_df[self.DATE] >= start],
                                      tau=None).iloc[0]
            _ = handler.add(end, y0_dict=y0_series.to_dict())
        # Estimate tau value if necessary
        if tau is None:
            tau = handler.estimate_tau(
                data_df, **find_args(ODEHandler.estimate_tau, **kwargs))
        # Estimate ODE parameter values
        est_dict = handler.estimate_params(data_df, **kwargs)
        # Register phase information to self
        df = pd.DataFrame.from_dict(est_dict, orient="index")
        df[self.DATE] = df[[self.START, self.END
                            ]].apply(lambda x: pd.date_range(x[0], x[1]),
                                     axis=1)
        df = df.explode(self.DATE).drop([self.START, self.END],
                                        axis=1).set_index(self.DATE)
        df.insert(0, self.ODE, model.NAME)
        df.insert(6, self.TAU, tau)
        all_columns = [*self._track_df.columns.tolist(), *df.columns.tolist()]
        self._track_df = self._track_df.reindex(
            columns=sorted(set(all_columns), key=all_columns.index))
        self._track_df.update(df)
        # Set model and tau to self
        self._model = model
        self._tau = tau
        return tau
Ejemplo n.º 5
0
    def subset(self, model=None, country=None, province=None, **kwargs):
        """
        Return the subset of dataset.

        Args:
            model (cs.ModelBase or None): the first ODE model
            country (str or None): country name
            province (str or None): province name
            kwargs: other keyword arguments of JHUData.subset()

        Returns:
            (pandas.DataFrame)
                Index
                    reset index
                Columns
                    - Date (pd.Timestamp): Observation date
                    - Confirmed (int): the number of confirmed cases
                    - Infected (int): the number of currently infected cases
                    - Fatal (int): the number of fatal cases
                    - Recovered (int): the number of recovered cases (> 0)
                    - Susceptible (int): the number of susceptible cases, if calculated

        Note:
            If country is None, the name of the model will be used.
            If province is None, '-' will be used.
            If @population is not None, the number of susceptible cases will be calculated.
            Records with Recovered > 0 will be selected.
        """
        country, _ = self._model_to_area(
            model=model, country=country, province=province)
        kwargs = find_args([super().subset], **kwargs)
        return super().subset(country=country, province=province, **kwargs)
Ejemplo n.º 6
0
    def estimate(self, data, **kwargs):
        """
        Estimate tau value [min] and ODE parameter values.

        Args:
            data (pandas.DataFrame):
                Index
                    reset index
                Columns
                    - Date (pandas.Timestamp): Observation date
                    - Susceptible (int): the number of susceptible cases
                    - Infected (int): the number of currently infected cases
                    - Fatal(int): the number of fatal cases
                    - Recovered (int): the number of recovered cases
            kwargs: keyword arguments of  ODEHander.estimate_tau() and ODEHander.estimate_param()

        Raises:
            covsirphy.UnExecutedError: phase information was not set

        Returns:
            tuple(int, dict(str, dict[str, object]))
                - int: tau value [min]
                - dict(str, object): setting of the phase (key: phase name)
                    - Start (pandas.Timestamp): start date
                    - End (pandas.Timestamp): end date
                    - Rt (float): phase-dependent reproduction number
                    - (str, float): estimated parameter values, including rho
                    - (int or float): day parameters, including 1/beta [days]
                    - {metric}: score with the estimated parameter values
                    - Trials (int): the number of trials
                    - Runtime (str): runtime of optimization
        """
        tau = self.estimate_tau(data, **find_args(self.estimate_tau, **kwargs))
        info_dict = self.estimate_params(data, **kwargs)
        return (tau, info_dict)
Ejemplo n.º 7
0
def line_plot(df, title=None, filename=None, show_legend=True, **kwargs):
    """
    Wrapper function: show chronological change of the data.

    Args:
        data (pandas.DataFrame or pandas.Series): data to show
            Index
                Date (pandas.Timestamp)
            Columns
                variables to show
        title (str): title of the figure
        filename (str or None): filename to save the figure or None (display)
        show_legend (bool): whether show legend or not
        kwargs: keyword arguments of the following classes and methods.
            - covsirphy.LinePlot() and its methods,
            - matplotlib.pyplot.savefig(), matplotlib.pyplot.legend(),
            - pandas.DataFrame.plot()
    """
    with LinePlot(filename=filename, **find_args(plt.savefig, **kwargs)) as lp:
        lp.title = title
        lp.plot(data=df,
                **find_args([LinePlot.plot, pd.DataFrame.plot], **kwargs))
        # Axis
        lp.x_axis(**find_args([LinePlot.x_axis], **kwargs))
        lp.y_axis(**find_args([LinePlot.y_axis], **kwargs))
        # Vertical/horizontal lines
        lp.line(**find_args([LinePlot.line], **kwargs))
        # Legend
        if show_legend:
            lp.legend(**find_args([LinePlot.legend, plt.legend], **kwargs))
        else:
            lp.legend_hide()
Ejemplo n.º 8
0
def trend_plot(df, title=None, filename=None, show_legend=True, **kwargs):
    """
    Wrapper function: show chronological change of the data.

    Args:
        df (pandas.DataFrame): data to show
            Index
                x values
            Columns
                - column defined by @actual_col, actual values for y-axis
                - columns defined by @predicted_cols, predicted values for y-axis
        actual_col (str): column name for y-axis
        predicted_cols (list[str]): list of columns which have predicted values
        title (str): title of the figure
        filename (str or None): filename to save the figure or None (display)
        show_legend (bool): whether show legend or not
        kwargs: keyword arguments of the following classes and methods.
            - covsirphy.TrendPlot() and its methods,
            - matplotlib.pyplot.savefig() and matplotlib.pyplot.legend()
    """
    with TrendPlot(filename=filename, **find_args(plt.savefig,
                                                  **kwargs)) as tp:
        tp.title = title
        tp.plot(data=df, **find_args([TrendPlot.plot], **kwargs))
        # Axis
        tp.x_axis(**find_args([TrendPlot.x_axis], **kwargs))
        tp.y_axis(**find_args([TrendPlot.y_axis], **kwargs))
        # Vertical/horizontal lines
        tp.line(**find_args([TrendPlot.line], **kwargs))
        # Legend
        if show_legend:
            tp.legend(**find_args([TrendPlot.legend, plt.legend], **kwargs))
        else:
            tp.legend_hide()
Ejemplo n.º 9
0
 def __init__(self, record_df, model, population, tau=None, **kwargs):
     # Arguments
     self.population = self.ensure_population(population)
     self.model = self.ensure_subclass(model, ModelBase, name="model")
     # Dataset
     if isinstance(record_df, JHUData):
         subset_arg_dict = find_args([JHUData.subset, record_df.subset],
                                     **kwargs)
         self.record_df = record_df.subset(population=population,
                                           **subset_arg_dict)
     else:
         if not set(self.NLOC_COLUMNS).issubset(record_df.columns):
             record_df = model.restore(record_df)
         self.record_df = self.ensure_dataframe(record_df,
                                                name="record_df",
                                                columns=self.NLOC_COLUMNS)
     # Initial values
     df = model.tau_free(self.record_df, population, tau=None)
     self.y0_dict = {k: df.loc[df.index[0], k] for k in model.VARIABLES}
     # Fixed parameter values
     self.fixed_dict = {
         k: v
         for (k, v) in kwargs.items()
         if k in set(model.PARAMETERS) and v is not None
     }
     # For optimization
     optuna.logging.disable_default_handler()
     self.x = self.TS
     self.y_list = model.VARIABLES[:]
     self.weight_dict = {
         v: p
         for (v, p) in zip(model.VARIABLES, model.WEIGHTS) if p > 0
     }
     self.study = None
     self.total_trials = 0
     self.run_time = 0
     self.tau_candidates = self.divisors(1440)
     # Defined in parent class, but not used
     self.train_df = None
     # step_n will be defined in divide_minutes()
     self.step_n = None
     # tau value
     self.tau = self.ensure_tau(tau)
     self.taufree_df = pd.DataFrame(
     ) if tau is None else self.divide_minutes(tau)
Ejemplo n.º 10
0
def compare_plot(df, variables, groups, filename=None, **kwargs):
    """
    Wrapper function: show chronological change of the data.

    Args:
        df (pandas.DataFrame): data to show
            Index
                x values
            Columns
                y variables to show, "{variable}_{group}" for all combinations of variables and groups
        variables (list[str]): variables to compare
        groups (list[str]): the first group name and the second group name
        filename (str or None): filename to save the figure or None (display)
        kwargs: keyword arguments of the following classes and methods.
            - matplotlib.pyplot.savefig()
            - matplotlib.pyplot.legend()
    """
    with ComparePlot(filename=filename, **find_args(plt.savefig,
                                                    **kwargs)) as cp:
        cp.plot(data=df, variables=variables, groups=groups)
Ejemplo n.º 11
0
    def _split(X, y, delay, **kwargs):
        """
        Apply delay period to the X dataset.

        Args:
            X (pandas.DataFrame): indicators with time index
            y (pandas.DataFrame): target values with time index
            delay (int): delay period [days]
            kwargs: keyword arguments of sklearn.model_selection.train_test_split(test_size=0.2, random_state=0)

        Returns:
            tuple(pandas.DataFrame): datasets with time index
                - X_train
                - X_test
                - y_train
                - y_test
                - X_target

        Note:
            If @seed is included in kwargs, this will be converted to @random_state.
        """
        split_kwargs = {
            "test_size": 0.2,
            "random_state": 0,
        }
        split_kwargs.update(kwargs)
        split_kwargs["random_state"] = split_kwargs.get(
            "seed", split_kwargs["random_state"])
        split_kwargs = find_args(train_test_split, **split_kwargs)
        # Apply delay period to X
        X_delayed = X.copy()
        X_delayed.index += timedelta(days=delay)
        # Training/test data
        df = X_delayed.join(y, how="inner")
        df = df.rolling(window=delay).mean().dropna().drop_duplicates()
        X_arranged = df.loc[:, X.columns]
        y_arranged = df.loc[:, y.columns]
        splitted = train_test_split(X_arranged, y_arranged, **split_kwargs)
        # X_target
        X_target = X_delayed.loc[X_delayed.index > y.index.max()]
        return (*splitted, X_target)
Ejemplo n.º 12
0
    def score(self, variables=None, phases=None, y0_dict=None, **kwargs):
        """
        Evaluate accuracy of phase setting and parameter estimation of selected enabled phases.

        Args:
            variables (list[str] or None): variables to use in calculation
            phases (list[str] or None): phases to use in calculation
            y0_dict(dict[str, float] or None): dictionary of initial values of variables
            kwargs: keyword arguments of covsirphy.Evaluator.score()

        Returns:
            float: score with the specified metrics

        Note:
            If @variables is None, ["Infected", "Fatal", "Recovered"] will be used.
            "Confirmed", "Infected", "Fatal" and "Recovered" can be used in @variables.
            If @phases is None, all phases will be used.
        """
        # Arguments
        variables = variables or [self.CI, self.F, self.R]
        variables = self._ensure_list(variables,
                                      self.VALUE_COLUMNS,
                                      name="variables")
        # Disable the non-target phases
        all_phases, _ = self.past_phases(phases=None)
        target_phases, _ = self.past_phases(phases=phases)
        ignored_phases = list(set(all_phases) - set(target_phases))
        if ignored_phases:
            self.disable(ignored_phases)
        # Get the number of cases
        rec_df, sim_df = self._compare_with_actual(variables=variables,
                                                   y0_dict=y0_dict)
        # Calculate score
        evaluator = Evaluator(rec_df, sim_df)
        score = evaluator.score(**find_args(Evaluator.score, **kwargs))
        # Enable the disabled non-target phases
        if ignored_phases:
            self.enable(ignored_phases)
        return score
Ejemplo n.º 13
0
    def run(self,
            timeout=180,
            reset_n_max=3,
            timeout_iteration=5,
            tail_n=4,
            allowance=(0.99, 1.01),
            seed=0,
            pruner="threshold",
            upper=0.5,
            percentile=50,
            metric=None,
            metrics="RMSLE",
            **kwargs):
        """
        Run optimization.
        If the result satisfied the following conditions, optimization ends.
        - Score did not change in the last @tail_n iterations.
        - Monotonic increasing variables increases monotonically.
        - Predicted values are in the allowance when each actual value shows max value.

        Args:
            timeout (int): timeout of optimization
            reset_n_max (int): if study was reset @reset_n_max times, will not be reset anymore
            timeout_iteration (int): time-out of one iteration
            tail_n (int): the number of iterations to decide whether score did not change for the last iterations
            allowance (tuple(float, float)): the allowance of the predicted value
            seed (int or None): random seed of hyperparameter optimization
            pruner (str): hyperband, median, threshold or percentile
            upper (float): works for "threshold" pruner,
                intermediate score is larger than this value, it prunes
            percentile (float): works for "Percentile" pruner,
                the best intermediate value is in the bottom percentile among trials, it prunes
            metric (str or None): metric name or None (use @metrics)
            metrics (str): alias of @metric
            kwargs: keyword arguments of ModelBase.param_range()

        Note:
            @n_jobs was obsoleted because this does not work effectively in Optuna.

        Note:
            Please refer to covsirphy.Evaluator.score() for metric names
        """
        self._metric = metric or metrics
        self._param_range_dict = find_args(self.model.param_range, **kwargs)
        # Create a study of optuna
        if self.study is None:
            self._init_study(seed=seed,
                             pruner=pruner,
                             upper=upper,
                             percentile=percentile)
        reset_n = 0
        iteration_n = math.ceil(timeout / timeout_iteration)
        increasing_cols = [f"{v}{self.P}" for v in self.model.VARS_INCLEASE]
        stopwatch = StopWatch()
        scores = []
        for _ in range(iteration_n):
            # Perform optimization
            self.study.optimize(self._objective,
                                n_jobs=1,
                                timeout=timeout_iteration)
            # If score did not change in the last iterations, stop running
            tau, param_dict = self._param()
            scores.append(self._score(tau=tau, param_dict=param_dict))
            if len(scores) >= tail_n and len(set(scores[-tail_n:])) == 1:
                break
            # Create a table to compare observed/estimated values
            comp_df = self._compare(tau=tau, param_dict=param_dict)
            # Check monotonic variables
            mono_ok_list = [
                comp_df[col].is_monotonic_increasing for col in increasing_cols
            ]
            if not all(mono_ok_list):
                if reset_n == reset_n_max - 1:
                    break
                # Initialize the study
                self._init_study(seed=seed)
                reset_n += 1
                continue
            # Need additional trials when the values are not in allowance
            if self._is_in_allowance(comp_df, allowance):
                break
        # Calculate run-time and the number of trials
        self.runtime += stopwatch.stop()
        self.total_trials = len(self.study.trials)
Ejemplo n.º 14
0
    def run(self, check_dict, study_dict):
        """
        Perform parameter estimation of the ODE model, not including tau.

        Args:
            check_dict (dict[str, object]): setting of validation
                - timeout (int): timeout of optimization
                - timeout_iteration (int): timeout of one iteration
                - tail_n (int): the number of iterations to decide whether score did not change for the last iterations
                - allowance (tuple(float, float)): the allowance of the max predicted values
            study_dict (dict[str, object]): setting of optimization study
                - pruner (str): kind of pruner (hyperband, median, threshold or percentile)
                - upper (float): works for "threshold" pruner, intermediate score is larger than this value, it prunes
                - percentile (float): works for "Percentile" pruner, the best intermediate value is in the bottom percentile among trials, it prunes

        Returns:
            dict(str, object):
                - Rt (float): phase-dependent reproduction number
                - (dict(str, float)): estimated parameter values
                - (dict(str, int or float)): day parameters, including 1/beta [days]
                - {metric}: score with the estimated parameter values
                - Trials (int): the number of trials
                - Runtime (str): runtime of optimization

        Note:
            Please refer to covsirphy.Evaluator.score() for metric names.
        """
        timeout = check_dict.get("timeout", 180)
        timeout_iteration = check_dict.get("timeout_iteration", 5)
        tail_n = check_dict.get("tail_n", 4)
        allowance = check_dict.get("allowance", (0.99, 1.01))
        # Initialize optimization
        study_kwargs = {
            "pruner": "threshold",
            "upper": 0.5,
            "percentile": 50,
            "seed": 0
        }
        study_kwargs.update(study_dict)
        study = self._init_study(**find_args(self._init_study, **study_kwargs))
        # The number of iterations
        iteration_n = math.ceil(timeout / timeout_iteration)
        stopmatch = StopWatch()
        # Optimization
        scores = []
        param_dict = {}
        for _ in range(iteration_n):
            # Run iteration
            study.optimize(self._objective,
                           n_jobs=1,
                           timeout=timeout_iteration)
            param_dict = study.best_params.copy()
            # If score did not change in the last iterations, stop running
            scores.append(self._score(**param_dict))
            if len(scores) >= tail_n and len(set(scores[-tail_n:])) == 1:
                break
            # Check max values are in the allowance
            if self._is_in_allowance(allowance, **param_dict):
                break
        model_instance = self._model(self._population, **param_dict)
        return {
            self.RT:
            model_instance.calc_r0(),
            **param_dict.copy(),
            **model_instance.calc_days_dict(self._tau),
            self._metric:
            self._score(**param_dict),
            self.TRIALS:
            len(study.trials),
            self.RUNTIME:
            stopmatch.stop_show(),
        }