Ejemplo n.º 1
0
    def _fit(self,
             endog,
             endog_end,
             min_ts_mean,
             min_ts_mean_window,
             include_holidays=False,
             min_ts_length=None,
             max_ft_freq=None,
             exog_data=None,
             optimize=None):
        """
        This function implements the fourier transformation to model the periodicities and implements ARIMA model with
        different order of differencing, MA and AR terms to generate the optimal prediction and anomaly detection.
        :param list endog: A list containing the time series input
        :param str endog_end: pandas datetime containing the last timestamp of the input time series
        :param float raw_actual: Containing the actual value of the execution date
        :param float raw_actual_previous: Containing the actual value of the day before execution date
        :param float interpolated_actual: Containing the interpolated value of the execution date
        :param pandas.Dttaframe pred_instance_date: pandas datetime containing the prediction timestamp
        :param float min_ts_mean: The minimum mean value of the time series required for the model to run. For data that
        originated as integers (such as counts), the ARIMA model can behave erratically when the numbers are small. When
        this parameter is set, any time series whose mean value is less than this will automatically result in a model
        failure, rather than a mostly bogus anomaly.
        :param int min_ts_mean_window: The number of observations (anchored to the end of the time series) to use when
        applying the min_ts_mean rule. Default is None, which performs the calculation on the entire time series.
        :param bool include_holidays: Whether to include holidays as exogenous variables in the regression. Holidays
        are defined in :class:`~luminaire.model.model_utils.LADsHolidays`
        :param int min_ts_length: Specifying the minimum required length of the time series for training
        :param int max_ft_freq: The maximum number of frequencies under consideration for the Fourier transformation.
        :param bool optimize: Flag to identify whether called from hyperparameter optimization
        :return: A dictionary containing the anomaly flag, details of the prediction data (timestamp, raw, interpolated)
        lower and upper bound of the confidence interval, flag whether holidays are included in the model as exogenous
        """
        import numpy as np
        from pykalman import KalmanFilter
        import warnings
        warnings.filterwarnings('ignore')

        p, q = self._params['p'], self._params['q']
        freq = self._params['freq']
        pred_len = self.max_scoring_length
        x_matrix_train = None
        x_matrix_score = None

        # set exogenous (holiday) variables for input data
        if include_holidays and len(endog) + pred_len > 385:
            exog = exog_data.loc[endog.index.min():endog_end]
        else:
            include_holidays = False
            exog = None

        if min_ts_length is not None and len(endog) < min_ts_length:
            raise ValueError(
                'TimeSeries length less than minimum length specified')

        if min_ts_mean is not None:
            if (min_ts_mean_window is not None and endog[-min_ts_mean_window:].fillna(0).mean() < min_ts_mean) or \
                    (min_ts_mean_window is None and endog.fillna(0).mean() < min_ts_mean):
                raise ValueError('Metric values too small to model.')

        # Smoothing the given time series as a pre-processing for modeling seasonalities through Fourier
        # transformation
        kf = KalmanFilter()
        endog_smoothed, filtered_state_covariances = kf.em(endog).smooth(endog)
        endog_smoothed = endog_smoothed[:, 0]

        endog, diff_order, actual_previous_per_diff = DataExploration._stationarizer(
            endog=pd.Series(endog), diff_min=0, diff_max=1, obs_incl=False)
        if diff_order:
            endog_smoothed = np.diff(endog_smoothed)

        if freq == 'D':
            complete_cycle = int(len(endog) / 7)
            endog = endog[-(complete_cycle * 7):]
            endog_smoothed = endog_smoothed[-(complete_cycle * 7):]
        elif freq == 'H':
            complete_cycle = int(len(endog) / 24)
            endog = endog[-(complete_cycle * 24):]
            endog_smoothed = endog_smoothed[-(complete_cycle * 24):]

        exog = exog.iloc[-len(endog):] if exog is not None else None

        if include_holidays:
            exog = exog.loc[:, (exog != 0).any(axis=0)]
            ext_training_features = list(exog.columns)
        else:
            ext_training_features = None

        stepwise_fit = []

        # Updating the user specified maximum number of frequencies to consider for the Fourier transformation
        # based on the length of the smoothed endogenous variable
        max_ft_freq = int(min(max_ft_freq, len(endog_smoothed) / 4))

        # Running the Fourier transformation extrapolating one point ahead in future that is going to be used
        # for predicting

        if max_ft_freq > 0:
            x_matrix = self._fourier_extp(series=endog_smoothed,
                                          max_trun=(2 * max_ft_freq),
                                          forecast_period=pred_len)
            if not optimize and np.all(x_matrix[0] == x_matrix[0][0]):
                x_matrix_train = None
                x_matrix_score = None
                max_ft_freq = 0
            else:
                x_matrix_train = x_matrix[:, :(x_matrix.shape[1] - pred_len)]
                x_matrix_score = x_matrix[:, (x_matrix.shape[1] - pred_len):]

        self._seasonal_arima(endog=endog,
                             exog=exog,
                             p=p,
                             d=0,
                             q=q,
                             imodels=max_ft_freq,
                             include_holidays=include_holidays,
                             ift_matrix=x_matrix_train,
                             stepwise_fit=stepwise_fit,
                             optimize=optimize)
        model = stepwise_fit[0]

        seasonal_feature_scoring = x_matrix_score[
            0, :].tolist() if not x_matrix_score is None else None

        result = {
            'model': model,
            'diff_order': diff_order,
            'seasonal_feature_scoring': seasonal_feature_scoring,
            'ext_training_features': ext_training_features,
        }

        p_selected = model.k_ar if hasattr(model, 'k_ar') else 0
        d_selected = diff_order
        q_selected = model.k_ma if hasattr(model, 'k_ma') else 0
        order = (p_selected, d_selected, q_selected)

        return result, order
Ejemplo n.º 2
0
    def _training(self, data, **kwargs):
        """
        This function implements Kalman filter based estimation algorithm over a Markovian State Space model and
        analyzes the residual process of the model with respect to a Gaussian process to perform anomaly detection
        :param pandas.DataFrame data: Input time seires to analyze for anomaly
        :param float sig_level: Significance level to be considered for anomaly detection based on the Gaussian process
        :return: A tuple containing a flag whether the datapoint on the given date is an anomnaly, the prediction and
        the standard error of prediction
        """

        import numpy as np
        from pykalman import KalmanFilter
        from numpy.linalg import LinAlgError

        if data is None:
            raise ValueError(
                'Not enough data to train due to recent change point')

        data = data[self._imputed_metric]

        last_data_points = data[-2:].values.tolist()

        try:
            data_dim = 1
            transition_matrix = [[1]]

            de_obj = DataExploration()
            endog, diff_order, actual_previous_per_diff = de_obj._stationarizer(
                data)

            kf = KalmanFilter(transition_matrices=transition_matrix,
                              initial_state_mean=np.zeros(data_dim),
                              n_dim_obs=data_dim)

            # Obtaining the hidden states and their covariance based on the Kalman Filter algorithm
            filtered_state_means, filtered_state_covariance = kf.em(
                endog).filter(endog)

            # Obtaining the observation matirx, transition covariance and the observation covariance
            observation_matrix = kf.observation_matrices
            transition_covariance = kf.transition_covariance
            observation_covariance = kf.observation_covariance

            prior_pred, pred_covariance, kalman_gain \
                = self._prediction_summary(state_mean=filtered_state_means[:, 0][-1],
                                           state_covariance=filtered_state_covariance[-1, :, :],
                                           observation_covariance=observation_covariance,
                                           transition_covariance=transition_covariance,
                                           observation_matrix=observation_matrix,
                                           transition_matrix=transition_matrix)

            result = {
                'model': kf,
                'state_mean': float(filtered_state_means[:, 0][-1]),
                'state_covariance':
                filtered_state_covariance[-1, :, :].tolist(),
                'transition_matrix': transition_matrix,
                'prior_pred': float(prior_pred),
                'pred_covariance': pred_covariance.tolist(),
                'kalman_gain': kalman_gain.tolist(),
                'diff_order': diff_order,
                'last_data_points': last_data_points
            }

        except (LinAlgError, ValueError, LADFilteringModelError) as e:
            result = {'ErrorMessage': str(e)}

        return result