def run(self, data, max_evals=50): """ This function runs hyperparameter optimization fort LAD batch outlier detection models :param list[list] data: Input time series. :param int max_evals: Number of iterations for hyperparameter optimization. :return: Optimal hyperparameters. :rtype: dict >>> data [[Timestamp('2020-01-01 00:00:00'), 1326.0], [Timestamp('2020-01-02 00:00:00'), 1552.0], [Timestamp('2020-01-03 00:00:00'), 1432.0], . . . , [Timestamp('2020-06-06 00:00:00'), 1747.0], [Timestamp('2020-06-07 00:00:00'), 1782.0]] >>> hopt_obj = HyperparameterOptimization(freq='D', detection_type='OutlierDetection') >>> hyper_params = hopt_obj._run(data=data, max_evals=5) >>> hyper_params {'LuminaireModel': 'LADStructuralModel', 'data_shift_truncate': 0, 'fill_rate': 0.8409249603686499, 'include_holidays_exog': 1, 'is_log_transformed': 1, 'max_ft_freq': 3, 'p': 4, 'q': 3} """ # Calling data exploration to perform imputation only de_obj = DataExploration(freq=self.freq, detection_type=self.detection_type) data, summary = de_obj.profile(df=data, impute_only=True) if summary['success']: return self._optimize(data=data, objective_part=self._objective_part, max_evals=max_evals) else: return None
def _objective_part(self, data, smoothed_series, args): """ This is the objective function that outputs the loss for a giveen set of hyperparameters for optimization through hyperopt :param pandas.DataFrame data: Input time series data :param list smoothed_series: Input time series after smoothing :param args: :return: AUC based on observed (synthetic) and predicted anomalies :rtype: dict >>> data raw 2016-01-02 1753421.0 2016-01-03 1879108.0 2016-01-04 1462725.0 2016-01-05 1525162.0 2016-01-06 1424264.0 ... ... 2018-10-24 1726884.0 2018-10-25 1685651.0 2018-10-26 1632952.0 2018-10-27 1850912.0 2018-10-28 2021929.0 >>> {'loss': 1 - auc, 'status': STATUS_OK} {'loss': 0.3917824074074072, 'status': 'ok'} """ import numpy as np import pandas as pd from sklearn.metrics import log_loss import copy is_log_transformed = args[0] data_shift_truncate = args[1] fill_rate = args[2] # Getting hyperparameters for lad structural model if args[3]['model'] == 'LADStructuralModel': max_ft_freq = args[3]['param']['max_ft_freq'] include_holidays_exog = args[3]['param']['include_holidays_exog'] p = args[3]['param']['p'] q = args[3]['param']['q'] ts_start = data.index.min() ts_end = data.index.max() max_ts_length = self.max_ts_length min_ts_length = self.min_ts_length freq = self.freq scoring_length = self.scoring_length train_end = ( pd.Timestamp(ts_end) - pd.Timedelta("{}".format(scoring_length) + freq)).to_pydatetime() score_start = (pd.Timestamp(train_end) + pd.Timedelta("1" + freq)).to_pydatetime() training_data = data.loc[ts_start:train_end] scoring_data = data.loc[score_start:ts_end] try: # Required data preprocessing before training and scoring de_obj = DataExploration(freq=self.freq, min_ts_length=self.min_ts_length, min_ts_mean=self.min_ts_mean, max_ts_length=self.max_ts_length, is_log_transformed=is_log_transformed, data_shift_truncate=data_shift_truncate, detection_type=self.detection_type, fill_rate=fill_rate) training_data, preprocess_summary = de_obj.profile( df=training_data) is_log_transformed = preprocess_summary['is_log_transformed'] # Getting De-noised smoothed series for generating synthetic anomalies smoothed_scoring_series = smoothed_series[-len(scoring_data):] labels = [] probs = [] if args[3]['model'] == 'LADStructuralModel': # LAD structural training and scoring hyper_params = LADStructuralHyperParams( is_log_transformed=is_log_transformed, max_ft_freq=max_ft_freq, include_holidays_exog=include_holidays_exog, p=p, q=q) lad_struct = LADStructuralModel(hyper_params.params, max_ts_length=max_ts_length, min_ts_length=min_ts_length, freq=freq) success, model_date, model = lad_struct.train( data=training_data, optimize=True, **preprocess_summary) scr_idx = 0 obs = [] preds = [] # Scoring and anomaly classification for synthetic anomalies for i, row in scoring_data.iterrows(): observed_value = row.raw obs.append(observed_value) result = model.score(observed_value, i) prediction = result['Prediction'] preds.append(prediction) std_error = result['StdErr'] observation = smoothed_scoring_series[scr_idx] scr_idx = scr_idx + 1 anomaly_flags, anomaly_probabilities = self._synthetic_anomaly_check( prediction=prediction, std_error=std_error, observation=observation) labels = labels + anomaly_flags probs = probs + anomaly_probabilities mdape = self._mdape(obs, preds) elif args[3]['model'] == 'LADFilteringModel': # LAD filtering training and scoring hyper_params = LADFilteringHyperParams( is_log_transformed=is_log_transformed) lad_filtering = LADFilteringModel(hyper_params.params, max_ts_length=max_ts_length, min_ts_length=min_ts_length, freq=freq) success, model_date, stable_model = lad_filtering.train( training_data, **preprocess_summary) # Scoring and anomaly classification for synthetic anomalies for prop in self.anomaly_intensity_list: anomaly_flags_list = [] anomaly_probabilities_list = [] local_model = copy.deepcopy(stable_model) for i, row in scoring_data.iterrows(): trial_prob = np.random.uniform(0, 1, 1) observed_value = row.raw synthetic_actual = observed_value if trial_prob < 0.4: synthetic_actual = observed_value + ( prop * observed_value) anomaly_flags_list.append(1) else: anomaly_flags_list.append(0) result, local_model = local_model.score( observed_value=observed_value, pred_date=i, synthetic_actual=synthetic_actual) anomaly_probabilities_list.append( result['AnomalyProbability']) labels = labels + anomaly_flags_list probs = probs + anomaly_probabilities_list weights = ((1 - np.array(labels)) + 1) / float(len(labels)) if args[3]['model'] == 'LADStructuralModel' and mdape: cost = (0.5 * mdape) + ( 0.5 * log_loss(labels, probs, sample_weight=weights)) else: cost = log_loss(labels, probs, sample_weight=weights) except Exception as e: return {'loss': 1e100, 'status': STATUS_OK} return {'loss': cost, 'status': STATUS_OK}