def calib_detect(df, calib_params): """ calib_detect seeks to find calibration events based on 2 conditions: persistence of a defined length, which often occurs when sensors are out of the water, during certain days of the week (M-F) and times of the day. Input: : param df: data frame with columns: 'observed' of observed data 'anomaly' booleans where True=1 corresponds to anomalies 'persist_grp' with indices of peristent groups (output of the persist function) : param calib_params: parameters defined in the parameters file persist_high: longest length of the persistent group perist_low: shortest length of the persistent group hour_low: earliest hour for calibrations to have occurred hour_high: latest hour for calibrations to have occurred Output: : param calib: data frame of booleans indicating whether the conditions were met for a possible calibration event. : param calib_dates: datetimes for which the conditions were met for a possible calibration event. """ if 'persist_grp' in df: temp = df[['observed', 'anomaly', 'persist_grp']].copy(deep=True) for i in range(1, max(temp['persist_grp']) + 1): temp['persist_grp'][temp.loc[temp['persist_grp'].shift(-1) == i].index[0]] = i if ((len(temp['persist_grp'][temp['persist_grp'] == i]) >= calib_params['persist_low']) and (len(temp['persist_grp'][temp['persist_grp'] == i]) <= calib_params['persist_high'])): temp['anomaly'][temp['persist_grp'] == i] = True else: temp = df[['observed', 'anomaly']].copy(deep=True) temp['persist_grp'] = (temp.observed.diff(1) == 0) temp['persist_grp'] = anomaly_utilities.anomaly_events( temp['persist_grp'], 0, 1) for i in range(1, max(temp['persist_grp']) + 1): temp['persist_grp'][temp.loc[temp['persist_grp'].shift(-1) == i].index[0]] = i if ((len(temp['persist_grp'][temp['persist_grp'] == i]) >= calib_params['persist_low']) and (len(temp['persist_grp'][temp['persist_grp'] == i]) <= calib_params['persist_high'])): temp['anomaly'][temp['persist_grp'] == i] = True dayofweek = temp.index.dayofweek hour = temp.index.hour business = temp.iloc[( (dayofweek == 0) | (dayofweek == 1) | (dayofweek == 2) | (dayofweek == 3) | (dayofweek == 4)) & (hour >= calib_params['hour_low']) & (hour <= calib_params['hour_high'])] calib = pd.DataFrame(index=temp.index) calib['anomaly'] = False calib['anomaly'].loc[business[business['anomaly']].index] = True calib_dates = calib[calib['anomaly']].index return calib, calib_dates
def group_size(df): """ group_size determines the size of the largest consecutive group of anomalous points. Arguments: df: data frame with column 'anomaly'. Returns: size: length of the largest consecutive group of anomalous points. """ temp = df[['anomaly']].copy(deep=True) temp['value_grp'] = anomaly_utilities.anomaly_events(temp['anomaly'], 0, 1) size = 0 if max(temp['value_grp']) > 0: size = len(temp['value_grp'][temp['value_grp'] == 1]) for i in range(2, max(temp['value_grp']) + 1): if(len(temp['value_grp'][temp['value_grp'] == i]) > size): size = len(temp['value_grp'][temp['value_grp'] == i]) return size
def calib_overlap(sensor_names, input_array, calib_params): """ calib_overlap seeks to identify calibration events by identifying where overlaps occur between multiple sensors. Calls the calib_detect function to identify events with a defined persistence length during certain days of the week (M-F) and hours of the day. Arguments: sensor_names: list of sensors to be considered for overlap. input_array: array of data frames each with columns: 'observed' of observed data 'anomaly' booleans where True=1 corresponds to anomalies 'persist_grp' with indices of persistent groups (output of the persist function) calib_params: parameters defined in the parameters file persist_high: longest length of the persistent group perist_low: shortest length of the persistent group hour_low: earliest hour for calibrations to have occurred hour_high: latest hour for calibrations to have occurred Returns: all_calib: array of data frames (one for each sensor) of booleans indicating whether the conditions were met for a possible calibration event. all_calib_dates: array of datetimes (one for each sensor) for which the conditions were met for a possible calibration event. df_all_calib: data frame with columns for each sensor observations, columns of booleans for each sensor indicating whether a calibration event may have occurred, and a column 'all_calib' that indicates if the conditions were met for all sensors. calib_dates_overlap: datetimes for which the conditions were met for a possible calibration event for all sensors. """ all_calib = dict() all_calib_dates = dict() df_all_calib = pd.DataFrame(index=input_array[sensor_names[0]].index) df_all_calib['all_calib'] = True for snsr in sensor_names: calib, calib_dates = calib_persist_detect(input_array[snsr], calib_params) all_calib[snsr] = calib all_calib_dates[snsr] = calib_dates df_all_calib[snsr] = input_array[snsr]['observed'] df_all_calib[snsr + '_calib'] = all_calib[snsr]['anomaly'] df_all_calib[snsr + '_event'] = anomaly_utilities.anomaly_events(df_all_calib[snsr + '_calib'], wf=1, sf=1) df_all_calib['all_calib'] = np.where(df_all_calib['all_calib'] & (df_all_calib[snsr + '_event'] != 0), True, False) calib_dates_overlap = df_all_calib[df_all_calib['all_calib']].index return all_calib, all_calib_dates, df_all_calib, calib_dates_overlap
def persistence(df, length, output_grp=False): """ persistence adds an anomalous label in the data frame if data repeat for specified length. Arguments: df: data frame with a column 'raw' of raw data and a boolean column 'anomaly' (typically output of range_check) length: duration of persistent/repeated values to be flagged output_grp: boolean to indicate whether the length of persistence should be output as a column in the original dataframe. Returns: df: dataframe with column 'anomaly' modified and added column 'persist_grp' that indexes points as part of persistent groups persist_count: total number of persistent points in the data frame """ temp = df[['raw', 'anomaly']].copy(deep=True) temp['persist_grp'] = (temp.raw.diff(1) == 0) temp['persist_grp'] = anomaly_utilities.anomaly_events(temp['persist_grp'], 0, 1) for i in range(1, max(temp['persist_grp']) + 1): if(len(temp['persist_grp'][temp['persist_grp'] == i]) >= length): temp['anomaly'][temp['persist_grp'] == i] = True persist_count = sum(temp['persist_grp'] != 0) df['anomaly'] = temp['anomaly'] if output_grp: df['persist_grp'] = temp['persist_grp'] return df, persist_count
def LSTM_detect_multivar(sensor_array, sensors, params, LSTM_params, model_type, name='', rules=False, plots=False, summary=True, compare=False, model_output=True, model_save=True): """ """ print('\nProcessing LSTM multivariate ' + str(model_type) + ' detections.') # RULES BASED DETECTION # if rules: size = dict() for snsr in sensors: sensor_array[snsr], r_c = rules_detect.range_check( sensor_array[snsr], params[snsr].max_range, params[snsr].min_range) sensor_array[snsr], p_c = rules_detect.persistence( sensor_array[snsr], params[snsr].persist) size[snsr] = rules_detect.group_size(sensor_array[snsr]) sensor_array[snsr] = rules_detect.interpolate(sensor_array[snsr]) print(snsr + ' maximum detected group length = ' + str(size[snsr])) print('Rules based detection complete.\n') # Create new data frames with raw and observed (after applying rules) and preliminary anomaly detections for selected sensors df_raw = pd.DataFrame(index=sensor_array[sensors[0]].index) df_observed = pd.DataFrame(index=sensor_array[sensors[0]].index) df_anomaly = pd.DataFrame(index=sensor_array[sensors[0]].index) for snsr in sensors: df_raw[snsr + '_raw'] = sensor_array[snsr]['raw'] df_observed[snsr + '_obs'] = sensor_array[snsr]['observed'] df_anomaly[snsr + '_anom'] = sensor_array[snsr]['anomaly'] print('Raw data shape: ' + str(df_raw.shape)) print('Observed data shape: ' + str(df_observed.shape)) print('Initial anomalies data shape: ' + str(df_anomaly.shape)) # MODEL CREATION # if model_type == ModelType.VANILLA: model = modeling_utilities.LSTM_multivar(df_observed, df_anomaly, df_raw, LSTM_params, summary, name, model_output, model_save) elif model_type == ModelType.BIDIRECTIONAL: model = modeling_utilities.LSTM_multivar_bidir(df_observed, df_anomaly, df_raw, LSTM_params, summary, name, model_output, model_save) print('multivariate ' + str(model_type) + ' LSTM model complete.\n') # Plot Metrics and Evaluate the Model if plots: plt.figure() plt.plot(model.history.history['loss'], label='Training Loss') plt.plot(model.history.history['val_loss'], label='Validation Loss') plt.legend() plt.show() # DETERMINE THRESHOLD AND DETECT ANOMALIES # ts = LSTM_params['time_steps'] residuals = pd.DataFrame(model.test_residuals) residuals.columns = sensors predictions = pd.DataFrame(model.predictions) predictions.columns = sensors if model_type == ModelType.VANILLA: residuals.index = df_observed[ts:].index predictions.index = df_observed[ts:].index observed = df_observed[ts:] elif model_type == ModelType.BIDIRECTIONAL: residuals.index = df_observed[ts:-ts].index predictions.index = df_observed[ts:-ts].index observed = df_observed[ts:-ts] threshold = dict() detections = dict() for snsr in sensors: threshold[snsr] = anomaly_utilities.set_dynamic_threshold( residuals[snsr], params[snsr]['window_sz'], params[snsr]['alpha'], params[snsr]['threshold_min']) threshold[snsr].index = residuals.index detections[snsr] = anomaly_utilities.detect_anomalies( observed[snsr + '_obs'], predictions[snsr], residuals[snsr], threshold[snsr], summary=True) if plots: plt.figure() anomaly_utilities.plt_threshold(residuals[snsr], threshold[snsr], sensors[snsr]) plt.show() print('Threshold determination complete.') # WIDEN AND NUMBER ANOMALOUS EVENTS # all_data = dict() for snsr in sensors: if model_type == ModelType.VANILLA: all_data[snsr] = sensor_array[snsr].iloc[ts:] elif model_type == ModelType.BIDIRECTIONAL: all_data[snsr] = sensor_array[snsr].iloc[ts:-ts] all_data[snsr]['detected_anomaly'] = detections[snsr]['anomaly'] all_data[snsr]['all_anomalies'] = all_data[snsr].eval( 'detected_anomaly or anomaly') all_data[snsr]['detected_event'] = anomaly_utilities.anomaly_events( all_data[snsr]['all_anomalies'], params[snsr]['widen']) # COMPARE AND DETERMINE METRICS # if compare: metrics = dict() e_metrics = dict() for snsr in sensors: all_data[snsr]['labeled_event'] = anomaly_utilities.anomaly_events( all_data[snsr]['labeled_anomaly'], params[snsr]['widen']) anomaly_utilities.compare_events(all_data[snsr], params[snsr]['widen']) metrics[snsr] = anomaly_utilities.metrics(all_data[snsr]) e_metrics[snsr] = anomaly_utilities.event_metrics(all_data[snsr]) # OUTPUT RESULTS # print('\nModel type: LSTM multivariate ' + str(model_type)) print('Sensor: ' + snsr) anomaly_utilities.print_metrics(metrics[snsr]) print('Event based calculations:') anomaly_utilities.print_metrics(e_metrics[snsr]) print('Model report complete\n') # GENERATE PLOTS # if plots: for snsr in sensors: plt.figure() anomaly_utilities.plt_results( raw=sensor_array[snsr]['raw'], predictions=detections[snsr]['prediction'], labels=sensor_array[snsr]['labeled_event'], detections=all_data[snsr]['detected_event'], sensor=snsr) plt.show() LSTM_detect_multivar = ModelWorkflow() LSTM_detect_multivar.sensor_array = sensor_array LSTM_detect_multivar.df_observed = df_observed LSTM_detect_multivar.df_raw = df_raw LSTM_detect_multivar.df_anomaly = df_anomaly LSTM_detect_multivar.model = model LSTM_detect_multivar.threshold = threshold LSTM_detect_multivar.detections = detections LSTM_detect_multivar.all_data = all_data if compare: LSTM_detect_multivar.metrics = metrics LSTM_detect_multivar.e_metrics = e_metrics return LSTM_detect_multivar
def ARIMA_detect(df, sensor, params, rules=False, plots=False, summary=True, compare=False, suppress_warnings=True): """ """ print('\nProcessing ARIMA detections.') # RULES BASED DETECTION # if rules: df = rules_detect.range_check(df, params['max_range'], params['min_range']) df = rules_detect.persistence(df, params['persist']) size = rules_detect.group_size(df) df = rules_detect.interpolate(df) print(sensor + ' rules based detection complete. Longest detected group = ' + str(size)) # MODEL CREATION # [p, d, q] = params['pdq'] model_fit, residuals, predictions = modeling_utilities.build_arima_model( df['observed'], p, d, q, summary, suppress_warnings) print(sensor + ' ARIMA model complete.') # DETERMINE THRESHOLD AND DETECT ANOMALIES # threshold = anomaly_utilities.set_dynamic_threshold( residuals[0], params['window_sz'], params['alpha'], params['threshold_min']) threshold.index = residuals.index if plots: plt.figure() anomaly_utilities.plt_threshold(residuals, threshold, sensor) plt.show() print('Threshold determination complete.') detections = anomaly_utilities.detect_anomalies(df['observed'], predictions, residuals, threshold, summary=True) # WIDEN AND NUMBER ANOMALOUS EVENTS # df['detected_anomaly'] = detections['anomaly'] df['all_anomalies'] = df.eval('detected_anomaly or anomaly') df['detected_event'] = anomaly_utilities.anomaly_events( df['all_anomalies'], params['widen']) if compare: df['labeled_event'] = anomaly_utilities.anomaly_events( df['labeled_anomaly'], params['widen']) # DETERMINE METRICS # anomaly_utilities.compare_events(df, params['widen']) metrics = anomaly_utilities.metrics(df) e_metrics = anomaly_utilities.event_metrics(df) # OUTPUT RESULTS # print('Model type: ARIMA') print('Sensor: ' + sensor) anomaly_utilities.print_metrics(metrics) print('Event based calculations:') anomaly_utilities.print_metrics(e_metrics) print('Model report complete\n') # GENERATE PLOTS # if plots: plt.figure() anomaly_utilities.plt_results(raw=df['raw'], predictions=detections['prediction'], labels=df['labeled_event'], detections=df['detected_event'], sensor=sensor) plt.show() ARIMA_detect = ModelWorkflow() ARIMA_detect.df = df ARIMA_detect.model_fit = model_fit ARIMA_detect.threshold = threshold ARIMA_detect.detections = detections if compare: ARIMA_detect.metrics = metrics ARIMA_detect.e_metrics = e_metrics return ARIMA_detect
def LSTM_detect_univar(df, sensor, params, LSTM_params, model_type, name='', rules=False, plots=False, summary=True, compare=False, model_output=True, model_save=True): """ """ print('\nProcessing LSTM univariate ' + str(model_type) + ' detections.') # RULES BASED DETECTION # if rules: df = rules_detect.range_check(df, params['max_range'], params['min_range']) df = rules_detect.persistence(df, params['persist']) size = rules_detect.group_size(df) df = rules_detect.interpolate(df) print( sensor + ' rules based detection complete. Maximum detected group length = ' + str(size)) # MODEL CREATION # if model_type == ModelType.VANILLA: model = modeling_utilities.LSTM_univar(df, LSTM_params, summary, name, model_output, model_save) elif model_type == ModelType.BIDIRECTIONAL: model = modeling_utilities.LSTM_univar_bidir(df, LSTM_params, summary, name, model_output, model_save) print(sensor + ' ' + str(model_type) + ' LSTM model complete.') if plots: plt.figure() plt.plot(model.history.history['loss'], label='Training Loss') plt.plot(model.history.history['val_loss'], label='Validation Loss') plt.legend() plt.show() # DETERMINE THRESHOLD AND DETECT ANOMALIES # ts = LSTM_params['time_steps'] threshold = anomaly_utilities.set_dynamic_threshold( model.test_residuals[0], params['window_sz'], params['alpha'], params['threshold_min']) if model_type == ModelType.VANILLA: threshold.index = df[ts:].index elif model_type == ModelType.BIDIRECTIONAL: threshold.index = df[ts:-ts].index residuals = pd.DataFrame(model.test_residuals) residuals.index = threshold.index if plots: plt.figure() anomaly_utilities.plt_threshold(residuals, threshold, sensor) plt.show() if model_type == ModelType.VANILLA: observed = df[['observed']][ts:] elif model_type == ModelType.BIDIRECTIONAL: observed = df[['observed']][ts:-ts] print('Threshold determination complete.') detections = anomaly_utilities.detect_anomalies(observed, model.predictions, model.test_residuals, threshold, summary=True) # WIDEN AND NUMBER ANOMALOUS EVENTS # if model_type == ModelType.VANILLA: df_anomalies = df.iloc[ts:] elif model_type == ModelType.BIDIRECTIONAL: df_anomalies = df.iloc[ts:-ts] df_anomalies['detected_anomaly'] = detections['anomaly'] df_anomalies['all_anomalies'] = df_anomalies.eval( 'detected_anomaly or anomaly') df_anomalies['detected_event'] = anomaly_utilities.anomaly_events( df_anomalies['all_anomalies'], params['widen']) if compare: df_anomalies['labeled_event'] = anomaly_utilities.anomaly_events( df_anomalies['labeled_anomaly'], params['widen']) # DETERMINE METRICS # anomaly_utilities.compare_events(df_anomalies, params['widen']) metrics = anomaly_utilities.metrics(df_anomalies) e_metrics = anomaly_utilities.event_metrics(df_anomalies) # OUTPUT RESULTS # print('Model type: LSTM univariate ' + str(model_type)) print('Sensor: ' + sensor) anomaly_utilities.print_metrics(metrics) print('Event based calculations:') anomaly_utilities.print_metrics(e_metrics) print('Model report complete\n') # GENERATE PLOTS # if plots: plt.figure() anomaly_utilities.plt_results( raw=df['raw'], predictions=detections['prediction'], labels=df['labeled_event'], detections=df_anomalies['detected_event'], sensor=sensor) plt.show() LSTM_detect_univar = ModelWorkflow() LSTM_detect_univar.df = df LSTM_detect_univar.model = model LSTM_detect_univar.threshold = threshold LSTM_detect_univar.detections = detections LSTM_detect_univar.df_anomalies = df_anomalies if compare: LSTM_detect_univar.metrics = metrics LSTM_detect_univar.e_metrics = e_metrics return LSTM_detect_univar
minimum=site_params[site][snsr]['min_range']) sensor_array[snsr], persist_count[snsr] = rules_detect.persistence( df=sensor_array[snsr], length=site_params[site][snsr]['persist'], output_grp=True) sensor_array[snsr] = rules_detect.add_labels(df=sensor_array[snsr], value=-9999) sensor_array[snsr] = rules_detect.interpolate(df=sensor_array[snsr]) # s = rules_detect.group_size(df=sensor_array[snsr]) # size.append(s) # print(str(snsr) + ' longest detected group = ' + str(size)) # metrics for rules based detection # for snsr in sensor_array: df_rules_metrics = sensor_array[snsr] df_rules_metrics['labeled_event'] = anomaly_utilities.anomaly_events( anomaly=df_rules_metrics['labeled_anomaly'], wf=0) df_rules_metrics['detected_event'] = anomaly_utilities.anomaly_events( anomaly=df_rules_metrics['anomaly'], wf=0) anomaly_utilities.compare_events(df=df_rules_metrics, wf=0) rules_metrics[snsr] = anomaly_utilities.metrics(df=df_rules_metrics) print('\nRules based metrics') print('Sensor: ' + snsr) anomaly_utilities.print_metrics(df=rules_metrics[snsr]) del (df_rules_metrics) print('Rules based detection complete.\n') #### Detect Calibration Events ######################################### calib_sensors = sensors[1:4]