def do_ad(df, alpha=0.005, max_anoms=0.1, only_last=None, longterm=False, e_value=False, direction='both'): """ This method performs the actual anomaly detection. Expecting the a dataframe with multiple sensors, and a specification of which sensor to use for anomaly detection. :param df: a dataframe with a timestamp column and one more columns with telemetry data :param column: name of the column on which to perform AD :param alpha: see pyculiarity documentation for the meaning of these parameters :param max_anoms: :param only_last: :param longterm: :param e_value: :param direction: :return: a pd.Series containing anomalies. If not an anomaly, entry will be NaN, otherwise the sensor reading """ results = detect_ts(df, max_anoms=max_anoms, alpha=alpha, direction=direction, e_value=e_value, longterm=longterm, only_last=only_last) return results['anoms']['timestamp'].values
def detect_anomaly(): twitter_example_data = pd.read_csv('1.csv', usecols=['timestamp', 'count']) results = detect_ts(twitter_example_data, max_anoms=0.02, direction='both', only_last='day') print results['anoms'].iloc[:, 1]
def twitter_anomaly_detection(data_path): data = get_data(data_path) # 异常检测 data['date'] = data['timestamp'].apply(lambda i: datetime.fromtimestamp(i)) anomalies = detect_ts(data[['date', 'value']], max_anoms=0.001, direction='both') print(anomalies) # 时间转换 plt.plot(pd.to_datetime(data['date']), data['value'], '-') # 绘制实际异常散点图 date = data.loc[data['label'] == 1]['date'] value = data.loc[data['label'] == 1]['value'] plt.scatter(pd.to_datetime(date), value, c='b', linewidths=3) # 绘制检测结果 anoms_date = anomalies['anoms']['timestamp'] plt.plot( pd.to_datetime(pd.to_datetime(anoms_date), format="%Y%m%d %H:%M:%S"), anomalies['anoms']['anoms'], 'ro') plt.grid(True) # 保存图片 plt.savefig(img_path + "twitter_anomaly_detection.png", dpi=1000) plt.show()
def detect_bw(cell): try: example_data = df_concat_kpi_bw_T[['timestamp', cell]] example_data.loc[:, cell] = example_data[cell].fillna( method='ffill').fillna(method='bfill') results = detect_ts(example_data, max_anoms=0.09, alpha=0.001, direction='both', only_last=None) results['anoms']['cell'] = cell return results['anoms'].reset_index(drop=True) except: example_data = df_concat_kpi_bw_T[['timestamp', cell]] example_data.loc[:, cell] = example_data[cell].fillna( method='ffill').fillna(method='bfill') results = sesd.seasonal_esd(example_data[cell], periodicity=20, hybrid=True, max_anomalies=int( len(example_data[cell]) * 0.05)) tmp = example_data.loc[results] tmp.columns = ['timestamp', 'anoms'] tmp['cell'] = cell return tmp
def test_handling_of_leading_trailing_nas(self): for i in range(10) + [len(self.raw_data) - 1]: self.raw_data.set_value(i, 'count', np.nan) results = detect_ts(self.raw_data, max_anoms=0.02, direction='both', plot=False) eq_(len(results['anoms'].columns), 2) eq_(len(results['anoms'].iloc[:,1]), 131)
def test_both_directions_with_plot(self): results = detect_ts(self.raw_data, max_anoms=0.02, direction='both', only_last='day', plot=False) eq_(len(results['anoms'].columns), 2) eq_(len(results['anoms'].iloc[:, 1]), 25)
def test_both_directions_e_value_threshold_med_max(self): results = detect_ts(self.raw_data, max_anoms=0.02, direction='both', threshold="med_max", e_value=True) eq_(len(results['anoms'].columns), 3) eq_(len(results['anoms'].iloc[:, 1]), 4)
def test_both_directions_with_plot(self): results = detect_ts( self.raw_data, max_anoms=0.02, direction='both', only_last='day', plot=False) eq_(len(results['anoms'].columns), 2) eq_(len(results['anoms'].iloc[:, 1]), 21)
def test_both_directions_e_value_threshold_med_max(self): results = detect_ts( self.raw_data, max_anoms=0.02, direction='both', threshold="med_max", e_value=True) eq_(len(results['anoms'].columns), 3) eq_(len(results['anoms'].iloc[:, 1]), 4)
def test_both_directions_e_value_longterm(self): results = detect_ts(self.raw_data, max_anoms=0.02, direction='both', longterm=True, plot=False, e_value=True) eq_(len(results['anoms'].columns), 3) eq_(len(results['anoms'].iloc[:, 1]), 131)
def test_handling_of_leading_trailing_nas(self): for i in list(range(10)) + [len(self.raw_data) - 1]: self.raw_data.at[i, 'count'] = np.nan results = detect_ts(self.raw_data, max_anoms=0.02, direction='both', plot=False) eq_(len(results['anoms'].columns), 2) eq_(len(results['anoms'].iloc[:, 1]), 131)
def test_both_directions_e_value_longterm(self): results = detect_ts( self.raw_data, max_anoms=0.02, direction='both', longterm=True, plot=False, e_value=True) eq_(len(results['anoms'].columns), 3) eq_(len(results['anoms'].iloc[:, 1]), 114)
def predict(self,data=None): results = detect_ts(data, max_anoms=self.max_anoms, direction=self.direction, alpha=self.alpha, only_last=self.only_last, threshold=self.threshold, e_value=self.e_value, longterm = self.longterm, piecewise_median_period=self.piecewise_median_period, custom_period=self.custom_period, use_period=self.use_period) anoms = results['anoms'] self.anomaly_idx = anoms.index self.anom_val = anoms['anoms'] return anoms
def test_check_midnight_date_format(self): data = pd.read_csv(os.path.join(self.path, 'midnight_test_data.csv'), usecols=['date', 'value']) data.date = date_format(data.date, "%Y-%m-%d %H:%M:%S") results = detect_ts(data, max_anoms=0.2, threshold=None, direction='both', plot=False, only_last="day", e_value=True) eq_(len(results['anoms'].anoms), len(results['anoms'].expected_value))
def filter_outliers(data): """ 异常值过滤 """ from pyculiarity import detect_ts results = detect_ts(data, max_anoms=0.10, alpha=1000, direction='both', only_last=None)["anoms"] outliers_arr = results["timestamp"].array data = data[data["timestamp"].apply(lambda ele: ele not in outliers_arr)] data = data.rename(columns={"value": "y", "timestamp": "ds"}) return data
def detect_anoms(dataframe): """ Run anomaly detection. :param dataframe: dataframe with 'timestamp' and 'pred_price' columns :return: list of timestamps """ df = dataframe[['timestamp', 'pred_price']].reset_index(drop=True) results = detect_ts(df, max_anoms=0.3, alpha=0.001, direction='both', only_last=None, longterm=True, verbose=True, piecewise_median_period_weeks=3) return results['anoms']
def detect_ts_online(df_smooth, window_size, stop): is_anomaly = False run_time = 9999 start_index = max(0, stop - window_size) df_win = df_smooth.iloc[start_index:stop, :] start_time = time.time() results = detect_ts(df_win, alpha=0.05, max_anoms=0.02, only_last=None, longterm=False, e_value=False, direction='both') run_time = time.time() - start_time if results['anoms'].shape[0] > 0: timestamp = df_win['timestamp'].tail(1).values[0] if timestamp == results['anoms'].tail(1)['timestamp'].values[0]: is_anomaly = True return is_anomaly, run_time
def detect_outlier_peculiarity(self, frame, yColumn, max_anoms = 0.05, alpha = 0.001, direction='both', printFigure=True): def plotOutliers(data, results, columnName): # format the data nicely data['timestamp'] = pd.to_datetime(data['timestamp']) data.set_index('timestamp', drop=True) # make a nice plot f, ax = plt.subplots(2, 1, sharex=True) ax[0].plot(data['timestamp'], data[columnName], 'b') ax[0].plot(results['anoms'].index, results['anoms']['anoms'], 'ro') ax[0].set_title('Detected Anomalies') ax[1].set_xlabel('Time Stamp') ax[0].set_ylabel(columnName) ax[1].plot(results['anoms'].index, results['anoms']['anoms'], 'b') ax[1].set_ylabel('Anomaly Magnitude') figTitle = columnName + " - Outliers using TwitterDetector" plt.savefig(figTitle+".png") plt.show() frame = frame.copy() frame['timestamp'] = (frame['date'] - datetime(1970,1,1)).dt.total_seconds() twoColumnsFrame = frame[['timestamp', yColumn]] #s = twoColumnsFrame.set_index('date')[yColumn] #results = detts.anomaly_detect_ts(s, max_anoms=0.05, alpha=0.001, direction='both') try: results = detect_ts(twoColumnsFrame, max_anoms=0.05, alpha=0.001, direction='both') except Exception as e: return [] if (printFigure): plotOutliers(twoColumnsFrame, results, yColumn) return results
def get_anomolies(data): return detect_ts(data, max_anoms=0.01, alpha=0.01, direction='pos', only_last=None, longterm=True)
from datetime import time import numpy as np plt.style.use('ggplot') __author__ = 'Raj Shanmuganathan' if __name__ == '__main__': rawdata = pd.read_csv('/Users/rshanm200/Workbench/Anamoly_detection/data/newrawdata1.csv', usecols=['datetime','online']) rawdata['timestamp'] = pd.to_datetime(rawdata['datetime'],format='%Y-%m-%d %H:%M:%S') rawdata['timestamp'] = rawdata['timestamp'].astype(np.int64) // 10**9 rawdata['value'] = rawdata['online'].apply(lambda x: 0 if pd.isna(x) else x) rawdata = rawdata.drop(['datetime','online'],axis=1) print(rawdata) results = detect_ts(rawdata, max_anoms=0.01, alpha=0.05, direction='both',piecewise_median_period_weeks=10,granularity='hr') print(results) # format the twitter data nicely results['timestamp'] = pd.to_datetime(rawdata['timestamp']) rawdata.set_index('timestamp', drop=True) # make a nice plot f, ax = plt.subplots(2, 1, sharex=True) ax[0].plot(rawdata['timestamp'], rawdata['value'], 'b') ax[0].plot(results['anoms'].index, results['anoms']['anoms'], 'ro') ax[0].set_title('Detected Anomalies') ax[1].set_xlabel('Time Stamp') ax[0].set_ylabel('Count') ax[1].plot(results['anoms'].index, results['anoms']['anoms'], 'b') ax[1].set_ylabel('Anomaly Magnitude')
def run(data, window=14 * 24): """ :param data: :param window: :return: """ # set some parameters for the AD algorithm alpha = 0.1 max_anoms = 0.05 only_last = None # alternative, we can set this to 'hr' or 'day' data = pd.read_json(json.loads(data)['data']) # return json.dumps(data.columns) sensors = ['volt','pressure','vibration', 'rotate'] # list(data.columns[2:]) # load dataframe df = load_df(data) # add current sensor readings to data frame, also adds fields for anomaly detection results df = append_data(df, data, sensors) # calculate running averages running_avgs(df, sensors) # note timestamp so that we can update the correct row of the dataframe later timestamp = data['timestamp'].values[0] # we get a copy of the current (also last) row of the dataframe current_row = df.loc[df['timestamp'] == timestamp, :] # determine how many sensor readings we already have rows = df.shape[0] # if the data frame doesn't have enough rows for our sliding window size, we just return (setting that we have no # anomalies) if rows < window: save_df(df) json_data = current_row.to_json() return json.dumps(json_data) # determine the first row of the data frame that falls into the sliding window start_row = rows - window # a flag to indicate whether we detected an anomaly in any of the sensors after this reading detected_an_anomaly = False # we loop over the sensor columns for column in sensors: df_s = df.ix[start_row:rows, ('timestamp', column + "_avg")] # pyculiarity expects two columns with particular names df_s.columns = ['timestamp', 'value'] # we reset the timestamps, so that the current measurement is the last within the sliding time window # df_s = reset_time(df_s) # calculate the median value within each time sliding window # values = df_s.groupby(df_s.index.date)['value'].median() # create dataframe with median values etc. # df_agg = pd.DataFrame(data={'timestamp': pd.to_datetime(values.index), 'value': values}) # find anomalies results = detect_ts(df_s, max_anoms=max_anoms, alpha=alpha, direction='both', e_value=False, only_last=only_last) # create a data frame where we mark for each day whether it was an anomaly df_s = df_s.merge(results['anoms'], on='timestamp', how='left') # mark the current sensor reading as anomaly Specifically, if we get an anomaly in the the sliding window # leading up (including) the current sensor reading, we mark the current sensor reading as anomaly note, # alternatively one could mark all the sensor readings that fall within the sliding window as anomalies. # However, we prefer our approach, because without the current sensor reading the other sensor readings in # this sliding window may not have been an anomaly # current_row[column + '_an'] = not np.isnan(df_agg.tail(1)['anoms'].iloc[0]) if not np.isnan(df_s.tail(1)['anoms'].iloc[0]): current_row.ix[0,column + '_an'] = True detected_an_anomaly = True # It's only necessary to update the current row in the data frame, if we detected an anomaly if detected_an_anomaly: df.loc[df['timestamp'] == timestamp, :] = current_row save_df(df) json_data = current_row.to_json() return json.dumps(json_data)
def run(rawdata, window=14 * 24): """ :param data: :param window: :return: """ try: # set some parameters for the AD algorithm alpha = 0.1 max_anoms = 0.05 only_last = None # alternative, we can set this to 'hr' or 'day' json_data = json.loads(rawdata)['data'] # this is the beginning of anomaly detection code # TODO: the anomaly detection service expected one row of a pd.DataFrame w/ a timestamp and machine id, but here we only get a list of values # we therefore create a time stamp ourselves # and create a data frame that the anomaly detection code can understand # eventually, we want this to be harmonized! timestamp = time.strftime("%m/%d/%Y %H:%M:%S", time.localtime()) machineID = 1 # TODO scipy.random.choice(100) telemetry_data = json_data[0][8:16:2] sensors = ['volt','pressure','vibration', 'rotate'] data_dict = {} data_dict['timestamp'] = [timestamp] data_dict['machineID'] = [machineID] for i in range(0,4): data_dict[sensors[i]] = [telemetry_data[i]] telemetry_df = pd.DataFrame(data=data_dict) telemetry_df['timestamp'] = pd.to_datetime(telemetry_df['timestamp']) # load dataframe df = load_df(telemetry_df) # add current sensor readings to data frame, also adds fields for anomaly detection results df = append_data(df, telemetry_df, sensors) # # calculate running averages (no need to do this here, because we are already sending preprocessed data) # # TODO: this is disabled for now, because we are dealing with pre-processed data # running_avgs(df, sensors, only_copy=True) # note timestamp so that we can update the correct row of the dataframe later timestamp = df['timestamp'].max() # we get a copy of the current (also last) row of the dataframe current_row = df.loc[df['timestamp'] == timestamp, :] # determine how many sensor readings we already have rows = df.shape[0] # if the data frame doesn't have enough rows for our sliding window size, we just return (setting that we have no # anomalies) if rows < window: save_df(df) json_data = current_row.to_json() return json.dumps({"result": [0]}) # determine the first row of the data frame that falls into the sliding window start_row = rows - window # a flag to indicate whether we detected an anomaly in any of the sensors after this reading detected_an_anomaly = False anom_list = [] # we loop over the sensor columns for column in sensors: df_s = df.ix[start_row:rows, ('timestamp', column + "_avg")] # pyculiarity expects two columns with particular names df_s.columns = ['timestamp', 'value'] # we reset the timestamps, so that the current measurement is the last within the sliding time window # df_s = reset_time(df_s) # calculate the median value within each time sliding window # values = df_s.groupby(df_s.index.date)['value'].median() # create dataframe with median values etc. # df_agg = pd.DataFrame(data={'timestamp': pd.to_datetime(values.index), 'value': values}) # find anomalies results = detect_ts(df_s, max_anoms=max_anoms, alpha=alpha, direction='both', e_value=False, only_last=only_last) # create a data frame where we mark for each day whether it was an anomaly df_s = df_s.merge(results['anoms'], on='timestamp', how='left') # mark the current sensor reading as anomaly Specifically, if we get an anomaly in the the sliding window # leading up (including) the current sensor reading, we mark the current sensor reading as anomaly note, # alternatively one could mark all the sensor readings that fall within the sliding window as anomalies. # However, we prefer our approach, because without the current sensor reading the other sensor readings in # this sliding window may not have been an anomaly # current_row[column + '_an'] = not np.isnan(df_agg.tail(1)['anoms'].iloc[0]) if not np.isnan(df_s.tail(1)['anoms'].iloc[0]): current_row.ix[0,column + '_an'] = True detected_an_anomaly = True anom_list.append(1.0) else: anom_list.append(0.0) # It's only necessary to update the current row in the data frame, if we detected an anomaly if detected_an_anomaly: df.loc[df['timestamp'] == timestamp, :] = current_row save_df(df) json_data[0][8:16:2] = anom_list # # this is the end of anomaly detection code data = np.array(json_data) result = model.predict(data) prediction_dc.collect(result) print ("saving prediction data" + time.strftime("%H:%M:%S")) except Exception as e: result = str(e) return json.dumps({"error": result}) return json.dumps({"result":result.tolist()})
import pandas as pd import matplotlib import datetime matplotlib.style.use('ggplot') __author__ = 'willmcginnis' if __name__ == '__main__': # first run the models example_data = pd.read_csv('db_test_data.csv', usecols=['time_stamp', 'temp']) results = detect_ts(example_data, max_anoms=0.05, alpha=0.001, granularity='day', direction='both') # format the twitter data nicely example_data['time_stamp'] = pd.to_datetime(example_data['time_stamp']) example_data.set_index('time_stamp', drop=True) # make a nice plot f, ax = plt.subplots(2, 1, sharex=True) ax[0].plot(example_data['time_stamp'], example_data['temp'], 'b') ax[0].plot(results['anoms'].index, results['anoms']['anoms'], 'ro') ax[0].set_title('Detected Anomalies') ax[1].set_xlabel('Time Stamp') ax[0].set_ylabel('Count') ax[1].plot(results['anoms'].index, results['anoms']['anoms'], 'b')
matplotlib.style.use('ggplot') __author__ = 'willmcginnis' if __name__ == '__main__': # first run the models twitter_example_data = pd.read_csv('../tests/raw_data.csv', usecols=['timestamp', 'count']) print(twitter_example_data['timestamp'].values[:10]) twitter_example_data['timestamp'] = twitter_example_data['timestamp'].map( lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S').timestamp( )) print(twitter_example_data['timestamp'].values[:10]) results = detect_ts(twitter_example_data, max_anoms=0.05, alpha=0.001, direction='both', verbose=True) print(results['anoms']['timestamp'].values[:10]) # format the twitter data nicely twitter_example_data['timestamp'] = pd.to_datetime( twitter_example_data['timestamp']) twitter_example_data.set_index('timestamp', drop=True) twitter_example_data.to_csv('raw.csv', index=False) results['anoms'].to_csv('results.csv', index=False) # make a nice plot f, ax = plt.subplots(2, 1, sharex=True) ax[0].plot(twitter_example_data['timestamp'],
from pyculiarity import detect_ts import matplotlib.pyplot as plt import pandas as pd import matplotlib import datetime matplotlib.style.use('ggplot') __author__ = 'willmcginnis' if __name__ == '__main__': # first run the models example_data = pd.read_csv('db_test_data.csv', usecols=['time_stamp', 'temp']) results = detect_ts(example_data, max_anoms=0.05, alpha=0.001, granularity='day', direction='both') # format the twitter data nicely example_data['time_stamp'] = pd.to_datetime(example_data['time_stamp']) example_data.set_index('time_stamp', drop=True) # make a nice plot f, ax = plt.subplots(2, 1, sharex=True) ax[0].plot(example_data['time_stamp'], example_data['temp'], 'b') ax[0].plot(results['anoms'].index, results['anoms']['anoms'], 'ro') ax[0].set_title('Detected Anomalies') ax[1].set_xlabel('Time Stamp') ax[0].set_ylabel('Count') ax[1].plot(results['anoms'].index, results['anoms']['anoms'], 'b') ax[1].set_ylabel('Anomaly Magnitude') plt.show()
def test_handling_of_middle_nas(self): self.raw_data.at[len(self.raw_data) / 2, 'count'] = np.nan detect_ts(self.raw_data, max_anoms=0.02, direction='both')
def test_handling_of_middle_nas(self): self.raw_data.set_value(len(self.raw_data) / 2, 'count', np.nan) detect_ts(self.raw_data, max_anoms=0.02, direction='both')
import sys import os curPath = os.path.abspath(os.path.dirname(__file__)) rootPath = os.path.split(curPath)[0] sys.path.append(rootPath) from pyculiarity import detect_ts import pandas as pd twitter_example_data = pd.read_csv( '/Users/mac/IdeaProjects/AIOps/src/python/pyculiarity/raw_data.csv', usecols=['timestamp', 'count']) results = detect_ts(twitter_example_data, max_anoms=0.02, direction='both', only_last='day') print(str(results['anoms']['anoms'])) resultfile = open( "/Users/mac/IdeaProjects/AIOps/src/python/pyculiarity/result.csv", 'a') resultfile.write(str(results['anoms']['anoms']))
import matplotlib.pyplot as plt import pandas as pd import matplotlib import datetime matplotlib.style.use('ggplot') __author__ = 'willmcginnis' if __name__ == '__main__': # first run the models twitter_example_data = pd.read_csv('../tests/raw_data.csv', usecols=['timestamp', 'count']) print(twitter_example_data['timestamp'].values[:10]) twitter_example_data['timestamp'] = twitter_example_data['timestamp'].map(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S').timestamp()) print(twitter_example_data['timestamp'].values[:10]) results = detect_ts(twitter_example_data, max_anoms=0.05, alpha=0.001, direction='both', verbose=True) print(results['anoms']['timestamp'].values[:10]) # format the twitter data nicely twitter_example_data['timestamp'] = pd.to_datetime(twitter_example_data['timestamp']) twitter_example_data.set_index('timestamp', drop=True) twitter_example_data.to_csv('raw.csv', index=False) results['anoms'].to_csv('results.csv', index=False) # make a nice plot f, ax = plt.subplots(2, 1, sharex=True) ax[0].plot(twitter_example_data['timestamp'], twitter_example_data['count'], 'b') ax[0].plot(results['anoms'].index, results['anoms']['anoms'], 'ro') ax[0].set_title('Detected Anomalies') ax[1].set_xlabel('Time Stamp')