def rank_aggregation(func, method, td_lag): # Collect scores for all intervals with various embedding dimensions regions = {} for k in range(3, 21): detections = maxdiv.maxdiv(func['ts'], method=method, mode='I_OMEGA', extint_min_len=20, extint_max_len=100, num_intervals=None, overlap_th=1.0, td_dim=k, td_lag=td_lag) for a, b, score in detections: if (a, b) not in regions: regions[(a, b)] = np.zeros(18) regions[(a, b)][k - 3] = score # Sort detections by Approximate Kemeny Rank Aggregation # (an interval is preferred over another one if the majority of rankers does so) detections = sorted(regions.keys(), key=lambda intvl: KemenyCompare(regions, intvl), reverse=True) # Assign inverse rank as detection score for i, (a, b) in enumerate(detections): detections[i] = (a, b, len(detections) - i) return maxdiv.find_max_regions(detections), 0
def td_from_mi(func, method, td_lag): # Determine Time Lag with minimum Mutual Information k = min(range(2, int(0.05 * func['ts'].shape[1])), key=lambda k: mutual_information(func['ts'], 2, k - 1)) // td_lag # Detect regions detections = maxdiv.maxdiv(func['ts'], method=method, mode='I_OMEGA', extint_min_len=20, extint_max_len=100, num_intervals=None, td_dim=k, td_lag=td_lag) return detections, k
def td_from_length_scale(func, method, td_lag, factor=0.3): # Determine Length Scale of Gaussian Process ls = length_scale(func['ts']) # Set Embedding Dimension k = int( max(1, min(0.05 * func['ts'].shape[1], round(factor * ls / td_lag)))) # Detect regions detections = maxdiv.maxdiv(func['ts'], method=method, mode='I_OMEGA', extint_min_len=20, extint_max_len=100, num_intervals=None, td_dim=k, td_lag=td_lag) return detections, k
def runOnDataset(dataset, params): detections = [] numAnomalies = 0 records = [ os.path.splitext(os.path.basename(file))[0] for file in glob(ROOT_DIR + dataset + '/*.csv') ] for record in records: ecg, timesteps, anomalies = readECG('{}/{}'.format(dataset, record)) if len(anomalies) > 0: print('Running detector on {}/{}'.format(dataset, record)) sys.stdout.flush() detections.append( classifyDetections( maxdiv.maxdiv(ecg, useLibMaxDiv=True, **params), timesteps, anomalies)) numAnomalies += len(anomalies) return detections, numAnomalies
def find_best_k(func, method, td_lag): # Find embedding dimension which maximizes AP k_best, ap_best, auc_best = 0, 0.0, 0.0 regions_best = [] for k in range(3, 21): detections = maxdiv.maxdiv(func['ts'], method=method, mode='I_OMEGA', extint_min_len=20, extint_max_len=100, num_intervals=None, td_dim=k, td_lag=td_lag) cur_ap = eval.average_precision([func['gt']], [detections]) cur_auc = eval.auc(func['gt'], detections, func['ts'].shape[1]) if (k_best == 0) or (cur_ap > ap_best) or ((cur_ap == ap_best) and (cur_auc > auc_best)): k_best, ap_best, auc_best, regions_best = k, cur_ap, cur_auc, detections return regions_best, k_best
def td_from_ce_gradient(func, method, td_lag, th=0.001): # Determine Time Lag based on the steepness of decrease of conditional entropy ce = np.array([ conditional_entropy(func['ts'], d, td_lag) for d in range(1, int(0.05 * func['ts'].shape[1] / td_lag)) ]) dce = np.convolve(ce, [-1, 0, 1], 'valid') if np.any(dce <= th): k = (np.where(dce <= th)[0][0] + 2) else: k = (dce.argmin() + 2) # Detect regions detections = maxdiv.maxdiv(func['ts'], method=method, mode='I_OMEGA', extint_min_len=20, extint_max_len=100, num_intervals=None, td_dim=k, td_lag=td_lag) return detections, k
def td_from_mi_gradient(func, method, td_lag, th=0.15): th *= func['ts'].shape[0] # Determine Time Lag based on the steepness of decrease of mutual information mi = np.array([ mutual_information(func['ts'], 2, d) for d in range(1, int(0.05 * func['ts'].shape[1])) ]) dmi = np.convolve(mi, [-1, 0, 1], 'valid') if np.any(dmi <= th): k = (np.where(dmi <= th)[0][0] + 3) // td_lag else: k = (dmi.argmin() + 3) // td_lag # Detect regions detections = maxdiv.maxdiv(func['ts'], method=method, mode='I_OMEGA', extint_min_len=20, extint_max_len=100, num_intervals=None, td_dim=k, td_lag=td_lag) return detections, k
def td_from_relative_ce(func, method, td_lag, th=0.005): # Determine Time Lag based on "normalized" Mutual Information rce = np.array([ conditional_entropy(func['ts'], d, td_lag) for d in range(1, int(0.05 * func['ts'].shape[1] / td_lag)) ]) rce /= rce[0] drce = np.convolve(rce, [-1, 0, 1], 'valid') if np.any(drce <= th): k = (np.where(drce <= th)[0][0] + 2) else: k = (drce.argmin() + 2) # Detect regions detections = maxdiv.maxdiv(func['ts'], method=method, mode='I_OMEGA', extint_min_len=20, extint_max_len=100, num_intervals=None, td_dim=k, td_lag=td_lag) return detections, k
def td_from_relative_mi(func, method, td_lag, th=0.05): # Determine Time Lag based on "normalized" Mutual Information rmi = np.array([ mutual_information(func['ts'], 2, d) for d in range(1, int(0.05 * func['ts'].shape[1])) ]) rmi /= rmi[0] drmi = np.convolve(rmi, [-1, 0, 1], 'valid') if np.any(drmi <= th): k = (np.where(drmi <= th)[0][0] + 3) // td_lag else: k = (drmi.argmin() + 3) // td_lag # Detect regions detections = maxdiv.maxdiv(func['ts'], method=method, mode='I_OMEGA', extint_min_len=20, extint_max_len=100, num_intervals=None, td_dim=k, td_lag=td_lag) return detections, k
def td_from_false_neighbors(func, method, td_lag, Rtol=1.0, Ntol=0.001): d, n = func['ts'].shape Rtol2 = Rtol * Rtol # Determine embedding dimension based on false nearest neighbors dist = maxdiv_util.calc_distance_matrix(func['ts']) cumdist = dist.copy() fnn = [] max_k = int(0.05 * func['ts'].shape[1]) for k in range(1, max_k + 1): cur_fnn = 0 for i in range(n - 1): for j in range(i + 1, n): id = max(0, i - k * td_lag) jd = max(0, j - k * td_lag) if dist[id, jd] / cumdist[i, j] > Rtol2: cur_fnn += 1 cumdist[i, j] += dist[id, jd] fnn.append(cur_fnn) if (len(fnn) >= 3) and (abs(fnn[-3] - fnn[-1]) <= Ntol * abs(fnn[0] - fnn[2])): k -= 2 break # Detect regions detections = maxdiv.maxdiv(func['ts'], method=method, mode='I_OMEGA', extint_min_len=20, extint_max_len=100, num_intervals=None, td_dim=k, td_lag=td_lag) return detections, k
def text2feat(text): return np.vstack(sent_feat(s) for s in text) if __name__ == '__main__': minLen = int(sys.argv[1]) if len(sys.argv) > 1 else MIN_LEN maxLen = int(sys.argv[2]) if len(sys.argv) > 2 else MAX_LEN numForeign = int(sys.argv[3]) if len(sys.argv) > 3 else NUM_FOREIGN text, gt = makeMixedText(minLen, maxLen, numForeign) feat = text2feat(text) start = time.time() intervals = maxdiv(feat.T, method='gaussian_global_cov', mode='TS', extint_min_len=minLen, extint_max_len=maxLen, num_intervals=numForeign * 2) stop = time.time() print( 'The search for anomalous paragraphs in a text of {} sentences took {} seconds.' .format(len(text), stop - start)) printDetectedParagraphs(text, intervals)
aucs = { method : [] for method in methods } aps = { method : [] for method in methods } ratios = [] for l in range(20, 201, 15): print(l) funcs = [sample_gp_with_meanshift(n, l) for i in range(m)] for method in methods: auc = [] regions = [] for i in range(m): gp, ygt = funcs[i] regions.append(maxdiv.maxdiv(gp, method = method, num_intervals = 5, extint_min_len = 20, extint_max_len = 220, kernelparameters={'kernel_sigma_sq': args.kernel_sigma_sq}, **parameters)) auc.append(eval.auc(ygt, regions[-1], n)) aucs[method].append(np.mean(auc)) aps[method].append(eval.average_precision([ygt for gp, ygt in funcs], regions)) ratios.append(float(l) / n) # Plot results fig_auc = plt.figure() sp_auc = fig_auc.add_subplot(111, xlabel = 'Length of anomaly / Length of time series', ylabel = 'AUC') fig_ap = plt.figure() sp_ap = fig_ap.add_subplot(111, xlabel = 'Length of anomaly / Length of time series', ylabel = 'Average Precision') for method in methods: sp_auc.plot(ratios, aucs[method], marker = 'x', label = method) sp_ap.plot(ratios, aps[method], marker = 'x', label = method)
method = sys.argv[1] if len(sys.argv) > 1 else 'parzen' propmeth = sys.argv[2] if len(sys.argv) > 2 else 'hotellings_t' # Load data data, dates = read_hpw_csv('HPW_2012_41046.csv') data = preproc.normalize_time_series(data) # Detect if method in ['hotellings_t', 'kde']: if method == 'kde': scores = baselines_noninterval.pointwiseKDE(preproc.td(data)) else: scores = baselines_noninterval.hotellings_t(preproc.td(data)) regions = baselines_noninterval.pointwiseScoresToIntervals(scores, 24) elif method == 'gaussian_cov_ts': regions = maxdiv.maxdiv(data, 'gaussian_cov', mode = 'TS', preproc = 'td', proposals = propmeth, extint_min_len = 24, extint_max_len = 170, num_intervals = 5) else: regions = maxdiv.maxdiv(data, method, mode = 'I_OMEGA', preproc = 'td', proposals = propmeth, extint_min_len = 24, extint_max_len = 170, num_intervals = 5) # Console output print('-- Ground Truth --') for name, (a, b) in HURRICANE_GT.items(): print('{:{}s}: {!s} - {!s}'.format(name, max(len(n) for n in HURRICANE_GT.keys()), a, b - datetime.timedelta(days = 1))) print('\n-- Detected Intervals ({} with {} proposals) --'.format(method, propmeth)) for a, b, score in regions: print('{!s} - {!s} (Score: {})'.format(dates[a], dates[b-1], score)) # Plot ygt = [(datetime_diff(a, dates[0]), datetime_diff(b, dates[0])) for a, b in HURRICANE_GT.values()] eval.plotDetections(data, regions, ygt,
propparams = {'useMAD': useMAD, 'sd_th': sd_th} if not filtered: propparams['filter'] = None for ftype in data: gts = [] cur_regions = [] for func in data[ftype]: gts.append(func['gt']) cur_regions.append( maxdiv.maxdiv(func['ts'], method=METHOD, mode=MODE, preproc='normalize', td_dim=6, td_lag=2, num_intervals=None, extint_min_len=20, extint_max_len=100, proposals=propmeth, proposalparameters=propparams)) aps.append(eval.average_precision(gts, cur_regions)) ygts += gts regions += cur_regions ap[id][sd_th] = eval.average_precision(ygts, regions) mean_ap[id][sd_th] = np.mean(aps) # Print results as table hdiv_len = 5 + sum(len(lbl) + 3 for lbl in labels.values()) # length of horizontal divider
# Measure runtime of various methods for different lengths of time series times = {method: [] for method in METHODS} lengths = [] for n in range(25, 1001, 25): print('-- n = {} --'.format(n)) for method in METHODS: times[method].append(0.0) for i in range(m): gp = sample_gp_with_meanshift(n) for method in METHODS: start_time = time.time() maxdiv.maxdiv(gp, method=method, preproc=PREPROC, mode=MODE) stop_time = time.time() times[method][-1] += stop_time - start_time lengths.append(n) for method in METHODS: times[method][-1] /= m # Plot results markers = ['x', 'o', '*', 'v', '^', '<', '>'] fig = plt.figure() sp = fig.add_subplot(111, xlabel='Length of time series', ylabel='Runtime in seconds') for i, (method, t) in enumerate(times.items()): sp.plot(lengths, t, marker=markers[i % len(markers)], label=method)
data, dates = read_hpw_csv('HPW_2012_41046.csv') data = preproc.normalize_time_series(data) # Detect if method in ['hotellings_t', 'kde']: if method == 'kde': scores = baselines_noninterval.pointwiseKDE(preproc.td(data)) else: scores = baselines_noninterval.hotellings_t(preproc.td(data)) regions = baselines_noninterval.pointwiseScoresToIntervals(scores, 24) elif method == 'gaussian_cov_ts': regions = maxdiv.maxdiv(data, 'gaussian_cov', mode='TS', td_dim=3, td_lag=1, proposals=propmeth, extint_min_len=24, extint_max_len=72, num_intervals=5) else: regions = maxdiv.maxdiv(data, method, mode='I_OMEGA', td_dim=3, td_lag=1, proposals=propmeth, extint_min_len=24, extint_max_len=72, num_intervals=5)
sys.stderr.write('- {} -\n'.format(labels[id])) sys.stderr.flush() aucs = [] regions = [] ygts = [] for i, func in enumerate(data[ftype]): time_start = time.time() regions.append( maxdiv.maxdiv( func['ts'], useLibMaxDiv=True, method=method, preproc=preproc, mode=mode, td_dim=args.td_dim if preproc is not None else 1, kernelparameters={ 'kernel_sigma_sq': args.kernel_sigma_sq }, **parameters)) time_stop = time.time() ygts.append(func['gt']) aucs.append( eval.auc(func['gt'], regions[-1], func['ts'].shape[1])) if preproc is None: if func['ts'].shape[1] not in times: times[func['ts'].shape[1]] = { m: [] for m in METHODS
if __name__ == '__main__': if len(sys.argv) < 2: print( 'Usage: {} <word2vec-model> [<min-len = {}> [<max-len = {}> [<num-intervals = {}>]]]' .format(sys.argv[0], MIN_LEN, MAX_LEN, NUM_INTERVALS)) exit() model = sys.argv[1] minLen = int(sys.argv[2]) if len(sys.argv) > 2 else MIN_LEN maxLen = int(sys.argv[3]) if len(sys.argv) > 3 else MAX_LEN numIntervals = int(sys.argv[4]) if len(sys.argv) > 4 else NUM_INTERVALS text = genesis.words(fileids='english-kjv.txt') feat = textutils.text2mat(text, model) start = time.time() intervals = maxdiv(feat, method='gaussian_cov', mode='TS', extint_min_len=minLen, extint_max_len=maxLen, num_intervals=numIntervals) stop = time.time() print( 'The search for anomalous paragraphs in a text of {} words took {} seconds.' .format(len(text), stop - start)) textutils.printDetectedParagraphs(text, intervals)
(window_sd**2)) gauss_window2 = np.exp(-0.5 * ((np.arange(0.0, ts_len) - window_center2)**2) / (window_sd**2)) ts = mean + np.random.randn(ts_len) * sd ts1 = gauss_window1 * ts + 0.1 * np.random.randn(ts_len) ts2 = (gauss_window1 + gauss_window2) * ts + 0.1 * np.random.randn(ts_len) gt = [(window_center1 - 3 * window_sd, window_center1 + 3 * window_sd + 1), (window_center2 - 3 * window_sd, window_center2 + 3 * window_sd + 1)] # Apply MDI Gaussian on different scenarios print('--- OMEGA_I on single extremum ---') det = maxdiv(ts1.reshape((1, ts_len)), 'gaussian_cov', None, mode='OMEGA_I', extint_min_len=10, extint_max_len=8 * window_sd, preproc='td') plotDetections(ts1.reshape((1, ts_len)), det, [gt[0]], silent=False) print('--- I_OMEGA on single extremum ---') det = maxdiv(ts1.reshape((1, ts_len)), 'gaussian_cov', None, mode='I_OMEGA', extint_min_len=10, extint_max_len=8 * window_sd, preproc='td') plotDetections(ts1.reshape((1, ts_len)), det, [gt[0]], silent=False)
# Measure runtimes and write them to timing.csv with open('timing.csv', 'w') as outFile: outFile.write( 'Length,Gaussian (Python),KDE (Python),Gaussian (libmaxdiv),KDE (libmaxdiv)\n' ) for i, n in enumerate(N): gps = sample_gp(n) start = time() maxdiv.maxdiv(gps, 'gaussian_cov', None, 'dense', useLibMaxDiv=False, mode='I_OMEGA', preproc='td', extint_min_len=min_len, extint_max_len=max_len) stop = time() times[i, 0] = stop - start start = time() maxdiv.maxdiv(gps, 'parzen', None, 'dense', useLibMaxDiv=False, mode='I_OMEGA', preproc='td',