def rank_aggregation(func, method, td_lag):

    # Collect scores for all intervals with various embedding dimensions
    regions = {}
    for k in range(3, 21):
        detections = maxdiv.maxdiv(func['ts'],
                                   method=method,
                                   mode='I_OMEGA',
                                   extint_min_len=20,
                                   extint_max_len=100,
                                   num_intervals=None,
                                   overlap_th=1.0,
                                   td_dim=k,
                                   td_lag=td_lag)
        for a, b, score in detections:
            if (a, b) not in regions:
                regions[(a, b)] = np.zeros(18)
            regions[(a, b)][k - 3] = score

    # Sort detections by Approximate Kemeny Rank Aggregation
    # (an interval is preferred over another one if the majority of rankers does so)
    detections = sorted(regions.keys(),
                        key=lambda intvl: KemenyCompare(regions, intvl),
                        reverse=True)

    # Assign inverse rank as detection score
    for i, (a, b) in enumerate(detections):
        detections[i] = (a, b, len(detections) - i)

    return maxdiv.find_max_regions(detections), 0
def td_from_mi(func, method, td_lag):

    # Determine Time Lag with minimum Mutual Information
    k = min(range(2, int(0.05 * func['ts'].shape[1])),
            key=lambda k: mutual_information(func['ts'], 2, k - 1)) // td_lag
    # Detect regions
    detections = maxdiv.maxdiv(func['ts'],
                               method=method,
                               mode='I_OMEGA',
                               extint_min_len=20,
                               extint_max_len=100,
                               num_intervals=None,
                               td_dim=k,
                               td_lag=td_lag)
    return detections, k
def td_from_length_scale(func, method, td_lag, factor=0.3):

    # Determine Length Scale of Gaussian Process
    ls = length_scale(func['ts'])
    # Set Embedding Dimension
    k = int(
        max(1, min(0.05 * func['ts'].shape[1], round(factor * ls / td_lag))))
    # Detect regions
    detections = maxdiv.maxdiv(func['ts'],
                               method=method,
                               mode='I_OMEGA',
                               extint_min_len=20,
                               extint_max_len=100,
                               num_intervals=None,
                               td_dim=k,
                               td_lag=td_lag)
    return detections, k
Example #4
0
def runOnDataset(dataset, params):

    detections = []
    numAnomalies = 0
    records = [
        os.path.splitext(os.path.basename(file))[0]
        for file in glob(ROOT_DIR + dataset + '/*.csv')
    ]
    for record in records:
        ecg, timesteps, anomalies = readECG('{}/{}'.format(dataset, record))
        if len(anomalies) > 0:
            print('Running detector on {}/{}'.format(dataset, record))
            sys.stdout.flush()
            detections.append(
                classifyDetections(
                    maxdiv.maxdiv(ecg, useLibMaxDiv=True, **params), timesteps,
                    anomalies))
            numAnomalies += len(anomalies)
    return detections, numAnomalies
def find_best_k(func, method, td_lag):

    # Find embedding dimension which maximizes AP
    k_best, ap_best, auc_best = 0, 0.0, 0.0
    regions_best = []
    for k in range(3, 21):
        detections = maxdiv.maxdiv(func['ts'],
                                   method=method,
                                   mode='I_OMEGA',
                                   extint_min_len=20,
                                   extint_max_len=100,
                                   num_intervals=None,
                                   td_dim=k,
                                   td_lag=td_lag)
        cur_ap = eval.average_precision([func['gt']], [detections])
        cur_auc = eval.auc(func['gt'], detections, func['ts'].shape[1])
        if (k_best == 0) or (cur_ap > ap_best) or ((cur_ap == ap_best) and
                                                   (cur_auc > auc_best)):
            k_best, ap_best, auc_best, regions_best = k, cur_ap, cur_auc, detections

    return regions_best, k_best
def td_from_ce_gradient(func, method, td_lag, th=0.001):

    # Determine Time Lag based on the steepness of decrease of conditional entropy
    ce = np.array([
        conditional_entropy(func['ts'], d, td_lag)
        for d in range(1, int(0.05 * func['ts'].shape[1] / td_lag))
    ])
    dce = np.convolve(ce, [-1, 0, 1], 'valid')
    if np.any(dce <= th):
        k = (np.where(dce <= th)[0][0] + 2)
    else:
        k = (dce.argmin() + 2)
    # Detect regions
    detections = maxdiv.maxdiv(func['ts'],
                               method=method,
                               mode='I_OMEGA',
                               extint_min_len=20,
                               extint_max_len=100,
                               num_intervals=None,
                               td_dim=k,
                               td_lag=td_lag)
    return detections, k
def td_from_mi_gradient(func, method, td_lag, th=0.15):

    th *= func['ts'].shape[0]
    # Determine Time Lag based on the steepness of decrease of mutual information
    mi = np.array([
        mutual_information(func['ts'], 2, d)
        for d in range(1, int(0.05 * func['ts'].shape[1]))
    ])
    dmi = np.convolve(mi, [-1, 0, 1], 'valid')
    if np.any(dmi <= th):
        k = (np.where(dmi <= th)[0][0] + 3) // td_lag
    else:
        k = (dmi.argmin() + 3) // td_lag
    # Detect regions
    detections = maxdiv.maxdiv(func['ts'],
                               method=method,
                               mode='I_OMEGA',
                               extint_min_len=20,
                               extint_max_len=100,
                               num_intervals=None,
                               td_dim=k,
                               td_lag=td_lag)
    return detections, k
def td_from_relative_ce(func, method, td_lag, th=0.005):

    # Determine Time Lag based on "normalized" Mutual Information
    rce = np.array([
        conditional_entropy(func['ts'], d, td_lag)
        for d in range(1, int(0.05 * func['ts'].shape[1] / td_lag))
    ])
    rce /= rce[0]
    drce = np.convolve(rce, [-1, 0, 1], 'valid')
    if np.any(drce <= th):
        k = (np.where(drce <= th)[0][0] + 2)
    else:
        k = (drce.argmin() + 2)
    # Detect regions
    detections = maxdiv.maxdiv(func['ts'],
                               method=method,
                               mode='I_OMEGA',
                               extint_min_len=20,
                               extint_max_len=100,
                               num_intervals=None,
                               td_dim=k,
                               td_lag=td_lag)
    return detections, k
def td_from_relative_mi(func, method, td_lag, th=0.05):

    # Determine Time Lag based on "normalized" Mutual Information
    rmi = np.array([
        mutual_information(func['ts'], 2, d)
        for d in range(1, int(0.05 * func['ts'].shape[1]))
    ])
    rmi /= rmi[0]
    drmi = np.convolve(rmi, [-1, 0, 1], 'valid')
    if np.any(drmi <= th):
        k = (np.where(drmi <= th)[0][0] + 3) // td_lag
    else:
        k = (drmi.argmin() + 3) // td_lag
    # Detect regions
    detections = maxdiv.maxdiv(func['ts'],
                               method=method,
                               mode='I_OMEGA',
                               extint_min_len=20,
                               extint_max_len=100,
                               num_intervals=None,
                               td_dim=k,
                               td_lag=td_lag)
    return detections, k
def td_from_false_neighbors(func, method, td_lag, Rtol=1.0, Ntol=0.001):

    d, n = func['ts'].shape
    Rtol2 = Rtol * Rtol

    # Determine embedding dimension based on false nearest neighbors
    dist = maxdiv_util.calc_distance_matrix(func['ts'])
    cumdist = dist.copy()
    fnn = []
    max_k = int(0.05 * func['ts'].shape[1])
    for k in range(1, max_k + 1):

        cur_fnn = 0
        for i in range(n - 1):
            for j in range(i + 1, n):
                id = max(0, i - k * td_lag)
                jd = max(0, j - k * td_lag)
                if dist[id, jd] / cumdist[i, j] > Rtol2:
                    cur_fnn += 1
                cumdist[i, j] += dist[id, jd]
        fnn.append(cur_fnn)

        if (len(fnn) >= 3) and (abs(fnn[-3] - fnn[-1]) <=
                                Ntol * abs(fnn[0] - fnn[2])):
            k -= 2
            break

    # Detect regions
    detections = maxdiv.maxdiv(func['ts'],
                               method=method,
                               mode='I_OMEGA',
                               extint_min_len=20,
                               extint_max_len=100,
                               num_intervals=None,
                               td_dim=k,
                               td_lag=td_lag)
    return detections, k

def text2feat(text):

    return np.vstack(sent_feat(s) for s in text)


if __name__ == '__main__':

    minLen = int(sys.argv[1]) if len(sys.argv) > 1 else MIN_LEN
    maxLen = int(sys.argv[2]) if len(sys.argv) > 2 else MAX_LEN
    numForeign = int(sys.argv[3]) if len(sys.argv) > 3 else NUM_FOREIGN

    text, gt = makeMixedText(minLen, maxLen, numForeign)

    feat = text2feat(text)

    start = time.time()
    intervals = maxdiv(feat.T,
                       method='gaussian_global_cov',
                       mode='TS',
                       extint_min_len=minLen,
                       extint_max_len=maxLen,
                       num_intervals=numForeign * 2)
    stop = time.time()
    print(
        'The search for anomalous paragraphs in a text of {} sentences took {} seconds.'
        .format(len(text), stop - start))

    printDetectedParagraphs(text, intervals)
Example #12
0
aucs = { method : [] for method in methods }
aps = { method : [] for method in methods }
ratios = []
for l in range(20, 201, 15):
    
    print(l)
    
    funcs = [sample_gp_with_meanshift(n, l) for i in range(m)]
    
    for method in methods:
        auc = []
        regions = []
        
        for i in range(m):
            gp, ygt = funcs[i]
            regions.append(maxdiv.maxdiv(gp, method = method, num_intervals = 5, extint_min_len = 20, extint_max_len = 220,
                                         kernelparameters={'kernel_sigma_sq': args.kernel_sigma_sq}, **parameters))
            auc.append(eval.auc(ygt, regions[-1], n))
        
        aucs[method].append(np.mean(auc))
        aps[method].append(eval.average_precision([ygt for gp, ygt in funcs], regions))
    
    ratios.append(float(l) / n)

# Plot results
fig_auc = plt.figure()
sp_auc = fig_auc.add_subplot(111, xlabel = 'Length of anomaly / Length of time series', ylabel = 'AUC')
fig_ap = plt.figure()
sp_ap = fig_ap.add_subplot(111, xlabel = 'Length of anomaly / Length of time series', ylabel = 'Average Precision')
for method in methods:
    sp_auc.plot(ratios, aucs[method], marker = 'x', label = method)
    sp_ap.plot(ratios, aps[method], marker = 'x', label = method)
Example #13
0
    method = sys.argv[1] if len(sys.argv) > 1 else 'parzen'
    propmeth = sys.argv[2] if len(sys.argv) > 2 else 'hotellings_t'

    # Load data
    data, dates = read_hpw_csv('HPW_2012_41046.csv')
    data = preproc.normalize_time_series(data)
    
    # Detect
    if method in ['hotellings_t', 'kde']:
        if method == 'kde':
            scores = baselines_noninterval.pointwiseKDE(preproc.td(data))
        else:
            scores = baselines_noninterval.hotellings_t(preproc.td(data))
        regions = baselines_noninterval.pointwiseScoresToIntervals(scores, 24)
    elif method == 'gaussian_cov_ts':
        regions = maxdiv.maxdiv(data, 'gaussian_cov', mode = 'TS', preproc = 'td', proposals = propmeth,
                                extint_min_len = 24, extint_max_len = 170, num_intervals = 5)
    else:
        regions = maxdiv.maxdiv(data, method, mode = 'I_OMEGA', preproc = 'td', proposals = propmeth,
                                extint_min_len = 24, extint_max_len = 170, num_intervals = 5)
    
    # Console output
    print('-- Ground Truth --')
    for name, (a, b) in HURRICANE_GT.items():
        print('{:{}s}: {!s} - {!s}'.format(name, max(len(n) for n in HURRICANE_GT.keys()), a, b - datetime.timedelta(days = 1)))
    print('\n-- Detected Intervals ({} with {} proposals) --'.format(method, propmeth))
    for a, b, score in regions:
        print('{!s} - {!s} (Score: {})'.format(dates[a], dates[b-1], score))
    
    # Plot
    ygt = [(datetime_diff(a, dates[0]), datetime_diff(b, dates[0])) for a, b in HURRICANE_GT.values()]
    eval.plotDetections(data, regions, ygt,
Example #14
0
            propparams = {'useMAD': useMAD, 'sd_th': sd_th}
            if not filtered:
                propparams['filter'] = None

            for ftype in data:
                gts = []
                cur_regions = []
                for func in data[ftype]:
                    gts.append(func['gt'])
                    cur_regions.append(
                        maxdiv.maxdiv(func['ts'],
                                      method=METHOD,
                                      mode=MODE,
                                      preproc='normalize',
                                      td_dim=6,
                                      td_lag=2,
                                      num_intervals=None,
                                      extint_min_len=20,
                                      extint_max_len=100,
                                      proposals=propmeth,
                                      proposalparameters=propparams))
                aps.append(eval.average_precision(gts, cur_regions))
                ygts += gts
                regions += cur_regions

            ap[id][sd_th] = eval.average_precision(ygts, regions)
            mean_ap[id][sd_th] = np.mean(aps)

# Print results as table
hdiv_len = 5 + sum(len(lbl) + 3
                   for lbl in labels.values())  # length of horizontal divider
Example #15
0
# Measure runtime of various methods for different lengths of time series
times = {method: [] for method in METHODS}
lengths = []
for n in range(25, 1001, 25):

    print('-- n = {} --'.format(n))

    for method in METHODS:
        times[method].append(0.0)

    for i in range(m):
        gp = sample_gp_with_meanshift(n)
        for method in METHODS:
            start_time = time.time()
            maxdiv.maxdiv(gp, method=method, preproc=PREPROC, mode=MODE)
            stop_time = time.time()
            times[method][-1] += stop_time - start_time

    lengths.append(n)
    for method in METHODS:
        times[method][-1] /= m

# Plot results
markers = ['x', 'o', '*', 'v', '^', '<', '>']
fig = plt.figure()
sp = fig.add_subplot(111,
                     xlabel='Length of time series',
                     ylabel='Runtime in seconds')
for i, (method, t) in enumerate(times.items()):
    sp.plot(lengths, t, marker=markers[i % len(markers)], label=method)
Example #16
0
    data, dates = read_hpw_csv('HPW_2012_41046.csv')
    data = preproc.normalize_time_series(data)

    # Detect
    if method in ['hotellings_t', 'kde']:
        if method == 'kde':
            scores = baselines_noninterval.pointwiseKDE(preproc.td(data))
        else:
            scores = baselines_noninterval.hotellings_t(preproc.td(data))
        regions = baselines_noninterval.pointwiseScoresToIntervals(scores, 24)
    elif method == 'gaussian_cov_ts':
        regions = maxdiv.maxdiv(data,
                                'gaussian_cov',
                                mode='TS',
                                td_dim=3,
                                td_lag=1,
                                proposals=propmeth,
                                extint_min_len=24,
                                extint_max_len=72,
                                num_intervals=5)
    else:
        regions = maxdiv.maxdiv(data,
                                method,
                                mode='I_OMEGA',
                                td_dim=3,
                                td_lag=1,
                                proposals=propmeth,
                                extint_min_len=24,
                                extint_max_len=72,
                                num_intervals=5)
                sys.stderr.write('- {} -\n'.format(labels[id]))
                sys.stderr.flush()

                aucs = []
                regions = []
                ygts = []

                for i, func in enumerate(data[ftype]):
                    time_start = time.time()
                    regions.append(
                        maxdiv.maxdiv(
                            func['ts'],
                            useLibMaxDiv=True,
                            method=method,
                            preproc=preproc,
                            mode=mode,
                            td_dim=args.td_dim if preproc is not None else 1,
                            kernelparameters={
                                'kernel_sigma_sq': args.kernel_sigma_sq
                            },
                            **parameters))
                    time_stop = time.time()

                    ygts.append(func['gt'])
                    aucs.append(
                        eval.auc(func['gt'], regions[-1], func['ts'].shape[1]))
                    if preproc is None:
                        if func['ts'].shape[1] not in times:
                            times[func['ts'].shape[1]] = {
                                m: []
                                for m in METHODS
Example #18
0
if __name__ == '__main__':

    if len(sys.argv) < 2:
        print(
            'Usage: {} <word2vec-model> [<min-len = {}> [<max-len = {}> [<num-intervals = {}>]]]'
            .format(sys.argv[0], MIN_LEN, MAX_LEN, NUM_INTERVALS))
        exit()

    model = sys.argv[1]
    minLen = int(sys.argv[2]) if len(sys.argv) > 2 else MIN_LEN
    maxLen = int(sys.argv[3]) if len(sys.argv) > 3 else MAX_LEN
    numIntervals = int(sys.argv[4]) if len(sys.argv) > 4 else NUM_INTERVALS

    text = genesis.words(fileids='english-kjv.txt')
    feat = textutils.text2mat(text, model)

    start = time.time()
    intervals = maxdiv(feat,
                       method='gaussian_cov',
                       mode='TS',
                       extint_min_len=minLen,
                       extint_max_len=maxLen,
                       num_intervals=numIntervals)
    stop = time.time()
    print(
        'The search for anomalous paragraphs in a text of {} words took {} seconds.'
        .format(len(text), stop - start))

    textutils.printDetectedParagraphs(text, intervals)
Example #19
0
                       (window_sd**2))
gauss_window2 = np.exp(-0.5 * ((np.arange(0.0, ts_len) - window_center2)**2) /
                       (window_sd**2))
ts = mean + np.random.randn(ts_len) * sd
ts1 = gauss_window1 * ts + 0.1 * np.random.randn(ts_len)
ts2 = (gauss_window1 + gauss_window2) * ts + 0.1 * np.random.randn(ts_len)

gt = [(window_center1 - 3 * window_sd, window_center1 + 3 * window_sd + 1),
      (window_center2 - 3 * window_sd, window_center2 + 3 * window_sd + 1)]

# Apply MDI Gaussian on different scenarios
print('--- OMEGA_I on single extremum ---')
det = maxdiv(ts1.reshape((1, ts_len)),
             'gaussian_cov',
             None,
             mode='OMEGA_I',
             extint_min_len=10,
             extint_max_len=8 * window_sd,
             preproc='td')
plotDetections(ts1.reshape((1, ts_len)), det, [gt[0]], silent=False)

print('--- I_OMEGA on single extremum ---')
det = maxdiv(ts1.reshape((1, ts_len)),
             'gaussian_cov',
             None,
             mode='I_OMEGA',
             extint_min_len=10,
             extint_max_len=8 * window_sd,
             preproc='td')
plotDetections(ts1.reshape((1, ts_len)), det, [gt[0]], silent=False)
Example #20
0
    # Measure runtimes and write them to timing.csv
    with open('timing.csv', 'w') as outFile:
        outFile.write(
            'Length,Gaussian (Python),KDE (Python),Gaussian (libmaxdiv),KDE (libmaxdiv)\n'
        )

        for i, n in enumerate(N):
            gps = sample_gp(n)

            start = time()
            maxdiv.maxdiv(gps,
                          'gaussian_cov',
                          None,
                          'dense',
                          useLibMaxDiv=False,
                          mode='I_OMEGA',
                          preproc='td',
                          extint_min_len=min_len,
                          extint_max_len=max_len)
            stop = time()
            times[i, 0] = stop - start

            start = time()
            maxdiv.maxdiv(gps,
                          'parzen',
                          None,
                          'dense',
                          useLibMaxDiv=False,
                          mode='I_OMEGA',
                          preproc='td',