Esempio n. 1
0
def train(cluster, data, nu, membership):
    svdds = []
    for c in range(cluster):
        svdds.append(SvddPrimalSGD(nu))
    svdd = ClusterSvdd(svdds, nu=nu)
    cinds = svdd.fit(data, init_membership=membership, max_svdd_iter=10000, max_iter=40)
    print cinds
    return svdd, cinds
def train(cluster, data, nu, membership, use_primal=True):
    svdds = []
    for c in range(cluster):
        if use_primal:
            svdds.append(SvddPrimalSGD(nu))
        else:
            svdds.append(SvddDualQP('rbf', 0.4, nu))
    svdd = ClusterSvdd(svdds, nu=nu)
    cinds = svdd.fit(data,
                     init_membership=membership,
                     max_svdd_iter=10000,
                     max_iter=40)
    print cinds
    return svdd, cinds
def evaluate(res_filename, nus, ks, outlier_frac, reps, num_train, num_test, use_primal=True):
    train = np.array(range(num_train), dtype='i')
    test = np.array(range(num_train, num_train + num_test), dtype='i')

    aris = np.zeros((reps, len(nus), len(ks)))
    for n in range(reps):
        # generate new gaussians
        data, y = generate_data(num_train + num_test, outlier_frac=outlier_frac)
        inds = np.random.permutation(range(num_test + num_train))
        data = data[:, inds]
        y = y[inds]
        for k in range(len(ks)):
            # fix the initialization for all methods
            membership = np.random.randint(0, ks[k], y.size)
            for i in range(len(nus)):
                svdds = list()
                for l in range(ks[k]):
                    if use_primal:
                        svdds.append(SvddPrimalSGD(nus[i]))
                    else:
                        svdds.append(SvddDualQP('rbf', 10.0, nus[i]))
                svdd = ClusterSvdd(svdds)
                svdd.fit(data[:, train].copy(), init_membership=membership[train])
                _, classes = svdd.predict(data[:, test].copy())
                # evaluate clustering abilities
                inds = np.where(y[test] >= 0)[0]
                aris[n, i, k] = metrics.cluster.adjusted_rand_score(y[test[inds]], classes[inds])

    print aris
    print ''
    maris = np.mean(aris, axis=0)
    saris = np.std(aris, axis=0)
    print np.mean(aris, axis=0)
    print np.std(aris, axis=0)
    np.savez(res_filename, maris=maris, saris=saris, outlier_frac=outlier_frac,
             ntrain=num_train, ntest=num_test, reps=reps, nus=nus)
Esempio n. 4
0
def evaluate(res_filename,
             dataset,
             nus,
             ks,
             outlier_frac,
             reps,
             num_train,
             num_val,
             num_test,
             use_kernels=False):
    train = np.array(range(num_train - num_val), dtype='i')
    val = np.array(range(num_train - num_val, num_train), dtype='i')
    test = np.array(range(num_train, num_train + num_test), dtype='i')

    aris = np.zeros((reps, len(nus), len(ks)))
    aucs = np.zeros((reps, len(nus), len(ks)))

    val_aris = np.zeros((reps, len(nus), len(ks)))
    val_aucs = np.zeros((reps, len(nus), len(ks)))

    for n in range(reps):
        # generate new gaussians
        # data, y = generate_data(num_train + num_test, outlier_frac=outlier_frac)
        inds = np.random.permutation(range(num_test + num_train))
        data, y = load_data_set(dataset, num_train + num_test, outlier_frac,
                                inds[:num_train])
        data = data[:, inds]
        y = y[inds]
        for k in range(len(ks)):
            # fix the initialization for all methods
            membership = np.random.randint(0, ks[k], y.size)
            for i in range(len(nus)):
                svdds = list()
                for l in range(ks[k]):
                    if use_kernels:
                        svdds.append(SvddDualQP('rbf', 20.0, nus[i]))
                    else:
                        svdds.append(SvddPrimalSGD(nus[i]))
                svdd = ClusterSvdd(svdds)
                svdd.fit(data[:, train].copy(),
                         init_membership=membership[train])
                # test error
                scores, classes = svdd.predict(data[:, test].copy())

                # evaluate clustering abilities
                # inds = np.where((y[test] >= 0))[0]
                # aris[n, i, k] = metrics.cluster.adjusted_rand_score(y[test[inds]], classes[inds])

                ari = metrics.cluster.adjusted_rand_score(y[test], classes)
                if nus[i] < 1.0:
                    inds = np.where(scores <= 0.)[0]

                    ari = metrics.cluster.adjusted_rand_score(
                        y[test[inds]], classes[inds])
                aris[n, i, k] = ari

                # ...and anomaly detection accuracy
                fpr, tpr, _ = metrics.roc_curve(np.array(y[test] < 0.,
                                                         dtype='i'),
                                                scores,
                                                pos_label=1)
                aucs[n, i, k] = metrics.auc(fpr, tpr)

                # validation error
                scores, classes = svdd.predict(data[:, val].copy())
                # evaluate clustering abilities
                # inds = np.where((y[val] >= 0))[0]
                # val_aris[n, i, k] = metrics.cluster.adjusted_rand_score(y[val[inds]], classes[inds])

                ari = metrics.cluster.adjusted_rand_score(y[val], classes)
                if nus[i] < 1.0:
                    inds = np.where(scores <= 0.)[0]
                    ari = metrics.cluster.adjusted_rand_score(
                        y[val[inds]], classes[inds])
                val_aris[n, i, k] = ari

                # ...and anomaly detection accuracy
                fpr, tpr, _ = metrics.roc_curve(np.array(y[val] < 0.,
                                                         dtype='i'),
                                                scores,
                                                pos_label=1)
                val_aucs[n, i, k] = metrics.auc(fpr, tpr)

    print '---------------------------------------------------'
    maris = np.mean(aris, axis=0)
    saris = np.std(aris, axis=0)
    print '(Test) ARI:'
    print np.mean(aris, axis=0)
    print np.std(aris, axis=0)

    val_maris = np.mean(val_aris, axis=0)
    val_saris = np.std(val_aris, axis=0)
    print '(Val) ARI:'
    print val_maris
    print val_saris

    print '---------------------------------------------------'
    maucs = np.mean(aucs, axis=0)
    saucs = np.std(aucs, axis=0)
    print '(Test) AUC:'
    print np.mean(aucs, axis=0)
    print np.std(aucs, axis=0)

    val_maucs = np.mean(val_aucs, axis=0)
    val_saucs = np.std(val_aucs, axis=0)
    print '(Val) AUC:'
    print val_maucs
    print val_saucs
    print '---------------------------------------------------'

    res = np.zeros(4)
    res_stds = np.zeros(4)

    # best svdd result (assume col 0 is k=1)
    svdd_ind = np.argmax(val_maucs[:, 0])
    print 'SVDD best AUC={0}'.format(maucs[svdd_ind, 0])
    csvdd_ind = np.argmax(val_maucs)
    i1, i2 = np.unravel_index(csvdd_ind, maucs.shape)
    print 'ClusterSVDD best AUC={0}'.format(maucs[i1, i2])
    res[0] = maucs[svdd_ind, 0]
    res_stds[0] = saucs[svdd_ind, 0]
    res[1] = maucs[i1, i2]
    res_stds[1] = saucs[i1, i2]

    # best svdd result (assume col 0 is k=1)
    km_ind = np.argmax(val_maris[0, :])
    print 'k-means best ARI={0}'.format(maris[0, km_ind])
    csvdd_ind = np.argmax(val_maris)
    i1, i2 = np.unravel_index(csvdd_ind, maris.shape)
    print 'ClusterSVDD best ARI={0}'.format(maris[i1, i2])
    res[2] = maris[0, km_ind]
    res_stds[2] = saris[0, km_ind]
    res[3] = maris[i1, i2]
    res_stds[3] = saris[i1, i2]
    print '---------------------------------------------------'

    return res, res_stds
def evaluate(res_filename, nus, ks, outlier_frac, reps, num_train, num_test):
    train = np.array(range(num_train), dtype='i')
    test = np.array(range(num_train, num_train + num_test), dtype='i')

    aris = np.zeros((reps, len(nus), len(ks)))
    loss = np.zeros((reps, len(nus), len(ks)))
    for n in range(reps):
        # generate new gaussians
        X, S, y = generate_data(num_train + num_test,
                                cluster=3,
                                outlier_frac=outlier_frac,
                                dims=3,
                                plot=False)
        inds = np.random.permutation(range(num_test + num_train))
        data = preprocess_training_data(X, S, inds[:num_train])
        data = data[:, inds]
        y = y[inds]
        print data
        print y
        for k in range(len(ks)):
            # fix the initialization for all methods
            membership = np.random.randint(0, ks[k], y.size)
            for i in range(len(nus)):
                svdds = list()
                for l in range(ks[k]):
                    svdds.append(SvddPrimalSGD(nus[i]))
                svdd = ClusterSvdd(svdds)
                svdd.fit(data[:, train], init_membership=membership[train])

                stime = time.time()
                pred_phis, true_states, pred_states = preprocess_test_data(
                    svdd, X, S, inds[num_train:])
                _, classes = svdd.predict(pred_phis)
                print '---------------- TIME'
                print time.time() - stime
                print '----------------'

                # evaluate clustering abilities
                ninds = np.where(y[test] >= 0)[0]
                aris[n, i, k] = metrics.cluster.adjusted_rand_score(
                    y[test[ninds]], classes[ninds])
                # evaluate structured prediction accuracy
                loss[n, i, k] = hamming_loss(true_states, pred_states)
                print loss[n, i, k]

    maris = np.mean(aris, axis=0)
    saris = np.std(aris, axis=0)
    print 'ARI'
    print np.mean(aris, axis=0)
    print np.std(aris, axis=0)

    mloss = np.mean(loss, axis=0)
    sloss = np.std(loss, axis=0)
    print 'Normalized Hamming Distance'
    print np.mean(loss, axis=0)
    print np.std(loss, axis=0)

    np.savez(res_filename,
             maris=maris,
             saris=saris,
             mloss=mloss,
             sloss=sloss,
             outlier_frac=outlier_frac,
             ntrain=num_train,
             ntest=num_test,
             reps=reps,
             nus=nus)
from ClusterSVDD.svdd_dual_qp import SvddDualQP
from ClusterSVDD.svdd_primal_sgd import SvddPrimalSGD

if __name__ == '__main__':
    nu = 0.15  # outlier fraction

    # generate raw training data
    Dtrain = np.random.randn(2, 1000)
    Dtrain /= np.max(np.abs(Dtrain))

    # train dual svdd
    svdd = SvddDualQP('linear', 0.1, nu)
    svdd.fit(Dtrain)

    # train primal svdd
    psvdd = SvddPrimalSGD(nu)
    psvdd.fit(Dtrain, max_iter=1000, prec=1e-4)

    # print solutions
    print('\n  dual-svdd: obj={0}  T={1}.'.format(svdd.pobj, svdd.radius2))
    print('primal-svdd: obj={0}  T={1}.\n'.format(psvdd.pobj, psvdd.radius2))

    # generate test data grid
    delta = 0.1
    x = np.arange(-2.0 - delta, 2.0 + delta, delta)
    y = np.arange(-2.0 - delta, 2.0 + delta, delta)
    X, Y = np.meshgrid(x, y)
    (sx, sy) = X.shape
    Xf = np.reshape(X, (1, sx * sy))
    Yf = np.reshape(Y, (1, sx * sy))
    Dtest = np.append(Xf, Yf, axis=0)
Esempio n. 7
0
    train = np.array(range(num_train), dtype='i')
    val = np.array(range(num_train, num_train + num_val), dtype='i')
    test = np.array(range(num_train + num_val, num_train + num_val + num_test),
                    dtype='i')

    dg = data_generator()
    inds = np.random.permutation(range(num_test + num_train + num_val))
    data, y = dg.sample_with_label(int(num_test + num_train + num_val))
    data = data.T
    membership = np.random.randint(0, k, y.size)
    svdds = list()
    for l in range(k):
        if use_kernels:
            svdds.append(SvddDualQP('rbf', 20.0, nu))
        else:
            svdds.append(SvddPrimalSGD(nu))
    svdd = ClusterSvdd(svdds)
    svdd.fit(data[:, train].copy(),
             max_iter=100,
             max_svdd_iter=100000,
             init_membership=membership[train])
    file_name = 'csvdd_' + str(k) + 'c_nu0' + str(nuu) + '_ring_' + str(
        run) + '.sav'
    pickle.dump(svdd, open(file_name, 'wb'))
    svdd = pickle.load(open(file_name, 'rb'))
    # test error
    print(data.shape, test[-1])
    scores, classes = svdd.predict(data[:, test].copy())

    ari = metrics.cluster.adjusted_rand_score(y[test], classes)
    test_acc = compute_accuracy(y[test], classes, num_class)
def evaluate(res_filename, nus, sigmas, ks, reps, ntrain, ntest, nval,
             use_kernels, anom_frac):
    train = np.array(range(ntrain - nval), dtype='i')
    val = np.array(range(ntrain - nval, ntrain), dtype='i')
    test = np.array(range(ntrain, ntrain + ntest), dtype='i')
    aucs = np.zeros((reps, len(nus), len(ks)))
    for n in range(reps):
        # generate new gaussians
        data, y = generate_data(ntrain + ntest, outlier_frac=anom_frac)
        inds = np.random.permutation(range(ntest + ntrain))
        data = data[:, inds]
        y = y[inds]
        for i in range(len(nus)):
            for k in range(len(ks)):
                # fix the initialization for all methods
                membership = np.random.randint(0, ks[k], y.size)

                max_auc = -1.0
                max_val_auc = -1.0
                for sigma in sigmas:
                    # build cluster svdd
                    svdds = list()
                    for l in range(ks[k]):
                        if use_kernels:
                            svdds.append(SvddDualQP('rbf', sigma, nus[i]))
                        else:
                            svdds.append(SvddPrimalSGD(nus[i]))

                    svdd = ClusterSvdd(svdds)
                    svdd.fit(data[:, train], init_membership=membership[train])
                    scores_val, _ = svdd.predict(data[:, val])
                    # test on validation data
                    fpr, tpr, _ = metrics.roc_curve(np.array(y[val] < 0.,
                                                             dtype='i'),
                                                    scores_val,
                                                    pos_label=1)
                    curr_auc = metrics.auc(fpr, tpr)
                    if curr_auc >= max_val_auc:
                        # store test data accuracy
                        scores, _ = svdd.predict(data[:, test])
                        fpr, tpr, _ = metrics.roc_curve(np.array(y[test] < 0.,
                                                                 dtype='i'),
                                                        scores,
                                                        pos_label=1)
                        max_auc = metrics.auc(fpr, tpr)
                        max_val_auc = curr_auc
                aucs[n, i, k] = max_auc
    # means and standard deviations
    maucs = np.mean(aucs, axis=0)
    saucs = np.std(aucs, axis=0)
    print 'AUCs'
    print np.mean(aucs, axis=0)
    print 'Stds'
    print np.std(aucs, axis=0)
    # save results
    np.savez(res_filename,
             maucs=maucs,
             saucs=saucs,
             outlier_frac=nus,
             ntrain=ntrain,
             ntest=ntest,
             reps=reps,
             nus=nus,
             ks=ks,
             sigmas=sigmas)
Esempio n. 9
0
from ClusterSVDD.svdd_primal_sgd import SvddPrimalSGD


if __name__ == '__main__':
    nu = 0.15  # outlier fraction

    # generate raw training data
    Dtrain = np.random.randn(2, 1000)
    Dtrain /= np.max(np.abs(Dtrain))

    # train dual svdd
    svdd = SvddDualQP('linear', 0.1, nu)
    svdd.fit(Dtrain)

    # train primal svdd
    psvdd = SvddPrimalSGD(nu)
    psvdd.fit(Dtrain, max_iter=1000, prec=1e-4)

    # print solutions
    print('\n  dual-svdd: obj={0}  T={1}.'.format(svdd.pobj, svdd.radius2))
    print('primal-svdd: obj={0}  T={1}.\n'.format(psvdd.pobj, psvdd.radius2))

    # generate test data grid
    delta = 0.1
    x = np.arange(-2.0-delta, 2.0+delta, delta)
    y = np.arange(-2.0-delta, 2.0+delta, delta)
    X, Y = np.meshgrid(x, y)
    (sx, sy) = X.shape
    Xf = np.reshape(X,(1, sx*sy))
    Yf = np.reshape(Y,(1, sx*sy))
    Dtest = np.append(Xf, Yf, axis=0)
def evaluate(nu,
             k,
             data,
             y,
             train,
             test,
             use_kernel=False,
             kparam=0.1,
             plot=False):

    # fix the initialization for all methods
    membership = np.random.randint(0, k, y.size)
    svdds = list()
    for l in range(k):
        if use_kernel:
            svdds.append(SvddDualQP('rbf', kparam, nu))
        else:
            svdds.append(SvddPrimalSGD(nu))

    svdd = ClusterSvdd(svdds)
    svdd.fit(data[:, train].copy(),
             max_iter=60,
             init_membership=membership[train])
    scores, classes = svdd.predict(data[:, test].copy())

    # normal classes are positive (e.g. 1,2,3,..) anomalous class is -1
    print y[test]
    true_lbl = y[test]
    true_lbl[true_lbl < 0] = -1  # convert outliers to single outlier class
    ari = metrics.cluster.adjusted_rand_score(true_lbl, classes)
    if nu < 1.0:
        classes[scores > 0.] = -1
        ari = metrics.cluster.adjusted_rand_score(true_lbl, classes)
    print 'ARI=', ari

    fpr, tpr, _ = metrics.roc_curve(y[test] < 0., scores, pos_label=1)
    auc = metrics.auc(
        fpr,
        tpr,
    )
    print 'AUC=', auc

    if plot:
        plt.figure(1)
        anom_inds = np.where(y == -1)[0]
        plt.plot(data[0, anom_inds], data[1, anom_inds], '.g', markersize=2)
        nom_inds = np.where(y != -1)[0]
        plt.plot(data[0, nom_inds], data[1, nom_inds], '.r', markersize=6)

        an = np.linspace(0, 2 * np.pi, 100)
        for l in range(k):
            r = np.sqrt(svdd.svdds[l].radius2)
            if hasattr(svdd.svdds[l], 'c'):
                plt.plot(svdd.svdds[l].c[0],
                         svdd.svdds[l].c[1],
                         'xb',
                         markersize=6,
                         linewidth=2,
                         alpha=0.7)
                plt.plot(r * np.sin(an) + svdd.svdds[l].c[0],
                         r * np.cos(an) + svdd.svdds[l].c[1],
                         '-b',
                         linewidth=2,
                         alpha=0.7)
        plt.show()
    return ari, auc