def train(cluster, data, nu, membership): svdds = [] for c in range(cluster): svdds.append(SvddPrimalSGD(nu)) svdd = ClusterSvdd(svdds, nu=nu) cinds = svdd.fit(data, init_membership=membership, max_svdd_iter=10000, max_iter=40) print cinds return svdd, cinds
def train(cluster, data, nu, membership, use_primal=True): svdds = [] for c in range(cluster): if use_primal: svdds.append(SvddPrimalSGD(nu)) else: svdds.append(SvddDualQP('rbf', 0.4, nu)) svdd = ClusterSvdd(svdds, nu=nu) cinds = svdd.fit(data, init_membership=membership, max_svdd_iter=10000, max_iter=40) print cinds return svdd, cinds
def evaluate(res_filename, nus, ks, outlier_frac, reps, num_train, num_test, use_primal=True): train = np.array(range(num_train), dtype='i') test = np.array(range(num_train, num_train + num_test), dtype='i') aris = np.zeros((reps, len(nus), len(ks))) for n in range(reps): # generate new gaussians data, y = generate_data(num_train + num_test, outlier_frac=outlier_frac) inds = np.random.permutation(range(num_test + num_train)) data = data[:, inds] y = y[inds] for k in range(len(ks)): # fix the initialization for all methods membership = np.random.randint(0, ks[k], y.size) for i in range(len(nus)): svdds = list() for l in range(ks[k]): if use_primal: svdds.append(SvddPrimalSGD(nus[i])) else: svdds.append(SvddDualQP('rbf', 10.0, nus[i])) svdd = ClusterSvdd(svdds) svdd.fit(data[:, train].copy(), init_membership=membership[train]) _, classes = svdd.predict(data[:, test].copy()) # evaluate clustering abilities inds = np.where(y[test] >= 0)[0] aris[n, i, k] = metrics.cluster.adjusted_rand_score(y[test[inds]], classes[inds]) print aris print '' maris = np.mean(aris, axis=0) saris = np.std(aris, axis=0) print np.mean(aris, axis=0) print np.std(aris, axis=0) np.savez(res_filename, maris=maris, saris=saris, outlier_frac=outlier_frac, ntrain=num_train, ntest=num_test, reps=reps, nus=nus)
def evaluate(res_filename, dataset, nus, ks, outlier_frac, reps, num_train, num_val, num_test, use_kernels=False): train = np.array(range(num_train - num_val), dtype='i') val = np.array(range(num_train - num_val, num_train), dtype='i') test = np.array(range(num_train, num_train + num_test), dtype='i') aris = np.zeros((reps, len(nus), len(ks))) aucs = np.zeros((reps, len(nus), len(ks))) val_aris = np.zeros((reps, len(nus), len(ks))) val_aucs = np.zeros((reps, len(nus), len(ks))) for n in range(reps): # generate new gaussians # data, y = generate_data(num_train + num_test, outlier_frac=outlier_frac) inds = np.random.permutation(range(num_test + num_train)) data, y = load_data_set(dataset, num_train + num_test, outlier_frac, inds[:num_train]) data = data[:, inds] y = y[inds] for k in range(len(ks)): # fix the initialization for all methods membership = np.random.randint(0, ks[k], y.size) for i in range(len(nus)): svdds = list() for l in range(ks[k]): if use_kernels: svdds.append(SvddDualQP('rbf', 20.0, nus[i])) else: svdds.append(SvddPrimalSGD(nus[i])) svdd = ClusterSvdd(svdds) svdd.fit(data[:, train].copy(), init_membership=membership[train]) # test error scores, classes = svdd.predict(data[:, test].copy()) # evaluate clustering abilities # inds = np.where((y[test] >= 0))[0] # aris[n, i, k] = metrics.cluster.adjusted_rand_score(y[test[inds]], classes[inds]) ari = metrics.cluster.adjusted_rand_score(y[test], classes) if nus[i] < 1.0: inds = np.where(scores <= 0.)[0] ari = metrics.cluster.adjusted_rand_score( y[test[inds]], classes[inds]) aris[n, i, k] = ari # ...and anomaly detection accuracy fpr, tpr, _ = metrics.roc_curve(np.array(y[test] < 0., dtype='i'), scores, pos_label=1) aucs[n, i, k] = metrics.auc(fpr, tpr) # validation error scores, classes = svdd.predict(data[:, val].copy()) # evaluate clustering abilities # inds = np.where((y[val] >= 0))[0] # val_aris[n, i, k] = metrics.cluster.adjusted_rand_score(y[val[inds]], classes[inds]) ari = metrics.cluster.adjusted_rand_score(y[val], classes) if nus[i] < 1.0: inds = np.where(scores <= 0.)[0] ari = metrics.cluster.adjusted_rand_score( y[val[inds]], classes[inds]) val_aris[n, i, k] = ari # ...and anomaly detection accuracy fpr, tpr, _ = metrics.roc_curve(np.array(y[val] < 0., dtype='i'), scores, pos_label=1) val_aucs[n, i, k] = metrics.auc(fpr, tpr) print '---------------------------------------------------' maris = np.mean(aris, axis=0) saris = np.std(aris, axis=0) print '(Test) ARI:' print np.mean(aris, axis=0) print np.std(aris, axis=0) val_maris = np.mean(val_aris, axis=0) val_saris = np.std(val_aris, axis=0) print '(Val) ARI:' print val_maris print val_saris print '---------------------------------------------------' maucs = np.mean(aucs, axis=0) saucs = np.std(aucs, axis=0) print '(Test) AUC:' print np.mean(aucs, axis=0) print np.std(aucs, axis=0) val_maucs = np.mean(val_aucs, axis=0) val_saucs = np.std(val_aucs, axis=0) print '(Val) AUC:' print val_maucs print val_saucs print '---------------------------------------------------' res = np.zeros(4) res_stds = np.zeros(4) # best svdd result (assume col 0 is k=1) svdd_ind = np.argmax(val_maucs[:, 0]) print 'SVDD best AUC={0}'.format(maucs[svdd_ind, 0]) csvdd_ind = np.argmax(val_maucs) i1, i2 = np.unravel_index(csvdd_ind, maucs.shape) print 'ClusterSVDD best AUC={0}'.format(maucs[i1, i2]) res[0] = maucs[svdd_ind, 0] res_stds[0] = saucs[svdd_ind, 0] res[1] = maucs[i1, i2] res_stds[1] = saucs[i1, i2] # best svdd result (assume col 0 is k=1) km_ind = np.argmax(val_maris[0, :]) print 'k-means best ARI={0}'.format(maris[0, km_ind]) csvdd_ind = np.argmax(val_maris) i1, i2 = np.unravel_index(csvdd_ind, maris.shape) print 'ClusterSVDD best ARI={0}'.format(maris[i1, i2]) res[2] = maris[0, km_ind] res_stds[2] = saris[0, km_ind] res[3] = maris[i1, i2] res_stds[3] = saris[i1, i2] print '---------------------------------------------------' return res, res_stds
def evaluate(res_filename, nus, ks, outlier_frac, reps, num_train, num_test): train = np.array(range(num_train), dtype='i') test = np.array(range(num_train, num_train + num_test), dtype='i') aris = np.zeros((reps, len(nus), len(ks))) loss = np.zeros((reps, len(nus), len(ks))) for n in range(reps): # generate new gaussians X, S, y = generate_data(num_train + num_test, cluster=3, outlier_frac=outlier_frac, dims=3, plot=False) inds = np.random.permutation(range(num_test + num_train)) data = preprocess_training_data(X, S, inds[:num_train]) data = data[:, inds] y = y[inds] print data print y for k in range(len(ks)): # fix the initialization for all methods membership = np.random.randint(0, ks[k], y.size) for i in range(len(nus)): svdds = list() for l in range(ks[k]): svdds.append(SvddPrimalSGD(nus[i])) svdd = ClusterSvdd(svdds) svdd.fit(data[:, train], init_membership=membership[train]) stime = time.time() pred_phis, true_states, pred_states = preprocess_test_data( svdd, X, S, inds[num_train:]) _, classes = svdd.predict(pred_phis) print '---------------- TIME' print time.time() - stime print '----------------' # evaluate clustering abilities ninds = np.where(y[test] >= 0)[0] aris[n, i, k] = metrics.cluster.adjusted_rand_score( y[test[ninds]], classes[ninds]) # evaluate structured prediction accuracy loss[n, i, k] = hamming_loss(true_states, pred_states) print loss[n, i, k] maris = np.mean(aris, axis=0) saris = np.std(aris, axis=0) print 'ARI' print np.mean(aris, axis=0) print np.std(aris, axis=0) mloss = np.mean(loss, axis=0) sloss = np.std(loss, axis=0) print 'Normalized Hamming Distance' print np.mean(loss, axis=0) print np.std(loss, axis=0) np.savez(res_filename, maris=maris, saris=saris, mloss=mloss, sloss=sloss, outlier_frac=outlier_frac, ntrain=num_train, ntest=num_test, reps=reps, nus=nus)
from ClusterSVDD.svdd_dual_qp import SvddDualQP from ClusterSVDD.svdd_primal_sgd import SvddPrimalSGD if __name__ == '__main__': nu = 0.15 # outlier fraction # generate raw training data Dtrain = np.random.randn(2, 1000) Dtrain /= np.max(np.abs(Dtrain)) # train dual svdd svdd = SvddDualQP('linear', 0.1, nu) svdd.fit(Dtrain) # train primal svdd psvdd = SvddPrimalSGD(nu) psvdd.fit(Dtrain, max_iter=1000, prec=1e-4) # print solutions print('\n dual-svdd: obj={0} T={1}.'.format(svdd.pobj, svdd.radius2)) print('primal-svdd: obj={0} T={1}.\n'.format(psvdd.pobj, psvdd.radius2)) # generate test data grid delta = 0.1 x = np.arange(-2.0 - delta, 2.0 + delta, delta) y = np.arange(-2.0 - delta, 2.0 + delta, delta) X, Y = np.meshgrid(x, y) (sx, sy) = X.shape Xf = np.reshape(X, (1, sx * sy)) Yf = np.reshape(Y, (1, sx * sy)) Dtest = np.append(Xf, Yf, axis=0)
train = np.array(range(num_train), dtype='i') val = np.array(range(num_train, num_train + num_val), dtype='i') test = np.array(range(num_train + num_val, num_train + num_val + num_test), dtype='i') dg = data_generator() inds = np.random.permutation(range(num_test + num_train + num_val)) data, y = dg.sample_with_label(int(num_test + num_train + num_val)) data = data.T membership = np.random.randint(0, k, y.size) svdds = list() for l in range(k): if use_kernels: svdds.append(SvddDualQP('rbf', 20.0, nu)) else: svdds.append(SvddPrimalSGD(nu)) svdd = ClusterSvdd(svdds) svdd.fit(data[:, train].copy(), max_iter=100, max_svdd_iter=100000, init_membership=membership[train]) file_name = 'csvdd_' + str(k) + 'c_nu0' + str(nuu) + '_ring_' + str( run) + '.sav' pickle.dump(svdd, open(file_name, 'wb')) svdd = pickle.load(open(file_name, 'rb')) # test error print(data.shape, test[-1]) scores, classes = svdd.predict(data[:, test].copy()) ari = metrics.cluster.adjusted_rand_score(y[test], classes) test_acc = compute_accuracy(y[test], classes, num_class)
def evaluate(res_filename, nus, sigmas, ks, reps, ntrain, ntest, nval, use_kernels, anom_frac): train = np.array(range(ntrain - nval), dtype='i') val = np.array(range(ntrain - nval, ntrain), dtype='i') test = np.array(range(ntrain, ntrain + ntest), dtype='i') aucs = np.zeros((reps, len(nus), len(ks))) for n in range(reps): # generate new gaussians data, y = generate_data(ntrain + ntest, outlier_frac=anom_frac) inds = np.random.permutation(range(ntest + ntrain)) data = data[:, inds] y = y[inds] for i in range(len(nus)): for k in range(len(ks)): # fix the initialization for all methods membership = np.random.randint(0, ks[k], y.size) max_auc = -1.0 max_val_auc = -1.0 for sigma in sigmas: # build cluster svdd svdds = list() for l in range(ks[k]): if use_kernels: svdds.append(SvddDualQP('rbf', sigma, nus[i])) else: svdds.append(SvddPrimalSGD(nus[i])) svdd = ClusterSvdd(svdds) svdd.fit(data[:, train], init_membership=membership[train]) scores_val, _ = svdd.predict(data[:, val]) # test on validation data fpr, tpr, _ = metrics.roc_curve(np.array(y[val] < 0., dtype='i'), scores_val, pos_label=1) curr_auc = metrics.auc(fpr, tpr) if curr_auc >= max_val_auc: # store test data accuracy scores, _ = svdd.predict(data[:, test]) fpr, tpr, _ = metrics.roc_curve(np.array(y[test] < 0., dtype='i'), scores, pos_label=1) max_auc = metrics.auc(fpr, tpr) max_val_auc = curr_auc aucs[n, i, k] = max_auc # means and standard deviations maucs = np.mean(aucs, axis=0) saucs = np.std(aucs, axis=0) print 'AUCs' print np.mean(aucs, axis=0) print 'Stds' print np.std(aucs, axis=0) # save results np.savez(res_filename, maucs=maucs, saucs=saucs, outlier_frac=nus, ntrain=ntrain, ntest=ntest, reps=reps, nus=nus, ks=ks, sigmas=sigmas)
from ClusterSVDD.svdd_primal_sgd import SvddPrimalSGD if __name__ == '__main__': nu = 0.15 # outlier fraction # generate raw training data Dtrain = np.random.randn(2, 1000) Dtrain /= np.max(np.abs(Dtrain)) # train dual svdd svdd = SvddDualQP('linear', 0.1, nu) svdd.fit(Dtrain) # train primal svdd psvdd = SvddPrimalSGD(nu) psvdd.fit(Dtrain, max_iter=1000, prec=1e-4) # print solutions print('\n dual-svdd: obj={0} T={1}.'.format(svdd.pobj, svdd.radius2)) print('primal-svdd: obj={0} T={1}.\n'.format(psvdd.pobj, psvdd.radius2)) # generate test data grid delta = 0.1 x = np.arange(-2.0-delta, 2.0+delta, delta) y = np.arange(-2.0-delta, 2.0+delta, delta) X, Y = np.meshgrid(x, y) (sx, sy) = X.shape Xf = np.reshape(X,(1, sx*sy)) Yf = np.reshape(Y,(1, sx*sy)) Dtest = np.append(Xf, Yf, axis=0)
def evaluate(nu, k, data, y, train, test, use_kernel=False, kparam=0.1, plot=False): # fix the initialization for all methods membership = np.random.randint(0, k, y.size) svdds = list() for l in range(k): if use_kernel: svdds.append(SvddDualQP('rbf', kparam, nu)) else: svdds.append(SvddPrimalSGD(nu)) svdd = ClusterSvdd(svdds) svdd.fit(data[:, train].copy(), max_iter=60, init_membership=membership[train]) scores, classes = svdd.predict(data[:, test].copy()) # normal classes are positive (e.g. 1,2,3,..) anomalous class is -1 print y[test] true_lbl = y[test] true_lbl[true_lbl < 0] = -1 # convert outliers to single outlier class ari = metrics.cluster.adjusted_rand_score(true_lbl, classes) if nu < 1.0: classes[scores > 0.] = -1 ari = metrics.cluster.adjusted_rand_score(true_lbl, classes) print 'ARI=', ari fpr, tpr, _ = metrics.roc_curve(y[test] < 0., scores, pos_label=1) auc = metrics.auc( fpr, tpr, ) print 'AUC=', auc if plot: plt.figure(1) anom_inds = np.where(y == -1)[0] plt.plot(data[0, anom_inds], data[1, anom_inds], '.g', markersize=2) nom_inds = np.where(y != -1)[0] plt.plot(data[0, nom_inds], data[1, nom_inds], '.r', markersize=6) an = np.linspace(0, 2 * np.pi, 100) for l in range(k): r = np.sqrt(svdd.svdds[l].radius2) if hasattr(svdd.svdds[l], 'c'): plt.plot(svdd.svdds[l].c[0], svdd.svdds[l].c[1], 'xb', markersize=6, linewidth=2, alpha=0.7) plt.plot(r * np.sin(an) + svdd.svdds[l].c[0], r * np.cos(an) + svdd.svdds[l].c[1], '-b', linewidth=2, alpha=0.7) plt.show() return ari, auc