test = alt_df.iloc[14 + os:24 + os]
test = test.append(rand_signs)
#print(test)
#test=pd.DataFrame(test.append(alt_df.iloc[888]))
#for i in range(1,21):
#    test=test.append(alt_df.iloc[i*30])

test = test.drop("Writer_no", axis=1)
test = test.drop("Sample_no", axis=1)

print(test.shape)

# In[376]:

clf = svm.OneClassSVM(nu=best_nu, kernel="rbf", gamma=best_gamma)
clf.fit(data)
preds = clf.predict(test)
print(preds)

pdf1 = clf.decision_function(test[0:10])
pdf2 = clf.decision_function(test[10:])
pdf = clf.decision_function(test)

# ## Probability Density Function of Real Signatures

# In[377]:

pd.DataFrame(pdf1).plot(kind="density", figsize=(5, 5))
plt.show()
Beispiel #2
0
    # read binary data
    feature_folder = os.listdir(filepath)
    lenth = len(feature_folder)
    feature_folder.sort(key=lambda i: int(re.match(r'(\d+)', i).group()))
    for id in feature_folder:
        filepath_ = os.path.join(filepath, id)
        f = open(filepath_, "rb")
        # read all bytes into a string
        s = f.read()
        f.close()
        (n, c, l, h, w) = array.array("i", s[:20])
        feature_vec = np.array(array.array("f", s[20:]))
        li.append(feature_vec)
X_train = np.array(li)
# fit the model
clf = svm.OneClassSVM(nu=0.1, kernel="rbf", gamma=0.1)
clf.fit(X_train)
# 预测的结果为-1 abnormal 或 1 normal ,在这个群落中为1,不在为-1.
y_pred_train = clf.predict(X_train)
print(y_pred_train)
normal = X_train[y_pred_train == 1]
abnormal = X_train[y_pred_train == -1]
print(normal)
print(abnormal)
print(normal.shape)
print(abnormal.shape)

print("labels_true")
print(labels_true)

plt.plot(normal[:, 0], normal[:, 1], 'bx')
Beispiel #3
0
from sklearn import svm

xx, yy = np.meshgrid(np.linspace(-10, 10, 500), np.linspace(-10, 10, 500))
# Generate train data
X = 0.3 * np.random.randn(100, 2)
X_train = np.r_[X + 5, X - 5]
X_train1 = np.r_[X, X]
# Generate some regular novel observations
X = 0.3 * np.random.randn(20, 2)
X_test = np.r_[X + 5, X - 5]
X_test1 = np.r_[X, X]
# Generate some abnormal novel observations
X_outliers = np.random.uniform(low=-4, high=4, size=(20, 2))

# fit the model
clf = svm.OneClassSVM(nu=0.1, kernel="linear", gamma=0.1)
clf.fit(X_train)
y_pred_train = clf.predict(X_train)
y_pred_test = clf.predict(X_test)
y_pred_test1 = clf.predict(X_test1)
y_pred_outliers = clf.predict(X_outliers)
n_error_train = y_pred_train[y_pred_train == -1].size
n_error_test = y_pred_test[y_pred_test == -1].size
n_error_outliers = y_pred_outliers[y_pred_outliers == 1].size

# plot the line, the points, and the nearest vectors to the plane
Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

plt.title("Novelty Detection")
plt.contourf(xx, yy, Z, levels=np.linspace(Z.min(), 0, 7), cmap=plt.cm.PuBu)
Beispiel #4
0
def train(train_set):
    clf = svm.OneClassSVM(nu=0.1, kernel="rbf", gamma='auto')
    clf.fit(train_set)

    return clf
Beispiel #5
0
import matplotlib.font_manager
from scipy import stats

from sklearn import svm
from sklearn.covariance import EllipticEnvelope

# Example settings
n_samples = 200
outliers_fraction = 0.25
clusters_separation = [0, 1, 2]

# define two outlier detection tools to be compared
classifiers = {
    "One-Class SVM":
    svm.OneClassSVM(nu=0.95 * outliers_fraction + 0.05,
                    kernel="rbf",
                    gamma=0.1),
    "robust covariance estimator":
    EllipticEnvelope(contamination=.1),
}

# Compare given classifiers under given settings
xx, yy = np.meshgrid(np.linspace(-7, 7, 500), np.linspace(-7, 7, 500))
n_inliers = int((1. - outliers_fraction) * n_samples)
n_outliers = int(outliers_fraction * n_samples)
ground_truth = np.ones(n_samples, dtype=int)
ground_truth[-n_outliers:] = 0

# Fit the problem with varying cluster separation
for i, offset in enumerate(clusters_separation):
    np.random.seed(42)
Beispiel #6
0
X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size=0.2)
#split X_train  to  normal and outliers

X_train_normal = X_train[X_train['label_filled'] == 0].drop("label_filled",
                                                            axis=1,
                                                            inplace=False)
#X_train_outliers = X_train[X_train['label_filled'] == 1].drop("label_filled",axis=1, inplace=False)
X_test = X_test.drop("label_filled", axis=1, inplace=False)
X_train = X_train.drop("label_filled", axis=1, inplace=False)

print("Load data done.")

#print(X_train.shape,X_train_normal.shape,X_train_outliers.shape)
#fit model
clf = svm.OneClassSVM(nu=0.05, kernel="rbf", gamma="auto")  #nu是异常点的比例
clf.fit(X_train_normal)

#predict
y_pred_train = clf.predict(X_train)
y_pred_test = clf.predict(X_test)

print y_test[:10]
print y_pred_test[:10]

#change predict_labeks  (1:-1)->to (0,1)
y_pred_train = np.where(y_pred_train > 0, 0, 1)
y_pred_test = np.where(y_pred_test > 0, 0, 1)
#
#print result
print("train data classification report: ")
Beispiel #7
0
def OC_SVM_linear(dataset, model_type, class_number, hyper_para):

	_, _, relu, mean, cov, imagenet_mean, imagenet_std, _ = get_fuv(hyper_para, model_type)

	if(hyper_para.verbose==True):
		print('Loading dataset '+dataset+'...')

	train_data, test_data, test_label = load_dataset(dataset, class_number, imagenet_mean, imagenet_std, hyper_para)

	if(hyper_para.verbose==True):
		print(dataset+' dataset loaded.')

	no_train_data = np.shape(train_data.numpy())[0]
	no_test_data  = np.shape(test_data.numpy())[0]

	### choose one network which produces D dimensional features
	model = choose_network(model_type, hyper_para.pre_trained_flag)

	### training on gpu
	if(hyper_para.gpu_flag):
		relu.cuda()
		model.cuda()
	
	model.eval()
	relu.eval()

	if(hyper_para.verbose==True):
		print('Extracting training features...')

	train_features = np.memmap('../../temp_files/train_features_temp.bin', dtype='float32', mode='w+', shape=(no_train_data,hyper_para.D))
	train_features = torch.from_numpy(train_features)

	for i in range(no_train_data):
		train_features[i:(i+1)] = (model(torch.autograd.Variable(train_data[i:(i+1)].cuda().contiguous().float(), volatile=True)).float()).data.cpu()
	train_data = None

	if(hyper_para.verbose==True):
		print('Features extracted.')

	if(hyper_para.verbose==True):
		print('Training one class SVM with linear kernel...')
			
	# train one-class svm
	oc_svm_clf = svm.OneClassSVM(kernel='linear', nu=float(hyper_para.N))
	# oc_svm_clf.fit(train_features)
	oc_svm_clf.fit(train_features.numpy())

	if(hyper_para.verbose==True):
		print('One class SVM with Linear kernel trained.')

	## test on the test set
	test_features = np.memmap('../../temp_files/test_features_temp.bin', dtype='float32', mode='w+', shape=(no_test_data,hyper_para.D))
	test_scores   = np.memmap('../../temp_files/test_scores_temp.bin', dtype='float32', mode='w+', shape=(no_test_data,1))
	test_features = torch.from_numpy(test_features)

	k=0
	mean_kwn = np.zeros( (no_test_data,1) )
	for j in range(no_test_data):
		temp = (model(torch.autograd.Variable(test_data[j:(j+1)].cuda().contiguous().float(), volatile=True)).float())
		test_features[k:(k+1)] = temp.data.cpu()
		temp 				   = np.reshape((temp).data.cpu().numpy(), (1, hyper_para.D))
		test_scores[k:(k+1)]   = oc_svm_clf.decision_function(temp)[0]
		
		k = k+1

	test_features  = test_features.numpy()
	train_features = train_features.numpy()

	fpr, tpr, thresholds = metrics.roc_curve(test_label, test_scores)
	
	area_under_curve = metrics.auc(fpr, tpr)
	
	joblib.dump(oc_svm_clf,'../../save_folder/saved_models/'+dataset+'/classifier/'+str(class_number)+'/'+model_type+'_OCSVMlin_'+str(hyper_para.N)+'.pkl')

	scipy.io.savemat('../../save_folder/results/'+dataset+'/'+str(class_number) +'/'+ model_type+'_OCSVMlin_'+str(hyper_para.N)+'.mat',
													{ 'train_features':train_features, 'test_features':test_features, 'test_label':test_label, 'test_scores':test_scores    })

	return area_under_curve
Beispiel #8
0
import numpy as np
from sklearn import svm
import cv2
import calHistogramOpticalFlow as chof

train_path = "/home/kun/data/UCSD/UCSDped1/Train/train_encoder_feature/train_encoder_patch_104.npy"
test_path = "/home/kun/data/UCSD/UCSDped1/Test/test_encoder_feature/test_encoder_patch_104.npy"

# train_path = "UCSD/UCSDped1/train/train_feature/train_patch_90.npy"
# test_path = "UCSD/UCSDped1/test/test_feature/test_patch_90.npy"

train_x = np.load(train_path)
test_x = np.load(test_path)

clf = svm.OneClassSVM(nu=0.01, kernel='rbf', gamma=0.6)
clf.fit(train_x)

y_pred_train = clf.predict(train_x)
y_pred_test = clf.predict(test_x)

n_error_train = y_pred_train[y_pred_train == -1].size
n_error_test = y_pred_test[y_pred_test == -1].size

print n_error_train

# for i in range(y_pred_test.shape[0]):
#     if y_pred_test[i] == -1:
#         file = i / 195 + 1
#         frame = i % 195 + 1
#         image_path = "/home/kun/data/UCSD/UCSDped1/Test/Test%03d/%03d.tif"%(file, frame)
Beispiel #9
0
X = X.transpose()

# Remove rows with 0 (NA) for wetCode
X_train = X[X[:, -1] != 0]

# Remove non-finite values
X_train = X_train[np.isfinite(X_train).all(axis=1)]

# Split into variables (X) and class (y)
y_train = X_train[:, -1]
X_train = X_train[:, 0:-1]

# Train CSV classifier
print('define clf')
# clf= svm.OneClassSVM(kernel='rbf',nu=0.2,gamma='auto',verbose=False)
clf = svm.OneClassSVM(kernel='poly', nu=0.1, gamma='auto', verbose=True)

print('fit clf')
clf.fit(X_train, y_train)

# Set NaN values to 0
X = np.where(np.isfinite(X), X, 0)

# Apply classification
print('apply classification ')
predictClass = clf.predict(X[:, 0:-1])

# Write out data to RAT
print('write RAT')
rat.writeColumn(ratDataset, 'predictClass', predictClass)
ratDataset = None
Xtest.append(dataset_task1[int(0.6*dataset_task1.shape[0])+1:dataset_task1.shape[0]])
Xtrain.append(dataset_task2[0:int(0.6*dataset_task2.shape[0])])
Xtest.append(dataset_task2[int(0.6*dataset_task2.shape[0])+1:dataset_task2.shape[0]])




for param_kernel in kernel:
    save_path = f'{path}/{param_kernel}/'
    for param_nu in nu:
        for param_gamma in gamma:            
            if(param_kernel!='poly'):
                print(f'kernel={param_kernel} - gamma={param_gamma} - nu={param_nu}')
                clfs = []
                for i in range(2):
                    clfs.append(svm.OneClassSVM(nu=param_nu, kernel=param_kernel, gamma=param_gamma))
                    clfs[i].fit(Xtrain[i])
                    pkl_filename = f'{save_path}/svm_model_3seq_T{i}.pkl'
                    with open(pkl_filename, 'wb') as file:
                        pickle.dump(clfs[i], file)
                conf_matrix = compute_confusion_matrix(clfs)
                fig = plt.gcf()
                ax = plt.subplot()
                sns.heatmap(conf_matrix, annot=True, ax = ax, cmap="YlGnBu");
                ax.xaxis.set_ticklabels(['T1', 'T2']); ax.yaxis.set_ticklabels(['T1', 'T2']);
                plt.savefig(f'{save_path}/FINAL_n{param_nu}_r{param_gamma}.png')
                plt.clf()                
            else:
                for param_degree in degree:
                    print(f'kernel={param_kernel} - gamma={param_gamma} - nu={param_nu} - d={param_degree}')
                    clfs = []
Beispiel #11
0
    def GetLabel(self, X):
        '''------------------OSVM--------------------------'''

        from sklearn import svm
        # use the same dataset

        clf = svm.OneClassSVM(nu=0.05, kernel="rbf", gamma=0.1)
        clf.fit(X)

        svm.OneClassSVM(cache_size=200,
                        coef0=0.0,
                        degree=3,
                        gamma=0.1,
                        kernel='rbf',
                        max_iter=-1,
                        nu=0.05,
                        random_state=None,
                        shrinking=True,
                        tol=0.001,
                        verbose=False)

        osvm = clf.predict(X)

        # inliers are labeled 1, outliers are labeled -1
        normal = X[osvm == 1]
        abnormal = X[osvm == -1]
        '''---------------------IForest--------------------------'''
        from sklearn.ensemble import IsolationForest
        data = pd.DataFrame(X, columns=["Price", "Time"])
        # train isolation forest
        model = IsolationForest(contamination=0.1)
        model.fit(data)
        data['IForest'] = pd.Series(model.predict(data))

        # visualization
        '''---------------------KNN--------------------------'''
        # train kNN detector
        from pyod.models.knn import KNN
        clf_name = 'KNN'
        clf = KNN()
        clf.fit(X)
        # get the prediction labels and outlier scores of the training data
        ss = clf.labels_  # binary labels (0: inliers, 1: outliers)
        #y_train_scores = clf.decision_scores_  # raw outlier scores

        data['OSVM'] = osvm
        data['KNN'] = ss

        # Convert each value of Knn to be equal with they others
        data.loc[(data.KNN == 0), 'KNN'] = '1'
        data.loc[(data.KNN == 1), 'KNN'] = '-1'

        #
        data['KNN'] = data['KNN'].astype(int)
        data['OSVM'] = data['OSVM'].astype(int)
        data['IForest'] = data['IForest'].astype(int)
        #data['RES']=data['RES'].astype(int)

        #
        data['RES'] = data.OSVM + data.IForest + data.KNN
        data.dtypes
        # Operation to get RES
        data.RES[data.RES == 1] = 0
        data.RES[data.RES == 3] = 1
        data.RES[data.RES == -3] = 0
        data.RES[data.RES == -1] = 0

        x = data.iloc[:, [0, 1]].values
        y = data.iloc[:, [5]].values
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            x, y, test_size=0.3, random_state=100)

        return self.X_train, self.X_test, self.y_train, self.y_test
Beispiel #12
0
def sample_svdd(x_train,
                outlier_fraction=0.001,
                kernel_s=2,
                maxiter=1000,
                sample_size=10,
                resample_n=3,
                stop_tol=1e-6,
                n_iter=30,
                iter_history=True,
                seed=2513646):
    """
    Perform sampling based approximate svdd.
    Input Parameters:
        x_train : input data to train, must be a two-dim numpy array 
        kernel_s: the bandwidth for the Gaussian kernel, the Gaussian kernel is 
                  assumed to be of the form exp( -||x - y||^2 / (2 *kernel_s^2))
        sample_size: the size of each random sample 
        resample_n: take these many samples in each iteration, and merge the union of their support vectors with the
                    master, the method documented in the paper corresponds to resample_n = 1
        stop_tol: the tolerance value to detect convergence
        n_iter: the raidus and center must be close to each other for this many consecutive iterations
                for convergence to be declared
        iter: flag to determine whether convergence history will be stored
        seed: seed value for the random number generator    
    Output:
        The output is a named tuple. If the output is denoted by res then:
            res.IterHist: a named tuple containing the iteration history
                res.IterHist.niter_ : number of iterations till convergence
                res.IterHist.radius_history_ : the iteration history for the radius
                res.IterHist.center_history_: the iteration history of the center
                res.IterHist.converged_ : convergence status flag
            res.Params: a named tuple containing the output parameters of the suggested SVDD 
                res.Params.sv_: the indices of the fitted support vectors
                res.Params.center_: final center point
                res.Params.radius_ : final radius
            res.OneClassSVM:
                A sklearn.svm.OneClassSVM instance corresponding to the result. Can be used for scoring.                                
    """

    # Only matrix input allowed
    if len(x_train.shape) != 2:
        print("ERROR: invalid x_train input found, expecting a matrix")
        raise ValueError

    #sanity checks
    if maxiter <= 0:
        print("ERROR: maxiter must be positive integer")
        raise ValueError

    nobs = x_train.shape[0]

    if nobs <= sample_size:
        print(
            "ERROR: sample size must be strictly smaller than number of observations in input data"
        )
        raise ValueError

    # convert kernel_s to gamma
    gamma, nu = 0.5 / (kernel_s * kernel_s), outlier_fraction

    if np.isfinite(gamma) != True or np.isfinite(nu) != True or (nu < 0) or (
            nu > 1):
        print("ERROR: Invalid kernel_s or outlier_fraction input")
        raise ValueError

    #if negative seed is provided use a system chosen seed
    np.random.seed(seed=seed if seed >= 0 else None)

    if iter_history:
        radius_history, center_history = np.empty(maxiter + 1), list()

    clf = None
    sv_ind_prev, radius_prev, center_prev = _do_one_class_svm_random(
        gamma, nu, x_train, sample_size)

    if iter_history:
        radius_history[0] = radius_prev
        center_history.append(center_prev)

    i, converged, iter_n = 0, 0, 0
    while i < maxiter:
        if converged: break

        sv_ind_local = _do_one_class_svm_random(gamma,
                                                nu,
                                                x_train,
                                                sample_size,
                                                compute_rc=False)
        for dummy1 in range(resample_n - 1):
            sv_ind_locals = _do_one_class_svm_random(gamma,
                                                     nu,
                                                     x_train,
                                                     sample_size,
                                                     compute_rc=False)
            sv_ind_local = np.union1d(sv_ind_locals, sv_ind_local)

        sv_ind_merge = np.union1d(sv_ind_local, sv_ind_prev)
        sv_ind_master, radius_master, center_master = _do_one_class_svm_sample(
            gamma, nu, x_train, sv_ind_merge)

        if iter_history:
            radius_history[i + 1] = radius_master
            center_history.append(center_master)

        iter_n = iter_n + 1 if np.fabs(
            radius_master -
            radius_prev) <= stop_tol * np.fabs(radius_prev) else 0
        if iter_n >= n_iter:
            converged = 1
        else:
            sv_ind_prev, center_prev, radius_prev = sv_ind_master, center_master, radius_master
        i += 1

    if iter_history:
        radius_history = radius_history[0:i + 1]
    niter = i + 1

    SampleSVDDRes = namedtuple("SampleSVDDRes", "Params  IterHist OneClassSVM")
    SampleSVDDParams = namedtuple("SampleSVDDParams", "sv_ center_ radius_")
    SampleSVDDIterHist = namedtuple(
        "SampleSVDDIterHist",
        "niter_ radius_history_ center_history_ converged_")

    params = SampleSVDDParams(sv_ind_master, center_master, radius_master)

    iterhist = None
    if iter_history:
        iterhist = SampleSVDDIterHist(niter, radius_history, center_history,
                                      converged)

    nsv = sv_ind_master.shape[0]
    clf = svm.OneClassSVM(gamma=gamma, nu=nu if nu * nsv > 1 else 1. / nsv)
    clf.fit(x_train[sv_ind_master, ...])

    return SampleSVDDRes(params, iterhist, clf)
Beispiel #13
0
    def run_main():
        import matplotlib.pyplot as plt
        import time

        #create a donut data.
        def one_donut(rmin, rmax, origin, nobs):
            """
                rmin: inner radius
                rmax: outer radis
                origin: origin
                nobs: number of observations in the data
            """
            r = np.sqrt(rmin * rmin + (rmax - rmin) *
                        (rmax + rmin) * np.random.ranf(nobs))
            theta = 2 * np.pi * np.random.ranf(nobs)
            res = np.array([(r_ * np.cos(theta_), r_ * np.sin(theta_))
                            for r_, theta_ in zip(r, theta)])
            return res + origin

        seed = 24215125
        np.random.seed(seed)

        #store time taken by the two methods
        tsample, tfull = list(), list()

        #run the method over data sets of these sizes
        dsize_list = [5000, 10000, 100000, 500000, 1000000, 1250000, 2000000]

        #this will take about 10mins to run
        for ndat in dsize_list:

            #parameters of the two donuts
            r_min1, r_max1, origin1, nobs1 = 3, 5, (0,
                                                    0), np.floor(0.75 * ndat)
            r_min2, r_max2, origin2, nobs2 = 2, 4, (10, 10), ndat - nobs1

            #create the training data
            test_data = np.append(one_donut(r_min1, r_max1, origin1, nobs1),
                                  one_donut(r_min2, r_max2, origin2, nobs2),
                                  axis=0)

            print('the test data has {0} observations'.format(
                test_data.shape[0]))

            #parameters of the training SVDD. Tweak for performance/accuracy.
            outlier_fraction, kernel_s = 0.0001, 1.3
            sample_size, resample_n, n_iter = 10, 1, 10
            stop_tol, maxiter = 1e-4, 5000

            #train using sampling svdd
            start = time.time()
            result = sample_svdd(test_data,
                                 outlier_fraction=outlier_fraction,
                                 kernel_s=kernel_s,
                                 resample_n=resample_n,
                                 maxiter=maxiter,
                                 sample_size=sample_size,
                                 stop_tol=stop_tol,
                                 n_iter=n_iter,
                                 iter_history=True,
                                 seed=seed)
            end = time.time()
            tsample.append(end - start)
            print(
                "sample svdd took {0} seconds to train, iteration history stored"
                .format(end - start))
            radius_history = result.IterHist.radius_history_
            sv_indices = result.Params.sv_

            #train using full svdd
            start = time.time()
            clf1 = svm.OneClassSVM(
                nu=outlier_fraction
                if test_data.shape[0] * outlier_fraction > 1 else 1. /
                test_data.shape[0],
                kernel="rbf",
                gamma=0.5 / (kernel_s * kernel_s))
            clf1.fit(test_data)
            end = time.time()
            tfull.append(end - start)
            print("full svdd took {0} seconds to train".format(end - start))

            #plot the support vectors
            plt.figure(1)
            plt.grid(True)
            plt.title('Support Vectors (Sampling Method)')
            plt.scatter(test_data[sv_indices, 0], test_data[sv_indices, 1])
            plt.show()

            plt.figure(2)
            plt.grid(True)
            plt.title('Support Vectors (Full SVDD))')
            plt.scatter(clf1.support_vectors_[..., 0],
                        clf1.support_vectors_[..., 1])
            plt.show()

            plt.figure(3)
            plt.title('Iteration History for Sampling Method')
            plt.plot(radius_history)
            plt.show()

            #create a 200 x 200 grid on the bounding rectangle of the training data
            # for scoring
            ngrid = 200
            max_x, max_y = np.amax(test_data, axis=0)
            min_x, min_y = np.amin(test_data, axis=0)

            x_ = np.linspace(min_x, max_x, ngrid)
            y_ = np.linspace(min_y, max_y, ngrid)

            x, y = np.meshgrid(x_, y_)

            score_data = np.array([(x1, y1)
                                   for x1, y1 in zip(x.ravel(), y.ravel())])

            #the OneClasSVM result corresponding to the sample method
            clf2 = result.OneClassSVM

            scores1 = clf1.predict(score_data)
            scores2 = clf2.predict(score_data)

            #plot the scored data
            plt.figure(4)
            p2 = np.where(scores2 == 1)
            plt.grid(True)
            plt.title(
                "Scoring Results : Inside Points Colored green (using sampling svdd)"
            )
            plt.scatter(score_data[p2, 0],
                        score_data[p2, 1],
                        color='g',
                        s=0.75)
            plt.show()

            plt.figure(5)
            p1 = np.where(scores1 == 1)
            plt.grid(True)
            plt.title(
                "Scoring Results : Inside Points Colored (using full svdd)")
            plt.scatter(score_data[p1, 0],
                        score_data[p1, 1],
                        color='g',
                        s=0.75)
            plt.show()

        plt.figure(6)
        plt.grid(True)
        plt.title(
            "Sampling SVDD Performance. Sample Size {0}".format(sample_size))
        plt.xlabel("Input Data Size")
        plt.ylabel("Time Taken (in seconds)")
        plt.plot(dsize_list, tsample)

        plt.figure(7)
        plt.grid(True)
        plt.title("Full SVDD Performance")
        plt.xlabel("Input Data Size")
        plt.ylabel("Time Taken (in seconds)")
        plt.plot(dsize_list, tfull)
##############

data.to_csv('with_nbhd.csv', index=False)

##############
# MODEL
##############

sample_columns = ['DAY', 'HOUR', 'WEATHER', 'NBHD', 'SEVERITYCODE']
sample_data = data[sample_columns]

# OneClassSVM

svm_data = sample_data[sample_data.SEVERITYCODE == 3].drop('SEVERITYCODE', axis=1)

model = svm.OneClassSVM()
%time model.fit(svm_data)

%time pd.value_counts(model.predict(svm_data))

%time pd.value_counts(model.predict(sample_data[sample_data.SEVERITYCODE < 3]))

preds = model.predict(sample_data[sample_data.SEVERITYCODE < 3])

predicted_dangerous = data[data.SEVERITYCODE < 3][preds == 1]
predicted_dangerous.groupby('DAY').size()
predicted_dangerous.groupby('NBHD').size()
pd_percent = predicted_dangerous.groupby('HOUR').size() / predicted_dangerous.groupby('HOUR').size().sum() * 100

sns.lineplot(x=list(range(1,23)), y=pd_percent)
Beispiel #15
0
def osvmClassification(nu, x_train_p, x_test, y_train, y_test):
    clf = svm.OneClassSVM(nu=nu, kernel='rbf', gamma=0.1)
    clf.fit(x_train_p)
    y_pred = clf.predict(x_test)
    accuracy = np.sum(y_pred == y_test) / len(y_pred)
    return clf, accuracy
Beispiel #16
0
y_pred_train_reshape, y_pred_test_reshape, y_train_reshape, y_test_reshape = train_model(
    model_svm, to_remove, x_class, y_class)
get_scores(y_pred_train_reshape, y_pred_test_reshape, y_train_reshape,
           y_test_reshape)

model_rfc = RandomForestClassifier(n_estimators=100,
                                   random_state=0,
                                   class_weight='balanced')
y_pred_train_reshape, y_pred_test_reshape, y_train_reshape, y_test_reshape = train_model(
    model_rfc, to_remove, x_class, y_class)
get_scores(y_pred_train_reshape, y_pred_test_reshape, y_train_reshape,
           y_test_reshape)

#now lets do unsupervised learning: for unsupervised learning we don't need to subsample nor dividide between training
#one class svm anomalies
clf = svm.OneClassSVM(nu=.1, kernel='rbf', gamma=.1)
fit_model(clf, x_class, y_class)
fit_model(clf, x, y)

# Split into anomaly and normal examples
clf = svm.OneClassSVM(nu=.1, kernel='rbf', gamma=.1)
fit_novelty_model(clf, x_class, y_class)

clf = svm.OneClassSVM(nu=.1, kernel='rbf', gamma=.1)
fit_novelty_model(clf, x, y)

clf = LocalOutlierFactor(n_neighbors=20, contamination=0.1)
fit_model_loc(clf, x_class, y_class)
clf = LocalOutlierFactor(n_neighbors=20, contamination=0.1)
fit_model_loc(clf, x, y)
Beispiel #17
0
    def _build_classifiers(self):
        # Train on first 20% of images and create empty lists of features
        to_analyse = int(len(self.all_imgs) * 0.2)
        hu_feas = []
        areas = []
        lengths = []
        initial_areas = []
        initial_lengths = []
        initial_hu_feas = []

        # Create empty lists of colour features if using colour
        if self.use_colour:
            colors_r = []
            colors_g = []
            colors_b = []

        # For all images and masks
        for index, (mask, img_f) in enumerate(
                list(zip(self.panel_masks[:], self.all_imgs[:]))):
            # Read respective image and create empty lists of features for this image
            img = imread(img_f)
            hu_feas_labelled = []
            areas_labelled = []
            lengths_labelled = []
            # Crop image to just the panel, get region properties of objects in the panel
            img = self.panel.get_bbox_image(img)
            c_label, c_rprops = simple_label_next_frame(
                self.panel_labels, self.panel_regionprops, mask)

            # For each seed found in the panel
            for idx, rp in enumerate(c_rprops):
                # If colour is being used, append the respective colour channel lists with rgb values f from
                # the colour histogram function
                if self.use_colour:
                    r, g, b = self.generate_color_histogram(img, rp)
                    colors_r.append(r)
                    colors_g.append(g)
                    colors_b.append(b)

                # Append the features of the seed (Hu moments, area, major axis length, minor axis length, minor/major
                # axis length ratio) to a list of seed features
                hu_feas.append(rp.moments_hu)
                hu_feas_labelled.append(np.hstack((rp.moments_hu, rp.label)))
                areas.append(rp.area)
                areas_labelled.append([rp.area, rp.label])
                lengths.append([
                    rp.minor_axis_length, rp.major_axis_length,
                    float(rp.minor_axis_length + 1.0) /
                    float(rp.major_axis_length + 1.0)
                ])
                lengths_labelled.append(
                    np.hstack(([
                        rp.minor_axis_length, rp.major_axis_length,
                        float(rp.minor_axis_length + 1.0) /
                        float(rp.major_axis_length + 1.0)
                    ], rp.label)))
            # Append the list of seed features for that image to a list of all images' seed features
            initial_areas.append(np.array(areas_labelled))
            initial_lengths.append(np.array(lengths_labelled))
            initial_hu_feas.append(np.array(hu_feas_labelled))

        areas = np.vstack(areas)
        hu_feas = np.vstack(hu_feas)
        lengths = np.vstack(lengths)
        if self.use_delta:
            self.delta_area = np.zeros((areas.shape[0], 1))
            self.delta_hu_feas = np.zeros((hu_feas.shape[0], 7))
            self.delta_lengths = np.zeros((lengths.shape[0], 3))
            counter = 0
            # For i in total number of images
            for i in range(len(initial_areas)):
                # For j in largest seed label
                for j in range(np.max(initial_areas[i][:, 1])):
                    # If first image
                    if i == 0:
                        # If seed label is present in current image array
                        if np.isin(j + 1, initial_areas[i][:, 1]):
                            id = j + 1
                            if np.isin(id, initial_areas[i + 1][:, 1]):
                                curr_arr = initial_areas[i][:, 1]
                                curr = np.argwhere(curr_arr == id)
                                next_arr = initial_areas[i + 1][:, 1]
                                next = np.argwhere(next_arr == id)
                                # As the delta for the first image is undefined, set it to the difference between the first
                                # and second image
                                self.delta_area[counter, 0] = np.abs(
                                    initial_areas[i + 1][next, 0] -
                                    initial_areas[i][curr, 0])
                                self.delta_lengths[counter, :] = np.abs(
                                    initial_lengths[i + 1][next, :3] -
                                    initial_lengths[i][curr, :3])
                                self.delta_hu_feas[counter, :] = np.abs(
                                    initial_hu_feas[i + 1][next, :7] -
                                    initial_hu_feas[i][curr, :7])
                                counter += 1
                    else:
                        # If seed label is present in current image array
                        if np.isin(j + 1, initial_areas[i][:, 1]):
                            id = j + 1
                            # Get indices of same seed in previous and current image array
                            curr_arr = initial_areas[i][:, 1]
                            curr = np.argwhere(curr_arr == id)
                            prev_arr = initial_areas[i - 1][:, 1]
                            prev = np.argwhere(prev_arr == id)
                            if curr.size != prev.size:
                                # If seed disappears or new seed, set it's delta to the mean of other seeds
                                self.delta_area[counter, 0] = np.mean(
                                    self.delta_area[0:counter, 0])
                                self.delta_lengths[counter, :] = np.mean(
                                    self.delta_lengths[0:counter, :])
                                self.delta_hu_feas[counter, :] = np.mean(
                                    self.delta_hu_feas[0:counter, :])
                                counter += 1
                            else:
                                # Create delta features i.e. seed feature from this image - seed feature from previous image
                                self.delta_area[counter, 0] = np.abs(
                                    initial_areas[i][curr, 0] -
                                    initial_areas[i - 1][prev, 0])
                                self.delta_lengths[counter, :] = np.abs(
                                    initial_lengths[i][curr, :3] -
                                    initial_lengths[i - 1][prev, :3])
                                self.delta_hu_feas[counter, :] = np.abs(
                                    initial_hu_feas[i][curr, :7] -
                                    initial_hu_feas[i - 1][prev, :7])
                                counter += 1

        # Get the number of seeds to train on
        to_analyse = np.sum(item.shape[0]
                            for item in initial_areas[:to_analyse])
        # Create array containing seed features from all images
        if self.use_delta:
            self.all_data = np.hstack([
                hu_feas, self.delta_hu_feas, areas, self.delta_area, lengths,
                self.delta_lengths
            ])
        else:
            self.all_data = np.hstack([hu_feas, areas, lengths])
        # Create training data for one class SVM
        if self.use_delta:
            hu_feas = np.hstack([
                hu_feas[:to_analyse, :], self.delta_hu_feas[:to_analyse, :],
                areas[:to_analyse, :], self.delta_area[:to_analyse, :],
                lengths[:to_analyse, :], self.delta_lengths[:to_analyse, :]
            ])  #added in area and delta area.
        else:
            hu_feas = np.hstack([
                hu_feas[:to_analyse, :], areas[:to_analyse, :],
                lengths[:to_analyse, :]
            ])
        if self.use_colour:
            color_feas = np.hstack([
                np.vstack(colors_r),
                np.vstack(colors_g),
                np.vstack(colors_b)
            ])

        # Normalise the hu features and the delta mean i.e. z = (x-mu)/sigma
        self.hu_feas_mu = hu_feas.mean(axis=0)
        self.hu_feas_stds = hu_feas.std(axis=0)
        hu_feas = (hu_feas - self.hu_feas_mu) / (self.hu_feas_stds + 1e-9)

        # Train a one class SVM on the hu features
        self.clf_hu = svm.OneClassSVM(nu=0.03, kernel="rbf", gamma=0.001)
        self.clf_hu.fit(hu_feas)

        # If using colour, normalise the colour histograms i.e. z = (x-mu)/sigma
        if self.use_colour:
            self.color_feas_mu = color_feas.mean(axis=0)
            self.color_feas_stds = color_feas.std(axis=0)
            color_feas = (color_feas -
                          self.color_feas_mu) / (self.color_feas_stds + 1e-9)
            # Train a one class SVM on the colour features
            self.clf_color = svm.OneClassSVM(nu=0.03,
                                             kernel="rbf",
                                             gamma=0.001)
            self.clf_color.fit(color_feas)
from sklearn.neighbors import LocalOutlierFactor

print(__doc__)

matplotlib.rcParams['contour.negative_linestyle'] = 'solid'

# Example settings
n_samples = 300
outliers_fraction = 0.15
n_outliers = int(outliers_fraction * n_samples)
n_inliers = n_samples - n_outliers

# define outlier/anomaly detection methods to be compared
anomaly_algorithms = [
    ("Robust covariance", EllipticEnvelope(contamination=outliers_fraction)),
    ("One-Class SVM", svm.OneClassSVM(nu=outliers_fraction, kernel="rbf",
                                      gamma=0.1)),
    ("Isolation Forest", IsolationForest(contamination=outliers_fraction,
                                         random_state=42)),
    ("Local Outlier Factor", LocalOutlierFactor(
        n_neighbors=35, contamination=outliers_fraction))]

# Define datasets
blobs_params = dict(random_state=0, n_samples=n_inliers, n_features=2)
datasets = [
    make_blobs(centers=[[0, 0], [0, 0]], cluster_std=0.5,
               **blobs_params)[0],
    make_blobs(centers=[[2, 2], [-2, -2]], cluster_std=[0.5, 0.5],
               **blobs_params)[0],
    make_blobs(centers=[[2, 2], [-2, -2]], cluster_std=[1.5, .3],
               **blobs_params)[0],
    4. * (make_moons(n_samples=n_samples, noise=.05, random_state=0)[0] -
Beispiel #19
0
def choose_classifier(dataset, class_number, model_type, model, classifier, D, hyper_para, train_data, test_data, test_label, no_train_data, no_test_data, inm, relu, m, s):

	if(hyper_para.verbose==True):
		print('Extracting features.....')

	train_features = np.memmap('../../temp_files/train_features_temp.bin', dtype='float32', mode='w+', shape=(no_train_data,hyper_para.D))
	train_features = torch.from_numpy(train_features)

	for i in range(no_train_data):
		temp = model(torch.autograd.Variable(train_data[i:(i+1)].cuda().contiguous().float())).float()
		temp = temp.view(1,1,hyper_para.D)
		temp = inm(temp)
		temp = relu(temp.view(hyper_para.D))
		train_features[i:(i+1)] = temp.data.cpu()
	train_data = None

	if(hyper_para.verbose==True):
		print('Features extracted.')

	## test on the test set
	test_features = np.memmap('../../temp_files/test_features_temp.bin', dtype='float32', mode='w+', shape=(no_test_data,hyper_para.D))
	test_scores   = np.memmap('../../temp_files/test_scores_temp.bin', dtype='float32', mode='w+', shape=(no_test_data,1))
	test_features = torch.from_numpy(test_features)

	if(hyper_para.verbose==True):
		print('Computing test scores and AUC....')

	area_under_curve=0.0
	if(hyper_para.classifier_type=='OC_CNN'):
		test_scores   = torch.from_numpy(test_scores)
		k=0
		print(np.shape(test_features))
		start = time.time()
		for j in range(no_test_data):
			temp = model(AddNoise(torch.autograd.Variable(test_data[j:(j+1)].cuda().contiguous().float()), hyper_para.sigma1)).float()
			temp = temp.view(1,1,hyper_para.D)
			temp = inm(temp)
			temp = temp.view(hyper_para.D)
			
			test_features[k:(k+1)] = temp.data.cpu()
			test_scores[k:(k+1)]   = classifier(relu(temp)).data.cpu()[1]
			# print(classifier(relu(temp)).data.cpu())
			
			k = k+1
		end = time.time()
		print(end-start)
		test_scores    = test_scores.numpy()
		test_features  = test_features.numpy()
		train_features = train_features.numpy()

		test_scores = (test_scores-np.min(test_scores))/(np.max(test_scores)-np.min(test_scores))

	elif(hyper_para.classifier_type=='OC_SVM_linear'):
		# train one-class svm
		oc_svm_clf = svm.OneClassSVM(kernel='linear', nu=float(hyper_para.N))
		oc_svm_clf.fit(train_features.numpy())
		k=0
		mean_kwn = np.zeros( (no_test_data,1) )
		for j in range(no_test_data):
			temp = model(torch.autograd.Variable(test_data[j:(j+1)].cuda().contiguous().float())).float()
			temp = temp.view(1,1,hyper_para.D)
			temp = inm(temp)
			temp = temp.view(hyper_para.D)			
			test_features[k:(k+1)] = temp.data.cpu()
			temp 				   = np.reshape(relu(temp).data.cpu().numpy(), (1, hyper_para.D))
			test_scores[k:(k+1)]   = oc_svm_clf.decision_function(temp)[0]

			k = k+1

		test_features  = test_features.numpy()
		train_features = train_features.numpy()

		joblib.dump(oc_svm_clf,'../../save_folder/saved_models/'+dataset+'/classifier/'+str(class_number) +'/'+
																				model_type+'_OCCNNlin'    +'_'+
																				str(hyper_para.iterations)+'_'+
																				str(hyper_para.lr)		  +'_'+
																				str(hyper_para.sigma)	  +'_'+
																				str(hyper_para.N)         +'.pkl')

	fpr, tpr, thresholds = metrics.roc_curve(test_label, test_scores)

	if(hyper_para.verbose==True):
		print('Test scores and AUC computed.')

	return area_under_curve, train_features, test_scores, test_features
def getTrainedSVM(trainingData,nuu,g):
    model=svm.OneClassSVM(nu=nuu,kernel='rbf',gamma=g)
    #trainingData=np.reshape(trainingData,(1,len(trainingData)))
    model.fit(trainingData)
    return model
print(np.array(NegativeTest).shape)

print("数据处理完成")

start = 72
# gamma = 0.001,0.01,0.1,1,10,100
gamma = 100
for nu in [0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]:
    print("     ")
    print("     ")
    print("C = {}".format(nu))
    print("     ")
    print("     ")
    #训练
    # clf = SVC(kernel="rbf", C = 1.0, gamma= "auto")
    clf = svm.OneClassSVM(kernel="rbf", gamma=gamma, nu=nu)
    clf.fit(PositiveTrain)
    Modalpath = modal_path + os.sep + r"model" + str(gamma) + "n" + str(
        nu) + r".plk"
    joblib.dump(clf, Modalpath)
    TestPredict = clf.predict(TestData)
    print("训练完成")
    #评价
    print("返回给定测试集和对应标签的平均准确率")
    n = 0
    for i in range(len(TestPredict)):
        if TestLabel[i] == TestPredict[i]:
            n += 1
    accuracy = n / (len(TestPredict))
    print(accuracy)
    print("混淆矩阵:")
Beispiel #22
0
X_misaligned = misaligned_blobs(samples=n_inliers, sd=cluster_sd)

## 6: Whole dataset
datasets3D = [X_lin, X_hex, X_sph, X_gau, X_misaligned]

# define to data label: y_true
y_true = np.concatenate([np.ones(n_inliers), -np.ones(n_outliers)], axis=0)
# label 1 as inliers, -1 as outliers


# Define algorithm to be compared -------------------------------
anomaly_algorithms = [
    ("Robust covariance", EllipticEnvelope(contamination=outliers_fraction)),
    (
        "One-Class SVM",
        svm.OneClassSVM(nu=outliers_fraction, kernel="rbf", gamma="scale"),
    ),
    (
        "Isolation Forest",
        IsolationForest(
            n_estimators=500,
            behaviour="new",
            contamination=outliers_fraction,
            random_state=42,
        ),
    ),
    (
        "Local Outlier Factor",
        LocalOutlierFactor(
            n_neighbors=35, contamination=outliers_fraction, novelty=False
        ),
def runDetection(outliers, inliers, X, outs, plot=True, outliersNb=10.):
    outliers_fraction = outliersNb / X.shape[0]
    rng = np.random.RandomState(69)
    clusters_separation = [0]  #, 1, 2]

    # les differents outils de detection d'anomalies
    classifiers = {
        "One-Class SVM":
        svm.OneClassSVM(nu=0.95 * outliers_fraction, kernel="rbf", gamma=0.1),
        "Isolation Forest":
        IsolationForest(n_estimators=500,
                        max_samples='auto',
                        bootstrap=False,
                        contamination=outliers_fraction,
                        random_state=rng)
    }

    if (plot):
        classifiers["Robust covariance"] = EllipticEnvelope(
            contamination=outliers_fraction)

    # Compare given classifiers under given settings
    xx, yy = np.meshgrid(np.linspace(-0.2, 1.3, 100),
                         np.linspace(-0.2, 1.9, 100))

    # Fit the problem with varying cluster separation
    for i, offset in enumerate(clusters_separation):
        np.random.seed(69)

        # Fit the model
        plt.figure(figsize=(10.8, 3.6))

        for i, (clf_name, clf) in enumerate(classifiers.items()):
            # fit the data and tag outliers
            clf.fit(X)
            scores_pred = clf.decision_function(X)
            threshold = stats.scoreatpercentile(scores_pred,
                                                100 * outliers_fraction)
            y_pred = clf.predict(X)

            X_out_idx = np.where(y_pred == -1)[0]

            print clf_name

            #if (plot):
            print "True outliers     :", outs
            print "Outliers detected :", X_out_idx

            # Calcul de la matrice de confusion a la main
            FP = len(np.intersect1d(outs, X_out_idx))
            FN = len(X_out_idx) - FP

            V = X.shape[0] - len(X_out_idx)
            VN = len(outs) - FP
            VP = V - VN

            n_errors = (VN + FN)

            print "Matrice de confusion"
            print " _________________________________", "\n"  \
                  "| P\R      Outliers    Inliers     |","\n"  \
                  "| -------------------------------- |","\n"  \
                  "| Outliers ", " "*4, FP, " "*8, FN, " "*4, "|","\n"  \
                  "| -------------------------------- |","\n"  \
                  "| Inliers  ", " "*4, VN, " "*7, VP, " "*3, "|","\n"  \
                  "|_________________________________ |","\n"  \

            if (plot):
                # plot the levels lines and the points
                Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
                Z = Z.reshape(xx.shape)
                subplot = plt.subplot(1, 3, i + 1)

                subplot.contourf(xx,
                                 yy,
                                 Z,
                                 levels=np.linspace(Z.min(), threshold, 7),
                                 cmap=plt.cm.Blues_r)

                a = subplot.contour(xx,
                                    yy,
                                    Z,
                                    levels=[threshold],
                                    linewidths=2,
                                    colors='red')

                subplot.contourf(xx,
                                 yy,
                                 Z,
                                 levels=[threshold, Z.max()],
                                 colors='orange')

                b = subplot.scatter(inliers[:, 0], inliers[:, 1], c='white')
                c = subplot.scatter(outliers[:, 0], outliers[:, 1], c='black')

                subplot.axis('tight')

                subplot.legend(
                    [a.collections[0], b, c], [
                        'learned decision function', 'true inliers',
                        'true outliers'
                    ],
                    prop=matplotlib.font_manager.FontProperties(size=11),
                    loc='upper left')

                subplot.set_title("%d. %s (errors: %d)" %
                                  (i + 1, clf_name, n_errors))
                subplot.set_xlim((-0.2, 1.3))
                subplot.set_ylim((-0.2, 1.9))

        if (plot):
            plt.subplots_adjust(0.04, 0.1, 0.96, 0.92, 0.1, 0.26)

    if (plot):
        plt.show()
Beispiel #24
0
    def train(self, GridSearch=True, **kwargs):

        if self.data._X_train.ndim > 2:
            X_train_shape = self.data._X_train.shape
            X_train = self.data._X_train.reshape(X_train_shape[0],
                                                 np.prod(X_train_shape[1:]))
        else:
            X_train = self.data._X_train

        print("Starting training...")
        self.start_clock()

        if self.loss == 'SVC':

            if self.kernel in ('DegreeKernel', 'WeightedDegreeKernel'):
                self.get_kernel_matrix(kernel=self.kernel,
                                       which_set='train',
                                       **kwargs)
                self.svm.fit(self.K_train, self.data._y_train)
            else:
                self.svm.fit(X_train, self.data._y_train)

        if self.loss == 'OneClassSVM':

            if self.kernel in ('DegreeKernel', 'WeightedDegreeKernel'):
                self.get_kernel_matrix(kernel=self.kernel,
                                       which_set='train',
                                       **kwargs)
                self.svm.fit(self.K_train)
            else:

                if GridSearch and self.kernel == 'rbf':

                    # use grid search cross-validation to select gamma
                    print("Using GridSearchCV for hyperparameter selection...")

                    # sample small hold-out set from test set for hyperparameter selection. Save as val set.
                    n_val_set = int(0.1 * self.data.n_test)
                    n_test_out = 0
                    n_test_norm = 0
                    n_val_out = 0
                    n_val_norm = 0
                    while (n_test_out == 0) | (n_test_norm == 0) | (
                            n_val_out == 0) | (n_val_norm == 0):
                        perm = np.random.permutation(self.data.n_test)
                        self.data._X_val = self.data._X_test[perm[:n_val_set]]
                        self.data._y_val = self.data._y_test[perm[:n_val_set]]
                        # only accept small test set if AUC can be computed on val and test set
                        n_test_out = np.sum(
                            self.data._y_test[perm[:n_val_set]])
                        n_test_norm = np.sum(
                            self.data._y_test[perm[:n_val_set]] == 0)
                        n_val_out = np.sum(self.data._y_test[perm[n_val_set:]])
                        n_val_norm = np.sum(
                            self.data._y_test[perm[n_val_set:]] == 0)

                    self.data._X_test = self.data._X_test[perm[n_val_set:]]
                    self.data._y_test = self.data._y_test[perm[n_val_set:]]
                    self.data.n_val = len(self.data._y_val)
                    self.data.n_test = len(self.data._y_test)

                    self.diag['val']['scores'] = np.zeros(
                        (len(self.data._y_val), 1))
                    self.diag['test']['scores'] = np.zeros(
                        (len(self.data._y_test), 1))

                    cv_auc = 0.0
                    cv_acc = 0

                    for gamma in np.logspace(-10, -1, num=10, base=2):

                        # train on selected gamma
                        self.cv_svm = svm.OneClassSVM(kernel='rbf',
                                                      nu=Cfg.svm_nu,
                                                      gamma=gamma)
                        self.cv_svm.fit(X_train)

                        # predict on small hold-out set
                        self.predict(which_set='val')

                        # save model if AUC on hold-out set improved
                        if self.diag['val']['auc'] > cv_auc:
                            self.svm = self.cv_svm
                            self.nu = Cfg.svm_nu
                            self.gamma = gamma
                            cv_auc = self.diag['val']['auc']
                            cv_acc = self.diag['val']['acc']

                    # save results of best cv run
                    self.diag['val']['auc'] = cv_auc
                    self.diag['val']['acc'] = cv_acc

                else:
                    # if rbf-kernel, re-initialize svm with gamma minimizing the
                    # numerical error
                    if self.kernel == 'rbf':
                        gamma = 1 / (np.max(pairwise_distances(X_train))**2)
                        self.svm = svm.OneClassSVM(kernel='rbf',
                                                   nu=Cfg.svm_nu,
                                                   gamma=gamma)

                    self.svm.fit(X_train)

                    self.nu = Cfg.svm_nu
                    self.gamma = gamma

        self.stop_clock()
        self.train_time = self.clocked
# In[6]:

params = np.array(df.values[:, 1:], dtype="float64")
params = scale(params)

# In[7]:

X = PCA(n_components=2).fit_transform(params)
num = X.shape[0]
OUTLIER_FRACTION = 0.01

# In[ ]:

# In[8]:

clf = svm.OneClassSVM(kernel="rbf")
clf.fit(X)

# In[9]:

dist_to_border = clf.decision_function(X).ravel()
threshold = stats.scoreatpercentile(dist_to_border, 100 * OUTLIER_FRACTION)
is_inlier = dist_to_border > threshold

# In[10]:

xx, yy = np.meshgrid(np.linspace(-7, 7, 500), np.linspace(-7, 7, 500))
n_inliers = int((1. - OUTLIER_FRACTION) * num)
n_outliers = int(OUTLIER_FRACTION * num)
Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
def plot_species_distribution(
        species=["bradypus_variegatus_0", "microryzomys_minutus_0"]):
    """
    Plot the species distribution.
    """
    if len(species) > 2:
        print(
            "Note: when more than two species are provided, only "
            "the first two will be used")

    t0 = time()

    # Load the compressed data
    data = fetch_species_distributions()

    # Set up the data grid
    xgrid, ygrid = construct_grids(data)

    # The grid in x,y coordinates
    X, Y = np.meshgrid(xgrid, ygrid[::-1])

    # create a bunch for each species
    BV_bunch = create_species_bunch(species[0], data.train, data.test,
                                    data.coverages, xgrid, ygrid)
    MM_bunch = create_species_bunch(species[1], data.train, data.test,
                                    data.coverages, xgrid, ygrid)

    # background points (grid coordinates) for evaluation
    np.random.seed(13)
    background_points = np.c_[
        np.random.randint(low=0, high=data.Ny, size=10000),
        np.random.randint(low=0, high=data.Nx, size=10000)].T

    # We'll make use of the fact that coverages[6] has measurements at all
    # land points.  This will help us decide between land and water.
    land_reference = data.coverages[6]

    # Fit, predict, and plot for each species.
    for i, species in enumerate([BV_bunch, MM_bunch]):
        print "_" * 80
        print "Modeling distribution of species '%s'" % species.name

        # Standardize features
        mean = species.cov_train.mean(axis=0)
        std = species.cov_train.std(axis=0)
        train_cover_std = (species.cov_train - mean) / std

        # Fit OneClassSVM
        print " - fit OneClassSVM ... ",
        clf = svm.OneClassSVM(nu=0.1, kernel="rbf", gamma=0.5)
        clf.fit(train_cover_std)
        print "done. "

        # Plot map of South America
        pl.subplot(1, 2, i + 1)
        if basemap:
            print " - plot coastlines using basemap"
            m = Basemap(projection='cyl',
                        llcrnrlat=Y.min(),
                        urcrnrlat=Y.max(),
                        llcrnrlon=X.min(),
                        urcrnrlon=X.max(),
                        resolution='c')
            m.drawcoastlines()
            m.drawcountries()
        else:
            print " - plot coastlines from coverage"
            pl.contour(X,
                       Y,
                       land_reference,
                       levels=[-9999],
                       colors="k",
                       linestyles="solid")
            pl.xticks([])
            pl.yticks([])

        print " - predict species distribution"

        # Predict species distribution using the training data
        Z = np.ones((data.Ny, data.Nx), dtype=np.float64)

        # We'll predict only for the land points.
        idx = np.where(land_reference > -9999)
        coverages_land = data.coverages[:, idx[0], idx[1]].T

        pred = clf.decision_function((coverages_land - mean) / std)[:, 0]
        Z *= pred.min()
        Z[idx[0], idx[1]] = pred

        levels = np.linspace(Z.min(), Z.max(), 25)
        Z[land_reference == -9999] = -9999

        # plot contours of the prediction
        pl.contourf(X, Y, Z, levels=levels, cmap=pl.cm.Reds)
        pl.colorbar(format='%.2f')

        # scatter training/testing points
        pl.scatter(species.pts_train['dd long'],
                   species.pts_train['dd lat'],
                   s=2**2,
                   c='black',
                   marker='^',
                   label='train')
        pl.scatter(species.pts_test['dd long'],
                   species.pts_test['dd lat'],
                   s=2**2,
                   c='black',
                   marker='x',
                   label='test')
        pl.legend()
        pl.title(species.name)
        pl.axis('equal')

        # Compute AUC w.r.t. background points
        pred_background = Z[background_points[0], background_points[1]]
        pred_test = clf.decision_function((species.cov_test - mean) / std)[:,
                                                                           0]
        scores = np.r_[pred_test, pred_background]
        y = np.r_[np.ones(pred_test.shape), np.zeros(pred_background.shape)]
        fpr, tpr, thresholds = metrics.roc_curve(y, scores)
        roc_auc = metrics.auc(fpr, tpr)
        pl.text(-35, -70, "AUC: %.3f" % roc_auc, ha="right")
        print "\n Area under the ROC curve : %f" % roc_auc

    print "\ntime elapsed: %.2fs" % (time() - t0)
Beispiel #27
0
def test_oneclass_score_samples():
    X_train = [[1, 1], [1, 2], [2, 1]]
    clf = svm.OneClassSVM(gamma=1).fit(X_train)
    assert_array_equal(clf.score_samples([[2., 2.]]),
                       clf.decision_function([[2., 2.]]) + clf.offset_)
Beispiel #28
0
        tX[d, int(w)] = float(cnts[n]) / total

# count total number of anomalies
Danom = 0.0
for a0, anomlbl in enumerate(anomlist):
    a = [1 for x in lbllist if anomlbl in x]
    Danom += float(len(a))

nulist = np.arange(1e-5, 0.4, 0.05)
F1score = np.zeros(len(nulist))
fpres = open('results_indv.txt', 'w')
fpres.write('')
fpres.close()
for n1, nu in enumerate(nulist):
    # train svm
    clf = svm.OneClassSVM(nu=nu, kernel="linear")
    clf.fit(trX)

    # test svm
    #pred_test = clf.predict(tX)
    anom_score = clf.decision_function(tX)[:, 0]

    anom_sorted = np.argsort(anom_score)

    # compute rec, prec
    recall = np.zeros(TopN)
    precision = np.zeros(TopN)

    tp = 0.0
    for i, ind in enumerate(anom_sorted[0:TopN]):
        doclbl = lbllist[ind]
def main(args):
    path = '/media/joshua/Data/python_codes/fingerprinting/internship_experiments/AnomalyDetectionUsingAutoencoder-master/data/'  ##single/all'
    #path =  '/media/joshua/Data/python_codes/fingerprinting/internship_experiments/AnomalyDetectionUsingAutoencoder-master/ID_data/single/'
    intruder = [0]  #4#
    intr = [0]
    snr = '0_1db'
    X0, y0 = load_image(path + snr, args.size, args.comp_vector,
                        args.cartesian, args.window, args.trainProp)

    #X0,y0 = shuffle(X0,y0)
    #unique_elements, counts_elements = np.unique(y0, return_counts=True)
    #print(np.asarray((unique_elements, counts_elements)))

    [X_train, y_train], [X_val,
                         y_val], [X_tes,
                                  y_tes] = data_split(X0, y0, args.trainProp,
                                                      intruder)

    X_test = np.concatenate(
        (X_tes[np.where(np.in1d(y_tes, np.array(0)))[0]], X_tes[np.where(
            np.in1d(y_tes, np.array(1)))[0]], X_tes[np.where(
                np.in1d(y_tes, np.array(2)))[0]], X_tes[np.where(
                    np.in1d(y_tes, np.array(3)))[0]], X_tes[np.where(
                        np.in1d(y_tes, np.array(4)))[0]], X_tes[np.where(
                            np.in1d(y_tes, np.array(5)))[0]]),
        axis=0)

    y_test = np.concatenate(
        (y_tes[np.where(np.in1d(y_tes, np.array(0)))[0]], y_tes[np.where(
            np.in1d(y_tes, np.array(1)))[0]], y_tes[np.where(
                np.in1d(y_tes, np.array(2)))[0]], y_tes[np.where(
                    np.in1d(y_tes, np.array(3)))[0]], y_tes[np.where(
                        np.in1d(y_tes, np.array(4)))[0]], y_tes[np.where(
                            np.in1d(y_tes, np.array(5)))[0]]),
        axis=0)

    scaler = StandardScaler()

    if args.dim == 2:  #
        s0, s1, s2 = X_train.shape[0], X_train.shape[1], X_train.shape[2]
        X_train = X_train.reshape(s0 * s1, s2)
        X_train = scaler.fit_transform(X_train)
        X_train = X_train.reshape(s0, s1, s2)

        s0, s1, s2 = X_test.shape[0], X_test.shape[1], X_test.shape[2]
        X_test = X_test.reshape(s0 * s1, s2)
        X_test = scaler.transform(X_test)
        X_test = X_test.reshape(s0, s1, s2)

        s0, s1, s2 = X_val.shape[0], X_val.shape[1], X_val.shape[2]
        X_val = X_val.reshape(s0 * s1, s2)
        X_val = scaler.fit_transform(X_val)
        X_val = X_val.reshape(s0, s1, s2)
    elif args.dim == 1:  #

        X_train = scaler.fit_transform(X_train)
        X_val = scaler.fit_transform(X_val)
        X_test = scaler.transform(X_test)
        print(X_train.min(), X_train.max(), X_test.min(), X_test.max())

    ### Take PCA to reduce feature space dimensionality
    ##pca = PCA(n_components=3, whiten=True)
    ##pca = pca.fit(X_train)
    ##print('Explained variance percentage = %0.2f' % sum(pca.explained_variance_ratio_))
    ##X_train = pca.transform(X_train)
    ##X_test = pca.transform(X_test)
    ###xval = pca.transform(xval)

    ## Train classifier and obtain predictions for OC-SVM
    oc_svm_clf = svm.OneClassSVM(gamma=0.001, kernel='rbf',
                                 nu=0.08)  # Obtained using grid search
    if_clf = IsolationForest(contamination=0.08,
                             max_features=1.0,
                             max_samples=0.4,
                             n_estimators=40)  # Obtained using grid search

    oc_svm_clf.fit(X_train)
    if_clf.fit(X_train)

    oc_svm_preds = oc_svm_clf.predict(X_test)
    if_preds = if_clf.predict(X_test)

    #calculate accuracy metrics
    print("SVM OOC accuracy: ", accuracy_score(y_test, oc_svm_preds))
    print("IF accuracy: ", accuracy_score(y_test, if_preds))

    df = pd.DataFrame({
        'Labels': np.ravel(y_test),
        'Clusters': np.ravel(oc_svm_preds)
    })
    df2 = pd.DataFrame({
        'Labels': np.ravel(y_test),
        'Clusters': np.ravel(if_preds)
    })

    ct = pd.crosstab(df['Labels'], df['Clusters'])
    ct2 = pd.crosstab(df2['Labels'], df2['Clusters'])
    print(ct)
    print(ct2)
    print(
        classification_report(df['Clusters'],
                              df['Labels'],
                              target_names=['anomaly', 'normal']))
    print(
        classification_report(df2['Clusters'],
                              df2['Labels'],
                              target_names=['anomaly', 'normal']))

    conf_matrix = confusion_matrix(df.Clusters, df.Labels)

    plt.figure(figsize=(12, 12))
    sns.heatmap(conf_matrix,
                xticklabels=LABELS,
                yticklabels=LABELS,
                annot=True,
                fmt="d")
    plt.title("One class SVM Confusion matrix")
    plt.ylabel('True class')
    plt.xlabel('Predicted class')
    plt.show()

    conf_matrix = confusion_matrix(df2.Clusters, df2.Labels)

    plt.figure(figsize=(12, 12))
    sns.heatmap(conf_matrix,
                xticklabels=LABELS,
                yticklabels=LABELS,
                annot=True,
                fmt="d")
    plt.title("Isolation forest Confusion matrix")
    plt.ylabel('True class')
    plt.xlabel('Predicted class')
    plt.show()
Beispiel #30
0
print train_data.shape
print test_data.shape

# train_target = np.ones([train_data.shape[0]])
test_target = np.append(np.ones([test_good_data.shape[0]], dtype=int), -np.ones([test_bad_data.shape[0]], dtype=int))

best_nu = 0
best_gamma = 0
best_auc = 0
best_model = 0

for i in range(1, 20):
    for j in range(1, 20):
        nu = i * 0.01
        gamma = j * 0.01
        model = svm.OneClassSVM(nu=nu, kernel='rbf', gamma=gamma)  
        model.fit(train_data)

        # values_preds = model.predict(train_data)  
        # values_targs = train_target
        # f1_train = 100 * metrics.f1_score(values_targs, values_preds)

        values_preds = model.predict(test_data)
        values_targs = test_target
        auc_test = 100 * metrics.roc_auc_score(values_targs, values_preds)
        print("nu = %.2f, gamma = %.2f, auc = %.2f" % (nu, gamma, auc_test))
        if best_auc < auc_test:
            best_nu = nu
            best_gamma = gamma
            best_auc = auc_test
            best_model = model