コード例 #1
0
class svm_model():
    def train(self, X, ker):
        self.model = OneClassSVM(kernel=ker, shrinking=True,random_state=1)
        self.model.fit(X)

    def predict(self, X):
        return self.model.predict(X)
コード例 #2
0
ファイル: dc161012.py プロジェクト: maybe-jkfirst/Data-Mining
def main():
	n = 1000
	data = []
	for i in range(n):
		data.append(np.array([np.random.randint(0, 5000) for i in range(np.random.randint(20, 150))]))
	data = np.array(data)

	# making all the data into 5 dimensions
	# howto : boxplot
	x = []
	y = []
	for i in data:
		sorted_i = sorted(i)
		x.append([max(sorted_i), np.percentile(sorted_i, 75), np.median(sorted_i), np.percentile(sorted_i, 25), min(sorted_i)])
		y.append(0)
	x = np.array(x)

	'''
	# making all the data into 5 dimensions
	# howto : distance
	start = time.time()
	data_i = 0
	cnt = 1
	x = np.zeros((n, n))
	for i in data:
		data_j = data_i
		for j in data[cnt:]:
			dist = dtw(i, j, dist=lambda i, j: norm(i - j, ord=1))[0]
			x[data_i][data_j+1], x[data_j+1][data_i] = dist, dist
			data_j += 1
		cnt += 1
		data_i += 1
	end = time.time()
	print(end - start)
	'''

	# build model with x
	model = OneClassSVM()
	model.fit(x)

	# create test dataset
	test = []
	for i in range(10):
		test.append(np.array([np.random.randint(0, 10000) for i in range(np.random.randint(20000, 30000))]))
	test = np.array(test)

	# transform test dataset
	x = []
	y = []
	for i in test:
		sorted_i = sorted(i)
		x.append([max(sorted_i), np.percentile(sorted_i, 75), np.median(sorted_i), np.percentile(sorted_i, 25), min(sorted_i)])
		y.append(0)
	x = np.array(x)

	# predict test dataset
	pred = model.predict(x)

	'''
コード例 #3
0
ファイル: svm.py プロジェクト: vishnu-locket/orange3
 def fit(self, X, Y, W):
     clf = OneClassSVM(kernel=self.kernel, degree=self.degree,
                       gamma=self.gamma, coef0=self.coef0, tol=self.tol,
                       nu=self.nu, shrinking=self.shrinking,
                       cache_size=self.cache_size, max_iter=self.max_iter)
     if W is not None:
         return OneClassSVMClassifier(clf.fit(X, W.reshape(-1)))
     return OneClassSVMClassifier(clf.fit(X))
コード例 #4
0
ファイル: learn.py プロジェクト: cmcneil/openepoc
class Cluster(object):

    def __init__(self, name):
        self.name = name
        self.raw_dataset = []
        self.dataset = []
        self.dataset_red = []
    
    def get_featurevec(self, data):
            '''Takes in data in the form of an array of EmoPackets, and outputs
                a list of feature vectors.'''
            # CHECKED, all good :) 
            num_bins = (len(data)/int(dsp.SAMPLE_RATE*dsp.STAGGER) -
                        int(dsp.BIN_SIZE / dsp.STAGGER) + 1)
            size = int(dsp.BIN_SIZE*dsp.SAMPLE_RATE)
            starts = int(dsp.SAMPLE_RATE*dsp.STAGGER)
            points = []
            for i in range(num_bins):
                points.append(dsp.get_features(data[i*starts:i*starts+size]))
            return points

    def add_data(self, raw):
        '''Allows the addition of new data. Will retrain upon addition.
            Expects a list of EmoPackets.'''
        self.dataset.extend(self.get_featurevec(raw))

    def extract_features(self):
        '''Does feature extraction for all of the datasets.'''
        self.dataset = []
        for sess in self.raw_dataset:
            self.dataset.extend(self.get_featurevec(sess))

    def reduce_dim(self, NDIM=5):
        '''Reduces the dimension of the extracted feature vectors.'''
        X = np.array(self.dataset)
        self.pca = RandomizedPCA(n_components=NDIM).fit(X)
        self.dataset_red = self.pca.transform(X)
        
    def train(self):
        '''Trains the classifier.'''
        self.svm = OneClassSVM()
        self.svm.fit(self.dataset_red)

    def is_novel(self, pt):
        '''Says whether or not the bin is novel. Expects an array of EmoPackets'''
        X = self.pca.transform(np.array(self.get_featurevec(data)[0]))
        ans = self.svm.predict(X)
        self.dataset_red.append(X)
        self.train()
        return ans
                    
    def save(self):
        '''Saves this classifier to a data directory.'''
        this_dir, this_filename = os.path.split(__file__)
        DATA_PATH = os.path.join(this_dir, "data", self.name+'.pkl')
        dumpfile = open(DATA_PATH, "wb")
        pickle.dump(self, dumpfile, pickle.HIGHEST_PROTOCOL)
        dumpfile.close()
コード例 #5
0
    def runClassifier(self, _driverId, numComponents=0):
        X = self.featuresHash.values()
        self.ids = self.featuresHash.keys()
        if self.runDimRed:
            X = self.dimRed(X, numComponents)

        clf = OCSVM(nu=self.nu, gamma=self.gamma)
        clf.fit(X)
        y_pred = clf.decision_function(X).ravel()
        threshold = stats.scoreatpercentile(y_pred, 100 * self.outliers_fraction)
        self.label = y_pred > threshold
        self.label = map(int, self.label)
コード例 #6
0
def select_best_support_vectors(data, nu=0.01, all_gammas=2 ** np.arange(-10, 10, 1)):
    all_errors = []
    for gamma in all_gammas:
        clf = OneClassSVM(nu=nu, gamma=gamma)
        clf.fit(data)
        prediction = clf.predict(data)
        out_of_class_count = np.sum(prediction == -1)
        support_vectors_count = len(clf.support_vectors_)
        error = (float(out_of_class_count) / len(data) - nu) ** 2
        error += (float(support_vectors_count) / len(data) - nu) ** 2
        all_errors.append(error)
    index = np.argmin(all_errors)
    return all_gammas[index], all_errors
コード例 #7
0
ファイル: embedding.py プロジェクト: gianlucacorrado/EDeN
def embed_dat_matrix_two_dimensions(low_dimension_data_matrix,
                                    y=None,
                                    labels=None,
                                    density_colormap='Blues',
                                    instance_colormap='YlOrRd'):
    from sklearn.preprocessing import scale
    low_dimension_data_matrix = scale(low_dimension_data_matrix)
    # make mesh
    x_min, x_max = low_dimension_data_matrix[:, 0].min(), low_dimension_data_matrix[:, 0].max()
    y_min, y_max = low_dimension_data_matrix[:, 1].min(), low_dimension_data_matrix[:, 1].max()
    step_num = 50
    h = min((x_max - x_min) / step_num, (y_max - y_min) / step_num)  # step size in the mesh
    b = h * 10  # border size
    x_min, x_max = low_dimension_data_matrix[:, 0].min() - b, low_dimension_data_matrix[:, 0].max() + b
    y_min, y_max = low_dimension_data_matrix[:, 1].min() - b, low_dimension_data_matrix[:, 1].max() + b
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

    # induce a one class model to estimate densities
    from sklearn.svm import OneClassSVM
    gamma = max(x_max - x_min, y_max - y_min)
    clf = OneClassSVM(gamma=gamma, nu=0.1)
    clf.fit(low_dimension_data_matrix)

    # Plot the decision boundary. For that, we will assign a color to each
    # point in the mesh [x_min, m_max] . [y_min, y_max].
    if hasattr(clf, "decision_function"):
        score_matrix = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
    else:
        score_matrix = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
    # Put the result into a color plot
    levels = np.linspace(min(score_matrix), max(score_matrix), 40)
    score_matrix = score_matrix.reshape(xx.shape)

    if y is None:
        y = 'white'

    plt.contourf(xx, yy, score_matrix, cmap=plt.get_cmap(density_colormap), alpha=0.9, levels=levels)
    plt.scatter(low_dimension_data_matrix[:, 0], low_dimension_data_matrix[:, 1],
                alpha=.5,
                s=70,
                edgecolors='gray',
                c=y,
                cmap=plt.get_cmap(instance_colormap))
    # labels
    if labels is not None:
        for id in range(low_dimension_data_matrix.shape[0]):
            label = labels[id]
            x = low_dimension_data_matrix[id, 0]
            y = low_dimension_data_matrix[id, 1]
            plt.annotate(label, xy=(x, y), xytext=(0, 0), textcoords='offset points')
コード例 #8
0
ファイル: svm.py プロジェクト: bondarchukYV/AD
def svm(data, fraction=0.05, kernel='poly', degree=3, gamma=0, coeff=0):
    svm = OneClassSVM(kernel=kernel, degree=degree, gamma=gamma, nu=fraction, coeff0=coeff)
    svm.fit(data)

    score = svm.predict(data)
    numeration = [[i] for i in xrange(1, len(data)+1, 1)]
    numeration = np.array(numeration)
    y = np.hstack((numeration, score))

    anomalies = numeration
    for num,s in y:
        if (y == 1):
            y = np.delete(anomalies, num-1, axis=0)

    return anomalies
コード例 #9
0
ファイル: feature_eng.py プロジェクト: CharlieDaniels/Rally
def outlier_detect(data_frame):
    #pandas to numpy - digestible by scikit
    columns = ['blm_tag_count','protest_count','justice_count','riot_count','breathe_count']
    features = data_frame[list(columns)].values

    clf = OneClassSVM(nu=0.008, gamma=0.05)
    clf.fit(features)
    y_pred = clf.predict(features)

    mask=[y_pred==-1]
    oak_array = np.asarray(data_frame.hourly)
    protest_predict = oak_array[mask]
    protest_hours = list(protest_predict)
    
    return protest_hours
コード例 #10
0
def select_best_outlier_fraction_cross_val(data, nu=0.05, all_gammas=2 ** np.arange(-10, 10, 50), folds_count=7):
    all_errors = []
    kf_iterator = KFold(len(data), n_folds=folds_count)
    for gamma in all_gammas:
        error = 0
        for train, test in kf_iterator:
            train_data = data[train,:]
            test_data = data[test,:]
            clf = OneClassSVM(nu=nu, gamma=gamma)
            clf.fit(train_data)
            prediction = clf.predict(test_data)
            outlier_fraction = np.mean(prediction == -1)
            error += (nu - outlier_fraction) ** 2 + (float(clf.support_vectors_.shape[0]) / len(data) - nu) ** 2
        all_errors.append(error / folds_count)
    best_index = np.argmin(error)
    return int(best_index), all_errors
コード例 #11
0
class OneClassSVMDetector(BaseOutlier):
    @staticmethod
    def get_attributes():
        return {
            "nu":0.1,
            "kernel":['rbf','linear', 'poly', 'rbf', 'sigmoid', 'precomputed'],
            "gamma":0.1,
        }
    def __init__(self,nu=0.1,kernel='rbf',gamma=0.1):
        self.nu = nu
        self.kernel = kernel
        self.gamma = gamma
    def fit(self,data=None):
        self.data = data
        self.check_finite(data)
        if(self._is_using_pandas(data)==True):
            self.data.interpolate(inplace=True)
        # self.datareshap = data.reshape(-1,1)
        self.clf = OneClassSVM(nu=self.nu, kernel=self.kernel, gamma=self.gamma)
        self.clf.fit(data.reshape(-1,1))
        # print "done"
        return self
    def predict(self, X_test):
        y_pred_train = self.clf.predict(X_test.reshape(-1,1))

        outlier_idx = np.where(y_pred_train == -1)
        inlier_idx = np.where(y_pred_train == 1)
        d = {
            'timestamp': self.data.index[outlier_idx],
            'anoms': self.data.iloc[outlier_idx]
        }
        anoms = pd.DataFrame(d)
        self.anomaly_idx = anoms.index
        self.anom_val = anoms['anoms']
        return anoms
    def fit_predict(self, data=None):
        self.fit(data)
        return self.predict(data)
    def plot(self):
        import matplotlib.pyplot as plt
        f, ax = plt.subplots(1, 1)
        ax.plot(self.data, 'b')
        ax.plot(self.anomaly_idx, self.anom_val, 'ro')
        ax.set_title('Detected Anomalies')
        ax.set_ylabel('Count')
        f.tight_layout()
        return f
コード例 #12
0
def cross_validate():
    #for tinkering with the model
    #read data
    all_df = pd.read_csv('./data/train.csv',index_col = 'ID')

    #split data
    zeros_df = all_df[all_df.TARGET == 0]
    ones_df = all_df[all_df.TARGET == 1]
    
    num_ones = ones_df.shape[0]
    msk = np.random.permutation(len(zeros_df)) < num_ones
    
    zeros_train_df = zeros_df[~msk]
    zeros_test_df = zeros_df[msk]


    ones_test_df = ones_df
    
    train_df = zeros_train_df
    test_df = pd.concat([zeros_test_df,ones_test_df])
    
    train_X = np.array(train_df.drop('TARGET', axis = 1))
    train_Y = np.array(train_df.TARGET)
    
    test_X = np.array(test_df.drop('TARGET',axis = 1))
    test_Y = np.array(test_df.TARGET) #true target values
    
    
    #init svm 
    print('training svm')
    my_svm = OneClassSVM(verbose = True)
    my_svm.fit(train_X)
    
    
    #predict
    print('predicting')
    predictions = my_svm.predict(test_X)
    
    

    conf_matrix = confusion_matrix(test_Y,predictions)
    print('confusion matrix:')
    print(pd.DataFrame(conf_matrix,columns = [0,1]))
    
    print('accuracy:')
    print(sum(test_Y.reshape(predictions.shape) == predictions)/len(test_Y))
コード例 #13
0
  def find_anomaly(label1, label2, winsize):
    print("Find anomaly in channel", label1 + '-' + label2 + '...', file=sys.stderr)
    print("-"*80)
    print("Channel [" + label1 + '-' + label2 + ']')
    print("-"*80)

    # find difference
    electrode1 = eeg.chan_lab.index(label1)
    electrode2 = eeg.chan_lab.index(label2)
    wave = eeg.X[electrode1] - eeg.X[electrode2]

    # # import random
    # wave = [random.uniform(-20,20) for _ in range(400*30)] + [random.uniform(-2000,2000) for _ in range(5*30)]
    # wave = np.array(wave)

    print("Splitting into windows...", file=sys.stderr)
    wave_windows = np.array_split(wave, len(wave)/eeg.sample_rate/winsize)
    # wave_windows = np.array_split(wave, len(wave)/winsize)

    print("Extracting features...", file=sys.stderr)
    def extract_features(wave_window): 
      max_val = max(wave_window)
      min_val = min(wave_window)
      stdev = np.std(wave_window)
      sum_val = sum(wave_window)
      sum_pos_val = sum([x for x in wave_window if x > 0])
      sum_abs_val = sum([abs(x) for x in wave_window])
      return [max_val, min_val, stdev, sum_val, sum_pos_val, sum_abs_val]

    Examples = np.array(map(extract_features, wave_windows))

    print("Training model, assuming no more than", CONTAMINATION, "anomaly...", file=sys.stderr)
    od = OneClassSVM(nu=CONTAMINATION, kernel='poly', gamma=0.05, max_iter=100000)
    od.fit(Examples)

    decisions = od.decision_function(Examples)
    # print decisions
    # print max(decisions), min(decisions)

    print("Most likely windows with anomaly:")
    # find most likely windows, in desc order
    largest_indices = np.argsort((-np.absolute(decisions)).ravel())[:20]
    for large_index in largest_indices:
      print(large_index*winsize/60, "min (score:", decisions[large_index][0], ")")

    sys.stdout.flush()
コード例 #14
0
ファイル: cluster.py プロジェクト: sandialabs/BioCompoundML
 def determine_test_similarity(self, model):
     clf_OCSVM = {}
     model_OCSVM = {}
     for i in range(len(model)):
         clf = OneClassSVM(kernel='rbf', nu=0.1, gamma=.023)
         clf_OCSVM[i] = clf
         OCSVMmodel = clf.fit(model[i])
         model_OCSVM[i] = OCSVMmodel
     return clf_OCSVM, model_OCSVM
コード例 #15
0
ファイル: engshut_utils.py プロジェクト: papart/mypm
def plot_scatter(X_dict, y_dict, col1, col2, max_error, max_filled_gap, insens, 
        f_colors = ['yellow', 'red', 'blue'], nu=0.98, high=0.95):

    planes = sorted(X_dict.keys())
    planes_with_failures = sorted([key for key in X_dict.keys() if y_dict[key].sum()>0])

    ocsvm = OneClassSVM(kernel='linear', nu=0.98)
    X_train = pd.concat(dict([(plane, X_dict[plane][[col1, col2]].dropna()) 
                              for plane in planes_with_failures]))
    ocsvm.fit(X_train.values)

    qb = QuantileBinarizer(low=0.0, high=0.95, each_side=False)
    qb.fit(X_train)

    mask_pref = pd.concat(dict(
            [(plane, get_mask_pref(y_dict[plane], max_error)) for plane in planes]), axis=0)
    mask_norm = pd.concat(dict(
            [(plane, get_mask_norm(y_dict[plane], max_error, insens)) for plane in planes]), axis=0) 

    fig = plt.figure(figsize=(15,15), dpi=100)
    # plt.xlabel('Norm of res. phase: %s, group: %s' % (col1[0], str(col_groups[col1[0]][int(col1[1][-1])])))
    # plt.ylabel('Norm of res. phase: %s, group: %s' % (col2[0], str(col_groups[col2[0]][int(col2[1][-1])])))
    plt.xlabel(col1)
    plt.ylabel(col2)

    plot_norm = plt.scatter(pd.concat(X_dict)[col1].loc[mask_norm], 
                pd.concat(X_dict)[col2].loc[mask_norm], c='lightgrey', zorder=1, s=6)
    plot_pref = []
    for i, plane in enumerate(planes_with_failures):        
        plot_pref.append(plt.scatter(X_dict[plane][col1].loc[get_mask_pref(y_dict[plane], max_error)], 
                    X_dict[plane][col2].loc[get_mask_pref(y_dict[plane], max_error)], 
                        c=f_colors[i], zorder=2, s=30))
    x_min, x_max, y_min, y_max = plt.axis('tight')

    plt.axvline(qb._thresholds[col1]['high'], c='green')
    plt.axhline(qb._thresholds[col2]['high'], c='green')
    plot_line = plt.plot([x_min, x_max], 
                         [(ocsvm.intercept_ - ocsvm.coef_[0][0] * x_min) / ocsvm.coef_[0][1],
                          (ocsvm.intercept_ - ocsvm.coef_[0][0] * x_max) / ocsvm.coef_[0][1]],
                         c='red')

    # # plt.legend((plot_norm, plot_pref), ('No-failure', 'Pre-failure'),
    # #            scatterpoints=1, loc='upper right', ncol=1)
    # #plt.savefig('./scatter/pair_group_of_fours3.png')
コード例 #16
0
ファイル: AnomalyDetector.py プロジェクト: NcoderA/518Project
 def predict_header_features(self, pkt_featurizer):
     group_id = pkt_featurizer.pkt_type
     features = pkt_featurizer.features
     arrival_time = pkt_featurizer.arrival_time
     try:
         vectorizer = DictVectorizer()
         vectorizer.fit(self.training_data[group_id])
         training_data_vectorized = vectorizer.transform(self.training_data[group_id])
         features_vectorized = vectorizer.transform(features)
         scaler = preprocessing.StandardScaler(with_mean=False)
         training_data_vectorized = scaler.fit_transform(training_data_vectorized)
         features_vectorized = scaler.transform(features_vectorized)
         classifier = OneClassSVM()
         classifier.fit(training_data_vectorized)
         result = classifier.predict(features_vectorized)
         distance = classifier.decision_function(features_vectorized)
     except KeyError:
         result = 0
         distance = 0
     return result, distance
コード例 #17
0
class TwoStage(object):

    def __init__(self, *args, **kwargs):
        super(TwoStage, self).__init__(*args, **kwargs)
        self._oneCls = OneClassSVM(nu=NU, gamma=GAMMA)
        self._clf = RandomForestClassifier(n_estimators=30)
        self._scaler = StandardScaler()

    def fit(self, data, labels):
        sdata = self._scaler.fit_transform(data)
        self._oneCls.fit(sdata)
        self._clf.fit(sdata, labels)
        return self

    def predict(self, data):
        sdata = self._scaler.transform(data)
        is_known_cls = self._oneCls.predict(sdata)
        cls = self._clf.predict(sdata)
        cls[is_known_cls == -1] = "zother"        
        classes = list(self._clf.classes_) + ["zother"]
        return cls, classes
コード例 #18
0
ファイル: estimators.py プロジェクト: Patechoc/labs-untested
class NoveltySeparator(BaseEstimator):

    def get_params(self, deep=True):
        return {}

    def fit(self, X, y):
        # lets treat users spending something in the rest of the month as outliers
        inliers = y - X[:, 0]
        inliers = np.where(inliers < 0.1, True, False)

        self.detector = OneClassSVM(nu=0.05, cache_size=2000, verbose=True)

        # training only on inliers
        print("Training detector")
        self.detector.fit(X[inliers])
        results = self.detector.predict(X).reshape(X.shape[0])
        # predicted
        inliers = results == 1
        outliers = results == -1

        print("Training estimators")
        self.est_inliers = Ridge(alpha=0.05)
        self.est_outliers = Ridge(alpha=0.05)
        self.est_inliers.fit(X[inliers], y[inliers])
        self.est_inliers.fit(X[outliers], y[outliers])

    def predict(self, X):

        y = np.zeros(X.shape[0])

        labels = self.detector.predict(X).reshape(X.shape[0])
        inliers = lables == 1
        outliers = lables == -1

        y[inliers] = self.est_inliers.predict(X[inliers])
        y[outliers] = self.est_outliers.predict(X[outliers])

        return y
コード例 #19
0
ファイル: AnomalyDetector.py プロジェクト: NcoderA/518Project
 def predict_pkt_length_features(self, pkt_featurizer):
     group_id = pkt_featurizer.pkt_type
     try:
         dbscan = DBSCAN()
         pkt_lengths = np.array(list(self.pkt_lengths[group_id])+[pkt_featurizer.len_bytes]).reshape(-1,1)
         labels = dbscan.fit_predict(pkt_lengths)
         dbscan_prediction = labels[-1] == -1
         if self.plot:
             self.plot_1d_dbscan(pkt_lengths, labels, range(len(pkt_lengths)), self.pkt_lengths_fig_dbscan, 
                                 "", "Pkt Length", "Pkt Length DBSCAN Clustering - Anomalous Pkts in Black")
         one_class_svm = OneClassSVM()
         scaler = preprocessing.StandardScaler()
         pkt_lengths_scaled = scaler.fit_transform(np.array(self.pkt_lengths[group_id]).reshape(-1,1))
         features_scaled = scaler.transform(np.array(pkt_featurizer.len_bytes).reshape(1,-1))
         one_class_svm.fit(pkt_lengths_scaled)
         svm_prediction = one_class_svm.predict(features_scaled)
         if self.plot and len(pkt_lengths_scaled) > 2:
             self.plot_1d_svm(self.pkt_lengths[group_id], one_class_svm, range(len(self.pkt_lengths[group_id])), scaler, self.pkt_lengths_fig_svm,  
                              "Pkt", "Pkt Length", "Pkt Length One Class SVM Classification")
     except (KeyError, IndexError) as e:
         print e
         dbscan_prediction = 0
     return dbscan_prediction
コード例 #20
0
def slice_probability_space_selection(data, nu=0.05, all_gammas=2 ** np.linspace(-10, 10, 50),
    rho=0.05, outlier_distribution = np.random.rand, folds_count=7):
    kf_iterator = KFold(len(data), n_folds=folds_count)
    all_errors = []
    for gamma in all_gammas:
        error = 0.0
        clf = OneClassSVM(nu=nu, gamma=gamma)
        for train, test in kf_iterator:
            train_data = data[train,:]
            test_data = data[test,:]
            clf = OneClassSVM(nu=nu, gamma=gamma)
            clf.fit(train_data)
            prediction = clf.predict(test_data)
            inlier_metric_part = np.mean(prediction == -1)
            inlier_metric_part = inlier_metric_part / (1 + rho) / len(data)
            outliers = outlier_distribution(*data.shape) - 0.5
            outliers *= 8 * np.std(data)
            outlier_metric_part = np.mean(clf.predict(outliers) == 1) * rho / (1 + rho) / len(outliers)
            error += inlier_metric_part + outlier_metric_part
        all_errors.append(error / folds_count)
    index = np.argmin(all_errors)
    #best_index = pd.Series(all_errors).pct_change().argmax() - 1
    return int(index), all_errors
コード例 #21
0
	def remove_outliers_SVM(self):
		## Remove outliers using a OneClassSVM method

		print "Running SVM to remove outliers..."

		svm = OneClassSVM(kernel='rbf', nu=0.1, degree=3, verbose=1)
		fit = svm.fit(self.DataArray)
		decision = svm.decision_function(self.DataArray)
		_indices = []

		# If a value is below the decision hyperplane, eliminate it
		for i in range(len(decision)):
			if decision[i] < 0:
				_indices.append(i)
		print self.DataArray.shape
		self.DataArray = np.delete(self.DataArray, _indices, axis=0)
		self.TargetArray = np.delete(self.TargetArray, _indices, axis=0)
		print self.DataArray.shape
コード例 #22
0
	def oneClass(self):
		model = OneClassSVM()
		model.fit(self.arr)
		model.predict(self.arr)
コード例 #23
0
ファイル: Training.py プロジェクト: su18/Marcus
    def get_model(self):
        start = datetime.now()
        print("Start at {}".format(start.strftime("%Y/%m/%d %H:%M:%S")))
        train_example = []
        xss_example = []
        non_xss_example = []

        # 读取训练集(整理好的XSS Payload)
        self.read_txt(self.train_path, train_example)
        # 读取正常请求样本集
        self.read_txt(self.test_none_xss_path, non_xss_example)
        # 读取攻击请求样本集
        self.read_txt(self.test_xss_path, xss_example)
        # 特征向量化训练样本
        tf_idf_vector = TfIdfVector()
        train_vector = tf_idf_vector.fit_vector
        # 特征向量化黑白样本
        test_normal_vector = tf_idf_vector.transform(xss_example)
        test_abnormal_vector = tf_idf_vector.transform(non_xss_example)
        y = [1] * (len(train_example))
        #  遍历调优参数nu与gamma
        grid = {
            'gamma': np.logspace(-8, 1, 10),
            'nu': np.linspace(0.01, 0.20, 20)
        }
        # 核函数(rbf,linear,poly)
        kernel = 'rbf'
        # 最高准确度、召回率、F1值纪录
        max_F1 = 0
        max_Re = 0
        max_Pr = 0
        # 最高准确度、召回率、F1值时参数gamma的值
        gamma_r_F1 = 0.01
        gamma_r_Re = 0.01
        gamma_r_Pr = 0.01
        # 最高准确度、召回率、F1值时参数nu的值
        nu_r_F1 = 0
        nu_r_Re = 0
        nu_r_Pr = 0
        svdd = OneClassSVM(kernel=kernel)
        zero_count = 0
        re_gamma = 0
        total_loop = len(ParameterGrid(grid))
        process_count = 0
        for z in ParameterGrid(grid):
            process_count += 1
            if re_gamma == z.get('gamma'):
                if zero_count >= 4:
                    continue
            else:
                zero_count = 0
            svdd.set_params(**z)
            svdd.fit(train_vector, y)
            k = svdd.get_params()

            # 攻击请求样本测试
            f = svdd.predict(test_normal_vector)
            TP = f.tolist().count(1)  # True positive
            FN = f.tolist().count(-1)  # False Negative

            # 非攻击样本测试
            f = svdd.predict(test_abnormal_vector)
            FP = f.tolist().count(1)  # False positive
            Precision = 0 if TP == 0 else (TP / (TP + FP))  # Precision
            Recall = 0 if TP == 0 else (TP / (TP + FN))  # Recall
            if Recall == 0 or Precision == 0:
                F1_score = 0
                zero_count += 1
                re_gamma = k.get('gamma')
            else:
                F1_score = 2 * Precision * Recall / (Precision + Recall
                                                     )  # F1 value

            if F1_score > max_F1:
                max_F1 = F1_score
                nu_r_F1 = k.get('nu')
                gamma_r_F1 = k.get('gamma')

            if Recall > max_Re:
                max_Re = Recall
                nu_r_Re = k.get('nu')
                gamma_r_Re = k.get('gamma')

            if Precision > max_Pr:
                max_Pr = Precision
                nu_r_Pr = k.get('nu')
                gamma_r_Pr = k.get('gamma')

            print(
                "========================== [{}] ===========================".
                format(datetime.now().strftime("%Y/%m/%d %H:%M:%S")))
            print(
                "nu: ",
                k.get('nu'),
                'gamma',
                k.get('gamma'),
            )
            print("Precision: {}%".format(Precision * 100))
            print("Recall: {}%".format(Recall * 100))
            print("F1 score: {}".format(F1_score))
        print("========================== [{}] ===========================".
              format(datetime.now().strftime("%Y/%m/%d %H:%M:%S")))

        print(
            "MAX Precision:  {:^20.6f}When Current nu: {:^20.6f} and gamma: {:0.8f}"
            .format(max_Pr, nu_r_Pr, gamma_r_Pr))
        print(
            "MAX Recall:     {:^20.6f}When Current nu: {:^20.6f} and gamma: {:0.8f}"
            .format(max_Re, nu_r_Re, gamma_r_Re))
        print(
            "MAX F1:         {:^20.6f}When Current nu: {:^20.6f} and gamma: {:0.8f}"
            .format(max_F1, nu_r_F1, gamma_r_F1))
        total_second = datetime.now() - start
        print("Cost {}s.".format(total_second.total_seconds()))
        with open(os.path.join(self.root_path, "ModuleTrain/cache/model.pkl"),
                  'wb') as file:
            svdd.set_params(kernel=kernel, nu=nu_r_F1, gamma=gamma_r_F1)
            svdd.fit(train_vector, y)
            pickle.dump(svdd, file)
        self.complete = True
コード例 #24
0
ファイル: novelty_detection.py プロジェクト: johnrobinsn/mHTM
def base_experiment(pct_noise=0.15, noverlap_bits=0, exp_name='1-1',
	ntrials=10, verbose=True, seed=123456789):
	"""
	Run a single experiment, locally.
	
	@param pct_noise: The percentage of noise to add to the dataset.
	
	@param noverlap_bits: The number of bits the base class should overlap
	with the novelty class.
	
	@param exp_name: The name of the experiment.
	
	@param ntrials: The number of times to repeat the experiment.
	
	@param verbose: If True print the results.
	
	@param seed: The random seed to use.
	
	@return: A tuple containing the percentage errors for the SP's training
	and testing results and the SVM's training and testing results,
	respectively.
	"""
	
	# Base parameters
	ntrain, ntest = 800, 200
	nsamples, nbits, pct_active = ntest + ntrain, 100, 0.4
	clf_th = 0.5
	log_dir = os.path.join(os.path.expanduser('~'), 'scratch',
		'novelty_experiments', exp_name)
	
	# Configure the SP
	config = {
		'ninputs': 100,
		'trim': 1e-4,
		'disable_boost': True,
		'seed': seed,
		'pct_active': None,
		'random_permanence': True,
		'pwindow': 0.5,
		
		'global_inhibition': True,
		
		'ncolumns': 200,
		'nactive': 50,
		
		
		'nsynapses': 75,
		'seg_th': 15,
		
		'syn_th': 0.5,
		
		'pinc': 0.001,
		'pdec': 0.001,
		
		'nepochs': 10,
		
		'log_dir': log_dir
	}
	
	# Seed numpy
	np.random.seed(seed)
	
	# Create the base dataset
	x_ds = SPDataset(nsamples, nbits, pct_active, pct_noise, seed=seed)
	x_tr, x_te = x_ds.data[:ntrain], x_ds.data[ntrain:]
	
	# Create the outlier dataset
	base_indexes = set(np.where(x_ds.base_class == 1)[0])
	choices = [x for x in xrange(nbits) if x not in base_indexes]
	outlier_base = np.zeros(nbits, dtype='bool')
	outlier_base[np.random.choice(choices, x_ds.nactive - noverlap_bits,
		False)] = 1
	outlier_base[np.random.permutation(list(base_indexes))[:noverlap_bits]] = 1
	y_ds = SPDataset(ntest, nbits, pct_active, pct_noise, outlier_base, seed)
	y_te = y_ds.data
	
	if verbose:
		print "\nBase class' test noise: {0:2.2f}".format(1 - (np.mean(x_te, 0)
			* x_ds.base_class.astype('i')).sum() / 40.)
		print "Outlier's class noise: {0:2.2f}".format(1 - (np.mean(y_te, 0) *
			outlier_base.astype('i')).sum() / 40.)
		print 'Overlap between two classes: {0}'.format(np.dot(
			x_ds.base_class.astype('i'), outlier_base.astype('i')))
	
	# Metrics
	metrics = SPMetrics()
	
	# Get the metrics for the datasets
	u_x_tr = metrics.compute_uniqueness(x_tr)
	o_x_tr = metrics.compute_overlap(x_tr)
	c_x_tr = 1 - metrics.compute_distance(x_tr)
	u_x_te = metrics.compute_uniqueness(x_te)
	o_x_te = metrics.compute_overlap(x_te)
	c_x_te = 1 - metrics.compute_distance(x_te)
	u_y_te = metrics.compute_uniqueness(y_te)
	o_y_te = metrics.compute_overlap(y_te)
	c_y_te = 1 - metrics.compute_distance(y_te)
	
	# Initialize the overall results
	sp_x_results = np.zeros(ntrials)
	sp_y_results = np.zeros(ntrials)
	svm_x_results = np.zeros(ntrials)
	svm_y_results = np.zeros(ntrials)
	
	# Iterate across the trials:
	for i in xrange(ntrials):
		# Make a new seed
		seed2 = np.random.randint(1000000)
		config['seed'] = seed2
		config['log_dir'] = '{0}-{1}'.format(log_dir, i + 1)
		
		# Create the SP
		sp = SPRegion(**config)
		
		# Fit the SP
		sp.fit(x_tr)
		
		# Get the SP's output
		sp_x_tr = sp.predict(x_tr)
		sp_x_te = sp.predict(x_te)
		sp_y_te = sp.predict(y_te)
		
		# Get the metrics for the SP's results
		u_sp_x_tr = metrics.compute_uniqueness(sp_x_tr)
		o_sp_x_tr = metrics.compute_overlap(sp_x_tr)
		c_sp_x_tr = 1 - metrics.compute_distance(sp_x_tr)
		u_sp_x_te = metrics.compute_uniqueness(sp_x_te)
		o_sp_x_te = metrics.compute_overlap(sp_x_te)
		c_sp_x_te = 1 - metrics.compute_distance(sp_x_te)
		u_sp_y_te = metrics.compute_uniqueness(sp_y_te)
		o_sp_y_te = metrics.compute_overlap(sp_y_te)
		c_sp_y_te = 1 - metrics.compute_distance(sp_y_te)
		
		# Log all of the metrics
		sp._log_stats('Input Base Class Train Uniqueness', u_x_tr)
		sp._log_stats('Input Base Class Train Overlap', o_x_tr)
		sp._log_stats('Input Base Class Train Correlation', c_x_tr)
		sp._log_stats('Input Base Class Test Uniqueness', u_x_te)
		sp._log_stats('Input Base Class Test Overlap', o_x_te)
		sp._log_stats('Input Base Class Test Correlation', c_x_te)
		sp._log_stats('Input Novelty Class Test Uniqueness', u_y_te)
		sp._log_stats('Input Novelty Class Test Overlap', o_y_te)
		sp._log_stats('Input Novelty Class Test Correlation', c_y_te)	
		sp._log_stats('SP Base Class Train Uniqueness', u_sp_x_tr)
		sp._log_stats('SP Base Class Train Overlap', o_sp_x_tr)
		sp._log_stats('SP Base Class Train Correlation', c_sp_x_tr)
		sp._log_stats('SP Base Class Test Uniqueness', u_sp_x_te)
		sp._log_stats('SP Base Class Test Overlap', o_sp_x_te)
		sp._log_stats('SP Base Class Test Correlation', c_sp_x_te)
		sp._log_stats('SP Novelty Class Test Uniqueness', u_sp_y_te)
		sp._log_stats('SP Novelty Class Test Overlap', o_sp_y_te)
		sp._log_stats('SP Novelty Class Test Correlation', c_sp_y_te)
		
		# Print the results
		fmt_s = '{0}:\t{1:2.4f}\t{2:2.4f}\t{3:2.4f}\t{4:2.4f}\t{5:2.4f}\t{5:2.4f}'
		if verbose:
			print '\nDescription\tx_tr\tx_te\ty_te\tsp_x_tr\tsp_x_te\tsp_y_te'
			print fmt_s.format('Uniqueness', u_x_tr, u_x_te, u_y_te, u_sp_x_tr,
				u_sp_x_te, u_sp_y_te)
			print fmt_s.format('Overlap', o_x_tr, o_x_te, o_y_te, o_sp_x_tr, o_sp_x_te,
				o_sp_y_te)
			print fmt_s.format('Correlation', c_x_tr, c_x_te, c_y_te, c_sp_x_tr,
				c_sp_x_te, c_sp_y_te)
		
		# Get average representation of the base class
		sp_base_result = np.mean(sp_x_tr, 0)
		sp_base_result[sp_base_result >= 0.5] = 1
		sp_base_result[sp_base_result < 1] = 0
		
		# Averaged results for each metric type
		u_sp_base_to_x_te = 0.
		o_sp_base_to_x_te = 0.
		c_sp_base_to_x_te = 0.
		u_sp_base_to_y_te = 0.
		o_sp_base_to_y_te = 0.
		c_sp_base_to_y_te = 0.
		for x, y in zip(sp_x_te, sp_y_te):
			# Refactor
			xt = np.vstack((sp_base_result, x))
			yt = np.vstack((sp_base_result, y))
			
			# Compute the sums
			u_sp_base_to_x_te += metrics.compute_uniqueness(xt)
			o_sp_base_to_x_te += metrics.compute_overlap(xt)
			c_sp_base_to_x_te += 1 - metrics.compute_distance(xt)
			u_sp_base_to_y_te += metrics.compute_uniqueness(yt)
			o_sp_base_to_y_te += metrics.compute_overlap(yt)
			c_sp_base_to_y_te += 1 - metrics.compute_distance(yt)
		u_sp_base_to_x_te /= ntest
		o_sp_base_to_x_te /= ntest
		c_sp_base_to_x_te /= ntest
		u_sp_base_to_y_te /= ntest
		o_sp_base_to_y_te /= ntest
		c_sp_base_to_y_te /= ntest
		
		# Log the results
		sp._log_stats('Base Train to Base Test Uniqueness',
			u_sp_base_to_x_te)
		sp._log_stats('Base Train to Base Test Overlap', o_sp_base_to_x_te)
		sp._log_stats('Base Train to Base Test Correlation', c_sp_base_to_x_te)
		sp._log_stats('Base Train to Novelty Test Uniqueness',
			u_sp_base_to_y_te)
		sp._log_stats('Base Train to Novelty Test Overlap', o_sp_base_to_y_te)
		sp._log_stats('Base Train to Novelty Test Correlation',
			c_sp_base_to_y_te)
		
		# Print the results
		if verbose:
			print '\nDescription\tx_tr->x_te\tx_tr->y_te'
			print 'Uniqueness:\t{0:2.4f}\t{1:2.4f}'.format(u_sp_base_to_x_te,
				u_sp_base_to_y_te)
			print 'Overlap:\t{0:2.4f}\t{1:2.4f}'.format(o_sp_base_to_x_te,
				o_sp_base_to_y_te)
			print 'Correlation:\t{0:2.4f}\t{1:2.4f}'.format(c_sp_base_to_x_te,
				c_sp_base_to_y_te)
		
		# Create an SVM
		clf = OneClassSVM(kernel='linear', nu=0.1, random_state=seed2)
		
		# Evaluate the SVM's performance
		clf.fit(x_tr)
		svm_x_te = len(np.where(clf.predict(x_te) == 1)[0]) / float(ntest) * \
			100
		svm_y_te = len(np.where(clf.predict(y_te) == -1)[0]) / float(ntest) * \
			100
		
		# Perform classification using overlap as the feature
		# -- The overlap must be above 50%
		clf_x_te = 0.
		clf_y_te = 0.
		for x, y in zip(sp_x_te, sp_y_te):
			# Refactor
			xt = np.vstack((sp_base_result, x))
			yt = np.vstack((sp_base_result, y))
			
			# Compute the accuracy
			xo = metrics.compute_overlap(xt)
			yo = metrics.compute_overlap(yt)
			if xo >= clf_th: clf_x_te += 1
			if yo < clf_th: clf_y_te += 1
		clf_x_te = (clf_x_te / ntest) * 100
		clf_y_te = (clf_y_te / ntest) * 100
		
		# Store the results as errors
		sp_x_results[i] = 100 - clf_x_te
		sp_y_results[i] = 100 - clf_y_te
		svm_x_results[i] = 100 - svm_x_te
		svm_y_results[i] = 100 - svm_y_te
		
		# Log the results
		sp._log_stats('SP % Correct Base Class', clf_x_te)
		sp._log_stats('SP % Correct Novelty Class', clf_y_te)
		sp._log_stats('SVM % Correct Base Class', svm_x_te)
		sp._log_stats('SVM % Correct Novelty Class', svm_y_te)
		
		# Print the results
		if verbose:
			print '\nSP Base Class Detection     : {0:2.2f}%'.format(clf_x_te)
			print 'SP Novelty Class Detection  : {0:2.2f}%'.format(clf_y_te)
			print 'SVM Base Class Detection    : {0:2.2f}%'.format(svm_x_te)
			print 'SVM Novelty Class Detection : {0:2.2f}%'.format(svm_y_te)
	
	return sp_x_results, sp_y_results, svm_x_results, svm_y_results
コード例 #25
0
ファイル: bench_ocsvm.py プロジェクト: ngoix/OCRF
            # X = X[indices]
            # y = y[indices]

            X_train = X[:n_samples_train, :]
            X_test = X[n_samples_train:, :]
            y_train = y[:n_samples_train]
            y_test = y[n_samples_train:]

            # # training only on normal data:
            # X_train = X_train[y_train == 0]
            # y_train = y_train[y_train == 0]

            print('OneClassSVM processing...')
            model = OneClassSVM(cache_size=500)
            tstart = time()
            model.fit(X_train)
            fit_time += time() - tstart
            tstart = time()

            scoring = -model.decision_function(X_test)  # the lower,the more normal
            predict_time += time() - tstart
            fpr_, tpr_, thresholds_ = roc_curve(y_test, scoring)

            if fit_time + predict_time > max_time:
                raise TimeoutError

            f = interp1d(fpr_, tpr_)
            tpr += f(x_axis)
            tpr[0] = 0.

            precision_, recall_ = precision_recall_curve(y_test, scoring)[:2]
        if run_lof_svm == 0:
            lof_scores = iso_scores
            osvm_scores = iso_scores
        elif j == 0:
            print('\n******LOF*******\n')
            start = time.time()
            lof = LocalOutlierFactor()
            lof.fit(X)
            end = time.time()
            time_all[j, 1] = end - start
            lof_scores = lof.negative_outlier_factor_

            print('\n******1-class SVM*******\n')
            start = time.time()
            osvm = OneClassSVM(kernel='rbf')
            osvm.fit(X)
            end = time.time()
            time_all[j, 2] = end - start
            osvm_scores = osvm.score_samples(X)

        print('\n******Our Algo*******\n')
        start = time.time()
        #n_samples = int(t1/50)
        n_samples = 100
        kwargs = {
            'max_depth': 10,
            'n_trees': 50,
            'max_samples': n_samples,
            'max_buckets': 3,
            'epsilon': 0.1,
            'sample_axis': 1,
コード例 #27
0
ファイル: main.py プロジェクト: shanyaanand/OCSVM
import numpy as np
from utils import Get_training_data, Get_testing_data
from sklearn.decomposition import KernelPCA
from sklearn.svm import OneClassSVM
import matplotlib.pyplot as plt

X = Get_training_data()
transformer = KernelPCA(n_components=8, kernel='rbf')

X_pca = []
for x in X:
    print(np.array(x).shape)
    transformed = transformer.fit_transform(x)
    for i in transformed:
        X_pca.append(i)
clf = OneClassSVM(gamma='auto')
X_pca = np.array(X_pca)
print(np.array(X_pca).shape)
plt.scatter(X_pca[:, 0], X_pca[:, 1], label="train data")
plt.show()
clf.fit(X_pca)
print(clf.predict(X_pca))
コード例 #28
0
def base_experiment(pct_noise=0.15, noverlap_bits=0, exp_name='1-1',
	ntrials=10, verbose=True, seed=123456789):
	"""
	Run a single experiment, locally.
	
	@param pct_noise: The percentage of noise to add to the dataset.
	
	@param noverlap_bits: The number of bits the base class should overlap
	with the novelty class.
	
	@param exp_name: The name of the experiment.
	
	@param ntrials: The number of times to repeat the experiment.
	
	@param verbose: If True print the results.
	
	@param seed: The random seed to use.
	
	@return: A tuple containing the percentage errors for the SP's training
	and testing results and the SVM's training and testing results,
	respectively.
	"""
	
	# Base parameters
	ntrain, ntest = 800, 200
	nsamples, nbits, pct_active = ntest + ntrain, 100, 0.4
	clf_th = 0.5
	log_dir = os.path.join(os.path.expanduser('~'), 'scratch',
		'novelty_experiments', exp_name)
	
	# Configure the SP
	config = {
		'ninputs': 100,
		'trim': 1e-4,
		'disable_boost': True,
		'seed': seed,
		'pct_active': None,
		'random_permanence': True,
		'pwindow': 0.5,
		
		'global_inhibition': True,
		
		'ncolumns': 200,
		'nactive': 50,
		
		
		'nsynapses': 75,
		'seg_th': 15,
		
		'syn_th': 0.5,
		
		'pinc': 0.001,
		'pdec': 0.001,
		
		'nepochs': 10,
		
		'log_dir': log_dir
	}
	
	# Seed numpy
	np.random.seed(seed)
	
	# Create the base dataset
	x_ds = SPDataset(nsamples, nbits, pct_active, pct_noise, seed=seed)
	x_tr, x_te = x_ds.data[:ntrain], x_ds.data[ntrain:]
	
	# Create the outlier dataset
	base_indexes = set(np.where(x_ds.base_class == 1)[0])
	choices = [x for x in xrange(nbits) if x not in base_indexes]
	outlier_base = np.zeros(nbits, dtype='bool')
	outlier_base[np.random.choice(choices, x_ds.nactive - noverlap_bits,
		False)] = 1
	outlier_base[np.random.permutation(list(base_indexes))[:noverlap_bits]] = 1
	y_ds = SPDataset(ntest, nbits, pct_active, pct_noise, outlier_base, seed)
	y_te = y_ds.data
	
	if verbose:
		print "\nBase class' test noise: {0:2.2f}".format(1 - (np.mean(x_te, 0)
			* x_ds.base_class.astype('i')).sum() / 40.)
		print "Outlier's class noise: {0:2.2f}".format(1 - (np.mean(y_te, 0) *
			outlier_base.astype('i')).sum() / 40.)
		print 'Overlap between two classes: {0}'.format(np.dot(
			x_ds.base_class.astype('i'), outlier_base.astype('i')))
	
	# Metrics
	metrics = SPMetrics()
	
	# Get the metrics for the datasets
	u_x_tr = metrics.compute_uniqueness(x_tr)
	o_x_tr = metrics.compute_overlap(x_tr)
	c_x_tr = 1 - metrics.compute_distance(x_tr)
	u_x_te = metrics.compute_uniqueness(x_te)
	o_x_te = metrics.compute_overlap(x_te)
	c_x_te = 1 - metrics.compute_distance(x_te)
	u_y_te = metrics.compute_uniqueness(y_te)
	o_y_te = metrics.compute_overlap(y_te)
	c_y_te = 1 - metrics.compute_distance(y_te)
	
	# Initialize the overall results
	sp_x_results = np.zeros(ntrials)
	sp_y_results = np.zeros(ntrials)
	svm_x_results = np.zeros(ntrials)
	svm_y_results = np.zeros(ntrials)
	
	# Iterate across the trials:
	for i in xrange(ntrials):
		# Make a new seed
		seed2 = np.random.randint(1000000)
		config['seed'] = seed2
		config['log_dir'] = '{0}-{1}'.format(log_dir, i + 1)
		
		# Create the SP
		sp = SPRegion(**config)
		
		# Fit the SP
		sp.fit(x_tr)
		
		# Get the SP's output
		sp_x_tr = sp.predict(x_tr)
		sp_x_te = sp.predict(x_te)
		sp_y_te = sp.predict(y_te)
		
		# Get the metrics for the SP's results
		u_sp_x_tr = metrics.compute_uniqueness(sp_x_tr)
		o_sp_x_tr = metrics.compute_overlap(sp_x_tr)
		c_sp_x_tr = 1 - metrics.compute_distance(sp_x_tr)
		u_sp_x_te = metrics.compute_uniqueness(sp_x_te)
		o_sp_x_te = metrics.compute_overlap(sp_x_te)
		c_sp_x_te = 1 - metrics.compute_distance(sp_x_te)
		u_sp_y_te = metrics.compute_uniqueness(sp_y_te)
		o_sp_y_te = metrics.compute_overlap(sp_y_te)
		c_sp_y_te = 1 - metrics.compute_distance(sp_y_te)
		
		# Log all of the metrics
		sp._log_stats('Input Base Class Train Uniqueness', u_x_tr)
		sp._log_stats('Input Base Class Train Overlap', o_x_tr)
		sp._log_stats('Input Base Class Train Correlation', c_x_tr)
		sp._log_stats('Input Base Class Test Uniqueness', u_x_te)
		sp._log_stats('Input Base Class Test Overlap', o_x_te)
		sp._log_stats('Input Base Class Test Correlation', c_x_te)
		sp._log_stats('Input Novelty Class Test Uniqueness', u_y_te)
		sp._log_stats('Input Novelty Class Test Overlap', o_y_te)
		sp._log_stats('Input Novelty Class Test Correlation', c_y_te)	
		sp._log_stats('SP Base Class Train Uniqueness', u_sp_x_tr)
		sp._log_stats('SP Base Class Train Overlap', o_sp_x_tr)
		sp._log_stats('SP Base Class Train Correlation', c_sp_x_tr)
		sp._log_stats('SP Base Class Test Uniqueness', u_sp_x_te)
		sp._log_stats('SP Base Class Test Overlap', o_sp_x_te)
		sp._log_stats('SP Base Class Test Correlation', c_sp_x_te)
		sp._log_stats('SP Novelty Class Test Uniqueness', u_sp_y_te)
		sp._log_stats('SP Novelty Class Test Overlap', o_sp_y_te)
		sp._log_stats('SP Novelty Class Test Correlation', c_sp_y_te)
		
		# Print the results
		fmt_s = '{0}:\t{1:2.4f}\t{2:2.4f}\t{3:2.4f}\t{4:2.4f}\t{5:2.4f}\t{5:2.4f}'
		if verbose:
			print '\nDescription\tx_tr\tx_te\ty_te\tsp_x_tr\tsp_x_te\tsp_y_te'
			print fmt_s.format('Uniqueness', u_x_tr, u_x_te, u_y_te, u_sp_x_tr,
				u_sp_x_te, u_sp_y_te)
			print fmt_s.format('Overlap', o_x_tr, o_x_te, o_y_te, o_sp_x_tr, o_sp_x_te,
				o_sp_y_te)
			print fmt_s.format('Correlation', c_x_tr, c_x_te, c_y_te, c_sp_x_tr,
				c_sp_x_te, c_sp_y_te)
		
		# Get average representation of the base class
		sp_base_result = np.mean(sp_x_tr, 0)
		sp_base_result[sp_base_result >= 0.5] = 1
		sp_base_result[sp_base_result < 1] = 0
		
		# Averaged results for each metric type
		u_sp_base_to_x_te = 0.
		o_sp_base_to_x_te = 0.
		c_sp_base_to_x_te = 0.
		u_sp_base_to_y_te = 0.
		o_sp_base_to_y_te = 0.
		c_sp_base_to_y_te = 0.
		for x, y in zip(sp_x_te, sp_y_te):
			# Refactor
			xt = np.vstack((sp_base_result, x))
			yt = np.vstack((sp_base_result, y))
			
			# Compute the sums
			u_sp_base_to_x_te += metrics.compute_uniqueness(xt)
			o_sp_base_to_x_te += metrics.compute_overlap(xt)
			c_sp_base_to_x_te += 1 - metrics.compute_distance(xt)
			u_sp_base_to_y_te += metrics.compute_uniqueness(yt)
			o_sp_base_to_y_te += metrics.compute_overlap(yt)
			c_sp_base_to_y_te += 1 - metrics.compute_distance(yt)
		u_sp_base_to_x_te /= ntest
		o_sp_base_to_x_te /= ntest
		c_sp_base_to_x_te /= ntest
		u_sp_base_to_y_te /= ntest
		o_sp_base_to_y_te /= ntest
		c_sp_base_to_y_te /= ntest
		
		# Log the results
		sp._log_stats('Base Train to Base Test Uniqueness',
			u_sp_base_to_x_te)
		sp._log_stats('Base Train to Base Test Overlap', o_sp_base_to_x_te)
		sp._log_stats('Base Train to Base Test Correlation', c_sp_base_to_x_te)
		sp._log_stats('Base Train to Novelty Test Uniqueness',
			u_sp_base_to_y_te)
		sp._log_stats('Base Train to Novelty Test Overlap', o_sp_base_to_y_te)
		sp._log_stats('Base Train to Novelty Test Correlation',
			c_sp_base_to_y_te)
		
		# Print the results
		if verbose:
			print '\nDescription\tx_tr->x_te\tx_tr->y_te'
			print 'Uniqueness:\t{0:2.4f}\t{1:2.4f}'.format(u_sp_base_to_x_te,
				u_sp_base_to_y_te)
			print 'Overlap:\t{0:2.4f}\t{1:2.4f}'.format(o_sp_base_to_x_te,
				o_sp_base_to_y_te)
			print 'Correlation:\t{0:2.4f}\t{1:2.4f}'.format(c_sp_base_to_x_te,
				c_sp_base_to_y_te)
		
		# Create an SVM
		clf = OneClassSVM(kernel='linear', nu=0.1, random_state=seed2)
		
		# Evaluate the SVM's performance
		clf.fit(x_tr)
		svm_x_te = len(np.where(clf.predict(x_te) == 1)[0]) / float(ntest) * \
			100
		svm_y_te = len(np.where(clf.predict(y_te) == -1)[0]) / float(ntest) * \
			100
		
		# Perform classification using overlap as the feature
		# -- The overlap must be above 50%
		clf_x_te = 0.
		clf_y_te = 0.
		for x, y in zip(sp_x_te, sp_y_te):
			# Refactor
			xt = np.vstack((sp_base_result, x))
			yt = np.vstack((sp_base_result, y))
			
			# Compute the accuracy
			xo = metrics.compute_overlap(xt)
			yo = metrics.compute_overlap(yt)
			if xo >= clf_th: clf_x_te += 1
			if yo < clf_th: clf_y_te += 1
		clf_x_te = (clf_x_te / ntest) * 100
		clf_y_te = (clf_y_te / ntest) * 100
		
		# Store the results as errors
		sp_x_results[i] = 100 - clf_x_te
		sp_y_results[i] = 100 - clf_y_te
		svm_x_results[i] = 100 - svm_x_te
		svm_y_results[i] = 100 - svm_y_te
		
		# Log the results
		sp._log_stats('SP % Correct Base Class', clf_x_te)
		sp._log_stats('SP % Correct Novelty Class', clf_y_te)
		sp._log_stats('SVM % Correct Base Class', svm_x_te)
		sp._log_stats('SVM % Correct Novelty Class', svm_y_te)
		
		# Print the results
		if verbose:
			print '\nSP Base Class Detection     : {0:2.2f}%'.format(clf_x_te)
			print 'SP Novelty Class Detection  : {0:2.2f}%'.format(clf_y_te)
			print 'SVM Base Class Detection    : {0:2.2f}%'.format(svm_x_te)
			print 'SVM Novelty Class Detection : {0:2.2f}%'.format(svm_y_te)
	
	return sp_x_results, sp_y_results, svm_x_results, svm_y_results
print("The scores are computed on the full evaluation set.")
print()
y_true, y_pred = y_test, gscv.predict(X_test)
print(classification_report(y_true, y_pred))
print(confusion_matrix(y_test, y_pred))
print()

#%%

# Novelty detection by One Class SVM with optimized hyperparameter
from my_library import optimize_gamma
optgamma = gscv.best_params_['gamma']
range_g = 2**np.arange(-20, 1, dtype=float)
optgamma = optimize_gamma(X_train, range_g)
clf = OneClassSVM(nu=0.003, kernel=gscv.best_params_['kernel'], gamma=optgamma)
clf.fit(X_train)

y_pred = gscv.predict(X_test)  # prediction

from my_library import ad_knn
# Applicability Domain (inside: +1, outside: -1)
ad_svm = clf.predict(X_test)  # outliers = -1
ad_knn = ad_knn(X_train, X_test)

results = np.c_[y_pred, y_test, ad_knn, ad_svm, X_test]

df = pd.DataFrame(results, columns=list('ABCDEF'))
df_knn = df[df.C == -1]
df_svm = df[df.D == -1]
print('AD svm =/= AD knn')
print(df[df.C != df.D])
コード例 #30
0
ファイル: ocr.py プロジェクト: BorutFlis/OCR
            #print(tokens[token_start:token_finish])
            #print(sims_flat[most_similar])
            #print(p_values[0][f_dict_keys.index(f)])
            return_dict[f]=0.75*sims_flat[most_similar]+0.25*p_values[0][f_dict_keys.index(f)]
        return return_dict


if __name__ == "__main__":
    ocr_inst=OcrValidation()
    #ocr_inst.setup_model()
    #ocr_inst.feature_dict.pop("Sign-off")
    sum_rep=rp.SumRepresentation(ocr_inst.vocabulary,ocr_inst.feature_dict)
    cvec=CountVectorizer(vocabulary=ocr_inst.vocabulary,binary=True)
    d2v_train=pickle.load(open("doc2vec.p","rb"))
    d2v=rp.Doc2Vec(d2v_train)
    ocr_inst.doc2vec=d2v
    ocr_inst.exemplar_vec=ocr_inst.doc2vec.model.infer_vector([ocr_inst.texts[1]])
    model=OneClassSVM(nu=0.05)
    ocr_inst.evaluate_mulitple([cvec,sum_rep,d2v],[model])

    train_set=sum_rep.fit_transform(ocr_inst.texts[:75])
    ocr_inst.train_set=train_set
    model_sum=OneClassSVM(nu=0.05)
    model_sum.fit(train_set)
    ocr_inst.model=model_sum
    ocr_inst.representation=sum_rep




コード例 #31
0
ファイル: oneclass.py プロジェクト: itsfareast/RSCar
                           '../data/label.csv')
tst_feature = np.asarray(tstset['feature'])
#load label with taxi==0, revert label to be taxi==1
tst_label = np.asarray(tstset['label'])

featmean = np.mean(tr_feature, axis=0)
featstd = np.std(tr_feature, axis=0)
tr_feature -= featmean
tr_feature /= featstd

tst_feature -= featmean
tst_feature /= featstd

#model = RandomForestClassifier(n_estimators=20,criterion='entropy')
model = OneClassSVM()
model.fit(tr_feature)

tr_accuracy = np.mean(model.predict(tr_feature) == tr_label)
tst_res = model.predict(tst_feature) == tst_label
tst_accuracy = np.mean(tst_res)
print tst_res
tst_pred = model.predict(tst_feature)
print tst_pred
proba = map(lambda x: max(x), tst_pred)

tst_log = []
for each in zip(tst_res, proba):
    tst_log.append({'p': each[1], 'acc': each[0]})
records = sorted(tst_log, key=operator.itemgetter('p'), reverse=True)
for i in range(1, len(records)):
    r = map(lambda x: x['acc'], records[0:i])
    print('Data Loaded, {} pristine vectors'.format(data_length))

    idx = np.arange(0, data_length)
    np.random.shuffle(idx)

    X_train = pristine_emb[idx[:int(args.train_prop * data_length)]]
    X_test = pristine_emb[idx[int(args.train_prop * data_length):]]

    print('Starting training on {} train vectors with {} test vectors'.format(
        X_train.shape[0], X_test.shape[0]))
    classifier = OneClassSVM(nu=0.0001,
                             kernel='rbf',
                             gamma=0.5 / 2048,
                             cache_size=2000,
                             verbose=True)
    classifier.fit(X_train)
    print('Finishing training')

    y_pred_train = classifier.predict(X_train)
    y_pred_test = classifier.predict(X_test)
    y_pred_outliers = classifier.predict(forged_emb)
    n_error_train = y_pred_train[y_pred_train == -1].size
    n_error_test = y_pred_test[y_pred_test == -1].size
    n_error_outliers = y_pred_outliers[y_pred_outliers == 1].size

    print('Error train: {}/{} --> {}%'.format(
        n_error_train, X_train.shape[0],
        100 * n_error_train / X_train.shape[0]))
    print('Error test: {}/{} --> {}%'.format(
        n_error_test, X_test.shape[0], 100 * n_error_test / X_test.shape[0]))
    print('Error forged: {}/{} --> {}%'.format(
コード例 #33
0
def remove_outliers(features,
                    max_fraction=0.1,
                    min_fraction=0.25,
                    verbose=False):
    """
	Remove outliers from feature set. Since this is an unsupervised approach we iterate
	over many nu/gamma settings for the one-class SVM. For each setting, a certain fraction
	of the subjects will be classified as outliers. For some settings, this fraction will
	be very large, e.g., 90% which is not realistic. For this reason, you can set a maximum
	fraction, e.g., 10%. Only those parameter combinations that result in 10% or less outliers
	are considered for further analysis. Within those combinations we simply count how often
	a given subject is classified as an outlier. We then use a minimum fraction to determine
	when a subject is truly an outlier.
	:param features:
	:param max_fraction: Upper bound on number of outliers allowed
	:param min_fraction: Lower bound on number of times a subject is classified as outlier
	:param verbose: Verbosity.
	:return: Filtered feature set
	"""
    X, y = util.get_xy(features,
                       target_column='diagnosis',
                       exclude_columns=['age', 'gender', 'diagnosis'])

    subjects = {}
    nr_ok_fractions = 0

    for nu in np.linspace(0.01, 1.0, num=20):

        for gamma in [2**x for x in range(-15, 4, 2)]:

            # Train classifier
            classifier = OneClassSVM(kernel='rbf', gamma=gamma, nu=nu)
            classifier.fit(X)
            y_pred = classifier.predict(X)

            # Calculate fraction of outliers
            count = 0.0
            for i in range(len(y_pred)):
                if y_pred[i] == -1:
                    count += 1.0
            fraction = count / len(y_pred)

            # If fraction is less than threshold run through list again to find
            # which subjects are considered outliers. Each outlying subject is
            # added to the table and its value incremented by one
            if fraction < max_fraction:
                nr_ok_fractions += 1
                for i in range(len(y_pred)):
                    if y_pred[i] == -1:
                        subject = features.index[i]
                        if subject not in subjects.keys():
                            subjects[subject] = 0
                        subjects[subject] += 1

    # Print number of times each subject is identified as outlier
    outliers = []
    for subject in subjects.keys():
        fraction = subjects[subject] / float(nr_ok_fractions)
        if fraction >= min_fraction:
            outliers.append(subject)

    # Remove outlying subjects

    if verbose:
        print('Removing {} outliers...'.format(len(outliers)))
    features.drop(outliers, axis=0, inplace=True)

    return features
コード例 #34
0
train_activations = get_activations(new_model,
                                    get_inputs(train_path + '/*.jpg'),
                                    RELEVANT_LAYER_NAME)
train_activations.to_csv(addr['ocs_train_activations_name'])
test_activations = get_activations(
    new_model, get_test_inputs(test_path, class1Name, class2Name),
    RELEVANT_LAYER_NAME)
test_activations.to_csv(addr['ocs_test_activations_name'])
y_true = [1.] * shock_len + [-1.] * nonshock_len
for kernel in ['linear', 'poly', 'rbf', 'sigmoid']:
    for nu in np.linspace(0.1, 0.9, num=9):
        for gamma in [2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096]:
            res_list = []
            ocs = OneClassSVM(nu=nu, kernel=kernel, gamma=1.0 / (gamma))
            ocs.fit(train_activations)

            y_pred = ocs.predict(test_activations)
            y_scores = ocs.decision_function(test_activations)

            precision, recall, thresholds = precision_recall_curve(
                y_true, y_scores)
            print(kernel, nu, gamma)
            print(metrics.accuracy_score(y_true, y_pred),
                  metrics.precision_score(y_true, y_pred),
                  metrics.recall_score(y_true, y_pred))

import gc
gc.collect()
sys.stdout = oldStdout
コード例 #35
0
ファイル: oneClassAbraxas.py プロジェクト: cmab92/pyAbraxas
                label=1)

oc.readDataSet(equalLength=False, checkData=False)
oc.dumpTeTrData(dumpName="anomaly.pkl")

TrainFeat, TrainLabel, TestFeat, TestLabel = oc.loadTeTrDump(
    dumpName="anomaly.pkl")

data = np.concatenate([TestFeat, TrainFeat])
label = np.concatenate([TestLabel, TrainLabel])

normal = data[label == 0]
anomal = data[label == 1]

training = normal[0:int(2 / 3 * len(normal))]
test = normal[int(2 / 3 * len(normal))::]

from sklearn.svm import OneClassSVM

model = OneClassSVM(kernel='linear')

model.fit(training)
preds = model.predict(test)
preds = np.reshape(preds, len(preds))
print("False Negatives: ", np.sum(preds == -1) / len(preds))
print("True Positives: ", np.sum(preds == 1) / len(preds))
preds = model.predict(anomal)
preds = np.reshape(preds, len(preds))
print("False Positives: ", np.sum(preds == 1) / len(preds))
print("True Negatives: ", np.sum(preds == -1) / len(preds))
コード例 #36
0
def outlier_SVM(df):
    ocsvm = OneClassSVM(kernel = 'rbf', gamma = 0.005, nu = 0.05)
    ocsvm.fit(df)
    outliers_svm = df[ocsvm.predict(df) == -1]
    
    return outliers_svm
コード例 #37
0
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.svm import OneClassSVM

X_dist, time = get_events_time(disturbed_sequences[:])

print("======== SEQUENCES =======")
sequence = X_test[:] + X_dist[:]

label = len(X_test[:]) * [1] + len(X_dist[:]) * [0]

# define outlier detection model
model = OneClassSVM(gamma='scale', nu=0.01)
# fit on majority class
model.fit(X_train)
# detect outliers in the test set
yhat = model.predict(sequence)

# calculate score
score = f1_score(label, yhat, pos_label=-1)
print('F1 Score: %.3f' % score)
# saveobj(f_history,h)

print("####### End tranining ########")

# %%
# model = load_model(f_current_model)
# param = loadobj(f_current_config)
model = load_model(f_model)
param = loadobj(f_config)
コード例 #38
0
ファイル: qc.py プロジェクト: davecash75/APPIAN
def _OneClassSVM(X):
    clf = OneClassSVM(nu=0.1, kernel="rbf", gamma=0.1)
    clf.fit(X)
    return clf.predict(X)
コード例 #39
0
    print(Indx,file=fRank)
fMaxAcc = -100.0
sMAXparam=''
clfMax = ''
idxmax = 0
for indx in (range(arTraining_std.shape[1]),):#Indx):
    for kernel in ['rbf']:#,'sigmoid']:#['poly', 'rbf', 'sigmoid']:
        for nu in [0.0000001,0.000001,0.00001,0.0001,0.001,0.01,0.1]:#,0.15,0.2,0.25,0.3,0.35,0.4]:
            for gam in [0.000001,0.00001,0.0001,0.001,0.01,0.1]:#,0.3,0.5]:#,1,10,100]:
                #print ('nu=',nu,'gamma=',gam,'kernel=',kernel)
                param = str(nu)+' '+str(gam) + ' '+ kernel
                clsf.set_params(**{'nu':nu,'gamma':gam,'kernel':kernel})
        #gsC    lsf = GridSearchCV(clsf,dParams,cv=tCVIndxs,scoring='scorer')
                #print('Training The Model')
                arTr = arTraining_std[:,indx]
                clsf.fit(arTr)
        #cls    f.fit(arCVData,arCVLab)
                #print('Prediction')
                arV = arValidation_std[:,indx]
                y_pre = clsf.predict(arV)
                param = param + ' ' + str(indx).replace('\n','')
                #print (accuracy_score(y_valid_ref,y_pre))
                fCurAcc = f1_score(y_valid_ref,y_pre)#accuracy_score(y_valid_ref,y_pre)
                xIn = accuracy_score(y_valid_ref[:iNumInClass],y_pre[:iNumInClass]) + math.exp(-100)
                #print('x=',x)
                fscoreIn = math.log(xIn)
                fscoreOut = 0.0
                for sPhone in dPhoneIndx:
                    iStart,iEnd = dPhoneIndx[sPhone]
                    x = accuracy_score(y_valid_ref[iStart:iEnd],y_pre[iStart:iEnd])
                    fscoreTemp = math.log(x+math.exp(-100))
コード例 #40
0
def eval(cfg, model, train_dataset, val_dataset, criterion, publisher="test"):
    model.eval()

    # get global features using a training dataset
    train_loader = DataLoader(train_dataset,
                              batch_size=cfg.batch_size,
                              num_workers=cfg.nworkers,
                              pin_memory=True)
    train_loader = tqdm(train_loader, ncols=100, desc="get train GF")
    train_global_features = []
    with torch.no_grad():
        for lidx, (inputs, targets) in enumerate(train_loader):

            inputs = inputs.to(cfg.device, non_blocking=True)
            inputs = torch.transpose(
                inputs, 1,
                2)[:, :
                   3]  # inputs.shape: Batch_size, num_channels, num_points)
            # targets = targets.to(cfg.device, non_blocking=True)

            # model encoder processing
            outputs, _, _ = model.encoder(inputs)

            # add a global feature to a list
            train_global_features.append(PytorchTools.t2n(outputs))

        train_global_features = np.concatenate(
            train_global_features, axis=0)  # shape (num_train_data, 1024)

        # get reconstructions for ply data
        reconstructions = model.decoder(outputs)
        # save reconstructions as ply
        rgb = np.full((reconstructions.shape[1], 3), 255, dtype=np.int32)
        xyz = PytorchTools.t2n(reconstructions[0])
        write_ply("train_reconstruction.ply", xyz, rgb)
        inputs = torch.transpose(inputs, 1, 2)
        gt_xyz = PytorchTools.t2n(inputs[0])
        write_ply("train_input.ply", gt_xyz, rgb)

    # get global features using a eval dataset
    val_loader = DataLoader(val_dataset,
                            batch_size=cfg.batch_size,
                            num_workers=cfg.nworkers,
                            pin_memory=True)
    val_loader = tqdm(val_loader, ncols=100, desc="get eval GF")
    val_global_features = []
    eval_labels = []
    loss_list = []
    with torch.no_grad():
        for lidx, (inputs, targets) in enumerate(val_loader):

            inputs = inputs.to(cfg.device, non_blocking=True)
            inputs = torch.transpose(
                inputs, 1,
                2)[:, :3]  # inputs.shape: Batch_size, num_channels, num_points
            # targets = targets.to(cfg.device, non_blocking=True)

            # model encoder processing
            outputs, _, _ = model.encoder(inputs)

            # get reconstructions for loss of true data
            reconstructions = model.decoder(outputs)

            # compute loss
            inputs = torch.transpose(inputs, 1, 2)
            dist1, dist2 = criterion["chamfer_distance"](inputs,
                                                         reconstructions)
            dist1 = np.mean(PytorchTools.t2n(dist1), axis=1)
            dist2 = np.mean(PytorchTools.t2n(dist2), axis=1)
            dist_loss = dist1 + dist2

            # add dist_losses to a list
            loss_list.append(dist_loss)

            # add a global feature to a list
            val_global_features.append(PytorchTools.t2n(outputs))

            # get eval labels
            eval_labels.append(targets)

        val_global_features = np.concatenate(
            val_global_features, axis=0)  # shape (num_eval_data, 1024)
        eval_labels = np.squeeze(np.concatenate(eval_labels, axis=0),
                                 axis=-1)  # shape (num_data)
        loss_list = np.concatenate(loss_list, axis=0)

        # save reconstructions as ply
        rgb = np.full((reconstructions.shape[1], 3), 255, dtype=np.int32)
        xyz = PytorchTools.t2n(reconstructions[0])
        write_ply("test_reconstruction.ply", xyz, rgb)
        gt_xyz = PytorchTools.t2n(inputs[0])
        write_ply("test_input.ply", gt_xyz, rgb)

    # use one class classification
    classifier = OneClassSVM(kernel='rbf', nu=0.1, gamma='auto')
    classifier.fit(train_global_features)
    pred_labels = classifier.predict(val_global_features)

    # visualize data using embeddings
    write_tsne("vis_embed.png", val_global_features, eval_labels)

    # get training data label
    _, true_label = train_dataset[0]
    # convert eval labels other than true labels to -1
    eval_labels[eval_labels != true_label] = -1
    # convert true labels to 1
    eval_labels[eval_labels == true_label] = 1

    # get loss of true data
    dist_loss = np.mean(loss_list[eval_labels])
    # get a accuracy
    acc = np.mean(pred_labels == eval_labels) * 100

    return acc, dist_loss
コード例 #41
0
data_nor = data_pre[data_pre.normal1 == 1]
data_abn = data_pre[data_pre.normal1 == -1]

ax = pyplot.gca()
data_nor.plot(x='timestamp_int', y='value', ax=ax,color='blue',marker='o')
data_abn.plot(kind='scatter', x='timestamp_int', y='value', ax = ax, marker='x', color='r')

pyplot.show()

data_pre = data_pre.drop(['timestamp'], axis=1)

min_max_scaler = preprocessing.StandardScaler()
np_scaled = min_max_scaler.fit_transform(data_pre)
# train one class SVM
model =  OneClassSVM(nu=0.95 * 0.01)
data = pandas.DataFrame(np_scaled)
model.fit(data)

data_pre['normal2'] = pandas.Series(model.predict(data))
data_pre['normal2'] = data_pre['normal2'].map( {1: 0, -1: 1} )
print(data_pre['normal2'].value_counts())

fig, ax = pyplot.subplots()

a = data_pre.loc[data_pre['normal2'] == 1, ['timestamp_int', 'value']]

ax.plot(data_pre['timestamp_int'], data_pre['value'], color='blue',marker='.',linestyle=' ')
ax.scatter(a['timestamp_int'], a['value'], color='red',marker='x')
pyplot.show()
コード例 #42
0
class OneClassSVM(object):
    def __init__(self,
                 kernel='rbf',
                 gamma='scale',
                 tol=0.001,
                 nu=0.5,
                 shrinking=True,
                 max_iter=1000):
        """
        Unsupervised Outlier Detection.
        Arguments
        ---------
            kernel : {‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’}, optional (default=rbf).
                Specifies the kernel type to be used in the algorithm.
                It must be one of ‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’ or a callable
            gamma : {‘scale’, ‘auto’} or float, default=’scale’
                Kernel coefficient for ‘rbf’, ‘poly’ and ‘sigmoid’.
            tol : float, default=1e-3
                Tolerance for stopping criterion
            nu : float, default=0.5
                An upper bound on the fraction of training errors and a lower bound of the fraction of support vectors.
                Should be in the interval (0, 1]. By default 0.5 will be taken
            max_iter : int, default=-1
                Hard limit on iterations within solver, or -1 for no limit.
        Reference
        ---------
            For more information, please visit https://scikit-learn.org/stable/modules/generated/sklearn.svm.OneClassSVM.html
        """
        self.model = SVM(kernel=kernel,
                         gamma=gamma,
                         tol=tol,
                         nu=nu,
                         shrinking=shrinking,
                         max_iter=max_iter)
        self.transformer = None

    def fit(self, x):
        """
        Arguments
        ---------
            x: ndarray, the event count matrix of shape num_instances-by-num_events
        """

        print('OneClassSVM Fit')
        x = x.reshape((len(x), -1))

        self.transformer = get_transformer(x, 'minmax')
        x = self.transformer.transform(x)

        self.model.fit(x)

    def predict(self, x):
        """ Predict anomalies with mined invariants
        Arguments
        ---------
            x: the input event count matrix
        Returns
        -------
            y_pred: ndarray, the predicted label vector of shape (num_instances,)
        """
        print('OneClassSVM Predict')
        x = x.reshape((len(x), -1))
        x = self.transformer.transform(x)

        y_pred = self.model.predict(x)

        y_pred = np.where(y_pred > 0, 0, 1)
        return y_pred
コード例 #43
0
ファイル: tica_kde.py プロジェクト: steven-albanese/MSMs
regularization_string = "_012"

X0 = dataset.dataset("./tica/tica%d%s.h5" %
                     (tica_lagtime, regularization_string))

slicer = featurizer.FirstSlicer(2)
X = slicer.transform(X0)

Xf = np.concatenate(X)

hexbin(Xf[:, 0], Xf[:, 1], bins='log')

Xf_train = Xf[::100]

svm = OneClassSVM()
svm.fit(Xf_train)

kde = sklearn.neighbors.kde.KernelDensity()
kde.fit(Xf)

scores = map(lambda x: kde.score(x), X)

ind0 = (Xf[:, 0] > 0.75) & (Xf[:, 0] < 0.92) & (Xf[:, 1] > 0.63) & (Xf[:, 1] <
                                                                    1.10)
Xf0 = Xf[ind0]
Xf0.shape

kde0 = sklearn.neighbors.kde.KernelDensity()
kde0.fit(Xf0)

scores = map(lambda x: kde0.score(x), X)
コード例 #44
0
ファイル: explore_district.py プロジェクト: jjardel/bd-bq
ax.set_ylabel( 'Margin' )
ax.set_zlabel( 'Similarity of Neighboring Districts' )
ax.set_zlim( [ 0., 1. ] )
ax.set_xlim( [ 0., 500. ] )
ax.set_ylim( [ 0., 1. ] )

fig.show()

angles = np.linspace(0,360,41)[:-1] # Take 20 angles between 0 and 360
rotanimate(ax, angles,'movie.gif',delay=20, width = 6., height = 5.) 

# do outlier search using one-class SVM
data[ 0, : ] = preprocessing.scale( data[ 0, : ] )

model = OneClassSVM( gamma = .001, nu = .1 )
fit = model.fit( data )
preds = model.predict( data )

inlier = np.where( preds == 1. )[ 0 ]
outlier = np.where( preds == -1. )[ 0 ]

fig = plt.figure()
ax = fig.add_subplot( 111, projection = '3d' )
ax.scatter( data[ inlier, 0 ], data[ inlier, 1 ], data[ inlier, 2 ], c = 'b' )
ax.scatter( data[ outlier, 0 ], data[ outlier, 1 ], data[ outlier, 2 ], c = 'k' )
ax.set_xlabel( '$P^2/A$' )
ax.set_ylabel( 'Margin' )
ax.set_zlabel( 'Similarity of Neighboring Districts' )

ax.set_ylim( [0., 1 ] )
ax.set_zlim( [ 0., 1. ] )
コード例 #45
0
# In[4]:

y_satellite.iloc[:, 0].value_counts().plot.bar()
plt.savefig('img.png')
plt.show()

# In[5]:

y_satellite[0].value_counts()

# In[6]:

gamma_values, err_values_gamma = [], []
for g in np.linspace(0.0000015, 0.00015, 10):
    onesvm = OneClassSVM(nu=y_satellite.mean(), gamma=g)
    onesvm.fit(satellite)
    yhat = onesvm.predict(satellite)
    yhat = ((yhat - 1) * -1) / 2
    acc = accuracy_score(y_satellite, yhat)
    err = 1 - acc
    gamma_values.append(g)
    err_values_gamma.append(err)

# In[7]:

plt.subplots(figsize=(10, 5))
plt.plot(gamma_values, err_values_gamma, 'o-')
plt.xlabel('gamma')
plt.ylabel('error')
plt.show()
コード例 #46
0
ファイル: Classifier.py プロジェクト: landauof/Porjecton
def one_class_svm(n, g):
    data_set = pandas.read_csv(selected_features_path)
    data_set.pop(data_set.columns[0])

    # class distribution
    print(data_set.groupby('Class').size())

    # Split-out validation dataset
    array = data_set.values
    X = array[:, 0:number_of_features]
    Y = array[:, number_of_features]
    validation_size = 0.20
    seed = 7
    X_train, X_validation, Y_train, Y_validation = \
        X[0:50], X[50:], Y[0:50], Y[50:]
    #    model_selection.train_test_split(X, Y, test_size=validation_size, random_state=seed)

    # Test options and evaluation metric
    seed = 7
    scoring = 'accuracy'

    # Spot Check Algorithms
    models = []
    models.append(('LR', LogisticRegression()))
    # models.append(('LDA', LinearDiscriminantAnalysis()))
    models.append(('KNN', KNeighborsClassifier()))
    models.append(('CART', DecisionTreeClassifier()))
    models.append(('NB', GaussianNB()))
    models.append(('SVM', SVC()))
    models.append(('OneClassSVM', OneClassSVM()))
    # evaluate each model in turn
    results = []
    names = []
    #for name, model in models:
    #    kfold = model_selection.KFold(n_splits=10, random_state=seed)
    #    cv_results = model_selection.cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
    #    results.append(cv_results)
    #    names.append(name)
    #    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    #    print(msg)

    model = OneClassSVM(nu=n, kernel='rbf', gamma=g)
    model.fit(X_train)
    #    print('\n')
    #    print(model)
    #    print('\n')

    preds = model.predict(X_validation)
    correct_preds = []
    for pred in preds:
        if pred == -1:
            correct_preds.append(1)
        else:
            correct_preds.append(0)
    targs = Y_validation
    print('\n')

    correct_targs = []
    for targ in targs:
        correct_targs.append(targ)
#    print(correct_targs)
#    print(correct_preds)

    print("accuracy: ", metrics.accuracy_score(correct_targs, correct_preds))
    #    print("precision: ", metrics.precision_score(correct_targs, correct_preds, average=None))
    #    print("recall: ", metrics.recall_score(correct_targs, correct_preds, average=None))
    #    print("f1: ", metrics.f1_score(correct_targs, correct_preds, average=None))
    # print("area under curve (auc): ", metrics.roc_auc_score(correct_targs, preds))

    res = metrics.accuracy_score(correct_targs, correct_preds)
    #    print(type(np.float64(res).item()))
    fres = np.float64(res).item()
    return fres
コード例 #47
0
def classifier(data):
    from sklearn.covariance import EllipticEnvelope
    from sklearn.svm import OneClassSVM
    from sklearn.datasets import load_boston
    from sklearn import preprocessing
    # Get data

    # Define "classifiers" to be used
    legend1 = {}
    legend2 = {}
    evaluation = [[val["coverage"], val["num_exons"], val["distance_to_next"]] for val in data] 
    X = [[val["coverage"], val["num_exons"], val["distance_to_next"]] for val in data]  
    X = preprocessing.scale(X)
    evaluation = preprocessing.scale(evaluation)
    # Learn a frontier for outlier detection with several classifiers
    sample = random.sample(X, 20000)
    clf = OneClassSVM(nu=.1, kernel='rbf')
    test = random.sample(evaluation, 2000)
    print >> sys.stderr, "fitting data"    
    clf.fit(sample)
    print >> sys.stderr, "predicting data"
    Y = clf.predict(test)
    print >> sys.stderr, "plotting data"
    fig, axes = subplots()
    
    for i in range(len(test)):
        if Y[i] == 1:
            color = 'blue'
        else:
            color = 'red'
        axes.scatter(test[i][2], test[i][1], c=color)
    #ylim([50,2000]) #num exons
    ylabel("distance")
    #xlim([3,10])
    xlabel("coverage")
    savefig("DistanceVCoverage.pdf")

    fig, axes = subplots()
    """
    for i in range(len(test)):
        if Y[i] == 1:
            color = 'blue'
        else:
            color = 'red'
        axes.scatter(test[i][1], test[i][0], c=color)
    #xlim([0,10]) #num exons
    xlabel("number of exons")
    #ylim([3,15])
    ylabel("coverage")
    savefig("ExonsvsCoverage.pdf")
    """
    full_test = clf.predict(evaluation)
    novel, regular = [],[]
    for i in range(len(full_test)):
        result = full_test[i]
        if result == -1:
            print data[i]["id"]
            novel.append(data[i]["num_exons"])
        else:
            regular.append(data[i]["num_exons"])
    multi_exon_novel = [val for val in novel if val > 1]
    multi_exon_regular = [val for val in regular if val > 1]
    print >> sys.stderr, "novel, regular"
    print >> sys.stderr, len(novel), len(regular)
    print >> sys.stderr, mean(multi_exon_novel), mean(multi_exon_regular), len(multi_exon_novel), len(multi_exon_regular)
コード例 #48
0
def base_experiment(config,
                    pct_noise=0.15,
                    noverlap_bits=0,
                    ntrials=10,
                    verbose=False,
                    seed=123456789):
    """
	Run a single experiment, locally.
	
	@param config: The configuration parameters.
	
	@param pct_noise: The percentage of noise to add to the dataset.
	
	@param noverlap_bits: The number of bits the base class should overlap
	with the novelty class.
	
	@param ntrials: The number of times to repeat the experiment.
	
	@param verbose: If True print the results.
	
	@param seed: The random seed to use.
	"""

    # Base parameters
    ntrain, ntest = 800, 200
    nsamples, nbits, pct_active = ntest + ntrain, 100, 0.4
    clf_th = 0.5

    # Build the directory, if needed
    base_dir = config['log_dir']
    if not os.path.exists(base_dir): os.makedirs(base_dir)

    # Seed numpy
    np.random.seed(seed)

    # Create the base dataset
    x_ds = SPDataset(nsamples, nbits, pct_active, pct_noise, seed=seed)
    x_tr, x_te = x_ds.data[:ntrain], x_ds.data[ntrain:]

    # Create the outlier dataset
    base_indexes = set(np.where(x_ds.base_class == 1)[0])
    choices = [x for x in xrange(nbits) if x not in base_indexes]
    outlier_base = np.zeros(nbits, dtype='bool')
    outlier_base[np.random.choice(choices, x_ds.nactive - noverlap_bits,
                                  False)] = 1
    outlier_base[np.random.permutation(list(base_indexes))[:noverlap_bits]] = 1
    y_ds = SPDataset(ntest, nbits, pct_active, pct_noise, outlier_base, seed)
    y_te = y_ds.data

    if verbose:
        print "\nBase class' test noise: {0:2.2f}".format(
            1 - (np.mean(x_te, 0) * x_ds.base_class.astype('i')).sum() / 40.)
        print "Outlier's class noise: {0:2.2f}".format(
            1 - (np.mean(y_te, 0) * outlier_base.astype('i')).sum() / 40.)
        print 'Overlap between two classes: {0}'.format(
            np.dot(x_ds.base_class.astype('i'), outlier_base.astype('i')))

    # Metrics
    metrics = SPMetrics()

    # Get the metrics for the datasets
    u_x_tr = metrics.compute_uniqueness(x_tr)
    o_x_tr = metrics.compute_overlap(x_tr)
    u_x_te = metrics.compute_uniqueness(x_te)
    o_x_te = metrics.compute_overlap(x_te)
    u_y_te = metrics.compute_uniqueness(y_te)
    o_y_te = metrics.compute_overlap(y_te)

    # Initialize the overall results
    sp_x_results = np.zeros(ntrials)
    sp_y_results = np.zeros(ntrials)
    svm_x_results = np.zeros(ntrials)
    svm_y_results = np.zeros(ntrials)

    # Iterate across the trials:
    for i, seed2 in enumerate(generate_seeds(ntrials, seed)):
        # Create the SP
        config['seed'] = seed2
        sp = SPRegion(**config)

        # Fit the SP
        sp.fit(x_tr)

        # Get the SP's output
        sp_x_tr = sp.predict(x_tr)
        sp_x_te = sp.predict(x_te)
        sp_y_te = sp.predict(y_te)

        # Get the metrics for the SP's results
        u_sp_x_tr = metrics.compute_uniqueness(sp_x_tr)
        o_sp_x_tr = metrics.compute_overlap(sp_x_tr)
        u_sp_x_te = metrics.compute_uniqueness(sp_x_te)
        o_sp_x_te = metrics.compute_overlap(sp_x_te)
        u_sp_y_te = metrics.compute_uniqueness(sp_y_te)
        o_sp_y_te = metrics.compute_overlap(sp_y_te)

        # Log all of the metrics
        sp._log_stats('Input Base Class Train Uniqueness', u_x_tr)
        sp._log_stats('Input Base Class Train Overlap', o_x_tr)
        sp._log_stats('Input Base Class Test Uniqueness', u_x_te)
        sp._log_stats('Input Base Class Test Overlap', o_x_te)
        sp._log_stats('Input Novelty Class Test Uniqueness', u_y_te)
        sp._log_stats('Input Novelty Class Test Overlap', o_y_te)
        sp._log_stats('SP Base Class Train Uniqueness', u_sp_x_tr)
        sp._log_stats('SP Base Class Train Overlap', o_sp_x_tr)
        sp._log_stats('SP Base Class Test Uniqueness', u_sp_x_te)
        sp._log_stats('SP Base Class Test Overlap', o_sp_x_te)
        sp._log_stats('SP Novelty Class Test Uniqueness', u_sp_y_te)
        sp._log_stats('SP Novelty Class Test Overlap', o_sp_y_te)

        # Print the results
        fmt_s = '{0}:\t{1:2.4f}\t{2:2.4f}\t{3:2.4f}\t{4:2.4f}\t{5:2.4f}\t{6:2.4f}'
        if verbose:
            print '\nDescription\tx_tr\tx_te\ty_te\tsp_x_tr\tsp_x_te\tsp_y_te'
            print fmt_s.format('Uniqueness', u_x_tr, u_x_te, u_y_te, u_sp_x_tr,
                               u_sp_x_te, u_sp_y_te)
            print fmt_s.format('Overlap', o_x_tr, o_x_te, o_y_te, o_sp_x_tr,
                               o_sp_x_te, o_sp_y_te)

        # Get average representation of the base class
        sp_base_result = np.mean(sp_x_tr, 0)
        sp_base_result[sp_base_result >= 0.5] = 1
        sp_base_result[sp_base_result < 1] = 0

        # Averaged results for each metric type
        u_sp_base_to_x_te = 0.
        o_sp_base_to_x_te = 0.
        u_sp_base_to_y_te = 0.
        o_sp_base_to_y_te = 0.
        for x, y in zip(sp_x_te, sp_y_te):
            # Refactor
            xt = np.vstack((sp_base_result, x))
            yt = np.vstack((sp_base_result, y))

            # Compute the sums
            u_sp_base_to_x_te += metrics.compute_uniqueness(xt)
            o_sp_base_to_x_te += metrics.compute_overlap(xt)
            u_sp_base_to_y_te += metrics.compute_uniqueness(yt)
            o_sp_base_to_y_te += metrics.compute_overlap(yt)
        u_sp_base_to_x_te /= ntest
        o_sp_base_to_x_te /= ntest
        u_sp_base_to_y_te /= ntest
        o_sp_base_to_y_te /= ntest

        # Log the results
        sp._log_stats('Base Train to Base Test Uniqueness', u_sp_base_to_x_te)
        sp._log_stats('Base Train to Base Test Overlap', o_sp_base_to_x_te)
        sp._log_stats('Base Train to Novelty Test Uniqueness',
                      u_sp_base_to_y_te)
        sp._log_stats('Base Train to Novelty Test Overlap', o_sp_base_to_y_te)

        # Print the results
        if verbose:
            print '\nDescription\tx_tr->x_te\tx_tr->y_te'
            print 'Uniqueness:\t{0:2.4f}\t{1:2.4f}'.format(
                u_sp_base_to_x_te, u_sp_base_to_y_te)
            print 'Overlap:\t{0:2.4f}\t{1:2.4f}'.format(
                o_sp_base_to_x_te, o_sp_base_to_y_te)

        # Create an SVM
        clf = OneClassSVM(kernel='linear', nu=0.1, random_state=seed2)

        # Evaluate the SVM's performance
        clf.fit(x_tr)
        svm_x_te = len(np.where(clf.predict(x_te) == 1)[0]) / float(ntest) * \
         100
        svm_y_te = len(np.where(clf.predict(y_te) == -1)[0]) / float(ntest) * \
         100

        # Perform classification using overlap as the feature
        # -- The overlap must be above 50%
        clf_x_te = 0.
        clf_y_te = 0.
        for x, y in zip(sp_x_te, sp_y_te):
            # Refactor
            xt = np.vstack((sp_base_result, x))
            yt = np.vstack((sp_base_result, y))

            # Compute the accuracy
            xo = metrics.compute_overlap(xt)
            yo = metrics.compute_overlap(yt)
            if xo >= clf_th: clf_x_te += 1
            if yo < clf_th: clf_y_te += 1
        clf_x_te = (clf_x_te / ntest) * 100
        clf_y_te = (clf_y_te / ntest) * 100

        # Store the results as errors
        sp_x_results[i] = 100 - clf_x_te
        sp_y_results[i] = 100 - clf_y_te
        svm_x_results[i] = 100 - svm_x_te
        svm_y_results[i] = 100 - svm_y_te

        # Log the results
        sp._log_stats('SP % Correct Base Class', clf_x_te)
        sp._log_stats('SP % Correct Novelty Class', clf_y_te)
        sp._log_stats('SVM % Correct Base Class', svm_x_te)
        sp._log_stats('SVM % Correct Novelty Class', svm_y_te)

        # Print the results
        if verbose:
            print '\nSP Base Class Detection     : {0:2.2f}%'.format(clf_x_te)
            print 'SP Novelty Class Detection  : {0:2.2f}%'.format(clf_y_te)
            print 'SVM Base Class Detection    : {0:2.2f}%'.format(svm_x_te)
            print 'SVM Novelty Class Detection : {0:2.2f}%'.format(svm_y_te)

    # Save the results
    with open(os.path.join(base_dir, 'results.pkl'), 'wb') as f:
        cPickle.dump(
            (sp_x_results, sp_y_results, svm_x_results, svm_y_results), f,
            cPickle.HIGHEST_PROTOCOL)
コード例 #49
0
ファイル: my_refine2d.py プロジェクト: g5v991x/emtest
def main():
	
	usage="refine2d using simmx information "
	parser = EMArgumentParser(usage=usage,version=EMANVERSION)
	parser.add_argument("--ptcls", type=str,help="particle file", default=None)
	parser.add_argument("--simmx", type=str,help="simmx", default=None)
	parser.add_argument("--npca", type=int,help="number of pca factors", default=10)
	parser.add_argument("--niter", type=int,help="number of iterations", default=5)
	parser.add_argument("--outlier", type=float,help="outlier fraction", default=0.1)
	parser.add_argument("--ncls", type=int,help="number of centers", default=128)
	parser.add_argument("--nref", type=int,help="number of references", default=32)
	(options, args) = parser.parse_args()
	logid=E2init(sys.argv)
	
	simmxfile=options.simmx
	for itr in range(options.niter):
		### start from the simmx
		print "Pre-processing simmx"
		e=EMData(simmxfile)
		pts=e.numpy().T.copy()
		for i in range(len(pts)):
			pts[i]-=np.mean(pts[i])
			pts[i]/=np.std(pts[i])
		pts=pts.astype(np.float).copy();
		#e=from_numpy(pts.T.copy())
		#e.write_image("simmx_tmp.hdf")
		#exit()
		
		print "Doing PCA"
		(nptcl, ncls) = pts.shape;
		#nfac=options.npca
		pca=PCA(options.npca)
		pts_pca=pca.fit_transform(pts)
		bs=pts_pca
		bs/=np.std(bs)
		print bs.shape,pts.shape
		np.savetxt("test_pca_{:02d}".format(itr),pts_pca)
		
		print "Removing outliers"
		outliers_fraction=options.outlier
		svm=OneClassSVM(nu=0.95 * outliers_fraction + 0.05,kernel="rbf", gamma=0.1)
		svm.fit(bs)
		y_pred = svm.decision_function(bs).ravel()
		nkeep=int(len(bs)*(1-outliers_fraction))
		st=np.argsort(y_pred)[::-1]
		st=st[:nkeep]
		
		print "Clustering"
		ncnt=options.ncls
		centroids,_ = kmeans(bs[st],ncnt)
		l,_ = vq(bs[st],centroids)
		
		labels=np.zeros(len(bs))-1
		labels[st]=l
		
		print "Class averaging"
		e=EMData(1,len(labels))
		for i in range(len(labels)):
			e.set_value_at(0,i,labels[i])
		clsmxfile="clsmx_{:02d}.hdf".format(itr)
		e.write_image(clsmxfile)
		
		clsout="classes_{:02d}.hdf".format(itr)
		run("e2classaverage.py --input={} --classmx={} --output={} --force --center xform.center --iter=5 --align=rotate_translate_flip:maxshift=32 --averager=mean --keep=.6 --cmp=ccc --aligncmp=ccc --normproc=normalize --parallel=thread:12".format(options.ptcls,clsmxfile,clsout))
		
		simmxfile="simmx_{:02d}.hdf".format(itr)
		run("e2simmx.py {} {} {} --align rotate_translate_flip --aligncmp ccc --cmp ccc --saveali --parallel thread:12".format(options.ptcls, clsout, simmxfile))
	

	E2end(logid)
コード例 #50
0
X0_outliers_n = scaler.transform(X0)

#UNIRE X1_test_n E X0_outliers_n in X_TEST_n
X_TEST_n = np.concatenate((X1_test_n, X0_outliers_n))

#UNIRE Y1_test E Y0
Y_TEST = np.concatenate((Y1_test, Y0))

pca = PCA(n_components=0.95)
reducer = pca.fit(X1_train_n)
X1_train_n_reduced = reducer.transform(X1_train_n)
X_TEST_n_reduced = reducer.transform(X_TEST_n)

clf = OneClassSVM(gamma='auto', nu=0.5)

clf.fit(X1_train_n_reduced)

Y1_pred_train = clf.predict(X1_train_n_reduced)
Y_pred_TEST = clf.predict(X_TEST_n_reduced)

#VALUTAZIONE

#TRAIN SET

#matrice di confusione

confmat = confusion_matrix(y_true=Y1_train, y_pred=Y1_pred_train)

fig, ax = plt.subplots(figsize=(2.5, 2.5))
ax.matshow(confmat, cmap=plt.cm.Blues, alpha=0.5)
for i in range(confmat.shape[0]):
コード例 #51
0
ファイル: em_bench.py プロジェクト: ngoix/EMMV_benchmarks
    axis_alpha = np.arange(alpha_min, alpha_max, 0.0001)
    unif = np.random.uniform(lim_inf, lim_sup,
                             size=(n_generated, n_features))

    # fit:
    print('IsolationForest processing...')
    iforest = IsolationForest()
    iforest.fit(X_train)
    s_X_iforest = iforest.decision_function(X_test)
    print('LocalOutlierFactor processing...')
    lof = LocalOutlierFactor(n_neighbors=20)
    lof.fit(X_train)
    s_X_lof = lof.decision_function(X_test)
    print('OneClassSVM processing...')
    ocsvm = OneClassSVM()
    ocsvm.fit(X_train[:min(ocsvm_max_train, n_samples_train - 1)])
    s_X_ocsvm = ocsvm.decision_function(X_test).reshape(1, -1)[0]
    s_unif_iforest = iforest.decision_function(unif)
    s_unif_lof = lof.decision_function(unif)
    s_unif_ocsvm = ocsvm.decision_function(unif).reshape(1, -1)[0]
    plt.subplot(121)
    auc_iforest, em_iforest, amax_iforest = em(t, t_max,
                                               volume_support,
                                               s_unif_iforest,
                                               s_X_iforest, n_generated)

    auc_lof, em_lof, amax_lof = em(t, t_max, volume_support,
                                   s_unif_lof, s_X_lof, n_generated)

    auc_ocsvm, em_ocsvm, amax_ocsvm = em(t, t_max, volume_support,
                                         s_unif_ocsvm, s_X_ocsvm,
コード例 #52
0
ファイル: svm.py プロジェクト: overgter/Wearable-Biometrics
class SVMClassifier:
    def __init__(self, nu=0.1, kernel="rbf", gamma=0.1):
        self.nu = nu
        self.kernel = kernel
        self.gamma = gamma
        self.svm = OneClassSVM(nu=nu, kernel=kernel, gamma=gamma)

    def get_parameters_string(self):
        return self.kernel + "_" + str(self.nu) + "_" + str(self.gamma)

    def print_details(self):
        print self.get_details()

    def get_details(self):
        result = "--------------------------\n"
        result += "Classifier type: SVM\n"
        result += "Parameters\n"
        result += "Kernel :" + self.kernel + "\n"
        result += "nu :" + str(self.nu) + "\n"
        result += "gamma :" + str(self.gamma) + "\n"
        result += "--------------------------\n"
        return result

    def train(self, dataset):
        first_label = dataset.feature_vectors[0].label
        for feature_vector in dataset.feature_vectors:
            if feature_vector.label != first_label:
                print "Training set vectors should be of the same label!!!"
                return None
        self.user_id = dataset.feature_vectors[0].label
        self.svm.fit([
            feature_vector.values for feature_vector in dataset.feature_vectors
        ])

    def test(self, dataset):
        # each row is a labeled_sample
        samples = [
            feature_vector.values for feature_vector in dataset.feature_vectors
        ]
        result = []
        labels = [
            1 if feature_vector.label == self.user_id else -1
            for feature_vector in dataset.feature_vectors
        ]
        predictions = self.svm.predict(samples)
        tp = len([
            1 for index in range(len(predictions))
            if labels[index] == 1 == predictions[index]
        ])
        tn = len([
            1 for index in range(len(predictions))
            if labels[index] == -1 == predictions[index]
        ])
        fp = len([
            1 for index in range(len(predictions))
            if labels[index] == -1 and predictions[index] == 1
        ])
        fn = len([
            1 for index in range(len(predictions))
            if labels[index] == 1 and predictions[index] == -1
        ])
        result.append([tp, tn, fp, fn])
        return result
コード例 #53
0
    org_dataset_label = org_dataset[len(org_dataset.columns) - 1]
    data_0 = org_dataset.loc[org_dataset[len(org_dataset.columns) - 1] == 0]
    data_1 = org_dataset.loc[org_dataset[len(org_dataset.columns) - 1] == 1]
    data_2 = org_dataset.loc[org_dataset[len(org_dataset.columns) - 1] == 2]
    majority_class = data_2.append(data_0)

    org_dataset_X = np.array(org_dataset_features)
    org_dataset_y = np.ravel(np.array(org_dataset_label))

    X_train, X_test, y_train, y_test = cross_validation.train_test_split(
        org_dataset_X, org_dataset_y, test_size=0.30)

    #OC SVM

    clf = OneClassSVM()
    clf.fit(majority_class)
    preds = clf.predict(org_dataset)
    org = np.array(org_dataset)

    #subset1 and larger SPLIT
    i = 0
    count_1 = 0
    count_2 = 0
    subset1_X = []
    subset1_y = []
    larger_X = []
    larger_y = []
    larger = []
    for i in list(range(0, len(preds))):
        if preds[i] == -1:
            count_1 = count_1 + 1
コード例 #54
0
ファイル: OCSVM.py プロジェクト: emconneilly/mqp
# Creates and saves model of data as fit by the OC SVM
# args: data file, label file, name for model

import numpy as np
from sklearn.svm import OneClassSVM
from sklearn.metrics import f1_score, precision_score, recall_score
import sys
from joblib import dump, load
import csv

train = np.loadtxt(sys.argv[1], delimiter=",")
data = np.loadtxt(sys.argv[2], delimiter=",")
labels = np.loadtxt(sys.argv[3], delimiter=",")

clf = OneClassSVM(kernel='rbf', gamma='scale')
clf.fit(train)

dump(clf, sys.argv[3] + '.joblib')
predicted = clf.predict(data)

#Results of self test
selfResults = open(sys.argv[3] + 'selfResults.csv', "w+")
writer = csv.writer(selfResults)
writer.writerow([predicted])
selfResults.close()

#Results of self test recall
selfRecall = open(sys.argv[3] + 'selfRecall.csv', "w+")
writer = csv.writer(selfRecall)
writer.writerow([recall_score(labels, predicted)])
selfRecall.close()
コード例 #55
0
ファイル: rop_svm.py プロジェクト: yifanlu/ropdetect
import numpy as np
from rop_dataextract import *
from sklearn.svm import OneClassSVM
import sys

MAX_EVENT_COUNTERS = 4
TIME_DELTA = 10000
CLUSTER_POINTS = 32
TRAIN_POINTS = 100000
TEST_POINTS = -1

svm = OneClassSVM()

train_set, test_set = getSetNames(sys.argv)

print "aggregating data..."
obs = aggrTimeseries(train_set, TRAIN_POINTS, CLUSTER_POINTS, MAX_EVENT_COUNTERS, TIME_DELTA)
print len(obs)

print "fitting model..."
svm.fit(obs)

print "aggregating test..."
test = aggrTimeseries(test_set, TEST_POINTS, CLUSTER_POINTS, MAX_EVENT_COUNTERS, TIME_DELTA)

print "testing..."
prediction = svm.predict(test)
print sum(prediction)
print len(prediction)
コード例 #56
0
kf.get_n_splits(X)

param_dist = {
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'gamma': ['scale', 'auto'],
    'nu': stats.uniform(.0, .99),
    'shrinking': [True, False]
}

n_inter = 20
# clf = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=n_inter, cv=5, scoring="accuracy")

print(kf)
for train_index, test_index in kf.split(X):
    print("Rodada")
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    clf = clf.fit(X_train, y_train)
    y_pred_train = clf.predict(X_train)
    y_pred_test = clf.predict(X_test)

    n_error_train = y_pred_train[y_pred_train == -1].size
    n_error_test = y_pred_test[y_pred_test == -1].size

    print("Train error: {:d}".format(n_error_train))
    print("Test error: {:d}".format(n_error_test))

end = time.time()

print("It took: %.2f seconds" % (end - start))
コード例 #57
0
def classifier(data):
    from sklearn.covariance import EllipticEnvelope
    from sklearn.svm import OneClassSVM
    from sklearn.datasets import load_boston
    from sklearn import preprocessing
    # Get data

    # Define "classifiers" to be used
    legend1 = {}
    legend2 = {}
    evaluation = [[val["coverage"], val["num_exons"], val["distance_to_next"]]
                  for val in data]
    X = [[val["coverage"], val["num_exons"], val["distance_to_next"]]
         for val in data]
    X = preprocessing.scale(X)
    evaluation = preprocessing.scale(evaluation)
    # Learn a frontier for outlier detection with several classifiers
    sample = random.sample(X, 20000)
    clf = OneClassSVM(nu=.1, kernel='rbf')
    test = random.sample(evaluation, 2000)
    print >> sys.stderr, "fitting data"
    clf.fit(sample)
    print >> sys.stderr, "predicting data"
    Y = clf.predict(test)
    print >> sys.stderr, "plotting data"
    fig, axes = subplots()

    for i in range(len(test)):
        if Y[i] == 1:
            color = 'blue'
        else:
            color = 'red'
        axes.scatter(test[i][2], test[i][1], c=color)
    #ylim([50,2000]) #num exons
    ylabel("distance")
    #xlim([3,10])
    xlabel("coverage")
    savefig("DistanceVCoverage.pdf")

    fig, axes = subplots()
    """
    for i in range(len(test)):
        if Y[i] == 1:
            color = 'blue'
        else:
            color = 'red'
        axes.scatter(test[i][1], test[i][0], c=color)
    #xlim([0,10]) #num exons
    xlabel("number of exons")
    #ylim([3,15])
    ylabel("coverage")
    savefig("ExonsvsCoverage.pdf")
    """
    full_test = clf.predict(evaluation)
    novel, regular = [], []
    for i in range(len(full_test)):
        result = full_test[i]
        if result == -1:
            print data[i]["id"]
            novel.append(data[i]["num_exons"])
        else:
            regular.append(data[i]["num_exons"])
    multi_exon_novel = [val for val in novel if val > 1]
    multi_exon_regular = [val for val in regular if val > 1]
    print >> sys.stderr, "novel, regular"
    print >> sys.stderr, len(novel), len(regular)
    print >> sys.stderr, mean(multi_exon_novel), mean(multi_exon_regular), len(
        multi_exon_novel), len(multi_exon_regular)
コード例 #58
0
def base_experiment(config, ntrials=1, seed=123456789):
	"""
	Run a single experiment, locally.
		
	@param config: The configuration parameters to use for the SP.
	
	@param ntrials: The number of times to repeat the experiment.
	
	@param seed: The random seed to use.
	
	@return: A tuple containing the percentage errors for the SP's training
	and testing results and the SVM's training and testing results,
	respectively.
	"""
	
	# Base parameters
	ntrain, ntest = 800, 200
	clf_th = 0.5
	
	# Seed numpy
	np.random.seed(seed)
	
	# Get the data
	(tr_x, tr_y), (te_x, te_y) = load_mnist()
	tr_x_0 = np.random.permutation(tr_x[tr_y == 0])
	x_tr = tr_x_0[:ntrain]
	x_te = tr_x_0[ntrain:ntrain + ntest]
	outliers = [np.random.permutation(tr_x[tr_y == i])[:ntest] for i in
		xrange(1, 10)]
	
	# Metrics
	metrics = SPMetrics()
	
	# Get the metrics for the datasets
	u_x_tr = metrics.compute_uniqueness(x_tr)
	o_x_tr = metrics.compute_overlap(x_tr)
	c_x_tr = 1 - metrics.compute_distance(x_tr)
	u_x_te = metrics.compute_uniqueness(x_te)
	o_x_te = metrics.compute_overlap(x_te)
	c_x_te = 1 - metrics.compute_distance(x_te)
	u_y_te, o_y_te, c_y_te = [], [], []
	for outlier in outliers:
		u_y_te.append(metrics.compute_uniqueness(outlier))
		o_y_te.append(metrics.compute_overlap(outlier))
		c_y_te.append(1 - metrics.compute_distance(outlier))
	
	# Initialize the overall results
	sp_x_results = np.zeros(ntrials)
	sp_y_results = [np.zeros(ntrials) for _ in xrange(9)]
	svm_x_results = np.zeros(ntrials)
	svm_y_results = [np.zeros(ntrials) for _ in xrange(9)]
	
	# Iterate across the trials:
	for nt in xrange(ntrials):
		# Make a new seeod
		seed2 = np.random.randint(1000000)
		config['seed'] = seed2
		
		# Create the SP
		sp = SPRegion(**config)
		
		# Fit the SP
		sp.fit(x_tr)
		
		# Get the SP's output
		sp_x_tr = sp.predict(x_tr)
		sp_x_te = sp.predict(x_te)
		sp_y_te = [sp.predict(outlier) for outlier in outliers]
		
		# Get the metrics for the SP's results
		u_sp_x_tr = metrics.compute_uniqueness(sp_x_tr)
		o_sp_x_tr = metrics.compute_overlap(sp_x_tr)
		c_sp_x_tr = 1 - metrics.compute_distance(sp_x_tr)
		u_sp_x_te = metrics.compute_uniqueness(sp_x_te)
		o_sp_x_te = metrics.compute_overlap(sp_x_te)
		c_sp_x_te = 1 - metrics.compute_distance(sp_x_te)
		u_sp_y_te, o_sp_y_te, c_sp_y_te = [], [], []
		for y in sp_y_te:
			u_sp_y_te.append(metrics.compute_uniqueness(y))
			o_sp_y_te.append(metrics.compute_overlap(y))
			c_sp_y_te.append(1 - metrics.compute_distance(y))
		
		# Log all of the metrics
		sp._log_stats('Input Base Class Train Uniqueness', u_x_tr)
		sp._log_stats('Input Base Class Train Overlap', o_x_tr)
		sp._log_stats('Input Base Class Train Correlation', c_x_tr)
		sp._log_stats('Input Base Class Test Uniqueness', u_x_te)
		sp._log_stats('Input Base Class Test Overlap', o_x_te)
		sp._log_stats('Input Base Class Test Correlation', c_x_te)
		sp._log_stats('SP Base Class Train Uniqueness', u_sp_x_tr)
		sp._log_stats('SP Base Class Train Overlap', o_sp_x_tr)
		sp._log_stats('SP Base Class Train Correlation', c_sp_x_tr)
		sp._log_stats('SP Base Class Test Uniqueness', u_sp_x_te)
		sp._log_stats('SP Base Class Test Overlap', o_sp_x_te)
		sp._log_stats('SP Base Class Test Correlation', c_sp_x_te)
		for i, (a, b, c, d, e, f) in enumerate(zip(u_y_te, o_y_te, c_y_te,
			u_sp_y_te, o_sp_y_te, c_sp_y_te), 1):
			sp._log_stats('Input Novelty Class {0} Uniqueness'.format(i), a)
			sp._log_stats('Input Novelty Class {0} Overlap'.format(i), b)
			sp._log_stats('Input Novelty Class {0} Correlation'.format(i), c)	
			sp._log_stats('SP Novelty Class {0} Uniqueness'.format(i), d)
			sp._log_stats('SP Novelty Class {0} Overlap'.format(i), e)
			sp._log_stats('SP Novelty Class {0} Correlation'.format(i), f)
		
		# Get average representation of the base class
		sp_base_result = np.mean(sp_x_tr, 0)
		sp_base_result[sp_base_result >= 0.5] = 1
		sp_base_result[sp_base_result < 1] = 0
		
		# Averaged results for each metric type
		u_sp_base_to_x_te = 0.
		o_sp_base_to_x_te = 0.
		c_sp_base_to_x_te = 0.
		u_sp, o_sp, c_sp = np.zeros(9), np.zeros(9), np.zeros(9)
		for i, x in enumerate(sp_x_te):
			xt = np.vstack((sp_base_result, x))
			u_sp_base_to_x_te += metrics.compute_uniqueness(xt)
			o_sp_base_to_x_te += metrics.compute_overlap(xt)
			c_sp_base_to_x_te += 1 - metrics.compute_distance(xt)
			
			for j, yi in enumerate(sp_y_te):
				yt = np.vstack((sp_base_result, yi[i]))
				u_sp[j] += metrics.compute_uniqueness(yt)
				o_sp[j] += metrics.compute_overlap(yt)
				c_sp[j] += 1 - metrics.compute_distance(yt)
		u_sp_base_to_x_te /= ntest
		o_sp_base_to_x_te /= ntest
		c_sp_base_to_x_te /= ntest
		for i in xrange(9):
			u_sp[i] /= ntest
			o_sp[i] /= ntest
			c_sp[i] /= ntest
		
		# Log the results
		sp._log_stats('Base Train to Base Test Uniqueness',
			u_sp_base_to_x_te)
		sp._log_stats('Base Train to Base Test Overlap', o_sp_base_to_x_te)
		sp._log_stats('Base Train to Base Test Correlation', c_sp_base_to_x_te)
		for i, j in enumerate(xrange(1, 10)):
			sp._log_stats('Base Train to Novelty {0} Uniqueness'.format(j),
				u_sp[i])
			sp._log_stats('Base Train to Novelty {0} Overlap'.format(j),
				o_sp[i])
			sp._log_stats('Base Train to Novelty {0} Correlation'.format(j),
				c_sp[i])
		
		# Create an SVM
		clf = OneClassSVM(kernel='linear', nu=0.1, random_state=seed2)
		
		# Evaluate the SVM's performance
		clf.fit(x_tr)
		svm_x_te = len(np.where(clf.predict(x_te) == 1)[0]) / float(ntest) * \
			100
		svm_y_te = np.array([len(np.where(clf.predict(outlier) == -1)[0]) /
			float(ntest) * 100 for outlier in outliers])
		
		# Perform classification using overlap as the feature
		# -- The overlap must be above 50%
		clf_x_te = 0.
		clf_y_te = np.zeros(9)
		for i, x in enumerate(sp_x_te):
			xt = np.vstack((sp_base_result, x))
			xo = metrics.compute_overlap(xt)
			if xo >= clf_th: clf_x_te += 1
			
			for j, yi in enumerate(sp_y_te):
				yt = np.vstack((sp_base_result, yi[i]))
				yo = metrics.compute_overlap(yt)
				if yo < clf_th: clf_y_te[j] += 1
		clf_x_te = (clf_x_te / ntest) * 100
		clf_y_te = (clf_y_te / ntest) * 100
		
		# Store the results as errors
		sp_x_results[nt] = 100 - clf_x_te
		sp_y_results[nt] = 100 - clf_y_te
		svm_x_results[nt] = 100 - svm_x_te
		svm_y_results[nt] = 100 - svm_y_te
		
		# Log the results
		sp._log_stats('SP % Correct Base Class', clf_x_te)
		sp._log_stats('SVM % Correct Base Class', svm_x_te)
		for i, j in enumerate(xrange(1, 10)):
			sp._log_stats('SP % Correct Novelty Class {0}'.format(j),
				clf_y_te[i])
			sp._log_stats('SVM % Correct Novelty Class {0}'.format(j),
				svm_y_te[i])
		sp._log_stats('SP % Mean Correct Novelty Class', np.mean(clf_y_te))
		sp._log_stats('SVM % Mean Correct Novelty Class', np.mean(svm_y_te))
		sp._log_stats('SP % Adjusted Score', (np.mean(clf_y_te) * clf_x_te) /
			100)
		sp._log_stats('SVM % Adjusted Score', (np.mean(svm_y_te) * svm_x_te) /
			100)
	
	return sp_x_results, sp_y_results, svm_x_results, svm_y_results