Exemple #1
0
class SVMDetector:
#just the training() function changes, rest all remains same.

    def __init__(self, subjects):
        self.u_scores = []
        self.i_scores = []
        self.mean_vector = []
        self.subjects = subjects
        
    
    def training(self):
        self.clf = OneClassSVM(kernel='rbf',gamma=26)
        print(self.clf)
        self.clf.fit(self.train)
        print(self.clf.fit(self.train))
        #print("lop")

 
    def testing(self):
        self.u_scores = -self.clf.decision_function(self.test_genuine)
        self.i_scores = -self.clf.decision_function(self.test_imposter)
        self.u_scores = list(self.u_scores)
        self.i_scores = list(self.i_scores)
        #print(self.u_scores)
        #print("dog")
        #
        #print(self.i_scores)
        #print("cat")
        
        #print(len(self.i_scores[0]))
        
 # specific .tie5Roanl
    def evaluate(self):
        eers = []
        
        for subject in subjects:        
            genuine_user_data = data.loc[data.subject == "vaibhav", \
                                         "H.period":"UD.l.Return"]
            imposter_data = data.loc[data.subject != "vaibhav", :]
            
            self.train = genuine_user_data[-1:]
            
            self.test_genuine = genuine_user_data[:20]
            self.test_imposter = imposter_data.groupby("subject"). \
                                 head(20).loc[:, "H.period":"UD.l.Return"]
            
            #total - selected genuine ,,,,, first five of every imposter displayed 
            
            #print(genuine_user_data[:200])
            self.training()

            self.testing()

            eers.append(evaluateEER(self.u_scores, \
                                     self.i_scores))
            #print(evaluateEER(self.u_scores, \
            #                         self.i_scores))
            #print(np.mean(eers))
            break
        return np.mean(eers)
Exemple #2
0
def odd_evaluate():
    X_test = RTM.y_sampler.X_test
    X_train = RTM.y_sampler.X_train
    label_test = RTM.y_sampler.label_test
    #one-class SVM
    clf = OneClassSVM(gamma='auto').fit(X_train)
    score_svm = clf.decision_function(X_test)  #lower, more abnormal
    pr_oneclassSVM = precision_at_K(score_svm, label_test)
    #Isolation Forest
    clf = IsolationForest()
    clf.fit(X_train)
    score_if = clf.decision_function(X_test)  #lower, more abnormal
    pr_iso_forest = precision_at_K(score_if, label_test)
    #Roundtrip
    py = RTM.estimate_py_with_IS(X_test,
                                 epoch,
                                 sd_y=best_sd,
                                 scale=best_scale,
                                 sample_size=sample_size,
                                 log=True,
                                 save=False)
    pr_Roundtrip = precision_at_K(py, label_test)
    print("The precision at K of Roundtrip model is %.4f" % pr_Roundtrip)
    print("The precision at K of One-class SVM is %.4f" % pr_oneclassSVM)
    print("The precision at K of Isolation forest is %.4f" % pr_iso_forest)
class SVMDetector(Detector):
    def training(self):
        self.clf = OneClassSVM(kernel='rbf', gamma=26)
        self.clf.fit(self.train)

    def testing(self):
        self.user_scores = -self.clf.decision_function(self.test_genuine)
        self.imposter_scores = -self.clf.decision_function(self.test_imposter)
        self.user_scores = list(self.user_scores)
        self.imposter_scores = list(self.imposter_scores)
Exemple #4
0
def get_predictions(X, y, ktype):
    '''Use a one-class SVM to get irregularity predictions from the inputs.

    Parameters:
        X : DataFrame
            input features
        y : Series
            labels
        ktype : str
            SVM kernel type

    Returns:
        result : DataFrame
            containing the centre, its predicted label using the built-in
            decision function, the predictive score, and the true label
    '''
    svm = OneClassSVM(kernel=ktype, gamma='auto').fit(X)
    y_pred = svm.predict(X)
    y_pred = ((1 - y_pred) / 2).astype(int)
    y_score = -svm.decision_function(X)
    result = pd.DataFrame({
        'centre': X.index,
        'pred': y_pred,
        'score': y_score,
        'anomalous': y
    })

    return result
Exemple #5
0
class OCSVM():
    def __init__(self, kernel='rbf', nu=.01, gamma=.01):
        self.svm = OneClassSVM(nu=nu, gamma=gamma, kernel=kernel)

    def predict(self, X):
        X_prime = self.mapping(X)
        return self.svm.predict(X_prime)

    def fit(self, X):
        X_prime = self.mapping(X)
        return self.svm.fit(X_prime)

    def decision_function(self, X):
        X_prime = self.mapping(X)
        return self.svm.decision_function(X_prime)

    def mapping(self, X):
        X = np.array(X)
        clutter1_com = X[:, -3:]
        clutter2_com = X[:, -6:-3]
        obj_com = X[:, -12:-9]
        gripper_com = X[:, -15:-12]

        X_prime = np.hstack((clutter1_com, clutter2_com, obj_com, gripper_com))
        return X_prime
def _raw_ocsvm_experiment(dataset_load_fn, dataset_name, single_class_ind):
    (x_train, y_train), (x_test, y_test) = dataset_load_fn()

    x_train = x_train.reshape((len(x_train), -1))
    x_test = x_test.reshape((len(x_test), -1))

    x_train_task = x_train[y_train.flatten() == single_class_ind]
    if dataset_name in ['cats-vs-dogs']:  # OC-SVM is quadratic on the number of examples, so subsample training set
        subsample_inds = np.random.choice(len(x_train_task), 5000, replace=False)
        x_train_task = x_train_task[subsample_inds]

    pg = ParameterGrid({'nu': np.linspace(0.1, 0.9, num=9),
                        'gamma': np.logspace(-7, 2, num=10, base=2)})

    results = Parallel(n_jobs=6)(
        delayed(_train_ocsvm_and_score)(d, x_train_task, y_test.flatten() == single_class_ind, x_test)
        for d in pg)

    best_params, best_auc_score = max(zip(pg, results), key=lambda t: t[-1])
    best_ocsvm = OneClassSVM(**best_params).fit(x_train_task)
    scores = best_ocsvm.decision_function(x_test)
    labels = y_test.flatten() == single_class_ind

    res_file_name = '{}_raw-oc-svm_{}_{}.npz'.format(dataset_name,
                                                     get_class_name_from_index(single_class_ind, dataset_name),
                                                     datetime.now().strftime('%Y-%m-%d-%H%M'))
    res_file_path = os.path.join(RESULTS_DIR, dataset_name, res_file_name)
    save_roc_pr_curve_data(scores, labels, res_file_path)
Exemple #7
0
def test_score_samples_estimators():
    """Check the values of score_samples methods derived from sklearn.

    Check that the values are the same than sklearn decision_function methods.
    This only concerns OCSVM and IsolationForest.
    """

    X = np.random.randn(50, 2)

    clf1 = IsolationForest(random_state=88)
    clf1.fit(X)

    clf2 = ensemble.IsolationForest(random_state=88)
    clf2.fit(X)

    assert_array_equal(clf1.score_samples(X), clf2.decision_function(X))

    nu = 0.4
    sigma = 3.0
    gamma = gamma = 1. / (2. * sigma**2)
    clf1 = OCSVM(sigma=sigma, nu=nu)
    clf1.fit(X)

    clf2 = OneClassSVM(gamma=gamma, nu=nu)
    clf2.fit(X)

    assert_array_equal(clf1.score_samples(X),
                       clf2.decision_function(X).ravel())
def dist_ocsvm(X_train, X_test, gamma=0.1):
    """
    Calculation of data density by OCSVM

    Parameters
    ----------
    X_train : array-like, shape = [n_samples, n_features]
        X training data

    X_test : array-like, shape = [n_samples, n_features]
        X test data

    fact : 
        dumping factor

    Returns
    -------
    array-like, shape = [n_samples]
        data density calculated by OCSVM
    """
    clf = OneClassSVM(nu=0.003, kernel="rbf", gamma=gamma)
    clf.fit(X_train)
    func = clf.decision_function(X_test)
    func = func.ravel()
    dens = abs(func - max(func))
    # Normalization: dens = 0 ~ 1
    dens = dens / max(dens)
    return dens
Exemple #9
0
class OCSVM():
    def __init__(self, kernel='rbf', nu=.01, gamma=.01):
        self.svm = OneClassSVM(nu=nu, gamma=gamma, kernel=kernel)

    def predict(self, X):
        X_prime = self.mapping(X)
        return self.svm.predict(X_prime)

    def fit(self, X):
        X_prime = self.mapping(X)
        return self.svm.fit(X_prime)

    def decision_function(self, X):
        X_prime = self.mapping(X)
        return self.svm.decision_function(X_prime)

    def mapping(self, X):
        X = np.array(X)
        clutter1_com = X[:, -3:]
        clutter2_com = X[:, -6:-3]
        obj_com = X[:, -12:-9]
        gripper_com = X[:, -15:-12]

        diff1 = clutter1_com - obj_com
        diff2 = clutter2_com - obj_com

        norm1 = np.linalg.norm(diff1, axis=1)
        norm2 = np.linalg.norm(diff2, axis=1)

        X_prime = np.array([norm1, norm2]).T
        X_prime = np.hstack((X_prime, gripper_com))
        return X_prime
Exemple #10
0
class OCSVM(AnomalyDetector):
    """
        Anomaly detector based on one-class SVM
    """
    def __init__(self, kernel="rbf"):
        self._model = OneClassSVM(gamma='scale', kernel=kernel)


#        self._thresholds = None
# TODO : tester gamma="auto"

    def learn(self, data):
        self._model.fit(data)

    def get_score(self, data, epoch=None):
        assert len(data) == 1, "len(data) = " + str(len(data))
        return self._model.decision_function(data)

    def anomalies_have_high_score(self):
        return True

    def predict(self, data, obs):
        return self._model.predict(obs) == -1

    def get_memory_size(self):
        return 0

    def save(self, filename):
        joblib.dump(self._model, filename)

    def load(self, filename):
        self._model = joblib.load(filename)
Exemple #11
0
 def support_vectors(self, X, n, **kwargs):
     model = OneClassSVM(gamma=X.shape[1], nu=1 / X.shape[0])
     model.fit(X)
     sv = model.support_vectors_
     distance_from_hyperplane = model.decision_function(sv).reshape(-1)
     idx = np.argsort(np.abs(distance_from_hyperplane))[:n]
     return sv[idx, :]
Exemple #12
0
def occ_onehot(pokemon, combats):
    training, labels = get_occ_feature_matrix_onehot(pokemon, combats)
    training = np.array(training)
    labels = np.array(labels)
    kf = KFold(n_splits=5)
    clf = OneClassSVM()
    all_scores = list()
    for train_index, test_index in kf.split(training):
        X_train, X_test = training[train_index], training[test_index]
        Y_test = np.ones(len(test_index))
        clf.fit(X_train)
        y_pred = clf.predict(X_test)
        prob_pos = clf.decision_function(X_test)
        prob_pos = (prob_pos - prob_pos.min()) / (prob_pos.max() -
                                                  prob_pos.min())
        y_pred = [0 if x == -1 else 1 for x in y_pred]
        Y_test = [int(x) for x in Y_test]
        scores = [
            precision_score(Y_test, y_pred),
            recall_score(Y_test, y_pred),
            f1_score(Y_test, y_pred)
        ]
        all_scores.append(scores)
    print(all_scores)
    pickle.dump(all_scores, open('scores_occ_oneoht.pickle', 'wb'))
Exemple #13
0
class OCSVM(object):
    def __init__(self, file_name, config):
        self.dataset = config.dataset
        self.file_name = file_name

        self.x_dim = config.x_dim

        self.kernel = config.kernel
        self.degree = config.degree
        self.gamma = config.gamma
        self.coef0 = config.coef0
        self.tol = config.tol
        self.nu = config.nu

        self.pid = config.pid

        self.model = OneClassSVM(kernel=self.kernel, degree=self.degree, gamma=self.gamma, coef0=self.coef0,
                                 tol=self.tol, nu=self.nu)

    def fit(self, train_input, train_label, test_input, test_label):
        # Perform fit on X and returns labels for X.
        # Returns -1 for outliers and 1 for inliers.
        y_pred = self.model.fit_predict(train_input)
        decision_function = self.model.decision_function(train_input)

        ocsvm_output = OCSVMOutput(y_hat=y_pred, decision_function=decision_function)

        return ocsvm_output
Exemple #14
0
    def ocsvm(self, X_train, kernel=None, gamma=None, nu=None):
        """
        Train OCSVM model from Sklearn

        Parameters
        __________
        X_train: scaled training data
        kernel: kernel funcs: ‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’
        gamma: kernel coefficient for ‘rbf’, ‘poly’ and ‘sigmoid’
        nu: regularization parameter btw [0,1]

        Returns
        ________
        Anomaly scores
        """
        model = OCSVM(kernel=kernel, gamma=gamma, nu=nu)
        model.fit(X_train)

        # Predict raw anomaly score
        labels = model.predict(X_train)  # Outlier labels (-1 or 1)
        labels = (labels.max() -
                  labels) // 2  # rescaled labels (1: outliers, 0: inliers)
        ocsvm_anomaly_scores = model.decision_function(
            X_train) * -1  # Outlier scores
        ocsvm_anomaly_scores = self.min_max_scaler(ocsvm_anomaly_scores)
        return ocsvm_anomaly_scores, labels
Exemple #15
0
def oneClassSVM(dataset):
    classifier = OneClassSVM(nu=outlierFraction, gamma=0.03)
    classifier.fit(dataset)
    predScore = classifier.decision_function(dataset).T[0]
    pred = classifier.predict(dataset)
    outlierRows = [i for i in range(len(pred)) if pred[i] == -1]
    return predScore, outlierRows
Exemple #16
0
def _cae_ocsvm_experiment(dataset_load_fn, dataset_name, single_class_ind,
                          gpu_q):
    # gpu_to_use = gpu_q.get()
    # os.environ["CUDA_VISIBLE_DEVICES"] = gpu_to_use

    (x_train, y_train), (x_test, y_test) = dataset_load_fn()

    print('data_shape', x_train.shape)

    n_channels = x_train.shape[get_channels_axis()]
    input_side = x_train.shape[2]  # channel side will always be at shape[2]
    enc = conv_encoder(input_side, n_channels)
    dec = conv_decoder(input_side, n_channels)
    # print(input_side)
    # print(dec.summary())
    x_in = Input(shape=x_train.shape[1:])
    x_rec = dec(enc(x_in))
    cae = Model(x_in, x_rec)
    cae.compile('adam', 'mse')

    x_train_task = x_train[y_train.flatten() == single_class_ind]
    x_test_task = x_test[y_test.flatten(
    ) == single_class_ind]  # This is just for visual monitoring
    cae.fit(x=x_train_task,
            y=x_train_task,
            batch_size=128,
            epochs=200,
            validation_data=(x_test_task, x_test_task))

    x_train_task_rep = enc.predict(x_train_task, batch_size=128)
    if dataset_name in LARGE_DATASET_NAMES:  # OC-SVM is quadratic on the number of examples, so subsample training set
        subsample_inds = np.random.choice(len(x_train_task_rep),
                                          2500,
                                          replace=False)
        x_train_task_rep_temp = x_train_task_rep[subsample_inds]

    x_test_rep = enc.predict(x_test, batch_size=128)
    pg = ParameterGrid({
        'nu': np.linspace(0.1, 0.9, num=9),
        'gamma': np.logspace(-7, 2, num=10, base=2)
    })

    results = Parallel(n_jobs=PARALLEL_N_JOBS)(delayed(
        _train_ocsvm_and_score)(d, x_train_task_rep_temp, y_test.flatten() ==
                                single_class_ind, x_test_rep) for d in pg)

    best_params, best_auc_score = max(zip(pg, results), key=lambda t: t[-1])
    print(best_params)
    best_ocsvm = OneClassSVM(**best_params).fit(x_train_task_rep)
    scores = best_ocsvm.decision_function(x_test_rep)
    labels = y_test.flatten() == single_class_ind

    res_file_name = '{}_cae-oc-svm_{}_{}.npz'.format(
        dataset_name, get_class_name_from_index(single_class_ind,
                                                dataset_name),
        datetime.datetime.now().strftime('%Y-%m-%d-%H%M'))
    res_file_path = os.path.join(RESULTS_DIR, dataset_name, res_file_name)
    save_roc_pr_curve_data(scores, labels, res_file_path)
    def find_anomaly(label1, label2, winsize):
        print("Find anomaly in channel",
              label1 + '-' + label2 + '...',
              file=sys.stderr)
        print("-" * 80)
        print("Channel [" + label1 + '-' + label2 + ']')
        print("-" * 80)

        # find difference
        electrode1 = eeg.chan_lab.index(label1)
        electrode2 = eeg.chan_lab.index(label2)
        wave = eeg.X[electrode1] - eeg.X[electrode2]

        # # import random
        # wave = [random.uniform(-20,20) for _ in range(400*30)] + [random.uniform(-2000,2000) for _ in range(5*30)]
        # wave = np.array(wave)

        print("Splitting into windows...", file=sys.stderr)
        wave_windows = np.array_split(wave,
                                      len(wave) / eeg.sample_rate / winsize)
        # wave_windows = np.array_split(wave, len(wave)/winsize)

        print("Extracting features...", file=sys.stderr)

        def extract_features(wave_window):
            max_val = max(wave_window)
            min_val = min(wave_window)
            stdev = np.std(wave_window)
            sum_val = sum(wave_window)
            sum_pos_val = sum([x for x in wave_window if x > 0])
            sum_abs_val = sum([abs(x) for x in wave_window])
            return [max_val, min_val, stdev, sum_val, sum_pos_val, sum_abs_val]

        Examples = np.array(map(extract_features, wave_windows))

        print("Training model, assuming no more than",
              CONTAMINATION,
              "anomaly...",
              file=sys.stderr)
        od = OneClassSVM(nu=CONTAMINATION,
                         kernel='poly',
                         gamma=0.05,
                         max_iter=100000)
        od.fit(Examples)

        decisions = od.decision_function(Examples)
        # print decisions
        # print max(decisions), min(decisions)

        print("Most likely windows with anomaly:")
        # find most likely windows, in desc order
        largest_indices = np.argsort((-np.absolute(decisions)).ravel())[:20]
        for large_index in largest_indices:
            print(large_index * winsize / 60, "min (score:",
                  decisions[large_index][0], ")")

        sys.stdout.flush()
Exemple #18
0
def ocsvm(features_train, features_test):
    # One Class Support Vector Machines
    # fit the model
    ocsvm = OneClassSVM().fit(features_train)
    # predict
    start = time.time()
    anomalyScores = ocsvm.decision_function(features_test)
    test_runtime = time.time() - start
    return anomalyScores, test_runtime
def main():
    print('------------01')
    iris = load_iris()
    pca = PCA(n_components=2)
    data = pca.fit_transform(iris.data)
    print(type(data))
    print(data)
    # nuで異常値の割合を指定。predictすると正常値=1,異常値=-1。
    ocsvm = OneClassSVM(nu=0.1, gamma="auto")
    ocsvm.fit(data)
    preds = ocsvm.predict(data)
    print(preds)
    plt.scatter(data[:, 0], data[:, 1], c=preds, cmap=plt.cm.RdBu)
    plt.show()

    print('------------02A')
    x = np.linspace(-5, 5, 500)
    y = np.linspace(-1.5, 1.5, 250)
    X, Y = np.meshgrid(x, y)
    print('X.ravel():')
    print(X.ravel())
    print(X.shape)
    print(Y.shape)
    z1 = np.array([X.ravel(), Y.ravel()])
    print(z1.shape)
    z2 = ocsvm.decision_function(np.array([X.ravel(), Y.ravel()]).T)
    print(z2.shape)
    # (250, 500)
    # (250, 500)
    # (2, 125000)
    # (125000,)
    # (250, 500)
    print(z2.reshape(X.shape).shape)
    df = ocsvm.decision_function(np.array([X.ravel(),
                                           Y.ravel()]).T).reshape(X.shape)
    plt.scatter(data[:, 0], data[:, 1], c=preds, cmap=plt.cm.RdBu, alpha=0.8)
    r = max([abs(df.min()), abs(df.max())])
    print('------------02B')
    print(df.min())
    print(max([abs(df.min()), abs(df.max())]))
    print(df)
    plt.contourf(X, Y, df, 10, vmin=-r, vmax=r, cmap=plt.cm.RdBu, alpha=.5)
    plt.show()
Exemple #20
0
    def __generate_probas(self, samples, resolution, affinity_matrix, number_of_questions):
        print(
            f"📞 Looks like there's a probability distribution ({self.name}) that wants to phone in an expert (that's "
            f"you)\n"
        )
        clf = OneClassSVM(kernel='precomputed')
        samples_and_weights = {0: 0.5}
        for nq in range(number_of_questions):
            indices = list(samples_and_weights.keys())
            if nq == 0:
                idx = np.random.choice(range(1, len(samples)))
            else:
                preds = clf.decision_function(affinity_matrix[:, indices])
                idx = [i for i, _ in sorted(enumerate(preds), key=lambda x: x[1]) if i not in samples_and_weights][
                    0]
            sample = samples[idx]

            print('Score the sample below with a number between 0 and 1 (higher is better)\n')
            if hasattr(sample, '_repr_html_'):
                print(sample)
            else:
                print(sample)
            weight = float(input('Score: '))
            assert 0 <= weight <= 1

            samples_and_weights[idx] = weight
            indices = list(samples_and_weights.keys())
            clf.fit(
                affinity_matrix[indices, :][:, indices],
                sample_weight=list(samples_and_weights.values())
            )

        indices = list(samples_and_weights.keys())
        preds = clf.decision_function(affinity_matrix[:, indices])
        scores = KernelDiscretizedMethod.discretized_scores(
            resolution,
            samples,
            affinity_matrix,
            lambda mask, _idx: preds[mask].mean())

        Z = logsumexp([s for s in scores.values()])

        return {idx: s - Z for idx, s in scores.items()}
Exemple #21
0
def run(opt):




    output={}
    cname = opt.cname
    datatrain, test_loader = get_loader(opt,classname = opt.cname)
    model = create_model(opt)
    opt.load = False
    #model.setup(opt)

    model = vgg_face_dag(weights_path='vgg_face_dag.pth')
    model.eval()


    f = []
    g  = []
    tlbl = []

    model.eval()
    cnt = 0
    output={}
    for data, lbl in datatrain:
        cnt +=1
        code, c = model(data)
        if cnt ==1:
            f = code.view(code.size(0), -1).detach().cpu().numpy().tolist()
        else:
            f += code.view(code.size(0), -1).detach().cpu().numpy().tolist()
    output['train']=f
    cnt = 0
    for data, lbl in test_loader:
        cnt +=1
        code, c = model(data)
        #out, code = model.test()
        if cnt ==1:
            tlbl = lbl.cpu().numpy().tolist()
            g  = code.view(code.size(0), -1).detach().cpu().numpy().tolist()
            #tloss =  np.mean(((out.cpu()-model.real_A.detach().cpu()).numpy())**2,(1,2,3)).tolist()
        else:
            g += code.view(code.size(0), -1).detach().cpu().numpy().tolist()
            tlbl += lbl.cpu().numpy().tolist()
            #tloss +=  np.mean(((out.cpu()-model.real_A.detach().cpu()).numpy())**2,(1,2,3)).tolist()
    output['test']=g
    output['lbl']=tlbl
    print(len(tlbl))
    print(len(g))
    clf = OneClassSVM(gamma='auto')
    clf.fit(output['train'])
    scores = clf.decision_function(output['test'])
    print(sklearn.metrics.roc_auc_score(output['lbl'], scores))

    '''cnt = 0
Exemple #22
0
def one_class_svm_core(x_train, x_test, y_test, x_test_names, version=0):
    clf = OneClassSVM()
    print 'svm begin...'
    start = time.time()
    clf.fit(x_train)
    joblib.dump(clf, 'one_class_model.m')
    print 'begin compute', time.time() - start
    y_distance = clf.decision_function(x_test)
    y_score = (np.clip(y_distance, -1, 1) + 1) / 2
    print 'svm complete..'
    show_result(y_score, y_test, x_test_names, version=version)
class OneClsSVM(AbstractDetector):
    name = "OneClassSVM"
    data_type = "REAL"

    def compute_scores(self, dataframe: pd.DataFrame, classes: np.array):
        bin_dataframe = dataframe._binarize_categorical_values()

        self.clf = OneClassSVM(**self.settings)
        self.clf.fit(bin_dataframe.values)
        self.values = -self.clf.decision_function(bin_dataframe.values)
        return self
def evaluate_features(train, test, labels, dataset, model=''):
    params = svm_parameters_dict(dataset)
    clf = OneClassSVM(kernel='rbf',
                      gamma=params[1],
                      nu=params[0],
                      verbose=True)
    clf.fit(train)
    decision_f = clf.decision_function(test)
    new_decision_f = filter_scores(decision_f, dataset, params[2])
    _auc = calc_auc(labels, new_decision_f)
    print "Area under ROC: ", _auc
    def runClassifier(self, _driverId, numComponents=0):
        X = self.featuresHash.values()
        self.ids = self.featuresHash.keys()
        if self.runDimRed:
            X = self.dimRed(X, numComponents)

        clf = OCSVM(nu=self.nu, gamma=self.gamma)
        clf.fit(X)
        y_pred = clf.decision_function(X).ravel()
        threshold = stats.scoreatpercentile(y_pred, 100 * self.outliers_fraction)
        self.label = y_pred > threshold
        self.label = map(int, self.label)
Exemple #26
0
def SVM(person):
    # Consider current subject as real and rest as fake
    realUser_data = data.loc[data.subject == person, "H.period":"H.Return"]

    if (len(realUser_data) == 0):
        print("No data found for the given user")
        return 0

    real_train = np.array((realUser_data[:200]).values)

    # True test set (200 records)
    real_test = np.array((realUser_data[200:]).values)

    fakeUser_data = data.loc[data.subject != person, :]

    # False set (250 records, 5 per fake user, 50 fake users in all)
    fake_test = np.array(
        (fakeUser_data.groupby("subject").head(5).loc[:, "H.period":"H.Return"]
         ).values)

    clf = OneClassSVM(kernel='rbf', gamma=26)
    clf.fit(real_train)

    realUser_scores = []  # real user score
    fakeUser_scores = []  # imposter user score

    # Calculate score for real user test data
    realUser_scores = list(-clf.decision_function(real_test))

    # Calculate score for fake user test data
    fakeUser_scores = list(-clf.decision_function(fake_test))

    # true label
    labels = [0] * len(realUser_scores) + [1] * len(fakeUser_scores)

    Equal_err_rate = Calc_equal_err_rate(realUser_scores, fakeUser_scores,
                                         labels)
    print("Equal err rate:: ", Equal_err_rate)

    return Equal_err_rate
    class SVMDetector:
        # just the training() function changes, rest all remains same.

        def __init__(self, subjects):
            self.u_scores = []
            self.i_scores = []
            self.mean_vector = []
            self.subjects = subjects

        def training(self):
            self.clf = OneClassSVM(kernel='rbf', gamma=26)
            self.clf.fit(self.train)

        def testing(self):
            self.u_scores = -self.clf.decision_function(self.test_genuine)
            self.i_scores = -self.clf.decision_function(self.test_imposter)
            self.u_scores = list(self.u_scores)
            self.i_scores = list(self.i_scores)

        def evaluate(self):
            eers = []

            for subject in subjects:
                genuine_user_data = data.loc[data.subject ==
                                             tempvalue, "H.period":"UD.l.Return"]

                imposter_data = data.loc[data.subject != tempvalue, :]

                self.train = genuine_user_data[-1:]
                self.test_genuine = genuine_user_data[:20]
                self.test_imposter = imposter_data.groupby("subject"). \
                    head(20).loc[:, "H.period":"UD.l.Return"]

                self.training()
                self.testing()
                eers.append(evaluateEER(self.u_scores,
                                        self.i_scores))
                break
            return np.mean(eers)
Exemple #28
0
def SVM():
    eers = []
    false_negative = 0.0
    false_positive = 0.0

    for subject in subjects:

        user_scores = []
        imposter_scores = []

        imposter_data = data.loc[data.subject != subject, :]
        train = data.loc[data.subject == subject,
                         "H.period":"H.Return"][:200].values
        test_genuine = data.loc[data.subject == subject,
                                "H.period":"H.Return"][200:].values
        imposter = imposter_data.groupby("subject").head(
            5).loc[:, "H.period":"H.Return"].values
        clf = OneClassSVM(kernel='rbf', gamma=26)
        clf.fit(train)
        user_scores = -clf.decision_function(test_genuine)
        imposter_scores = -clf.decision_function(imposter)
        user_scores = list(user_scores)
        imposter_scores = list(imposter_scores)

        standard_deviation = np.std(user_scores)
        mean_standard = np.mean(user_scores)

        for score in user_scores:
            if score > mean_standard + standard_deviation or score < mean_standard - standard_deviation:
                false_positive = false_positive + 1
        # checking for false positives
        for score in imposter_scores:
            if score < mean_standard + standard_deviation and score > mean_standard - standard_deviation:
                false_negative = false_negative + 1

        eers.append(Calc_equal_err_rate(user_scores, imposter_scores))
    #print eers
    return np.mean(eers), false_positive / (51 * 200), false_negative / (51 *
                                                                         250)
Exemple #29
0
def ocsvm(X, y, percentage=None, params={}, sh_params={}):

    normalizer = Normalizer(norm="l1")
    X = normalizer.fit_transform(X)

    cf = OneClassSVM(**params)
    cf.fit(X)
    anomalyscores = -cf.decision_function(X)

    ap = average_precision_score(y, anomalyscores)
    auc = roc_auc_score(y, anomalyscores)

    return ap, auc
Exemple #30
0
def embed_dat_matrix_two_dimensions(low_dimension_data_matrix,
                                    y=None,
                                    labels=None,
                                    density_colormap='Blues',
                                    instance_colormap='YlOrRd'):
    from sklearn.preprocessing import scale
    low_dimension_data_matrix = scale(low_dimension_data_matrix)
    # make mesh
    x_min, x_max = low_dimension_data_matrix[:, 0].min(), low_dimension_data_matrix[:, 0].max()
    y_min, y_max = low_dimension_data_matrix[:, 1].min(), low_dimension_data_matrix[:, 1].max()
    step_num = 50
    h = min((x_max - x_min) / step_num, (y_max - y_min) / step_num)  # step size in the mesh
    b = h * 10  # border size
    x_min, x_max = low_dimension_data_matrix[:, 0].min() - b, low_dimension_data_matrix[:, 0].max() + b
    y_min, y_max = low_dimension_data_matrix[:, 1].min() - b, low_dimension_data_matrix[:, 1].max() + b
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

    # induce a one class model to estimate densities
    from sklearn.svm import OneClassSVM
    gamma = max(x_max - x_min, y_max - y_min)
    clf = OneClassSVM(gamma=gamma, nu=0.1)
    clf.fit(low_dimension_data_matrix)

    # Plot the decision boundary. For that, we will assign a color to each
    # point in the mesh [x_min, m_max] . [y_min, y_max].
    if hasattr(clf, "decision_function"):
        score_matrix = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
    else:
        score_matrix = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
    # Put the result into a color plot
    levels = np.linspace(min(score_matrix), max(score_matrix), 40)
    score_matrix = score_matrix.reshape(xx.shape)

    if y is None:
        y = 'white'

    plt.contourf(xx, yy, score_matrix, cmap=plt.get_cmap(density_colormap), alpha=0.9, levels=levels)
    plt.scatter(low_dimension_data_matrix[:, 0], low_dimension_data_matrix[:, 1],
                alpha=.5,
                s=70,
                edgecolors='gray',
                c=y,
                cmap=plt.get_cmap(instance_colormap))
    # labels
    if labels is not None:
        for id in range(low_dimension_data_matrix.shape[0]):
            label = labels[id]
            x = low_dimension_data_matrix[id, 0]
            y = low_dimension_data_matrix[id, 1]
            plt.annotate(label, xy=(x, y), xytext=(0, 0), textcoords='offset points')
def embed_dat_matrix_two_dimensions(low_dimension_data_matrix,
                                    y=None,
                                    labels=None,
                                    density_colormap='Blues',
                                    instance_colormap='YlOrRd'):
    from sklearn.preprocessing import scale
    low_dimension_data_matrix = scale(low_dimension_data_matrix)
    # make mesh
    x_min, x_max = low_dimension_data_matrix[:, 0].min(), low_dimension_data_matrix[:, 0].max()
    y_min, y_max = low_dimension_data_matrix[:, 1].min(), low_dimension_data_matrix[:, 1].max()
    step_num = 50
    h = min((x_max - x_min) / step_num, (y_max - y_min) / step_num)  # step size in the mesh
    b = h * 10  # border size
    x_min, x_max = low_dimension_data_matrix[:, 0].min() - b, low_dimension_data_matrix[:, 0].max() + b
    y_min, y_max = low_dimension_data_matrix[:, 1].min() - b, low_dimension_data_matrix[:, 1].max() + b
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

    # induce a one class model to estimate densities
    from sklearn.svm import OneClassSVM
    gamma = max(x_max - x_min, y_max - y_min)
    clf = OneClassSVM(gamma=gamma, nu=0.1)
    clf.fit(low_dimension_data_matrix)

    # Plot the decision boundary. For that, we will assign a color to each
    # point in the mesh [x_min, m_max] . [y_min, y_max].
    if hasattr(clf, "decision_function"):
        score_matrix = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
    else:
        score_matrix = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
    # Put the result into a color plot
    levels = np.linspace(min(score_matrix), max(score_matrix), 40)
    score_matrix = score_matrix.reshape(xx.shape)

    if y is None:
        y = 'white'

    plt.contourf(xx, yy, score_matrix, cmap=plt.get_cmap(density_colormap), alpha=0.9, levels=levels)
    plt.scatter(low_dimension_data_matrix[:, 0], low_dimension_data_matrix[:, 1],
                alpha=.5,
                s=70,
                edgecolors='gray',
                c=y,
                cmap=plt.get_cmap(instance_colormap))
    # labels
    if labels is not None:
        for id in range(low_dimension_data_matrix.shape[0]):
            label = labels[id]
            x = low_dimension_data_matrix[id, 0]
            y = low_dimension_data_matrix[id, 1]
            plt.annotate(label, xy=(x, y), xytext = (0, 0), textcoords = 'offset points')
def SVM_score(S):
    X = np.array(S)
    clf = OneClassSVM(kernel='linear')
    clf.fit(X)
    scores = clf.decision_function(X)
    min = 999999
    max = -99999
    for i in range(len(scores)):
        scores[i] = -1 * scores[i]
        if scores[i] < min:
            min = scores[i]
        if scores[i] > max:
            max = scores[i]
    return scores
  def find_anomaly(label1, label2, winsize):
    print("Find anomaly in channel", label1 + '-' + label2 + '...', file=sys.stderr)
    print("-"*80)
    print("Channel [" + label1 + '-' + label2 + ']')
    print("-"*80)

    # find difference
    electrode1 = eeg.chan_lab.index(label1)
    electrode2 = eeg.chan_lab.index(label2)
    wave = eeg.X[electrode1] - eeg.X[electrode2]

    # # import random
    # wave = [random.uniform(-20,20) for _ in range(400*30)] + [random.uniform(-2000,2000) for _ in range(5*30)]
    # wave = np.array(wave)

    print("Splitting into windows...", file=sys.stderr)
    wave_windows = np.array_split(wave, len(wave)/eeg.sample_rate/winsize)
    # wave_windows = np.array_split(wave, len(wave)/winsize)

    print("Extracting features...", file=sys.stderr)
    def extract_features(wave_window): 
      max_val = max(wave_window)
      min_val = min(wave_window)
      stdev = np.std(wave_window)
      sum_val = sum(wave_window)
      sum_pos_val = sum([x for x in wave_window if x > 0])
      sum_abs_val = sum([abs(x) for x in wave_window])
      return [max_val, min_val, stdev, sum_val, sum_pos_val, sum_abs_val]

    Examples = np.array(map(extract_features, wave_windows))

    print("Training model, assuming no more than", CONTAMINATION, "anomaly...", file=sys.stderr)
    od = OneClassSVM(nu=CONTAMINATION, kernel='poly', gamma=0.05, max_iter=100000)
    od.fit(Examples)

    decisions = od.decision_function(Examples)
    # print decisions
    # print max(decisions), min(decisions)

    print("Most likely windows with anomaly:")
    # find most likely windows, in desc order
    largest_indices = np.argsort((-np.absolute(decisions)).ravel())[:20]
    for large_index in largest_indices:
      print(large_index*winsize/60, "min (score:", decisions[large_index][0], ")")

    sys.stdout.flush()
	def remove_outliers_SVM(self):
		## Remove outliers using a OneClassSVM method

		print "Running SVM to remove outliers..."

		svm = OneClassSVM(kernel='rbf', nu=0.1, degree=3, verbose=1)
		fit = svm.fit(self.DataArray)
		decision = svm.decision_function(self.DataArray)
		_indices = []

		# If a value is below the decision hyperplane, eliminate it
		for i in range(len(decision)):
			if decision[i] < 0:
				_indices.append(i)
		print self.DataArray.shape
		self.DataArray = np.delete(self.DataArray, _indices, axis=0)
		self.TargetArray = np.delete(self.TargetArray, _indices, axis=0)
		print self.DataArray.shape
Exemple #35
0
 def predict_header_features(self, pkt_featurizer):
     group_id = pkt_featurizer.pkt_type
     features = pkt_featurizer.features
     arrival_time = pkt_featurizer.arrival_time
     try:
         vectorizer = DictVectorizer()
         vectorizer.fit(self.training_data[group_id])
         training_data_vectorized = vectorizer.transform(self.training_data[group_id])
         features_vectorized = vectorizer.transform(features)
         scaler = preprocessing.StandardScaler(with_mean=False)
         training_data_vectorized = scaler.fit_transform(training_data_vectorized)
         features_vectorized = scaler.transform(features_vectorized)
         classifier = OneClassSVM()
         classifier.fit(training_data_vectorized)
         result = classifier.predict(features_vectorized)
         distance = classifier.decision_function(features_vectorized)
     except KeyError:
         result = 0
         distance = 0
     return result, distance
import numpy as np
import pandas as pd
from sklearn.svm import OneClassSVM

df = pd.read_csv('kddcup_for_elki_100000.csv', header=None, index_col=False)
labelix = df.shape[1]-1

labels = df[labelix]
df = df.drop(labelix, axis=1)

svm = OneClassSVM(kernel='rbf', gamma=1.0/df.shape[0], tol=0.001, nu=0.5, shrinking=True, cache_size=80)
svm = svm.fit(df.values)

scores = svm.decision_function(df.values).flatten()
maxvalue = np.max(scores)
scores = maxvalue - scores

output = pd.DataFrame()

# perform reverse sort
sort_ix = np.argsort(scores)[::-1]

output['labels'] =  labels[sort_ix]
output['outlier_scores'] =  scores[sort_ix]

output.to_csv('outlier_scores.csv', header=None, index=None)
Exemple #37
0
    unif = np.random.uniform(lim_inf, lim_sup,
                             size=(n_generated, n_features))

    # fit:
    print('IsolationForest processing...')
    iforest = IsolationForest()
    iforest.fit(X_train)
    s_X_iforest = iforest.decision_function(X_test)
    print('LocalOutlierFactor processing...')
    lof = LocalOutlierFactor(n_neighbors=20)
    lof.fit(X_train)
    s_X_lof = lof.decision_function(X_test)
    print('OneClassSVM processing...')
    ocsvm = OneClassSVM()
    ocsvm.fit(X_train[:min(ocsvm_max_train, n_samples_train - 1)])
    s_X_ocsvm = ocsvm.decision_function(X_test).reshape(1, -1)[0]
    s_unif_iforest = iforest.decision_function(unif)
    s_unif_lof = lof.decision_function(unif)
    s_unif_ocsvm = ocsvm.decision_function(unif).reshape(1, -1)[0]
    plt.subplot(121)
    auc_iforest, em_iforest, amax_iforest = em(t, t_max,
                                               volume_support,
                                               s_unif_iforest,
                                               s_X_iforest, n_generated)

    auc_lof, em_lof, amax_lof = em(t, t_max, volume_support,
                                   s_unif_lof, s_X_lof, n_generated)

    auc_ocsvm, em_ocsvm, amax_ocsvm = em(t, t_max, volume_support,
                                         s_unif_ocsvm, s_X_ocsvm,
                                         n_generated)
Exemple #38
0
def select_candidates(X, h, objective_function, verbose=False,
                      cov_computation_method=empirical_covariance):
    """Finds the best pure subset of observations to compute MCD from it.

    The purpose of this function is to find the best sets of h
    observations with respect to a minimization of their covariance
    matrix determinant. Equivalently, it removes n_samples-h
    observations to construct what we call a pure data set (i.e. not
    containing outliers). The list of the observations of the pure
    data set is referred to as the `support`.

    Starting from a support estimated with a Parzen density estimator,
    the pure data set is found by the c_step procedure introduced by
    Rousseeuw and Van Driessen in [1].

    Parameters
    ----------
    X: array-like, shape (n_samples, n_features)
      Data (sub)set in which we look for the h purest observations
    h: int, [(n + p + 1)/2] < h < n
      The number of samples the pure data set must contain.
    select: int, int > 0
      Number of best candidates results to return.

    See
    ---
    `c_step` function

    Returns
    -------
    best_locations: array-like, shape (select, n_features)
      The `select` location estimates computed from the `select` best
      supports found in the data set (`X`)
    best_covariances: array-like, shape (select, n_features, n_features)
      The `select` covariance estimates computed from the `select`
      best supports found in the data set (`X`)
    best_supports: array-like, shape (select, n_samples)
      The `select` best supports found in the data set (`X`)

    Notes
    -----
    References:
    [1] A Fast Algorithm for the Minimum Covariance Determinant Estimator,
        1999, American Statistical Association and the American Society
        for Quality, TECHNOMETRICS

    """
    n_samples, n_features = X.shape

    from sklearn.metrics.pairwise import euclidean_distances
    from sklearn.svm import OneClassSVM
    pairwise_distances = np.ravel(euclidean_distances(X))
    delta = sp.stats.scoreatpercentile(pairwise_distances, 10)
    gamma = 0.01 / delta
    clf = OneClassSVM(kernel='rbf', gamma=gamma)
    clf.fit(X)
    in_support = np.argsort(
        -np.ravel(clf.decision_function(X)))[-(n_samples / 2):]
    support = np.zeros(n_samples, dtype=bool)
    support[in_support] = True
    location = X[support].mean(0)
    covariance = cov_computation_method(X[support])
    initial_estimates = (location, covariance)
    best_location, best_covariance, _, best_support = c_step(
        X, h, objective_function, initial_estimates, verbose=verbose,
        cov_computation_method=cov_computation_method)

    return best_location, best_covariance, best_support
Exemple #39
0
            X_test = X[n_samples_train:, :]
            y_train = y[:n_samples_train]
            y_test = y[n_samples_train:]

            # # training only on normal data:
            # X_train = X_train[y_train == 0]
            # y_train = y_train[y_train == 0]

            print('OneClassSVM processing...')
            model = OneClassSVM(cache_size=500)
            tstart = time()
            model.fit(X_train)
            fit_time += time() - tstart
            tstart = time()

            scoring = -model.decision_function(X_test)  # the lower,the more normal
            predict_time += time() - tstart
            fpr_, tpr_, thresholds_ = roc_curve(y_test, scoring)

            if fit_time + predict_time > max_time:
                raise TimeoutError

            f = interp1d(fpr_, tpr_)
            tpr += f(x_axis)
            tpr[0] = 0.

            precision_, recall_ = precision_recall_curve(y_test, scoring)[:2]

            # cluster: old version of scipy -> interpol1d needs sorted x_input
            arg_sorted = recall_.argsort()
            recall_ = recall_[arg_sorted]
Exemple #40
0
 def decision_function(self, data):
     return -OneClassSVM.decision_function(self, data)
Exemple #41
0
def main():
	
	usage="refine2d using simmx information "
	parser = EMArgumentParser(usage=usage,version=EMANVERSION)
	parser.add_argument("--ptcls", type=str,help="particle file", default=None)
	parser.add_argument("--simmx", type=str,help="simmx", default=None)
	parser.add_argument("--npca", type=int,help="number of pca factors", default=10)
	parser.add_argument("--niter", type=int,help="number of iterations", default=5)
	parser.add_argument("--outlier", type=float,help="outlier fraction", default=0.1)
	parser.add_argument("--ncls", type=int,help="number of centers", default=128)
	parser.add_argument("--nref", type=int,help="number of references", default=32)
	(options, args) = parser.parse_args()
	logid=E2init(sys.argv)
	
	simmxfile=options.simmx
	for itr in range(options.niter):
		### start from the simmx
		print "Pre-processing simmx"
		e=EMData(simmxfile)
		pts=e.numpy().T.copy()
		for i in range(len(pts)):
			pts[i]-=np.mean(pts[i])
			pts[i]/=np.std(pts[i])
		pts=pts.astype(np.float).copy();
		#e=from_numpy(pts.T.copy())
		#e.write_image("simmx_tmp.hdf")
		#exit()
		
		print "Doing PCA"
		(nptcl, ncls) = pts.shape;
		#nfac=options.npca
		pca=PCA(options.npca)
		pts_pca=pca.fit_transform(pts)
		bs=pts_pca
		bs/=np.std(bs)
		print bs.shape,pts.shape
		np.savetxt("test_pca_{:02d}".format(itr),pts_pca)
		
		print "Removing outliers"
		outliers_fraction=options.outlier
		svm=OneClassSVM(nu=0.95 * outliers_fraction + 0.05,kernel="rbf", gamma=0.1)
		svm.fit(bs)
		y_pred = svm.decision_function(bs).ravel()
		nkeep=int(len(bs)*(1-outliers_fraction))
		st=np.argsort(y_pred)[::-1]
		st=st[:nkeep]
		
		print "Clustering"
		ncnt=options.ncls
		centroids,_ = kmeans(bs[st],ncnt)
		l,_ = vq(bs[st],centroids)
		
		labels=np.zeros(len(bs))-1
		labels[st]=l
		
		print "Class averaging"
		e=EMData(1,len(labels))
		for i in range(len(labels)):
			e.set_value_at(0,i,labels[i])
		clsmxfile="clsmx_{:02d}.hdf".format(itr)
		e.write_image(clsmxfile)
		
		clsout="classes_{:02d}.hdf".format(itr)
		run("e2classaverage.py --input={} --classmx={} --output={} --force --center xform.center --iter=5 --align=rotate_translate_flip:maxshift=32 --averager=mean --keep=.6 --cmp=ccc --aligncmp=ccc --normproc=normalize --parallel=thread:12".format(options.ptcls,clsmxfile,clsout))
		
		simmxfile="simmx_{:02d}.hdf".format(itr)
		run("e2simmx.py {} {} {} --align rotate_translate_flip --aligncmp ccc --cmp ccc --saveali --parallel thread:12".format(options.ptcls, clsout, simmxfile))
	

	E2end(logid)