def print_accuracy(train_arr,test_arr,trader_id): if len(train_arr)==0 or len(test_arr)==0: return for i in range(len(train_arr)): l1=len(train_arr[i]) l2=len(test_arr[i]) if l1==0 or l2==0: continue train_data=np.array([train_arr[i]]).T test_data=np.array([test_arr[i]]).T clf=OCSVM() clf.fit(train_data) y_pred=clf.predict(train_data) print("TRAINING ACCURACY for TRADER",trader_id,":",100 - (sum(y_pred)*100/l1)) y_pred=clf.predict(test_data) print("TESTING ACCURACY: ",sum(y_pred)*100/l2)
def detect_outliers(lst): """detect outliers in a list of numpy arrays Parameters ---------- lst : List [description] Returns ------- inliers : List A list of the inliers """ clf = OCSVM(verbose=0) clf.fit(lst) inlier_idx = [] outlier_idx = [] for index, data in enumerate(lst): y = clf.predict(data.reshape(1, -1)) if y: # y==1 for outliers logger.debug('Found outlier: {0}'.format(index)) outlier_idx.append(index) else: inlier_idx.append(index) logger.info('{:.0%} are outliers'.format(len(outlier_idx) / len(lst))) return inlier_idx, outlier_idx
class TestOCSVM(unittest.TestCase): def setUp(self): self.n_train = 200 self.n_test = 100 self.contamination = 0.1 self.roc_floor = 0.8 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, contamination=self.contamination, random_state=42) self.clf = OCSVM() self.clf.fit(self.X_train) def test_parameters(self): assert (hasattr(self.clf, 'decision_scores_') and self.clf.decision_scores_ is not None) assert (hasattr(self.clf, 'labels_') and self.clf.labels_ is not None) assert (hasattr(self.clf, 'threshold_') and self.clf.threshold_ is not None) assert (hasattr(self.clf, '_mu') and self.clf._mu is not None) assert (hasattr(self.clf, '_sigma') and self.clf._sigma is not None) assert (hasattr(self.clf, 'support_') and self.clf.support_ is not None) assert (hasattr(self.clf, 'support_vectors_') and self.clf.support_vectors_ is not None) assert (hasattr(self.clf, 'dual_coef_') and self.clf.dual_coef_ is not None) assert (hasattr(self.clf, 'intercept_') and self.clf.intercept_ is not None) # only available for linear kernel # if not hasattr(self.clf, 'coef_') or self.clf.coef_ is None: # self.assertRaises(AttributeError, 'coef_ is not set') def test_train_scores(self): assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0]) def test_prediction_scores(self): pred_scores = self.clf.decision_function(self.X_test) # check score shapes assert_equal(pred_scores.shape[0], self.X_test.shape[0]) # check performance assert_greater(roc_auc_score(self.y_test, pred_scores), self.roc_floor) def test_prediction_labels(self): pred_labels = self.clf.predict(self.X_test) assert_equal(pred_labels.shape, self.y_test.shape) def test_prediction_proba_linear(self): pred_proba = self.clf.predict_proba(self.X_test, method='linear') assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_unify(self): pred_proba = self.clf.predict_proba(self.X_test, method='unify') assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_parameter(self): with assert_raises(ValueError): self.clf.predict_proba(self.X_test, method='something') def test_fit_predict(self): pred_labels = self.clf.fit_predict(self.X_train) assert_equal(pred_labels.shape, self.y_train.shape) def test_fit_predict_score(self): self.clf.fit_predict_score(self.X_test, self.y_test) self.clf.fit_predict_score(self.X_test, self.y_test, scoring='roc_auc_score') self.clf.fit_predict_score(self.X_test, self.y_test, scoring='prc_n_score') with assert_raises(NotImplementedError): self.clf.fit_predict_score(self.X_test, self.y_test, scoring='something') def test_predict_rank(self): pred_socres = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=3.5) assert_array_less(pred_ranks, self.X_train.shape[0] + 1) assert_array_less(-0.1, pred_ranks) def test_predict_rank_normalized(self): pred_socres = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test, normalized=True) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=3.5) assert_array_less(pred_ranks, 1.01) assert_array_less(-0.1, pred_ranks) def tearDown(self): pass
n_test=n_test, n_features=2, contamination=contamination, random_state=42) # train one_class_svm detector clf_name = 'OneClassSVM' clf = OCSVM() clf.fit(X_train) # get the prediction labels and outlier scores of the training data y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) y_train_scores = clf.decision_scores_ # raw outlier scores # get the prediction on the test data y_test_pred = clf.predict(X_test) # outlier labels (0 or 1) y_test_scores = clf.decision_function(X_test) # outlier scores # evaluate and print the results print("\nOn Training Data:") evaluate_print(clf_name, y_train, y_train_scores) print("\nOn Test Data:") evaluate_print(clf_name, y_test, y_test_scores) # visualize the results visualize(clf_name, X_train, y_train, X_test, y_test, y_train_pred,
from pyod.models.ocsvm import OCSVM from pyod.models.pca import PCA # from pyod.models.mcd import MCD clf1=PCA(standardization = True,contamination=0.2) # clf1 = MCD(assume_centered = True) clf2=OCSVM(kernel = 'poly',nu = 0.25,degree =2,contamination =0.2) # clf2 = OCSVM(kernel = 'linear',nu =0.02) clf1.fit(train_set) clf2.fit(train_set) y_pred_train_pca=clf1.predict(train_set) y_pred_test_pca=clf1.predict(test_set) y_pred_train_ocsvm=clf2.predict(train_set) y_pred_test_ocsvm=clf2.predict(test_set) print(clf1.explained_variance_) # print(y_pred_test_pca,y_pred_test_ocsvm) train_pca_correct=0 train_ocsvm_correct=0 print("TRAIN SET") for i in range(len(pred_train_set)): # print("Actual:",pred_train_set[i],"PCA",y_pred_train_pca[i],"OCSVM",y_pred_train_ocsvm[i]) if pred_train_set[i]==y_pred_train_pca[i] and pred_train_set[i]==1: train_pca_correct+=1 if pred_train_set[i]==y_pred_train_ocsvm[i] and y_pred_train_ocsvm[i]==1: train_ocsvm_correct+=1 test_pca_correct=0 test_ocsvm_correct=0
class TestOCSVM(unittest.TestCase): def setUp(self): self.n_train = 100 self.n_test = 50 self.contamination = 0.1 self.roc_floor = 0.6 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, contamination=self.contamination, random_state=42) self.clf = OCSVM() self.clf.fit(self.X_train) def test_sklearn_estimator(self): check_estimator(self.clf) def test_parameters(self): assert_true(hasattr(self.clf, 'decision_scores_') and self.clf.decision_scores_ is not None) assert_true(hasattr(self.clf, 'labels_') and self.clf.labels_ is not None) assert_true(hasattr(self.clf, 'threshold_') and self.clf.threshold_ is not None) assert_true(hasattr(self.clf, '_mu') and self.clf._mu is not None) assert_true(hasattr(self.clf, '_sigma') and self.clf._sigma is not None) assert_true(hasattr(self.clf, 'support_') and self.clf.support_ is not None) assert_true(hasattr(self.clf, 'support_vectors_') and self.clf.support_vectors_ is not None) assert_true(hasattr(self.clf, 'dual_coef_') and self.clf.dual_coef_ is not None) assert_true(hasattr(self.clf, 'intercept_') and self.clf.intercept_ is not None) # only available for linear kernel # if not hasattr(self.clf, 'coef_') or self.clf.coef_ is None: # self.assertRaises(AttributeError, 'coef_ is not set') def test_train_scores(self): assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0]) def test_prediction_scores(self): pred_scores = self.clf.decision_function(self.X_test) # check score shapes assert_equal(pred_scores.shape[0], self.X_test.shape[0]) # check performance assert_greater(roc_auc_score(self.y_test, pred_scores), self.roc_floor) def test_prediction_labels(self): pred_labels = self.clf.predict(self.X_test) assert_equal(pred_labels.shape, self.y_test.shape) def test_prediction_proba_linear(self): pred_proba = self.clf.predict_proba(self.X_test, method='linear') assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_unify(self): pred_proba = self.clf.predict_proba(self.X_test, method='unify') assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_parameter(self): with assert_raises(ValueError): self.clf.predict_proba(self.X_test, method='something') def test_fit_predict(self): pred_labels = self.clf.fit_predict(self.X_train) assert_equal(pred_labels.shape, self.y_train.shape) def test_fit_predict_score(self): self.clf.fit_predict_score(self.X_test, self.y_test) self.clf.fit_predict_score(self.X_test, self.y_test, scoring='roc_auc_score') self.clf.fit_predict_score(self.X_test, self.y_test, scoring='prc_n_score') with assert_raises(NotImplementedError): self.clf.fit_predict_score(self.X_test, self.y_test, scoring='something') def test_predict_rank(self): pred_socres = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2) assert_array_less(pred_ranks, self.X_train.shape[0] + 1) assert_array_less(-0.1, pred_ranks) def test_predict_rank_normalized(self): pred_socres = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test, normalized=True) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2) assert_array_less(pred_ranks, 1.01) assert_array_less(-0.1, pred_ranks) def tearDown(self): pass
class TestOCSVM(unittest.TestCase): def setUp(self): self.n_train = 100 self.n_test = 50 self.contamination = 0.1 self.roc_floor = 0.6 self.X_train, self.y_train, self.X_test, self.y_test = generate_data( n_train=self.n_train, n_test=self.n_test, contamination=self.contamination) self.clf = OCSVM() self.clf.fit(self.X_train) def test_sklearn_estimator(self): check_estimator(self.clf) def test_parameters(self): if not hasattr(self.clf, 'decision_scores_') or self.clf.decision_scores_ is None: self.assertRaises(AttributeError, 'decision_scores_ is not set') if not hasattr(self.clf, 'labels_') or self.clf.labels_ is None: self.assertRaises(AttributeError, 'labels_ is not set') if not hasattr(self.clf, 'threshold_') or self.clf.threshold_ is None: self.assertRaises(AttributeError, 'threshold_ is not set') if not hasattr(self.clf, 'support_') or self.clf.support_ is None: self.assertRaises(AttributeError, 'support_ is not set') if not hasattr(self.clf, 'support_vectors_') or self.clf.support_vectors_ is None: self.assertRaises(AttributeError, 'support_vectors_ is not set') if not hasattr(self.clf, 'dual_coef_') or self.clf.dual_coef_ is None: self.assertRaises(AttributeError, 'dual_coef_ is not set') # only available for linear kernel # if not hasattr(self.clf, 'coef_') or self.clf.coef_ is None: # self.assertRaises(AttributeError, 'coef_ is not set') if not hasattr(self.clf, 'intercept_') or self.clf.intercept_ is None: self.assertRaises(AttributeError, 'intercept_ is not set') def test_train_scores(self): assert_equal(len(self.clf.decision_scores_), self.X_train.shape[0]) def test_prediction_scores(self): pred_scores = self.clf.decision_function(self.X_test) # check score shapes assert_equal(pred_scores.shape[0], self.X_test.shape[0]) # check performance assert_greater(roc_auc_score(self.y_test, pred_scores), self.roc_floor) def test_prediction_labels(self): pred_labels = self.clf.predict(self.X_test) assert_equal(pred_labels.shape, self.y_test.shape) def test_prediction_proba_linear(self): pred_proba = self.clf.predict_proba(self.X_test, method='linear') assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_unify(self): pred_proba = self.clf.predict_proba(self.X_test, method='unify') assert_greater_equal(pred_proba.min(), 0) assert_less_equal(pred_proba.max(), 1) def test_prediction_proba_parameter(self): with assert_raises(ValueError): self.clf.predict_proba(self.X_test, method='something') def test_fit_predict(self): pred_labels = self.clf.fit_predict(self.X_train) assert_equal(pred_labels.shape, self.y_train.shape) def test_evaluate(self): self.clf.fit_predict_evaluate(self.X_test, self.y_test) def tearDown(self): pass
data_price = [] data_vol_price = [] mal_t_stamps1 = [] mal_t_stamps2 = [] mal_t_stamps12 = [] clf1 = OCSVM() clf2 = OCSVM() clf12 = OCSVM() for j in d_transaction[i]: data_vol.append(j[2]) data_price.append(j[1]) data_vol_price.append([j[2], j[1]]) clf1.fit(np.array([data_vol]).T) clf2.fit(np.array([data_price]).T) clf12.fit(np.array(data_vol_price)) for j in d_transaction[i]: p1 = clf1.predict(np.array(j[2]).reshape(1, -1)) p2 = clf2.predict(np.array(j[2]).reshape(1, -1)) p3 = clf12.predict(np.array([j[2], j[1]]).T.reshape(1, -1)) if p1 == 1: mal_t_stamps1.append(j[0]) if p2 == 1: mal_t_stamps2.append(j[0]) if p3 == 1: mal_t_stamps12.append(j[0]) s = set(d_attack[i]) print("TRADER", i, "VOL", len(s & set(mal_t_stamps1)), "OUT OF", len(mal_t_stamps1), "PRICE", len(s & set(mal_t_stamps2)), "OUT OF", len(mal_t_stamps2), "VOL AND PRICE", len(s & set(mal_t_stamps12)), "OUT OF", len(mal_t_stamps12))
y = data['s'] #划分测试集和训练集 X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.33) #使用pyod中的OCSVM算法拟合数据 clf_name = 'OCSVM' clf = OCSVM() clf.fit(X_train) #预测得到由0和1组成的数组,1表示离群点,0表示飞离群点 y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) y_train_scores = clf.decision_scores_ # raw outlier scores,The outlier scores of the training data. #预测样本是不是离群点,返回0和1 的数组 y_test_pred = clf.predict(X_test) y_test_scores = clf.decision_function( X_test) # outlier scores,The anomaly score of the input samples. #使用sklearn中的roc_auc_score方法得到auc值,即roc曲线下面的面积 try: sumAuc_train += sklearn.metrics.roc_auc_score(y_train, y_train_scores, average='macro') sumAuc_test += sklearn.metrics.roc_auc_score(y_test, y_test_scores, average='macro') #s=precision_score(y_train, y_train_scores, average='macro') i += 1 print(sumAuc_train, sumAuc_test) except ValueError:
def main(): parser = argparse.ArgumentParser(description='baseline') register_data_args(parser) parser.add_argument("--mode", type=str, default='A', choices=['A', 'AX', 'X'], help="dropout probability") parser.add_argument("--seed", type=int, default=-1, help="random seed, -1 means dont fix seed") parser.add_argument( "--emb-method", type=str, default='DeepWalk', help="embedding methods: DeepWalk, Node2Vec, LINE, SDNE, Struc2Vec") parser.add_argument("--ad-method", type=str, default='OCSVM', help="embedding methods: PCA,OCSVM,IF,AE") args = parser.parse_args() if args.seed != -1: np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) logging.basicConfig( filename="./log/baseline.log", filemode="a", format="%(asctime)s-%(name)s-%(levelname)s-%(message)s", level=logging.INFO) logger = logging.getLogger('baseline') datadict = emb_dataloader(args) if args.mode == 'X': data = datadict['features'] #print('X shape',data.shape) else: t0 = time.time() embeddings = embedding(args, datadict) dur1 = time.time() - t0 if args.mode == 'A': data = embeddings #print('A shape',data.shape) if args.mode == 'AX': data = np.concatenate((embeddings, datadict['features']), axis=1) #print('AX shape',data.shape) logger.debug(f'data shape: {data.shape}') if args.ad_method == 'OCSVM': clf = OCSVM(contamination=0.1) if args.ad_method == 'IF': clf = IForest(n_estimators=100, contamination=0.1, n_jobs=-1, behaviour="new") if args.ad_method == 'PCA': clf = PCA(contamination=0.1) if args.ad_method == 'AE': clf = AutoEncoder(contamination=0.1) t1 = time.time() clf.fit(data[datadict['train_mask']]) dur2 = time.time() - t1 print('traininig time:', dur1 + dur2) logger.info('\n') logger.info('\n') logger.info( f'Parameters dataset:{args.dataset} datamode:{args.mode} ad-method:{args.ad_method} emb-method:{args.emb_method}' ) logger.info('-------------Evaluating Validation Results--------------') t2 = time.time() y_pred_val = clf.predict(data[datadict['val_mask']]) y_score_val = clf.decision_function(data[datadict['val_mask']]) auc, ap, f1, acc, precision, recall = baseline_evaluate(datadict, y_pred_val, y_score_val, val=True) dur3 = time.time() - t2 print('infer time:', dur3) logger.info(f'AUC:{round(auc,4)},AP:{round(ap,4)}') logger.info( f'f1:{round(f1,4)},acc:{round(acc,4)},pre:{round(precision,4)},recall:{round(recall,4)}' ) logger.info('-------------Evaluating Test Results--------------') y_pred_test = clf.predict(data[datadict['test_mask']]) y_score_test = clf.decision_function(data[datadict['test_mask']]) auc, ap, f1, acc, precision, recall = baseline_evaluate(datadict, y_pred_test, y_score_test, val=False) logger.info(f'AUC:{round(auc,4)},AP:{round(ap,4)}') logger.info( f'f1:{round(f1,4)},acc:{round(acc,4)},pre:{round(precision,4)},recall:{round(recall,4)}' )