def run_svm(scaled_df): # Initiate classifier, C values, and gamma values clf = SVC() C_list = [1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3] # 'auto' means 1/(n_features) gamma_list = [1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 'auto'] search_params = {'C': C_list, 'gamma': gamma_list} # Set metrics metrics = ['accuracy', 'f1', 'roc_auc'] raw_train_arr = [] raw_test_arr = [] # Over five trials for i in range(5): # Train test split data X_train, X_test, y_train, y_test = train_test_split( scaled_df.iloc[:, :-1], scaled_df.y, train_size=5000) # Init GridSearch search_results = GridSearchCV(clf, search_params, scoring=metrics, refit=False) # Run GridSearch search_results.fit(X_train, y_train) # Get results results = pd.DataFrame(search_results.cv_results_['params']) results['mean_accuracy'] = search_results.cv_results_[ 'mean_test_accuracy'] results['mean_f1'] = search_results.cv_results_['mean_test_f1'] results['mean_auc'] = search_results.cv_results_['mean_test_roc_auc'] # Get optimal classifier using results dataframe opt_acc_inf = results.sort_values(by='mean_accuracy', ascending=False).iloc[0] opt_f1_inf = results.sort_values(by='mean_f1', ascending=False).iloc[0] opt_auc_inf = results.sort_values(by='mean_auc', ascending=False).iloc[0] # Init optimal classifiers opt_acc_clf = SVC(C=opt_acc_inf.C, gamma=opt_acc_inf.gamma) opt_f1_clf = SVC(C=opt_f1_inf.C, gamma=opt_f1_inf.gamma) opt_auc_clf = SVC(C=opt_auc_inf.C, gamma=opt_auc_inf.gamma) # Fit to train opt_acc_clf.fit(X_train, y_train) opt_f1_clf.fit(X_train, y_train) opt_auc_clf.fit(X_train, y_train) # Get train and test metrics train_score_acc = opt_acc_clf.score(X_train, y_train) train_score_f1 = f1_score(y_train, opt_f1_clf.predict(X_train)) train_score_auc = roc_auc_score(y_train, opt_auc_clf.predict(X_train)) test_score_acc = opt_acc_clf.score(X_test, y_test) test_score_f1 = f1_score(y_test, opt_f1_clf.predict(X_test)) test_score_auc = roc_auc_score(y_test, opt_auc_clf.predict(X_test)) # Append to results raw_train_arr.append( [train_score_acc, train_score_f1, train_score_auc]) raw_test_arr.append([test_score_acc, test_score_f1, test_score_auc]) raw_train_arr = np.array(raw_train_arr).reshape(5, 3) raw_test_arr = np.array(raw_test_arr).reshape(5, 3) raw_train_df = pd.DataFrame(data=raw_train_arr, columns=['accuracy', 'f1', 'auc']) raw_test_df = pd.DataFrame(data=raw_test_arr, columns=['accuracy', 'f1', 'auc']) # Return results return raw_train_df, raw_test_df
class CumlSVMFitter(FitterBase): def __init__(self, label='label', metric='error', opt: SVMOpt = None, max_eval=100): super(CumlSVMFitter, self).__init__(label, metric, max_eval) if opt is not None: self.opt = opt else: self.opt = SVMOpt() self.clf = None def train(self, train_df, eval_df, params=None): train_df, eval_df = cudf.DataFrame(train_df), cudf.DataFrame(eval_df) x_train, y_train, x_eval, y_eval = train_df.drop(columns=[self.label]), train_df[self.label], \ eval_df.drop(columns=[self.label]), eval_df[self.label], if params is None: use_params = deepcopy(self.opt_params) else: use_params = deepcopy(params) self.clf = SVC(**use_params) self.clf.fit(X=x_train, y=y_train) preds = self.clf.predict(X=x_eval) output = self.get_loss(y_pred=preds, y=y_eval) return output def search(self, train_df, eval_df): self.opt_params = dict() def train_impl(params): self.train(train_df, eval_df, params) if self.metric == 'auc': y_pred = self.clf.predict(eval_df.drop(columns=[self.label])) else: y_pred = self.clf.predict( eval_df.drop(columns=[self.label])).astype(int) return self.get_loss(eval_df[self.label], y_pred) self.opt_params = fmin(train_impl, asdict(self.opt), algo=tpe.suggest, max_evals=self.max_eval) def search_k_fold(self, k_fold, data): self.opt_params = dict() def train_impl_nfold(params): loss = list() for train_id, eval_id in k_fold.split(data): train_df = data.iloc[train_id, :] eval_df = data.iloc[eval_id, :] self.train(train_df, eval_df, params) if self.metric == 'auc': y_pred = self.clf.predict( eval_df.drop(columns=[self.label])) else: y_pred = self.clf.predict( eval_df.drop(columns=[self.label])).astype(int) loss.append(self.get_loss(eval_df[self.label], y_pred)) return np.mean(loss) self.opt_params = fmin(train_impl_nfold, asdict(self.opt), algo=tpe.suggest, max_evals=self.max_eval) def train_k_fold(self, k_fold, train_data, test_data, params=None, drop_test_y=True): acc_result = list() train_pred = cudf.Series(np.empty(train_data.shape[0])) test_pred = cudf.Series(np.empty(test_data.shape[0])) if drop_test_y: dtest = test_data.drop(columns=self.label) else: dtest = test_data for train_id, eval_id in k_fold.split(train_data): train_df = train_data.iloc[train_id, :] eval_df = train_data.iloc[eval_id, :] self.train(train_df, eval_df, params) train_pred[eval_id] = self.clf.predict_proba( eval_df.drop(columns=self.label)).iloc[:, 1].values if self.metric == 'auc': y_pred = self.clf.predict(eval_df.drop(columns=[self.label])) else: y_pred = self.clf.predict( eval_df.drop(columns=[self.label])).astype(int) acc_result.append(self.get_loss(eval_df[self.label], y_pred)) test_pred += self.clf.predict_proba(dtest).iloc[:, 1] test_pred /= k_fold.n_splits return train_pred, test_pred, acc_result
ap = argparse.ArgumentParser() ap.add_argument("--data", default="data.pickle", help='Path to data') ap.add_argument("--folder", default="matching_out/") ap.add_argument("--models_out", default="svm_gpu.pickle") args = ap.parse_args() data = pickle.loads(open(args.folder + args.data, "rb").read()) # Encode the labels le = LabelEncoder() labels = le.fit_transform(data["labels"]) print("Encoder: ", labels) X = np.array(data['data']) y = labels X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20) svclassifier = SVC() svclassifier.fit(X_train, y_train) y_pred = svclassifier.predict(X_test) print(y_pred) print(type(y_pred)) #with open(args.folder + args.models_out, 'wb') as f: # pickle.dump(svclassifier, f) #from sklearn.metrics import classification_report, confusion_matrix #print(confusion_matrix(y_test,y_pred)) #print(classification_report(y_test,y_pred))
X_train = scaler.transform(X_train) X_test = scaler.transform(X_test) if detector == 'LayerMFS': gamma = 0.1 if attack_method == 'cw': C = 1 else: C = 10 else: C = 10 gamma = 0.01 clf = SVC(probability=True, C=C, gamma=gamma) else: clf = LogisticRegression() #normal case clf.fit(X_train, y_train) #save classifier filename = './data/detectors/LR_' + attack_method + '_' + detector + '_' + mode + '_' + net + '.sav' pickle.dump(clf, open(filename, 'wb')) print('Evaluating classifier...') prediction = clf.predict(X_test) prediction_pr = clf.predict_proba(X_test)[:, 1] benign_rate = 0 benign_guesses = 0 ad_guesses = 0 ad_rate = 0 for i in range(len(prediction)): if prediction[i] == 0: