def run_svm(scaled_df): # Initiate classifier, C values, and gamma values clf = SVC() C_list = [1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3] # 'auto' means 1/(n_features) gamma_list = [1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 'auto'] search_params = {'C': C_list, 'gamma': gamma_list} # Set metrics metrics = ['accuracy', 'f1', 'roc_auc'] raw_train_arr = [] raw_test_arr = [] # Over five trials for i in range(5): # Train test split data X_train, X_test, y_train, y_test = train_test_split( scaled_df.iloc[:, :-1], scaled_df.y, train_size=5000) # Init GridSearch search_results = GridSearchCV(clf, search_params, scoring=metrics, refit=False) # Run GridSearch search_results.fit(X_train, y_train) # Get results results = pd.DataFrame(search_results.cv_results_['params']) results['mean_accuracy'] = search_results.cv_results_[ 'mean_test_accuracy'] results['mean_f1'] = search_results.cv_results_['mean_test_f1'] results['mean_auc'] = search_results.cv_results_['mean_test_roc_auc'] # Get optimal classifier using results dataframe opt_acc_inf = results.sort_values(by='mean_accuracy', ascending=False).iloc[0] opt_f1_inf = results.sort_values(by='mean_f1', ascending=False).iloc[0] opt_auc_inf = results.sort_values(by='mean_auc', ascending=False).iloc[0] # Init optimal classifiers opt_acc_clf = SVC(C=opt_acc_inf.C, gamma=opt_acc_inf.gamma) opt_f1_clf = SVC(C=opt_f1_inf.C, gamma=opt_f1_inf.gamma) opt_auc_clf = SVC(C=opt_auc_inf.C, gamma=opt_auc_inf.gamma) # Fit to train opt_acc_clf.fit(X_train, y_train) opt_f1_clf.fit(X_train, y_train) opt_auc_clf.fit(X_train, y_train) # Get train and test metrics train_score_acc = opt_acc_clf.score(X_train, y_train) train_score_f1 = f1_score(y_train, opt_f1_clf.predict(X_train)) train_score_auc = roc_auc_score(y_train, opt_auc_clf.predict(X_train)) test_score_acc = opt_acc_clf.score(X_test, y_test) test_score_f1 = f1_score(y_test, opt_f1_clf.predict(X_test)) test_score_auc = roc_auc_score(y_test, opt_auc_clf.predict(X_test)) # Append to results raw_train_arr.append( [train_score_acc, train_score_f1, train_score_auc]) raw_test_arr.append([test_score_acc, test_score_f1, test_score_auc]) raw_train_arr = np.array(raw_train_arr).reshape(5, 3) raw_test_arr = np.array(raw_test_arr).reshape(5, 3) raw_train_df = pd.DataFrame(data=raw_train_arr, columns=['accuracy', 'f1', 'auc']) raw_test_df = pd.DataFrame(data=raw_test_arr, columns=['accuracy', 'f1', 'auc']) # Return results return raw_train_df, raw_test_df
ap = argparse.ArgumentParser() ap.add_argument("--data", default="data.pickle", help='Path to data') ap.add_argument("--folder", default="matching_out/") ap.add_argument("--models_out", default="svm_gpu.pickle") args = ap.parse_args() data = pickle.loads(open(args.folder + args.data, "rb").read()) # Encode the labels le = LabelEncoder() labels = le.fit_transform(data["labels"]) print("Encoder: ", labels) X = np.array(data['data']) y = labels X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20) svclassifier = SVC() svclassifier.fit(X_train, y_train) y_pred = svclassifier.predict(X_test) print(y_pred) print(type(y_pred)) #with open(args.folder + args.models_out, 'wb') as f: # pickle.dump(svclassifier, f) #from sklearn.metrics import classification_report, confusion_matrix #print(confusion_matrix(y_test,y_pred)) #print(classification_report(y_test,y_pred))
if params.gamma is None: params.gamma = 1.0 / X_train.shape[1] cache_size_bytes = bench.get_optimal_cache_size(X_train.shape[0], max_cache=params.max_cache_size) params.cache_size_mb = cache_size_bytes / 1024**2 params.n_classes = y_train[y_train.columns[0]].nunique() # Create our C-SVM classifier clf = SVC(C=params.C, kernel=params.kernel, max_iter=params.maxiter, cache_size=params.cache_size_mb, tol=params.tol, gamma=params.gamma) # Time fit and predict fit_time, _ = bench.measure_function_time(clf.fit, X_train, y_train, params=params) params.sv_len = clf.support_.shape[0] predict_time, y_pred = bench.measure_function_time( clf.predict, X_train, params=params) train_acc = 100 * bench.accuracy_score(y_pred, y_train) y_pred = clf.predict(X_test) test_acc = 100 * bench.accuracy_score(y_pred, y_test) bench.print_output(library='cuml', algorithm='svc', stages=['training', 'prediction'], params=params, functions=['SVM.fit', 'SVM.predict'], times=[fit_time, predict_time], accuracy_type='accuracy[%]', accuracies=[train_acc, test_acc], data=[X_train, X_train], alg_instance=clf)
# Create our C-SVM classifier clf = SVC(C=params.C, kernel=params.kernel, max_iter=params.maxiter, cache_size=params.cache_size_mb, tol=params.tol, gamma=params.gamma) columns = ('batch', 'arch', 'prefix', 'function', 'threads', 'dtype', 'size', 'kernel', 'cache_size_mb', 'C', 'sv_len', 'n_classes', 'accuracy', 'time') # Time fit and predict fit_time, _ = measure_function_time(clf.fit, X_train, y_train, params=params) params.sv_len = clf.support_.shape[0] y_pred = clf.predict(X_train) train_acc = 100 * accuracy_score(y_pred, y_train) predict_time, y_pred = measure_function_time(clf.predict, X_test, params=params) test_acc = 100 * accuracy_score(y_pred, y_test) print_output(library='cuml', algorithm='svc', stages=['training', 'prediction'], columns=columns, params=params, functions=['SVM.fit', 'SVM.predict'], times=[fit_time, predict_time], accuracy_type='accuracy[%]',
class CumlSVMFitter(FitterBase): def __init__(self, label='label', metric='error', opt: SVMOpt = None, max_eval=100): super(CumlSVMFitter, self).__init__(label, metric, max_eval) if opt is not None: self.opt = opt else: self.opt = SVMOpt() self.clf = None def train(self, train_df, eval_df, params=None): train_df, eval_df = cudf.DataFrame(train_df), cudf.DataFrame(eval_df) x_train, y_train, x_eval, y_eval = train_df.drop(columns=[self.label]), train_df[self.label], \ eval_df.drop(columns=[self.label]), eval_df[self.label], if params is None: use_params = deepcopy(self.opt_params) else: use_params = deepcopy(params) self.clf = SVC(**use_params) self.clf.fit(X=x_train, y=y_train) preds = self.clf.predict(X=x_eval) output = self.get_loss(y_pred=preds, y=y_eval) return output def search(self, train_df, eval_df): self.opt_params = dict() def train_impl(params): self.train(train_df, eval_df, params) if self.metric == 'auc': y_pred = self.clf.predict(eval_df.drop(columns=[self.label])) else: y_pred = self.clf.predict( eval_df.drop(columns=[self.label])).astype(int) return self.get_loss(eval_df[self.label], y_pred) self.opt_params = fmin(train_impl, asdict(self.opt), algo=tpe.suggest, max_evals=self.max_eval) def search_k_fold(self, k_fold, data): self.opt_params = dict() def train_impl_nfold(params): loss = list() for train_id, eval_id in k_fold.split(data): train_df = data.iloc[train_id, :] eval_df = data.iloc[eval_id, :] self.train(train_df, eval_df, params) if self.metric == 'auc': y_pred = self.clf.predict( eval_df.drop(columns=[self.label])) else: y_pred = self.clf.predict( eval_df.drop(columns=[self.label])).astype(int) loss.append(self.get_loss(eval_df[self.label], y_pred)) return np.mean(loss) self.opt_params = fmin(train_impl_nfold, asdict(self.opt), algo=tpe.suggest, max_evals=self.max_eval) def train_k_fold(self, k_fold, train_data, test_data, params=None, drop_test_y=True): acc_result = list() train_pred = cudf.Series(np.empty(train_data.shape[0])) test_pred = cudf.Series(np.empty(test_data.shape[0])) if drop_test_y: dtest = test_data.drop(columns=self.label) else: dtest = test_data for train_id, eval_id in k_fold.split(train_data): train_df = train_data.iloc[train_id, :] eval_df = train_data.iloc[eval_id, :] self.train(train_df, eval_df, params) train_pred[eval_id] = self.clf.predict_proba( eval_df.drop(columns=self.label)).iloc[:, 1].values if self.metric == 'auc': y_pred = self.clf.predict(eval_df.drop(columns=[self.label])) else: y_pred = self.clf.predict( eval_df.drop(columns=[self.label])).astype(int) acc_result.append(self.get_loss(eval_df[self.label], y_pred)) test_pred += self.clf.predict_proba(dtest).iloc[:, 1] test_pred /= k_fold.n_splits return train_pred, test_pred, acc_result
C = 10 else: C = 10 gamma = 0.01 clf = SVC(probability=True, C=C, gamma=gamma) else: clf = LogisticRegression() #normal case clf.fit(X_train, y_train) #save classifier filename = './data/detectors/LR_' + attack_method + '_' + detector + '_' + mode + '_' + net + '.sav' pickle.dump(clf, open(filename, 'wb')) print('Evaluating classifier...') prediction = clf.predict(X_test) prediction_pr = clf.predict_proba(X_test)[:, 1] benign_rate = 0 benign_guesses = 0 ad_guesses = 0 ad_rate = 0 for i in range(len(prediction)): if prediction[i] == 0: benign_guesses += 1 if y_test[i] == 0: benign_rate += 1 else: ad_guesses += 1 if y_test[i] == 1: ad_rate += 1