Exemple #1
0
def test_gridsearchCV():
    iris = load_iris()
    parameters = {'kernel': ('linear', 'rbf'), 'C': [1, 10]}
    clf = GridSearchCV(SVC(), parameters)
    clf.fit(iris.data, iris.target)
    assert clf.best_params_['kernel'] == 'rbf'
    assert clf.best_params_['C'] == 10
Exemple #2
0
def test_pipeline():
    X, y = make_classification(random_state=0)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
    pipe = Pipeline(steps=[('scaler', StandardScaler()), ('svc', SVC())])
    pipe.fit(X_train, y_train)
    score = pipe.score(X_test, y_test)
    assert score > 0.8
Exemple #3
0
    def _train_predict(self, X: pd.DataFrame, y: pd.DataFrame,
                       X_test: pd.DataFrame, predictors: List[str],
                       train_idx: np.ndarray, valid_idx: np.ndarray,
                       seed: int):
        _params = self._get_default_params()
        _params.update(self.params)

        X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
        y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]
        target_cols = y_valid.columns.tolist()

        pred_valid = np.zeros_like(y_valid).astype(float)
        preds = np.zeros(shape=(X_test.shape[0], y_train.shape[1]))

        # multilabel分回す
        for idx, target_col in tqdm(enumerate(target_cols),
                                    total=len(target_cols)):
            # Since cuml SVC calls CalibratedClassifierCV(n_folds=5), more than 5 positive samples is required
            if y_train[target_col].sum() < 5:
                logger.info(f'{target_col} is all zeros')
                clf = AllZerosClassifier()
            else:
                clf = SVC(**_params)
                clf.fit(X_train[predictors].values,
                        y_train[target_col].values.astype(int),
                        convert_dtype=False)
            pred_valid[:,
                       idx] = clf.predict_proba(X_valid[predictors].values)[:,
                                                                            1]
            preds[:, idx] = clf.predict_proba(X_test[predictors].values)[:, 1]

        return preds, pred_valid
def run_svm(scaled_df):
    # Initiate classifier, C values, and gamma values
    clf = SVC()
    C_list = [1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3]

    # 'auto' means 1/(n_features)
    gamma_list = [1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 'auto']

    search_params = {'C': C_list, 'gamma': gamma_list}

    # Set metrics
    metrics = ['accuracy', 'f1', 'roc_auc']

    raw_train_arr = []
    raw_test_arr = []

    # Over five trials
    for i in range(5):

        # Train test split data
        X_train, X_test, y_train, y_test = train_test_split(
            scaled_df.iloc[:, :-1], scaled_df.y, train_size=5000)

        # Init GridSearch
        search_results = GridSearchCV(clf,
                                      search_params,
                                      scoring=metrics,
                                      refit=False)

        # Run GridSearch
        search_results.fit(X_train, y_train)

        # Get results
        results = pd.DataFrame(search_results.cv_results_['params'])

        results['mean_accuracy'] = search_results.cv_results_[
            'mean_test_accuracy']
        results['mean_f1'] = search_results.cv_results_['mean_test_f1']
        results['mean_auc'] = search_results.cv_results_['mean_test_roc_auc']

        # Get optimal classifier using results dataframe
        opt_acc_inf = results.sort_values(by='mean_accuracy',
                                          ascending=False).iloc[0]
        opt_f1_inf = results.sort_values(by='mean_f1', ascending=False).iloc[0]
        opt_auc_inf = results.sort_values(by='mean_auc',
                                          ascending=False).iloc[0]

        # Init optimal classifiers
        opt_acc_clf = SVC(C=opt_acc_inf.C, gamma=opt_acc_inf.gamma)
        opt_f1_clf = SVC(C=opt_f1_inf.C, gamma=opt_f1_inf.gamma)
        opt_auc_clf = SVC(C=opt_auc_inf.C, gamma=opt_auc_inf.gamma)

        # Fit to train
        opt_acc_clf.fit(X_train, y_train)
        opt_f1_clf.fit(X_train, y_train)
        opt_auc_clf.fit(X_train, y_train)

        # Get train and test metrics
        train_score_acc = opt_acc_clf.score(X_train, y_train)
        train_score_f1 = f1_score(y_train, opt_f1_clf.predict(X_train))
        train_score_auc = roc_auc_score(y_train, opt_auc_clf.predict(X_train))

        test_score_acc = opt_acc_clf.score(X_test, y_test)
        test_score_f1 = f1_score(y_test, opt_f1_clf.predict(X_test))
        test_score_auc = roc_auc_score(y_test, opt_auc_clf.predict(X_test))

        # Append to results
        raw_train_arr.append(
            [train_score_acc, train_score_f1, train_score_auc])
        raw_test_arr.append([test_score_acc, test_score_f1, test_score_auc])

    raw_train_arr = np.array(raw_train_arr).reshape(5, 3)
    raw_test_arr = np.array(raw_test_arr).reshape(5, 3)

    raw_train_df = pd.DataFrame(data=raw_train_arr,
                                columns=['accuracy', 'f1', 'auc'])
    raw_test_df = pd.DataFrame(data=raw_test_arr,
                               columns=['accuracy', 'f1', 'auc'])

    # Return results
    return raw_train_df, raw_test_df
Exemple #5
0
ap = argparse.ArgumentParser()

ap.add_argument("--data", default="data.pickle", help='Path to data')
ap.add_argument("--folder", default="matching_out/")
ap.add_argument("--models_out", default="svm_gpu.pickle")
args = ap.parse_args()

data = pickle.loads(open(args.folder + args.data, "rb").read())
# Encode the labels
le = LabelEncoder()
labels = le.fit_transform(data["labels"])
print("Encoder: ", labels)

X = np.array(data['data'])

y = labels
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

svclassifier = SVC()

svclassifier.fit(X_train, y_train)
y_pred = svclassifier.predict(X_test)
print(y_pred)
print(type(y_pred))
#with open(args.folder + args.models_out, 'wb') as f:
#    pickle.dump(svclassifier, f)
#from sklearn.metrics import classification_report, confusion_matrix
#print(confusion_matrix(y_test,y_pred))
#print(classification_report(y_test,y_pred))
params = bench.parse_args(parser)

X_train, X_test, y_train, y_test = bench.load_data(params)

if params.gamma is None:
    params.gamma = 1.0 / X_train.shape[1]

cache_size_bytes = bench.get_optimal_cache_size(
    X_train.shape[0], max_cache=params.max_cache_size)
params.cache_size_mb = cache_size_bytes / 1024**2
params.n_classes = y_train[y_train.columns[0]].nunique()

clf = SVC(C=params.C,
          kernel=params.kernel,
          cache_size=params.cache_size_mb,
          tol=params.tol,
          gamma=params.gamma,
          probability=params.probability,
          degree=params.degree)

fit_time, _ = bench.measure_function_time(clf.fit,
                                          X_train,
                                          y_train,
                                          params=params)

if params.probability:
    state_predict = 'predict_proba'
    metric_type = 'log_loss'
    clf_predict = clf.predict_proba

    def metric_call(x, y):
Exemple #7
0
params = bench.parse_args(parser)

# Load data
X_train, X_test, y_train, y_test = bench.load_data(params)

if params.gamma is None:
    params.gamma = 1.0 / X_train.shape[1]

cache_size_bytes = bench.get_optimal_cache_size(X_train.shape[0],
                                                max_cache=params.max_cache_size)
params.cache_size_mb = cache_size_bytes / 1024**2
params.n_classes = y_train[y_train.columns[0]].nunique()

# Create our C-SVM classifier
clf = SVC(C=params.C, kernel=params.kernel, max_iter=params.maxiter,
          cache_size=params.cache_size_mb, tol=params.tol,
          gamma=params.gamma)

# Time fit and predict
fit_time, _ = bench.measure_function_time(clf.fit, X_train, y_train, params=params)
params.sv_len = clf.support_.shape[0]

predict_time, y_pred = bench.measure_function_time(
    clf.predict, X_train, params=params)
train_acc = 100 * bench.accuracy_score(y_pred, y_train)

y_pred = clf.predict(X_test)
test_acc = 100 * bench.accuracy_score(y_pred, y_test)

bench.print_output(library='cuml', algorithm='svc',
                   stages=['training', 'prediction'], params=params,
Exemple #8
0
             per_run_time_limit=alg.run_time,
             resampling_strategy=alg.sampling_strategy,
             resampling_strategy_arguments={'folds': alg.folds}
         )
     else:
         model = classification.AutoSklearnClassifier(
             time_left_for_this_task=alg.task_time,
             per_run_time_limit=alg.run_time
         )
     warn_not_gpu_support(alg)
 elif alg.name == 'SupportVectorMachines':
     if NVIDIA_RAPIDS_ENABLED:
         from cuml.svm import SVC
     else:
         from sklearn.svm import SVC
     model = SVC(**alg.input_variables.__dict__)
 elif alg.name == 'GaussianNaiveBayes':
     from sklearn.naive_bayes import GaussianNB
     model = GaussianNB(**alg.input_variables.__dict__)
     warn_not_gpu_support(alg)
 elif alg.name == 'LogisticRegression':
     if NVIDIA_RAPIDS_ENABLED:
         # nvidia rapids version should be higher than v0.13
         # if version.parse(cuml.__version__) > version.parse("0.13"):
         from cuml.linear_model import LogisticRegression
     else:
         from sklearn.linear_model import LogisticRegression
     model = LogisticRegression(**alg.input_variables.__dict__)
 elif alg.name == 'AdaBoost' and alg.type == 'classification':
     from sklearn.ensemble import AdaBoostClassifier
     model = AdaBoostClassifier(**alg.input_variables.__dict__)
Exemple #9
0
class CumlSVMFitter(FitterBase):
    def __init__(self,
                 label='label',
                 metric='error',
                 opt: SVMOpt = None,
                 max_eval=100):
        super(CumlSVMFitter, self).__init__(label, metric, max_eval)
        if opt is not None:
            self.opt = opt
        else:
            self.opt = SVMOpt()
        self.clf = None

    def train(self, train_df, eval_df, params=None):
        train_df, eval_df = cudf.DataFrame(train_df), cudf.DataFrame(eval_df)
        x_train, y_train, x_eval, y_eval = train_df.drop(columns=[self.label]), train_df[self.label], \
                                           eval_df.drop(columns=[self.label]), eval_df[self.label],
        if params is None:
            use_params = deepcopy(self.opt_params)
        else:
            use_params = deepcopy(params)
        self.clf = SVC(**use_params)
        self.clf.fit(X=x_train, y=y_train)
        preds = self.clf.predict(X=x_eval)
        output = self.get_loss(y_pred=preds, y=y_eval)

        return output

    def search(self, train_df, eval_df):
        self.opt_params = dict()

        def train_impl(params):
            self.train(train_df, eval_df, params)
            if self.metric == 'auc':
                y_pred = self.clf.predict(eval_df.drop(columns=[self.label]))
            else:
                y_pred = self.clf.predict(
                    eval_df.drop(columns=[self.label])).astype(int)

            return self.get_loss(eval_df[self.label], y_pred)

        self.opt_params = fmin(train_impl,
                               asdict(self.opt),
                               algo=tpe.suggest,
                               max_evals=self.max_eval)

    def search_k_fold(self, k_fold, data):
        self.opt_params = dict()

        def train_impl_nfold(params):
            loss = list()
            for train_id, eval_id in k_fold.split(data):
                train_df = data.iloc[train_id, :]
                eval_df = data.iloc[eval_id, :]
                self.train(train_df, eval_df, params)
                if self.metric == 'auc':
                    y_pred = self.clf.predict(
                        eval_df.drop(columns=[self.label]))
                else:
                    y_pred = self.clf.predict(
                        eval_df.drop(columns=[self.label])).astype(int)
                loss.append(self.get_loss(eval_df[self.label], y_pred))
            return np.mean(loss)

        self.opt_params = fmin(train_impl_nfold,
                               asdict(self.opt),
                               algo=tpe.suggest,
                               max_evals=self.max_eval)

    def train_k_fold(self,
                     k_fold,
                     train_data,
                     test_data,
                     params=None,
                     drop_test_y=True):
        acc_result = list()
        train_pred = cudf.Series(np.empty(train_data.shape[0]))
        test_pred = cudf.Series(np.empty(test_data.shape[0]))
        if drop_test_y:
            dtest = test_data.drop(columns=self.label)
        else:
            dtest = test_data
        for train_id, eval_id in k_fold.split(train_data):
            train_df = train_data.iloc[train_id, :]
            eval_df = train_data.iloc[eval_id, :]
            self.train(train_df, eval_df, params)
            train_pred[eval_id] = self.clf.predict_proba(
                eval_df.drop(columns=self.label)).iloc[:, 1].values
            if self.metric == 'auc':
                y_pred = self.clf.predict(eval_df.drop(columns=[self.label]))
            else:
                y_pred = self.clf.predict(
                    eval_df.drop(columns=[self.label])).astype(int)

            acc_result.append(self.get_loss(eval_df[self.label], y_pred))
            test_pred += self.clf.predict_proba(dtest).iloc[:, 1]
        test_pred /= k_fold.n_splits
        return train_pred, test_pred, acc_result
Exemple #10
0
if (detector == 'LayerMFS' or detector == 'LayerPFS') and net == 'cif100' and (
        attack_method == 'cw' or attack_method == 'df'):
    from cuml.svm import SVC
    scaler = MinMaxScaler().fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
    if detector == 'LayerMFS':
        gamma = 0.1
        if attack_method == 'cw':
            C = 1
        else:
            C = 10
    else:
        C = 10
        gamma = 0.01
    clf = SVC(probability=True, C=C, gamma=gamma)
else:
    clf = LogisticRegression()  #normal case

clf.fit(X_train, y_train)

#save classifier
filename = './data/detectors/LR_' + attack_method + '_' + detector + '_' + mode + '_' + net + '.sav'
pickle.dump(clf, open(filename, 'wb'))

print('Evaluating classifier...')
prediction = clf.predict(X_test)
prediction_pr = clf.predict_proba(X_test)[:, 1]

benign_rate = 0
benign_guesses = 0