def test_eigenpro_classification_conflict_data():
    """Make sure that the classifier doesn't crash
    when given conflicting input data"""
    X, y = make_classification(random_state=1)
    X, y = np.concatenate([X, X]), np.concatenate([y, 1 - y])
    # Make sure we don't throw an error when fitting or predicting
    EigenProClassifier(kernel="linear", n_epoch=5,
                       random_state=1).fit(X, y).predict(X)
def test_eigenpro_classification_duplicate_data():
    """
    Make sure that the classifier correctly handles cases
    where some data is repeated.
    """
    X, y = make_classification(n_features=200, n_repeated=50, random_state=1)
    prediction = (EigenProClassifier(kernel="rbf",
                                     n_epoch=60,
                                     gamma=0.002,
                                     random_state=1).fit(X, y).predict(X))
    assert_allclose(prediction, y, rtol=5e-3)
eig_err = []
svc_fit_times = []
svc_pred_times = []
svc_err = []

train_sizes = [2000, 5000, 10000, 20000, 50000]

gamma = 0.005
for train_size in train_sizes:
    for name, estimator in [
        (
            "EigenPro",
            EigenProClassifier(
                n_epoch=3,
                gamma=gamma,
                n_components=30,
                subsample_size=1000,
                random_state=rng,
            ),
        ),
        ("SupportVector", SVC(C=5, gamma=gamma)),
    ]:
        stime = time()
        estimator.fit(x_train[:train_size], y_train[:train_size])
        fit_t = time() - stime

        stime = time()
        y_pred_test = estimator.predict(x_test)
        pred_t = time() - stime

        err = 100.0 * np.sum(y_pred_test != y_test) / len(y_test)
# Tests for FastKernelClassification


@pytest.mark.parametrize(
    "data, estimator",
    [
        # Test rbf kernel
        (
            gen_classification({
                "n_samples": 10,
                "hypercube": False
            }),
            EigenProClassifier(
                batch_size=9,
                kernel="rbf",
                gamma=0.08,
                n_epoch=100,
                random_state=1,
            ),
        ),
        # Test laplacian kernel
        (
            gen_classification({}),
            EigenProClassifier(
                kernel="laplace", n_epoch=100, gamma=0.003, random_state=1),
        ),
        # Test cauchy kernel
        (
            gen_classification({}),
            EigenProClassifier(
                kernel="cauchy", n_epoch=100, gamma=0.005, random_state=1),
Ejemplo n.º 5
0
for n_features in feature_counts:
    x, y = make_classification(
        n_samples=train_size + test_size,
        n_features=n_features,
        random_state=rng,
    )

    x_train = x[:train_size]
    y_train = y[:train_size]
    x_test = x[train_size:]
    y_test = y[train_size:]
    for name, estimator in [
        (
            "EigenPro",
            EigenProClassifier(n_epoch=2,
                               gamma=gamma,
                               n_components=400,
                               random_state=rng),
        ),
        ("SupportVector", SVC(gamma=gamma, random_state=rng)),
    ]:
        stime = time()
        estimator.fit(x_train, y_train)
        fit_t = time() - stime

        stime = time()
        y_pred_test = estimator.predict(x_test)
        pred_t = time() - stime

        err = 100.0 * np.sum(y_pred_test != y_test) / len(y_test)
        if name == "EigenPro":
            eig_fit_times.append(fit_t)
Ejemplo n.º 6
0
svc_pred_times = []
svc_err = []

train_sizes = [500, 1000, 2000]

print("Train Sizes: " + str(train_sizes))

bandwidth = 5.0

# Fit models to data
for train_size in train_sizes:
    for name, estimator in [
        (
            "EigenPro",
            EigenProClassifier(
                n_epoch=2, bandwidth=bandwidth, random_state=rng
            ),
        ),
        (
            "SupportVector",
            SVC(
                C=5, gamma=1.0 / (2 * bandwidth * bandwidth), random_state=rng
            ),
        ),
    ]:
        stime = time()
        estimator.fit(x_train[:train_size], y_train[:train_size])
        fit_t = time() - stime

        stime = time()
        y_pred_test = estimator.predict(x_test)
    x, y = make_classification(
        n_samples=train_size + test_size,
        n_features=n_features,
        random_state=rng,
    )

    x_train = x[:train_size]
    y_train = y[:train_size]
    x_test = x[train_size:]
    y_test = y[train_size:]
    for name, estimator in [
        (
            "EigenPro",
            EigenProClassifier(
                n_epoch=2,
                bandwidth=bandwidth,
                n_components=400,
                random_state=rng,
            ),
        ),
        (
            "SupportVector",
            SVC(gamma=1.0 / (2 * bandwidth * bandwidth), random_state=rng),
        ),
    ]:
        stime = time()
        estimator.fit(x_train, y_train)
        fit_t = time() - stime

        stime = time()
        y_pred_test = estimator.predict(x_test)
        pred_t = time() - stime
Ejemplo n.º 8
0
eig_fit_times = []
eig_pred_times = []
eig_err = []
svc_fit_times = []
svc_pred_times = []
svc_err = []

train_sizes = [500, 1000, 2000, 5000, 10000, 20000, 40000, 60000]

gamma = 0.02
# Fit models to data
for train_size in train_sizes:
    for name, estimator in [
        (
            "EigenPro",
            EigenProClassifier(n_epoch=2, gamma=gamma, random_state=rng),
        ),
        ("SupportVector", SVC(C=5, gamma=gamma, random_state=rng)),
    ]:
        stime = time()
        estimator.fit(x_train[:train_size], y_train[:train_size])
        fit_t = time() - stime

        stime = time()
        y_pred_test = estimator.predict(x_test)
        pred_t = time() - stime

        err = 100.0 * np.sum(y_pred_test != y_test) / len(y_test)
        if name == "EigenPro":
            eig_fit_times.append(fit_t)
            eig_pred_times.append(pred_t)
Ejemplo n.º 9
0
    ).fit(X, y).predict(X)


# Tests for FastKernelClassification


@pytest.mark.parametrize(
    "data, estimator",
    [
        # Test rbf kernel
        (
            gen_classification({"n_samples": 10, "hypercube": False}),
            EigenProClassifier(
                batch_size=9,
                kernel="rbf",
                bandwidth=2.5,
                n_epoch=100,
                random_state=1,
            ),
        ),
        # Test laplacian kernel
        (
            gen_classification({}),
            EigenProClassifier(
                kernel="laplace", n_epoch=100, bandwidth=13, random_state=1
            ),
        ),
        # Test cauchy kernel
        (
            gen_classification({}),
            EigenProClassifier(
eig_err = []
svc_fit_times = []
svc_pred_times = []
svc_err = []

train_sizes = [2000, 5000, 10000, 20000, 50000]

bandwidth = 10.0
for train_size in train_sizes:
    for name, estimator in [
        (
            "EigenPro",
            EigenProClassifier(
                n_epoch=3,
                bandwidth=bandwidth,
                n_components=30,
                subsample_size=1000,
                random_state=rng,
            ),
        ),
        ("SupportVector", SVC(C=5, gamma=1.0 / (2 * bandwidth * bandwidth))),
    ]:
        stime = time()
        estimator.fit(x_train[:train_size], y_train[:train_size])
        fit_t = time() - stime

        stime = time()
        y_pred_test = estimator.predict(x_test)
        pred_t = time() - stime

        err = 100.0 * np.sum(y_pred_test != y_test) / len(y_test)
def instanciate_estimators(clf_type, classifiers, clf_seed, y=None, **kw):

    score_metric, _ = get_score_metric(clf_type)
    param_grid_LGBM = {
        'learning_rate': [0.1, .05, .5],
        'num_leaves': [7, 15, 31]
    }
    param_grid_XGB = {'learning_rate': [0.1, .05, .3], 'max_depth': [3, 6, 9]}
    param_grid_MLP = {
        'learning_rate_init': [.001, .0005, .005],
        'hidden_layer_sizes': [(30, ), (50, ), (100, ), (30, 30), (50, 50),
                               (100, 100)]
    }
    param_grid_EigenProGaussian = {'bandwidth': [1, 5, 25]}
    n_components_eigenpro = 160
    param_grid_nystroem_ridgecv = {
        'kernel_approx__n_components': [1000, 3000],
        'kernel_approx__degree': [2, 3],
    }
    if clf_type == 'binary':
        print(('Fraction by class: True: %0.2f; False: %0.2f' %
               (list(y).count(True) / len(y), list(y).count(False) / len(y))))
        cw = 'balanced'
        clfs = {
            'L2RegularizedLinearModel':
            linear_model.LogisticRegressionCV(class_weight=cw,
                                              max_iter=100,
                                              solver='sag',
                                              penalty='l2',
                                              n_jobs=1,
                                              cv=3,
                                              multi_class='multinomial'),
            'GradientBoosting':
            ensemble.GradientBoostingClassifier(n_estimators=100),
            'LGBM':
            GridSearchCV(estimator=LGBMClassifier(n_estimators=100,
                                                  n_jobs=1,
                                                  is_unbalance=True),
                         param_grid=param_grid_LGBM,
                         cv=3,
                         scoring=metrics.make_scorer(score_metric)),
            'XGB':
            GridSearchCV(estimator=XGBClassifier(n_estimators=100, n_jobs=1),
                         param_grid=param_grid_XGB,
                         cv=3,
                         scoring=metrics.make_scorer(score_metric)),
            'MLP':
            MLPClassifier(hidden_layer_sizes=(30, 30),
                          activation='relu',
                          solver='adam',
                          alpha=0.0001,
                          batch_size='auto',
                          learning_rate='constant',
                          learning_rate_init=0.001,
                          power_t=0.5,
                          max_iter=200,
                          shuffle=True,
                          random_state=None,
                          tol=0.0001,
                          verbose=False,
                          warm_start=False,
                          momentum=0.9,
                          nesterovs_momentum=True,
                          early_stopping=False,
                          validation_fraction=0.1,
                          beta_1=0.9,
                          beta_2=0.999,
                          epsilon=1e-08,
                          n_iter_no_change=10),
            'MLPGridSearchCV':
            GridSearchCV(estimator=MLPClassifier(hidden_layer_sizes=(30, 30),
                                                 activation='relu',
                                                 solver='adam',
                                                 alpha=0.0001,
                                                 batch_size='auto',
                                                 learning_rate='adaptive',
                                                 learning_rate_init=0.001,
                                                 power_t=0.5,
                                                 max_iter=200,
                                                 shuffle=True,
                                                 random_state=None,
                                                 tol=0.0001,
                                                 verbose=False,
                                                 warm_start=False,
                                                 momentum=0.9,
                                                 nesterovs_momentum=True,
                                                 early_stopping=False,
                                                 validation_fraction=0.1,
                                                 beta_1=0.9,
                                                 beta_2=0.999,
                                                 epsilon=1e-08,
                                                 n_iter_no_change=10),
                         param_grid=param_grid_MLP,
                         cv=3,
                         scoring=metrics.make_scorer(score_metric)),
            'EigenProPolynomial':
            EigenProClassifier(batch_size="auto",
                               n_epoch=10,
                               n_components=n_components_eigenpro,
                               subsample_size="auto",
                               kernel="polynomial",
                               bandwidth=5,
                               gamma=None,
                               degree=2,
                               coef0=1,
                               kernel_params=None,
                               random_state=None),
            'EigenProGaussian160':
            GridSearchCV(estimator=EigenProClassifier(
                batch_size="auto",
                n_epoch=10,
                n_components=n_components_eigenpro,
                subsample_size="auto",
                kernel="gaussian",
                gamma=None,
                degree=2,
                coef0=1,
                kernel_params=None,
                random_state=None),
                         param_grid=param_grid_EigenProGaussian,
                         cv=3,
                         scoring=metrics.make_scorer(score_metric)),
            'EigenProGaussian1000':
            GridSearchCV(estimator=EigenProClassifier(batch_size="auto",
                                                      n_epoch=10,
                                                      n_components=1000,
                                                      subsample_size="auto",
                                                      kernel="gaussian",
                                                      gamma=None,
                                                      degree=2,
                                                      coef0=1,
                                                      kernel_params=None,
                                                      random_state=None),
                         param_grid=param_grid_EigenProGaussian,
                         cv=3,
                         scoring=metrics.make_scorer(score_metric)),
            'NystroemRidgeCV':
            GridSearchCV(estimator=Pipeline([
                ('kernel_approx',
                 Nystroem(kernel="polynomial",
                          n_components=None,
                          random_state=clf_seed,
                          degree=2)),
                ('classifier',
                 linear_model.LogisticRegressionCV(class_weight=cw,
                                                   max_iter=100,
                                                   solver='sag',
                                                   penalty='l2',
                                                   n_jobs=1,
                                                   cv=3,
                                                   multi_class='multinomial'))
            ]),
                         param_grid=param_grid_nystroem_ridgecv,
                         cv=3,
                         scoring=metrics.make_scorer(score_metric)),
        }

    elif clf_type == 'multiclass':
        print('fraction of the most frequent class:',
              max([list(y).count(x) for x in set(list(y))]) / len(list(y)))
        clfs = {
            'L2RegularizedLinearModel':
            linear_model.LogisticRegressionCV(penalty='l2',
                                              n_jobs=1,
                                              cv=3,
                                              multi_class='multinomial',
                                              solver='sag',
                                              max_iter=100),
            'GradientBoosting':
            ensemble.GradientBoostingClassifier(n_estimators=100),
            'LGBM':
            GridSearchCV(estimator=LGBMClassifier(n_estimators=100, n_jobs=1),
                         param_grid=param_grid_LGBM,
                         cv=3,
                         scoring=metrics.make_scorer(score_metric)),
            'XGB':
            GridSearchCV(estimator=XGBClassifier(n_estimators=100,
                                                 n_jobs=1,
                                                 objective='multi:softmax',
                                                 num_class=len(np.unique(y))),
                         param_grid=param_grid_XGB,
                         cv=3,
                         scoring=metrics.make_scorer(score_metric)),
            'MLP':
            MLPClassifier(hidden_layer_sizes=(30, 30),
                          activation='relu',
                          solver='adam',
                          alpha=0.0001,
                          batch_size='auto',
                          learning_rate='constant',
                          learning_rate_init=0.001,
                          power_t=0.5,
                          max_iter=200,
                          shuffle=True,
                          random_state=None,
                          tol=0.0001,
                          verbose=False,
                          warm_start=False,
                          momentum=0.9,
                          nesterovs_momentum=True,
                          early_stopping=False,
                          validation_fraction=0.1,
                          beta_1=0.9,
                          beta_2=0.999,
                          epsilon=1e-08,
                          n_iter_no_change=10),
            'MLPGridSearchCV':
            GridSearchCV(estimator=MLPClassifier(hidden_layer_sizes=(30, 30),
                                                 activation='relu',
                                                 solver='adam',
                                                 alpha=0.0001,
                                                 batch_size='auto',
                                                 learning_rate='adaptive',
                                                 learning_rate_init=0.001,
                                                 power_t=0.5,
                                                 max_iter=200,
                                                 shuffle=True,
                                                 random_state=None,
                                                 tol=0.0001,
                                                 verbose=False,
                                                 warm_start=False,
                                                 momentum=0.9,
                                                 nesterovs_momentum=True,
                                                 early_stopping=False,
                                                 validation_fraction=0.1,
                                                 beta_1=0.9,
                                                 beta_2=0.999,
                                                 epsilon=1e-08,
                                                 n_iter_no_change=10),
                         param_grid=param_grid_MLP,
                         cv=3,
                         scoring=metrics.make_scorer(score_metric)),
            'EigenProPolynomial':
            EigenProClassifier(batch_size="auto",
                               n_epoch=10,
                               n_components=n_components_eigenpro,
                               subsample_size="auto",
                               kernel="polynomial",
                               gamma=None,
                               degree=2,
                               coef0=1,
                               kernel_params=None,
                               random_state=None),
            'EigenProGaussian160':
            GridSearchCV(estimator=EigenProClassifier(
                batch_size="auto",
                n_epoch=10,
                n_components=n_components_eigenpro,
                subsample_size="auto",
                kernel="gaussian",
                gamma=None,
                degree=2,
                coef0=1,
                kernel_params=None,
                random_state=None),
                         param_grid=param_grid_EigenProGaussian,
                         cv=3,
                         scoring=metrics.make_scorer(score_metric)),
            'EigenProGaussian1000':
            GridSearchCV(estimator=EigenProClassifier(batch_size="auto",
                                                      n_epoch=10,
                                                      n_components=1000,
                                                      subsample_size="auto",
                                                      kernel="gaussian",
                                                      gamma=None,
                                                      degree=2,
                                                      coef0=1,
                                                      kernel_params=None,
                                                      random_state=None),
                         param_grid=param_grid_EigenProGaussian,
                         cv=3,
                         scoring=metrics.make_scorer(score_metric)),
            'NystroemRidgeCV':
            GridSearchCV(estimator=Pipeline([
                ('kernel_approx',
                 Nystroem(kernel="polynomial",
                          n_components=None,
                          random_state=clf_seed,
                          degree=2)),
                ('classifier',
                 linear_model.LogisticRegressionCV(penalty='l2',
                                                   n_jobs=1,
                                                   cv=3,
                                                   multi_class='multinomial',
                                                   solver='sag',
                                                   max_iter=100))
            ]),
                         param_grid=param_grid_nystroem_ridgecv,
                         cv=3,
                         scoring=metrics.make_scorer(score_metric)),
        }
    elif clf_type == 'regression':
        clfs = {
            'L2RegularizedLinearModel':
            linear_model.RidgeCV(cv=3),
            'GradientBoosting':
            ensemble.GradientBoostingRegressor(n_estimators=100),
            'LGBM':
            GridSearchCV(estimator=LGBMRegressor(n_estimators=100, n_jobs=1),
                         param_grid=param_grid_LGBM,
                         cv=3,
                         scoring=metrics.make_scorer(score_metric)),
            'XGB':
            GridSearchCV(estimator=XGBRegressor(n_estimators=100, n_jobs=1),
                         param_grid=param_grid_XGB,
                         cv=3,
                         scoring=metrics.make_scorer(score_metric)),
            'MLP':
            MLPRegressor(hidden_layer_sizes=(30, 30),
                         activation='relu',
                         solver='adam',
                         alpha=0.0001,
                         batch_size='auto',
                         learning_rate='constant',
                         learning_rate_init=0.001,
                         power_t=0.5,
                         max_iter=200,
                         shuffle=True,
                         random_state=None,
                         tol=0.0001,
                         verbose=False,
                         warm_start=False,
                         momentum=0.9,
                         nesterovs_momentum=True,
                         early_stopping=False,
                         validation_fraction=0.1,
                         beta_1=0.9,
                         beta_2=0.999,
                         epsilon=1e-08,
                         n_iter_no_change=10),
            'MLPGridSearchCV':
            GridSearchCV(estimator=MLPRegressor(hidden_layer_sizes=(30, 30),
                                                activation='relu',
                                                solver='adam',
                                                alpha=0.0001,
                                                batch_size='auto',
                                                learning_rate='adaptive',
                                                learning_rate_init=0.001,
                                                power_t=0.5,
                                                max_iter=200,
                                                shuffle=True,
                                                random_state=None,
                                                tol=0.0001,
                                                verbose=False,
                                                warm_start=False,
                                                momentum=0.9,
                                                nesterovs_momentum=True,
                                                early_stopping=False,
                                                validation_fraction=0.1,
                                                beta_1=0.9,
                                                beta_2=0.999,
                                                epsilon=1e-08,
                                                n_iter_no_change=10),
                         param_grid=param_grid_MLP,
                         cv=3,
                         scoring=metrics.make_scorer(score_metric)),
            'EigenProPolynomial':
            EigenProRegressor(batch_size="auto",
                              n_epoch=10,
                              n_components=n_components_eigenpro,
                              subsample_size="auto",
                              kernel="polynomial",
                              bandwidth=5,
                              gamma=None,
                              degree=2,
                              coef0=1,
                              kernel_params=None,
                              random_state=None),
            'EigenProGaussian160':
            GridSearchCV(estimator=EigenProRegressor(
                batch_size="auto",
                n_epoch=10,
                n_components=n_components_eigenpro,
                subsample_size="auto",
                kernel="gaussian",
                gamma=None,
                degree=2,
                coef0=1,
                kernel_params=None,
                random_state=None),
                         param_grid=param_grid_EigenProGaussian,
                         cv=3,
                         scoring=metrics.make_scorer(score_metric)),
            'EigenProGaussian1000':
            GridSearchCV(estimator=EigenProRegressor(batch_size="auto",
                                                     n_epoch=10,
                                                     n_components=1000,
                                                     subsample_size="auto",
                                                     kernel="gaussian",
                                                     gamma=None,
                                                     degree=2,
                                                     coef0=1,
                                                     kernel_params=None,
                                                     random_state=None),
                         param_grid=param_grid_EigenProGaussian,
                         cv=3,
                         scoring=metrics.make_scorer(score_metric)),
            'NystroemRidgeCV':
            GridSearchCV(estimator=Pipeline([('kernel_approx',
                                              Nystroem(kernel="polynomial",
                                                       n_components=None,
                                                       random_state=clf_seed,
                                                       degree=2)),
                                             ('classifier',
                                              linear_model.RidgeCV(cv=3))]),
                         param_grid=param_grid_nystroem_ridgecv,
                         cv=3,
                         scoring=metrics.make_scorer(score_metric)),
        }
    else:
        raise ValueError("{} not recognized".format(clf_type))

    clfs = [clfs[clf] for clf in classifiers]
    for clf in clfs:
        try:
            if 'random_state' in clf.estimator.get_params():
                clf.estimator.set_params(random_state=clf_seed)
        except AttributeError:
            if 'random_state' in clf.get_params():
                clf.set_params(random_state=clf_seed)
    return clfs