def test_eigenpro_regression_conflict_data():
    """Make sure the regressor doesn't crash when conflicting
    data is given"""
    X, y = make_regression(random_state=1)
    y = np.reshape(y, (-1, 1))
    X, y = X, np.hstack([y, y + 2])
    # Make sure we don't throw an error when fitting or predicting
    EigenProRegressor(kernel="linear", n_epoch=5, gamma=0.5,
                      random_state=1).fit(X, y).predict(X)
def test_eigenpro_regression_duplicate_data():
    """Test the performance when some data is repeated"""
    X, y = make_regression(random_state=1)
    X, y = np.concatenate([X, X]), np.concatenate([y, y])
    prediction = (EigenProRegressor(kernel="rbf",
                                    n_epoch=100,
                                    gamma=0.02,
                                    random_state=1).fit(X, y).predict(X))
    assert_allclose(prediction, y, rtol=5e-3)
    ],
)
def test_parameter_validation(estimator, data, params, err_msg):
    X, y = data
    with pytest.raises(ValueError, match=err_msg):
        estimator(**params).fit(X, y)


@pytest.mark.parametrize(
    "data, estimator",
    [
        # Test rbf kernel
        (
            gen_regression({}),
            EigenProRegressor(
                kernel="rbf", n_epoch=100, bandwidth=10, random_state=1
            ),
        ),
        # Test laplacian kernel
        (
            gen_regression({}),
            EigenProRegressor(
                kernel="laplace", n_epoch=100, bandwidth=8, random_state=1
            ),
        ),
        # Test cauchy kernel
        (
            gen_regression({}),
            EigenProRegressor(
                kernel="cauchy",
                n_epoch=100,
        }, "gamma should be positive, was -1"),
    ],
)
def test_parameter_validation(estimator, data, params, err_msg):
    X, y = data
    with pytest.raises(ValueError, match=err_msg):
        estimator(**params).fit(X, y)


@pytest.mark.parametrize(
    "data, estimator",
    [
        # Test rbf kernel
        (
            gen_regression({}),
            EigenProRegressor(kernel="rbf", n_epoch=100, random_state=1),
        ),
        # Test laplacian kernel
        (
            gen_regression({}),
            EigenProRegressor(
                kernel="laplace", n_epoch=100, gamma=0.008, random_state=1),
        ),
        # Test cauchy kernel
        (
            gen_regression({}),
            EigenProRegressor(
                kernel="cauchy",
                n_epoch=100,
                gamma=0.005,
                subsample_size=1000,
def instanciate_estimators(clf_type, classifiers, clf_seed, y=None, **kw):

    score_metric, _ = get_score_metric(clf_type)
    param_grid_LGBM = {
        'learning_rate': [0.1, .05, .5],
        'num_leaves': [7, 15, 31]
    }
    param_grid_XGB = {'learning_rate': [0.1, .05, .3], 'max_depth': [3, 6, 9]}
    param_grid_MLP = {
        'learning_rate_init': [.001, .0005, .005],
        'hidden_layer_sizes': [(30, ), (50, ), (100, ), (30, 30), (50, 50),
                               (100, 100)]
    }
    param_grid_EigenProGaussian = {'bandwidth': [1, 5, 25]}
    n_components_eigenpro = 160
    param_grid_nystroem_ridgecv = {
        'kernel_approx__n_components': [1000, 3000],
        'kernel_approx__degree': [2, 3],
    }
    if clf_type == 'binary':
        print(('Fraction by class: True: %0.2f; False: %0.2f' %
               (list(y).count(True) / len(y), list(y).count(False) / len(y))))
        cw = 'balanced'
        clfs = {
            'L2RegularizedLinearModel':
            linear_model.LogisticRegressionCV(class_weight=cw,
                                              max_iter=100,
                                              solver='sag',
                                              penalty='l2',
                                              n_jobs=1,
                                              cv=3,
                                              multi_class='multinomial'),
            'GradientBoosting':
            ensemble.GradientBoostingClassifier(n_estimators=100),
            'LGBM':
            GridSearchCV(estimator=LGBMClassifier(n_estimators=100,
                                                  n_jobs=1,
                                                  is_unbalance=True),
                         param_grid=param_grid_LGBM,
                         cv=3,
                         scoring=metrics.make_scorer(score_metric)),
            'XGB':
            GridSearchCV(estimator=XGBClassifier(n_estimators=100, n_jobs=1),
                         param_grid=param_grid_XGB,
                         cv=3,
                         scoring=metrics.make_scorer(score_metric)),
            'MLP':
            MLPClassifier(hidden_layer_sizes=(30, 30),
                          activation='relu',
                          solver='adam',
                          alpha=0.0001,
                          batch_size='auto',
                          learning_rate='constant',
                          learning_rate_init=0.001,
                          power_t=0.5,
                          max_iter=200,
                          shuffle=True,
                          random_state=None,
                          tol=0.0001,
                          verbose=False,
                          warm_start=False,
                          momentum=0.9,
                          nesterovs_momentum=True,
                          early_stopping=False,
                          validation_fraction=0.1,
                          beta_1=0.9,
                          beta_2=0.999,
                          epsilon=1e-08,
                          n_iter_no_change=10),
            'MLPGridSearchCV':
            GridSearchCV(estimator=MLPClassifier(hidden_layer_sizes=(30, 30),
                                                 activation='relu',
                                                 solver='adam',
                                                 alpha=0.0001,
                                                 batch_size='auto',
                                                 learning_rate='adaptive',
                                                 learning_rate_init=0.001,
                                                 power_t=0.5,
                                                 max_iter=200,
                                                 shuffle=True,
                                                 random_state=None,
                                                 tol=0.0001,
                                                 verbose=False,
                                                 warm_start=False,
                                                 momentum=0.9,
                                                 nesterovs_momentum=True,
                                                 early_stopping=False,
                                                 validation_fraction=0.1,
                                                 beta_1=0.9,
                                                 beta_2=0.999,
                                                 epsilon=1e-08,
                                                 n_iter_no_change=10),
                         param_grid=param_grid_MLP,
                         cv=3,
                         scoring=metrics.make_scorer(score_metric)),
            'EigenProPolynomial':
            EigenProClassifier(batch_size="auto",
                               n_epoch=10,
                               n_components=n_components_eigenpro,
                               subsample_size="auto",
                               kernel="polynomial",
                               bandwidth=5,
                               gamma=None,
                               degree=2,
                               coef0=1,
                               kernel_params=None,
                               random_state=None),
            'EigenProGaussian160':
            GridSearchCV(estimator=EigenProClassifier(
                batch_size="auto",
                n_epoch=10,
                n_components=n_components_eigenpro,
                subsample_size="auto",
                kernel="gaussian",
                gamma=None,
                degree=2,
                coef0=1,
                kernel_params=None,
                random_state=None),
                         param_grid=param_grid_EigenProGaussian,
                         cv=3,
                         scoring=metrics.make_scorer(score_metric)),
            'EigenProGaussian1000':
            GridSearchCV(estimator=EigenProClassifier(batch_size="auto",
                                                      n_epoch=10,
                                                      n_components=1000,
                                                      subsample_size="auto",
                                                      kernel="gaussian",
                                                      gamma=None,
                                                      degree=2,
                                                      coef0=1,
                                                      kernel_params=None,
                                                      random_state=None),
                         param_grid=param_grid_EigenProGaussian,
                         cv=3,
                         scoring=metrics.make_scorer(score_metric)),
            'NystroemRidgeCV':
            GridSearchCV(estimator=Pipeline([
                ('kernel_approx',
                 Nystroem(kernel="polynomial",
                          n_components=None,
                          random_state=clf_seed,
                          degree=2)),
                ('classifier',
                 linear_model.LogisticRegressionCV(class_weight=cw,
                                                   max_iter=100,
                                                   solver='sag',
                                                   penalty='l2',
                                                   n_jobs=1,
                                                   cv=3,
                                                   multi_class='multinomial'))
            ]),
                         param_grid=param_grid_nystroem_ridgecv,
                         cv=3,
                         scoring=metrics.make_scorer(score_metric)),
        }

    elif clf_type == 'multiclass':
        print('fraction of the most frequent class:',
              max([list(y).count(x) for x in set(list(y))]) / len(list(y)))
        clfs = {
            'L2RegularizedLinearModel':
            linear_model.LogisticRegressionCV(penalty='l2',
                                              n_jobs=1,
                                              cv=3,
                                              multi_class='multinomial',
                                              solver='sag',
                                              max_iter=100),
            'GradientBoosting':
            ensemble.GradientBoostingClassifier(n_estimators=100),
            'LGBM':
            GridSearchCV(estimator=LGBMClassifier(n_estimators=100, n_jobs=1),
                         param_grid=param_grid_LGBM,
                         cv=3,
                         scoring=metrics.make_scorer(score_metric)),
            'XGB':
            GridSearchCV(estimator=XGBClassifier(n_estimators=100,
                                                 n_jobs=1,
                                                 objective='multi:softmax',
                                                 num_class=len(np.unique(y))),
                         param_grid=param_grid_XGB,
                         cv=3,
                         scoring=metrics.make_scorer(score_metric)),
            'MLP':
            MLPClassifier(hidden_layer_sizes=(30, 30),
                          activation='relu',
                          solver='adam',
                          alpha=0.0001,
                          batch_size='auto',
                          learning_rate='constant',
                          learning_rate_init=0.001,
                          power_t=0.5,
                          max_iter=200,
                          shuffle=True,
                          random_state=None,
                          tol=0.0001,
                          verbose=False,
                          warm_start=False,
                          momentum=0.9,
                          nesterovs_momentum=True,
                          early_stopping=False,
                          validation_fraction=0.1,
                          beta_1=0.9,
                          beta_2=0.999,
                          epsilon=1e-08,
                          n_iter_no_change=10),
            'MLPGridSearchCV':
            GridSearchCV(estimator=MLPClassifier(hidden_layer_sizes=(30, 30),
                                                 activation='relu',
                                                 solver='adam',
                                                 alpha=0.0001,
                                                 batch_size='auto',
                                                 learning_rate='adaptive',
                                                 learning_rate_init=0.001,
                                                 power_t=0.5,
                                                 max_iter=200,
                                                 shuffle=True,
                                                 random_state=None,
                                                 tol=0.0001,
                                                 verbose=False,
                                                 warm_start=False,
                                                 momentum=0.9,
                                                 nesterovs_momentum=True,
                                                 early_stopping=False,
                                                 validation_fraction=0.1,
                                                 beta_1=0.9,
                                                 beta_2=0.999,
                                                 epsilon=1e-08,
                                                 n_iter_no_change=10),
                         param_grid=param_grid_MLP,
                         cv=3,
                         scoring=metrics.make_scorer(score_metric)),
            'EigenProPolynomial':
            EigenProClassifier(batch_size="auto",
                               n_epoch=10,
                               n_components=n_components_eigenpro,
                               subsample_size="auto",
                               kernel="polynomial",
                               gamma=None,
                               degree=2,
                               coef0=1,
                               kernel_params=None,
                               random_state=None),
            'EigenProGaussian160':
            GridSearchCV(estimator=EigenProClassifier(
                batch_size="auto",
                n_epoch=10,
                n_components=n_components_eigenpro,
                subsample_size="auto",
                kernel="gaussian",
                gamma=None,
                degree=2,
                coef0=1,
                kernel_params=None,
                random_state=None),
                         param_grid=param_grid_EigenProGaussian,
                         cv=3,
                         scoring=metrics.make_scorer(score_metric)),
            'EigenProGaussian1000':
            GridSearchCV(estimator=EigenProClassifier(batch_size="auto",
                                                      n_epoch=10,
                                                      n_components=1000,
                                                      subsample_size="auto",
                                                      kernel="gaussian",
                                                      gamma=None,
                                                      degree=2,
                                                      coef0=1,
                                                      kernel_params=None,
                                                      random_state=None),
                         param_grid=param_grid_EigenProGaussian,
                         cv=3,
                         scoring=metrics.make_scorer(score_metric)),
            'NystroemRidgeCV':
            GridSearchCV(estimator=Pipeline([
                ('kernel_approx',
                 Nystroem(kernel="polynomial",
                          n_components=None,
                          random_state=clf_seed,
                          degree=2)),
                ('classifier',
                 linear_model.LogisticRegressionCV(penalty='l2',
                                                   n_jobs=1,
                                                   cv=3,
                                                   multi_class='multinomial',
                                                   solver='sag',
                                                   max_iter=100))
            ]),
                         param_grid=param_grid_nystroem_ridgecv,
                         cv=3,
                         scoring=metrics.make_scorer(score_metric)),
        }
    elif clf_type == 'regression':
        clfs = {
            'L2RegularizedLinearModel':
            linear_model.RidgeCV(cv=3),
            'GradientBoosting':
            ensemble.GradientBoostingRegressor(n_estimators=100),
            'LGBM':
            GridSearchCV(estimator=LGBMRegressor(n_estimators=100, n_jobs=1),
                         param_grid=param_grid_LGBM,
                         cv=3,
                         scoring=metrics.make_scorer(score_metric)),
            'XGB':
            GridSearchCV(estimator=XGBRegressor(n_estimators=100, n_jobs=1),
                         param_grid=param_grid_XGB,
                         cv=3,
                         scoring=metrics.make_scorer(score_metric)),
            'MLP':
            MLPRegressor(hidden_layer_sizes=(30, 30),
                         activation='relu',
                         solver='adam',
                         alpha=0.0001,
                         batch_size='auto',
                         learning_rate='constant',
                         learning_rate_init=0.001,
                         power_t=0.5,
                         max_iter=200,
                         shuffle=True,
                         random_state=None,
                         tol=0.0001,
                         verbose=False,
                         warm_start=False,
                         momentum=0.9,
                         nesterovs_momentum=True,
                         early_stopping=False,
                         validation_fraction=0.1,
                         beta_1=0.9,
                         beta_2=0.999,
                         epsilon=1e-08,
                         n_iter_no_change=10),
            'MLPGridSearchCV':
            GridSearchCV(estimator=MLPRegressor(hidden_layer_sizes=(30, 30),
                                                activation='relu',
                                                solver='adam',
                                                alpha=0.0001,
                                                batch_size='auto',
                                                learning_rate='adaptive',
                                                learning_rate_init=0.001,
                                                power_t=0.5,
                                                max_iter=200,
                                                shuffle=True,
                                                random_state=None,
                                                tol=0.0001,
                                                verbose=False,
                                                warm_start=False,
                                                momentum=0.9,
                                                nesterovs_momentum=True,
                                                early_stopping=False,
                                                validation_fraction=0.1,
                                                beta_1=0.9,
                                                beta_2=0.999,
                                                epsilon=1e-08,
                                                n_iter_no_change=10),
                         param_grid=param_grid_MLP,
                         cv=3,
                         scoring=metrics.make_scorer(score_metric)),
            'EigenProPolynomial':
            EigenProRegressor(batch_size="auto",
                              n_epoch=10,
                              n_components=n_components_eigenpro,
                              subsample_size="auto",
                              kernel="polynomial",
                              bandwidth=5,
                              gamma=None,
                              degree=2,
                              coef0=1,
                              kernel_params=None,
                              random_state=None),
            'EigenProGaussian160':
            GridSearchCV(estimator=EigenProRegressor(
                batch_size="auto",
                n_epoch=10,
                n_components=n_components_eigenpro,
                subsample_size="auto",
                kernel="gaussian",
                gamma=None,
                degree=2,
                coef0=1,
                kernel_params=None,
                random_state=None),
                         param_grid=param_grid_EigenProGaussian,
                         cv=3,
                         scoring=metrics.make_scorer(score_metric)),
            'EigenProGaussian1000':
            GridSearchCV(estimator=EigenProRegressor(batch_size="auto",
                                                     n_epoch=10,
                                                     n_components=1000,
                                                     subsample_size="auto",
                                                     kernel="gaussian",
                                                     gamma=None,
                                                     degree=2,
                                                     coef0=1,
                                                     kernel_params=None,
                                                     random_state=None),
                         param_grid=param_grid_EigenProGaussian,
                         cv=3,
                         scoring=metrics.make_scorer(score_metric)),
            'NystroemRidgeCV':
            GridSearchCV(estimator=Pipeline([('kernel_approx',
                                              Nystroem(kernel="polynomial",
                                                       n_components=None,
                                                       random_state=clf_seed,
                                                       degree=2)),
                                             ('classifier',
                                              linear_model.RidgeCV(cv=3))]),
                         param_grid=param_grid_nystroem_ridgecv,
                         cv=3,
                         scoring=metrics.make_scorer(score_metric)),
        }
    else:
        raise ValueError("{} not recognized".format(clf_type))

    clfs = [clfs[clf] for clf in classifiers]
    for clf in clfs:
        try:
            if 'random_state' in clf.estimator.get_params():
                clf.estimator.set_params(random_state=clf_seed)
        except AttributeError:
            if 'random_state' in clf.get_params():
                clf.set_params(random_state=clf_seed)
    return clfs