def check_transformer_pickle(name, Transformer):
    X, y = make_blobs(n_samples=30, centers=[[0, 0, 0], [1, 1, 1]],
                      random_state=0, n_features=2, cluster_std=0.1)
    n_samples, n_features = X.shape
    X = StandardScaler().fit_transform(X)
    X -= X.min()
    # catch deprecation warnings
    with warnings.catch_warnings(record=True):
        transformer = Transformer()
    if not hasattr(transformer, 'transform'):
        return
    set_random_state(transformer)
    set_fast_parameters(transformer)

    # fit
    if name in CROSS_DECOMPOSITION:
        random_state = np.random.RandomState(seed=12345)
        y_ = np.vstack([y, 2 * y + random_state.randint(2, size=len(y))])
        y_ = y_.T
    else:
        y_ = y

    transformer.fit(X, y_)
    X_pred = transformer.fit(X, y_).transform(X)
    pickled_transformer = pickle.dumps(transformer)
    unpickled_transformer = pickle.loads(pickled_transformer)
    pickled_X_pred = unpickled_transformer.transform(X)

    assert_array_almost_equal(pickled_X_pred, X_pred)
Example #2
0
def test_set_random_state():
    lda = LDA()
    tree = DecisionTreeClassifier()
    # LDA doesn't have random state: smoke test
    set_random_state(lda, 3)
    set_random_state(tree, 3)
    assert_equal(tree.random_state, 3)
def test_various_scoring_on_tuples_learners(estimator, build_dataset,
                                            with_preprocessor):
  """Tests that scikit-learn's scoring returns something finite,
  for other scoring than default scoring. (List of scikit-learn's scores can be
  found in sklearn.metrics.scorer). For each type of output (predict,
  predict_proba, decision_function), we test a bunch of scores.
  We only test on pairs learners because quadruplets don't have a y argument.
  """
  input_data, labels, preprocessor, _ = build_dataset(with_preprocessor)
  estimator = clone(estimator)
  estimator.set_params(preprocessor=preprocessor)
  set_random_state(estimator)

  # scores that need a predict function: every tuples learner should have a
  # predict function (whether the pair is of positive samples or negative
  # samples)
  for scoring in ['accuracy', 'f1']:
    check_score_is_finite(scoring, estimator, input_data, labels)
  # scores that need a predict_proba:
  if hasattr(estimator, "predict_proba"):
    for scoring in ['neg_log_loss', 'brier_score']:
      check_score_is_finite(scoring, estimator, input_data, labels)
  # scores that need a decision_function: every tuples learner should have a
  # decision function (the metric between points)
  for scoring in ['roc_auc', 'average_precision', 'precision', 'recall']:
    check_score_is_finite(scoring, estimator, input_data, labels)
def test_get_metric_works_does_not_raise(estimator, build_dataset):
  """Tests that the metric returned by get_metric does not raise errors (or
  warnings) similarly to the distance functions in scipy.spatial.distance"""
  input_data, labels, _, X = build_dataset()
  model = clone(estimator)
  set_random_state(model)
  model.fit(input_data, labels)
  metric = model.get_metric()

  list_test_get_metric_doesnt_raise = [(X[0], X[1]),
                                       (X[0].tolist(), X[1].tolist()),
                                       (X[0][None], X[1][None])]

  for u, v in list_test_get_metric_doesnt_raise:
    with pytest.warns(None) as record:
      metric(u, v)
    assert len(record) == 0

  # Test that the scalar case works
  model.transformer_ = np.array([3.1])
  metric = model.get_metric()
  for u, v in [(5, 6.7), ([5], [6.7]), ([[5]], [[6.7]])]:
    with pytest.warns(None) as record:
      metric(u, v)
    assert len(record) == 0
Example #5
0
def test_set_random_state():
    lda = LinearDiscriminantAnalysis()
    tree = DecisionTreeClassifier()
    # Linear Discriminant Analysis doesn't have random state: smoke test
    set_random_state(lda, 3)
    set_random_state(tree, 3)
    assert_equal(tree.random_state, 3)
Example #6
0
def check_regressors_int(name, Regressor, X, y):
    if name == 'OrthogonalMatchingPursuitCV':
        # FIXME: This test is unstable on Travis, see issue #3190.
        check_skip_travis()
    rnd = np.random.RandomState(0)
    # catch deprecation warnings
    with warnings.catch_warnings(record=True):
        # separate estimators to control random seeds
        regressor_1 = Regressor()
        regressor_2 = Regressor()
    set_random_state(regressor_1)
    set_random_state(regressor_2)

    if name in ('_PLS', 'PLSCanonical', 'PLSRegression'):
        y_ = np.vstack([y, 2 * y + rnd.randint(2, size=len(y))])
        y_ = y_.T
    else:
        y_ = y

    # fit
    regressor_1.fit(X, y_)
    pred1 = regressor_1.predict(X)
    regressor_2.fit(X, y_.astype(np.float))
    pred2 = regressor_2.predict(X)
    assert_array_almost_equal(pred1, pred2, 2, name)
def check_classifiers_input_shapes(name, Classifier):
    iris = load_iris()
    X, y = iris.data, iris.target
    X, y = shuffle(X, y, random_state=1)
    X = StandardScaler().fit_transform(X)
    # catch deprecation warnings
    with warnings.catch_warnings(record=True):
        classifier = Classifier()
    set_fast_parameters(classifier)
    set_random_state(classifier)
    # fit
    classifier.fit(X, y)
    y_pred = classifier.predict(X)

    set_random_state(classifier)
    # Check that when a 2D y is given, a DataConversionWarning is
    # raised
    with warnings.catch_warnings(record=True) as w:
        warnings.simplefilter("always", DataConversionWarning)
        warnings.simplefilter("ignore", RuntimeWarning)
        classifier.fit(X, y[:, np.newaxis])
    msg = "expected 1 DataConversionWarning, got: %s" % (
        ", ".join([str(w_x) for w_x in w]))
    assert_equal(len(w), 1, msg)
    assert_array_equal(y_pred, classifier.predict(X))
Example #8
0
def test_regressors_int():
    # test if regressors can cope with integer labels (by converting them to
    # float)
    regressors = all_estimators(type_filter='regressor')
    boston = load_boston()
    X, y = boston.data, boston.target
    X, y = shuffle(X, y, random_state=0)
    X = StandardScaler().fit_transform(X)
    y = np.random.randint(2, size=X.shape[0])
    for name, Reg in regressors:
        if Reg in dont_test or Reg in (CCA,):
            continue
        # catch deprecation warnings
        with warnings.catch_warnings(record=True):
            # separate estimators to control random seeds
            reg1 = Reg()
            reg2 = Reg()
        set_random_state(reg1)
        set_random_state(reg2)

        if Reg in (_PLS, PLSCanonical, PLSRegression):
            y_ = np.vstack([y, 2 * y + np.random.randint(2, size=len(y))])
            y_ = y_.T
        else:
            y_ = y

        # fit
        reg1.fit(X, y_)
        pred1 = reg1.predict(X)
        reg2.fit(X, y_.astype(np.float))
        pred2 = reg2.predict(X)
        assert_array_almost_equal(pred1, pred2, 2, name)
Example #9
0
def test_class_weight_classifiers():
    # test that class_weight works and that the semantics are consistent
    classifiers = all_estimators(type_filter="classifier")

    with warnings.catch_warnings(record=True):
        classifiers = [c for c in classifiers if "class_weight" in c[1]().get_params().keys()]

    for n_centers in [2, 3]:
        # create a very noisy dataset
        X, y = make_blobs(centers=n_centers, random_state=0, cluster_std=20)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)
        for name, Classifier in classifiers:
            if name == "NuSVC":
                # the sparse version has a parameter that doesn't do anything
                continue
            if name.endswith("NB"):
                # NaiveBayes classifiers have a somewhat different interface.
                # FIXME SOON!
                continue
            if n_centers == 2:
                class_weight = {0: 1000, 1: 0.0001}
            else:
                class_weight = {0: 1000, 1: 0.0001, 2: 0.0001}

            with warnings.catch_warnings(record=True):
                classifier = Classifier(class_weight=class_weight)
            if hasattr(classifier, "n_iter"):
                classifier.set_params(n_iter=100)

            set_random_state(classifier)
            classifier.fit(X_train, y_train)
            y_pred = classifier.predict(X_test)
            assert_greater(np.mean(y_pred == 0), 0.9)
Example #10
0
def test_regressors_int():
    # test if regressors can cope with integer labels (by converting them to
    # float)
    regressors = all_estimators(type_filter="regressor")
    X, _ = _boston_subset()
    X = X[:50]
    rnd = np.random.RandomState(0)
    y = rnd.randint(3, size=X.shape[0])
    for name, Regressor in regressors:
        if name in dont_test or name in ("CCA"):
            continue
        # catch deprecation warnings
        with warnings.catch_warnings(record=True):
            # separate estimators to control random seeds
            regressor_1 = Regressor()
            regressor_2 = Regressor()
        set_random_state(regressor_1)
        set_random_state(regressor_2)

        if name in ("_PLS", "PLSCanonical", "PLSRegression"):
            y_ = np.vstack([y, 2 * y + rnd.randint(2, size=len(y))])
            y_ = y_.T
        else:
            y_ = y

        # fit
        regressor_1.fit(X, y_)
        pred1 = regressor_1.predict(X)
        regressor_2.fit(X, y_.astype(np.float))
        pred2 = regressor_2.predict(X)
        assert_array_almost_equal(pred1, pred2, 2, name)
Example #11
0
def check_regressors_train(name, Regressor, X, y):
    if name == 'OrthogonalMatchingPursuitCV':
        # FIXME: This test is unstable on Travis, see issue #3190.
        check_skip_travis()
    rnd = np.random.RandomState(0)
    # catch deprecation warnings
    with warnings.catch_warnings(record=True):
        regressor = Regressor()
    if not hasattr(regressor, 'alphas') and hasattr(regressor, 'alpha'):
        # linear regressors need to set alpha, but not generalized CV ones
        regressor.alpha = 0.01

    # raises error on malformed input for fit
    assert_raises(ValueError, regressor.fit, X, y[:-1])
    # fit
    if name in ('PLSCanonical', 'PLSRegression', 'CCA'):
        y_ = np.vstack([y, 2 * y + rnd.randint(2, size=len(y))])
        y_ = y_.T
    else:
        y_ = y
    set_random_state(regressor)
    regressor.fit(X, y_)
    regressor.predict(X)

      # TODO: find out why PLS and CCA fail. RANSAC is random
      # and furthermore assumes the presence of outliers, hence
      # skipped
    if name not in ('PLSCanonical', 'CCA', 'RANSACRegressor'):
        assert_greater(regressor.score(X, y_), 0.5)
def check_regressors_train(name, Regressor):
    X, y = _boston_subset()
    y = StandardScaler().fit_transform(y)   # X is already scaled
    y = multioutput_estimator_convert_y_2d(name, y)
    rnd = np.random.RandomState(0)
    # catch deprecation warnings
    with warnings.catch_warnings(record=True):
        regressor = Regressor()
    set_fast_parameters(regressor)
    if not hasattr(regressor, 'alphas') and hasattr(regressor, 'alpha'):
        # linear regressors need to set alpha, but not generalized CV ones
        regressor.alpha = 0.01
    if name == 'PassiveAggressiveRegressor':
        regressor.C = 0.01

    # raises error on malformed input for fit
    assert_raises(ValueError, regressor.fit, X, y[:-1])
    # fit
    if name in CROSS_DECOMPOSITION:
        y_ = np.vstack([y, 2 * y + rnd.randint(2, size=len(y))])
        y_ = y_.T
    else:
        y_ = y
    set_random_state(regressor)
    regressor.fit(X, y_)
    regressor.fit(X.tolist(), y_.tolist())
    regressor.predict(X)

    # TODO: find out why PLS and CCA fail. RANSAC is random
    # and furthermore assumes the presence of outliers, hence
    # skipped
    if name not in ('PLSCanonical', 'CCA', 'RANSACRegressor'):
        print(regressor)
        assert_greater(regressor.score(X, y_), 0.5)
Example #13
0
def check_estimators_overwrite_params(name, Estimator, X, y):
    with warnings.catch_warnings(record=True):
        # catch deprecation warnings
        estimator = Estimator()

    if hasattr(estimator, 'batch_size'):
        # FIXME
        # for MiniBatchDictLearning
        estimator.batch_size = 1

    if name in ['GaussianRandomProjection',
                'SparseRandomProjection']:
        # Due to the jl lemma and very few samples, the number
        # of components of the random matrix projection will be
        # greater
        # than the number of features.
        # So we impose a smaller number (avoid "auto" mode)
        estimator = Estimator(n_components=1)
    elif name == "SelectKBest":
        estimator = Estimator(k=1)

    set_random_state(estimator)

    params = estimator.get_params()
    estimator.fit(X, y)
    new_params = estimator.get_params()
    for k, v in params.items():
        assert_false(np.any(new_params[k] != v),
                     "Estimator %s changes its parameter %s"
                     " from %s to %s during fit."
                     % (name, k, v, new_params[k]))
def check_class_weight_classifiers(name, Classifier):
    if name == "NuSVC":
        # the sparse version has a parameter that doesn't do anything
        raise SkipTest
    if name.endswith("NB"):
        # NaiveBayes classifiers have a somewhat different interface.
        # FIXME SOON!
        raise SkipTest

    for n_centers in [2, 3]:
        # create a very noisy dataset
        X, y = make_blobs(centers=n_centers, random_state=0, cluster_std=20)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.5,
                                                            random_state=0)
        n_centers = len(np.unique(y_train))

        if n_centers == 2:
            class_weight = {0: 1000, 1: 0.0001}
        else:
            class_weight = {0: 1000, 1: 0.0001, 2: 0.0001}

        with warnings.catch_warnings(record=True):
            classifier = Classifier(class_weight=class_weight)
        if hasattr(classifier, "n_iter"):
            classifier.set_params(n_iter=100)
        if hasattr(classifier, "min_weight_fraction_leaf"):
            classifier.set_params(min_weight_fraction_leaf=0.01)

        set_random_state(classifier)
        classifier.fit(X_train, y_train)
        y_pred = classifier.predict(X_test)
        assert_greater(np.mean(y_pred == 0), 0.89)
def check_clustering(name, Alg):
    X, y = make_blobs(n_samples=50, random_state=1)
    X, y = shuffle(X, y, random_state=7)
    X = StandardScaler().fit_transform(X)
    n_samples, n_features = X.shape
    # catch deprecation and neighbors warnings
    with warnings.catch_warnings(record=True):
        alg = Alg()
    set_fast_parameters(alg)
    if hasattr(alg, "n_clusters"):
        alg.set_params(n_clusters=3)
    set_random_state(alg)
    if name == 'AffinityPropagation':
        alg.set_params(preference=-100)
        alg.set_params(max_iter=100)

    # fit
    alg.fit(X)
    # with lists
    alg.fit(X.tolist())

    assert_equal(alg.labels_.shape, (n_samples,))
    pred = alg.labels_
    assert_greater(adjusted_rand_score(pred, y), 0.4)
    # fit another time with ``fit_predict`` and compare results
    if name is 'SpectralClustering':
        # there is no way to make Spectral clustering deterministic :(
        return
    set_random_state(alg)
    with warnings.catch_warnings(record=True):
        pred2 = alg.fit_predict(X)
    assert_array_equal(pred, pred2)
def check_class_weight_auto_linear_classifier(name, Classifier):
    """Test class weights with non-contiguous class labels."""
    X = np.array([[-1.0, -1.0], [-1.0, 0], [-.8, -1.0],
                  [1.0, 1.0], [1.0, 0.0]])
    y = [1, 1, 1, -1, -1]

    with warnings.catch_warnings(record=True):
        classifier = Classifier()
    if hasattr(classifier, "n_iter"):
        # This is a very small dataset, default n_iter are likely to prevent
        # convergence
        classifier.set_params(n_iter=1000)
    set_random_state(classifier)

    # Let the model compute the class frequencies
    classifier.set_params(class_weight='auto')
    coef_auto = classifier.fit(X, y).coef_.copy()

    # Count each label occurrence to reweight manually
    mean_weight = (1. / 3 + 1. / 2) / 2
    class_weight = {
        1: 1. / 3 / mean_weight,
        -1: 1. / 2 / mean_weight,
    }
    classifier.set_params(class_weight=class_weight)
    coef_manual = classifier.fit(X, y).coef_.copy()

    assert_array_almost_equal(coef_auto, coef_manual)
def check_samplers_pandas(name, Sampler):
    pd = pytest.importorskip("pandas")
    # Check that the samplers handle pandas dataframe and pandas series
    X, y = make_classification(n_samples=1000, n_classes=3, n_informative=4,
                               weights=[0.2, 0.3, 0.5], random_state=0)
    X_pd = pd.DataFrame(X)
    sampler = Sampler()
    if isinstance(Sampler(), SMOTE):
        samplers = [
            Sampler(random_state=0, kind=kind)
            for kind in ('regular', 'borderline1', 'borderline2', 'svm')
        ]

    elif isinstance(Sampler(), NearMiss):
        samplers = [Sampler(version=version) for version in (1, 2, 3)]

    else:
        samplers = [Sampler()]

    for sampler in samplers:
        # FIXME: in 0.6 set the random_state for all
        if name not in DONT_HAVE_RANDOM_STATE:
            set_random_state(sampler)
        X_res_pd, y_res_pd = sampler.fit_resample(X_pd, y)
        X_res, y_res = sampler.fit_resample(X, y)
        assert_allclose(X_res_pd, X_res)
        assert_allclose(y_res_pd, y_res)
def check_pipeline_consistency(name, Estimator):
    if name in ('CCA', 'LocallyLinearEmbedding', 'KernelPCA') and _is_32bit():
        # Those transformers yield non-deterministic output when executed on
        # a 32bit Python. The same transformers are stable on 64bit Python.
        # FIXME: try to isolate a minimalistic reproduction case only depending
        # scipy and/or maybe generate a test dataset that does not
        # cause such unstable behaviors.
        msg = name + ' is non deterministic on 32bit Python'
        raise SkipTest(msg)

    # check that make_pipeline(est) gives same score as est
    X, y = make_blobs(n_samples=30, centers=[[0, 0, 0], [1, 1, 1]],
                      random_state=0, n_features=2, cluster_std=0.1)
    X -= X.min()
    y = multioutput_estimator_convert_y_2d(name, y)
    estimator = Estimator()
    set_fast_parameters(estimator)
    set_random_state(estimator)
    pipeline = make_pipeline(estimator)
    estimator.fit(X, y)
    pipeline.fit(X, y)
    funcs = ["score", "fit_transform"]
    for func_name in funcs:
        func = getattr(estimator, func_name, None)
        if func is not None:
            func_pipeline = getattr(pipeline, func_name)
            result = func(X, y)
            result_pipe = func_pipeline(X, y)
            assert_array_almost_equal(result, result_pipe)
def check_class_weight_balanced_linear_classifier(name, Classifier):
    """Test class weights with non-contiguous class labels."""
    X = np.array([[-1.0, -1.0], [-1.0, 0], [-.8, -1.0],
                  [1.0, 1.0], [1.0, 0.0]])
    y = np.array([1, 1, 1, -1, -1])

    with warnings.catch_warnings(record=True):
        classifier = Classifier()
    if hasattr(classifier, "n_iter"):
        # This is a very small dataset, default n_iter are likely to prevent
        # convergence
        classifier.set_params(n_iter=1000)
    set_random_state(classifier)

    # Let the model compute the class frequencies
    classifier.set_params(class_weight='balanced')
    coef_balanced = classifier.fit(X, y).coef_.copy()

    # Count each label occurrence to reweight manually
    n_samples = len(y)
    n_classes = float(len(np.unique(y)))

    class_weight = {1: n_samples / (np.sum(y == 1) * n_classes),
                    -1: n_samples / (np.sum(y == -1) * n_classes)}
    classifier.set_params(class_weight=class_weight)
    coef_manual = classifier.fit(X, y).coef_.copy()

    assert_array_almost_equal(coef_balanced, coef_manual)
Example #20
0
def check_estimators_pickle(name, Estimator):
    """Test that we can pickle all estimators"""
    check_methods = ["predict", "transform", "decision_function",
                     "predict_proba"]

    X, y = make_blobs(n_samples=30, centers=[[0, 0, 0], [1, 1, 1]],
                      random_state=0, n_features=2, cluster_std=0.1)

    # some estimators can't do features less than 0
    X -= X.min()

    # some estimators only take multioutputs
    y = multioutput_estimator_convert_y_2d(name, y)

    # catch deprecation warnings
    with warnings.catch_warnings(record=True):
        estimator = Estimator()

    set_random_state(estimator)
    set_fast_parameters(estimator)
    estimator.fit(X, y)

    result = dict()
    for method in check_methods:
        if hasattr(estimator, method):
            result[method] = getattr(estimator, method)(X)

    # pickle and unpickle!
    pickled_estimator = pickle.dumps(estimator)
    unpickled_estimator = pickle.loads(pickled_estimator)

    for method in result:
        unpickled_result = getattr(unpickled_estimator, method)(X)
        assert_array_almost_equal(result[method], unpickled_result)
def check_target_type(name, Estimator):
    X = np.random.random((20, 2))
    y = np.linspace(0, 1, 20)
    estimator = Estimator()
    set_random_state(estimator)
    with warns(UserWarning, match='should be of types'):
        estimator.fit(X, y)
Example #22
0
def check_estimators_overwrite_params(name, Estimator):
    X, y = make_blobs(random_state=0, n_samples=9)
    y = multioutput_estimator_convert_y_2d(name, y)
    # some want non-negative input
    X -= X.min()
    with warnings.catch_warnings(record=True):
        # catch deprecation warnings
        estimator = Estimator()

    set_fast_parameters(estimator)
    set_random_state(estimator)

    # Make a physical copy of the orginal estimator parameters before fitting.
    params = estimator.get_params()
    original_params = deepcopy(params)

    # Fit the model
    estimator.fit(X, y)

    # Compare the state of the model parameters with the original parameters
    new_params = estimator.get_params()
    for param_name, original_value in original_params.items():
        new_value = new_params[param_name]

        # We should never change or mutate the internal state of input
        # parameters by default. To check this we use the joblib.hash function
        # that introspects recursively any subobjects to compute a checksum.
        # The only exception to this rule of immutable constructor parameters
        # is possible RandomState instance but in this check we explicitly
        # fixed the random_state params recursively to be integer seeds.
        assert_equal(hash(new_value), hash(original_value),
                     "Estimator %s should not change or mutate "
                     " the parameter %s from %s to %s during fit."
                     % (name, param_name, original_value, new_value))
def check_classifiers_classes(name, Classifier):
    X, y = make_blobs(n_samples=30, random_state=0, cluster_std=0.1)
    X, y = shuffle(X, y, random_state=7)
    X = StandardScaler().fit_transform(X)
    # We need to make sure that we have non negative data, for things
    # like NMF
    X -= X.min() - .1
    y_names = np.array(["one", "two", "three"])[y]

    for y_names in [y_names, y_names.astype('O')]:
        if name in ["LabelPropagation", "LabelSpreading"]:
            # TODO some complication with -1 label
            y_ = y
        else:
            y_ = y_names

        classes = np.unique(y_)
        # catch deprecation warnings
        with warnings.catch_warnings(record=True):
            classifier = Classifier()
        if name == 'BernoulliNB':
            classifier.set_params(binarize=X.mean())
        set_fast_parameters(classifier)
        set_random_state(classifier)
        # fit
        classifier.fit(X, y_)

        y_pred = classifier.predict(X)
        # training set performance
        assert_array_equal(np.unique(y_), np.unique(y_pred))
        if np.any(classifier.classes_ != classes):
            print("Unexpected classes_ attribute for %r: "
                  "expected %s, got %s" %
                  (classifier, classes, classifier.classes_))
Example #24
0
def check_estimators_overwrite_params(name, Estimator):
    X, y = make_blobs(random_state=0, n_samples=9)
    y = multioutput_estimator_convert_y_2d(name, y)
    # some want non-negative input
    X -= X.min()
    with warnings.catch_warnings(record=True):
        # catch deprecation warnings
        estimator = Estimator()

    if name == 'MiniBatchDictLearning' or name == 'MiniBatchSparsePCA':
        # FIXME
        # for MiniBatchDictLearning and MiniBatchSparsePCA
        estimator.batch_size = 1

    set_fast_parameters(estimator)

    set_random_state(estimator)

    params = estimator.get_params()
    estimator.fit(X, y)
    new_params = estimator.get_params()
    for k, v in params.items():
        assert_false(np.any(new_params[k] != v),
                     "Estimator %s changes its parameter %s"
                     " from %s to %s during fit."
                     % (name, k, v, new_params[k]))
def check_regressors_int(name, Regressor):
    X, _ = _boston_subset()
    X = X[:50]
    rnd = np.random.RandomState(0)
    y = rnd.randint(3, size=X.shape[0])
    y = multioutput_estimator_convert_y_2d(name, y)
    rnd = np.random.RandomState(0)
    # catch deprecation warnings
    with warnings.catch_warnings(record=True):
        # separate estimators to control random seeds
        regressor_1 = Regressor()
        regressor_2 = Regressor()
    set_fast_parameters(regressor_1)
    set_fast_parameters(regressor_2)
    set_random_state(regressor_1)
    set_random_state(regressor_2)

    if name in CROSS_DECOMPOSITION:
        y_ = np.vstack([y, 2 * y + rnd.randint(2, size=len(y))])
        y_ = y_.T
    else:
        y_ = y

    # fit
    regressor_1.fit(X, y_)
    pred1 = regressor_1.predict(X)
    regressor_2.fit(X, y_.astype(np.float))
    pred2 = regressor_2.predict(X)
    assert_array_almost_equal(pred1, pred2, 2, name)
Example #26
0
def check_supervised_y_2d(name, Estimator):
    if "MultiTask" in name:
        # These only work on 2d, so this test makes no sense
        return
    rnd = np.random.RandomState(0)
    X = rnd.uniform(size=(10, 3))
    y = np.arange(10) % 3
    # catch deprecation warnings
    with warnings.catch_warnings(record=True):
        estimator = Estimator()
    set_fast_parameters(estimator)
    set_random_state(estimator)
    # fit
    estimator.fit(X, y)
    y_pred = estimator.predict(X)

    set_random_state(estimator)
    # Check that when a 2D y is given, a DataConversionWarning is
    # raised
    with warnings.catch_warnings(record=True) as w:
        warnings.simplefilter("always", DataConversionWarning)
        warnings.simplefilter("ignore", RuntimeWarning)
        estimator.fit(X, y[:, np.newaxis])
    y_pred_2d = estimator.predict(X)
    msg = "expected 1 DataConversionWarning, got: %s" % (
        ", ".join([str(w_x) for w_x in w]))
    if name not in MULTI_OUTPUT:
        # check that we warned if we don't support multi-output
        assert_greater(len(w), 0, msg)
        assert_true("DataConversionWarning('A column-vector y"
                    " was passed when a 1d array was expected" in msg)
    assert_array_almost_equal(y_pred.ravel(), y_pred_2d.ravel())
Example #27
0
def test_transformers_pickle():
    # test if transformers do something sensible on training set
    # also test all shapes / shape errors
    transformers = all_estimators(type_filter='transformer')
    X, y = make_blobs(n_samples=30, centers=[[0, 0, 0], [1, 1, 1]],
                      random_state=0, n_features=2, cluster_std=0.1)
    n_samples, n_features = X.shape
    X = StandardScaler().fit_transform(X)
    X -= X.min()

    succeeded = True

    for name, Transformer in transformers:
        if name in dont_test:
            continue
        # catch deprecation warnings
        with warnings.catch_warnings(record=True):
            transformer = Transformer()
        if not hasattr(transformer, 'transform'):
            continue
        set_random_state(transformer)
        if hasattr(transformer, 'compute_importances'):
            transformer.compute_importances = True

        if name == "SelectKBest":
            # SelectKBest has a default of k=10
            # which is more feature than we have.
            transformer.k = 1
        elif name in ['GaussianRandomProjection', 'SparseRandomProjection']:
            # Due to the jl lemma and very few samples, the number
            # of components of the random matrix projection will be greater
            # than the number of features.
            # So we impose a smaller number (avoid "auto" mode)
            transformer.n_components = 1

        # fit
        if name in ('PLSCanonical', 'PLSRegression', 'CCA',
                    'PLSSVD'):
            random_state = np.random.RandomState(seed=12345)
            y_ = np.vstack([y, 2 * y + random_state.randint(2, size=len(y))])
            y_ = y_.T
        else:
            y_ = y

        transformer.fit(X, y_)
        X_pred = transformer.fit(X, y_).transform(X)
        pickled_transformer = pickle.dumps(transformer)
        unpickled_transformer = pickle.loads(pickled_transformer)
        pickled_X_pred = unpickled_transformer.transform(X)

        try:
            assert_array_almost_equal(pickled_X_pred, X_pred)
        except Exception as exc:
            succeeded = False
            print ("Transformer %s doesn't predict the same value "
                   "after pickling" % name)
            raise exc

    assert_true(succeeded)
def _check_transformer(name, Transformer, X, y):
    n_samples, n_features = np.asarray(X).shape
    # catch deprecation warnings
    with warnings.catch_warnings(record=True):
        transformer = Transformer()
    set_random_state(transformer)

    if name == "KernelPCA":
        transformer.remove_zero_eig = False

    set_fast_parameters(transformer)

    # fit

    if name in CROSS_DECOMPOSITION:
        y_ = np.c_[y, y]
        y_[::2, 1] *= 2
    else:
        y_ = y

    transformer.fit(X, y_)
    X_pred = transformer.fit_transform(X, y=y_)
    if isinstance(X_pred, tuple):
        for x_pred in X_pred:
            assert_equal(x_pred.shape[0], n_samples)
    else:
        assert_equal(X_pred.shape[0], n_samples)

    if hasattr(transformer, 'transform'):
        if name in CROSS_DECOMPOSITION:
            X_pred2 = transformer.transform(X, y_)
            X_pred3 = transformer.fit_transform(X, y=y_)
        else:
            X_pred2 = transformer.transform(X)
            X_pred3 = transformer.fit_transform(X, y=y_)
        if isinstance(X_pred, tuple) and isinstance(X_pred2, tuple):
            for x_pred, x_pred2, x_pred3 in zip(X_pred, X_pred2, X_pred3):
                assert_array_almost_equal(
                    x_pred, x_pred2, 2,
                    "fit_transform and transform outcomes not consistent in %s"
                    % Transformer)
                assert_array_almost_equal(
                    x_pred, x_pred3, 2,
                    "consecutive fit_transform outcomes not consistent in %s"
                    % Transformer)
        else:
            assert_array_almost_equal(
                X_pred, X_pred2, 2,
                "fit_transform and transform outcomes not consistent in %s"
                % Transformer)
            assert_array_almost_equal(
                X_pred, X_pred3, 2,
                "consecutive fit_transform outcomes not consistent in %s"
                % Transformer)

        # raises error on malformed input for transform
        if hasattr(X, 'T'):
            # If it's not an array, it does not have a 'T' property
            assert_raises(ValueError, transformer.transform, X.T)
def _check_transformer(name, Transformer, X, y):
    if name in ("CCA", "LocallyLinearEmbedding", "KernelPCA") and _is_32bit():
        # Those transformers yield non-deterministic output when executed on
        # a 32bit Python. The same transformers are stable on 64bit Python.
        # FIXME: try to isolate a minimalistic reproduction case only depending
        # on numpy & scipy and/or maybe generate a test dataset that does not
        # cause such unstable behaviors.
        msg = name + " is non deterministic on 32bit Python"
        raise SkipTest(msg)
    n_samples, n_features = np.asarray(X).shape
    # catch deprecation warnings
    with warnings.catch_warnings(record=True):
        transformer = Transformer()
    set_random_state(transformer)
    set_fast_parameters(transformer)

    # fit

    if name in CROSS_DECOMPOSITION:
        y_ = np.c_[y, y]
        y_[::2, 1] *= 2
    else:
        y_ = y

    transformer.fit(X, y_)
    X_pred = transformer.fit_transform(X, y=y_)
    if isinstance(X_pred, tuple):
        for x_pred in X_pred:
            assert_equal(x_pred.shape[0], n_samples)
    else:
        assert_equal(X_pred.shape[0], n_samples)

    if hasattr(transformer, "transform"):
        if name in CROSS_DECOMPOSITION:
            X_pred2 = transformer.transform(X, y_)
            X_pred3 = transformer.fit_transform(X, y=y_)
        else:
            X_pred2 = transformer.transform(X)
            X_pred3 = transformer.fit_transform(X, y=y_)
        if isinstance(X_pred, tuple) and isinstance(X_pred2, tuple):
            for x_pred, x_pred2, x_pred3 in zip(X_pred, X_pred2, X_pred3):
                assert_array_almost_equal(
                    x_pred, x_pred2, 2, "fit_transform and transform outcomes not consistent in %s" % Transformer
                )
                assert_array_almost_equal(
                    x_pred, x_pred3, 2, "consecutive fit_transform outcomes not consistent in %s" % Transformer
                )
        else:
            assert_array_almost_equal(
                X_pred, X_pred2, 2, "fit_transform and transform outcomes not consistent in %s" % Transformer
            )
            assert_array_almost_equal(
                X_pred, X_pred3, 2, "consecutive fit_transform outcomes not consistent in %s" % Transformer
            )

        # raises error on malformed input for transform
        if hasattr(X, "T"):
            # If it's not an array, it does not have a 'T' property
            assert_raises(ValueError, transformer.transform, X.T)
Example #30
0
def check_transformer(name, Transformer, X, y):
    n_samples, n_features = X.shape
    # catch deprecation warnings
    with warnings.catch_warnings(record=True):
        transformer = Transformer()
    set_random_state(transformer)
    if hasattr(transformer, "compute_importances"):
        transformer.compute_importances = True

    if name == "SelectKBest":
        # SelectKBest has a default of k=10
        # which is more feature than we have.
        transformer.k = 1
    elif name in ["GaussianRandomProjection", "SparseRandomProjection"]:
        # Due to the jl lemma and very few samples, the number
        # of components of the random matrix projection will be greater
        # than the number of features.
        # So we impose a smaller number (avoid "auto" mode)
        transformer.n_components = 1
    elif name == "MiniBatchDictionaryLearning":
        transformer.set_params(n_iter=5)  # default = 1000

    elif name == "KernelPCA":
        transformer.remove_zero_eig = False

    # fit

    if name in ("PLSCanonical", "PLSRegression", "CCA", "PLSSVD"):
        y_ = np.c_[y, y]
        y_[::2, 1] *= 2
    else:
        y_ = y

    transformer.fit(X, y_)
    X_pred = transformer.fit_transform(X, y=y_)
    if isinstance(X_pred, tuple):
        for x_pred in X_pred:
            assert_equal(x_pred.shape[0], n_samples)
    else:
        assert_equal(X_pred.shape[0], n_samples)

    if hasattr(transformer, "transform"):
        if name in ("PLSCanonical", "PLSRegression", "CCA", "PLSSVD"):
            X_pred2 = transformer.transform(X, y_)
            X_pred3 = transformer.fit_transform(X, y=y_)
        else:
            X_pred2 = transformer.transform(X)
            X_pred3 = transformer.fit_transform(X, y=y_)
        if isinstance(X_pred, tuple) and isinstance(X_pred2, tuple):
            for x_pred, x_pred2, x_pred3 in zip(X_pred, X_pred2, X_pred3):
                assert_array_almost_equal(x_pred, x_pred2, 2, "fit_transform not correct in %s" % Transformer)
                assert_array_almost_equal(x_pred3, x_pred2, 2, "fit_transform not correct in %s" % Transformer)
        else:
            assert_array_almost_equal(X_pred, X_pred2, 2, "fit_transform not correct in %s" % Transformer)
            assert_array_almost_equal(X_pred3, X_pred2, 2, "fit_transform not correct in %s" % Transformer)

        # raises error on malformed input for transform
        assert_raises(ValueError, transformer.transform, X.T)
Example #31
0
    def _sample(self):
        random_state = check_random_state(self.random_state)
        if self.sampler is None:
            self.sampler_ = RandomUnderSampler(return_indices=True,
                                               random_state=random_state)
        else:
            if not hasattr(self.sampler, 'return_indices'):
                raise ValueError("'sampler' needs to return the indices of "
                                 "the samples selected. Provide a sampler "
                                 "which has an attribute 'return_indices'.")
            self.sampler_ = clone(self.sampler)
            self.sampler_.set_params(return_indices=True)
            set_random_state(self.sampler_, random_state)

        _, _, self.indices_ = self.sampler_.fit_sample(self.X, self.y)
        # shuffle the indices since the sampler are packing them by class
        random_state.shuffle(self.indices_)
Example #32
0
def check_classifiers_input_shapes(name, Classifier, X, y):
    # catch deprecation warnings
    with warnings.catch_warnings(record=True):
        classifier = Classifier()
    set_random_state(classifier)
    # fit
    classifier.fit(X, y)
    y_pred = classifier.predict(X)

    set_random_state(classifier)
    # Check that when a 2D y is given, a DataConversionWarning is
    # raised
    with warnings.catch_warnings(record=True) as w:
        warnings.simplefilter("always", DataConversionWarning)
        classifier.fit(X, y[:, np.newaxis])
    assert_equal(len(w), 1)
    assert_array_equal(y_pred, classifier.predict(X))
Example #33
0
def check_non_transformer_estimators_n_iter(name,
                                            estimator,
                                            multi_output=False):
    # Check if all iterative solvers, run for more than one iteratiom

    iris = load_iris()
    X, y_ = iris.data, iris.target

    if multi_output:
        y_ = y_[:, np.newaxis]

    set_random_state(estimator, 0)
    if name == 'AffinityPropagation':
        estimator.fit(X)
    else:
        estimator.fit(X, y_)
    assert_greater(estimator.n_iter_, 0)
Example #34
0
    def _sample(self):
        random_state = check_random_state(self.random_state)
        if self.sampler is None:
            self.sampler_ = RandomUnderSampler(random_state=random_state)
        else:
            self.sampler_ = clone(self.sampler)
            # FIXME: Remove in 0.6
            if self.sampler_.__class__.__name__ not in DONT_HAVE_RANDOM_STATE:
                set_random_state(self.sampler_, random_state)

        self.sampler_.fit_resample(self.X, self.y)
        if not hasattr(self.sampler_, 'sample_indices_'):
            raise ValueError("'sampler' needs to have an attribute "
                             "'sample_indices_'.")
        self.indices_ = self.sampler_.sample_indices_
        # shuffle the indices since the sampler are packing them by class
        random_state.shuffle(self.indices_)
Example #35
0
def test_get_metric_equivalent_to_explicit_mahalanobis(estimator,
                                                       build_dataset):
    """Tests that using the get_metric method of mahalanobis metric learners is
  equivalent to explicitely calling scipy's mahalanobis metric
  """
    rng = np.random.RandomState(42)
    input_data, labels, _, X = build_dataset()
    model = clone(estimator)
    set_random_state(model)
    model.fit(input_data, labels)
    metric = model.get_metric()
    n_features = X.shape[1]
    a, b = (rng.randn(n_features), rng.randn(n_features))
    expected_dist = mahalanobis(a[None],
                                b[None],
                                VI=model.get_mahalanobis_matrix())
    assert_allclose(metric(a, b), expected_dist, rtol=1e-15)
Example #36
0
def test_score_pairs_dim(estimator, build_dataset):
    # scoring of 3D arrays should return 1D array (several tuples),
    # and scoring of 2D arrays (one tuple) should return an error (like
    # scikit-learn's error when scoring 1D arrays)
    input_data, labels, _, X = build_dataset()
    model = clone(estimator)
    set_random_state(model)
    model.fit(input_data, labels)
    tuples = np.array(list(product(X, X)))
    assert model.score_pairs(tuples).shape == (tuples.shape[0], )
    context = make_context(estimator)
    msg = ("3D array of formed tuples expected{}. Found 2D array "
           "instead:\ninput={}. Reshape your data and/or use a preprocessor.\n"
           .format(context, tuples[1]))
    with pytest.raises(ValueError) as raised_error:
        model.score_pairs(tuples[1])
    assert str(raised_error.value) == msg
Example #37
0
def test_class_weight_auto_classifies():
    # test that class_weight="auto" improves f1-score
    classifiers = all_estimators(type_filter='classifier')

    with warnings.catch_warnings(record=True):
        classifiers = [c for c in classifiers
                       if 'class_weight' in c[1]().get_params().keys()]

    for n_classes, weights in zip([2, 3], [[.8, .2], [.8, .1, .1]]):
        # create unbalanced dataset
        X, y = make_classification(n_classes=n_classes, n_samples=200,
                                   n_features=10, weights=weights,
                                   random_state=0, n_informative=n_classes)
        X = StandardScaler().fit_transform(X)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.5,
                                                            random_state=0)
        for name, Classifier in classifiers:
            if name == "NuSVC":
                # the sparse version has a parameter that doesn't do anything
                continue

            if name.startswith("RidgeClassifier"):
                # RidgeClassifier behaves unexpected
                # FIXME!
                continue

            if name.endswith("NB"):
                # NaiveBayes classifiers have a somewhat different interface.
                # FIXME SOON!
                continue

            with warnings.catch_warnings(record=True):
                classifier = Classifier()
            if hasattr(classifier, "n_iter"):
                classifier.set_params(n_iter=100)

            set_random_state(classifier)
            classifier.fit(X_train, y_train)
            y_pred = classifier.predict(X_test)

            classifier.set_params(class_weight='auto')
            classifier.fit(X_train, y_train)
            y_pred_auto = classifier.predict(X_test)
            assert_greater(f1_score(y_test, y_pred_auto),
                           f1_score(y_test, y_pred))
Example #38
0
def test_check_input_pairs_learners_invalid_y(estimator, build_dataset,
                                              with_preprocessor):
  """checks that the only allowed labels for learning pairs are +1 and -1"""
  input_data, labels, _, X = build_dataset()
  wrong_labels_list = [labels + 0.5,
                       np.random.RandomState(42).randn(len(labels)),
                       np.random.RandomState(42).choice([0, 1],
                                                        size=len(labels))]
  model = clone(estimator)
  set_random_state(model)

  expected_msg = ("When training on pairs, the labels (y) should contain "
                  "only values in [-1, 1]. Found an incorrect value.")

  for wrong_labels in wrong_labels_list:
    with pytest.raises(ValueError) as raised_error:
      model.fit(input_data, wrong_labels)
  assert str(raised_error.value) == expected_msg
Example #39
0
def test_predict_monotonous(estimator, build_dataset, with_preprocessor):
    """Test that there is a threshold distance separating points labeled as
  similar and points labeled as dissimilar """
    input_data, labels, preprocessor, _ = build_dataset(with_preprocessor)
    estimator = clone(estimator)
    estimator.set_params(preprocessor=preprocessor)
    set_random_state(estimator)
    pairs_train, pairs_test, y_train, y_test = train_test_split(
        input_data, labels)
    estimator.fit(pairs_train, y_train)
    distances = estimator.score_pairs(pairs_test)
    predictions = estimator.predict(pairs_test)
    min_dissimilar = np.min(distances[predictions == -1])
    max_similar = np.max(distances[predictions == 1])
    assert max_similar <= min_dissimilar
    separator = np.mean([min_dissimilar, max_similar])
    assert (predictions[distances > separator] == -1).all()
    assert (predictions[distances < separator] == 1).all()
Example #40
0
def check_estimators_dtypes(name, estimator_orig):
    rnd = np.random.RandomState(0)
    X_train_32 = 3 * rnd.uniform(size=(20, 5)).astype(np.float32)
    X_train_64 = X_train_32.astype(np.float64)
    X_train_int_64 = X_train_32.astype(np.int64)
    X_train_int_32 = X_train_32.astype(np.int32)
    y = (X_train_int_64[:, 0] < 1).astype(int)

    methods = ["predict", "transform", "decision_function", "predict_proba"]

    for X_train in [X_train_32, X_train_64, X_train_int_64, X_train_int_32]:
        estimator = clone(estimator_orig)
        set_random_state(estimator, 1)
        estimator.fit(X_train, y)

        for method in methods:
            if hasattr(estimator, method):
                getattr(estimator, method)(X_train)
Example #41
0
def check_estimators_fit_returns_self(name,
                                      estimator_orig,
                                      readonly_memmap=False):
    """Check if self is returned when calling fit"""
    X, y = make_blobs(random_state=0, n_samples=9, n_features=4)
    y = (y > 1).astype(int)
    # some want non-negative input
    X -= X.min()
    X = pairwise_estimator_convert_X(X, estimator_orig)

    estimator = clone(estimator_orig)
    y = multioutput_estimator_convert_y_2d(estimator, y)

    if readonly_memmap:
        X, y = create_memmap_backed_data([X, y])

    set_random_state(estimator)
    assert estimator.fit(X, y) is estimator
Example #42
0
def check_class_weight_classifiers(name, Classifier, X_train, y_train, X_test,
                                   y_test):
    n_centers = len(np.unique(y_train))

    if n_centers == 2:
        class_weight = {0: 1000, 1: 0.0001}
    else:
        class_weight = {0: 1000, 1: 0.0001, 2: 0.0001}

    with warnings.catch_warnings(record=True):
        classifier = Classifier(class_weight=class_weight)
    if hasattr(classifier, "n_iter"):
        classifier.set_params(n_iter=100)

    set_random_state(classifier)
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    assert_greater(np.mean(y_pred == 0), 0.9)
Example #43
0
def check_fit_score_takes_y(name, Estimator):
    # check that all estimators accept an optional y
    # in fit and score so they can be used in pipelines
    rnd = np.random.RandomState(0)
    X = rnd.uniform(size=(10, 3))
    y = np.arange(10) % 3
    y = multioutput_estimator_convert_y_2d(name, y)
    estimator = Estimator()
    set_fast_parameters(estimator)
    set_random_state(estimator)
    funcs = ["fit", "score", "partial_fit", "fit_predict", "fit_transform"]

    for func_name in funcs:
        func = getattr(estimator, func_name, None)
        if func is not None:
            func(X, y)
            args = inspect.getargspec(func).args
            assert_true(args[2] in ["y", "Y"])
def test_cross_validation_manual_vs_scikit(estimator, build_dataset,
                                           with_preprocessor):
    """Tests that if we make a manual cross-validation, the result will be the
  same as scikit-learn's cross-validation (some code for generating the
  folds is taken from scikit-learn).
  """
    if any(hasattr(estimator, method) for method in ["predict", "score"]):
        input_data, labels, preprocessor, _ = build_dataset(with_preprocessor)
        estimator = clone(estimator)
        estimator.set_params(preprocessor=preprocessor)
        set_random_state(estimator)
        n_splits = 3
        kfold = KFold(shuffle=False, n_splits=n_splits)
        n_samples = input_data.shape[0]
        fold_sizes = (n_samples // n_splits) * np.ones(n_splits, dtype=np.int)
        fold_sizes[:n_samples % n_splits] += 1
        current = 0
        scores, predictions = [], np.zeros(input_data.shape[0])
        for fold_size in fold_sizes:
            start, stop = current, current + fold_size
            current = stop
            test_slice = slice(start, stop)
            train_mask = np.ones(input_data.shape[0], bool)
            train_mask[test_slice] = False
            y_train, y_test = labels[train_mask], labels[test_slice]
            estimator.fit(*remove_y_quadruplets(
                estimator, input_data[train_mask], y_train))
            if hasattr(estimator, "score"):
                scores.append(
                    estimator.score(*remove_y_quadruplets(
                        estimator, input_data[test_slice], y_test)))
            if hasattr(estimator, "predict"):
                predictions[test_slice] = estimator.predict(
                    input_data[test_slice])
        if hasattr(estimator, "score"):
            assert all(scores == cross_val_score(
                estimator,
                *remove_y_quadruplets(estimator, input_data, labels),
                cv=kfold))
        if hasattr(estimator, "predict"):
            assert all(predictions == cross_val_predict(
                estimator,
                *remove_y_quadruplets(estimator, input_data, labels),
                cv=kfold))
Example #45
0
def check_samplers_multiclass_ova(name, Sampler):
    # Check that multiclass target lead to the same results than OVA encoding
    X, y = make_classification(n_samples=1000, n_classes=3, n_informative=4,
                               weights=[0.2, 0.3, 0.5], random_state=0)
    y_ova = label_binarize(y, np.unique(y))
    sampler = Sampler()
    # FIXME: in 0.6 set the random_state for all
    if name not in DONT_HAVE_RANDOM_STATE:
        set_random_state(sampler)
    X_res, y_res = sampler.fit_resample(X, y)
    X_res_ova, y_res_ova = sampler.fit_resample(X, y_ova)
    assert_allclose(X_res, X_res_ova)
    if issubclass(Sampler, BaseEnsembleSampler):
        for batch_y, batch_y_ova in zip(y_res, y_res_ova):
            assert type_of_target(batch_y_ova) == type_of_target(y_ova)
            assert_allclose(batch_y, batch_y_ova.argmax(axis=1))
    else:
        assert type_of_target(y_res_ova) == type_of_target(y_ova)
        assert_allclose(y_res, y_res_ova.argmax(axis=1))
Example #46
0
def test_estimators_overwrite_params():
    # test whether any classifier overwrites his init parameters during fit
    for est_type in ["classifier", "regressor", "transformer"]:
        estimators = all_estimators(type_filter=est_type)
        X, y = make_blobs(random_state=0, n_samples=9)
        # some want non-negative input
        X -= X.min()
        for name, Estimator in estimators:
            if (name in dont_test
                    or name in ['CCA', '_CCA', 'PLSCanonical',
                                'PLSRegression',
                                'PLSSVD', 'GaussianProcess']):
                # FIXME!
                # in particular GaussianProcess!
                continue
            with warnings.catch_warnings(record=True):
                # catch deprecation warnings
                estimator = Estimator()

            if hasattr(estimator, 'batch_size'):
                # FIXME
                # for MiniBatchDictLearning
                estimator.batch_size = 1

            if name in ['GaussianRandomProjection',
                        'SparseRandomProjection']:
                # Due to the jl lemma and very few samples, the number
                # of components of the random matrix projection will be
                # greater
                # than the number of features.
                # So we impose a smaller number (avoid "auto" mode)
                estimator = Estimator(n_components=1)

            set_random_state(estimator)

            params = estimator.get_params()
            estimator.fit(X, y)
            new_params = estimator.get_params()
            for k, v in params.items():
                assert_false(np.any(new_params[k] != v),
                             "Estimator %s changes its parameter %s"
                             " from %s to %s during fit."
                             % (name, k, v, new_params[k]))
Example #47
0
def check_estimators_dtypes(name, Estimator):
    rnd = np.random.RandomState(0)
    X_train_32 = 3 * rnd.uniform(size=(20, 5)).astype(np.float32)
    X_train_64 = X_train_32.astype(np.float64)
    X_train_int_64 = X_train_32.astype(np.int64)
    X_train_int_32 = X_train_32.astype(np.int32)
    y = X_train_int_64[:, 0]
    y = multioutput_estimator_convert_y_2d(name, y)
    for X_train in [X_train_32, X_train_64, X_train_int_64, X_train_int_32]:
        with warnings.catch_warnings(record=True):
            estimator = Estimator()
        set_fast_parameters(estimator)
        set_random_state(estimator, 1)
        estimator.fit(X_train, y)

        for method in ["predict", "transform", "decision_function",
                       "predict_proba"]:
            if hasattr(estimator, method):
                getattr(estimator, method)(X_train)
Example #48
0
def check_estimators_not_an_array(name, Estimator, X, y):
    if name in ('CCA', '_PLS', 'PLSCanonical', 'PLSRegression'):
        raise SkipTest
    # catch deprecation warnings
    with warnings.catch_warnings(record=True):
        # separate estimators to control random seeds
        regressor_1 = Estimator()
        regressor_2 = Estimator()
    set_random_state(regressor_1)
    set_random_state(regressor_2)

    y_ = NotAnArray(np.asarray(y))

    # fit
    regressor_1.fit(X, y_)
    pred1 = regressor_1.predict(X)
    regressor_2.fit(X, y)
    pred2 = regressor_2.predict(X)
    assert_array_almost_equal(pred1, pred2, 2, name)
Example #49
0
def test_get_metric_raises_error(estimator, build_dataset):
  """Tests that the metric returned by get_metric raises errors similar to
  the distance functions in scipy.spatial.distance"""
  input_data, labels, _, X = build_dataset()
  model = clone(estimator)
  set_random_state(model)
  model.fit(input_data, labels)
  metric = model.get_metric()

  list_test_get_metric_raises = [(X[0].tolist() + [5.2], X[1]),  # vectors with
                                 # different dimensions
                                 (X[0:4], X[1:5]),  # 2D vectors
                                 (X[0].tolist() + [5.2], X[1] + [7.2])]
  # vectors of same dimension but incompatible with what the metric learner
  # was trained on

  for u, v in list_test_get_metric_raises:
    with pytest.raises(ValueError):
      metric(u, v)
Example #50
0
def test_class_weight_classifiers():
    # test that class_weight works and that the semantics are consistent
    classifiers = all_estimators(type_filter='classifier')

    with warnings.catch_warnings(record=True):
        classifiers = [
            c for c in classifiers
            if 'class_weight' in c[1]().get_params().keys()
        ]

    for n_centers in [2, 3]:
        # create a very noisy dataset
        X, y = make_blobs(centers=n_centers, random_state=0, cluster_std=20)
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=.5,
                                                            random_state=0)
        for name, Clf in classifiers:
            if name == "NuSVC":
                # the sparse version has a parameter that doesn't do anything
                continue
            if name.startswith("RidgeClassifier"):
                # RidgeClassifier shows unexpected behavior
                # FIXME!
                continue
            if name.endswith("NB"):
                # NaiveBayes classifiers have a somewhat differnt interface.
                # FIXME SOON!
                continue
            if n_centers == 2:
                class_weight = {0: 1000, 1: 0.0001}
            else:
                class_weight = {0: 1000, 1: 0.0001, 2: 0.0001}

            with warnings.catch_warnings(record=True):
                clf = Clf(class_weight=class_weight)
            if hasattr(clf, "n_iter"):
                clf.set_params(n_iter=100)

            set_random_state(clf)
            clf.fit(X_train, y_train)
            y_pred = clf.predict(X_test)
            assert_greater(np.mean(y_pred == 0), 0.9)
Example #51
0
def test_accuracy_toy_example(estimator, build_dataset):
    """Test that the default scoring for triplets (accuracy) works on some
  toy example"""
    triplets, _, _, X = build_dataset(with_preprocessor=False)
    estimator = clone(estimator)
    set_random_state(estimator)
    estimator.fit(triplets)
    # We take the two first points and we build 4 regularly spaced points on the
    # line they define, so that it's easy to build triplets of different
    # similarities.
    X_test = X[0] + np.arange(4)[:, np.newaxis] * (X[0] - X[1]) / 4

    triplets_test = np.array([[X_test[0], X_test[2], X_test[1]],
                              [X_test[1], X_test[3], X_test[0]],
                              [X_test[1], X_test[2], X_test[3]],
                              [X_test[3], X_test[0], X_test[2]]])
    # we force the transformation to be identity so that we control what it does
    estimator.components_ = np.eye(X.shape[1])
    assert estimator.score(triplets_test) == 0.25
Example #52
0
def check_transformer_n_iter(name, estimator):
    if name in CROSS_DECOMPOSITION:
        # Check using default data
        X = [[0., 0., 1.], [1., 0., 0.], [2., 2., 2.], [2., 5., 4.]]
        y_ = [[0.1, -0.2], [0.9, 1.1], [0.1, -0.5], [0.3, -0.2]]

    else:
        X, y_ = make_blobs(n_samples=30, centers=[[0, 0, 0], [1, 1, 1]],
                           random_state=0, n_features=2, cluster_std=0.1)
        X -= X.min() - 0.1
    set_random_state(estimator, 0)
    estimator.fit(X, y_)

    # These return a n_iter per component.
    if name in CROSS_DECOMPOSITION:
        for iter_ in estimator.n_iter_:
            assert_greater(iter_, 1)
    else:
        assert_greater(estimator.n_iter_, 1)
Example #53
0
def check_samplers_sparse(name, Sampler):
    # check that sparse matrices can be passed through the sampler leading to
    # the same results than dense
    X, y = make_classification(n_samples=1000,
                               n_classes=3,
                               n_informative=4,
                               weights=[0.2, 0.3, 0.5],
                               random_state=0)
    X_sparse = sparse.csr_matrix(X)
    if isinstance(Sampler(), SMOTE):
        samplers = [
            Sampler(random_state=0, kind=kind)
            for kind in ('regular', 'borderline1', 'borderline2', 'svm')
        ]
    elif isinstance(Sampler(), NearMiss):
        samplers = [Sampler(version=version) for version in (1, 2, 3)]
    elif isinstance(Sampler(), ClusterCentroids):
        # set KMeans to full since it support sparse and dense
        samplers = [
            Sampler(random_state=0,
                    voting='soft',
                    estimator=KMeans(random_state=1, algorithm='full'))
        ]
    else:
        samplers = [Sampler()]

    for sampler in samplers:
        # FIXME: in 0.6 set the random_state for all
        if name not in DONT_HAVE_RANDOM_STATE:
            set_random_state(sampler)
        X_res_sparse, y_res_sparse = sampler.fit_resample(X_sparse, y)
        X_res, y_res = sampler.fit_resample(X, y)
        if not isinstance(sampler, BaseEnsembleSampler):
            assert sparse.issparse(X_res_sparse)
            assert_allclose(X_res_sparse.A, X_res)
            assert_allclose(y_res_sparse, y_res)
        else:
            for x_sp, x, y_sp, y in zip(X_res_sparse, X_res, y_res_sparse,
                                        y_res):
                assert sparse.issparse(x_sp)
                assert_allclose(x_sp.A, x)
                assert_allclose(y_sp, y)
Example #54
0
def test_get_metric_is_independent_from_metric_learner(estimator,
                                                       build_dataset):
    """Tests that the get_metric method returns a function that is independent
  from the original metric learner"""
    input_data, labels, _, X = build_dataset()
    model = clone(estimator)
    set_random_state(model)

    # we fit the metric learner on it and then we compute the metric on some
    # points
    model.fit(input_data, labels)
    metric = model.get_metric()
    score = metric(X[0], X[1])

    # then we refit the estimator on another dataset
    model.fit(np.sin(input_data), labels)

    # we recompute the distance between the two points: it should be the same
    score_bis = metric(X[0], X[1])
    assert score_bis == score
Example #55
0
def test_score_pairs_pairwise(estimator, build_dataset):
    # Computing pairwise scores should return a euclidean distance matrix.
    input_data, labels, _, X = build_dataset()
    n_samples = 20
    X = X[:n_samples]
    model = clone(estimator)
    set_random_state(model)
    model.fit(*remove_y_quadruplets(estimator, input_data, labels))

    pairwise = model.score_pairs(np.array(list(product(X, X))))\
        .reshape(n_samples, n_samples)

    check_is_distance_matrix(pairwise)

    # a necessary condition for euclidean distance matrices: (see
    # https://en.wikipedia.org/wiki/Euclidean_distance_matrix)
    assert np.linalg.matrix_rank(pairwise**2) <= min(X.shape) + 2

    # assert that this distance is coherent with pdist on embeddings
    assert_array_almost_equal(squareform(pairwise), pdist(model.transform(X)))
Example #56
0
def check_multiprocessing_idempotent(Estimator):
    # Check that running an estimator on a single process is no different to running
    # it on multiple processes. We also check that we can set n_jobs=-1 to make use
    # of all CPUs. The test is not really necessary though, as we rely on joblib for
    # parallelization and can trust that it works as expected.
    estimator = _construct_instance(Estimator)
    params = estimator.get_params()

    if "n_jobs" in params:
        results = dict()
        args = dict()

        # run on a single process
        estimator = _construct_instance(Estimator)
        estimator.set_params(n_jobs=1)
        set_random_state(estimator)
        args["fit"] = _make_args(estimator, "fit")
        estimator.fit(*args["fit"])

        # compute and store results
        for method in NON_STATE_CHANGING_METHODS:
            if hasattr(estimator, method):
                args[method] = _make_args(estimator, method)
                results[method] = getattr(estimator, method)(*args[method])

        # run on multiple processes, reusing the same input arguments
        estimator = _construct_instance(Estimator)
        estimator.set_params(n_jobs=-1)
        set_random_state(estimator)
        estimator.fit(*args["fit"])

        # compute and compare results
        for method in results:
            if hasattr(estimator, method):
                result = getattr(estimator, method)(*args[method])
                _assert_array_equal(
                    results[method],
                    result,
                    err_msg="Results are not equal for n_jobs=1 and "
                    "n_jobs=-1",
                )
Example #57
0
def check_estimators_overwrite_params(name, Estimator):
    X, y = make_blobs(random_state=0, n_samples=9)
    y = multioutput_estimator_convert_y_2d(name, y)
    # some want non-negative input
    X -= X.min()
    with warnings.catch_warnings(record=True):
        # catch deprecation warnings
        estimator = Estimator()

    if name == 'MiniBatchDictLearning' or name == 'MiniBatchSparsePCA':
        # FIXME
        # for MiniBatchDictLearning and MiniBatchSparsePCA
        estimator.batch_size = 1

    set_fast_parameters(estimator)
    set_random_state(estimator)

    # Make a physical copy of the orginal estimator parameters before fitting.
    params = estimator.get_params()
    original_params = deepcopy(params)

    # Fit the model
    estimator.fit(X, y)

    # Compare the state of the model parameters with the original parameters
    new_params = estimator.get_params()
    for param_name, original_value in original_params.items():
        new_value = new_params[param_name]

        # We should never change or mutate the internal state of input
        # parameters by default. To check this we use the joblib.hash function
        # that introspects recursively any subobjects to compute a checksum.
        # The only exception to this rule of immutable constructor parameters
        # is possible RandomState instance but in this check we explicitly
        # fixed the random_state params recursively to be integer seeds.
        assert_equal(
            hash(new_value), hash(original_value),
            "Estimator %s should not change or mutate "
            " the parameter %s from %s to %s during fit." %
            (name, param_name, original_value, new_value))
def test_array_like_inputs(estimator, build_dataset, with_preprocessor):
    """Test that metric-learners can have as input (of all functions that are
  applied on data) any array-like object."""
    input_data, labels, preprocessor, X = build_dataset(with_preprocessor)

    # we subsample the data for the test to be more efficient
    input_data, _, labels, _ = train_test_split(input_data,
                                                labels,
                                                train_size=20)
    X = X[:10]

    estimator = clone(estimator)
    estimator.set_params(preprocessor=preprocessor)
    set_random_state(estimator)
    input_variants, label_variants = generate_array_like(input_data, labels)
    for input_variant in input_variants:
        for label_variant in label_variants:
            estimator.fit(
                *remove_y_quadruplets(estimator, input_variant, label_variant))
        if hasattr(estimator, "predict"):
            estimator.predict(input_variant)
        if hasattr(estimator, "predict_proba"):
            estimator.predict_proba(input_variant)  # anticipation in case some
            # time we have that, or if ppl want to contribute with new algorithms
            # it will be checked automatically
        if hasattr(estimator, "decision_function"):
            estimator.decision_function(input_variant)
        if hasattr(estimator, "score"):
            for label_variant in label_variants:
                estimator.score(*remove_y_quadruplets(estimator, input_variant,
                                                      label_variant))

    X_variants, _ = generate_array_like(X)
    for X_variant in X_variants:
        estimator.transform(X_variant)

    pairs = np.array([[X[0], X[1]], [X[0], X[2]]])
    pairs_variants, _ = generate_array_like(pairs)
    for pairs_variant in pairs_variants:
        estimator.score_pairs(pairs_variant)
def test_singular_array_init_or_prior_strictpd(estimator, build_dataset, w0):
    """Tests that when using a custom array init (or prior), it returns the
    appropriate error if it is singular, for algorithms
    that need a strictly PD prior or init (see
    https://github.com/scikit-learn-contrib/metric-learn/issues/202 and
    https://github.com/scikit-learn-contrib/metric-learn/pull/195#issuecomment
    -492332451)
    """
    matrices_to_set = []
    if hasattr(estimator, 'init'):
        matrices_to_set.append('init')
    if hasattr(estimator, 'prior'):
        matrices_to_set.append('prior')

    rng = np.random.RandomState(42)
    input_data, labels, _, X = build_dataset()
    for param in matrices_to_set:
        model = clone(estimator)
        set_random_state(model)

        P = ortho_group.rvs(X.shape[1], random_state=rng)
        w = np.abs(rng.randn(X.shape[1]))
        w[0] = w0
        M = P.dot(np.diag(w)).dot(P.T)
        if hasattr(model, 'init'):
            model.set_params(init=M)
        if hasattr(model, 'prior'):
            model.set_params(prior=M)
        if not hasattr(model, 'prior') and not hasattr(model, 'init'):
            raise RuntimeError(
                "Neither prior or init could be set in the model.")
        msg = ("You should provide a strictly positive definite "
               "matrix as `{}`. This one is not definite. Try another"
               " {}, or an algorithm that does not "
               "require the {} to be strictly positive definite.".format(
                   *(param, ) * 3))
        with pytest.raises(LinAlgError) as raised_err:
            model.fit(input_data, labels)
        assert str(raised_err.value) == msg
Example #60
0
def check_classifiers_input_shapes(name, Classifier):
    iris = load_iris()
    X, y = iris.data, iris.target
    X, y = shuffle(X, y, random_state=1)
    X = StandardScaler().fit_transform(X)
    # catch deprecation warnings
    with warnings.catch_warnings(record=True):
        classifier = Classifier()
    set_fast_parameters(classifier)
    set_random_state(classifier)
    # fit
    classifier.fit(X, y)
    y_pred = classifier.predict(X)

    set_random_state(classifier)
    # Check that when a 2D y is given, a DataConversionWarning is
    # raised
    with warnings.catch_warnings(record=True) as w:
        warnings.simplefilter("always", DataConversionWarning)
        classifier.fit(X, y[:, np.newaxis])
    assert_equal(len(w), 1)
    assert_array_equal(y_pred, classifier.predict(X))