Example #1
0
def test_nearest_neighbor_ranker(n_categories):
    # check that we have sensible results with respect to
    # NN1 binary classification (supervised, with both positive
    # and negative samples)
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.neighbors import NearestNeighbors
    np.random.seed(0)

    n_samples = 110
    n_features = 10

    X = np.random.rand(n_samples, n_features)
    normalize(X, copy=False)
    index = np.arange(n_samples, dtype='int')
    y = np.random.randint(0, n_categories, size=(n_samples, ))
    index_train, index_test, y_train, y_test = train_test_split(index, y)
    X_train = X[index_train]
    X_test = X[index_test]

    rk = NearestNeighborRanker()
    rk.fit(X_train, y_train)
    y_pred, idx_pred = rk.kneighbors(
        X_test, batch_size=90)  # choose a batch size smaller
    # than n_samples

    assert y_pred.shape == (X_test.shape[0], n_categories)
    assert y_pred.min() >= -1 and y_pred.max(
    ) <= 1  # as we are using cosine similarities
    assert idx_pred.shape == (X_test.shape[0], n_categories)

    # postive scores correspond to positive documents

    #assert_equal((y_pred > 0), y_train[idx])

    cl = KNeighborsClassifier(n_neighbors=1,
                              algorithm='brute',
                              metric='cosine')
    cl.fit(X_train, y_train)

    idx_ref_cl = cl.predict(X_test)

    # make sure we get the same results as for the KNeighborsClassifier
    label_pred = np.argmax(y_pred, axis=1)
    assert_equal(label_pred, idx_ref_cl)

    nn = NearestNeighbors(n_neighbors=1, algorithm='brute', metric='cosine')
    nn.fit(X_train)
    S_ref_nn, idx_ref_nn = nn.kneighbors(X_test)

    assert_equal(idx_pred[range(len(label_pred)), label_pred], idx_ref_nn[:,
                                                                          0])
    assert_allclose(np.max(y_pred, axis=1)[:, None], 1 - S_ref_nn)
Example #2
0
def test_nearest_neighbor_ranker_is_picklable():
    mod = NearestNeighborRanker()

    mod.fit([[0, 1], [1, 0]], [0, 1])

    try:
        tmp_file = os.path.join(cache_dir, 'tmp_NearestNeighborRanker.pkl')
        joblib.dump(mod, tmp_file)

        mod2 = joblib.load(tmp_file)
    finally:
        if os.path.exists(tmp_file):
            os.remove(tmp_file)
Example #3
0
    def _build_estimator(Y_train, method, cv, cv_scoring, cv_n_folds, random_state=None, **options):
        if cv:
            #from sklearn.cross_validation import StratifiedKFold
            #cv_obj = StratifiedKFold(n_splits=cv_n_folds, shuffle=False)
            cv_obj = cv_n_folds  # temporary hack (due to piclking issues otherwise, this needs to be fixed)
        else:
            cv_obj = None

        _rename_main_thread()

        if method == 'LinearSVC':
            from sklearn.svm import LinearSVC
            if cv is None:
                cmod = LinearSVC(random_state=random_state, **options)
            else:
                try:
                    from freediscovery_extra import make_linearsvc_cv_model
                except ImportError:
                    raise OptionalDependencyMissing('freediscovery_extra')
                cmod = make_linearsvc_cv_model(cv_obj, cv_scoring, **options)
        elif method == 'LogisticRegression':
            from sklearn.linear_model import LogisticRegression
            if cv is None:
                cmod = LogisticRegression(random_state=random_state, **options)
            else:
                try:
                    from freediscovery_extra import make_logregr_cv_model
                except ImportError:
                    raise OptionalDependencyMissing('freediscovery_extra')
                cmod = make_logregr_cv_model(cv_obj, cv_scoring, **options)
        elif method == 'NearestCentroid':
            cmod = NearestCentroidRanker()
        elif method == 'NearestNeighbor':
            cmod = NearestNeighborRanker()
        elif method == 'xgboost':
            try:
                import xgboost as xgb
            except ImportError:
                raise OptionalDependencyMissing('xgboost')
            if cv is None:
                try:
                    from freediscovery_extra import make_xgboost_model
                except ImportError:
                    raise OptionalDependencyMissing('freediscovery_extra')
                cmod = make_xgboost_model(cv_obj, cv_scoring, **options)
            else:
                try:
                    from freediscovery_extra import make_xgboost_cv_model
                except ImportError:
                    raise OptionalDependencyMissing('freediscovery_extra')
                cmod = make_xgboost_cv_model(cv, cv_obj, cv_scoring, **options)
        elif method == 'MLPClassifier':
            if cv is not None:
                raise NotImplementedFD('CV not supported with MLPClassifier')
            from sklearn.neural_network import MLPClassifier
            cmod = MLPClassifier(solver='adam', hidden_layer_sizes=10,
                                 max_iter=200, activation='identity',
                                 verbose=0,
                                 random_state=random_state)
        else:
            raise WrongParameter('Method {} not implemented!'.format(method))
        return cmod