def test_nearest_neighbor_ranker(n_categories): # check that we have sensible results with respect to # NN1 binary classification (supervised, with both positive # and negative samples) from sklearn.neighbors import KNeighborsClassifier from sklearn.neighbors import NearestNeighbors np.random.seed(0) n_samples = 110 n_features = 10 X = np.random.rand(n_samples, n_features) normalize(X, copy=False) index = np.arange(n_samples, dtype='int') y = np.random.randint(0, n_categories, size=(n_samples, )) index_train, index_test, y_train, y_test = train_test_split(index, y) X_train = X[index_train] X_test = X[index_test] rk = NearestNeighborRanker() rk.fit(X_train, y_train) y_pred, idx_pred = rk.kneighbors( X_test, batch_size=90) # choose a batch size smaller # than n_samples assert y_pred.shape == (X_test.shape[0], n_categories) assert y_pred.min() >= -1 and y_pred.max( ) <= 1 # as we are using cosine similarities assert idx_pred.shape == (X_test.shape[0], n_categories) # postive scores correspond to positive documents #assert_equal((y_pred > 0), y_train[idx]) cl = KNeighborsClassifier(n_neighbors=1, algorithm='brute', metric='cosine') cl.fit(X_train, y_train) idx_ref_cl = cl.predict(X_test) # make sure we get the same results as for the KNeighborsClassifier label_pred = np.argmax(y_pred, axis=1) assert_equal(label_pred, idx_ref_cl) nn = NearestNeighbors(n_neighbors=1, algorithm='brute', metric='cosine') nn.fit(X_train) S_ref_nn, idx_ref_nn = nn.kneighbors(X_test) assert_equal(idx_pred[range(len(label_pred)), label_pred], idx_ref_nn[:, 0]) assert_allclose(np.max(y_pred, axis=1)[:, None], 1 - S_ref_nn)
def test_nearest_neighbor_ranker_is_picklable(): mod = NearestNeighborRanker() mod.fit([[0, 1], [1, 0]], [0, 1]) try: tmp_file = os.path.join(cache_dir, 'tmp_NearestNeighborRanker.pkl') joblib.dump(mod, tmp_file) mod2 = joblib.load(tmp_file) finally: if os.path.exists(tmp_file): os.remove(tmp_file)
def _build_estimator(Y_train, method, cv, cv_scoring, cv_n_folds, random_state=None, **options): if cv: #from sklearn.cross_validation import StratifiedKFold #cv_obj = StratifiedKFold(n_splits=cv_n_folds, shuffle=False) cv_obj = cv_n_folds # temporary hack (due to piclking issues otherwise, this needs to be fixed) else: cv_obj = None _rename_main_thread() if method == 'LinearSVC': from sklearn.svm import LinearSVC if cv is None: cmod = LinearSVC(random_state=random_state, **options) else: try: from freediscovery_extra import make_linearsvc_cv_model except ImportError: raise OptionalDependencyMissing('freediscovery_extra') cmod = make_linearsvc_cv_model(cv_obj, cv_scoring, **options) elif method == 'LogisticRegression': from sklearn.linear_model import LogisticRegression if cv is None: cmod = LogisticRegression(random_state=random_state, **options) else: try: from freediscovery_extra import make_logregr_cv_model except ImportError: raise OptionalDependencyMissing('freediscovery_extra') cmod = make_logregr_cv_model(cv_obj, cv_scoring, **options) elif method == 'NearestCentroid': cmod = NearestCentroidRanker() elif method == 'NearestNeighbor': cmod = NearestNeighborRanker() elif method == 'xgboost': try: import xgboost as xgb except ImportError: raise OptionalDependencyMissing('xgboost') if cv is None: try: from freediscovery_extra import make_xgboost_model except ImportError: raise OptionalDependencyMissing('freediscovery_extra') cmod = make_xgboost_model(cv_obj, cv_scoring, **options) else: try: from freediscovery_extra import make_xgboost_cv_model except ImportError: raise OptionalDependencyMissing('freediscovery_extra') cmod = make_xgboost_cv_model(cv, cv_obj, cv_scoring, **options) elif method == 'MLPClassifier': if cv is not None: raise NotImplementedFD('CV not supported with MLPClassifier') from sklearn.neural_network import MLPClassifier cmod = MLPClassifier(solver='adam', hidden_layer_sizes=10, max_iter=200, activation='identity', verbose=0, random_state=random_state) else: raise WrongParameter('Method {} not implemented!'.format(method)) return cmod