def test_unique_label(): """Check that testing works with only one label in the training test""" np.random.seed(10) Nshape = ground_truth.file_path.values.shape is_relevant = np.zeros(Nshape).astype(int) idx = np.arange(len(is_relevant), dtype='int') categorization_score(idx, is_relevant, idx, np.random.rand(*Nshape))
def test_categorization(use_lsi, method, cv): if 'CIRCLECI' in os.environ and cv == 'fast'\ and method in ['LinearSVC', 'xgboost']: raise SkipTest # Circle CI is too slow and timesout if method == 'xgboost': try: import xgboost except ImportError: raise SkipTest if not use_lsi: uuid = vect_uuid else: uuid = lsi.mid cat = _CategorizerWrapper(cache_dir=cache_dir, parent_id=uuid, cv_n_folds=2) cat.fe.db_.filenames_ = cat.fe.filenames_ index = cat.fe.db_._search_filenames(ground_truth.file_path.values) try: model, Y_train = cat.fit( index, ground_truth.is_relevant.values, method=method, cv=cv) except OptionalDependencyMissing: raise SkipTest except WrongParameter: if method in ['NearestNeighbor', 'NearestCentroid']: return else: raise Y_pred, md = cat.predict() X_pred = np.arange(cat.fe.n_samples_, dtype='int') idx_gt = cat.fe.db_._search_filenames(ground_truth.file_path.values) scores = categorization_score(idx_gt, ground_truth.is_relevant.values, X_pred, np.argmax(Y_pred, axis=1)) assert cat.get_params() is not None assert Y_pred.shape == (cat.fe.n_samples_, len(np.unique(ground_truth.is_relevant.values))) if method == 'NearestNeighbor': assert md.shape == Y_pred.shape else: assert md is None if method in ['xgboost', 'ensemble-stacking']: # this parameter fail for some reason so far... return assert_allclose(scores['precision'], 1, rtol=0.5) assert_allclose(scores['recall'], 1, rtol=0.68) cat.delete()
def test_categorization_score(): idx = [1, 2, 3, 4, 5, 6] y = [1, 1, -1, -1, -1, 1] idx_ref = [10, 5, 3, 2, 6] y_ref = [0, 1, 0, 1, 1] scores = categorization_score(idx_ref, y_ref, idx, y) assert_allclose(scores['precision'], 1.0) assert_allclose(scores['recall'], 0.66666666, rtol=1e-4) # make sure permutations don't affect the result idx_ref2 = [10, 5, 2, 3, 6] y_ref2 = [0, 1, 1, 0, 1] scores2 = categorization_score(idx_ref2, y_ref2, idx, y) assert scores['average_precision'] == scores2['average_precision']
def test_features_hashing(use_hashing, use_lsi, method): # check that models work both with and without hashing cache_dir = check_cache() n_features = 20000 fe = FeatureVectorizer(cache_dir=cache_dir) uuid = fe.setup(n_features=n_features, use_hashing=use_hashing) fe.ingest(data_dir, file_pattern='.*\d.txt') ground_truth = parse_ground_truth_file( os.path.join(data_dir, "..", "ground_truth_file.txt")) lsi = _LSIWrapper(cache_dir=cache_dir, parent_id=uuid) lsi_res, exp_var = lsi.fit_transform(n_components=100) assert lsi._load_pars() is not None lsi._load_model() if method == 'Categorization': if use_lsi: parent_id = lsi.mid method = 'NearestNeighbor' else: parent_id = uuid method = 'LogisticRegression' cat = _CategorizerWrapper(cache_dir=cache_dir, parent_id=parent_id, cv_n_folds=2) cat.fe.db_.filenames_ = cat.fe.filenames_ index = cat.fe.db_._search_filenames(ground_truth.file_path.values) try: coefs, Y_train = cat.fit(index, ground_truth.is_relevant.values, method=method) except OptionalDependencyMissing: raise SkipTest Y_pred, md = cat.predict() X_pred = np.arange(cat.fe.n_samples_, dtype='int') idx_gt = cat.fe.db_._search_filenames(ground_truth.file_path.values) scores = categorization_score(idx_gt, ground_truth.is_relevant.values, X_pred, np.argmax(Y_pred, axis=1)) assert_allclose(scores['precision'], 1, rtol=0.5) assert_allclose(scores['recall'], 1, rtol=0.7) cat.delete() elif method == 'DuplicateDetection': dd = _DuplicateDetectionWrapper(cache_dir=cache_dir, parent_id=uuid) try: dd.fit() except ImportError: raise SkipTest cluster_id = dd.query(distance=10) elif method == 'Clustering': if not use_hashing: if use_lsi: parent_id = lsi.mid method = 'birch' else: parent_id = uuid method = 'k_means' cat = _ClusteringWrapper(cache_dir=cache_dir, parent_id=parent_id) cm = getattr(cat, method) labels = cm(2) htree = cat._get_htree(cat.pipeline.data) terms = cat.compute_labels(n_top_words=10) else: with pytest.raises(NotImplementedError): _ClusteringWrapper(cache_dir=cache_dir, parent_id=uuid) else: raise ValueError