def test_categorization_score():
    idx = [1, 2, 3, 4, 5, 6]
    y = [1, 1, -1, -1, -1, 1]
    idx_ref = [10, 5, 3, 2, 6]
    y_ref = [0, 1, 0, 1, 1]

    scores = categorization_score(idx_ref, y_ref, idx, y)

    assert_allclose(scores['precision'], 1.0)
    assert_allclose(scores['recall'], 0.66666666, rtol=1e-4)

    # make sure permutations don't affect the result
    idx_ref2 = [10, 5, 2, 3, 6]
    y_ref2 = [0, 1, 1, 0, 1]
    scores2 = categorization_score(idx_ref2, y_ref2, idx, y)
    assert scores['average_precision'] == scores2['average_precision']
def test_unique_label():
    """Check that testing works with only one label in the training test"""
    np.random.seed(10)
    Nshape = ground_truth.file_path.values.shape
    is_relevant = np.zeros(Nshape).astype(int)

    idx = np.arange(len(is_relevant), dtype='int')

    scores = categorization_score(idx, is_relevant, idx,
                                  np.random.rand(*Nshape))
def test_categorization(use_lsi, method, cv):

    if 'CIRCLECI' in os.environ and cv == 'fast' and method in [
            'LinearSVC', 'xgboost'
    ]:
        raise SkipTest  # Circle CI is too slow and timesout

    if method == 'xgboost':
        try:
            import xgboost
        except ImportError:
            raise SkipTest

    if not use_lsi:
        uuid = vect_uuid
    else:
        uuid = lsi.mid

    cat = _CategorizerWrapper(cache_dir=cache_dir,
                              parent_id=uuid,
                              cv_n_folds=2)
    index = cat.fe.db._search_filenames(ground_truth.file_path.values)

    try:
        model, Y_train = cat.train(index,
                                   ground_truth.is_relevant.values,
                                   method=method,
                                   cv=cv)
    except OptionalDependencyMissing:
        raise SkipTest

    Y_pred, md = cat.predict()
    X_pred = np.arange(cat.fe.n_samples_, dtype='int')
    idx_gt = cat.fe.db._search_filenames(ground_truth.file_path.values)

    scores = categorization_score(idx_gt, ground_truth.is_relevant.values,
                                  X_pred, np.argmax(Y_pred, axis=1))

    assert cat.get_params() is not None

    assert Y_pred.shape == (cat.fe.n_samples_,
                            len(np.unique(ground_truth.is_relevant.values)))

    if method == 'NearestNeighbor':
        assert md.shape == Y_pred.shape
    else:
        assert md is None

    if method in ['xgboost', 'ensemble-stacking']:
        # this parameter fail for some reason so far...
        return
    assert_allclose(scores['precision'], 1, rtol=0.5)
    assert_allclose(scores['recall'], 1, rtol=0.68)
    cat.delete()
Exemple #4
0
def test_lsi():
    basename = os.path.dirname(__file__)

    cache_dir = check_cache()
    data_dir = os.path.join(basename, "..", "data", "ds_001", "raw")
    n_features = 110000

    fe = FeatureVectorizer(cache_dir=cache_dir)
    uuid = fe.preprocess(
        data_dir, file_pattern='.*\d.txt', n_features=n_features
    )  # TODO unused variable (overwritten on the next line)
    uuid, filenames = fe.transform()
    ground_truth = parse_ground_truth_file(
        os.path.join(data_dir, "..", "ground_truth_file.txt"))

    lsi = LSI(cache_dir=cache_dir, dsid=uuid)
    lsi_res, exp_var = lsi.transform(n_components=100)  # TODO unused variables
    lsi_id = lsi.mid
    assert lsi.get_dsid(fe.cache_dir, lsi_id) == uuid
    assert lsi.get_path(lsi_id) is not None
    assert lsi._load_pars(lsi_id) is not None
    lsi.load(lsi_id)

    idx_gt = lsi.fe.search(ground_truth.index.values)
    idx_all = np.arange(lsi.fe.n_samples_, dtype='int')

    for accumulate in ['nearest-max', 'centroid-max']:
        #'nearest-diff', 'nearest-combine', 'stacking']:
        _, Y_train, Y_pred, ND_train = lsi.predict(
            idx_gt, ground_truth.is_relevant.values, accumulate=accumulate)
        scores = categorization_score(idx_gt, ground_truth.is_relevant.values,
                                      idx_all, Y_pred)
        assert_allclose(scores['precision'], 1, rtol=0.5)
        assert_allclose(scores['recall'], 1, rtol=0.3)

    lsi.list_models()
    lsi.delete()
Exemple #5
0
    seed_filenames = ds['seed_filenames']
    seed_y = ds['seed_y']
    ground_truth_file = ds['ground_truth_file']  # (optional)

    fe_opts = {'data_dir': data_dir,
               'stop_words': 'english', 'chunk_size': 2000, 'n_jobs': -1,
               'use_idf': 1, 'sublinear_tf': 0, 'binary': 0, 'n_features': 50001,
               'analyzer': 'word', 'ngram_range': (1, 1), "norm": "l2"
              }

    fe = FeatureVectorizer(cache_dir=cache_dir)

    uuid = fe.preprocess(**fe_opts)
    uuid, filenames  = fe.transform()

    seed_index = fe.search(seed_filenames)

    cat = Categorizer(cache_dir=cache_dir, dsid=uuid)
    cat.train(seed_index, seed_y)

    predictions = cat.predict()

    gt = parse_ground_truth_file( ground_truth_file)
    idx_ref = cat.fe.search(gt.index.values)
    idx_res = np.arange(cat.fe.n_samples_, dtype='int')

    scores = categorization_score(idx_ref, gt.is_relevant.values,
                               idx_res, predictions)

    print('    => Test scores: MAP = {average_precision:.3f}, ROC-AUC = {roc_auc:.3f}'.format(**scores))
def test_features_hashing(use_hashing, use_lsi, method):
    # check that models work both with and without hashing

    cache_dir = check_cache()

    n_features = 20000

    fe = FeatureVectorizer(cache_dir=cache_dir)
    uuid = fe.preprocess(data_dir,
                         file_pattern='.*\d.txt',
                         n_features=n_features,
                         use_hashing=use_hashing)
    uuid, filenames = fe.transform()

    ground_truth = parse_ground_truth_file(
        os.path.join(data_dir, "..", "ground_truth_file.txt"))

    lsi = _LSIWrapper(cache_dir=cache_dir, parent_id=uuid)
    lsi_res, exp_var = lsi.fit_transform(
        n_components=100)  # TODO unused variables
    assert lsi._load_pars() is not None
    lsi._load_model()

    if method == 'Categorization':
        if use_lsi:
            parent_id = lsi.mid
            method = 'NearestNeighbor'
        else:
            parent_id = uuid
            method = 'LogisticRegression'
        cat = _CategorizerWrapper(cache_dir=cache_dir,
                                  parent_id=parent_id,
                                  cv_n_folds=2)
        index = cat.fe.db._search_filenames(ground_truth.file_path.values)

        try:
            coefs, Y_train = cat.train(index,
                                       ground_truth.is_relevant.values,
                                       method=method)
        except OptionalDependencyMissing:
            raise SkipTest

        Y_pred, md = cat.predict()
        X_pred = np.arange(cat.fe.n_samples_, dtype='int')
        idx_gt = cat.fe.db._search_filenames(ground_truth.file_path.values)

        scores = categorization_score(idx_gt, ground_truth.is_relevant.values,
                                      X_pred, np.argmax(Y_pred, axis=1))
        assert_allclose(scores['precision'], 1, rtol=0.5)
        assert_allclose(scores['recall'], 1, rtol=0.7)
        cat.delete()
    elif method == 'DuplicateDetection':
        dd = _DuplicateDetectionWrapper(cache_dir=cache_dir, parent_id=uuid)
        try:
            dd.fit()
        except ImportError:
            raise SkipTest
        cluster_id = dd.query(distance=10)
    elif method == 'Clustering':
        if not use_hashing:
            if use_lsi:
                parent_id = lsi.mid
                method = 'birch'
            else:
                parent_id = uuid
                method = 'k_means'
            cat = _ClusteringWrapper(cache_dir=cache_dir, parent_id=parent_id)
            cm = getattr(cat, method)
            labels, htree = cm(2)

            terms = cat.compute_labels(n_top_words=10)
        else:
            with pytest.raises(NotImplementedError):
                _ClusteringWrapper(cache_dir=cache_dir, parent_id=uuid)

    else:
        raise ValueError
Exemple #7
0
def test_features_hashing(use_hashing, method):
    # check that models work both with and without hashing

    cache_dir = check_cache()

    n_features = 20000

    fe = FeatureVectorizer(cache_dir=cache_dir)
    uuid = fe.preprocess(data_dir,
                         file_pattern='.*\d.txt',
                         n_features=n_features,
                         use_hashing=use_hashing)
    uuid, filenames = fe.transform()

    ground_truth = parse_ground_truth_file(
        os.path.join(data_dir, "..", "ground_truth_file.txt"))

    if method == 'Categorization':
        cat = Categorizer(cache_dir=cache_dir, dsid=uuid, cv_n_folds=2)
        index = cat.fe.search(ground_truth.index.values)

        try:
            coefs, Y_train = cat.train(
                index,
                ground_truth.is_relevant.values,
            )
        except OptionalDependencyMissing:
            raise SkipTest

        Y_pred = cat.predict()
        X_pred = np.arange(cat.fe.n_samples_, dtype='int')
        idx_gt = cat.fe.search(ground_truth.index.values)

        scores = categorization_score(idx_gt, ground_truth.is_relevant.values,
                                      X_pred, Y_pred)
        assert_allclose(scores['precision'], 1, rtol=0.5)
        assert_allclose(scores['recall'], 1, rtol=0.5)
        cat.delete()
    elif method == 'LSI':
        lsi = LSI(cache_dir=cache_dir, dsid=uuid)
        lsi_res, exp_var = lsi.transform(
            n_components=100)  # TODO unused variables
        lsi_id = lsi.mid
        assert lsi.get_dsid(fe.cache_dir, lsi_id) == uuid
        assert lsi.get_path(lsi_id) is not None
        assert lsi._load_pars(lsi_id) is not None
        lsi.load(lsi_id)

        idx_gt = lsi.fe.search(ground_truth.index.values)
        idx_all = np.arange(lsi.fe.n_samples_, dtype='int')

        for accumulate in ['nearest-max', 'centroid-max']:
            #'nearest-diff', 'nearest-combine', 'stacking']:
            _, Y_train, Y_pred, ND_train = lsi.predict(
                idx_gt, ground_truth.is_relevant.values, accumulate=accumulate)
            scores = categorization_score(idx_gt,
                                          ground_truth.is_relevant.values,
                                          idx_all, Y_pred)
            assert_allclose(scores['precision'], 1, rtol=0.5)
            assert_allclose(scores['recall'], 1, rtol=0.3)
    elif method == 'DuplicateDetection':
        dd = DuplicateDetection(cache_dir=cache_dir, dsid=uuid)
        try:
            dd.fit()
        except ImportError:
            raise SkipTest
        cluster_id = dd.query(distance=10)
    elif method == 'Clustering':
        if not use_hashing:
            cat = Clustering(cache_dir=cache_dir, dsid=uuid)
            cm = getattr(cat, 'k_means')
            labels, htree = cm(2, lsi_components=20)

            terms = cat.compute_labels(n_top_words=10)
        else:
            with pytest.raises(NotImplementedError):
                Clustering(cache_dir=cache_dir, dsid=uuid)

    else:
        raise ValueError