Ejemplo n.º 1
0
def test_features_hashing(use_hashing, method):
    # check that models work both with and without hashing

    cache_dir = check_cache()

    n_features = 20000

    fe = FeatureVectorizer(cache_dir=cache_dir)
    uuid = fe.preprocess(data_dir,
                         file_pattern='.*\d.txt',
                         n_features=n_features,
                         use_hashing=use_hashing)
    uuid, filenames = fe.transform()

    ground_truth = parse_ground_truth_file(
        os.path.join(data_dir, "..", "ground_truth_file.txt"))

    if method == 'Categorization':
        cat = Categorizer(cache_dir=cache_dir, dsid=uuid, cv_n_folds=2)
        index = cat.fe.search(ground_truth.index.values)

        try:
            coefs, Y_train = cat.train(
                index,
                ground_truth.is_relevant.values,
            )
        except OptionalDependencyMissing:
            raise SkipTest

        Y_pred = cat.predict()
        X_pred = np.arange(cat.fe.n_samples_, dtype='int')
        idx_gt = cat.fe.search(ground_truth.index.values)

        scores = categorization_score(idx_gt, ground_truth.is_relevant.values,
                                      X_pred, Y_pred)
        assert_allclose(scores['precision'], 1, rtol=0.5)
        assert_allclose(scores['recall'], 1, rtol=0.5)
        cat.delete()
    elif method == 'LSI':
        lsi = LSI(cache_dir=cache_dir, dsid=uuid)
        lsi_res, exp_var = lsi.transform(
            n_components=100)  # TODO unused variables
        lsi_id = lsi.mid
        assert lsi.get_dsid(fe.cache_dir, lsi_id) == uuid
        assert lsi.get_path(lsi_id) is not None
        assert lsi._load_pars(lsi_id) is not None
        lsi.load(lsi_id)

        idx_gt = lsi.fe.search(ground_truth.index.values)
        idx_all = np.arange(lsi.fe.n_samples_, dtype='int')

        for accumulate in ['nearest-max', 'centroid-max']:
            #'nearest-diff', 'nearest-combine', 'stacking']:
            _, Y_train, Y_pred, ND_train = lsi.predict(
                idx_gt, ground_truth.is_relevant.values, accumulate=accumulate)
            scores = categorization_score(idx_gt,
                                          ground_truth.is_relevant.values,
                                          idx_all, Y_pred)
            assert_allclose(scores['precision'], 1, rtol=0.5)
            assert_allclose(scores['recall'], 1, rtol=0.3)
    elif method == 'DuplicateDetection':
        dd = DuplicateDetection(cache_dir=cache_dir, dsid=uuid)
        try:
            dd.fit()
        except ImportError:
            raise SkipTest
        cluster_id = dd.query(distance=10)
    elif method == 'Clustering':
        if not use_hashing:
            cat = Clustering(cache_dir=cache_dir, dsid=uuid)
            cm = getattr(cat, 'k_means')
            labels, htree = cm(2, lsi_components=20)

            terms = cat.compute_labels(n_top_words=10)
        else:
            with pytest.raises(NotImplementedError):
                Clustering(cache_dir=cache_dir, dsid=uuid)

    else:
        raise ValueError
Ejemplo n.º 2
0
from freediscovery.engine.categorization import _CategorizerWrapper
from freediscovery.engine.lsi import _LSIWrapper
from freediscovery.io import parse_ground_truth_file
from freediscovery.metrics import categorization_score
from freediscovery.exceptions import OptionalDependencyMissing, WrongParameter
from freediscovery.tests.run_suite import check_cache

basename = Path(__file__).parent

cache_dir = check_cache()

EPSILON = 1e-4

data_dir = basename / ".." / ".." / "data" / "ds_001" / "raw"

ground_truth = parse_ground_truth_file(
    str(data_dir / ".." / "ground_truth_file.txt"))

fe = FeatureVectorizer(cache_dir=cache_dir, mode='w')
vect_uuid = fe.setup()
fe.ingest(str(data_dir), file_pattern='.*\d.txt')

lsi = _LSIWrapper(cache_dir=cache_dir, parent_id=vect_uuid, mode='w')
lsi.fit_transform(n_components=6)

_test_cases = itertools.product([False, True], [
    "LinearSVC", "LogisticRegression", 'xgboost', "NearestNeighbor",
    "NearestCentroid"
], [None, 'fast'])

# 'MLPClassifier', 'ensemble-stacking' not supported in production the moment
_test_cases = filter(lambda x: not (x[1].startswith("Nearest") and x[2]),