def test_load_20newsgoups_dataset(name):
    if (sys.version_info < (3, 0)):
        raise SkipTest
    md, training_set, dataset = load_dataset(name, cache_dir=cache_dir)

    response_ref = {
        "document_id": 'int',
        "file_path": "str",
        "internal_id": "int"
    }
    if '20_newsgroups_' in name or 'treclegal09' in name:
        response_ref["category"] = "str"

    assert dict2type(md) == {'data_dir': 'str', 'name': 'str'}

    assert dict2type(dataset[0]) == response_ref
    assert dict2type(training_set[1]) == response_ref

    categories = sorted(list({row['category'] for row in dataset}))
    for categories_sel in \
            [[categories[0]],
             [categories[1]],
             [categories[0], categories[1]],
             [categories[1]],
             categories]:

        md, training_set, dataset = load_dataset(name,
                                                 cache_dir=cache_dir,
                                                 categories=categories_sel)

        for resp in [training_set, dataset]:

            assert dict2type(resp[0]) == response_ref
            result_fields = list({el['category'] for el in resp})

            # the opposite if not always true (e.g. for small training sets)
            for key in result_fields:
                assert key in categories_sel

        training_set = pd.DataFrame(training_set)
        dataset = pd.DataFrame(dataset)
        if name == 'treclegal09_2k_subset':
            if categories_sel == ['positive']:
                assert dataset.shape[0] == 12
            elif categories_sel == categories:
                assert dataset.shape[0] == 2465
                assert (training_set.category == 'positive').sum() == 5
        elif name == '20_newsgroups_micro':
            if categories_sel == ['comp.graphics']:
                assert dataset.shape[0] == 3
            elif categories_sel == categories:
                assert dataset.shape[0] == 7
                assert training_set.shape[0] == 4
def test_load_dataset():
    try:
        from unittest.mock import patch, MagicMock
    except ImportError:
        raise SkipTest


    cache_dir = check_cache()
    m = MagicMock()
    m2 = MagicMock()
    with patch.dict("sys.modules", requests=m, tarfile=m2):
        res = load_dataset(verbose=False, force=True, cache_dir=cache_dir,
                       load_ground_truth=False, verify_checksum=False)
    assert sorted(res.keys()) == sorted([#"ground_truth_file", "seed_non_relevant_files",
                                         #"seed_relevant_files",
                                         "base_dir", "data_dir"])
Exemple #3
0
import numpy as np

from freediscovery.datasets import load_dataset
from freediscovery.text import FeatureVectorizer
from freediscovery.categorization import Categorizer
from freediscovery.tests.run_suite import check_cache
from freediscovery.io import parse_ground_truth_file
from freediscovery.utils import categorization_score

dataset_name = "treclegal09_2k_subset"     # see list of available datasets

cache_dir = check_cache()

if __name__ == '__main__':

    ds = load_dataset(dataset_name, load_ground_truth=True, cache_dir=cache_dir)


    # To use a custom dataset, simply specify the following variables
    data_dir = ds['data_dir']
    seed_filenames = ds['seed_filenames']
    seed_y = ds['seed_y']
    ground_truth_file = ds['ground_truth_file']  # (optional)

    fe_opts = {'data_dir': data_dir,
               'stop_words': 'english', 'chunk_size': 2000, 'n_jobs': -1,
               'use_idf': 1, 'sublinear_tf': 0, 'binary': 0, 'n_features': 50001,
               'analyzer': 'word', 'ngram_range': (1, 1), "norm": "l2"
              }

    fe = FeatureVectorizer(cache_dir=cache_dir)
Exemple #4
0
from freediscovery.text import FeatureVectorizer
from freediscovery.cluster import _ClusteringWrapper
from freediscovery.lsi import _LSIWrapper
from freediscovery.datasets import load_dataset
from freediscovery.tests.run_suite import check_cache
from time import time

pd.options.display.float_format = '{:,.3f}'.format

dataset_name = "treclegal09_2k_subset"
cache_dir = check_cache()


print("0. Load Dataset")

md, training_set, dataset = load_dataset(dataset_name, cache_dir=cache_dir)


print("\n1. Feature extraction (non hashed)\n")

n_features = 30000
fe = FeatureVectorizer(cache_dir=cache_dir)
uuid = fe.preprocess(md['data_dir'],
                     n_features=n_features, use_hashing=False,
                     use_idf=True, stop_words='english')
uuid, filenames = fe.transform()




n_clusters = 10
from freediscovery.text import FeatureVectorizer
from freediscovery.cluster import _ClusteringWrapper
from freediscovery.lsi import _LSIWrapper
from freediscovery.datasets import load_dataset
from freediscovery.tests.run_suite import check_cache
from time import time

pd.options.display.float_format = '{:,.3f}'.format

dataset_name = "treclegal09_2k_subset"
cache_dir = check_cache()


print("0. Load Dataset")

ds = load_dataset(dataset_name, cache_dir=cache_dir)


print("\n1. Feature extraction (non hashed)\n")

n_features = 30000
fe = FeatureVectorizer(cache_dir=cache_dir)
uuid = fe.preprocess(ds['data_dir'],
                     n_features=n_features, use_hashing=False,
                     use_idf=True, stop_words='english')
uuid, filenames = fe.transform()




n_clusters = 10