def test_load_20newsgoups_dataset(name): if (sys.version_info < (3, 0)): raise SkipTest md, training_set, dataset = load_dataset(name, cache_dir=cache_dir) response_ref = { "document_id": 'int', "file_path": "str", "internal_id": "int" } if '20_newsgroups_' in name or 'treclegal09' in name: response_ref["category"] = "str" assert dict2type(md) == {'data_dir': 'str', 'name': 'str'} assert dict2type(dataset[0]) == response_ref assert dict2type(training_set[1]) == response_ref categories = sorted(list({row['category'] for row in dataset})) for categories_sel in \ [[categories[0]], [categories[1]], [categories[0], categories[1]], [categories[1]], categories]: md, training_set, dataset = load_dataset(name, cache_dir=cache_dir, categories=categories_sel) for resp in [training_set, dataset]: assert dict2type(resp[0]) == response_ref result_fields = list({el['category'] for el in resp}) # the opposite if not always true (e.g. for small training sets) for key in result_fields: assert key in categories_sel training_set = pd.DataFrame(training_set) dataset = pd.DataFrame(dataset) if name == 'treclegal09_2k_subset': if categories_sel == ['positive']: assert dataset.shape[0] == 12 elif categories_sel == categories: assert dataset.shape[0] == 2465 assert (training_set.category == 'positive').sum() == 5 elif name == '20_newsgroups_micro': if categories_sel == ['comp.graphics']: assert dataset.shape[0] == 3 elif categories_sel == categories: assert dataset.shape[0] == 7 assert training_set.shape[0] == 4
def test_load_dataset(): try: from unittest.mock import patch, MagicMock except ImportError: raise SkipTest cache_dir = check_cache() m = MagicMock() m2 = MagicMock() with patch.dict("sys.modules", requests=m, tarfile=m2): res = load_dataset(verbose=False, force=True, cache_dir=cache_dir, load_ground_truth=False, verify_checksum=False) assert sorted(res.keys()) == sorted([#"ground_truth_file", "seed_non_relevant_files", #"seed_relevant_files", "base_dir", "data_dir"])
import numpy as np from freediscovery.datasets import load_dataset from freediscovery.text import FeatureVectorizer from freediscovery.categorization import Categorizer from freediscovery.tests.run_suite import check_cache from freediscovery.io import parse_ground_truth_file from freediscovery.utils import categorization_score dataset_name = "treclegal09_2k_subset" # see list of available datasets cache_dir = check_cache() if __name__ == '__main__': ds = load_dataset(dataset_name, load_ground_truth=True, cache_dir=cache_dir) # To use a custom dataset, simply specify the following variables data_dir = ds['data_dir'] seed_filenames = ds['seed_filenames'] seed_y = ds['seed_y'] ground_truth_file = ds['ground_truth_file'] # (optional) fe_opts = {'data_dir': data_dir, 'stop_words': 'english', 'chunk_size': 2000, 'n_jobs': -1, 'use_idf': 1, 'sublinear_tf': 0, 'binary': 0, 'n_features': 50001, 'analyzer': 'word', 'ngram_range': (1, 1), "norm": "l2" } fe = FeatureVectorizer(cache_dir=cache_dir)
from freediscovery.text import FeatureVectorizer from freediscovery.cluster import _ClusteringWrapper from freediscovery.lsi import _LSIWrapper from freediscovery.datasets import load_dataset from freediscovery.tests.run_suite import check_cache from time import time pd.options.display.float_format = '{:,.3f}'.format dataset_name = "treclegal09_2k_subset" cache_dir = check_cache() print("0. Load Dataset") md, training_set, dataset = load_dataset(dataset_name, cache_dir=cache_dir) print("\n1. Feature extraction (non hashed)\n") n_features = 30000 fe = FeatureVectorizer(cache_dir=cache_dir) uuid = fe.preprocess(md['data_dir'], n_features=n_features, use_hashing=False, use_idf=True, stop_words='english') uuid, filenames = fe.transform() n_clusters = 10
from freediscovery.text import FeatureVectorizer from freediscovery.cluster import _ClusteringWrapper from freediscovery.lsi import _LSIWrapper from freediscovery.datasets import load_dataset from freediscovery.tests.run_suite import check_cache from time import time pd.options.display.float_format = '{:,.3f}'.format dataset_name = "treclegal09_2k_subset" cache_dir = check_cache() print("0. Load Dataset") ds = load_dataset(dataset_name, cache_dir=cache_dir) print("\n1. Feature extraction (non hashed)\n") n_features = 30000 fe = FeatureVectorizer(cache_dir=cache_dir) uuid = fe.preprocess(ds['data_dir'], n_features=n_features, use_hashing=False, use_idf=True, stop_words='english') uuid, filenames = fe.transform() n_clusters = 10