def test_search_filenames(use_hashing): cache_dir = check_cache() fe = FeatureVectorizer(cache_dir=cache_dir) uuid = fe.preprocess(data_dir, file_pattern='.*\d.txt', use_hashing=use_hashing) # TODO unused (overwritten on the next line) uuid, filenames = fe.transform() assert_equal(fe._pars['filenames'], filenames) for low, high, step in [(0, 1, 1), (0, 4, 1), (3, 1, -1)]: idx_slice = list(range(low, high, step)) filenames_slice = [filenames[idx] for idx in idx_slice] idx0 = fe.search(filenames_slice) assert_equal(idx0, idx_slice) assert_equal(filenames_slice, fe[idx0]) with pytest.raises(KeyError): fe.search(['DOES_NOT_EXIST.txt']) if not use_hashing: n_top_words = 5 terms = fe.query_features([2, 3, 5], n_top_words=n_top_words) assert len(terms) == n_top_words fe.list_datasets()
def test_feature_extraction_storage(): cache_dir = check_cache() fe = FeatureVectorizer(cache_dir=cache_dir) uuid = fe.setup() fe.ingest(data_dir, file_pattern='.*\d.txt') db = pd.read_pickle(os.path.join(cache_dir, 'ediscovery_cache', uuid, 'db')) assert 'file_path' not in db.columns
def fd_setup(**fe_options): basename = os.path.dirname(__file__) cache_dir = check_cache() data_dir = os.path.join(basename, "..", "data", "ds_001", "raw") n_features = 110000 fe = FeatureVectorizer(cache_dir=cache_dir) uuid = fe.setup(n_features=n_features, use_hashing=True, stop_words='english', **fe_options) fe.ingest(data_dir, file_pattern='.*\d.txt') return cache_dir, uuid, fe.filenames_, fe
def test_search_filenames(use_hashing): cache_dir = check_cache() fe = FeatureVectorizer(cache_dir=cache_dir) uuid = fe.setup(use_hashing=use_hashing) fe.ingest(data_dir, file_pattern='.*\d.txt') assert fe.db_ is not None for low, high, step in [(0, 1, 1), (0, 4, 1), (3, 1, -1)]: idx_slice = list(range(low, high, step)) filenames_slice = [fe.filenames_[idx] for idx in idx_slice] idx0 = fe.db_._search_filenames(filenames_slice) assert_equal(idx0, idx_slice) assert_equal(filenames_slice, fe[idx0]) with pytest.raises(NotFound): fe.db_._search_filenames(['DOES_NOT_EXIST.txt']) if not use_hashing: n_top_words = 5 terms = fe.query_features([2, 3, 5], n_top_words=n_top_words) assert len(terms) == n_top_words fe.list_datasets()
def fd_setup(**fe_options): basename = os.path.dirname(__file__) cache_dir = check_cache() data_dir = os.path.join(basename, "..", "data", "ds_001", "raw") n_features = 110000 fe = FeatureVectorizer(cache_dir=cache_dir) uuid = fe.preprocess( data_dir, file_pattern='.*\d.txt', n_features=n_features, use_hashing=True, stop_words='english', **fe_options) # TODO unused variable (overwritten on the next line) uuid, filenames = fe.transform() return cache_dir, uuid, filenames, fe
def fd_setup(): basename = os.path.dirname(__file__) cache_dir = check_cache() np.random.seed(1) data_dir = os.path.join(basename, "..", "data", "ds_001", "raw") n_features = 110000 fe = FeatureVectorizer(cache_dir=cache_dir) dsid = fe.setup(n_features=n_features, use_hashing=False, stop_words='english', min_df=0.1, max_df=0.9) fe.ingest(data_dir, file_pattern='.*\d.txt') lsi = _LSIWrapper(cache_dir=cache_dir, parent_id=dsid) lsi.fit_transform(n_components=6) return cache_dir, dsid, fe.filenames_, lsi.mid
def fd_setup(): basename = os.path.dirname(__file__) cache_dir = check_cache() data_dir = os.path.join(basename, "..", "data", "ds_001", "raw") n_features = 110000 fe = FeatureVectorizer(cache_dir=cache_dir) uuid = fe.preprocess(data_dir, file_pattern='.*\d.txt', n_features=n_features, use_hashing=True, stop_words='english') uuid, filenames = fe.transform() return cache_dir, uuid, filenames, fe
def test_email_parsing(): data_dir = os.path.join(basename, "..", "data", "fedora-devel-list-2008-October") cache_dir = check_cache() fe = FeatureVectorizer(cache_dir=cache_dir) uuid = fe.setup() fe.ingest(data_dir) email_md = fe.parse_email_headers() assert len(fe.filenames_) == len(email_md) fe.delete()
def test_lsi(): basename = os.path.dirname(__file__) cache_dir = check_cache() data_dir = os.path.join(basename, "..", "data", "ds_001", "raw") n_features = 110000 fe = FeatureVectorizer(cache_dir=cache_dir) uuid = fe.preprocess(data_dir, file_pattern='.*\d.txt', n_features=n_features) uuid, filenames = fe.transform() ground_truth = parse_ground_truth_file( os.path.join(data_dir, "..", "ground_truth_file.txt")) lsi = LSI(cache_dir=cache_dir, dsid=uuid) lsi_res, exp_var = lsi.transform(n_components=100) lsi_id = lsi.mid assert lsi.get_dsid(fe.cache_dir, lsi_id) == uuid assert lsi.get_path(lsi_id) is not None assert lsi._load_pars(lsi_id) is not None lsi.load(lsi_id) mask = ground_truth.is_relevant.values == 1 for accumulate in ['nearest-max', 'centroid-max']: #'nearest-diff', 'nearest-combine', 'stacking']: _, X_train, Y_train_val, Y_train, X_pred, Y_pred, ND_train = lsi.predict( ground_truth.index.values[mask], ground_truth.index.values[~mask], accumulate=accumulate) scores = classification_score(ground_truth.index.values, ground_truth.is_relevant.values, X_pred, Y_pred) #yield assert_allclose, scores['precision_score'], 1 #yield assert_allclose, scores['recall_score'], 1 lsi.list_models() lsi.delete()
def fd_setup(): basename = os.path.dirname(__file__) cache_dir = check_cache() np.random.seed(1) data_dir = os.path.join(basename, "..", "data", "ds_001", "raw") n_features = 110000 fe = FeatureVectorizer(cache_dir=cache_dir) dsid = fe.preprocess( data_dir, file_pattern='.*\d.txt', n_features=n_features, use_hashing=False, stop_words='english', min_df=0.1, max_df=0.9 ) # TODO unused variable 'uuid' (overwritten on the next line) dsid, filenames = fe.transform() lsi = _LSIWrapper(cache_dir=cache_dir, parent_id=dsid) lsi.fit_transform(n_components=6) return cache_dir, dsid, filenames, lsi.mid
def test_lsi(): basename = os.path.dirname(__file__) cache_dir = check_cache() data_dir = os.path.join(basename, "..", "data", "ds_001", "raw") n_components = 5 fe = FeatureVectorizer(cache_dir=cache_dir) uuid = fe.preprocess(data_dir, file_pattern='.*\d.txt') uuid, filenames = fe.transform() lsi = _LSIWrapper(cache_dir=cache_dir, parent_id=uuid) lsi_res, exp_var = lsi.fit_transform( n_components=n_components) # TODO unused variables assert lsi_res.components_.shape == (n_components, fe.n_features_) assert lsi._load_pars() is not None lsi._load_model() # test pipeline lsi.list_models() lsi.delete()
def test_lsi(): basename = os.path.dirname(__file__) cache_dir = check_cache() data_dir = os.path.join(basename, "..", "data", "ds_001", "raw") n_features = 110000 fe = FeatureVectorizer(cache_dir=cache_dir) uuid = fe.preprocess( data_dir, file_pattern='.*\d.txt', n_features=n_features ) # TODO unused variable (overwritten on the next line) uuid, filenames = fe.transform() ground_truth = parse_ground_truth_file( os.path.join(data_dir, "..", "ground_truth_file.txt")) lsi = LSI(cache_dir=cache_dir, dsid=uuid) lsi_res, exp_var = lsi.transform(n_components=100) # TODO unused variables lsi_id = lsi.mid assert lsi.get_dsid(fe.cache_dir, lsi_id) == uuid assert lsi.get_path(lsi_id) is not None assert lsi._load_pars(lsi_id) is not None lsi.load(lsi_id) idx_gt = lsi.fe.search(ground_truth.index.values) idx_all = np.arange(lsi.fe.n_samples_, dtype='int') for accumulate in ['nearest-max', 'centroid-max']: #'nearest-diff', 'nearest-combine', 'stacking']: _, Y_train, Y_pred, ND_train = lsi.predict( idx_gt, ground_truth.is_relevant.values, accumulate=accumulate) scores = categorization_score(idx_gt, ground_truth.is_relevant.values, idx_all, Y_pred) assert_allclose(scores['precision'], 1, rtol=0.5) assert_allclose(scores['recall'], 1, rtol=0.3) lsi.list_models() lsi.delete()
def test_feature_extraction_tokenization(analyzer, ngram_range, use_hashing): cache_dir = check_cache() use_hashing = (use_hashing == 'hashed') fe = FeatureVectorizer(cache_dir=cache_dir) uuid = fe.setup(analyzer=analyzer, ngram_range=ngram_range, use_hashing=use_hashing) fe.ingest(data_dir, file_pattern='.*\d.txt') res2 = fe._load_features(uuid) assert isinstance(res2, np.ndarray) or scipy.sparse.issparse(res2), "not an array {}".format(res2) assert np.isfinite(res2.data).all() assert_allclose(normalize(res2).data, res2.data) # data is l2 normalized fe.delete()
def test_feature_extraction_cyrillic(use_hashing): data_dir = os.path.join(basename, "..", "data", "ds_002", "raw") cache_dir = check_cache() use_hashing = (use_hashing == 'hashed') fe = FeatureVectorizer(cache_dir=cache_dir) uuid = fe.setup(use_hashing=use_hashing) fe.ingest(data_dir, file_pattern='.*\d.txt') res2 = fe._load_features(uuid) filenames = fe.filenames_ fe._filenames = None filenames2 = fe.filenames_ assert_equal(filenames2, filenames) assert isinstance(res2, np.ndarray) or scipy.sparse.issparse(res2),\ "not an array {}".format(res2) assert np.isfinite(res2.data).all() fe.delete()
def test_feature_extraction_tokenization(analyzer, ngram_range, use_hashing): cache_dir = check_cache() use_hashing = (use_hashing == 'hashed') fe = FeatureVectorizer(cache_dir=cache_dir) uuid = fe.preprocess(data_dir, file_pattern='.*\d.txt', analyzer=analyzer, ngram_range=ngram_range, use_hashing=use_hashing) uuid, filenames = fe.transform() filenames2, res2 = fe.load(uuid) assert_equal(filenames2, filenames) assert isinstance(res2, np.ndarray) or scipy.sparse.issparse(res2), "not an array {}".format(res2) assert np.isfinite(res2.data).all() fe.delete()
def test_feature_extraction_nfeatures(n_features, use_idf, use_hashing): cache_dir = check_cache() use_hashing = (use_hashing == 'hashed') use_idf = (use_idf == 'IDF') fe = FeatureVectorizer(cache_dir=cache_dir) uuid = fe.setup(n_features=n_features, use_idf=use_idf, use_hashing=use_hashing) fe.ingest(data_dir, file_pattern='.*\d.txt') res2 = fe._load_features(uuid) assert isinstance(res2, np.ndarray) or scipy.sparse.issparse(res2), \ "not an array {}".format(res2) assert np.isfinite(res2.data).all() assert res2.shape[1] == fe.n_features_ fe.delete()
def test_feature_extraction(analyzer, stop_words, ngram_range, use_idf, sublinear_tf, binary, use_hashing): cache_dir = check_cache() fe = FeatureVectorizer(cache_dir=cache_dir) uuid = fe.preprocess(data_dir, file_pattern='.*\d.txt', n_features=n_features, analyzer=analyzer, stop_words=stop_words, ngram_range=ngram_range, use_idf=use_idf, binary=binary, use_hashing=use_hashing, sublinear_tf=sublinear_tf) # TODO unused (overwritten on the next line) uuid, filenames = fe.transform() filenames2, res2 = fe.load(uuid) assert_equal(filenames2, filenames) assert isinstance(res2, np.ndarray) or scipy.sparse.issparse(res2), "not an array {}".format(res2) assert np.isfinite(res2.data).all() fe.delete()
def test_feature_extraction_weighting(use_idf, sublinear_tf, binary, use_hashing): cache_dir = check_cache() use_idf = (use_idf == 'IDF') sublinear_tf = (sublinear_tf == 'sublinear TF') binary = (binary == 'binary') use_hashing = (use_hashing == 'hashed') fe = FeatureVectorizer(cache_dir=cache_dir) uuid = fe.setup(use_idf=use_idf, binary=binary, use_hashing=use_hashing, sublinear_tf=sublinear_tf) fe.ingest(data_dir, file_pattern='.*\d.txt') res2 = fe._load_features(uuid) assert isinstance(res2, np.ndarray) or scipy.sparse.issparse(res2), \ "not an array {}".format(res2) assert np.isfinite(res2.data).all() assert_allclose(normalize(res2).data, res2.data) # data is l2 normalized fe.delete()
def test_threading(): cache_dir = check_cache() fe = FeatureVectorizer(cache_dir=cache_dir) uuid = fe.setup() fe.ingest(data_dir=data_dir) fe.parse_email_headers() cat = _EmailThreadingWrapper(cache_dir=cache_dir, parent_id=uuid) tree = cat.thread() cat.get_params() tree_ref = [{ 'id': 0, 'parent': None, 'children': [{ 'id': 1, 'children': [], 'parent': 0 }, { 'id': 2, 'parent': 0, 'children': [{ 'id': 3, 'children': [], 'parent': 2 }, { 'id': 4, 'children': [], 'parent': 2 }] }] }] assert [el.to_dict() for el in tree] == tree_ref assert len(fe.filenames_) == sum([el.size for el in tree]) assert len(fe.filenames_) == 5 assert len(tree[0].flatten()) == 5
def test_feature_extraction_weighting(use_idf, sublinear_tf, binary, use_hashing): cache_dir = check_cache() use_idf = (use_idf == 'IDF') sublinear_tf = (sublinear_tf == 'sublinear TF') binary = (binary == 'binary') use_hashing = (use_hashing == 'hashed') fe = FeatureVectorizer(cache_dir=cache_dir) uuid = fe.preprocess(data_dir, file_pattern='.*\d.txt', use_idf=use_idf, binary=binary, use_hashing=use_hashing, sublinear_tf=sublinear_tf) uuid, filenames = fe.transform() filenames2, res2 = fe.load(uuid) assert_equal(filenames2, filenames) assert isinstance(res2, np.ndarray) or scipy.sparse.issparse(res2), "not an array {}".format(res2) assert np.isfinite(res2.data).all() fe.delete()
def test_feature_extraction_nfeatures(n_features, use_idf, use_hashing): cache_dir = check_cache() use_hashing = (use_hashing == 'hashed') use_idf = (use_idf == 'IDF') fe = FeatureVectorizer(cache_dir=cache_dir) uuid = fe.preprocess(data_dir, file_pattern='.*\d.txt', n_features=n_features, use_idf=use_idf, use_hashing=use_hashing) uuid, filenames = fe.transform() filenames2, res2 = fe.load(uuid) assert_equal(filenames2, filenames) assert isinstance(res2, np.ndarray) or scipy.sparse.issparse(res2), "not an array {}".format(res2) assert np.isfinite(res2.data).all() assert res2.shape[1] == fe.n_features_ fe.delete()
from freediscovery.metrics import categorization_score from freediscovery.exceptions import OptionalDependencyMissing, WrongParameter from .run_suite import check_cache basename = os.path.dirname(__file__) cache_dir = check_cache() EPSILON = 1e-4 data_dir = os.path.join(basename, "..", "data", "ds_001", "raw") fe = FeatureVectorizer(cache_dir=cache_dir) vect_uuid = fe.setup() fe.ingest(data_dir, file_pattern='.*\d.txt') lsi = _LSIWrapper(cache_dir=cache_dir, parent_id=vect_uuid) lsi.fit_transform(n_components=6) ground_truth = parse_ground_truth_file( os.path.join(data_dir, "..", "ground_truth_file.txt")) _test_cases = itertools.product( [False, True], ["LinearSVC", "LogisticRegression", 'xgboost', "NearestNeighbor", "NearestCentroid"], [None, 'fast'])
from freediscovery.categorization import Categorizer from freediscovery.io import parse_ground_truth_file from freediscovery.utils import classification_score from freediscovery.exceptions import OptionalDependencyMissing from ..utils import _silent from .run_suite import check_cache basename = os.path.dirname(__file__) cache_dir = check_cache() data_dir = os.path.join(basename, "..", "data", "ds_001", "raw") n_features = 20000 fe = FeatureVectorizer(cache_dir=cache_dir) uuid = fe.preprocess(data_dir, file_pattern='.*\d.txt', n_features=n_features, binary=True, use_idf=False, norm=None) uuid, filenames = fe.transform() ground_truth = parse_ground_truth_file( os.path.join(data_dir, "..", "ground_truth_file.txt")) @pytest.mark.parametrize( 'method, cv', itertools.product(
def test_features_hashing(use_hashing, use_lsi, method): # check that models work both with and without hashing cache_dir = check_cache() n_features = 20000 fe = FeatureVectorizer(cache_dir=cache_dir) uuid = fe.setup(n_features=n_features, use_hashing=use_hashing) fe.ingest(data_dir, file_pattern='.*\d.txt') ground_truth = parse_ground_truth_file( os.path.join(data_dir, "..", "ground_truth_file.txt")) lsi = _LSIWrapper(cache_dir=cache_dir, parent_id=uuid) lsi_res, exp_var = lsi.fit_transform(n_components=100) assert lsi._load_pars() is not None lsi._load_model() if method == 'Categorization': if use_lsi: parent_id = lsi.mid method = 'NearestNeighbor' else: parent_id = uuid method = 'LogisticRegression' cat = _CategorizerWrapper(cache_dir=cache_dir, parent_id=parent_id, cv_n_folds=2) cat.fe.db_.filenames_ = cat.fe.filenames_ index = cat.fe.db_._search_filenames(ground_truth.file_path.values) try: coefs, Y_train = cat.fit(index, ground_truth.is_relevant.values, method=method) except OptionalDependencyMissing: raise SkipTest Y_pred, md = cat.predict() X_pred = np.arange(cat.fe.n_samples_, dtype='int') idx_gt = cat.fe.db_._search_filenames(ground_truth.file_path.values) scores = categorization_score(idx_gt, ground_truth.is_relevant.values, X_pred, np.argmax(Y_pred, axis=1)) assert_allclose(scores['precision'], 1, rtol=0.5) assert_allclose(scores['recall'], 1, rtol=0.7) cat.delete() elif method == 'DuplicateDetection': dd = _DuplicateDetectionWrapper(cache_dir=cache_dir, parent_id=uuid) try: dd.fit() except ImportError: raise SkipTest cluster_id = dd.query(distance=10) elif method == 'Clustering': if not use_hashing: if use_lsi: parent_id = lsi.mid method = 'birch' else: parent_id = uuid method = 'k_means' cat = _ClusteringWrapper(cache_dir=cache_dir, parent_id=parent_id) cm = getattr(cat, method) labels = cm(2) htree = cat._get_htree(cat.pipeline.data) terms = cat.compute_labels(n_top_words=10) else: with pytest.raises(NotImplementedError): _ClusteringWrapper(cache_dir=cache_dir, parent_id=uuid) else: raise ValueError
def test_feature_extraction(analyzer, stop_words, ngram_range, use_idf, sublinear_tf, binary, use_hashing): cache_dir = check_cache() fe = FeatureVectorizer(cache_dir=cache_dir) uuid = fe.preprocess(data_dir, file_pattern='.*\d.txt', n_features=n_features, analyzer=analyzer, stop_words=stop_words, ngram_range=ngram_range, use_idf=use_idf, binary=binary, use_hashing=use_hashing, sublinear_tf=sublinear_tf) uuid, filenames = fe.transform() filenames2, res2 = fe.load(uuid) assert_equal(filenames2, filenames) assert isinstance(res2, np.ndarray) or scipy.sparse.issparse( res2), "not an array {}".format(res2) fe.search(['0.7.47.117435.txt']) fe.search(['DOES_NOT_EXIST.txt']) fe.list_datasets assert np.isfinite(res2.data).all() if not use_hashing: n_top_words = 5 terms = fe.query_features([2, 3, 5], n_top_words=n_top_words) assert len(terms) == n_top_words fe.delete()
from freediscovery.lsi import _LSIWrapper from freediscovery.categorization import _CategorizerWrapper from freediscovery.io import parse_ground_truth_file from freediscovery.metrics import categorization_score from freediscovery.exceptions import OptionalDependencyMissing from .run_suite import check_cache basename = os.path.dirname(__file__) cache_dir = check_cache() EPSILON = 1e-4 data_dir = os.path.join(basename, "..", "data", "ds_001", "raw") fe = FeatureVectorizer(cache_dir=cache_dir) vect_uuid = fe.preprocess(data_dir, file_pattern='.*\d.txt') vect_uuid, filenames = fe.transform() lsi = _LSIWrapper(cache_dir=cache_dir, parent_id=vect_uuid) lsi.fit_transform(n_components=6) ground_truth = parse_ground_truth_file( os.path.join(data_dir, "..", "ground_truth_file.txt")) _test_cases = itertools.product( [False, True], [ "LinearSVC", "LogisticRegression", 'xgboost', "NearestNeighbor", "NearestCentroid" ],
def test_sampling_filenames(): cache_dir = check_cache() fe_pars = {'binary': True, 'norm': None, 'sublinear_tf': False} fe = FeatureVectorizer(cache_dir=cache_dir) uuid = fe.preprocess(data_dir, file_pattern='.*\d.txt', use_hashing=True, **fe_pars) # TODO unused (overwritten on the next line) uuid, filenames = fe.transform() fnames, X = fe.load(uuid) # don't use any sampling fes = _FeatureVectorizerSampled(cache_dir=cache_dir, dsid=uuid, sampling_filenames=None) fnames_s, X_s = fes.load(uuid) pars = fe._load_pars() assert_array_equal(fnames, fnames_s) assert_array_equal(X.data, X_s.data) assert fes.n_samples_ == len(fnames) fes = _FeatureVectorizerSampled(cache_dir=cache_dir, dsid=uuid, sampling_filenames=fnames[::-1]) assert fes.sampling_index is not None fnames_s, X_s = fes.load(uuid) pars_s = fes._load_pars_sampled() assert_array_equal(fnames[::-1], fnames_s) assert_array_equal(X[::-1,:].data, X_s.data) for key in pars: if key == 'filenames': assert pars[key][::-1] == pars_s[key] else: assert pars[key] == pars_s[key] # repeat twice the filenames fes = _FeatureVectorizerSampled(cache_dir=cache_dir, dsid=uuid, sampling_filenames=(fnames+fnames)) assert fes.sampling_index is not None fnames_s, X_s = fes.load(uuid) pars_s = fes._load_pars_sampled() assert_array_equal(fnames + fnames, fnames_s ) assert_array_equal(X.data, X_s[:len(fnames)].data) assert_array_equal(X.data, X_s[len(fnames):].data) assert fes.n_samples_ == len(fnames)*2 #for key in pars: # assert pars[key] == pars_s[key] # downsample the filenames N = len(fnames)//2 np.random.seed(1) idx = np.random.choice(fe.n_samples_, size=(N,)) fnames_s_in = np.array(fnames)[idx].tolist() fes = _FeatureVectorizerSampled(cache_dir=cache_dir, dsid=uuid, sampling_filenames=fnames_s_in) assert fes.sampling_index is not None fnames_s, X_s = fes.load(uuid) pars_s = fes._load_pars_sampled() assert_array_equal(fnames_s_in, fnames_s ) assert_array_equal(X[idx].data, X_s.data) assert fes.n_samples_ == N fe.delete()
def test_df_filtering(use_hashing, min_df, max_df): cache_dir = check_cache() fe = FeatureVectorizer(cache_dir=cache_dir) uuid = fe.preprocess(data_dir, use_hashing=use_hashing, min_df=min_df, max_df=max_df) uuid, filenames = fe.transform() _, X = fe.load(uuid) fe2 = FeatureVectorizer(cache_dir=cache_dir) uuid2 = fe2.preprocess(data_dir, use_hashing=use_hashing) uuid2, filenames = fe2.transform() _, X2 = fe2.load(uuid2) if use_hashing: assert X.shape[1] == X2.shape[1] # min/max_df does not affect the number of features else: assert X.shape[1] < X2.shape[1] # min/max_df removes some features fe.delete()
pd.options.display.float_format = '{:,.3f}'.format data_dir = "../freediscovery_shared/tar_fd_benchmark" examples_to_server_path = "../" # relative path between this file and the FreeDiscovery source folder BASE_URL = "http://localhost:5001/api/v0" # FreeDiscovery server URL # # 1. Feature extraction (non hashed) # In[2]: n_features = 30000 cache_dir = '/tmp/' fe = FeatureVectorizer(cache_dir=cache_dir) uuid = fe.preprocess("../"+data_dir+'/data', n_features=n_features, use_hashing=False, use_idf=True, stop_words='english') uuid, filenames = fe.transform() # # 2. Document Clustering (LSI + K-Means) # In[4]: cat = Clustering(cache_dir=cache_dir, dsid=uuid) n_clusters = 10 n_top_words = 6 lsi_components = 50
ds = load_dataset(dataset_name, load_ground_truth=True, cache_dir=cache_dir) # To use a custom dataset, simply specify the following variables data_dir = ds['data_dir'] seed_filenames = ds['seed_filenames'] seed_y = ds['seed_y'] ground_truth_file = ds['ground_truth_file'] # (optional) fe_opts = {'data_dir': data_dir, 'stop_words': 'english', 'chunk_size': 2000, 'n_jobs': -1, 'use_idf': 1, 'sublinear_tf': 0, 'binary': 0, 'n_features': 50001, 'analyzer': 'word', 'ngram_range': (1, 1), "norm": "l2" } fe = FeatureVectorizer(cache_dir=cache_dir) uuid = fe.preprocess(**fe_opts) uuid, filenames = fe.transform() seed_index = fe.search(seed_filenames) cat = Categorizer(cache_dir=cache_dir, dsid=uuid) cat.train(seed_index, seed_y) predictions = cat.predict() gt = parse_ground_truth_file( ground_truth_file) idx_ref = cat.fe.search(gt.index.values) idx_res = np.arange(cat.fe.n_samples_, dtype='int')