def test_custom_mid(): cache_dir = check_cache() fe = FeatureVectorizer(cache_dir=cache_dir, mode='w') uuid = fe.setup() fe.ingest(data_dir) mid_orig = "sklds" lsi = _LSIWrapper(cache_dir=cache_dir, parent_id=uuid, mid=mid_orig, mode='w') lsi_res, exp_var = lsi.fit_transform(n_components=2, alpha=1.0) lsi._load_features() assert lsi.mid == mid_orig with pytest.raises(WrongParameter): lsi = _LSIWrapper(cache_dir=cache_dir, parent_id=uuid, mid=mid_orig, mode='w') lsi.fit_transform(n_components=2, alpha=1.0) lsi = _LSIWrapper(cache_dir=cache_dir, parent_id=uuid, mid=mid_orig, mode='fw') lsi.fit_transform(n_components=2, alpha=1.0) with pytest.raises(WrongParameter): lsi = _LSIWrapper(cache_dir=cache_dir, parent_id=uuid, mid='?', mode='fw') lsi.fit_transform(n_components=2, alpha=1.0)
def fd_setup(): basename = os.path.dirname(__file__) cache_dir = check_cache() np.random.seed(1) data_dir = os.path.join(basename, "..", "..", "data", "ds_001", "raw") n_features = 110000 fe = FeatureVectorizer(cache_dir=cache_dir, mode='w') dsid = fe.setup(n_features=n_features, use_hashing=False, stop_words='english', min_df=0.1, max_df=0.9) fe.ingest(data_dir, file_pattern=r'.*\d.txt') lsi = _LSIWrapper(cache_dir=cache_dir, parent_id=dsid, mode='w') lsi.fit_transform(n_components=6) return cache_dir, dsid, fe.filenames_, lsi.mid
def test_lsi_remove_documents(): cache_dir = check_cache() fe = FeatureVectorizer(cache_dir=cache_dir, mode='w') uuid = fe.setup() fe.ingest(data_dir) lsi = _LSIWrapper(cache_dir=cache_dir, parent_id=uuid, mode='w') lsi_res, exp_var = lsi.fit_transform(n_components=2, alpha=1.0) X_lsi = lsi._load_features() docs = DocumentIndex.from_folder(data_dir).data dataset_definition = docs[['document_id']].to_dict(orient='records') fe.remove([dataset_definition[2], dataset_definition[4]]) X_lsi_new = lsi._load_features() assert X_lsi_new.shape[0] == X_lsi.shape[0] - 2
def test_search_wrapper(kind): # check for syntax errors etc in the wrapper fe = FeatureVectorizer(cache_dir=cache_dir, mode='w') vect_uuid = fe.setup() fe.ingest(data_dir, file_pattern=r'.*\d.txt') if kind == 'semantic': lsi = _LSIWrapper(cache_dir=cache_dir, parent_id=vect_uuid, mode='w') lsi.fit_transform(n_components=20) parent_id = lsi.mid else: parent_id = vect_uuid sw = _SearchWrapper(cache_dir=cache_dir, parent_id=parent_id) dist = sw.search("so that I can reserve a room") assert dist.shape == (fe.n_samples_, ) # document 1 found by # grep -rn "so that I can reserve a room" # freediscovery/data/ds_001/raw/ assert dist.argmax() == 1
def test_lsi(): cache_dir = check_cache() n_components = 2 fe = FeatureVectorizer(cache_dir=cache_dir, mode='w') uuid = fe.setup() fe.ingest(data_dir, file_pattern=r'.*\d.txt') lsi = _LSIWrapper(cache_dir=cache_dir, parent_id=uuid, mode='w') lsi_res, exp_var = lsi.fit_transform(n_components=n_components, alpha=1.0) assert lsi_res.components_.shape[0] == 5 assert lsi_res.components_.shape[1] == fe.n_features_ assert lsi._load_pars() is not None lsi._load_model() X_lsi = lsi._load_features() assert_allclose(normalize(X_lsi), X_lsi) lsi.list_models() lsi.delete()
def test_lsi_append_documents(): cache_dir = check_cache() fe = FeatureVectorizer(cache_dir=cache_dir, mode='w') uuid = fe.setup() fe.ingest(data_dir) lsi = _LSIWrapper(cache_dir=cache_dir, parent_id=uuid, mode='w') lsi_res, exp_var = lsi.fit_transform(n_components=2, alpha=1.0) X_lsi = lsi._load_features() n_samples = fe.n_samples_ docs = DocumentIndex.from_folder(data_dir).data docs['document_id'] += 10 dataset_definition = docs[['file_path', 'document_id']].to_dict(orient='records') for row in dataset_definition: row['file_path'] = os.path.join(data_dir, row['file_path']) fe.append(dataset_definition) X_lsi_new = lsi._load_features() assert X_lsi_new.shape[0] == X_lsi.shape[0]*2 assert_equal(X_lsi_new[:n_samples], X_lsi_new[:n_samples])
def test_features_hashing(use_hashing, use_lsi, method): # check that models work both with and without hashing cache_dir = check_cache() n_features = 20000 fe = FeatureVectorizer(cache_dir=cache_dir, mode='w') uuid = fe.setup(n_features=n_features, use_hashing=use_hashing) fe.ingest(data_dir, file_pattern='.*\d.txt') ground_truth = parse_ground_truth_file(os.path.join(data_dir, "..", "ground_truth_file.txt")) lsi = _LSIWrapper(cache_dir=cache_dir, parent_id=uuid, mode='w') lsi_res, exp_var = lsi.fit_transform(n_components=100) assert lsi._load_pars() is not None lsi._load_model() if method == 'Categorization': if use_lsi: parent_id = lsi.mid method = 'NearestNeighbor' else: parent_id = uuid method = 'LogisticRegression' cat = _CategorizerWrapper(cache_dir=cache_dir, parent_id=parent_id, cv_n_folds=2) cat.fe.db_.filenames_ = cat.fe.filenames_ index = cat.fe.db_._search_filenames(ground_truth.file_path.values) try: coefs, Y_train = cat.fit( index, ground_truth.is_relevant.values, method=method ) except OptionalDependencyMissing: raise SkipTest Y_pred, md = cat.predict() X_pred = np.arange(cat.fe.n_samples_, dtype='int') idx_gt = cat.fe.db_._search_filenames(ground_truth.file_path.values) scores = categorization_score(idx_gt, ground_truth.is_relevant.values, X_pred, np.argmax(Y_pred, axis=1)) assert_allclose(scores['precision'], 1, rtol=0.5) assert_allclose(scores['recall'], 1, rtol=0.7) cat.delete() elif method == 'DuplicateDetection': dd = _DuplicateDetectionWrapper(cache_dir=cache_dir, parent_id=uuid) try: dd.fit() except ImportError: raise SkipTest cluster_id = dd.query(distance=10) elif method == 'Clustering': if not use_hashing: if use_lsi: parent_id = lsi.mid method = 'birch' else: parent_id = uuid method = 'k_means' cat = _ClusteringWrapper(cache_dir=cache_dir, parent_id=parent_id) cm = getattr(cat, method) labels = cm(2) htree = cat._get_htree(cat.pipeline.data) terms = cat.compute_labels(n_top_words=10) else: with pytest.raises(NotImplementedError): _ClusteringWrapper(cache_dir=cache_dir, parent_id=uuid) else: raise ValueError
def append(self, dataset_definition, data_dir=None): """ Add some documents to the dataset This is by no mean an efficient operation, processing all the files at once might be more suitable in most occastions. """ from freediscovery.engine.lsi import _LSIWrapper dsid_dir = self.dsid_dir db_old = self.db_.data internal_id_offset = db_old.internal_id.max() db_extra = DocumentIndex.from_list(dataset_definition, data_dir, internal_id_offset + 1, dsid_dir) db_new = db_extra.data vect = self.vect_ tfidf = self.tfidf_ filenames_new = list(db_new.file_path.values) # write down the new features file X_new_raw = vect.transform(filenames_new) X_new = tfidf.transform(X_new_raw) X_old = self._load_features() X = scipy.sparse.vstack((X_new, X_old)) joblib.dump(X, str(dsid_dir / 'features')) # write down the new filenames file filenames_old = list(self.filenames_) filenames = filenames_old + filenames_new data_dir = DocumentIndex._detect_data_dir(filenames) self._pars['data_dir'] = data_dir self._filenames = [os.path.relpath(el, data_dir) for el in filenames] with (dsid_dir / 'filenames').open('wb') as fh: pickle.dump(self._filenames, fh) del db_new['file_path'] # write down the new pars file self._pars = self.pars_ self._pars['n_samples'] = len(filenames) with (dsid_dir / 'pars').open('wb') as fh: pickle.dump(self._pars, fh) # write down the new database file db = pd.concat((db_old, db_new)) if 'file_path' in db.columns: del db['file_path'] db.to_pickle(str(dsid_dir / 'db')) self._db = DocumentIndex(self.pars_['data_dir'], db) # find all exisisting LSI models and update them as well if (dsid_dir / 'lsi').exists(): for lsi_id in os.listdir(str(dsid_dir / 'lsi')): lsi_obj = _LSIWrapper(cache_dir=self.cache_dir, mid=lsi_id) lsi_obj.append(X_new) # remove all trained models for this dataset for model_type in ['categorizer', 'dupdet', 'cluster', 'threading']: if (dsid_dir / model_type).exists(): for mid in os.listdir(str(dsid_dir / model_type)): shutil.rmtree(str(dsid_dir / model_type / mid))
basename = Path(__file__).parent cache_dir = check_cache() EPSILON = 1e-4 data_dir = basename / ".." / ".." / "data" / "ds_001" / "raw" ground_truth = parse_ground_truth_file( str(data_dir / ".." / "ground_truth_file.txt")) fe = FeatureVectorizer(cache_dir=cache_dir, mode='w') vect_uuid = fe.setup() fe.ingest(str(data_dir), file_pattern='.*\d.txt') lsi = _LSIWrapper(cache_dir=cache_dir, parent_id=vect_uuid, mode='w') lsi.fit_transform(n_components=6) _test_cases = itertools.product([False, True], [ "LinearSVC", "LogisticRegression", 'xgboost', "NearestNeighbor", "NearestCentroid" ], [None, 'fast']) # 'MLPClassifier', 'ensemble-stacking' not supported in production the moment _test_cases = filter(lambda x: not (x[1].startswith("Nearest") and x[2]), _test_cases) @pytest.mark.parametrize('use_lsi, method, cv', _test_cases) def test_categorization(use_lsi, method, cv):