def test_search_2fields(): dbi = DocumentIndex.from_folder(data_dir) dbi._make_relative_paths() query = pd.DataFrame([{'internal_id': 3}, {'internal_id': 1}, {'internal_id': 2}]) sres = dbi.search(query) assert_equal(sres.internal_id.values, [3, 1, 2]) assert_array_equal(sorted(sres.columns), sorted(['internal_id', 'file_path', 'document_id'])) # make sure that if we have some additional field, # we still use the internal_id query = pd.DataFrame([{'internal_id': 1, 'a': 2}, {'internal_id': 2, 'b': 4}, {'internal_id': 1, 'a': 3}]) sres = dbi.search(query) assert_equal(sres.internal_id.values, [1, 2, 1]) assert_array_equal(sorted(sres.columns), sorted(['internal_id', 'file_path', 'document_id'])) sres = dbi.search(query, drop=False) assert_equal(sres.internal_id.values, [1, 2, 1]) assert_array_equal(sorted(sres.columns), sorted(['internal_id', 'file_path', 'document_id', 'a', 'b'])) query = pd.DataFrame([{'file_path': "0.7.6.28637.txt"}, {'file_path': "0.7.47.117435.txt"}]) del dbi.data['file_path'] sres = dbi.search(query) query_res = [dbi.data.file_path.values.tolist().index(el) for el in query.file_path.values] assert_array_equal(query_res, sres.internal_id)
def test_search_document_id(): md = [{'file_path': '/test', 'document_id': 2}, {'file_path': '/test2', 'document_id': 1}, {'file_path': '/test3', 'document_id': 7}, {'file_path': '/test8', 'document_id': 9}, {'file_path': '/test9', 'document_id': 4}] for idx, el in enumerate(md): el['internal_id'] = idx dbi = DocumentIndex.from_list(md) dbi._make_relative_paths() query = pd.DataFrame([{'internal_id': 1}, {'internal_id': 2}, {'internal_id': 1}]) sres = dbi.search(query) assert_equal(sres.internal_id.values, [1, 2, 1]) assert_array_equal(sorted(sres.columns), sorted(['internal_id', 'file_path', 'document_id'])) # make sure we use internal id first query = pd.DataFrame([{'internal_id': 1, 'document_id': 2}, {'internal_id': 2, 'document_id': 2}, {'internal_id': 1, 'document_id': 2}]) sres = dbi.search(query) assert_equal(sres.internal_id.values, [1, 2, 1]) query = pd.DataFrame([{'document_id': 4}, {'document_id': 9}, {'document_id': 2}]) sres = dbi.search(query) assert_equal(sres.internal_id.values, [4, 3, 0])
def test_ingestion_metadata(n_fields): metadata = [] for idx, fname in enumerate(fnames_in_abs): el = {'file_path': fname} if n_fields >= 2: el['document_id'] = 'a' + str(idx + 100) if n_fields >= 3: el['rendition_id'] = 1 metadata.append(el) dbi = DocumentIndex.from_list(metadata) dbi._make_relative_paths() data_dir_res, filenames, db = dbi.data_dir, dbi.filenames_, dbi.data if n_fields in [1, 2]: columns_ref = sorted(['file_path', 'document_id', 'internal_id']) elif n_fields == 3: columns_ref = sorted(['file_path', 'document_id', 'rendition_id', 'internal_id']) assert_array_equal(sorted(db.columns.values), columns_ref) assert_array_equal([os.path.normpath(os.path.join(data_dir_res, el)) for el in filenames], [os.path.join(data_dir_res, el) for el in db.file_path.values])
def test_remove_documents(): cache_dir = check_cache() fe = FeatureVectorizer(cache_dir=cache_dir) uuid = fe.setup() fe.ingest(data_dir) X = fe._load_features(uuid) db = fe.db_.data filenames = fe.filenames_ n_samples = len(fe.filenames_) docs = DocumentIndex.from_folder(data_dir).data dataset_definition = docs[['document_id']].to_dict(orient='records') fe.remove([dataset_definition[2], dataset_definition[4]]) X_new = fe._load_features(uuid) assert X_new.shape[0] == X.shape[0] - 2 assert fe.db_.data.shape[0] == db.shape[0] - 2 assert len(fe.filenames_) == len(filenames) - 2 dbn = fe.db_.data assert_equal(db.iloc[[0, 1, 3, 5]]['document_id'].values, dbn['document_id'].values) # check that internal id is contiguous assert (np.diff(dbn.internal_id.values) == 1).all() # check the number of samples is consistent del fe._pars assert fe.n_samples_ == n_samples - 2 fe.delete()
def test_append_documents(): cache_dir = check_cache() fe = FeatureVectorizer(cache_dir=cache_dir) uuid = fe.setup() fe.ingest(data_dir) X = fe._load_features(uuid) db = fe.db_ filenames = fe.filenames_ n_samples = len(fe.filenames_) docs = DocumentIndex.from_folder(data_dir).data docs['document_id'] += 10 dataset_definition = docs[['file_path', 'document_id']].to_dict(orient='records') for row in dataset_definition: row['file_path'] = os.path.join(data_dir, row['file_path']) fe.append(dataset_definition) X_new = fe._load_features(uuid) assert X_new.shape[0] == X.shape[0]*2 assert fe.db_.data.shape[0] == db.data.shape[0]*2 assert len(fe.filenames_) == len(filenames)*2 dbn = fe.db_.data assert_equal(dbn.iloc[:n_samples]['document_id'].values, dbn.iloc[n_samples:]['document_id'].values - 10) # check that internal id is contiguous assert (np.diff(dbn.internal_id.values) == 1).all() # check the number of samples is consistent del fe._pars assert fe.n_samples_ == n_samples * 2 fe.delete()
def test_ingestion_pickling(): from sklearn.externals import joblib db = DocumentIndex.from_folder(data_dir) fname = os.path.join(cache_dir, 'document_index') # check that db is picklable joblib.dump(db, fname) db2 = joblib.load(fname) os.remove(fname)
def test_search_not_found(): dbi = DocumentIndex.from_folder(data_dir) query = pd.DataFrame([{ 'file_path': "DOES_NOT_EXISTS" }, { 'file_path': "0.7.6.28637.txt" }]) with pytest.raises(NotFound): sres = dbi.search(query)
def test_ingestion_base_dir(): dbi = DocumentIndex.from_folder(data_dir) data_dir_res, filenames, db = dbi.data_dir, dbi.filenames, dbi.data assert data_dir_res == os.path.normpath(data_dir) assert_array_equal(db.columns.values, ['file_path', 'internal_id']) assert_array_equal(db.file_path.values, fnames_in) assert_array_equal( [os.path.normpath(el) for el in filenames], [os.path.join(data_dir_res, el) for el in db.file_path.values])
def test_bad_search_document_rendition_id(): md = [{'file_path': '/test', 'document_id': 0, 'rendition_id': 0}, {'file_path': '/test2', 'document_id': 0, 'rendition_id': 1}, {'file_path': '/test3', 'document_id': 1, 'rendition_id': 0}, {'file_path': '/test8', 'document_id': 2, 'rendition_id': 0}, {'file_path': '/test9', 'document_id': 3, 'rendition_id': 0}] for idx, el in enumerate(md): el['internal_id'] = idx # can always index with internal_id dbi = DocumentIndex.from_list(md) query = pd.DataFrame([{'internal_id': 1}, {'internal_id': 2}, {'document_id': 1}]) with pytest.raises(NotFound): sres = dbi.search(query)
def test_ingestion_render(return_file_path): def _process_results(rd): rd = pd.DataFrame(rd) if return_file_path: assert 'file_path' in rd.columns del rd['file_path'] return rd # make it a binary variable return_file_path = (return_file_path == 'return_file_path') md = [{'file_path': '/test', 'document_id': 2}, {'file_path': '/test2', 'document_id': 1}, {'file_path': '/test3', 'document_id': 7}, {'file_path': '/test8', 'document_id': 9}, {'file_path': '/test9', 'document_id': 4}] for idx, el in enumerate(md): el['internal_id'] = idx dbi = DocumentIndex.from_list(md) query = pd.DataFrame([{'a': 2, 'internal_id': 3}, {'a': 4, 'internal_id': 1}]) res = pd.DataFrame([{'a': 2, 'internal_id': 3, 'document_id': 9}, {'a': 4, 'internal_id': 1, 'document_id': 1}]) rd = dbi.render_dict(query, return_file_path=return_file_path) rd = _process_results(rd) assert_frame_equal(rd, res) rd = dbi.render_dict(return_file_path=return_file_path) rd = _process_results(rd) assert_frame_equal(rd.loc[[0]], pd.DataFrame([{'internal_id': 0, 'document_id': 2}])) assert len(rd) == len(md) rd = dbi.render_list(res, return_file_path=return_file_path) rd = _process_results(rd) assert sorted(rd.keys()) == sorted(['internal_id', 'document_id', 'a']) assert_frame_equal(pd.DataFrame(rd), pd.DataFrame([{'a': 2, 'internal_id': 3, 'document_id': 9}, {'a': 4, 'internal_id': 1, 'document_id': 1}])) rd = dbi.render_list() assert sorted(rd.keys()) == sorted(['internal_id', 'document_id'])
def test_search_document_rendition_id(): md = [{'file_path': '/test', 'document_id': 0, 'rendition_id': 0}, {'file_path': '/test2', 'document_id': 0, 'rendition_id': 1}, {'file_path': '/test3', 'document_id': 1, 'rendition_id': 0}, {'file_path': '/test8', 'document_id': 2, 'rendition_id': 0}, {'file_path': '/test9', 'document_id': 3, 'rendition_id': 0}] for idx, el in enumerate(md): el['internal_id'] = idx # can always index with internal_id dbi = DocumentIndex.from_list(md) dbi._make_relative_paths() query = pd.DataFrame([{'internal_id': 1}, {'internal_id': 2}, {'internal_id': 1}]) sres = dbi.search(query) assert_equal(sres.internal_id.values, [1, 2, 1]) assert_array_equal(sorted(sres.columns), sorted(['internal_id', 'file_path', 'document_id', 'rendition_id'])) # the internal id is not sufficient to fully index documents in this case query = pd.DataFrame([{'document_id': 0}, {'document_id': 1}, {'document_id': 2}]) with pytest.raises(ValueError): sres = dbi.search(query) query = pd.DataFrame([{'document_id': 0, 'rendition_id': 0}, {'document_id': 1, 'rendition_id': 0}, {'document_id': 2, 'rendition_id': 0}]) sres = dbi.search(query) assert_equal(sres.internal_id.values, [0, 2, 3])