def test_append_documents(): cache_dir = check_cache() fe = FeatureVectorizer(cache_dir=cache_dir, mode='w') uuid = fe.setup() fe.ingest(data_dir) X = fe._load_features(uuid) db = fe.db_ filenames = fe.filenames_ n_samples = len(fe.filenames_) docs = DocumentIndex.from_folder(data_dir).data docs['document_id'] += 10 dataset_definition = docs[['file_path', 'document_id']].to_dict(orient='records') for row in dataset_definition: row['file_path'] = os.path.join(data_dir, row['file_path']) fe.append(dataset_definition) X_new = fe._load_features(uuid) assert X_new.shape[0] == X.shape[0] * 2 assert fe.db_.data.shape[0] == db.data.shape[0] * 2 assert len(fe.filenames_) == len(filenames) * 2 dbn = fe.db_.data assert_equal(dbn.iloc[:n_samples]['document_id'].values, dbn.iloc[n_samples:]['document_id'].values - 10) # check that internal id is contiguous assert (np.diff(dbn.internal_id.values) == 1).all() # check the number of samples is consistent del fe._pars assert fe.n_samples_ == n_samples * 2 fe.delete()
def test_remove_documents(): cache_dir = check_cache() fe = FeatureVectorizer(cache_dir=cache_dir, mode='w') uuid = fe.setup() fe.ingest(data_dir) X = fe._load_features(uuid) db = fe.db_.data filenames = fe.filenames_ n_samples = len(fe.filenames_) docs = DocumentIndex.from_folder(data_dir).data dataset_definition = docs[['document_id']].to_dict(orient='records') fe.remove([dataset_definition[2], dataset_definition[4]]) X_new = fe._load_features(uuid) assert X_new.shape[0] == X.shape[0] - 2 assert fe.db_.data.shape[0] == db.shape[0] - 2 assert len(fe.filenames_) == len(filenames) - 2 dbn = fe.db_.data assert_equal(db.iloc[[0, 1, 3, 5]]['document_id'].values, dbn['document_id'].values) # check that internal id is contiguous assert (np.diff(dbn.internal_id.values) == 1).all() # check the number of samples is consistent del fe._pars assert fe.n_samples_ == n_samples - 2 fe.delete()
def test_ingestion_pickling(): from sklearn.externals import joblib db = DocumentIndex.from_folder(data_dir) fname = os.path.join(cache_dir, 'document_index') # check that db is picklable joblib.dump(db, fname) db2 = joblib.load(fname) os.remove(fname)
def test_search_document_id(): md = [{ 'file_path': '/test', 'document_id': 2 }, { 'file_path': '/test2', 'document_id': 1 }, { 'file_path': '/test3', 'document_id': 7 }, { 'file_path': '/test8', 'document_id': 9 }, { 'file_path': '/test9', 'document_id': 4 }] for idx, el in enumerate(md): el['internal_id'] = idx dbi = DocumentIndex.from_list(md) dbi._make_relative_paths() query = pd.DataFrame([{ 'internal_id': 1 }, { 'internal_id': 2 }, { 'internal_id': 1 }]) sres = dbi.search(query) assert_equal(sres.internal_id.values, [1, 2, 1]) assert_array_equal(sorted(sres.columns), sorted(['internal_id', 'file_path', 'document_id'])) # make sure we use internal id first query = pd.DataFrame([{ 'internal_id': 1, 'document_id': 2 }, { 'internal_id': 2, 'document_id': 2 }, { 'internal_id': 1, 'document_id': 2 }]) sres = dbi.search(query) assert_equal(sres.internal_id.values, [1, 2, 1]) query = pd.DataFrame([{ 'document_id': 4 }, { 'document_id': 9 }, { 'document_id': 2 }]) sres = dbi.search(query) assert_equal(sres.internal_id.values, [4, 3, 0])
def test_search_not_found(): dbi = DocumentIndex.from_folder(data_dir) query = pd.DataFrame([{ 'file_path': "DOES_NOT_EXISTS" }, { 'file_path': "0.7.6.28637.txt" }]) with pytest.raises(NotFound): sres = dbi.search(query)
def remove(self, dataset_definition): """ Remove some documents from the dataset This is by no mean an efficient operation, processing all the files at once might be more suitable in most occastions. """ from freediscovery.engine.lsi import _LSIWrapper dsid_dir = self.dsid_dir db_old = self.db_.data query = pd.DataFrame(dataset_definition) res = self.db_.search(query, drop=False) del_internal_id = res.internal_id.values internal_id_mask = ~np.in1d(db_old.internal_id.values, del_internal_id) # write down the new features file X_old = self._load_features() X = X_old[internal_id_mask, :] joblib.dump(X, str(dsid_dir / 'features')) # write down the new filenames file filenames = list(np.array(self.filenames_)[internal_id_mask]) with (dsid_dir / 'filenames').open('wb') as fh: pickle.dump(filenames, fh) self._filenames = filenames # write down the new database file db = db_old.iloc[internal_id_mask].copy() # create a new contiguous internal_id db['internal_id'] = np.arange(db.shape[0], dtype='int') self._db = DocumentIndex(self.pars_['data_dir'], db) if 'file_path' in db.columns: del db['file_path'] db.to_pickle(str(dsid_dir / 'db')) # write down the new pars file self._pars = self.pars_ self._pars['n_samples'] = len(filenames) with (dsid_dir / 'pars').open('wb') as fh: pickle.dump(self._pars, fh) # find all exisisting LSI models and update them as well if (dsid_dir / 'lsi').exists(): for lsi_id in os.listdir(str(dsid_dir / 'lsi')): _fname = dsid_dir / 'lsi' / lsi_id / 'data' if _fname.exists(): X_lsi_old = joblib.load(str(_fname)) X_lsi = X_lsi_old[internal_id_mask] joblib.dump(X_lsi, str(_fname)) # remove all trained models for this dataset for model_type in ['categorizer', 'dupdet', 'cluster', 'threading']: if (dsid_dir / model_type).exists(): for mid in os.listdir(str(dsid_dir / model_type)): shutil.rmtree(str(dsid_dir / model_type / mid))
def test_ingestion_base_dir(): dbi = DocumentIndex.from_folder(data_dir) dbi._make_relative_paths() data_dir_res, filenames, db = dbi.data_dir, dbi.filenames_, dbi.data assert data_dir_res == os.path.normpath(data_dir) assert_array_equal(db.columns.values, ['file_path', 'internal_id', 'document_id']) assert_array_equal(db.file_path.values, fnames_in) assert_array_equal( [os.path.normpath(os.path.join(data_dir_res, el)) for el in filenames], [os.path.join(data_dir_res, el) for el in db.file_path.values])
def db_(self): """ DatasetIndex """ if not hasattr(self, '_db') or self._db is None: dsid = self.dsid if self.cache_dir is None: raise InitException('cache_dir is None: cannot load from cache!') dsid_dir = self.cache_dir / dsid if not dsid_dir.exists(): raise DatasetNotFound('dsid {} not found!'.format(dsid)) data = pd.read_pickle(str(dsid_dir / 'db')) self._db = DocumentIndex(self.pars_['data_dir'], data) return self._db
def test_search_2fields(): dbi = DocumentIndex.from_folder(data_dir) dbi._make_relative_paths() query = pd.DataFrame([{ 'internal_id': 3 }, { 'internal_id': 1 }, { 'internal_id': 2 }]) sres = dbi.search(query) assert_equal(sres.internal_id.values, [3, 1, 2]) assert_array_equal(sorted(sres.columns), sorted(['internal_id', 'file_path', 'document_id'])) # make sure that if we have some additional field, # we still use the internal_id query = pd.DataFrame([{ 'internal_id': 1, 'a': 2 }, { 'internal_id': 2, 'b': 4 }, { 'internal_id': 1, 'a': 3 }]) sres = dbi.search(query) assert_equal(sres.internal_id.values, [1, 2, 1]) assert_array_equal(sorted(sres.columns), sorted(['internal_id', 'file_path', 'document_id'])) sres = dbi.search(query, drop=False) assert_equal(sres.internal_id.values, [1, 2, 1]) assert_array_equal( sorted(sres.columns), sorted(['internal_id', 'file_path', 'document_id', 'a', 'b'])) query = pd.DataFrame([{ 'file_path': "0.7.6.28637.txt" }, { 'file_path': "0.7.47.117435.txt" }]) del dbi.data['file_path'] sres = dbi.search(query) query_res = [ dbi.data.file_path.values.tolist().index(el) for el in query.file_path.values ] assert_array_equal(query_res, sres.internal_id)
def test_lsi_remove_documents(): cache_dir = check_cache() fe = FeatureVectorizer(cache_dir=cache_dir, mode='w') uuid = fe.setup() fe.ingest(data_dir) lsi = _LSIWrapper(cache_dir=cache_dir, parent_id=uuid, mode='w') lsi_res, exp_var = lsi.fit_transform(n_components=2, alpha=1.0) X_lsi = lsi._load_features() docs = DocumentIndex.from_folder(data_dir).data dataset_definition = docs[['document_id']].to_dict(orient='records') fe.remove([dataset_definition[2], dataset_definition[4]]) X_lsi_new = lsi._load_features() assert X_lsi_new.shape[0] == X_lsi.shape[0] - 2
def test_lsi_append_documents(): cache_dir = check_cache() fe = FeatureVectorizer(cache_dir=cache_dir, mode='w') uuid = fe.setup() fe.ingest(data_dir) lsi = _LSIWrapper(cache_dir=cache_dir, parent_id=uuid, mode='w') lsi_res, exp_var = lsi.fit_transform(n_components=2, alpha=1.0) X_lsi = lsi._load_features() n_samples = fe.n_samples_ docs = DocumentIndex.from_folder(data_dir).data docs['document_id'] += 10 dataset_definition = docs[['file_path', 'document_id']].to_dict(orient='records') for row in dataset_definition: row['file_path'] = os.path.join(data_dir, row['file_path']) fe.append(dataset_definition) X_lsi_new = lsi._load_features() assert X_lsi_new.shape[0] == X_lsi.shape[0]*2 assert_equal(X_lsi_new[:n_samples], X_lsi_new[:n_samples])
def test_ingestion_metadata(n_fields): metadata = [] for idx, fname in enumerate(fnames_in_abs): el = {'file_path': fname} if n_fields >= 2: el['document_id'] = 'a' + str(idx + 100) if n_fields >= 3: el['rendition_id'] = 1 metadata.append(el) dbi = DocumentIndex.from_list(metadata) dbi._make_relative_paths() data_dir_res, filenames, db = dbi.data_dir, dbi.filenames_, dbi.data if n_fields in [1, 2]: columns_ref = sorted(['file_path', 'document_id', 'internal_id']) elif n_fields == 3: columns_ref = sorted( ['file_path', 'document_id', 'rendition_id', 'internal_id']) assert_array_equal(sorted(db.columns.values), columns_ref) assert_array_equal( [os.path.normpath(os.path.join(data_dir_res, el)) for el in filenames], [os.path.join(data_dir_res, el) for el in db.file_path.values])
def test_bad_search_document_rendition_id(): md = [{ 'file_path': '/test', 'document_id': 0, 'rendition_id': 0 }, { 'file_path': '/test2', 'document_id': 0, 'rendition_id': 1 }, { 'file_path': '/test3', 'document_id': 1, 'rendition_id': 0 }, { 'file_path': '/test8', 'document_id': 2, 'rendition_id': 0 }, { 'file_path': '/test9', 'document_id': 3, 'rendition_id': 0 }] for idx, el in enumerate(md): el['internal_id'] = idx # can always index with internal_id dbi = DocumentIndex.from_list(md) query = pd.DataFrame([{ 'internal_id': 1 }, { 'internal_id': 2 }, { 'document_id': 1 }]) with pytest.raises(NotFound): sres = dbi.search(query)
def append(self, dataset_definition, data_dir=None): """ Add some documents to the dataset This is by no mean an efficient operation, processing all the files at once might be more suitable in most occastions. """ from freediscovery.engine.lsi import _LSIWrapper dsid_dir = self.dsid_dir db_old = self.db_.data internal_id_offset = db_old.internal_id.max() db_extra = DocumentIndex.from_list(dataset_definition, data_dir, internal_id_offset + 1, dsid_dir) db_new = db_extra.data vect = self.vect_ tfidf = self.tfidf_ filenames_new = list(db_new.file_path.values) # write down the new features file X_new_raw = vect.transform(filenames_new) X_new = tfidf.transform(X_new_raw) X_old = self._load_features() X = scipy.sparse.vstack((X_new, X_old)) joblib.dump(X, str(dsid_dir / 'features')) # write down the new filenames file filenames_old = list(self.filenames_) filenames = filenames_old + filenames_new data_dir = DocumentIndex._detect_data_dir(filenames) self._pars['data_dir'] = data_dir self._filenames = [os.path.relpath(el, data_dir) for el in filenames] with (dsid_dir / 'filenames').open('wb') as fh: pickle.dump(self._filenames, fh) del db_new['file_path'] # write down the new pars file self._pars = self.pars_ self._pars['n_samples'] = len(filenames) with (dsid_dir / 'pars').open('wb') as fh: pickle.dump(self._pars, fh) # write down the new database file db = pd.concat((db_old, db_new)) if 'file_path' in db.columns: del db['file_path'] db.to_pickle(str(dsid_dir / 'db')) self._db = DocumentIndex(self.pars_['data_dir'], db) # find all exisisting LSI models and update them as well if (dsid_dir / 'lsi').exists(): for lsi_id in os.listdir(str(dsid_dir / 'lsi')): lsi_obj = _LSIWrapper(cache_dir=self.cache_dir, mid=lsi_id) lsi_obj.append(X_new) # remove all trained models for this dataset for model_type in ['categorizer', 'dupdet', 'cluster', 'threading']: if (dsid_dir / model_type).exists(): for mid in os.listdir(str(dsid_dir / model_type)): shutil.rmtree(str(dsid_dir / model_type / mid))
def ingest(self, data_dir=None, file_pattern='.*', dir_pattern='.*', dataset_definition=None, vectorize=True, document_id_generator='indexed_file_path', ): """Perform data ingestion Parameters ---------- data_dir : str path to the data directory (used only if metadata not provided), default: None dataset_defintion : list of dicts a list of dictionaries with keys ['file_path', 'document_id', 'rendition_id'] describing the data ingestion (this overwrites data_dir) vectorize : bool (default: True) """ dsid_dir = self.cache_dir / self.dsid if (dsid_dir / 'db').exists(): raise ValueError('Dataset {} already vectorized!' .format(self.dsid)) db_list = list(sorted(dsid_dir.glob('db*'))) if len(db_list) == 0: internal_id_offset = -1 elif len(db_list) >= 1: internal_id_offset = int(db_list[-1].name[3:]) pars = self.pars_ if pars.get('column_ids', None) is not None: if dataset_definition is None: raise ValueError("CSV files can only be privided using " "`dataset_definition` parameter") else: if len(dataset_definition) > 1: raise ValueError( "Only one CSV can be provided at a time" ) file_path = dataset_definition[0]['file_path'] X = pd.read_csv( file_path, sep=pars['column_separator'], header=None) dataset_definition = [ {'file_path': f"{file_path}:{idx}", 'document_id': idx} for idx in range(len(X))] db = DocumentIndex.from_list( dataset_definition, data_dir, internal_id_offset + 1, dsid_dir, document_id_generator=document_id_generator) elif dataset_definition is not None: db = DocumentIndex.from_list( dataset_definition, data_dir, internal_id_offset + 1, dsid_dir, document_id_generator=document_id_generator) elif data_dir is not None: db = DocumentIndex.from_folder( data_dir, file_pattern, dir_pattern, internal_id_offset + 1, document_id_generator=document_id_generator) else: db = None if db is not None: data_dir = db.data_dir batch_suffix = '.{:09}'.format(db.data.internal_id.iloc[-1]) self._filenames = db.data.file_path.values.tolist() del db.data['file_path'] if 'file_path' in db.data.columns: del db.data['file_path'] db.data.to_pickle(str(dsid_dir / ('db' + batch_suffix))) with (dsid_dir / ('filenames' + batch_suffix)).open('wb') as fh: pickle.dump(self._filenames, fh) self._db = db if vectorize: db_list = list(sorted(dsid_dir.glob('db*'))) filenames_list = list(sorted(dsid_dir.glob('filenames*'))) if len(db_list) == 0: raise ValueError('No ingested files found!') if len(db_list) == 1: with filenames_list[0].open('rb') as fh: filenames_concat = pickle.load(fh) elif len(db_list) >= 2: # accumulate different batches into a single file # filename file filenames_concat = [] for fname in filenames_list: with fname.open('rb') as fh: filenames_concat += pickle.load(fh) if self.pars_['data_dir'] is None: data_dir = DocumentIndex._detect_data_dir(filenames_concat) self._pars['data_dir'] = data_dir else: data_dir = self._pars['data_dir'] self._filenames = [os.path.relpath(el, data_dir) for el in filenames_concat] with (dsid_dir / 'filenames').open('wb') as fh: pickle.dump(self._filenames, fh) for fname in filenames_list: fname.unlink() # save databases if len(db_list) == 1: db_list[0].rename(dsid_dir / 'db') self.db_.filenames_ = self._filenames self.db_.data['file_path'] = self._filenames elif len(db_list) >= 2: db_concat = [] for fname in db_list: db_concat.append(pd.read_pickle(str(fname))) db_new = pd.concat(db_concat, axis=0) db_new.filenames_ = self._filenames db_new.set_index('internal_id', drop=False, inplace=True) self._db = DocumentIndex(data_dir, db_new) if 'file_path' in db_new.columns: del db_new['file_path'] db_new.to_pickle(str(dsid_dir / 'db')) # save parameters self._pars['n_samples'] = len(self._filenames) self._pars['data_dir'] = data_dir with (dsid_dir / 'pars').open('wb') as fh: pickle.dump(self._pars, fh) self.transform() if (dsid_dir / 'raw').exists(): shutil.rmtree(str(dsid_dir / 'raw')) if db is None and not vectorize: raise ValueError('At least one of data_dir, dataset_definition, ' 'vectorize parameters must be provided!') return
def test_search_document_rendition_id(): md = [{ 'file_path': '/test', 'document_id': 0, 'rendition_id': 0 }, { 'file_path': '/test2', 'document_id': 0, 'rendition_id': 1 }, { 'file_path': '/test3', 'document_id': 1, 'rendition_id': 0 }, { 'file_path': '/test8', 'document_id': 2, 'rendition_id': 0 }, { 'file_path': '/test9', 'document_id': 3, 'rendition_id': 0 }] for idx, el in enumerate(md): el['internal_id'] = idx # can always index with internal_id dbi = DocumentIndex.from_list(md) dbi._make_relative_paths() query = pd.DataFrame([{ 'internal_id': 1 }, { 'internal_id': 2 }, { 'internal_id': 1 }]) sres = dbi.search(query) assert_equal(sres.internal_id.values, [1, 2, 1]) assert_array_equal( sorted(sres.columns), sorted(['internal_id', 'file_path', 'document_id', 'rendition_id'])) # the internal id is not sufficient to fully index documents in this case query = pd.DataFrame([{ 'document_id': 0 }, { 'document_id': 1 }, { 'document_id': 2 }]) with pytest.raises(ValueError): sres = dbi.search(query) query = pd.DataFrame([{ 'document_id': 0, 'rendition_id': 0 }, { 'document_id': 1, 'rendition_id': 0 }, { 'document_id': 2, 'rendition_id': 0 }]) sres = dbi.search(query) assert_equal(sres.internal_id.values, [0, 2, 3])
def load_dataset(name='20_newsgroups_3categories', cache_dir='/tmp', verbose=False, verify_checksum=False, document_id_generation='squared', categories=None): """Download a benchmark dataset. The currently supported datasets are listed below, 1. TREC 2009 legal collection - `treclegal09_2k_subset` : 2 400 documents, 2 MB - `treclegal09_20k_subset` : 20 000 documents, 30 MB - `treclegal09_37k_subset` : 37 000 documents, 55 MB - `treclegal09` : 700 000 documents, 1.2 GB The ground truth files for categorization are adapted from TAR Toolkit. 2. Fedora mailing list (2009-2009) - `fedora_ml_3k_subset` 3. The 20 newsgoups dataset - `20_newsgroups_3categories`: only the ['comp.graphics', 'rec.sport.baseball', 'sci.space'] categories If you encounter any issues for downloads with this function, you can also manually download and extract the required dataset to ``cache_dir`` (the download url is ``http://r0h.eu/d/<name>.tar.gz``), then re-run this function to get the required metadata. Parameters ---------- name : str, default='20_newsgroups_3categories' the name of the dataset file to load cache_dir : str, default='/tmp/' root directory where to save the download verbose : bool, default=False print download progress verify_checksum : bool, default=False verify the checksum of the downloaded archive document_id_generation : str specifies how the document_id is computed from internal_id must be one of ``['identity', 'squared']`` ``default="identity"`` (i.e. ``document_id = internal_id``) categories : str select a subsection of the dataset, ``default='all'`` Returns ------- metadata: dict a dictionary containing metadata corresponding to the dataset training_set : {dict, None} a list of dictionaries for the training set test_set : dict a list of dictionaries for the test set """ from freediscovery.engine.ingestion import DocumentIndex from freediscovery.io import parse_ground_truth_file if name not in IR_DATASETS: raise ValueError('Dataset name {} not known!'.format(name)) valid_fields = ['document_id', 'internal_id', 'file_path', 'category'] has_categories = '20_newsgroups_' in name or 'treclegal09' in name # make sure we don't have "ediscovery_cache" in the path cache_dir = _normalize_cachedir(cache_dir) cache_dir = os.path.dirname(cache_dir) outdir = os.path.join(cache_dir, name) fname = outdir db = IR_DATASETS[name] if '20_newsgroups_' in name: if db['url'].endswith('.pkl.xz'): import lzma fname = name + '.pkl.xz' opener = lzma.open else: fname = name + '.pkl' opener = open with opener(os.path.join(INTERNAL_DATA_DIR, fname), 'rb') as fh: twenty_news = pickle.load(fh) # Download the dataset if it doesn't exist if not os.path.exists(outdir): if '20_newsgroups_' in name: os.mkdir(outdir) for idx, doc in enumerate(twenty_news.data): with open(os.path.join(outdir, '{:05}.txt'.format(idx)), 'wt') as fh: # noqa fh.write(doc) else: outdir = _get_file(str(fname), db['url'], extract=True, cache_dir=str(cache_dir)) print('Downloaded {} dataset to {}'.format(name, outdir)) if 'treclegal09' in name or 'fedora_ml' in name: data_dir = os.path.join(outdir, 'data') else: data_dir = outdir md = {'data_dir': str(data_dir), 'name': name} di = DocumentIndex.from_folder(str(data_dir)) di._make_relative_paths() training_set = None if 'treclegal09' in name: negative_files, positive_files = _load_erdm_ground_truth(outdir) ground_truth_file = os.path.join(outdir, "ground_truth_file.txt") gt = parse_ground_truth_file(str(ground_truth_file)) res = di.search(gt, drop=False) di.data['category'] = res.is_relevant di.data['category'] = di.data['category'].apply( lambda x: 'positive' if x == 1 else 'negative') di.data['is_train'] = False res = di.search( pd.DataFrame({'file_path': positive_files + negative_files})) di.data.loc[res.internal_id.values, 'is_train'] = True elif '20_newsgroups_' in name: di.data['category'] = np.array( twenty_news.target_names)[twenty_news.target] # noqa di.data['is_train'] = ['-train' in el for el in twenty_news.filenames] if categories is not None and has_categories: mask = di.data.category.isin(categories) di.data = di.data[mask] di.data['internal_id'] = np.arange(len(di.data['internal_id'])) di.data.set_index('internal_id', drop=False, inplace=True) di.data['document_id'] = _compute_document_id(di.data['internal_id'], document_id_generation) di.data = di.data.astype('object') if has_categories: mask = di.data['is_train'] training_set = di.render_dict(di.data[mask], return_file_path=True) training_set = filter_dict(training_set, valid_fields) if name == '20_newsgroups_3categories': # make a smaller training set random.seed(999998) training_set = random.sample( training_set, min(len(training_set), di.data.shape[0] // 5)) dataset = di.render_dict(return_file_path=True) dataset = filter_dict(dataset, valid_fields) return md, training_set, dataset
def test_ingestion_render(return_file_path): def _process_results(rd): rd = pd.DataFrame(rd) if return_file_path: assert 'file_path' in rd.columns del rd['file_path'] return rd # make it a binary variable return_file_path = (return_file_path == 'return_file_path') md = [{ 'file_path': '/test', 'document_id': 2 }, { 'file_path': '/test2', 'document_id': 1 }, { 'file_path': '/test3', 'document_id': 7 }, { 'file_path': '/test8', 'document_id': 9 }, { 'file_path': '/test9', 'document_id': 4 }] for idx, el in enumerate(md): el['internal_id'] = idx dbi = DocumentIndex.from_list(md) query = pd.DataFrame([{ 'a': 2, 'internal_id': 3 }, { 'a': 4, 'internal_id': 1 }]) res = pd.DataFrame([{ 'a': 2, 'internal_id': 3, 'document_id': 9 }, { 'a': 4, 'internal_id': 1, 'document_id': 1 }]) rd = dbi.render_dict(query, return_file_path=return_file_path) rd = _process_results(rd) assert_frame_equal(rd, res) rd = dbi.render_dict(return_file_path=return_file_path) rd = _process_results(rd) assert_frame_equal(rd.loc[[0]], pd.DataFrame([{ 'internal_id': 0, 'document_id': 2 }])) assert len(rd) == len(md) rd = dbi.render_list(res, return_file_path=return_file_path) rd = _process_results(rd) assert sorted(rd.keys()) == sorted(['internal_id', 'document_id', 'a']) assert_frame_equal(pd.DataFrame(rd), pd.DataFrame([{ 'a': 2, 'internal_id': 3, 'document_id': 9 }, { 'a': 4, 'internal_id': 1, 'document_id': 1 }]), check_like=True) rd = dbi.render_list() assert sorted(rd.keys()) == sorted(['internal_id', 'document_id'])