def save_file(self, path, link_ok=True, sep=None): if link_ok and 'trec' in self._data and os.path.exists( self._data['file']): os.symlink(self._data['trec'], path) else: it = ((qid, '0', did, score) for qid, did, score in iter(self)) plaintext.write_sv(path, it, sep=(sep or self._sep))
def write_sample_dict(sample_dict, file): def sample_iter(): for qid in sample_dict: for did in sample_dict[qid]: cat, rel = sample_dict[qid][did] yield qid, "0", did, cat, rel plaintext.write_sv(file, sample_iter(), sep=' ')
def write_run_dict(file, run_dict, runid='run'): """ Writes a query-document run dictionary to the given file Args: file (str|Stream) file path (str) or stream (Stream) to read to run_dict (dict<str<dict<str,float>>) run scores of format {qid: {docid: score}} runid (str, optional) run name to output (optional) """ def run_iter(): for qid in run_dict: for i, (docid, score) in enumerate( sorted(run_dict[qid].items(), key=lambda x: (-x[1], x[0]))): yield qid, 'Q0', docid, i + 1, score, runid plaintext.write_sv(file, run_iter(), sep=' ')
def _init_collection(self, collection, force=False): base_path = util.path_dataset(self) if collection == '1k': idxs = [self.index1k, self.index1k_stem, self.docstore1k] elif collection == '59k': idxs = [self.index59k, self.index59k_stem, self.docstore59k] else: raise ValueError(f'unsupported collection {collection}') query_files = { f'wikIR{collection}/training/queries.csv': os.path.join(base_path, f'train.{collection}.queries'), f'wikIR{collection}/validation/queries.csv': os.path.join(base_path, f'dev.{collection}.queries'), f'wikIR{collection}/test/queries.csv': os.path.join(base_path, f'test.{collection}.queries') } qrels_files = { f'wikIR{collection}/training/qrels': os.path.join(base_path, f'train.{collection}.qrels'), f'wikIR{collection}/validation/qrels': os.path.join(base_path, f'dev.{collection}.qrels'), f'wikIR{collection}/test/qrels': os.path.join(base_path, f'test.{collection}.qrels') } theirbm25_files = { f'wikIR{collection}/training/BM25.res': os.path.join(base_path, f'train.{collection}.theirbm25'), f'wikIR{collection}/validation/BM25.res': os.path.join(base_path, f'dev.{collection}.theirbm25'), f'wikIR{collection}/test/BM25.res': os.path.join(base_path, f'test.{collection}.theirbm25') } if not force and \ all(i.built() for i in idxs) and \ all(os.path.exists(f) for f in query_files.values()) and \ all(os.path.exists(f) for f in qrels_files.values()) and \ all(os.path.exists(f) for f in theirbm25_files.values()): return if not self._confirm_dua(): return with util.download_tmp(_SOURCES[collection]) as f: with zipfile.ZipFile(f) as zipf: doc_iter = self._init_iter_collection(zipf, collection) self._init_indices_parallel(idxs, doc_iter, force) for zqueryf, queryf in query_files.items(): if force or not os.path.exists(queryf): with zipf.open(zqueryf) as f, open(queryf, 'wt') as out: f = io.TextIOWrapper(f) f.readline() # head for qid, text in plaintext.read_sv(f, ','): plaintext.write_tsv(out, [[qid, text]]) for zqrelf, qrelf in qrels_files.items(): if force or not os.path.exists(qrelf): with zipf.open(zqrelf) as f, open(qrelf, 'wt') as out: f = io.TextIOWrapper(f) plaintext.write_sv(out, plaintext.read_tsv(f), ' ') for zbm25, bm25 in theirbm25_files.items(): if force or not os.path.exists(bm25): with zipf.open(zbm25) as f, open(bm25, 'wb') as out: out.write(f.read())