Example #1
0
 def save_file(self, path, link_ok=True, sep=None):
     if link_ok and 'trec' in self._data and os.path.exists(
             self._data['file']):
         os.symlink(self._data['trec'], path)
     else:
         it = ((qid, '0', did, score) for qid, did, score in iter(self))
         plaintext.write_sv(path, it, sep=(sep or self._sep))
Example #2
0
def write_sample_dict(sample_dict, file):
    def sample_iter():
        for qid in sample_dict:
            for did in sample_dict[qid]:
                cat, rel = sample_dict[qid][did]
                yield qid, "0", did, cat, rel

    plaintext.write_sv(file, sample_iter(), sep=' ')
Example #3
0
def write_run_dict(file, run_dict, runid='run'):
    """
    Writes a query-document run dictionary to the given file

    Args:
        file (str|Stream) file path (str) or stream (Stream) to read to
        run_dict (dict<str<dict<str,float>>) run scores of format {qid: {docid: score}}
        runid (str, optional) run name to output (optional)
    """
    def run_iter():
        for qid in run_dict:
            for i, (docid, score) in enumerate(
                    sorted(run_dict[qid].items(), key=lambda x:
                           (-x[1], x[0]))):
                yield qid, 'Q0', docid, i + 1, score, runid

    plaintext.write_sv(file, run_iter(), sep=' ')
Example #4
0
    def _init_collection(self, collection, force=False):
        base_path = util.path_dataset(self)
        if collection == '1k':
            idxs = [self.index1k, self.index1k_stem, self.docstore1k]
        elif collection == '59k':
            idxs = [self.index59k, self.index59k_stem, self.docstore59k]
        else:
            raise ValueError(f'unsupported collection {collection}')

        query_files = {
            f'wikIR{collection}/training/queries.csv':
            os.path.join(base_path, f'train.{collection}.queries'),
            f'wikIR{collection}/validation/queries.csv':
            os.path.join(base_path, f'dev.{collection}.queries'),
            f'wikIR{collection}/test/queries.csv':
            os.path.join(base_path, f'test.{collection}.queries')
        }

        qrels_files = {
            f'wikIR{collection}/training/qrels':
            os.path.join(base_path, f'train.{collection}.qrels'),
            f'wikIR{collection}/validation/qrels':
            os.path.join(base_path, f'dev.{collection}.qrels'),
            f'wikIR{collection}/test/qrels':
            os.path.join(base_path, f'test.{collection}.qrels')
        }

        theirbm25_files = {
            f'wikIR{collection}/training/BM25.res':
            os.path.join(base_path, f'train.{collection}.theirbm25'),
            f'wikIR{collection}/validation/BM25.res':
            os.path.join(base_path, f'dev.{collection}.theirbm25'),
            f'wikIR{collection}/test/BM25.res':
            os.path.join(base_path, f'test.{collection}.theirbm25')
        }

        if not force and \
           all(i.built() for i in idxs) and \
           all(os.path.exists(f) for f in query_files.values()) and \
           all(os.path.exists(f) for f in qrels_files.values()) and \
           all(os.path.exists(f) for f in theirbm25_files.values()):
            return

        if not self._confirm_dua():
            return

        with util.download_tmp(_SOURCES[collection]) as f:
            with zipfile.ZipFile(f) as zipf:
                doc_iter = self._init_iter_collection(zipf, collection)
                self._init_indices_parallel(idxs, doc_iter, force)

                for zqueryf, queryf in query_files.items():
                    if force or not os.path.exists(queryf):
                        with zipf.open(zqueryf) as f, open(queryf,
                                                           'wt') as out:
                            f = io.TextIOWrapper(f)
                            f.readline()  # head
                            for qid, text in plaintext.read_sv(f, ','):
                                plaintext.write_tsv(out, [[qid, text]])

                for zqrelf, qrelf in qrels_files.items():
                    if force or not os.path.exists(qrelf):
                        with zipf.open(zqrelf) as f, open(qrelf, 'wt') as out:
                            f = io.TextIOWrapper(f)
                            plaintext.write_sv(out, plaintext.read_tsv(f), ' ')

                for zbm25, bm25 in theirbm25_files.items():
                    if force or not os.path.exists(bm25):
                        with zipf.open(zbm25) as f, open(bm25, 'wb') as out:
                            out.write(f.read())