Beispiel #1
0
    def init(self, force=False):
        base_dir = os.path.join(util.path_dataset(self), self.subset)

        if self.subset == 'dummy':
            datafile = os.path.join(base_dir, 'datafile.tsv')
            qrels = os.path.join(base_dir, 'qrels.txt')
            if not os.path.exists(datafile):
                os.symlink(os.path.abspath('etc/dummy_datafile.tsv'), datafile)
            if not os.path.exists(qrels):
                os.symlink(os.path.abspath('etc/dummy_qrels.txt'), qrels)

        needs_datafile = []
        if force or not self.index.built():
            needs_datafile.append(lambda it: self.index.build(
                indices.RawDoc(did, txt) for t, did, txt in it if t == 'doc'))

        if force or not self.index_stem.built():
            needs_datafile.append(lambda it: self.index_stem.build(
                indices.RawDoc(did, txt) for t, did, txt in it if t == 'doc'))

        if force or not self.doc_store.built():
            needs_datafile.append(lambda it: self.doc_store.build(
                indices.RawDoc(did, txt) for t, did, txt in it if t == 'doc'))

        query_file = os.path.join(base_dir, 'queries.tsv')
        if force or not os.path.exists(query_file):
            needs_datafile.append(
                lambda it: plaintext.write_tsv(query_file, (
                    (qid, txt) for t, qid, txt in it if t == 'query')))

        if needs_datafile:
            df_glob = os.path.join(base_dir, 'datafile*.tsv')
            datafiles = glob(df_glob)
            while not datafiles:
                c = util.confirm(
                    f'No data files found. Please move/link data files to {df_glob}.\n'
                    'Data files should contain both queries and documents in the '
                    'following format (one per line):\n'
                    '[query|doc] [TAB] [qid/did] [TAB] [text]')
                if not c:
                    sys.exit(1)
                datafiles = glob(df_glob)
            main_iter = itertools.chain(*(plaintext.read_tsv(df)
                                          for df in datafiles))
            main_iter = tqdm(main_iter, desc='reading datafiles')
            iters = util.blocking_tee(main_iter, len(needs_datafile))
            with contextlib.ExitStack() as stack:
                for fn, it in zip(needs_datafile, iters):
                    stack.enter_context(
                        util.CtxtThread(functools.partial(fn, it)))

        qrels_file = os.path.join(base_dir, 'qrels.txt')
        while not os.path.exists(qrels_file):
            c = util.confirm(
                f'No qrels file found. Please move/link qrels file to {qrels_file}.\n'
                'Qrels file should be in the TREC format:\n'
                '[qid] [SPACE] Q0 [SPACE] [did] [SPACE] [score]')
            if not c:
                sys.exit(1)
Beispiel #2
0
 def _confirm_dua(self):
     self._has_confirmed_dua = True
     return self._has_confirmed_dua
     if self._has_confirmed_dua is None and self.DUA is not None:
         self._has_confirmed_dua = util.confirm(
             self.DUA.format(ds_path=util.path_dataset(self)))
     return self._has_confirmed_dua