コード例 #1
0
ファイル: __init__.py プロジェクト: crystina-z/capreolus
    def _create_index(self):
        outdir = self.get_index_path()
        stops = "-keepStopwords" if self.cfg["indexstops"] else ""

        collection_path, document_type, generator_type = self["collection"].get_path_and_types()

        anserini_fat_jar = Anserini.get_fat_jar()
        if self["collection"].is_large_collection:
            cmd = f"java -classpath {anserini_fat_jar} -Xms512M -Xmx31G -Dapp.name='IndexCollection' io.anserini.index.IndexCollection -collection {document_type} -generator {generator_type} -threads {MAX_THREADS} -input {collection_path} -index {outdir} -stemmer {self.cfg['stemmer']} {stops}"
        else:
            cmd = f"java -classpath {anserini_fat_jar} -Xms512M -Xmx31G -Dapp.name='IndexCollection' io.anserini.index.IndexCollection -collection {document_type} -generator {generator_type} -threads {MAX_THREADS} -input {collection_path} -index {outdir} -storePositions -storeDocvectors -storeTransformedDocs -stemmer {self.cfg['stemmer']} {stops}"

        logger.info("building index %s", outdir)
        logger.debug(cmd)
        os.makedirs(os.path.basename(outdir), exist_ok=True)

        app = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE, universal_newlines=True)

        # Anserini output is verbose, so ignore DEBUG log lines and send other output through our logger
        for line in app.stdout:
            Anserini.filter_and_log_anserini_output(line, logger)

        app.wait()
        if app.returncode != 0:
            raise RuntimeError("command failed")
コード例 #2
0
ファイル: anserini.py プロジェクト: larryli1999/capreolus
    def _anserini_query_from_file(self, topicsfn, anserini_param_str, output_base_path):
        if not os.path.exists(topicsfn):
            raise IOError(f"could not find topics file: {topicsfn}")

        donefn = os.path.join(output_base_path, "done")
        if os.path.exists(donefn):
            logger.debug(f"skipping Anserini SearchCollection call because path already exists: {donefn}")
            return

        # create index if it does not exist. the call returns immediately if the index does exist.
        self.index.create_index()

        os.makedirs(output_base_path, exist_ok=True)
        output_path = os.path.join(output_base_path, "searcher")

        index_path = self.index.get_index_path()
        anserini_fat_jar = Anserini.get_fat_jar()
        cmd = [
            "java",
            "-classpath",
            anserini_fat_jar,
            "-Xms512M",
            "-Xmx31G",
            "-Dapp.name=SearchCollection",
            "io.anserini.search.SearchCollection",
            "-topicreader",
            "TsvString",
            "-index",
            index_path,
            "-topics",
            topicsfn,
            "-output",
            output_path,
            "-inmem",
            "-threads",
            str(MAX_THREADS),
            "-stemmer",
            "none" if self.index.config["stemmer"] is None else self.index.config["stemmer"],
        ] + anserini_param_str.split()

        if self.index.config["indexstops"]:
            cmd += ["-keepStopwords"]

        logger.info("Anserini writing runs to %s", output_path)
        logger.debug(cmd)

        app = subprocess.Popen(cmd, stdout=subprocess.PIPE, universal_newlines=True)

        # Anserini output is verbose, so ignore DEBUG log lines and send other output through our logger
        for line in app.stdout:
            Anserini.filter_and_log_anserini_output(line, logger)

        app.wait()
        if app.returncode != 0:
            raise RuntimeError("command failed")

        with open(donefn, "wt") as donef:
            print("done", file=donef)
コード例 #3
0
    def _create_index(self):
        outdir = self.get_index_path()
        collection_path, document_type, generator_type = self.collection.get_path_and_types(
        )
        anserini_fat_jar = Anserini.get_fat_jar()

        cmd = [
            "java",
            "-classpath",
            anserini_fat_jar,
            "-Xms512M",
            "-Xmx31G",
            "-Dapp.name='IndexCollection'",
            "io.anserini.index.IndexCollection",
            "-collection",
            document_type,
            "-generator",
            generator_type,
            "-threads",
            str(MAX_THREADS),
            "-input",
            collection_path,
            "-index",
            outdir,
            "-stemmer",
            "none"
            if self.config["stemmer"] is None else self.config["stemmer"],
        ]

        if self.config["indexstops"]:
            cmd += ["-keepStopwords"]

        if not self.collection.is_large_collection:
            cmd += [
                "-storePositions",
                "-storeDocvectors",
                "-storeContents",
            ]

        logger.info("building index %s", outdir)
        logger.debug(cmd)
        os.makedirs(os.path.basename(outdir), exist_ok=True)

        app = subprocess.Popen(cmd,
                               stdout=subprocess.PIPE,
                               universal_newlines=True)

        # Anserini output is verbose, so ignore DEBUG log lines and send other output through our logger
        for line in app.stdout:
            Anserini.filter_and_log_anserini_output(line, logger)

        app.wait()
        if app.returncode != 0:
            raise RuntimeError("command failed")
コード例 #4
0
ファイル: __init__.py プロジェクト: crystina-z/capreolus
    def _anserini_query_from_file(self, topicsfn, anserini_param_str,
                                  output_base_path):
        if not os.path.exists(topicsfn):
            raise IOError(f"could not find topics file: {topicsfn}")

        donefn = os.path.join(output_base_path, "done")
        if os.path.exists(donefn):
            logger.debug(
                f"skipping Anserini SearchCollection call because path already exists: {donefn}"
            )
            return

        # create index if it does not exist. the call returns immediately if the index does exist.
        self["index"].create_index()

        os.makedirs(output_base_path, exist_ok=True)
        output_path = os.path.join(output_base_path, "searcher")

        # add stemmer and stop options to match underlying index
        indexopts = f"-stemmer {self['index'].cfg['stemmer']}"
        if self["index"].cfg["indexstops"]:
            indexopts += " -keepstopwords"

        index_path = self["index"].get_index_path()
        anserini_fat_jar = Anserini.get_fat_jar()
        cmd = f"java -classpath {anserini_fat_jar} -Xms512M -Xmx31G -Dapp.name=SearchCollection io.anserini.search.SearchCollection -topicreader Trec -index {index_path} {indexopts} -topics {topicsfn} -output {output_path} -inmem -threads {MAX_THREADS} {anserini_param_str}"
        logger.info("Anserini writing runs to %s", output_path)
        logger.debug(cmd)

        app = subprocess.Popen(cmd.split(),
                               stdout=subprocess.PIPE,
                               universal_newlines=True)

        # Anserini output is verbose, so ignore DEBUG log lines and send other output through our logger
        for line in app.stdout:
            Anserini.filter_and_log_anserini_output(line, logger)

        app.wait()
        if app.returncode != 0:
            raise RuntimeError("command failed")

        with open(donefn, "wt") as donef:
            print("done", file=donef)