예제 #1
0
    def _create_index(self):
        outdir = self.get_index_path()
        stops = "-keepStopwords" if self.cfg["indexstops"] else ""

        collection_path, document_type, generator_type = self["collection"].get_path_and_types()

        anserini_fat_jar = Anserini.get_fat_jar()
        if self["collection"].is_large_collection:
            cmd = f"java -classpath {anserini_fat_jar} -Xms512M -Xmx31G -Dapp.name='IndexCollection' io.anserini.index.IndexCollection -collection {document_type} -generator {generator_type} -threads {MAX_THREADS} -input {collection_path} -index {outdir} -stemmer {self.cfg['stemmer']} {stops}"
        else:
            cmd = f"java -classpath {anserini_fat_jar} -Xms512M -Xmx31G -Dapp.name='IndexCollection' io.anserini.index.IndexCollection -collection {document_type} -generator {generator_type} -threads {MAX_THREADS} -input {collection_path} -index {outdir} -storePositions -storeDocvectors -storeTransformedDocs -stemmer {self.cfg['stemmer']} {stops}"

        logger.info("building index %s", outdir)
        logger.debug(cmd)
        os.makedirs(os.path.basename(outdir), exist_ok=True)

        app = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE, universal_newlines=True)

        # Anserini output is verbose, so ignore DEBUG log lines and send other output through our logger
        for line in app.stdout:
            Anserini.filter_and_log_anserini_output(line, logger)

        app.wait()
        if app.returncode != 0:
            raise RuntimeError("command failed")
예제 #2
0
    def _anserini_query_from_file(self, topicsfn, anserini_param_str, output_base_path):
        if not os.path.exists(topicsfn):
            raise IOError(f"could not find topics file: {topicsfn}")

        donefn = os.path.join(output_base_path, "done")
        if os.path.exists(donefn):
            logger.debug(f"skipping Anserini SearchCollection call because path already exists: {donefn}")
            return

        # create index if it does not exist. the call returns immediately if the index does exist.
        self.index.create_index()

        os.makedirs(output_base_path, exist_ok=True)
        output_path = os.path.join(output_base_path, "searcher")

        index_path = self.index.get_index_path()
        anserini_fat_jar = Anserini.get_fat_jar()
        cmd = [
            "java",
            "-classpath",
            anserini_fat_jar,
            "-Xms512M",
            "-Xmx31G",
            "-Dapp.name=SearchCollection",
            "io.anserini.search.SearchCollection",
            "-topicreader",
            "TsvString",
            "-index",
            index_path,
            "-topics",
            topicsfn,
            "-output",
            output_path,
            "-inmem",
            "-threads",
            str(MAX_THREADS),
            "-stemmer",
            "none" if self.index.config["stemmer"] is None else self.index.config["stemmer"],
        ] + anserini_param_str.split()

        if self.index.config["indexstops"]:
            cmd += ["-keepStopwords"]

        logger.info("Anserini writing runs to %s", output_path)
        logger.debug(cmd)

        app = subprocess.Popen(cmd, stdout=subprocess.PIPE, universal_newlines=True)

        # Anserini output is verbose, so ignore DEBUG log lines and send other output through our logger
        for line in app.stdout:
            Anserini.filter_and_log_anserini_output(line, logger)

        app.wait()
        if app.returncode != 0:
            raise RuntimeError("command failed")

        with open(donefn, "wt") as donef:
            print("done", file=donef)
예제 #3
0
    def _create_index(self):
        outdir = self.get_index_path()
        collection_path, document_type, generator_type = self.collection.get_path_and_types(
        )
        anserini_fat_jar = Anserini.get_fat_jar()

        cmd = [
            "java",
            "-classpath",
            anserini_fat_jar,
            "-Xms512M",
            "-Xmx31G",
            "-Dapp.name='IndexCollection'",
            "io.anserini.index.IndexCollection",
            "-collection",
            document_type,
            "-generator",
            generator_type,
            "-threads",
            str(MAX_THREADS),
            "-input",
            collection_path,
            "-index",
            outdir,
            "-stemmer",
            "none"
            if self.config["stemmer"] is None else self.config["stemmer"],
        ]

        if self.config["indexstops"]:
            cmd += ["-keepStopwords"]

        if not self.collection.is_large_collection:
            cmd += [
                "-storePositions",
                "-storeDocvectors",
                "-storeContents",
            ]

        logger.info("building index %s", outdir)
        logger.debug(cmd)
        os.makedirs(os.path.basename(outdir), exist_ok=True)

        app = subprocess.Popen(cmd,
                               stdout=subprocess.PIPE,
                               universal_newlines=True)

        # Anserini output is verbose, so ignore DEBUG log lines and send other output through our logger
        for line in app.stdout:
            Anserini.filter_and_log_anserini_output(line, logger)

        app.wait()
        if app.returncode != 0:
            raise RuntimeError("command failed")
예제 #4
0
    def _query_index(self):
        index = self.index.index_path
        outdir = self.run_path
        topics = self.collection.config["topics"]["path"]
        document_type = self.collection.config["topics"]["type"]
        if document_type == "trec":
            topic_reader = "Trec"
        elif document_type == "ClueWeb12Collection":
            topic_reader = "Webxml"

        bs = [self.pipeline_config["b"]]
        k1s = [self.pipeline_config["k1"]]
        bstr = " ".join(str(x) for x in bs)
        k1str = " ".join(str(x) for x in k1s)

        # add stemmer and stop options to match underlying index
        indexopts = f"-stemmer {self.pipeline_config['stemmer']}"
        if self.pipeline_config["indexstops"]:
            indexopts += " -keepstopwords"

        anserini_fat_jar = Anserini.get_fat_jar()
        cmd = f"java -classpath {anserini_fat_jar} -Xms512M -Xmx31G -Dapp.name=SearchCollection io.anserini.search.SearchCollection -topicreader {topic_reader} -index {index} {indexopts} -topics {topics} -output {outdir}/searcher -inmem -threads {self.pipeline_config['maxthreads']} -bm25 -b {bstr} -k1 {k1str}"
        logger.info("writing runs to %s", outdir)
        logger.debug(cmd)
        os.makedirs(outdir, exist_ok=True)
        retcode = subprocess.call(cmd, shell=True)
        if retcode != 0:
            raise RuntimeError("command failed")
예제 #5
0
파일: rm3.py 프로젝트: ghazalehnt/capreolus
    def _query_index(self):
        index = self.index.index_path
        outdir = self.run_path
        topics = self.collection.config["topics"]["path"]
        assert self.collection.config["topics"]["type"] == "trec"

        bs = np.around(np.arange(0.1, self.pipeline_config["bmax"] + 0.1, 0.1), 1)
        k1s = np.around(np.arange(0.1, self.pipeline_config["k1max"] + 0.1, 0.1), 1)
        ows = np.around(np.arange(0.0, 1.0, 0.1), 1)
        fts = np.arange(1, self.pipeline_config["ftmax"] + self.pipeline_config["ftstep"], self.pipeline_config["ftstep"])
        fds = np.arange(1, self.pipeline_config["fdmax"] + self.pipeline_config["fdstep"], self.pipeline_config["fdstep"])

        grid_size = len(bs) * len(k1s) * len(ows) * len(fts) * len(fds)
        logger.warning("performing grid search over %s parameter combinations", grid_size)

        bstr = " ".join(str(x) for x in bs)
        k1str = " ".join(str(x) for x in k1s)
        owstr = " ".join(str(x) for x in ows)
        ftstr = " ".join(str(x) for x in fts)
        fdstr = " ".join(str(x) for x in fds)

        # add stemmer and stop options to match underlying index
        indexopts = f"-stemmer {self.pipeline_config['stemmer']}"
        if self.pipeline_config["indexstops"]:
            indexopts += " -keepstopwords"

        anserini_fat_jar = Anserini.get_fat_jar()
        cmd = f"java -classpath {anserini_fat_jar} -Xms512M -Xmx31G -Dapp.name=SearchCollection io.anserini.search.SearchCollection -topicreader Trec -index {index} {indexopts} -topics {topics} -output {outdir}/run -inmem -threads {self.pipeline_config['maxthreads']} -bm25 -b {bstr} -k1 {k1str} -rm3 -rm3.originalQueryWeight {owstr} -rm3.fbTerms {ftstr} -rm3.fbDocs {fdstr}"
        logger.info("writing runs to %s", outdir)
        logger.debug(cmd)
        os.makedirs(outdir, exist_ok=True)
        retcode = subprocess.call(cmd, shell=True)
        if retcode != 0:
            raise RuntimeError("command failed")
예제 #6
0
파일: rm3.py 프로젝트: ghazalehnt/capreolus
    def _query_index(self):
        index = self.index.index_path
        outdir = self.run_path
        topics = self.collection.config["topics"]["path"]
        assert self.collection.config["topics"]["type"] == "trec"

        bs = [self.pipeline_config["b"]]
        k1s = [self.pipeline_config["k1"]]
        ows = [self.pipeline_config["ow"]]
        fts = [self.pipeline_config["ft"]]
        fds = [self.pipeline_config["fd"]]
        bstr = " ".join(str(x) for x in bs)
        k1str = " ".join(str(x) for x in k1s)
        owstr = " ".join(str(x) for x in ows)
        ftstr = " ".join(str(x) for x in fts)
        fdstr = " ".join(str(x) for x in fds)

        # add stemmer and stop options to match underlying index
        indexopts = f"-stemmer {self.pipeline_config['stemmer']}"
        if self.pipeline_config["indexstops"]:
            indexopts += " -keepstopwords"

        anserini_fat_jar = Anserini.get_fat_jar()
        cmd = f"java -classpath {anserini_fat_jar} -Xms512M -Xmx31G -Dapp.name=SearchCollection io.anserini.search.SearchCollection -topicreader Trec -index {index} {indexopts} -topics {topics} -output {outdir}/run -inmem -threads {self.pipeline_config['maxthreads']} -bm25 -b {bstr} -k1 {k1str} -rm3 -rm3.originalQueryWeight {owstr} -rm3.fbTerms {ftstr} -rm3.fbDocs {fdstr}"
        logger.info("writing runs to %s", outdir)
        logger.debug(cmd)
        os.makedirs(outdir, exist_ok=True)
        retcode = subprocess.call(cmd, shell=True)
        if retcode != 0:
            raise RuntimeError("command failed")
예제 #7
0
파일: rm3.py 프로젝트: ghazalehnt/capreolus
    def _query_index(self):
        index = self.index.index_path
        outdir = self.run_path
        topics = self.collection.config["topics"]["path"]
        assert self.collection.config["topics"]["type"] == "trec"

        # from https://github.com/castorini/anserini/blob/master/src/main/python/rerank/scripts/export_robust04_dataset.py#L28
        best_rm3_parameters = set([(47, 9, 0.3), (47, 9, 0.3), (47, 9, 0.3), (47, 9, 0.3), (26, 8, 0.3)])
        k1 = 0.9
        b = 0.4

        # add stemmer and stop options to match underlying index
        indexopts = f"-stemmer {self.pipeline_config['stemmer']}"
        if self.pipeline_config["indexstops"]:
            indexopts += " -keepstopwords"

        anserini_fat_jar = Anserini.get_fat_jar()
        for fbterms, fbdocs, origw in best_rm3_parameters:
            cmd = f"java -classpath {anserini_fat_jar} -Xms512M -Xmx31G -Dapp.name=SearchCollection io.anserini.search.SearchCollection -topicreader Trec -index {index} {indexopts} -topics {topics} -output {outdir}/run_{fbterms}_{fbdocs}_{origw} -inmem -threads {self.pipeline_config['maxthreads']} -bm25 -b {b} -k1 {k1} -rm3 -rm3.fbTerms {fbterms} -rm3.fbDocs {fbdocs} -rm3.originalQueryWeight {origw}"
            logger.info("writing searcher to %s", outdir)
            logger.debug(cmd)
            os.makedirs(outdir, exist_ok=True)
            retcode = subprocess.call(cmd, shell=True)
            if retcode != 0:
                raise RuntimeError("command failed")
예제 #8
0
    def _build_index(self, config):
        outdir = self.index_path
        stops = "-keepStopwords" if config["indexstops"] else ""
        indir = self.collection.config["documents"]["path"]

        document_type = self.collection.config["documents"]["type"]
        if document_type == "trec":
            ctype = "TrecCollection"
        elif document_type == "trecweb":
            ctype = "TrecwebCollection"
        else:
            # For clueweb12, document_type in yaml is the same as anserini - ClueWeb12Collection
            ctype = document_type

        anserini_fat_jar = Anserini.get_fat_jar()
        if self.collection.is_large_collection:
            cmd = f"java -classpath {anserini_fat_jar} -Xms512M -Xmx31G -Dapp.name='IndexCollection' io.anserini.index.IndexCollection -collection {ctype} -generator JsoupGenerator -threads {config['maxthreads']} -input {indir} -index {outdir} -stemmer {config['stemmer']} {stops}"
        else:
            cmd = f"java -classpath {anserini_fat_jar} -Xms512M -Xmx31G -Dapp.name='IndexCollection' io.anserini.index.IndexCollection -collection {ctype} -generator JsoupGenerator -threads {config['maxthreads']} -input {indir} -index {outdir} -storePositions -storeDocvectors -storeTransformedDocs -stemmer {config['stemmer']} {stops}"

        logger.info("building index %s", outdir)
        logger.debug(cmd)
        os.makedirs(os.path.basename(outdir), exist_ok=True)
        retcode = subprocess.call(cmd, shell=True)
        if retcode != 0:
            raise RuntimeError("command failed")
예제 #9
0
    def _anserini_query_from_file(self, topicsfn, anserini_param_str,
                                  output_base_path):
        if not os.path.exists(topicsfn):
            raise IOError(f"could not find topics file: {topicsfn}")

        donefn = os.path.join(output_base_path, "done")
        if os.path.exists(donefn):
            logger.debug(
                f"skipping Anserini SearchCollection call because path already exists: {donefn}"
            )
            return

        # create index if it does not exist. the call returns immediately if the index does exist.
        self["index"].create_index()

        os.makedirs(output_base_path, exist_ok=True)
        output_path = os.path.join(output_base_path, "searcher")

        # add stemmer and stop options to match underlying index
        indexopts = f"-stemmer {self['index'].cfg['stemmer']}"
        if self["index"].cfg["indexstops"]:
            indexopts += " -keepstopwords"

        index_path = self["index"].get_index_path()
        anserini_fat_jar = Anserini.get_fat_jar()
        cmd = f"java -classpath {anserini_fat_jar} -Xms512M -Xmx31G -Dapp.name=SearchCollection io.anserini.search.SearchCollection -topicreader Trec -index {index_path} {indexopts} -topics {topicsfn} -output {output_path} -inmem -threads {MAX_THREADS} {anserini_param_str}"
        logger.info("Anserini writing runs to %s", output_path)
        logger.debug(cmd)

        app = subprocess.Popen(cmd.split(),
                               stdout=subprocess.PIPE,
                               universal_newlines=True)

        # Anserini output is verbose, so ignore DEBUG log lines and send other output through our logger
        for line in app.stdout:
            Anserini.filter_and_log_anserini_output(line, logger)

        app.wait()
        if app.returncode != 0:
            raise RuntimeError("command failed")

        with open(donefn, "wt") as donef:
            print("done", file=donef)
예제 #10
0
def trec_index(request, tmpdir):
    """
    Build an index based on sample data and create an AnseriniIndex instance based on it
    """
    indir = os.path.join(COLLECTIONS["dummy"].basepath, "dummy")
    outdir = os.path.join(tmpdir, "index")
    anserini_fat_jar = Anserini.get_fat_jar()
    cmd = f"java -classpath {anserini_fat_jar} -Xms512M -Xmx31G -Dapp.name=IndexCollection io.anserini.index.IndexCollection  -collection TrecCollection -generator JsoupGenerator -threads 1 -input {indir} -index {outdir} -storeTransformedDocs"
    os.system(cmd)
    collection = Collection(dummy_collection_config())
    anserini_index = AnseriniIndex(collection, outdir,
                                   os.path.join(tmpdir, "index_cache"))
    anserini_index.open()
    return anserini_index
예제 #11
0
### set missing environment variables to safe defaults ###
if "GENSIM_DATA_DIR" not in os.environ:
    os.environ["GENSIM_DATA_DIR"] = (constants["CACHE_BASE_PATH"] /
                                     "gensim").as_posix()

if "NLTK_DATA" not in os.environ:
    os.environ["NLTK_DATA"] = (constants["CACHE_BASE_PATH"] /
                               "nltk").as_posix()

if "TOKENIZERS_PARALLELISM" not in os.environ:
    os.environ["TOKENIZERS_PARALLELISM"] = "false"

import jnius_config
from capreolus.utils.common import Anserini

jnius_config.set_classpath(Anserini.get_fat_jar())

### convenience imports
# note: order is important to avoid circular imports
from capreolus.utils.loginit import get_logger
from capreolus.benchmark import Benchmark
from capreolus.collection import Collection
from capreolus.index import Index
from capreolus.searcher import Searcher
from capreolus.extractor import Extractor
from capreolus.reranker import Reranker
from capreolus.tokenizer import Tokenizer
from capreolus.trainer import Trainer
from capreolus.task import Task