def _query_index(self): index = self.index.index_path outdir = self.run_path topics = self.collection.config["topics"]["path"] assert self.collection.config["topics"]["type"] == "trec" bs = np.around(np.arange(0.1, self.pipeline_config["bmax"] + 0.1, 0.1), 1) k1s = np.around(np.arange(0.1, self.pipeline_config["k1max"] + 0.1, 0.1), 1) ows = np.around(np.arange(0.0, 1.0, 0.1), 1) fts = np.arange(1, self.pipeline_config["ftmax"] + self.pipeline_config["ftstep"], self.pipeline_config["ftstep"]) fds = np.arange(1, self.pipeline_config["fdmax"] + self.pipeline_config["fdstep"], self.pipeline_config["fdstep"]) grid_size = len(bs) * len(k1s) * len(ows) * len(fts) * len(fds) logger.warning("performing grid search over %s parameter combinations", grid_size) bstr = " ".join(str(x) for x in bs) k1str = " ".join(str(x) for x in k1s) owstr = " ".join(str(x) for x in ows) ftstr = " ".join(str(x) for x in fts) fdstr = " ".join(str(x) for x in fds) # add stemmer and stop options to match underlying index indexopts = f"-stemmer {self.pipeline_config['stemmer']}" if self.pipeline_config["indexstops"]: indexopts += " -keepstopwords" anserini_fat_jar = Anserini.get_fat_jar() cmd = f"java -classpath {anserini_fat_jar} -Xms512M -Xmx31G -Dapp.name=SearchCollection io.anserini.search.SearchCollection -topicreader Trec -index {index} {indexopts} -topics {topics} -output {outdir}/run -inmem -threads {self.pipeline_config['maxthreads']} -bm25 -b {bstr} -k1 {k1str} -rm3 -rm3.originalQueryWeight {owstr} -rm3.fbTerms {ftstr} -rm3.fbDocs {fdstr}" logger.info("writing runs to %s", outdir) logger.debug(cmd) os.makedirs(outdir, exist_ok=True) retcode = subprocess.call(cmd, shell=True) if retcode != 0: raise RuntimeError("command failed")
def _query_index(self): index = self.index.index_path outdir = self.run_path topics = self.collection.config["topics"]["path"] document_type = self.collection.config["topics"]["type"] if document_type == "trec": topic_reader = "Trec" elif document_type == "ClueWeb12Collection": topic_reader = "Webxml" bs = [self.pipeline_config["b"]] k1s = [self.pipeline_config["k1"]] bstr = " ".join(str(x) for x in bs) k1str = " ".join(str(x) for x in k1s) # add stemmer and stop options to match underlying index indexopts = f"-stemmer {self.pipeline_config['stemmer']}" if self.pipeline_config["indexstops"]: indexopts += " -keepstopwords" anserini_fat_jar = Anserini.get_fat_jar() cmd = f"java -classpath {anserini_fat_jar} -Xms512M -Xmx31G -Dapp.name=SearchCollection io.anserini.search.SearchCollection -topicreader {topic_reader} -index {index} {indexopts} -topics {topics} -output {outdir}/searcher -inmem -threads {self.pipeline_config['maxthreads']} -bm25 -b {bstr} -k1 {k1str}" logger.info("writing runs to %s", outdir) logger.debug(cmd) os.makedirs(outdir, exist_ok=True) retcode = subprocess.call(cmd, shell=True) if retcode != 0: raise RuntimeError("command failed")
def _create_index(self): outdir = self.get_index_path() stops = "-keepStopwords" if self.cfg["indexstops"] else "" collection_path, document_type, generator_type = self["collection"].get_path_and_types() anserini_fat_jar = Anserini.get_fat_jar() if self["collection"].is_large_collection: cmd = f"java -classpath {anserini_fat_jar} -Xms512M -Xmx31G -Dapp.name='IndexCollection' io.anserini.index.IndexCollection -collection {document_type} -generator {generator_type} -threads {MAX_THREADS} -input {collection_path} -index {outdir} -stemmer {self.cfg['stemmer']} {stops}" else: cmd = f"java -classpath {anserini_fat_jar} -Xms512M -Xmx31G -Dapp.name='IndexCollection' io.anserini.index.IndexCollection -collection {document_type} -generator {generator_type} -threads {MAX_THREADS} -input {collection_path} -index {outdir} -storePositions -storeDocvectors -storeTransformedDocs -stemmer {self.cfg['stemmer']} {stops}" logger.info("building index %s", outdir) logger.debug(cmd) os.makedirs(os.path.basename(outdir), exist_ok=True) app = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE, universal_newlines=True) # Anserini output is verbose, so ignore DEBUG log lines and send other output through our logger for line in app.stdout: Anserini.filter_and_log_anserini_output(line, logger) app.wait() if app.returncode != 0: raise RuntimeError("command failed")
def _query_index(self): index = self.index.index_path outdir = self.run_path topics = self.collection.config["topics"]["path"] assert self.collection.config["topics"]["type"] == "trec" bs = [self.pipeline_config["b"]] k1s = [self.pipeline_config["k1"]] ows = [self.pipeline_config["ow"]] fts = [self.pipeline_config["ft"]] fds = [self.pipeline_config["fd"]] bstr = " ".join(str(x) for x in bs) k1str = " ".join(str(x) for x in k1s) owstr = " ".join(str(x) for x in ows) ftstr = " ".join(str(x) for x in fts) fdstr = " ".join(str(x) for x in fds) # add stemmer and stop options to match underlying index indexopts = f"-stemmer {self.pipeline_config['stemmer']}" if self.pipeline_config["indexstops"]: indexopts += " -keepstopwords" anserini_fat_jar = Anserini.get_fat_jar() cmd = f"java -classpath {anserini_fat_jar} -Xms512M -Xmx31G -Dapp.name=SearchCollection io.anserini.search.SearchCollection -topicreader Trec -index {index} {indexopts} -topics {topics} -output {outdir}/run -inmem -threads {self.pipeline_config['maxthreads']} -bm25 -b {bstr} -k1 {k1str} -rm3 -rm3.originalQueryWeight {owstr} -rm3.fbTerms {ftstr} -rm3.fbDocs {fdstr}" logger.info("writing runs to %s", outdir) logger.debug(cmd) os.makedirs(outdir, exist_ok=True) retcode = subprocess.call(cmd, shell=True) if retcode != 0: raise RuntimeError("command failed")
def _query_index(self): index = self.index.index_path outdir = self.run_path topics = self.collection.config["topics"]["path"] assert self.collection.config["topics"]["type"] == "trec" # from https://github.com/castorini/anserini/blob/master/src/main/python/rerank/scripts/export_robust04_dataset.py#L28 best_rm3_parameters = set([(47, 9, 0.3), (47, 9, 0.3), (47, 9, 0.3), (47, 9, 0.3), (26, 8, 0.3)]) k1 = 0.9 b = 0.4 # add stemmer and stop options to match underlying index indexopts = f"-stemmer {self.pipeline_config['stemmer']}" if self.pipeline_config["indexstops"]: indexopts += " -keepstopwords" anserini_fat_jar = Anserini.get_fat_jar() for fbterms, fbdocs, origw in best_rm3_parameters: cmd = f"java -classpath {anserini_fat_jar} -Xms512M -Xmx31G -Dapp.name=SearchCollection io.anserini.search.SearchCollection -topicreader Trec -index {index} {indexopts} -topics {topics} -output {outdir}/run_{fbterms}_{fbdocs}_{origw} -inmem -threads {self.pipeline_config['maxthreads']} -bm25 -b {b} -k1 {k1} -rm3 -rm3.fbTerms {fbterms} -rm3.fbDocs {fbdocs} -rm3.originalQueryWeight {origw}" logger.info("writing searcher to %s", outdir) logger.debug(cmd) os.makedirs(outdir, exist_ok=True) retcode = subprocess.call(cmd, shell=True) if retcode != 0: raise RuntimeError("command failed")
def _build_index(self, config): outdir = self.index_path stops = "-keepStopwords" if config["indexstops"] else "" indir = self.collection.config["documents"]["path"] document_type = self.collection.config["documents"]["type"] if document_type == "trec": ctype = "TrecCollection" elif document_type == "trecweb": ctype = "TrecwebCollection" else: # For clueweb12, document_type in yaml is the same as anserini - ClueWeb12Collection ctype = document_type anserini_fat_jar = Anserini.get_fat_jar() if self.collection.is_large_collection: cmd = f"java -classpath {anserini_fat_jar} -Xms512M -Xmx31G -Dapp.name='IndexCollection' io.anserini.index.IndexCollection -collection {ctype} -generator JsoupGenerator -threads {config['maxthreads']} -input {indir} -index {outdir} -stemmer {config['stemmer']} {stops}" else: cmd = f"java -classpath {anserini_fat_jar} -Xms512M -Xmx31G -Dapp.name='IndexCollection' io.anserini.index.IndexCollection -collection {ctype} -generator JsoupGenerator -threads {config['maxthreads']} -input {indir} -index {outdir} -storePositions -storeDocvectors -storeTransformedDocs -stemmer {config['stemmer']} {stops}" logger.info("building index %s", outdir) logger.debug(cmd) os.makedirs(os.path.basename(outdir), exist_ok=True) retcode = subprocess.call(cmd, shell=True) if retcode != 0: raise RuntimeError("command failed")
def _anserini_query_from_file(self, topicsfn, anserini_param_str, output_base_path): if not os.path.exists(topicsfn): raise IOError(f"could not find topics file: {topicsfn}") donefn = os.path.join(output_base_path, "done") if os.path.exists(donefn): logger.debug(f"skipping Anserini SearchCollection call because path already exists: {donefn}") return # create index if it does not exist. the call returns immediately if the index does exist. self.index.create_index() os.makedirs(output_base_path, exist_ok=True) output_path = os.path.join(output_base_path, "searcher") index_path = self.index.get_index_path() anserini_fat_jar = Anserini.get_fat_jar() cmd = [ "java", "-classpath", anserini_fat_jar, "-Xms512M", "-Xmx31G", "-Dapp.name=SearchCollection", "io.anserini.search.SearchCollection", "-topicreader", "TsvString", "-index", index_path, "-topics", topicsfn, "-output", output_path, "-inmem", "-threads", str(MAX_THREADS), "-stemmer", "none" if self.index.config["stemmer"] is None else self.index.config["stemmer"], ] + anserini_param_str.split() if self.index.config["indexstops"]: cmd += ["-keepStopwords"] logger.info("Anserini writing runs to %s", output_path) logger.debug(cmd) app = subprocess.Popen(cmd, stdout=subprocess.PIPE, universal_newlines=True) # Anserini output is verbose, so ignore DEBUG log lines and send other output through our logger for line in app.stdout: Anserini.filter_and_log_anserini_output(line, logger) app.wait() if app.returncode != 0: raise RuntimeError("command failed") with open(donefn, "wt") as donef: print("done", file=donef)
def _create_index(self): outdir = self.get_index_path() collection_path, document_type, generator_type = self.collection.get_path_and_types( ) anserini_fat_jar = Anserini.get_fat_jar() cmd = [ "java", "-classpath", anserini_fat_jar, "-Xms512M", "-Xmx31G", "-Dapp.name='IndexCollection'", "io.anserini.index.IndexCollection", "-collection", document_type, "-generator", generator_type, "-threads", str(MAX_THREADS), "-input", collection_path, "-index", outdir, "-stemmer", "none" if self.config["stemmer"] is None else self.config["stemmer"], ] if self.config["indexstops"]: cmd += ["-keepStopwords"] if not self.collection.is_large_collection: cmd += [ "-storePositions", "-storeDocvectors", "-storeContents", ] logger.info("building index %s", outdir) logger.debug(cmd) os.makedirs(os.path.basename(outdir), exist_ok=True) app = subprocess.Popen(cmd, stdout=subprocess.PIPE, universal_newlines=True) # Anserini output is verbose, so ignore DEBUG log lines and send other output through our logger for line in app.stdout: Anserini.filter_and_log_anserini_output(line, logger) app.wait() if app.returncode != 0: raise RuntimeError("command failed")
def trec_index(request, tmpdir): """ Build an index based on sample data and create an AnseriniIndex instance based on it """ indir = os.path.join(COLLECTIONS["dummy"].basepath, "dummy") outdir = os.path.join(tmpdir, "index") anserini_fat_jar = Anserini.get_fat_jar() cmd = f"java -classpath {anserini_fat_jar} -Xms512M -Xmx31G -Dapp.name=IndexCollection io.anserini.index.IndexCollection -collection TrecCollection -generator JsoupGenerator -threads 1 -input {indir} -index {outdir} -storeTransformedDocs" os.system(cmd) collection = Collection(dummy_collection_config()) anserini_index = AnseriniIndex(collection, outdir, os.path.join(tmpdir, "index_cache")) anserini_index.open() return anserini_index
def _anserini_query_from_file(self, topicsfn, anserini_param_str, output_base_path): if not os.path.exists(topicsfn): raise IOError(f"could not find topics file: {topicsfn}") donefn = os.path.join(output_base_path, "done") if os.path.exists(donefn): logger.debug( f"skipping Anserini SearchCollection call because path already exists: {donefn}" ) return # create index if it does not exist. the call returns immediately if the index does exist. self["index"].create_index() os.makedirs(output_base_path, exist_ok=True) output_path = os.path.join(output_base_path, "searcher") # add stemmer and stop options to match underlying index indexopts = f"-stemmer {self['index'].cfg['stemmer']}" if self["index"].cfg["indexstops"]: indexopts += " -keepstopwords" index_path = self["index"].get_index_path() anserini_fat_jar = Anserini.get_fat_jar() cmd = f"java -classpath {anserini_fat_jar} -Xms512M -Xmx31G -Dapp.name=SearchCollection io.anserini.search.SearchCollection -topicreader Trec -index {index_path} {indexopts} -topics {topicsfn} -output {output_path} -inmem -threads {MAX_THREADS} {anserini_param_str}" logger.info("Anserini writing runs to %s", output_path) logger.debug(cmd) app = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE, universal_newlines=True) # Anserini output is verbose, so ignore DEBUG log lines and send other output through our logger for line in app.stdout: Anserini.filter_and_log_anserini_output(line, logger) app.wait() if app.returncode != 0: raise RuntimeError("command failed") with open(donefn, "wt") as donef: print("done", file=donef)
### set missing environment variables to safe defaults ### if "GENSIM_DATA_DIR" not in os.environ: os.environ["GENSIM_DATA_DIR"] = (constants["CACHE_BASE_PATH"] / "gensim").as_posix() if "NLTK_DATA" not in os.environ: os.environ["NLTK_DATA"] = (constants["CACHE_BASE_PATH"] / "nltk").as_posix() if "TOKENIZERS_PARALLELISM" not in os.environ: os.environ["TOKENIZERS_PARALLELISM"] = "false" import jnius_config from capreolus.utils.common import Anserini jnius_config.set_classpath(Anserini.get_fat_jar()) ### convenience imports # note: order is important to avoid circular imports from capreolus.utils.loginit import get_logger from capreolus.benchmark import Benchmark from capreolus.collection import Collection from capreolus.index import Index from capreolus.searcher import Searcher from capreolus.extractor import Extractor from capreolus.reranker import Reranker from capreolus.tokenizer import Tokenizer from capreolus.trainer import Trainer from capreolus.task import Task