def _create_index(self): outdir = self.get_index_path() stops = "-keepStopwords" if self.cfg["indexstops"] else "" collection_path, document_type, generator_type = self["collection"].get_path_and_types() anserini_fat_jar = Anserini.get_fat_jar() if self["collection"].is_large_collection: cmd = f"java -classpath {anserini_fat_jar} -Xms512M -Xmx31G -Dapp.name='IndexCollection' io.anserini.index.IndexCollection -collection {document_type} -generator {generator_type} -threads {MAX_THREADS} -input {collection_path} -index {outdir} -stemmer {self.cfg['stemmer']} {stops}" else: cmd = f"java -classpath {anserini_fat_jar} -Xms512M -Xmx31G -Dapp.name='IndexCollection' io.anserini.index.IndexCollection -collection {document_type} -generator {generator_type} -threads {MAX_THREADS} -input {collection_path} -index {outdir} -storePositions -storeDocvectors -storeTransformedDocs -stemmer {self.cfg['stemmer']} {stops}" logger.info("building index %s", outdir) logger.debug(cmd) os.makedirs(os.path.basename(outdir), exist_ok=True) app = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE, universal_newlines=True) # Anserini output is verbose, so ignore DEBUG log lines and send other output through our logger for line in app.stdout: Anserini.filter_and_log_anserini_output(line, logger) app.wait() if app.returncode != 0: raise RuntimeError("command failed")
def _anserini_query_from_file(self, topicsfn, anserini_param_str, output_base_path): if not os.path.exists(topicsfn): raise IOError(f"could not find topics file: {topicsfn}") donefn = os.path.join(output_base_path, "done") if os.path.exists(donefn): logger.debug(f"skipping Anserini SearchCollection call because path already exists: {donefn}") return # create index if it does not exist. the call returns immediately if the index does exist. self.index.create_index() os.makedirs(output_base_path, exist_ok=True) output_path = os.path.join(output_base_path, "searcher") index_path = self.index.get_index_path() anserini_fat_jar = Anserini.get_fat_jar() cmd = [ "java", "-classpath", anserini_fat_jar, "-Xms512M", "-Xmx31G", "-Dapp.name=SearchCollection", "io.anserini.search.SearchCollection", "-topicreader", "TsvString", "-index", index_path, "-topics", topicsfn, "-output", output_path, "-inmem", "-threads", str(MAX_THREADS), "-stemmer", "none" if self.index.config["stemmer"] is None else self.index.config["stemmer"], ] + anserini_param_str.split() if self.index.config["indexstops"]: cmd += ["-keepStopwords"] logger.info("Anserini writing runs to %s", output_path) logger.debug(cmd) app = subprocess.Popen(cmd, stdout=subprocess.PIPE, universal_newlines=True) # Anserini output is verbose, so ignore DEBUG log lines and send other output through our logger for line in app.stdout: Anserini.filter_and_log_anserini_output(line, logger) app.wait() if app.returncode != 0: raise RuntimeError("command failed") with open(donefn, "wt") as donef: print("done", file=donef)
def _create_index(self): outdir = self.get_index_path() collection_path, document_type, generator_type = self.collection.get_path_and_types( ) anserini_fat_jar = Anserini.get_fat_jar() cmd = [ "java", "-classpath", anserini_fat_jar, "-Xms512M", "-Xmx31G", "-Dapp.name='IndexCollection'", "io.anserini.index.IndexCollection", "-collection", document_type, "-generator", generator_type, "-threads", str(MAX_THREADS), "-input", collection_path, "-index", outdir, "-stemmer", "none" if self.config["stemmer"] is None else self.config["stemmer"], ] if self.config["indexstops"]: cmd += ["-keepStopwords"] if not self.collection.is_large_collection: cmd += [ "-storePositions", "-storeDocvectors", "-storeContents", ] logger.info("building index %s", outdir) logger.debug(cmd) os.makedirs(os.path.basename(outdir), exist_ok=True) app = subprocess.Popen(cmd, stdout=subprocess.PIPE, universal_newlines=True) # Anserini output is verbose, so ignore DEBUG log lines and send other output through our logger for line in app.stdout: Anserini.filter_and_log_anserini_output(line, logger) app.wait() if app.returncode != 0: raise RuntimeError("command failed")
def _anserini_query_from_file(self, topicsfn, anserini_param_str, output_base_path): if not os.path.exists(topicsfn): raise IOError(f"could not find topics file: {topicsfn}") donefn = os.path.join(output_base_path, "done") if os.path.exists(donefn): logger.debug( f"skipping Anserini SearchCollection call because path already exists: {donefn}" ) return # create index if it does not exist. the call returns immediately if the index does exist. self["index"].create_index() os.makedirs(output_base_path, exist_ok=True) output_path = os.path.join(output_base_path, "searcher") # add stemmer and stop options to match underlying index indexopts = f"-stemmer {self['index'].cfg['stemmer']}" if self["index"].cfg["indexstops"]: indexopts += " -keepstopwords" index_path = self["index"].get_index_path() anserini_fat_jar = Anserini.get_fat_jar() cmd = f"java -classpath {anserini_fat_jar} -Xms512M -Xmx31G -Dapp.name=SearchCollection io.anserini.search.SearchCollection -topicreader Trec -index {index_path} {indexopts} -topics {topicsfn} -output {output_path} -inmem -threads {MAX_THREADS} {anserini_param_str}" logger.info("Anserini writing runs to %s", output_path) logger.debug(cmd) app = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE, universal_newlines=True) # Anserini output is verbose, so ignore DEBUG log lines and send other output through our logger for line in app.stdout: Anserini.filter_and_log_anserini_output(line, logger) app.wait() if app.returncode != 0: raise RuntimeError("command failed") with open(donefn, "wt") as donef: print("done", file=donef)