name = "anserini" def __init__(self, index, stemmer="none", keepstops=False, use_cache=True): super().__init__(index) self.params = {"stemmer": stemmer, "keepstops": keepstops} if use_cache: self.initialize_cache(self.params) self.create() def create(self): stemmer = self.params["stemmer"] keepstops = self.params["keepstops"] if keepstops: emptyjchar = autoclass("org.apache.lucene.analysis.CharArraySet").EMPTY_SET self.analyzer = autoclass("io.anserini.analysis.EnglishStemmingAnalyzer")(stemmer, emptyjchar) else: self.analyzer = autoclass("io.anserini.analysis.EnglishStemmingAnalyzer")(stemmer) self._tokenize = autoclass("io.anserini.analysis.AnalyzerUtils").tokenize def tokenize(self, s): if not s: return [] return self._tokenize(self.analyzer, s).toArray() import_component_modules("tokenizer")
for name, cls in Index.ALL.items(): if name in index_path: return cls return None @staticmethod def config(): raise NotImplementedError("config method must be provided by subclass") @classmethod def register(cls, subcls): return register_component_module(cls, subcls) def exists(self): return os.path.exists(os.path.join(self.index_path, "done")) def _build_index(self, config): raise NotImplementedError def create(self, config): if self.exists(): return self._build_index(config) with open(os.path.join(self.index_path, "done"), "wt") as donef: print("done", file=donef) import_component_modules("index")
expected_metrics = {m for m in valid_metrics if not m.endswith("_cut") and m != "P"} | { m + "_" + str(cutoff) for cutoff in cut_points for m in valid_metrics if m.endswith("_cut") or m == "P" } if metric in ["ndcg", "ndcg_cut"]: mkey = "ndcg_cut_20" elif metric in expected_metrics: mkey = metric else: raise RuntimeError("requested metric %s is not one of the supported metrics: %s" % (metric, sorted(expected_metrics))) avg_metric = lambda run_metrics: np.mean([qid[mkey] for qid in run_metrics.values()]) assert len(set(dev["qrels"].keys()).intersection(test["qrels"].keys())) == 0 dev_eval = pytrec_eval.RelevanceEvaluator(dev["qrels"], valid_metrics) best_metric, best_alpha = -np.inf, None for alpha in np.arange(0, 1.001, 0.05): run_metrics = dev_eval.evaluate( Searcher.interpolate_runs(dev["reranker"], dev["searcher"], dev["qrels"].keys(), alpha) ) mavgp = avg_metric(run_metrics) if mavgp > best_metric: best_metric = mavgp best_alpha = alpha test_run = Searcher.interpolate_runs(test["reranker"], test["searcher"], test["qrels"].keys(), best_alpha) dev_run = Searcher.interpolate_runs(dev["reranker"], dev["searcher"], dev["qrels"].keys(), best_alpha) return (best_alpha, test_run, dev_run) import_component_modules("searcher")
posdocid = random.choice(reldocs[qid]) negdocid = random.choice(negdocs[qid]) features = self.get_features({"qid": qid, "posdocid": posdocid, "negdocid": negdocid}) if features is None: logger.warning("got none features: qid=%s posid=%s negid=%s", qid, posdocid, negdocid) continue for k, v in features.items(): batch[k].append(v) if len(batch["qid"]) == self.pipeline_config["batch"]: yield self.prepare_batch(batch) batch = defaultdict(list) return genf() def prepare_batch(self, batch): return batch @staticmethod def config(): raise NotImplementedError("config method must be provided by subclass") @classmethod def register(cls, subcls): return register_component_module(cls, subcls) import_component_modules("benchmark")