Example #1
0
class CDSSM(Reranker):
    """Yelong Shen, Xiaodong He, Jianfeng Gao, Li Deng, and Grégoire Mesnil. 2014. A Latent Semantic Model with Convolutional-Pooling Structure for Information Retrieval. In CIKM'14."""

    module_name = "CDSSM"

    dependencies = [
        Dependency(key="extractor", module="extractor", name="slowembedtext"),
        Dependency(key="trainer", module="trainer", name="pytorch"),
    ]
    config_spec = [
        ConfigOption("nkernel", 3, "kernel dimension in conv"),
        ConfigOption("nfilter", 1, "number of filters in conv"),
        ConfigOption("nhiddens", 30, "hidden layer dimension for ffw layer"),
        ConfigOption("windowsize", 3, "number of query/document words to concatenate before conv"),
        ConfigOption("dropoutrate", 0, "dropout rate for conv"),
    ]

    def build_model(self):
        if not hasattr(self, "model"):
            self.model = CDSSM_class(self.extractor, self.config)

        return self.model

    def score(self, d):
        query_sentence = d["query"]
        pos_sentence, neg_sentence = d["posdoc"], d["negdoc"]
        return [self.model(pos_sentence, query_sentence).view(-1), self.model(neg_sentence, query_sentence).view(-1)]

    def test(self, data):
        query_sentence, pos_sentence = data["query"], data["posdoc"]

        return self.model(pos_sentence, query_sentence).view(-1)

    def zero_grad(self, *args, **kwargs):
        self.model.zero_grad(*args, **kwargs)
Example #2
0
class TFParade(Reranker):
    """
    TensorFlow implementation of PARADE.

    PARADE: Passage Representation Aggregation for Document Reranking.
    Canjia Li, Andrew Yates, Sean MacAvaney, Ben He, and Yingfei Sun. arXiv 2020.
    https://arxiv.org/pdf/2008.09093.pdf
    """

    module_name = "parade"

    dependencies = [
        Dependency(key="extractor",
                   module="extractor",
                   name="pooledbertpassage"),
        Dependency(key="trainer", module="trainer", name="tensorflow"),
    ]
    config_spec = [
        ConfigOption(
            "pretrained", "bert-base-uncased",
            "Pretrained model: bert-base-uncased, bert-base-msmarco, or electra-base-msmarco"
        ),
        ConfigOption("aggregation", "transformer"),
    ]

    def build_model(self):
        self.model = TFParade_Class(self.extractor, self.config)
        return self.model
Example #3
0
class DSSM(Reranker):
    """Po-Sen Huang, Xiaodong He, Jianfeng Gao, Li Deng, Alex Acero, and Larry Heck. 2013. Learning deep structured semantic models for web search using clickthrough data. In CIKM'13."""

    module_name = "DSSM"
    dependencies = [
        Dependency(key="extractor", module="extractor", name="bagofwords"),
        Dependency(key="trainer", module="trainer", name="pytorch", default_config_overrides={"lr": 0.0001}),
    ]
    config_spec = [
        ConfigOption(
            "nhiddens",
            "56",
            "list of hidden layer sizes (eg '56 128'), where the i'th value indicates the output size of the i'th layer",
        )
    ]

    def build_model(self):
        if not hasattr(self, "model"):
            self.model = DSSM_class(self.extractor, self.config)
        return self.model

    def score(self, d):
        query_idf = d["query_idf"]
        query_sentence = d["query"]
        pos_sentence, neg_sentence = d["posdoc"], d["negdoc"]
        return [
            self.model(pos_sentence, query_sentence, query_idf).view(-1),
            self.model(neg_sentence, query_sentence, query_idf).view(-1),
        ]

    def test(self, d):
        query_idf = d["query_idf"]
        query_sentence = d["query"]
        pos_sentence = d["posdoc"]
        return self.model(pos_sentence, query_sentence, query_idf).view(-1)
Example #4
0
class TFBERTMaxP(Reranker):
    """
    TensorFlow implementation of BERT-MaxP.

    Deeper Text Understanding for IR with Contextual Neural Language Modeling. Zhuyun Dai and Jamie Callan. SIGIR 2019.
    https://arxiv.org/pdf/1905.09217.pdf
    """

    module_name = "TFBERTMaxP"

    dependencies = [
        Dependency(key="extractor", module="extractor", name="bertpassage"),
        Dependency(key="trainer", module="trainer", name="tensorflow"),
    ]
    config_spec = [
        ConfigOption(
            "pretrained",
            "bert-base-uncased",
            "Pretrained model: bert-base-uncased, bert-base-msmarco, electra-base-msmarco, or HuggingFace supported models",
        ),
        ConfigOption("aggregation", "max"),
        ConfigOption("hidden_dropout_prob", 0.1, "The dropout probability of BERT-like model's hidden layers."),
    ]

    def build_model(self):
        self.model = TFBERTMaxP_Class(self.extractor, self.config)
        return self.model
Example #5
0
class CEDRKNRM(Reranker):
    """
    PyTorch implementation of CEDR-KNRM.
    Equivalant to BERT-KNRM when cls=None.

    CEDR: Contextualized Embeddings for Document Ranking
    Sean MacAvaney, Andrew Yates, Arman Cohan, and Nazli Goharian. SIGIR 2019.
    https://arxiv.org/pdf/1904.07094
    """

    module_name = "CEDRKNRM"

    dependencies = [
        Dependency(key="extractor",
                   module="extractor",
                   name="pooledbertpassage"),
        Dependency(key="trainer", module="trainer", name="pytorch"),
    ]
    config_spec = [
        ConfigOption(
            "pretrained",
            "electra-base",
            "Pretrained model: bert-base-uncased, bert-base-msmarco, electra-base, or electra-base-msmarco",
        ),
        ConfigOption("mus",
                     [-0.9, -0.7, -0.5, -0.3, -0.1, 0.1, 0.3, 0.5, 0.7, 0.9],
                     "mus",
                     value_type="floatlist"),
        ConfigOption("sigma", 0.1, "sigma"),
        ConfigOption("gradkernels", True, "tune mus and sigmas"),
        ConfigOption(
            "hidden_dropout_prob", 0.1,
            "The dropout probability of BERT-like model's hidden layers."),
        ConfigOption("simmat_layers",
                     "0..12,1",
                     "Layer outputs to include in similarity matrix",
                     value_type="intlist"),
        ConfigOption(
            "combine_hidden", 1024,
            "Hidden size to use with combination FC layer (0 to disable)"),
        ConfigOption("cls", "avg", "Handling of CLS token: avg, max, or None"),
    ]

    def build_model(self):
        if not hasattr(self, "model"):
            self.model = CEDRKNRM_Class(self.extractor, self.config)
        return self.model

    def score(self, d):
        return [
            self.model(d["pos_bert_input"], d["pos_mask"],
                       d["pos_seg"]).view(-1),
            self.model(d["neg_bert_input"], d["neg_mask"],
                       d["neg_seg"]).view(-1),
        ]

    def test(self, d):
        return self.model(d["pos_bert_input"], d["pos_mask"],
                          d["pos_seg"]).view(-1)
Example #6
0
class Reranker(ModuleBase):
    """Base class for Reranker modules. The purpose of a Reranker is to predict relevance scores for input documents. Rerankers are generally supervised methods implemented in PyTorch or TensorFlow.

    Modules should provide:
        - a ``build_model`` method that initializes the model used
        - a ``score`` and a ``test`` method that take a representation created by an :class:`~capreolus.extractor.Extractor` module as input and return document scores
        - a ``load_weights`` and a ``save_weights`` method, if the base class' PyTorch methods cannot be used
    """

    module_type = "reranker"
    dependencies = [
        Dependency(key="extractor", module="extractor", name="embedtext"),
        Dependency(key="trainer", module="trainer", name="pytorch"),
    ]

    def add_summary(self, summary_writer, niter):
        """
        Write to the summay_writer custom visualizations/data specific to this reranker
        """
        for name, weight in self.model.named_parameters():
            summary_writer.add_histogram(name, weight.data.cpu(), niter)
            # summary_writer.add_histogram(f'{name}.grad', weight.grad, niter)

    def save_weights(self, weights_fn, optimizer):
        if not os.path.exists(os.path.dirname(weights_fn)):
            os.makedirs(os.path.dirname(weights_fn))

        d = {
            k: v
            for k, v in self.model.state_dict().items()
            if ("embedding.weight" not in k and "_nosave_" not in k)
        }
        with open(weights_fn, "wb") as outf:
            pickle.dump(d, outf, protocol=-1)

        optimizer_fn = weights_fn.as_posix() + ".optimizer"
        with open(optimizer_fn, "wb") as outf:
            pickle.dump(optimizer.state_dict(), outf, protocol=-1)

    def load_weights(self, weights_fn, optimizer):
        with open(weights_fn, "rb") as f:
            d = pickle.load(f)

        cur_keys = set(k for k in self.model.state_dict().keys()
                       if not ("embedding.weight" in k or "_nosave_" in k))
        missing = cur_keys - set(d.keys())
        if len(missing) > 0:
            raise RuntimeError(
                "loading state_dict with keys that do not match current model: %s"
                % missing)

        self.model.load_state_dict(d, strict=False)

        optimizer_fn = weights_fn.as_posix() + ".optimizer"
        with open(optimizer_fn, "rb") as f:
            optimizer.load_state_dict(pickle.load(f))
Example #7
0
class TK(Reranker):
    """Sebastian Hofstätter, Markus Zlabinger, and Allan Hanbury. 2019. TU Wien @ TREC Deep Learning '19 -- Simple Contextualization for Re-ranking. In TREC '19."""

    module_name = "TK"

    dependencies = [
        Dependency(key="extractor", module="extractor", name="slowembedtext"),
        Dependency(key="trainer", module="trainer", name="pytorch"),
    ]
    config_spec = [
        ConfigOption("gradkernels", True, "backprop through mus and sigmas"),
        ConfigOption(
            "scoretanh", False,
            "use a tanh on the prediction as in paper (True) or do not use a nonlinearity (False)"
        ),
        ConfigOption(
            "singlefc", True,
            "use single fully connected layer as in paper (True) or 2 fully connected layers (False)"
        ),
        ConfigOption("projdim", 32),
        ConfigOption("ffdim", 100),
        ConfigOption("numlayers", 2),
        ConfigOption("numattheads", 10),
        ConfigOption("alpha", 0.5),
        ConfigOption("usemask", False),
        ConfigOption("usemixer", False),
        ConfigOption(
            "finetune", False,
            "fine tune the embedding layer"),  # TODO check save when True
    ]

    def build_model(self):
        if not hasattr(self, "model"):
            self.model = TK_class(self.extractor, self.config)
        return self.model

    def score(self, d):
        query_idf = d["query_idf"]
        query_sentence = d["query"]
        pos_sentence, neg_sentence = d["posdoc"], d["negdoc"]
        return [
            self.model(pos_sentence, query_sentence, query_idf).view(-1),
            self.model(neg_sentence, query_sentence, query_idf).view(-1),
        ]

    def test(self, d):
        query_idf = d["query_idf"]
        query_sentence = d["query"]
        pos_sentence = d["posdoc"]
        return self.model(pos_sentence, query_sentence, query_idf).view(-1)
Example #8
0
class ConvKNRM(Reranker):
    """Zhuyun Dai, Chenyan Xiong, Jamie Callan, and Zhiyuan Liu. 2018. Convolutional Neural Networks for Soft-Matching N-Grams in Ad-hoc Search. In WSDM'18."""

    module_name = "ConvKNRM"

    dependencies = [
        Dependency(key="extractor", module="extractor", name="slowembedtext"),
        Dependency(key="trainer", module="trainer", name="pytorch"),
    ]
    config_spec = [
        ConfigOption("gradkernels", True, "backprop through mus and sigmas"),
        ConfigOption("maxngram", 3, "maximum ngram length considered"),
        ConfigOption(
            "crossmatch", True,
            "match query and document ngrams of different lengths (e.g., bigram vs. unigram)"
        ),
        ConfigOption("filters", 128,
                     "number of filters used in convolutional layers"),
        ConfigOption(
            "scoretanh", False,
            "use a tanh on the prediction as in paper (True) or do not use a nonlinearity (False)"
        ),
        ConfigOption(
            "singlefc", True,
            "use single fully connected layer as in paper (True) or 2 fully connected layers (False)"
        ),
    ]

    def build_model(self):
        if not hasattr(self, "model"):
            self.model = ConvKNRM_class(self.extractor, self.config)
        return self.model

    def score(self, d):
        query_idf = d["query_idf"]
        query_sentence = d["query"]
        pos_sentence, neg_sentence = d["posdoc"], d["negdoc"]
        return [
            self.model(pos_sentence, query_sentence, query_idf).view(-1),
            self.model(neg_sentence, query_sentence, query_idf).view(-1),
        ]

    def test(self, d):
        query_idf = d["query_idf"]
        query_sentence = d["query"]
        pos_sentence = d["posdoc"]
        return self.model(pos_sentence, query_sentence, query_idf).view(-1)

    def zero_grad(self, *args, **kwargs):
        self.model.zero_grad(*args, **kwargs)
Example #9
0
class TutorialTask(Task):
    module_name = "tutorial"
    config_spec = [
        ConfigOption("optimize", "map",
                     "metric to maximize on the validation set")
    ]
    dependencies = [
        Dependency(key="benchmark",
                   module="benchmark",
                   name="nf",
                   provide_this=True,
                   provide_children=["collection"]),
        Dependency(key="searcher1", module="searcher", name="BM25RM3"),
        Dependency(key="searcher2", module="searcher", name="SDM"),
    ]

    commands = ["run"] + Task.help_commands
    default_command = "run"

    def run(self):
        output_dir = self.get_results_path()

        # read the title queries from the chosen benchmark's topic file
        results1 = self.searcher1.query_from_file(
            self.benchmark.get_topics_file(), output_dir / "searcher1")
        results2 = self.searcher2.query_from_file(
            self.benchmark.get_topics_file(), output_dir / "searcher2")
        searcher_results = [results1, results2]

        # using the benchmark's folds, which each contain train/validation/test queries,
        # choose the best run in `output_dir` for the fold based on the validation queries
        # and return metrics calculated on the test queries
        best_results = evaluator.search_best_run(
            searcher_results,
            self.benchmark,
            primary_metric=self.config["optimize"],
            metrics=evaluator.DEFAULT_METRICS)

        for fold, path in best_results["path"].items():
            shortpath = "..." + path[-40:]
            logger.info("fold=%s best run: %s", fold, shortpath)

        logger.info("cross-validated results when optimizing for '%s':",
                    self.config["optimize"])
        for metric, score in sorted(best_results["score"].items()):
            logger.info("%15s: %0.4f", metric, score)

        return best_results
Example #10
0
class AxiomaticSemanticMatching(Searcher, AnseriniSearcherMixIn):
    """ Anserini BM25 with Axiomatic query expansion. This searcher's parameters can also be specified as lists indicating parameters to grid search (e.g., ``"0.4,0.6,0.8,1.0"`` or ``"0.4..1,0.2"``). """

    module_name = "axiomatic"
    dependencies = [Dependency(key="index", module="index", name="anserini")]
    config_spec = [
        ConfigOption("k1",
                     0.9,
                     "controls term saturation",
                     value_type="floatlist"),
        ConfigOption("b",
                     0.4,
                     "controls document length normalization",
                     value_type="floatlist"),
        ConfigOption("r", 20, value_type="intlist"),
        ConfigOption("n", 30, value_type="intlist"),
        ConfigOption("beta", 0.4, value_type="floatlist"),
        ConfigOption("top", 20, value_type="intlist"),
        ConfigOption("hits", 1000, "number of results to return"),
        ConfigOption("fields", "title"),
    ]

    def _query_from_file(self, topicsfn, output_path, config):
        hits = str(config["hits"])

        anserini_param_str = "-axiom -axiom.deterministic -axiom.r {0} -axiom.n {1} -axiom.beta {2} -axiom.top {3}".format(
            *[list2str(config[k], " ") for k in ["r", "n", "beta", "top"]])
        anserini_param_str += " -bm25 -bm25.k1 {0} -bm25.b {1} ".format(
            *[list2str(config[k], " ") for k in ["k1", "b"]])
        anserini_param_str += f" -hits {hits}"
        self._anserini_query_from_file(topicsfn, anserini_param_str,
                                       output_path, config["fields"])

        return output_path
Example #11
0
class MSMarcoPassageKeywords(MSMarcoPassage):
    module_name = "msmarcopsg_keywords"
    dependencies = MSMarcoPassage.dependencies + [
        Dependency(
            key="tokenizer",
            module="tokenizer",
            name="anserini",
            default_config_overrides={
                "keepstops": False,
                "stemmer": "none"
            },  # don't keepStops, don't stem
        ),
    ]

    topic_file = MSMarcoPassage.data_dir / "topics.keyword.txt"

    def download_if_missing(self):
        super().download_if_missing()
        full_topic_file = super().topic_file
        assert full_topic_file.exists()
        if self.topic_file.exists():
            return

        title = load_trec_topics(full_topic_file)["title"]
        with open(self.topic_file, "w") as f:
            for qid, full_topic in title:
                kw_topic = self.tokenizer.tokenize(full_topic)
                f.write(topic_to_trectxt(qid, kw_topic))
Example #12
0
class BM25Grid(Searcher, AnseriniSearcherMixIn):
    """ Deprecated. BM25 with a grid search for k1 and b. Search is from 0.1 to bmax/k1max in 0.1 increments """

    module_name = "BM25Grid"
    dependencies = [Dependency(key="index", module="index", name="anserini")]
    config_spec = [
        ConfigOption(
            "k1max", 1.0,
            "maximum k1 value to include in grid search (starting at 0.1)"),
        ConfigOption(
            "bmax", 1.0,
            "maximum b value to include in grid search (starting at 0.1)"),
        ConfigOption("hits", 1000, "number of results to return"),
        ConfigOption("fields", "title"),
    ]

    def _query_from_file(self, topicsfn, output_path, config):
        bs = np.around(np.arange(0.1, config["bmax"] + 0.1, 0.1), 1)
        k1s = np.around(np.arange(0.1, config["k1max"] + 0.1, 0.1), 1)
        bstr = " ".join(str(x) for x in bs)
        k1str = " ".join(str(x) for x in k1s)
        hits = config["hits"]
        anserini_param_str = f"-bm25 -bm25.b {bstr} -bm25.k1 {k1str} -hits {hits}"

        self._anserini_query_from_file(topicsfn, anserini_param_str,
                                       output_path, config["fields"])

        return output_path
Example #13
0
class DirichletQL(Searcher, AnseriniSearcherMixIn):
    """ Anserini QL with Dirichlet smoothing. This searcher's parameters can also be specified as lists indicating parameters to grid search (e.g., ``"0.4,0.6,0.8,1.0"`` or ``"0.4..1,0.2"``). """

    module_name = "DirichletQL"
    dependencies = [Dependency(key="index", module="index", name="anserini")]

    config_spec = [
        ConfigOption("mu", 1000, "smoothing parameter", value_type="intlist"),
        ConfigOption("hits", 1000, "number of results to return"),
        ConfigOption("fields", "title"),
    ]

    def _query_from_file(self, topicsfn, output_path, config):
        """
        Runs Dirichlet QL search. Takes a query from the topic files, and fires it against the index
        Args:
            topicsfn: Path to a topics file
            output_path: Path where the results of the search (i.e the run file) should be stored

        Returns: Path to the run file where the results of the search are stored

        """
        mustr = list2str(config["mu"], delimiter=" ")
        hits = config["hits"]
        anserini_param_str = f"-qld -qld.mu {mustr} -hits {hits}"
        self._anserini_query_from_file(topicsfn, anserini_param_str,
                                       output_path, config["fields"])

        return output_path
Example #14
0
class CovidAbstract(IRDBenchmark):
    module_name = "covidabstract"
    query_type = "title"
    ird_dataset_names = ["cord19/trec-covid"]
    dependencies = [
        Dependency(key="collection", module="collection", name="covidabstract")
    ]
    fold_file = PACKAGE_PATH / "data" / "covid_random_folds.json"
Example #15
0
class DUET(Reranker):
    """Bhaskar Mitra, Fernando Diaz, and Nick Craswell. 2017. Learning to Match using Local and Distributed Representations of Text for Web Search. In WWW'17."""

    module_name = "DUET"

    dependencies = [
        Dependency(key="extractor", module="extractor", name="slowembedtext"),
        Dependency(key="trainer", module="trainer", name="pytorch"),
    ]
    config_spec = [
        ConfigOption("nfilters", 10,
                     "number of filters for both local and distrbuted model"),
        ConfigOption("lmhidden", 30,
                     "ffw hidden layer dimension for local model"),
        ConfigOption("nhidden", 699,
                     "ffw hidden layer dimension for local model"),
        ConfigOption(
            "idfweight", True,
            "whether to weight each query word with its idf value in local model"
        ),
        ConfigOption("dropoutrate", 0.5, "dropout probability"),
        ConfigOption("activation", "relu",
                     "ffw layer activation: tanh or relu"),
    ]

    def build_model(self):
        if not hasattr(self, "model"):
            self.model = DUET_class(self.extractor, self.config)
        return self.model

    def score(self, d):
        query_idf = d["query_idf"]
        query_sentence = d["query"]
        pos_sentence, neg_sentence = d["posdoc"], d["negdoc"]
        return [
            self.model(pos_sentence, query_sentence, query_idf).view(-1),
            self.model(neg_sentence, query_sentence, query_idf).view(-1),
        ]

    def test(self, d):
        query_idf = d["query_idf"]
        query_sentence = d["query"]
        pos_sentence = d["posdoc"]
        return self.model(pos_sentence, query_sentence, query_idf).view(-1)
Example #16
0
class HINT(Reranker):
    """Yixing Fan, Jiafeng Guo, Yanyan Lan, Jun Xu, Chengxiang Zhai, and Xueqi Cheng. 2018. Modeling Diverse Relevance Patterns in Ad-hoc Retrieval. In SIGIR'18."""

    module_name = "HINT"

    dependencies = [
        Dependency(key="extractor", module="extractor", name="slowembedtext"),
        Dependency(key="trainer", module="trainer", name="pytorch"),
    ]
    config_spec = [
        ConfigOption("spatialGRU", 2),
        ConfigOption("LSTMdim", 6),
        ConfigOption("kmax", 10)
    ]

    def test(self, query_sentence, query_idf, pos_sentence, *args, **kwargs):
        return self.model.test_forward(query_sentence, query_idf, pos_sentence)

    def score(self, d):
        query_idf = d["query_idf"]
        query_sentence = d["query"]
        pos_sentence, neg_sentence = d["posdoc"], d["negdoc"]
        return self.model(query_sentence, query_idf, pos_sentence,
                          neg_sentence)

    def test(self, d):
        query_idf = d["query_idf"]
        query_sentence = d["query"]
        pos_sentence = d["posdoc"]

        return self.model.test_forward(query_sentence, query_idf, pos_sentence)

    def zero_grad(self, *args, **kwargs):
        self.model.zero_grad(*args, **kwargs)

    def build_model(self):
        if not hasattr(self, "model"):
            config = dict(self.config)
            config.update(self.extractor.config)
            config["batch"] = self.trainer.config["batch"]
            self.model = HiNT_main(self.extractor, config)

        return self.model
Example #17
0
class Gov2(IRDBenchmark):
    module_name = "gov2"
    query_type = "title"
    ird_dataset_names = [
        "gov2/trec-tb-2004", "gov2/trec-tb-2005", "gov2/trec-tb-2006"
    ]
    dependencies = [
        Dependency(key="collection", module="collection", name="gov2")
    ]
    fold_file = PACKAGE_PATH / "data" / "gov2_maxp_folds.json"
Example #18
0
class DeepTileBar(Reranker):
    """Zhiwen Tang and Grace Hui Yang. 2019. DeepTileBars: Visualizing Term Distribution for Neural Information Retrieval. In AAAI'19."""

    module_name = "DeepTileBar"

    dependencies = [
        Dependency(key="extractor", module="extractor", name="deeptiles"),
        Dependency(key="trainer", module="trainer", name="pytorch"),
    ]

    config_spec = [
        ConfigOption("passagelen", 30),
        ConfigOption("numberfilter", 3),
        ConfigOption("lstmhiddendim", 3),
        ConfigOption("linearhiddendim1", 32),
        ConfigOption("linearhiddendim2", 16),
    ]

    def build_model(self):
        if not hasattr(self, "model"):
            config = copy.copy(dict(self.config))
            config["batch"] = self.trainer.config["batch"]
            self.model = DeepTileBar_class(self.extractor, config)

        return self.model

    def score(self, d):
        pos_tile_matrix = torch.cat(
            [d["posdoc"][i] for i in range(len(d["qid"]))])  # 32 x
        neg_tile_matrix = torch.cat(
            [d["negdoc"][i] for i in range(len(d["qid"]))])
        return self.model(pos_tile_matrix, neg_tile_matrix)

    def test(self, d):
        qids = d["qid"]
        pos_sentence = d["posdoc"]
        pos_tile_matrix = torch.cat(
            [pos_sentence[i] for i in range(len(qids))])

        return self.model.test_forward(pos_tile_matrix)

    def zero_grad(self, *args, **kwargs):
        self.model.zero_grad(*args, **kwargs)
Example #19
0
class Genomics(IRDBenchmark):
    module_name = "genomics"
    query_type = "text"
    ird_dataset_names = [
        "highwire/trec-genomics-2006", "highwire/trec-genomics-2007"
    ]
    dependencies = [
        Dependency(key="collection", module="collection", name="highwire")
    ]
    fold_file = PACKAGE_PATH / "data" / "genomics_5folds.json"
Example #20
0
class Birch(Reranker):
    module_name = "birch"

    config_spec = [
        ConfigOption("topk", 3, "top k scores to use"),
        ConfigOption(
            "hidden", 0,
            "size of hidden layer or 0 to take the weighted sum of the topk"),
        ConfigOption("finetune", False, "fine-tune the BERT model"),
        ConfigOption(
            "pretrained", "msmarco_mb",
            "pretrained Birch model to load: mb, msmarco_mb, or car_mb"),
    ]
    dependencies = [
        Dependency(
            key="extractor",
            module="extractor",
            name="bertpassage",
            default_config_overrides={
                "tokenizer": {
                    "pretrained": "bert-large-uncased"
                }
            },
        ),
        Dependency(key="trainer", module="trainer", name="pytorch"),
    ]

    def build_model(self):
        self.model = Birch_Class(self.extractor, self.config)
        return self.model

    def score(self, d):
        return [
            self.model(d["pos_bert_input"], d["pos_seg"],
                       d["pos_mask"]).view(-1),
            self.model(d["neg_bert_input"], d["neg_seg"],
                       d["neg_mask"]).view(-1),
        ]

    def test(self, d):
        return self.model(d["pos_bert_input"], d["pos_seg"],
                          d["pos_mask"]).view(-1)
Example #21
0
class PTParade(Reranker):
    """
    PyTorch implementation of PARADE.

    PARADE: Passage Representation Aggregation for Document Reranking.
    Canjia Li, Andrew Yates, Sean MacAvaney, Ben He, and Yingfei Sun. arXiv 2020.
    https://arxiv.org/pdf/2008.09093.pdf
    """

    module_name = "ptparade"

    dependencies = [
        Dependency(key="extractor",
                   module="extractor",
                   name="pooledbertpassage"),
        Dependency(key="trainer", module="trainer", name="pytorch"),
    ]
    config_spec = [
        ConfigOption(
            "pretrained", "bert-base-uncased",
            "Pretrained model: bert-base-uncased, bert-base-msmarco, or electra-base-msmarco"
        ),
        ConfigOption("aggregation", "transformer"),
    ]

    def build_model(self):
        if not hasattr(self, "model"):
            self.model = PTParade_Class(self.extractor, self.config)
        return self.model

    def score(self, d):
        return [
            self.model(d["pos_bert_input"], d["pos_mask"],
                       d["pos_seg"]).view(-1),
            self.model(d["neg_bert_input"], d["neg_mask"],
                       d["neg_seg"]).view(-1),
        ]

    def test(self, d):
        return self.model(d["pos_bert_input"], d["pos_mask"],
                          d["pos_seg"]).view(-1)
Example #22
0
class GOV2Benchmark(Benchmark):
    module_name = "gov2benchmark"
    query_type = "title"

    qrel_file = DATA_PATH / module_name / "qrels.gov2.txt"
    topic_file = DATA_PATH / module_name / "topics.gov2.701-750.751-800.801-850.txt"
    fold_file = DATA_PATH / module_name / "gov2.json"
    dependencies = [
        Dependency(key="collection",
                   module="collection",
                   name="gov2collection")
    ]
Example #23
0
class TFKNRM(Reranker):
    """TensorFlow implementation of KNRM.

    Chenyan Xiong, Zhuyun Dai, Jamie Callan, Zhiyuan Liu, and Russell Power. 2017. End-to-End Neural Ad-hoc Ranking with Kernel Pooling. In SIGIR'17.
    """

    module_name = "TFKNRM"

    dependencies = [
        Dependency(key="extractor", module="extractor", name="slowembedtext"),
        Dependency(key="trainer", module="trainer", name="tensorflow"),
    ]
    config_spec = [
        ConfigOption("gradkernels", True, "backprop through mus and sigmas"),
        ConfigOption("finetune", False, "fine tune the embedding layer"),  # TODO check save when True
    ]

    def build_model(self):
        self.model = TFKNRM_Class(self.extractor, self.config)

        return self.model
Example #24
0
class TFVanillaBERT(Reranker):
    """
    TensorFlow implementation of Vanilla BERT.
    Input is of the form [CLS] sentence A [SEP] sentence B [SEP]
    The "score" of a query (sentence A) - document (sentence B) pair is the probability that the document is relevant
    to the query. This is achieved through a linear classifier layer attached to BERT's last layer and using the logits[1] as the score.
    """

    module_name = "TFVanillaBERT"

    dependencies = [
        Dependency(key="extractor", module="extractor", name="bertpassage"),
        Dependency(key="trainer", module="trainer", name="tensorflow"),
    ]
    config_spec = [
        ConfigOption("pretrained", "bert-base-uncased",
                     "pretrained model to load")
    ]

    def build_model(self):
        self.model = TFVanillaBert_Class(self.extractor, self.config)
        return self.model
Example #25
0
class Robust04Yang19(Benchmark):
    """Robust04 benchmark using the folds from Yang et al. [1]

    [1] Wei Yang, Kuang Lu, Peilin Yang, and Jimmy Lin. 2019. Critically Examining the "Neural Hype": Weak Baselines and the Additivity of Effectiveness Gains from Neural Ranking Models. SIGIR 2019.
    """

    module_name = "robust04.yang19"
    dependencies = [
        Dependency(key="collection", module="collection", name="robust04")
    ]
    qrel_file = PACKAGE_PATH / "data" / "qrels.robust2004.txt"
    topic_file = PACKAGE_PATH / "data" / "topics.robust04.301-450.601-700.txt"
    fold_file = PACKAGE_PATH / "data" / "rob04_yang19_folds.json"
    query_type = "title"
Example #26
0
class ANTIQUE(Benchmark):
    """A Non-factoid Question Answering Benchmark from Hashemi et al. [1]

    [1] Helia Hashemi, Mohammad Aliannejadi, Hamed Zamani, and W. Bruce Croft. 2020. ANTIQUE: A non-factoid question answering benchmark. ECIR 2020.
    """

    module_name = "antique"
    dependencies = [
        Dependency(key="collection", module="collection", name="antique")
    ]
    qrel_file = PACKAGE_PATH / "data" / "qrels.antique.txt"
    topic_file = PACKAGE_PATH / "data" / "topics.antique.txt"
    fold_file = PACKAGE_PATH / "data" / "antique.json"
    query_type = "title"
    relevance_level = 2
class GovIndex(AnseriniIndex):
    module_name = "gov2index"
    path = ""  # store the anserini index
    dependencies = [
        Dependency(key="collection",
                   module="collection",
                   name="gov2collection")
    ]

    def get_index_path(self):
        return self.path

    def exists(self):
        return True

    def get_doc(self, doc_id):
        return self.collection.get_doc(doc_id)
Example #28
0
class Robust04(Benchmark):
    """Robust04 benchmark using the title folds from Huston and Croft. [1] Each of these is used as the test set.
    Given the remaining four folds, we split them into the same train and dev sets used in recent work. [2]

    [1] Samuel Huston and W. Bruce Croft. 2014. Parameters learned in the comparison of retrieval models using term dependencies. Technical Report.

    [2] Sean MacAvaney, Andrew Yates, Arman Cohan, Nazli Goharian. 2019. CEDR: Contextualized Embeddings for Document Ranking. SIGIR 2019.
    """

    module_name = "robust04"
    dependencies = [
        Dependency(key="collection", module="collection", name="robust04")
    ]
    qrel_file = PACKAGE_PATH / "data" / "qrels.robust2004.txt"
    topic_file = PACKAGE_PATH / "data" / "topics.robust04.301-450.601-700.txt"
    fold_file = PACKAGE_PATH / "data" / "rob04_cedr_folds.json"
    query_type = "title"
Example #29
0
class CDS(IRDBenchmark):
    module_name = "cds"
    ird_dataset_names = [
        "pmc/v1/trec-cds-2014", "pmc/v1/trec-cds-2015", "pmc/v2/trec-cds-2016"
    ]
    dependencies = [
        Dependency(key="collection", module="collection", name="cds")
    ]
    fold_file = PACKAGE_PATH / "data" / "cds_5folds.json"
    query_type = "summary"
    query_types = {}  # diagnosis, treatment, or test

    def build(self):
        self.topics

    def ird_load_qrels(self):
        qrels = {}
        for name in self.ird_dataset_names:
            year = name.split("-")[-1]
            assert len(year) == 4

            dataset = ir_datasets.load(name)
            for qrel in dataset.qrels_iter():
                qid = year + qrel.query_id
                qrels.setdefault(qid, {})
                qrels[qid][qrel.doc_id] = max(qrel.relevance,
                                              qrels[qid].get(qrel.doc_id, -1))

        return qrels

    def ird_load_topics(self):
        topics = {}
        field = "description" if self.query_type == "desc" else self.query_type

        for name in self.ird_dataset_names:
            year = name.split("-")[-1]
            assert len(year) == 4

            dataset = ir_datasets.load(name)
            for query in dataset.queries_iter():
                qid = year + query.query_id
                topics[qid] = getattr(query, field).replace("\n", " ")
                self.query_types[qid] = query.type

        return {self.query_type: topics}
Example #30
0
class INL2(Searcher, AnseriniSearcherMixIn):
    """ Anserini I(n)L2 scoring model. This searcher does not support list parameters. """

    module_name = "INL2"
    dependencies = [Dependency(key="index", module="index", name="anserini")]
    config_spec = [
        ConfigOption(
            "c", 0.1
        ),  # array input of this parameter is not support by anserini.SearchCollection
        ConfigOption("hits", 1000, "number of results to return"),
        ConfigOption("fields", "title"),
    ]

    def _query_from_file(self, topicsfn, output_path, config):
        anserini_param_str = "-inl2 -inl2.c {0} -hits {1}".format(
            config["c"], config["hits"])
        self._anserini_query_from_file(topicsfn, anserini_param_str,
                                       output_path, config["fields"])
        return output_path