def test_dssm_unigram(dummy_index, tmpdir, tmpdir_as_cache, monkeypatch):
    benchmark = DummyBenchmark()
    reranker = DSSM(
        {
            "nhiddens": "56",
            "trainer": {
                "niters": 1,
                "itersize": 4,
                "batch": 2
            }
        },
        provide={
            "index": dummy_index,
            "benchmark": benchmark
        },
    )
    extractor = reranker.extractor
    metric = "map"

    extractor.preprocess(["301"], ["LA010189-0001", "LA010189-0002"],
                         benchmark.topics[benchmark.query_type])
    reranker.build_model()

    train_run = {"301": ["LA010189-0001", "LA010189-0002"]}
    train_dataset = TrainTripletSampler()
    train_dataset.prepare(train_run, benchmark.qrels, extractor)
    dev_dataset = PredSampler()
    dev_dataset.prepare(train_run, benchmark.qrels, extractor)
    reranker.trainer.train(reranker, train_dataset,
                           Path(tmpdir) / "train", dev_dataset,
                           Path(tmpdir) / "dev", benchmark.qrels, metric)

    assert os.path.exists(Path(tmpdir) / "train" / "dev.best")
Exemple #2
0
def test_train_sampler(monkeypatch, tmpdir):
    benchmark = DummyBenchmark()
    extractor = EmbedText(
        {"tokenizer": {"keepstops": True}}, provide={"collection": benchmark.collection, "benchmark": benchmark}
    )
    training_judgments = benchmark.qrels.copy()
    train_dataset = TrainTripletSampler()
    train_dataset.prepare(training_judgments, training_judgments, extractor)

    def mock_id2vec(*args, **kwargs):
        return {"query": np.array([1, 2, 3, 4]), "posdoc": np.array([1, 1, 1, 1]), "negdoc": np.array([2, 2, 2, 2])}

    monkeypatch.setattr(EmbedText, "id2vec", mock_id2vec)
    dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=32)
    for idx, batch in enumerate(dataloader):
        assert len(batch["query"]) == 32
        assert len(batch["posdoc"]) == 32
        assert len(batch["negdoc"]) == 32
        assert np.array_equal(batch["query"][0], np.array([1, 2, 3, 4]))
        assert np.array_equal(batch["query"][30], np.array([1, 2, 3, 4]))
        assert np.array_equal(batch["posdoc"][0], np.array([1, 1, 1, 1]))
        assert np.array_equal(batch["posdoc"][30], np.array([1, 1, 1, 1]))
        assert np.array_equal(batch["negdoc"][0], np.array([2, 2, 2, 2]))
        assert np.array_equal(batch["negdoc"][30], np.array([2, 2, 2, 2]))

        # Just making sure that the dataloader can do multiple iterations
        if idx > 3:
            break
def test_birch(dummy_index, tmpdir, tmpdir_as_cache, monkeypatch):
    benchmark = DummyBenchmark()
    reranker = Birch({"trainer": {
        "niters": 1,
        "itersize": 2,
        "batch": 2
    }},
                     provide={
                         "index": dummy_index,
                         "benchmark": benchmark
                     })
    extractor = reranker.extractor
    metric = "map"

    extractor.preprocess(["301"], ["LA010189-0001", "LA010189-0002"],
                         benchmark.topics[benchmark.query_type])
    reranker.build_model()
    reranker.searcher_scores = {
        "301": {
            "LA010189-0001": 2,
            "LA010189-0002": 1
        }
    }
    train_run = {"301": ["LA010189-0001", "LA010189-0002"]}
    train_dataset = TrainTripletSampler()
    train_dataset.prepare(train_run, benchmark.qrels, extractor)
    dev_dataset = PredSampler()
    dev_dataset.prepare(train_run, benchmark.qrels, extractor)
    reranker.trainer.train(reranker, train_dataset,
                           Path(tmpdir) / "train", dev_dataset,
                           Path(tmpdir) / "dev", benchmark.qrels, metric)
def test_tfvanillabert(dummy_index, tmpdir, tmpdir_as_cache, monkeypatch):
    benchmark = DummyBenchmark({"collection": {"name": "dummy"}})
    reranker = TFVanillaBERT(
        {
            "pretrained": "bert-base-uncased",
            "extractor": {
                "name": "bertpassage",
                "usecache": False,
                "maxseqlen": 32,
                "numpassages": 1,
                "passagelen": 15,
                "stride": 5,
                "index": {
                    "name": "anserini",
                    "indexstops": False,
                    "stemmer": "porter",
                    "collection": {
                        "name": "dummy"
                    }
                },
            },
            "trainer": {
                "name": "tensorflow",
                "batch": 1,
                "niters": 1,
                "itersize": 2,
                "lr": 0.001,
                "validatefreq": 1,
                "usecache": False,
                "tpuname": None,
                "tpuzone": None,
                "storage": None,
                "boardname": "default",
                "loss": "pairwise_hinge_loss",
            },
        },
        provide=benchmark,
    )

    reranker.extractor.preprocess(["301"], ["LA010189-0001", "LA010189-0002"],
                                  benchmark.topics[benchmark.query_type])
    reranker.build_model()
    reranker.bm25_scores = {"301": {"LA010189-0001": 2, "LA010189-0002": 1}}
    train_run = {"301": ["LA010189-0001", "LA010189-0002"]}
    train_dataset = TrainTripletSampler()
    train_dataset.prepare(train_run, benchmark.qrels, reranker.extractor)
    dev_dataset = PredSampler()
    dev_dataset.prepare(train_run, benchmark.qrels, reranker.extractor)
    reranker.trainer.train(reranker, train_dataset,
                           Path(tmpdir) / "train", dev_dataset,
                           Path(tmpdir) / "dev", benchmark.qrels, "map")
def test_tk(dummy_index, tmpdir, tmpdir_as_cache, monkeypatch):
    def fake_magnitude_embedding(*args, **kwargs):
        return np.zeros((1, 8), dtype=np.float32), {0: "<pad>"}, {"<pad>": 0}

    monkeypatch.setattr(SlowEmbedText, "_load_pretrained_embeddings",
                        fake_magnitude_embedding)

    benchmark = DummyBenchmark()
    reranker = TK(
        {
            "gradkernels": True,
            "scoretanh": False,
            "singlefc": True,
            "projdim": 32,
            "ffdim": 100,
            "numlayers": 2,
            "numattheads": 4,
            "alpha": 0.5,
            "usemask": False,
            "usemixer": True,
            "finetune": True,
            "trainer": {
                "niters": 1,
                "itersize": 4,
                "batch": 2
            },
        },
        provide={
            "index": dummy_index,
            "benchmark": benchmark
        },
    )
    extractor = reranker.extractor
    metric = "map"

    extractor.preprocess(["301"], ["LA010189-0001", "LA010189-0002"],
                         benchmark.topics[benchmark.query_type])
    reranker.build_model()

    train_run = {"301": ["LA010189-0001", "LA010189-0002"]}
    train_dataset = TrainTripletSampler()
    train_dataset.prepare(train_run, benchmark.qrels, extractor)
    dev_dataset = PredSampler()
    dev_dataset.prepare(train_run, benchmark.qrels, extractor)
    reranker.trainer.train(reranker, train_dataset,
                           Path(tmpdir) / "train", dev_dataset,
                           Path(tmpdir) / "dev", benchmark.qrels, metric)

    assert os.path.exists(Path(tmpdir) / "train" / "dev.best")
def test_CDSSM(dummy_index, tmpdir, tmpdir_as_cache, monkeypatch):
    def fake_magnitude_embedding(*args, **kwargs):
        return np.zeros((1, 8), dtype=np.float32), {0: "<pad>"}, {"<pad>": 0}

    monkeypatch.setattr(SlowEmbedText, "_load_pretrained_embeddings",
                        fake_magnitude_embedding)

    benchmark = DummyBenchmark()
    reranker = CDSSM(
        {
            "nkernel": 3,
            "nfilter": 1,
            "nhiddens": 30,
            "windowsize": 3,
            "dropoutrate": 0,
            "trainer": {
                "niters": 1,
                "itersize": 2,
                "batch": 1
            },
        },
        provide={
            "index": dummy_index,
            "benchmark": benchmark
        },
    )
    extractor = reranker.extractor
    metric = "map"

    extractor.preprocess(["301"], ["LA010189-0001", "LA010189-0002"],
                         benchmark.topics[benchmark.query_type])
    reranker.build_model()
    reranker.searcher_scores = {
        "301": {
            "LA010189-0001": 2,
            "LA010189-0002": 1
        }
    }
    train_run = {"301": ["LA010189-0001", "LA010189-0002"]}
    train_dataset = TrainTripletSampler()
    train_dataset.prepare(train_run, benchmark.qrels, extractor)
    dev_dataset = PredSampler()
    dev_dataset.prepare(train_run, benchmark.qrels, extractor)
    reranker.trainer.train(reranker, train_dataset,
                           Path(tmpdir) / "train", dev_dataset,
                           Path(tmpdir) / "dev", benchmark.qrels, metric)
def test_knrm_pytorch(dummy_index, tmpdir, tmpdir_as_cache, monkeypatch):
    def fake_load_embeddings(self):
        self.embeddings = np.zeros((1, 50))
        self.stoi = {"<pad>": 0}
        self.itos = {v: k for k, v in self.stoi.items()}

    monkeypatch.setattr(EmbedText, "_load_pretrained_embeddings",
                        fake_load_embeddings)

    benchmark = DummyBenchmark()
    reranker = KNRM(
        {
            "gradkernels": True,
            "scoretanh": False,
            "singlefc": True,
            "finetune": False,
            "trainer": {
                "niters": 1,
                "itersize": 4,
                "batch": 2
            },
        },
        provide={
            "index": dummy_index,
            "benchmark": benchmark
        },
    )
    extractor = reranker.extractor
    metric = "map"

    extractor.preprocess(["301"], ["LA010189-0001", "LA010189-0002"],
                         benchmark.topics[benchmark.query_type])
    reranker.build_model()

    train_run = {"301": ["LA010189-0001", "LA010189-0002"]}
    train_dataset = TrainTripletSampler()
    train_dataset.prepare(train_run, benchmark.qrels, extractor)

    dev_dataset = PredSampler()
    dev_dataset.prepare(train_run, benchmark.qrels, extractor)
    reranker.trainer.train(reranker, train_dataset,
                           Path(tmpdir) / "train", dev_dataset,
                           Path(tmpdir) / "dev", benchmark.qrels, metric)

    assert os.path.exists(Path(tmpdir) / "train" / "dev.best")
def test_deeptilebars(dummy_index, tmpdir, tmpdir_as_cache, monkeypatch):
    def fake_magnitude_embedding(*args, **kwargs):
        return Magnitude(None)

    monkeypatch.setattr(DeepTileExtractor, "_get_pretrained_emb",
                        fake_magnitude_embedding)
    benchmark = DummyBenchmark()
    reranker = DeepTileBar(
        {
            "name": "DeepTileBar",
            "passagelen": 30,
            "numberfilter": 3,
            "lstmhiddendim": 3,
            "linearhiddendim1": 32,
            "linearhiddendim2": 16,
            "trainer": {
                "niters": 1,
                "itersize": 4,
                "batch": 2
            },
        },
        provide={
            "index": dummy_index,
            "benchmark": benchmark
        },
    )
    extractor = reranker.extractor
    metric = "map"

    extractor.preprocess(["301"], ["LA010189-0001", "LA010189-0002"],
                         benchmark.topics[benchmark.query_type])
    reranker.build_model()

    train_run = {"301": ["LA010189-0001", "LA010189-0002"]}
    train_dataset = TrainTripletSampler()
    train_dataset.prepare(train_run, benchmark.qrels, extractor)
    dev_dataset = PredSampler()
    dev_dataset.prepare(train_run, benchmark.qrels, extractor)
    reranker.trainer.train(reranker, train_dataset,
                           Path(tmpdir) / "train", dev_dataset,
                           Path(tmpdir) / "dev", benchmark.qrels, metric)

    assert os.path.exists(Path(tmpdir) / "train" / "dev.best")
def test_HINT(dummy_index, tmpdir, tmpdir_as_cache, monkeypatch):
    def fake_magnitude_embedding(*args, **kwargs):
        return np.zeros((1, 8), dtype=np.float32), {0: "<pad>"}, {"<pad>": 0}

    monkeypatch.setattr(SlowEmbedText, "_load_pretrained_embeddings",
                        fake_magnitude_embedding)

    benchmark = DummyBenchmark()
    reranker = HINT(
        {
            "spatialGRU": 2,
            "LSTMdim": 6,
            "kmax": 10,
            "trainer": {
                "niters": 1,
                "itersize": 2,
                "batch": 1
            }
        },
        provide={
            "index": dummy_index,
            "benchmark": benchmark
        },
    )
    extractor = reranker.extractor
    metric = "map"

    extractor.preprocess(["301"], ["LA010189-0001", "LA010189-0002"],
                         benchmark.topics[benchmark.query_type])
    reranker.build_model()

    train_run = {"301": ["LA010189-0001", "LA010189-0002"]}
    train_dataset = TrainTripletSampler()
    train_dataset.prepare(train_run, benchmark.qrels, extractor)
    dev_dataset = PredSampler()
    dev_dataset.prepare(train_run, benchmark.qrels, extractor)
    reranker.trainer.train(reranker, train_dataset,
                           Path(tmpdir) / "train", dev_dataset,
                           Path(tmpdir) / "dev", benchmark.qrels, metric)

    assert os.path.exists(Path(tmpdir) / "train" / "dev.best")
def test_tf_find_cached_tf_records(monkeypatch, dummy_index):
    def fake_magnitude_embedding(*args, **kwargs):
        return np.zeros((1, 8), dtype=np.float32), {0: "<pad>"}, {"<pad>": 0}

    monkeypatch.setattr(SlowEmbedText, "_load_pretrained_embeddings",
                        fake_magnitude_embedding)

    reranker = TFKNRM(
        {
            "gradkernels": True,
            "finetune": False,
            "trainer": {
                "niters": 1,
                "itersize": 4,
                "batch": 2
            }
        },
        provide={"index": dummy_index},
    )
    extractor = reranker.extractor
    benchmark = DummyBenchmark()

    extractor.preprocess(["301"], ["LA010189-0001", "LA010189-0002"],
                         benchmark.topics[benchmark.query_type])

    train_run = {"301": ["LA010189-0001", "LA010189-0002"]}
    train_dataset = TrainTripletSampler()
    train_dataset.prepare(train_run, benchmark.qrels, extractor)

    required_samples = 8
    reranker.trainer.convert_to_tf_train_record(reranker, train_dataset)
    assert reranker.trainer.find_cached_tf_records(
        train_dataset, required_samples) is not None
    assert reranker.trainer.find_cached_tf_records(
        train_dataset, required_samples - 4) is not None
    assert reranker.trainer.find_cached_tf_records(train_dataset, 24) is None

    reranker = TFKNRM(
        {
            "gradkernels": True,
            "finetune": False,
            "trainer": {
                "niters": 1,
                "itersize": 4,
                "batch": 6
            }
        },
        provide={"index": dummy_index},
    )
    reranker.extractor.preprocess(["301"], ["LA010189-0001", "LA010189-0002"],
                                  benchmark.topics[benchmark.query_type])
    train_dataset.prepare(train_run, benchmark.qrels, extractor)
    reranker.trainer.convert_to_tf_train_record(reranker, train_dataset)
    assert reranker.trainer.find_cached_tf_records(train_dataset,
                                                   24) is not None
    assert reranker.trainer.find_cached_tf_records(train_dataset,
                                                   18) is not None
def test_tf_get_tf_dataset(monkeypatch):
    benchmark = DummyBenchmark()
    extractor = SlowEmbedText(
        {
            "maxdoclen": 4,
            "maxqlen": 4,
            "tokenizer": {
                "keepstops": True
            }
        },
        provide={"collection": benchmark.collection})
    training_judgments = benchmark.qrels.copy()
    train_dataset = TrainTripletSampler()
    train_dataset.prepare(training_judgments, training_judgments, extractor)

    reranker = collections.namedtuple("reranker",
                                      "extractor")(extractor=extractor)

    def mock_id2vec(*args, **kwargs):
        return {
            "query": np.array([1, 2, 3, 4], dtype=np.long),
            "posdoc": np.array([1, 1, 1, 1], dtype=np.long),
            "negdoc": np.array([2, 2, 2, 2], dtype=np.long),
            "qid": "1",
            "posdocid": "posdoc1",
            "negdocid": "negdoc1",
            "query_idf": np.array([0.1, 0.1, 0.2, 0.1], dtype=np.float),
        }

    monkeypatch.setattr(SlowEmbedText, "id2vec", mock_id2vec)
    trainer = TensorflowTrainer({
        "name": "tensorflow",
        "batch": 2,
        "niters": 2,
        "itersize": 16,
        "lr": 0.001,
        "validatefreq": 1,
        "usecache": False,
        "tpuname": None,
        "tpuzone": None,
        "storage": None,
    })

    tf_record_filenames = trainer.convert_to_tf_train_record(
        reranker, train_dataset)
    for filename in tf_record_filenames:
        assert os.path.isfile(filename)

    tf_record_dataset = trainer.load_tf_train_records_from_file(
        reranker, tf_record_filenames, 2)
    dataset = tf_record_dataset

    for idx, data_and_label in enumerate(dataset):
        batch, _ = data_and_label
        tf.debugging.assert_equal(
            batch[0],
            tf.convert_to_tensor(np.array([[1, 1, 1, 1], [1, 1, 1, 1]]),
                                 dtype=tf.int64))
        tf.debugging.assert_equal(
            batch[1],
            tf.convert_to_tensor(np.array([[2, 2, 2, 2], [2, 2, 2, 2]]),
                                 dtype=tf.int64))
        tf.debugging.assert_equal(
            batch[2],
            tf.convert_to_tensor(np.array([[1, 2, 3, 4], [1, 2, 3, 4]]),
                                 dtype=tf.int64))
        tf.debugging.assert_equal(
            batch[3],
            tf.convert_to_tensor(np.array([[0.1, 0.1, 0.2, 0.1],
                                           [0.1, 0.1, 0.2, 0.1]]),
                                 dtype=tf.float32))