def test_dssm_unigram(dummy_index, tmpdir, tmpdir_as_cache, monkeypatch): benchmark = DummyBenchmark() reranker = DSSM( { "nhiddens": "56", "trainer": { "niters": 1, "itersize": 4, "batch": 2 } }, provide={ "index": dummy_index, "benchmark": benchmark }, ) extractor = reranker.extractor metric = "map" extractor.preprocess(["301"], ["LA010189-0001", "LA010189-0002"], benchmark.topics[benchmark.query_type]) reranker.build_model() train_run = {"301": ["LA010189-0001", "LA010189-0002"]} train_dataset = TrainTripletSampler() train_dataset.prepare(train_run, benchmark.qrels, extractor) dev_dataset = PredSampler() dev_dataset.prepare(train_run, benchmark.qrels, extractor) reranker.trainer.train(reranker, train_dataset, Path(tmpdir) / "train", dev_dataset, Path(tmpdir) / "dev", benchmark.qrels, metric) assert os.path.exists(Path(tmpdir) / "train" / "dev.best")
def test_train_sampler(monkeypatch, tmpdir): benchmark = DummyBenchmark() extractor = EmbedText( {"tokenizer": {"keepstops": True}}, provide={"collection": benchmark.collection, "benchmark": benchmark} ) training_judgments = benchmark.qrels.copy() train_dataset = TrainTripletSampler() train_dataset.prepare(training_judgments, training_judgments, extractor) def mock_id2vec(*args, **kwargs): return {"query": np.array([1, 2, 3, 4]), "posdoc": np.array([1, 1, 1, 1]), "negdoc": np.array([2, 2, 2, 2])} monkeypatch.setattr(EmbedText, "id2vec", mock_id2vec) dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=32) for idx, batch in enumerate(dataloader): assert len(batch["query"]) == 32 assert len(batch["posdoc"]) == 32 assert len(batch["negdoc"]) == 32 assert np.array_equal(batch["query"][0], np.array([1, 2, 3, 4])) assert np.array_equal(batch["query"][30], np.array([1, 2, 3, 4])) assert np.array_equal(batch["posdoc"][0], np.array([1, 1, 1, 1])) assert np.array_equal(batch["posdoc"][30], np.array([1, 1, 1, 1])) assert np.array_equal(batch["negdoc"][0], np.array([2, 2, 2, 2])) assert np.array_equal(batch["negdoc"][30], np.array([2, 2, 2, 2])) # Just making sure that the dataloader can do multiple iterations if idx > 3: break
def test_birch(dummy_index, tmpdir, tmpdir_as_cache, monkeypatch): benchmark = DummyBenchmark() reranker = Birch({"trainer": { "niters": 1, "itersize": 2, "batch": 2 }}, provide={ "index": dummy_index, "benchmark": benchmark }) extractor = reranker.extractor metric = "map" extractor.preprocess(["301"], ["LA010189-0001", "LA010189-0002"], benchmark.topics[benchmark.query_type]) reranker.build_model() reranker.searcher_scores = { "301": { "LA010189-0001": 2, "LA010189-0002": 1 } } train_run = {"301": ["LA010189-0001", "LA010189-0002"]} train_dataset = TrainTripletSampler() train_dataset.prepare(train_run, benchmark.qrels, extractor) dev_dataset = PredSampler() dev_dataset.prepare(train_run, benchmark.qrels, extractor) reranker.trainer.train(reranker, train_dataset, Path(tmpdir) / "train", dev_dataset, Path(tmpdir) / "dev", benchmark.qrels, metric)
def test_tfvanillabert(dummy_index, tmpdir, tmpdir_as_cache, monkeypatch): benchmark = DummyBenchmark({"collection": {"name": "dummy"}}) reranker = TFVanillaBERT( { "pretrained": "bert-base-uncased", "extractor": { "name": "bertpassage", "usecache": False, "maxseqlen": 32, "numpassages": 1, "passagelen": 15, "stride": 5, "index": { "name": "anserini", "indexstops": False, "stemmer": "porter", "collection": { "name": "dummy" } }, }, "trainer": { "name": "tensorflow", "batch": 1, "niters": 1, "itersize": 2, "lr": 0.001, "validatefreq": 1, "usecache": False, "tpuname": None, "tpuzone": None, "storage": None, "boardname": "default", "loss": "pairwise_hinge_loss", }, }, provide=benchmark, ) reranker.extractor.preprocess(["301"], ["LA010189-0001", "LA010189-0002"], benchmark.topics[benchmark.query_type]) reranker.build_model() reranker.bm25_scores = {"301": {"LA010189-0001": 2, "LA010189-0002": 1}} train_run = {"301": ["LA010189-0001", "LA010189-0002"]} train_dataset = TrainTripletSampler() train_dataset.prepare(train_run, benchmark.qrels, reranker.extractor) dev_dataset = PredSampler() dev_dataset.prepare(train_run, benchmark.qrels, reranker.extractor) reranker.trainer.train(reranker, train_dataset, Path(tmpdir) / "train", dev_dataset, Path(tmpdir) / "dev", benchmark.qrels, "map")
def test_tk(dummy_index, tmpdir, tmpdir_as_cache, monkeypatch): def fake_magnitude_embedding(*args, **kwargs): return np.zeros((1, 8), dtype=np.float32), {0: "<pad>"}, {"<pad>": 0} monkeypatch.setattr(SlowEmbedText, "_load_pretrained_embeddings", fake_magnitude_embedding) benchmark = DummyBenchmark() reranker = TK( { "gradkernels": True, "scoretanh": False, "singlefc": True, "projdim": 32, "ffdim": 100, "numlayers": 2, "numattheads": 4, "alpha": 0.5, "usemask": False, "usemixer": True, "finetune": True, "trainer": { "niters": 1, "itersize": 4, "batch": 2 }, }, provide={ "index": dummy_index, "benchmark": benchmark }, ) extractor = reranker.extractor metric = "map" extractor.preprocess(["301"], ["LA010189-0001", "LA010189-0002"], benchmark.topics[benchmark.query_type]) reranker.build_model() train_run = {"301": ["LA010189-0001", "LA010189-0002"]} train_dataset = TrainTripletSampler() train_dataset.prepare(train_run, benchmark.qrels, extractor) dev_dataset = PredSampler() dev_dataset.prepare(train_run, benchmark.qrels, extractor) reranker.trainer.train(reranker, train_dataset, Path(tmpdir) / "train", dev_dataset, Path(tmpdir) / "dev", benchmark.qrels, metric) assert os.path.exists(Path(tmpdir) / "train" / "dev.best")
def test_CDSSM(dummy_index, tmpdir, tmpdir_as_cache, monkeypatch): def fake_magnitude_embedding(*args, **kwargs): return np.zeros((1, 8), dtype=np.float32), {0: "<pad>"}, {"<pad>": 0} monkeypatch.setattr(SlowEmbedText, "_load_pretrained_embeddings", fake_magnitude_embedding) benchmark = DummyBenchmark() reranker = CDSSM( { "nkernel": 3, "nfilter": 1, "nhiddens": 30, "windowsize": 3, "dropoutrate": 0, "trainer": { "niters": 1, "itersize": 2, "batch": 1 }, }, provide={ "index": dummy_index, "benchmark": benchmark }, ) extractor = reranker.extractor metric = "map" extractor.preprocess(["301"], ["LA010189-0001", "LA010189-0002"], benchmark.topics[benchmark.query_type]) reranker.build_model() reranker.searcher_scores = { "301": { "LA010189-0001": 2, "LA010189-0002": 1 } } train_run = {"301": ["LA010189-0001", "LA010189-0002"]} train_dataset = TrainTripletSampler() train_dataset.prepare(train_run, benchmark.qrels, extractor) dev_dataset = PredSampler() dev_dataset.prepare(train_run, benchmark.qrels, extractor) reranker.trainer.train(reranker, train_dataset, Path(tmpdir) / "train", dev_dataset, Path(tmpdir) / "dev", benchmark.qrels, metric)
def test_knrm_pytorch(dummy_index, tmpdir, tmpdir_as_cache, monkeypatch): def fake_load_embeddings(self): self.embeddings = np.zeros((1, 50)) self.stoi = {"<pad>": 0} self.itos = {v: k for k, v in self.stoi.items()} monkeypatch.setattr(EmbedText, "_load_pretrained_embeddings", fake_load_embeddings) benchmark = DummyBenchmark() reranker = KNRM( { "gradkernels": True, "scoretanh": False, "singlefc": True, "finetune": False, "trainer": { "niters": 1, "itersize": 4, "batch": 2 }, }, provide={ "index": dummy_index, "benchmark": benchmark }, ) extractor = reranker.extractor metric = "map" extractor.preprocess(["301"], ["LA010189-0001", "LA010189-0002"], benchmark.topics[benchmark.query_type]) reranker.build_model() train_run = {"301": ["LA010189-0001", "LA010189-0002"]} train_dataset = TrainTripletSampler() train_dataset.prepare(train_run, benchmark.qrels, extractor) dev_dataset = PredSampler() dev_dataset.prepare(train_run, benchmark.qrels, extractor) reranker.trainer.train(reranker, train_dataset, Path(tmpdir) / "train", dev_dataset, Path(tmpdir) / "dev", benchmark.qrels, metric) assert os.path.exists(Path(tmpdir) / "train" / "dev.best")
def test_deeptilebars(dummy_index, tmpdir, tmpdir_as_cache, monkeypatch): def fake_magnitude_embedding(*args, **kwargs): return Magnitude(None) monkeypatch.setattr(DeepTileExtractor, "_get_pretrained_emb", fake_magnitude_embedding) benchmark = DummyBenchmark() reranker = DeepTileBar( { "name": "DeepTileBar", "passagelen": 30, "numberfilter": 3, "lstmhiddendim": 3, "linearhiddendim1": 32, "linearhiddendim2": 16, "trainer": { "niters": 1, "itersize": 4, "batch": 2 }, }, provide={ "index": dummy_index, "benchmark": benchmark }, ) extractor = reranker.extractor metric = "map" extractor.preprocess(["301"], ["LA010189-0001", "LA010189-0002"], benchmark.topics[benchmark.query_type]) reranker.build_model() train_run = {"301": ["LA010189-0001", "LA010189-0002"]} train_dataset = TrainTripletSampler() train_dataset.prepare(train_run, benchmark.qrels, extractor) dev_dataset = PredSampler() dev_dataset.prepare(train_run, benchmark.qrels, extractor) reranker.trainer.train(reranker, train_dataset, Path(tmpdir) / "train", dev_dataset, Path(tmpdir) / "dev", benchmark.qrels, metric) assert os.path.exists(Path(tmpdir) / "train" / "dev.best")
def test_HINT(dummy_index, tmpdir, tmpdir_as_cache, monkeypatch): def fake_magnitude_embedding(*args, **kwargs): return np.zeros((1, 8), dtype=np.float32), {0: "<pad>"}, {"<pad>": 0} monkeypatch.setattr(SlowEmbedText, "_load_pretrained_embeddings", fake_magnitude_embedding) benchmark = DummyBenchmark() reranker = HINT( { "spatialGRU": 2, "LSTMdim": 6, "kmax": 10, "trainer": { "niters": 1, "itersize": 2, "batch": 1 } }, provide={ "index": dummy_index, "benchmark": benchmark }, ) extractor = reranker.extractor metric = "map" extractor.preprocess(["301"], ["LA010189-0001", "LA010189-0002"], benchmark.topics[benchmark.query_type]) reranker.build_model() train_run = {"301": ["LA010189-0001", "LA010189-0002"]} train_dataset = TrainTripletSampler() train_dataset.prepare(train_run, benchmark.qrels, extractor) dev_dataset = PredSampler() dev_dataset.prepare(train_run, benchmark.qrels, extractor) reranker.trainer.train(reranker, train_dataset, Path(tmpdir) / "train", dev_dataset, Path(tmpdir) / "dev", benchmark.qrels, metric) assert os.path.exists(Path(tmpdir) / "train" / "dev.best")
def test_tf_find_cached_tf_records(monkeypatch, dummy_index): def fake_magnitude_embedding(*args, **kwargs): return np.zeros((1, 8), dtype=np.float32), {0: "<pad>"}, {"<pad>": 0} monkeypatch.setattr(SlowEmbedText, "_load_pretrained_embeddings", fake_magnitude_embedding) reranker = TFKNRM( { "gradkernels": True, "finetune": False, "trainer": { "niters": 1, "itersize": 4, "batch": 2 } }, provide={"index": dummy_index}, ) extractor = reranker.extractor benchmark = DummyBenchmark() extractor.preprocess(["301"], ["LA010189-0001", "LA010189-0002"], benchmark.topics[benchmark.query_type]) train_run = {"301": ["LA010189-0001", "LA010189-0002"]} train_dataset = TrainTripletSampler() train_dataset.prepare(train_run, benchmark.qrels, extractor) required_samples = 8 reranker.trainer.convert_to_tf_train_record(reranker, train_dataset) assert reranker.trainer.find_cached_tf_records( train_dataset, required_samples) is not None assert reranker.trainer.find_cached_tf_records( train_dataset, required_samples - 4) is not None assert reranker.trainer.find_cached_tf_records(train_dataset, 24) is None reranker = TFKNRM( { "gradkernels": True, "finetune": False, "trainer": { "niters": 1, "itersize": 4, "batch": 6 } }, provide={"index": dummy_index}, ) reranker.extractor.preprocess(["301"], ["LA010189-0001", "LA010189-0002"], benchmark.topics[benchmark.query_type]) train_dataset.prepare(train_run, benchmark.qrels, extractor) reranker.trainer.convert_to_tf_train_record(reranker, train_dataset) assert reranker.trainer.find_cached_tf_records(train_dataset, 24) is not None assert reranker.trainer.find_cached_tf_records(train_dataset, 18) is not None
def test_tf_get_tf_dataset(monkeypatch): benchmark = DummyBenchmark() extractor = SlowEmbedText( { "maxdoclen": 4, "maxqlen": 4, "tokenizer": { "keepstops": True } }, provide={"collection": benchmark.collection}) training_judgments = benchmark.qrels.copy() train_dataset = TrainTripletSampler() train_dataset.prepare(training_judgments, training_judgments, extractor) reranker = collections.namedtuple("reranker", "extractor")(extractor=extractor) def mock_id2vec(*args, **kwargs): return { "query": np.array([1, 2, 3, 4], dtype=np.long), "posdoc": np.array([1, 1, 1, 1], dtype=np.long), "negdoc": np.array([2, 2, 2, 2], dtype=np.long), "qid": "1", "posdocid": "posdoc1", "negdocid": "negdoc1", "query_idf": np.array([0.1, 0.1, 0.2, 0.1], dtype=np.float), } monkeypatch.setattr(SlowEmbedText, "id2vec", mock_id2vec) trainer = TensorflowTrainer({ "name": "tensorflow", "batch": 2, "niters": 2, "itersize": 16, "lr": 0.001, "validatefreq": 1, "usecache": False, "tpuname": None, "tpuzone": None, "storage": None, }) tf_record_filenames = trainer.convert_to_tf_train_record( reranker, train_dataset) for filename in tf_record_filenames: assert os.path.isfile(filename) tf_record_dataset = trainer.load_tf_train_records_from_file( reranker, tf_record_filenames, 2) dataset = tf_record_dataset for idx, data_and_label in enumerate(dataset): batch, _ = data_and_label tf.debugging.assert_equal( batch[0], tf.convert_to_tensor(np.array([[1, 1, 1, 1], [1, 1, 1, 1]]), dtype=tf.int64)) tf.debugging.assert_equal( batch[1], tf.convert_to_tensor(np.array([[2, 2, 2, 2], [2, 2, 2, 2]]), dtype=tf.int64)) tf.debugging.assert_equal( batch[2], tf.convert_to_tensor(np.array([[1, 2, 3, 4], [1, 2, 3, 4]]), dtype=tf.int64)) tf.debugging.assert_equal( batch[3], tf.convert_to_tensor(np.array([[0.1, 0.1, 0.2, 0.1], [0.1, 0.1, 0.2, 0.1]]), dtype=tf.float32))