Esempio n. 1
0
    def get_most_relevant_doc_based_on_config(config, query_string, target_index):
        """
        1. Instantiate various classes based on config
        2. Get the most relevant doc
        """
        # We still need to init a pipeline because it pre-processes some config params, and we rely on that to
        # construct paths e.t.c.
        config = config.copy()  # because we end up modifying config
        pipeline = Pipeline(config)
        pipeline.initialize(config)
        path_dict = pipeline.get_paths(config)
        index_path = target_index
        index_class = Index.get_index_from_index_path(index_path)
        index = index_class(pipeline.collection, index_path, None)  # TODO: Pass a proper index_key
        model_class = Reranker.ALL[config["reranker"]]
        tokenizer = NeuralQueryView.get_tokenizer(pipeline, config, index_class.name)
        embedding_holder = EmbeddingHolder.get_instance(config.get("embeddings", "glove6b"))
        trained_weight_path = path_dict["trained_weight_path"]
        config = NeuralQueryView.add_model_required_params_to_config(config, embedding_holder)

        return NeuralQueryView.do_query(
            config,
            query_string,
            pipeline,
            index,
            tokenizer,
            embedding_holder,
            model_class,
            trained_weight_path=trained_weight_path,
        )
Esempio n. 2
0
def test_get_parameters_to_module():
    pipeline = Pipeline({})
    ex = sacred.Experiment("capreolus")

    parameters_to_module = pipeline.get_parameters_to_module(ex)
    assert parameters_to_module == {
        "collection": "module",
        "index": "module",
        "searcher": "module",
        "benchmark": "module",
        "reranker": "module",
        "expid": "stateless",
        "earlystopping": "stateless",
        "predontrain": "stateless",
        "fold": "stateless",
        "maxdoclen": "pipeline",
        "maxqlen": "pipeline",
        "batch": "pipeline",
        "niters": "pipeline",
        "itersize": "pipeline",
        "gradacc": "pipeline",
        "lr": "pipeline",
        "seed": "pipeline",
        "sample": "pipeline",
        "softmaxloss": "pipeline",
        "dataparallel": "pipeline",
    }
Esempio n. 3
0
def _train(_config):
    pipeline_config = _config
    early_stopping = pipeline_config["earlystopping"]
    pipeline = Pipeline(pipeline_config)
    pipeline.initialize(pipeline_config)
    reranker = pipeline.reranker
    benchmark = pipeline.benchmark
    fold = benchmark.folds.get(pipeline.cfg["fold"], None)
    datagen = benchmark.training_tuples(fold["train_qids"])
    run_path = os.path.join(pipeline.reranker_path, pipeline.cfg["fold"])
    weight_path = os.path.join(run_path, "weights")

    prepare_batch = functools.partial(
        _prepare_batch_with_strings,
        device=torch.device("cuda:0" if torch.cuda.is_available() else "cpu"))
    batches_per_epoch = pipeline_config["itersize"] // pipeline_config["batch"]
    batches_per_step = pipeline_config.get("gradacc", 1)

    optimizer = reranker.get_optimizer()
    best_accuracy = 0

    for niter in range(pipeline.cfg["niters"]):
        reranker.model.train()
        reranker.next_iteration()

        for bi, data in enumerate(datagen):
            data = prepare_batch(data)

            tag_scores = reranker.score(data)
            loss = pipeline.lossf(tag_scores[0], tag_scores[1],
                                  pipeline.cfg["batch"])
            loss.backward()

            if bi % batches_per_step == 0:
                optimizer.step()
                optimizer.zero_grad()

            if (bi + 1) % batches_per_epoch == 0:
                break

        if early_stopping:
            current_accuracy = max(evaluate_pipeline(pipeline))
            if current_accuracy > best_accuracy:
                logger.debug(
                    "Current accuracy: {0} is greater than best so far: {1}".
                    format(current_accuracy, best_accuracy))
                best_accuracy = current_accuracy
                reranker.save(os.path.join(weight_path, "dev"))

    # TODO: Do early stopping to return the best instance of the reranker
    if early_stopping:
        reranker.load(os.path.join(weight_path, "dev"))

    return pipeline
Esempio n. 4
0
def test_get_module_to_class():
    pipeline = Pipeline({})
    module_choices = {"reranker": "KNRM"}  # default is PACRR

    module2class = pipeline.get_module_to_class({})
    assert module2class["collection"].__class__ == Collection
    assert module2class["index"].__class__ == AnseriniIndex.__class__
    assert module2class["searcher"].__class__ == BM25Grid.__class__
    assert module2class["benchmark"].__class__ == Robust04Benchmark.__class__
    assert module2class["reranker"].__class__ == PACRR.__class__

    module2class = pipeline.get_module_to_class(module_choices)
    assert module2class["reranker"].__class__ == KNRM.__class__
Esempio n. 5
0
def test_check_for_invalid_keys():
    pipeline = Pipeline({})
    ex = sacred.Experiment("capreolus")
    pipeline.check_for_invalid_keys()

    pipeline.parameters_to_module["foo_bar"] = "reranker"

    with pytest.raises(ValueError):
        pipeline.check_for_invalid_keys()
Esempio n. 6
0
def test_query_view_get_most_relevant_doc(trec_index, anserini_tokenizer,
                                          embedding_holder, mocker):
    @property
    def mock_qrels(collection, *args):
        collection._qrels = {"q_s1": {"doc_1": "LA010189-0001"}}
        return collection._qrels

    @property
    def mock_topics(collection, *args):
        collection._topics = {"title": {"q_1": "Dummy Doc"}}
        return collection._topics

    mocker.patch.object(Collection, "qrels", mock_qrels)
    mocker.patch.object(Collection, "topics", mock_topics)

    query_string = "world"
    _, docs = BM25View.do_query("world", trec_index, 5)
    all_tokens = NeuralQueryView.get_tokens_from_docs_and_query(
        anserini_tokenizer, docs, query_string)
    embedding_holder.create_indexed_embedding_layer_from_tokens(all_tokens)

    config = {
        "maxdoclen": 10,
        "maxqlen": 5,
        "gradkernels": True,
        "singlefc": True,
        "scoretanh": False,
        "pad_token": 0,
        "batch": 3,
    }

    pipeline = Pipeline(dict())
    model_class = KNRM

    result_dicts = NeuralQueryView.do_query(config, query_string, pipeline,
                                            trec_index, anserini_tokenizer,
                                            embedding_holder, model_class)

    expected = [
        {
            "doc_id": "LA010189-0002",
            "doc":
            "Dummy Dummy Dummy Hello world, greetings from outer space!",
            "relevance": 0
        },
        {
            "doc_id": "LA010189-0001",
            "doc":
            "Dummy Dummy Dummy Hello world, greetings from outer space!",
            "relevance": 0
        },
    ]
    assert set([tuple(x.items()) for x in result_dicts
                ]) == set([tuple(x.items()) for x in expected])
Esempio n. 7
0
def test_get_parameters_to_module_including_missing_and_extractors():
    """
        Calls Pipeline.__init__() which in turn calls
        1. self.get_parameters_to_module
        2. get_parameters_to_module_for_missing_parameters
        3. get_parameters_to_module_for_feature_parameters
    """
    pipeline = Pipeline({})
    ex = sacred.Experiment("capreolus")

    # parameters_to_module, parameter_types = pipeline.get_parameters_to_module_for_missing_parameters(ex)

    assert pipeline.parameters_to_module == {
        "collection": "module",
        "benchmark": "module",
        "reranker": "module",
        "expid": "stateless",
        "predontrain": "stateless",
        "earlystopping": "stateless",
        "maxdoclen": "pipeline",
        "maxqlen": "pipeline",
        "batch": "pipeline",
        "niters": "pipeline",
        "itersize": "pipeline",
        "gradacc": "pipeline",
        "lr": "pipeline",
        "seed": "pipeline",
        "sample": "pipeline",
        "softmaxloss": "pipeline",
        "dataparallel": "pipeline",
        # AnseriniIndex specific config
        "stemmer": "index",
        "indexstops": "index",
        # BM25Grid specific config
        "index": "module",
        # Robust04Benchmark specific config
        "fold": "stateless",
        "searcher": "module",
        "rundocsonly": "benchmark",
        # PACRR specific config
        "mingram": "reranker",
        "maxgram": "reranker",
        "nfilters": "reranker",
        "idf": "reranker",
        "kmax": "reranker",
        "combine": "reranker",
        "nonlinearity": "reranker",
        # EmbedText specific config
        "embeddings": "extractor",
        "keepstops": "extractor",
    }
Esempio n. 8
0
def test_get_parameter_types(mocker):
    pipeline = Pipeline({})
    ex = sacred.Experiment("capreolus")

    def mock_config(method_that_generates_input_dict):
        input_dict = method_that_generates_input_dict()

        # Just messing with the types to make sure that get_parameter_types does what it should
        input_dict.update({"index": None, "niters": True})
        return lambda: input_dict

    mocker.patch.object(ex, "config", mock_config)
    parameter_types = pipeline.get_parameter_types(ex)
    assert parameter_types == {
        "pipeline": type("string"),  # "pipeline" key is added by the method
        "collection": type("robust04"),
        "earlystopping": forced_types[type(True)],
        "index": forced_types[type(None)],
        "searcher": type("bm25grid"),
        "benchmark": type("robust04.title.wsdm20demo"),
        "reranker": type("PACRR"),
        "expid": type("debug"),
        "predontrain": forced_types[type(True)],
        "fold": type("s1"),
        "maxdoclen": type(800),
        "maxqlen": type(4),
        "batch": type(32),
        "niters": forced_types[type(True)],
        "itersize": type(4096),
        "gradacc": type(1),
        "lr": type(0.001),
        "seed": type(123_456),
        "sample": type("simple"),
        "softmaxloss": forced_types[type(True)],
        "dataparallel": type("none"),
    }
Esempio n. 9
0
def train_pipeline(pipeline_config, data_sources=None, early_stopping=False):
    pipeline = Pipeline(pipeline_config)
    # Ugly hack
    pipeline_config["earlystopping"] = early_stopping
    collection_name = pipeline_config["collection"]
    validate_datasources(data_sources)
    if data_sources is not None and data_sources.get("qrels") is not None:
        COLLECTIONS[collection_name].set_qrels(data_sources["qrels"])
    if data_sources is not None and data_sources.get("topics") is not None:
        COLLECTIONS[collection_name].set_topics(data_sources["topics"])
    if data_sources is not None and data_sources.get("documents") is not None:
        COLLECTIONS[collection_name].set_documents(data_sources["documents"])

    pipeline.ex.main(_train)

    run = pipeline.ex.run(config_updates=pipeline_config)
    return run.result
Esempio n. 10
0
def test_convknrm(monkeypatch, tmpdir):
    monkeypatch.setenv("CAPREOLUS_RESULTS",
                       str(os.path.join(tmpdir, "results")))
    monkeypatch.setenv("CAPREOLUS_CACHE", str(os.path.join(tmpdir, "cache")))

    def fake_magnitude_embedding(*args, **kwargs):
        return Magnitude(None)

    pipeline = Pipeline({
        "reranker": "ConvKNRM",
        "niters": 1,
        "benchmark": "dummy",
        "itersize": 1,
        "batch": 1
    })
    pipeline.ex.main(train.train)
    monkeypatch.setattr(train, "pipeline", pipeline)
    monkeypatch.setattr(EmbedText, "get_magnitude_embeddings",
                        fake_magnitude_embedding)
    pipeline.ex.run(
        config_updates={
            "reranker": "ConvKNRM",
            "niters": 1,
            "benchmark": "dummy",
            "itersize": 1,
            "batch": 1
        })
    logger.info("Base path is {0}".format(pipeline.base_path))

    config_files = search_files_or_folders_in_directory(
        pipeline.base_path, "config.json")
    assert len(config_files) == 1
    config_file = json.load(open(config_files[0], "rt"))
    assert config_file["reranker"] == "ConvKNRM"
    assert config_file["niters"] == 1

    run_path = os.path.join(pipeline.reranker_path, pipeline.cfg["fold"])
    weight_dir = os.path.join(run_path, "weights")
    weight_file = search_files_or_folders_in_directory(weight_dir, "dev")
    assert len(weight_file) == 1
Esempio n. 11
0
def test_deeptilebar(monkeypatch, tmpdir):
    monkeypatch.setenv("CAPREOLUS_RESULTS",
                       str(os.path.join(tmpdir, "results")))
    monkeypatch.setenv("CAPREOLUS_CACHE", str(os.path.join(tmpdir, "cache")))

    def fake_magnitude_embedding(*args, **kwargs):
        return Magnitude(None)

    monkeypatch.setattr(DeepTileExtractor, "get_magnitude_embeddings",
                        fake_magnitude_embedding)
    pipeline = Pipeline({
        "reranker": "DeepTileBar",
        "niters": 1,
        "benchmark": "dummy",
        "itersize": 1,
        "batch": 1,
        "passagelen": "3"
    })
    pipeline.ex.main(train.train)
    monkeypatch.setattr(train, "pipeline", pipeline)
    pipeline.ex.run(
        config_updates={
            "reranker": "DeepTileBar",
            "niters": 1,
            "benchmark": "dummy",
            "itersize": 1,
            "batch": 1
        })

    config_files = search_files_or_folders_in_directory(
        pipeline.base_path, "config.json")
    assert len(config_files) == 1
    config_file = json.load(open(config_files[0], "rt"))
    assert config_file["reranker"] == "DeepTileBar"
    assert config_file["niters"] == 1

    run_path = os.path.join(pipeline.reranker_path, pipeline.cfg["fold"])
    weight_dir = os.path.join(run_path, "weights")
    weight_file = search_files_or_folders_in_directory(weight_dir, "dev")
    assert len(weight_file) == 1
Esempio n. 12
0
from tqdm import tqdm
from scipy.stats import ttest_rel

curr_file_dir = os.path.dirname(os.path.realpath(__file__))
sys.path.append(curr_file_dir)

from capreolus.reranker.common import pair_hinge_loss, pair_softmax_loss
from capreolus.utils.loginit import get_logger
from capreolus.pipeline import Pipeline, cli_module_choice, modules
from capreolus.searcher import Searcher

logger = get_logger(__name__)  # pylint: disable=invalid-name
plt.switch_backend("agg")

pipeline = Pipeline(
    {module: cli_module_choice(sys.argv, module)
     for module in modules})
pipeline.ex.logger = logger


@pipeline.ex.main
def train(_config, _run):
    pipeline.initialize(_config)
    reranker = pipeline.reranker
    benchmark = pipeline.benchmark
    logger.debug("initialized pipeline with results path: %s",
                 pipeline.reranker_path)
    post_pipeline_init_time = time.time()
    run_path = os.path.join(pipeline.reranker_path, pipeline.cfg["fold"])
    logger.info("initialized pipeline with results path: %s", run_path)
    post_pipeline_init_time = time.time()
Esempio n. 13
0
    if len(args) == 1:
        return default

    if "with" in args:
        # if "with" appears, the command should appear immediately before it
        index = args.index("with") - 1
    else:
        # there is no "with", so command must be the last argument
        index = len(args) - 1

    # if index points to the program name, no command was provided
    if index == 0:
        return default

    # index points to the command name
    return args[index]


if __name__ == "__main__":
    task_command_str = parse_sacred_command(sys.argv)
    task_command_path = task_command_str.split(".")
    task = task_command_path[0]
    command = ".".join(task_command_path[1:])

    rewritten_args = list(sys.argv)
    task_index = rewritten_args.index(task_command_str)
    rewritten_args[task_index] = command

    pipeline = Pipeline(task, rewritten_args)
    pipeline.run()