def get_most_relevant_doc_based_on_config(config, query_string, target_index): """ 1. Instantiate various classes based on config 2. Get the most relevant doc """ # We still need to init a pipeline because it pre-processes some config params, and we rely on that to # construct paths e.t.c. config = config.copy() # because we end up modifying config pipeline = Pipeline(config) pipeline.initialize(config) path_dict = pipeline.get_paths(config) index_path = target_index index_class = Index.get_index_from_index_path(index_path) index = index_class(pipeline.collection, index_path, None) # TODO: Pass a proper index_key model_class = Reranker.ALL[config["reranker"]] tokenizer = NeuralQueryView.get_tokenizer(pipeline, config, index_class.name) embedding_holder = EmbeddingHolder.get_instance(config.get("embeddings", "glove6b")) trained_weight_path = path_dict["trained_weight_path"] config = NeuralQueryView.add_model_required_params_to_config(config, embedding_holder) return NeuralQueryView.do_query( config, query_string, pipeline, index, tokenizer, embedding_holder, model_class, trained_weight_path=trained_weight_path, )
def test_get_parameters_to_module(): pipeline = Pipeline({}) ex = sacred.Experiment("capreolus") parameters_to_module = pipeline.get_parameters_to_module(ex) assert parameters_to_module == { "collection": "module", "index": "module", "searcher": "module", "benchmark": "module", "reranker": "module", "expid": "stateless", "earlystopping": "stateless", "predontrain": "stateless", "fold": "stateless", "maxdoclen": "pipeline", "maxqlen": "pipeline", "batch": "pipeline", "niters": "pipeline", "itersize": "pipeline", "gradacc": "pipeline", "lr": "pipeline", "seed": "pipeline", "sample": "pipeline", "softmaxloss": "pipeline", "dataparallel": "pipeline", }
def _train(_config): pipeline_config = _config early_stopping = pipeline_config["earlystopping"] pipeline = Pipeline(pipeline_config) pipeline.initialize(pipeline_config) reranker = pipeline.reranker benchmark = pipeline.benchmark fold = benchmark.folds.get(pipeline.cfg["fold"], None) datagen = benchmark.training_tuples(fold["train_qids"]) run_path = os.path.join(pipeline.reranker_path, pipeline.cfg["fold"]) weight_path = os.path.join(run_path, "weights") prepare_batch = functools.partial( _prepare_batch_with_strings, device=torch.device("cuda:0" if torch.cuda.is_available() else "cpu")) batches_per_epoch = pipeline_config["itersize"] // pipeline_config["batch"] batches_per_step = pipeline_config.get("gradacc", 1) optimizer = reranker.get_optimizer() best_accuracy = 0 for niter in range(pipeline.cfg["niters"]): reranker.model.train() reranker.next_iteration() for bi, data in enumerate(datagen): data = prepare_batch(data) tag_scores = reranker.score(data) loss = pipeline.lossf(tag_scores[0], tag_scores[1], pipeline.cfg["batch"]) loss.backward() if bi % batches_per_step == 0: optimizer.step() optimizer.zero_grad() if (bi + 1) % batches_per_epoch == 0: break if early_stopping: current_accuracy = max(evaluate_pipeline(pipeline)) if current_accuracy > best_accuracy: logger.debug( "Current accuracy: {0} is greater than best so far: {1}". format(current_accuracy, best_accuracy)) best_accuracy = current_accuracy reranker.save(os.path.join(weight_path, "dev")) # TODO: Do early stopping to return the best instance of the reranker if early_stopping: reranker.load(os.path.join(weight_path, "dev")) return pipeline
def test_get_module_to_class(): pipeline = Pipeline({}) module_choices = {"reranker": "KNRM"} # default is PACRR module2class = pipeline.get_module_to_class({}) assert module2class["collection"].__class__ == Collection assert module2class["index"].__class__ == AnseriniIndex.__class__ assert module2class["searcher"].__class__ == BM25Grid.__class__ assert module2class["benchmark"].__class__ == Robust04Benchmark.__class__ assert module2class["reranker"].__class__ == PACRR.__class__ module2class = pipeline.get_module_to_class(module_choices) assert module2class["reranker"].__class__ == KNRM.__class__
def test_check_for_invalid_keys(): pipeline = Pipeline({}) ex = sacred.Experiment("capreolus") pipeline.check_for_invalid_keys() pipeline.parameters_to_module["foo_bar"] = "reranker" with pytest.raises(ValueError): pipeline.check_for_invalid_keys()
def test_query_view_get_most_relevant_doc(trec_index, anserini_tokenizer, embedding_holder, mocker): @property def mock_qrels(collection, *args): collection._qrels = {"q_s1": {"doc_1": "LA010189-0001"}} return collection._qrels @property def mock_topics(collection, *args): collection._topics = {"title": {"q_1": "Dummy Doc"}} return collection._topics mocker.patch.object(Collection, "qrels", mock_qrels) mocker.patch.object(Collection, "topics", mock_topics) query_string = "world" _, docs = BM25View.do_query("world", trec_index, 5) all_tokens = NeuralQueryView.get_tokens_from_docs_and_query( anserini_tokenizer, docs, query_string) embedding_holder.create_indexed_embedding_layer_from_tokens(all_tokens) config = { "maxdoclen": 10, "maxqlen": 5, "gradkernels": True, "singlefc": True, "scoretanh": False, "pad_token": 0, "batch": 3, } pipeline = Pipeline(dict()) model_class = KNRM result_dicts = NeuralQueryView.do_query(config, query_string, pipeline, trec_index, anserini_tokenizer, embedding_holder, model_class) expected = [ { "doc_id": "LA010189-0002", "doc": "Dummy Dummy Dummy Hello world, greetings from outer space!", "relevance": 0 }, { "doc_id": "LA010189-0001", "doc": "Dummy Dummy Dummy Hello world, greetings from outer space!", "relevance": 0 }, ] assert set([tuple(x.items()) for x in result_dicts ]) == set([tuple(x.items()) for x in expected])
def test_get_parameters_to_module_including_missing_and_extractors(): """ Calls Pipeline.__init__() which in turn calls 1. self.get_parameters_to_module 2. get_parameters_to_module_for_missing_parameters 3. get_parameters_to_module_for_feature_parameters """ pipeline = Pipeline({}) ex = sacred.Experiment("capreolus") # parameters_to_module, parameter_types = pipeline.get_parameters_to_module_for_missing_parameters(ex) assert pipeline.parameters_to_module == { "collection": "module", "benchmark": "module", "reranker": "module", "expid": "stateless", "predontrain": "stateless", "earlystopping": "stateless", "maxdoclen": "pipeline", "maxqlen": "pipeline", "batch": "pipeline", "niters": "pipeline", "itersize": "pipeline", "gradacc": "pipeline", "lr": "pipeline", "seed": "pipeline", "sample": "pipeline", "softmaxloss": "pipeline", "dataparallel": "pipeline", # AnseriniIndex specific config "stemmer": "index", "indexstops": "index", # BM25Grid specific config "index": "module", # Robust04Benchmark specific config "fold": "stateless", "searcher": "module", "rundocsonly": "benchmark", # PACRR specific config "mingram": "reranker", "maxgram": "reranker", "nfilters": "reranker", "idf": "reranker", "kmax": "reranker", "combine": "reranker", "nonlinearity": "reranker", # EmbedText specific config "embeddings": "extractor", "keepstops": "extractor", }
def test_get_parameter_types(mocker): pipeline = Pipeline({}) ex = sacred.Experiment("capreolus") def mock_config(method_that_generates_input_dict): input_dict = method_that_generates_input_dict() # Just messing with the types to make sure that get_parameter_types does what it should input_dict.update({"index": None, "niters": True}) return lambda: input_dict mocker.patch.object(ex, "config", mock_config) parameter_types = pipeline.get_parameter_types(ex) assert parameter_types == { "pipeline": type("string"), # "pipeline" key is added by the method "collection": type("robust04"), "earlystopping": forced_types[type(True)], "index": forced_types[type(None)], "searcher": type("bm25grid"), "benchmark": type("robust04.title.wsdm20demo"), "reranker": type("PACRR"), "expid": type("debug"), "predontrain": forced_types[type(True)], "fold": type("s1"), "maxdoclen": type(800), "maxqlen": type(4), "batch": type(32), "niters": forced_types[type(True)], "itersize": type(4096), "gradacc": type(1), "lr": type(0.001), "seed": type(123_456), "sample": type("simple"), "softmaxloss": forced_types[type(True)], "dataparallel": type("none"), }
def train_pipeline(pipeline_config, data_sources=None, early_stopping=False): pipeline = Pipeline(pipeline_config) # Ugly hack pipeline_config["earlystopping"] = early_stopping collection_name = pipeline_config["collection"] validate_datasources(data_sources) if data_sources is not None and data_sources.get("qrels") is not None: COLLECTIONS[collection_name].set_qrels(data_sources["qrels"]) if data_sources is not None and data_sources.get("topics") is not None: COLLECTIONS[collection_name].set_topics(data_sources["topics"]) if data_sources is not None and data_sources.get("documents") is not None: COLLECTIONS[collection_name].set_documents(data_sources["documents"]) pipeline.ex.main(_train) run = pipeline.ex.run(config_updates=pipeline_config) return run.result
def test_convknrm(monkeypatch, tmpdir): monkeypatch.setenv("CAPREOLUS_RESULTS", str(os.path.join(tmpdir, "results"))) monkeypatch.setenv("CAPREOLUS_CACHE", str(os.path.join(tmpdir, "cache"))) def fake_magnitude_embedding(*args, **kwargs): return Magnitude(None) pipeline = Pipeline({ "reranker": "ConvKNRM", "niters": 1, "benchmark": "dummy", "itersize": 1, "batch": 1 }) pipeline.ex.main(train.train) monkeypatch.setattr(train, "pipeline", pipeline) monkeypatch.setattr(EmbedText, "get_magnitude_embeddings", fake_magnitude_embedding) pipeline.ex.run( config_updates={ "reranker": "ConvKNRM", "niters": 1, "benchmark": "dummy", "itersize": 1, "batch": 1 }) logger.info("Base path is {0}".format(pipeline.base_path)) config_files = search_files_or_folders_in_directory( pipeline.base_path, "config.json") assert len(config_files) == 1 config_file = json.load(open(config_files[0], "rt")) assert config_file["reranker"] == "ConvKNRM" assert config_file["niters"] == 1 run_path = os.path.join(pipeline.reranker_path, pipeline.cfg["fold"]) weight_dir = os.path.join(run_path, "weights") weight_file = search_files_or_folders_in_directory(weight_dir, "dev") assert len(weight_file) == 1
def test_deeptilebar(monkeypatch, tmpdir): monkeypatch.setenv("CAPREOLUS_RESULTS", str(os.path.join(tmpdir, "results"))) monkeypatch.setenv("CAPREOLUS_CACHE", str(os.path.join(tmpdir, "cache"))) def fake_magnitude_embedding(*args, **kwargs): return Magnitude(None) monkeypatch.setattr(DeepTileExtractor, "get_magnitude_embeddings", fake_magnitude_embedding) pipeline = Pipeline({ "reranker": "DeepTileBar", "niters": 1, "benchmark": "dummy", "itersize": 1, "batch": 1, "passagelen": "3" }) pipeline.ex.main(train.train) monkeypatch.setattr(train, "pipeline", pipeline) pipeline.ex.run( config_updates={ "reranker": "DeepTileBar", "niters": 1, "benchmark": "dummy", "itersize": 1, "batch": 1 }) config_files = search_files_or_folders_in_directory( pipeline.base_path, "config.json") assert len(config_files) == 1 config_file = json.load(open(config_files[0], "rt")) assert config_file["reranker"] == "DeepTileBar" assert config_file["niters"] == 1 run_path = os.path.join(pipeline.reranker_path, pipeline.cfg["fold"]) weight_dir = os.path.join(run_path, "weights") weight_file = search_files_or_folders_in_directory(weight_dir, "dev") assert len(weight_file) == 1
from tqdm import tqdm from scipy.stats import ttest_rel curr_file_dir = os.path.dirname(os.path.realpath(__file__)) sys.path.append(curr_file_dir) from capreolus.reranker.common import pair_hinge_loss, pair_softmax_loss from capreolus.utils.loginit import get_logger from capreolus.pipeline import Pipeline, cli_module_choice, modules from capreolus.searcher import Searcher logger = get_logger(__name__) # pylint: disable=invalid-name plt.switch_backend("agg") pipeline = Pipeline( {module: cli_module_choice(sys.argv, module) for module in modules}) pipeline.ex.logger = logger @pipeline.ex.main def train(_config, _run): pipeline.initialize(_config) reranker = pipeline.reranker benchmark = pipeline.benchmark logger.debug("initialized pipeline with results path: %s", pipeline.reranker_path) post_pipeline_init_time = time.time() run_path = os.path.join(pipeline.reranker_path, pipeline.cfg["fold"]) logger.info("initialized pipeline with results path: %s", run_path) post_pipeline_init_time = time.time()
if len(args) == 1: return default if "with" in args: # if "with" appears, the command should appear immediately before it index = args.index("with") - 1 else: # there is no "with", so command must be the last argument index = len(args) - 1 # if index points to the program name, no command was provided if index == 0: return default # index points to the command name return args[index] if __name__ == "__main__": task_command_str = parse_sacred_command(sys.argv) task_command_path = task_command_str.split(".") task = task_command_path[0] command = ".".join(task_command_path[1:]) rewritten_args = list(sys.argv) task_index = rewritten_args.index(task_command_str) rewritten_args[task_index] = command pipeline = Pipeline(task, rewritten_args) pipeline.run()