Beispiel #1
0
def cli(port, workdir, dataset, debug):
    """Runs an experiment"""
    logging.getLogger().setLevel(logging.DEBUG if debug else logging.INFO)

    bm25 = BM25()

    # Sets the working directory and the name of the xp
    with experiment(workdir, "bm25", port=port) as xp:
        # Index the collection
        xp.setenv("JAVA_HOME", os.environ["JAVA_HOME"])
        ds = prepare_dataset(dataset)

        documents = ds.documents
        index = IndexCollection(
            documents=documents,
            storePositions=True,
            storeDocvectors=True,
            storeContents=True,
            threads=CPU_COUNT,
        ).submit()

        # Search with BM25
        bm25_retriever = AnseriniRetriever(k=1500, index=index,
                                           model=BM25()).tag("model", "bm25")

        bm25_eval = Evaluate(dataset=ds, retriever=bm25_retriever).submit()

    print("BM25 results on TREC 1")
    print(bm25_eval.results.read_text())
Beispiel #2
0
def process(processors, debug, gpu, port, workdir, max_epoch, batch_size,
            grad_acc_batch):
    """Runs an experiment"""
    logging.info("Running pipeline")

    logging.getLogger().setLevel(logging.DEBUG if debug else logging.INFO)
    info = Information()
    info.device = device = Device(gpu=gpu)

    # Sets the working directory and the name of the xp
    with experiment(workdir, "neural-ir", port=port) as xpm:
        # Misc settings
        random = Random()
        assert "JAVA_HOME" in os.environ, "JAVA_HOME should be defined (to call anserini)"
        xpm.setenv("JAVA_HOME", os.environ["JAVA_HOME"])

        # Prepare the embeddings
        info.device = device

        for processor in processors:
            processor(info)

        assert info.datasets, "No dataset was selected"
        assert info.rankers, "No model was selected"

        for train, val, test in info.datasets:

            # Search and evaluate with BM25
            bm25_search = (SearchCollection(index=test.index,
                                            topics=test.assessed_topics.topics,
                                            model=BM25()).tag("model",
                                                              "bm25").submit())
            bm25_eval = TrecEval(assessments=test.assessed_topics.assessments,
                                 run=bm25_search).submit()

            # Train and evaluate with each model
            for ranker in info.rankers:
                # Train with OpenNIR DRMM model
                predictor = Reranker(device=device, batch_size=batch_size)
                trainer = PointwiseTrainer(device=device,
                                           grad_acc_batch=grad_acc_batch)
                learner = Learner(trainer=trainer,
                                  random=random,
                                  ranker=ranker,
                                  valid_pred=predictor,
                                  train_dataset=train,
                                  val_dataset=val,
                                  max_epoch=tag(max_epoch))
                model = learner.submit()

                # Evaluate the neural model
                evaluate = Evaluate(dataset=test,
                                    model=model,
                                    predictor=predictor).submit()

        xpm.wait()

        print(f"Results for DRMM\n{evaluate.results.read_text()}\n")
        print(f"Results for BM25\n{bm25_eval.results.read_text()}\n")
Beispiel #3
0
def cli(port, workdir, debug):
    """Runs an experiment"""
    logging.getLogger().setLevel(logging.DEBUG if debug else logging.INFO)

    bm25 = BM25()

    # Sets the working directory and the name of the xp
    with experiment(workdir, "index", port=port) as xp:
        train_ds = RandomDataset()
Beispiel #4
0
    def __enter__(self) -> experiment:
        if self.clean_workdir:
            self.workdir = TemporaryDirectory(prefix="xpm", suffix=self.name)
            workdir = self.workdir.__enter__()
        else:
            workdir = self.workdir

        self.experiment = experiment(workdir, self.name, port=self.port)
        self.experiment.__enter__()

        # Set some useful environment variables
        self.experiment.workspace.launcher.setenv(
            "PYTHONPATH", str(Path(__file__).parents[2]))
        self.timeout.__enter__()

        logging.info("Created new temporary experiment (%s)", workdir)
        return self.experiment
Beispiel #5
0
    def on_button_clicked(self, b):
        with self.output:
            if experiment.CURRENT:
                try:
                    experiment.CURRENT.__exit__(None, None, None)
                except Exception:
                    print("Error while stopping experimaestro")
                self.current = experiment.CURRENT
            else:
                self.current = experiment(self.name,
                                          self.name,
                                          host="localhost",
                                          port=self.port).__enter__()
                for key, value in self.environment.items():
                    self.current.setenv(key, value)
                if self.hook:
                    self.hook(self)

        self.refresh()
Beispiel #6
0
def process(processors, debug, gpu, port, workdir, max_epoch, batch_size):
    """Runs an experiment"""
    logging.info("Running pipeline")

    logging.getLogger().setLevel(logging.DEBUG if debug else logging.INFO)
    info = Information()
    info.device = device = Device(gpu=gpu)

    # Sets the working directory and the name of the xp
    with experiment(workdir, "neural-ir", port=port) as xpm:
        # Misc settings__xpm__
        # Prepare the embeddings
        info.device = device

        for processor in processors:
            processor(info)

        assert info.trainer, "No trainer was selected"
        assert info.train_sampler, "No train sampler was selected"
        assert info.dev, "No dev dataset was selected"
        assert info.test, "No test dataset was selected"

        random_scorer = RandomScorer(random=info.random).tag("model", "random")

        # Retrieve the top 1000
        topK = 1000
        # 1000 documents used for cross-validation
        valtopK = 100

        def get_retriever(index, scorer, topk=topK):
            base_retriever = AnseriniRetriever(k=topk,
                                               index=index,
                                               model=info.basemodel)
            return TwoStageRetriever(retriever=base_retriever,
                                     scorer=scorer,
                                     batchsize=batch_size)

        val_index, test_index = [
            info.index(c.documents) for c in (info.dev, info.test)
        ]

        # Search and evaluate with BM25
        bm25_retriever = AnseriniRetriever(k=topK,
                                           index=test_index,
                                           model=info.basemodel).tag(
                                               "model", "bm25")
        bm25_eval = Evaluate(dataset=info.test,
                             retriever=bm25_retriever).submit()

        # Performance of random
        random_eval = Evaluate(dataset=info.test,
                               retriever=get_retriever(
                                   test_index, random_scorer)).submit()

        # Train and evaluate with each model
        for scorer in info.scorers:
            # Train with OpenNIR DRMM model
            # predictor = Reranker(device=device, batch_size=batch_size)

            trainer = info.trainer
            validation = Validation(dataset=info.dev,
                                    retriever=get_retriever(
                                        val_index, scorer, valtopK))

            learner = Learner(
                trainer=trainer,
                random=info.random,
                scorer=scorer,
                max_epoch=tag(max_epoch),
                validation=validation,
            )
            model = learner.submit()

            # Evaluate the neural model
            evaluate = Evaluate(dataset=info.test,
                                retriever=get_retriever(test_index,
                                                        model)).submit()

        xpm.wait()

        print(f"===")
        print(f"Results for BM25\n{bm25_eval.results.read_text()}\n")
        print(f"Results for DRMM\n{evaluate.results.read_text()}\n")
        print(f"Results for random\n{random_eval.results.read_text()}\n")
Beispiel #7
0
import importlib
import sys
from experimaestro import experiment

if __name__ == "__main__":
    wspath, module, functionname = sys.argv[1:]
    print("Importing", module)
    f = getattr(importlib.import_module(module), functionname)
    with experiment(wspath, "restart") as xp:
        f(xp)