Example #1
0
    def init(self, rank, root, experiment, name):
        assert '/' not in experiment, experiment
        assert '/' not in name, name

        self.experiments_root = os.path.abspath(root)
        self.experiment = experiment
        self.name = name
        self.path = os.path.join(self.experiments_root, self.experiment, self.script, self.name)

        if rank < 1:
            if os.path.exists(self.path):
                print('\n\n')
                print_message("It seems that ", self.path, " already exists.")
                print_message("Do you want to overwrite it? \t yes/no \n")

                # TODO: This should timeout and exit (i.e., fail) given no response for 60 seconds.

                response = input()
                if response.strip() != 'yes':
                    assert not os.path.exists(self.path), self.path
            else:
                create_directory(self.path)

        distributed.barrier(rank)

        self._logger = Logger(rank, self)
        self._log_args = self._logger._log_args
        self.warn = self._logger.warn
        self.info = self._logger.info
        self.info_all = self._logger.info_all
        self.log_metric = self._logger.log_metric
        self.log_new_artifact = self._logger.log_new_artifact
Example #2
0
    def __init__(self, rank, run):
        self.rank = rank
        self.is_main = self.rank in [-1, 0]
        self.run = run
        self.logs_path = os.path.join(self.run.path, "logs/")

        if self.is_main:
            self._init_mlflow()
            self.initialized_tensorboard = False
            create_directory(self.logs_path)
Example #3
0
def main():
    random.seed(12345)

    parser = Arguments(
        description='Precomputing document representations with ColBERT.')

    parser.add_model_parameters()
    parser.add_model_inference_parameters()
    parser.add_indexing_input()

    parser.add_argument('--chunksize',
                        dest='chunksize',
                        default=6.0,
                        required=False,
                        type=float)  # in GiBs

    args = parser.parse()

    with Run.context():
        args.index_path = os.path.join(args.index_root, args.index_name)
        # try:
        assert not os.path.exists(args.index_path), args.index_path
        # except:
        #     print("\n\nNOT EXISTING:", args.index_path, args.index_path, '\n\n')

        distributed.barrier(args.rank)

        if args.rank < 1:
            create_directory(args.index_root)
            create_directory(args.index_path)

        distributed.barrier(args.rank)

        process_idx = max(0, args.rank)
        encoder = CollectionEncoder(args,
                                    process_idx=process_idx,
                                    num_processes=args.nranks)
        encoder.encode()

        distributed.barrier(args.rank)

        # Save metadata.
        if args.rank < 1:
            metadata_path = os.path.join(args.index_path, 'metadata.json')
            print_message("Saving (the following) metadata to", metadata_path,
                          "..")
            print(args.input_arguments)

            with open(metadata_path, 'w') as output_metadata:
                ujson.dump(args.input_arguments.__dict__, output_metadata)

        distributed.barrier(args.rank)
Example #4
0
    def index(self, iterator):
        from timeit import default_timer as timer
        starttime = timer()
        maxdocs = 100
        assert not os.path.exists(self.args.index_path), self.args.index_path
        docnos = []
        docid = 0

        def convert_gen(iterator):
            import pyterrier as pt
            nonlocal docnos
            nonlocal docid
            if self.num_docs is not None:
                iterator = pt.tqdm(iterator,
                                   total=self.num_docs,
                                   desc="encoding",
                                   unit="d")
            for l in iterator:
                l["docid"] = docid
                docnos.append(l['docno'])
                docid += 1
                yield l

        self.args.generator = convert_gen(iterator)
        ceg = CollectionEncoder_Generator(self.prepend_title, self.args, 0, 1)
        create_directory(self.args.index_root)
        create_directory(self.args.index_path)
        ceg.encode()
        self.colbert = ceg.colbert
        self.checkpoint = ceg.checkpoint

        assert os.path.exists(self.args.index_path), self.args.index_path
        num_embeddings = sum(load_doclens(self.args.index_path))
        print("#> num_embeddings =", num_embeddings)

        import pyterrier as pt
        with pt.io.autoopen(
                os.path.join(self.args.index_path, "docnos.pkl.gz"),
                "wb") as f:
            pickle.dump(docnos, f)

        if self.args.partitions is None:
            self.args.partitions = 1 << math.ceil(
                math.log2(8 * math.sqrt(num_embeddings)))
            warn("You did not specify --partitions!")
            warn("Default computation chooses", self.args.partitions,
                 "partitions (for {} embeddings)".format(num_embeddings))
        index_faiss(self.args)
        print("#> Faiss encoding complete")
        endtime = timer()
        print("#> Indexing complete, Time elapsed %0.2f seconds" %
              (endtime - starttime))
Example #5
0
    save_metadata(f'{args.output}.meta', args)

    print('\n\n', args, '\n\n')
    print(args.output)
    print_message("#> Done.")


if __name__ == "__main__":
    random.seed(12345)

    parser = ArgumentParser(description='.')

    # Input / Output Arguments
    parser.add_argument('--metric', dest='metric', required=True,
                        type=str)  # e.g., success.20
    parser.add_argument('--paths',
                        dest='paths',
                        required=True,
                        type=str,
                        nargs='+')
    parser.add_argument('--output', dest='output', required=True, type=str)

    args = parser.parse_args()

    args.metric = args.metric.split('.')

    assert not os.path.exists(args.output), args.output
    create_directory(os.path.dirname(args.output))

    main(args)