def get_args():
    options = Options()
    options.add_index_options()
    args = options.parse()

    coarse = 'hnsw' if args.hnsw else 'flat'
    args.index_name = f'{args.num_clusters}_{coarse}_{args.fine_quant}{"_first" if args.first_passage else ""}'
    if args.index_filter != -1e8:  # other than default
        args.index_name = args.index_name + f'_ft{int(args.index_filter)}'
    args.index_dir = os.path.join(args.dump_dir, 'start', args.index_name)

    args.quantizer_path = os.path.join(args.index_dir, args.quantizer_path)
    args.trained_index_path = os.path.join(args.index_dir,
                                           args.trained_index_path)
    args.inv_path = os.path.join(args.index_dir, args.inv_path)

    args.subindex_dir = os.path.join(args.index_dir, args.subindex_name)
    if args.dump_paths is None:
        args.index_path = os.path.join(args.index_dir, args.index_path)
        args.idx2id_path = os.path.join(args.index_dir, args.idx2id_path)
    else:
        args.dump_paths = [
            os.path.join(args.dump_dir, args.phrase_dir, path)
            for path in args.dump_paths.split(',')
        ]
        args.index_path = os.path.join(args.subindex_dir,
                                       '%d.faiss' % args.offset)
        args.idx2id_path = os.path.join(args.subindex_dir,
                                        '%d.hdf5' % args.offset)

    logger.info(f"Creating {args.index_name}...")
    return args
Esempio n. 2
0
    def __init__(self,
                 load_dir,
                 dump_dir,
                 index_name='start/1048576_flat_OPQ96',
                 device='cuda',
                 verbose=False,
                 **kwargs):
        print(
            "This could take up to 15 mins depending on the file reading speed of HDD/SSD"
        )

        # Turn off loggers
        if not verbose:
            logging.getLogger("densephrases").setLevel(logging.WARNING)
            logging.getLogger("transformers").setLevel(logging.WARNING)

        # Get default options
        options = Options()
        options.add_model_options()
        options.add_index_options()
        options.add_retrieval_options()
        options.add_data_options()
        self.args = options.parse()

        # Set options
        self.args.load_dir = load_dir
        self.args.dump_dir = dump_dir
        self.args.cache_dir = os.environ['CACHE_DIR']
        self.args.index_name = index_name
        self.args.cuda = True if device == 'cuda' else False
        self.args.__dict__.update(kwargs)

        # Load encoder
        self.set_encoder(load_dir, device)

        # Load MIPS
        self.mips = load_phrase_index(self.args, ignore_logging=not verbose)

        # Others
        self.truecase = TrueCaser(
            os.path.join(os.environ['DATA_DIR'], self.args.truecase_path))
        print("Loading DensePhrases Completed!")
    out_file = os.path.join(
        os.environ['SAVE_DIR'], os.path.basename(args.load_dir), 'pred',
        os.path.splitext(os.path.basename(pred_path))[0] +
        f'_{"sent" if args.return_sent else "psg"}-top{args.psg_top_k}{"_mark" if args.mark_phrase else ""}.json'
    )
    logger.info(f"dump to {out_file}")
    json.dump(my_target, open(out_file, 'w'), indent=4)

    # Call subprocess for evaluation
    command = f'python scripts/postprocess/recall.py --k_values 1,5,20,100 --results_file {out_file} --ans_fn string'
    subprocess.run(command.split(' '))


if __name__ == '__main__':
    # See options in densephrases.options
    options = Options()
    options.add_model_options()
    options.add_index_options()
    options.add_retrieval_options()
    options.add_data_options()
    options.add_question_type_options()
    args = options.parse()

    # Seed for reproducibility
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(args.seed)

    # Set wandb
Esempio n. 4
0
    # Annotate for L_doc
    if 'doc' in args.label_strat.split(','):
        p_targets = [[
            any(phrase['title'][0].lower() == tit.lower() for tit in title)
            for phrase in phrase_group
        ] for phrase_group, title in zip(phrase_groups, titles)]
        p_targets = [[ii if val else None for ii, val in enumerate(target)]
                     for target in p_targets]

    return start_vecs, end_vecs, targets, p_targets


if __name__ == '__main__':
    # See options in densephrases.options
    options = Options()
    options.add_model_options()
    options.add_index_options()
    options.add_retrieval_options()
    options.add_data_options()
    options.add_qsft_options()
    args = options.parse()

    # Seed for reproducibility
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(args.seed)

    if args.run_mode == 'train_query':
Esempio n. 5
0
def main():
    # See options in densephrases.options
    options = Options()
    options.add_model_options()
    options.add_data_options()
    options.add_rc_options()
    args = options.parse()

    # Setup CUDA, GPU & distributed training
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
        args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend="nccl")
        args.n_gpu = 1
    args.device = device

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        args.local_rank,
        device,
        args.n_gpu,
        bool(args.local_rank != -1),
        args.fp16,
    )

    # Set seed
    set_seed(args)

    # Load config, tokenizer
    if args.local_rank not in [-1, 0]:
        # Make sure only the first process in distributed training will download model & vocab
        torch.distributed.barrier()

    args.model_type = args.model_type.lower()
    config, unused_kwargs = AutoConfig.from_pretrained(
        args.config_name if args.config_name else args.pretrained_name_or_path,
        cache_dir=args.cache_dir if args.cache_dir else None,
        output_hidden_states=False,
        return_unused_kwargs=True
    )
    tokenizer = AutoTokenizer.from_pretrained(
        args.tokenizer_name if args.tokenizer_name else args.pretrained_name_or_path,
        do_lower_case=args.do_lower_case,
        cache_dir=args.cache_dir if args.cache_dir else None,
    )

    if args.local_rank == 0:
        # Make sure only the first process in distributed training will download model & vocab
        torch.distributed.barrier()

    logger.info("Dump parameters %s", args)

    # Before we do anything with models, we want to ensure that we get fp16 execution of torch.einsum if args.fp16 is set.
    # Otherwise it'll default to "promote" mode, and we'll get fp32 operations. Note that running `--fp16_opt_level="O2"`
    # will remove the need for this code, but it is still valid.
    if args.fp16:
        try:
            import apex
            apex.amp.register_half_function(torch, "einsum")
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")

    # Create phrase vectors
    if args.do_dump:
        assert args.load_dir
        model, tokenizer, config = load_encoder(device, args, phrase_only=True)

        args.draft = False
        dump_phrases(args, model, tokenizer, filter_only=args.filter_only)
Esempio n. 6
0
def main():
    # See options in densephrases.options
    options = Options()
    options.add_model_options()
    options.add_data_options()
    options.add_rc_options()
    args = options.parse()

    if args.local_rank != -1:
        if args.local_rank > 0:
            logger.setLevel(logging.WARN)

    # Setup CUDA, GPU & distributed training
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend="nccl")
        args.n_gpu = 1
    args.device = device

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        args.local_rank,
        device,
        args.n_gpu,
        bool(args.local_rank != -1),
        args.fp16,
    )

    # Set seed
    set_seed(args)

    # Set wandb
    if args.do_train or args.do_eval:
        wandb.init(project="DensePhrases (single)",
                   mode="online" if args.wandb else "disabled")
        wandb.config.update(args)

    # Load config, tokenizer
    if args.local_rank not in [-1, 0]:
        # Make sure only the first process in distributed training will download model & vocab
        torch.distributed.barrier()

    # Initialize or load encoder
    model, tokenizer, config = load_encoder(device, args)

    if args.local_rank == 0:
        # Make sure only the first process in distributed training will download model & vocab
        torch.distributed.barrier()

    logger.info("Training/evaluation parameters %s", args)

    # Before we do anything with models, we want to ensure that we get fp16 execution of torch.einsum if args.fp16 is set.
    # Otherwise it'll default to "promote" mode, and we'll get fp32 operations. Note that running `--fp16_opt_level="O2"`
    # will remove the need for this code, but it is still valid.
    if args.fp16:
        try:
            import apex
            apex.amp.register_half_function(torch, "einsum")
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )

    # Training
    global_step = 1
    tr_loss = 99999

    if args.do_train:
        # Load pre-trained cross encoder
        if args.lambda_kl > 0 and args.do_train:
            cross_encoder = torch.load(os.path.join(args.teacher_dir,
                                                    "pytorch_model.bin"),
                                       map_location=torch.device('cpu'))
            new_qd = {
                n[len('bert') + 1:]: p
                for n, p in cross_encoder.items() if 'bert' in n
            }
            new_linear = {
                n[len('qa_outputs') + 1:]: p
                for n, p in cross_encoder.items() if 'qa_outputs' in n
            }
            qd_config, unused_kwargs = AutoConfig.from_pretrained(
                args.pretrained_name_or_path,
                cache_dir=args.cache_dir if args.cache_dir else None,
                return_unused_kwargs=True)
            qd_pretrained = AutoModel.from_pretrained(
                args.pretrained_name_or_path,
                config=qd_config,
                cache_dir=args.cache_dir if args.cache_dir else None,
            )
            model.cross_encoder = qd_pretrained
            model.cross_encoder.load_state_dict(new_qd)
            model.qa_outputs = torch.nn.Linear(config.hidden_size, 2)
            model.qa_outputs.load_state_dict(new_linear)
            logger.info(f'Distill with teacher model {args.teacher_dir}')

        # Train model
        model.to(args.device)
        train_dataset = load_and_cache_examples(args,
                                                tokenizer,
                                                evaluate=False,
                                                output_examples=False,
                                                skip_no_answer=False)
        global_step, tr_loss = train(args, train_dataset, model, tokenizer)
        logger.info(" global_step = %s, average loss = %s", global_step,
                    tr_loss)

    # Save the trained model and the tokenizer
    if (args.do_train) and (args.local_rank == -1
                            or torch.distributed.get_rank() == 0):
        # Create output directory if needed
        if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
            os.makedirs(args.output_dir)

        # Remove teacher before saving
        if args.lambda_kl > 0:
            del model.cross_encoder
            del model.qa_outputs

        logger.info("Saving model checkpoint to %s", args.output_dir)
        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
        # They can then be reloaded using `from_pretrained()`
        # Take care of distributed/parallel training
        model_to_save = model.module if hasattr(model, "module") else model

        model_to_save.save_pretrained(args.output_dir)
        tokenizer.save_pretrained(args.output_dir)

        # Good practice: save your training arguments together with the trained model
        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))

        # Load a trained model and vocabulary that you have fine-tuned
        model.load_state_dict(
            backward_compat(
                torch.load(os.path.join(args.output_dir, "pytorch_model.bin"),
                           map_location=torch.device('cpu'))))
        tokenizer = AutoTokenizer.from_pretrained(
            args.output_dir, do_lower_case=args.do_lower_case)
        model.to(args.device)

        # Set load_dir to trained model
        args.load_dir = args.output_dir
        logger.info(f'Will load {args.load_dir} that was trained.')

    # Test filter
    if args.do_filter_test:
        assert args.load_dir
        model, tokenizer, _ = load_encoder(device, args)
        filter_test(args, model, tokenizer)

    # Evaluation
    if args.do_eval and args.local_rank in [-1, 0]:
        assert args.load_dir
        model, tokenizer, _ = load_encoder(device, args)
        result, _ = evaluate(args, model, tokenizer, prefix='final')
        result = dict((k + "_final", v) for k, v in result.items())
        wandb.log(
            {
                "Eval EM": result['exact_final'],
                "Eval F1": result['f1_final'],
                "loss": tr_loss
            },
            step=global_step,
        )
        logger.info("Results: {}".format(result))