Ejemplo n.º 1
0
def main():
    # See options in densephrases.options
    options = Options()
    options.add_model_options()
    options.add_data_options()
    options.add_rc_options()
    args = options.parse()

    # Setup CUDA, GPU & distributed training
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
        args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend="nccl")
        args.n_gpu = 1
    args.device = device

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        args.local_rank,
        device,
        args.n_gpu,
        bool(args.local_rank != -1),
        args.fp16,
    )

    # Set seed
    set_seed(args)

    # Load config, tokenizer
    if args.local_rank not in [-1, 0]:
        # Make sure only the first process in distributed training will download model & vocab
        torch.distributed.barrier()

    args.model_type = args.model_type.lower()
    config, unused_kwargs = AutoConfig.from_pretrained(
        args.config_name if args.config_name else args.pretrained_name_or_path,
        cache_dir=args.cache_dir if args.cache_dir else None,
        output_hidden_states=False,
        return_unused_kwargs=True
    )
    tokenizer = AutoTokenizer.from_pretrained(
        args.tokenizer_name if args.tokenizer_name else args.pretrained_name_or_path,
        do_lower_case=args.do_lower_case,
        cache_dir=args.cache_dir if args.cache_dir else None,
    )

    if args.local_rank == 0:
        # Make sure only the first process in distributed training will download model & vocab
        torch.distributed.barrier()

    logger.info("Dump parameters %s", args)

    # Before we do anything with models, we want to ensure that we get fp16 execution of torch.einsum if args.fp16 is set.
    # Otherwise it'll default to "promote" mode, and we'll get fp32 operations. Note that running `--fp16_opt_level="O2"`
    # will remove the need for this code, but it is still valid.
    if args.fp16:
        try:
            import apex
            apex.amp.register_half_function(torch, "einsum")
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")

    # Create phrase vectors
    if args.do_dump:
        assert args.load_dir
        model, tokenizer, config = load_encoder(device, args, phrase_only=True)

        args.draft = False
        dump_phrases(args, model, tokenizer, filter_only=args.filter_only)
Ejemplo n.º 2
0
def main():
    # See options in densephrases.options
    options = Options()
    options.add_model_options()
    options.add_data_options()
    options.add_rc_options()
    args = options.parse()

    if args.local_rank != -1:
        if args.local_rank > 0:
            logger.setLevel(logging.WARN)

    # Setup CUDA, GPU & distributed training
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend="nccl")
        args.n_gpu = 1
    args.device = device

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        args.local_rank,
        device,
        args.n_gpu,
        bool(args.local_rank != -1),
        args.fp16,
    )

    # Set seed
    set_seed(args)

    # Set wandb
    if args.do_train or args.do_eval:
        wandb.init(project="DensePhrases (single)",
                   mode="online" if args.wandb else "disabled")
        wandb.config.update(args)

    # Load config, tokenizer
    if args.local_rank not in [-1, 0]:
        # Make sure only the first process in distributed training will download model & vocab
        torch.distributed.barrier()

    # Initialize or load encoder
    model, tokenizer, config = load_encoder(device, args)

    if args.local_rank == 0:
        # Make sure only the first process in distributed training will download model & vocab
        torch.distributed.barrier()

    logger.info("Training/evaluation parameters %s", args)

    # Before we do anything with models, we want to ensure that we get fp16 execution of torch.einsum if args.fp16 is set.
    # Otherwise it'll default to "promote" mode, and we'll get fp32 operations. Note that running `--fp16_opt_level="O2"`
    # will remove the need for this code, but it is still valid.
    if args.fp16:
        try:
            import apex
            apex.amp.register_half_function(torch, "einsum")
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )

    # Training
    global_step = 1
    tr_loss = 99999

    if args.do_train:
        # Load pre-trained cross encoder
        if args.lambda_kl > 0 and args.do_train:
            cross_encoder = torch.load(os.path.join(args.teacher_dir,
                                                    "pytorch_model.bin"),
                                       map_location=torch.device('cpu'))
            new_qd = {
                n[len('bert') + 1:]: p
                for n, p in cross_encoder.items() if 'bert' in n
            }
            new_linear = {
                n[len('qa_outputs') + 1:]: p
                for n, p in cross_encoder.items() if 'qa_outputs' in n
            }
            qd_config, unused_kwargs = AutoConfig.from_pretrained(
                args.pretrained_name_or_path,
                cache_dir=args.cache_dir if args.cache_dir else None,
                return_unused_kwargs=True)
            qd_pretrained = AutoModel.from_pretrained(
                args.pretrained_name_or_path,
                config=qd_config,
                cache_dir=args.cache_dir if args.cache_dir else None,
            )
            model.cross_encoder = qd_pretrained
            model.cross_encoder.load_state_dict(new_qd)
            model.qa_outputs = torch.nn.Linear(config.hidden_size, 2)
            model.qa_outputs.load_state_dict(new_linear)
            logger.info(f'Distill with teacher model {args.teacher_dir}')

        # Train model
        model.to(args.device)
        train_dataset = load_and_cache_examples(args,
                                                tokenizer,
                                                evaluate=False,
                                                output_examples=False,
                                                skip_no_answer=False)
        global_step, tr_loss = train(args, train_dataset, model, tokenizer)
        logger.info(" global_step = %s, average loss = %s", global_step,
                    tr_loss)

    # Save the trained model and the tokenizer
    if (args.do_train) and (args.local_rank == -1
                            or torch.distributed.get_rank() == 0):
        # Create output directory if needed
        if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
            os.makedirs(args.output_dir)

        # Remove teacher before saving
        if args.lambda_kl > 0:
            del model.cross_encoder
            del model.qa_outputs

        logger.info("Saving model checkpoint to %s", args.output_dir)
        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
        # They can then be reloaded using `from_pretrained()`
        # Take care of distributed/parallel training
        model_to_save = model.module if hasattr(model, "module") else model

        model_to_save.save_pretrained(args.output_dir)
        tokenizer.save_pretrained(args.output_dir)

        # Good practice: save your training arguments together with the trained model
        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))

        # Load a trained model and vocabulary that you have fine-tuned
        model.load_state_dict(
            backward_compat(
                torch.load(os.path.join(args.output_dir, "pytorch_model.bin"),
                           map_location=torch.device('cpu'))))
        tokenizer = AutoTokenizer.from_pretrained(
            args.output_dir, do_lower_case=args.do_lower_case)
        model.to(args.device)

        # Set load_dir to trained model
        args.load_dir = args.output_dir
        logger.info(f'Will load {args.load_dir} that was trained.')

    # Test filter
    if args.do_filter_test:
        assert args.load_dir
        model, tokenizer, _ = load_encoder(device, args)
        filter_test(args, model, tokenizer)

    # Evaluation
    if args.do_eval and args.local_rank in [-1, 0]:
        assert args.load_dir
        model, tokenizer, _ = load_encoder(device, args)
        result, _ = evaluate(args, model, tokenizer, prefix='final')
        result = dict((k + "_final", v) for k, v in result.items())
        wandb.log(
            {
                "Eval EM": result['exact_final'],
                "Eval F1": result['f1_final'],
                "loss": tr_loss
            },
            step=global_step,
        )
        logger.info("Results: {}".format(result))