def init_pytorch_model(model_name, tf_checkpoint_path):
    config_name = TFMODELS[model_name][1]
    config_module = __import__("transformers", fromlist=[config_name])
    model_config = getattr(config_module, config_name)

    parent_path = tf_checkpoint_path.rpartition('/')[0]
    config_path = glob.glob(parent_path + "/*config.json")
    config = model_config() if len(config_path) is 0 else model_config.from_json_file(str(config_path[0]))

    if TFMODELS[model_name][2] is "":
        from transformers import AutoModelForPreTraining
        init_model = AutoModelForPreTraining.from_config(config)
    else:
        model_categroy_name = TFMODELS[model_name][2]
        module = __import__("transformers", fromlist=[model_categroy_name])
        model_categroy = getattr(module, model_categroy_name)
        init_model = model_categroy(config)
    return config, init_model
Beispiel #2
0
def prepare_model_and_optimizer(args, device):
    global_step = 0
    args.resume_step = 0
    checkpoint = None
    # download model & vocab.
    if args.config_name:
        config = AutoConfig.from_pretrained(args.config_name)
    elif args.model_name_or_path:
        config = AutoConfig.from_pretrained(args.model_name_or_path)
    else:
        config = CONFIG_MAPPING[args.model_type]()
        logger.warning("You are instantiating a new config instance from scratch.")

    config.dense_seq_output = args.dense_seq_output 
    if args.model_name_or_path:
        model = AutoModelForPreTraining.from_pretrained(
            args.model_name_or_path,
            from_tf=bool(".ckpt" in args.model_name_or_path),
            config=config,
            )
    else:
        logger.info("Training new model from scratch")
        model = AutoModelForPreTraining.from_config(config)
    ## Load from Pyt checkpoint - either given as init_checkpoint, or picked up from output_dir if found
    #if args.init_checkpoint is not None or found_resume_checkpoint(args):
    #    # Prepare model
    #    #model = BertForPreTraining(config)
    #    model = BertForPreTrainingSegmented(config)

    #    # for k,v in model.state_dict().items():
    #    #     print(f'model-k,len(v)={k}, {v.numel()}')

    #    #model = BertForPretraining(config)
    #    if args.init_checkpoint is None: # finding checkpoint in output_dir
    #        assert False, "code path not tested with cuda graphs"
    #        checkpoint_str = "phase2_ckpt_*.pt" if args.phase2 else "phase1_ckpt_*.pt"
    #        model_names = [f for f in glob.glob(os.path.join(args.output_dir, checkpoint_str))]
    #        global_step = max([int(x.split('.pt')[0].split('_')[-1].strip()) for x in model_names])
    #        args.resume_step = global_step #used for throughput computation

    #        resume_init_checkpoint = os.path.join(args.output_dir, checkpoint_str.replace("*", str(global_step)))
    #        print("Setting init checkpoint to %s - which is the latest in %s" %(resume_init_checkpoint, args.output_dir))
    #        checkpoint=torch.load(resume_init_checkpoint, map_location="cpu")
    #    else:
    #        checkpoint=torch.load(args.init_checkpoint, map_location="cpu")["model"]
    param_optimizer = list(model.named_parameters())
    
    no_decay = ['bias', 'gamma', 'beta', 'LayerNorm']

    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay_rate},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}]
    optimizer = Lamb(optimizer_grouped_parameters, lr=args.learning_rate,  betas=(args.opt_lamb_beta_1, args.opt_lamb_beta_2), fused=True)

    if args.warmup_steps == 0:
        warmup_steps = int(args.max_steps * args.warmup_proportion)
        warmup_start = 0
    else:
        warmup_steps = args.warmup_steps
        warmup_start = args.start_warmup_step
    
    lr_scheduler = LinearWarmupPolyDecayScheduler(optimizer, start_warmup_steps=warmup_start, warmup_steps=warmup_steps,
                                                  total_steps=args.max_steps, end_learning_rate=0.0, degree=1.0)
    #if found_resume_checkpoint(args):
    #    assert False, "code path not tested with cuda graphs"
    #    optimizer.load_state_dict(checkpoint['optimizer']) #restores m,v states (only if resuming checkpoint, not for init_checkpoint and init_tf_checkpoint for now)
    return model, optimizer, lr_scheduler, checkpoint, global_step