def get_model(args):
    """Build the model."""

    print_rank_0('building GPT2 model ...')
    model = GPT2Model(num_layers=args.num_layers,
                      vocab_size=args.vocab_size,
                      hidden_size=args.hidden_size,
                      num_attention_heads=args.num_attention_heads,
                      embedding_dropout_prob=args.hidden_dropout,
                      attention_dropout_prob=args.attention_dropout,
                      output_dropout_prob=args.hidden_dropout,
                      max_sequence_length=args.max_position_embeddings,
                      checkpoint_activations=args.checkpoint_activations,
                      checkpoint_num_layers=args.checkpoint_num_layers,
                      parallel_output=False)

    if mpu.get_data_parallel_rank() == 0:
        print(' > number of parameters on model parallel rank {}: {}'.format(
            mpu.get_model_parallel_rank(),
            sum([p.nelement() for p in model.parameters()])),
              flush=True)

    # GPU allocation.
    model.cuda(torch.cuda.current_device())

    # Fp16 conversion.
    if args.fp16:
        model = FP16_Module(model)

    # Wrap model for distributed training.
    model = DDP(model)

    return model
Beispiel #2
0
def get_model(tokenizer, args):
    """Build the model."""

    print('building BERT model ...')
    model = BertModel(tokenizer, args)
    print(' > number of parameters: {}'.format(
        sum([p.nelement() for p in model.parameters()])),
          flush=True)

    # GPU allocation.
    model.cuda(torch.cuda.current_device())

    # Fp16 conversion.
    if args.fp16:
        print("fp16 mode")
        model = FP16_Module(model)
        if args.fp32_embedding:
            model.module.model.bert.embeddings.word_embeddings.float()
            model.module.model.bert.embeddings.position_embeddings.float()
            model.module.model.bert.embeddings.token_type_embeddings.float()
        if args.fp32_tokentypes:
            model.module.model.bert.embeddings.token_type_embeddings.float()
        if args.fp32_layernorm:
            for name, _module in model.named_modules():
                if 'LayerNorm' in name:
                    _module.float()
    # Wrap model for distributed training.
    if args.world_size > 1:
        model = DDP(model)

    return model
Beispiel #3
0
def get_model(args):
    """Build the model."""

    print_rank_0('building GPT2 model ...')
    model = GPT2Model(num_layers=args.num_layers,
                      vocab_size=args.vocab_size,
                      hidden_size=args.hidden_size,
                      num_attention_heads=args.num_attention_heads,
                      embedding_dropout_prob=args.hidden_dropout,
                      attention_dropout_prob=args.attention_dropout,
                      output_dropout_prob=args.hidden_dropout,
                      max_sequence_length=args.max_position_embeddings,
                      max_memory_length=args.mem_length,
                      checkpoint_activations=args.checkpoint_activations,
                      checkpoint_num_layers=args.checkpoint_num_layers,
                      parallel_output=True,
                      relative_encoding=args.transformer_xl)

    if mpu.get_data_parallel_rank() == 0:
        print(' > number of parameters on model parallel rank {}: {}'.format(
            mpu.get_model_parallel_rank(),
            sum([p.nelement() for p in model.parameters()])), flush=True)

    # To prevent OOM for model sizes that cannot fit in GPU memory in full precision
    if hasattr(args, "deepspeed") and args.deepspeed and args.fp16:
        model.half()

    # GPU allocation.
    model.cuda(torch.cuda.current_device())

    # Fp16 conversion.
    if args.fp16:
        model = FP16_Module(model)

    # Wrap model for distributed training.
    if USE_TORCH_DDP:
        from model import PyTorchDistributedDataParallel as DDP
        i = torch.cuda.current_device()
        model = DDP(model, device_ids=[i], output_device=i,
                    process_group=mpu.get_data_parallel_group())
    else:
        from model import DistributedDataParallel as DDP
        model = DDP(model)

    return model
Beispiel #4
0
def setup_model_and_optim(args, train_data, tokenizer):
    ntokens = args.data_size
    if args.model.lower() == 'transformer':
        embed_tokens = m.Embedding(
            ntokens,
            args.decoder_embed_dim,
            padding_idx=tokenizer.command_name_map['pad'].Id)
        model = m.TransformerModel(m.DecoderPreprocessor(args, embed_tokens),
                                   m.TransformerDecoder(args, embed_tokens))
    else:
        model = m.RNNModel(args.model, ntokens, args.emsize, args.nhid,
                           args.nlayers, args.dropout, args.tied)
        global rnn_model
        rnn_model = model
    LR_Warmer = None
    print('* number of parameters: %d' %
          sum([p.nelement() for p in model.parameters()]))
    if args.cuda:
        model.cuda()

    optim = None
    if args.load is not None and args.load != '':
        sd = torch.load(args.load, map_location='cpu')
        if args.load_optim:
            #optim_sd = torch.load(os.path.join(os.path.dirname(args.load), 'optim.pt'), map_location='cpu')
            rng = torch.load(os.path.join(os.path.dirname(args.load),
                                          'rng.pt'))
            torch.cuda.set_rng_state(rng[0])
            torch.set_rng_state(rng[1])
        try:
            model.load_state_dict(sd)
        except:
            if hasattr(model, 'rnn'):
                apply_weight_norm(model.rnn, hook_child=False)
            else:
                apply_weight_norm(model, hook_child=False)
            model.load_state_dict(sd)
            remove_weight_norm(model)

    if not args.no_weight_norm:
        if hasattr(model, 'rnn'):
            apply_weight_norm(model.rnn, hook_child=False)
        else:
            apply_weight_norm(model, hook_child=False)

    if optim is None:
        optim_choice = 'Adam' if args.stlr_cut_frac else args.optim
        if args.fp16:
            model = FP16_Module(model)
            optim = eval('torch.optim.' + args.optim)(model.parameters(),
                                                      lr=args.lr)
            optim = FP16_Optimizer(optim,
                                   static_loss_scale=args.loss_scale,
                                   dynamic_loss_scale=args.dynamic_loss_scale)
        else:
            optim = eval('torch.optim.' + args.optim)(model.parameters(),
                                                      lr=args.lr)

    if args.load_optim:
        optim.load_state_dict(optim_sd)

    # add linear learning rate scheduler
    if train_data is not None:
        if args.constant_decay:
            num_iters = args.constant_decay
        else:
            num_iters = args.train_iters * args.epochs

        init_step = -1
        if args.load_optim:
            #TODO: this no longer makes sense given the new data loaders
            init_step = optim_sd['iter'] - optim_sd['skipped_iter']
            train_data.batch_sampler.start_iter = (optim_sd['iter'] %
                                                   len(train_data)) + 1

        warmup_iter = args.warmup * num_iters

        if args.stlr_cut_frac is not None:
            LR = SlantedTriangularLR(optim,
                                     cut_frac=args.stlr_cut_frac,
                                     num_iters=num_iters)
        else:
            LR = AnnealingLR(optim,
                             start_lr=args.lr,
                             warmup_iter=warmup_iter,
                             num_iters=num_iters,
                             decay_style=args.decay_style)

        if args.warmup != 0:
            LR_Warmer = WarmupLR(optim, warmup_iter, last_iter=init_step)

    # wrap model for distributed training
    if args.world_size > 1:
        model = DDP(model)

    criterion = nn.CrossEntropyLoss(reduce=False)
    return model, optim, LR, LR_Warmer, criterion
Beispiel #5
0
    if args.constant_decay:
        num_iters = args.constant_decay
    else:
        num_iters = args.train_iters * args.epochs

    init_step = -1
    if args.load_optim:
        init_step = optim_sd['iter'] - optim_sd['skipped_iter']
        train_data.batch_sampler.start_iter = (optim_sd['iter'] %
                                               len(train_data)) + 1

    LR = LinearLR(optim, num_iters, last_iter=init_step)

# wrap model for distributed training
if args.world_size > 1:
    model = DDP(model)

criterion = nn.CrossEntropyLoss()

###############################################################################
# Training code
###############################################################################

# get_batch subdivides the source data into chunks of length args.seq_length.
# If source is equal to the example output of the data loading example, with
# a seq_length limit of 2, we'd get the following two Variables for i = 0:
# ┌ a g m s ┐ ┌ b h n t ┐
# └ b h n t ┘ └ c i o u ┘
# Note that despite the name of the function, the subdivison of data is not
# done along the batch dimension (i.e. dimension 1), since that was handled
# by the data loader. The chunks are along dimension 0, corresponding