Example #1
0
def get_learning_rate_scheduler(optimizer, args):

    # Add linear learning rate scheduler.
    if args.lr_decay_iters is not None:
        num_iters = args.lr_decay_iters
    else:
        num_iters = args.max_steps
    num_iters = max(1, num_iters)
    init_step = 0
    warmup_iter = args.warmup * num_iters
    plateau_iter = warmup_iter + args.plateau * num_iters
    lr_scheduler = AnnealingLR(
        optimizer,
        start_lr=args.lr,
        warmup_iter=warmup_iter,
        plateau_iter=plateau_iter,
        total_iters=num_iters,
        decay_style=args.lr_decay_style,
        last_iter=init_step,
        min_lr=args.min_lr,
        use_checkpoint_lr_scheduler=args.load_partial or args.load_full,
        override_lr_scheduler=False,
    )

    return lr_scheduler
Example #2
0
def main():
    model = Linear(10, 10)
    optimizer = Adam(model.parameters())
    lr_scheduler = AnnealingLR(optimizer,
                               start_lr=0.00015,
                               warmup_iter=3000,
                               num_iters=300000,
                               decay_style='cosine',
                               decay_ratio=0.1)
    steps = np.arange(0, 400000, 10, dtype=np.long)
    rates = []
    for step in steps:
        lr_scheduler.num_iters = step
        rates.append(lr_scheduler.get_lr())
    print(rates)
    plt.plot(steps, rates)
    plt.savefig("lr.pdf", format='pdf')
Example #3
0
def get_learning_rate_scheduler(optimizer, args):
    """Build the learning rate scheduler."""

    # Add linear learning rate scheduler.
    if args.lr_decay_iters is not None:
        num_iters = args.lr_decay_iters
    else:
        num_iters = args.train_iters * args.epochs
    init_step = -1
    warmup_iter = args.warmup * num_iters
    lr_scheduler = AnnealingLR(optimizer,
                               start_lr=args.lr,
                               warmup_iter=warmup_iter,
                               num_iters=num_iters,
                               decay_style=args.lr_decay_style,
                               last_iter=init_step)

    return lr_scheduler
Example #4
0
def get_learning_rate_scheduler(optimizer, args):
    """Build the learning rate scheduler."""

    # Add linear learning rate scheduler.
    if args.lr_decay_iters is not None:
        num_iters = args.lr_decay_iters
    else:
        num_iters = args.train_iters
    if args.finetune:
        num_iters = num_iters // args.gradient_accumulation_steps
    num_iters = max(1, num_iters)
    init_step = -1
    warmup_iter = args.warmup * num_iters
    lr_scheduler = AnnealingLR(optimizer,
                               start_lr=args.lr,
                               warmup_iter=warmup_iter,
                               num_iters=num_iters - warmup_iter,
                               decay_style=args.lr_decay_style,
                               last_iter=init_step,
                               decay_ratio=args.lr_decay_ratio)

    return lr_scheduler
Example #5
0
def get_learning_rate_scheduler(optimizer, args):
    """Build the learning rate scheduler."""

    # Add linear learning rate scheduler.
    if args.lr_decay_iters is not None:
        num_iters = args.lr_decay_iters
    else:
        num_iters = args.train_iters
    num_iters = max(1, num_iters)
    init_step = -1
    warmup_iter = args.warmup * num_iters
    lr_scheduler = AnnealingLR(
        optimizer,
        start_lr=args.lr,
        warmup_iter=warmup_iter,
        num_iters=num_iters,
        decay_style=args.lr_decay_style,
        last_iter=init_step,
        min_lr=args.min_lr,
        use_checkpoint_lr_scheduler=args.use_checkpoint_lr_scheduler,
        override_lr_scheduler=args.override_lr_scheduler)

    return lr_scheduler
Example #6
0
def setup_model_and_optim(args, train_data, tokenizer):
    ntokens = args.data_size
    if args.model.lower() == 'transformer':
        embed_tokens = m.Embedding(
            ntokens,
            args.decoder_embed_dim,
            padding_idx=tokenizer.command_name_map['pad'].Id)
        model = m.TransformerModel(m.DecoderPreprocessor(args, embed_tokens),
                                   m.TransformerDecoder(args, embed_tokens))
    else:
        model = m.RNNModel(args.model, ntokens, args.emsize, args.nhid,
                           args.nlayers, args.dropout, args.tied)
        global rnn_model
        rnn_model = model
    LR_Warmer = None
    print('* number of parameters: %d' %
          sum([p.nelement() for p in model.parameters()]))
    if args.cuda:
        model.cuda()

    optim = None
    if args.load is not None and args.load != '':
        sd = torch.load(args.load, map_location='cpu')
        if args.load_optim:
            #optim_sd = torch.load(os.path.join(os.path.dirname(args.load), 'optim.pt'), map_location='cpu')
            rng = torch.load(os.path.join(os.path.dirname(args.load),
                                          'rng.pt'))
            torch.cuda.set_rng_state(rng[0])
            torch.set_rng_state(rng[1])
        try:
            model.load_state_dict(sd)
        except:
            if hasattr(model, 'rnn'):
                apply_weight_norm(model.rnn, hook_child=False)
            else:
                apply_weight_norm(model, hook_child=False)
            model.load_state_dict(sd)
            remove_weight_norm(model)

    if not args.no_weight_norm:
        if hasattr(model, 'rnn'):
            apply_weight_norm(model.rnn, hook_child=False)
        else:
            apply_weight_norm(model, hook_child=False)

    if optim is None:
        optim_choice = 'Adam' if args.stlr_cut_frac else args.optim
        if args.fp16:
            model = FP16_Module(model)
            optim = eval('torch.optim.' + args.optim)(model.parameters(),
                                                      lr=args.lr)
            optim = FP16_Optimizer(optim,
                                   static_loss_scale=args.loss_scale,
                                   dynamic_loss_scale=args.dynamic_loss_scale)
        else:
            optim = eval('torch.optim.' + args.optim)(model.parameters(),
                                                      lr=args.lr)

    if args.load_optim:
        optim.load_state_dict(optim_sd)

    # add linear learning rate scheduler
    if train_data is not None:
        if args.constant_decay:
            num_iters = args.constant_decay
        else:
            num_iters = args.train_iters * args.epochs

        init_step = -1
        if args.load_optim:
            #TODO: this no longer makes sense given the new data loaders
            init_step = optim_sd['iter'] - optim_sd['skipped_iter']
            train_data.batch_sampler.start_iter = (optim_sd['iter'] %
                                                   len(train_data)) + 1

        warmup_iter = args.warmup * num_iters

        if args.stlr_cut_frac is not None:
            LR = SlantedTriangularLR(optim,
                                     cut_frac=args.stlr_cut_frac,
                                     num_iters=num_iters)
        else:
            LR = AnnealingLR(optim,
                             start_lr=args.lr,
                             warmup_iter=warmup_iter,
                             num_iters=num_iters,
                             decay_style=args.decay_style)

        if args.warmup != 0:
            LR_Warmer = WarmupLR(optim, warmup_iter, last_iter=init_step)

    # wrap model for distributed training
    if args.world_size > 1:
        model = DDP(model)

    criterion = nn.CrossEntropyLoss(reduce=False)
    return model, optim, LR, LR_Warmer, criterion
def get_model_and_optim(args, train_data):
    if args.use_softmax:
        args.report_no_thresholding = True
    ntokens = args.data_size
    concat_pools = args.concat_max, args.concat_min, args.concat_mean
    if args.model == 'transformer':
        model = M.SentimentClassifier(
            model_type=args.model,
            ntoken=ntokens,
            ninp=None,
            nhid=None,
            nlayers=None,
            classifier_hidden_layers=args.classifier_hidden_layers,
            dropout=args.classifier_dropout,
            all_layers=None,
            concat_pools=concat_pools,
            get_lm_out=args.aux_lm_loss,
            args=args,
        )
    else:
        model = M.SentimentClassifier(
            model_type=args.model,
            ntoken=ntokens,
            ninp=args.emsize,
            nhid=args.nhid,
            nlayers=args.nlayers,
            classifier_hidden_layers=args.classifier_hidden_layers,
            dropout=args.classifier_dropout,
            all_layers=args.all_layers,
            concat_pools=concat_pools,
            get_lm_out=args.aux_lm_loss,
            args=args,
        )
    if args.cuda:
        model.cuda()

    if args.fp16:
        model.half()
    # load char embedding and recurrent encoder for featurization
    if args.load is not None and args.load != '':
        with open(args.load, 'rb') as f:
            sd = x = torch.load(f, 'cpu')
            if 'sd' in sd:
                sd = sd['sd']

        if not args.load_finetuned:
            if 'lm_encoder' in sd:
                sd = sd['lm_encoder']
            try:
                model.lm_encoder.load_state_dict(sd)
            except:
                # if state dict has weight normalized parameters apply and remove weight norm to model while loading sd
                if hasattr(model.lm_encoder, 'rnn'):
                    apply_weight_norm(model.lm_encoder.rnn)
                else:
                    apply_weight_norm(model.lm_encoder)
                model.lm_encoder.load_state_dict(sd)
                remove_weight_norm(model)
        else:
            model.load_state_dict(sd)

    if args.thresh_test_preds:
        model.set_thresholds(
            pd.read_csv(args.thresh_test_preds,
                        header=None).values.squeeze(), args.double_thresh,
            args.dual_thresh and not args.joint_binary_train)

    optims = {'adam': 'Adam', 'sgd': 'SGD'}

    optim = eval('torch.optim.' + optims[args.optim.lower()])(
        model.parameters(), lr=args.lr)
    iters_per_epoch = len(train_data)
    num_iters = iters_per_epoch * args.epochs

    assert not (args.stlr_cut_frac and args.cos_cut_frac)
    if args.stlr_cut_frac is not None:
        LR = SlantedTriangularLR(optim,
                                 max_val=args.lr,
                                 cut_frac=args.stlr_cut_frac,
                                 num_iters=num_iters)
    elif args.cos_cut_frac is not None:
        LR = AnnealingLR(optim,
                         start_lr=args.lr,
                         warmup_iter=int(args.cos_cut_frac * num_iters),
                         num_iters=num_iters,
                         decay_style='cosine')
    elif args.decay_style is not None:
        warmup_iters = int(args.warmup_epochs * iters_per_epoch)
        if args.decay_epochs == -1:
            decay_iters = int(args.epochs * iters_per_epoch)
        else:
            decay_iters = int(args.decay_epochs * iters_per_epoch)
        if args.decay_style == 'constant':
            #TODO: implement
            LR = AnnealingLR(optim,
                             start_lr=args.lr,
                             warmup_iter=warmup_iters,
                             num_iters=decay_iters + warmup_iters,
                             decay_style=args.decay_style)
        elif args.decay_style == 'linear':
            #TODO: implement
            LR = AnnealingLR(optim,
                             start_lr=args.lr,
                             warmup_iter=warmup_iters,
                             num_iters=decay_iters + warmup_iters,
                             decay_style=args.decay_style)
        elif args.decay_style == 'cosine':
            LR = AnnealingLR(optim,
                             start_lr=args.lr,
                             warmup_iter=warmup_iters,
                             num_iters=decay_iters + warmup_iters,
                             decay_style=args.decay_style)
        elif args.decay_style == 'exponential':
            #TODO: implement
            LR = ConstantLR(optim, lr=args.lr)
        else:
            LR = ConstantLR(optim, lr=args.lr)
    else:
        LR = ConstantLR(optim, lr=args.lr)
    return model, optim, LR
Example #8
0
# b = torch.arange(2) * 1000
# h = torch.arange(3) * 100
# pos_seq = torch.arange(9, -1, -1)
# query = torch.arange(7) * 10
# s = pos_seq.unsqueeze(0) + query.unsqueeze(1)
# s = b.view(-1, 1, 1, 1) + h.view(1, -1, 1, 1) + s
# s = GPT2ParallelSelfAttention._rel_shift(s)
# print(s)

from torch.nn.modules import Linear
from torch.optim import Adam
from learning_rates import AnnealingLR
import matplotlib.pyplot as plt
import numpy as np

model = Linear(10, 10)
optimizer = Adam(model.parameters())
lr_scheduler = AnnealingLR(optimizer,
                           start_lr=0.00015,
                           warmup_iter=3000,
                           num_iters=300000,
                           decay_style='cosine',
                           decay_ratio=0.1)
steps = np.arange(0, 400000, 10, dtype=np.long)
rates = []
for step in steps:
    lr_scheduler.num_iters = step
    rates.append(lr_scheduler.get_lr())
print(rates)
plt.plot(steps, rates)
plt.savefig("lr.pdf", format='pdf')