def get_learning_rate_scheduler(optimizer, args): # Add linear learning rate scheduler. if args.lr_decay_iters is not None: num_iters = args.lr_decay_iters else: num_iters = args.max_steps num_iters = max(1, num_iters) init_step = 0 warmup_iter = args.warmup * num_iters plateau_iter = warmup_iter + args.plateau * num_iters lr_scheduler = AnnealingLR( optimizer, start_lr=args.lr, warmup_iter=warmup_iter, plateau_iter=plateau_iter, total_iters=num_iters, decay_style=args.lr_decay_style, last_iter=init_step, min_lr=args.min_lr, use_checkpoint_lr_scheduler=args.load_partial or args.load_full, override_lr_scheduler=False, ) return lr_scheduler
def main(): model = Linear(10, 10) optimizer = Adam(model.parameters()) lr_scheduler = AnnealingLR(optimizer, start_lr=0.00015, warmup_iter=3000, num_iters=300000, decay_style='cosine', decay_ratio=0.1) steps = np.arange(0, 400000, 10, dtype=np.long) rates = [] for step in steps: lr_scheduler.num_iters = step rates.append(lr_scheduler.get_lr()) print(rates) plt.plot(steps, rates) plt.savefig("lr.pdf", format='pdf')
def get_learning_rate_scheduler(optimizer, args): """Build the learning rate scheduler.""" # Add linear learning rate scheduler. if args.lr_decay_iters is not None: num_iters = args.lr_decay_iters else: num_iters = args.train_iters * args.epochs init_step = -1 warmup_iter = args.warmup * num_iters lr_scheduler = AnnealingLR(optimizer, start_lr=args.lr, warmup_iter=warmup_iter, num_iters=num_iters, decay_style=args.lr_decay_style, last_iter=init_step) return lr_scheduler
def get_learning_rate_scheduler(optimizer, args): """Build the learning rate scheduler.""" # Add linear learning rate scheduler. if args.lr_decay_iters is not None: num_iters = args.lr_decay_iters else: num_iters = args.train_iters if args.finetune: num_iters = num_iters // args.gradient_accumulation_steps num_iters = max(1, num_iters) init_step = -1 warmup_iter = args.warmup * num_iters lr_scheduler = AnnealingLR(optimizer, start_lr=args.lr, warmup_iter=warmup_iter, num_iters=num_iters - warmup_iter, decay_style=args.lr_decay_style, last_iter=init_step, decay_ratio=args.lr_decay_ratio) return lr_scheduler
def get_learning_rate_scheduler(optimizer, args): """Build the learning rate scheduler.""" # Add linear learning rate scheduler. if args.lr_decay_iters is not None: num_iters = args.lr_decay_iters else: num_iters = args.train_iters num_iters = max(1, num_iters) init_step = -1 warmup_iter = args.warmup * num_iters lr_scheduler = AnnealingLR( optimizer, start_lr=args.lr, warmup_iter=warmup_iter, num_iters=num_iters, decay_style=args.lr_decay_style, last_iter=init_step, min_lr=args.min_lr, use_checkpoint_lr_scheduler=args.use_checkpoint_lr_scheduler, override_lr_scheduler=args.override_lr_scheduler) return lr_scheduler
def setup_model_and_optim(args, train_data, tokenizer): ntokens = args.data_size if args.model.lower() == 'transformer': embed_tokens = m.Embedding( ntokens, args.decoder_embed_dim, padding_idx=tokenizer.command_name_map['pad'].Id) model = m.TransformerModel(m.DecoderPreprocessor(args, embed_tokens), m.TransformerDecoder(args, embed_tokens)) else: model = m.RNNModel(args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.tied) global rnn_model rnn_model = model LR_Warmer = None print('* number of parameters: %d' % sum([p.nelement() for p in model.parameters()])) if args.cuda: model.cuda() optim = None if args.load is not None and args.load != '': sd = torch.load(args.load, map_location='cpu') if args.load_optim: #optim_sd = torch.load(os.path.join(os.path.dirname(args.load), 'optim.pt'), map_location='cpu') rng = torch.load(os.path.join(os.path.dirname(args.load), 'rng.pt')) torch.cuda.set_rng_state(rng[0]) torch.set_rng_state(rng[1]) try: model.load_state_dict(sd) except: if hasattr(model, 'rnn'): apply_weight_norm(model.rnn, hook_child=False) else: apply_weight_norm(model, hook_child=False) model.load_state_dict(sd) remove_weight_norm(model) if not args.no_weight_norm: if hasattr(model, 'rnn'): apply_weight_norm(model.rnn, hook_child=False) else: apply_weight_norm(model, hook_child=False) if optim is None: optim_choice = 'Adam' if args.stlr_cut_frac else args.optim if args.fp16: model = FP16_Module(model) optim = eval('torch.optim.' + args.optim)(model.parameters(), lr=args.lr) optim = FP16_Optimizer(optim, static_loss_scale=args.loss_scale, dynamic_loss_scale=args.dynamic_loss_scale) else: optim = eval('torch.optim.' + args.optim)(model.parameters(), lr=args.lr) if args.load_optim: optim.load_state_dict(optim_sd) # add linear learning rate scheduler if train_data is not None: if args.constant_decay: num_iters = args.constant_decay else: num_iters = args.train_iters * args.epochs init_step = -1 if args.load_optim: #TODO: this no longer makes sense given the new data loaders init_step = optim_sd['iter'] - optim_sd['skipped_iter'] train_data.batch_sampler.start_iter = (optim_sd['iter'] % len(train_data)) + 1 warmup_iter = args.warmup * num_iters if args.stlr_cut_frac is not None: LR = SlantedTriangularLR(optim, cut_frac=args.stlr_cut_frac, num_iters=num_iters) else: LR = AnnealingLR(optim, start_lr=args.lr, warmup_iter=warmup_iter, num_iters=num_iters, decay_style=args.decay_style) if args.warmup != 0: LR_Warmer = WarmupLR(optim, warmup_iter, last_iter=init_step) # wrap model for distributed training if args.world_size > 1: model = DDP(model) criterion = nn.CrossEntropyLoss(reduce=False) return model, optim, LR, LR_Warmer, criterion
def get_model_and_optim(args, train_data): if args.use_softmax: args.report_no_thresholding = True ntokens = args.data_size concat_pools = args.concat_max, args.concat_min, args.concat_mean if args.model == 'transformer': model = M.SentimentClassifier( model_type=args.model, ntoken=ntokens, ninp=None, nhid=None, nlayers=None, classifier_hidden_layers=args.classifier_hidden_layers, dropout=args.classifier_dropout, all_layers=None, concat_pools=concat_pools, get_lm_out=args.aux_lm_loss, args=args, ) else: model = M.SentimentClassifier( model_type=args.model, ntoken=ntokens, ninp=args.emsize, nhid=args.nhid, nlayers=args.nlayers, classifier_hidden_layers=args.classifier_hidden_layers, dropout=args.classifier_dropout, all_layers=args.all_layers, concat_pools=concat_pools, get_lm_out=args.aux_lm_loss, args=args, ) if args.cuda: model.cuda() if args.fp16: model.half() # load char embedding and recurrent encoder for featurization if args.load is not None and args.load != '': with open(args.load, 'rb') as f: sd = x = torch.load(f, 'cpu') if 'sd' in sd: sd = sd['sd'] if not args.load_finetuned: if 'lm_encoder' in sd: sd = sd['lm_encoder'] try: model.lm_encoder.load_state_dict(sd) except: # if state dict has weight normalized parameters apply and remove weight norm to model while loading sd if hasattr(model.lm_encoder, 'rnn'): apply_weight_norm(model.lm_encoder.rnn) else: apply_weight_norm(model.lm_encoder) model.lm_encoder.load_state_dict(sd) remove_weight_norm(model) else: model.load_state_dict(sd) if args.thresh_test_preds: model.set_thresholds( pd.read_csv(args.thresh_test_preds, header=None).values.squeeze(), args.double_thresh, args.dual_thresh and not args.joint_binary_train) optims = {'adam': 'Adam', 'sgd': 'SGD'} optim = eval('torch.optim.' + optims[args.optim.lower()])( model.parameters(), lr=args.lr) iters_per_epoch = len(train_data) num_iters = iters_per_epoch * args.epochs assert not (args.stlr_cut_frac and args.cos_cut_frac) if args.stlr_cut_frac is not None: LR = SlantedTriangularLR(optim, max_val=args.lr, cut_frac=args.stlr_cut_frac, num_iters=num_iters) elif args.cos_cut_frac is not None: LR = AnnealingLR(optim, start_lr=args.lr, warmup_iter=int(args.cos_cut_frac * num_iters), num_iters=num_iters, decay_style='cosine') elif args.decay_style is not None: warmup_iters = int(args.warmup_epochs * iters_per_epoch) if args.decay_epochs == -1: decay_iters = int(args.epochs * iters_per_epoch) else: decay_iters = int(args.decay_epochs * iters_per_epoch) if args.decay_style == 'constant': #TODO: implement LR = AnnealingLR(optim, start_lr=args.lr, warmup_iter=warmup_iters, num_iters=decay_iters + warmup_iters, decay_style=args.decay_style) elif args.decay_style == 'linear': #TODO: implement LR = AnnealingLR(optim, start_lr=args.lr, warmup_iter=warmup_iters, num_iters=decay_iters + warmup_iters, decay_style=args.decay_style) elif args.decay_style == 'cosine': LR = AnnealingLR(optim, start_lr=args.lr, warmup_iter=warmup_iters, num_iters=decay_iters + warmup_iters, decay_style=args.decay_style) elif args.decay_style == 'exponential': #TODO: implement LR = ConstantLR(optim, lr=args.lr) else: LR = ConstantLR(optim, lr=args.lr) else: LR = ConstantLR(optim, lr=args.lr) return model, optim, LR
# b = torch.arange(2) * 1000 # h = torch.arange(3) * 100 # pos_seq = torch.arange(9, -1, -1) # query = torch.arange(7) * 10 # s = pos_seq.unsqueeze(0) + query.unsqueeze(1) # s = b.view(-1, 1, 1, 1) + h.view(1, -1, 1, 1) + s # s = GPT2ParallelSelfAttention._rel_shift(s) # print(s) from torch.nn.modules import Linear from torch.optim import Adam from learning_rates import AnnealingLR import matplotlib.pyplot as plt import numpy as np model = Linear(10, 10) optimizer = Adam(model.parameters()) lr_scheduler = AnnealingLR(optimizer, start_lr=0.00015, warmup_iter=3000, num_iters=300000, decay_style='cosine', decay_ratio=0.1) steps = np.arange(0, 400000, 10, dtype=np.long) rates = [] for step in steps: lr_scheduler.num_iters = step rates.append(lr_scheduler.get_lr()) print(rates) plt.plot(steps, rates) plt.savefig("lr.pdf", format='pdf')