agent.load_state_dict(checkpoint['agent']) start_epoch = checkpoint['epoch'] + 1 print('loaded agent from', args.load) if args.parallel: agent = nn.DataParallel(agent) seg_model = nn.DataParallel(seg_model) seg_model.eval().cuda() agent.cuda() optimizer = optim.Adam(agent.parameters(), lr=args.lr, weight_decay=args.wd) lr_scheduler = utils.LrScheduler(optimizer, args.lr, args.lr_decay_ratio, args.epoch_step) for epoch in range(start_epoch, start_epoch + args.max_epochs + 1): lr_scheduler.adjust_learning_rate(epoch) if args.cl_step < num_blocks: args.cl_step = 1 + 1 * (epoch // 10) else: args.cl_step = num_blocks print('training the last %d blocks ...' % args.cl_step) train(epoch) with torch.no_grad(): if epoch != 0 and epoch % 10 == 0: test(epoch)
vocab_size = args.max_vocab_size model = BiLSTM(vocab_size=vocab_size, embed_dim=args.embed_dim, hidden_dim=args.hidden_dim, num_tags=len(tag2idx), embed_matrix=embed_mat) loss_fn = tf.keras.losses.SparseCategoricalCrossentropy( from_logits=True) # Set "from_logits=True" may be more numerically stable #%% Degine optimizer total_steps = len(list(train_batches)) * args.epochs warm_steps = int(total_steps * args.warm_frac) lr_scheduler = utils.LrScheduler(args.lr, warm_steps, total_steps) wd_scheduler = utils.WdScheduler(1e-2, warm_steps, total_steps) ## Adam optimizer # optimizer = tf.keras.optimizers.Adam(learning_rate = lr_scheduler) ## AdamW optimizer optimizer = tfa.optimizers.AdamW(learning_rate=lr_scheduler, weight_decay=lambda: None) optimizer.weight_decay = lambda: wd_scheduler(optimizer.iterations) #%% @tf.function( experimental_relax_shapes=True ) # Passing tensors with different shapes - need to relax shapes to avoid unnecessary retracing