def configure_optimizers(self): if FLAGS.optim == 'adam': self.optimizer = AdamW(self.model.parameters(), lr=FLAGS.lr, weight_decay=1e-5) elif FLAGS.optim == 'sm3': self.optimizer = SM3(self.model.parameters(), lr=FLAGS.lr, momentum=0.0) else: self.optimizer = Novograd(self.model.parameters(), lr=FLAGS.lr, weight_decay=1e-3) scheduler = [] if FLAGS.sched: self.plateau_scheduler = optim.lr_scheduler.ReduceLROnPlateau( self.optimizer, patience=FLAGS.sched_patience, factor=FLAGS.sched_factor, min_lr=FLAGS.sched_min_lr, verbose=1) scheduler = [self.plateau_scheduler] self.warmup_optimizer_step(0) return [self.optimizer]
def get_optimizer(opt, params): # large_lr_layers = list(map(id, model.module._fc.parameters())) # small_lr_layers = filter(lambda p:id(p) not in large_lr_layers, model.module.parameters()) if opt.optimizer == 'sgd': optimizer = optim.SGD(params, lr=opt.lr, momentum=0.9, nesterov=True) # optimizer = torch.optim.SGD([ # {"params":model.module._fc.parameters()}, # {"params":small_lr_layers,"lr":opt.lr/10} # ],lr = opt.lr, momentum=0.9, weight_decay=1e-4) elif opt.optimizer == 'adam': optimizer = optim.Adam(params, lr=opt.lr) elif opt.optimizer == 'radam': optimizer = RAdam(params, lr=opt.lr) elif opt.optimizer == 'adamw': optimizer = AdamW(params, lr=opt.lr) # optimizer = torch.optim.AdamW([ # {"params":model.module._fc.parameters()}, # {"params":small_lr_layers,"lr":opt.lr/10} # ],lr = opt.lr, weight_decay=5e-4) elif opt.optimizer == 'rms': # optimizer = optim.RMSprop([ # {"params":model.module._fc.parameters()}, # {"params":small_lr_layers, "lr": opt.lr/10} # ], lr=opt.lr, momentum=0.9, weight_decay=1e-4) optimizer = optim.RMSprop(params, lr=opt.lr, momentum=0.9) elif opt.optimizer == 'novograd': optimizer = NovoGrad(params, lr=opt.lr, grad_averaging=True) if opt.lookahead: optimizer = Lookahead(optimizer, k=6, alpha=0.6) return optimizer
def worker(proc_id, gpu_ranks, args, model): if args.dist_train: # multiple GPU mode rank = gpu_ranks[proc_id] % args.world_size gpu_id = gpu_ranks[proc_id] % args.device_count elif args.single_gpu: # single GPU mode rank = None gpu_id = proc_id else: # CPU mode rank = None gpu_id = None if args.dist_train: train_loader = LOADERS[args.target](args, args.dataset_path, args.batch_size, rank, args.world_size, True) else: train_loader = LOADERS[args.target](args, args.dataset_path, args.batch_size, 0, 1, True) if gpu_id is not None: torch.cuda.set_device(gpu_id) model.cuda(gpu_id) # build optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'gamma', 'beta'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, correct_bias=False) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.total_steps * args.warmup, t_total=args.total_steps) if args.dist_train: # initialize multiprocessing distributed training environment dist.init_process_group(backend=args.backend, init_method=args.master_ip, world_size=args.world_size, rank=rank) model = DistributedDataParallel(model, device_ids=[gpu_id]) print("Worker {} is training ... ".format(rank)) else: print("Worker is training ...") TRAINERS[args.target](args, gpu_id, rank, train_loader, model, optimizer, scheduler)
def train(dataset, embedding, tokenizer, entity_linker, min_count, max_word_length, max_entity_length, batch_size, patience, learning_rate, weight_decay, warmup_epochs, dropout_prob, use_gpu, use_word): if use_gpu: device = torch.device('cuda') else: device = torch.device('cpu') data = generate_features(dataset, tokenizer, entity_linker, min_count, max_word_length, max_entity_length) word_vocab = data['word_vocab'] entity_vocab = data['entity_vocab'] train_data_loader = DataLoader(data['train'], shuffle=True, batch_size=batch_size) dev_data_loader = DataLoader(data['dev'], shuffle=False, batch_size=batch_size) dim_size = embedding.syn0.shape[1] word_embedding = np.random.uniform(low=-0.05, high=0.05, size=(len(word_vocab), dim_size)) word_embedding[0] = np.zeros(dim_size) for word, index in word_vocab.items(): try: word_embedding[index] = embedding.get_word_vector(word) except KeyError: continue entity_embedding = np.random.uniform(low=-0.05, high=0.05, size=(len(entity_vocab), dim_size)) entity_embedding[0] = np.zeros(dim_size) for entity, index in entity_vocab.items(): try: entity_embedding[index] = embedding.get_entity_vector(entity) except KeyError: continue model = NABoE(word_embedding, entity_embedding, len(dataset.label_names), dropout_prob, use_word) optimizer = AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay, warmup=warmup_epochs * len(train_data_loader)) model.to(device) epoch = 0 best_val_acc = 0.0 best_weights = None num_epochs_without_improvement = 0 while True: with tqdm(train_data_loader) as pbar: model.train() for batch in pbar: args = { k: v.to(device) for k, v in batch.items() if k != 'label' } logits = model(**args) loss = F.cross_entropy(logits, batch['label'].to(device)) loss.backward() optimizer.step() model.zero_grad() pbar.set_description(f'epoch: {epoch} loss: {loss.item():.8f}') epoch += 1 val_acc = evaluate(model, dev_data_loader, device, 'dev')[0] if val_acc > best_val_acc: best_val_acc = val_acc best_weights = { k: v.to('cpu').clone() for k, v in model.state_dict().items() } num_epochs_without_improvement = 0 else: num_epochs_without_improvement += 1 if num_epochs_without_improvement >= patience: model.load_state_dict(best_weights) break test_data_loader = DataLoader(data['test'], shuffle=False, batch_size=batch_size) return evaluate(model, test_data_loader, device, 'test')
dis_loss = nn.BCEWithLogitsLoss(reduction='mean') param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'gamma', 'beta'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if n not in no_decay], 'weight_decay_rate': 0.01 }, { 'params': [p for n, p in param_optimizer if n in no_decay], 'weight_decay_rate': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.lr) # optimizer = apex.optimizers.FusedLAMB(optimizer_grouped_parameters, lr=args.lr) model, optimizer = amp.initialize(model, optimizer, opt_level="O1") model = DistributedDataParallel(model) log_dir = os.path.join(args.save_path, 'logs') model_dir = os.path.join(args.save_path, 'models') if not os.path.exists(log_dir): mkdir_p(log_dir) if not os.path.exists(model_dir): mkdir_p(model_dir)
def train(args): device = torch.device('cuda') if args.use_gpu else torch.device('cpu') #### Load data # create the data and its corresponding datasets and dataloader train_data, num_labels = create_data(args.train, 'train') dev_data = create_data(args.dev, 'valid') train_dataset = BertDataset(train_data, args) dev_dataset = BertDataset(dev_data, args) train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=args.batch_size, collate_fn=train_dataset.collate_fn) dev_dataloader = DataLoader(dev_dataset, shuffle=False, batch_size=args.batch_size, collate_fn=dev_dataset.collate_fn) #### Init model config = { 'hidden_dropout_prob': args.hidden_dropout_prob, 'num_labels': num_labels, 'hidden_size': 768, 'data_dir': '.', 'option': args.option } config = SimpleNamespace(**config) # initialize the Senetence Classification Model model = BertSentClassifier(config) model = model.to(device) lr = args.lr ## specify the optimizer optimizer = AdamW(model.parameters(), lr=lr) best_dev_acc = 0 ## run for the specified number of epochs for epoch in range(args.epochs): model.train() train_loss = 0 num_batches = 0 for step, batch in enumerate( tqdm(train_dataloader, desc=f'train-{epoch}', disable=TQDM_DISABLE)): b_ids, b_type_ids, b_mask, b_labels, b_sents = batch[0][ 'token_ids'], batch[0]['token_type_ids'], batch[0][ 'attention_mask'], batch[0]['labels'], batch[0]['sents'] b_ids = b_ids.to(device) b_mask = b_mask.to(device) b_labels = b_labels.to(device) optimizer.zero_grad() logits = model(b_ids, b_mask) loss = F.nll_loss(logits, b_labels.view(-1), reduction='sum') / args.batch_size loss.backward() optimizer.step() train_loss += loss.item() num_batches += 1 train_loss = train_loss / (num_batches) train_acc, train_f1, *_ = model_eval(train_dataloader, model, device) dev_acc, dev_f1, *_ = model_eval(dev_dataloader, model, device) if dev_acc > best_dev_acc: best_dev_acc = dev_acc save_model(model, optimizer, args, config, args.filepath) print( f"epoch {epoch}: train loss :: {train_loss :.3f}, train acc :: {train_acc :.3f}, dev acc :: {dev_acc :.3f}" )
if args.model == 'resnet': model = ResNet18(num_classes=10) args.model += f"_{args.optimizer}" if args.do_scheduler: args.model += "_cosine" model.to(device) if args.optimizer == 'adam': optimizer = optim.Adam(model.parameters(), lr=0.001, betas=(0.9, 0.999)) if args.optimizer == 'adamw': optimizer = AdamW(model.parameters(), lr=0.001, betas=(0.9, 0.999), weight_decay=0.001) if args.optimizer == 'novograd': optimizer = NovoGrad(model.parameters(), lr=0.01, betas=(0.95, 0.98), weight_decay=0.001) train_monitor = TrainingMonitor(file_dir='./png', arch=args.model) if args.do_scheduler: lr_scheduler = CosineAnnealingLR(optimizer, epochs * len(loaders['train']), 1e-4) for epoch in range(1, epochs + 1): if args.do_scheduler:
def train(args, network, train_itr, dev_itr): logger.info("Start training.") num_train_steps = int(args.max_epoch * len(train_itr) / args.gradient_accumulation_steps) logger.info("Num update steps {}!".format(num_train_steps)) start_epoch, best_result = 1, 0.0 metrics = {key: AverageMeter() for key in ['loss', 'f1', 'em']} def reset_metrics(): for metric in metrics.values(): metric.reset() def update_metrics(result): for key in metrics.keys(): metrics[key].update(result[key]) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in network.bert.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': args.bert_weight_decay, 'lr': args.bert_learning_rate }, { 'params': [ p for n, p in network.bert.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0, 'lr': args.bert_learning_rate }, { 'params': [ p for n, p in network.named_parameters() if not n.startswith("bert.") ], "weight_decay": args.weight_decay, "lr": args.learning_rate }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup, t_total=num_train_steps, max_grad_norm=args.grad_clipping, schedule=args.warmup_schedule) update_cnt, step = 0, 0 train_start = datetime.now() save_prefix = os.path.join(args.save_dir, "checkpoint_best") for epoch in range(start_epoch, args.max_epoch + 1): logger.info('Start epoch {}'.format(epoch)) reset_metrics() for batch in train_itr: step += 1 network.train() output_dict = network(**batch) loss = output_dict["loss"] if args.gradient_accumulation_steps > 1: loss /= args.gradient_accumulation_steps loss.backward() current_metrics = network.get_metrics(True) current_metrics['loss'] = output_dict["loss"] update_metrics(current_metrics) if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() update_cnt += 1 if update_cnt % ( args.log_per_updates * args.gradient_accumulation_steps) == 0 or update_cnt == 1: logger.info( "QDGAT train: step:{0:6} loss:{1:.5f} f1:{2:.5f} em:{3:.5f} left:{4}" .format( update_cnt, metrics['loss'].avg, metrics['f1'].avg, metrics['em'].avg, str((datetime.now() - train_start) / (update_cnt + 1) * (num_train_steps - update_cnt - 1)).split('.')[0])) reset_metrics() if args.do_eval: eval_loss, eval_f1, eval_em = evaluate(args, network, dev_itr) logger.info("Epoch {} eval result, loss {}, f1 {}, em {}.".format( epoch, eval_loss, eval_f1, eval_em)) if args.do_eval and eval_f1 > best_result: save(args, network, optimizer, save_prefix, epoch, best_result) best_result = eval_f1 logger.info("Best eval F1 {} at epoch {}".format( best_result, epoch)) logger.info("Train cost {}s.".format( (datetime.now() - train_start).seconds))