def validate(valid_loader, model, epoch, cur_step): stats = utils.SumMeter() # top1 = utils.AverageMeter() # top5 = utils.AverageMeter() losses = utils.AverageMeter() model.eval() with torch.no_grad(): for step, (X, y) in enumerate(valid_loader): X, y = X.to(device, non_blocking=True), y.to(device, non_blocking=True) N = X.size(0) logits = model(X) loss = model.module.criterion(logits, y) truePos, trueNeg, falsePos, falseNeg = utils.accuracy(logits, y) losses.update(loss.item(), N) stats.update(truePos, trueNeg, falsePos, falseNeg) if step % config.print_freq == 0 or step == len(valid_loader) - 1: logger.info( "Valid: [{:2d}/{}] Step {:03d}/{:03d} Loss {losses.avg:.3f} " "Prec@(1,5) ({top1:.1%}, {top5:.3})".format( epoch + 1, config.epochs, step, len(valid_loader) - 1, losses=losses, top1=stats.accuracy(), top5=stats.MCC())) writer.add_scalar('val/loss', losses.avg, cur_step) # writer.add_scalar('val/top1', top1.avg, cur_step) # writer.add_scalar('val/top5', top5.avg, cur_step) logger.info("Valid: [{:2d}/{}] Final Prec@1 {:.4}".format( epoch + 1, config.epochs, stats.MCC())) return stats.accuracy()
######################################################################################### print('<train.py> Resume from a checkpoint or create logs directory.') utils.ensure_dirname(cf.log_dir, override=not cf.resume) # if cf.resume: # exp_logger = load_checkpoint(model.module, optimizer, cf.resume) # else: exp_logger = utils.Experiment(os.path.basename(cf.log_dir)) meters = { 'loss': utils.AvgMeter(), 'acc1': utils.AvgMeter(), 'acc5': utils.AvgMeter(), 'batch_time': utils.AvgMeter(), 'data_time': utils.AvgMeter(), 'epoch_time': utils.SumMeter() } for split in vqa.data['qa'].keys(): exp_logger.add_meters(split, meters) exp_logger.info['model_params'] = utils.params_count(model) # print('Model has {} parameters'.format(exp_logger.info['model_params'])) print('<train.py> Start training...') max_step = None if cf.debug: # max_step = 5 print('<train.py>: You are in debugging mode...')
def train(train_loader, valid_loader, model, architect, w_optim, alpha_optim, lr, epoch): stats = utils.SumMeter() # logger.info('8.1') # top1 = utils.AverageMeter() # top5 = utils.AverageMeter() losses = utils.AverageMeter() # logger.info('8.2') cur_step = epoch * len(train_loader) writer.add_scalar('train/lr', lr, cur_step) # logger.info('8.3') model.train() # logger.info('8.4') train_iter = iter(train_loader) valid_iter = iter(valid_loader) # logger.info('8.5') for step, ((trn_X, trn_y), (val_X, val_y)) in enumerate(zip(train_loader, valid_loader)): if not torch.cuda.is_available(): logging.info('no gpu device available') sys.exit(1) # logger.info('8.6') trn_X, trn_y = trn_X.to(device, non_blocking=True), trn_y.to(device, non_blocking=True) # logger.info('8.7') val_X, val_y = val_X.to(device, non_blocking=True), val_y.to(device, non_blocking=True) # logger.info('8.8') N = trn_X.size(0) # logger.info('8.9') # phase 2. architect step (alpha) alpha_optim.zero_grad() # logger.info('8.10') architect.unrolled_backward(trn_X, trn_y, val_X, val_y, lr, w_optim, logger) # logger.info('8.11') alpha_optim.step() # logger.info('8.12') # phase 1. child network step (w) w_optim.zero_grad() # logger.info('8.13') logits = model(trn_X) # logger.info('8.14') loss = model.module.criterion(logits, trn_y) # logger.info('8.15') loss.backward() # logger.info('8.16') # gradient clipping nn.utils.clip_grad_norm_(model.module.weights(), config.w_grad_clip) # logger.info('8.17') w_optim.step() # logger.info('8.18') truePos, trueNeg, falsePos, falseNeg = utils.accuracy(logits, trn_y) # logger.info('8.19') losses.update(loss.item(), N) # logger.info('8.20') stats.update(truePos, trueNeg, falsePos, falseNeg) # logger.info('8.21') if step % config.print_freq == 0 or step == len(train_loader) - 1: logger.info( "Train: [{:2d}/{}] Step {:03d}/{:03d} Loss {losses.avg:.3f} " "Prec@(1,5) ({top1:.1%}, {top5:.3})".format( epoch + 1, config.epochs, step, len(train_loader) - 1, losses=losses, top1=stats.accuracy(), top5=stats.MCC())) writer.add_scalar('train/loss', loss.item(), cur_step) # writer.add_scalar('train/top1', prec1, cur_step) # writer.add_scalar('train/top5', prec5, cur_step) cur_step += 1 logger.info("Train: [{:2d}/{}] Final Prec@1 {:.4}".format( epoch + 1, config.epochs, stats.MCC()))
def train(train_loader, valid_loader, model, architect, w_optim, alpha_optim, lr, epoch, logger, args): stats = utils.SumMeter() # logger.info('8.1') # top1 = utils.AverageMeter() # top5 = utils.AverageMeter() losses = utils.AverageMeter() # logger.info('8.2') cur_step = epoch * len(train_loader) # logger.info('8.3') model.train() # logger.info('8.4') train_iter = iter(train_loader) valid_iter = iter(valid_loader) # logger.info('8.5') for step, ((input, target), (val_X, val_y)) in enumerate(zip(train_loader, valid_loader)): # if not torch.cuda.is_available(): # logging.info('no gpu device available') # sys.exit(1) # if args.gpu is not None: # input = input.cuda(args.gpu, non_blocking=True) # val_X = val_X.cuda(args.gpu, non_blocking=True) # target = target.cuda(args.gpu, non_blocking=True) # val_y = val_y.cuda(args.gpu, non_blocking=True) logger.info('8.8') # logger.info('8.9') # phase 2. architect step (alpha) alpha_optim.zero_grad() logger.info('8.10') try: architect.unrolled_backward(input, target, val_X, val_y, lr, w_optim, logger) except Exception as e: logger.info("problem: {}".format(e)) logger.info('8.11') alpha_optim.step() logger.info('8.12') # phase 1. child network step (w) w_optim.zero_grad() # logger.info('8.13') output = model(input) logger.info('8.14') loss = model.criterion(output, target) logger.info('8.15') loss.backward() logger.info('8.16') # gradient clipping nn.utils.clip_grad_norm_(model.weights(), args.w_grad_clip) # logger.info('8.17') w_optim.step() # logger.info('8.18') acc = getAccuracy(output, target, args) # logger.info('8.19') # logger.info('8.20') # logger.info('8.21') if step % args.print_freq == 0 or step == len(train_loader) - 1: # progress._print_(i) logger.info("Epoch: %d, Batch: %d/%d, Loss: %.3f, acc: %.3f" % (epoch + 1, step, len(train_loader) - 1, loss, acc)) cur_step += 1