def train(self, epoch, trainloader, print_every=100): ''' method for training ''' loss_batch = 0 if epoch % 10 == 0 and epoch > 0: adjust_lr(self.optimizer, self.lr) for b_idx, (train_data, train_labels) in enumerate(trainloader): if self.use_gpu and str(self.device) == 'cuda:0': train_data = train_data.cuda(non_blocking=True) train_labels = train_labels.cuda() # Forward Pass train_preds = self.model.forward(train_data) loss = self.model.loss(train_preds, train_labels) if self.l2: loss = self.l2_regularization(loss, self.l2) self.optimizer.zero_grad() loss.backward() self.optimizer.step() if b_idx % print_every == 0: print('Train Epoch: {0} [{1}/{2} ({3:.0f}%)]\t Loss {4:.6f}'. format(epoch, b_idx * len(train_data), len(trainloader.dataset), 100. * b_idx / len(trainloader), loss)) loss_batch += loss.item() loss_batch /= len(trainloader) self.train_loss.append(loss_batch)
def train(epoch, model, criterion, opt, scheduler, tr_loader, device, logger, schdl_type='cyclic'): model.train() ep_loss = 0 ep_acc = 0 print( '[INFO][TRAINING][clean_training] \t Epoch {} started.'.format(epoch)) for batch_idx, (inpt, targets) in enumerate(tqdm(tr_loader)): inpt, targets = inpt.to(device), targets.to(device) output = model(inpt) loss = criterion(output, targets) opt.zero_grad() with amp.scale_loss(loss, opt) as scaled_loss: scaled_loss.backward() opt.step() ep_loss += loss.item() ep_acc += (output.max(1)[1] == targets).sum().item() / len(targets) if schdl_type == 'cyclic': utils.adjust_lr(opt, scheduler, logger, epoch * batch_idx) if schdl_type != 'cyclic': utils.adjust_lr(opt, scheduler, logger, epoch) logger.log_train(epoch, ep_loss / len(tr_loader), (ep_acc / len(tr_loader)) * 100, "clean_training")
def adjust_lr(self, ep): utils.adjust_lr( param_groups=self.optimReID.param_groups, base_lrs=[cfg.pre_reid_ft_base_lr, cfg.pre_reid_ft_base_lr, cfg.pre_reid_fc_weight_base_lr, cfg.pre_reid_fc_bias_base_lr], decay_epochs=cfg.pre_reid_lr_decay_epochs, epoch=ep, verbose=True)
def train(epoch, model, criterion, opt, scheduler, cnfg, tr_loader, device, logger, schdl_type='cyclic'): model.train() ep_loss = 0 ep_acc = 0 print( '[INFO][TRAINING][clean_training] \t Epoch {} started.'.format(epoch)) for batch_idx, (inpt, targets) in enumerate(tqdm(tr_loader)): inpt, targets = inpt.to(device), targets.to(device) l_limit, u_limit = pgd.get_limits(device) delta = pgd.train_pgd(model, device, criterion, inpt, targets, epsilon=cnfg['pgd']['epsilon'], alpha=cnfg['pgd']['alpha'], iter=cnfg['pgd']['iter'], opt=opt, restart=cnfg['pgd']['restarts'], d_init=cnfg['pgd']['delta-init'], l_limit=l_limit, u_limit=u_limit) output = model(inpt + delta) loss = criterion(output, targets) opt.zero_grad() with amp.scale_loss(loss, opt) as scaled_loss: scaled_loss.backward() opt.step() ep_loss += loss.item() ep_acc += (output.max(1)[1] == targets).sum().item() / len(targets) if schdl_type == 'cyclic': utils.adjust_lr(opt, scheduler, logger, epoch * batch_idx) if schdl_type != 'cyclic': utils.adjust_lr(opt, scheduler, logger, epoch) print('ce ba', len(tr_loader)) logger.log_train(epoch, ep_loss / len(tr_loader), (ep_acc / len(tr_loader)) * 100, "pgd_training")
def step(self, profiler): gvar = self.gvar opt = self.opt self.optimizer.zero_grad() pg_used = gvar.gest_used loss = gvar.grad(self.niters) if gvar.gest_used != pg_used: logging.info('Optimizer reset.') self.gest_used = gvar.gest_used utils.adjust_lr(self, opt) self.reset() self.optimizer.step() profiler.toc('optim') profiler.end() return loss
def train(epoch, criterion_list, optimizer): train_loss = 0. train_loss_cls = 0. train_loss_div = 0. top1_num = 0 top5_num = 0 total = 0 lr = adjust_lr(optimizer, epoch, args) start_time = time.time() criterion_cls = criterion_list[0] criterion_div = criterion_list[1] net.train() for batch_idx, (input, target) in enumerate(trainloader): batch_start_time = time.time() input = input.cuda() target = target.cuda() input, targets_a, targets_b, lam = mixup_data(input, target, 0.4) logit = net(input) #loss_cls = criterion_cls(logit, target) loss_cls = mixup_criterion(CrossEntropyLoss_label_smooth, logit, targets_a, targets_b, lam) loss = loss_cls optimizer.zero_grad() loss.backward() optimizer.step() train_loss += loss.item() / len(trainloader) train_loss_cls += loss_cls.item() / len(trainloader) top1, top5 = correct_num(logit, target, topk=(1, 5)) top1_num += top1 top5_num += top5 total += target.size(0) print('Epoch:{},batch_idx:{}/{}'.format(epoch, batch_idx, len(trainloader)), 'acc:', top1_num.item() / total, 'duration:', time.time()-batch_start_time) print('Epoch:{}\t lr:{:.5f}\t duration:{:.3f}' '\n train_loss:{:.5f}\t train_loss_cls:{:.5f}' '\n top1_acc: {:.4f} \t top5_acc:{:.4f}' .format(epoch, lr, time.time() - start_time, train_loss, train_loss_cls, (top1_num/total).item(), (top5_num/total).item())) with open(log_txt, 'a+') as f: f.write('Epoch:{}\t lr:{:.5f}\t duration:{:.3f}' '\ntrain_loss:{:.5f}\t train_loss_cls:{:.5f}' '\ntop1_acc: {:.4f} \t top5_acc:{:.4f} \n' .format(epoch, lr, time.time() - start_time, train_loss, train_loss_cls, (top1_num/total).item(), (top5_num/total).item()))
def step(self, profiler): gvar = self.gvar opt = self.opt model = self.model self.optimizer.zero_grad() # Frequent snaps inits = list(map(int, opt.g_osnap_iter.split(',')[0:2])) every = int(opt.g_osnap_iter.split(',')[-1]) if (((self.niters - opt.gvar_start) % every == 0 or self.niters in inits) and self.niters >= opt.gvar_start): print(self.niters) if opt.g_estim == 'nuq' and opt.nuq_method != 'none': stats = gvar.gest.snap_online_mean(model) if opt.nuq_parallel == 'ngpu': for qdq in gvar.gest.qdq: qdq.set_mean_variance(stats) else: gvar.gest.qdq.set_mean_variance(stats) if opt.nuq_method == 'amq' or opt.nuq_method == 'alq' or opt.nuq_method == 'alq_nb' or opt.nuq_method == 'amq_nb': if opt.nuq_parallel == 'ngpu': for qdq in gvar.gest.qdq: qdq.update_levels() else: gvar.gest.qdq.update_levels() pg_used = gvar.gest_used loss = gvar.grad(self.niters) if gvar.gest_used != pg_used: logging.info('Optimizer reset.') self.gest_used = gvar.gest_used utils.adjust_lr(self, opt) self.reset() self.optimizer.step() profiler.toc('optim') profiler.end() return loss
def train_second_stage(viz, writer, dataloader, front_net_thick, front_net_thin, fusion_net, optimizer, base_lr, criterion, device, power, epoch, num_epochs=100): dt_size = len(dataloader.dataset) epoch_loss = 0 step = 0 for sample in dataloader: step += 1 img = sample[0].to(device) gt = sample[1].to(device) with torch.no_grad(): thick_pred = front_net_thick(img) thin_pred= front_net_thin(img) # zero the parameter gradients optimizer.zero_grad() # forward fusion_pred = fusion_net(img[:, :1, :, :], thick_pred, thin_pred) viz.img(name="images", img_=img[0, :, :, :]) viz.img(name="labels", img_=gt[0, :, :, :]) viz.img(name="prediction", img_=fusion_pred[0, :, :, :]) loss = criterion(fusion_pred, gt) loss.backward() optimizer.step() epoch_loss += loss.item() # 当前batch图像的loss niter = epoch * len(dataloader) + step writer.add_scalars("train_loss", {"train_loss": loss.item()}, niter) print("%d / %d, train loss: %0.4f" % (step, (dt_size - 1) // dataloader.batch_size + 1, loss.item())) viz.plot("train loss", loss.item()) # 写入当前lr current_lr = get_lr(optimizer) viz.plot("learning rate", current_lr) writer.add_scalars("learning_rate", {"lr": current_lr}, niter) print("epoch %d loss: %0.4f" % (epoch, epoch_loss)) print("current learning rate: %f" % current_lr) adjust_lr(optimizer, base_lr, epoch, num_epochs, power=power) return fusion_net
def train_first_stage(viz, writer, dataloader, net, optimizer, base_lr, thin_criterion, thick_criterion, device, power, epoch, num_epochs=100): dt_size = len(dataloader.dataset) epoch_loss = 0 step = 0 for sample in dataloader: step += 1 img = sample[0].to(device) thin_gt = sample[2].to(device) thick_gt = sample[3].to(device) # zero the parameter gradients optimizer.zero_grad() # forward thick_pred, thin_pred, _ = net(img) viz.img(name="images", img_=img[0, :, :, :]) viz.img(name="thin labels", img_=thin_gt[0, :, :, :]) viz.img(name="thick labels", img_=thick_gt[0, :, :, :]) viz.img(name="thin prediction", img_=thin_pred[0, :, :, :]) viz.img(name="thick prediction", img_=thick_pred[0, :, :, :]) loss = thin_criterion(thin_pred, thin_gt) + thick_criterion(thick_pred, thick_gt) # 可加权 loss.backward() optimizer.step() epoch_loss += loss.item() # 当前batch图像的loss niter = epoch * len(dataloader) + step writer.add_scalars("train_loss", {"train_loss": loss.item()}, niter) print("%d / %d, train loss: %0.4f" % (step, (dt_size - 1) // dataloader.batch_size + 1, loss.item())) viz.plot("train loss", loss.item()) # 写入当前lr current_lr = get_lr(optimizer) viz.plot("learning rate", current_lr) writer.add_scalars("learning_rate", {"lr": current_lr}, niter) print("epoch %d loss: %0.4f" % (epoch, epoch_loss)) print("current learning rate: %f" % current_lr) adjust_lr(optimizer, base_lr, epoch, num_epochs, power=power) return net
def train(epoch): model.train() adjust_lr(optimizer, epoch, args.lr, decay_rate=0.2) for batch_idx, data in enumerate(train_loader): raw_data = data[-1] data = [Variable(_, requires_grad=False).cuda() for _ in data[:-1]] prev_canvas, final_canvas, inst, target_obj, act = data ref_obj = None optimizer.zero_grad() loss = loss_fn( model(inst, prev_canvas, final_canvas, (target_obj, ref_obj, True)), inst[:, 1:]) loss.backward() clip_gradient(optimizer, 0.1) optimizer.step() if batch_idx % args.log_interval == 0: print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f})'.format( epoch, batch_idx * args.batch_size, len(train_loader.dataset), 100. * batch_idx / len(train_loader), loss.data[0])) torch.save( model.state_dict(), 'instructor_seq2seq_target_canvas_concat/model_{}.pth'.format(epoch))
def train(epoch): model.train() adjust_lr(optimizer, epoch, args.lr, decay_rate=0.2) for batch_idx, data in enumerate(train_loader): raw_data = data[-1] data = [Variable(_, requires_grad=False).cuda() for _ in data[:-1]] prev_canvas, inst, next_obj, final_canvas, ref_obj = data optimizer.zero_grad() loss = loss_fn( model(inst, prev_canvas, final_canvas, (next_obj, ref_obj, True)), inst[:, 1:]) policy_loss = (-model.saved_log_probs * model.rewards).sum() # policy_loss = 0 # for i in range(len(model.saved_log_probs)): # policy_loss += (-model.saved_log_probs[i] * model.rewards[:, i]).sum() (loss + policy_loss).backward() clip_gradient(optimizer, 0.1) optimizer.step() # del model.saved_log_probs[:] model.saved_log_probs = None model.sampled_actions = None model.rewards = None if batch_idx % args.log_interval == 0: print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f})'.format( epoch, batch_idx * args.batch_size, len(train_loader.dataset), 100. * batch_idx / len(train_loader), loss.data[0])) # torch.save(model.state_dict(), 'models_topdown_3att_att64_hardatt/model_{}.pth'.format(epoch)) # torch.save(optimizer.state_dict(), 'models_topdown_3att_att64_hardatt/optimizer_{}.pth'.format(epoch)) torch.save( model.state_dict(), 'models_topdown_3att_att64_content_planning_absmix_reinforce_running_baseline/model_{}.pth' .format(epoch)) torch.save( optimizer.state_dict(), 'models_topdown_3att_att64_content_planning_absmix_reinforce_running_baseline/optimizer_{}.pth' .format(epoch))
}] else: param_groups = model.parameters() optimizer = torch.optim.SGD(param_groups, lr=lr, momentum=0.9, weight_decay=5e-4, nesterov=True) print('Start Training.') loss_record = 0. acc_record = 0. best_acc = 0. bat_start = 0 for it in range(n_iters): print('\rExtractor | iter %05d' % (it + 1), end='') adjust_lr(it, lr, optimizer, lr_step_size=1000) ''' Grab a batch from X and Y. ''' batch, label, bat_start = grab_batch(X, Y, bat_start, batch_size) x = Variable(torch.from_numpy(batch.astype(float)).float().cuda()) y = Variable(torch.from_numpy(label).long().cuda()) ''' Feedforward and Backward. ''' outputs = model(x) loss = criterion(outputs, y) optimizer.zero_grad() loss.backward() optimizer.step() loss_record += loss.data.cpu().numpy()[0] oo = outputs.data.cpu().numpy() oc = np.argmax(oo, axis=1) acc = np.sum(oc == label) / batch_size
dict_best_top5 = {'Epoch': 0, 'Top5': 100.} if opt.resume: state_dict = torch.load(opt.path_model) model.load_state_dict(state_dict['state_dict']) optim.load_state_dict(state_dict['optimizer']) dict_best_top1.update({'Epoch': opt.epoch_top1, 'Top1': opt.top1}) dict_best_top5.update({'Epoch': opt.epoch_top5, 'Top5': opt.top5}) st = datetime.now() iter_total = 0 top1_hist = list(100 for i in range(100)) top5_hist = list(100 for i in range(100)) # to see 100 latest top5 error for epoch in range(opt.epoch_recent, opt.epochs): adjust_lr(optim, epoch, opt.lr, milestones=milestones, gamma=0.1) list_loss = list() model.train() for input, label in tqdm(data_loader): iter_total += 1 input, label = input.to(device), label.to(device) output = model(input) loss = criterion(output, label) optim.zero_grad() loss.backward() optim.step() top1, top5 = cal_top1_and_top5(output, label) top1_hist[iter_total % 100] = np.float(top1)
def main_worker(gpu, args): """ @param: gpu - index of the gpu on a single node, here its range is [0, args.gpus-1] """ # IMPORTANT: we need to set the random seed in each process so that the models are initialized with the same weights # Reference: https://yangkky.github.io/2019/07/08/distributed-pytorch-tutorial.html # torch.cuda.manual_seed(args.seed) # for distributed training, rank needs to be global rank among all processes rank = args.node_rank * args.gpus + gpu dist.init_process_group(backend=args.dist_backend, \ init_method=args.dist_url, \ world_size=args.world_size, \ rank=rank) # build model densenet = DenseNet121(in_channels=3, growth_rate=args.growth_rate, \ compression_rate=args.compression_rate, \ num_classes=args.num_classes) # torch.cuda.device(gpu) # densenet.cuda(gpu) densenet.cuda() # densenet = nn.parallel.DistributedDataParallel(densenet, device_ids=[gpu]) densenet = nn.parallel.DistributedDataParallel(densenet) # Reference: https://github.com/pytorch/examples/blob/master/imagenet/main.py normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_transform = transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize ]) trainset = torchvision.datasets.ImageFolder(root=os.path.join( args.dataset_root, 'train'), transform=train_transform) val_transform = transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), normalize ]) valset = torchvision.datasets.ImageFolder(root=os.path.join( args.dataset_root, 'val'), transform=val_transform) train_sampler = torch.utils.data.distributed.DistributedSampler( trainset, num_replicas=args.world_size, rank=rank) args.batch_size = int(args.batch_size / args.gpus) args.num_workers = int(args.num_workers / args.gpus) train_data = torch.utils.data.DataLoader( trainset, batch_size=args.batch_size, shuffle=False, # when sampler is specified, shuffle should be False num_workers=args.num_workers, pin_memory=True, sampler=train_sampler) val_data = torch.utils.data.DataLoader(valset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, pin_memory=True) criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(densenet.parameters(), lr=args.lr, momentum=args.momentum, \ weight_decay=args.weight_decay) global best_prec1 # Reference: https://discuss.pytorch.org/t/what-does-torch-backends-cudnn-benchmark-do/5936/2 # this is useful for cudnn finding optimal set of algorithms for particular configurations # and accelerate training when the input sizes do not change over iteration. cudnn.backend = True for epoch in range(args.epochs): train_sampler.set_epoch(epoch) adjust_lr(args, optimizer, epoch) train(densenet, train_data, criterion, optimizer, epoch, args) if args.tensorboard: log_value('train_loss', losses.avg, epoch) log_value('train_acc', top1.avg, epoch) # validate the model every epoch prec1 = validate(args, val_data, densenet, criterion, epoch) is_best = prec1 > best_prec1 best_prec1 = max(prec1, best_prec1) save_checkpoint( { 'epoch': epoch + 1, 'state_dict': densenet.state_dict(), 'best_prec1': best_prec1, 'optimizer': optimizer.state_dict() }, is_best)
alpha=0.99, eps=1e-8, weight_decay=0.0, momentum=0.0) max_pck = config.max_pck weights = [ config.s * config.s * config.num_kpt, config.s * config.s * config.num_kpt * 2 ] evaluate(config, model) for epoch in range(config.start_epoch, config.end_epoch): model = model.train() lr = adjust_lr(optimizer, epoch, config.decay, config.lr_gamma) batch_loss, batch_hloss, batch_xloss, batch_yloss, batch_acc, batch = 0., 0., 0., 0., 0., 0. for (idx, (img, label, offset)) in enumerate(trainloader): if config.cuda: img = img.cuda() label = label.cuda() offset = offset.cuda() img = img.float() out1_1, out1_2 = model(img) optimizer.zero_grad() heat_loss = heat_criterion(out1_1, label) offx_loss = offset_criterion(out1_2[:, :config.num_kpt] * label, offset[:, :config.num_kpt]) offy_loss = offset_criterion(out1_2[:, config.num_kpt:] * label, offset[:, config.num_kpt:])
def main_worker(local_rank, ngpus, args): best_prec1 = .0 dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url) print(f'local_rank: {local_rank}\n') torch.cuda.set_device(local_rank) # IMPORTANT: we need to set the random seed in each process so that the models are initialized with the same weights # Reference: https://yangkky.github.io/2019/07/08/distributed-pytorch-tutorial.html # torch.cuda.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) # build model densenet = DenseNet121(in_channels=3, growth_rate=args.growth_rate, \ compression_rate=args.compression_rate, \ num_classes=args.num_classes) densenet.cuda(local_rank) densenet = nn.parallel.DistributedDataParallel(densenet, \ device_ids=[local_rank], \ output_device=local_rank) # Reference: https://github.com/pytorch/examples/blob/master/imagenet/main.py normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_transform = transforms.Compose([ transforms.RandomResizedCrop(args.image_width), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize ]) trainset = torchvision.datasets.ImageFolder(root=os.path.join( args.dataset_root, 'train'), transform=train_transform) val_transform = transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(args.image_width), transforms.ToTensor(), normalize ]) valset = torchvision.datasets.ImageFolder(root=os.path.join( args.dataset_root, 'val'), transform=val_transform) # num_replicas: int, Number of processes participating in distributed training. By default, world_size is retrieved from the current distributed group. train_sampler = torch.utils.data.distributed.DistributedSampler( trainset, num_replicas=ngpus) batch_size = args.batch_size // ngpus num_workers = args.num_workers // ngpus train_data = DataLoader( trainset, batch_size=batch_size, shuffle=False, # when sampler is specified, shuffle should be False num_workers=num_workers, pin_memory=True, sampler=train_sampler) val_data = DataLoader(valset, batch_size=batch_size, shuffle=False, num_workers=num_workers, pin_memory=True) criterion = nn.CrossEntropyLoss().cuda(local_rank) optimizer = optim.SGD(densenet.parameters(), lr=args.lr, momentum=args.momentum, \ weight_decay=args.weight_decay) # Reference: https://discuss.pytorch.org/t/what-does-torch-backends-cudnn-benchmark-do/5936/2 # this is useful for cudnn finding optimal set of algorithms for particular configurations # and accelerate training when the input sizes do not change over iteration. cudnn.backend = True for epoch in range(args.epochs): train_sampler.set_epoch(epoch) adjust_lr(args, optimizer, epoch) losses, top1, top5 = train(densenet, train_data, criterion, optimizer, epoch, local_rank, args) if args.tensorboard: log_value('train_loss', losses.avg, epoch) log_value('top1_acc', top1.avg, epoch) log_value('top5_acc', top5.avg, epoch) # validate the model every epoch prec1 = validate(args, val_data, densenet, criterion, epoch) is_best = prec1.avg > best_prec1 best_prec1 = max(prec1.avg, best_prec1) save_checkpoint( { 'epoch': epoch + 1, 'state_dict': densenet.state_dict(), 'best_prec1': best_prec1, 'optimizer': optimizer.state_dict() }, is_best)
def train(dataname, num_dims, num_inc=2, cap=1000, lr=0.001, name_log='default', num_epochs=60, batch_size=64, device='gpu', lr_schedule=False, optimizer='adam', dir_data='./', num_workers=4): dir_root = os.path.join(dir_data, dataname) dir_model = os.path.join(dir_root, 'model') dir_logs = os.path.join(dir_root, 'logs', name_log) logging.info("start training.") logging.info("delete present data pool") if os.path.exists(os.path.join(dir_root, 'data_pool.pkl')): os.remove(os.path.join(dir_root, 'data_pool.pkl')) with open(os.path.join(dir_root, 'classes.txt'), 'r') as f: label_list = f.readlines()[0].strip().split(' ') logging.info("total number of classes is {}".format(len(label_list))) print(label_list) label_sep_list = [label_list[i:i+num_inc] for i in range(0, len(label_list), num_inc)] num_total_classes = len(label_list) num_now_classes = 0 # load model model = BenchMark(num_dims, 2) save_model(model, dir_model) logging.info("define the representer model") # define DataPool data_pool = DataPool(dir_data=dir_data, cap=cap, dataname=dataname) for num_inc, label_sep in enumerate(label_sep_list): num_now_classes += len(label_sep) acc_best = 0 # load stored model trained using old classes's data model = load_model(model, num_now_classes, dir_model) model.train() model = model.to(device) logging.info("reload the old model.") # define logger logger = SummaryWriter(dir_logs) # dataloader of old and new datasets train_dataset_old = data_pool.load_data_pool() train_dataset_new = load_data(dir_data, dataname, 'train', label_sep) train_dataset = concat_datasets([train_dataset_old, train_dataset_new]) train_dataloader = load_dataloader(train_dataset, batch_size, num_workers) test_dataset = load_data(dir_data, dataname, 'test', data_pool.classes+label_sep) test_dataloader = load_dataloader(test_dataset, 16, num_workers) # define loss function criterion = nn.CrossEntropyLoss() if optimizer == 'adam': optimizer = optim.Adam(model.parameters(), lr=lr) # train the representer for epoch in range(num_epochs): model.train() sum_loss = 0 # lr schedule if lr_schedule: adjust_lr(optimizer, lr, epoch) for i, (train_batch, label_batch) in enumerate(train_dataloader): train_batch = train_batch.to(device) label_batch = label_batch.to(device) output_batch = model(train_batch) loss = criterion(output_batch, label_batch) sum_loss += loss optimizer.zero_grad() loss.backward() optimizer.step() acc = acc_cal(model, test_dataloader) acc_best = acc logging.info("Classes: {}/{}, Epoch: {}/{}, Loss: {:.4f}, Acc: {:.4f}".format(num_now_classes, num_total_classes, epoch+1, num_epochs, sum_loss.data, acc)) logger.add_scalars('data/Classes_{}'.format(num_now_classes), {'loss': sum_loss.data}, epoch) logger.add_scalars('data/Classes_{}'.format(num_now_classes), {'acc': acc}, epoch) logger.add_scalars('data/acc_incremental', {'acc_incremental': acc_best}, num_inc) # save samples to data pool num_everyclass = int(data_pool.cap/num_now_classes) data_pool.add_data(model, train_dataset_new, num_everyclass, device) # save model save_model(model, dir_model) logger.close()
optimizer_Encoder.step() optimizer_Decoder.step() if i % 10 == 0 or i == total_step: print('{} Epoch [{:03d}/{:03d}], Step [{:04d}/{:04d}], loss: {:0.4f}, sal1 loss: {:0.4f}, edge loss: {:0.4f}, sal2 loss: {:0.4f}, prior loss: {:0.4f}'. format(datetime.now(), epoch, args.epoch, i, total_step, loss.data, sal_loss1.data, edge_loss.data, sal_loss2.data, pri_loss.data)) visualize_prediction(torch.sigmoid(sal1), './show/', "sal1") visualize_prediction(torch.sigmoid(edge_map), './show/', "edge") visualize_prediction(torch.sigmoid(sal2), './show/', "sal2") save_path = 'save_models/finetune/' if not os.path.exists(save_path): os.makedirs(save_path) if epoch % 1 == 0: torch.save(Encoder.state_dict(), save_path + 'scribble_Encoder' + '_%d' % epoch + '.pth') torch.save(Decoder.state_dict(), save_path + 'scribble_Decoder' + '_%d' % epoch + '.pth') print("start training!!!") for epoch in range(1, args.epoch+1): adjust_lr(optimizer_Encoder, epoch, args.decay_rate, args.decay_epoch) adjust_lr(optimizer_Decoder, epoch, args.decay_rate, args.decay_epoch) train(train_dataloader, Encoder, Decoder, optimizer_Encoder, optimizer_Decoder, epoch)
def trainer(model, optimizer, criterion, scheduler, train_loader, val_loader, tqdm_length, log_flag=False): best_acc = 0.5 best_ap = 0 best_FOR = 0 best_ok_ap = 0 best_ng_ap = 0 best_ap_epoch = [] best_acc_epoch = [] best_FOR_epoch = [] save_names = [] for epoch in range(config.max_epoch): batch_avg_loss = 0 bar = tqdm(enumerate(train_loader), total=tqdm_length) for ii, (data, label) in bar: image = data.cuda() target = label.cuda() optimizer.zero_grad() logits = model(image) loss = criterion(logits, target) loss.backward() optimizer.step() cur_loss = loss.item() batch_avg_loss += cur_loss cur_lr = optimizer.state_dict()["param_groups"][0]["lr"] batch_loss = batch_avg_loss / (ii + 1) bar.set_description(f'{epoch} loss:{cur_loss:.2e} lr:{cur_lr:.2e}') if scheduler is None: utils.adjust_lr(optimizer, epoch) else: scheduler.step() val_accuracy, y_true, y_score = val(model, val_loader) if config.num_class == 2: # confusion matrix confusion_matrix = metrics.confusion_matrix( y_true, np.argmax(y_score, 1)) # AP # ok_val_ap, ng_val_ap, mAP = utils.get_AP_metric(y_true, y_score) mulit_class_ap = ClassifierEvalMulticlass.compute_ap( y_true, y_score) ng_val_ap = mulit_class_ap[0] ok_val_ap = mulit_class_ap[1] mAP = (ng_val_ap + ok_val_ap) / 2 # FOR final_metric_dict = utils.get_FOR_metric(y_true, y_score) ok_y_score = y_score[:, 1] ok_p_at_r = ClassifierEvalBinary.compute_p_at_r( y_true, ok_y_score, 1) ng_y_true = np.array(y_true).astype("bool") ng_y_true = (1 - ng_y_true).astype(np.int) ng_y_score = y_score[:, 0] ng_p_at_r = ClassifierEvalBinary.compute_p_at_r( ng_y_true, ng_y_score, 1) print( f'Acc: {val_accuracy:.2f}\t OK_AP:{ok_val_ap:.2f}\t NG_AP: {ng_val_ap:.2f}\t mAP: {mAP:.2f}' ) print( f'BEST Acc: {best_acc:.2f}\t OK_AP: {best_ok_ap:.2f}\t NG_AP: {best_ng_ap:.2f}\t mAP: {best_ap:.2f}' ) print(confusion_matrix) print(mulit_class_ap) print(f'ok_p_at_r: {ok_p_at_r}, ng_p_at_r: {ng_p_at_r}') print(final_metric_dict) save_path = f'./checkpoints/{config.model["name"]}' save_name = f'{epoch}_acc_{val_accuracy:.4f}_p@r_{ng_p_at_r}_FOR_{final_metric_dict["FOR"]:.4F}.pth' save_names.append(save_name) if not os.path.exists(save_path): os.makedirs(save_path) torch.save(model, f'{save_path}/{save_name}') if final_metric_dict['FOR'] > best_FOR: best_FOR = final_metric_dict['FOR'] best_FOR_epoch.append(epoch) if val_accuracy > best_acc: best_acc = val_accuracy best_acc_epoch.append(epoch) if mAP > best_ap: best_ap = mAP best_ap_epoch.append(epoch) best_ok_ap = max(ok_val_ap, best_ok_ap) best_ng_ap = max(ng_val_ap, best_ng_ap) else: mulit_class_ap = ClassifierEvalMulticlass.compute_ap( y_true, y_score) confusion_matrix = metrics.confusion_matrix( y_true, np.argmax(y_score, 1)) save_path = f'./checkpoints/{config.model["name"]}' epoch_index = epoch + 1 save_name = f'{epoch_index:03d}_acc_{val_accuracy:.4f}.pth' save_names.append(save_name) if not os.path.exists(save_path): os.makedirs(save_path) torch.save(model, f'{save_path}/{save_name}') print(val_accuracy) print(mulit_class_ap) print(confusion_matrix) if log_flag: cur_time = time.strftime('%m%d_%H_%M') log_file_name = f"{config.model['name']}_{cur_time}.txt" utils.write_log(log_file_name, best_FOR_epoch, best_acc_epoch, best_ap_epoch, save_names)
align_corners=True) pred_post_init = generator.forward(images) sal_loss = structure_loss(pred_post_init, gts) sal_loss.backward() generator_optimizer.step() visualize_prediction_init(torch.sigmoid(pred_post_init)) visualize_gt(gts) if rate == 1: loss_record.update(sal_loss.data, opt.batchsize) if i % 10 == 0 or i == total_step: print( '{} Epoch [{:03d}/{:03d}], Step [{:04d}/{:04d}], Gen Loss: {:.4f}' .format(datetime.now(), epoch, opt.epoch, i, total_step, loss_record.show())) adjust_lr(generator_optimizer, opt.lr_gen, epoch, opt.decay_rate, opt.decay_epoch) save_path = 'models/Resnet/' if not os.path.exists(save_path): os.makedirs(save_path) if epoch % opt.epoch == 0: torch.save(generator.state_dict(), save_path + 'Model' + '_%d' % epoch + '_gen.pth')
def train(): args = parse_args() args_msg = [ ' %s: %s' % (name, value) for (name, value) in vars(args).items() ] logger.info('args:\n' + '\n'.join(args_msg)) ckpt_path = "models_chunk_twin_context" os.system("mkdir -p {}".format(ckpt_path)) logger = init_logging("chunk_model", "{}/train.log".format(ckpt_path)) csv_file = open(args.csv_file, 'w', newline='') csv_writer = csv.writer(csv_file) csv_writer.writerow(header) batch_size = args.batch_size device = torch.device("cuda:0") reg_weight = args.reg_weight ctc_crf_base.init_env(args.den_lm_fst_path, gpus) model = CAT_Chunk_Model(args.feature_size, args.hdim, args.output_unit, args.dropout, args.lamb, reg_weight) lr = args.origin_lr optimizer = optim.Adam(model.parameters(), lr=lr) epoch = 0 prev_cv_loss = np.inf if args.checkpoint: checkpoint = torch.load(args.checkpoint) epoch = checkpoint['epoch'] lr = checkpoint['lr'] prev_cv_loss = checkpoint['cv_loss'] model.load_state_dict(checkpoint['model']) model.cuda() model = nn.DataParallel(model) model.to(device) reg_model = CAT_RegModel(args.feature_size, args.hdim, args.output_unit, args.dropout, args.lamb) loaded_reg_model = torch.load(args.regmodel_checkpoint) reg_model.load_state_dict(loaded_reg_model) reg_model.cuda() reg_model = nn.DataParallel(reg_model) reg_model.to(device) prev_epoch_time = timeit.default_timer() model.train() reg_model.eval() while True: # training stage epoch += 1 gc.collect() if epoch > 2: cate_list = list(range(1, args.cate, 1)) random.shuffle(cate_list) else: cate_list = range(1, args.cate, 1) for cate in cate_list: pkl_path = args.tr_data_path + "/" + str(cate) + ".pkl" if not os.path.exists(pkl_path): continue tr_dataset = SpeechDatasetMemPickel(pkl_path) jitter = random.randint(-args.jitter_range, args.jitter_range) chunk_size = args.default_chunk_size + jitter tr_dataloader = DataLoader(tr_dataset, batch_size=batch_size, shuffle=True, num_workers=0, collate_fn=PadCollateChunk(chunk_size)) train_chunk_model(model, reg_model, tr_dataloader, optimizer, epoch, chunk_size, TARGET_GPUS, args, logger) # cv stage model.eval() cv_losses_sum = [] cv_cls_losses_sum = [] count = 0 cate_list = range(1, args.cate, 1) for cate in cate_list: pkl_path = args.dev_data_path + "/" + str(cate) + ".pkl" if not os.path.exists(pkl_path): continue cv_dataset = SpeechDatasetMemPickel(pkl_path) cv_dataloader = DataLoader(cv_dataset, batch_size=batch_size, shuffle=False, num_workers=0, collate_fn=PadCollateChunk( args.default_chunk_size)) validate_count = validate_chunk_model(model, reg_model, cv_dataloader, epoch, cv_losses_sum, cv_cls_losses_sum, args, logger) count += validate_count cv_loss = np.sum(np.asarray(cv_losses_sum)) / count cv_cls_loss = np.sum(np.asarray(cv_cls_losses_sum)) / count # save model save_ckpt( { 'cv_loss': cv_loss, 'model': model.module.state_dict(), 'optimizer': optimizer.state_dict(), 'lr': lr, 'epoch': epoch }, epoch < args.min_epoch or cv_loss <= prev_cv_loss, ckpt_path, "model.epoch.{}".format(epoch)) csv_row = [ epoch, (timeit.default_timer() - prev_epoch_time) / 60, lr, cv_loss ] prev_epoch_time = timeit.default_timer() csv_writer.writerow(csv_row) csv_file.flush() plot_train_figure(args.csv_file, args.figure_file) if epoch < args.min_epoch or cv_loss <= prev_cv_loss: prev_cv_loss = cv_loss lr = adjust_lr(optimizer, args.origin_lr, lr, cv_loss, prev_cv_loss, epoch, args.min_epoch) if (lr < args.stop_lr): print("rank {} lr is too slow, finish training".format(args.rank), datetime.datetime.now(), flush=True) break model.train() ctc_crf_base.release_env(gpus)
def main(): opt = get_opt() tb_logger.configure(opt.logger_name, flush_secs=5, opt=opt) logfname = os.path.join(opt.logger_name, 'log.txt') logging.basicConfig(filename=logfname, format='%(asctime)s %(message)s', level=logging.INFO) logging.getLogger().addHandler(logging.StreamHandler(sys.stdout)) logging.info(str(opt.d)) torch.manual_seed(opt.seed) if opt.cuda: # TODO: remove deterministic torch.backends.cudnn.deterministic = True torch.cuda.manual_seed(opt.seed) np.random.seed(opt.seed) # helps with wide-resnet by reducing memory and time 2x cudnn.benchmark = True train_loader, test_loader, train_test_loader = get_loaders(opt) if opt.epoch_iters == 0: opt.epoch_iters = int( np.ceil(1. * len(train_loader.dataset) / opt.batch_size)) opt.maxiter = opt.epoch_iters * opt.epochs if opt.g_epoch: opt.gvar_start *= opt.epoch_iters opt.g_optim_start = (opt.g_optim_start * opt.epoch_iters) + 1 model = models.init_model(opt) optimizer = OptimizerFactory(model, train_loader, tb_logger, opt) epoch = 0 save_checkpoint = utils.SaveCheckpoint() # optionally resume from a checkpoint if not opt.noresume: model_path = os.path.join(opt.logger_name, opt.ckpt_name) if os.path.isfile(model_path): print("=> loading checkpoint '{}'".format(model_path)) checkpoint = torch.load(model_path) best_prec1 = checkpoint['best_prec1'] optimizer.gvar.load_state_dict(checkpoint['gvar']) optimizer.niters = checkpoint['niters'] epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['model']) save_checkpoint.best_prec1 = best_prec1 print("=> loaded checkpoint '{}' (epoch {}, best_prec {})".format( model_path, epoch, best_prec1)) else: print("=> no checkpoint found at '{}'".format(model_path)) if opt.niters > 0: max_iters = opt.niters else: max_iters = opt.epochs * opt.epoch_iters if opt.untrain_steps > 0: untrain(model, optimizer.gvar, opt) while optimizer.niters < max_iters: optimizer.epoch = epoch utils.adjust_lr(optimizer, opt) ecode = train(tb_logger, epoch, train_loader, model, optimizer, opt, test_loader, save_checkpoint, train_test_loader) if ecode == -1: break epoch += 1 tb_logger.save_log()
def cross_train(): #Basic parameters gpus = FLAG.gpus batch_size = FLAG.batch_size epoches = FLAG.epoch init_lr = FLAG.lr LOG_INTERVAL = 10 TEST_INTERVAL = 2 source_name = FLAG.source target_name = FLAG.target model_name = FLAG.arch adapt_mode = FLAG.adapt_mode l2_decay = 5e-4 #Loading dataset if FLAG.isLT: source_train,target_train,target_test,classes = cross_dataset_LT(FLAG) else: source_train,target_train,target_test,classes = my_cross_dataset(FLAG) source_train_loader = torch.utils.data.DataLoader(dataset=source_train,batch_size=batch_size, shuffle=True,num_workers=8,drop_last=True) target_train_loader = torch.utils.data.DataLoader(dataset=target_train,batch_size=batch_size, shuffle=True,num_workers=8,drop_last=True) target_test_loader = torch.utils.data.DataLoader(dataset=target_test,batch_size=batch_size, shuffle=False,num_workers=8) #Define model if adapt_mode == 'ddc': cross_model = models.DDCNet(FLAG) #adapt_loss_function = mmd_linear adapt_loss_function = mmd_rbf_noaccelerate #print(model) elif adapt_mode == 'coral': cross_model = models.DeepCoral(FLAG) adapt_loss_function = CORAL elif adapt_mode == 'mmd': cross_model = models.DDCNet(FLAG) adapt_loss_function = mmd_linear else: print('The adaptive model name is wrong !') if len(gpus)>1: gpus = gpus.split(',') gpus = [int(v) for v in gpus] cross_model = nn.DataParallel(cross_model,device_ids=gpus) cross_model.to(DEVICE) #Define Optimizer if len(gpus)>1: optimizer = optim.SGD([{'params':cross_model.module.sharedNet.parameters()}, {'params':cross_model.module.cls_fc.parameters(),'lr':init_lr}], lr=init_lr/10,momentum=0.9,weight_decay=l2_decay) else: optimizer = optim.SGD([{'params':cross_model.sharedNet.parameters()}, {'params':cross_model.cls_fc.parameters(),'lr':init_lr}], lr=init_lr/10,momentum=0.9,weight_decay=l2_decay) #print(optimizer.param_groups) #loss function criterion = torch.nn.CrossEntropyLoss() #Training best_result = 0.0 #Model store model_dir = os.path.join('./cross_models/',adapt_mode+'-'+source_name+'2'+target_name+'-'+model_name) if not os.path.exists(model_dir): os.makedirs(model_dir) #Tensorboard configuration log_dir = os.path.join('./cross_logs/',adapt_mode+'-'+source_name+'2'+target_name+'-'+model_name) if not os.path.exists(log_dir): os.makedirs(log_dir) writer = SummaryWriter(logdir=log_dir) for epoch in range(1,epoches+1): cross_model.train() len_source_loader= len(source_train_loader) len_target_loader = len(target_train_loader) iter_source = iter(source_train_loader) iter_target = iter(target_train_loader) if len_target_loader <= len_source_loader: iter_num = len_target_loader which_dataset = True else: iter_num = len_source_loader which_dataset = False #Adaptive learning rate optimizer = adjust_lr(optimizer,epoch,FLAG) writer.add_scalar('data/SharedNet lr',optimizer.param_groups[0]['lr'],epoch) running_loss = 0.0 for i in range(1,iter_num+1): if which_dataset: target_data,_ = next(iter_target) if i % len_target_loader == 0: iter_source = iter(source_train_loader) source_data,source_label = next(iter_source) else: source_data,source_label = next(iter_source) if i % len_source_loader == 0: iter_target = iter(target_train_loader) target_data,_ = next(iter_target) input_source_data,input_source_label = source_data.to(DEVICE),source_label.to(DEVICE).squeeze() input_target_data = target_data.to(DEVICE) optimizer.zero_grad() label_source_pred,source_output,target_output = cross_model(input_source_data, input_target_data) loss_adapt = adapt_loss_function(source_output,target_output) loss_cls = criterion(label_source_pred,input_source_label) lambda_1 = 2 / (1 + math.exp(-10 * (epoch) / epoches)) - 1 loss = loss_cls + lambda_1 * loss_adapt if i%5 ==0: n_iter = (epoch-1)*len_target_loader+i writer.add_scalar('data/adapt loss',loss_adapt,n_iter) writer.add_scalar('data/cls loss',loss_cls,n_iter) writer.add_scalar('data/total loss',loss,n_iter) #print(optimizer.param_groups[0]['lr']) loss.backward() optimizer.step() #Print statistics running_loss += loss.item() if i%LOG_INTERVAL == 0: #Print every 30 mini-batches print('Epoch:[{}/{}],Batch:[{}/{}] loss: {}'.format(epoch,epoches,i,len_target_loader,running_loss/LOG_INTERVAL)) running_loss = 0 if epoch%TEST_INTERVAL ==0: #Every 2 epoches acc_test,class_corr,class_total=cross_test(cross_model,target_test_loader,epoch) #log test acc writer.add_scalar('data/test accuracy',acc_test,epoch) #Store the best model if acc_test>best_result: model_path = os.path.join(model_dir, '{}-{}-{}-epoch_{}-accval_{}.pth'.format(source_name,target_name,model_name,epoch,round(acc_test,3))) torch.save(cross_model,model_path) #log results for classes log_path = model_path = os.path.join(model_dir, '{}-{}-{}-epoch_{}-accval_{}.csv'.format(source_name,target_name,model_name,epoch,round(acc_test,3))) log_to_csv(log_path,classes,class_corr,class_total) best_result = acc_test else: print('The results in this epoch cannot exceed the best results !') writer.close()
def main(): # data div1, div2 = 800, 900 batch_size = 20 num_workers = 0 data_path = 'data/class1_data.pkl' # train num_epoch = 100 lr = 1e-3 lr_step = 50 # momentum = 0.9 weight_decay = 1e-3 # model hidden_dim = 32 # make it a number smaller than feature_dim model_check = 'model/checkpoint.pth.tar' model_best = 'model/bestmodel.pth.tar' # result print_freq = 20 loss_best = 1e5 # -------------------------------------------------------------------------- # prepare dataset data, (len_seq, num_frame, num_joint, num_coor) = make_dataset(data_path) # data[:div] for trainning, data[800:900] for validation # and the rest for testing train_set = data[:div1] val_set = data[div1:div2] # train_loader shuffle train_loader = DataLoader(dataset=train_set, batch_size=batch_size, shuffle=True, num_workers=num_workers) val_loader = DataLoader(dataset=val_set, batch_size=1, shuffle=False, num_workers=num_workers) # -------------------------------------------------------------------------- # model settings feature_dim = num_joint * num_coor # 16 * 3 = 48 model = LSTMpred(feature_dim, hidden_dim) print(model) criterion = nn.MSELoss() optimizer = optim.SGD(model.parameters(), lr=lr, momentum=momentum, weight_decay=weight_decay) # -------------------------------------------------------------------------- # run for epoch in range(num_epoch): adjust_lr(lr, optimizer, epoch, lr_step) print('Epoch: {0}/{1} [training stage]'.format(epoch, num_epoch)) train(train_loader, model, criterion, optimizer, print_freq) print('Epoch: {0}/{1} [validation stage]'.format(epoch, num_epoch)) loss = val(val_loader, model, criterion, print_freq) is_best = loss < loss_best loss_best = min(loss_best, loss) save_checkpoint( { 'epoch': epoch, 'arch': 'LSTMpred', 'state_dict': model.state_dict(), 'loss_best': loss_best, 'optimizer': optimizer.state_dict(), }, is_best, model_check, model_best)
loss1 = CE(atts, gts) loss2 = CE(dets, gts) loss = loss1 + loss2 loss.backward() clip_gradient(optimizer, opt.clip) optimizer.step() if i % 400 == 0 or i == total_step: print( '{} Epoch [{:03d}/{:03d}], Step [{:04d}/{:04d}], Loss1: {:.4f} Loss2: {:0.4f}' .format(datetime.now(), epoch, opt.epoch, i, total_step, loss1.data, loss2.data)) if opt.is_ResNet: save_path = 'models/CPD_Resnet/' else: save_path = 'models/CPD_VGG/' if not os.path.exists(save_path): os.makedirs(save_path) if (epoch + 1) % 5 == 0: torch.save(model.state_dict(), save_path + opt.trainset + '_w.pth' + '.%d' % epoch) print("Let's go!") for epoch in range(1, opt.epoch): adjust_lr(optimizer, opt.lr, epoch, opt.decay_rate, opt.decay_epoch) train(train_loader, model, optimizer, epoch)
if opt.cuda: train_loader = get_loader(image_root, gt_root, batchsize=opt.batchsize, num_workers=3, pin_memory=True) model.cuda() else: train_loader = get_loader(image_root, gt_root, batchsize=opt.batchsize, num_workers=3, pin_memory=False) total_step = len(train_loader) params = model.parameters() optimizer = torch.optim.Adam(params, lr=opt.lr) crit = torch.nn.BCEWithLogitsLoss() print("Let's go!") for epoch in range(1, opt.epoch + 1): model.train() adjust_lr(optimizer, opt.lr, epoch) for i, pack in enumerate(train_loader, start=1): optimizer.zero_grad() # Load data images, gts = pack images = Variable(images) gts = Variable(gts) if opt.cuda: images = images.cuda() gts = gts.cuda() # Forward res = model(images) # Merge losses loss = crit(res, gts) # Backward and update loss.backward()