def learn(self, epoch, dataloader, init=True): self.train() loss_curve = {loss: [] for loss in self.loss_names} self.acc_reset_mnist() bar = ProgressBar() for data in bar(dataloader): data_var = [to_tensor(_, self.opt.device) for _ in data] self.set_input(data_var, init) self.optimize_parameters(init) for loss in self.loss_names: loss_curve[loss].append( getattr(self, 'loss_' + loss).detach().item()) self.acc_update_mnist() self.loss_msg = '[Train][{}] Loss:'.format(epoch) for loss in self.loss_names: self.loss_msg += ' {} {:.3f}'.format(loss, np.mean(loss_curve[loss])) self.acc_msg = '[Train][{}] Acc: source {:.3f} ({}/{}) target {:.3f} ({}/{})'.format( epoch, self.acc_source, self.hit_source, self.cnt_source, self.acc_target, self.hit_target, self.cnt_target) self.print_log() for lr_scheduler in self.lr_schedulers: lr_scheduler.step()
def learn(self, epoch, dataloader, init=True): self.epoch = epoch self.train() loss_curve = { loss: [] for loss in self.loss_names } acc_curve = [] if init: for data in dataloader: x_seq, y_seq = [d[0][None, :, :] for d in data], [d[1][None, :] for d in data] x_seq = torch.cat(x_seq, 0).cuda() y_seq = torch.cat(y_seq, 0).cuda() self.set_input(input=(x_seq, y_seq)) self.optimize_parameters(init) for loss in self.loss_names: loss_curve[loss].append(getattr(self, 'loss_' + loss).item()) acc_curve.append(self.g_seq.eq(self.y_seq).to(torch.float).mean(-1, keepdim=True)) loss_msg = '[Train][{}] Loss:'.format(epoch) for loss in self.loss_names: loss_msg += ' {} {:.3f}'.format(loss, np.mean(loss_curve[loss])) acc = to_np(torch.cat(acc_curve, 1).mean(-1)) acc_msg = '[Train][{}] Acc: {:.2f} {}'.format(epoch, acc.mean(), np.around(acc, decimals=2)) print(loss_msg) print(acc_msg) else: dataloader, continual_dataloader = dataloader for data_1, data_2 in zip(dataloader, continual_dataloader): x_seq, y_seq = [d[0][None, :, :] for d in data_1], [d[1][None, :] for d in data_1] x_seq = torch.cat(x_seq, 0).cuda() y_seq = torch.cat(y_seq, 0).cuda() x_rpy, y_rpy = [d[0][None, :, :] for d in data_2], [d[1][None, :] for d in data_2] x_rpy = torch.cat(x_rpy, 0).cuda() y_rpy = torch.cat(y_rpy, 0).cuda() self.set_input([x_seq, y_seq, x_rpy, y_rpy], init) self.optimize_parameters(init) for loss in self.loss_names: loss_curve[loss].append(getattr(self, 'loss_' + loss).detach().item()) acc_curve.append(self.g_tgt.eq(self.y_tgt).to(torch.float).mean(-1, keepdim=True)) loss_msg = '[Train][{}] Loss:'.format(epoch) for loss in self.loss_names: loss_msg += ' {} {:.3f}'.format(loss, np.mean(loss_curve[loss])) acc = to_np(torch.cat(acc_curve, 1).mean(-1)) acc_msg = '[Train][{}] Acc: {:.2f} {}'.format(epoch, acc.mean(), np.around(acc, decimals=2)) print(loss_msg) print(acc_msg) for lr_scheduler in self.lr_schedulers: lr_scheduler.step()
def train_model(model, criterion, optimizer, dataload, lr_scheduler): num_epochs = args.num_epochs loss_record = [] for epoch in range(num_epochs): print('Epoch {}/{}'.format(epoch, num_epochs - 1)) print('-' * 10) dataset_size = len(dataload.dataset) epoch_loss = 0 step = 0 # minibatch数 for x, y in dataload: # 分100次遍历数据集,每次遍历batch_size=4 optimizer.zero_grad() # 每次minibatch都要将梯度(dw,db,...)清零 inputs = x.to(device) labels = y.to(device) outputs = model(inputs) # 前向传播 outputs = outputs.squeeze() loss = criterion(outputs, labels) # 计算损失 loss.backward() # 梯度下降,计算出梯度 # print(lr_scheduler.get_lr()[0]) optimizer.step() lr_scheduler.step( ) # 更新参数一次:所有的优化器Optimizer都实现了step()方法来对所有的参数进行更新 epoch_loss += loss.item() loss_record.append(loss.item()) step += 1 print("%d/%d,train_loss:%0.3f" % (step, dataset_size // dataload.batch_size, loss.item())) print("epoch %d loss:%0.3f" % (epoch, epoch_loss)) loss_data = pd.DataFrame(data=loss_record) loss_data.to_csv(args.loss_record) plt.plot(loss_data) torch.save(model.state_dict(), args.weight) # 返回模型的所有内容 plt.show() return model
def test_cosine_decay_function() -> None: """ Tests Cosine lr decay function at (pi/2) and verifies if the value is correct. """ config = DummyModel(l_rate_scheduler=LRSchedulerType.Cosine, num_epochs=10, min_l_rate=0.0) # create lr scheduler test_epoch = 5 lr_scheduler, _ = _create_lr_scheduler_and_optimizer(config) for _ in range(test_epoch): lr_scheduler.step() assert lr_scheduler.get_last_lr()[0] == 0.5 * config.l_rate
def train(train_loader, model, optimizer, lr_scheduler, epoch): batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() top1 = AverageMeter() top5 = AverageMeter() # switch to train mode model.train() end = time.time() for i, (data, target) in enumerate(train_loader): # measure data loading time data_time.update(time.time() - end) target = target.cuda(non_blocking=True) data = data.cuda() output = model(data) loss = criterion(output, target) acc1, acc5 = accuracy(output, target, topk=(1, 5)) losses.update(loss.item(), data.size(0)) top1.update(acc1.item(), data.size(0)) top5.update(acc5.item(), data.size(0)) # compute gradient and do SGD step optimizer.zero_grad() loss.backward() # adjust lr lr = lr_scheduler.step() for pg in optimizer.param_groups: pg["lr"] = lr # impose L1 penalty to BN factors if args.sparsity != 0: for m in model.modules(): if isinstance(m, nn.BatchNorm2d): m.weight.grad.data.add_(args.sparsity*torch.sign(m.weight.data)) # L1 optimizer.step() # measure elapsed time batch_time.update(time.time() - end) end = time.time() lr = optimizer.param_groups[0]["lr"] if i % args.print_freq == 0: logger.info('Epoch[{0}/{1}] Iter[{2}/{3}]\t' 'Batch Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Data Time {data_time.val:.3f} ({data_time.avg:.3f})\t' 'Train Loss {loss.val:.3f} ({loss.avg:.3f})\t' 'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t' 'Prec@5 {top5.val:.3f} ({top5.avg:.3f})\t' 'LR: {lr:.4f}'.format( epoch, args.epochs, i, len(train_loader), batch_time=batch_time, data_time=data_time, loss=losses, top1=top1, top5=top5, lr=lr)) return losses.avg
def train(training_model, n_epochs, optim, lr_scheduler, model_dir, target_device, train_loader, test_loader): tlog('Training the model...') tlog('working on {}'.format(target_device)) best_accuracy = 0. # determines whether we save a copy of the model saved_model_filename = None for epoch in range(n_epochs): # train for one epoch, printing every 10 iterations train_one_epoch(training_model, optim, train_loader, target_device, epoch, print_freq=10) # update the learning rate lr_scheduler.step() # evaluate on the test dataset x = evaluate(training_model, test_loader, device=target_device) print(x) saved_model_filename = save_model(training_model, args.model_dir) return (saved_model_filename, 1)
def learn(self, epoch, dataloader): self.epoch = epoch self.stage = epoch // self.epoch_in_a_stage self.train() loss_curve = {loss: [] for loss in self.loss_names} acc_curve = [] for data in dataloader: x_seq, y_seq, idx_seq = [d[0][None, :, :] for d in data ], [d[1][None, :] for d in data ], [d[2][None, :] for d in data] x_seq = torch.cat(x_seq, 0).to(self.device) y_seq = torch.cat(y_seq, 0).to(self.device) idx_seq = torch.cat(idx_seq, 0).to(self.device) self.set_input(input=(x_seq, y_seq, idx_seq)) self.optimize_parameters() for loss in self.loss_names: loss_curve[loss].append(getattr(self, 'loss_' + loss).item()) acc_curve.append( self.g_seq.eq(self.y_seq).to(torch.float).mean(-1, keepdim=True)) loss_msg = '[Train][{}] Loss:'.format(epoch) for loss in self.loss_names: loss_msg += ' {} {:.3f}'.format(loss, np.mean(loss_curve[loss])) acc = to_np(torch.cat(acc_curve, 1).mean(-1)) acc_msg = '[Train][{}] Accuracy: total average {:.1f}, in each domain {}'.format( epoch, acc.mean() * 100, np.around(acc * 100, decimals=1)) if (epoch + 1) % 10 == 0: print(loss_msg) print(acc_msg) with open(self.train_log, 'a') as f: f.write(loss_msg + "\n" + acc_msg + "\n") for lr_scheduler in self.lr_schedulers: lr_scheduler.step()
def train_one_epoch(model, optimizer, lr_scheduler, data_loader, epoch): device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') print("Start Train ...") model.train() losses = [] accur = [] for data, target in data_loader: device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') data = data.permute(0, 3, 1, 2).to(device) targets = target.permute(0, 3, 1, 2).to(device) outputs = model(data) out_cut = np.copy(outputs.data.cpu().numpy()) out_cut[np.nonzero(out_cut < 0.5)] = 0.0 out_cut[np.nonzero(out_cut >= 0.5)] = 1.0 train_dice = dice_metric(out_cut, targets.data.cpu().numpy()) loss = bce_dice_loss(outputs, targets) losses.append(loss.item()) accur.append(train_dice) optimizer.zero_grad() loss.backward() optimizer.step() if lr_scheduler is not None: lr_scheduler.step() print("Epoch [%d]" % (epoch)) print("Mean loss on train:", np.array(losses).mean(), "Mean DICE on train:", np.array(accur).mean()) return np.array(losses).mean(), np.array(accur).mean()
def train_step(train_loader, model, criterion, optimizer, epoch, lr_scheduler): print(f'epoch {epoch}') batch_time = AverageMeter() losses = AverageMeter() avg_score = AverageMeter() model.train() num_steps = min(len(train_loader), MAX_STEPS_PER_EPOCH) print(f'total batches: {num_steps}') end = time.time() lr = None for i, data in enumerate(train_loader): input_ = data['image'] target = data['target'] batch_size, _, _, _ = input_.shape output = model(input_.cuda()) loss = criterion(output, target.cuda()) confs, predicts = torch.max(output.detach(), dim=1) avg_score.update(GAP(predicts, confs, target)) losses.update(loss.data.item(), input_.size(0)) optimizer.zero_grad() loss.backward() optimizer.step() lr_scheduler.step() lr = optimizer.param_groups[0]['lr'] batch_time.update(time.time() - end) end = time.time() if i % LOG_FREQ == 0: print(f'{epoch} [{i}/{num_steps}]\t' f'time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' f'loss {losses.val:.4f} ({losses.avg:.4f})\t' f'GAP {avg_score.val:.4f} ({avg_score.avg:.4f})' + str(lr)) print(f' * average GAP on train {avg_score.avg:.4f}')
def train(epoch): print('\nEpoch: %d' % epoch) net.train() train_loss = 0 correct = 0 total = 0 lr_scheduler.step() ############add for batch_idx, (inputs, targets) in enumerate(trainloader): inputs, targets = inputs.to(device), targets.to(device) optimizer.zero_grad() outputs = net(inputs) loss = criterion(outputs, targets) loss.backward() optimizer.step() train_loss += loss.item() _, predicted = outputs.max(1) total += targets.size(0) correct += predicted.eq(targets).sum().item() progress_bar( batch_idx, len(trainloader), 'Loss: %.3f | Acc: %.3f%% (%d/%d)' % (train_loss / (batch_idx + 1), 100. * correct / total, correct, total))
loss_hist.append(float(loss)) epoch_loss.append(float(loss)) print(json.dumps({ "epoch_num" : epoch_num, "iter_num" : iter_num, "img_num" : iter_num * args.batch_size, "cls_loss" : float(cls_loss), "reg_loss" : float(reg_loss), "loss_hist" : float(np.mean(loss_hist)), "elapsed" : time() - start_time, })) sys.stdout.flush() del cls_loss del reg_loss print('Evaluating dataset') if args.dataset == 'coco': coco_eval.evaluate_coco(dataset_val, retinanet) elif args.dataset == 'csv' and args.csv_val is not None: _ = csv_eval.evaluate(dataset_val, retinanet) lr_scheduler.step(np.mean(epoch_loss)) torch.save(retinanet.module, '{}_retinanet_{}.pt'.format(args.dataset, epoch_num)) retinanet.eval() torch.save(retinanet, 'model_final.pt'.format(epoch_num))
def train_epoch(self, epoch, model, dataloader, optimizer, lr_scheduler, grad_normalizer=None, prefix="train"): model.train() _timer = Timer() lossMeter = LossMeter() perfMeter = PerfMeter() for i, (imgs, labels) in enumerate(dataloader): _timer.tic() # zero the parameter gradients optimizer.zero_grad() if self.cfg.HALF: imgs = imgs.half() if len(self.device) > 1: out = data_parallel(model, (imgs, labels, prefix), device_ids=self.device, output_device=self.device[0]) else: imgs = imgs.cuda() labels = [label.cuda() for label in labels] if isinstance( labels, list) else labels.cuda() out = model(imgs, labels, prefix) if not isinstance(out, tuple): losses, performances = out, None else: losses, performances = out if losses["all_loss"].sum().requires_grad: if self.cfg.GRADNORM is not None: grad_normalizer.adjust_losses(losses) grad_normalizer.adjust_grad(model, losses) else: losses["all_loss"].sum().backward() optimizer.step() self.n_iters_elapsed += 1 _timer.toc() lossMeter.__add__(losses) if performances is not None and all(performances): perfMeter.put(performances) if (i + 1) % self.cfg.N_ITERS_TO_DISPLAY_STATUS == 0: avg_losses = lossMeter.average() template = "[epoch {}/{}, iter {}, lr {}] Total train loss: {:.4f} " "(ips = {:.2f} )\n" "{}" self.logger.info( template.format( epoch, self.cfg.N_MAX_EPOCHS, i, round(get_current_lr(optimizer), 6), avg_losses["all_loss"], self.batch_size * self.cfg.N_ITERS_TO_DISPLAY_STATUS / _timer.total_time, "\n".join([ "{}: {:.4f}".format(n, l) for n, l in avg_losses.items() if n != "all_loss" ]), )) if self.cfg.TENSORBOARD: tb_step = int((epoch * self.n_steps_per_epoch + i) / self.cfg.N_ITERS_TO_DISPLAY_STATUS) # Logging train losses [ self.tb_writer.add_scalar(f"loss/{prefix}_{n}", l, tb_step) for n, l in avg_losses.items() ] lossMeter.clear() del imgs, labels, losses, performances lr_scheduler.step() if self.cfg.TENSORBOARD and len(perfMeter): avg_perf = perfMeter.average() [ self.tb_writer.add_scalar(f"performance/{prefix}_{k}", v, epoch) for k, v in avg_perf.items() ] if self.cfg.TENSORBOARD_WEIGHT and False: for name, param in model.named_parameters(): layer, attr = os.path.splitext(name) attr = attr[1:] self.tb_writer.add_histogram("{}/{}".format(layer, attr), param, epoch)
def train(train_loader, model, optimizer, lr_scheduler, epoch): batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() top1 = AverageMeter() top5 = AverageMeter() if args.use_dali: train_loader_len = int(np.ceil(train_loader._size/args.batch_size)) else: train_loader_len = len(train_loader) # switch to train mode model.train() end = time.time() for i, data in enumerate(train_loader): # measure data loading time data_time.update(time.time() - end) if args.use_dali: target = torch.cat([i["label"].to(torch.device('cuda:0')) for i in data], dim=0) data = torch.cat([i["data"].to(torch.device('cuda:0')) for i in data], dim=0) target = target.cuda().squeeze().long() else: data, target = data data = data.cuda() target = target.cuda() output = model(data) loss = criterion(output, target) acc1, acc5 = accuracy(output, target, topk=(1, 5)) losses.update(loss.item(), data.size(0)) top1.update(acc1.item(), data.size(0)) top5.update(acc5.item(), data.size(0)) # compute gradient and do SGD step optimizer.zero_grad() loss.backward() lr = lr_scheduler.step() for pg in optimizer.param_groups: pg["lr"] = lr # impose L1 penalty to BN factors if args.sparsity != 0: for m in model.modules(): if isinstance(m, nn.BatchNorm2d): m.weight.grad.data.add_(args.sparsity*torch.sign(m.weight.data)) # L1 optimizer.step() # measure elapsed time batch_time.update(time.time() - end) end = time.time() lr = optimizer.param_groups[0]["lr"] if i % args.print_freq == 0: logger.info('Epoch[{0}/{1}] Iter[{2}/{3}]\t' 'Batch Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Data Time {data_time.val:.3f} ({data_time.avg:.3f})\t' 'Train Loss {loss.val:.3f} ({loss.avg:.3f})\t' 'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t' 'Prec@5 {top5.val:.3f} ({top5.avg:.3f})\t' 'LR: {lr:.4f}'.format( epoch, args.epochs, i, train_loader_len, batch_time=batch_time, data_time=data_time, loss=losses, top1=top1, top5=top5, lr=lr)) if args.use_dali: train_loader.reset() return losses.avg
def train(opt): """ dataset preparation """ logging.info("dataset preparation ...") dataset = Dateloader(opt.data_path, mode="train", dataset=opt.Datasets) data_loader = torch.utils.data.DataLoader(dataset, batch_size=opt.batch_size, shuffle=True, num_workers=opt.num_workers, drop_last=True, pin_memory=True) dataset_val = Dateloader(opt.data_path, mode="test", dataset=opt.Datasets) data_loader_val = torch.utils.data.DataLoader(dataset_val, batch_size=opt.batch_size, shuffle=False, num_workers=opt.num_workers) logging.info('| Building net...') model = create_model(opt.Backbone, opt.num_classes) model = torch.nn.DataParallel(model) cudnn.benchmark = True optimizer = optim.SGD(model.parameters(), lr=opt.lr, momentum=0.9, weight_decay=2e-5) # optimizer = optim.Adam(model.parameters(), lr=opt.lr, weight_decay=opt.weight_decay) lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, milestones=[80, 130, 170, 200, 230, 250], gamma=0.1) CEloss = nn.CrossEntropyLoss() best_acc = 0 for epoch in range(opt.epoch_iter): model.train() epoch_loss = 0 lr_scheduler.step() epoch_time = time.time() for i, (image, gt) in enumerate(data_loader): start_time = time.time() inputs, labels = image.cuda(), gt.cuda() optimizer.zero_grad() outputs = model(inputs) loss = CEloss(outputs, labels) epoch_loss += loss.item() loss.backward() optimizer.step() logging.info('Epoch is [{}/{}], mini-batch is [{}/{}], time consumption is {:.8f}, batch_loss is {:.8f}'.format( \ epoch + 1, opt.epoch_iter, i + 1, int(len(data_loader)), time.time() - start_time, loss.item())) if epoch > 1: validate(data_loader_val, model, CEloss) best_acc = test(epoch, model, data_loader_val, best_acc) model.train() logging.info( "----------------------------------------------------------") logging.info(" best_acc: {:.3f}".format(best_acc)) logging.info(" lr: {:.3f}".format( optimizer.param_groups[0]['lr'])) logging.info( "----------------------------------------------------------") logging.info('epoch_loss is {:.8f}, epoch_time is {:.8f}'.format( epoch_loss / int(len(data_loader)), time.time() - epoch_time)) logging.info(time.asctime(time.localtime(time.time())))
def train_model(model, criterion, optimizer, lr_scheduler, num_epochs=5): start = time.time() #deepcopy needed for references best_model = copy.deepcopy(model.state_dict()) best_acc = 0.0 train_losses = [] train_acc = [] dev_losses = [] dev_acc = [] for epoch in range(num_epochs): print('Epoch {}/{}'.format(epoch, num_epochs - 1)) print('-' * 10) for phase in ['train', 'dev']: if phase == 'train': lr_scheduler.step() #Toggle 'train' mode for model. model.train() else: #Toggle 'eval' mode for model model.eval() running_loss = 0.0 running_corrects = 0 for inputs, labels in dataloaders[phase]: inputs = inputs.to(device) labels = labels.to(device) # reset gradients optimizer.zero_grad() with torch.set_grad_enabled(phase == 'train'): outputs = model(inputs) _, preds = torch.max(outputs, 1) loss = criterion(outputs, labels) # backward + optimize only if in training phase if phase == 'train': loss.backward() optimizer.step() # accumulate running_loss += loss.item() * inputs.size(0) running_corrects += torch.sum(preds == labels.data) epoch_loss = running_loss / dataset_sizes[phase] epoch_acc = running_corrects.double() / dataset_sizes[phase] if phase == 'train': train_losses.append(epoch_loss) train_acc.append(epoch_acc) elif phase == 'dev': dev_losses.append(epoch_loss) dev_acc.append(epoch_acc) print('{} Loss: {:.4f} Acc: {:.4f}'.format( phase, epoch_loss, epoch_acc)) #keep updating best model if phase == 'dev' and epoch_acc > best_acc: best_acc = epoch_acc best_model = copy.deepcopy(model.state_dict()) print() time_elapsed = time.time() - start print('Training complete in {:.0f}m {:.0f}s'.format( time_elapsed // 60, time_elapsed % 60)) print('Best dev Acc: {:4f}'.format(best_acc)) # load best model weights model.load_state_dict(best_model) return model, train_losses, train_acc, dev_losses, dev_acc
else: val_loss = evaluate(val_data, eval_batch_size) print('-' * 89) print('| end of epoch {:3d} |lr {:5.2f}| time: {:5.2f}s | valid loss {:5.2f} | ' 'valid ppl {:8.2f} | valid bpc {:8.3f}'.format( epoch, lr, (time.time() - epoch_start_time), val_loss, math.exp(val_loss), val_loss / math.log(2))) print('-' * 89) if val_loss < stored_loss: model_save(args.save) print('Saving model (new best validation)') stored_loss = val_loss if args.optimizer == 'adam': lr_scheduler.step(val_loss) if args.optimizer == 'sgd' and 't0' not in optimizer.param_groups[0] and ( len(best_val_loss) > args.nonmono and val_loss > min(best_val_loss[:-args.nonmono])): print('Switching to ASGD') optimizer = Sparse_ASGD(model.parameters(), lr=args.lr, t0=0, lambd=0., weight_decay=args.wdecay) mask.optimizer = optimizer mask.update_optimizer_mask() if args.sparse and ('t0' not in optimizer.param_groups[0]): mask.at_end_of_epoch(epoch) best_val_loss.append(val_loss) print("PROGRESS: {}%".format((epoch / args.epochs) * 100))
def train(model_ft, criterion, optimizer_ft, train_generator, val_generator, regularize=False, n_epochs=20, lr_scheduler=None): start_time = time.time() # Current time data_actual = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S") # Interesting metrics to keep loss_train = [] acc_train = [] loss_val = [] acc_val = [] best_val_acc = 0. # Main loop for epoch in range(n_epochs): # Train model_ft.train() cont = 0 running_loss = 0.0 running_corrects = 0 if lr_scheduler: lr_scheduler.step() for rgbs, labels in training_generator: cont += 1 # Get items from generator if torch.cuda.is_available(): inputs = rgbs.cuda() labels = labels.cuda() else: inputs = rgbs # Clean grads optimizer_ft.zero_grad() #Forward outs = model_ft(inputs) _, preds = torch.max(outs, 1) loss = criterion(outs, labels) # Track losses + correct predictions running_loss += loss.item() * inputs.size(0) running_corrects += torch.sum(preds == labels.data) loss.backward() optimizer_ft.step() # Get avg loss + accuracies in % epoch_loss = running_loss / dataset.__len__() epoch_acc = running_corrects.double().detach() / dataset.__len__() print('{} Loss: {:.4f} Acc: {:.4f}'.format('Train epoch ' + str(epoch), epoch_loss, epoch_acc)) loss_train.append(epoch_loss) #.data.cpu().numpy()[0]) acc_train.append(epoch_acc.data.cpu().numpy()) # Val model_ft.eval() cont = 0 running_loss = 0.0 running_corrects = 0 predicts = [] val_labels = [] for rgbs, labels in val_generator: cont += 1 val_labels += list(labels.numpy()) # Get items from generator if torch.cuda.is_available(): inputs = rgbs.cuda() labels = labels.cuda() else: inputs = rgbs # Clean grads optimizer_ft.zero_grad() #Forward outs = model_ft(inputs) _, preds = torch.max(outs, 1) predicts += list(preds.cpu().numpy()) loss = criterion(outs, labels) loss.backward() optimizer_ft.step() running_loss += loss.item() * inputs.size(0) running_corrects += torch.sum(preds == labels.data) epoch_loss = running_loss / dataset_val.__len__() epoch_acc = running_corrects.double().detach() / dataset_val.__len__() epoch_acc = epoch_acc.data.cpu().numpy() print('{} Loss: {:.4f} Acc: {:.4f}'.format('Val epoch ' + str(epoch), epoch_loss, epoch_acc)) loss_val.append(epoch_loss) #.data.cpu().numpy()) acc_val.append(epoch_acc) # Save model and early stop? if epoch_acc > best_val_acc: best_model_wts = copy.deepcopy(model_ft.state_dict()) best_predicts = predicts best_labels = val_labels torch.save(best_model_wts, 'attention_resnet_' + data_actual) results = {} loss = {} acc = {} # losses loss['train'] = np.array(loss_train) loss['val'] = np.array(loss_val) # accuracies acc['train'] = np.array(acc_train) acc['val'] = np.array(acc_val) results['losses'] = loss results['acc'] = acc print("--- %s seconds ---" % (time.time() - start_time)) return results, best_labels, best_predicts, data_actual
def main(): # --- parse parameters --- for i in np.arange(1, len(sys.argv), 1): [key, val] = sys.argv[i].split('=', 1) if key in [ 'd', 'nIneq', 'randseed', 'nEpochs', 'Ktrain', 'Kval', 'Ktest' ]: args[key] = int(val) elif key in ['bound']: args[key] = float(val) elif key in ['nunits']: args[key] = [int(s) for s in val.split(',')] elif key in ['videofilename', 'datafilename']: if val == 'None': args[key] = None else: args[key] = val else: print('WARNING: invalid input option {0:s}'.format(key)) if args['nunits'] is None: args['nunits'] = [args['d'], 4 * args['d'], 16 * args['d'], args['d']] # check if cuda available args['useCuda'] = False #torch.cuda.is_available() print('CUDA enabled: {0:}'.format(args['useCuda'])) # seed rng np.random.seed(args['randseed']) # --- generate inequalities to make convex set --- print('Making data...') #ineq = linearIneqTestData.makeRandomData(args['d'], args['nIneq']) #ineq = linearIneqTestData.makeSimplePolygonData() ineq = linearIneqTestData.makeSimpleTriangleData() print('done.') # --- generate point/projected point pairs --- print('Making training/validation/test sets:') print('Training set...') dataTrain = linearIneqTestData.makePointProjectionPairs( ineq, args['Ktrain'], args['bound']) trainDataset = ProjectionDataset(dataTrain['P'], dataTrain['Pproj']) print('Validation set...') dataVal = linearIneqTestData.makePointProjectionPairs( ineq, args['Kval'], args['bound']) valDataset = ProjectionDataset(dataVal['P'], dataVal['Pproj']) print('Test set...') dataTest = linearIneqTestData.makePointProjectionPairs( ineq, args['Ktest'], args['bound']) testDataset = ProjectionDataset(dataTest['P'], dataTest['Pproj']) print('done.') #linearIneqTestData.plot(ineq, P=dataTrain['P'], Pproj=dataTrain['Pproj'], Pproj_hat=None, # showplot=True, savefile="traindata.png") # --- train network --- print('Constructing network...') model = Network() if args['useCuda']: model.cuda() print(model) print('done.') print('Making optimizer...') optimizer = torch.optim.SGD(model.parameters(), 0.05, momentum=0.9, weight_decay=1e-4) lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[225, 240]) if args['useCuda']: criterion = torch.nn.SmoothL1Loss().cuda() # huber loss else: criterion = torch.nn.SmoothL1Loss() print('done.') print('Training...') Pproj_hat = np.zeros((args['d'], args['Kval'], args['nEpochs'])) errs = np.zeros((args['nEpochs'])) losses = np.zeros((args['nEpochs'])) for epoch in range(args['nEpochs']): # train one epoch currentLR = optimizer.param_groups[0]['lr'] train(trainDataset, model, criterion, optimizer) lr_scheduler.step() # evaluate on validation set errs[epoch], Pproj_hat[:, :, epoch] = validate(valDataset, model, criterion) #linearIneqTestData.plot(ineq, P=dataTrain['P'], Pproj=dataTrain['Pproj'], Pproj_hat=None, # showplot=False, savefile=None) print('Epoch {0:d}/{1:d}\tlr = {2:.5e}\tmean l2 err = {3:.7f}'.format( epoch + 1, args['nEpochs'], currentLR, errs[epoch])) print('Training ({0:d} epochs) complete!'.format(args['nEpochs'])) # --- save results on training/eval set --- print('Generating output files:') if args['videofilename'] is None: print('Video creation disabled.') else: print('Making video...') linearIneqTestData.makevideo(ineq, dataTest['P'], dataTest['Pproj'], Pproj_hat, savefile=args['videofilename'] + '.mp4', errs=errs) print('done.') if args['datafilename'] is None: print('Data output disabled.') else: print('Saving results...') saveTestResults(trainDataset, model, args['datafilename'] + '_train.mat') saveTestResults(valDataset, model, args['datafilename'] + '_val.mat') saveTestResults(testDataset, model, args['datafilename'] + '_test.mat') print('done.') print('Output complete!')
def _train_one_epoch(self, train_loader, batch_size=0, epoch=0, print_freq=1, multi_scale=False, img_size=(512, 512), grid_min=None, grid_max=None, grid_size=32, random_size=64, device=torch.device('cuda'), warmup=False): self.model.train() metric_logger = utils.MetricLogger(delimiter=" ") metric_logger.add_meter( 'lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}')) header = 'Epoch: [{}]'.format(epoch) lr_scheduler = None if epoch == 0 and warmup: # 当训练第一轮(epoch=0)时,启用warmup训练方式,可理解为热身训练 warmup_factor = 1.0 / 1000 warmup_iters = min(1000, len(train_loader) - 1) lr_scheduler = utils.warmup_lr_scheduler(self.optimizer, warmup_iters, warmup_factor) random_size = 1 enable_amp = 'cuda' in device.type scale = amp.GradScaler(enabled=enable_amp) lr_now = 0. loss_mean = torch.zeros(4).to(device) # mean losses batch_size = len(train_loader) # number of batches for i, (images, targets, paths, _, _) in enumerate( metric_logger.log_every(train_loader, print_freq, header)): # count_batch 统计从 epoch0 开始的所有 batch 数 count_batch = i + batch_size * epoch # number integrated batches (since train start) images = images.to(device).float( ) / 255.0 # uint8 to float32, 0 - 255 to 0.0 - 1.0 targets = targets.to(device) # Multi-Scale # 由于label已转为相对坐标,故缩放图片不影响label的值 # 每训练64张图片,就随机修改一次输入图片大小 if multi_scale: images, img_size = self.random_size( images, img_size, count_batch % random_size == 0, grid_min, grid_max, grid_size) # 混合精度训练上下文管理器,如果在CPU环境中不起任何作用 with amp.autocast(enabled=enable_amp): # loss: compute_loss loss_dict = self.loss(self.model(images), targets) losses = sum(loss for loss in loss_dict.values()) # reduce losses over all GPUs for logging purpose loss_dict_reduced = utils.reduce_dict(loss_dict) losses_reduced = sum(loss for loss in loss_dict_reduced.values()) loss_items = torch.cat((loss_dict_reduced["box_loss"], loss_dict_reduced["obj_loss"], loss_dict_reduced["class_loss"], losses_reduced)).detach() loss_mean = (loss_mean * i + loss_items) / ( i + 1) # update mean losses if not torch.isfinite(losses_reduced): print('WARNING: non-finite loss, ending training ', loss_dict_reduced) print("training image path: {}".format(",".join(paths))) sys.exit(1) losses *= 1. / random_size # scale loss # backward scale.scale(losses).backward() # optimize # 每训练64张图片更新一次权重 if count_batch % random_size == 0: scale.step(self.optimizer) scale.update() self.optimizer.zero_grad() metric_logger.update(loss=losses_reduced, **loss_dict_reduced) lr_now = self.optimizer.param_groups[0]["lr"] metric_logger.update(lr=lr_now) if count_batch % random_size == 0 and lr_scheduler is not None: # 第一轮使用warmup训练方式 self.optimizer.step() lr_scheduler.step() return loss_mean, lr_now
def train_cls(dataloader, val_dataloader, model_root, net, args): net.train() start_epoch = 1 optimizer = torch.optim.SGD(net.parameters(), lr=args.lr, momentum=args.momentum, nesterov=True, weight_decay=args.weight_decay) lr_scheduler = torch.optim.lr_scheduler.LambdaLR( optimizer, lr_lambda=LambdaLR(args.maxepoch, start_epoch, args.decay_epoch).step) train_loss, step_cnt, batch_count = 0.0, 0, 0 best_acc = 0.50 for epoc_num in np.arange(start_epoch, args.maxepoch + 1): for batch_idx, (batch_data, gt_classes, true_num, bboxes) in enumerate(dataloader): im_data = batch_data.cuda().float() im_label = gt_classes.cuda().long() num_data = true_num.cuda().long() im_label = im_label.view(-1, 1) train_pred, assignments = net(im_data, im_label, true_num=num_data) vecloss = net.loss loss = torch.mean(vecloss) n_data = im_data.size()[0] num_sample = im_data.size()[0] train_loss_val = loss.data.cpu().item() train_loss += train_loss_val # backward optimizer.zero_grad() loss.backward() optimizer.step() step_cnt += 1 batch_count += 1 train_loss /= step_cnt print((' epoch {}, loss: {}, learning rate: {:.5f}'. \ format(epoc_num, train_loss, optimizer.param_groups[0]['lr']))) net.eval() total_pred, total_gt = [], [] for val_data, val_label, val_num, val_boxes in val_dataloader: val_data = val_data.cuda().float() val_num = val_num.cuda().long() val_pred_pro, assignments = net(val_data, true_num=val_num) val_pred_pro = val_pred_pro.cpu() _, cls_labels = torch.topk(val_pred_pro, 1, dim=1) cls_labels = cls_labels.data.cpu().numpy()[:, 0] total_pred.extend(cls_labels.tolist()) total_gt.extend(val_label.tolist()) precision, recall, fscore, support = score(total_gt, total_pred) con_mat = confusion_matrix(total_gt, total_pred) # print(' p: {}\n r: {}\n f1: {} \n'.format(precision, recall, fscore)) # print('confusion matrix:') # print(con_mat) cls_acc = np.trace(con_mat) * 1.0 / np.sum(con_mat) print("\n Current classification accuracy is: {:.4f}".format(cls_acc)) train_loss, step_cnt = 0, 0 net.train() lr_scheduler.step() if epoc_num % args.save_freq == 0 and cls_acc >= best_acc and epoc_num >= args.maxepoch - 10: save_model_name = 'epoch-{}-acc-{:.3f}.pth'.format( str(epoc_num).zfill(3), cls_acc) torch.save(net.state_dict(), os.path.join(model_root, save_model_name)) print('Model saved as {}'.format(save_model_name)) best_acc = cls_acc
def main(): for epoch in range(1, epochs + 1): scheduler.step() train(epoch) torch.save(model.state_dict(), '%sage_epoch_%d.pth' % (outf, epoch))
for step in range(start_step, n_steps): if es.early_stop: break data, target, meta = next(iter(train_loader)) step_loss, step_precision = train_triplet_step(data, target, model, device, optimizer, miner) print('Train Step: {} Precision@1: {:.4f}\tLoss: {:.6f}'.format( step, step_precision, step_loss), flush=True) if step % args.val_freq == 0: total_loss, acc_dict, embedding_list, target_list = representation( model, device, validation_loader) lr_scheduler.step(total_loss) es(total_loss, step, model.state_dict(), output_dir / 'model.pt') save_checkpoint( model, optimizer, lr_scheduler, train_loader.sampler.state_dict(train_loader._infinite_iterator), step + 1, es, torch.random.get_rng_state()) _, acc_dict, embedding_list, target_list = representation( model, device, test_loader) _, acc_dict_aug, embedding_list_aug, target_list_aug = representation( model, device, test_loader_aug) results = {} acc_calc = AccuracyCalculator() for m, embedding, target in zip(['unaug', 'aug'],
global_step = int(checkpoint["global_step"]) global_start_time = time.time() if not PREDICT_ONLY: print("Training...") criterion = nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE) if CHECKPOINT_NAME != None: optimizer.load_state_dict(checkpoint["optimizer_state_dict"]) # model, optimizer = amp.initialize(model, optimizer, opt_level="O1") lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=LR_STEP, gamma=LR_FACTOR) if USE_PARALLEL: print("[Using all the available GPUs]") model = nn.DataParallel(model, device_ids=[0, 1]) for epoch in range(epoch, NUM_EPOCHS + 1): print('-' * 50) train(train_loader, model, criterion, optimizer, epoch, lr_scheduler, tensorboard, label_encoder) eval(val_loader, train_loader, model, tensorboard, epoch) lr_scheduler.step() if has_time_run_out(): break
def main(): global args, best_prec1 args = parser.parse_args() # Check the save_dir exists or not if not os.path.exists(args.save_dir): os.makedirs(args.save_dir) model = torch.nn.DataParallel(resnet.__dict__[args.arch]()) #model.cuda() # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) print(list(checkpoint.keys())) #args.start_epoch = checkpoint['epoch'] best_prec1 = checkpoint['best_prec1'] model.load_state_dict(checkpoint['state_dict']) #print("=> loaded checkpoint '{}' (epoch {})" # .format(args.evaluate, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) cudnn.benchmark = True normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) print('Making train_loader...') train_loader = torch.utils.data.DataLoader(datasets.CIFAR10( root='./data', train=True, transform=transforms.Compose([ transforms.RandomHorizontalFlip(), transforms.RandomCrop(32, 4), transforms.ToTensor(), normalize, ]), download=True), batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True) print('Making val_loader...') val_loader = torch.utils.data.DataLoader(datasets.CIFAR10( root='./data', train=False, transform=transforms.Compose([ transforms.ToTensor(), normalize, ])), batch_size=128, shuffle=False, num_workers=args.workers, pin_memory=True) # define loss function (criterion) and pptimizer #criterion = nn.CrossEntropyLoss().cuda() criterion = nn.CrossEntropyLoss() if args.half: model.half() criterion.half() print('Making optimizer...') optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, milestones=[100, 150], last_epoch=args.start_epoch - 1) if args.arch in ['resnet1202', 'resnet110']: # for resnet1202 original paper uses lr=0.01 for first 400 minibatches for warm-up # then switch back. In this implementation it will correspond for first epoch. for param_group in optimizer.param_groups: param_group['lr'] = args.lr * 0.1 if args.evaluate: print('Evaluating...') validate(val_loader, model, criterion) return for epoch in range(args.start_epoch, args.epochs): # train for one epoch print('current lr {:.5e}'.format(optimizer.param_groups[0]['lr'])) train(train_loader, model, criterion, optimizer, epoch) lr_scheduler.step() # evaluate on validation set prec1 = validate(val_loader, model, criterion) # remember best prec@1 and save checkpoint is_best = prec1 > best_prec1 best_prec1 = max(prec1, best_prec1) if epoch > 0 and epoch % args.save_every == 0: save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'best_prec1': best_prec1, }, is_best, filename=os.path.join(args.save_dir, 'checkpoint.th')) save_checkpoint( { 'state_dict': model.state_dict(), 'best_prec1': best_prec1, }, is_best, filename=os.path.join(args.save_dir, 'model.th'))
def fit(self, dataset_train, nb_epochs=10, batch_size=64, optimizer=None, lr=0.001, lr_step_size=0, dataset_val=None): if self._crayon_exp is None and self.crayon_exp_name is not None: self._crayon_exp = get_crayon_experiment(self.crayon_exp_name) if optimizer is None: optimizer = torch.optim.Adam(self.model.parameters(), lr=lr) lr_scheduler = None if lr_step_size != 0: lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=lr_step_size, gamma=0.5) data_loader_train = torch.utils.data.DataLoader(dataset_train, batch_size=batch_size, shuffle=True, num_workers=1, pin_memory=torch.cuda.is_available()) phases = ['train', ] data_loaders = [data_loader_train] if dataset_val is not None: data_loader_val = torch.utils.data.DataLoader(dataset_val, batch_size=batch_size, shuffle=False, num_workers=1, pin_memory=torch.cuda.is_available()) phases.append('val') data_loaders.append(data_loader_val) model = cuda(self.model) loss_fn = cuda(self.loss) j = 1 loss_best = np.inf for epoch in range(nb_epochs): for phase, data_loader in zip(phases, data_loaders): if phase == 'train': model.train(True) else: model.train(False) if phase == 'train' and lr_scheduler is not None: lr_scheduler.step() pbar_desc = f'Epoch {epoch}, {phase}' pbar = tqdm(total=len(data_loader.dataset), desc=pbar_desc, postfix={f'loss_{phase}': 0}, ncols=120) running_loss = 0.0 for j, (inputs, targets) in enumerate(data_loader, 1): volatile = phase == 'val' inputs = variable(inputs, volatile=volatile) targets = variable(targets, volatile=volatile) # zero the parameter gradients optimizer.zero_grad() # forward + backward + optimize outputs = model(inputs) loss = loss_fn(outputs, targets) if phase == 'train': loss.backward() optimizer.step() batch_loss = loss.data[0] running_loss += batch_loss pbar.update(inputs.size(0)) pbar.set_postfix(**{f'loss_{phase}': batch_loss}) if self._crayon_exp is not None: self._crayon_exp.add_scalar_value(f'loss_batch/{phase}', batch_loss) lr = optimizer.param_groups[0]['lr'] self._crayon_exp.add_scalar_value(f'learning_rate', lr) del loss del outputs del targets epoch_loss = running_loss / j pbar.set_postfix(**{f'loss_{phase}': epoch_loss}) pbar.close() if self._crayon_exp is not None: self._crayon_exp.add_scalar_value(f'loss_epoch/{phase}', epoch_loss) if phase == 'val' and epoch_loss < loss_best and self.checkpoint_filename is not None: save_weights(model, self.checkpoint_filename) loss_best = epoch_loss
def train_model(output_path, model, dataloaders, dataset_sizes, criterion, optimizer, num_epochs=5, scheduler=None, lr=0.1): if not os.path.exists('models/' + str(output_path)): os.makedirs('models/' + str(output_path)) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") since = time.time() best_model_wts = copy.deepcopy(model.state_dict()) best_acc = 0.0 best = 0 for epoch in range(num_epochs): top1 = AverageMeter() top5 = AverageMeter() losses = AverageMeter() print('Epoch {}/{}'.format(epoch + 1, num_epochs)) print('-' * 10) # Each epoch has a training and validation phase for phase in ['train', 'val']: if phase == 'train': if scheduler != None: scheduler.step() model.train() # Set model to training mode else: model.eval() # Set model to evaluate mode running_loss = 0.0 running_corrects = 0 # Iterate over data. for i, (inputs, labels) in enumerate(dataloaders[phase]): inputs = inputs.to(device) labels = labels.to(device) # zero the parameter gradients optimizer.zero_grad() # forward # track history if only in train with torch.set_grad_enabled(phase == 'train'): outputs = model(inputs) _, preds = torch.max(outputs, 1) acc1, acc5 = accuracy(outputs, labels, topk=(1, 5)) loss = criterion(outputs, labels) # backward + optimize only if in training phase if phase == 'train': losses.update(loss.item(), inputs.size(0)) top1.update(acc1[0], inputs.size(0)) top5.update(acc5[0], inputs.size(0)) loss.backward() optimizer.step() # statistics running_loss += loss.item() * inputs.size(0) running_corrects += torch.sum(preds == labels.data) # print("\rIteration: {}/{}, Loss: {}.".format(i+1, len(dataloaders[phase]), loss.item() * inputs.size(0)), end="") sys.stdout.flush() print( '\rLoss {loss.val:.4f} ({loss.avg:.4f}) Acc@1 {top1.val:.3f} ({top1.avg:.3f}) Acc@5 {top5.val:.3f} ({top5.avg:.3f})' .format(loss=losses, top1=top1, top5=top5), end="") # print( (i+1)*100. / len(dataloaders[phase]), "% Complete" ) epoch_loss = running_loss / dataset_sizes[phase] epoch_acc = running_corrects.double() / dataset_sizes[phase] if phase == 'train': avg_loss = epoch_loss t_acc = epoch_acc else: val_loss = epoch_loss val_acc = epoch_acc # print('{} Loss: {:.4f} Acc: {:.4f}'.format( # phase, epoch_loss, epoch_acc)) # deep copy the model if phase == 'val' and epoch_acc > best_acc: best_acc = epoch_acc best = epoch + 1 best_model_wts = copy.deepcopy(model.state_dict()) print('Train Loss: {:.4f} Acc: {:.4f}'.format(avg_loss, t_acc)) print('Val Loss: {:.4f} Acc: {:.4f}'.format(val_loss, val_acc)) print() # torch.save(model.state_dict(), './models/' + str(output_path) + '/model_{}_epoch.pth'.format(epoch+1)) torch.save( { 'epoch': epoch, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'loss': epoch_loss, }, './models/' + str(output_path) + '/model_{}_epoch.pth'.format(epoch + 1)) time_elapsed = time.time() - since print('Training complete in {:.0f}m {:.0f}s'.format( time_elapsed // 60, time_elapsed % 60)) print('Best Validation Accuracy: {}, Epoch: {}'.format(best_acc, best))
def learning_process(train_loader, network, criterion, test_loader, all_outputs_test, all_labels_test, mode, optimizer=None, start_epoch=0, lr_scheduler=lr_scheduler): vis = visdom.Visdom() r_loss = [] iterations = [] total_iteration = 0 loss_plot = vis.line(Y=np.zeros(1), X=np.zeros(1)) number_of_epochs = 0 name_prefix_for_saved_model = '' if mode == params.mode_classification: number_of_epochs = params.number_of_epochs_for_classification name_prefix_for_saved_model = params.name_prefix_for_saved_model_for_classification if mode == params.mode_representation: number_of_epochs = params.number_of_epochs_for_representation name_prefix_for_saved_model = params.name_prefix_for_saved_model_for_representation for epoch in range( start_epoch, number_of_epochs): # loop over the dataset multiple times pr = cProfile.Profile() pr.enable() lr_scheduler.step(epoch=epoch) print('current_learning_rate =', optimizer.param_groups[0]['lr']) print(datetime.datetime.now()) running_loss = 0.0 i = 0 # for representation we need clever sampling which should change every epoch # if mode == params.mode_representation: # train_loader, test_loader, \ # train_loader_for_classification, test_loader_for_classification = cifar.download_CIFAR100() for i, data in enumerate(train_loader, 0): # print('i = ', i) # get the inputs # inputs are [torch.FloatTensor of size 4x3x32x32] # labels are [torch.LongTensor of size 4] # here 4 is a batch size and 3 is a number of channels in the input images # 32x32 is a size of input image inputs, labels = data # wrap them in Variable inputs, labels = Variable(inputs.cuda()), Variable(labels.cuda()) # zero the parameter gradients optimizer.zero_grad() # forward + backward + optimize outputs = network(inputs) loss = criterion(outputs, labels) loss.backward() optimizer.step() # print statistics current_batch_loss = loss.data[0] if i % params.skip_step == 0: # print every 2000 mini-batches print('[ephoch %d, itteration in the epoch %5d] loss: %.30f' % (epoch + 1, i + 1, current_batch_loss)) r_loss.append(current_batch_loss) iterations.append(total_iteration + i) options = dict(legend=['loss for' + mode]) loss_plot = vis.line( Y=np.array(r_loss), X=np.array(iterations), # , update='append', win=loss_plot, opts=options) if epoch % 10 == 0: # print the train accuracy at every epoch # to see if it is enough to start representation training # or we should proceed with classification if mode == params.mode_classification: accuracy = test.test_for_classification( test_loader=test_loader, network=network) if mode == params.mode_representation: # we should recalculate all outputs before the evaluation because our network changed during the trainig all_outputs_test, all_labels_test = metric_learning_utils.get_all_outputs_and_labels( test_loader, network) recall_at_k = test.full_test_for_representation( k=params.k_for_recall, all_outputs=all_outputs_test, all_labels=all_labels_test) utils.save_checkpoint(network=network, optimizer=optimizer, filename=name_prefix_for_saved_model + '-%d' % epoch, epoch=epoch) total_iteration = total_iteration + i print('total_iteration = ', total_iteration) pr.disable() # s = io.FileIO('profiler-statistic') s = io.StringIO() sortby = 'tottime' ps = pstats.Stats(pr, stream=s).sort_stats(sortby) # ps.print_stats() # print(s.getvalue()) print('Finished Training')
def k_train_fcn(k_fold, model, batch_size, max_iterations, save_dir='./logs', eval_every=50, checkpoint_every=1000, mode='reg', config=None): train_weight = torch.tensor([1, 200, 3500, 20000], dtype=torch.float).to(config['DEVICE']) mse_loss = torch.nn.MSELoss().to(config['DEVICE']) cls_loss = cross_entropy2d save_dir += datetime.now().strftime("_%m_%d_%H_%M") if not os.path.exists(save_dir): os.makedirs(save_dir) data_gen = DataGenerator_FCN(global_config['DATA_PATH'], k_fold, batch_size, config['IN_LEN'], config['OUT_LEN'], config['IN_LEN'] + config['OUT_LEN'], config=config) writer = SummaryWriter(os.path.join(save_dir, 'train_logs')) for k in range(1, k_fold + 1): k_model, optimizer, lr_scheduler = model() data_gen.set_k(k) train_loss = 0.0 train_acc = 0.0 train_f1 = 0.0 train_csi = np.zeros((len(global_config['LEVEL_BUCKET']) + 1, ), dtype=np.float32) train_count = 0 i_batch = 0 best_val_loss = np.inf pbar = tqdm(range(1, max_iterations + 1)) for itera in pbar: n_train_batch = data_gen.n_train_batch() pbar_b = tqdm( np.random.choice(data_gen.n_train_batch(), 10000)) #range(data_gen.n_train_batch())) for b in pbar_b: pbar.set_description("Fold %d Training at batch %d / %d" % (k, i_batch, n_train_batch)) train_data, train_label = data_gen.get_train(b) #train_data, train_label, train_label_cat = data_gen.get_train(b) k_model.train() optimizer.zero_grad() output = k_model(train_data) # print(train_label.size()) output = output[:, 0] # loss = None # if mode == 'reg': loss = mse_loss(output, train_label[:, 0]) # elif mode == 'seg': # loss = cls_loss(output, train_label_cat, weight=train_weight) # elif mode == 'reg_multi': # loss = mse_loss(output, train_label) # loss += cls_loss(output, train_label_cat, weight=train_weight) # else: # raise Exception('wrong mode') loss.backward() # torch.nn.utils.clip_grad_value_(k_model.parameters(), clip_value=50.0) optimizer.step() lr_scheduler.step() train_loss += loss.item() # pred_numpy = output.cpu().max(1)[1].detach().numpy().flatten() # label_numpy = train_label_cat.cpu().numpy().flatten() # train_acc += accuracy_score(label_numpy, pred_numpy) # train_f1 += f1_score(label_numpy, pred_numpy, average='macro', zero_division=1) # train_csi += fp_fn_image_csi_muti_reg(pred_numpy, label_numpy) train_csi += fp_fn_image_csi_muti_reg( dbz_mm(output.detach().cpu().numpy()), dbz_mm(train_label[:, 0].detach().cpu().numpy())) train_count += 1 if i_batch % eval_every == 0: val_loss = 0.0 val_acc = 0.0 val_f1 = 0.0 val_csi = np.zeros( (len(global_config['LEVEL_BUCKET']) + 1, ), dtype=np.float32) val_count = 0 with torch.no_grad(): k_model.eval() n_val_batch = data_gen.n_val_batch() for ib_val, b_val in enumerate( np.random.choice(n_val_batch, 20)): #range(n_val_batch) val_data, val_label = data_gen.get_val(b_val) # val_data, val_label, val_label_cat = data_gen.get_val(b_val) output = k_model(val_data) output = output[:, 0] loss = mse_loss(output, val_label[:, 0]) # loss = None # if mode == 'reg': # loss = mse_loss(output, val_label) # elif mode == 'seg': # loss = cls_loss(output, val_label_cat, weight=train_weight) # elif mode == 'reg_multi': # loss = mse_loss(output, val_label) # loss += cls_loss(output, val_label_cat, weight=train_weight) val_loss += loss.item() # pred_numpy = output.cpu().max(1)[1].detach().numpy().flatten() # label_numpy = val_label_cat.cpu().numpy().flatten() # val_acc += accuracy_score(label_numpy, pred_numpy) # val_f1 += f1_score(label_numpy, pred_numpy, average='macro', zero_division=1) val_csi += fp_fn_image_csi_muti_reg( dbz_mm(output.detach().cpu().numpy()), dbz_mm(val_label[:, 0].detach().cpu().numpy())) val_count += 1 pbar.set_description( "Fold %d Validating at batch %d / %d" % (k, ib_val, 20)) train_loss /= train_count train_f1 /= train_count train_acc /= train_count train_csi /= train_count val_loss /= val_count val_f1 /= val_count val_acc /= val_count val_csi /= val_count writer.add_scalars('loss/' + str(k), { 'train': train_loss, 'valid': val_loss }, i_batch) # writer.add_scalars('f1/'+str(k), { # 'train': train_f1, # 'valid': val_f1 # }, i_batch) # writer.add_scalars('acc/'+str(k), { # 'train': train_acc, # 'valid': val_acc # }, i_batch) for i in range(train_csi.shape[0]): writer.add_scalars('csi_' + str(i) + '/' + str(k), { 'train': train_csi[i], 'valid': val_csi[i] }, i_batch) # writer.add_image('result/val', torch.tensor(cv2.cvtColor(np.array(output.cpu().max(1)[1].detach().numpy() / 4 * 255, dtype=np.uint8)[0,:,:,None], cv2.COLOR_GRAY2RGB).swapaxes(0,2)), i_batch) # writer.add_image('result/gt', torch.tensor(cv2.cvtColor(np.array(val_label_cat.cpu().numpy()[0, 0] / 4 * 255, dtype=np.uint8)[:,:,None], cv2.COLOR_GRAY2RGB).swapaxes(0,2)), i_batch) writer.add_image( 'result/val', torch.tensor( cv2.cvtColor( np.array( dbz_mm(output.detach().cpu().numpy()) / 60 * 255, dtype=np.uint8)[0, :, :, None], cv2.COLOR_GRAY2RGB).swapaxes(0, 2)), i_batch) writer.add_image( 'result/gt', torch.tensor( cv2.cvtColor( np.array( dbz_mm(val_label[:, 0].cpu().numpy()) / 60 * 255, dtype=np.uint8)[0, :, :, None], cv2.COLOR_GRAY2RGB).swapaxes(0, 2)), i_batch) train_loss = 0.0 train_acc = 0.0 train_f1 = 0.0 train_count = 0 train_csi = 0.0 if val_loss <= best_val_loss: torch.save( k_model.state_dict(), os.path.join( save_dir, 'model_f{}_i{}_best.pth'.format(k, i_batch))) best_val_loss = val_loss if i_batch % checkpoint_every == 0: torch.save( k_model.state_dict(), os.path.join(save_dir, 'model_f{}_i{}.pth'.format(k, i_batch))) i_batch += 1 try: torch.save( k_model.state_dict(), os.path.join(save_dir, 'model_f{}_i{}.pth'.format(k, i_batch))) except: print('cannot save model') writer.close()
def train(args): model, model_file = create_model(args.encoder_type, work_dir=args.work_dir, ckp=args.ckp) model = model.cuda() loaders = get_train_val_loaders(batch_size=args.batch_size) #optimizer = RAdam([ # {'params': model.decoder.parameters(), 'lr': args.lr}, # {'params': model.encoder.parameters(), 'lr': args.lr / 10.}, #]) if args.optim_name == 'RAdam': optimizer = RAdam(model.parameters(), lr=args.lr) elif args.optim_name == 'Adam': optimizer = optim.Adam(model.parameters(), lr=args.lr) elif args.optim_name == 'SGD': optimizer = optim.SGD(model.parameters(), momentum=0.9, lr=args.lr) #model, optimizer = amp.initialize(model, optimizer, opt_level="O1",verbosity=0) if torch.cuda.device_count() > 1: model = DataParallel(model) if args.lrs == 'plateau': lr_scheduler = ReduceLROnPlateau(optimizer, mode='max', factor=args.factor, patience=args.patience, min_lr=args.min_lr) else: lr_scheduler = CosineAnnealingLR(optimizer, args.t_max, eta_min=args.min_lr) best_metrics = 0. best_key = 'dice' print( 'epoch | lr | % | loss | avg | loss | dice | best | time | save |' ) if not args.no_first_val: val_metrics = validate(args, model, loaders['valid']) print( 'val | | | | | {:.4f} | {:.4f} | {:.4f} | | |' .format(val_metrics['loss'], val_metrics['dice'], val_metrics['dice'])) best_metrics = val_metrics[best_key] if args.val: return model.train() #if args.lrs == 'plateau': # lr_scheduler.step(best_metrics) #else: # lr_scheduler.step() train_iter = 0 for epoch in range(args.num_epochs): train_loss = 0 current_lr = get_lrs(optimizer) bg = time.time() for batch_idx, data in enumerate(loaders['train']): train_iter += 1 img, targets = data[0].cuda(), data[1].cuda() batch_size = img.size(0) outputs = model(img) loss = _reduce_loss(criterion(outputs, targets)) (loss).backward() #with amp.scale_loss(loss*batch_size, optimizer) as scaled_loss: # scaled_loss.backward() if batch_idx % 4 == 0: optimizer.step() optimizer.zero_grad() train_loss += loss.item() print('\r {:4d} | {:.6f} | {:06d}/{} | {:.4f} | {:.4f} |'.format( epoch, float(current_lr[0]), args.batch_size * (batch_idx + 1), loaders['train'].num, loss.item(), train_loss / (batch_idx + 1)), end='') if train_iter > 0 and train_iter % args.iter_val == 0: save_model(model, model_file + '_latest') val_metrics = validate(args, model, loaders['valid']) _save_ckp = '' if val_metrics[best_key] > best_metrics: best_metrics = val_metrics[best_key] save_model(model, model_file) _save_ckp = '*' print(' {:.4f} | {:.4f} | {:.4f} | {:.2f} | {:4s} |'.format( val_metrics['loss'], val_metrics['dice'], best_metrics, (time.time() - bg) / 60, _save_ckp)) model.train() if args.lrs == 'plateau': lr_scheduler.step(best_metrics) else: lr_scheduler.step() current_lr = get_lrs(optimizer)
def train(model, optimizer, lr_scheduler, dataloaders, device, epochs): generator = model[0] discriminator = model[1] optimizer_G = optimizer[0] optimizer_D = optimizer[1] for e in range(epochs): for x, y in tqdm(dataloaders['train']): generator.train() discriminator.train() valid = torch.ones((x.shape[0], 1), requires_grad=False) fake = torch.zeros((x.shape[0], 1), requires_grad=False) sampled_latent = torch.tensor( np.random.normal(0, 1, (x.shape[0], latent_dim)), dtype=torch.float32).to(device=device) x = x.to(device=device) valid = valid.to(device=device) fake = fake.to(device=device) generated_imgs = generator(sampled_latent) ge_ = discriminator(generated_imgs) gt_ = discriminator(x) gen_loss = nn.BCELoss()(ge_, valid) optimizer_G.zero_grad() gen_loss.backward() optimizer_G.step() dis_loss = (nn.BCELoss()(discriminator(generated_imgs.detach()), fake) + nn.BCELoss()(gt_, valid)) / 2 optimizer_D.zero_grad() dis_loss.backward() optimizer_D.step() if lr_scheduler: lr_scheduler.step() print('epoche %d, gen loss = %f, dis loss = %f' % (e, gen_loss.item(), dis_loss.item())) logging.info('epoche %d, gen loss = %f, dis loss = %f' % (e, gen_loss.item(), dis_loss.item())) sample(model, device, e) writer.add_scalars("loss", { "GEN": gen_loss.item(), "DIS": dis_loss.item() }, e) save_model(save_dir='model_checkpoint', file_name="check_point_G", model=generator, optimizer=optimizer_G, lr_scheduler=lr_scheduler) save_model(save_dir='model_checkpoint', file_name="check_point_D", model=discriminator, optimizer=optimizer_D, lr_scheduler=lr_scheduler) save_model(save_dir='model_checkpoint', file_name=task_name + "_G", model=generator, optimizer=optimizer_G, lr_scheduler=lr_scheduler) save_model(save_dir='model_checkpoint', file_name=task_name + "_D", model=discriminator, optimizer=optimizer_D, lr_scheduler=lr_scheduler) return model