def main(): args = parser.parse_args() import torch from torch.autograd import Variable mdl = ResNet(50, "deconv3", in_channels=4, image_shape=(192, 256)).cuda() x = Variable(torch.randn(1, mdl.in_channels, *mdl.image_shape).cuda()) y = mdl(x) model_graph = torchviz.make_dot(y.mean(), dict(mdl.named_parameters())) model_graph.format = "svg" model_graph.render("resnet50.gv", "resnet50_render", view=True)
def main(): global args, best_result, output_directory, train_csv, test_csv # evaluation mode start_epoch = 0 if args.evaluate: assert os.path.isfile(args.evaluate), \ "=> no best model found at '{}'".format(args.evaluate) print("=> loading best model '{}'".format(args.evaluate)) checkpoint = torch.load(args.evaluate) output_directory = os.path.dirname(args.evaluate) args = checkpoint['args'] start_epoch = checkpoint['epoch'] + 1 best_result = checkpoint['best_result'] model = checkpoint['model'] print("=> loaded best model (epoch {})".format(checkpoint['epoch'])) _, val_loader = create_data_loaders(args) args.evaluate = True validate(val_loader, model, checkpoint['epoch'], write_to_file=False) return # optionally resume from a checkpoint elif args.resume: chkpt_path = args.resume assert os.path.isfile(chkpt_path), \ "=> no checkpoint found at '{}'".format(chkpt_path) print("=> loading checkpoint '{}'".format(chkpt_path)) checkpoint = torch.load(chkpt_path) args = checkpoint['args'] start_epoch = checkpoint['epoch'] + 1 best_result = checkpoint['best_result'] model = checkpoint['model'] optimizer = checkpoint['optimizer'] output_directory = os.path.dirname(os.path.abspath(chkpt_path)) print("=> loaded checkpoint (epoch {})".format(checkpoint['epoch'])) train_loader, val_loader = create_data_loaders(args) args.resume = True # create new model else: train_loader, val_loader = create_data_loaders(args) print("=> creating Model ({}-{}) ...".format(args.arch, args.decoder)) in_channels = len(args.modality) if args.arch == 'resnet50': model = ResNet(layers=50, decoder=args.decoder, output_size=train_loader.dataset.output_size, in_channels=in_channels, pretrained=args.pretrained) elif args.arch == 'SmallNet': model = SmallNet().cuda() model_named_params = [p for _,p in model.named_parameters() if p.requires_grad] elif args.arch == 'UNET': model = DepthCompletionNet(args).cuda() model_named_params = [p for _,p in model.named_parameters() if p.requires_grad] elif args.arch == 'DRNSeg': model = DRNSeg("drn_d_22", 1, pretrained_model=None,pretrained=False) model_named_params = [p for _,p in model.named_parameters() if p.requires_grad] elif args.arch == 'ERF': model = ERF().cuda() model_named_params = [p for _,p in model.named_parameters() if p.requires_grad] elif args.arch == 'resnet18': model = ResNet(layers=18, decoder=args.decoder, output_size=train_loader.dataset.output_size, in_channels=in_channels, pretrained=args.pretrained) print("=> model created.") #optimizer = torch.optim.SGD(model.parameters(), args.lr, \ # momentum=args.momentum, weight_decay=args.weight_decay) optimizer = torch.optim.Adam(model_named_params, lr=args.lr, weight_decay=args.weight_decay) # model = torch.nn.DataParallel(model).cuda() # for multi-gpu training model = model.cuda() # define loss function (criterion) and optimizer if args.criterion == 'l2': criterion = criteria.MaskedMSELoss().cuda() elif args.criterion == 'l1': criterion = criteria.MaskedL1Loss().cuda() smoothloss = criteria.SmoothnessLoss().cuda() photometric_loss = criteria.PhotometricLoss().cuda() # create results folder, if not already exists output_directory = utils.get_output_directory(args) if not os.path.exists(output_directory): os.makedirs(output_directory) train_csv = os.path.join(output_directory, 'train.csv') test_csv = os.path.join(output_directory, 'test.csv') best_txt = os.path.join(output_directory, 'best.txt') # create new csv files with only header if not args.resume: with open(train_csv, 'w') as csvfile: writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() with open(test_csv, 'w') as csvfile: writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() for epoch in range(start_epoch, args.epochs): utils.adjust_learning_rate(optimizer, epoch, args.lr) train(train_loader, model, criterion,smoothloss, photometric_loss, optimizer, epoch) # train for one epoch result, img_merge = validate(val_loader, model, epoch) # evaluate on validation set # remember best rmse and save checkpoint is_best = result.rmse < best_result.rmse if is_best: best_result = result with open(best_txt, 'w') as txtfile: txtfile.write("epoch={}\nmse={:.3f}\nrmse={:.3f}\nabsrel={:.3f}\nlg10={:.3f}\nmae={:.3f}\ndelta1={:.3f}\nt_gpu={:.4f}\n". format(epoch, result.mse, result.rmse, result.absrel, result.lg10, result.mae, result.delta1, result.gpu_time)) if img_merge is not None: img_filename = output_directory + '/comparison_best.png' utils.save_image(img_merge, img_filename) utils.save_checkpoint({ 'args': args, 'epoch': epoch, 'arch': args.arch, 'model': model, 'best_result': best_result, 'optimizer' : optimizer, }, is_best, epoch, output_directory)
def train(k, epochs): model = ResNet(k=k) opt = torch.optim.Adam(model.parameters(), lr=1e-4) criterion = nn.CrossEntropyLoss() if use_gpu: model.to('cuda') if use_horovod: # broadcast parameters and optimizer state from root device to other devices hvd.broadcast_parameters(model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(opt, root_rank=0) # Wraps the opimizer for multiGPU operation opt = hvd.DistributedOptimizer( opt, named_parameters=model.named_parameters(), op=hvd.Adasum) loss_dict = {'epoch': [], 'train': [], 'val': []} for epoch in range(epochs): train_loss = 0 val_loss = 0 # train block for img_batch, labels_batch in train_loader: if use_gpu: img_batch = img_batch.to('cuda') labels_batch = labels_batch.to('cuda') pred = model(img_batch) opt.zero_grad() loss = criterion(pred, labels_batch) loss.backward() opt.step() train_loss += loss.item() #val block with torch.no_grad(): for img_batch, labels_batch in val_loader: if use_gpu: img_batch = img_batch.to('cuda') labels_batch = labels_batch.to('cuda') pred = model(img_batch) loss = criterion(pred, labels_batch) val_loss += loss.item() if use_horovod: train_loss = average_loss(train_loss, 'avg_train_loss') val_loss = average_loss(val_loss, 'avg_val_loss') loss_dict['epoch'].append(epoch + 1) loss_dict['train'].append(train_loss) loss_dict['val'].append(val_loss) print(",".join([ "{}:{:.2f}".format(key, val[epoch]) for key, val in loss_dict.items() ])) torch.save(model.state_dict(), "models/modelsdata/ResNet18_Cifar10_d{}.ckpt".format(k)) save_obj(loss_dict, "models/modelsdata/losses/ResNet18_Cifar10_d{}".format(k)) return loss_dict