def main(args): modelpath = args.loadDir + args.loadModel weightspath = args.loadDir + args.loadWeights print ("Loading model: " + modelpath) print ("Loading weights: " + weightspath) model = Net(NUM_CLASSES) model = torch.nn.DataParallel(model) if (not args.cpu): model = model.cuda() #model.load_state_dict(torch.load(args.state)) #model.load_state_dict(torch.load(weightspath)) #not working if missing key def load_my_state_dict(model, state_dict): #custom function to load model when not all dict elements own_state = model.state_dict() for name, param in state_dict.items(): if name not in own_state: continue own_state[name].copy_(param) return model model = load_my_state_dict(model, torch.load(weightspath)) print ("Model and weights LOADED successfully") model.eval() if(not os.path.exists(args.datadir)): print ("Error: datadir could not be loaded") loader = DataLoader(cityscapes(args.datadir, input_transform_cityscapes, target_transform_cityscapes, subset=args.subset), num_workers=args.num_workers, batch_size=args.batch_size, shuffle=False) for step, (images, labels, filename, filenameGt) in enumerate(loader): if (not args.cpu): images = images.cuda() #labels = labels.cuda() inputs = Variable(images) #targets = Variable(labels) with torch.no_grad(): outputs = model(inputs) label = outputs[0].max(0)[1].byte().cpu().data label_cityscapes = cityscapes_trainIds2labelIds(label.unsqueeze(0)) #print (numpy.unique(label.numpy())) #debug filenameSave = "./save_results/" + filename[0].split("leftImg8bit/")[1] os.makedirs(os.path.dirname(filenameSave), exist_ok=True) #image_transform(label.byte()).save(filenameSave) label_cityscapes.save(filenameSave) print (step, filenameSave)
def main(args): modelpath = args.loadDir + args.loadModel weightspath = args.loadDir + args.loadWeights print("Loading model: " + modelpath) print("Loading weights: " + weightspath) model = Net(NUM_CLASSES) model = torch.nn.DataParallel(model) if (not args.cpu): model = model.cuda() def load_my_state_dict( model, state_dict ): #custom function to load model when not all dict elements own_state = model.state_dict() for a in own_state.keys(): print(a) for a in state_dict.keys(): print(a) print('-----------') for name, param in state_dict.items(): if name not in own_state: continue own_state[name].copy_(param) return model model = load_my_state_dict(model, torch.load(weightspath)) print("Model and weights LOADED successfully") model.eval() if (not os.path.exists(args.datadir)): print("Error: datadir could not be loaded") loader = DataLoader(cityscapes(args.datadir, input_transform_cityscapes, subset=args.subset), num_workers=args.num_workers, batch_size=args.batch_size, shuffle=False) # For visualizer: # must launch in other window "python3.6 -m visdom.server -port 8097" # and access localhost:8097 to see it if (args.visualize): vis = visdom.Visdom() with torch.no_grad(): for step, (images, filename) in enumerate(loader): images = images.cuda() outputs = model(images) label = outputs[0].cpu().max(0)[1].data.byte() label_color = Colorize()(label.unsqueeze(0)) filenameSave = "./save_color/" + filename[0].split( "leftImg8bit/")[1] os.makedirs(os.path.dirname(filenameSave), exist_ok=True) label_save = ToPILImage()(label_color) label_save.save(filenameSave) print(step, filenameSave)
def main(args): modelpath = args.loadDir + args.loadModel weightspath = args.loadDir + args.loadWeights print("Loading model: " + modelpath) print("Loading weights: " + weightspath) #Import ERFNet model from the folder #Net = importlib.import_module(modelpath.replace("/", "."), "ERFNet") model = ERFNet(NUM_CLASSES) model = torch.nn.DataParallel(model) if (not args.cpu): model = model.cuda() #model.load_state_dict(torch.load(args.state)) #model.load_state_dict(torch.load(weightspath)) #not working if missing key def load_my_state_dict( model, state_dict ): #custom function to load model when not all dict elements own_state = model.state_dict() for name, param in state_dict.items(): if name not in own_state: continue own_state[name].copy_(param) return model model = load_my_state_dict(model, torch.load(weightspath)) print("Model and weights LOADED successfully") model.eval() if (not os.path.exists(args.datadir)): print("Error: datadir could not be loaded") loader = DataLoader(cityscapes(args.datadir, input_transform_cityscapes, target_transform_cityscapes, subset=args.subset), num_workers=args.num_workers, batch_size=args.batch_size, shuffle=False) # For visualizer: # must launch in other window "python3.6 -m visdom.server -port 8097" # and access localhost:8097 to see it if (args.visualize): vis = visdom.Visdom() for step, (images, labels, filename, filenameGt) in enumerate(loader): if (not args.cpu): images = images.cuda() #labels = labels.cuda() inputs = Variable(images) #targets = Variable(labels) with torch.no_grad(): outputs = model(inputs) label = outputs[0].max(0)[1].byte().cpu().data #label_cityscapes = cityscapes_trainIds2labelIds(label.unsqueeze(0)) label_color = Colorize()(label.unsqueeze(0)) filenameSave = "./save_color/" + filename[0].split("leftImg8bit/")[1] os.makedirs(os.path.dirname(filenameSave), exist_ok=True) #image_transform(label.byte()).save(filenameSave) label_save = ToPILImage()(label_color) label_save.save(filenameSave) if (args.visualize): vis.image(label_color.numpy()) print(step, filenameSave)
def train(args, model, enc=False): best_acc = 0 #TODO: calculate weights by processing dataset histogram (now its being set by hand from the torch values) #create a loder to run all images and calculate histogram of labels, then create weight array using class balancing weight = torch.ones(NUM_CLASSES) if (enc): weight[0] = 2.3653597831726 weight[1] = 4.4237880706787 weight[2] = 2.9691488742828 weight[3] = 5.3442072868347 weight[4] = 5.2983593940735 weight[5] = 5.2275490760803 weight[6] = 5.4394111633301 weight[7] = 5.3659925460815 weight[8] = 3.4170460700989 weight[9] = 5.2414722442627 weight[10] = 4.7376127243042 weight[11] = 5.2286224365234 weight[12] = 5.455126285553 weight[13] = 4.3019247055054 weight[14] = 5.4264230728149 weight[15] = 5.4331531524658 weight[16] = 5.433765411377 weight[17] = 5.4631009101868 weight[18] = 5.3947434425354 else: weight[0] = 2.8149201869965 weight[1] = 6.9850029945374 weight[2] = 3.7890393733978 weight[3] = 9.9428062438965 weight[4] = 9.7702074050903 weight[5] = 9.5110931396484 weight[6] = 10.311357498169 weight[7] = 10.026463508606 weight[8] = 4.6323022842407 weight[9] = 9.5608062744141 weight[10] = 7.8698215484619 weight[11] = 9.5168733596802 weight[12] = 10.373730659485 weight[13] = 6.6616044044495 weight[14] = 10.260489463806 weight[15] = 10.287888526917 weight[16] = 10.289801597595 weight[17] = 10.405355453491 weight[18] = 10.138095855713 weight[19] = 0 assert os.path.exists(args.datadir), "Error: datadir (dataset directory) could not be loaded" co_transform = MyCoTransform(enc, augment=True, height=args.height)#1024) co_transform_val = MyCoTransform(enc, augment=False, height=args.height)#1024) dataset_train = cityscapes(args.datadir, co_transform, 'train',50) dataset_val = cityscapes(args.datadir, co_transform_val, 'val',100) print(len(dataset_train)) loader = DataLoader(dataset_train, num_workers=args.num_workers, batch_size=args.batch_size, shuffle=True) loader_val = DataLoader(dataset_val, num_workers=args.num_workers, batch_size=args.batch_size, shuffle=False) # print(list(enumerate(loader))) if args.cuda: weight = weight.cuda() criterion = CrossEntropyLoss2d(weight) savedir = f'../save/{args.savedir}' if (enc): automated_log_path = savedir + "/automated_log_encoder.txt" modeltxtpath = savedir + "/model_encoder.txt" else: automated_log_path = savedir + "/automated_log.txt" modeltxtpath = savedir + "/model.txt" if (not os.path.exists(automated_log_path)): #dont add first line if it exists with open(automated_log_path, "a") as myfile: myfile.write("Epoch\t\tTrain-loss\t\tTest-loss\t\tTrain-IoU\t\tTest-IoU\t\tlearningRate") with open(modeltxtpath, "w") as myfile: myfile.write(str(model)) #TODO: reduce memory in first gpu: https://discuss.pytorch.org/t/multi-gpu-training-memory-usage-in-balance/4163/4 #https://github.com/pytorch/pytorch/issues/1893 #optimizer = Adam(model.parameters(), 5e-4, (0.9, 0.999), eps=1e-08, weight_decay=2e-4) ## scheduler 1 optimizer = Adam(model.parameters(), 5e-4, (0.9, 0.999), eps=1e-08, weight_decay=1e-4) ## scheduler 2 start_epoch = 1 if args.resume: #Must load weights, optimizer, epoch and best value. if enc: filenameCheckpoint = savedir + '/checkpoint_enc.pth.tar' else: filenameCheckpoint = savedir + '/checkpoint.pth.tar' assert os.path.exists(filenameCheckpoint), "Error: resume option was used but checkpoint was not found in folder" checkpoint = torch.load(filenameCheckpoint) start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) best_acc = checkpoint['best_acc'] print("=> Loaded checkpoint at epoch {})".format(checkpoint['epoch'])) #scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=0.5) # set up scheduler ## scheduler 1 lambda1 = lambda epoch: pow((1-((epoch-1)/args.num_epochs)),0.9) ## scheduler 2 scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda1) ## scheduler 2 if args.visualize and args.steps_plot > 0: board = Dashboard(args.port) for epoch in range(start_epoch, args.num_epochs+1): print("----- TRAINING - EPOCH", epoch, "-----") scheduler.step(epoch) ## scheduler 2 epoch_loss = [] time_train = [] doIouTrain = args.iouTrain doIouVal = args.iouVal if (doIouTrain): iouEvalTrain = iouEval(NUM_CLASSES) usedLr = 0 for param_group in optimizer.param_groups: print("LEARNING RATE: ", param_group['lr']) usedLr = float(param_group['lr']) model.train() #print("this is me!!!!!") #print(len(loader)) for step, (images, labels) in enumerate(loader): start_time = time.time() #print("this is also m") #print (labels.size()) #print (np.unique(labels.numpy())) #print("labels: ", np.unique(labels[0].numpy())) #labels = torch.ones(4, 1, 512, 1024).long() if args.cuda: images = images.cuda() labels = labels.cuda() inputs = Variable(images) targets = Variable(labels) outputs = model(inputs, only_encode=enc) #print("targets", np.unique(targets[:, 0].cpu().data.numpy())) #print("This is me on traget") #print(np.min(targets.cpu().detach().numpy())) #print("This is me after target") optimizer.zero_grad() loss = criterion(outputs, targets[:, 0]) #print("This is me on loss") #print(loss) #print("This is me after loss") loss.backward() optimizer.step() epoch_loss.append(loss.cpu().detach().numpy().item()) time_train.append(time.time() - start_time) if (doIouTrain): #start_time_iou = time.time() iouEvalTrain.addBatch(outputs.max(1)[1].unsqueeze(1).data, targets.data) #print ("Time to add confusion matrix: ", time.time() - start_time_iou) #print(outputs.size()) if args.visualize and args.steps_plot > 0 and step % args.steps_plot == 0: start_time_plot = time.time() image = inputs[0].cpu().data #image[0] = image[0] * .229 + .485 #image[1] = image[1] * .224 + .456 #image[2] = image[2] * .225 + .406 #print("output", np.unique(outputs[0].cpu().max(0)[1].data.numpy())) board.image(image, f'input (epoch: {epoch}, step: {step})') if isinstance(outputs, list): #merge gpu tensors board.image(color_transform(outputs[0][0].cpu().max(0)[1].data.unsqueeze(0)), f'output (epoch: {epoch}, step: {step})') else: board.image(color_transform(outputs[0].cpu().max(0)[1].data.unsqueeze(0)), f'output (epoch: {epoch}, step: {step})') board.image(color_transform(targets[0].cpu().data), f'target (epoch: {epoch}, step: {step})') print ("Time to paint images: ", time.time() - start_time_plot) if args.steps_loss > 0 and step % args.steps_loss == 0: average = sum(epoch_loss) / len(epoch_loss) print(f'loss: {average:0.4} (epoch: {epoch}, step: {step})', "// Avg time/img: %.4f s" % (sum(time_train) / len(time_train) / args.batch_size)) average_epoch_loss_train = sum(epoch_loss) / len(epoch_loss) iouTrain = 0 if (doIouTrain): iouTrain, iou_classes = iouEvalTrain.getIoU() iouStr = getColorEntry(iouTrain)+'{:0.2f}'.format(iouTrain*100) + '\033[0m' print ("EPOCH IoU on TRAIN set: ", iouStr, "%") #Validate on 500 val images after each epoch of training print("----- VALIDATING - EPOCH", epoch, "-----") model.eval() epoch_loss_val = [] time_val = [] if (doIouVal): iouEvalVal = iouEval(NUM_CLASSES) for step, (images, labels) in enumerate(loader_val): start_time = time.time() if args.cuda: images = images.cuda() labels = labels.cuda() inputs = Variable(images, volatile=True) #volatile flag makes it free backward or outputs for eval targets = Variable(labels, volatile=True) outputs = model(inputs, only_encode=enc) loss = criterion(outputs, targets[:, 0]) epoch_loss_val.append(loss.cpu().detach().numpy().item()) time_val.append(time.time() - start_time) #Add batch to calculate TP, FP and FN for iou estimation if (doIouVal): #start_time_iou = time.time() iouEvalVal.addBatch(outputs.max(1)[1].unsqueeze(1).data, targets.data) #print ("Time to add confusion matrix: ", time.time() - start_time_iou) if args.visualize and args.steps_plot > 0 and step % args.steps_plot == 0: start_time_plot = time.time() image = inputs[0].cpu().data board.image(image, f'VAL input (epoch: {epoch}, step: {step})') if isinstance(outputs, list): #merge gpu tensors board.image(color_transform(outputs[0][0].cpu().max(0)[1].data.unsqueeze(0)), f'VAL output (epoch: {epoch}, step: {step})') else: board.image(color_transform(outputs[0].cpu().max(0)[1].data.unsqueeze(0)), f'VAL output (epoch: {epoch}, step: {step})') board.image(color_transform(targets[0].cpu().data), f'VAL target (epoch: {epoch}, step: {step})') print ("Time to paint images: ", time.time() - start_time_plot) if args.steps_loss > 0 and step % args.steps_loss == 0: average = sum(epoch_loss_val) / len(epoch_loss_val) print(f'VAL loss: {average:0.4} (epoch: {epoch}, step: {step})', "// Avg time/img: %.4f s" % (sum(time_val) / len(time_val) / args.batch_size)) average_epoch_loss_val = sum(epoch_loss_val) / len(epoch_loss_val) #scheduler.step(average_epoch_loss_val, epoch) ## scheduler 1 # update lr if needed iouVal = 0 if (doIouVal): iouVal, iou_classes = iouEvalVal.getIoU() iouStr = getColorEntry(iouVal)+'{:0.2f}'.format(iouVal*100) + '\033[0m' print ("EPOCH IoU on VAL set: ", iouStr, "%") # remember best valIoU and save checkpoint if iouVal == 0: current_acc = -average_epoch_loss_val else: current_acc = iouVal is_best = current_acc > best_acc best_acc = max(current_acc, best_acc) if enc: filenameCheckpoint = savedir + '/checkpoint_enc.pth.tar' filenameBest = savedir + '/model_best_enc.pth.tar' else: filenameCheckpoint = savedir + '/checkpoint.pth.tar' filenameBest = savedir + '/model_best.pth.tar' save_checkpoint({ 'epoch': epoch + 1, 'arch': str(model), 'state_dict': model.state_dict(), 'best_acc': best_acc, 'optimizer' : optimizer.state_dict(), }, is_best, filenameCheckpoint, filenameBest) #SAVE MODEL AFTER EPOCH if (enc): filename = f'{savedir}/model_encoder-{epoch:03}.pth' filenamebest = f'{savedir}/model_encoder_best.pth' else: filename = f'{savedir}/model-{epoch:03}.pth' filenamebest = f'{savedir}/model_best.pth' if args.epochs_save > 0 and step > 0 and step % args.epochs_save == 0: torch.save(model.state_dict(), filename) print(f'save: {filename} (epoch: {epoch})') if (is_best): torch.save(model.state_dict(), filenamebest) print(f'save: {filenamebest} (epoch: {epoch})') if (not enc): with open(savedir + "/best.txt", "w") as myfile: myfile.write("Best epoch is %d, with Val-IoU= %.4f" % (epoch, iouVal)) else: with open(savedir + "/best_encoder.txt", "w") as myfile: myfile.write("Best epoch is %d, with Val-IoU= %.4f" % (epoch, iouVal)) #SAVE TO FILE A ROW WITH THE EPOCH RESULT (train loss, val loss, train IoU, val IoU) #Epoch Train-loss Test-loss Train-IoU Test-IoU learningRate with open(automated_log_path, "a") as myfile: myfile.write("\n%d\t\t%.4f\t\t%.4f\t\t%.4f\t\t%.4f\t\t%.8f" % (epoch, average_epoch_loss_train, average_epoch_loss_val, iouTrain, iouVal, usedLr )) return(model) #return model (convenience for encoder-decoder training)
def main(args): modelpath = args.loadDir + args.loadModel weightspath = args.loadDir + args.loadWeights print("Loading model: " + modelpath) print("Loading weights: " + weightspath) resnet = resnet18(pretrained=True, efficient=False, use_bn=True) model = Net(resnet, size=(512, 1024 * 4), num_classes=NUM_CLASSES) model = torch.nn.DataParallel(model) if (not args.cpu): model = model.cuda() def load_my_state_dict(model, state_dict): own_state = model.state_dict() for a, b in zip(own_state.keys(), state_dict.keys()): print(a, ' ', b) print('-----------') for name, param in state_dict.items(): name = name[7:] if name not in own_state: print('{} not in own_state'.format(name)) continue own_state[name].copy_(param) return model model = load_my_state_dict(model, torch.load(weightspath)) print("Model and weights LOADED successfully") model.eval() if (not os.path.exists(args.datadir)): print("Error: datadir could not be loaded") loader = DataLoader(cityscapes(args.datadir, input_transform_cityscapes, subset=args.subset), num_workers=args.num_workers, batch_size=args.batch_size, shuffle=False) # For visualizer: # must launch in other window "python3.6 -m visdom.server -port 8097" # and access localhost:8097 to see it if (args.visualize): vis = visdom.Visdom() with torch.no_grad(): for step, (images, filename) in enumerate(loader): images = images.cuda() outputs = model(images) heatmap = outputs[0, 26, :, :].cpu().data heatmap = (heatmap - heatmap.min().min()) / ( heatmap.max().max() - heatmap.min().min() + 1e-6) heatmap = heatmap.unsqueeze(0) filenameSave = "./save_color/" + filename[0].split( "leftImg8bit/")[1] os.makedirs(os.path.dirname(filenameSave), exist_ok=True) label_save = ToPILImage()(heatmap) label_save.save(filenameSave) print(step, filenameSave)
def main(args): modelpath = args.loadDir + args.loadModel weightspath = args.loadDir + args.loadWeights print("Loading model: " + modelpath) print("Loading weights: " + weightspath) model = ERFNet(NUM_CLASSES) model = torch.nn.DataParallel(model) if (not args.cpu): model = model.cuda() def load_my_state_dict( model, state_dict ): #custom function to load model when not all dict elements own_state = model.state_dict() for name, param in state_dict.items(): if name not in own_state: print(name, " not loaded") continue own_state[name].copy_(param) return model model = load_my_state_dict(model, torch.load(weightspath)) print("Model and weights LOADED successfully") model.eval() if (not os.path.exists(args.datadir)): print("Error: datadir could not be loaded") loader = DataLoader(cityscapes(args.datadir, input_transform_cityscapes, target_transform_cityscapes, subset=args.subset), num_workers=args.num_workers, batch_size=args.batch_size, shuffle=False) iouEvalVal = iouEval(NUM_CLASSES) start = time.time() for step, (images, labels, filename, filenameGt) in enumerate(loader): if (not args.cpu): images = images.cuda() labels = labels.cuda() inputs = Variable(images, volatile=True) outputs = model(inputs) iouEvalVal.addBatch(outputs.max(1)[1].unsqueeze(1).data, labels) filenameSave = filename[0].split("leftImg8bit/")[1] print(step, filenameSave) iouVal, iou_classes = iouEvalVal.getIoU() iou_classes_str = [] for i in range(iou_classes.size(0)): iouStr = getColorEntry(iou_classes[i]) + '{:0.2f}'.format( iou_classes[i] * 100) + '\033[0m' iou_classes_str.append(iouStr) print("---------------------------------------") print("Took ", time.time() - start, "seconds") print("=======================================") #print("TOTAL IOU: ", iou * 100, "%") print("Per-Class IoU:") print(iou_classes_str[0], "Road") print(iou_classes_str[1], "sidewalk") print(iou_classes_str[2], "building") print(iou_classes_str[3], "wall") print(iou_classes_str[4], "fence") print(iou_classes_str[5], "pole") print(iou_classes_str[6], "traffic light") print(iou_classes_str[7], "traffic sign") print(iou_classes_str[8], "vegetation") print(iou_classes_str[9], "terrain") print(iou_classes_str[10], "sky") print(iou_classes_str[11], "person") print(iou_classes_str[12], "rider") print(iou_classes_str[13], "car") print(iou_classes_str[14], "truck") print(iou_classes_str[15], "bus") print(iou_classes_str[16], "train") print(iou_classes_str[17], "motorcycle") print(iou_classes_str[18], "bicycle") print("=======================================") iouStr = getColorEntry(iouVal) + '{:0.2f}'.format(iouVal * 100) + '\033[0m' print("MEAN IoU: ", iouStr, "%")
def train(): """Train SqueezeDet model""" assert FLAGS.dataset == 'KITTI' or FLAGS.dataset== 'cityscapes', \ 'Currently only support KITTI & cityscapes dataset' os.environ['CUDA_VISIBLE_DEVICES'] = FLAGS.gpu with tf.Graph().as_default(): assert FLAGS.net == 'vgg16' or FLAGS.net == 'resnet50' \ or FLAGS.net == 'squeezeDet' or FLAGS.net == 'squeezeDet+', \ 'Selected neural net architecture not supported: {}'.format(FLAGS.net) if FLAGS.net == 'vgg16': mc = kitti_vgg16_config() mc.IS_TRAINING = True mc.PRETRAINED_MODEL_PATH = FLAGS.pretrained_model_path model = VGG16ConvDet(mc) elif FLAGS.net == 'resnet50': mc = kitti_res50_config() mc.IS_TRAINING = True mc.PRETRAINED_MODEL_PATH = FLAGS.pretrained_model_path model = ResNet50ConvDet(mc) elif FLAGS.net == 'squeezeDet': if FLAGS.dataset== 'cityscapes' mc = cityscapes_squeezeDet_config() elif FLAGS.dataset== 'KITTI' mc = kitti_squeezeDet_config() mc.IS_TRAINING = True mc.PRETRAINED_MODEL_PATH = FLAGS.pretrained_model_path model = SqueezeDet(mc) elif FLAGS.net == 'squeezeDet+': mc = kitti_squeezeDetPlus_config() mc.IS_TRAINING = True mc.PRETRAINED_MODEL_PATH = FLAGS.pretrained_model_path model = SqueezeDetPlus(mc) imdb = cityscapes(FLAGS.image_set, FLAGS.data_path, mc) # save model size, flops, activations by layers with open(os.path.join(FLAGS.train_dir, 'model_metrics.txt'), 'w') as f: f.write('Number of parameter by layer:\n') count = 0 for c in model.model_size_counter: f.write('\t{}: {}\n'.format(c[0], c[1])) count += c[1] f.write('\ttotal: {}\n'.format(count)) count = 0 f.write('\nActivation size by layer:\n') for c in model.activation_counter: f.write('\t{}: {}\n'.format(c[0], c[1])) count += c[1] f.write('\ttotal: {}\n'.format(count)) count = 0 f.write('\nNumber of flops by layer:\n') for c in model.flop_counter: f.write('\t{}: {}\n'.format(c[0], c[1])) count += c[1] f.write('\ttotal: {}\n'.format(count)) f.close() print ('Model statistics saved to {}.'.format( os.path.join(FLAGS.train_dir, 'model_metrics.txt'))) def _load_data(load_to_placeholder=True): # read batch input image_per_batch, label_per_batch, box_delta_per_batch, aidx_per_batch, \ bbox_per_batch = imdb.read_batch() label_indices, bbox_indices, box_delta_values, mask_indices, box_values, \ = [], [], [], [], [] aidx_set = set() num_discarded_labels = 0 num_labels = 0 for i in range(len(label_per_batch)): # batch_size for j in range(len(label_per_batch[i])): # number of annotations num_labels += 1 if (i, aidx_per_batch[i][j]) not in aidx_set: aidx_set.add((i, aidx_per_batch[i][j])) label_indices.append( [i, aidx_per_batch[i][j], label_per_batch[i][j]]) mask_indices.append([i, aidx_per_batch[i][j]]) bbox_indices.extend( [[i, aidx_per_batch[i][j], k] for k in range(4)]) box_delta_values.extend(box_delta_per_batch[i][j]) box_values.extend(bbox_per_batch[i][j]) else: num_discarded_labels += 1 if mc.DEBUG_MODE: print ('Warning: Discarded {}/({}) labels that are assigned to the same ' 'anchor'.format(num_discarded_labels, num_labels)) if load_to_placeholder: image_input = model.ph_image_input input_mask = model.ph_input_mask box_delta_input = model.ph_box_delta_input box_input = model.ph_box_input labels = model.ph_labels else: image_input = model.image_input input_mask = model.input_mask box_delta_input = model.box_delta_input box_input = model.box_input labels = model.labels feed_dict = { image_input: image_per_batch, input_mask: np.reshape( sparse_to_dense( mask_indices, [mc.BATCH_SIZE, mc.ANCHORS], [1.0]*len(mask_indices)), [mc.BATCH_SIZE, mc.ANCHORS, 1]), box_delta_input: sparse_to_dense( bbox_indices, [mc.BATCH_SIZE, mc.ANCHORS, 4], box_delta_values), box_input: sparse_to_dense( bbox_indices, [mc.BATCH_SIZE, mc.ANCHORS, 4], box_values), labels: sparse_to_dense( label_indices, [mc.BATCH_SIZE, mc.ANCHORS, mc.CLASSES], [1.0]*len(label_indices)), } return feed_dict, image_per_batch, label_per_batch, bbox_per_batch def _enqueue(sess, coord): try: while not coord.should_stop(): feed_dict, _, _, _ = _load_data() sess.run(model.enqueue_op, feed_dict=feed_dict) if mc.DEBUG_MODE: print ("added to the queue") if mc.DEBUG_MODE: print ("Finished enqueue") except Exception, e: coord.request_stop(e) sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) saver = tf.train.Saver(tf.global_variables()) summary_op = tf.summary.merge_all() ckpt = tf.train.get_checkpoint_state(FLAGS.train_dir) if ckpt and ckpt.model_checkpoint_path: saver.restore(sess, ckpt.model_checkpoint_path) summary_writer = tf.summary.FileWriter(FLAGS.train_dir, sess.graph) init = tf.global_variables_initializer() sess.run(init) coord = tf.train.Coordinator() if mc.NUM_THREAD > 0: enq_threads = [] for _ in range(mc.NUM_THREAD): enq_thread = threading.Thread(target=_enqueue, args=[sess, coord]) # enq_thread.isDaemon() enq_thread.start() enq_threads.append(enq_thread) threads = tf.train.start_queue_runners(coord=coord, sess=sess) run_options = tf.RunOptions(timeout_in_ms=60000) # try: for step in xrange(FLAGS.max_steps):#max_step = 1000000 if coord.should_stop(): sess.run(model.FIFOQueue.close(cancel_pending_enqueues=True)) coord.request_stop() coord.join(threads) break start_time = time.time() if step % FLAGS.summary_step == 0: #summary_step = 10 feed_dict, image_per_batch, label_per_batch, bbox_per_batch = \ _load_data(load_to_placeholder=False) op_list = [ model.train_op, model.loss, summary_op, model.det_boxes, model.det_probs, model.det_class, model.conf_loss, model.bbox_loss, model.class_loss ] _, loss_value, summary_str, det_boxes, det_probs, det_class, \ conf_loss, bbox_loss, class_loss = sess.run( op_list, feed_dict=feed_dict) _viz_prediction_result( model, image_per_batch, bbox_per_batch, label_per_batch, det_boxes, det_class, det_probs) image_per_batch = bgr_to_rgb(image_per_batch) viz_summary = sess.run( model.viz_op, feed_dict={model.image_to_show: image_per_batch}) summary_writer.add_summary(summary_str, step) summary_writer.add_summary(viz_summary, step) summary_writer.flush() print ('conf_loss: {}, bbox_loss: {}, class_loss: {}'. format(conf_loss, bbox_loss, class_loss)) else: if mc.NUM_THREAD > 0: _, loss_value, conf_loss, bbox_loss, class_loss = sess.run( [model.train_op, model.loss, model.conf_loss, model.bbox_loss, model.class_loss], options=run_options) else: feed_dict, _, _, _ = _load_data(load_to_placeholder=False) _, loss_value, conf_loss, bbox_loss, class_loss = sess.run( [model.train_op, model.loss, model.conf_loss, model.bbox_loss, model.class_loss], feed_dict=feed_dict) duration = time.time() - start_time assert not np.isnan(loss_value), \ 'Model diverged. Total loss: {}, conf_loss: {}, bbox_loss: {}, ' \ 'class_loss: {}'.format(loss_value, conf_loss, bbox_loss, class_loss) if step % 10 == 0: num_images_per_step = mc.BATCH_SIZE images_per_sec = num_images_per_step / duration sec_per_batch = float(duration) format_str = ('%s: step %d, loss = %.2f (%.1f images/sec; %.3f ' 'sec/batch)') print (format_str % (datetime.now(), step, loss_value, images_per_sec, sec_per_batch)) sys.stdout.flush() # Save the model checkpoint periodically. if step % FLAGS.checkpoint_step == 0 or (step + 1) == FLAGS.max_steps: checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step)
def train(args, model, enc=False): best_acc = 0 #TODO: calculate weights by processing dataset histogram (now its being set by hand from the torch values) #create a loder to run all images and calculate histogram of labels, then create weight array using class balancing weight = torch.ones(NUM_CLASSES) if (enc): weight[0] = 4.38133159 weight[1] = 1.29574148 else: weight[0] = 4.40513628 weight[1] = 1.293674 weight_new = torch.ones(NUM_CLASSES) if (enc): weight_new[0] = 2.35864154 weight_new[1] = 1.59490491 else: weight_new[0] = 2.81672577 weight_new[1] = 1.74484367 assert os.path.exists(args.datadir), "Error: datadir (dataset directory) could not be loaded" co_transform = MyCoTransform(enc, augment=True, height=args.height)#1024) co_transform_val = MyCoTransform(enc, augment=False, height=args.height)#1024) if args.apex: dataset_train = apex(args.datadir, co_transform, 'train') dataset_val = apex(args.datadir, co_transform_val, 'val') else: dataset_train = cityscapes(args.datadir, co_transform, 'train') dataset_val = cityscapes(args.datadir, co_transform_val, 'val') loader = DataLoader(dataset_train, num_workers=args.num_workers, batch_size=args.batch_size, shuffle=True) loader_val = DataLoader(dataset_val, num_workers=args.num_workers, batch_size=args.batch_size, shuffle=False) if args.cuda: weight = weight.cuda() weight_new = weight_new.cuda() if args.weighted: criterion = CrossEntropyLoss2d(weight) elif args.weighted_new: criterion = CrossEntropyLoss2d(weight_new) else: criterion = CrossEntropyLoss2d() print(type(criterion)) savedir = args.savedir if (enc): automated_log_path = savedir + "/automated_log_encoder.txt" modeltxtpath = savedir + "/model_encoder.txt" else: automated_log_path = savedir + "/automated_log.txt" modeltxtpath = savedir + "/model.txt" if (not os.path.exists(automated_log_path)): #dont add first line if it exists with open(automated_log_path, "a") as myfile: myfile.write("Epoch\t\tTrain-loss\t\tTest-loss\t\tTrain-IoU\t\tTest-IoU\t\tlearningRate") with open(modeltxtpath, "w") as myfile: myfile.write(str(model)) #TODO: reduce memory in first gpu: https://discuss.pytorch.org/t/multi-gpu-training-memory-usage-in-balance/4163/4 #https://github.com/pytorch/pytorch/issues/1893 #optimizer = Adam(model.parameters(), 5e-4, (0.9, 0.999), eps=1e-08, weight_decay=2e-4) ## scheduler 1 optimizer = Adam(model.parameters(), 5e-4, (0.9, 0.999), eps=1e-08, weight_decay=1e-4) ## scheduler 2 start_epoch = 1 if args.resume: #Must load weights, optimizer, epoch and best value. if enc: filenameCheckpoint = savedir + '/checkpoint_enc.pth.tar' else: filenameCheckpoint = savedir + '/checkpoint.pth.tar' assert os.path.exists(filenameCheckpoint), "Error: resume option was used but checkpoint was not found in folder" checkpoint = torch.load(filenameCheckpoint) start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) best_acc = checkpoint['best_acc'] print("=> Loaded checkpoint at epoch {})".format(checkpoint['epoch'])) #scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=0.5) # set up scheduler ## scheduler 1 lambda1 = lambda epoch: pow((1-((epoch-1)/args.num_epochs)),0.9) ## scheduler 2 scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda1) ## scheduler 2 if args.visualize and args.steps_plot > 0: board = Dashboard(args.port) for epoch in range(start_epoch, args.num_epochs+1): print("----- TRAINING - EPOCH", epoch, "-----") scheduler.step(epoch) ## scheduler 2 epoch_loss = [] time_train = [] doIouTrain = args.iouTrain doIouVal = args.iouVal if (doIouTrain): iouEvalTrain = iouEval(NUM_CLASSES, args.ignoreindex) usedLr = 0 for param_group in optimizer.param_groups: print("LEARNING RATE: ", param_group['lr']) usedLr = float(param_group['lr']) model.train() for step, (images, labels) in enumerate(loader): start_time = time.time() #print (labels.size()) #print (np.unique(labels.numpy())) #print("labels: ", np.unique(labels[0].numpy())) #labels = torch.ones(4, 1, 512, 1024).long() if args.cuda: images = images.cuda() labels = labels.cuda() inputs = Variable(images) targets = Variable(labels) outputs = model(inputs, only_encode=enc) #print("targets", np.unique(targets[:, 0].cpu().data.numpy())) optimizer.zero_grad() loss = criterion(outputs, targets[:, 0]) loss.backward() optimizer.step() epoch_loss.append(loss.data[0]) time_train.append(time.time() - start_time) if (doIouTrain): #start_time_iou = time.time() iouEvalTrain.addBatch(outputs.max(1)[1].unsqueeze(1).data, targets.data) #print ("Time to add confusion matrix: ", time.time() - start_time_iou) #print(outputs.size()) if args.visualize and args.steps_plot > 0 and step % args.steps_plot == 0: start_time_plot = time.time() image = inputs[0].cpu().data #image[0] = image[0] * .229 + .485 #image[1] = image[1] * .224 + .456 #image[2] = image[2] * .225 + .406 #print("output", np.unique(outputs[0].cpu().max(0)[1].data.numpy())) board.image(image, f'input (epoch: {epoch}, step: {step})') if isinstance(outputs, list): #merge gpu tensors board.image(color_transform(outputs[0][0].cpu().max(0)[1].data.unsqueeze(0)), f'output (epoch: {epoch}, step: {step})') else: board.image(color_transform(outputs[0].cpu().max(0)[1].data.unsqueeze(0)), f'output (epoch: {epoch}, step: {step})') board.image(color_transform(targets[0].cpu().data), f'target (epoch: {epoch}, step: {step})') print ("Time to paint images: ", time.time() - start_time_plot) if args.steps_loss > 0 and step % args.steps_loss == 0: average = sum(epoch_loss) / len(epoch_loss) print(f'loss: {average:0.4} (epoch: {epoch}, step: {step})', "// Avg time/img: %.4f s" % (sum(time_train) / len(time_train) / args.batch_size)) average_epoch_loss_train = sum(epoch_loss) / len(epoch_loss) iouTrain = 0 if (doIouTrain): iouTrain, iou_classes = iouEvalTrain.getIoU() iouStr = getColorEntry(iouTrain)+'{:0.2f}'.format(iouTrain*100) + '\033[0m' print ("EPOCH IoU on TRAIN set: ", iouStr, "%") #Validate on 500 val images after each epoch of training print("----- VALIDATING - EPOCH", epoch, "-----") model.eval() epoch_loss_val = [] time_val = [] if (doIouVal): iouEvalVal = iouEval(NUM_CLASSES, args.ignoreindex) for step, (images, labels) in enumerate(loader_val): start_time = time.time() if args.cuda: images = images.cuda() labels = labels.cuda() inputs = Variable(images, volatile=True) #volatile flag makes it free backward or outputs for eval targets = Variable(labels, volatile=True) outputs = model(inputs, only_encode=enc) loss = criterion(outputs, targets[:, 0]) epoch_loss_val.append(loss.data[0]) time_val.append(time.time() - start_time) #Add batch to calculate TP, FP and FN for iou estimation if (doIouVal): #start_time_iou = time.time() iouEvalVal.addBatch(outputs.max(1)[1].unsqueeze(1).data, targets.data) #print ("Time to add confusion matrix: ", time.time() - start_time_iou) if args.visualize and args.steps_plot > 0 and step % args.steps_plot == 0: start_time_plot = time.time() image = inputs[0].cpu().data board.image(image, f'VAL input (epoch: {epoch}, step: {step})') if isinstance(outputs, list): #merge gpu tensors board.image(color_transform(outputs[0][0].cpu().max(0)[1].data.unsqueeze(0)), f'VAL output (epoch: {epoch}, step: {step})') else: board.image(color_transform(outputs[0].cpu().max(0)[1].data.unsqueeze(0)), f'VAL output (epoch: {epoch}, step: {step})') board.image(color_transform(targets[0].cpu().data), f'VAL target (epoch: {epoch}, step: {step})') print ("Time to paint images: ", time.time() - start_time_plot) if args.steps_loss > 0 and step % args.steps_loss == 0: average = sum(epoch_loss_val) / len(epoch_loss_val) print(f'VAL loss: {average:0.4} (epoch: {epoch}, step: {step})', "// Avg time/img: %.4f s" % (sum(time_val) / len(time_val) / args.batch_size)) average_epoch_loss_val = sum(epoch_loss_val) / len(epoch_loss_val) #scheduler.step(average_epoch_loss_val, epoch) ## scheduler 1 # update lr if needed iouVal = 0 if (doIouVal): iouVal, iou_classes = iouEvalVal.getIoU() iouStr = getColorEntry(iouVal)+'{:0.2f}'.format(iouVal*100) + '\033[0m' print ("EPOCH IoU on VAL set: ", iouStr, "%") # remember best valIoU and save checkpoint if iouVal == 0: current_acc = -average_epoch_loss_val else: current_acc = iouVal is_best = current_acc > best_acc best_acc = max(current_acc, best_acc) if enc: filenameCheckpoint = savedir + '/checkpoint_enc.pth.tar' filenameBest = savedir + '/model_best_enc.pth.tar' else: filenameCheckpoint = savedir + '/checkpoint.pth.tar' filenameBest = savedir + '/model_best.pth.tar' save_checkpoint({ 'epoch': epoch + 1, 'arch': str(model), 'state_dict': model.state_dict(), 'best_acc': best_acc, 'optimizer' : optimizer.state_dict(), }, is_best, filenameCheckpoint, filenameBest) #SAVE MODEL AFTER EPOCH if (enc): filename = f'{savedir}/model_encoder-{epoch:03}.pth' filenamebest = f'{savedir}/model_encoder_best.pth' else: filename = f'{savedir}/model-{epoch:03}.pth' filenamebest = f'{savedir}/model_best.pth' if args.epochs_save > 0 and step > 0 and step % args.epochs_save == 0: torch.save(model.state_dict(), filename) print(f'save: {filename} (epoch: {epoch})') if (is_best): torch.save(model.state_dict(), filenamebest) print(f'save: {filenamebest} (epoch: {epoch})') if (not enc): with open(savedir + "/best.txt", "w") as myfile: myfile.write("Best epoch is %d, with Val-IoU= %.4f" % (epoch, iouVal)) else: with open(savedir + "/best_encoder.txt", "w") as myfile: myfile.write("Best epoch is %d, with Val-IoU= %.4f" % (epoch, iouVal)) #SAVE TO FILE A ROW WITH THE EPOCH RESULT (train loss, val loss, train IoU, val IoU) #Epoch Train-loss Test-loss Train-IoU Test-IoU learningRate with open(automated_log_path, "a") as myfile: myfile.write("\n%d\t\t%.4f\t\t%.4f\t\t%.4f\t\t%.4f\t\t%.8f" % (epoch, average_epoch_loss_train, average_epoch_loss_val, iouTrain, iouVal, usedLr )) return(model) #return model (convenience for encoder-decoder training)
def main(args): modelpath = args.loadDir + args.loadModel weightspath = args.loadDir + args.loadWeights print("Loading model: " + modelpath) print("Loading weights: " + weightspath) resnet = resnet18(pretrained=True, efficient=False, use_bn=True) model = Net(resnet, size=(512, 1024 * 4), num_classes=NUM_CLASSES) model = torch.nn.DataParallel(model) if (not args.cpu): model = model.cuda() #model.load_state_dict(torch.load(args.state)) #model.load_state_dict(torch.load(weightspath)) #not working if missing key def load_my_state_dict( model, state_dict ): #custom function to load model when not all dict elements own_state = model.state_dict() for a, b in zip(own_state.keys(), state_dict.keys()): print(a, ' ', b) print('-----------') for name, param in state_dict.items(): # print('#####', name) name = name[7:] if name not in own_state: print('{} not in own_state'.format(name)) continue #if name not in except_list: own_state[name].copy_(param) return model model = load_my_state_dict(model, torch.load(weightspath)) print("Model and weights LOADED successfully") model.eval() if (not os.path.exists(args.datadir)): print("Error: datadir could not be loaded") loader = DataLoader(cityscapes(args.datadir, input_transform_cityscapes, subset=args.subset), num_workers=args.num_workers, batch_size=args.batch_size, shuffle=False) # For visualizer: # must launch in other window "python3.6 -m visdom.server -port 8097" # and access localhost:8097 to see it if (args.visualize): vis = visdom.Visdom() time_all = [] with torch.no_grad(): for step, (images, filename) in enumerate(loader): images = images.cuda() start_time = time.time() outputs = model(images) fwt = time.time() - start_time time_all.append(fwt) label = outputs[0].cpu().max(0)[1].data.byte() label_color = Colorize()(label.unsqueeze(0)) filenameSave = "./save_color/" + filename[0].split( "leftImg8bit/")[1] os.makedirs(os.path.dirname(filenameSave), exist_ok=True) label_save = ToPILImage()(label_color) label_save.save(filenameSave) print(step, filenameSave) print("Forward time per img (Mean: %.4f)" % (1 / (sum(time_all) / len(time_all))))
def main(args, get_dataset): modelpath = args.loadDir + args.loadModel weightspath = args.loadDir + args.loadWeights print("Loading model: " + modelpath) print("Loading weights: " + weightspath) NUM_CLASSES = get_dataset.num_labels model = Net(NUM_CLASSES, args.em_dim, args.resnet) model = torch.nn.DataParallel(model) if (not args.cpu): model = model.cuda() def load_my_state_dict( model, state_dict ): #custom function to load model when not all dict elements own_state = model.state_dict() for a in own_state.keys(): print(a) for a in state_dict.keys(): print(a) print('-----------') for name, param in state_dict.items(): if name not in own_state: continue own_state[name].copy_(param) return model model = load_my_state_dict(model, torch.load(weightspath)) print("Model and weights LOADED successfully") model.eval() if (not os.path.exists(args.datadir)): print("Error: datadir could not be loaded") loader = DataLoader(cityscapes(args.datadir, input_transform_cityscapes, subset=args.subset), num_workers=args.num_workers, batch_size=args.batch_size, shuffle=False) log_dir = "./save_eval_iou/" os.makedirs(log_dir, exist_ok=True) logger = create_logger(log_dir) fused_iou_eval = FusedIoU(NUM_CLASSES['MAP'], WILDPASS_IN_MAP_ID, REMAP_MAP_IDD, WILDPASS_REMAP_ID, logger) with torch.set_grad_enabled(False): step = 0 for step, (images, filename) in enumerate(loader): images = images.cuda() outputs = model(images, enc=False) pred_MAP = outputs['MAP'].data.cpu().numpy() pred_IDD = outputs['IDD20K'].data.cpu().numpy() # --- evaluate label = np.asarray( Image.open(filename[0].replace('/leftImg8bit/', '/gtFine/'))) label = np.expand_dims(label, axis=0) if args.is_fuse: outputs = fused_iou_eval.fuse(label, pred_MAP, pred_IDD, filename) else: fused_iou_eval.add_batch(np.argmax(pred_MAP, axis=1), label) # only MAP #outputs = outputs['MAP'] outputs = torch.from_numpy(outputs[None, ...]) label_color = Colorize()(outputs) filenameSave = "./save_color/" + filename[0].split( "leftImg8bit/")[1] os.makedirs(os.path.dirname(filenameSave), exist_ok=True) label_save = ToPILImage()(label_color) label_save.save(filenameSave) print(step, filenameSave) print('processed step {}, file {}.'.format( step, filename[0].split("leftImg8bit/val/cs/")[1])) logger.info('========== evaluator using onehot ==========') miou, iou, macc, acc = fused_iou_eval.get_iou() logger.info('-----------Acc of each classes-----------') for i, c_acc in enumerate(acc): name = WILDPASS_IN_MAP_NAME[i] if len(acc) == len( WILDPASS_IN_MAP_NAME) else str(i) logger.info('ID= {:>10s}: {:.2f}'.format(name, c_acc * 100.0)) logger.info("Acc of {} images :{:.2f}".format(str(step + 1), macc * 100.0)) logger.info('-----------IoU of each classes-----------') for i, c_iou in enumerate(iou): name = WILDPASS_IN_MAP_NAME[i] if len(iou) == len( WILDPASS_IN_MAP_NAME) else str(i) logger.info('ID= {:>10s}: {:.2f}'.format(name, c_iou * 100.0)) logger.info("mIoU of {} images :{:.2f}".format(str(step + 1), miou * 100.0))
def train(args, rmodel, model, enc=False): best_acc = 0 weight = classWeights(NUM_CLASSES) assert os.path.exists( args.datadir), "Error: datadir (dataset directory) could not be loaded" co_transform = MyCoTransform(augment=True, height=args.height) co_transform_val = MyCoTransform(augment=False, height=args.height) dataset_train = cityscapes(args.datadir, co_transform, 'train') dataset_val = cityscapes(args.datadir, co_transform_val, 'val') loader = DataLoader(dataset_train, num_workers=args.num_workers, batch_size=args.batch_size, shuffle=True) loader_val = DataLoader(dataset_val, num_workers=args.num_workers, batch_size=args.batch_size, shuffle=False) if args.cuda: weight = weight.cuda() rcriterion = torch.nn.L1Loss() savedir = '/home/shyam.nandan/NewExp/F_erfnet_pytorch_ours_w_gt_v2_multiply/save/' + args.savedir #change path if (enc): automated_log_path = savedir + "/automated_log_encoder.txt" modeltxtpath = savedir + "/model_encoder.txt" else: automated_log_path = savedir + "/automated_log.txt" modeltxtpath = savedir + "/model.txt" if (not os.path.exists(automated_log_path)): with open(automated_log_path, "a") as myfile: myfile.write( "Epoch\t\tTrain-loss\t\tTest-loss\t\tTrain-IoU\t\tTest-IoU\t\tlearningRate" ) with open(modeltxtpath, "w") as myfile: myfile.write(str(model)) optimizer = Adam(model.parameters(), 5e-4, (0.9, 0.999), eps=1e-08, weight_decay=2e-4) ## roptimizer = Adam(rmodel.parameters(), 2e-4, (0.9, 0.999)) ## restoration scheduler start_epoch = 1 scheduler = lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.99) rscheduler = lr_scheduler.StepLR(roptimizer, step_size=30, gamma=0.5) ## Restoration schedular for epoch in range(start_epoch, args.num_epochs + 1): print("----- TRAINING - EPOCH", epoch, "-----") scheduler.step() ## scheduler 2 rscheduler.step() epoch_loss = [] time_train = [] doIouTrain = args.iouTrain doIouVal = args.iouVal if (doIouTrain): iouEvalTrain = iouEval(NUM_CLASSES) usedLr = 0 rusedLr = 0 for param_group in optimizer.param_groups: print("Segmentation LEARNING RATE: ", param_group['lr']) usedLr = float(param_group['lr']) for param_group in roptimizer.param_groups: print("Restoration LEARNING RATE: ", param_group['lr']) rusedLr = float(param_group['lr']) model.eval() epoch_loss_val = [] time_val = [] if (doIouVal): iouEvalVal = iouEval(NUM_CLASSES) for step, (timages, images, labels, filename) in enumerate(loader_val): start_time = time.time() if args.cuda: images = images.cuda() labels = labels.cuda() timages = timages.cuda() inputs = Variable( timages, volatile=True ) #volatile flag makes it free backward or outputs for eval itargets = Variable(images, volatile=True) targets = Variable(labels, volatile=True) ss_inputs = rmodel(inputs, flag=0, r_fb1=0, r_fb2=0) outs = model(ss_inputs, only_encode=enc) tminus_outs = outs.detach() tplus_outs = outs.detach() for num_feedback in range(3): optimizer.zero_grad() roptimizer.zero_grad() ss_inputs = rmodel(inputs, flag=1, r_fb1=(tplus_outs - tminus_outs), r_fb2=ss_inputs.detach()) loss = rcriterion(ss_inputs, itargets) outs = model(ss_inputs.detach(), only_encode=enc) tminus_outs = tplus_outs tplus_outs = outs.detach() outputs = outs del outs, tminus_outs, tplus_outs gc.collect() Gamma = [0, 0, 0] Alpha = [1, 1, 1] loss = CB_iFl(outputs, targets[:, 0], weight, gamma=Gamma[0], alpha=Alpha[0]) epoch_loss_val.append(loss.data[0]) time_val.append(time.time() - start_time) if (doIouVal): #start_time_iou = time.time() iouEvalVal_img = iouEval(NUM_CLASSES) iouEvalVal_img.addBatch( outputs.max(1)[1].unsqueeze(1).data, targets.data) iouEvalVal.addBatch( outputs.max(1)[1].unsqueeze(1).data, targets.data) #print ("Time to add confusion matrix: ", time.time() - start_time_iou) label_color = Colorize()( outputs[0].max(0)[1].byte().cpu().data.unsqueeze(0)) label_save = ToPILImage()(label_color) filenameSave = '../save_color_restored_joint_afl_CBFL/' + filename[ 0].split('/')[-2] im_iou, _ = iouEvalVal_img.getIoU() if not os.path.exists(filenameSave): os.makedirs(filenameSave) #Uncomment to save output #label_save.save(filenameSave+ '/' + str(" %6.4f " %im_iou[0].data.numpy()) + '_' + filename[0].split('/')[-1]) if args.steps_loss > 0 and step % args.steps_loss == 0: average = sum(epoch_loss_val) / len(epoch_loss_val) print('Val loss: ', average, 'Epoch: ', epoch, 'Step: ', step) average_epoch_loss_val = sum(epoch_loss_val) / len(epoch_loss_val) iouVal = 0 if (doIouVal): iouVal, iou_classes = iouEvalVal.getIoU() iouStr = getColorEntry(iouVal) + '{:0.2f}'.format( iouVal * 100) + '\033[0m' print(iouVal, iou_classes, iouStr) return (model)
def train(args, rmodel, model, enc=False): best_acc = 0 weight = classWeights(NUM_CLASSES) assert os.path.exists(args.datadir), "Error: datadir (dataset directory) could not be loaded" co_transform = MyCoTransform(augment=True, height=args.height) co_transform_val = MyCoTransform(augment=False, height=args.height) dataset_train = cityscapes(args.datadir, co_transform, 'train') dataset_val = cityscapes(args.datadir, co_transform_val, 'val') loader = DataLoader(dataset_train, num_workers=args.num_workers, batch_size=args.batch_size, shuffle=True) loader_val = DataLoader(dataset_val, num_workers=args.num_workers, batch_size=args.batch_size, shuffle=False) if args.cuda: weight = weight.cuda() rcriterion = torch.nn.L1Loss() savedir = '/home/shyam.nandan/NewExp/final_code/save/' + args.savedir automated_log_path = savedir + "/automated_log.txt" modeltxtpath = savedir + "/model.txt" if (not os.path.exists(automated_log_path)): with open(automated_log_path, "a") as myfile: myfile.write("Epoch\t\tTrain-loss\t\tTest-loss\t\tTrain-IoU\t\tTest-IoU\t\tlearningRate") with open(modeltxtpath, "w") as myfile: myfile.write(str(model)) optimizer = Adam(model.parameters(), 5e-4, (0.9, 0.999),eps=1e-08, weight_decay=2e-4) roptimizer = Adam(rmodel.parameters(), 2e-4, (0.9, 0.999)) start_epoch = 1 scheduler = lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.99) rscheduler = lr_scheduler.StepLR(roptimizer, step_size=30, gamma=0.5) for epoch in range(start_epoch, args.num_epochs+1): print("----- TRAINING - EPOCH", epoch, "-----") scheduler.step() rscheduler.step() epoch_loss = [] time_train = [] doIouTrain = args.iouTrain doIouVal = args.iouVal if (doIouTrain): iouEvalTrain = iouEval(NUM_CLASSES) usedLr = 0 rusedLr = 0 for param_group in optimizer.param_groups: print("Segmentation LEARNING RATE: ", param_group['lr']) usedLr = float(param_group['lr']) for param_group in roptimizer.param_groups: print("Restoration LEARNING RATE: ", param_group['lr']) rusedLr = float(param_group['lr']) model.train() for step, (timages, images, labels) in enumerate(loader): start_time = time.time() if args.cuda: images = images.cuda() labels = labels.cuda() timages = timages.cuda() inputs = Variable(timages) itargets = Variable(images) targets = Variable(labels) ss_inputs = rmodel(inputs, flag = 0, r_fb1 = 0, r_fb2 = 0) outs = model(ss_inputs, only_encode=enc) tminus_outs = outs.detach() tplus_outs = outs.detach() outputs = [] for num_feedback in range(3): optimizer.zero_grad() roptimizer.zero_grad() ss_inputs = rmodel(inputs, flag= 1, r_fb1 = (tplus_outs - tminus_outs) , r_fb2 = ss_inputs.detach()) loss = rcriterion(ss_inputs, itargets) loss.backward() roptimizer.step() optimizer.zero_grad() roptimizer.zero_grad() outs = model(ss_inputs.detach(),only_encode=enc) outputs.append(outs) tminus_outs = tplus_outs tplus_outs = outs.detach() del outs, tminus_outs, tplus_outs gc.collect() loss = 0.0 Gamma = [0, 0.1, 0.2] Alpha = [1, 1, 1] for i, o in enumerate(outputs): loss += CB_iFl(o, targets[:, 0], weight, gamma = Gamma[i], alpha = Alpha[i]) loss.backward() optimizer.step() epoch_loss.append(loss.data[0]) time_train.append(time.time() - start_time) if (doIouTrain): iouEvalTrain.addBatch(outputs.max(1)[1].unsqueeze(1).data, targets.data) if args.steps_loss > 0 and step % args.steps_loss == 0: average = sum(epoch_loss) / len(epoch_loss) print('loss: ', average.data.cpu()[0], 'Epoch: ', epoch, 'Step: ', step) average_epoch_loss_train = sum(epoch_loss) / len(epoch_loss) iouTrain = 0 if (doIouTrain): iouTrain, iou_classes = iouEvalTrain.getIoU() iouStr = getColorEntry(iouTrain)+'{:0.2f}'.format(iouTrain*100) + '\033[0m' print ("EPOCH IoU on TRAIN set: ", iouStr, "%") print("----- VALIDATING - EPOCH", epoch, "-----") model.eval() epoch_loss_val = [] time_val = [] if (doIouVal): iouEvalVal = iouEval(NUM_CLASSES) for step, (timages, images, labels) in enumerate(loader_val): start_time = time.time() if args.cuda: images = images.cuda() labels = labels.cuda() timages = timages.cuda() inputs = Variable(timages, volatile=True) itargets = Variable(images, volatile=True) targets = Variable(labels, volatile=True) ss_inputs = rmodel(inputs, flag = 0, r_fb1 = 0, r_fb2 = 0) outs = model(ss_inputs, only_encode=enc) tminus_outs = outs.detach() tplus_outs = outs.detach() for num_feedback in range(3): optimizer.zero_grad() roptimizer.zero_grad() ss_inputs = rmodel(inputs, flag= 1, r_fb1 = (tplus_outs - tminus_outs) , r_fb2 = ss_inputs.detach()) loss = rcriterion(ss_inputs, itargets) outs = model(ss_inputs.detach(),only_encode=enc) tminus_outs = tplus_outs tplus_outs = outs.detach() ################################## del ss_inputs, tplus_outs, tminus_outs outputs = outs loss = CB_iFl(outputs, targets[:, 0], weight, gamma = Gamma[0], alpha = Alpha[0]) epoch_loss_val.append(loss.data[0]) time_val.append(time.time() - start_time) if (doIouVal): iouEvalVal.addBatch(outputs.max(1)[1].unsqueeze(1).data, targets.data) if args.steps_loss > 0 and step % args.steps_loss == 0: average = sum(epoch_loss_val) / len(epoch_loss_val) print('Val loss: ', average, 'Epoch: ', epoch, 'Step: ', step) average_epoch_loss_val = sum(epoch_loss_val) / len(epoch_loss_val) iouVal = 0 if (doIouVal): iouVal, iou_classes = iouEvalVal.getIoU() iouStr = getColorEntry(iouVal)+'{:0.2f}'.format(iouVal*100) + '\033[0m' print ("EPOCH IoU on VAL set: ", iouStr, "%") # remember best valIoU and save checkpoint if iouVal == 0: current_acc = -average_epoch_loss_val else: current_acc = iouVal is_best = current_acc > best_acc best_acc = max(current_acc, best_acc) filenameCheckpoint = savedir + '/checkpoint.pth.tar' filenameBest = savedir + '/model_best.pth.tar' save_checkpoint({ 'epoch': epoch + 1, 'arch': str(model), 'state_dict': model.state_dict(), 'best_acc': best_acc, 'optimizer' : optimizer.state_dict(), }, is_best, filenameCheckpoint, filenameBest) #SAVE MODEL AFTER EPOCH filename = savedir + '/model-{epoch:03}.pth' filenamebest = savedir + '/model_best.pth' if args.epochs_save > 0 and step > 0 and step % args.epochs_save == 0: torch.save(model.state_dict(), filename) print(filename, epoch) if (is_best): torch.save(model.state_dict(), filenamebest) torch.save(rmodel.state_dict(), savedir + '/rmodel_best.pth') print(filenamebest,epoch) with open(savedir + "/best.txt", "w") as myfile: myfile.write("Best epoch is %d, with Val-IoU= %.4f" % (epoch, iouVal)) #SAVE TO FILE A ROW WITH THE EPOCH RESULT (train loss, val loss, train IoU, val IoU) #Epoch Train-loss Test-loss Train-IoU Test-IoU learningRate with open(automated_log_path, "a") as myfile: myfile.write("\n%d\t\t%.4f\t\t%.4f\t\t%.4f\t\t%.4f\t\t%.8f" % (epoch, average_epoch_loss_train, average_epoch_loss_val, iouTrain, iouVal, usedLr )) return(model)