help='path to train dataset') parser.add_argument('--train_save', type=str, default='PraNet_Res2Net') opt = parser.parse_args() # ---- build models ---- # torch.cuda.set_device(0) # set your gpu device model = PraNet().cuda() # ---- flops and params ---- # from utils.utils import CalParams # x = torch.randn(1, 3, 352, 352).cuda() # CalParams(lib, x) params = model.parameters() optimizer = torch.optim.Adam(params, opt.lr) image_root = '{}/images/'.format(opt.train_path) gt_root = '{}/masks/'.format(opt.train_path) train_loader = get_loader(image_root, gt_root, batchsize=opt.batchsize, trainsize=opt.trainsize) total_step = len(train_loader) print("#" * 20, "Start Training", "#" * 20) for epoch in range(1, opt.epoch): adjust_lr(optimizer, opt.lr, epoch, opt.decay_rate, opt.decay_epoch) train(train_loader, model, optimizer, epoch)
# We support two kind of backbone assert opt.hardnet in [68, 85], "We support two kind of backbone [HarDNet68, HarDNet85]" model = HarDMSEG(arch=opt.hardnet).cuda() # ---- flops and params ---- # from utils.utils import CalParams # x = torch.randn(1, 3, 352, 352).cuda() # CalParams(lib, x) params = model.parameters() if opt.optimizer == 'Adam': optimizer = torch.optim.Adam(params, opt.lr) else: optimizer = torch.optim.SGD(params, opt.lr, weight_decay = 1e-4, momentum = 0.9) print(optimizer) image_root = '{}/images/'.format(opt.train_path) gt_root = '{}/masks/'.format(opt.train_path) print("Dataset root: " + image_root) train_loader = get_loader(image_root, gt_root, batchsize=opt.batchsize, trainsize=opt.trainsize, num_workers=opt.num_workers, augmentation = opt.augmentation) total_step = len(train_loader) print("#"*20, "Start Training", "#"*20) for epoch in range(1, opt.epoch): adjust_lr(optimizer, opt.lr, epoch, 0.1, 200) train(train_loader, model, optimizer, epoch, opt.test_path)
# from utils.utils import CalParams # x = torch.randn(1, 3, 352, 352).cuda() # CalParams(lib, x) params = model.parameters() if opt.optimizer == 'Adam': optimizer = torch.optim.Adam(params, opt.lr) else: optimizer = torch.optim.SGD(params, opt.lr, weight_decay=1e-4, momentum=0.9) print(optimizer) image_root = '{}/images/'.format(opt.train_path) gt_root = '{}/masks/'.format(opt.train_path) train_loader = get_loader(image_root, gt_root, batchsize=opt.batchsize, trainsize=opt.trainsize, augmentation=opt.augmentation) total_step = len(train_loader) print("#" * 20, "Start Training", "#" * 20) for epoch in range(1, opt.epoch): # adjust_lr(optimizer, opt.lr, epoch, 0.1, 200) train(train_loader, model, optimizer, epoch, opt.test_path)
parser.add_argument('--train_save', type=str, default='PraNet_Res2Net') parser.add_argument('--json_file', type=str, default='/data0/zzhang/new_polyp_annotation_01_03/train.json') opt = parser.parse_args() # ---- build models ---- # torch.cuda.set_device(0) # set your gpu device model = PraNet().cuda() # ---- flops and params ---- # from utils.utils import CalParams # x = torch.randn(1, 3, 352, 352).cuda() # CalParams(lib, x) params = model.parameters() optimizer = torch.optim.Adam(params, opt.lr) image_root = opt.train_path gt_root = opt.gt_path train_loader = get_loader(image_root, gt_root, opt.json_file, batchsize=opt.batchsize, trainsize=opt.trainsize) total_step = len(train_loader) print("#"*20, "Start Training", "#"*20) for epoch in range(1, opt.epoch): adjust_lr(optimizer, opt.lr, epoch, opt.decay_rate, opt.decay_epoch) train(train_loader, model, optimizer, epoch)
from model import Encoder, Decoder else: from model_gru import Encoder, Decoder crop_size = args.crop_size transform = transforms.Compose([ transforms.RandomCrop(crop_size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))]) dct = Dictionary() dct.load_dict(args.vocab_path) print(f'Dict Size {len(dct)}') data_loader = get_loader(args.image_dir, args.caption_path, dct, transform, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers) def train(dct_size, embed_size=256, hidden_size=512, epochs=10, num_layers=1, save_step=1000, lr=0.001, model_save='model/'): encoder = Encoder(embed_size=embed_size).to(device) decoder = Decoder(embed_size=embed_size, hidden_size=hidden_size, dct_size=len(dct), num_layers=num_layers).to(device) criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list(encoder.linear.parameters()) + list(encoder.bn.parameters()) optimizer = torch.optim.Adam(params, lr=lr) for epoch in range(epochs): print(f'epoch {epoch+1}/{epochs}: ') for i, (images, captions, lengths) in enumerate(tqdm(data_loader)): # Set mini-batch dataset
def main(opt): # cudnn.benchmark = True device = select_device(opt.device, batch_size=opt.batchsize) # device = torch.device('cuda') # print(opt) ######### 数据加载 ######### train_loader = get_loader(opt.train_data,opt.batchsize,shuffle=True, num_workers=4, pin_memory=False) if opt.val_data: val_loader = get_loader(opt.val_data,opt.val_bs,shuffle=False, num_workers=1, pin_memory=False) ########## 模型加载 ######## model = build_model(opt) model = torch.nn.DataParallel(model) model = model.to(device) ######## 预训练参数加载 ######## if opt.preweight_path: model.load_state_dict(torch.load(opt.preweight_path)) ####### 定义模型优化器 ############# if opt.optimizer_type == 'Adam': optimizer = torch.optim.Adam(model.parameters(), opt.lr) elif opt.optimizer_type == 'SGD': optimizer = torch.optim.SGD(model.parameters(), lr=opt.lr, momentum=0.9, weight_decay=1e-8, nesterov=True) ####### 学习率 衰减方式 if opt.lr_scheduler == 'CosineAnnealingWarmRestarts': scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=50, T_mult=2, eta_min=0, last_epoch=-1) elif opt.lr_scheduler == 'LambdaLR': lambda1 = lambda epoch: epoch // 30 lambda2 = lambda epoch: 0.95 ** epoch scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=[lambda1, lambda2]) elif opt.lr_scheduler == 'MultiplicativeLR': lmbda = lambda epoch: 0.95 scheduler = torch.optim.lr_scheduler.MultiplicativeLR(optimizer, lr_lambda=lmbda) elif opt.lr_scheduler == 'StepLR': lmbda = lambda epoch: 0.95 scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=30, gamma=0.1) elif opt.lr_scheduler == 'MultiStepLR': scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer,milestones=[30,80,120,200], gamma=0.1) elif opt.lr_scheduler == 'ExponentialLR': scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.99) elif opt.lr_scheduler == 'CosineAnnealingLR': scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=50, eta_min=0,) elif opt.lr_scheduler == 'ReduceLROnPlateau': optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,'min') elif opt.lr_scheduler == 'ReduceLROnPlateau': optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9) scheduler = torch.optim.lr_scheduler.CyclicLR(optimizer, base_lr=0.01, max_lr=0.1) #### ---- loss function ---- if opt.lossfunction == 'BCE': opt.criterion = torch.nn.BCELoss() elif opt.lossfunction == 'structure_loss': opt.criterion = structure_loss elif opt.lossfunction == 'symmetric_lovasz' : opt.criterion = symmetric_lovasz print('##########################################\n\n\ntrain start') for epoch in range(0, opt.epoch): if opt.model_type == 'PraNet': parnet_train(epoch,model,optimizer,train_loader,val_loader,device) else: train(epoch,model,optimizer,train_loader,val_loader,device) scheduler.step() print('all train done')
def main(args): # Create model directory & other aux folders for logging where_to_save = os.path.join(args.save_dir, args.project_name, args.model_name) checkpoints_dir = os.path.join(where_to_save, 'checkpoints') logs_dir = os.path.join(where_to_save, 'logs') tb_logs = os.path.join(args.save_dir, args.project_name, 'tb_logs', args.model_name) make_dir(where_to_save) make_dir(logs_dir) make_dir(checkpoints_dir) make_dir(tb_logs) if args.tensorboard: logger = Visualizer(tb_logs, name='visual_results') # check if we want to resume from last checkpoint of current model if args.resume: args = pickle.load( open(os.path.join(checkpoints_dir, 'args.pkl'), 'rb')) args.resume = True # logs to disk if not args.log_term: print("Training logs will be saved to:", os.path.join(logs_dir, 'train.log')) sys.stdout = open(os.path.join(logs_dir, 'train.log'), 'w') sys.stderr = open(os.path.join(logs_dir, 'train.err'), 'w') print(args) pickle.dump(args, open(os.path.join(checkpoints_dir, 'args.pkl'), 'wb')) # patience init curr_pat = 0 # Build data loader data_loaders = {} datasets = {} data_dir = args.recipe1m_dir for split in ['train', 'val']: transforms_list = [transforms.Resize((args.image_size))] if split == 'train': # Image preprocessing, normalization for the pretrained resnet transforms_list.append(transforms.RandomHorizontalFlip()) transforms_list.append( transforms.RandomAffine(degrees=10, translate=(0.1, 0.1))) transforms_list.append(transforms.RandomCrop(args.crop_size)) else: transforms_list.append(transforms.CenterCrop(args.crop_size)) transforms_list.append(transforms.ToTensor()) transforms_list.append( transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))) transform = transforms.Compose(transforms_list) max_num_samples = max(args.max_eval, args.batch_size) if split == 'val' else -1 data_loaders[split], datasets[split] = get_loader( transform, data_dir, split, args.batch_size, shuffle=split == 'train', num_workers=args.num_workers, drop_last=True, ) ingr_vocab_size = datasets[split].get_ingrs_vocab_size() print('Length of ingredients:', ingr_vocab_size) # Build the model model = get_model(args, ingr_vocab_size) keep_cnn_gradients = False decay_factor = 1.0 # add model parameters params = list(model.recipe_decoder.parameters()) # only train the linear layer in the encoder if we are not transfering from another model if args.transfer_from == '': params += list(model.image_encoder.linear.parameters()) params_cnn = list(model.image_encoder.resnet.parameters()) print("CNN params:", sum(p.numel() for p in params_cnn if p.requires_grad)) print("decoder params:", sum(p.numel() for p in params if p.requires_grad)) # start optimizing cnn from the beginning if params_cnn is not None and args.finetune_after == 0: optimizer = torch.optim.Adam( [{ 'params': params }, { 'params': params_cnn, 'lr': args.learning_rate * args.scale_learning_rate_cnn }], lr=args.learning_rate, weight_decay=args.weight_decay) keep_cnn_gradients = True print("Fine tuning resnet") else: optimizer = torch.optim.Adam(params, lr=args.learning_rate) if args.resume: model_path = os.path.join(args.save_dir, args.project_name, args.model_name, 'checkpoints', 'model.ckpt') optim_path = os.path.join(args.save_dir, args.project_name, args.model_name, 'checkpoints', 'optim.ckpt') optimizer.load_state_dict(torch.load(optim_path, map_location=map_loc)) for state in optimizer.state.values(): for k, v in state.items(): if isinstance(v, torch.Tensor): state[k] = v.to(device) model.load_state_dict(torch.load(model_path, map_location=map_loc)) if args.transfer_from != '': # loads CNN encoder from transfer_from model model_path = os.path.join(args.save_dir, args.project_name, args.transfer_from, 'checkpoints', 'modelbest.ckpt') pretrained_dict = torch.load(model_path, map_location=map_loc) pretrained_dict = { k: v for k, v in pretrained_dict.items() if 'encoder' in k } model.load_state_dict(pretrained_dict, strict=False) args, model = merge_models(args, model, ingr_vocab_size, instrs_vocab_size) if device != 'cpu' and torch.cuda.device_count() > 1: model = nn.DataParallel(model) model = model.to(device) cudnn.benchmark = True if not hasattr(args, 'current_epoch'): args.current_epoch = 0 es_best = 10000 if args.es_metric == 'loss' else 0 # Train the model start = args.current_epoch for epoch in range(start, args.num_epochs): # save current epoch for resuming if args.tensorboard: logger.reset() args.current_epoch = epoch # # increase / decrase values for moving params if args.decay_lr: frac = epoch // args.lr_decay_every decay_factor = args.lr_decay_rate**frac new_lr = args.learning_rate * decay_factor print('Epoch %d. lr: %.5f' % (epoch, new_lr)) set_lr(optimizer, decay_factor) if args.finetune_after != -1 and args.finetune_after < epoch \ and not keep_cnn_gradients and params_cnn is not None: print("Starting to fine tune CNN") # start with learning rates as they were (if decayed during training) optimizer = torch.optim.Adam([{ 'params': params }, { 'params': params_cnn, 'lr': decay_factor * args.learning_rate * args.scale_learning_rate_cnn }], lr=decay_factor * args.learning_rate) keep_cnn_gradients = True for split in ['train', 'val']: if split == 'train': model.train() else: # model.eval() #val still train model.train() total_step = len(data_loaders[split]) loader = iter(data_loaders[split]) total_loss_dict = { 'recipe_loss': [], 'ingr_loss': [], 'eos_loss': [], 'loss': [], 'iou': [], 'perplexity': [], 'iou_sample': [], 'f1': [], 'card_penalty': [] } error_types = { 'tp_i': 0, 'fp_i': 0, 'fn_i': 0, 'tn_i': 0, 'tp_all': 0, 'fp_all': 0, 'fn_all': 0 } torch.cuda.synchronize() start = time.time() for i in range(total_step): img_inputs, captions = loader.next() img_inputs = img_inputs.to(device) captions = captions.to(device) true_caps_batch = captions.clone()[:, 1:].contiguous() loss_dict = {} if split == 'val': #with torch.no_grad(): # losses = model(img_inputs, captions) #val still train losses = model(img_inputs, captions, keep_cnn_gradients=keep_cnn_gradients) else: losses = model(img_inputs, captions, keep_cnn_gradients=keep_cnn_gradients) if not args.ingrs_only: recipe_loss = losses['recipe_loss'] recipe_loss = recipe_loss.view(true_caps_batch.size()) non_pad_mask = true_caps_batch.ne(ingr_vocab_size - 1).float() recipe_loss = torch.sum(recipe_loss * non_pad_mask, dim=-1) / torch.sum(non_pad_mask, dim=-1) perplexity = torch.exp(recipe_loss) recipe_loss = recipe_loss.mean() perplexity = perplexity.mean() loss_dict['recipe_loss'] = recipe_loss.item() loss_dict['perplexity'] = perplexity.item() ingr_loss, eos_loss, card_penalty = 0, 0, 0 loss = args.loss_weight[0] * recipe_loss + args.loss_weight[1] * ingr_loss \ + args.loss_weight[2]*eos_loss + args.loss_weight[3]*card_penalty loss_dict['loss'] = loss.item() for key in loss_dict.keys(): total_loss_dict[key].append(loss_dict[key]) #if split == 'train': #val still train if split == 'train' or split == 'val': model.zero_grad() loss.backward() optimizer.step() # Print log info if args.log_step != -1 and i % args.log_step == 0: elapsed_time = time.time() - start lossesstr = "" for k in total_loss_dict.keys(): if len(total_loss_dict[k]) == 0: continue this_one = "%s: %.4f" % ( k, np.mean(total_loss_dict[k][-args.log_step:])) lossesstr += this_one + ', ' # this only displays nll loss on captions, the rest of losses will be in tensorboard logs strtoprint = 'Split: %s, Epoch [%d/%d], Step [%d/%d], Losses: %sTime: %.4f' % ( split, epoch, args.num_epochs, i, total_step, lossesstr, elapsed_time) print(strtoprint) if args.tensorboard: # logger.histo_summary(model=model, step=total_step * epoch + i) logger.scalar_summary( mode=split + '_iter', epoch=total_step * epoch + i, **{ k: np.mean(v[-args.log_step:]) for k, v in total_loss_dict.items() if v }) torch.cuda.synchronize() start = time.time() del loss, losses, captions, img_inputs if split == 'val' and not args.recipe_only: ret_metrics = { 'accuracy': [], 'f1': [], 'jaccard': [], 'f1_ingredients': [], 'dice': [] } compute_metrics( ret_metrics, error_types, ['accuracy', 'f1', 'jaccard', 'f1_ingredients', 'dice'], eps=1e-10, weights=None) total_loss_dict['f1'] = ret_metrics['f1'] if args.tensorboard: # 1. Log scalar values (scalar summary) logger.scalar_summary( mode=split, epoch=epoch, **{k: np.mean(v) for k, v in total_loss_dict.items() if v}) # Save the model's best checkpoint if performance was improved# Save the model's best checkpoint if performance was improved es_value = np.mean(total_loss_dict[args.es_metric]) # save current model as well save_model(model, optimizer, checkpoints_dir, suff='') if (args.es_metric == 'loss' and es_value < es_best) or (args.es_metric == 'iou_sample' and es_value > es_best): es_best = es_value save_model(model, optimizer, checkpoints_dir, suff='best') pickle.dump(args, open(os.path.join(checkpoints_dir, 'args.pkl'), 'wb')) curr_pat = 0 print('Saved checkpoint.') else: curr_pat += 1 if curr_pat > args.patience: break if args.tensorboard: logger.close()