def train(train_loader, model, optimizer, epochs, batch_size, train_size, clip, test_path): best_dice_score = 0 for epoch in range(1, epochs): # 99 epoch adjust_lr(optimizer, lr, epoch, 0.1, 200) model.train() size_rates = [0.75, 1, 1.25] loss_record = AvgMeter() criterion = WIoUBCELoss() for i, pack in enumerate(train_loader, start=1): for rate in size_rates: optimizer.zero_grad() images, gts = pack images = Variable(images).cuda() gts = Variable(gts).cuda() trainsize = int(round(train_size * rate / 32) * 32) if rate != 1: images = F.upsample(images, size=(trainsize, trainsize), mode='bilinear', align_corners=True) gts = F.upsample(gts, size=(trainsize, trainsize), mode='bilinear', align_corners=True) # predict predict_maps = model(images) loss = criterion(predict_maps, gts) loss.backward() clip_gradient(optimizer, clip) optimizer.step() if rate == 1: loss_record.update(loss.data, batch_size) if i % 20 == 0 or i == total_step: print( f'{datetime.now()} Epoch [{epoch}/{epochs}], Step [{i}/{total_step}], Loss: {loss_record.show()}' ) train_logger.info( f'{datetime.now()} Epoch [{epoch}/{epochs}], Step [{i}/{total_step}], Loss: {loss_record.show()}' ) save_path = 'checkpoints/' os.makedirs(save_path, exist_ok=True) if (epoch + 1) % 1 == 0: meandice = validation(model, test_path) print(f'meandice: {meandice}') train_logger.info(f'meandice: {meandice}') if meandice > best_dice_score: best_dice_score = meandice torch.save(model.state_dict(), save_path + 'effnetv2pd.pth') print('[Saving Snapshots:]', save_path + 'effnetv2pd.pth', meandice) if epoch in [50, 60, 70]: file_ = 'effnetv2pd_' + str(epoch) + '.pth' torch.save(model.state_dict(), save_path + file_) print('[Saving Snapshots:]', save_path + file_, meandice)
def train(train_loader, model, optimizer, epochs, batch_size, train_size, clip, test_path): best_dice_score = 0 for epoch in range(1, epochs): adjust_lr(optimizer, lr, epoch, 0.1, 200) for param in optimizer.param_groups: print(param['lr']) model.train() size_rates = [0.75, 1, 1.25] loss_record2, loss_record3, loss_record4, loss_record5 = AvgMeter( ), AvgMeter(), AvgMeter(), AvgMeter() criterion = WIoUBCELoss() for i, pack in enumerate(train_loader, start=1): for rate in size_rates: optimizer.zero_grad() images, gts = pack images = Variable(images).cuda() gts = Variable(gts).cuda() trainsize = int(round(train_size * rate / 32) * 32) if rate != 1: images = F.upsample(images, size=( trainsize, trainsize), mode='bilinear', align_corners=True) gts = F.upsample(gts, size=( trainsize, trainsize), mode='bilinear', align_corners=True) # predict lateral_map_5, lateral_map_4, lateral_map_3, lateral_map_2 = model( images) loss5 = criterion(lateral_map_5, gts) loss4 = criterion(lateral_map_4, gts) loss3 = criterion(lateral_map_3, gts) loss2 = criterion(lateral_map_2, gts) loss = loss2 + loss3 + loss4 + loss5 loss.backward() clip_gradient(optimizer, clip) optimizer.step() if rate == 1: loss_record2.update(loss2.data, batch_size) loss_record3.update(loss3.data, batch_size) loss_record4.update(loss4.data, batch_size) loss_record5.update(loss5.data, batch_size) if i % 20 == 0 or i == total_step: print(f'{datetime.now()} Epoch [{epoch}/{epochs}], Step [{i}/{total_step}], [lateral-2: {loss_record2.show()}, lateral-3: {loss_record3.show()}, lateral-4: {loss_record4.show()}, lateral-5: {loss_record5.show()},]') train_logger.info( f'{datetime.now()} Epoch [{epoch}/{epochs}], Step [{i}/{total_step}], [lateral-2: {loss_record2.show()}, lateral-3: {loss_record3.show()}, lateral-4: {loss_record4.show()}, lateral-5: {loss_record5.show()},]') save_path = 'checkpoints/' os.makedirs(save_path, exist_ok=True) if (epoch+1) % 1 == 0: meandice = validation(model, test_path) print(f'meandice: {meandice}') train_logger.info(f'meandice: {meandice}') if meandice > best_dice_score: best_dice_score = meandice torch.save(model.state_dict(), save_path + 'PraHarDNet.pth') print('[Saving Snapshots:]', save_path + 'PraHarDNet.pth', meandice)
def train_validate(self): for epoch in range(self.args.start_epoch, self.args.n_epochs): _ = adjust_lr(self.args.lr, self.model.optimizer_G, epoch, [40, 80, 160, 240]) new_lr = adjust_lr(self.args.lr, self.model.optimizer_D, epoch, [40, 80, 160, 240]) self.new_lr = min(new_lr, self.new_lr) self.epoch = epoch self.train() if epoch % self.args.validate_freq == 0 and epoch > self.args.save_freq: self.validate() # self.validate_loader(self.normal_test_loader) # self.validate_loader(self.amd_fundus_loader) # self.validate_loader(self.myopia_fundus_loader) print('\n', '*' * 10, 'Program Information', '*' * 10) print('Node: {}'.format(self.args.node)) print('GPU: {}'.format(self.args.gpu)) print('Version: {}\n'.format(self.args.version))
def train_val(self): # general metrics self.best_auc = 0 self.is_best = False # self.total_auc_top10 = AverageMeter() self.total_auc_last10 = LastAvgMeter(length=10) self.acc_last10 = LastAvgMeter(length=10) # metrics for iSee self.myopia_auc_last10 = LastAvgMeter(length=10) self.amd_auc_last10 = LastAvgMeter(length=10) self.glaucoma_auc_last10 = LastAvgMeter(length=10) self.dr_auc_last10 = LastAvgMeter(length=10) for epoch in range(self.args.start_epoch, self.args.n_epochs): if self.args.data_modality == 'fundus': # total: 1000 adjust_lr_epoch_list = [40, 80, 160, 240] else: # total: 180 adjust_lr_epoch_list = [20, 40, 80, 120] _ = adjust_lr(self.args.lr, self.model.optimizer_G, epoch, adjust_lr_epoch_list) new_lr = adjust_lr(self.args.lr, self.model.optimizer_D, epoch, adjust_lr_epoch_list) self.new_lr = min(new_lr, self.new_lr) self.epoch = epoch self.train() # last 80 epoch, validate with freq if epoch > self.args.validate_start_epoch \ and (epoch % self.args.validate_freq == 0 or epoch > (self.args.n_epochs - self.args.validate_each_epoch)): self.validate_cls() print('\n', '*' * 10, 'Program Information', '*' * 10) print('Node: {}'.format(self.args.node)) print('GPU: {}'.format(self.args.gpu)) print('Version: {}\n'.format(self.args.version))
def main(optin): if not os.path.exists('checkpoint/' + optin.exp): os.makedirs('checkpoint/' + optin.exp) model = poseNet(101).cuda() model.train() #model = torch.nn.DataParallel(model).cuda() optimizer = torch.optim.Adam(model.parameters(), lr=optin.lr) criterion = torch.nn.MSELoss().cuda() # print(os.path.join('./annotations/person_keypoints_train2017.json')) coco_train = COCO( os.path.join('./annotations/person_keypoints_train2017.json')) trainloader = DataLoader(dataset=COCOkeypointloader(coco_train), batch_size=optin.batch_size, num_workers=optin.num_workers, shuffle=True) bar = Bar('-->', fill='>', max=len(trainloader)) for epoch in range(optin.number_of_epoch): print('-------------Training Epoch {}-------------'.format(epoch)) print('Total Step:', len(trainloader), '| Total Epoch:', optin.number_of_epoch) lr = adjust_lr(optimizer, epoch, optin.lr_gamma) print('\nEpoch: %d | LR: %.8f' % (epoch + 1, lr)) for idx, (input, label) in tqdm(enumerate(trainloader)): input = input.cuda().float() label = label.cuda().float() outputs = model(input) optimizer.zero_grad() loss = criterion(outputs, label) loss.backward() optimizer.step() print('Epoch {} : loss {}'.format(epoch, loss.data)) #if idx % 200 == 0: # bar.suffix = 'Epoch: {epoch} Total: {ttl} | ETA: {eta:} | loss:{loss}' \ # .format(ttl=bar.elapsed_td, eta=bar.eta_td, loss=loss.data, epoch=epoch) # bar.next() if epoch % 5 == 0: torch.save( model, os.path.join('checkpoint/' + optin.exp, 'model_{}.pth'.format(epoch)))
# We support two kind of backbone assert opt.hardnet in [68, 85], "We support two kind of backbone [HarDNet68, HarDNet85]" model = HarDMSEG(arch=opt.hardnet).cuda() # ---- flops and params ---- # from utils.utils import CalParams # x = torch.randn(1, 3, 352, 352).cuda() # CalParams(lib, x) params = model.parameters() if opt.optimizer == 'Adam': optimizer = torch.optim.Adam(params, opt.lr) else: optimizer = torch.optim.SGD(params, opt.lr, weight_decay = 1e-4, momentum = 0.9) print(optimizer) image_root = '{}/images/'.format(opt.train_path) gt_root = '{}/masks/'.format(opt.train_path) print("Dataset root: " + image_root) train_loader = get_loader(image_root, gt_root, batchsize=opt.batchsize, trainsize=opt.trainsize, num_workers=opt.num_workers, augmentation = opt.augmentation) total_step = len(train_loader) print("#"*20, "Start Training", "#"*20) for epoch in range(1, opt.epoch): adjust_lr(optimizer, opt.lr, epoch, 0.1, 200) train(train_loader, model, optimizer, epoch, opt.test_path)
# logging logging.basicConfig( filename=save_path + 'log.log', format='[%(asctime)s-%(filename)s-%(levelname)s:%(message)s]', level=logging.INFO, filemode='a', datefmt='%Y-%m-%d %I:%M:%S %p') logging.info("Network-Train") logging.info("Config") logging.info( 'epoch: {}; lr: {}; batchsize: {}; trainsize: {}; clip: {}; decay_rate: {}; load: {}; ' 'save_path: {}; decay_epoch: {}'.format(opt.epoch, opt.lr, opt.batchsize, opt.trainsize, opt.clip, opt.decay_rate, opt.load, save_path, opt.decay_epoch)) step = 0 writer = SummaryWriter(save_path + 'summary') best_mae = 1 best_epoch = 0 print("Start train...") for epoch in range(1, opt.epoch): cur_lr = adjust_lr(optimizer, opt.lr, epoch, opt.decay_rate, opt.decay_epoch) writer.add_scalar('learning_rate', cur_lr, global_step=epoch) train(train_loader, model, optimizer, epoch, save_path, writer, cur_loss) val(val_loader, model, epoch, save_path, writer)
def training(config, data, **kwargs): """Training pipeline for embedding. Args: data: iterator within dataset epochs: how much training epochs to perform n_subact: number of subactions in current complex activity mnist: if training with mnist dataset (just to test everything how well it works) Returns: trained pytorch model """ logger.debug('create model') torch.manual_seed(config["seed"]) model = kwargs['model'] loss = kwargs['loss'] optimizer = kwargs['optimizer'] create_dataloader = lambda x: \ torch.utils.data.DataLoader(x, batch_size=config["batch_size"], shuffle=True, num_workers=config["num_workers"]) if config["sparse"]: dataset = data data = create_dataloader(dataset) cudnn.benchmark = True batch_time = Averaging() data_time = Averaging() losses = Averaging() adjustable_lr = config["lr"] logger.debug('epochs: %s', config["epochs"]) for epoch in range(config["epochs"]): model.cuda() model.train() logger.debug('Epoch # %d' % epoch) if config["lr_adj"]: if epoch % (50) == 0 and epoch > 0: adjustable_lr = adjust_lr(optimizer, adjustable_lr) logger.debug('lr: %f' % adjustable_lr) end_time = time.time() # start_time = time.time(); print(len(data)) train_acc_epoch = torch.zeros((1, 1)) time_epoch = time.time() for i, (features, labels) in enumerate(data): # print i data_time.update(time.time() - end_time) features = features.float().cuda(non_blocking=True) labels = labels.long().cuda() #labels_one_hot = _to_one_hot(labels, config["n_classes"]) output = model(features) max_index = output.max(dim=1)[1] train_acc = (max_index == labels).sum() train_acc_epoch = train_acc_epoch + train_acc loss_values = loss(output, labels) losses.update(loss_values.item(), features.size(0)) optimizer.zero_grad() loss_values.backward() optimizer.step() batch_time.update(time.time() - end_time) end_time = time.time() ''' if i % 5000 == 0 and i: logger.debug('Epoch: [{0}][{1}/{2}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t'.format( epoch, i, len(data), batch_time=batch_time, data_time=data_time, loss=losses)) # print(time.time() - start_time); # start_time = time.time(); ''' logger.debug('duration: %f' % (time.time() - time_epoch)) logger.debug('train_err: %f' % (1 - ((train_acc_epoch.cpu()).numpy() / (len(data) * config["batch_size"])))) logger.debug('loss: %f' % losses.avg) losses.reset() if epoch % 1 == 0 and config["save_model"]: save_dict = { 'epoch': epoch, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict() } dir_check(config["model_folder"]) logger.debug('Saving model to: %s' % ops.join(config["model_folder"], '%s%d.pth.tar' % (config["log_str"], epoch))) torch.save( save_dict, ops.join(config["model_folder"], '%s%d.pth.tar' % (config["log_str"], epoch))) logger.debug('Saving probs to: %s' % ops.join(config["model_folder"], '%s%d.probs' % (config["log_str"], epoch))) data.dataset.save_probs( ops.join(config["model_folder"], '%s%d.probs' % (config["log_str"], epoch))) if config["sparse"]: dataset.next_epoch() data = create_dataloader(dataset) if config["save_model"]: save_dict = { 'epoch': config["epochs"], 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict() } dir_check(config["model_folder"]) logger.debug('Saving model to: %s' % ops.join(config["model_folder"], '%s%d.pth.tar' % (config["log_str"], epoch))) torch.save( save_dict, ops.join(config["model_folder"], '%s%d.pth.tar' % (config["log_str"], epoch))) logger.debug('Saving probs to: %s' % ops.join(config["model_folder"], '%s%d.probs' % (config["log_str"], epoch))) data.dataset.save_probs( ops.join(config["model_folder"], '%s%d.probs' % (config["log_str"], epoch))) return model
def train_vae(args, dtype=torch.float32): torch.set_default_dtype(dtype) state_dim = args.state_dim output_path = args.output_path # generate state pairs expert_traj_raw = list(pickle.load(open(args.expert_traj_path, "rb"))) state_pairs = generate_pairs(expert_traj_raw, state_dim, args.size_per_traj, max_step=10, min_step=5) # tune the step size if needed. # shuffle and split idx = np.arange(state_pairs.shape[0]) np.random.shuffle(idx) state_pairs = state_pairs[idx, :] split = (state_pairs.shape[0] * 19) // 20 state_tuples = state_pairs[:split, :] test_state_tuples = state_pairs[split:, :] print(state_tuples.shape) print(test_state_tuples.shape) goal_model = VAE(state_dim, latent_dim=128) optimizer_vae = torch.optim.Adam(goal_model.parameters(), lr=args.model_lr) save_path = '{}_softbc_{}_{}'.format(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"), args.env_name, \ args.beta) writer = SummaryWriter(log_dir=os.path.join(output_path, 'runs/' + save_path)) if args.weight: state_dim = state_dim + 1 state_tuples = torch.from_numpy(state_pairs).to(dtype) s, t = state_tuples[:, :state_dim - 1], state_tuples[:, state_dim:2 * state_dim] state_tuples_test = torch.from_numpy(test_state_tuples).to(dtype) s_test, t_test = state_tuples_test[:, :state_dim - 1], state_tuples_test[:, state_dim:2 * state_dim] else: state_tuples = torch.from_numpy(state_pairs).to(dtype) s, t = state_tuples[:, :state_dim], state_tuples[:, state_dim:2 * state_dim] state_tuples_test = torch.from_numpy(test_state_tuples).to(dtype) s_test, t_test = state_tuples_test[:, : state_dim], state_tuples_test[:, state_dim: 2 * state_dim] for i in range(1, args.iter + 1): loss = goal_model.train(s, t, epoch=args.epoch, optimizer=optimizer_vae, \ batch_size=args.optim_batch_size, beta=args.beta, use_weight=args.weight) next_states = goal_model.get_next_states(s_test) if args.weight: val_error = (t_test[:, -1].unsqueeze(1) * (t_test[:, :-1] - next_states)**2).mean() else: val_error = ((t_test[:, :-1] - next_states)**2).mean() writer.add_scalar('loss/vae', loss, i) writer.add_scalar('valid/vae', val_error, i) if i % args.lr_decay_rate == 0: adjust_lr(optimizer_vae, 2.) torch.save( goal_model.state_dict(), os.path.join(output_path, '{}_{}_vae.pt'.format(args.env_name, str(args.beta))))
def train(args, model, optimizer,dataloader_train,total): # Dicedict = {'CVC-300': [], 'CVC-ClinicDB': [], 'Kvasir': [], 'CVC-ColonDB': [], 'ETIS-LaribPolypDB': [], # 'test': []} Dicedict = {"CVC-ClinicDB-612-Test":[], "CVC-ClinicDB-612-Valid":[], "CVC-ColonDB-300":[], 'test': []} best_dice=0 best_epo =0 BCE = torch.nn.BCEWithLogitsLoss() criterion = u.BceDiceLoss() for epoch in range(1, args.num_epochs+1): u.adjust_lr(optimizer, args.lr, epoch, args.decay_rate, args.decay_epoch) size_rates = [0.75, 1, 1.25] # replace your desired scale, try larger scale for better accuracy in small object model.train() loss_record = [] loss_record1, loss_record2, loss_record3, loss_record4, loss_record5 = u.AvgMeter(), u.AvgMeter(), u.AvgMeter(), u.AvgMeter(), u.AvgMeter() for i, (data, label) in enumerate(dataloader_train, start=1): for rate in size_rates: #dataprepare if torch.cuda.is_available() and args.use_gpu: data = Variable(data).cuda() label = Variable(label).cuda() # edgs = Variable(edgs).cuda() # rescale trainsize = int(round(args.trainsize * rate / 32) * 32) if rate != 1: data = F.upsample(data, size=(trainsize, trainsize), mode='bilinear', align_corners=True) label = F.upsample(label, size=(trainsize, trainsize), mode='bilinear', align_corners=True) # edgs = F.upsample(edgs, size=(trainsize, trainsize), mode='bilinear', align_corners=True) """ 网络训练 标准三步 """ optimizer.zero_grad() prediction1, prediction2 =model(data) """ 计算损失函数 """ loss = u.bce_dice(prediction1,label)+u.bce_dice(prediction2,label) loss.backward() u.clip_gradient(optimizer, args.clip) optimizer.step() loss_record.append(loss.item()) # ---- train visualization ---- if i % 20 == 0 or i == total: loss_train_mean = np.mean(loss_record) print('{} Epoch [{:03d}/{:03d}], Step [{:04d}/{:04d}], ' '[loss for train : {:.4f}]'. format(datetime.now(), epoch, args.num_epochs, i, len(dataloader_train), loss_train_mean)) if (epoch + 1) % 1 == 0: for dataset in args.testdataset: # for dataset in ['CVC-300', 'CVC-ClinicDB', 'Kvasir', 'CVC-ColonDB', 'ETIS-LaribPolypDB']: dataset_dice = valid(model, dataset,args) print("dataset:{},Dice:{:.4f}".format(dataset, dataset_dice)) Dicedict[dataset].append(dataset_dice) meandice = valid(model, 'test',args ) print("dataset:{},Dice:{:.4f}".format("test", meandice)) Dicedict['test'].append(meandice) if meandice > best_dice: best_dice = meandice best_epo =epoch checkpoint_dir = "./checkpoint" filename = 'model_{}_{:03d}_{:.4f}.pth.tar'.format(args.net_work, epoch,best_dice) checkpointpath = os.path.join(checkpoint_dir, filename) torch.save(model.state_dict(), checkpointpath) print('############# Saving best ##########################################BestAvgDice:{}'.format(best_dice)) print('bestepo:{:03d} ,bestdice :{:.4f}'.format(best_epo,best_dice))
def main(): cmd_ls = sys.argv[1:] cmd = generate_cmd(cmd_ls) if "--freeze_bn False" in cmd: opt.freeze_bn = False if "--addDPG False" in cmd: opt.addDPG = False print( "----------------------------------------------------------------------------------------------------" ) print("This is the model with id {}".format(save_ID)) print(opt) print("Training backbone is: {}".format(opt.backbone)) dataset_str = "" for k, v in config.train_info.items(): dataset_str += k dataset_str += "," print("Training data is: {}".format(dataset_str[:-1])) print("Warm up end at {}".format(warm_up_epoch)) for k, v in config.bad_epochs.items(): if v > 1: raise ValueError("Wrong stopping accuracy!") print( "----------------------------------------------------------------------------------------------------" ) exp_dir = os.path.join("exp/{}/{}".format(folder, save_ID)) log_dir = os.path.join(exp_dir, "{}".format(save_ID)) os.makedirs(log_dir, exist_ok=True) log_name = os.path.join(log_dir, "{}.txt".format(save_ID)) train_log_name = os.path.join(log_dir, "{}_train.xlsx".format(save_ID)) bn_file = os.path.join(log_dir, "{}_bn.txt".format(save_ID)) # Prepare Dataset # Model Initialize if device != "cpu": m = createModel(cfg=model_cfg).cuda() else: m = createModel(cfg=model_cfg).cpu() print(m, file=open("model.txt", "w")) begin_epoch = 0 pre_train_model = opt.loadModel flops = print_model_param_flops(m) print("FLOPs of current model is {}".format(flops)) params = print_model_param_nums(m) print("Parameters of current model is {}".format(params)) inf_time = get_inference_time(m, height=opt.outputResH, width=opt.outputResW) print("Inference time is {}".format(inf_time)) print( "----------------------------------------------------------------------------------------------------" ) if opt.freeze > 0 or opt.freeze_bn: if opt.backbone == "mobilenet": feature_layer_num = 155 feature_layer_name = "features" elif opt.backbone == "seresnet101": feature_layer_num = 327 feature_layer_name = "preact" elif opt.backbone == "seresnet18": feature_layer_num = 75 feature_layer_name = "seresnet18" elif opt.backbone == "shufflenet": feature_layer_num = 167 feature_layer_name = "shuffle" else: raise ValueError("Not a correct name") feature_num = int(opt.freeze * feature_layer_num) for idx, (n, p) in enumerate(m.named_parameters()): if len(p.shape) == 1 and opt.freeze_bn: p.requires_grad = False elif feature_layer_name in n and idx < feature_num: p.requires_grad = False else: p.requires_grad = True writer = SummaryWriter('exp/{}/{}'.format(folder, save_ID), comment=cmd) if device != "cpu": # rnd_inps = Variable(torch.rand(3, 3, 224, 224), requires_grad=True).cuda() rnd_inps = torch.rand(3, 3, 224, 224).cuda() else: rnd_inps = torch.rand(3, 3, 224, 224) # rnd_inps = Variable(torch.rand(3, 3, 224, 224), requires_grad=True) try: writer.add_graph(m, (rnd_inps, )) except: pass shuffle_dataset = False for k, v in config.train_info.items(): if k not in open_source_dataset: shuffle_dataset = True train_dataset = MyDataset(config.train_info, train=True) val_dataset = MyDataset(config.train_info, train=False) if shuffle_dataset: val_dataset.img_val, val_dataset.bbox_val, val_dataset.part_val = \ train_dataset.img_val, train_dataset.bbox_val, train_dataset.part_val train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=opt.trainBatch, shuffle=True, num_workers=opt.train_worker, pin_memory=True) val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=opt.validBatch, shuffle=True, num_workers=opt.val_worker, pin_memory=True) # for k, v in config.train_info.items(): # train_dataset = Mscoco([v[0], v[1]], train=True, val_img_num=v[2]) # val_dataset = Mscoco([v[0], v[1]], train=False, val_img_num=v[2]) # # train_loaders[k] = torch.utils.data.DataLoader( # train_dataset, batch_size=config.train_batch, shuffle=True, num_workers=config.train_mum_worker, # pin_memory=True) # # val_loaders[k] = torch.utils.data.DataLoader( # val_dataset, batch_size=config.val_batch, shuffle=False, num_workers=config.val_num_worker, pin_memory=True) # # train_loader = torch.utils.data.DataLoader( # train_dataset, batch_size=config.train_batch, shuffle=True, num_workers=config.train_mum_worker, # pin_memory=True) # val_loader = torch.utils.data.DataLoader( # val_dataset, batch_size=config.val_batch, shuffle=False, num_workers=config.val_num_worker, pin_memory=True) # assert train_loaders != {}, "Your training data has not been specific! " os.makedirs("exp/{}/{}".format(folder, save_ID), exist_ok=True) if pre_train_model: if "duc_se.pth" not in pre_train_model: if "pretrain" not in pre_train_model: try: info_path = os.path.join("exp", folder, save_ID, "option.pkl") info = torch.load(info_path) opt.trainIters = info.trainIters opt.valIters = info.valIters begin_epoch = int(pre_train_model.split("_")[-1][:-4]) + 1 except: # begin_epoch = int(pre_train_model.split("_")[-1][:-4]) + 1 with open(log_name, "a+") as f: f.write(cmd) print('Loading Model from {}'.format(pre_train_model)) m.load_state_dict(torch.load(pre_train_model)) else: with open(log_name, "a+") as f: f.write(cmd) print('Loading Model from {}'.format(pre_train_model)) m.load_state_dict(torch.load(pre_train_model)) m.conv_out = nn.Conv2d(m.DIM, opt.kps, kernel_size=3, stride=1, padding=1) if device != "cpu": m.conv_out.cuda() os.makedirs("exp/{}/{}".format(folder, save_ID), exist_ok=True) else: print('Create new model') with open(log_name, "a+") as f: f.write(cmd) print(opt, file=f) f.write("FLOPs of current model is {}\n".format(flops)) f.write("Parameters of current model is {}\n".format(params)) with open(os.path.join(log_dir, "tb.py"), "w") as pyfile: pyfile.write("import os\n") pyfile.write("os.system('conda init bash')\n") pyfile.write("os.system('conda activate py36')\n") pyfile.write( "os.system('tensorboard --logdir=../../../../exp/{}/{}')".format( folder, save_ID)) params_to_update, layers = [], 0 for name, param in m.named_parameters(): layers += 1 if param.requires_grad: params_to_update.append(param) print("Training {} layers out of {}".format(len(params_to_update), layers)) if optimize == 'rmsprop': optimizer = torch.optim.RMSprop(params_to_update, lr=opt.LR, momentum=opt.momentum, weight_decay=opt.weightDecay) elif optimize == 'adam': optimizer = torch.optim.Adam(params_to_update, lr=opt.LR, weight_decay=opt.weightDecay) elif optimize == 'sgd': optimizer = torch.optim.SGD(params_to_update, lr=opt.LR, momentum=opt.momentum, weight_decay=opt.weightDecay) else: raise Exception if mix_precision: m, optimizer = amp.initialize(m, optimizer, opt_level="O1") # Model Transfer if device != "cpu": m = torch.nn.DataParallel(m).cuda() criterion = torch.nn.MSELoss().cuda() else: m = torch.nn.DataParallel(m) criterion = torch.nn.MSELoss() # loss, acc = valid(val_loader, m, criterion, optimizer, writer) # print('Valid:-{idx:d} epoch | loss:{loss:.8f} | acc:{acc:.4f}'.format( # idx=-1, # loss=loss, # acc=acc # )) # early_stopping = EarlyStopping(patience=opt.patience, verbose=True) train_acc, val_acc, train_loss, val_loss, best_epoch, train_dist, val_dist, train_auc, val_auc, train_PR, val_PR = \ 0, 0, float("inf"), float("inf"), 0, float("inf"), float("inf"), 0, 0, 0, 0 train_acc_ls, val_acc_ls, train_loss_ls, val_loss_ls, train_dist_ls, val_dist_ls, train_auc_ls, val_auc_ls, \ train_pr_ls, val_pr_ls, epoch_ls, lr_ls = [], [], [], [], [], [], [], [], [], [], [], [] decay, decay_epoch, lr, i = 0, [], opt.LR, begin_epoch stop = False m_best = m train_log = open(train_log_name, "w", newline="") bn_log = open(bn_file, "w") csv_writer = csv.writer(train_log) csv_writer.writerow(write_csv_title()) begin_time = time.time() os.makedirs("result", exist_ok=True) result = os.path.join( "result", "{}_result_{}.csv".format(opt.expFolder, config.computer)) exist = os.path.exists(result) # Start Training try: for i in range(opt.nEpochs)[begin_epoch:]: opt.epoch = i epoch_ls.append(i) train_log_tmp = [save_ID, i, lr] log = open(log_name, "a+") print('############# Starting Epoch {} #############'.format(i)) log.write( '############# Starting Epoch {} #############\n'.format(i)) if i < warm_up_epoch: optimizer, lr = warm_up_lr(optimizer, i) else: optimizer, lr = adjust_lr(optimizer, i, lr_dict, opt.nEpochs) writer.add_scalar("lr", lr, i) print("epoch {}: lr {}".format(i, lr)) loss, acc, dist, auc, pr, pt_acc, pt_dist, pt_auc, pt_pr = \ train(train_loader, m, criterion, optimizer, writer) train_log_tmp.append(" ") train_log_tmp.append(loss) train_log_tmp.append(acc.tolist()) train_log_tmp.append(dist.tolist()) train_log_tmp.append(auc) train_log_tmp.append(pr) for a in pt_acc: train_log_tmp.append(a.tolist()) train_log_tmp.append(" ") for d in pt_dist: train_log_tmp.append(d.tolist()) train_log_tmp.append(" ") for ac in pt_auc: train_log_tmp.append(ac) train_log_tmp.append(" ") for p in pt_pr: train_log_tmp.append(p) train_log_tmp.append(" ") train_acc_ls.append(acc) train_loss_ls.append(loss) train_dist_ls.append(dist) train_auc_ls.append(auc) train_pr_ls.append(pr) train_acc = acc if acc > train_acc else train_acc train_loss = loss if loss < train_loss else train_loss train_dist = dist if dist < train_dist else train_dist train_auc = auc if auc > train_auc else train_auc train_PR = pr if pr > train_PR else train_PR log.write( 'Train:{idx:d} epoch | loss:{loss:.8f} | acc:{acc:.4f} | dist:{dist:.4f} | AUC: {AUC:.4f} | PR: {PR:.4f}\n' .format( idx=i, loss=loss, acc=acc, dist=dist, AUC=auc, PR=pr, )) opt.acc = acc opt.loss = loss m_dev = m.module loss, acc, dist, auc, pr, pt_acc, pt_dist, pt_auc, pt_pr = valid( val_loader, m, criterion, writer) train_log_tmp.insert(9, loss) train_log_tmp.insert(10, acc.tolist()) train_log_tmp.insert(11, dist.tolist()) train_log_tmp.insert(12, auc) train_log_tmp.insert(13, pr) train_log_tmp.insert(14, " ") for a in pt_acc: train_log_tmp.append(a.tolist()) train_log_tmp.append(" ") for d in pt_dist: train_log_tmp.append(d.tolist()) train_log_tmp.append(" ") for ac in pt_auc: train_log_tmp.append(ac) train_log_tmp.append(" ") for p in pt_pr: train_log_tmp.append(p) train_log_tmp.append(" ") val_acc_ls.append(acc) val_loss_ls.append(loss) val_dist_ls.append(dist) val_auc_ls.append(auc) val_pr_ls.append(pr) if acc > val_acc: best_epoch = i val_acc = acc torch.save( m_dev.state_dict(), 'exp/{0}/{1}/{1}_best_acc.pkl'.format(folder, save_ID)) m_best = copy.deepcopy(m) val_loss = loss if loss < val_loss else val_loss if dist < val_dist: val_dist = dist torch.save( m_dev.state_dict(), 'exp/{0}/{1}/{1}_best_dist.pkl'.format(folder, save_ID)) if auc > val_auc: val_auc = auc torch.save( m_dev.state_dict(), 'exp/{0}/{1}/{1}_best_auc.pkl'.format(folder, save_ID)) if pr > val_PR: val_PR = pr torch.save( m_dev.state_dict(), 'exp/{0}/{1}/{1}_best_pr.pkl'.format(folder, save_ID)) log.write( 'Valid:{idx:d} epoch | loss:{loss:.8f} | acc:{acc:.4f} | dist:{dist:.4f} | AUC: {AUC:.4f} | PR: {PR:.4f}\n' .format( idx=i, loss=loss, acc=acc, dist=dist, AUC=auc, PR=pr, )) bn_sum, bn_num = 0, 0 for mod in m.modules(): if isinstance(mod, nn.BatchNorm2d): bn_num += mod.num_features bn_sum += torch.sum(abs(mod.weight)) writer.add_histogram("bn_weight", mod.weight.data.cpu().numpy(), i) bn_ave = bn_sum / bn_num bn_log.write("{} --> {}".format(i, bn_ave)) print("Current bn : {} --> {}".format(i, bn_ave)) bn_log.write("\n") log.close() csv_writer.writerow(train_log_tmp) writer.add_scalar("lr", lr, i) print("epoch {}: lr {}".format(i, lr)) lr_ls.append(lr) torch.save(opt, 'exp/{}/{}/option.pkl'.format(folder, save_ID, i)) if i % opt.save_interval == 0 and i != 0: torch.save( m_dev.state_dict(), 'exp/{0}/{1}/{1}_{2}.pkl'.format(folder, save_ID, i)) # torch.save( # optimizer, 'exp/{}/{}/optimizer.pkl'.format(dataset, save_folder)) for epo, ac in config.bad_epochs.items(): if i == epo and val_acc < ac: stop = True if stop: print("Training finished at epoch {}".format(i)) break training_time = time.time() - begin_time writer.close() train_log.close() draw_graph(epoch_ls, train_loss_ls, val_loss_ls, "loss", log_dir) draw_graph(epoch_ls, train_acc_ls, val_acc_ls, "acc", log_dir) draw_graph(epoch_ls, train_auc_ls, val_auc_ls, "AUC", log_dir) draw_graph(epoch_ls, train_dist_ls, val_dist_ls, "dist", log_dir) draw_graph(epoch_ls, train_pr_ls, val_pr_ls, "PR", log_dir) with open(result, "a+") as f: if not exist: title_str = "id,backbone,structure,DUC,params,flops,time,loss_param,addDPG,kps,batch_size,optimizer," \ "freeze_bn,freeze,sparse,sparse_decay,epoch_num,LR,Gaussian,thresh,weightDecay,loadModel," \ "model_location, ,folder_name,training_time,train_acc,train_loss,train_dist,train_AUC," \ "train_PR,val_acc,val_loss,val_dist,val_AUC,val_PR,best_epoch,final_epoch" title_str = write_decay_title(len(decay_epoch), title_str) f.write(title_str) info_str = "{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{}, ,{},{},{},{},{},{},{},{},{},{},{},{},{},{}\n". \ format(save_ID, opt.backbone, opt.struct, opt.DUC, params, flops, inf_time, opt.loss_allocate, opt.addDPG, opt.kps, opt.trainBatch, opt.optMethod, opt.freeze_bn, opt.freeze, opt.sparse_s, opt.sparse_decay, opt.nEpochs, opt.LR, opt.hmGauss, opt.ratio, opt.weightDecay, opt.loadModel, config.computer, os.path.join(folder, save_ID), training_time, train_acc, train_loss, train_dist, train_auc, train_PR, val_acc, val_loss, val_dist, val_auc, val_PR, best_epoch, i) info_str = write_decay_info(decay_epoch, info_str) f.write(info_str) except IOError: with open(result, "a+") as f: training_time = time.time() - begin_time writer.close() train_log.close() info_str = "{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{}, ,{},{},{}\n". \ format(save_ID, opt.backbone, opt.struct, opt.DUC, params, flops, inf_time, opt.loss_allocate, opt.addDPG, opt.kps, opt.trainBatch, opt.optMethod, opt.freeze_bn, opt.freeze, opt.sparse_s, opt.sparse_decay, opt.nEpochs, opt.LR, opt.hmGauss, opt.ratio, opt.weightDecay, opt.loadModel, config.computer, os.path.join(folder, save_ID), training_time, "Some file is closed") f.write(info_str) except ZeroDivisionError: with open(result, "a+") as f: training_time = time.time() - begin_time writer.close() train_log.close() info_str = "{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{}, ,{},{},{}\n". \ format(save_ID, opt.backbone, opt.struct, opt.DUC, params, flops, inf_time, opt.loss_allocate, opt.addDPG, opt.kps, opt.trainBatch, opt.optMethod, opt.freeze_bn, opt.freeze, opt.sparse_s, opt.sparse_decay, opt.nEpochs, opt.LR, opt.hmGauss, opt.ratio, opt.weightDecay, opt.loadModel, config.computer, os.path.join(folder, save_ID), training_time, "Gradient flow") f.write(info_str) except KeyboardInterrupt: with open(result, "a+") as f: training_time = time.time() - begin_time writer.close() train_log.close() info_str = "{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{}, ,{},{},{}\n". \ format(save_ID, opt.backbone, opt.struct, opt.DUC, params, flops, inf_time, opt.loss_allocate, opt.addDPG, opt.kps, opt.trainBatch, opt.optMethod, opt.freeze_bn, opt.freeze, opt.sparse_s, opt.sparse_decay, opt.nEpochs, opt.LR, opt.hmGauss, opt.ratio, opt.weightDecay, opt.loadModel, config.computer, os.path.join(folder, save_ID), training_time, "Be killed by someone") f.write(info_str) print("Model {} training finished".format(save_ID)) print( "----------------------------------------------------------------------------------------------------" )
def training(train_loader, epochs, n_subact=0, save=True, **kwargs): """Training pipeline for embedding. Args: train_loader: iterator within dataset epochs: how much training epochs to perform n_subact: number of subactions in current complex activity mnist: if training with mnist dataset (just to test everything how well it works) Returns: trained pytorch model """ logger.debug('create model') torch.manual_seed(opt.seed) np.random.seed(opt.seed) try: model = kwargs['model'] loss = kwargs['loss'] optimizer = kwargs['optimizer'] except KeyError: model = Embedding(embed_dim=opt.embed_dim, feature_dim=opt.feature_dim, n_subact=n_subact).cuda() loss = RankLoss(margin=0.2).cuda() optimizer = torch.optim.SGD(model.parameters(), lr=opt.lr, momentum=opt.momentum, weight_decay=opt.weight_decay) cudnn.benchmark = True batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() vis = Visual() best_acc = -1 _lr = opt.lr logger.debug('epochs: %s', epochs) loss_previous = np.inf for epoch in range(epochs): model.cuda() model.train() logger.debug('Epoch # %d' % epoch) if opt.lr_adj: # if epoch in [int(epochs * 0.3), int(epochs * 0.7)]: # if epoch in [int(epochs * 0.5)]: if epoch % 30 == 0 and epoch > 0: _lr = adjust_lr(optimizer, _lr) logger.debug('lr: %f' % _lr) end = time.time() for i, (input, k, _) in enumerate(train_loader): # TODO: not sure that it's necessary data_time.update(time.time() - end) input = input.float().cuda(non_blocking=True) k = k.float().cuda() output = model(input) loss_values = loss(output, k) losses.update(loss_values.item(), input.size(0)) optimizer.zero_grad() loss_values.backward() optimizer.step() batch_time.update(time.time() - end) end = time.time() if i % 100 == 0 and i: logger.debug( 'Epoch: [{0}][{1}/{2}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t'.format( epoch, i, len(train_loader), batch_time=batch_time, data_time=data_time, loss=losses)) logger.debug('loss: %f' % losses.avg) losses.reset() if save: save_dict = { 'epoch': epoch, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict() } dir_check(join(opt.dataset_root, 'models')) dir_check(join(opt.dataset_root, 'models', kwargs['name'])) torch.save( save_dict, join(opt.dataset_root, 'models', kwargs['name'], '%s.pth.tar' % opt.log_str)) return model
def training(dataloader, **kwargs): """Training pipeline for embedding. Args: dataloader: iterator within dataset epochs: how much training epochs to perform n_subact: number of subactions in current complex activity mnist: if training with mnist dataset (just to test everything how well it works) Returns: trained pytorch model """ logger.debug('create model') torch.manual_seed(opt.seed) model = kwargs['model'] loss = kwargs['loss'] optimizer = kwargs['optimizer'] cudnn.benchmark = True batch_time = Averaging() data_time = Averaging() losses = Averaging() adjustable_lr = opt.lr logger.debug('epochs: %s', opt.epochs) for epoch in range(opt.epochs): model.train() logger.debug('Epoch # %d' % epoch) if opt.lr_adj: if epoch % 5 == 0 and epoch > 0: adjustable_lr = adjust_lr(optimizer, adjustable_lr) logger.debug('lr: %f' % adjustable_lr) end = time.time() for i, (features, labels) in enumerate(dataloader): data_time.update(time.time() - end) features = features.cuda(non_blocking=True) # features = features.float().cuda(non_blocking=True) labels = labels.long().cuda() output = model(features) loss_values = loss(output, labels) losses.update(loss_values.item(), labels.size(0)) optimizer.zero_grad() loss_values.backward() optimizer.step() batch_time.update(time.time() - end) end = time.time() if i % 100 == 0 and i: logger.debug( 'Epoch: [{0}][{1}/{2}]\t' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t'.format( epoch, i, len(dataloader), batch_time=batch_time, data_time=data_time, loss=losses)) logger.debug('loss: %f' % losses.avg) losses.reset() if epoch % 1 == 0: save_dict = { 'epoch': epoch, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict() } dir_check(ops.join(opt.dataset_root, 'models')) torch.save( save_dict, ops.join(opt.dataset_root, 'models', '%s%d.pth.tar' % (opt.log_str, epoch))) if opt.save_model: save_dict = { 'epoch': opt.epochs, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict() } dir_check(ops.join(opt.dataset_root, 'models')) torch.save( save_dict, ops.join(opt.dataset_root, 'models', '%s%d.pth.tar' % (opt.log_str, opt.epochs))) return model
f = open("pranet_logs.txt", "w") for epoch in range(100): print('\nEpoch: {}'.format(epoch)) f.write('\nEpoch: {} \n'.format(epoch)) # Training metric_meter_activated.reset() metric_meter_activated2.reset() for i in range(len(loss_meters)): loss_meters[i].reset() progress = tqdm(train_loader, total=len(train_loader)) models = [model.train() for model in models] [adjust_lr(optimizers[i], 1e-4, epoch, 0.1, 50) for i in range(3)] for step, (x, y) in enumerate(progress): outputs = [] for i in range(3): y_pred, loss_value = model_update(models[i], optimizers[i], losses[i], x, y[:, i, :, :]) loss_meters[i].update(loss_value) outputs.append(y_pred) y_pred = torch.stack(outputs, dim=1).squeeze() if len(y_pred.shape) != 4: y_pred = y_pred.unsqueeze(0)