def train(loader, model, epochs=5, batch_size=2, show_loss=False, augmenter=None, lr=None, init_lr=2e-4, saver=None, variables_to_optimize=None, evaluation=True, name_best_model='/root/Ev-SegNet-old/weights/model/best', preprocess_mode=None): training_samples = len(loader.image_train_list) steps_per_epoch = int((training_samples / batch_size) + 1) best_miou = 0 for epoch in range(epochs): # for each epoch lr_decay(lr, init_lr, 1e-9, epoch, epochs - 1) # compute the new lr print('epoch: ' + str(epoch) + '. Learning rate: ' + str(lr.numpy())) for step in range(steps_per_epoch): # for every batch with tf.GradientTape() as g: # get batch # print("process:%.2f"%(step/steps_per_epoch), "\t", step, "/", steps_per_epoch) x, y, mask = loader.get_batch(size=batch_size, train=True, augmenter=augmenter) x = preprocess(x, mode=preprocess_mode) [x, y, mask] = convert_to_tensors([x, y, mask]) y_, aux_y_ = model(x, training=True, aux_loss=True) # get output of the model loss = tf.losses.softmax_cross_entropy(y, y_, weights=mask) # compute loss loss_aux = tf.losses.softmax_cross_entropy(y, aux_y_, weights=mask) # compute loss loss = 1 * loss + 0.8 * loss_aux if show_loss: print('Training loss: ' + str(loss.numpy())) # Gets gradients and applies them grads = g.gradient(loss, variables_to_optimize) optimizer.apply_gradients(zip(grads, variables_to_optimize)) if evaluation: # get metrics # train_acc, train_miou = get_metrics(loader, model, loader.n_classes, train=True, preprocess_mode=preprocess_mode) test_acc, test_miou = get_metrics(loader, model, loader.n_classes, train=False, flip_inference=False, scales=[1], preprocess_mode=preprocess_mode) # print('Train accuracy: ' + str(train_acc.numpy())) # print('Train miou: ' + str(train_miou)) print('Test accuracy: ' + str(test_acc.numpy())) print('Test miou: ' + str(test_miou)) print('Best miou: ' + str(best_miou)) print('') # save model if bet if test_miou > best_miou: best_miou = test_miou saver.save(name_best_model) else: saver.save(name_best_model) loader.suffle_segmentation() # sheffle trainign set
def train(data_loader, model_pos, criterion, optimizer, device, lr_init, lr_now, step, decay, gamma, max_norm=True): batch_time = AverageMeter() data_time = AverageMeter() epoch_loss_3d_pos = AverageMeter() # Switch to train mode torch.set_grad_enabled(True) model_pos.train() end = time.time() bar = Bar('Train', max=len(data_loader)) for i, (targets_3d, inputs_2d, _, _) in enumerate(data_loader): # Measure data loading time data_time.update(time.time() - end) num_poses = targets_3d.size(0) step += 1 if step % decay == 0 or step == 1: lr_now = lr_decay(optimizer, step, lr_init, decay, gamma) targets_3d, inputs_2d = targets_3d.to(device), inputs_2d.to(device) targets_3d = targets_3d[:, :, :] - targets_3d[:, : 1, :] # the output is relative to the 0 joint outputs_3d = model_pos(inputs_2d) optimizer.zero_grad() loss_3d_pos = criterion(outputs_3d, targets_3d) loss_3d_pos.backward() if max_norm: nn.utils.clip_grad_norm_(model_pos.parameters(), max_norm=1) optimizer.step() epoch_loss_3d_pos.update(loss_3d_pos.item(), num_poses) # Measure elapsed time batch_time.update(time.time() - end) end = time.time() bar.suffix = '({batch}/{size}) Data: {data:.6f}s | Batch: {bt:.3f}s | Total: {ttl:} | ETA: {eta:} ' \ '| Loss: {loss: .4f}' \ .format(batch=i + 1, size=len(data_loader), data=data_time.avg, bt=batch_time.avg, ttl=bar.elapsed_td, eta=bar.eta_td, loss=epoch_loss_3d_pos.avg) bar.next() bar.finish() return epoch_loss_3d_pos.avg, lr_now, step
def main(opt): start_epoch = 0 err_best = 10000 lr_now = opt.lr is_cuda = torch.cuda.is_available() script_name = os.path.basename(__file__).split('.')[0] script_name = script_name + '_in{:d}_out{:d}_dctn_{:d}'.format( opt.input_n, opt.output_n, opt.dct_n) # create model print(">>> creating model") input_n = opt.input_n output_n = opt.output_n dct_n = opt.dct_n model = nnmodel.GCN(input_feature=dct_n, hidden_feature=opt.linear_size, p_dropout=opt.dropout, num_stage=opt.num_stage, node_n=69) if is_cuda: model.cuda() print(">>> total params: {:.2f}M".format( sum(p.numel() for p in model.parameters()) / 1000000.0)) optimizer = torch.optim.Adam(model.parameters(), lr=opt.lr) if opt.is_load: model_path_len = 'checkpoint/test/ckpt_main_last.pth.tar' print(">>> loading ckpt len from '{}'".format(model_path_len)) if is_cuda: ckpt = torch.load(model_path_len) else: ckpt = torch.load(model_path_len, map_location='cpu') start_epoch = ckpt['epoch'] err_best = ckpt['err'] lr_now = ckpt['lr'] model.load_state_dict(ckpt['state_dict']) optimizer.load_state_dict(ckpt['optimizer']) print(">>> ckpt len loaded (epoch: {} | err: {})".format( start_epoch, err_best)) # data loading print(">>> loading data") train_dataset = Pose3dPW(path_to_data=opt.data_dir_3dpw, input_n=input_n, output_n=output_n, dct_n=dct_n, split=0) dim_used = train_dataset.dim_used test_dataset = Pose3dPW(path_to_data=opt.data_dir_3dpw, input_n=input_n, output_n=output_n, dct_n=dct_n, split=1) val_dataset = Pose3dPW(path_to_data=opt.data_dir_3dpw, input_n=input_n, output_n=output_n, dct_n=dct_n, split=2) # load dadasets for training train_loader = DataLoader(dataset=train_dataset, batch_size=opt.train_batch, shuffle=True, num_workers=opt.job, pin_memory=True) test_loader = DataLoader(dataset=test_dataset, batch_size=opt.test_batch, shuffle=False, num_workers=opt.job, pin_memory=True) val_loader = DataLoader(dataset=val_dataset, batch_size=opt.test_batch, shuffle=False, num_workers=opt.job, pin_memory=True) print(">>> data loaded !") print(">>> train data {}".format(train_dataset.__len__())) print(">>> test data {}".format(test_dataset.__len__())) print(">>> validation data {}".format(val_dataset.__len__())) for epoch in range(start_epoch, opt.epochs): if (epoch + 1) % opt.lr_decay == 0: lr_now = utils.lr_decay(optimizer, lr_now, opt.lr_gamma) print('==========================') print('>>> epoch: {} | lr: {:.5f}'.format(epoch + 1, lr_now)) ret_log = np.array([epoch + 1]) head = np.array(['epoch']) # per epoch lr_now, t_l, t_err = train(train_loader, model, optimizer, input_n=input_n, dct_n=dct_n, dim_used=dim_used, lr_now=lr_now, max_norm=opt.max_norm, is_cuda=is_cuda) ret_log = np.append(ret_log, [lr_now, t_l, t_err]) head = np.append(head, ['lr', 't_l', 't_err']) v_err = val(val_loader, model, input_n=input_n, dct_n=dct_n, dim_used=dim_used, is_cuda=is_cuda) ret_log = np.append(ret_log, v_err) head = np.append(head, ['v_err']) test_3d = test(test_loader, model, input_n=input_n, output_n=output_n, dct_n=dct_n, dim_used=dim_used, is_cuda=is_cuda) # ret_log = np.append(ret_log, test_l) ret_log = np.append(ret_log, test_3d) if output_n == 15: head = np.append(head, ['1003d', '2003d', '3003d', '4003d', '5003d']) elif output_n == 30: head = np.append(head, [ '1003d', '2003d', '3003d', '4003d', '5003d', '6003d', '7003d', '8003d', '9003d', '10003d' ]) # update log file df = pd.DataFrame(np.expand_dims(ret_log, axis=0)) if epoch == start_epoch: df.to_csv(opt.ckpt + '/' + script_name + '.csv', header=head, index=False) else: with open(opt.ckpt + '/' + script_name + '.csv', 'a') as f: df.to_csv(f, header=False, index=False) # save ckpt is_best = v_err < err_best err_best = min(v_err, err_best) file_name = [ 'ckpt_' + script_name + '_best.pth.tar', 'ckpt_' + script_name + '_last.pth.tar' ] utils.save_ckpt( { 'epoch': epoch + 1, 'lr': lr_now, 'err': test_3d[0], 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict() }, ckpt_path=opt.ckpt, is_best=is_best, file_name=file_name)
def main(opt): start_epoch = 0 err_best = 10000 lr_now = opt.lr is_cuda = torch.cuda.is_available() # save option in log script_name = os.path.basename(__file__).split('.')[0] script_name = script_name + '_3D_in{:d}_out{:d}_dct_n_{:d}'.format( opt.input_n, opt.output_n, opt.dct_n) # create model print(">>> creating model") input_n = opt.input_n output_n = opt.output_n dct_n = opt.dct_n sample_rate = opt.sample_rate model = nnmodel.GCN(input_feature=dct_n, hidden_feature=opt.linear_size, p_dropout=opt.dropout, num_stage=opt.num_stage, node_n=66) if is_cuda: model.cuda() print(">>> total params: {:.2f}M".format( sum(p.numel() for p in model.parameters()) / 1000000.0)) optimizer = torch.optim.Adam(model.parameters(), lr=opt.lr) if opt.is_load: model_path_len = 'checkpoint/test/' + 'ckpt_' + script_name + '_last.pth.tar' print(">>> loading ckpt len from '{}'".format(model_path_len)) if is_cuda: ckpt = torch.load(model_path_len) else: ckpt = torch.load(model_path_len, map_location='cpu') start_epoch = ckpt['epoch'] err_best = ckpt['err'] lr_now = ckpt['lr'] model.load_state_dict(ckpt['state_dict']) optimizer.load_state_dict(ckpt['optimizer']) print(">>> ckpt len loaded (epoch: {} | err: {})".format( start_epoch, err_best)) # data loading print(">>> loading data") train_dataset = H36motion3D(path_to_data=opt.data_dir, actions='all', input_n=input_n, output_n=output_n, split=0, dct_used=dct_n, sample_rate=sample_rate) acts = data_utils.define_actions('all') test_data = dict() for act in acts: test_dataset = H36motion3D(path_to_data=opt.data_dir, actions=act, input_n=input_n, output_n=output_n, split=1, sample_rate=sample_rate, dct_used=dct_n) test_data[act] = DataLoader(dataset=test_dataset, batch_size=opt.test_batch, shuffle=False, num_workers=opt.job, pin_memory=True) val_dataset = H36motion3D(path_to_data=opt.data_dir, actions='all', input_n=input_n, output_n=output_n, split=2, dct_used=dct_n, sample_rate=sample_rate) # load dadasets for training train_loader = DataLoader(dataset=train_dataset, batch_size=opt.train_batch, shuffle=True, num_workers=opt.job, pin_memory=True) val_loader = DataLoader(dataset=val_dataset, batch_size=opt.test_batch, shuffle=False, num_workers=opt.job, pin_memory=True) print(">>> data loaded !") print(">>> train data {}".format(train_dataset.__len__())) print(">>> test data {}".format(test_dataset.__len__())) print(">>> validation data {}".format(val_dataset.__len__())) for epoch in range(start_epoch, opt.epochs): if (epoch + 1) % opt.lr_decay == 0: lr_now = utils.lr_decay(optimizer, lr_now, opt.lr_gamma) print('==========================') print('>>> epoch: {} | lr: {:.5f}'.format(epoch + 1, lr_now)) ret_log = np.array([epoch + 1]) head = np.array(['epoch']) # per epoch lr_now, t_l = train(train_loader, model, optimizer, lr_now=lr_now, max_norm=opt.max_norm, is_cuda=is_cuda, dim_used=train_dataset.dim_used, dct_n=dct_n) ret_log = np.append(ret_log, [lr_now, t_l]) head = np.append(head, ['lr', 't_l']) v_3d = val(val_loader, model, is_cuda=is_cuda, dim_used=train_dataset.dim_used, dct_n=dct_n) ret_log = np.append(ret_log, [v_3d]) head = np.append(head, ['v_3d']) test_3d_temp = np.array([]) test_3d_head = np.array([]) for act in acts: test_l, test_3d = test(test_data[act], model, input_n=input_n, output_n=output_n, is_cuda=is_cuda, dim_used=train_dataset.dim_used, dct_n=dct_n) # ret_log = np.append(ret_log, test_l) ret_log = np.append(ret_log, test_3d) head = np.append( head, [act + '3d80', act + '3d160', act + '3d320', act + '3d400']) if output_n > 10: head = np.append(head, [act + '3d560', act + '3d1000']) ret_log = np.append(ret_log, test_3d_temp) head = np.append(head, test_3d_head) # update log file and save checkpoint df = pd.DataFrame(np.expand_dims(ret_log, axis=0)) if epoch == start_epoch: df.to_csv(opt.ckpt + '/' + script_name + '.csv', header=head, index=False) else: with open(opt.ckpt + '/' + script_name + '.csv', 'a') as f: df.to_csv(f, header=False, index=False) if not np.isnan(v_3d): is_best = v_3d < err_best err_best = min(v_3d, err_best) else: is_best = False file_name = [ 'ckpt_' + script_name + '_best.pth.tar', 'ckpt_' + script_name + '_last.pth.tar' ] utils.save_ckpt( { 'epoch': epoch + 1, 'lr': lr_now, 'err': test_3d[0], 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict() }, ckpt_path=opt.ckpt, is_best=is_best, file_name=file_name)
def main(opt): start_epoch = 0 err_best = 10000 lr_now = opt.lr is_cuda = torch.cuda.is_available() print(">>> loading data") input_n = opt.input_n output_n = opt.output_n dct_n = opt.dct_n sample_rate = opt.sample_rate ##################################################### # Load data ##################################################### data = DATA(opt.dataset, opt.data_dir) out_of_distribution = data.get_dct_and_sequences(input_n, output_n, sample_rate, dct_n, opt.out_of_distribution) train_loader, val_loader, OoD_val_loader, test_loaders = data.get_dataloaders( opt.train_batch, opt.test_batch, opt.job) print(">>> data loaded !") print(">>> train data {}".format(data.train_dataset.__len__())) if opt.dataset == 'h3.6m': print(">>> validation data {}".format(data.val_dataset.__len__())) ##################################################### # Define script name ##################################################### script_name = os.path.basename(__file__).split('.')[0] script_name = script_name + "_{}_in{:d}_out{:d}_dctn{:d}_dropout_{}".format( str(opt.dataset), opt.input_n, opt.output_n, opt.dct_n, str( opt.dropout)) if out_of_distribution: script_name = script_name + "_OoD_{}_".format( str(opt.out_of_distribution)) if opt.variational: script_name = script_name + "_var_lambda_{}_nz_{}_lr_{}_n_layers_{}".format( str(opt.lambda_), str(opt.n_z), str(opt.lr), str(opt.num_decoder_stage)) ################################################################## # Instantiate model, and methods used fro training and valdation ################################################################## print(">>> creating model") model = nnmodel.GCN(input_feature=dct_n, hidden_feature=opt.linear_size, p_dropout=opt.dropout, num_stage=opt.num_stage, node_n=data.node_n, variational=opt.variational, n_z=opt.n_z, num_decoder_stage=opt.num_decoder_stage) methods = MODEL_METHODS(model, is_cuda) if opt.is_load: start_epoch, err_best, lr_now = methods.load_weights(opt.load_path) print(">>> total params: {:.2f}M".format( sum(p.numel() for p in model.parameters()) / 1000000.0)) methods.optimizer = torch.optim.Adam(model.parameters(), lr=opt.lr) for epoch in range(start_epoch, opt.epochs): ##################################################################################################################################################### # Training step ##################################################################################################################################################### if (epoch + 1) % opt.lr_decay == 0: lr_now = utils.lr_decay(methods.optimizer, lr_now, opt.lr_gamma) print('==========================') print('>>> epoch: {} | lr: {:.5f}'.format(epoch + 1, lr_now)) ret_log = np.array([epoch + 1]) head = np.array(['epoch']) # per epoch lr_now, t_l, t_l_joint, t_l_vlb, t_l_latent, t_e, t_3d = methods.train( train_loader, dataset=opt.dataset, input_n=input_n, lr_now=lr_now, cartesian=data.cartesian, lambda_=opt.lambda_, max_norm=opt.max_norm, dim_used=data.train_dataset.dim_used, dct_n=dct_n) ret_log = np.append( ret_log, [lr_now, t_l, t_l_joint, t_l_vlb, t_l_latent, t_e, t_3d]) head = np.append( head, ['lr', 't_l', 't_l_joint', 't_l_vlb', 't_l_latent', 't_e', 't_3d']) ##################################################################################################################################################### # Evaluate on validation set; Keep track of best, either via val set, OoD val set (in the case of OoD), or train set in the case of the CMU dataset ##################################################################################################################################################### if opt.dataset == 'h3.6m': v_e, v_3d = methods.val(val_loader, input_n=input_n, dim_used=data.train_dataset.dim_used, dct_n=dct_n) ret_log = np.append(ret_log, [v_e, v_3d]) head = np.append(head, ['v_e', 'v_3d']) is_best, err_best = utils.check_is_best(v_e, err_best) if out_of_distribution: OoD_v_e, OoD_v_3d = methods.val( OoD_val_loader, input_n=input_n, dim_used=data.train_dataset.dim_used, dct_n=dct_n) ret_log = np.append(ret_log, [OoD_v_e, OoD_v_3d]) head = np.append(head, ['OoD_v_e', 'OoD_v_3d']) else: is_best, err_best = utils.check_is_best(t_e, err_best) ##################################################### # Evaluate on test set ##################################################### test_3d_temp = np.array([]) test_3d_head = np.array([]) for act in data.acts_test: test_e, test_3d = methods.test( test_loaders[act], dataset=opt.dataset, input_n=input_n, output_n=output_n, cartesian=data.cartesian, dim_used=data.train_dataset.dim_used, dct_n=dct_n) ret_log = np.append(ret_log, test_e) test_3d_temp = np.append(test_3d_temp, test_3d) test_3d_head = np.append( test_3d_head, [act + '3d80', act + '3d160', act + '3d320', act + '3d400']) head = np.append( head, [act + '80', act + '160', act + '320', act + '400']) if output_n > 10: head = np.append(head, [act + '560', act + '1000']) test_3d_head = np.append(test_3d_head, [act + '3d560', act + '3d1000']) ret_log = np.append(ret_log, test_3d_temp) head = np.append(head, test_3d_head) ##################################################### # Update log file and save checkpoint ##################################################### df = pd.DataFrame(np.expand_dims(ret_log, axis=0)) if epoch == start_epoch: df.to_csv(opt.ckpt + '/' + script_name + '.csv', header=head, index=False) else: with open(opt.ckpt + '/' + script_name + '.csv', 'a') as f: df.to_csv(f, header=False, index=False) file_name = [ 'ckpt_' + script_name + '_best.pth.tar', 'ckpt_' + script_name + '_last.pth.tar' ] utils.save_ckpt( { 'epoch': epoch + 1, 'lr': lr_now, 'err': test_e[0], 'state_dict': model.state_dict(), 'optimizer': methods.optimizer.state_dict() }, ckpt_path=opt.ckpt, is_best=is_best, file_name=file_name)
def main(): cmd_ls = sys.argv[1:] cmd = generate_cmd(cmd_ls) if "--freeze_bn False" in cmd: opt.freeze_bn = False if "--addDPG False" in cmd: opt.addDPG = False print( "----------------------------------------------------------------------------------------------------" ) print("This is the model with id {}".format(save_ID)) print(opt) print("Training backbone is: {}".format(opt.backbone)) dataset_str = "" for k, v in config.train_info.items(): dataset_str += k dataset_str += "," print("Training data is: {}".format(dataset_str[:-1])) print("Warm up end at {}".format(warm_up_epoch)) for k, v in config.bad_epochs.items(): if v > 1: raise ValueError("Wrong stopping accuracy!") print( "----------------------------------------------------------------------------------------------------" ) exp_dir = os.path.join("exp/{}/{}".format(folder, save_ID)) log_dir = os.path.join(exp_dir, "{}".format(save_ID)) os.makedirs(log_dir, exist_ok=True) log_name = os.path.join(log_dir, "{}.txt".format(save_ID)) train_log_name = os.path.join(log_dir, "{}_train.xlsx".format(save_ID)) bn_file = os.path.join(log_dir, "{}_bn.txt".format(save_ID)) # Prepare Dataset # Model Initialize if device != "cpu": m = createModel(cfg=model_cfg).cuda() else: m = createModel(cfg=model_cfg).cpu() print(m, file=open("model.txt", "w")) begin_epoch = 0 pre_train_model = opt.loadModel flops = print_model_param_flops(m) print("FLOPs of current model is {}".format(flops)) params = print_model_param_nums(m) print("Parameters of current model is {}".format(params)) inf_time = get_inference_time(m, height=opt.outputResH, width=opt.outputResW) print("Inference time is {}".format(inf_time)) print( "----------------------------------------------------------------------------------------------------" ) if opt.freeze > 0 or opt.freeze_bn: if opt.backbone == "mobilenet": feature_layer_num = 155 feature_layer_name = "features" elif opt.backbone == "seresnet101": feature_layer_num = 327 feature_layer_name = "preact" elif opt.backbone == "seresnet18": feature_layer_num = 75 feature_layer_name = "seresnet18" elif opt.backbone == "shufflenet": feature_layer_num = 167 feature_layer_name = "shuffle" else: raise ValueError("Not a correct name") feature_num = int(opt.freeze * feature_layer_num) for idx, (n, p) in enumerate(m.named_parameters()): if len(p.shape) == 1 and opt.freeze_bn: p.requires_grad = False elif feature_layer_name in n and idx < feature_num: p.requires_grad = False else: p.requires_grad = True writer = SummaryWriter('exp/{}/{}'.format(folder, save_ID), comment=cmd) if device != "cpu": # rnd_inps = Variable(torch.rand(3, 3, 224, 224), requires_grad=True).cuda() rnd_inps = torch.rand(3, 3, 224, 224).cuda() else: rnd_inps = torch.rand(3, 3, 224, 224) # rnd_inps = Variable(torch.rand(3, 3, 224, 224), requires_grad=True) try: writer.add_graph(m, (rnd_inps, )) except: pass shuffle_dataset = False for k, v in config.train_info.items(): if k not in open_source_dataset: shuffle_dataset = True train_dataset = MyDataset(config.train_info, train=True) val_dataset = MyDataset(config.train_info, train=False) if shuffle_dataset: val_dataset.img_val, val_dataset.bbox_val, val_dataset.part_val = \ train_dataset.img_val, train_dataset.bbox_val, train_dataset.part_val train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=opt.trainBatch, shuffle=True, num_workers=opt.trainNW, pin_memory=True) val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=opt.validBatch, shuffle=True, num_workers=opt.valNW, pin_memory=True) # for k, v in config.train_info.items(): # train_dataset = Mscoco([v[0], v[1]], train=True, val_img_num=v[2]) # val_dataset = Mscoco([v[0], v[1]], train=False, val_img_num=v[2]) # # train_loaders[k] = torch.utils.data.DataLoader( # train_dataset, batch_size=config.train_batch, shuffle=True, num_workers=config.train_mum_worker, # pin_memory=True) # # val_loaders[k] = torch.utils.data.DataLoader( # val_dataset, batch_size=config.val_batch, shuffle=False, num_workers=config.val_num_worker, pin_memory=True) # # train_loader = torch.utils.data.DataLoader( # train_dataset, batch_size=config.train_batch, shuffle=True, num_workers=config.train_mum_worker, # pin_memory=True) # val_loader = torch.utils.data.DataLoader( # val_dataset, batch_size=config.val_batch, shuffle=False, num_workers=config.val_num_worker, pin_memory=True) # assert train_loaders != {}, "Your training data has not been specific! " os.makedirs("exp/{}/{}".format(folder, save_ID), exist_ok=True) if pre_train_model: if "duc_se.pth" not in pre_train_model: if "pretrain" not in pre_train_model: try: info_path = os.path.join("exp", folder, save_ID, "option.pkl") info = torch.load(info_path) opt.trainIters = info.trainIters opt.valIters = info.valIters begin_epoch = int(pre_train_model.split("_")[-1][:-4]) + 1 except: # begin_epoch = int(pre_train_model.split("_")[-1][:-4]) + 1 with open(log_name, "a+") as f: f.write(cmd) print('Loading Model from {}'.format(pre_train_model)) m.load_state_dict(torch.load(pre_train_model)) else: with open(log_name, "a+") as f: f.write(cmd) print('Loading Model from {}'.format(pre_train_model)) m.load_state_dict(torch.load(pre_train_model)) m.conv_out = nn.Conv2d(m.DIM, opt.kps, kernel_size=3, stride=1, padding=1) if device != "cpu": m.conv_out.cuda() os.makedirs("exp/{}/{}".format(folder, save_ID), exist_ok=True) else: print('Create new model') with open(log_name, "a+") as f: f.write(cmd) print(opt, file=f) f.write("FLOPs of current model is {}\n".format(flops)) f.write("Parameters of current model is {}\n".format(params)) with open(os.path.join(log_dir, "tb.py"), "w") as pyfile: pyfile.write("import os\n") pyfile.write("os.system('conda init bash')\n") pyfile.write("os.system('conda activate py36')\n") pyfile.write( "os.system('tensorboard --logdir=../../../../exp/{}/{}')".format( folder, save_ID)) params_to_update, layers = [], 0 for name, param in m.named_parameters(): layers += 1 if param.requires_grad: params_to_update.append(param) print("Training {} layers out of {}".format(len(params_to_update), layers)) if optimize == 'rmsprop': optimizer = torch.optim.RMSprop(params_to_update, lr=opt.LR, momentum=opt.momentum, weight_decay=opt.weightDecay) elif optimize == 'adam': optimizer = torch.optim.Adam(params_to_update, lr=opt.LR, weight_decay=opt.weightDecay) elif optimize == 'sgd': optimizer = torch.optim.SGD(params_to_update, lr=opt.LR, momentum=opt.momentum, weight_decay=opt.weightDecay) else: raise Exception if mix_precision: m, optimizer = amp.initialize(m, optimizer, opt_level="O1") # Model Transfer if device != "cpu": m = torch.nn.DataParallel(m).cuda() criterion = torch.nn.MSELoss().cuda() else: m = torch.nn.DataParallel(m) criterion = torch.nn.MSELoss() # loss, acc = valid(val_loader, m, criterion, optimizer, writer) # print('Valid:-{idx:d} epoch | loss:{loss:.8f} | acc:{acc:.4f}'.format( # idx=-1, # loss=loss, # acc=acc # )) early_stopping = EarlyStopping(patience=opt.patience, verbose=True) train_acc, val_acc, train_loss, val_loss, best_epoch, train_dist, val_dist, train_auc, val_auc, train_PR, val_PR = \ 0, 0, float("inf"), float("inf"), 0, float("inf"), float("inf"), 0, 0, 0, 0 train_acc_ls, val_acc_ls, train_loss_ls, val_loss_ls, train_dist_ls, val_dist_ls, train_auc_ls, val_auc_ls, \ train_pr_ls, val_pr_ls, epoch_ls, lr_ls = [], [], [], [], [], [], [], [], [], [], [], [] decay, decay_epoch, lr, i = 0, [], opt.LR, begin_epoch stop = False m_best = m train_log = open(train_log_name, "w", newline="") bn_log = open(bn_file, "w") csv_writer = csv.writer(train_log) csv_writer.writerow(write_csv_title()) begin_time = time.time() os.makedirs("result", exist_ok=True) result = os.path.join( "result", "{}_result_{}.csv".format(opt.expFolder, config.computer)) exist = os.path.exists(result) # Start Training try: for i in range(opt.nEpochs)[begin_epoch:]: opt.epoch = i epoch_ls.append(i) train_log_tmp = [save_ID, i, lr] log = open(log_name, "a+") print('############# Starting Epoch {} #############'.format(i)) log.write( '############# Starting Epoch {} #############\n'.format(i)) # optimizer, lr = adjust_lr(optimizer, i, config.lr_decay, opt.nEpochs) # writer.add_scalar("lr", lr, i) # print("epoch {}: lr {}".format(i, lr)) loss, acc, dist, auc, pr, pt_acc, pt_dist, pt_auc, pt_pr = \ train(train_loader, m, criterion, optimizer, writer) train_log_tmp.append(" ") train_log_tmp.append(loss) train_log_tmp.append(acc.tolist()) train_log_tmp.append(dist.tolist()) train_log_tmp.append(auc) train_log_tmp.append(pr) for a in pt_acc: train_log_tmp.append(a.tolist()) train_log_tmp.append(" ") for d in pt_dist: train_log_tmp.append(d.tolist()) train_log_tmp.append(" ") for ac in pt_auc: train_log_tmp.append(ac) train_log_tmp.append(" ") for p in pt_pr: train_log_tmp.append(p) train_log_tmp.append(" ") train_acc_ls.append(acc) train_loss_ls.append(loss) train_dist_ls.append(dist) train_auc_ls.append(auc) train_pr_ls.append(pr) train_acc = acc if acc > train_acc else train_acc train_loss = loss if loss < train_loss else train_loss train_dist = dist if dist < train_dist else train_dist train_auc = auc if auc > train_auc else train_auc train_PR = pr if pr > train_PR else train_PR log.write( 'Train:{idx:d} epoch | loss:{loss:.8f} | acc:{acc:.4f} | dist:{dist:.4f} | AUC: {AUC:.4f} | PR: {PR:.4f}\n' .format( idx=i, loss=loss, acc=acc, dist=dist, AUC=auc, PR=pr, )) opt.acc = acc opt.loss = loss m_dev = m.module loss, acc, dist, auc, pr, pt_acc, pt_dist, pt_auc, pt_pr = valid( val_loader, m, criterion, writer) train_log_tmp.insert(9, loss) train_log_tmp.insert(10, acc.tolist()) train_log_tmp.insert(11, dist.tolist()) train_log_tmp.insert(12, auc) train_log_tmp.insert(13, pr) train_log_tmp.insert(14, " ") for a in pt_acc: train_log_tmp.append(a.tolist()) train_log_tmp.append(" ") for d in pt_dist: train_log_tmp.append(d.tolist()) train_log_tmp.append(" ") for ac in pt_auc: train_log_tmp.append(ac) train_log_tmp.append(" ") for p in pt_pr: train_log_tmp.append(p) train_log_tmp.append(" ") val_acc_ls.append(acc) val_loss_ls.append(loss) val_dist_ls.append(dist) val_auc_ls.append(auc) val_pr_ls.append(pr) if acc > val_acc: best_epoch = i val_acc = acc torch.save( m_dev.state_dict(), 'exp/{0}/{1}/{1}_best_acc.pkl'.format(folder, save_ID)) m_best = copy.deepcopy(m) val_loss = loss if loss < val_loss else val_loss if dist < val_dist: val_dist = dist torch.save( m_dev.state_dict(), 'exp/{0}/{1}/{1}_best_dist.pkl'.format(folder, save_ID)) if auc > val_auc: val_auc = auc torch.save( m_dev.state_dict(), 'exp/{0}/{1}/{1}_best_auc.pkl'.format(folder, save_ID)) if pr > val_PR: val_PR = pr torch.save( m_dev.state_dict(), 'exp/{0}/{1}/{1}_best_pr.pkl'.format(folder, save_ID)) log.write( 'Valid:{idx:d} epoch | loss:{loss:.8f} | acc:{acc:.4f} | dist:{dist:.4f} | AUC: {AUC:.4f} | PR: {PR:.4f}\n' .format( idx=i, loss=loss, acc=acc, dist=dist, AUC=auc, PR=pr, )) bn_sum, bn_num = 0, 0 for mod in m.modules(): if isinstance(mod, nn.BatchNorm2d): bn_num += mod.num_features bn_sum += torch.sum(abs(mod.weight)) writer.add_histogram("bn_weight", mod.weight.data.cpu().numpy(), i) bn_ave = bn_sum / bn_num bn_log.write("{} --> {}".format(i, bn_ave)) print("Current bn : {} --> {}".format(i, bn_ave)) bn_log.write("\n") log.close() csv_writer.writerow(train_log_tmp) writer.add_scalar("lr", lr, i) print("epoch {}: lr {}".format(i, lr)) lr_ls.append(lr) torch.save(opt, 'exp/{}/{}/option.pkl'.format(folder, save_ID, i)) if i % opt.save_interval == 0 and i != 0: torch.save( m_dev.state_dict(), 'exp/{0}/{1}/{1}_{2}.pkl'.format(folder, save_ID, i)) # torch.save( # optimizer, 'exp/{}/{}/optimizer.pkl'.format(dataset, save_folder)) if i < warm_up_epoch: optimizer, lr = warm_up_lr(optimizer, i) elif i == warm_up_epoch: lr = opt.LR early_stopping(acc) else: early_stopping(acc) if early_stopping.early_stop: optimizer, lr = lr_decay(optimizer, lr) decay += 1 # if decay == 2: # draw_pred_img = False if decay > opt.lr_decay_time: stop = True else: decay_epoch.append(i) early_stopping.reset( int(opt.patience * patience_decay[decay])) # torch.save(m_dev.state_dict(), 'exp/{0}/{1}/{1}_decay{2}.pkl'.format(folder, save_ID, decay)) m = m_best for epo, ac in config.bad_epochs.items(): if i == epo and val_acc < ac: stop = True if stop: print("Training finished at epoch {}".format(i)) break training_time = time.time() - begin_time writer.close() train_log.close() # draw_graph(epoch_ls, train_loss_ls, val_loss_ls, train_acc_ls, val_acc_ls, train_dist_ls, val_dist_ls, log_dir) draw_graph(epoch_ls, train_loss_ls, val_loss_ls, "loss", log_dir) draw_graph(epoch_ls, train_acc_ls, val_acc_ls, "acc", log_dir) draw_graph(epoch_ls, train_auc_ls, val_auc_ls, "AUC", log_dir) draw_graph(epoch_ls, train_dist_ls, val_dist_ls, "dist", log_dir) draw_graph(epoch_ls, train_pr_ls, val_pr_ls, "PR", log_dir) with open(result, "a+") as f: if not exist: title_str = "id,backbone,structure,DUC,params,flops,time,loss_param,addDPG,kps,batch_size,optimizer," \ "freeze_bn,freeze,sparse,sparse_decay,epoch_num,LR,Gaussian,thresh,weightDecay,loadModel," \ "model_location, ,folder_name,training_time,train_acc,train_loss,train_dist,train_AUC," \ "train_PR,val_acc,val_loss,val_dist,val_AUC,val_PR,best_epoch,final_epoch" title_str = write_decay_title(len(decay_epoch), title_str) f.write(title_str) info_str = "{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{}, ,{},{},{},{},{},{},{},{},{},{},{},{},{},{}\n".\ format(save_ID, opt.backbone, opt.struct, opt.DUC, params, flops, inf_time, opt.loss_allocate, opt.addDPG, opt.kps, opt.trainBatch, opt.optMethod, opt.freeze_bn, opt.freeze, opt.sparse_s, opt.sparse_decay, opt.nEpochs, opt.LR, opt.hmGauss, opt.ratio, opt.weightDecay, opt.loadModel, config.computer, os.path.join(folder, save_ID), training_time, train_acc, train_loss, train_dist, train_auc, train_PR, val_acc, val_loss, val_dist, val_auc, val_PR, best_epoch, i) info_str = write_decay_info(decay_epoch, info_str) f.write(info_str) # except IOError: # with open(result, "a+") as f: # training_time = time.time() - begin_time # writer.close() # train_log.close() # info_str = "{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{}, ,{},{},{}\n". \ # format(save_ID, opt.backbone, opt.struct, opt.DUC, params, flops, inf_time, opt.loss_allocate, opt.addDPG, # opt.kps, opt.trainBatch, opt.optMethod, opt.freeze_bn, opt.freeze, opt.sparse_s, opt.sparse_decay, # opt.nEpochs, opt.LR, opt.hmGauss, opt.ratio, opt.weightDecay, opt.loadModel, config.computer, # os.path.join(folder, save_ID), training_time, "Some file is closed") # f.write(info_str) except ZeroDivisionError: with open(result, "a+") as f: training_time = time.time() - begin_time writer.close() train_log.close() info_str = "{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{}, ,{},{},{}\n". \ format(save_ID, opt.backbone, opt.struct, opt.DUC, params, flops, inf_time, opt.loss_allocate, opt.addDPG, opt.kps, opt.trainBatch, opt.optMethod, opt.freeze_bn, opt.freeze, opt.sparse_s, opt.sparse_decay, opt.nEpochs, opt.LR, opt.hmGauss, opt.ratio, opt.weightDecay, opt.loadModel, config.computer, os.path.join(folder, save_ID), training_time, "Gradient flow") f.write(info_str) except KeyboardInterrupt: with open(result, "a+") as f: training_time = time.time() - begin_time writer.close() train_log.close() info_str = "{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{}, ,{},{},{}\n". \ format(save_ID, opt.backbone, opt.struct, opt.DUC, params, flops, inf_time, opt.loss_allocate, opt.addDPG, opt.kps, opt.trainBatch, opt.optMethod, opt.freeze_bn, opt.freeze, opt.sparse_s, opt.sparse_decay, opt.nEpochs, opt.LR, opt.hmGauss, opt.ratio, opt.weightDecay, opt.loadModel, config.computer, os.path.join(folder, save_ID), training_time, "Be killed by someone") f.write(info_str) print("Model {} training finished".format(save_ID)) print( "----------------------------------------------------------------------------------------------------" )
def _train(loader, optimizer, loss_function, model, config=None, lr=None, evaluation=True, name_best_model='weights/best', preprocess_mode=None): # Parameters for training training_samples = len(loader.image_train_list) steps_per_epoch = int(training_samples / config['batch_size']) + 1 best_miou = 0 log_freq = min(50, int(steps_per_epoch / 5)) avg_loss = tf.keras.metrics.Mean(name='loss', dtype=tf.float32) train_summary_writer = tf.summary.create_file_writer( '/tmp/summaries/train') # tensorboard test_summary_writer = tf.summary.create_file_writer( '/tmp/summaries/test') # tensorboard print('Please enter in terminal: tensorboard --logdir /tmp/summaries') for epoch in range(config['epochs']): # for each epoch start_time_epoch = time.time() lr_decay(lr, config['init_lr'], 1e-9, epoch, config['epochs'] - 1) # compute the new lr print('epoch: ' + str(epoch + 1) + '. Learning rate: ' + str(lr.numpy())) for step in range(steps_per_epoch): # for every batch # get batch x, y, mask = loader.get_batch(size=config['batch_size'], train=True) x = preprocess(x, mode=preprocess_mode) with train_summary_writer.as_default(): loss = train_step( model, x, y, mask, loss_function, optimizer, (config['height_train'], config['width_train']), config['zoom_augmentation']) # tensorboard avg_loss.update_state(loss) if tf.equal(optimizer.iterations % log_freq, 0): tf.summary.scalar('loss', avg_loss.result(), step=optimizer.iterations) avg_loss.reset_states() if evaluation: # get metrics # with train_summary_writer.as_default(): # train_acc, train_miou = get_metrics(loader, model, loader.n_classes, train=True, flip_inference=False, preprocess_mode=preprocess_mode, optimizer=optimizer) with test_summary_writer.as_default(): test_acc, test_miou = get_metrics( loader, model, loader.n_classes, train=False, flip_inference=False, preprocess_mode=preprocess_mode, optimizer=optimizer, scales=[1]) # print('Train accuracy: ' + str(train_acc.numpy())) # print('Train miou: ' + str(train_miou.numpy())) print('Test accuracy: ' + str(test_acc.numpy())) print('Test miou: ' + str(test_miou.numpy())) # save model if best model if test_miou.numpy() > best_miou: best_miou = test_miou.numpy() model.save_weights(name_best_model) print('Current Best model miou: ' + str(best_miou)) print('') else: model.save_weights(name_best_model) loader.suffle_segmentation() # sheffle training set every epoch print('Epoch time seconds: ' + str(time.time() - start_time_epoch))
def train(loader, optimizer, loss_function, model, size_input, epochs=5, batch_size=2, lr=None, init_lr=2e-4, evaluation=True, name_best_model='weights/best', preprocess_mode=None, labels_resize_factor=1): # Parameters for training training_samples = len(loader.image_train_list) steps_per_epoch = int(training_samples / batch_size) + 1 best_miou = 0 log_freq = min(50, int(steps_per_epoch / 5)) avg_loss = tf.keras.metrics.Mean(name='loss', dtype=tf.float32) train_summary_writer = tf.summary.create_file_writer( '/tmp/summaries/train') # tensorboard test_summary_writer = tf.summary.create_file_writer( '/tmp/summaries/test') # tensorboard print('Please enter in terminal: tensorboard --logdir \\tmp\\summaries') for epoch in range(epochs): # for each epoch lr_decay(lr, init_lr, 1e-9, epoch, epochs - 1) # compute the new lr print('epoch: ' + str(epoch + 1) + '. Learning rate: ' + str(lr.numpy())) for step in range(steps_per_epoch): # for every batch # get batch x, y, mask = loader.get_batch(size=batch_size, train=True) x = preprocess(x, mode=preprocess_mode) with train_summary_writer.as_default(): loss = train_step(model, x, y, mask, loss_function, optimizer, labels_resize_factor, size_input) # tensorboard avg_loss.update_state(loss) if tf.equal(optimizer.iterations % log_freq, 0): tf.summary.scalar('loss', avg_loss.result(), step=optimizer.iterations) avg_loss.reset_states() if evaluation: # get metrics with train_summary_writer.as_default(): train_acc, train_miou = get_metrics( loader, model, loader.n_classes, train=True, preprocess_mode=preprocess_mode, labels_resize_factor=labels_resize_factor, optimizer=optimizer) with test_summary_writer.as_default(): test_acc, test_miou = get_metrics( loader, model, loader.n_classes, train=False, flip_inference=False, scales=[1], preprocess_mode=preprocess_mode, labels_resize_factor=labels_resize_factor, optimizer=optimizer) print('Train accuracy: ' + str(train_acc.numpy())) print('Train miou: ' + str(train_miou.numpy())) print('Test accuracy: ' + str(test_acc.numpy())) print('Test miou: ' + str(test_miou.numpy())) print('') # save model if bet if test_miou > best_miou: best_miou = test_miou model.save_weights(name_best_model) else: model.save_weights(name_best_model) loader.suffle_segmentation() # sheffle trainign set
def main(opt): start_epoch = 0 err_best = 10000 lr_now = opt.lr is_cuda = torch.cuda.is_available() # define log csv file script_name = os.path.basename(__file__).split('.')[0] script_name = script_name + "_in{:d}_out{:d}_dctn{:d}".format( opt.input_n, opt.output_n, opt.dct_n) # create model print(">>> creating model") input_n = opt.input_n output_n = opt.output_n dct_n = opt.dct_n sample_rate = opt.sample_rate # 48 nodes for angle prediction model = nnmodel.GCN(input_feature=dct_n, hidden_feature=opt.linear_size, p_dropout=opt.dropout, num_stage=opt.num_stage, node_n=48) if is_cuda: model.cuda() print(">>> total params: {:.2f}M".format( sum(p.numel() for p in model.parameters()) / 1000000.0)) optimizer = torch.optim.Adam(model.parameters(), lr=opt.lr) # continue from checkpoint if opt.is_load: model_path_len = 'checkpoint/test/ckpt_main_gcn_muti_att_best.pth.tar' print(">>> loading ckpt len from '{}'".format(model_path_len)) if is_cuda: ckpt = torch.load(model_path_len) else: ckpt = torch.load(model_path_len, map_location='cpu') start_epoch = ckpt['epoch'] err_best = ckpt['err'] lr_now = ckpt['lr'] model.load_state_dict(ckpt['state_dict']) optimizer.load_state_dict(ckpt['optimizer']) print(">>> ckpt len loaded (epoch: {} | err: {})".format( start_epoch, err_best)) # data loading print(">>> loading data") train_dataset = H36motion(path_to_data=opt.data_dir, actions='all', input_n=input_n, output_n=output_n, split=0, sample_rate=sample_rate, dct_n=dct_n) data_std = train_dataset.data_std data_mean = train_dataset.data_mean val_dataset = H36motion(path_to_data=opt.data_dir, actions='all', input_n=input_n, output_n=output_n, split=2, sample_rate=sample_rate, data_mean=data_mean, data_std=data_std, dct_n=dct_n) # load datasets for training train_loader = DataLoader(dataset=train_dataset, batch_size=opt.train_batch, shuffle=True, num_workers=opt.job, pin_memory=True) val_loader = DataLoader(dataset=val_dataset, batch_size=opt.test_batch, shuffle=False, num_workers=opt.job, pin_memory=True) acts = data_utils.define_actions('all') test_data = dict() for act in acts: test_dataset = H36motion(path_to_data=opt.data_dir, actions=act, input_n=input_n, output_n=output_n, split=1, sample_rate=sample_rate, data_mean=data_mean, data_std=data_std, dct_n=dct_n) test_data[act] = DataLoader(dataset=test_dataset, batch_size=opt.test_batch, shuffle=False, num_workers=opt.job, pin_memory=True) print(">>> data loaded !") print(">>> train data {}".format(train_dataset.__len__())) print(">>> validation data {}".format(val_dataset.__len__())) for epoch in range(start_epoch, opt.epochs): if (epoch + 1) % opt.lr_decay == 0: lr_now = utils.lr_decay(optimizer, lr_now, opt.lr_gamma) print('==========================') print('>>> epoch: {} | lr: {:.5f}'.format(epoch + 1, lr_now)) ret_log = np.array([epoch + 1]) head = np.array(['epoch']) # per epoch a = train_dataset.dim_used