def main(args): """Run training.""" val_perf = [] # summary of validation performance, and the training loss train_data = utils.read_data(args, "train") val_data = utils.read_data(args, "val") args.train_num_examples = train_data.num_examples # construct model under gpu0 model = models.get_model(args, gpuid=args.gpuid) trainer = models.Trainer(model, args) tester = models.Tester(model, args) saver = tf.train.Saver(max_to_keep=5) bestsaver = tf.train.Saver(max_to_keep=5) save_period = args.save_period # also the eval period # start training! tfconfig = tf.ConfigProto(allow_soft_placement=True) tfconfig.gpu_options.allow_growth = True tfconfig.gpu_options.visible_device_list = "%s" % ( ",".join(["%s" % i for i in [args.gpuid]])) with tf.Session(config=tfconfig) as sess: utils.initialize( load=args.load, load_best=args.load_best, args=args, sess=sess) # the total step (iteration) the model will run # total / batchSize * epoch num_steps = int(math.ceil(train_data.num_examples / float(args.batch_size)))*args.num_epochs # get_batches is a generator, run on the fly print(" batch_size:%s, epoch:%s, %s step every epoch, total step:%s," " eval/save every %s steps" % (args.batch_size, args.num_epochs, math.ceil(train_data.num_examples/ float(args.batch_size)), num_steps, args.save_period)) metric = "ade" # average displacement error # smaller better # remember the best eval acc during training best = {metric: 999999, "step": -1} finalperf = None is_start = True loss = -1 grid_loss = -1 xyloss = -1 act_loss = -1 traj_class_loss = -1 for batch in tqdm(train_data.get_batches(args.batch_size, num_steps=num_steps), total=num_steps, ascii=True): global_step = sess.run(model.global_step) + 1 # start from 0 # if load from existing model, save if first if (global_step % save_period == 0) or \ (args.load_best and is_start) or \ (args.load and is_start and (args.ignore_vars is None)): tqdm.write("\tsaving model %s..." % global_step) saver.save(sess, args.save_dir_model, global_step=global_step) tqdm.write("\tdone") evalperf = utils.evaluate(val_data, args, sess, tester) tqdm.write(("\tlast loss:%.5f, xyloss:%.5f, traj_class_loss:%.5f," " grid_loss:%s, act_loss:%.5f, eval on validation:%s," " (best %s:%s at step %s) ") % ( loss, xyloss, traj_class_loss, grid_loss, act_loss, ["%s: %s" % (k, evalperf[k]) for k in sorted(evalperf.keys())], metric, best[metric], best["step"])) # remember the best acc if evalperf[metric] < best[metric]: best[metric] = evalperf[metric] best["step"] = global_step # save the best model tqdm.write("\t saving best model...") bestsaver.save(sess, args.save_dir_best_model, global_step=global_step) tqdm.write("\t done.") finalperf = evalperf val_perf.append((loss, evalperf)) is_start = False loss, _, xyloss, act_loss, traj_class_loss, grid_loss = \ trainer.step(sess, batch) if math.isnan(loss): print("nan loss.") print(grid_loss) sys.exit() if global_step % save_period != 0: saver.save(sess, args.save_dir_model, global_step=global_step) print("best eval on val %s: %s at %s step, final step %s %s is %s" % ( metric, best[metric], best["step"], global_step, metric, finalperf[metric]))
def train(): # Setup Dataloader,训练集和验证集数据,决定了如分类类别等 train_dataset = data.UAVDataClassSeg( txt_path= '/home/mlxuan/project/DeepLearning/FCN/fcn_mlx/data/data/train.txt') trainloader = DataLoader(train_dataset, batch_size=12, shuffle=True, drop_last=True, num_workers=24, pin_memory=True) val_dataset = data.UAVDataClassSeg( '/home/mlxuan/project/DeepLearning/FCN/fcn_mlx/data/data/valid/valid.txt', train=False) valloader = DataLoader(val_dataset, batch_size=1, shuffle=False) # Setup device89 device = torch.device("cuda" if torch.cuda.is_available() else "cpu") #Setup model model = models.segnet(n_classes=len(val_dataset.class_names)) #用预训练的Vgg16网络初始化FCN32s的参数 model.init_vgg16_params(torchvision.models.vgg16(pretrained=True)) # Setup optimizer, lr_scheduler and loss function(优化器、学习率调整策略、损失函数) def cross_entropy2d(input, target, weight=None, size_average=True): # input: (n, c, h, w), target: (n, h, w) n, c, h, w = input.size() # log_p: (n, c, h, w) if LooseVersion(torch.__version__) < LooseVersion( '0.3'): #简单的版本比较操作,此处传入的时torch.__version__,所以比较的时torch的版本 # ==0.2.X log_p = F.log_softmax(input) else: # >=0.3 log_p = F.log_softmax(input, dim=1) # log_p: (n*h*w, c) log_p是对input做log_softmax后的结果,表示每个类的概率。tensor.transpose将tensor的维度交换,如行变成列,列变成行 log_p = log_p.transpose(1, 2).transpose(2, 3).contiguous() log_p = log_p[target.view(n, h, w, 1).repeat(1, 1, 1, c) >= 0] log_p = log_p.view(-1, c) # target: (n*h*w,) mask = target >= 0 target = target[mask] loss = F.nll_loss(log_p, target, weight=weight) if size_average: loss /= mask.data.sum() return loss lossFun = cross_entropy2d optim = torch.optim.Adam(params=model.parameters(), lr=1.0e-5, weight_decay=0.0005) #定义学习率调整策略 scheduler = lr_scheduler.ReduceLROnPlateau( optim, mode='min', patience=1, min_lr=10e-10, eps=10e-9) # min表示当指标不在降低时,patience表示可以容忍的step次数 # utils.ModelLoad('/home/mlxuan/project/DeepLearning/FCN/fcn_mlx/output_segnet/bestModel/1.4000*3000_trainModel.tar', # model) now = datetime.datetime.now() logFile = utils.Log( osp.join( '/home/mlxuan/project/DeepLearning/FCN/fcn_mlx/output_segnet/visualization_viz/', now.strftime('%Y%m%d_%H%M%S.%f') + 'log.csv'), [ 'iteration', 'train/loss', 'train/mean_iu', 'valid/loss', 'valid/mean_iu', 'lr' ]) trainer = models.Trainer(cuda=True, model=model, optimizer=optim, loss_fcn=lossFun, train_loader=trainloader, val_loader=valloader, out='./output_segnet/', max_iter=100000, scheduler=scheduler, interval_validate=800, logFile=logFile) trainer.train() #进入训练
def train(): # Setup device device = torch.device("cuda" if torch.cuda.is_available() else "cpu") #Setup model model = models.UNet(n_channels=3,n_classes=5) #用预训练的Vgg16网络初始化FCN32s的参数 vgg16 = models.VGG16(pretrained=True) model.copy_params_from_vgg16(vgg16) # Setup Dataloader,训练集和验证集数据 """data.picFulPath('/home/mlxuan/project/DeepLearning/data/benchmark/benchmark_RELEASE/dataset/train.txt', '/home/mlxuan/project/DeepLearning/data/benchmark/benchmark_RELEASE/dataset/img/', '/home/mlxuan/project/DeepLearning/data/benchmark/benchmark_RELEASE/dataset/cls/') train_dataset = data.SBDClassSeg('/home/mlxuan/project/DeepLearning/FCN/fcn_mlx/data/ImagAndLal.txt') trainloader = DataLoader(train_dataset, batch_size=4, shuffle=False, drop_last=True) data.picFulPath('/home/mlxuan/project/DeepLearning/data/VOCtrainval_11-May-2012/VOCdevkit/VOC2012/ImageSets/Segmentation/val.txt', '/home/mlxuan/project/DeepLearning/data/VOCtrainval_11-May-2012/VOCdevkit/VOC2012/JPEGImages/', '/home/mlxuan/project/DeepLearning/data/VOCtrainval_11-May-2012/VOCdevkit/VOC2012/SegmentationClass/', destPath='/home/mlxuan/project/DeepLearning/FCN/fcn_mlx/data/ValImagAndLal.txt', ImgFix='.jpg',lblFix='.png') val_dataset = data.VOCClassSeg('/home/mlxuan/project/DeepLearning/FCN/fcn_mlx/data/ValImagAndLal.txt',train=False) valloader = DataLoader(val_dataset,batch_size=1,shuffle=False)""" train_dataset = data.RSDataClassSeg('/home/mlxuan/project/DeepLearning/FCN/fcn_mlx/Data/trainFullPath.txt') trainloader = DataLoader(train_dataset, batch_size=4, shuffle=False, drop_last=True) val_dataset = data.RSDataClassSeg('/home/mlxuan/project/DeepLearning/FCN/fcn_mlx/Data/validFullPath.txt',train=False) valloader = DataLoader(val_dataset, batch_size=1, shuffle=False) # Setup optimizer, lr_scheduler and loss function(优化器、学习率调整策略、损失函数) def cross_entropy2d(input, target, weight=None, size_average=True): # input: (n, c, h, w), target: (n, h, w) n, c, h, w = input.size() # log_p: (n, c, h, w) if LooseVersion(torch.__version__) < LooseVersion('0.3'):#简单的版本比较操作,此处传入的时torch.__version__,所以比较的时torch的版本 # ==0.2.X log_p = F.log_softmax(input) else: # >=0.3 log_p = F.log_softmax(input, dim=1) # log_p: (n*h*w, c) log_p是对input做log_softmax后的结果,表示每个类的概率。tensor.transpose将tensor的维度交换,如行变成列,列变成行 log_p = log_p.transpose(1, 2).transpose(2, 3).contiguous() log_p = log_p[target.view(n, h, w, 1).repeat(1, 1, 1, c) >= 0] log_p = log_p.view(-1, c) # target: (n*h*w,) mask = target >= 0 target = target[mask] loss = F.nll_loss(log_p, target, weight=weight) if size_average: loss /= mask.data.sum() return loss lossFun = cross_entropy2d def get_parameters(model, bias=False): import torch.nn as nn modules_skipped = ( nn.ReLU, nn.MaxPool2d, nn.Dropout2d, nn.Sequential, models.FCN32s, ) for m in model.modules(): if isinstance(m, nn.Conv2d): if bias: yield m.bias else: yield m.weight elif isinstance(m, nn.ConvTranspose2d): # weight is frozen because it is just a bilinear upsampling if bias: assert m.bias is None elif isinstance(m, modules_skipped): continue else: raise ValueError('Unexpected module: %s' % str(m)) optim = torch.optim.SGD( [ {'params': get_parameters(model, bias=False)}, {'params': get_parameters(model, bias=True), 'lr': 1.0e-5* 2, 'weight_decay': 0}, ], lr=1.0e-5, momentum=0.99, weight_decay=0.0005) #定义学习率调整策略 scheduler = lr_scheduler.ReduceLROnPlateau(optim, mode='min', patience=0,min_lr=10e-10,eps=10e-8) # min表示当指标不在降低时,patience表示可以容忍的step次数 utils.ModelLoad('/home/mlxuan/project/DeepLearning/FCN/fcn_mlx/output/Model.path/20181227_220035.852449model_best.pth.tar',model,optim) trainer = models.Trainer( cuda =True, model=model, optimizer=optim, loss_fcn=lossFun, train_loader=trainloader, val_loader=valloader, out='./output/', max_iter=40000, scheduler = scheduler, interval_validate=2000 ) trainer.train()#进入训练
epochs = args.epochs quiet = args.quiet == True verbose = not quiet latent_dim = 256 mixed_probability = 0.9 discriminator_filters = 8 generator_filters = 8 pl_beta = 0.99 if __name__ == '__main__': device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu") print("Using device: ", device) #model = StyleGan2Model() dataLoader = utils.getDataLoader(batch_size, image_size) # print(len(dataLoader) / 10) Trainer = models.Trainer(batch_size, image_size, latent_dim, epochs, discriminator_filters, generator_filters, device, mixed_probability, pl_beta) # print(Trainer.StyleGan) # print(Trainer.StyleGan.generator.state_dict()) # print(sum(p.numel() for p in Trainer.StyleGan.parameters())) # print(Trainer.StyleGan.discriminator.state_dict()) print(Trainer.StyleGan.generator.state_dict()['generatorBlocks.2.style_to_input_channels.weight'][0][0].item()) # print(Trainer.StyleGan.discriminator.state_dict()[]) print("Apex available: ", Trainer.apex_available) # Trainer.resetSaves() # x, y = next(enumerate(dataLoader)) x, y = next(enumerate(dataLoader)) # print(y[0]) # utils.showImage(y[0][0].expand(3, -1, -1)) # print(y[0].size())287 # for x in range(10):
def train(args): '''\ Training function. Args: args: namespace of arguments. Run 'artRecycle train --help' for info. ''' # Model name and paths model_name = '{}|{}'.format(*args.datasets) model_path, log_path, logs_path = _prepare_directories(model_name, resume=args.cont) model_json = os.path.join(model_path, 'keras.json') model_checkpoint = os.path.join(model_path, 'model') # Summary writers train_summary_writer = tf.summary.create_file_writer( os.path.join(log_path, 'train')) # Define datasets image_shape = (300, 300, 3) train_dataset, train_size = data.load_pair(*args.datasets, 'all', shape=image_shape, batch=args.batch) train_dataset_it = iter(train_dataset) few_samples = [data.load_few(name, 'all', image_shape, 1) \ for name in args.datasets] # Define keras model keras_model, model_layer = models.define_model(image_shape) # Save keras model keras_json = keras_model.to_json() keras_json = json.dumps(json.loads(keras_json), indent=2) with open(model_json, 'w') as f: f.write(keras_json) # Save TensorBoard graph @tf.function def tracing_model_ops(inputs): return model_layer(inputs) tf.summary.trace_on() tracing_model_ops(next(train_dataset_it)) with train_summary_writer.as_default(): tf.summary.trace_export('Model', step=0) # Resuming if args.cont: keras_model.load_weights(model_checkpoint) print('> Weights loaded') # Training steps step_saver = CountersSaver(log_dir=logs_path, log_every=args.logs) steps_per_epoch = int(train_size/args.batch) \ if not args.epoch_steps else args.epoch_steps epochs = range(step_saver.epoch, args.epochs) # Training tools make_optmizer = lambda: tf.optimizers.Adam(args.rate) trainer = models.Trainer(keras_model, make_optmizer, train_dataset_it) tester = models.Tester(keras_model, train_dataset_it) saver = CheckpointSaver(keras_model, model_checkpoint) # Print job print('> Training. Epochs:', epochs) # Training loop for epoch in epochs: print('> Epoch', step_saver.epoch) for epoch_step in range(steps_per_epoch): print('> Step', step_saver.step, end='\r') # Train step output = trainer.step() # Validation and log if step_saver.step % args.logs == 0 or epoch_step == steps_per_epoch - 1: print('\n> Validation') # Evaluation for i in range(args.val_steps): tester.step() train_metrics = tester.result() # Log in console print(' Train metrics:', train_metrics) # Log in TensorBoard with train_summary_writer.as_default(): for metric in train_metrics: tf.summary.scalar(metric, train_metrics[metric], step=step_saver.step) # Save weigths loss = 0 for m in train_metrics.values(): loss += m saved = saver.save(score=-loss) if saved: print('Weigths saved') # Transform images for visualization if args.images: fake_A, fake_B, *_ = keras_model(few_samples) fake_A_viz = image_unnormalize(fake_A) fake_B_viz = image_unnormalize(fake_B) # Log images with train_summary_writer.as_default(): tf.summary.image('fake_A', fake_A_viz, step=step_saver.step) tf.summary.image('fake_B', fake_B_viz, step=step_saver.step) # End step step_saver.new_step() # End epoch step_saver.new_epoch()
default=dt.now().strftime("%Y-%m-%d_%H-%M"), type=str, help='Path to output directory') ap.add_argument( '-l', '--load', type=str, help='Path to directory from which best_model.ckpt should be loaded') args = vars(ap.parse_args()) loss_list, mape_list = [], [] # Initialize model for i in range(args['num_runs']): print(f"\n[INFO] NOW STARTING RUN {i+1}\n") trainer = models.Trainer(args) best_loss, best_mape = trainer.train(run_id=i) loss_list.append(best_loss) mape_list.append(best_mape) with open(os.path.join('outputs', args['task'], 'mse_run_stats.txt'), 'w') as f: f.write(" ".join([str(l) for l in loss_list])) f.write(f"\nMean: {np.mean(loss_list)}") f.write(f"\nStdev: {np.std(loss_list)}") with open(os.path.join('outputs', args['task'], 'mape_run_stats.txt'), 'w') as f: f.write(" ".join([str(l) for l in mape_list])) f.write(f"\nMean: {np.mean(mape_list)}") f.write(f"\nStdev: {np.std(mape_list)}")