def configure_optimizers(self): """Summary Must be implemented Returns: TYPE: Description """ optimizer = create_optimizer(self.cfg, self.model.parameters()) if self.cfg.lr_scheduler == 'step': scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=self.cfg.step_size, gamma=self.cfg.lr_factor) elif self.cfg.lr_scheduler == 'cosin': scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=200, eta_min=1e-6) elif self.cfg.lr_scheduler == 'cosin_epoch': scheduler = optim.lr_scheduler.CosineAnnealingLR( optimizer, T_max=self.cfg.tmax, eta_min=self.cfg.eta_min) elif self.cfg.lr_scheduler == 'onecycle': max_lr = [g["lr"] for g in optimizer.param_groups] scheduler = optim.lr_scheduler.OneCycleLR( optimizer, max_lr=max_lr, epochs=self.hparams.epochs, steps_per_epoch=len(self.train_dataloader())) scheduler = {"scheduler": scheduler, "interval": "step"} else: raise ValueError( 'Does not support {} learning rate scheduler'.format( self.cfg.lr_scheduler)) return optimizer, scheduler
def run(args): setup_default_logging() #args = parser.parse_args() args.prefetcher = not args.no_prefetcher args.distributed = False if 'WORLD_SIZE' in os.environ: args.distributed = int(os.environ['WORLD_SIZE']) > 1 if args.distributed and args.num_gpu > 1: logging.warning( 'Using more than one GPU per process in distributed mode is not allowed. Setting num_gpu to 1.' ) args.num_gpu = 1 args.world_size = 1 args.rank = 0 # global rank if args.distributed: args.num_gpu = 1 #1 args.device = 'cuda:%d' % args.local_rank torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') args.world_size = torch.distributed.get_world_size() args.rank = torch.distributed.get_rank() assert args.rank >= 0 if args.distributed: logging.info( 'Training in distributed mode with multiple processes, 1 GPU per process. Process %d, total %d.' % (args.rank, args.world_size)) else: logging.info('Training with a single process on %d GPUs.' % args.num_gpu) torch.manual_seed(args.seed + args.rank) model = create_model(args.model, pretrained=args.pretrained, num_classes=args.num_classes, drop_rate=args.drop, global_pool=args.gp, bn_tf=args.bn_tf, bn_momentum=args.bn_momentum, bn_eps=args.bn_eps, checkpoint_path=args.initial_checkpoint) if args.local_rank == 0: logging.info('Model %s created, param count: %d' % (args.model, sum([m.numel() for m in model.parameters()]))) data_config = resolve_data_config(vars(args), model=model, verbose=args.local_rank == 0) # optionally resume from a checkpoint optimizer_state = None resume_epoch = None if args.resume: optimizer_state, resume_epoch = resume_checkpoint(model, args.resume) if args.num_gpu > 1: if args.amp: logging.warning( 'AMP does not work well with nn.DataParallel, disabling. Use distributed mode for multi-GPU AMP.' ) args.amp = False model = nn.DataParallel(model, device_ids=list(range(args.num_gpu))).cuda() else: model.cuda() optimizer = create_optimizer(args, model) if optimizer_state is not None: optimizer.load_state_dict(optimizer_state) use_amp = False if has_apex and args.amp: model, optimizer = amp.initialize(model, optimizer, opt_level='O1') use_amp = True if args.local_rank == 0: logging.info('NVIDIA APEX {}. AMP {}.'.format( 'installed' if has_apex else 'not installed', 'on' if use_amp else 'off')) model_ema = None if args.model_ema: # create EMA model after cuda() model_ema = ModelEma(model, decay=args.model_ema_decay, device='cpu' if args.model_ema_force_cpu else '', resume=args.resume) if args.distributed: if args.sync_bn: try: if has_apex: model = convert_syncbn_model(model) else: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm( model) if args.local_rank == 0: logging.info( 'Converted model to use Synchronized BatchNorm.') except Exception as e: logging.error( 'Failed to enable Synchronized BatchNorm. Install Apex or Torch >= 1.1' ) if has_apex: model = DDP(model, delay_allreduce=True) else: if args.local_rank == 0: logging.info( "Using torch DistributedDataParallel. Install NVIDIA Apex for Apex DDP." ) model = DDP(model, device_ids=[args.local_rank ]) # can use device str in Torch >= 1.1 # NOTE: EMA model does not need to be wrapped by DDP lr_scheduler, num_epochs = create_scheduler(args, optimizer) start_epoch = 0 if args.start_epoch is not None: # a specified start_epoch will always override the resume epoch start_epoch = args.start_epoch elif resume_epoch is not None: start_epoch = resume_epoch if start_epoch > 0: lr_scheduler.step(start_epoch) if args.local_rank == 0: logging.info('Scheduled epochs: {}'.format(num_epochs)) collate_fn = None if args.prefetcher and args.mixup > 0: collate_fn = FastCollateMixup(args.mixup, args.smoothing, args.num_classes) # Load dataset data_dir = os.path.join(args.data, 'img') if not os.path.exists(data_dir): logging.error('Training folder does not exist at: {}'.format(data_dir)) exit(1) dataset_train = MultiViewDataSet(train_file, class_file, data_dir, transform=transform_train) dataset_eval = MultiViewDataSet(test_file, class_file, data_dir, transform=transform_eval) loader_train = torch.utils.data.DataLoader(dataset_train, batch_size=args.batch_size, shuffle=True, num_workers=1) if 0: loader_train = create_loader( dataset_train, input_size=data_config['input_size'], batch_size=args.batch_size, is_training=True, use_prefetcher=args.prefetcher, rand_erase_prob=args.reprob, rand_erase_mode=args.remode, color_jitter=args.color_jitter, interpolation='random', mean=data_config['mean'], std=data_config['std'], num_workers=args.workers, distributed=args.distributed, collate_fn=collate_fn, ) loader_eval = create_loader( dataset_eval, input_size=data_config['input_size'], batch_size=4 * args.batch_size, is_training=False, use_prefetcher=args.prefetcher, interpolation=data_config['interpolation'], mean=data_config['mean'], std=data_config['std'], num_workers=args.workers, distributed=args.distributed, ) if args.mixup > 0.: # smoothing is handled with mixup label transform train_loss_fn = SoftTargetCrossEntropy().cuda() validate_loss_fn = nn.CrossEntropyLoss().cuda() elif args.smoothing: train_loss_fn = LabelSmoothingCrossEntropy( smoothing=args.smoothing).cuda() validate_loss_fn = nn.CrossEntropyLoss().cuda() else: train_loss_fn = nn.CrossEntropyLoss().cuda() validate_loss_fn = train_loss_fn eval_metric = args.eval_metric best_metric = None best_epoch = None saver = None output_dir = '' metrics_history = OrderedDict() if args.local_rank == 0: output_base = args.output if args.output else './output' exp_name = '-'.join([ datetime.now().strftime("%Y%m%d-%H%M%S"), args.model, str(data_config['input_size'][-1]) ]) output_dir = get_outdir(output_base, 'train', exp_name) decreasing = True if eval_metric == 'loss' else False saver = CheckpointSaver(checkpoint_dir=output_dir, decreasing=decreasing) try: for epoch in range(start_epoch, num_epochs): train_metrics = train_epoch(epoch, model, loader_train, optimizer, train_loss_fn, args, lr_scheduler=lr_scheduler, saver=saver, output_dir=output_dir, use_amp=use_amp, model_ema=model_ema) eval_metrics = validate(model, loader_eval, validate_loss_fn, args) if model_ema is not None and not args.model_ema_force_cpu: ema_eval_metrics = validate(model_ema.ema, loader_eval, validate_loss_fn, args, log_suffix=' (EMA)') eval_metrics = ema_eval_metrics if lr_scheduler is not None: # step LR for next epoch lr_scheduler.step(epoch + 1, eval_metrics[eval_metric]) update_summary(epoch, train_metrics, eval_metrics, os.path.join(output_dir, 'summary.csv'), write_header=best_metric is None) if saver is not None: # save proper checkpoint with eval metric save_metric = eval_metrics[eval_metric] metrics_history[epoch] = eval_metrics make_plots(metrics_history, output_dir) best_metric, best_epoch = saver.save_checkpoint( model, optimizer, args, epoch=epoch, model_ema=model_ema, metric=save_metric) except KeyboardInterrupt: pass if best_metric is not None: logging.info('*** Best metric: {0} (epoch {1})'.format( best_metric, best_epoch))
def main(args, config): if args.horovod: verbose = hvd.rank() == 0 global_size = hvd.size() # global_rank = hvd.rank() local_rank = hvd.local_rank() else: verbose = True global_size = 1 # global_rank = 0 local_rank = 0 timestamp = time.strftime("%Y-%m-%d_%H:%M:%S", time.gmtime()) logdir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'runs', args.architecture, timestamp) if verbose: writer = tf.summary.FileWriter(logdir=logdir) print("Arguments passed:") print(args) print(f"Saving files to {logdir}") else: writer = None final_shape = parse_tuple(args.final_shape) image_channels = final_shape[0] final_resolution = final_shape[-1] num_phases = int(np.log2(final_resolution) - 1) base_dim = num_filters(1, num_phases, size=args.network_size) var_list = list() global_step = 0 for phase in range(1, num_phases + 1): tf.reset_default_graph() # ------------------------------------------------------------------------------------------# # DATASET size = 2 * 2**phase if args.dataset == 'imagenet': dataset = imagenet_dataset( args.dataset_path, args.scratch_path, size, copy_files=local_rank == 0, is_correct_phase=phase >= args.starting_phase, gpu=args.gpu, num_labels=1 if args.num_labels is None else args.num_labels) else: raise ValueError(f"Unknown dataset {args.dataset_path}") # Get DataLoader batch_size = max(1, args.base_batch_size // (2**(phase - 1))) if phase >= args.starting_phase: assert batch_size * global_size <= args.max_global_batch_size if verbose: print( f"Using local batch size of {batch_size} and global batch size of {batch_size * global_size}" ) if args.horovod: dataset.shard(hvd.size(), hvd.rank()) dataset = dataset.batch(batch_size, drop_remainder=True) dataset = dataset.repeat() dataset = dataset.prefetch(AUTOTUNE) dataset = dataset.make_one_shot_iterator() data = dataset.get_next() if len(data) == 1: real_image_input = data real_label = None elif len(data) == 2: real_image_input, real_label = data else: raise NotImplementedError() real_image_input = tf.ensure_shape( real_image_input, [batch_size, image_channels, size, size]) real_image_input = real_image_input + tf.random.normal( tf.shape(real_image_input)) * .01 if real_label is not None: real_label = tf.one_hot(real_label, depth=args.num_labels) # ------------------------------------------------------------------------------------------# # OPTIMIZERS g_lr = args.g_lr d_lr = args.d_lr if args.horovod: if args.g_scaling == 'sqrt': g_lr = g_lr * np.sqrt(hvd.size()) elif args.g_scaling == 'linear': g_lr = g_lr * hvd.size() elif args.g_scaling == 'none': pass else: raise ValueError(args.g_scaling) if args.d_scaling == 'sqrt': d_lr = d_lr * np.sqrt(hvd.size()) elif args.d_scaling == 'linear': d_lr = d_lr * hvd.size() elif args.d_scaling == 'none': pass else: raise ValueError(args.d_scaling) # d_lr = tf.Variable(d_lr, name='d_lr', dtype=tf.float32) # g_lr = tf.Variable(g_lr, name='g_lr', dtype=tf.float32) # # optimizer_gen = tf.train.AdamOptimizer(learning_rate=g_lr, beta1=args.beta1, beta2=args.beta2) # # optimizer_disc = tf.train.AdamOptimizer(learning_rate=d_lr, beta1=args.beta1, beta2=args.beta2) # # optimizer_gen = LAMB(learning_rate=g_lr, beta1=args.beta1, beta2=args.beta2) # # optimizer_disc = LAMB(learning_rate=d_lr, beta1=args.beta1, beta2=args.beta2) # # optimizer_gen = LARSOptimizer(learning_rate=g_lr, momentum=0, weight_decay=0) # # optimizer_disc = LARSOptimizer(learning_rate=d_lr, momentum=0, weight_decay=0) # # optimizer_gen = tf.train.RMSPropOptimizer(learning_rate=1e-3) # # optimizer_disc = tf.train.RMSPropOptimizer(learning_rate=1e-3) # # optimizer_gen = tf.train.GradientDescentOptimizer(learning_rate=1e-3) # # optimizer_disc = tf.train.GradientDescentOptimizer(learning_rate=1e-3) # # optimizer_gen = RAdamOptimizer(learning_rate=g_lr, beta1=args.beta1, beta2=args.beta2) # # optimizer_disc = RAdamOptimizer(learning_rate=d_lr, beta1=args.beta1, beta2=args.beta2) # lr_step = tf.Variable(0, name='step', dtype=tf.float32) # update_step = lr_step.assign_add(1.0) # with tf.control_dependencies([update_step]): # update_g_lr = g_lr.assign(g_lr * args.g_annealing) # update_d_lr = d_lr.assign(d_lr * args.d_annealing) # if args.horovod: # if args.use_adasum: # # optimizer_gen = hvd.DistributedOptimizer(optimizer_gen, op=hvd.Adasum) # optimizer_gen = hvd.DistributedOptimizer(optimizer_gen) # optimizer_disc = hvd.DistributedOptimizer(optimizer_disc, op=hvd.Adasum) # else: # optimizer_gen = hvd.DistributedOptimizer(optimizer_gen) # optimizer_disc = hvd.DistributedOptimizer(optimizer_disc) # ------------------------------------------------------------------------------------------# # NETWORKS with tf.variable_scope('alpha'): alpha = tf.Variable(1, name='alpha', dtype=tf.float32) # Alpha init init_alpha = alpha.assign(1) # Specify alpha update op for mixing phase. num_steps = args.mixing_nimg // (batch_size * global_size) alpha_update = 1 / num_steps # noinspection PyTypeChecker update_alpha = alpha.assign(tf.maximum(alpha - alpha_update, 0)) base_shape = [image_channels, 4, 4] if args.optim_strategy == 'simultaneous': gen_loss, disc_loss, gp_loss, gen_sample = forward_simultaneous( generator, discriminator, real_image_input, args.latent_dim, alpha, phase, num_phases, base_dim, base_shape, args.activation, args.leakiness, args.network_size, args.loss_fn, args.gp_weight, conditioning=real_label, ) gen_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='generator') disc_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='discriminator') with tf.variable_scope('optimizer_gen'): # disc_loss = tf.Print(gen_loss, [gen_loss], 'g_loss') optimizer_gen = create_optimizer( gen_loss, gen_vars, 1e-8, (args.mixing_nimg + args.stabilizing_nimg) / (batch_size * global_size), 8, hvd=hvd, optimizer_type='adam') with tf.variable_scope('optimizer_disc'): # disc_loss = tf.Print(disc_loss, [disc_loss], 'd_loss') optimizer_disc = create_optimizer( disc_loss, disc_vars, 1e-8, (args.mixing_nimg + args.stabilizing_nimg) / (batch_size * global_size), 8, hvd=hvd, optimizer_type='lamb') # if args.horovod: # if args.use_adasum: # # optimizer_gen = hvd.DistributedOptimizer(optimizer_gen, op=hvd.Adasum) # optimizer_gen = hvd.DistributedOptimizer(optimizer_gen, sparse_as_dense=True) # optimizer_disc = hvd.DistributedOptimizer(optimizer_disc, op=hvd.Adasum, sparse_as_dense=True) # else: # optimizer_gen = hvd.DistributedOptimizer(optimizer_gen, sparse_as_dense=True) # optimizer_disc = hvd.DistributedOptimizer(optimizer_disc, sparse_as_dense=True) # g_gradients = optimizer_gen.compute_gradients(gen_loss, var_list=gen_vars) # d_gradients = optimizer_disc.compute_gradients(disc_loss, var_list=disc_vars) # g_norms = tf.stack([tf.norm(grad) for grad, var in g_gradients if grad is not None]) # max_g_norm = tf.reduce_max(g_norms) # d_norms = tf.stack([tf.norm(grad) for grad, var in d_gradients if grad is not None]) # max_d_norm = tf.reduce_max(d_norms) # # g_clipped_grads = [(tf.clip_by_norm(grad, clip_norm=128), var) for grad, var in g_gradients] # # train_gen = optimizer_gen.apply_gradients(g_clipped_grads) # gs = t # train_gen = optimizer_gen.apply_gradients(g_gradients) # train_disc = optimizer_disc.apply_gradients(d_gradients) # elif args.optim_strategy == 'alternate': # disc_loss, gp_loss = forward_discriminator( # generator, # discriminator, # real_image_input, # args.latent_dim, # alpha, # phase, # num_phases, # base_dim, # base_shape, # args.activation, # args.leakiness, # args.network_size, # args.loss_fn, # args.gp_weight, # conditioning=real_label # ) # # disc_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='discriminator') # # d_gradients = optimizer_disc.compute_gradients(disc_loss, var_list=disc_vars) # # d_norms = tf.stack([tf.norm(grad) for grad, var in d_gradients if grad is not None]) # # max_d_norm = tf.reduce_max(d_norms) # # train_disc = optimizer_disc.apply_gradients(d_gradients) # with tf.control_dependencies([train_disc]): # gen_sample, gen_loss = forward_generator( # generator, # discriminator, # real_image_input, # args.latent_dim, # alpha, # phase, # num_phases, # base_dim, # base_shape, # args.activation, # args.leakiness, # args.network_size, # args.loss_fn, # is_reuse=True # ) # gen_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='generator') # g_gradients = optimizer_gen.compute_gradients(gen_loss, var_list=gen_vars) # g_norms = tf.stack([tf.norm(grad) for grad, var in g_gradients if grad is not None]) # max_g_norm = tf.reduce_max(g_norms) # train_gen = optimizer_gen.apply_gradients(g_gradients) else: raise ValueError("Unknown optim strategy ", args.optim_strategy) if verbose: print(f"Generator parameters: {count_parameters('generator')}") print( f"Discriminator parameters:: {count_parameters('discriminator')}" ) # train_gen = optimizer_gen.minimize(gen_loss, var_list=gen_vars) # train_disc = optimizer_disc.minimize(disc_loss, var_list=disc_vars) ema = tf.train.ExponentialMovingAverage(decay=args.ema_beta) ema_op = ema.apply(gen_vars) # Transfer EMA values to original variables ema_update_weights = tf.group( [tf.assign(var, ema.average(var)) for var in gen_vars]) with tf.name_scope('summaries'): # Summaries tf.summary.scalar('d_loss', disc_loss) tf.summary.scalar('g_loss', gen_loss) tf.summary.scalar('gp', tf.reduce_mean(gp_loss)) # for g in g_gradients: # tf.summary.histogram(f'grad_{g[1].name}', g[0]) # for g in d_gradients: # tf.summary.histogram(f'grad_{g[1].name}', g[0]) # tf.summary.scalar('convergence', tf.reduce_mean(disc_real) - tf.reduce_mean(tf.reduce_mean(disc_fake_d))) # tf.summary.scalar('max_g_grad_norm', max_g_norm) # tf.summary.scalar('max_d_grad_norm', max_d_norm) real_image_grid = tf.transpose(real_image_input, (0, 2, 3, 1)) # D H W C -> B H W C shape = real_image_grid.get_shape().as_list() grid_cols = int(2**np.floor(np.log(np.sqrt(shape[0])) / np.log(2))) grid_rows = shape[0] // grid_cols grid_shape = [grid_rows, grid_cols] real_image_grid = image_grid(real_image_grid, grid_shape, image_shape=shape[1:3], num_channels=shape[-1]) fake_image_grid = tf.transpose(gen_sample, (0, 2, 3, 1)) fake_image_grid = image_grid(fake_image_grid, grid_shape, image_shape=shape[1:3], num_channels=shape[-1]) fake_image_grid = tf.clip_by_value(fake_image_grid, -1, 1) tf.summary.image('real_image', real_image_grid) tf.summary.image('fake_image', fake_image_grid) tf.summary.scalar('fake_image_min', tf.math.reduce_min(gen_sample)) tf.summary.scalar('fake_image_max', tf.math.reduce_max(gen_sample)) tf.summary.scalar('real_image_min', tf.math.reduce_min(real_image_input[0])) tf.summary.scalar('real_image_max', tf.math.reduce_max(real_image_input[0])) tf.summary.scalar('alpha', alpha) tf.summary.scalar('g_lr', g_lr) tf.summary.scalar('d_lr', d_lr) merged_summaries = tf.summary.merge_all() # Other ops init_op = tf.global_variables_initializer() assign_starting_alpha = alpha.assign(args.starting_alpha) assign_zero = alpha.assign(0) broadcast = hvd.broadcast_global_variables(0) with tf.Session(config=config) as sess: sess.run(init_op) trainable_variable_names = [ v.name for v in tf.trainable_variables() ] if var_list is not None and phase > args.starting_phase: print("Restoring variables from:", os.path.join(logdir, f'model_{phase - 1}')) var_names = [v.name for v in var_list] load_vars = [ sess.graph.get_tensor_by_name(n) for n in var_names if n in trainable_variable_names ] saver = tf.train.Saver(load_vars) saver.restore(sess, os.path.join(logdir, f'model_{phase - 1}')) elif var_list is not None and args.continue_path and phase == args.starting_phase: print("Restoring variables from:", args.continue_path) var_names = [v.name for v in var_list] load_vars = [ sess.graph.get_tensor_by_name(n) for n in var_names if n in trainable_variable_names ] saver = tf.train.Saver(load_vars) saver.restore(sess, os.path.join(args.continue_path)) else: if verbose: print("Not restoring variables.") print("Variable List Length:", len(var_list)) var_list = gen_vars + disc_vars if phase < args.starting_phase: continue if phase == args.starting_phase: sess.run(assign_starting_alpha) else: sess.run(init_alpha) if verbose: print(f"Begin mixing epochs in phase {phase}") if args.horovod: sess.run(broadcast) local_step = 0 # take_first_snapshot = True while True: start = time.time() if local_step % 128 == 0 and local_step > 1: if args.horovod: sess.run(broadcast) saver = tf.train.Saver(var_list) if verbose: saver.save( sess, os.path.join(logdir, f'model_{phase}_ckpt_{global_step}')) # _, _, summary, d_loss, g_loss = sess.run( # [train_gen, train_disc, merged_summaries, # disc_loss, gen_loss]) _, _, summary, d_loss, g_loss = sess.run([ optimizer_gen, optimizer_disc, merged_summaries, disc_loss, gen_loss ]) global_step += batch_size * global_size local_step += 1 end = time.time() img_s = global_size * batch_size / (end - start) if verbose: writer.add_summary(summary, global_step) writer.add_summary( tf.Summary(value=[ tf.Summary.Value(tag='img_s', simple_value=img_s) ]), global_step) memory_percentage = psutil.Process( os.getpid()).memory_percent() writer.add_summary( tf.Summary(value=[ tf.Summary.Value(tag='memory_percentage', simple_value=memory_percentage) ]), global_step) print(f"Step {global_step:09} \t" f"img/s {img_s:.2f} \t " f"d_loss {d_loss:.4f} \t " f"g_loss {g_loss:.4f} \t " f"memory {memory_percentage:.4f} % \t" f"alpha {alpha.eval():.2f}") # if take_first_snapshot: # import tracemalloc # tracemalloc.start() # snapshot_first = tracemalloc.take_snapshot() # take_first_snapshot = False # snapshot = tracemalloc.take_snapshot() # top_stats = snapshot.compare_to(snapshot_first, 'lineno') # print("[ Top 10 differences ]") # for stat in top_stats[:10]: # print(stat) # snapshot_prev = snapshot if global_step >= ((phase - args.starting_phase) * (args.mixing_nimg + args.stabilizing_nimg) + args.mixing_nimg): break sess.run(update_alpha) sess.run(ema_op) # sess.run(update_d_lr) # sess.run(update_g_lr) assert alpha.eval() >= 0 if verbose: writer.flush() if verbose: print(f"Begin stabilizing epochs in phase {phase}") sess.run(assign_zero) while True: start = time.time() assert alpha.eval() == 0 if local_step % 128 == 0 and local_step > 0: if args.horovod: sess.run(broadcast) saver = tf.train.Saver(var_list) if verbose: saver.save( sess, os.path.join(logdir, f'model_{phase}_ckpt_{global_step}')) # _, _, summary, d_loss, g_loss = sess.run( # [train_gen, train_disc, merged_summaries, # disc_loss, gen_loss]) _, _, summary, d_loss, g_loss = sess.run([ optimizer_gen, optimizer_disc, merged_summaries, disc_loss, gen_loss ]) global_step += batch_size * global_size local_step += 1 end = time.time() img_s = global_size * batch_size / (end - start) if verbose: writer.add_summary( tf.Summary(value=[ tf.Summary.Value(tag='img_s', simple_value=img_s) ]), global_step) writer.add_summary(summary, global_step) memory_percentage = psutil.Process( os.getpid()).memory_percent() writer.add_summary( tf.Summary(value=[ tf.Summary.Value(tag='memory_percentage', simple_value=memory_percentage) ]), global_step) print(f"Step {global_step:09} \t" f"img/s {img_s:.2f} \t " f"d_loss {d_loss:.4f} \t " f"g_loss {g_loss:.4f} \t " f"memory {memory_percentage:.4f} % \t" f"alpha {alpha.eval():.2f}") sess.run(ema_op) if verbose: writer.flush() if global_step >= (phase - args.starting_phase + 1) * ( args.stabilizing_nimg + args.mixing_nimg): # if verbose: # run_metadata = tf.RunMetadata() # opts = tf.profiler.ProfileOptionBuilder.float_operation() # g = tf.get_default_graph() # flops = tf.profiler.profile(g, run_meta=run_metadata, cmd='op', options=opts) # writer.add_summary(tf.Summary(value=[tf.Summary.Value(tag='graph_flops', # simple_value=flops.total_float_ops)]), # global_step) # # # Print memory info. # try: # print(nvgpu.gpu_info()) # except subprocess.CalledProcessError: # pid = os.getpid() # py = psutil.Process(pid) # print(f"CPU Percent: {py.cpu_percent()}") # print(f"Memory info: {py.memory_info()}") break # # Calculate metrics. # calc_swds: bool = size >= 16 # calc_ssims: bool = min(npy_data.shape[1:]) >= 16 # # if args.calc_metrics: # fids_local = [] # swds_local = [] # psnrs_local = [] # mses_local = [] # nrmses_local = [] # ssims_local = [] # # counter = 0 # while True: # if args.horovod: # start_loc = counter + hvd.rank() * batch_size # else: # start_loc = 0 # real_batch = np.stack([npy_data[i] for i in range(start_loc, start_loc + batch_size)]) # real_batch = real_batch.astype(np.int16) - 1024 # fake_batch = sess.run(gen_sample).astype(np.float32) # # # Turn fake batch into HUs and clip to training range. # fake_batch = (np.clip(fake_batch, -1, 2) * 1024).astype(np.int16) # # if verbose: # print('real min, max', real_batch.min(), real_batch.max()) # print('fake min, max', fake_batch.min(), fake_batch.max()) # # fids_local.append(calculate_fid_given_batch_volumes(real_batch, fake_batch, sess)) # # if calc_swds: # swds = get_swd_for_volumes(real_batch, fake_batch) # swds_local.append(swds) # # psnr = get_psnr(real_batch, fake_batch) # if calc_ssims: # ssim = get_ssim(real_batch, fake_batch) # ssims_local.append(ssim) # mse = get_mean_squared_error(real_batch, fake_batch) # nrmse = get_normalized_root_mse(real_batch, fake_batch) # # psnrs_local.append(psnr) # mses_local.append(mse) # nrmses_local.append(nrmse) # # if args.horovod: # counter = counter + global_size * batch_size # else: # counter += batch_size # # if counter >= args.num_metric_samples: # break # # fid_local = np.mean(fids_local) # psnr_local = np.mean(psnrs_local) # ssim_local = np.mean(ssims_local) # mse_local = np.mean(mses_local) # nrmse_local = np.mean(nrmses_local) # # if args.horovod: # fid = MPI.COMM_WORLD.allreduce(fid_local, op=MPI.SUM) / hvd.size() # psnr = MPI.COMM_WORLD.allreduce(psnr_local, op=MPI.SUM) / hvd.size() # mse = MPI.COMM_WORLD.allreduce(mse_local, op=MPI.SUM) / hvd.size() # nrmse = MPI.COMM_WORLD.allreduce(nrmse_local, op=MPI.SUM) / hvd.size() # if calc_ssims: # ssim = MPI.COMM_WORLD.allreduce(ssim_local, op=MPI.SUM) / hvd.size() # else: # fid = fid_local # psnr = psnr_local # ssim = ssim_local # mse = mse_local # nrmse = nrmse_local # # if calc_swds: # swds_local = np.array(swds_local) # # Average over batches # swds_local = swds_local.mean(axis=0) # if args.horovod: # swds = MPI.COMM_WORLD.allreduce(swds_local, op=MPI.SUM) / hvd.size() # else: # swds = swds_local # # if verbose: # print(f"FID: {fid:.4f}") # writer.add_summary(tf.Summary(value=[tf.Summary.Value(tag='fid', # simple_value=fid)]), # global_step) # # print(f"PSNR: {psnr:.4f}") # writer.add_summary(tf.Summary(value=[tf.Summary.Value(tag='psnr', # simple_value=psnr)]), # global_step) # # print(f"MSE: {mse:.4f}") # writer.add_summary(tf.Summary(value=[tf.Summary.Value(tag='mse', # simple_value=mse)]), # global_step) # # print(f"Normalized Root MSE: {nrmse:.4f}") # writer.add_summary(tf.Summary(value=[tf.Summary.Value(tag='nrmse', # simple_value=nrmse)]), # global_step) # # if calc_swds: # print(f"SWDS: {swds}") # for i in range(len(swds))[:-1]: # lod = 16 * 2 ** i # writer.add_summary(tf.Summary(value=[tf.Summary.Value(tag=f'swd_{lod}', # simple_value=swds[ # i])]), # global_step) # writer.add_summary(tf.Summary(value=[tf.Summary.Value(tag=f'swd_mean', # simple_value=swds[ # -1])]), global_step) # if calc_ssims: # print(f"SSIM: {ssim}") # writer.add_summary(tf.Summary(value=[tf.Summary.Value(tag=f'ssim', # simple_value=ssim)]), global_step) if verbose: print("\n\n\n End of phase.") # Save Session. sess.run(ema_update_weights) saver = tf.train.Saver(var_list) saver.save(sess, os.path.join(logdir, f'model_{phase}')) if args.ending_phase: if phase == args.ending_phase: print("Reached final phase, breaking.") break
def main(): args = parser.parse_args() args.prefetcher = not args.no_prefetcher args.distributed = False if 'WORLD_SIZE' in os.environ: args.distributed = int(os.environ['WORLD_SIZE']) > 1 if args.distributed and args.num_gpu > 1: print( 'Using more than one GPU per process in distributed mode is not allowed. Setting num_gpu to 1.' ) args.num_gpu = 1 args.device = 'cuda:0' args.world_size = 1 r = -1 if args.distributed: args.num_gpu = 1 args.device = 'cuda:%d' % args.local_rank torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') args.world_size = torch.distributed.get_world_size() r = torch.distributed.get_rank() if args.distributed: print( 'Training in distributed mode with multiple processes, 1 GPU per process. Process %d, total %d.' % (r, args.world_size)) else: print('Training with a single process on %d GPUs.' % args.num_gpu) # FIXME seed handling for multi-process distributed? torch.manual_seed(args.seed) output_dir = '' if args.local_rank == 0: if args.output: output_base = args.output else: output_base = './output' exp_name = '-'.join([ datetime.now().strftime("%Y%m%d-%H%M%S"), args.model, str(args.img_size) ]) output_dir = get_outdir(output_base, 'train', exp_name) model = create_model(args.model, pretrained=args.pretrained, num_classes=args.num_classes, drop_rate=args.drop, global_pool=args.gp, bn_tf=args.bn_tf, bn_momentum=args.bn_momentum, bn_eps=args.bn_eps, checkpoint_path=args.initial_checkpoint) print('Model %s created, param count: %d' % (args.model, sum([m.numel() for m in model.parameters()]))) data_config = resolve_data_config(model, args, verbose=args.local_rank == 0) # optionally resume from a checkpoint start_epoch = 0 optimizer_state = None if args.resume: optimizer_state, start_epoch = resume_checkpoint( model, args.resume, args.start_epoch) if args.num_gpu > 1: if args.amp: print( 'Warning: AMP does not work well with nn.DataParallel, disabling. ' 'Use distributed mode for multi-GPU AMP.') args.amp = False model = nn.DataParallel(model, device_ids=list(range(args.num_gpu))).cuda() else: model.cuda() optimizer = create_optimizer(args, model) if optimizer_state is not None: optimizer.load_state_dict(optimizer_state) if has_apex and args.amp: model, optimizer = amp.initialize(model, optimizer, opt_level='O1') use_amp = True print('AMP enabled') else: use_amp = False print('AMP disabled') if args.distributed: model = DDP(model, delay_allreduce=True) lr_scheduler, num_epochs = create_scheduler(args, optimizer) if start_epoch > 0: lr_scheduler.step(start_epoch) if args.local_rank == 0: print('Scheduled epochs: ', num_epochs) train_dir = os.path.join(args.data, 'train') if not os.path.exists(train_dir): print('Error: training folder does not exist at: %s' % train_dir) exit(1) dataset_train = Dataset(train_dir) collate_fn = None if args.prefetcher and args.mixup > 0: collate_fn = FastCollateMixup(args.mixup, args.smoothing, args.num_classes) loader_train = create_loader( dataset_train, input_size=data_config['input_size'], batch_size=args.batch_size, is_training=True, use_prefetcher=args.prefetcher, rand_erase_prob=args.reprob, rand_erase_mode=args.remode, interpolation= 'random', # FIXME cleanly resolve this? data_config['interpolation'], mean=data_config['mean'], std=data_config['std'], num_workers=args.workers, distributed=args.distributed, collate_fn=collate_fn, ) eval_dir = os.path.join(args.data, 'validation') if not os.path.isdir(eval_dir): print('Error: validation folder does not exist at: %s' % eval_dir) exit(1) dataset_eval = Dataset(eval_dir) loader_eval = create_loader( dataset_eval, input_size=data_config['input_size'], batch_size=4 * args.batch_size, is_training=False, use_prefetcher=args.prefetcher, interpolation=data_config['interpolation'], mean=data_config['mean'], std=data_config['std'], num_workers=args.workers, distributed=args.distributed, ) if args.mixup > 0.: # smoothing is handled with mixup label transform train_loss_fn = SoftTargetCrossEntropy().cuda() validate_loss_fn = nn.CrossEntropyLoss().cuda() elif args.smoothing: train_loss_fn = LabelSmoothingCrossEntropy( smoothing=args.smoothing).cuda() validate_loss_fn = nn.CrossEntropyLoss().cuda() else: train_loss_fn = nn.CrossEntropyLoss().cuda() validate_loss_fn = train_loss_fn eval_metric = args.eval_metric saver = None if output_dir: decreasing = True if eval_metric == 'loss' else False saver = CheckpointSaver(checkpoint_dir=output_dir, decreasing=decreasing) best_metric = None best_epoch = None try: for epoch in range(start_epoch, num_epochs): if args.distributed: loader_train.sampler.set_epoch(epoch) train_metrics = train_epoch(epoch, model, loader_train, optimizer, train_loss_fn, args, lr_scheduler=lr_scheduler, saver=saver, output_dir=output_dir, use_amp=use_amp) eval_metrics = validate(model, loader_eval, validate_loss_fn, args) if lr_scheduler is not None: lr_scheduler.step(epoch, eval_metrics[eval_metric]) update_summary(epoch, train_metrics, eval_metrics, os.path.join(output_dir, 'summary.csv'), write_header=best_metric is None) if saver is not None: # save proper checkpoint with eval metric best_metric, best_epoch = saver.save_checkpoint( { 'epoch': epoch + 1, 'arch': args.model, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), 'args': args, }, epoch=epoch + 1, metric=eval_metrics[eval_metric]) except KeyboardInterrupt: pass if best_metric is not None: print('*** Best metric: {0} (epoch {1})'.format( best_metric, best_epoch))
def main(): cfg, args = _parse_args() torch.manual_seed(args.seed) output_base = cfg.OUTPUT_DIR if len(cfg.OUTPUT_DIR) > 0 else './output' exp_name = '-'.join([ datetime.now().strftime("%Y%m%d-%H%M%S"), cfg.MODEL.ARCHITECTURE, str(cfg.INPUT.IMG_SIZE) ]) output_dir = get_outdir(output_base, exp_name) with open(os.path.join(output_dir, 'config.yaml'), 'w', encoding='utf-8') as file_writer: # cfg.dump(stream=file_writer, default_flow_style=False, indent=2, allow_unicode=True) file_writer.write(pyaml.dump(cfg)) logger = setup_logger(file_name=os.path.join(output_dir, 'train.log'), control_log=False, log_level='INFO') # create model model = create_model(cfg.MODEL.ARCHITECTURE, num_classes=cfg.MODEL.NUM_CLASSES, pretrained=True, in_chans=cfg.INPUT.IN_CHANNELS, drop_rate=cfg.MODEL.DROP_RATE, drop_connect_rate=cfg.MODEL.DROP_CONNECT, global_pool=cfg.MODEL.GLOBAL_POOL) os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu gpu_list = list(map(int, args.gpu.split(','))) device = 'cuda' if len(gpu_list) == 1: model.cuda() torch.backends.cudnn.benchmark = True elif len(gpu_list) > 1: model = nn.DataParallel(model, device_ids=gpu_list) model = convert_model(model).cuda() torch.backends.cudnn.benchmark = True else: device = 'cpu' logger.info('device: {}, gpu_list: {}'.format(device, gpu_list)) optimizer = create_optimizer(cfg, model) # optionally initialize from a checkpoint if args.initial_checkpoint and os.path.isfile(args.initial_checkpoint): load_checkpoint(model, args.initial_checkpoint) # optionally resume from a checkpoint resume_state = None resume_epoch = None if args.resume and os.path.isfile(args.resume): resume_state, resume_epoch = resume_checkpoint(model, args.resume) if resume_state and not args.no_resume_opt: if 'optimizer' in resume_state: optimizer.load_state_dict(resume_state['optimizer']) logger.info('Restoring optimizer state from [{}]'.format( args.resume)) start_epoch = 0 if args.start_epoch is not None: start_epoch = args.start_epoch elif resume_epoch is not None: start_epoch = resume_epoch model_ema = None if cfg.SOLVER.EMA: # Important to create EMA model after cuda() model_ema = ModelEma(model, decay=cfg.SOLVER.EMA_DECAY, device=device, resume=args.resume) lr_scheduler, num_epochs = create_scheduler(cfg, optimizer) if lr_scheduler is not None and start_epoch > 0: lr_scheduler.step(start_epoch) # summary print('=' * 60) print(cfg) print('=' * 60) print(model) print('=' * 60) summary(model, (3, cfg.INPUT.IMG_SIZE, cfg.INPUT.IMG_SIZE)) # dataset dataset_train = Dataset(cfg.DATASETS.TRAIN) dataset_valid = Dataset(cfg.DATASETS.TEST) train_loader = create_loader(dataset_train, cfg, is_training=True) valid_loader = create_loader(dataset_valid, cfg, is_training=False) # loss function if cfg.SOLVER.LABEL_SMOOTHING > 0: train_loss_fn = LabelSmoothingCrossEntropy( smoothing=cfg.SOLVER.LABEL_SMOOTHING).to(device) validate_loss_fn = nn.CrossEntropyLoss().to(device) else: train_loss_fn = nn.CrossEntropyLoss().to(device) validate_loss_fn = train_loss_fn eval_metric = cfg.SOLVER.EVAL_METRIC best_metric = None best_epoch = None saver = CheckpointSaver( checkpoint_dir=output_dir, recovery_dir=output_dir, decreasing=True if eval_metric == 'loss' else False) try: for epoch in range(start_epoch, num_epochs): train_metrics = train_epoch(epoch, model, train_loader, optimizer, train_loss_fn, cfg, logger, lr_scheduler=lr_scheduler, saver=saver, device=device, model_ema=model_ema) eval_metrics = validate(epoch, model, valid_loader, validate_loss_fn, cfg, logger) if model_ema is not None: ema_eval_metrics = validate(epoch, model_ema.ema, valid_loader, validate_loss_fn, cfg, logger) eval_metrics = ema_eval_metrics if lr_scheduler is not None: # step LR for next epoch lr_scheduler.step(epoch + 1, eval_metrics[eval_metric]) update_summary(epoch, train_metrics, eval_metrics, os.path.join(output_dir, 'summary.csv'), write_header=best_metric is None) if saver is not None: # save proper checkpoint with eval metric save_metric = eval_metrics[eval_metric] best_metric, best_epoch = saver.save_checkpoint( model, optimizer, cfg, epoch=epoch, model_ema=model_ema, metric=save_metric) except KeyboardInterrupt: pass if best_metric is not None: logger.info('*** Best metric: {0} (epoch {1})'.format( best_metric, best_epoch))