def subprocess_fn(rank, args, temp_dir): dnnlib.util.Logger(file_name=os.path.join(args.run_dir, 'log.txt'), file_mode='a', should_flush=True) # Init torch.distributed. if args.num_gpus > 1: init_file = os.path.abspath( os.path.join(temp_dir, '.torch_distributed_init')) if os.name == 'nt': init_method = 'file:///' + init_file.replace('\\', '/') torch.distributed.init_process_group(backend='gloo', init_method=init_method, rank=rank, world_size=args.num_gpus) else: init_method = f'file://{init_file}' torch.distributed.init_process_group(backend='nccl', init_method=init_method, rank=rank, world_size=args.num_gpus) # Init torch_utils. sync_device = torch.device('cuda', rank) if args.num_gpus > 1 else None training_stats.init_multiprocessing(rank=rank, sync_device=sync_device) if rank != 0: custom_ops.verbosity = 'none' # Execute training loop. training_loop.training_loop(rank=rank, **args)
def subprocess_fn(rank, args, temp_dir): dnnlib.util.Logger(file_name=os.path.join(args.run_dir, "log.txt"), file_mode="a", should_flush=True) # Init torch.distributed. if args.num_gpus > 1: init_file = os.path.abspath(os.path.join(temp_dir, ".torch_distributed_init")) if os.name == "nt": init_method = "file:///" + init_file.replace("\\", "/") torch.distributed.init_process_group( backend="gloo", init_method=init_method, rank=rank, world_size=args.num_gpus ) else: init_method = f"file://{init_file}" torch.distributed.init_process_group( backend="nccl", init_method=init_method, rank=rank, world_size=args.num_gpus ) # Init torch_utils. sync_device = torch.device("cuda", rank) if args.num_gpus > 1 else None training_stats.init_multiprocessing(rank=rank, sync_device=sync_device) if rank != 0: custom_ops.verbosity = "none" # Execute training loop. training_loop.training_loop(rank=rank, **args)
def run_training(outdir, seed, dry_run, **hyperparam_options): # Setup training options. tflib.init_tf({'rnd.np_random_seed': seed}) run_desc, training_options = setup_training_options(**hyperparam_options) # Pick output directory. prev_run_dirs = [] if os.path.isdir(outdir): prev_run_dirs = [ x for x in os.listdir(outdir) if os.path.isdir(os.path.join(outdir, x)) ] prev_run_ids = [re.match(r'^\d+', x) for x in prev_run_dirs] prev_run_ids = [int(x.group()) for x in prev_run_ids if x is not None] cur_run_id = max(prev_run_ids, default=-1) + 1 training_options.run_dir = os.path.join(outdir, f'{cur_run_id:05d}-{run_desc}') assert not os.path.exists(training_options.run_dir) # Print options. print() print('Training options:') print(json.dumps(training_options, indent=2)) print() print(f'Output directory: {training_options.run_dir}') print(f'Training data: {training_options.train_dataset_args.path}') print(f'Training length: {training_options.total_kimg} kimg') res_h = training_options.train_dataset_args.min_h * 2**training_options.train_dataset_args.res_log2 res_w = training_options.train_dataset_args.min_w * 2**training_options.train_dataset_args.res_log2 print(f'Height res: {res_h}') print(f'Width res: {res_w}') #print(f'Resolution Height: {training_options.train_dataset_args.resolution}') print(f'Number of GPUs: {training_options.num_gpus}') print() # Dry run? if dry_run: print('Dry run; exiting.') return # Kick off training. print('Creating output directory...') os.makedirs(training_options.run_dir) with open(os.path.join(training_options.run_dir, 'training_options.json'), 'wt') as f: json.dump(training_options, f, indent=2) with dnnlib.util.Logger(os.path.join(training_options.run_dir, 'log.txt')): training_loop.training_loop(**training_options)
def train(datasets, options, ctx): images_path = datasets['images'] print('Reading expected image size from starting checkpoint') dnnlib.tflib.init_tf() _, _, Gs = pickle.load(open(options['checkpoint'], 'rb')) width, height = Gs.output_shape[::-1][:2] print('Resizing images') tmp_resized = tempfile.TemporaryDirectory() preprocess_images(images_path, tmp_resized.name, width, height, options['crop_method']) print('Converting dataset to TFRecord') tmp_dataset = tempfile.TemporaryDirectory() create_from_images(tmp_dataset.name, tmp_resized.name, 1) print('Creating training config') result_dir = runway.utils.generate_uuid() os.makedirs(result_dir) kwargs = create_training_config(tmp_dataset.name, options['checkpoint'], result_dir, **options) kwargs.update(max_steps=options['max_steps']) gen = training_loop(**kwargs) for (step, metrics, samples, checkpoints) in gen: ctx.step = step for k, v in metrics.items(): ctx.metrics[k] = v for k, v in samples.items(): ctx.samples.add(k, v) for k, v in checkpoints.items(): ctx.checkpoints.add(k, v)
def run_training(data_path, out_dir, resume=None, mirror=True): run_desc, training_options, outdir = setup_training_options(data_path, out_dir, resume=resume, mirror=mirror) # Pick output directory. prev_run_dirs = [] if os.path.isdir(outdir): prev_run_dirs = [ x for x in os.listdir(outdir) if os.path.isdir(os.path.join(outdir, x)) ] prev_run_ids = [re.match(r'^\d+', x) for x in prev_run_dirs] prev_run_ids = [int(x.group()) for x in prev_run_ids if x is not None] cur_run_id = max(prev_run_ids, default=-1) + 1 training_options.run_dir = os.path.join(outdir, f'{cur_run_id:05d}-{run_desc}') assert not os.path.exists(training_options.run_dir) # Print options. # print() # print('Training options:') # print(json.dumps(training_options, indent=2)) # print() # print(f'Output directory: {training_options.run_dir}') # print(f'Training data: {training_options.train_dataset_args.path}') # print(f'Training length: {training_options.total_kimg} kimg') # print(f'Resolution: {training_options.train_dataset_args.resolution}') # print(f'Number of GPUs: {training_options.num_gpus}') # print() # Kick off training. print('Creating output directory...') os.makedirs(training_options.run_dir) with open(os.path.join(training_options.run_dir, 'training_options.json'), 'wt') as f: json.dump(training_options, f, indent=2) with dnnlib.util.Logger(os.path.join(training_options.run_dir, 'log.txt')): training_loop.training_loop(**training_options)
def main(ctx, outdir, dry_run, encoder_mode, **config_kwargs): """Train a GAN using the techniques described in the paper "Training Generative Adversarial Networks with Limited Data". Examples: \b # Train with custom dataset using 1 GPU. python train.py --outdir=~/training-runs --data=~/mydataset.zip --gpus=1 \b # Train class-conditional CIFAR-10 using 2 GPUs. python train.py --outdir=~/training-runs --data=~/datasets/cifar10.zip \\ --gpus=2 --cfg=cifar --cond=1 \b # Transfer learn MetFaces from FFHQ using 4 GPUs. python train.py --outdir=~/training-runs --data=~/datasets/metfaces.zip \\ --gpus=4 --cfg=paper1024 --mirror=1 --resume=ffhq1024 --snap=10 \b # Reproduce original StyleGAN2 config F. python train.py --outdir=~/training-runs --data=~/datasets/ffhq.zip \\ --gpus=8 --cfg=stylegan2 --mirror=1 --aug=noaug \b Base configs (--cfg): auto Automatically select reasonable defaults based on resolution and GPU count. Good starting point for new datasets. stylegan2 Reproduce results for StyleGAN2 config F at 1024x1024. paper256 Reproduce results for FFHQ and LSUN Cat at 256x256. paper512 Reproduce results for BreCaHAD and AFHQ at 512x512. paper1024 Reproduce results for MetFaces at 1024x1024. cifar Reproduce results for CIFAR-10 at 32x32. \b Transfer learning source networks (--resume): ffhq256 FFHQ trained at 256x256 resolution. ffhq512 FFHQ trained at 512x512 resolution. ffhq1024 FFHQ trained at 1024x1024 resolution. celebahq256 CelebA-HQ trained at 256x256 resolution. lsundog256 LSUN Dog trained at 256x256 resolution. <PATH or URL> Custom network pickle. """ dnnlib.util.Logger(should_flush=True) print(config_kwargs) # Setup training options. try: if not encoder_mode: run_desc, args = setup_training_loop_kwargs(**config_kwargs) else: run_desc, args = setup_training_loop_encoder_kwargs( **config_kwargs) except UserError as err: ctx.fail(err) # Pick output directory. prev_run_dirs = [] if os.path.isdir(outdir): prev_run_dirs = [ x for x in os.listdir(outdir) if os.path.isdir(os.path.join(outdir, x)) ] prev_run_ids = [re.match(r'^\d+', x) for x in prev_run_dirs] prev_run_ids = [int(x.group()) for x in prev_run_ids if x is not None] cur_run_id = max(prev_run_ids, default=-1) + 1 args.run_dir = os.path.join(outdir, f'{cur_run_id:05d}-{run_desc}') assert not os.path.exists(args.run_dir) # Print options. print() print('Training options:') print(json.dumps(args, indent=2)) print() print(f'Output directory: {args.run_dir}') print(f'Training data: {args.training_set_kwargs.path}') print(f'Training duration: {args.total_kimg} kimg') print(f'Number of GPUs: {args.num_gpus}') print(f'Number of images: {args.training_set_kwargs.max_size}') print(f'Image resolution: {args.training_set_kwargs.resolution}') print(f'Conditional model: {args.training_set_kwargs.use_labels}') print(f'Dataset x-flips: {args.training_set_kwargs.xflip}') print() # Dry run? if dry_run: print('Dry run; exiting.') return # Create output directory. print('Creating output directory...') os.makedirs(args.run_dir) with open(os.path.join(args.run_dir, 'training_options.json'), 'wt') as f: json.dump(args, f, indent=2) # Launch processes. print('Launching processes...') training_loop.training_loop(**args)
from training.training_loop import training_loop from utils.training_utils import * from argparse import ArgumentParser usage = 'Parser for all sample.' parser = ArgumentParser(description=usage) parser.add_argument('--batch_size', type=int, default=128, help='batch size for sample') args = parser.parse_args() config = Config() config.make_task_dir() config.make_task_log() config.set(gpu_nums=8, batch_size=args.batch_size) config.write_config_and_gin() training_loop(config=config) config.terminate()