Beispiel #1
0
def subprocess_fn(rank, args, temp_dir):
    dnnlib.util.Logger(file_name=os.path.join(args.run_dir, 'log.txt'),
                       file_mode='a',
                       should_flush=True)

    # Init torch.distributed.
    if args.num_gpus > 1:
        init_file = os.path.abspath(
            os.path.join(temp_dir, '.torch_distributed_init'))
        if os.name == 'nt':
            init_method = 'file:///' + init_file.replace('\\', '/')
            torch.distributed.init_process_group(backend='gloo',
                                                 init_method=init_method,
                                                 rank=rank,
                                                 world_size=args.num_gpus)
        else:
            init_method = f'file://{init_file}'
            torch.distributed.init_process_group(backend='nccl',
                                                 init_method=init_method,
                                                 rank=rank,
                                                 world_size=args.num_gpus)

    # Init torch_utils.
    sync_device = torch.device('cuda', rank) if args.num_gpus > 1 else None
    training_stats.init_multiprocessing(rank=rank, sync_device=sync_device)
    if rank != 0:
        custom_ops.verbosity = 'none'

    # Execute training loop.
    training_loop.training_loop(rank=rank, **args)
def subprocess_fn(rank, args, temp_dir):
    dnnlib.util.Logger(file_name=os.path.join(args.run_dir, "log.txt"), file_mode="a", should_flush=True)

    # Init torch.distributed.
    if args.num_gpus > 1:
        init_file = os.path.abspath(os.path.join(temp_dir, ".torch_distributed_init"))
        if os.name == "nt":
            init_method = "file:///" + init_file.replace("\\", "/")
            torch.distributed.init_process_group(
                backend="gloo", init_method=init_method, rank=rank, world_size=args.num_gpus
            )
        else:
            init_method = f"file://{init_file}"
            torch.distributed.init_process_group(
                backend="nccl", init_method=init_method, rank=rank, world_size=args.num_gpus
            )

    # Init torch_utils.
    sync_device = torch.device("cuda", rank) if args.num_gpus > 1 else None
    training_stats.init_multiprocessing(rank=rank, sync_device=sync_device)
    if rank != 0:
        custom_ops.verbosity = "none"

    # Execute training loop.
    training_loop.training_loop(rank=rank, **args)
Beispiel #3
0
def run_training(outdir, seed, dry_run, **hyperparam_options):
    # Setup training options.
    tflib.init_tf({'rnd.np_random_seed': seed})
    run_desc, training_options = setup_training_options(**hyperparam_options)

    # Pick output directory.
    prev_run_dirs = []
    if os.path.isdir(outdir):
        prev_run_dirs = [
            x for x in os.listdir(outdir)
            if os.path.isdir(os.path.join(outdir, x))
        ]
    prev_run_ids = [re.match(r'^\d+', x) for x in prev_run_dirs]
    prev_run_ids = [int(x.group()) for x in prev_run_ids if x is not None]
    cur_run_id = max(prev_run_ids, default=-1) + 1
    training_options.run_dir = os.path.join(outdir,
                                            f'{cur_run_id:05d}-{run_desc}')
    assert not os.path.exists(training_options.run_dir)

    # Print options.
    print()
    print('Training options:')
    print(json.dumps(training_options, indent=2))
    print()
    print(f'Output directory:  {training_options.run_dir}')
    print(f'Training data:     {training_options.train_dataset_args.path}')
    print(f'Training length:   {training_options.total_kimg} kimg')
    res_h = training_options.train_dataset_args.min_h * 2**training_options.train_dataset_args.res_log2
    res_w = training_options.train_dataset_args.min_w * 2**training_options.train_dataset_args.res_log2
    print(f'Height res: {res_h}')
    print(f'Width res: {res_w}')
    #print(f'Resolution Height:        {training_options.train_dataset_args.resolution}')
    print(f'Number of GPUs:    {training_options.num_gpus}')
    print()

    # Dry run?
    if dry_run:
        print('Dry run; exiting.')
        return

    # Kick off training.
    print('Creating output directory...')
    os.makedirs(training_options.run_dir)
    with open(os.path.join(training_options.run_dir, 'training_options.json'),
              'wt') as f:
        json.dump(training_options, f, indent=2)
    with dnnlib.util.Logger(os.path.join(training_options.run_dir, 'log.txt')):
        training_loop.training_loop(**training_options)
Beispiel #4
0
def train(datasets, options, ctx):
    images_path = datasets['images']

    print('Reading expected image size from starting checkpoint')
    dnnlib.tflib.init_tf()
    _, _, Gs = pickle.load(open(options['checkpoint'], 'rb'))
    width, height = Gs.output_shape[::-1][:2]

    print('Resizing images')
    tmp_resized = tempfile.TemporaryDirectory()
    preprocess_images(images_path, tmp_resized.name, width, height, options['crop_method'])

    print('Converting dataset to TFRecord')
    tmp_dataset = tempfile.TemporaryDirectory()
    create_from_images(tmp_dataset.name, tmp_resized.name, 1)

    print('Creating training config')
    result_dir = runway.utils.generate_uuid()
    os.makedirs(result_dir)
    kwargs = create_training_config(tmp_dataset.name, options['checkpoint'], result_dir, **options)
    kwargs.update(max_steps=options['max_steps'])
    gen = training_loop(**kwargs)

    for (step, metrics, samples, checkpoints) in gen:
        ctx.step = step
        for k, v in metrics.items():
            ctx.metrics[k] = v
        for k, v in samples.items():
            ctx.samples.add(k, v)
        for k, v in checkpoints.items():
            ctx.checkpoints.add(k, v)
Beispiel #5
0
def run_training(data_path, out_dir, resume=None, mirror=True):

    run_desc, training_options, outdir = setup_training_options(data_path,
                                                                out_dir,
                                                                resume=resume,
                                                                mirror=mirror)

    # Pick output directory.
    prev_run_dirs = []
    if os.path.isdir(outdir):
        prev_run_dirs = [
            x for x in os.listdir(outdir)
            if os.path.isdir(os.path.join(outdir, x))
        ]
    prev_run_ids = [re.match(r'^\d+', x) for x in prev_run_dirs]
    prev_run_ids = [int(x.group()) for x in prev_run_ids if x is not None]
    cur_run_id = max(prev_run_ids, default=-1) + 1
    training_options.run_dir = os.path.join(outdir,
                                            f'{cur_run_id:05d}-{run_desc}')
    assert not os.path.exists(training_options.run_dir)

    # Print options.
    # print()
    # print('Training options:')
    # print(json.dumps(training_options, indent=2))
    # print()
    # print(f'Output directory:  {training_options.run_dir}')
    # print(f'Training data:     {training_options.train_dataset_args.path}')
    # print(f'Training length:   {training_options.total_kimg} kimg')
    # print(f'Resolution:        {training_options.train_dataset_args.resolution}')
    # print(f'Number of GPUs:    {training_options.num_gpus}')
    # print()

    # Kick off training.
    print('Creating output directory...')
    os.makedirs(training_options.run_dir)
    with open(os.path.join(training_options.run_dir, 'training_options.json'),
              'wt') as f:
        json.dump(training_options, f, indent=2)
    with dnnlib.util.Logger(os.path.join(training_options.run_dir, 'log.txt')):
        training_loop.training_loop(**training_options)
Beispiel #6
0
def main(ctx, outdir, dry_run, encoder_mode, **config_kwargs):
    """Train a GAN using the techniques described in the paper
    "Training Generative Adversarial Networks with Limited Data".

    Examples:

    \b
    # Train with custom dataset using 1 GPU.
    python train.py --outdir=~/training-runs --data=~/mydataset.zip --gpus=1

    \b
    # Train class-conditional CIFAR-10 using 2 GPUs.
    python train.py --outdir=~/training-runs --data=~/datasets/cifar10.zip \\
        --gpus=2 --cfg=cifar --cond=1

    \b
    # Transfer learn MetFaces from FFHQ using 4 GPUs.
    python train.py --outdir=~/training-runs --data=~/datasets/metfaces.zip \\
        --gpus=4 --cfg=paper1024 --mirror=1 --resume=ffhq1024 --snap=10

    \b
    # Reproduce original StyleGAN2 config F.
    python train.py --outdir=~/training-runs --data=~/datasets/ffhq.zip \\
        --gpus=8 --cfg=stylegan2 --mirror=1 --aug=noaug

    \b
    Base configs (--cfg):
      auto       Automatically select reasonable defaults based on resolution
                 and GPU count. Good starting point for new datasets.
      stylegan2  Reproduce results for StyleGAN2 config F at 1024x1024.
      paper256   Reproduce results for FFHQ and LSUN Cat at 256x256.
      paper512   Reproduce results for BreCaHAD and AFHQ at 512x512.
      paper1024  Reproduce results for MetFaces at 1024x1024.
      cifar      Reproduce results for CIFAR-10 at 32x32.

    \b
    Transfer learning source networks (--resume):
      ffhq256        FFHQ trained at 256x256 resolution.
      ffhq512        FFHQ trained at 512x512 resolution.
      ffhq1024       FFHQ trained at 1024x1024 resolution.
      celebahq256    CelebA-HQ trained at 256x256 resolution.
      lsundog256     LSUN Dog trained at 256x256 resolution.
      <PATH or URL>  Custom network pickle.
    """

    dnnlib.util.Logger(should_flush=True)
    print(config_kwargs)
    # Setup training options.
    try:
        if not encoder_mode:
            run_desc, args = setup_training_loop_kwargs(**config_kwargs)
        else:
            run_desc, args = setup_training_loop_encoder_kwargs(
                **config_kwargs)
    except UserError as err:
        ctx.fail(err)

    # Pick output directory.
    prev_run_dirs = []
    if os.path.isdir(outdir):
        prev_run_dirs = [
            x for x in os.listdir(outdir)
            if os.path.isdir(os.path.join(outdir, x))
        ]
    prev_run_ids = [re.match(r'^\d+', x) for x in prev_run_dirs]
    prev_run_ids = [int(x.group()) for x in prev_run_ids if x is not None]
    cur_run_id = max(prev_run_ids, default=-1) + 1
    args.run_dir = os.path.join(outdir, f'{cur_run_id:05d}-{run_desc}')
    assert not os.path.exists(args.run_dir)

    # Print options.
    print()
    print('Training options:')
    print(json.dumps(args, indent=2))
    print()
    print(f'Output directory:   {args.run_dir}')
    print(f'Training data:      {args.training_set_kwargs.path}')
    print(f'Training duration:  {args.total_kimg} kimg')
    print(f'Number of GPUs:     {args.num_gpus}')
    print(f'Number of images:   {args.training_set_kwargs.max_size}')
    print(f'Image resolution:   {args.training_set_kwargs.resolution}')
    print(f'Conditional model:  {args.training_set_kwargs.use_labels}')
    print(f'Dataset x-flips:    {args.training_set_kwargs.xflip}')
    print()

    # Dry run?
    if dry_run:
        print('Dry run; exiting.')
        return

    # Create output directory.
    print('Creating output directory...')
    os.makedirs(args.run_dir)
    with open(os.path.join(args.run_dir, 'training_options.json'), 'wt') as f:
        json.dump(args, f, indent=2)

    # Launch processes.
    print('Launching processes...')
    training_loop.training_loop(**args)
Beispiel #7
0
from training.training_loop import training_loop
from utils.training_utils import *
from argparse import ArgumentParser

usage = 'Parser for all sample.'
parser = ArgumentParser(description=usage)
parser.add_argument('--batch_size',
                    type=int,
                    default=128,
                    help='batch size for sample')

args = parser.parse_args()
config = Config()
config.make_task_dir()
config.make_task_log()
config.set(gpu_nums=8, batch_size=args.batch_size)
config.write_config_and_gin()
training_loop(config=config)
config.terminate()