Exemple #1
0
def launch(main_func,
           num_gpus_per_machine,
           num_machines=1,
           machine_rank=0,
           ips=None,
           args=()):
    """
    Launch multi-gpu or distributed training.
    This function must be called on all machines involved in the training.
    It will spawn child processes (defined by ``num_gpus_per_machine``) on each machine.

    Args:
        main_func: a function that will be called by `main_func(*args)`
        num_gpus_per_machine (int): number of GPUs per machine
        num_machines (int): the total number of machines
        machine_rank (int): the rank of this machine
        ips (str): url to connect to for distributed jobs, including protocol
                       e.g. "127.0.0.1".
        args (tuple): arguments passed to main_func
    """
    world_size = num_machines * num_gpus_per_machine
    if world_size > 1:
        options = {}
        if ips is not None:
            options.update({'ips': ips})
        dist.spawn(main_func, nprocs=num_gpus_per_machine, args=args, **options)
    else:
        main_func(*args)
Exemple #2
0
 def test_spawn(self):
     context = dist.spawn(train, backend='cncl', nprocs=4)
     rank_list = []
     for i in range(4):
         rank_list.append(context.return_queues[i].get())
     rank_list.sort()
     self.assertEqual(rank_list, list(range(4)))
Exemple #3
0
def main():
    args = parser.parse_args()
    os.makedirs(args.save, exist_ok=True)

    # save the configurations
    t = time.localtime()
    timestamp = time.strftime('%b-%d-%Y_%H%M', t)
    with open(os.path.join(args.save, 'args-{}.txt'.format(timestamp)),
              'w') as fh:
        json.dump(args.__dict__, fh, indent=2)

    print('Start at : {}'.format(timestamp))

    # show non-default args
    default_args = parser.parse_args([args.data, args.save])
    for key in args.__dict__:
        if args.__dict__[key] != default_args.__dict__[key]:
            print('{}: {} | default ({})'.format(key, args.__dict__[key],
                                                 default_args.__dict__[key]))

    if args.seed is not None:
        random.seed(args.seed)
        paddle.seed(args.seed)
        warnings.warn('You have chosen to seed training. '
                      'This will turn on the CUDNN deterministic setting, '
                      'which can slow down your training considerably! '
                      'You may see unexpected behavior when restarting '
                      'from checkpoints.')

    if args.gpu is not None:
        warnings.warn('You have chosen a specific GPU. This will completely '
                      'disable data parallelism.')

    ngpus_per_node = len(paddle.get_cuda_rng_state())
    print('ngpus per node is {}'.format(ngpus_per_node))
    if args.distributed:
        dist.spawn(main_worker,
                   nprocs=ngpus_per_node,
                   args=(args.gpu, ngpus_per_node, args),
                   started_port=6671)
    else:
        # Simply call main_worker function
        main_worker(args.gpu, ngpus_per_node, args)
Exemple #4
0
def main(config, args):
    if args.nprocs > 1 and args.device == "gpu":
        dist.spawn(main_sp, args=(config, args), nprocs=args.nprocs)
    else:
        main_sp(config, args)
Exemple #5
0

def train():
    # 1. initialize parallel environment (cpu & gpu)
    dist.init_parallel_env()

    # 2. set cpu place
    paddle.set_device('cpu')

    # 3. create data parallel layer & optimizer
    layer = LinearNet()
    dp_layer = paddle.DataParallel(layer)

    loss_fn = nn.MSELoss()
    adam = opt.Adam(learning_rate=0.001, parameters=dp_layer.parameters())

    # 4. run layer
    inputs = paddle.randn([10, 10], 'float32')
    outputs = dp_layer(inputs)
    labels = paddle.randn([10, 1], 'float32')
    loss = loss_fn(outputs, labels)

    loss.backward()

    adam.step()
    adam.clear_grad()


if __name__ == '__main__':
    dist.spawn(train, nprocs=2)
    layer = LinearNet()
    dp_layer = paddle.DataParallel(layer)
    loss_fn = nn.CrossEntropyLoss()
    adam = opt.Adam(learning_rate=0.001, parameters=dp_layer.parameters())

    # create data loader
    dataset = RandomDataset(BATCH_NUM * BATCH_SIZE)
    loader = paddle.io.DataLoader(dataset,
        batch_size=BATCH_SIZE,
        shuffle=True,
        drop_last=True,
        num_workers=1)

    # train
    for epoch_id in range(EPOCH_NUM):
        for batch_id, (image, label) in enumerate(loader()):
            out = layer(image)
            loss = loss_fn(out, label)

            loss.backward()

            adam.step()
            adam.clear_grad()

            if dist.get_rank() == 0:
                print("Epoch {} batch {}: loss = {}".format(
                    epoch_id, batch_id, np.mean(loss.numpy())))

if __name__ == '__main__':
    dist.spawn(train, nprocs=2, selected_gpus='2,3')
Exemple #7
0
def train():
    # 1. enable dynamic mode
    paddle.disable_static()

    # 2. initialize parallel environment
    dist.init_parallel_env()

    # 3. create data parallel layer & optimizer
    layer = LinearNet()
    dp_layer = paddle.DataParallel(layer)

    loss_fn = nn.MSELoss()
    adam = opt.Adam(learning_rate=0.001, parameters=dp_layer.parameters())

    # 4. run layer
    inputs = paddle.randn([10, 10], 'float32')
    outputs = dp_layer(inputs)
    labels = paddle.randn([10, 1], 'float32')
    loss = loss_fn(outputs, labels)

    loss = dp_layer.scale_loss(loss)
    loss.backward()
    dp_layer.apply_collective_grads()

    adam.step()
    adam.clear_grad()


if __name__ == '__main__':
    dist.spawn(train)
Exemple #8
0
                train_step += 1
            # 固定步数也要保存一次模型
            if batch_id % 2000 == 0 and batch_id != 0 and dist.get_rank() == 0:
                # 保存模型
                save_model(args=args,
                           epoch=epoch,
                           model=model,
                           optimizer=optimizer)
        # 多卡训练只使用一个进程执行评估和保存模型
        if dist.get_rank() == 0:
            # 执行评估
            model.eval()
            cer = evaluate(model, test_loader, greedy_decoder)
            print('[%s] Test epoch %d, cer: %f' % (datetime.now(), epoch, cer))
            writer.add_scalar('Test cer', cer, test_step)
            test_step += 1
            model.train()
            # 记录学习率
            writer.add_scalar('Learning rate', scheduler.last_lr, epoch)
            # 保存模型
            save_model(args=args,
                       epoch=epoch,
                       model=model,
                       optimizer=optimizer)
        scheduler.step()


if __name__ == '__main__':
    print_arguments(args)
    dist.spawn(train, args=(args, ))
Exemple #9
0
    # 2. initialize parallel environmen
    dist.init_parallel_env()

    # 3. create data parallel layer & optimizer
    layer = LinearNet()
    dp_layer = paddle.DataParallel(layer)

    loss_fn = nn.MSELoss()
    adam = opt.Adam(
        learning_rate=0.001, parameters=dp_layer.parameters())

    # 4. run layer
    inputs = paddle.randn([10, 10], 'float32')
    outputs = dp_layer(inputs)
    labels = paddle.randn([10, 1], 'float32')
    loss = loss_fn(outputs, labels)

    loss.backward()

    adam.step()
    adam.clear_grad()

    if dist.get_rank() == 0:
        print("loss:", loss.numpy())
        # paddle.jit.save(dp_layer, "spawn_model/linear", 
        #     input_spec=[InputSpec(shape=[None, 10], dtype='float32')])

if __name__ == '__main__':
    dist.spawn(train, nprocs=4)
Exemple #10
0
            # 多卡训练只使用一个进程打印
            if batch_id % 100 == 0 and dist.get_rank() == 0:
                eta_sec = ((time.time() - start) * 1000) * (sum_batch - (epoch - last_epoch) * len(train_loader) - batch_id)
                eta_str = str(timedelta(seconds=int(eta_sec / 1000)))
                print('[%s] Train epoch %d, batch: %d/%d, loss: %f, accuracy: %f, eta: %s' % (
                    datetime.now(), epoch, batch_id, len(train_loader), sum(loss_sum) / len(loss_sum), sum(accuracies) / len(accuracies), eta_str))
                writer.add_scalar('Train loss', los, train_step)
                train_step += 1
                loss_sum = []
        # 多卡训练只使用一个进程执行评估和保存模型
        if dist.get_rank() == 0:
            acc = test(model, metric_fc, test_loader)
            print('='*70)
            print('[%s] Test %d, accuracy: %f' % (datetime.now(), epoch, acc))
            print('='*70)
            writer.add_scalar('Test acc', acc, test_step)
            # 记录学习率
            writer.add_scalar('Learning rate', scheduler.last_lr, epoch)
            test_step += 1
            save_model(args, epoch, model, metric_fc, optimizer)
        scheduler.step()


if __name__ == '__main__':
    print_arguments(args)
    if len(args.gpus.split(',')) > 1:
        dist.spawn(train, args=(args,), gpus=args.gpus, nprocs=len(args.gpus.split(',')))
    else:
        os.environ["CUDA_VISIBLE_DEVICES"] = args.gpus
        train(args)
            # 多卡训练只使用一个进程打印
            if batch_id % 100 == 0 and dist.get_rank() == 0:
                eta_sec = ((time.time() - start) * 1000) * (sum_batch - (epoch - last_epoch) * len(train_loader) - batch_id)
                eta_str = str(timedelta(seconds=int(eta_sec / 1000)))
                print('[%s] Train epoch %d, batch: %d/%d, loss: %f, accuracy: %f, eta: %s' % (
                    datetime.now(), epoch, batch_id, len(train_loader), sum(loss_sum) / len(loss_sum), sum(accuracies) / len(accuracies), eta_str))
                writer.add_scalar('Train loss', los, train_step)
                train_step += 1
                loss_sum = []
        # 多卡训练只使用一个进程执行评估和保存模型
        if dist.get_rank() == 0:
            print('='*70)
            acc = test(model)
            print('[%s] Test %d, accuracy: %f' % (datetime.now(), epoch, acc))
            print('='*70)
            writer.add_scalar('Test acc', acc, test_step)
            # 记录学习率
            writer.add_scalar('Learning rate', scheduler.last_lr, epoch)
            test_step += 1
            save_model(args, epoch, model, metric_fc, optimizer)
        scheduler.step()
    save_model(args, args.num_epoch, model, metric_fc, optimizer)


if __name__ == '__main__':
    print_arguments(args)
    if len(args.gpus.split(',')) > 1:
        dist.spawn(train, args=(args,), gpus=args.gpus)
    else:
        train(args)
Exemple #12
0
def train_from_folder(
        data='./data',
        results_dir='./results',
        models_dir='./models',
        name='default',
        new=False,
        load_from=-1,
        image_size=128,
        network_capacity=16,
        transparent=False,
        batch_size=5,
        gradient_accumulate_every=6,
        num_train_steps=150000,
        learning_rate=2e-4,
        lr_mlp=0.1,
        ttur_mult=1.5,
        rel_disc_loss=False,
        num_workers=None,
        save_every=1000,
        generate=False,
        generate_interpolation=False,
        interpolation_num_steps=100,
        save_frames=False,
        num_image_tiles=8,
        trunc_psi=0.75,
        mixed_prob=0.9,
        fp16=False,
        cl_reg=False,
        fq_layers=[],
        fq_dict_size=256,
        attn_layers=[],
        no_const=False,
        aug_prob=0.,
        aug_types=['translation', 'cutout'],
        top_k_training=False,
        generator_top_k_gamma=0.99,
        generator_top_k_frac=0.5,
        dataset_aug_prob=0.,
        multi_gpus=False,
        calculate_fid_every=None,
        seed=42,
        use_shared_memory=True,  # set to False if /dev/shm is limited
):
    model_args = dict(name=name,
                      results_dir=results_dir,
                      models_dir=models_dir,
                      batch_size=batch_size,
                      gradient_accumulate_every=gradient_accumulate_every,
                      image_size=image_size,
                      network_capacity=network_capacity,
                      transparent=transparent,
                      lr=learning_rate,
                      lr_mlp=lr_mlp,
                      ttur_mult=ttur_mult,
                      rel_disc_loss=rel_disc_loss,
                      num_workers=num_workers,
                      save_every=save_every,
                      trunc_psi=trunc_psi,
                      fp16=fp16,
                      cl_reg=cl_reg,
                      fq_layers=fq_layers,
                      fq_dict_size=fq_dict_size,
                      attn_layers=attn_layers,
                      no_const=no_const,
                      aug_prob=aug_prob,
                      aug_types=cast_list(aug_types),
                      top_k_training=top_k_training,
                      generator_top_k_gamma=generator_top_k_gamma,
                      generator_top_k_frac=generator_top_k_frac,
                      dataset_aug_prob=dataset_aug_prob,
                      calculate_fid_every=calculate_fid_every,
                      mixed_prob=mixed_prob)

    if generate:
        model = Trainer(**model_args)
        model.load(load_from)
        samples_name = timestamped_filename()
        model.evaluate(samples_name, num_image_tiles)
        print(
            f'sample images generated at {results_dir}/{name}/{samples_name}')
        return

    if generate_interpolation:
        model = Trainer(**model_args)
        model.load(load_from)
        samples_name = timestamped_filename()
        model.generate_interpolation(samples_name,
                                     num_image_tiles,
                                     num_steps=interpolation_num_steps,
                                     save_frames=save_frames)
        print(
            f'interpolation generated at {results_dir}/{name}/{samples_name}')
        return

    world_size = dist.get_world_size()

    if world_size == 1 or not multi_gpus:
        run_training(0, 1, model_args, data, load_from, new, num_train_steps,
                     name, seed, use_shared_memory)
        return

    dist.spawn(run_training,
               args=(world_size, model_args, data, load_from, new,
                     num_train_steps, name, seed, use_shared_memory),
               nprocs=world_size,
               join=True)
    dist.init_parallel_env()

    # 2. create data parallel layer & optimizer
    layer = LinearNet()
    dp_layer = paddle.DataParallel(layer)

    loss_fn = nn.MSELoss()
    adam = opt.Adam(learning_rate=0.001, parameters=dp_layer.parameters())

    # 3. run layer
    inputs = paddle.randn([10, 10], 'float32')
    outputs = dp_layer(inputs)
    labels = paddle.randn([10, 1], 'float32')
    loss = loss_fn(outputs, labels)
    assert len(loss) == 1
    if print_result is True:
        train_data_list1.append(loss.numpy())
    assert len(train_data_list1)

    loss.backward()

    adam.step()
    adam.clear_grad()


if __name__ == '__main__':
    dist.spawn(train, args=(True, ))
    dist.spawn(train)
    dist.spawn(train, args=(True, ), nprocs=2, gpus='0,1')
    dist.spawn(train, args=(True, ), nprocs=2)
Exemple #14
0
    labels = paddle.randn([10, 1], 'float32')
    loss = loss_fn(outputs, labels)
    
    if print_result is True:
        print("loss:", loss.numpy())
    
    loss.backward()

    adam.step()
    adam.clear_grad()

# Usage 1: only pass function. 
# If your training method no need any argument, and 
# use all visible devices for parallel training. 
if __name__ == '__main__':
    dist.spawn(train)

# Usage 2: pass function and arguments.
# If your training method need some arguments, and 
# use all visible devices for parallel training.
if __name__ == '__main__':
    dist.spawn(train, args=(True,))

# Usage 3: pass function, arguments and nprocs.
# If your training method need some arguments, and 
# only use part of visible devices for parallel training.
# If your machine hold 8 cards {0,1,2,3,4,5,6,7},
# this case will use cards {0,1}; If you set 
# CUDA_VISIBLE_DEVICES=4,5,6,7, this case will use
# cards {4,5}
if __name__ == '__main__':
    set process name on local machine
    """
    if config.general.is_cloud:
        return
    if tag_base.startswith('train'):
        tag_base = 'train'
    import setproctitle
    setproctitle.setproctitle(tag_base + '_' + config.data.output.rstrip('/')
                              .split('/')[-1])


if __name__ == "__main__":
    config = global_config.gen_config()
    init_env(config)

    run_mode = config.general.mode
    if run_mode == 'preproc':
        preprocess(config)
        sys.exit(0)

    _set_proc_name(config, run_mode)
    if run_mode == 'test':
        evaluate(config)
    elif run_mode == 'infer':
        inference(config)
    elif run_mode.startswith('train'):
        if config.train.use_data_parallel:
            dist.spawn(train, args=(config, ))
        else:
            train(config)
Exemple #16
0
            if args.do_train:
                # If do_eval=True, use best model to evaluate the test data.
                # Otherwise, use final model to evaluate the test data.
                if args.do_eval:
                    args.init_from_ckpt = os.path.join(args.output_dir, 'best')
                    load_ckpt(args, model)
            else:
                if not args.init_from_ckpt:
                    raise ValueError('"init_from_ckpt" should be set.')
                load_ckpt(args, model)
            print('\nTest begin...')
            evaluation(args, model, test_data_loader, metric)


def print_args(args):
    print('-----------  Configuration Arguments -----------')
    for arg, value in sorted(vars(args).items()):
        print('%s: %s' % (arg, value))
    print('------------------------------------------------')


if __name__ == '__main__':
    args = parse_args()
    set_default_args(args)
    print_args(args)

    if args.n_gpu > 1:
        dist.spawn(main, args=(args, ), nprocs=args.n_gpu)
    else:
        main(args)
Exemple #17
0
def train(world_size=2):
    if world_size > 1:
        dist.spawn(do_train, nprocs=world_size, args=())
    else:
        do_train()