Ejemplo n.º 1
0
def adjust_learning_rate(epoch, batch_idx):
    if epoch < args.warmup_epochs:
        epoch += float(batch_idx + 1) / len(train_loader)
        lr_adj = 1. / bps.size() * (epoch * (bps.size() - 1) / args.warmup_epochs + 1)
    elif epoch < 30:
        lr_adj = 1.
    elif epoch < 60:
        lr_adj = 1e-1
    elif epoch < 80:
        lr_adj = 1e-2
    else:
        lr_adj = 1e-3
    for param_group in optimizer.param_groups:
        param_group['lr'] = args.base_lr * bps.size() * args.batches_per_pushpull * lr_adj
Ejemplo n.º 2
0
def benchmark(tensor, average, name):
    if not args.no_wait and hvd.rank() == 0:
        # let other workers submit allreduce request first
        time.sleep(0.01)
    start = time.time()
    # do not use allreduce_() as it polls every 1ms
    handle = push_pull_async_inplace(tensor, average, name)
    while True:
        if poll(handle):
            synchronize(handle)
            break
    end = time.time()
    return (end - start) * 1000


log('Number of GPUs: %d' % (hvd.size()))

# Benchmark
log('Running benchmark...')

log('size (Byte)    avg. time (ms)    std.dev (ms)')
for i in range(10):
    size = 10**i
    data = torch.rand(size, dtype=torch.float32)
    if args.cuda:
        data = data.cuda()
    # warm up
    for j in range(args.num_warmup):
        benchmark(tensor=data, average=True, name=str(i))
    # timeit
    durations = []
Ejemplo n.º 3
0
    args.log_dir) if bps.rank() == 0 else None

kwargs = {'num_workers': 4, 'pin_memory': True} if args.cuda else {}
train_dataset = \
    datasets.ImageFolder(args.train_dir,
                         transform=transforms.Compose([
                             transforms.RandomResizedCrop(224),
                             transforms.RandomHorizontalFlip(),
                             transforms.ToTensor(),
                             transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                                  std=[0.229, 0.224, 0.225])
                         ]))
# BytePS: use DistributedSampler to partition data among workers. Manually specify
# `num_replicas=bps.size()` and `rank=bps.rank()`.
train_sampler = torch.utils.data.distributed.DistributedSampler(
    train_dataset, num_replicas=bps.size(), rank=bps.rank())
train_loader = torch.utils.data.DataLoader(train_dataset,
                                           batch_size=allreduce_batch_size,
                                           sampler=train_sampler,
                                           **kwargs)

val_dataset = \
    datasets.ImageFolder(args.val_dir,
                         transform=transforms.Compose([
                             transforms.Resize(256),
                             transforms.CenterCrop(224),
                             transforms.ToTensor(),
                             transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                                  std=[0.229, 0.224, 0.225])
                         ]))
val_sampler = torch.utils.data.distributed.DistributedSampler(
Ejemplo n.º 4
0
    loss = F.cross_entropy(output, target)
    loss.backward()
    optimizer.step()


def log(s, nl=True):
    if bps.local_rank() != 0:
        return
    print(s, end='\n' if nl else '')
    sys.stdout.flush()


log('Model: %s' % args.model)
log('Batch size: %d' % args.batch_size)
device = 'GPU' if args.cuda else 'CPU'
log('Number of %ss: %d' % (device, bps.size()))

# Warm-up
log('Running warmup...')
timeit.timeit(benchmark_step, number=args.num_warmup_batches)

# Benchmark
log('Running benchmark...')
img_secs = []
enable_profiling = args.profiler & (bps.rank() == 0)

with torch.autograd.profiler.profile(enable_profiling, True) as prof:
    for x in range(args.num_iters):
        time = timeit.timeit(benchmark_step, number=args.num_batches_per_iter)
        img_sec = args.batch_size * args.num_batches_per_iter / time
        log('Iter #%d: %.1f img/sec per %s' % (x, img_sec, device))
def main_worker(gpu, ngpus_per_node, args):
    global best_acc1
    args.gpu = gpu

    if args.gpu is not None:
        print("Use GPU: {} for training".format(args.gpu))

    if args.distributed:
        if args.dist_url == "env://" and args.rank == -1:
            args.rank = int(os.environ["RANK"])
        if args.multiprocessing_distributed:
            # For multiprocessing distributed training, rank needs to be the
            # global rank among all the processes
            args.rank = args.rank * ngpus_per_node + gpu

    # create model
    if args.pretrained:
        print("=> using pre-trained model '{}'".format(args.arch))
        model = models.__dict__[args.arch](pretrained=True)
    else:
        print("=> creating model '{}'".format(args.arch))
        model = models.__dict__[args.arch]()

    if args.distributed:
        # For multiprocessing distributed, DistributedDataParallel constructor
        # should always set the single device scope, otherwise,
        # DistributedDataParallel will use all available devices.
        if args.gpu is not None:
            torch.cuda.set_device(args.gpu)
            model.cuda(args.gpu)
            # When using a single GPU per process and per
            # DistributedDataParallel, we need to divide the batch size
            # ourselves based on the total number of GPUs we have
            args.batch_size = int(args.batch_size / ngpus_per_node)
            args.workers = int(
                (args.workers + ngpus_per_node - 1) / ngpus_per_node)
            model = DDP(model,
                        device_ids=[args.gpu],
                        broadcast_buffers=args.broadcast_buffers)

    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss().cuda(args.gpu)

    optimizer = torch.optim.SGD(model.parameters(),
                                args.lr,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)

    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            if args.gpu is None:
                checkpoint = torch.load(args.resume)
            else:
                # Map model to be loaded to specified single gpu.
                loc = 'cuda:{}'.format(args.gpu)
                checkpoint = torch.load(args.resume, map_location=loc)
            args.start_epoch = checkpoint['epoch']
            best_acc1 = checkpoint['best_acc1']
            if args.gpu is not None:
                # best_acc1 may be from a checkpoint from a different GPU
                best_acc1 = best_acc1.to(args.gpu)
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    cudnn.benchmark = True

    # Data loading code
    traindir = os.path.join(args.data, 'train')
    valdir = os.path.join(args.data, 'val')
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])

    train_dataset = datasets.ImageFolder(
        traindir,
        transforms.Compose([
            transforms.RandomResizedCrop(224),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            normalize,
        ]))

    if args.distributed:
        train_sampler = torch.utils.data.distributed.DistributedSampler(
            train_dataset, num_replicas=bps.size(), rank=bps.rank())
    else:
        train_sampler = None

    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=args.batch_size,
                                               shuffle=(train_sampler is None),
                                               num_workers=args.workers,
                                               pin_memory=True,
                                               sampler=train_sampler)

    val_loader = torch.utils.data.DataLoader(datasets.ImageFolder(
        valdir,
        transforms.Compose([
            transforms.Resize(256),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            normalize,
        ])),
                                             batch_size=args.batch_size,
                                             shuffle=False,
                                             num_workers=args.workers,
                                             pin_memory=True)

    if args.evaluate:
        validate(val_loader, model, criterion, args)
        return

    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            train_sampler.set_epoch(epoch)
        adjust_learning_rate(optimizer, epoch, args)

        # train for one epoch
        train(train_loader, model, criterion, optimizer, epoch, args)

        # evaluate on validation set
        acc1 = validate(val_loader, model, criterion, args)

        # remember best acc@1 and save checkpoint
        is_best = acc1 > best_acc1
        best_acc1 = max(acc1, best_acc1)

        if not args.multiprocessing_distributed or (
                args.multiprocessing_distributed
                and args.rank % ngpus_per_node == 0):
            save_checkpoint(
                {
                    'epoch': epoch + 1,
                    'arch': args.arch,
                    'state_dict': model.state_dict(),
                    'best_acc1': best_acc1,
                    'optimizer': optimizer.state_dict(),
                }, is_best)
Ejemplo n.º 6
0
def train(args, train_dataset, model: PreTrainedModel,
          tokenizer: PreTrainedTokenizer) -> Tuple[int, float]:
    """ Train the model """
    if args.local_rank in [-1, 0]:
        tb_writer = SummaryWriter()

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)

    def collate(examples: List[torch.Tensor]):
        if tokenizer._pad_token is None:
            return pad_sequence(examples, batch_first=True)
        return pad_sequence(examples,
                            batch_first=True,
                            padding_value=tokenizer.pad_token_id)

    train_sampler = DistributedSampler(train_dataset,
                                       num_replicas=bps.size(),
                                       rank=bps.rank())
    train_dataloader = DataLoader(train_dataset,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size,
                                  collate_fn=collate)

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (
            len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(
            train_dataloader
        ) // args.gradient_accumulation_steps * args.num_train_epochs

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            args.weight_decay,
        },
        {
            "params": [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            0.0
        },
    ]
    optimizer = SGD(optimizer_grouped_parameters,
                    lr=args.learning_rate,
                    momentum=0.9)

    optimizer = bps.DistributedOptimizer(
        optimizer, named_parameters=model.named_parameters())
    bps.broadcast_parameters(model.state_dict(), root_rank=0)

    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=args.warmup_steps,
        num_training_steps=t_total)

    # Check if saved optimizer or scheduler states exist
    if (args.model_name_or_path and os.path.isfile(
            os.path.join(args.model_name_or_path, "optimizer.pt"))
            and os.path.isfile(
                os.path.join(args.model_name_or_path, "scheduler.pt"))):
        # Load in optimizer and scheduler states
        optimizer.load_state_dict(
            torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
        scheduler.load_state_dict(
            torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))

    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16_opt_level)

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d",
                args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size * args.gradient_accumulation_steps *
        (bps.size() if args.local_rank != -1 else 1),
    )
    logger.info("  Gradient Accumulation steps = %d",
                args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    epochs_trained = 0
    steps_trained_in_current_epoch = 0
    # Check if continuing training from a checkpoint
    if args.model_name_or_path and os.path.exists(args.model_name_or_path):
        try:
            # set global_step to gobal_step of last saved checkpoint from model path
            checkpoint_suffix = args.model_name_or_path.split("-")[-1].split(
                "/")[0]
            global_step = int(checkpoint_suffix)
            epochs_trained = global_step // (len(train_dataloader) //
                                             args.gradient_accumulation_steps)
            steps_trained_in_current_epoch = global_step % (
                len(train_dataloader) // args.gradient_accumulation_steps)

            logger.info(
                "  Continuing training from checkpoint, will skip to saved global_step"
            )
            logger.info("  Continuing training from epoch %d", epochs_trained)
            logger.info("  Continuing training from global step %d",
                        global_step)
            logger.info("  Will skip the first %d steps in the first epoch",
                        steps_trained_in_current_epoch)
        except ValueError:
            logger.info("  Starting fine-tuning.")

    tr_loss, logging_loss = 0.0, 0.0

    model_to_resize = model.module if hasattr(
        model,
        "module") else model  # Take care of distributed/parallel training
    model_to_resize.resize_token_embeddings(len(tokenizer))

    model.zero_grad()
    train_iterator = trange(epochs_trained,
                            int(args.num_train_epochs),
                            desc="Epoch",
                            disable=args.local_rank not in [-1, 0])
    set_seed(args)  # Added here for reproducibility
    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader,
                              desc="Iteration",
                              disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):

            # Skip past any already trained steps if resuming training
            if steps_trained_in_current_epoch > 0:
                steps_trained_in_current_epoch -= 1
                continue

            inputs, labels = mask_tokens(batch, tokenizer,
                                         args) if args.mlm else (batch, batch)
            inputs = inputs.to(args.device)
            labels = labels.to(args.device)
            model.train()
            outputs = model(inputs,
                            masked_lm_labels=labels) if args.mlm else model(
                                inputs, labels=labels)
            loss = outputs[
                0]  # model outputs are always tuple in transformers (see doc)

            if args.n_gpu > 1:
                loss = loss.mean(
                )  # mean() to average on multi-gpu parallel training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    torch.nn.utils.clip_grad_norm_(
                        amp.master_params(optimizer), args.max_grad_norm)
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                   args.max_grad_norm)
                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

                if args.local_rank in [
                        -1, 0
                ] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    # Log metrics
                    if (
                            args.local_rank == -1
                            and args.evaluate_during_training
                    ):  # Only evaluate when single GPU otherwise metrics may not average well
                        results = evaluate(args, model, tokenizer)
                        for key, value in results.items():
                            tb_writer.add_scalar("eval_{}".format(key), value,
                                                 global_step)
                    tb_writer.add_scalar("lr",
                                         scheduler.get_lr()[0], global_step)
                    tb_writer.add_scalar("loss", (tr_loss - logging_loss) /
                                         args.logging_steps, global_step)
                    logging_loss = tr_loss

                if args.local_rank in [
                        -1, 0
                ] and args.save_steps > 0 and global_step % args.save_steps == 0:
                    checkpoint_prefix = "checkpoint"
                    # Save model checkpoint
                    output_dir = os.path.join(
                        args.output_dir,
                        "{}-{}".format(checkpoint_prefix, global_step))
                    os.makedirs(output_dir, exist_ok=True)
                    model_to_save = (
                        model.module if hasattr(model, "module") else model
                    )  # Take care of distributed/parallel training
                    model_to_save.save_pretrained(output_dir)
                    tokenizer.save_pretrained(output_dir)

                    torch.save(args,
                               os.path.join(output_dir, "training_args.bin"))
                    logger.info("Saving model checkpoint to %s", output_dir)

                    _rotate_checkpoints(args, checkpoint_prefix)

                    torch.save(optimizer.state_dict(),
                               os.path.join(output_dir, "optimizer.pt"))
                    torch.save(scheduler.state_dict(),
                               os.path.join(output_dir, "scheduler.pt"))
                    logger.info("Saving optimizer and scheduler states to %s",
                                output_dir)

            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break
        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break

    if args.local_rank in [-1, 0]:
        tb_writer.close()

    return global_step, tr_loss / global_step
Ejemplo n.º 7
0
if args.cuda:
    # BytePS: pin GPU to local rank.
    torch.cuda.set_device(bps.local_rank())
    torch.cuda.manual_seed(args.seed)

kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {}
train_dataset = \
    datasets.MNIST('data-%d' % bps.rank(), train=True, download=True,
                   transform=transforms.Compose([
                       transforms.ToTensor(),
                       transforms.Normalize((0.1307,), (0.3081,))
                   ]))
# BytePS: use DistributedSampler to partition the training data.
train_sampler = torch.utils.data.distributed.DistributedSampler(
    train_dataset, num_replicas=bps.size(), rank=bps.rank())
train_loader = torch.utils.data.DataLoader(train_dataset,
                                           batch_size=args.batch_size,
                                           sampler=train_sampler,
                                           **kwargs)

test_dataset = \
    datasets.MNIST('data-%d' % bps.rank(), train=False, transform=transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.1307,), (0.3081,))
    ]))
# BytePS: use DistributedSampler to partition the test data.
test_sampler = torch.utils.data.distributed.DistributedSampler(
    test_dataset, num_replicas=bps.size(), rank=bps.rank())
test_loader = torch.utils.data.DataLoader(test_dataset,
                                          batch_size=args.test_batch_size,
Ejemplo n.º 8
0

def benchmark(tensor, average, name):
    if not args.no_wait and bps.rank() == 0:
        time.sleep(0.01)
    start = time.time()
    handle = push_pull_async_inplace(tensor, average, name)
    while True:
        if poll(handle):
            synchronize(handle)
            break
    end = time.time()
    return (end - start) * 1000


log('Number of GPUs: %d' % (bps.size()))

# Benchmark
log('Running benchmark...')

log('size (Byte) \t avg. time (ms) \t std.dev (ms)')
for i in range(8):
    size = 10**i
    data = torch.rand(size, dtype=torch.float32)
    if args.cuda:
        data = data.cuda()
    # warm up
    for j in range(args.num_warmup):
        benchmark(tensor=data, average=True, name=str(i))
    # timeit
    durations = []
Ejemplo n.º 9
0
def env_world_size():
    return bps.size()
Ejemplo n.º 10
0
        if args.cuda:
            args.model_device = torch.device('cuda')
        else:
            args.model_device = torch.device('cpu')

    # Initialize Horovod/Cuda
    myrank = 0
    mysize = 1
    if args.par == "hvd":
        hvd.init()
        myrank = hvd.rank()
        mysize = hvd.size()
    elif args.par == "bps":
        bps.init()
        myrank = bps.rank()
        mysize = bps.size()
    torch.manual_seed(args.seed)
    if args.cuda:
        # Horovod & BytePS: pin GPU to local rank.
        if args.par == "hvd":
            torch.cuda.set_device(hvd.local_rank())
            torch.cuda.manual_seed(args.seed)
        if args.par == "bps":
            torch.cuda.set_device(bps.local_rank())
            torch.cuda.manual_seed(args.seed)

    # Model definition
    model = MortgageNetwork(
        args.num_features,
        args.embedding_size,
        args.hidden_dims,
Ejemplo n.º 11
0
if args.cuda:
    # BytePS: pin GPU to local rank.
    torch.cuda.set_device(bps.local_rank())
    torch.cuda.manual_seed(args.seed)


kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {}
train_dataset = \
    datasets.MNIST('data-%d' % bps.rank(), train=True, download=True,
                   transform=transforms.Compose([
                       transforms.ToTensor(),
                       transforms.Normalize((0.1307,), (0.3081,))
                   ]))
# BytePS: use DistributedSampler to partition the training data.
train_sampler = torch.utils.data.distributed.DistributedSampler(
    train_dataset, num_replicas=bps.size(), rank=bps.rank())
train_loader = torch.utils.data.DataLoader(
    train_dataset, batch_size=args.batch_size, sampler=train_sampler, **kwargs)

test_dataset = \
    datasets.MNIST('data-%d' % bps.rank(), train=False, transform=transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.1307,), (0.3081,))
    ]))
# BytePS: use DistributedSampler to partition the test data.
test_sampler = torch.utils.data.distributed.DistributedSampler(
    test_dataset, num_replicas=bps.size(), rank=bps.rank())
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=args.test_batch_size,
                                          sampler=test_sampler, **kwargs)

Ejemplo n.º 12
0
    def build_model(self):
        """ DataLoader """

        if self.fix_aug:
            print("FIX AUG ON")
            train_transform = transforms.Compose([
                transforms.RandomHorizontalFlip(),
                transforms.Resize((self.img_size, self.img_size)),
                transforms.ToTensor(),
                transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5))
            ])
        else:
            train_transform = transforms.Compose([
                transforms.RandomHorizontalFlip(),
                transforms.Resize((self.img_size + 30, self.img_size + 30)),
                transforms.RandomCrop(self.img_size),
                transforms.ToTensor(),
                transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5))
            ])

        test_transform = transforms.Compose([
            transforms.Resize((self.img_size, self.img_size)),
            transforms.ToTensor(),
            transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5))
        ])

        self.trainA = ImageFolder(os.path.join(self.dataset_dir, self.dataset,
                                               'trainA'),
                                  train_transform,
                                  list_mode=self.list_mode)
        self.trainB = ImageFolder(os.path.join(self.dataset_dir, self.dataset,
                                               'trainB'),
                                  train_transform,
                                  list_mode=self.list_mode)
        self.testA = ImageFolder(os.path.join(self.dataset_dir, self.dataset,
                                              'testA'),
                                 test_transform,
                                 list_mode=self.list_mode)
        self.testB = ImageFolder(os.path.join(self.dataset_dir, self.dataset,
                                              'testB'),
                                 test_transform,
                                 list_mode=self.list_mode)

        trainA_sampler = torch.utils.data.distributed.DistributedSampler(
            self.trainA, num_replicas=bps.size(), rank=bps.rank())
        trainB_sampler = torch.utils.data.distributed.DistributedSampler(
            self.trainB, num_replicas=bps.size(), rank=bps.rank())
        testA_sampler = torch.utils.data.distributed.DistributedSampler(
            self.testA, num_replicas=bps.size(), rank=bps.rank())
        testB_sampler = torch.utils.data.distributed.DistributedSampler(
            self.testB, num_replicas=bps.size(), rank=bps.rank())

        self.trainA_loader = DataLoader(self.trainA,
                                        batch_size=self.batch_size,
                                        sampler=trainA_sampler,
                                        num_workers=1)
        self.trainB_loader = DataLoader(self.trainB,
                                        batch_size=self.batch_size,
                                        sampler=trainB_sampler,
                                        num_workers=1)
        self.testA_loader = DataLoader(self.testA,
                                       batch_size=1,
                                       sampler=testA_sampler)
        self.testB_loader = DataLoader(self.testB,
                                       batch_size=1,
                                       sampler=testB_sampler)
        """ Define Generator, Discriminator """
        self.genA2B = ResnetGenerator(input_nc=3,
                                      output_nc=3,
                                      ngf=self.ch,
                                      n_blocks=self.n_res,
                                      img_size=self.img_size,
                                      light=self.light).to(self.device)
        self.genB2A = ResnetGenerator(input_nc=3,
                                      output_nc=3,
                                      ngf=self.ch,
                                      n_blocks=self.n_res,
                                      img_size=self.img_size,
                                      light=self.light).to(self.device)
        self.disGA = Discriminator(input_nc=3, ndf=self.ch,
                                   n_layers=7).to(self.device)
        self.disGB = Discriminator(input_nc=3, ndf=self.ch,
                                   n_layers=7).to(self.device)
        self.disLA = Discriminator(input_nc=3, ndf=self.ch,
                                   n_layers=5).to(self.device)
        self.disLB = Discriminator(input_nc=3, ndf=self.ch,
                                   n_layers=5).to(self.device)
        """ Define Loss """
        self.L1_loss = nn.L1Loss().to(self.device)
        self.MSE_loss = nn.MSELoss().to(self.device)
        self.BCE_loss = nn.BCEWithLogitsLoss().to(self.device)

        gen_named_parameters = []
        dis_named_parameters = []
        for n, p in (list(self.genA2B.named_parameters(prefix='genA2B')) +
                     list(self.genB2A.named_parameters(prefix='genB2A'))):
            gen_named_parameters.append((n, p))
        for n, p in (list(self.disGA.named_parameters(prefix='disGA')) +
                     list(self.disGB.named_parameters(prefix='disGB')) +
                     list(self.disLA.named_parameters(prefix='disLA')) +
                     list(self.disLB.named_parameters(prefix='disLB'))):
            dis_named_parameters.append((n, p))

        gen_state_dict = OrderedDict(
            [("genA2B." + k, v) for k, v in self.genA2B.state_dict().items()] +
            [("genB2A." + k, v) for k, v in self.genB2A.state_dict().items()])
        dis_state_dict = OrderedDict(
            [("disGA." + k, v) for k, v in self.disGA.state_dict().items()] +
            [("disGB." + k, v) for k, v in self.disGB.state_dict().items()] +
            [("disLA." + k, v) for k, v in self.disLA.state_dict().items()] +
            [("disLB." + k, v) for k, v in self.disLB.state_dict().items()])

        bps.broadcast_parameters(gen_state_dict, root_rank=0)
        bps.broadcast_parameters(dis_state_dict, root_rank=0)
        """ Trainer """
        self.G_optim = torch.optim.Adam(itertools.chain(
            self.genA2B.parameters(), self.genB2A.parameters()),
                                        lr=self.lr,
                                        betas=(0.5, 0.999),
                                        weight_decay=self.weight_decay)
        self.D_optim = torch.optim.Adam(itertools.chain(
            self.disGA.parameters(), self.disGB.parameters(),
            self.disLA.parameters(), self.disLB.parameters()),
                                        lr=self.lr,
                                        betas=(0.5, 0.999),
                                        weight_decay=self.weight_decay)

        named_parameters = []
        for n, p in list(self.genA2B.named_parameters()):
            named_parameters.append(("genA2B." + n, p))
        for n, p in list(self.genB2A.named_parameters()):
            named_parameters.append(("genB2A." + n, p))

        self.G_optim = bps.DistributedOptimizer(
            self.G_optim,
            named_parameters=gen_named_parameters,
            compression=bps.Compression.none)

        self.D_optim = bps.DistributedOptimizer(
            self.D_optim,
            named_parameters=dis_named_parameters,
            compression=bps.Compression.none)

        self.G_optim._handles.clear()
        self.D_optim._handles.clear()
        """ Define Rho clipper to constraint the value of rho in AdaILN and ILN"""
        self.Rho_clipper = RhoClipper(0, 1)
Ejemplo n.º 13
0
    def __init__(self, args):
        self.light = args.light

        if self.light:
            self.model_name = 'UGATIT_light'
        else:
            self.model_name = 'UGATIT'

        self.result_dir = args.result_dir
        self.dataset_dir = args.dataset_dir
        self.dataset = args.dataset

        self.iteration = args.iteration // bps.size()

        self.decay_flag = args.decay_flag

        self.batch_size = args.batch_size
        self.print_freq = args.print_freq
        self.save_freq = args.save_freq

        self.lr = args.lr
        self.weight_decay = args.weight_decay
        self.ch = args.ch
        """ Weight """
        self.adv_weight = args.adv_weight
        self.cycle_weight = args.cycle_weight
        self.identity_weight = args.identity_weight
        self.cam_weight = args.cam_weight
        """ Generator """
        self.n_res = args.n_res
        """ Discriminator """
        self.n_dis = args.n_dis

        self.img_size = args.img_size
        self.img_ch = args.img_ch

        self.device = args.device
        self.benchmark_flag = args.benchmark_flag
        self.resume = args.resume
        self.fix_aug = args.fix_aug
        self.list_mode = args.list_mode

        if torch.backends.cudnn.enabled and self.benchmark_flag:
            print('set benchmark !')
            torch.backends.cudnn.benchmark = True

        print()

        print("##### Information #####")
        print("# light : ", self.light)
        print("# dataset : ", self.dataset)
        print("# batch_size : ", self.batch_size)
        print("# iteration per epoch : ", self.iteration)

        print()

        print("##### Generator #####")
        print("# residual blocks : ", self.n_res)

        print()

        print("##### Discriminator #####")
        print("# discriminator layer : ", self.n_dis)

        print()

        print("##### Weight #####")
        print("# adv_weight : ", self.adv_weight)
        print("# cycle_weight : ", self.cycle_weight)
        print("# identity_weight : ", self.identity_weight)
        print("# cam_weight : ", self.cam_weight)
Ejemplo n.º 14
0
def train(trn_loader, model, criterion, optimizer, scheduler, epoch):
    net_meter = NetworkMeter()
    timer = TimeMeter()
    losses = AverageMeter()
    top1 = AverageMeter()
    top5 = AverageMeter()

    # switch to train mode
    model.train()
    for i, (input, target) in enumerate(trn_loader):
        if args.short_epoch and (i > 10): break
        batch_num = i + 1
        timer.batch_start()
        scheduler.update_lr(epoch, i + 1, len(trn_loader))

        # compute output
        output = model(input)
        loss = criterion(output, target)

        should_print = (batch_num % args.print_freq
                        == 0) or (batch_num == len(trn_loader))

        # compute gradient and do SGD step
        if args.fp16:
            loss = loss * args.loss_scale
            # zero_grad() and converting fp16/fp32 is handled in optimizer
            loss.backward()
            optimizer.step(wait_for_finish=should_print)
            loss = loss / args.loss_scale
        else:
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        # Train batch done. Logging results
        timer.batch_end()

        if args.local_rank == 0 and should_print:
            corr1, corr5 = correct(output.data, target, topk=(1, 5))
            reduced_loss, batch_total = to_python_float(
                loss.data), to_python_float(input.size(0))
            if args.distributed:  # Must keep track of global batch size, since not all machines are guaranteed equal batches at the end of an epoch
                validate_tensor[0] = batch_total
                validate_tensor[1] = reduced_loss
                validate_tensor[2] = corr1
                validate_tensor[3] = corr5
                batch_total, reduced_loss, corr1, corr5 = bps.push_pull(
                    validate_tensor, average=False, name="validation_tensor")
                batch_total = batch_total.cpu().numpy()
                reduced_loss = reduced_loss.cpu().numpy()
                corr1 = corr1.cpu().numpy()
                corr5 = corr5.cpu().numpy()
                reduced_loss = reduced_loss / bps.size()

            top1acc = to_python_float(corr1) * (100.0 / batch_total)
            top5acc = to_python_float(corr5) * (100.0 / batch_total)

            losses.update(reduced_loss, batch_total)
            top1.update(top1acc, batch_total)
            top5.update(top5acc, batch_total)
            tb.log_memory()
            tb.log_trn_times(timer.batch_time.val, timer.data_time.val,
                             input.size(0))
            tb.log_trn_loss(losses.val, top1.val, top5.val)

            recv_gbit, transmit_gbit = net_meter.update_bandwidth()
            tb.log("sizes/batch_total", batch_total)
            tb.log('net/recv_gbit', recv_gbit)
            tb.log('net/transmit_gbit', transmit_gbit)

            output = (
                f'Epoch: [{epoch}][{batch_num}/{len(trn_loader)}]\t'
                f'Time {timer.batch_time.val:.3f} ({timer.batch_time.avg:.3f})\t'
                f'Loss {losses.val:.4f} ({losses.avg:.4f})\t'
                f'Acc@1 {top1.val:.3f} ({top1.avg:.3f})\t'
                f'Acc@5 {top5.val:.3f} ({top5.avg:.3f})\t'
                f'Data {timer.data_time.val:.3f} ({timer.data_time.avg:.3f})\t'
                f'BW {recv_gbit:.3f} {transmit_gbit:.3f}')
            log.verbose(output)

            tb.update_step_count(batch_total)
Ejemplo n.º 15
0
def main():
    # os.system('shutdown -c')  # cancel previous shutdown command
    log.console(args)
    tb.log('sizes/world', bps.size())

    # need to index validation directory before we start counting the time
    dataloader.sort_ar(args.data + '/validation')

    # if args.distributed:
    # log.console('Distributed initializing process group')
    torch.cuda.set_device(bps.local_rank())
    print(f'cuda device set to {bps.local_rank()}')
    log.console("cuda initialized (rank=%d)" % (bps.local_rank()))
    # dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=bps.size())
    log.console("Distributed: success (%d/%d)" % (bps.rank(), bps.size()))

    log.console("Loading model (rank=%d)" % (bps.rank()))
    model = resnet.resnet50(bn0=args.init_bn0).cuda()

    # reuse the validate tensor
    global validate_tensor, dist_validate_tensor
    validate_tensor = torch.tensor([0, 0, 0, 0]).float().cuda()
    dist_validate_tensor = torch.tensor([0, 0, 0, 0, 0]).float().cuda()

    if args.fp16: model = network_to_half(model)
    best_top5 = 93  # only save models over 93%. Otherwise it stops to save every time

    global model_params, master_params
    if args.fp16: model_params, master_params = prep_param_lists(model)
    else: model_params = master_params = model.parameters()

    optim_params, name_list = experimental_utils.bnwd_optim_params(
        model, model_params, master_params) if args.no_bn_wd else master_params

    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss().cuda()
    optimizer = torch.optim.SGD(
        optim_params,
        0,
        momentum=args.momentum,
        weight_decay=args.weight_decay
    )  # start with 0 lr. Scheduler will change this later

    named_param = []
    for p in optim_params:
        tensors = p['params']
        for tensor in tensors:
            named_param.append(tensor)

    # create bps_param (tuple)
    bps_param = []
    for i, tensor in enumerate(named_param):
        name = name_list[i]
        bps_param.append((name, tensor))

    # wrap with byteps optimizer
    optimizer = DistributedOptimizer(
        optimizer,
        named_parameters=bps_param,
        backward_passes_per_step=args.batches_per_pushpull,
        half=True,
        model=model,
        fp16_params=model_params,
        fp32_params=master_params,
        loss_scale=args.loss_scale)

    if args.resume:
        checkpoint = torch.load(
            args.resume,
            map_location=lambda storage, loc: storage.cuda(args.local_rank))
        model.load_state_dict(checkpoint['state_dict'])
        args.start_epoch = checkpoint['epoch']
        best_top5 = checkpoint['best_top5']
        optimizer.load_state_dict(checkpoint['optimizer'])

    log.console(
        "Creating data loaders (this could take up to 10 minutes if volume needs to be warmed up)"
    )
    num_machines = (bps.size() - 1) // 8 + 1
    assert (num_machines in schedules)
    phases = schedules[num_machines]
    dm = DataManager([copy.deepcopy(p) for p in phases if 'bs' in p])
    scheduler = Scheduler(optimizer,
                          [copy.deepcopy(p) for p in phases if 'lr' in p])

    # BytePS: broadcast parameters & optimizer state.
    broadcast_parameters([(name, p.detach()) for name, p in bps_param],
                         root_rank=0)
    broadcast_optimizer_state(optimizer, root_rank=0)

    start_time = datetime.now()  # Loading start to after everything is loaded
    if args.evaluate:
        return validate(dm.val_dl, model, criterion, 0, start_time)

    if args.distributed:
        log.console('Global Barrier: Syncing machines before training')
        tensor = torch.tensor([1.0]).float().cuda()
        barrier_handler = push_pull_async_inplace(tensor,
                                                  average=True,
                                                  name="init.barrier")
        while True:
            if poll(barrier_handler):
                synchronize(barrier_handler)
                break
        # do broadcast for validate tensor
        log.console('Broadcasting validate tensor')
        barrier_handler = push_pull_async_inplace(validate_tensor,
                                                  average=True,
                                                  name="validation_tensor")
        while True:
            if poll(barrier_handler):
                synchronize(barrier_handler)
                break
        barrier_handler = push_pull_async_inplace(
            dist_validate_tensor,
            average=True,
            name="distributed_validation_tensor")
        while True:
            if poll(barrier_handler):
                synchronize(barrier_handler)
                break

    log.event("~~epoch\thours\ttop1\ttop5\n")
    for epoch in range(args.start_epoch, scheduler.tot_epochs):
        dm.set_epoch(epoch)

        train(dm.trn_dl, model, criterion, optimizer, scheduler, epoch)
        top1, top5 = validate(dm.val_dl, model, criterion, epoch, start_time)

        time_diff = (datetime.now() - start_time).total_seconds() / 3600.0
        log.event(f'~~{epoch}\t{time_diff:.5f}\t\t{top1:.3f}\t\t{top5:.3f}\n')

        is_best = top5 > best_top5
        best_top5 = max(top5, best_top5)
        if args.local_rank == 0:
            if is_best:
                save_checkpoint(epoch,
                                model,
                                best_top5,
                                optimizer,
                                is_best=True,
                                filename='model_best.pth.tar')
            phase = dm.get_phase(epoch)
            if phase:
                save_checkpoint(
                    epoch,
                    model,
                    best_top5,
                    optimizer,
                    filename=f'sz{phase["bs"]}_checkpoint.path.tar')
Ejemplo n.º 16
0

kwargs = {'num_workers': 4, 'pin_memory': True} if args.cuda else {}
train_dataset = \
    datasets.ImageFolder(args.train_dir,
                         transform=transforms.Compose([
                             transforms.RandomResizedCrop(224),
                             transforms.RandomHorizontalFlip(),
                             transforms.ToTensor(),
                             transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                                  std=[0.229, 0.224, 0.225])
                         ]))
# BytePS: use DistributedSampler to partition data among workers. Manually specify
# `num_replicas=bps.size()` and `rank=bps.rank()`.
train_sampler = torch.utils.data.distributed.DistributedSampler(
    train_dataset, num_replicas=bps.size(), rank=bps.rank())
train_loader = torch.utils.data.DataLoader(
    train_dataset, batch_size=pushpull_batch_size,
    sampler=train_sampler, **kwargs)

val_dataset = \
    datasets.ImageFolder(args.val_dir,
                         transform=transforms.Compose([
                             transforms.Resize(256),
                             transforms.CenterCrop(224),
                             transforms.ToTensor(),
                             transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                                  std=[0.229, 0.224, 0.225])
                         ]))
val_sampler = torch.utils.data.distributed.DistributedSampler(
    val_dataset, num_replicas=bps.size(), rank=bps.rank())