Example #1
0
    def all_gather_stats_list(stat_list, max_size=4096):
        """
        Gather a `Statistics` list accross all processes/nodes

        Args:
            stat_list(list([`Statistics`])): list of statistics objects to
                gather accross all processes/nodes
            max_size(int): max buffer size to use

        Returns:
            our_stats(list([`Statistics`])): list of updated stats
        """
        from torch.distributed import get_rank
        from onmt.utils.distributed import all_gather_list

        # Get a list of world_size lists with len(stat_list) Statistics objects
        all_stats = all_gather_list(stat_list, max_size=max_size)

        our_rank = get_rank()
        our_stats = all_stats[our_rank]
        for other_rank, stats in enumerate(all_stats):
            if other_rank == our_rank:
                continue
            for i, stat in enumerate(stats):
                our_stats[i].update(stat, update_n_src_words=True)
        return our_stats
Example #2
0
    def _init_group_test(self):
        group = [1, 2]
        group_id = dist.new_group(group)
        rank = dist.get_rank()
        if rank not in group:
            return ([], None, rank)

        return (group, group_id, rank)
Example #3
0
 def __init__(self, num_replicas=None, rank=None):
     if num_replicas is None:
         num_replicas = get_world_size()
     if rank is None:
         rank = get_rank()
     self.num_replicas = num_replicas
     self.rank = rank
     self.epoch = 0
     self.extra = 0
Example #4
0
def test_mpi():
    dist.init_process_group('mpi')
    world_size = dist.get_world_size()
    rank = dist.get_rank()

    vector = [0] * world_size
    vector[rank] = 1
    vector = torch.DoubleTensor(vector)

    dist.all_reduce(vector, op=dist.reduce_op.SUM)
    print("Host {} : Rank {} : {}".format(get_hostname(), rank, vector))
Example #5
0
 def __init__(self, dataset, num_replicas=None, rank=None):
     if num_replicas is None:
         num_replicas = get_world_size()
     if rank is None:
         rank = get_rank()
     self.dataset = dataset
     self.num_replicas = num_replicas
     self.rank = rank
     self.epoch = 0
     self.num_samples = int(math.ceil(len(self.dataset) / self.num_replicas))
     self.total_size = self.num_samples * self.num_replicas
Example #6
0
    def test_get_rank(self):
        test_dir = os.path.join(TEMP_DIR, 'test_dir')
        pid = str(os.getpid())
        num_processes = dist.get_world_size()
        with open(os.path.join(test_dir, pid), 'w') as f:
            f.write(str(dist.get_rank()))

        self._barrier()

        all_ranks = set()
        for f_name in os.listdir(test_dir):
            with open(os.path.join(test_dir, f_name), 'r') as f:
                all_ranks.add(int(f.read()))
        self.assertEqual(len(all_ranks), num_processes)

        self._barrier()

        if dist.get_rank() == 0:
            for f_name in os.listdir(test_dir):
                os.unlink(os.path.join(test_dir, f_name))

        self._barrier()
 def __init__(self, data_source, batch_size=1, num_replicas=None, rank=None):
     """
     Samples batches assuming they are in order of size to batch similarly sized samples together.
     """
     super(DistributedBucketingSampler, self).__init__(data_source)
     if num_replicas is None:
         num_replicas = get_world_size()
     if rank is None:
         rank = get_rank()
     self.data_source = data_source
     self.ids = list(range(0, len(data_source)))
     self.batch_size = batch_size
     self.bins = [self.ids[i:i + batch_size] for i in range(0, len(self.ids), batch_size)]
     self.num_replicas = num_replicas
     self.rank = rank
     self.num_samples = int(math.ceil(len(self.bins) * 1.0 / self.num_replicas))
     self.total_size = self.num_samples * self.num_replicas
Example #8
0
    def test_send_recv(self):
        rank = dist.get_rank()
        tensor = _build_tensor(rank + 1)
        for dest in range(0, dist.get_world_size()):
            if dest == rank:
                continue
            dist.send(tensor, dest)

        for src in range(0, dist.get_world_size()):
            if src == rank:
                continue
            tensor = _build_tensor(src + 1, value=-1)
            expected_tensor = _build_tensor(src + 1)
            dist.recv(tensor, src)
            self.assertEqual(tensor, expected_tensor)

        self._barrier()
Example #9
0
    def test_isend(self):
        rank = dist.get_rank()
        world_size = dist.get_world_size()

        if rank == 0:
            requests = [
                dist.isend(_build_tensor(dest, 10), dest) for dest in range(1, world_size)
            ]
            for request in requests:
                request.wait()
                self.assertTrue(request.is_completed())
        else:
            tensor = _build_tensor(rank, -1)
            dist.recv(tensor, 0)
            self.assertEqual(tensor, _build_tensor(rank, 10))

        self._barrier()
Example #10
0
def config_pytorch(options):
    """Config pytorch packages.

    Fix random number for packages and initialize distributed environment for pytorch.
    Setup cuda environment for pytorch.

    :param options: A global object containing specified options.
    :type options: argparse.Namespace
    """

    # Setting `cudnn.deterministic = True` will turn on
    # CUDNN deterministic setting which can slow down training considerably.
    # Unexpected behavior may also be observed from checkpoint.
    # See: https: // github.com/pytorch/examples/blob/master/imagenet/main.py
    if options.cudnn_deterministic:
        cudnn.deterministic = True
        log.warning('You have chosen to seed training. '
                    'This will turn on the CUDNN deterministic setting, '
                    'which can slow down your training considerably! '
                    'You may see unexpected behavior when restarting '
                    'from checkpoints.', 0)

    if options.seed is not None:
        random.seed(options.seed)
        torch.manual_seed(options.seed)

    # define the graph for the computation.
    if options.use_cuda:
        assert torch.cuda.is_available()

    options.rank = dist.get_rank()
    options.world_size = dist.get_world_size()
    options.graph = FCGraph(options)

    # enable cudnn accelerator if we are using cuda.
    if options.use_cuda:
        options.graph.assigned_gpu_id()
        torch.backends.cudnn.enabled = True
        torch.backends.cudnn.benchmark = True

        if torch.backends.cudnn.version() is None:
            log.warning("CUDNN not found on device.")

        log.info("World size={}, Rank={}, hostname={}, cuda_available={}, cuda_device={}".format(
            options.world_size, options.rank, socket.gethostname(), torch.cuda.is_available(),
            torch.cuda.current_device()))
Example #11
0
    def test_send_recv_any_source(self):
        rank = dist.get_rank()
        tensor = _build_tensor(10, rank)
        for dest in range(0, dist.get_world_size()):
            if dest == rank:
                continue
            dist.send(tensor, dest)

        recv_ranks = set()
        for src in range(0, dist.get_world_size()):
            if src == rank:
                continue
            tensor = _build_tensor(10, value=-1)
            dist.recv(tensor)
            recv_ranks.add(tensor.resize_(1)[0])

        self.assertEqual(len(recv_ranks), dist.get_world_size() - 1)
        self._barrier()
Example #12
0
    def test_irecv(self):
        rank = dist.get_rank()
        world_size = dist.get_world_size()

        if rank == 0:
            expected_tensors = [_build_tensor(src, -1) for src in range(1, world_size)]
            requests = [
                dist.irecv(expected_tensors[src - 1], src) for src in range(1, world_size)
            ]

            for src in range(1, world_size):
                requests[src - 1].wait()
                self.assertTrue(requests[src - 1].is_completed())
                self.assertEqual(expected_tensors[src - 1], _build_tensor(src, 10))
        else:
            tensor = _build_tensor(rank, 10)
            dist.send(tensor, 0)

        self._barrier()
Example #13
0
 def filter(self, record):
     record.rank = dist.get_rank()
     return True
Example #14
0
def warning(content, who='all'):
    if who == 'all' or who == dist.get_rank():
        logger.warning("{}".format(content))
Example #15
0
def info(content, who='all'):
    if who == 'all' or who == dist.get_rank():
        logger.info(content)
Example #16
0
def train(args):
    world_size = len(args.hosts)
    is_distributed = world_size > 1
    logger.debug('Number of hosts {}. Distributed training - {}'.format(world_size, is_distributed))
    use_cuda = args.num_gpus > 0
    logger.debug('Number of gpus available - {}'.format(args.num_gpus))
    kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}
    device = torch.device('cuda' if use_cuda else 'cpu')

    if is_distributed:
        # Initialize the distributed environment.
        backend = 'gloo'
        os.environ['WORLD_SIZE'] = str(world_size)
        host_rank = args.hosts.index(args.current_host)
        dist.init_process_group(backend=backend, rank=host_rank, world_size=world_size)
        logger.info('Initialized the distributed environment: \'{}\' backend on {} nodes. '.format(
            backend, dist.get_world_size()) + 'Current host rank is {}. Is cuda available: {}. Number of gpus: {}'.format(
            dist.get_rank(), torch.cuda.is_available(), args.num_gpus))

    # set the seed for generating random numbers
    seed = 1
    torch.manual_seed(seed)
    if use_cuda:
        torch.cuda.manual_seed(seed)

    train_sampler, train_loader = _get_train_data_loader(args.data_dir, is_distributed, args.batch_size, **kwargs)
    test_loader = _get_test_data_loader(args.data_dir, **kwargs)

    logger.debug('Processes {}/{} ({:.0f}%) of train data'.format(
        len(train_loader.sampler), len(train_loader.dataset),
        100. * len(train_loader.sampler) / len(train_loader.dataset)
    ))

    logger.debug('Processes {}/{} ({:.0f}%) of test data'.format(
        len(test_loader.sampler), len(test_loader.dataset),
        100. * len(test_loader.sampler) / len(test_loader.dataset)
    ))

    model = Net().to(device)
    if is_distributed and use_cuda:
        # multi-machine multi-gpu case
        logger.debug('Multi-machine multi-gpu: using DistributedDataParallel.')
        model = torch.nn.parallel.DistributedDataParallel(model)
    elif use_cuda:
        # single-machine multi-gpu case
        logger.debug('Single-machine multi-gpu: using DataParallel().cuda().')
        model = torch.nn.DataParallel(model)
    else:
        # single-machine or multi-machine cpu case
        logger.debug('Single-machine/multi-machine cpu: using DataParallel.')
        model = torch.nn.DataParallel(model)

    optimizer = optim.SGD(model.parameters(), lr=0.1, momentum=0.5)

    log_interval = 100
    for epoch in range(1, args.epochs + 1):
        if is_distributed:
            train_sampler.set_epoch(epoch)
        model.train()
        for batch_idx, (data, target) in enumerate(train_loader, 1):
            data, target = data.to(device), target.to(device)
            optimizer.zero_grad()
            output = model(data)
            loss = F.nll_loss(output, target)
            loss.backward()
            if is_distributed and not use_cuda:
                # average gradients manually for multi-machine cpu case only
                _average_gradients(model)
            optimizer.step()
            if batch_idx % log_interval == 0:
                logger.debug('Train Epoch: {} [{}/{} ({:.0f}%)] Loss: {:.6f}'.format(
                    epoch, batch_idx * len(data), len(train_loader.sampler),
                    100. * batch_idx / len(train_loader), loss.item()))
        accuracy = test(model, test_loader, device)
    save_model(model, args.model_dir)

    logger.debug('Overall test accuracy: {}'.format(accuracy))
Example #17
0
def train(train_loader, r, optimizer, epoch):
    batch_time = AverageMeter()
    losses = AverageMeter()
    top1 = AverageMeter()
    top5 = AverageMeter()

    # switch to train mode
    n = r.num_iterations(loader_size=len(train_loader))
    if args.num_minibatches is not None:
        n = min(n, args.num_minibatches)
    r.train(n)
    if not is_first_stage():
        train_loader = None
    r.set_loader(train_loader)

    end = time.time()
    epoch_start_time = time.time()

    if args.no_input_pipelining:
        num_warmup_minibatches = 0
    else:
        num_warmup_minibatches = r.num_warmup_minibatches

    if args.verbose_frequency > 0:
        logging.info("Letting in %d warm-up minibatches" %
                     num_warmup_minibatches)
        logging.info("Running training for %d minibatches" % n)

    # start num_warmup_minibatches forward passes
    for i in range(num_warmup_minibatches):
        r.run_forward()

    for i in range(n - num_warmup_minibatches):
        # perform forward pass
        r.run_forward()

        # Adjust learning rate
        adjust_learning_rate(optimizer, epoch, args.epochs, r, args.lr_policy,
                             i, n)

        if is_last_stage():
            # measure accuracy and record loss
            output, target, loss = r.output, r.target, r.loss
            prec1, prec5 = accuracy(output, target, topk=(1, 5))
            losses.update(loss.item(), output.size(0))
            top1.update(prec1[0], output.size(0))
            top5.update(prec5[0], output.size(0))

            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()
            epoch_time = (end - epoch_start_time) / 3600.0
            full_epoch_time = (epoch_time / float(i + 1)) * float(n)

            if i % args.print_freq == 0:
                logging.info(
                    'Epoch: [{0}][{1}/{2}]\t'
                    'Time: {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                    'Epoch time [hr]: {epoch_time:.3f} ({full_epoch_time:.3f})\t'
                    'Memory: {memory:.3f}G ({cached_memory:.3f}G)\t'
                    'Loss: {loss.val:.4f} ({loss.avg:.4f})\t'
                    'Prec@1: {top1.val:.2f}% ({top1.avg:.2f}%)\t'
                    'Prec@5: {top5.val:.2f}% ({top5.avg:.2f}%)'.format(
                        epoch,
                        i,
                        n,
                        batch_time=batch_time,
                        epoch_time=epoch_time,
                        full_epoch_time=full_epoch_time,
                        loss=losses,
                        top1=top1,
                        top5=top5,
                        memory=(float(torch.cuda.memory_allocated()) / 10**9),
                        cached_memory=(float(torch.cuda.memory_cached()) /
                                       10**9)))
                import sys
                sys.stdout.flush()
            #print(losses.avg, i)
            #print(top1.avg, i)
        else:
            if i % args.print_freq == 0:
                logging.info(
                    'Epoch: [{0}][{1}/{2}]\tMemory: {memory:.3f}G ({cached_memory:.3f}G)'
                    .format(epoch,
                            i,
                            n,
                            memory=(float(torch.cuda.memory_allocated()) /
                                    10**9),
                            cached_memory=(float(torch.cuda.memory_cached()) /
                                           10**9)))
                import sys
                sys.stdout.flush()

        # perform backward pass
        if args.fp16:
            r.zero_grad()
        else:
            optimizer.zero_grad()
        # consistent
        # optimizer.load_old_params()
        r.run_backward()
        # optimizer.load_new_params()
        # s = optimizer.get_s()
        if args.square:
            s = (dist.get_world_size() - dist.get_rank())**2
        else:
            s = None
        #logging.warning(f'outside: {dist.get_rank()}: {args.find_median}, s = {s}')
        if args.spectrain:
            optimizer.step(s=s, find_median=args.find_median)
        else:
            optimizer.step()

        # inconsistent
        # optimizer.load_old_params()
        # r.run_backward()
        # optimizer.load_new_params()
        # optimizer.step()

    global writer
    if dist.get_rank() == dist.get_world_size() - 1:
        writer.add_scalar('Train/Loss', losses.avg, epoch)
        writer.add_scalar('Train/Accuracy', top1.avg, epoch)
    # finish remaining backward passes
    for i in range(num_warmup_minibatches):
        optimizer.zero_grad()
        # optimizer.load_old_params()
        r.run_backward()
        # optimizer.load_new_params()
        if args.spectrain:
            optimizer.step(s=s, find_median=args.find_median)
        else:
            optimizer.step()

    # wait for all helper threads to complete
    r.wait()

    logging.info("Epoch %d: %.3f seconds" %
                 (epoch, time.time() - epoch_start_time))
    logging.info("Epoch start time: %.3f, epoch end time: %.3f" %
                 (epoch_start_time, time.time()))
hidden_dim = 128

input_steps = segment_size
output_steps = segment_size
input_size = 1
output_size = 1

train_idx = list(range(training_size))
valid_idx = list(range(training_size, train_valid_size))
test_idx = list(range(train_valid_size, train_valid_size + test_size))

encoder = Encoder(input_size, hidden_dim, num_layers, dropout_rate)
decoder = Decoder(output_size, hidden_dim, num_layers, dropout_rate)

# to enable multi GPU training
rank = dist.get_rank()
world_size = dist.get_world_size()
print('rank {} and world size {}'.format(rank, world_size))

dist.init_process_group("gloo", rank=rank, world_size=world_size)
print(rank, world_size)
model = Seq2Seq(encoder, decoder, rank).to(rank)
model = DDP(model, device_ids=[rank])

model, loss, preds, min_valid_loss, test_rmse = train_model(
    model,
    X,
    Y,
    learning_rate,
    output_steps=output_steps,
    batch_size=64,
Example #19
0
def train(train_loader, model, criterion, optimizer, epoch, log_writer):
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    top1 = AverageMeter()
    top5 = AverageMeter()
    global iter_ptr


    train_record.set()

    # switch to train mode
    model.train()
    end = time.time()
    torch.cuda.synchronize()
    i = -1
    #while input is not None:
    for input, target in train_loader:
        assert input.size(0) == target.size(0)
        i += 1
        iter_ptr += 1

        if args.prof and (i > 200): break
        # measure data loading time
        data_time.update(time.time() - end)

        input = input.cuda(async=True)
        target = target.cuda(async=True)

        input_var = Variable(input)
        target_var = Variable(target)

        # compute output
        output = model(input_var)
        loss = criterion(output, target_var)

        # measure accuracy and record loss
        prec1, prec5 = accuracy(output.data, target, topk=(1, 5))

        if args.distributed:
            reduced_loss = reduce_tensor(loss.data)
            #reduced_loss = loss.data
            prec1 = reduce_tensor(prec1)
            prec5 = reduce_tensor(prec5)
        else:
            reduced_loss = loss.data
        
        losses.update(to_python_float(reduced_loss), input.size(0))
        top1.update(to_python_float(prec1), input.size(0))
        top5.update(to_python_float(prec5), input.size(0))

        # compute gradient and do SGD step
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()
        #input, target = prefetcher.next()
        if dist.get_rank() == 0 and i % args.print_freq == 0 and i > 1:
            train_record.record()
            print('Epoch: [{0}][{1}/{2}]\t'
                  'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                  'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
                  'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                  'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t'
                  'Prec@5 {top5.val:.3f} ({top5.avg:.3f})\t'
                  'Total Training Time {train_time:.3f}'.format(
                   epoch, i, len(train_loader), batch_time=batch_time,
                   data_time=data_time, loss=losses, top1=top1, top5=top5, train_time=train_record.get_time()))
            train_record.set()

    if log_writer:
        log_writer.add_scalar('train_iter/top1', top1.get_avg(), iter_ptr)
        log_writer.add_scalar('train_iter/top5', top5.get_avg(), iter_ptr)
        log_writer.add_scalar('train_iter/loss', losses.get_avg(), iter_ptr)
        log_writer.add_scalar('train_iter/batch_time', batch_time.get_avg(), iter_ptr)
        log_writer.add_scalar('train_iter/data_time', data_time.get_avg(), iter_ptr)
        log_writer.add_scalar('train_iter/learning_rate_schedule', args.lr_present, iter_ptr)

        log_writer.add_scalar('train_epoch/top1', top1.get_avg(), epoch)
        log_writer.add_scalar('train_epoch/top5', top5.get_avg(), epoch)
        log_writer.add_scalar('train_epoch/loss', losses.get_avg(), epoch)
        log_writer.add_scalar('train_epoch/learning_rate_schedule', args.lr_present, epoch)

        log_writer.add_scalar('train_time/top1', top1.get_avg(), train_record.get_time())
        log_writer.add_scalar('train_time/top5', top5.get_avg(), train_record.get_time())
        log_writer.add_scalar('train_time/loss', losses.get_avg(), train_record.get_time())  

        if args.larc_enable:
            #add larc_adaptive_lr saving
            laryer_saving_name = ['layer0.conv1.weight', 'layer0.bn1.weight', 'layer1.1.conv1.weight', \
                                'layer2.1.conv1.weight', 'layer3.1.conv1.weight', 'layer4.1.conv1.weight'] #correspond to list laryer_saving in Signum_SGD.py
            for index, layer_lr in enumerate(optimizer.layer_adaptive_lr):
                log_writer.add_scalar('larc_layer_adaptive_lr/' + laryer_saving_name[index], layer_lr, epoch)
Example #20
0
    def forward(self,
                input,
                input_mask=None,
                attention_mask=None,
                head_mask=None,
                layer_past=None,
                get_key_value=False,
                get_present=False,
                encoder_output=None,
                enc_dec_attn_mask=None,
                encoder_hidden_states=None,
                encoder_attention_mask=None,
                use_cache=False,
                output_attentions=False):
        get_present = (get_present or get_key_value or use_cache)
        input_mask = input_mask if attention_mask is None else attention_mask
        input_type = input.dtype

        if (self.config.fp16 or self.config.q_int8) \
            and input.dtype == torch.float:
            input = input.half()

        with torch.no_grad():
            attention_output = self.attention(input, input_mask, head_mask,
                                              layer_past, get_present,
                                              encoder_hidden_states,
                                              encoder_attention_mask,
                                              output_attentions, self.norm_w,
                                              self.norm_b)

            if get_present:
                attention_output, p_key, p_value = attention_output[0:3]
                presents = (p_key, p_value)
            elif output_attentions:
                attention_output, _, _, context_output = attention_output[0:4]
            else:
                attention_output = attention_output[0]

            residual_add = attention_output + self.attention.attn_ob
            attention_output = self.ds_layernorm(residual_add, self.attn_nw,
                                                 self.attn_nb,
                                                 self.config.epsilon)

            if self.config.mlp_type == 'residual':
                res_mlp_out = self.res_mlp(attention_output, async_op=True)
                res_coef_out = self.res_coef_func(attention_output,
                                                  async_op=True)

            if self.expert_mp_group is not None:
                tensor_list = [
                    torch.empty_like(attention_output) for _ in range(
                        dist.get_world_size(group=self.expert_mp_group))
                ]
                tensor_list[dist.get_rank(
                    group=self.expert_mp_group)] = attention_output
                dist.all_gather(tensor_list,
                                attention_output,
                                group=self.expert_mp_group)
                attention_output = torch.cat(tensor_list).contiguous()

            ############## MoE Gating + Experts ###############
            dispatched_attention, combined_weights = self.moe_gate_einsum(
                attention_output)
            dispatched_input = self._alltoall(dispatched_attention)
            expert_outputs = self.expert_exec(dispatched_input)
            expert_output = self._alltoall(expert_outputs)
            output = self.scale_expert_output(attention_output, expert_output,
                                              combined_weights)
            ################################################

            if self.expert_mp_group is not None:
                output = output.split(
                    output.shape[0] //
                    dist.get_world_size(group=self.expert_mp_group),
                    dim=0)[dist.get_rank(group=self.expert_mp_group)]

            if self.config.mlp_type == 'residual':
                inference_cuda_module.moe_res_matmul(res_mlp_out, res_coef_out,
                                                     output)

            output = self.bias_residual_func(output, residual_add,
                                             torch.empty(1))

            if not self.config.pre_layer_norm:
                output = self.ds_layernorm(output, self.norm_w, self.norm_b,
                                           self.config.epsilon)

            if input_type != output.dtype:
                output = output.to(input_type)

        if get_present:
            output = (output, presents)

        if self.config.return_tuple:
            return output if type(output) is tuple else (output, )
        else:
            return output
def main(run_id, validation_only=False):
    r"""Main logic."""
    num_parallel_workers = 2
    dataset_root = '/datasets/torch/cifar10'
    ckpt_run_dir = '/checkpoints/decentralized/cifar_resnet20'
    use_cuda = True
    train_epochs = 164

    initialize_backends(
        comm_backend='mpi',
        logging_level='INFO',
        logging_file='/mlbench.log',
        use_cuda=use_cuda,
        seed=42,
        cudnn_deterministic=False,
        ckpt_run_dir=ckpt_run_dir,
        delete_existing_ckpts=not validation_only)

    rank = dist.get_rank()
    world_size = dist.get_world_size()

    batch_size = 256 // world_size

    model = ResNetCIFAR(
        resnet_size=20,
        bottleneck=False,
        num_classes=10,
        version=1)

    optimizer = optim.SGD(
        model.parameters(),
        lr=0.1,
        momentum=0.9,
        weight_decay=1e-4,
        nesterov=True)

    # Create a learning rate scheduler for an optimizer
    scheduler = MultiStepLR(
        optimizer,
        milestones=[82, 109],
        gamma=0.1)

    # A loss_function for computing the loss
    loss_function = CrossEntropyLoss()

    if use_cuda:
        model = model.cuda()
        loss_function = loss_function.cuda()

    # Metrics like Top 1/5 Accuracy
    metrics = [
        TopKAccuracy(topk=1),
        TopKAccuracy(topk=5)
    ]

    train_set = CIFAR10V1(dataset_root, train=True, download=True)
    val_set = CIFAR10V1(dataset_root, train=False, download=True)

    train_set = partition_dataset_by_rank(train_set, rank, world_size)
    val_set = partition_dataset_by_rank(val_set, rank, world_size)

    train_loader = DataLoader(
        train_set,
        batch_size=batch_size,
        shuffle=True,
        num_workers=num_parallel_workers,
        pin_memory=use_cuda,
        drop_last=False)

    val_loader = DataLoader(
        val_set,
        batch_size=batch_size,
        shuffle=False,
        num_workers=num_parallel_workers,
        pin_memory=use_cuda,
        drop_last=False)

    checkpointer = Checkpointer(
        ckpt_run_dir=ckpt_run_dir,
        rank=rank,
        checkpoint_all=True)

    if not validation_only:
        # Aggregation
        ring_neighbors = [(rank + 1) % world_size, (rank - 1) % world_size]

        agg_fn = DecentralizedAggregation(
            rank=rank, neighbors=ring_neighbors).agg_model

        controlflow = TrainValidation(
            model=model,
            optimizer=optimizer,
            loss_function=loss_function,
            metrics=metrics,
            scheduler=scheduler,
            batch_size=batch_size,
            train_epochs=train_epochs,
            rank=rank,
            world_size=world_size,
            run_id=run_id,
            dtype='fp32',
            validate=True,
            schedule_per='epoch',
            checkpoint=checkpointer,
            transform_target_type=None,
            average_models=True,
            use_cuda=use_cuda,
            max_batch_per_epoch=None,
            agg_fn=agg_fn)

        controlflow.run(
            dataloader_train=train_loader,
            dataloader_val=val_loader,
            dataloader_train_fn=None,
            dataloader_val_fn=None,
            resume=False,
            repartition_per_epoch=False)
    else:
        cecf = CheckpointsEvaluationControlFlow(
            ckpt_dir=ckpt_run_dir,
            rank=rank,
            world_size=world_size,
            checkpointer=checkpointer,
            model=model,
            epochs=train_epochs,
            loss_function=loss_function,
            metrics=metrics,
            use_cuda=use_cuda,
            dtype='fp32',
            max_batch_per_epoch=None)

        train_stats = cecf.evaluate_by_epochs(train_loader)
        with open(os.path.join(ckpt_run_dir, "train_stats.json"), 'w') as f:
            json.dump(train_stats, f)

        val_stats = cecf.evaluate_by_epochs(val_loader)
        with open(os.path.join(ckpt_run_dir, "val_stats.json"), 'w') as f:
            json.dump(val_stats, f)
Example #22
0
def train(verbose=True, **kwargs):
    args = kwargs['args']
    torch.cuda.set_device(args.local_rank)
    dist.init_process_group(backend='nccl',
                            init_method='tcp://127.0.0.1:{}'.format(cfg.port),
                            world_size=torch.cuda.device_count(),
                            rank=args.local_rank)
    setup_logger(cfg.respth)
    logger = logging.getLogger()

    ## dataset
    ds = CityScapes(cfg, mode='train_val')
    sampler = torch.utils.data.distributed.DistributedSampler(ds)
    dl = DataLoader(ds,
                    batch_size=cfg.ims_per_gpu,
                    shuffle=False,
                    sampler=sampler,
                    num_workers=cfg.n_workers,
                    pin_memory=True,
                    drop_last=True)

    ## model
    net = EaNet(cfg)
    net.cuda()
    it_start = 0
    n_epoch = 0

    ## optimizer
    optim = Optimizer(
        net,
        cfg.lr_start,
        cfg.momentum,
        cfg.weight_decay,
        cfg.warmup_steps,
        cfg.warmup_start_lr,
        cfg.max_iter,
        cfg.lr_power,
        # start_iter = it_start
    )

    ## resume
    if cfg.resume:
        print("=> loading checkpoint '{}'".format(cfg.resume))
        checkpoint = torch.load(cfg.resume)
        if '.tar' in cfg.resume:
            net.load_state_dict(checkpoint['model'])
            optim.optim.load_state_dict(checkpoint['optimizer'])
            # it_start = checkpoint['it']
            n_epoch = checkpoint['epoch']
            bestMIOU = checkpoint['mIOU']
            # optim.it = it_start

            print('Pth.Tar Load model from {}'.format(cfg.resume))
        else:
            net.load_state_dict(checkpoint)
            print('Pth Load model from {}'.format(cfg.resume))
        print('pretrained model loaded')
        net.eval()
        evaluator = MscEval(cfg)
        mIOU = evaluator(net)
        print('mIOU start from %f' % mIOU)
        del checkpoint

    net.train()

    net = nn.parallel.DistributedDataParallel(net,
                                              device_ids=[
                                                  args.local_rank,
                                              ],
                                              output_device=args.local_rank)
    n_min = cfg.ims_per_gpu * cfg.crop_size[0] * cfg.crop_size[1] // 16
    #criteria = OhemCELoss(thresh=cfg.ohem_thresh, n_min=n_min).cuda()
    criteria = ECELoss(thresh=cfg.ohem_thresh,
                       n_min=n_min,
                       n_classes=cfg.n_classes,
                       alpha=cfg.alpha,
                       radius=cfg.radius,
                       beta=cfg.beta,
                       ignore_lb=cfg.ignore_label,
                       mode=cfg.mode).cuda()

    ## train loop
    loss_avg = []
    st = glob_st = time.time()
    diter = iter(dl)
    # n_epoch = 0
    counter = 0
    #count for the epoch finished
    #已经跑结束的epoch
    epochF = 0
    bestMIOU = 0

    for it in range(it_start, cfg.max_iter):
        try:
            im, lb = next(diter)
            if not im.size()[0] == cfg.ims_per_gpu: continue
        except StopIteration:
            n_epoch += 1
            sampler.set_epoch(n_epoch)
            diter = iter(dl)
            im, lb = next(diter)
        im = im.cuda()
        lb = lb.cuda()

        H, W = im.size()[2:]
        lb = torch.squeeze(lb, 1)

        try:
            optim.zero_grad()
            logits = net(im)
            loss = criteria(logits, lb)

            loss.backward()
            optim.step()
        except RuntimeError as e:
            if 'out of memory' in e:
                print('| WARNING: run out of memory')
                if hasattr(troch.cuda, 'empty_cach'):
                    torch.cuda.empty_cache()
            else:
                raise e
        '''
        logits = net(im)
        loss = criteria(logits, lb)
        loss = loss / (cfg.ims_per_gpu)
        counter += 1
        loss.backward()
        
        if counter == cfg.ims_per_gpu:
            optim.step()
            optim.zero_grad()
            counter = 0
        '''
        loss_avg.append(loss.item())
        ## print training log message
        if it % cfg.msg_iter == 0 and not it == 0:
            loss_avg = sum(loss_avg) / len(loss_avg)
            lr = optim.lr
            ed = time.time()
            t_intv, glob_t_intv = ed - st, ed - glob_st
            eta = int((cfg.max_iter - it) * (glob_t_intv / it))
            eta = str(datetime.timedelta(seconds=eta))
            msg = ', '.join([
                'iter: {it}/{max_it}',
                'lr: {lr:4f}',
                'loss: {loss:.4f}',
                'eta: {eta}',
                'time: {time:.4f}',
            ]).format(it=it,
                      max_it=cfg.max_iter,
                      lr=lr,
                      loss=loss_avg,
                      time=t_intv,
                      eta=eta)

            logger.info(msg)
            loss_avg = []
            st = ed
        #每隔一段时间评估一次
        if n_epoch > epochF and n_epoch > 20:
            #置为相等的了
            epochF = n_epoch
            #if (n_epoch > 35) and it%(5*cfg.msg_iter) == 0 and not it==0:
            # net.cpu()
            # save_pth = osp.join(cfg.respth, 'model_final_best.pth')
            # state = net.module.state_dict() if hasattr(net, 'module') else net.state_dict()
            # if dist.get_rank()==0: torch.save(state, save_pth)
            # logger.info('training done, model saved to: {}'.format(save_pth))
            # logger.info('evaluating the final model')
            # net.cuda()
            net.eval()
            evaluator = MscEval(cfg)
            mIOU = evaluator(net)
            logger.info('mIOU is: {}'.format(mIOU))

            # 保存check point
            save_pth = osp.join(cfg.respth, 'checkpoint.pth.tar')
            state = net.module.state_dict() if hasattr(
                net, 'module') else net.state_dict()
            if dist.get_rank() == 0:
                stateF = {
                    'model': state,
                    'lr': optim.lr,
                    'mIOU': mIOU,
                    'it': it,
                    'epoch': n_epoch,
                    'optimizer': optim.optim.state_dict(),
                }
                torch.save(stateF, save_pth)

            if mIOU > bestMIOU:
                logger.info('Get a new best mIMOU:{} at epoch:{}'.format(
                    bestMIOU, n_epoch))
                #print('Get a new best mIMOU:{}'.format(bestMIOU))
                bestMIOU = mIOU
                #net.cpu()
                save_pth = osp.join(cfg.respth,
                                    'model_final_{}.pth'.format(n_epoch))
                state = net.module.state_dict() if hasattr(
                    net, 'module') else net.state_dict()
                if dist.get_rank() == 0: torch.save(state, save_pth)
                #重新加载到cuda
                #net.cuda()

            net.train()
    if verbose:
        net.cpu()
        save_pth = osp.join(cfg.respth, 'model_final.pth.rar')
        state = net.module.state_dict() if hasattr(
            net, 'module') else net.state_dict()
        stateF = {
            'model': state,
            'lr': optim.lr,
            'mIOU': mIOU,
            'it': it,
            'epoch': n_epoch,
            'optimizer': optim.optim.state_dict(),
        }
        torch.save(stateF, save_pth)
        #if dist.get_rank()==0: torch.save(state, save_pth)
        logger.info('training done, model saved to: {}'.format(save_pth))
        logger.info('evaluating the final model')
        net.cuda()
        net.eval()
        evaluator = MscEval(cfg)
        mIOU = evaluator(net)
        logger.info('mIOU is: {}'.format(mIOU))
Example #23
0
def get_rank():
    try:
        return dist.get_rank()
    except:
        return None
Example #24
0
    def train_epoch(self, data_loader):
        self.model.train()
        num_ckpt = int(np.ceil(len(data_loader) / 10))
        meter_loss = tnt.meter.MovingAverageValueMeter(
            len(data_loader) // 100 + 1)
        #meter_accuracy = tnt.meter.ClassErrorMeter(accuracy=True)
        #meter_confusion = tnt.meter.ConfusionMeter(p.NUM_CTC_LABELS, normalized=True)
        if self.lr_scheduler is not None:
            self.lr_scheduler.step()
            logger.debug(f"current lr = {self.lr_scheduler.get_lr()}")
        if is_distributed() and data_loader.sampler is not None:
            data_loader.sampler.set_epoch(self.epoch)

        # count the number of supervised batches seen in this epoch
        t = tqdm(enumerate(data_loader),
                 total=len(data_loader),
                 desc="training")
        for i, (data) in t:
            loss_value = self.unit_train(data)
            meter_loss.add(loss_value)
            t.set_description(f"training (loss: {meter_loss.value()[0]:.3f})")
            t.refresh()
            #self.meter_accuracy.add(ys_int, ys)
            #self.meter_confusion.add(ys_int, ys)

            if 0 < i < len(data_loader) and i % num_ckpt == 0:
                if not is_distributed() or (is_distributed()
                                            and dist.get_rank() == 0):
                    title = "train"
                    x = self.epoch + i / len(data_loader)
                    if logger.visdom is not None:
                        logger.visdom.add_point(title=title,
                                                x=x,
                                                y=meter_loss.value()[0])
                    if logger.tensorboard is not None:
                        logger.tensorboard.add_graph(self.model, xs)
                        xs_img = tvu.make_grid(xs[0, 0],
                                               normalize=True,
                                               scale_each=True)
                        logger.tensorboard.add_image('xs', x, xs_img)
                        ys_hat_img = tvu.make_grid(ys_hat[0].transpose(0, 1),
                                                   normalize=True,
                                                   scale_each=True)
                        logger.tensorboard.add_image('ys_hat', x, ys_hat_img)
                        logger.tensorboard.add_scalars(
                            title, x, {
                                'loss': meter_loss.value()[0],
                            })
                if self.checkpoint:
                    logger.info(
                        f"training loss at epoch_{self.epoch:03d}_ckpt_{i:07d}: "
                        f"{meter_loss.value()[0]:5.3f}")
                    if not is_distributed() or (is_distributed()
                                                and dist.get_rank() == 0):
                        self.save(
                            self.__get_model_name(
                                f"epoch_{self.epoch:03d}_ckpt_{i:07d}"))
            #input("press key to continue")

        self.epoch += 1
        logger.info(f"epoch {self.epoch:03d}: "
                    f"training loss {meter_loss.value()[0]:5.3f} ")
        #f"training accuracy {meter_accuracy.value()[0]:6.3f}")
        if not is_distributed() or (is_distributed() and dist.get_rank() == 0):
            self.save(self.__get_model_name(f"epoch_{self.epoch:03d}"))
            self.__remove_ckpt_files(self.epoch - 1)
Example #25
0
def main():
    args = parse_args()

    # Devices
    if args.local_rank == -1:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        torch.distributed.init_process_group(backend="nccl")
    default_gpu = False
    if dist.is_available() and args.local_rank != -1:
        rank = dist.get_rank()
        if rank == 0:
            default_gpu = True
    else:
        default_gpu = True
    logger.info(
        f"device: {device} n_gpu: {n_gpu}, distributed training: {bool(args.local_rank != -1)}"
    )

    # Load config
    config = BertConfig.from_json_file(args.config_file)

    # Load task config
    with open(args.tasks_config_file, "r") as f:
        task_cfg = edict(yaml.safe_load(f))
    task_id = args.task.strip()
    task = "TASK" + task_id
    task_name = task_cfg[task]["name"]
    if task_cfg[task].get("fusion_method", None):
        # VL-BERT pooling for VQA
        config.fusion_method = task_cfg[task]["fusion_method"]

    # Output dirs
    timeStamp = args.from_pretrained.split("/")[-1] + "-" + args.save_name
    savePath = os.path.join(args.output_dir, timeStamp)
    if default_gpu and not os.path.exists(savePath):
        os.makedirs(savePath)

    # Seed
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)

    # Dataset
    batch_size, task2num_iters, dset_val, dl_val = LoadDatasetEval(
        args, config, task_cfg, args.task)

    # Logging
    tb_logger = tbLogger(timeStamp,
                         savePath, [task_name], [task],
                         task2num_iters,
                         1,
                         save_logger=False,
                         txt_name="eval.txt")

    # Model
    model = BertForVLTasks.from_pretrained(args.from_pretrained,
                                           config=config,
                                           task_cfg=task_cfg,
                                           task_ids=[task])

    # Optimization details
    criterion = LoadLoss(task_cfg, args.task)

    # Move to GPU(s)
    model.to(device)
    if args.local_rank != -1:
        try:
            from apex.parallel import DistributedDataParallel as DDP
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )
        model = DDP(model, delay_allreduce=True)
    elif n_gpu > 1:
        model = nn.DataParallel(model)

    # Print summary
    if default_gpu:
        print("***** Running evaluation *****")
        print("  Num Iters: ", task2num_iters[task])
        print("  Batch size: ", batch_size)

    # Evaluate
    model.eval()
    results = []
    others = []
    for i, batch in tqdm(enumerate(dl_val), total=task2num_iters[task]):
        loss, score, batch_size, results, others = EvaluatingModel(
            config, task_cfg, device, task, batch, model, dl_val, criterion,
            results, others)

        tb_logger.step_val(0, float(loss), float(score), task, batch_size,
                           "val")
        sys.stdout.write("%d/%d\r" % (i, len(dl_val)))
        sys.stdout.flush()
    # save the result or evaluate the result.
    ave_score = tb_logger.showLossVal(task)

    if args.split:
        json_path = os.path.join(savePath, args.split)
    else:
        json_path = os.path.join(savePath, task_cfg[task]["val_split"])
    json.dump(results, open(json_path + "_result.json", "w"))
    json.dump(others, open(json_path + "_others.json", "w"))
Example #26
0
def load_model(args):
    # Prepare GLUE task
    args.task_name = args.task_name.lower()
    args.output_mode = "classification"
    label_list = ["0", "1"]
    num_labels = len(label_list)

    # store args
    if args.local_rank != -1:
        args.world_size = torch.distributed.get_world_size()
        args.rank = dist.get_rank()

    # Load pretrained model and tokenizer
    if args.local_rank not in [-1, 0]:
        # Make sure only the first process in distributed training will
        # download model & vocab
        torch.distributed.barrier()

    args.train_model_type = args.train_model_type.lower()
    configObj = MSMarcoConfigDict[args.train_model_type]
    if 'fairseq' not in args.train_model_type:
        config = configObj.config_class.from_pretrained(
            args.config_name if args.config_name else args.model_name_or_path,
            num_labels=num_labels,
            finetuning_task=args.task_name,
            cache_dir=args.cache_dir if args.cache_dir else None,
        )
        tokenizer = configObj.tokenizer_class.from_pretrained(
            args.tokenizer_name
            if args.tokenizer_name else args.model_name_or_path,
            do_lower_case=args.do_lower_case,
            cache_dir=args.cache_dir if args.cache_dir else None,
        )
        model = configObj.model_class.from_pretrained(
            args.model_name_or_path,
            from_tf=bool(".ckpt" in args.model_name_or_path),
            config=config,
            cache_dir=args.cache_dir if args.cache_dir else None,
        )
    elif 'fast' in args.train_model_type:
        config = configObj.config_class.from_pretrained(
            'roberta-base',
            num_labels=num_labels,
            finetuning_task=args.task_name,
            cache_dir=args.cache_dir if args.cache_dir else None,
        )
        #print('???',args.model_name_or_path)
        model = configObj.model_class(config)
        #print('???',model.state_dict()['encoder.layers.1.fc2.weight'])
        #print('???',model.state_dict().keys())
        if os.path.isdir(args.model_name_or_path):
            model.from_pretrained(
                os.path.join(args.model_name_or_path, args.model_file))
        else:
            model.from_pretrained(os.path.join(args.model_name_or_path))
        #print('???',model.state_dict()['encoder.layers.1.fc2.weight'])
        tokenizer = BertWordPieceTokenizer(args.bpe_vocab_file,
                                           clean_text=False,
                                           strip_accents=False,
                                           lowercase=False)
    else:
        config = configObj.config_class.from_pretrained(
            'roberta-base',
            num_labels=args.num_labels,
            finetuning_task=args.task_name,
            cache_dir=args.cache_dir if args.cache_dir else None,
        )
        model = configObj.model_class(config)
        model.from_pretrained(
            os.path.join(args.model_name_or_path, args.model_file))
        tokenizer = torch.hub.load('pytorch/fairseq', 'roberta.base')

    if args.local_rank == 0:
        # Make sure only the first process in distributed training will
        # download model & vocab
        torch.distributed.barrier()

    model.to(args.device)

    return tokenizer, model
Example #27
0
 def _init_global_test(self):
     group = [i for i in range(0, dist.get_world_size())]
     group_id = dist.group.WORLD
     rank = dist.get_rank()
     return (group, group_id, rank)
Example #28
0
def check_write_log():
    return dist.get_rank(
    ) == 0 or not use_multigpu_with_single_device_per_process
Example #29
0
def _run_train(device: torch.device, is_distributed: bool):

    serialize_dir = os.path.join(ROOT_PATH, "data/easytext/tests/trainer/save_and_load")
            
    if is_distributed:
        if TorchDist.get_rank() == 0:
            if os.path.isdir(serialize_dir):
                shutil.rmtree(serialize_dir)

            os.makedirs(serialize_dir)
    
        TorchDist.barrier()
    else:
        if os.path.isdir(serialize_dir):
            shutil.rmtree(serialize_dir)

        os.makedirs(serialize_dir)

    model = ModelDemo()

    optimizer_factory = _DemoOptimizerFactory()

    loss = _DemoLoss()
    metric = _DemoMetric()

    tensorboard_log_dir = "data/tensorboard"

    tensorboard_log_dir = os.path.join(ROOT_PATH, tensorboard_log_dir)

    # shutil.rmtree(tensorboard_log_dir)

    trainer = Trainer(num_epoch=100,
                      model=model,
                      loss=loss,
                      metrics=metric,
                      optimizer_factory=optimizer_factory,
                      serialize_dir=serialize_dir,
                      patient=20,
                      num_check_point_keep=25,
                      device=device,
                      trainer_callback=None,
                      is_distributed=is_distributed
                      )
    logging.info(f"test is_distributed: {is_distributed}")
    # trainer_callback = BasicTrainerCallbackComposite(tensorboard_log_dir=tensorboard_log_dir)
    train_dataset = _DemoDataset()

    if is_distributed:
        sampler = DistributedSampler(dataset=train_dataset)
    else:
        sampler = None

    train_data_loader = DataLoader(dataset=train_dataset,
                                   collate_fn=_DemoCollate(),
                                   batch_size=200,
                                   num_workers=0,
                                   sampler=sampler)

    if is_distributed:
        sampler = DistributedSampler(dataset=train_dataset)
    else:
        sampler = None

    validation_data_loader = DataLoader(dataset=train_dataset,
                                        collate_fn=_DemoCollate(),
                                        batch_size=200,
                                        num_workers=0,
                                        sampler=sampler)

    trainer.train(train_data_loader=train_data_loader,
                  validation_data_loader=validation_data_loader)

    expect_model_state_dict = json.loads(json2str(trainer.model.state_dict()))
    expect_optimizer_state_dict = json.loads(json2str(trainer.optimizer.state_dict()))
    expect_current_epoch = trainer.current_epoch
    expect_num_epoch = trainer.num_epoch
    expect_metric = trainer.metrics.metric[0]
    expect_metric_tracker = json.loads(json2str(trainer.metric_tracker))

    trainer.load_checkpoint(serialize_dir=serialize_dir)

    loaded_model_state_dict = json.loads(json2str(trainer.model.state_dict()))
    loaded_optimizer_state_dict = json.loads(json2str(trainer.optimizer.state_dict()))
    current_epoch = trainer.current_epoch
    num_epoch = trainer.num_epoch
    metric = trainer.metrics.metric[0]
    metric_tracker = json.loads(json2str(trainer.metric_tracker))

    ASSERT.assertDictEqual(expect_model_state_dict, loaded_model_state_dict)
    ASSERT.assertDictEqual(expect_optimizer_state_dict, loaded_optimizer_state_dict)
    ASSERT.assertEqual(expect_current_epoch, current_epoch)
    ASSERT.assertEqual(expect_num_epoch, num_epoch)
    ASSERT.assertDictEqual(expect_metric, metric)
    ASSERT.assertDictEqual(expect_metric_tracker, metric_tracker)
Example #30
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument(
        "--bert_model",
        default="bert-base-uncased",
        type=str,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.",
    )
    parser.add_argument(
        "--from_pretrained",
        default="bert-base-uncased",
        type=str,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.",
    )
    parser.add_argument(
        "--output_dir",
        default="results",
        type=str,
        help=
        "The output directory where the model checkpoints will be written.",
    )
    parser.add_argument(
        "--config_file",
        default="config/bert_base_6layer_interbert.json",
        type=str,
        help="The config file which specified the model details.",
    )
    parser.add_argument("--no_cuda",
                        action="store_true",
                        help="Whether not to use CUDA when available")
    parser.add_argument(
        "--do_lower_case",
        default=True,
        type=bool,
        help=
        "Whether to lower case the input text. True for uncased models, False for cased models.",
    )
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument("--seed",
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        "--fp16",
        action="store_true",
        help="Whether to use 16-bit float precision instead of 32-bit",
    )
    parser.add_argument(
        "--loss_scale",
        type=float,
        default=0,
        help=
        "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
        "0 (default value): dynamic loss scaling.\n"
        "Positive power of 2: static loss scaling value.\n",
    )
    parser.add_argument("--num_workers",
                        type=int,
                        default=16,
                        help="Number of workers in the dataloader.")
    parser.add_argument(
        "--save_name",
        default='',
        type=str,
        help="save name for training.",
    )
    parser.add_argument("--tasks",
                        default='',
                        type=str,
                        help="1-2-3... training task separate by -")
    parser.add_argument("--in_memory",
                        default=False,
                        type=bool,
                        help="whether use chunck for parallel training.")
    parser.add_argument("--zero_shot",
                        action="store_true",
                        help="whether use single stream baseline.")
    parser.add_argument("--split",
                        default="",
                        type=str,
                        help="which split to use.")
    parser.add_argument("--batch_size",
                        default=1,
                        type=int,
                        help="which split to use.")
    args = parser.parse_args()
    with open('interbert_tasks.yml', 'r') as f:
        task_cfg = edict(yaml.safe_load(f))

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)

    from bertmodel.modules import BertConfig

    task_names = []
    for i, task_id in enumerate(args.tasks.split('-')):
        task = 'TASK' + task_id
        name = task_cfg[task]['name']
        task_names.append(name)

    # timeStamp = '-'.join(task_names) + '_' + args.config_file.split('/')[1].split('.')[0]
    if '/' in args.from_pretrained:
        timeStamp = args.from_pretrained.split('/')[1]
    else:
        timeStamp = args.from_pretrained

    savePath = os.path.join(args.output_dir, timeStamp)

    config = BertConfig.from_json_file(args.config_file)
    bert_weight_name = json.load(
        open("config/" + "bert-base-uncased_weight_name.json", "r"))

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend="nccl")

    logger.info(
        "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".
        format(device, n_gpu, bool(args.local_rank != -1), args.fp16))

    default_gpu = False
    if dist.is_available() and args.local_rank != -1:
        rank = dist.get_rank()
        if rank == 0:
            default_gpu = True
    else:
        default_gpu = True

    if default_gpu and not os.path.exists(savePath):
        os.makedirs(savePath)

    task_batch_size, task_num_iters, task_ids, task_datasets_val, task_dataloader_val \
                        = LoadDatasetEval(args, task_cfg, args.tasks.split('-'))

    num_labels = max(
        [dataset.num_labels for dataset in task_datasets_val.values()])

    config.fast_mode = True
    if args.zero_shot:
        model = InterBertForMultiModalPreTraining.from_pretrained(
            args.from_pretrained, config)
    else:
        model = InterBertForVLTasks.from_pretrained(args.from_pretrained,
                                                    config,
                                                    num_labels=num_labels,
                                                    default_gpu=default_gpu)

    task_losses = LoadLosses(args, task_cfg, args.tasks.split('-'))
    model.to(device)
    if args.local_rank != -1:
        try:
            from apex.parallel import DistributedDataParallel as DDP
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )
        model = DDP(model, deay_allreduce=True)

    elif n_gpu > 1:
        model = nn.DataParallel(model)

    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]

    print("  Num Iters: ", task_num_iters)
    print("  Batch size: ", task_batch_size)

    model.eval()
    # when run evaluate, we run each task sequentially.
    for task_id in task_ids:
        results = []
        others = []

        score_matrix = np.zeros((5000, 1000))
        target_matrix = np.zeros((5000, 1000))
        rank_matrix = np.ones((5000)) * 1000
        count = 0

        for i, batch in enumerate(task_dataloader_val[task_id]):
            batch = tuple(
                t.cuda(device=device, non_blocking=True) for t in batch)
            features, spatials, image_mask, question, target, input_mask, segment_ids, caption_idx, image_idx = batch

            if task_id in ['TASK3']:
                batch_size = features.size(0)
                features = features.squeeze(0)
                spatials = spatials.squeeze(0)
                image_mask = image_mask.squeeze(0)
                multimodal_mask = torch.cat(
                    (image_mask, input_mask.expand(image_mask.size(0), -1)),
                    dim=-1)
                question = question.expand(features.size(0), -1)

            with torch.no_grad():
                if args.zero_shot:
                    _, _, vil_logit, _ = model(question,
                                               features,
                                               spatials,
                                               segment_ids,
                                               input_mask,
                                               image_mask,
                                               multimodal_mask=multimodal_mask)
                else:
                    _, _, vil_logit, _, _, _, _ = model(
                        question,
                        features,
                        spatials,
                        segment_ids,
                        input_mask,
                        image_mask,
                        multimodal_mask=multimodal_mask)

                score_matrix[caption_idx, image_idx * 500:(image_idx + 1) *
                             500] = torch.softmax(
                                 vil_logit, dim=1)[:,
                                                   0].view(-1).cpu().numpy()
                target_matrix[caption_idx, image_idx * 500:(image_idx + 1) *
                              500] = target.view(-1).float().cpu().numpy()

                if image_idx.item() == 1:
                    rank = np.where(
                        (np.argsort(-score_matrix[caption_idx]) == np.where(
                            target_matrix[caption_idx] == 1)[0][0]) == 1)[0][0]
                    rank_matrix[caption_idx] = rank

                    rank_matrix_tmp = rank_matrix[:caption_idx + 1]
                    r1 = 100.0 * np.sum(
                        rank_matrix_tmp < 1) / len(rank_matrix_tmp)
                    r5 = 100.0 * np.sum(
                        rank_matrix_tmp < 5) / len(rank_matrix_tmp)
                    r10 = 100.0 * np.sum(
                        rank_matrix_tmp < 10) / len(rank_matrix_tmp)

                    medr = np.floor(np.median(rank_matrix_tmp) + 1)
                    meanr = np.mean(rank_matrix_tmp) + 1
                    print(
                        "%d Final r1:%.3f, r5:%.3f, r10:%.3f, mder:%.3f, meanr:%.3f"
                        % (count, r1, r5, r10, medr, meanr))

                    results.append(
                        np.argsort(-score_matrix[caption_idx]).tolist()[:20])
            count += 1

        r1 = 100.0 * np.sum(rank_matrix < 1) / len(rank_matrix)
        r5 = 100.0 * np.sum(rank_matrix < 5) / len(rank_matrix)
        r10 = 100.0 * np.sum(rank_matrix < 10) / len(rank_matrix)

        medr = np.floor(np.median(rank_matrix) + 1)
        meanr = np.mean(rank_matrix) + 1

        print("************************************************")
        print("Final r1:%.3f, r5:%.3f, r10:%.3f, mder:%.3f, meanr:%.3f" %
              (r1, r5, r10, medr, meanr))
        print("************************************************")

        if args.split:
            json_path = os.path.join(savePath, args.split)
        else:
            json_path = os.path.join(savePath, task_cfg[task_id]['val_split'])
        json.dump(results, open(json_path + '_result.json', 'w'))
        json.dump(others, open(json_path + '_others.json', 'w'))
Example #31
0
def main():
    global args, best_prec1
    args = parser.parse_args()
    if int(args.rank) == int(args.world_size) - 1:
        log_level = logging.INFO
    else:
        log_level = logging.WARNING
    logging.basicConfig(
        level=log_level,
        format="[%(asctime)s] %(name)s:%(levelname)s: %(message)s")

    logging.info(f'Find median: {args.find_median}')
    logging.warning(
        f'master addr: {args.master_addr}, rank:{args.rank}, local_rank:{args.local_rank}'
    )
    torch.cuda.set_device(args.local_rank)

    # define loss function (criterion)
    criterion = nn.CrossEntropyLoss()

    # create stages of the model
    module = importlib.import_module(args.module)
    args.arch = module.arch()
    model = module.model(criterion)

    # determine shapes of all tensors in passed-in model
    if args.arch == 'inception_v3':
        input_size = [args.batch_size, 3, 299, 299]
    else:
        #input_size = [args.batch_size, 3, 224, 224]
        input_size = [args.batch_size, 3, 32, 32]
    training_tensor_shapes = {
        "input0": input_size,
        "target": [args.batch_size]
    }
    dtypes = {"input0": torch.int64, "target": torch.int64}
    inputs_module_destinations = {"input": 0}
    target_tensor_names = {"target"}
    for (stage, inputs, outputs) in model[:-1]:  # Skip last layer (loss).
        input_tensors = []
        for input in inputs:
            input_tensor = torch.zeros(tuple(training_tensor_shapes[input]),
                                       dtype=torch.float32)
            input_tensors.append(input_tensor)
        with torch.no_grad():
            output_tensors = stage(*tuple(input_tensors))
        if not type(output_tensors) is tuple:
            output_tensors = [output_tensors]
        for output, output_tensor in zip(outputs, list(output_tensors)):
            training_tensor_shapes[output] = list(output_tensor.size())
            dtypes[output] = output_tensor.dtype

    eval_tensor_shapes = {}
    for key in training_tensor_shapes:
        eval_tensor_shapes[key] = tuple([args.eval_batch_size] +
                                        training_tensor_shapes[key][1:])
        training_tensor_shapes[key] = tuple(training_tensor_shapes[key])

    configuration_maps = {
        'module_to_stage_map': None,
        'stage_to_rank_map': None,
        'stage_to_depth_map': None
    }
    if args.config_path is not None:
        json_config_file = json.load(open(args.config_path, 'r'))
        configuration_maps['module_to_stage_map'] = json_config_file.get(
            "module_to_stage_map", None)
        configuration_maps['stage_to_rank_map'] = json_config_file.get(
            "stage_to_rank_map", None)
        configuration_maps['stage_to_rank_map'] = {
            int(k): v
            for (k, v) in configuration_maps['stage_to_rank_map'].items()
        }
        configuration_maps['stage_to_depth_map'] = json_config_file.get(
            "stage_to_depth_map", None)

    r = runtime.StageRuntime(
        model=model,
        distributed_backend=args.distributed_backend,
        fp16=args.fp16,
        loss_scale=args.loss_scale,
        training_tensor_shapes=training_tensor_shapes,
        eval_tensor_shapes=eval_tensor_shapes,
        training_tensor_dtypes=dtypes,
        inputs_module_destinations=inputs_module_destinations,
        target_tensor_names=target_tensor_names,
        configuration_maps=configuration_maps,
        master_addr=args.master_addr,
        rank=args.rank,
        local_rank=args.local_rank,
        num_ranks_in_server=args.num_ranks_in_server,
        verbose_freq=args.verbose_frequency,
        model_type=runtime.IMAGE_CLASSIFICATION,
        port=args.port,
        enable_recompute=args.recompute)

    # stage needed to determine if current stage is the first stage
    # num_stages needed to determine if current stage is the last stage
    # num_ranks needed to determine number of warmup_minibatches in case of pipelining
    args.stage = r.stage
    args.num_stages = r.num_stages
    args.num_ranks = r.num_ranks
    if not is_first_stage():
        args.synthetic_data = True

    # define optimizer
    if args.no_input_pipelining:
        num_versions = 1
    else:
        # number of versions is the total number of machines following the current
        # stage, shared amongst all replicas in this stage
        num_versions = r.num_warmup_minibatches + 1

    # if specified, resume from checkpoint
    if args.resume:
        checkpoint_file_path = "%s.%d.pth.tar" % (args.resume, r.stage)
        assert os.path.isfile(checkpoint_file_path)
        logging.info("=> loading checkpoint '{}'".format(checkpoint_file_path))
        checkpoint = torch.load(checkpoint_file_path)
        args.start_epoch = checkpoint['epoch']
        best_prec1 = checkpoint['best_prec1']
        r.load_state_dict(checkpoint['state_dict'])
        logging.info("=> loaded checkpoint '{}' (epoch {})".format(
            checkpoint_file_path, checkpoint['epoch']))

    #optimizer = sgd.SGDWithWeightStashing(r.modules(), r.master_parameters,
    if args.spectrain:
        if args.log_dir != None:
            args.log_dir += '_spectrain'
        logging.info('Using spectrain')
        if args.square:
            if args.log_dir != None:
                args.log_dir += '_square'
            logging.info('s = version difference ^ 2')
        else:
            logging.info('s = version difference')
        optimizer = sgd.SGDWithSpectrain(r.modules(),
                                         r.master_parameters,
                                         r.model_parameters,
                                         args.loss_scale,
                                         num_versions=num_versions,
                                         lr=args.lr,
                                         momentum=args.momentum,
                                         weight_decay=args.weight_decay,
                                         verbose_freq=args.verbose_frequency,
                                         macrobatch=args.macrobatch)
    else:
        logging.info('Not using spectrain')
        optimizer = sgd.SGDWithWeightStashing(
            r.modules(),
            r.master_parameters,
            r.model_parameters,
            args.loss_scale,
            num_versions=num_versions,
            lr=args.lr,
            momentum=args.momentum,
            weight_decay=args.weight_decay,
            verbose_freq=args.verbose_frequency,
            macrobatch=args.macrobatch)

    logging.info(f'log_dir: {args.log_dir}')
    if args.resume:
        optimizer.load_state_dict(checkpoint['optimizer'])

    cudnn.benchmark = True

    # Data loading code
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])

    if args.arch == 'inception_v3':
        if args.synthetic_data:
            train_dataset = SyntheticDataset((3, 299, 299), 10000)
        else:
            traindir = os.path.join(args.data_dir, 'train')
            train_dataset = datasets.ImageFolder(
                traindir,
                transforms.Compose([
                    transforms.RandomResizedCrop(299),
                    transforms.ToTensor(),
                    normalize,
                ]))
    else:
        if args.synthetic_data:
            train_dataset = SyntheticDataset((3, 224, 224), 50000)
        else:
            traindir = os.path.join(args.data_dir, 'train')
            transform_train = transforms.Compose([
                #transforms.RandomResizedCrop(224),
                transforms.RandomCrop(32, padding=4),
                transforms.RandomHorizontalFlip(),
                transforms.ToTensor(),
                transforms.Normalize((0.4914, 0.4822, 0.4465),
                                     (0.2023, 0.1994, 0.2010)),
            ])
            #train_dataset = datasets.ImageFolder(
            train_dataset = datasets.CIFAR10(
                traindir,
                True,
                transform_train,
                #transforms.Compose([
                #    transforms.RandomResizedCrop(224),
                #    transforms.RandomHorizontalFlip(),
                #    transforms.ToTensor(),
                #    normalize,
                #]),
                download=True)

    if args.synthetic_data:
        val_dataset = SyntheticDataset((3, 224, 224), 10000)
    else:
        valdir = os.path.join(args.data_dir, 'val')
        transform_test = transforms.Compose([
            #transforms.RandomResizedCrop(224),
            transforms.ToTensor(),
            transforms.Normalize((0.4914, 0.4822, 0.4465),
                                 (0.2023, 0.1994, 0.2010)),
        ])
        #val_dataset = datasets.ImageFolder(valdir, transforms.Compose([
        val_dataset = datasets.CIFAR10(
            valdir,
            False,
            transform_test,
            #transforms.Compose([
            #    transforms.Resize(256),
            #    transforms.CenterCrop(224),
            #    transforms.ToTensor(),
            #    normalize,
            #]),
            download=True,
        )

    global writer
    if dist.get_rank() == dist.get_world_size() - 1:
        writer = SummaryWriter(args.log_dir)

    distributed_sampler = False
    train_sampler = None
    val_sampler = None
    if configuration_maps['stage_to_rank_map'] is not None:
        num_ranks_in_first_stage = len(
            configuration_maps['stage_to_rank_map'][0])
        if num_ranks_in_first_stage > 1:
            train_sampler = torch.utils.data.distributed.DistributedSampler(
                train_dataset,
                num_replicas=num_ranks_in_first_stage,
                rank=args.rank)
            val_sampler = torch.utils.data.distributed.DistributedSampler(
                val_dataset,
                num_replicas=num_ranks_in_first_stage,
                rank=args.rank)
            distributed_sampler = True

    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=args.batch_size,
                                               shuffle=(train_sampler is None),
                                               num_workers=args.workers,
                                               pin_memory=True,
                                               sampler=train_sampler,
                                               drop_last=True)

    val_loader = torch.utils.data.DataLoader(val_dataset,
                                             batch_size=args.eval_batch_size,
                                             shuffle=False,
                                             num_workers=args.workers,
                                             pin_memory=True,
                                             sampler=val_sampler,
                                             drop_last=True)

    # if checkpoint is loaded, start by running validation
    if args.resume:
        assert args.start_epoch > 0
        validate(val_loader, r, args.start_epoch - 1)

    for epoch in range(args.start_epoch, args.epochs):
        if distributed_sampler:
            train_sampler.set_epoch(epoch)

        # train or run forward pass only for one epoch
        if args.forward_only:
            validate(val_loader, r, epoch)
        else:
            train(train_loader, r, optimizer, epoch)

            # evaluate on validation set
            prec1 = validate(val_loader, r, epoch)
            if r.stage != r.num_stages:
                prec1 = 0

            # remember best prec@1 and save checkpoint
            best_prec1 = max(prec1, best_prec1)

            should_save_checkpoint = args.checkpoint_dir_not_nfs or r.rank_in_stage == 0
            if args.checkpoint_dir and should_save_checkpoint:
                save_checkpoint(
                    {
                        'epoch': epoch + 1,
                        'arch': args.arch,
                        'state_dict': r.state_dict(),
                        'best_prec1': best_prec1,
                        'optimizer': optimizer.state_dict(),
                    }, args.checkpoint_dir, r.stage)
Example #32
0
def is_logging_process():
    return not dist.is_initialized() or dist.get_rank() == 0
Example #33
0
def validate(val_loader, r, epoch):
    batch_time = AverageMeter()
    losses = AverageMeter()
    top1 = AverageMeter()
    top5 = AverageMeter()

    # switch to evaluate mode
    n = r.num_iterations(loader_size=len(val_loader))
    if args.num_minibatches is not None:
        n = min(n, args.num_minibatches)
    r.eval(n)
    if not is_first_stage():
        val_loader = None
    r.set_loader(val_loader)

    end = time.time()
    epoch_start_time = time.time()

    if args.no_input_pipelining:
        num_warmup_minibatches = 0
    else:
        num_warmup_minibatches = r.num_warmup_minibatches

    if args.verbose_frequency > 0:
        logging.info("Letting in %d warm-up minibatches" %
                     num_warmup_minibatches)
        logging.info("Running validation for %d minibatches" % n)

    with torch.no_grad():
        for i in range(num_warmup_minibatches):
            r.run_forward()

        for i in range(n - num_warmup_minibatches):
            # perform forward pass
            r.run_forward()
            r.run_ack()

            if is_last_stage():
                output, target, loss = r.output, r.target, r.loss

                # measure accuracy and record loss
                prec1, prec5 = accuracy(output, target, topk=(1, 5))
                losses.update(loss.item(), output.size(0))
                top1.update(prec1[0], output.size(0))
                top5.update(prec5[0], output.size(0))

                # measure elapsed time
                batch_time.update(time.time() - end)
                end = time.time()

                if i % args.print_freq == 0:
                    logging.info(
                        'Test: [{0}][{1}/{2}]\t'
                        'Time: {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                        'Memory: {memory:.3f}G ({cached_memory:.3f}G)\t'
                        'Loss: {loss.val:.4f} ({loss.avg:.4f})\t'
                        'Prec@1: {top1.val:.2f}% ({top1.avg:.2f}%)\t'
                        'Prec@5: {top5.val:.2f}% ({top5.avg:.2f}%)'.format(
                            epoch,
                            i,
                            n,
                            batch_time=batch_time,
                            loss=losses,
                            top1=top1,
                            top5=top5,
                            memory=(float(torch.cuda.memory_allocated()) /
                                    10**9),
                            cached_memory=(float(torch.cuda.memory_cached()) /
                                           10**9)))
                    import sys
                    sys.stdout.flush()

        if is_last_stage():
            logging.info(
                ' * Prec@1 {top1.avg:.2f}% Prec@5 {top5.avg:.2f}%'.format(
                    top1=top1, top5=top5))

        for i in range(num_warmup_minibatches):
            r.run_ack()

        # wait for all helper threads to complete
        r.wait()

        logging.info('Epoch %d: %.3f seconds' %
                     (epoch, time.time() - epoch_start_time))
        logging.info("Epoch start time: %.3f, epoch end time: %.3f" %
                     (epoch_start_time, time.time()))
    global writer
    if dist.get_rank() == dist.get_world_size() - 1:
        writer.add_scalar('Test/Loss', losses.avg, epoch)
        writer.add_scalar('Test/Accuracy', top1.avg, epoch)

    return top1.avg
def get_rank():
    if not is_dist_avail_and_initialized():
        return 0
    return dist.get_rank()
Example #35
0
parser.add_argument('--min-num-tensors', dest='min_num_tensors', action='store',
                    default=2, type=int,
                    help='set the inclusive lower limit for the number of ' +
                    'tensors to be sent during one test run; ' +
                    'default: 2 (10**2 = 100)')

args = parser.parse_args()

MIN_NUM_TENSORS = args.min_num_tensors
MIN_BYTES = args.min_bytes
MAX_NUM_TENSORS = args.max_num_tensors + 1
MAX_BYTES = args.max_bytes + 1

dist.init_process_group(backend=os.environ['BACKEND'])

rank = dist.get_rank()
dist.barrier()

if rank == 0:
    print_header("broadcast")
    for bytes in [2**n for n in range(MIN_BYTES, MAX_BYTES)]:
        tensor = torch.ByteTensor(bytes).fill_(42)
        for num_tensors in [10**n for n in range(MIN_NUM_TENSORS, MAX_NUM_TENSORS)]:
            start = timer()
            for i in range(0, num_tensors):
                dist.broadcast(tensor, 0)
            end = timer()
            print_stats(bytes, num_tensors, end - start)
    print()
else:
    for bytes in [2**n for n in range(MIN_BYTES, MAX_BYTES)]:
Example #36
0
    def __init__(self,
                 train_data,
                 model,
                 optimizer=None,
                 loss=None,
                 callbacks_all=None,
                 callbacks_master=None,
                 batch_size_per_gpu=8,
                 n_epochs=1,
                 num_workers=1,
                 drop_last=False,
                 dev_data=None,
                 metrics=None,
                 metric_key=None,
                 update_every=1,
                 print_every=10,
                 validate_every=-1,
                 save_path=None,
                 device='auto',
                 fp16='',
                 use_tqdm=True):
        r"""

        :param train_data: 训练集, :class:`~fastNLP.DataSet` 类型。
        :param nn.modules model: 待训练的模型
        :param optimizer: `torch.optim.Optimizer` 优化器。如果为None,则Trainer使用默认的Adam(model.parameters(), lr=4e-3)这个优化器
        :param loss: 使用的 :class:`~fastNLP.core.losses.LossBase` 对象。当为None时,默认使用 :class:`~fastNLP.LossInForward`
        :param list callbacks_all: 用于在train过程中起调节作用的回调函数,作用于所有训练进程中。
            可使用的callback参见 :mod:`callback模块 <fastNLP.core.callback>`
        :param list callbacks_master: 用于在train过程中起调节作用的回调函数,只作用于其中一个进程( Master 进程)。
            可使用的callback参见 :mod:`callback模块 <fastNLP.core.callback>`
        :param int batch_size_per_gpu: 训练时,每个进程的 batch 大小。
        :param int n_epochs: 需要优化迭代多少次。
        :param num_workers: int, 有多少个线程来进行数据pad处理。
        :param drop_last: 如果最后一个batch没有正好为batch_size这么多数据,就扔掉最后一个batch
        :param dev_data: 用于做验证的DataSet, :class:`~fastNLP.DataSet` 类型。
        :param metrics: 验证的评估函数。可以只使用一个 :class:`Metric<fastNLP.core.metrics.MetricBase>` ,
            也可以使用多个 :class:`Metric<fastNLP.core.metrics.MetricBase>` ,通过列表传入。
            如验证时取得了更好的验证结果(如果有多个Metric,以列表中第一个Metric为准),且save_path不为None,
            则保存当前模型。Metric种类详见 :mod:`metrics模块 <fastNLP.core.metrics>` 。仅在传入dev_data时有效。
        :param str,None metric_key:  :class:`Metric<fastNLP.core.metrics.MetricBase>` 有时会有多个指标,
            比如 :class:`~fastNLP.core.metrics.SpanFPreRecMetric` 中包含了'f', 'pre', 'rec'。此时需
            要指定以哪个指标为准。另外有些指标是越小效果越好,比如语言模型的困惑度,这种情况下,在key前面增加一个'-'来表
            明验证时,值越小越好(比如: "-ppl")。仅在传入dev_data时有效。
        :param update_every: int, 多少步更新一次梯度。用于希望累计梯度的场景,比如需要128的batch_size, 但是直接设为128
            会导致内存不足,通过设置batch_size=32, update_every=4达到目的。当optimizer为None时,该参数无效。
        :param int print_every: 多少次反向传播更新tqdm显示的loss; 如果use_tqdm=False, 则多少次反向传播打印loss。
        :param int validate_every: 多少个step在验证集上验证一次; 如果为-1,则每个epoch结束验证一次。仅在传入dev_data时有效。
        :param str,None save_path: 将模型保存路径,如果路径不存在,将自动创建文件夹。如果为None,则不保存模型。如果dev_data为None,则保存
            最后一次迭代的模型。保存的时候不仅保存了参数,还保存了模型结构。即便使用DataParallel,这里也只保存模型。
        :param str device: 指定 device,可以是 gpu,cpu 或 auto
        :param str fp16: 指定半精度训练的优化等级,可为 O1,O2 或 O3,若为空字符串则不使用半精度。
        :param bool use_tqdm: 是否使用tqdm来显示训练进度; 如果为False,则将loss打印在终端中。
        """
        assert device in [
            'auto', 'cuda', 'cpu'
        ], "Please set correct device in [auto', 'cuda', 'cpu']"
        if device == 'auto':
            device = 'cuda' if torch.cuda.is_available() else 'cpu'

        # init distributed
        if device == 'cuda':
            torch.cuda.set_device(get_local_rank())
            self.device = torch.device("cuda", get_local_rank())
        else:
            self.device = torch.device(device)

        init_logger_dist()

        self.world_size = dist.get_world_size()
        self.rank = dist.get_rank()  # unique id for each process

        self.train_data = train_data
        self.batch_size_per_gpu = int(batch_size_per_gpu)
        self.n_epochs = int(n_epochs)
        self.num_data_workers = int(num_workers)
        self.drop_last = drop_last
        self.update_every = int(update_every)
        self.print_every = int(print_every)
        self.validate_every = int(validate_every)
        self.save_path = save_path
        self.losser = _prepare_losser(loss)
        self.fp16 = fp16
        self.local_rank = get_local_rank()
        self._forward_func = model.forward
        self.callback_manager = DistCallbackManager(
            env={"trainer": self},
            callbacks_all=callbacks_all,
            callbacks_master=callbacks_master)
        self.test_manager = DistCallbackManager(env={'trainer': self})
        self.metric_key = metric_key
        self.use_tqdm = use_tqdm

        model.to(self.device)
        optimizer = self._get_optimizer(optimizer)

        # init fp16, must before DataParallel init
        if len(self.fp16):
            assert isinstance(
                self.fp16, str
            ), "Please set Apex AMP optimization level selected in ['O0', 'O1', 'O2', 'O3']"
            _check_fp16()
            assert device == 'cuda', "Amp requires cuda device"
            model, optimizer = amp.initialize(model,
                                              optimizer,
                                              opt_level=self.fp16)

        # init DataParallel
        if parse_version(torch.__version__) >= parse_version('1.1'):
            self.ddp_model = DDP(model,
                                 device_ids=[self.local_rank],
                                 output_device=self.local_rank,
                                 find_unused_parameters=True)
        else:
            self.ddp_model = DDP(model,
                                 device_ids=[self.local_rank],
                                 output_device=self.local_rank)
        self.model = self.ddp_model.module

        self.optimizer = optimizer
        self.sampler = DistributedSampler(self.train_data)
        self.data_iterator = self._get_data_iter(self.train_data)
        self.batch_size = self.world_size * self.batch_size_per_gpu
        self.n_steps = self._get_n_steps()

        # for evaluation, only run eval on master proc
        if dev_data and metrics:
            cb = _TesterCallback(dev_data,
                                 model,
                                 metrics,
                                 batch_size=batch_size_per_gpu,
                                 num_workers=num_workers)
            self.test_manager.add_callback([cb], master=True)

        # Setup logging
        dist.barrier()
        self.start_time = datetime.now().strftime('%m_%d_%Y-%H_%M')
        if self.save_path:
            self.cp_save_path = self.save_path
        else:
            self.cp_save_path = None
        # use INFO in the master, WARN for others
        self.logger = logger
        self.logger.info("Setup Distributed Trainer")
        self.logger.warning(
            "Process pid: {}, rank: {}, local rank: {}, device: {}, fp16: {}".
            format(os.getpid(), self.rank, self.local_rank, self.device,
                   self.fp16 if self.fp16 else False))
        self.logger.info("Num of processes: {}".format(self.world_size))
        self.logger.info("Use device: {}".format(device))
        self.logger.info(
            "Training with fp16: {}, optimization level: {}".format(
                len(self.fp16) > 0, self.fp16 if self.fp16 else None))
Example #37
0
def debug(content, who='all'):
    if who == 'all' or who == dist.get_rank():
        logger.debug(content)
Example #38
0
def train(model_config, model, benchmark_config, model_specs, args):
    lm_dataloader, _, _ = model_config["data"]
    criterion = benchmark_config["criterion"]
    vocab_size = model_specs["vocab_size"]
    optimizer = model_config["optimizer"]

    model.train()
    log_number_of_parameters(model)

    total_loss = 0.0
    word_counter = 0

    optimizer = optimizer(model.parameters())

    pipe_group = model.group if hasattr(model, "group") else None

    if args.ddp_zero:
        model = DDP(
            model,
            device_ids=[torch.cuda.current_device()],
            process_group=get_data_parallel_group(),
            find_unused_parameters=False,
        )

    # TODO(anj-s): Avoid sending fake data to all replicas except the first and last one.
    if pipe_group and pipe_group.rank() != 0 and pipe_group.rank() != (pipe_group.size() - 1):
        lm_dataloader, _, _ = get_synthetic_dataloaders(args, benchmark_config, model_specs)

    total_tokens = 0
    total_tokens_per_log_interval = 0
    bptt = 2
    start_time = time.time()
    epoch_start_time = 0.0

    def get_batch(source):
        seq_len = len(source) - 1
        data = source[0:seq_len]
        target = source[1 : 1 + seq_len]
        return data, target

    for i, batch in enumerate(lm_dataloader):
        if i == 1:
            epoch_start_time = time.time()

        source, target = get_batch(batch)
        if args.max_batch and i > args.max_batch:
            break

        if i > 0:
            total_tokens += source.numel()

        optimizer.zero_grad()
        try:
            if (pipe_group is None or pipe_group.rank() == 0) and not args.ddp_zero:
                tmp = source.to(get_device(model, 0))
                output = model(tmp)
            else:
                output = model(source)
        except Exception as e:
            raise RuntimeError(f"training failed on {torch.distributed.get_rank()}") from e

        if pipe_group is None or pipe_group.rank() == pipe_group.size() - 1:
            target = target.to(get_device(model, -1))
            output = output.to(target.device)

            loss = criterion(output.view(-1, vocab_size), target.view(-1))
            if args.ddp_zero:
                ddp_group = get_data_parallel_group()
                torch.distributed.all_reduce(loss, op=torch.distributed.ReduceOp.SUM, group=ddp_group)
                loss /= ddp_group.size()
            loss.backward()
            del target
        else:
            if args.ddp_zero:
                model.module.back_helper(output)
            else:
                model.back_helper(output)

        del output

        torch.nn.utils.clip_grad_value_(model.parameters(), model_specs["clip_value"])
        optimizer.step()

        if pipe_group is None or pipe_group.rank() == pipe_group.size() - 1:
            total_loss += loss.item()
            log_interval = 1
            total_tokens_per_log_interval += source.numel()
            if i % log_interval == 0 and i > 0:
                cur_loss = total_loss / log_interval
                elapsed = time.time() - start_time
                if not args.multiprocess or dist.get_rank() == dist.get_world_size() - 1:
                    print(
                        "| batch {:5d} | wps {:5.2f} | loss {:5.2f} | ppl {:8.2f}".format(
                            i, total_tokens_per_log_interval / elapsed, cur_loss, math.exp(cur_loss)
                        )
                    )
                total_tokens_per_log_interval = 0
                total_loss = 0
                start_time = time.time()

    if epoch_start_time != 0:
        wps = total_tokens / (time.time() - epoch_start_time)
    else:
        raise RuntimeError(
            "Unable to benchmark on a single batch. Increase the size " " of the dataset and rerun the benchmark."
        )
    if not args.multiprocess or dist.get_rank() == dist.get_world_size() - 1:
        return wps, loss.item()
    else:
        return 0.0, 0.0
def run(rank, size):
    torch.manual_seed(1234)
    test_set, bsz = partition_dataset()

    model = load_model(nn.parallel.DistributedDataParallel(Net()),
                       "sgd_150_0.1_state_Dict_150.pth").float()

    num_batches = np.ceil(len(test_set.dataset) / float(bsz))
    best_loss = float("inf")

    preds, labels = get_all_preds(model, test_set)
    #print("Preds Size")
    #print(preds.size())  ([7551,15])
    #print("Labels Size")  ([7551])
    #print(labels.size())

    pred_lbl_fl = preds.argmax(1).float()
    lbl_fl = labels.float()

    prediction_list = [torch.zeros_like(pred_lbl_fl) for _ in range(size)]
    labels_list = [torch.zeros_like(pred_lbl_fl) for _ in range(size)]

    #print(labels)
    if dist.get_rank() == 0:
        gather(pred_lbl_fl, prediction_list)
        gather(lbl_fl, labels_list)
    else:
        gather(pred_lbl_fl)
        gather(lbl_fl)

    if dist.get_rank() == 0:

        new_preds = torch.tensor([], dtype=torch.float32)
        new_labels = torch.tensor([], dtype=torch.float32)
        for t1 in prediction_list:
            new_preds = torch.cat((new_preds, t1), dim=0)

        for t2 in labels_list:
            new_labels = torch.cat((new_labels, t2), dim=0)

        print("Preds:")
        k = new_preds.tolist()
        print(k[0:20])
        print("Actual:")
        j = new_labels.tolist()
        print(j[0:20])

        accuracry = calculate_accuracy(new_labels, new_preds)
        print("Accuracy : ", accuracry)
        print("Classification Report")
        print(
            classification_report(new_labels,
                                  new_preds,
                                  target_names=class_names))

        #roc_auc = roc_auc_compute_fn(new_preds, new_labels)
        #print("ROC-AUC score :", roc_auc)

        cm = get_confusion_matrix(new_labels, new_preds)
        print("Confusion Matrix :")
        print(cm)
Example #40
0
def train():
    logger = logging.getLogger()
    is_dist = dist.is_initialized()

    ## dataset
    dl = get_data_loader(cfg, mode='train', distributed=is_dist)

    ## model
    net, criteria_pre, criteria_aux = set_model()

    ## optimizer
    optim = set_optimizer(net)

    ## mixed precision training
    scaler = amp.GradScaler()

    ## ddp training
    net = set_model_dist(net)

    ## meters
    time_meter, loss_meter, loss_pre_meter, loss_aux_meters = set_meters()

    ## lr scheduler
    lr_schdr = WarmupPolyLrScheduler(optim, power=0.9,
        max_iter=cfg.max_iter, warmup_iter=cfg.warmup_iters,
        warmup_ratio=0.1, warmup='exp', last_epoch=-1,)

    ## train loop
    for it, (im, lb) in enumerate(dl):
        im = im.cuda()
        lb = lb.cuda()

        lb = torch.squeeze(lb, 1)

        optim.zero_grad()
        with amp.autocast(enabled=cfg.use_fp16):
            logits, *logits_aux = net(im)
            loss_pre = criteria_pre(logits, lb)
            loss_aux = [crit(lgt, lb) for crit, lgt in zip(criteria_aux, logits_aux)]
            loss = loss_pre + sum(loss_aux)
        scaler.scale(loss).backward()
        scaler.step(optim)
        scaler.update()
        torch.cuda.synchronize()

        time_meter.update()
        loss_meter.update(loss.item())
        loss_pre_meter.update(loss_pre.item())
        _ = [mter.update(lss.item()) for mter, lss in zip(loss_aux_meters, loss_aux)]

        ## print training log message
        if (it + 1) % 100 == 0:
            lr = lr_schdr.get_lr()
            lr = sum(lr) / len(lr)
            print_log_msg(
                it, cfg.max_iter, lr, time_meter, loss_meter,
                loss_pre_meter, loss_aux_meters)
        lr_schdr.step()

    ## dump the final model and evaluate the result
    save_pth = osp.join(cfg.respth, 'model_final.pth')
    logger.info('\nsave models to {}'.format(save_pth))
    state = net.module.state_dict()
    if dist.get_rank() == 0: torch.save(state, save_pth)

    logger.info('\nevaluating the final model')
    torch.cuda.empty_cache()
    heads, mious = eval_model(cfg, net)
    logger.info(tabulate([mious, ], headers=heads, tablefmt='orgtbl'))

    return
def save_layers_on_all_rank_zero_workers(ctx, model):
    gpus_per_model = ctx["gpus_per_model"]
    rank = torch_distrib.get_rank()
    if rank in range(gpus_per_model):
        seq = list(model.children())[0]
        torch.save(seq, f"seq_{rank}.pt")
Example #42
0
    def __init__(self,
                 in_channels,
                 bottleneck_channels,
                 out_channels,
                 stride=1,
                 groups=1,
                 dilation=1,
                 norm_func=None,
                 use_cudnn=False,
                 explicit_nhwc=False,
                 spatial_group_size=1,
                 communicator=None):
        super(SpatialBottleneck, self).__init__()
        if groups != 1:
            raise RuntimeError('Only support groups == 1')
        if dilation != 1:
            raise RuntimeError('Only support dilation == 1')
        if norm_func == None:
            norm_func = FrozenBatchNorm2d
        else:
            raise RuntimeError('Only support frozen BN now.')

        if stride != 1 or in_channels != out_channels:
            self.downsample = nn.Sequential(
                conv1x1(in_channels, out_channels, stride),
                norm_func(out_channels),
            )
        else:
            self.downsample = None

        # Both self.conv2 and self.downsample layers downsample the input when stride != 1
        self.conv1 = conv1x1(in_channels, bottleneck_channels, stride)
        self.conv2 = conv3x3(bottleneck_channels, bottleneck_channels)
        self.conv3 = conv1x1(bottleneck_channels, out_channels)
        self.relu = nn.ReLU(inplace=True)
        self.stride = stride

        self.bn1 = norm_func(bottleneck_channels)
        self.bn2 = norm_func(bottleneck_channels)
        self.bn3 = norm_func(out_channels)

        self.use_cudnn = use_cudnn

        # setup conv weights
        self.w_conv = [self.conv1.weight, self.conv2.weight, self.conv3.weight]
        if self.downsample is not None:
            self.w_conv.append(self.downsample[0].weight)

        # init weight in nchw format before possible transpose
        for w in self.w_conv:
            kaiming_uniform_(w, a=1)

        # TODO: prevent unsupported case usage
        # support cases
        #                 native      cudnn
        # normal             yes         no
        # channel_last       yes        yes
        # explicit_nhwc       no        yes
        self.explicit_nhwc = explicit_nhwc
        if self.explicit_nhwc:
            for p in self.parameters():
                with torch.no_grad():
                    p.data = p.data.permute(0, 2, 3, 1).contiguous()

        # spatial communicator
        self.spatial_group_size = spatial_group_size
        if spatial_group_size > 1:
            world_size = dist.get_world_size()
            num_groups = world_size // spatial_group_size
            assert (
                num_groups * spatial_group_size == world_size
            ), "torch.distributed.get_world_size() must be multiple of group_size"
            rank = dist.get_rank()
            self.local_rank = rank % spatial_group_size
            if communicator is None:
                for group in range(num_groups):
                    ranks = list(
                        range(group * spatial_group_size,
                              (group + 1) * spatial_group_size))
                    comm = torch.distributed.new_group(ranks=ranks)
                    if rank in ranks:
                        self.communicator = comm
            else:
                self.communicator = communicator
            self.stream1 = torch.cuda.Stream()
            self.spatial_args = self.spatial_group_size, self.local_rank, self.communicator, self.stream1
        else:
            self.spatial_args = 1, 0, None, None

        return
def get_rank():
    return dist.get_rank()
 def set_main_rpc_process(self):
     self.main_rpc_process = torch_distrib.get_rank(
         group=mpu.get_pipeline_parallel_group()) == 0
def train(args):
    is_distributed = len(args.hosts) > 1 and args.backend is not None
    logger.debug("Distributed training - {}".format(is_distributed))
    use_cuda = args.num_gpus > 0
    logger.debug("Number of gpus available - {}".format(args.num_gpus))
    kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}
    device = torch.device("cuda" if use_cuda else "cpu")

    if is_distributed:
        # Initialize the distributed environment.
        world_size = len(args.hosts)
        os.environ['WORLD_SIZE'] = str(world_size)
        host_rank = args.hosts.index(args.current_host)
        dist.init_process_group(backend=args.backend, rank=host_rank, world_size=world_size)
        logger.info('Initialized the distributed environment: \'{}\' backend on {} nodes. '.format(
            args.backend, dist.get_world_size()) + 'Current host rank is {}. Number of gpus: {}'.format(
            dist.get_rank(), args.num_gpus))

    # set the seed for generating random numbers
    torch.manual_seed(args.seed)
    if use_cuda:
        torch.cuda.manual_seed(args.seed)

    train_loader = _get_train_data_loader(args.batch_size, args.data_dir, is_distributed, **kwargs)
    test_loader = _get_test_data_loader(args.test_batch_size, args.data_dir, **kwargs)

    logger.debug("Processes {}/{} ({:.0f}%) of train data".format(
        len(train_loader.sampler), len(train_loader.dataset),
        100. * len(train_loader.sampler) / len(train_loader.dataset)
    ))

    logger.debug("Processes {}/{} ({:.0f}%) of test data".format(
        len(test_loader.sampler), len(test_loader.dataset),
        100. * len(test_loader.sampler) / len(test_loader.dataset)
    ))

    model = Net().to(device)
    if is_distributed and use_cuda:
        # multi-machine multi-gpu case
        model = torch.nn.parallel.DistributedDataParallel(model)
    else:
        # single-machine multi-gpu case or single-machine or multi-machine cpu case
        model = torch.nn.DataParallel(model)

    optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum)

    for epoch in range(1, args.epochs + 1):
        model.train()
        for batch_idx, (data, target) in enumerate(train_loader, 1):
            data, target = data.to(device), target.to(device)
            optimizer.zero_grad()
            output = model(data)
            loss = F.nll_loss(output, target)
            loss.backward()
            if is_distributed and not use_cuda:
                # average gradients manually for multi-machine cpu case only
                _average_gradients(model)
            optimizer.step()
            if batch_idx % args.log_interval == 0:
                logger.info('Train Epoch: {} [{}/{} ({:.0f}%)] Loss: {:.6f}'.format(
                    epoch, batch_idx * len(data), len(train_loader.sampler),
                    100. * batch_idx / len(train_loader), loss.item()))
        test(model, test_loader, device)
    save_model(model, args.model_dir)
 def get_rank():
     if not dist.is_available():
         return 0
     if not dist.is_initialized():
         return 0
     return dist.get_rank()
Example #47
0
def validate(val_loader, model, criterion, epoch, start_time, log_writer):
    batch_time = AverageMeter()
    losses = AverageMeter()
    top1 = AverageMeter()
    top5 = AverageMeter()
    global iter_ptr

    model.eval()
    end = time.time()

    prefetcher = data_prefetcher(val_loader)
    input, target = prefetcher.next()
    i = -1
    while input is not None:
        i += 1

        target = target.cuda(async=True)
        input_var = Variable(input)
        target_var = Variable(target)

        # compute output
        with torch.no_grad():
            output = model(input_var)
            loss = criterion(output, target_var)

        reduced_loss = reduce_tensor(loss.data)

        # measure accuracy and record loss
        prec1, prec5 = accuracy(output.data, target, topk=(1, 5))
        

        reduced_prec1 = reduce_tensor(prec1)
        reduced_prec5 = reduce_tensor(prec5)

        losses.update(to_python_float(reduced_loss), input.size(0))
        top1.update(to_python_float(prec1), input.size(0))
        top5.update(to_python_float(prec5), input.size(0))

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        if dist.get_rank() == 0 and i % args.print_freq == 0:
            print('Test: [{0}/{1}]\t'
                  'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                  'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                  'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t'
                  'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format(
                   i, len(val_loader), batch_time=batch_time, loss=losses,
                   top1=top1, top5=top5))

        input, target = prefetcher.next()

    time_diff = datetime.now()-start_time
    if dist.get_rank() == 0:
        print(f'~~{epoch}\t{float(time_diff.total_seconds() / 3600.0)}\t{top5.avg:.3f}\n')
        print(' * Prec@1 {top1.avg:.3f} Prec@5 {top5.avg:.3f}'.format(top1=top1, top5=top5)) 
        if log_writer:
            log_writer.add_scalar('test_iter/top1', top1.get_avg(), iter_ptr)
            log_writer.add_scalar('test_iter/top5', top5.get_avg(), iter_ptr)
            log_writer.add_scalar('test_iter/loss', losses.get_avg(), iter_ptr)
            log_writer.add_scalar('test_iter/batch_time', batch_time.get_avg(), iter_ptr)

            log_writer.add_scalar('test_epoch/top1', top1.get_avg(), epoch)
            log_writer.add_scalar('test_epoch/top5', top5.get_avg(), epoch)
            log_writer.add_scalar('test_epoch/loss', losses.get_avg(), epoch)

            log_writer.add_scalar('test_time/top1', top1.get_avg(), train_record.get_time())
            log_writer.add_scalar('test_time/top5', top5.get_avg(), train_record.get_time())
            log_writer.add_scalar('test_time/loss', losses.get_avg(), train_record.get_time())  

    return top1.avg
            logger.info(f'E{epoch} V{v}  * msIoU {IoUs}')
            if overall_acc:
                logger.info(f'E{epoch} V{v}  * OA {overall_acc:.4%}')

    return mIoU


if __name__ == "__main__":
    args, config = parse_config()

    torch.cuda.set_device(config.local_rank)
    torch.distributed.init_process_group(backend='nccl', init_method='env://')
    torch.backends.cudnn.enabled = True
    torch.backends.cudnn.benchmark = True
    torch.backends.cudnn.deterministic = True

    os.makedirs(args.log_dir, exist_ok=True)
    os.environ["JOB_LOAD_DIR"] = os.path.dirname(config.load_path)

    logger = setup_logger(output=config.log_dir, distributed_rank=dist.get_rank(), name="s3dis_eval")
    if dist.get_rank() == 0:
        path = os.path.join(config.log_dir, "config.json")
        with open(path, 'w') as f:
            json.dump(vars(args), f, indent=2)
            json.dump(vars(config), f, indent=2)
            os.system('cp %s %s' % (args.cfg, config.log_dir))
        logger.info("Full config saved to {}".format(path))

    # main function
    main(config)
def _train(args):
    is_distributed = len(args.hosts) > 1 and args.dist_backend is not None
    logger.debug("Distributed training - {}".format(is_distributed))

    if is_distributed:
        # Initialize the distributed environment.
        world_size = len(args.hosts)
        os.environ['WORLD_SIZE'] = str(world_size)
        host_rank = args.hosts.index(args.current_host)
        dist.init_process_group(backend=args.dist_backend, rank=host_rank, world_size=world_size)
        logger.info(
            'Initialized the distributed environment: \'{}\' backend on {} nodes. '.format(
                args.dist_backend,
                dist.get_world_size()) + 'Current host rank is {}. Using cuda: {}. Number of gpus: {}'.format(
                dist.get_rank(), torch.cuda.is_available(), args.num_gpus))

    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    logger.info("Device Type: {}".format(device))

    logger.info("Loading Cifar10 dataset")
    transform = transforms.Compose(
        [transforms.ToTensor(),
         transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

    trainset = torchvision.datasets.CIFAR10(root=args.data_dir, train=True,
                                            download=False, transform=transform)
    train_loader = torch.utils.data.DataLoader(trainset, batch_size=args.batch_size,
                                               shuffle=True, num_workers=args.workers)

    testset = torchvision.datasets.CIFAR10(root=args.data_dir, train=False,
                                           download=False, transform=transform)
    test_loader = torch.utils.data.DataLoader(testset, batch_size=args.batch_size,
                                              shuffle=False, num_workers=args.workers)

    logger.info("Model loaded")
    model = Net()

    if torch.cuda.device_count() > 1:
        logger.info("Gpu count: {}".format(torch.cuda.device_count()))
        model = nn.DataParallel(model)

    model = model.to(device)

    criterion = nn.CrossEntropyLoss().to(device)
    optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum)

    for epoch in range(0, args.epochs):
        running_loss = 0.0
        for i, data in enumerate(train_loader):
            # get the inputs
            inputs, labels = data
            inputs, labels = inputs.to(device), labels.to(device)

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            # print statistics
            running_loss += loss.item()
            if i % 2000 == 1999:  # print every 2000 mini-batches
                print('[%d, %5d] loss: %.3f' %
                      (epoch + 1, i + 1, running_loss / 2000))
                running_loss = 0.0
    print('Finished Training')
    return _save_model(model, args.model_dir)
Example #50
0
def test_synchronize_sgd():
    torch.manual_seed(42)
    dist.init_process_group('mpi')
    rank = dist.get_rank()
    world_size = dist.get_world_size()

    device = torch.device('cpu')
    # device = torch.device('cuda') # Uncomment this to run on GPU

    # N is batch size; D_in is input dimension;
    # H is hidden dimension; D_out is output dimension.
    N, D_in, H, D_out = 64, 1000, 100, 10

    # Create random Tensors to hold input and outputs
    x = torch.randn(N, D_in, device=device)
    y = torch.randn(N, D_out, device=device)

    x = x[rank::world_size]
    y = y[rank::world_size]

    # Create random Tensors for weights; setting requires_grad=True means that we
    # want to compute gradients for these Tensors during the backward pass.
    w1 = torch.randn(D_in, H, device=device, requires_grad=True)
    w2 = torch.randn(H, D_out, device=device, requires_grad=True)

    learning_rate = 1e-6
    for t in range(500):
        # Forward pass: compute predicted y using operations on Tensors. Since w1 and
        # w2 have requires_grad=True, operations involving these Tensors will cause
        # PyTorch to build a computational graph, allowing automatic computation of
        # gradients. Since we are no longer implementing the backward pass by hand we
        # don't need to keep references to intermediate values.
        y_pred = x.mm(w1).clamp(min=0).mm(w2)

        # Compute and print loss. Loss is a Tensor of shape (), and loss.item()
        # is a Python number giving its value.
        loss = (y_pred - y).pow(2).sum()

        if rank == 0:
            print("Iter {} : {:10.3e}".format(t, loss.item()))

        # Use autograd to compute the backward pass. This call will compute the
        # gradient of loss with respect to all Tensors with requires_grad=True.
        # After this call w1.grad and w2.grad will be Tensors holding the gradient
        # of the loss with respect to w1 and w2 respectively.
        loss.backward()

        # Update weights using gradient descent. For this step we just want to mutate
        # the values of w1 and w2 in-place; we don't want to build up a computational
        # graph for the update steps, so we use the torch.no_grad() context manager
        # to prevent PyTorch from building a computational graph for the updates
        with torch.no_grad():
            w1 -= learning_rate * w1.grad
            w2 -= learning_rate * w2.grad

            # Manually zero the gradients after running the backward pass
            w1.grad.zero_()
            w2.grad.zero_()

            # Synchronize weights
            dist.all_reduce(w1, op=dist.reduce_op.SUM)
            dist.all_reduce(w2, op=dist.reduce_op.SUM)
            w1 /= world_size
            w2 /= world_size
Example #51
0
    def test_basic_math_ops(self):
        ops = [
            "torch.add", "torch.sub", "torch.mul", "torch.div", "+", "-", "*",
            "/"
        ]

        spec = ChunkShardingSpec(
            dim=0,
            placements=[
                "rank:0/cuda:0",
                "rank:1/cuda:1",
                "rank:2/cuda:2",
                "rank:3/cuda:3",
            ],
        )

        sharded_lhs = sharded_tensor.rand(spec, (12, 3))
        sharded_rhs = sharded_tensor.rand(spec, (12, 3))
        current_rank = dist.get_rank()
        global_lhs = torch.empty(
            (12, 3), device=current_rank) if current_rank == 0 else None
        global_rhs = torch.empty(
            (12, 3), device=current_rank) if current_rank == 0 else None
        sharded_lhs.gather(dst=0, out=global_lhs)
        sharded_rhs.gather(dst=0, out=global_rhs)

        for op in ops:
            binary_op = gen_binary_op_func(op)
            binary_op_ = gen_binary_op_func(op, inplace=True)
            # test basic math ops between ShardedTensors
            sharded_output = binary_op(sharded_lhs, sharded_rhs)
            output = torch.empty(
                (12, 3), device=current_rank) if current_rank == 0 else None
            sharded_output.gather(dst=0, out=output)

            if current_rank == 0:
                global_output = binary_op(global_lhs, global_rhs)

                self.assertEqual(output, global_output)

            # test basic math ops between ShardedTensor and scalar
            scalars = [3, 1.8]
            for scalar in scalars:
                sharded_output_lhs = binary_op(sharded_lhs, scalar)

                sharded_output_lhs_ = binary_op_(sharded_lhs, scalar)
                self.assertTrue(
                    torch.allclose(sharded_output_lhs, sharded_output_lhs_))
                output_lhs = torch.empty(
                    (12,
                     3), device=current_rank) if current_rank == 0 else None
                sharded_output_lhs.gather(dst=0, out=output_lhs)

                sharded_output_rhs = binary_op(scalar, sharded_lhs)
                output_rhs = torch.empty(
                    (12,
                     3), device=current_rank) if current_rank == 0 else None
                sharded_output_rhs.gather(dst=0, out=output_rhs)

                if current_rank == 0:
                    global_output_lhs = binary_op(global_lhs, scalar)
                    global_output_rhs = binary_op(scalar, global_lhs)

                    self.assertEqual(output_lhs, global_output_lhs)
                    self.assertEqual(output_rhs, global_output_rhs)
def main():
    args = create_argparser().parse_args()

    dist_util.setup_dist()
    logger.configure()

    logger.log("creating model and diffusion...")
    model, diffusion = create_model_and_diffusion(
        **args_to_dict(args,
                       model_and_diffusion_defaults().keys()))
    model.load_state_dict(
        dist_util.load_state_dict(args.model_path, map_location="cpu"))
    model.to(dist_util.dev())
    if args.use_fp16:
        model.convert_to_fp16()
    model.eval()

    logger.log("loading classifier...")
    classifier = create_classifier(
        **args_to_dict(args,
                       classifier_defaults().keys()))
    classifier.load_state_dict(
        dist_util.load_state_dict(args.classifier_path, map_location="cpu"))
    classifier.to(dist_util.dev())
    if args.classifier_use_fp16:
        classifier.convert_to_fp16()
    classifier.eval()

    def cond_fn(x, t, y=None):
        assert y is not None
        with th.enable_grad():
            x_in = x.detach().requires_grad_(True)
            logits = classifier(x_in, t)
            log_probs = F.log_softmax(logits, dim=-1)
            selected = log_probs[range(len(logits)), y.view(-1)]
            return th.autograd.grad(selected.sum(),
                                    x_in)[0] * args.classifier_scale

    def model_fn(x, t, y=None):
        assert y is not None
        return model(x, t, y if args.class_cond else None)

    logger.log("sampling...")
    all_images = []
    all_labels = []
    while len(all_images) * args.batch_size < args.num_samples:
        model_kwargs = {}
        classes = th.randint(low=0,
                             high=NUM_CLASSES,
                             size=(args.batch_size, ),
                             device=dist_util.dev())
        model_kwargs["y"] = classes
        sample_fn = (diffusion.p_sample_loop
                     if not args.use_ddim else diffusion.ddim_sample_loop)
        sample = sample_fn(
            model_fn,
            (args.batch_size, 3, args.image_size, args.image_size),
            clip_denoised=args.clip_denoised,
            model_kwargs=model_kwargs,
            cond_fn=cond_fn,
            device=dist_util.dev(),
        )
        sample = ((sample + 1) * 127.5).clamp(0, 255).to(th.uint8)
        sample = sample.permute(0, 2, 3, 1)
        sample = sample.contiguous()

        gathered_samples = [
            th.zeros_like(sample) for _ in range(dist.get_world_size())
        ]
        dist.all_gather(gathered_samples,
                        sample)  # gather not supported with NCCL
        all_images.extend(
            [sample.cpu().numpy() for sample in gathered_samples])
        gathered_labels = [
            th.zeros_like(classes) for _ in range(dist.get_world_size())
        ]
        dist.all_gather(gathered_labels, classes)
        all_labels.extend([labels.cpu().numpy() for labels in gathered_labels])
        logger.log(f"created {len(all_images) * args.batch_size} samples")

    arr = np.concatenate(all_images, axis=0)
    arr = arr[:args.num_samples]
    label_arr = np.concatenate(all_labels, axis=0)
    label_arr = label_arr[:args.num_samples]
    if dist.get_rank() == 0:
        shape_str = "x".join([str(x) for x in arr.shape])
        out_path = os.path.join(logger.get_dir(), f"samples_{shape_str}.npz")
        logger.log(f"saving to {out_path}")
        np.savez(out_path, arr, label_arr)

    dist.barrier()
    logger.log("sampling complete")