Esempio n. 1
0
    def test_sync_params_no_buffers(self):
        # Set up process group.
        store = c10d.TCPStore('localhost', self.port, self.is_master)
        options = c10d.ProcessGroupGloo.Options()
        options.devices = [c10d.ProcessGroupGloo.create_tcp_device(interface="lo")]
        process_group = c10d.ProcessGroupGloo(store, self.rank, self.world_size, options)

        # Use all available devices on every process here (data is small, so should be fine).
        devices = gpus_for_rank(self.world_size)[self.rank]
        target = torch.arange(10, dtype=torch.float64, device='cuda:0').chunk(5)
        parameter_data = [target]
        parameter_data += [torch.zeros(10, device=torch.device('cuda', d)).chunk(5) for d in devices[1:]]
        buffer_data = [[]] * len(parameter_data)

        c10d._sync_params(
            process_group,
            parameter_data=parameter_data,
            buffer_data=buffer_data,
            devices=devices,
            broadcast_bucket_size=10,
            broadcast_buffers=False)

        for device_data in parameter_data:
            for i, parameter in enumerate(device_data):
                self.assertEqual(parameter, target[i])
Esempio n. 2
0
    def test_fp16(self):
        store = c10d.TCPStore('localhost', self.port, self.rank == 0)
        process_group = c10d.ProcessGroupNCCL(store, self.rank,
                                              self.world_size)

        gpus = gpus_for_rank(self.world_size)[self.rank]
        model = nn.Linear(1, 1, bias=False).cuda(gpus[0]).half()
        nn.init.constant_(model.weight, 1)
        ddp_model = distributed_c10d._DistributedDataParallelC10d(
            model,
            device_ids=[gpus[0]],
            process_group=process_group,
            bucket_cap_mb=1,
        )

        # Input 2**15, so that the gradients will overflow with a
        # world_size of 2, unless we normalize the gradient by the
        # world_size before the reduction
        input = torch.Tensor([[2**15]]).cuda(gpus[0]).half()

        # Step model
        ddp_model.train()
        output = ddp_model(input)
        loss = output.sum()
        loss.backward()

        self.assertFalse(
            any(torch.isinf(p.grad).any() for p in ddp_model.parameters()))
Esempio n. 3
0
    def test_dist_broadcast_coalesced(self):
        # Set up process group.
        store = c10d.TCPStore('localhost', self.port, self.is_master)
        options = c10d.ProcessGroupGloo.Options()
        options.devices = [c10d.ProcessGroupGloo.create_tcp_device(interface="lo")]
        process_group = c10d.ProcessGroupGloo(store, self.rank, self.world_size, options)

        device = torch.device('cuda')

        target = torch.arange(10, dtype=torch.float64, device=device).chunk(5)

        if self.is_master:
            # All processes should have these tensors in the end.
            tensors = target
        else:
            # Non-master processes start with empty tensors and should be
            # filled with the tensors from the master.
            tensors = torch.zeros(10, device=device).chunk(5)

        c10d._dist_broadcast_coalesced(
            tensors,
            buffer_size=10,
            process_group=process_group)

        if not self.is_master:
            self.assertEqual(tensors, target)
 def test_gloo_backend(self):
     store = c10d.TCPStore('localhost', self.port, self.rank == 0)
     options = c10d.ProcessGroupGloo.Options()
     options.devices = [
         c10d.ProcessGroupGloo.create_tcp_device(interface="lo")
     ]
     process_group = c10d.ProcessGroupGloo(store, self.rank,
                                           self.world_size, options)
     self._test_ddp_with_process_group(process_group)
Esempio n. 5
0
    def test_sync_params_with_buffers(self):
        # Set up process group.
        store = c10d.TCPStore('localhost', self.port, self.is_master)
        options = c10d.ProcessGroupGloo.Options()
        options.devices = [
            c10d.ProcessGroupGloo.create_tcp_device(interface="lo")
        ]
        process_group = c10d.ProcessGroupGloo(store, self.rank,
                                              self.world_size, options)

        devices = gpus_for_rank(self.world_size)[self.rank]
        target = torch.arange(10, dtype=torch.float64,
                              device='cuda:0').chunk(5)
        parameter_data = [target]
        parameter_data += [
            torch.zeros(10, device=torch.device('cuda', d)).chunk(5)
            for d in devices[1:]
        ]

        # sync_params should do a dist_broadcast for buffers, so we only populate the master buffers and
        # then check that other processes' tensors end up matching.

        if self.is_master:
            buffer_data = [target]
            buffer_data += [
                torch.zeros(10, device=torch.device('cuda', d)).chunk(5)
                for d in devices[1:]
            ]
        else:
            buffer_data = [
                torch.zeros(10, device=torch.device('cuda', d)).chunk(5)
                for d in devices
            ]

        c10d._sync_params(process_group,
                          parameter_data=parameter_data,
                          buffer_data=buffer_data,
                          devices=devices,
                          broadcast_bucket_size=10,
                          broadcast_buffers=True)

        for device_data in parameter_data:
            for i, parameter in enumerate(device_data):
                self.assertEqual(parameter, target[i])

        for device_data in buffer_data:
            for i, buffer in enumerate(device_data):
                self.assertEqual(buffer, target[i])
Esempio n. 6
0
def main():
    # is_chief indicates this machine will do shared tasks for the cluster
    # such as logging and checkpointing
    # is_chief must be true only for at most 1 process in training cluster
    # $RANK is set by pytorch.distributed.launch
    # https://github.com/pytorch/pytorch/blob/db6e4576dab097abf01d032c3326e4b285eb8499/torch/distributed/launch.py#L193
    global is_chief, event_writer, global_example_count, last_recv_bytes, last_transmit_bytes, last_log_time

    is_chief = (not args.distributed) or (int(os.environ['RANK'])==0)

    global_example_count = 0
    if is_chief:
      print(f"Logging to {args.logdir}")
      event_writer = SummaryWriter(args.logdir)
      log_tb("first", time.time())
    else:
      event_writer = NoOp()

    # baseline number for network bytes
    last_recv_bytes, last_transmit_bytes = network_bytes()
    last_log_time = time.time()
    
    print(args)
    print("~~epoch\thours\ttop1Accuracy\n")

    # need to index validation directory before we start counting the time
    dataloader.sort_ar(args.data+'/validation')
    
    global reduce_function
    if args.c10d:
        print('Distributed: loading c10d process group')
        # https://github.com/pytorch/pytorch/blob/master/torch/lib/c10d/TCPStore.hpp
        torch.cuda.set_device(args.local_rank)
        rank = int(os.environ['RANK'])
        store = c10d.TCPStore(os.environ['MASTER_ADDR'], int(os.environ['MASTER_PORT']), rank==0) # (masterAddr, masterPort, isServer) 
        process_group = c10d.ProcessGroupNCCL(store, rank, args.world_size) # (store, rank, size)
        reduce_function = lambda t: process_group.allreduce(t, c10d.AllreduceOptions().reduceOp)
    elif args.distributed:
        print('Distributed: initializing process group')
        torch.cuda.set_device(args.local_rank)
        dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size)
        assert(args.world_size == dist.get_world_size())
        reduce_function = lambda t: dist.all_reduce(t, op=dist.reduce_op.SUM)
        print("Distributed: success (%d/%d)"%(args.local_rank, args.world_size))

    if args.fp16: assert torch.backends.cudnn.enabled, "fp16 mode requires cudnn backend to be enabled."

    print("Loading model")
    if args.factorized_resnet: model = resnet.resnet50factorized(pretrained=args.pretrained)
    else: model = resnet.resnet50(pretrained=args.pretrained)

    model = model.cuda()
    if args.init_bn0: resnet.init_dist_weights(model) # Sets batchnorm std to 0
    if args.fp16: model = network_to_half(model)
    best_prec5 = 93 # only save models over 92%. Otherwise it stops to save every time

    # Load model from checkpoint. This must happen distributed as model is saved without it
    if args.resume:
        checkpoint = torch.load(args.resume, map_location = lambda storage, loc: storage.cuda(args.local_rank))
        model.load_state_dict(checkpoint['state_dict'])
        args.start_epoch = checkpoint['epoch']
        best_prec5 = checkpoint['best_prec5']

    if args.c10d:
        model = distributed_c10d._DistributedDataParallelC10d(model, process_group, device_ids=[args.local_rank], output_device=args.local_rank)
        c10d_sanity_check()
    elif args.distributed: model = nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank)

    global model_params, master_params
    if args.fp16: model_params, master_params = prep_param_lists(model)
    else: model_params = master_params = model.parameters()

    optim_params = experimental_utils.bnwd_optim_params(model, model_params, master_params) if args.no_bn_wd else master_params

    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss().cuda()
    optimizer = torch.optim.SGD(optim_params, 0, momentum=args.momentum, weight_decay=args.weight_decay) # start with 0 lr. Scheduler will change this later
    if args.resume: # we must resume optimizer params separately
        checkpoint = torch.load(args.resume, map_location = lambda storage, loc: storage.cuda(args.local_rank))
        optimizer.load_state_dict(checkpoint['optimizer'])

    # Load data data manager and lr scheduler from phases
    phases = eval(args.phases)
    print("Creating data loaders (this could take 6-12 minutes)")
    dm = DataManager([p for p in phases if 'bs' in p])
    scheduler = Scheduler(optimizer, [p for p in phases if 'lr' in p], args.scale_lr)

    start_time = datetime.now() # Loading start to after everything is loaded
    if args.evaluate: return validate(dm.val_dl, model, criterion, 0, start_time)

    if args.distributed:
        print('Syncing machines before training')
        sum_tensor(torch.tensor([1.0]).float().cuda())

    print("Begin training")
    estart = time.time()
    for epoch in range(args.start_epoch, scheduler.tot_epochs):
        estart = time.time()
        dm.set_epoch(epoch)

        train(dm.trn_dl, model, criterion, optimizer, scheduler, epoch)
        if args.prof: break
        prec5 = validate(dm.val_dl, model, criterion, epoch, start_time)

        is_best = prec5 > best_prec5
        best_prec5 = max(prec5, best_prec5)
        if args.local_rank == 0:
            if is_best: save_checkpoint(epoch, model, best_prec5, optimizer, is_best=True, filename='model_best.pth.tar')
            phase = dm.get_phase(epoch)
            if phase:save_checkpoint(epoch, model, best_prec5, optimizer, filename=f'sz{phase["bs"]}_checkpoint.path.tar')

    event_writer.export_scalars_to_json(args.logdir+'/scalars.json')
    event_writer.close()
Esempio n. 7
0
 def _create_store(self):
     return c10d.TCPStore(TCP_ADDR, TCP_PORT, True)
Esempio n. 8
0
 def test_nccl_backend(self):
     store = c10d.TCPStore('localhost', self.port, self.is_master)
     process_group = c10d.ProcessGroupNCCL(store, self.rank, self.world_size)
     self._test_ddp_with_process_group(process_group)
Esempio n. 9
0
 def setUp(self):
     addr = 'localhost'
     port = common.find_free_port()
     self.tcpstore = c10d.TCPStore(addr, port, True)
     self.prefix = "test_prefix"
     self.tcpstore.set_timeout(timedelta(seconds=300))
Esempio n. 10
0
 def _create_store(self):
     addr = 'localhost'
     port = common.find_free_port()
     store = c10d.TCPStore(addr, port, True)
     store.set_timeout(timedelta(seconds=300))
     return store
 def _create_store(self):
     addr = 'localhost'
     port = find_free_port()
     return c10d.TCPStore(addr, port, True)
 def setUp(self):
     addr = 'localhost'
     port = find_free_port()
     self.tcpstore = c10d.TCPStore(addr, port, True)
     self.prefix = "test_prefix"