Esempio n. 1
0
def get_bn(config):
    if config.use_sync_bn:
        group_size = config.kwargs.group_size
        var_mode = config.kwargs.var_mode
        if group_size == 1:
            bn_group = None
        else:
            world_size, rank = link.get_world_size(), link.get_rank()
            assert world_size % group_size == 0
            bn_group = simple_group_split(world_size, rank,
                                          world_size // group_size)

        del config.kwargs['group_size']
        config.kwargs.group = bn_group
        config.kwargs.var_mode = (link.syncbnVarMode_t.L1 if var_mode == 'L1'
                                  else link.syncbnVarMode_t.L2)

        def BNFunc(*args, **kwargs):
            return link.nn.SyncBatchNorm2d(*args, **kwargs, **config.kwargs)

        return BNFunc
    else:

        def BNFunc(*args, **kwargs):
            return torch.nn.BatchNorm2d(*args, **kwargs, **config.kwargs)

        return BNFunc
Esempio n. 2
0
def optim_entry(config):
    rank = link.get_rank()
    if config['type'] == 'FusedFP16SGD' and FusedFP16SGD is None:
        raise RuntimeError(
            'FusedFP16SGD is disabled due to linklink version, try using other optimizers'
        )
    if config['type'] == 'FusedFP16SGD' and rank > 0:
        config['kwargs']['verbose'] = False
    return globals()[config['type']](**config['kwargs'])
Esempio n. 3
0
def get_logger(name, level=logging.INFO):
    global _logger_names
    logger = logging.getLogger(name)
    if name in _logger_names:
        return logger

    _logger_names.append(name)
    if link.get_rank() > 0:
        logger.addFilter(RankFilter())

    return logger
def initialize():
    process_id = int(os.environ['SLURM_PROCID'] if 'SLURM_PROCID' in
                     os.environ else 0)
    num_gpu = torch.cuda.device_count()
    if num_gpu > 0:
        torch.cuda.set_device(process_id % num_gpu)

    link.initialize()
    rank = link.get_rank()
    world_size = link.get_world_size()

    return rank, world_size
Esempio n. 5
0
def dist_init():
    proc_id = int(os.environ['SLURM_PROCID'])
    ntasks = int(os.environ['SLURM_NTASKS'])
    node_list = os.environ['SLURM_NODELIST']
    num_gpus = torch.cuda.device_count()
    torch.cuda.set_device(proc_id%num_gpus)

    link.initialize()
    world_size = link.get_world_size()
    rank = link.get_rank()
    
    return rank, world_size
Esempio n. 6
0
    def _batch_unshuffle_ddp(self, x, idx_unshuffle):
        # gather from all gpus
        batch_size_this = x.shape[0]
        x_gather = concat_all_gather(x, self.group_size, self.group_idx)
        batch_size_all = x_gather.shape[0]

        num_gpus = batch_size_all // batch_size_this

        # restored index for this gpu
        gpu_idx = link.get_rank() % self.group_size
        idx_this = idx_unshuffle.view(num_gpus, -1)[gpu_idx]

        return x_gather[idx_this]
Esempio n. 7
0
    def _get_group(bn_group_size):
        if bn_group_size in GroupSyncBatchNorm.group_by_size:
            return GroupSyncBatchNorm.group_by_size[bn_group_size]

        rank = get_rank()
        world_size = get_world_size()
        if bn_group_size is None:
            bn_group_size = world_size
        assert world_size % bn_group_size == 0
        bn_group_comm = simple_group_split(world_size, rank,
                                           world_size // bn_group_size)
        GroupSyncBatchNorm.group_by_size[bn_group_size] = bn_group_comm
        return bn_group_comm
Esempio n. 8
0
    def __init__(self,
                 encoder_q,
                 encoder_k,
                 K=65536,
                 m=0.999,
                 T=0.07,
                 mlp=False,
                 group_size=8):
        """
        K: queue size; number of negative keys (default: 65536)
        m: moco momentum of updating key encoder (default: 0.999)
        T: softmax temperature (default: 0.07)
        group_size: size of the group to use ShuffleBN (default: 8, shuffle data across all gpus)
        """
        super(MoCo, self).__init__()

        self.K = K
        self.m = m
        self.T = T
        dim = encoder_q.num_classes

        self.encoder_q = encoder_q
        self.encoder_k = encoder_k

        if mlp:  # hack: brute-force replacement
            dim_mlp = self.encoder_q.fc.weight.shape[1]
            self.encoder_q.fc = nn.Sequential(nn.Linear(dim_mlp, dim_mlp),
                                              nn.ReLU(), self.encoder_q.fc)
            self.encoder_k.fc = nn.Sequential(nn.Linear(dim_mlp, dim_mlp),
                                              nn.ReLU(), self.encoder_k.fc)

        for param_q, param_k in zip(self.encoder_q.parameters(),
                                    self.encoder_k.parameters()):
            param_k.data.copy_(param_q.data)  # initialize
            param_k.requires_grad = False  # not update by gradient

        # create the queue
        self.register_buffer("queue", torch.randn(dim, K))
        self.queue = nn.functional.normalize(self.queue, dim=0)

        self.register_buffer("queue_ptr", torch.zeros(1, dtype=torch.long))

        rank = link.get_rank()
        world_size = link.get_world_size()
        self.group_size = world_size if group_size is None else min(
            world_size, group_size)

        assert world_size % self.group_size == 0
        self.group_idx = simple_group_split(world_size, rank,
                                            world_size // self.group_size)
Esempio n. 9
0
def dist_init(method='slurm', device_id=0):
    if method == 'slurm':
        proc_id = int(os.environ['SLURM_PROCID'])
        # ntasks = int(os.environ['SLURM_NTASKS'])
        # node_list = os.environ['SLURM_NODELIST']
        num_gpus = torch.cuda.device_count()
        torch.cuda.set_device(proc_id % num_gpus)
    elif method == 'single_node':
        torch.cuda.set_device(device_id)

    link.initialize()
    world_size = link.get_world_size()
    rank = link.get_rank()

    return rank, world_size
    def __init__(self, dataset, world_size=None, rank=None, round_up=True):
        if world_size is None:
            world_size = link.get_world_size()
        if rank is None:
            rank = link.get_rank()
        self.dataset = dataset
        self.world_size = world_size
        self.rank = rank
        self.round_up = round_up
        self.epoch = 0

        self.num_samples = int(
            math.ceil(len(self.dataset) * 1.0 / self.world_size))
        if self.round_up:
            self.total_size = self.num_samples * self.world_size
        else:
            self.total_size = len(self.dataset)
Esempio n. 11
0
def _serialize_to_tensor(data, group=None):
    # backend = link.get_backend(group)
    # assert backend in ["gloo", "nccl"]
    # device = torch.device("cpu" if backend == "gloo" else "cuda")
    device = torch.cuda.current_device()

    buffer = pickle.dumps(data)
    if len(buffer) > 1024**3:
        import logging
        logger = logging.getLogger('global')
        logger.warning(
            "Rank {} trying to all-gather {:.2f} GB of data on device {}".
            format(link.get_rank(),
                   len(buffer) / (1024**3), device))
    storage = torch.ByteStorage.from_buffer(buffer)
    tensor = torch.ByteTensor(storage).to(device=device)
    return tensor
Esempio n. 12
0
    def _batch_shuffle_ddp(self, x):
        # gather from all gpus
        batch_size_this = x.shape[0]
        x_gather = concat_all_gather(x, self.group_size, self.group_idx)
        batch_size_all = x_gather.shape[0]

        num_gpus = batch_size_all // batch_size_this

        # random shuffle index
        idx_shuffle = torch.randperm(batch_size_all).cuda()

        # broadcast to all gpus
        link.broadcast(idx_shuffle, 0, group_idx=self.group_idx)

        # index for restoring
        idx_unshuffle = torch.argsort(idx_shuffle)

        # shuffled index for this gpu
        gpu_idx = link.get_rank() % self.group_size
        idx_this = idx_shuffle.view(num_gpus, -1)[gpu_idx]

        return x_gather[idx_this], idx_unshuffle
Esempio n. 13
0
 def setup_env(self):
     # dist
     self.dist = EasyDict()
     self.dist.rank, self.dist.world_size = link.get_rank(
     ), link.get_world_size()
     self.prototype_info.world_size = self.dist.world_size
     # directories
     self.path = EasyDict()
     self.path.root_path = os.path.dirname(self.config_file)
     self.path.save_path = os.path.join(self.path.root_path, 'checkpoints')
     self.path.event_path = os.path.join(self.path.root_path, 'events')
     self.path.result_path = os.path.join(self.path.root_path, 'results')
     makedir(self.path.save_path)
     makedir(self.path.event_path)
     makedir(self.path.result_path)
     # tb_logger
     if self.dist.rank == 0:
         self.tb_logger = SummaryWriter(self.path.event_path)
     # logger
     create_logger(os.path.join(self.path.root_path, 'log.txt'))
     self.logger = get_logger(__name__)
     self.logger.info(f'config: {pprint.pformat(self.config)}')
     if 'SLURM_NODELIST' in os.environ:
         self.logger.info(f"hostnames: {os.environ['SLURM_NODELIST']}")
     # load pretrain checkpoint
     if hasattr(self.config.saver, 'pretrain'):
         self.state = torch.load(self.config.saver.pretrain.path, 'cpu')
         self.logger.info(
             f"Recovering from {self.config.saver.pretrain.path}, keys={list(self.state.keys())}"
         )
         if hasattr(self.config.saver.pretrain, 'ignore'):
             self.state = modify_state(self.state,
                                       self.config.saver.pretrain.ignore)
     else:
         self.state = {}
         self.state['last_iter'] = 0
     # others
     torch.backends.cudnn.benchmark = True
Esempio n. 14
0
    def __init__(self,
                 dataset,
                 total_iter,
                 batch_size,
                 world_size=None,
                 rank=None,
                 last_iter=0):
        if world_size is None:
            world_size = link.get_world_size()
        if rank is None:
            rank = link.get_rank()
        assert rank < world_size
        self.dataset = dataset
        self.total_iter = total_iter
        self.batch_size = batch_size
        self.world_size = world_size
        self.rank = rank
        self.last_iter = last_iter

        self.total_size = self.total_iter * self.batch_size

        self.indices = self.gen_new_list()
        self.call = 0
Esempio n. 15
0
 def _setup_env(self):
     # distribution information
     self.dist = EasyDict()
     self.dist.rank, self.dist.world_size = link.get_rank(
     ), link.get_world_size()
     # directories
     self.path = EasyDict()
     self.path.root_path = self.work_dir
     self.path.save_path = os.path.join(self.path.root_path, 'checkpoints')
     self.path.event_path = os.path.join(self.path.root_path, 'events')
     self.path.result_path = os.path.join(self.path.root_path, 'results')
     makedir(self.path.save_path)
     makedir(self.path.event_path)
     makedir(self.path.result_path)
     # create tensorboard logger
     if self.dist.rank == 0:
         self.tb_logger = SummaryWriter(self.path.event_path)
     # create logger
     create_logger(os.path.join(self.path.root_path, 'log.txt'))
     self.logger = get_logger(__name__)
     self.logger.info(f'config: {pprint.pformat(self.config)}')
     self.logger.info(f"hostnames: {os.environ['SLURM_NODELIST']}")
     # others
     torch.backends.cudnn.benchmark = True
def send_info(info):
    PrototypeINFO = {"name": "Prototype", "version": __version__}
    PrototypeINFO.update(info)
    if link.get_rank() == 0:
        t = threading.Thread(target=send, args=(PrototypeINFO, ))
        t.start()
Esempio n. 17
0
def validate(val_loader, model, fusion_list=None, fuse_prob=False):
    batch_time = AverageMeter(0)
    losses = AverageMeter(0)
    top1 = AverageMeter(0)
    top5 = AverageMeter(0)

    # switch to evaluate mode
    if fusion_list is not None:
        model_list = []
        for i in range(len(fusion_list)):
            model_list.append(model_entry(config.model))
            model_list[i].cuda()
            model_list[i] = DistModule(model_list[i], args.sync)
            load_state(fusion_list[i], model_list[i])
            model_list[i].eval()
        if fuse_prob:
            softmax = nn.Softmax(dim=1)
    else:
        model.eval()

    rank = link.get_rank()
    world_size = link.get_world_size()

    logger = logging.getLogger('global_logger')

    criterion = nn.CrossEntropyLoss()

    end = time.time()
    with torch.no_grad():
        for i, (input, target) in enumerate(val_loader):
            input = input.cuda() if not args.fp16 else input.half().cuda()
            target = target.cuda()
            # compute output
            if fusion_list is not None:
                output_list = []
                for model_idx in range(len(fusion_list)):
                    output = model_list[model_idx](input)
                    if fuse_prob:
                        output = softmax(output)
                    output_list.append(output)
                output = torch.stack(output_list, 0)
                output = torch.mean(output, 0)
            else:
                output = model(input)

            # measure accuracy and record loss
            loss = criterion(
                output, target
            )  #/ world_size ## loss should not be scaled here, it's reduced later!
            prec1, prec5 = accuracy(output.data, target, topk=(1, 5))

            num = input.size(0)
            losses.update(loss.item(), num)
            top1.update(prec1.item(), num)
            top5.update(prec5.item(), num)

            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()

            if i % config.print_freq == 0 and rank == 0:
                logger.info(
                    'Test: [{0}/{1}]\tTime {batch_time.val:.3f} ({batch_time.avg:.3f})'
                    .format(i, len(val_loader), batch_time=batch_time))

    # gather final results
    total_num = torch.Tensor([losses.count])
    loss_sum = torch.Tensor([losses.avg * losses.count])
    top1_sum = torch.Tensor([top1.avg * top1.count])
    top5_sum = torch.Tensor([top5.avg * top5.count])
    link.allreduce(total_num)
    link.allreduce(loss_sum)
    link.allreduce(top1_sum)
    link.allreduce(top5_sum)
    final_loss = loss_sum.item() / total_num.item()
    final_top1 = top1_sum.item() / total_num.item()
    final_top5 = top5_sum.item() / total_num.item()

    if rank == 0:
        logger.info(
            ' * Prec@1 {:.3f}\tPrec@5 {:.3f}\tLoss {:.3f}\ttotal_num={}'.
            format(final_top1, final_top5, final_loss, total_num.item()))

    model.train()

    return final_loss, final_top1, final_top5
Esempio n. 18
0
def train(train_loader, val_loader, model, optimizer, lr_scheduler, start_iter,
          tb_logger):

    global best_prec1

    batch_time = AverageMeter(config.print_freq)
    fw_time = AverageMeter(config.print_freq)
    bp_time = AverageMeter(config.print_freq)
    sy_time = AverageMeter(config.print_freq)
    step_time = AverageMeter(config.print_freq)
    data_time = AverageMeter(config.print_freq)
    losses = AverageMeter(config.print_freq)
    top1 = AverageMeter(config.print_freq)
    top5 = AverageMeter(config.print_freq)

    # switch to train mode
    model.train()

    world_size = link.get_world_size()
    rank = link.get_rank()

    logger = logging.getLogger('global_logger')

    end = time.time()

    label_smooth = config.get('label_smooth', 0.0)
    if label_smooth > 0:
        logger.info('using label_smooth: {}'.format(label_smooth))
        criterion = LabelSmoothCELoss(label_smooth, 1000)
    else:
        criterion = nn.CrossEntropyLoss()

    for i, (input, target) in enumerate(train_loader):
        curr_step = start_iter + i
        lr_scheduler.step(curr_step)
        current_lr = lr_scheduler.get_lr()[0]

        # measure data loading time
        data_time.update(time.time() - end)

        # transfer input to gpu
        target = target.cuda()
        input = input.cuda() if not args.fp16 else input.cuda().half()

        # forward
        output = model(input)
        loss = criterion(output, target) / world_size

        # measure accuracy and record loss
        prec1, prec5 = accuracy(output, target, topk=(1, 5))

        reduced_loss = loss.clone()
        reduced_prec1 = prec1.clone() / world_size
        reduced_prec5 = prec5.clone() / world_size

        link.allreduce(reduced_loss)
        link.allreduce(reduced_prec1)
        link.allreduce(reduced_prec5)

        losses.update(reduced_loss.item())
        top1.update(reduced_prec1.item())
        top5.update(reduced_prec5.item())

        # backward
        optimizer.zero_grad()

        if isinstance(optimizer, FusedFP16SGD):
            optimizer.backward(loss)
            reduce_gradients(model, args.sync)
            optimizer.step()
        elif isinstance(optimizer, FP16SGD):

            def closure():
                # backward
                optimizer.backward(loss, False)
                # sync gradients
                reduce_gradients(model, args.sync)
                # check overflow, convert to fp32 grads, downscale
                optimizer.update_master_grads()
                return loss

            optimizer.step(closure)
        else:
            loss.backward()
            reduce_gradients(model, args.sync)
            optimizer.step()

        # measure elapsed time
        batch_time.update(time.time() - end)

        if curr_step % config.print_freq == 0 and rank == 0:
            tb_logger.add_scalar('loss_train', losses.avg, curr_step)
            tb_logger.add_scalar('acc1_train', top1.avg, curr_step)
            tb_logger.add_scalar('acc5_train', top5.avg, curr_step)
            tb_logger.add_scalar('lr', current_lr, curr_step)
            logger.info('Iter: [{0}/{1}]\t'
                        'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                        'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
                        'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                        'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t'
                        'Prec@5 {top5.val:.3f} ({top5.avg:.3f})\t'
                        'LR {lr:.4f}'.format(curr_step,
                                             len(train_loader),
                                             batch_time=batch_time,
                                             data_time=data_time,
                                             loss=losses,
                                             top1=top1,
                                             top5=top5,
                                             lr=current_lr))

        if curr_step > 0 and curr_step % config.val_freq == 0:
            val_loss, prec1, prec5 = validate(val_loader, model)

            if not tb_logger is None:
                tb_logger.add_scalar('loss_val', val_loss, curr_step)
                tb_logger.add_scalar('acc1_val', prec1, curr_step)
                tb_logger.add_scalar('acc5_val', prec5, curr_step)

            if rank == 0:
                # remember best prec@1 and save checkpoint
                is_best = prec1 > best_prec1
                best_prec1 = max(prec1, best_prec1)
                save_checkpoint(
                    {
                        'step': curr_step,
                        'arch': config.model.arch,
                        'state_dict': model.state_dict(),
                        'best_prec1': best_prec1,
                        'optimizer': optimizer.state_dict(),
                    }, is_best, config.save_path + '/ckpt')

        end = time.time()
Esempio n. 19
0
def train(train_loader, val_loader, model, optimizer, lr_scheduler, start_iter, tb_logger):

    global best_prec1

    batch_time = AverageMeter(config.print_freq)
    fw_time = AverageMeter(config.print_freq)
    bp_time = AverageMeter(config.print_freq)
    sy_time = AverageMeter(config.print_freq)
    step_time = AverageMeter(config.print_freq)
    data_time = AverageMeter(config.print_freq)
    losses = AverageMeter(config.print_freq)
    top1 = AverageMeter(config.print_freq)
    top5 = AverageMeter(config.print_freq)

    # switch to train mode
    model.train()

    world_size = link.get_world_size()
    rank = link.get_rank()

    logger = logging.getLogger('global_logger')

    end = time.time()

    label_smooth = config.get('label_smooth', 0.0)
    if label_smooth > 0:
        logger.info('using label_smooth: {}'.format(label_smooth))
        criterion = LabelSmoothCELoss(label_smooth, 1000)
    else:
        criterion = nn.CrossEntropyLoss()

    T_min, T_max = args.Tmin, args.Tmax
    # print (T_min, T_max)

    def Log_UP(K_min, K_max, ITEMS, ALL_ITEMS):
        Kmin, Kmax = math.log(K_min) / math.log(10), math.log(K_max) / math.log(10)
        return torch.tensor([math.pow(10, Kmin + (Kmax - Kmin) / ALL_ITEMS * ITEMS)]).float().cuda()

    # print (model)
    TIME = time.time()

    for i, (input, target) in enumerate(train_loader):

        curr_step = start_iter + i
        lr_scheduler.step(curr_step)
        current_lr = lr_scheduler.get_lr()[0]

        if (curr_step % config.print_freq == 0):
            t = Log_UP(T_min, T_max, curr_step, len(train_loader))
            if (t < 1):
                k = 1 / t
            else:
                k = torch.tensor([1]).float().cuda()

            for i in range(3):
                model.module.layer1[i].conv1.k = k
                model.module.layer1[i].conv2.k = k
                model.module.layer1[i].conv1.t = t
                model.module.layer1[i].conv2.t = t

            for i in range(4):
                model.module.layer2[i].conv1.k = k
                model.module.layer2[i].conv2.k = k
                model.module.layer2[i].conv1.t = t
                model.module.layer2[i].conv2.t = t

            for i in range(6):
                model.module.layer3[i].conv1.k = k
                model.module.layer3[i].conv2.k = k
                model.module.layer3[i].conv1.t = t
                model.module.layer3[i].conv2.t = t

            for i in range(3):
                model.module.layer4[i].conv1.k = k
                model.module.layer4[i].conv2.k = k
                model.module.layer4[i].conv1.t = t
                model.module.layer4[i].conv2.t = t

            # print ('current k {:.5e} current t {:.5e}'.format(k[0], t[0]))

        # measure data loading time
        data_time.update(time.time() - end)

        # transfer input to gpu
        target = target.cuda()
        input = input.cuda() if not args.fp16 else input.cuda().half()

        # forward
        output = model(input)
        loss = criterion(output, target) / world_size

        # measure accuracy and record loss
        prec1, prec5 = accuracy(output, target, topk=(1, 5))

        reduced_loss = loss.clone()
        reduced_prec1 = prec1.clone() / world_size
        reduced_prec5 = prec5.clone() / world_size

        link.allreduce(reduced_loss)
        link.allreduce(reduced_prec1)
        link.allreduce(reduced_prec5)

        losses.update(reduced_loss.item())
        top1.update(reduced_prec1.item())
        top5.update(reduced_prec5.item())

        # backward
        optimizer.zero_grad()

        if isinstance(optimizer, FusedFP16SGD):
            optimizer.backward(loss)
            reduce_gradients(model, args.sync)
            optimizer.step()
        elif isinstance(optimizer, FP16SGD):
            def closure():
                # backward
                optimizer.backward(loss, False)
                # sync gradients
                reduce_gradients(model, args.sync)
                # check overflow, convert to fp32 grads, downscale
                optimizer.update_master_grads()
                return loss
            optimizer.step(closure)
        else:
            loss.backward()
            reduce_gradients(model, args.sync)
            optimizer.step()

        # measure elapsed time
        batch_time.update(time.time() - end)

        if curr_step % config.print_freq == 0 and rank == 0:
            tb_logger.add_scalar('loss_train', losses.avg, curr_step)
            tb_logger.add_scalar('acc1_train', top1.avg, curr_step)
            tb_logger.add_scalar('acc5_train', top5.avg, curr_step)
            tb_logger.add_scalar('lr', current_lr, curr_step)
            logger.info('Iter: [{0}/{1}]\t'
                  'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                  'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
                  'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                  'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t'
                  'Prec@5 {top5.val:.3f} ({top5.avg:.3f})\t'
                  'LR {lr:.4f}'.format(
                   curr_step, len(train_loader), batch_time=batch_time,
                   data_time=data_time, loss=losses, top1=top1, top5=top5, lr=current_lr))

        if curr_step > 0 and curr_step % config.val_freq == 0:
            val_loss, prec1, prec5 = validate(val_loader, model)

            if not tb_logger is None:
                tb_logger.add_scalar('loss_val', val_loss, curr_step)
                tb_logger.add_scalar('acc1_val', prec1, curr_step)
                tb_logger.add_scalar('acc5_val', prec5, curr_step)


            if rank == 0:
                # remember best prec@1 and save checkpoint
                is_best = prec1 > best_prec1
                best_prec1 = max(prec1, best_prec1)
                # save_checkpoint({
                    # 'step': curr_step,
                    # 'arch': config.model.arch,
                    # 'state_dict': model.state_dict(),
                    # 'best_prec1': best_prec1,
                    # 'optimizer' : optimizer.state_dict(),
                # }, is_best, config.save_path+'/ckpt'+str(TIME % 100000))

        end = time.time()
Esempio n. 20
0
def makedir(path):
    if link.get_rank() == 0 and not os.path.exists(path):
        os.makedirs(path)
    link.barrier()