def _benchmark_communication(self):
        logger.info('Benchmarking communication performance...')
        comm_profiler = CommunicationProfiler(allreduce_async_, synchronize)
        sizes, times = comm_profiler.benchmark(num_iters=10)

        def _fit_linear_function(x, y):
            X = np.array(x).reshape((-1, 1)) * 4
            Y = np.array(y)
            model = LinearRegression()
            model.fit(X, Y)
            alpha = model.intercept_
            beta = model.coef_[0]
            return alpha, beta

        alpha, beta = _fit_linear_function(sizes, times)
        self.alpha = alpha
        self.beta = beta
        alpha_tensor = torch.ones(1) * alpha
        beta_tensor = torch.ones(1) * beta
        alpha_tensor = broadcast(alpha_tensor, root_rank=0)
        beta_tensor = broadcast(beta_tensor, root_rank=0)
        if rank() != 0:
            self.alpha = float(alpha_tensor[0])
            self.beta = float(beta_tensor[0])
        logger.info(
            '[rank:{}] Communication performance fitted with f(p)=a+b*p, where a={} and b={}'
            .format(rank(), self.alpha, self.beta))
Exemple #2
0
    def _sparse_allreduce_async(self, p, name, density):
        stime = time.time()
        tensor = p.data.view(-1)
        tensor_compressed, ctx, selected_values = self._compression.compress(
            tensor, name, ratio=density)

        if False and rank(
        ) == 0 and self.train_iter % 200 == 0 and self.train_iter < 3000:
            grads = tensor.cpu().numpy()
            layer_idx = self._sequential_keys.index(name)
            np.save(
                '%s/r%d_gradients_iter_%d::%s::%d' %
                (self._gradient_path, rank(), self.train_iter, name,
                 layer_idx), grads)
        indexes = ctx
        if indexes is None:
            handle = allgather_async(tensor_compressed, name)
            handle_idx = None  # quantization uses all indices
        else:
            handle = allgather_async(selected_values, name)
            handle_idx = allgather_async(indexes.int(), name + '_indexes')
        if self._profiling:
            utils.force_insert_item(self._compression_timers, name,
                                    time.time() - stime)
        return (handle, handle_idx), ctx
 def _print_profiling(self):
     if self._profiling and rank() == 0 and len(
             self._allreduce_timers.keys()
     ) > 0 and self.train_iter % settings.DISPLAY == 0:  #and len(self._allreduce_timers.get(list(self._allreduce_timers.keys())[0], [])) ==  settings.DISPLAY:
         cps = self._compression_timers  # compression
         ars = self._allreduce_timers  # allreduce times
         ups = self._update_times  # update times
         r = rank()
         tcp = 0.0
         tar = 0.0
         tup = 0.0
         total = 0.0
         for k in ars:
             if len(cps) > 0:
                 acp = np.mean(cps[k])
                 tcp += acp
             aar = np.mean(ars[k])
             tar += aar
             aup = np.mean(ups[k])
             tup += aup
         total = tcp + tar + tup
         logger.info(
             '[%d]: Total compress: %f, allreduce: %f, update: %f, total: %f',
             r, tcp, tar, tup, total)
         #Ahmed - log to wandb micromeasurments of RANK 0
         if r == 0:
             self._tb.log('micro/compress_ms', tcp * 1000)
             self._tb.log('micro/comm_ms', tar * 1000)
             self._tb.log('micro/gradagg_ms', tup * 1000)
             self._tb.log('micro/total_ms', total * 1000)
         cps.clear()
         ars.clear()
         ups.clear()
Exemple #4
0
 def _print_profiling(self):
     if self._profiling and rank() == 0 and len(
             self._allreduce_timers.keys()) > 0 and len(
                 self._allreduce_timers.get(
                     self._allreduce_timers.keys()[0], [])) == 40:
         cps = self._compression_timers  # compression
         ars = self._allreduce_timers  # allreduce times
         ups = self._update_times  # update times
         r = rank()
         tcp = 0.0
         tar = 0.0
         tup = 0.0
         total = 0.0
         for k in cps:
             acp = np.mean(cps[k])
             tcp += acp
             aar = np.mean(ars[k])
             tar += aar
             aup = np.mean(ups[k])
             tup += aup
             #logger.info('[%d][%s]: %f, %f, %f', r, k, acp, aar, aup)
         total = tcp + tar + tup
         cps.clear()
         ars.clear()
         ups.clear()
Exemple #5
0
 def _allreduce_grad_async(self, p, name):
     tensor = p.data.view(-1)
     tensor_compressed, ctx = tensor, None #self._compression.compress(tensor, name)
     if settings.LOGGING_GRADIENTS and rank() == 0:
         grads = tensor.cpu().numpy()
         np.save('%s/r%d_gradients_iter_%d' % (self._gradient_path, rank(), self.train_iter), grads)
     handle = allreduce_async_(tensor_compressed, average=True, name=name)
     return handle, ctx
Exemple #6
0
    def __init__(self, model, hvd_opt, num_steps=10**6):
        """Construct a new ScheduledOptimizer, which uses horovod optimizer under the hood for averaging gradients
         across all the Horovod ranks.

        Args:
            model: The training model. ByteScheduler uses the model object to register hooks.
            hvd_opt: Optimizer to use for averaging gradients and applying updates.
            num_steps: The maximum number of training steps. ByteScheduler needs to know when to stop cross-iteration
            scheduling.

        Usage example:
        ```
        import bytescheduler.pytorch.horovod as bsc
        bsc.init()
        optimizer = hvd.DistributedOptimizer(optimizer, named_parameters, compression)
        optimizer = bsc.ScheduledOptimizer(model, optimizer, num_steps)
        ```
        """
        self._model = model
        self._opt = hvd_opt
        self._logger = logging.getLogger("ByteScheduler")
        self._logger.debug("hvd size {}, rank {}".format(size(), rank()))
        self._desc = "rank {}".format(rank())

        # Track training steps
        self._step = 0
        self._final_step = num_steps

        # Use lock to block the forward propagation of each parameter.
        self._locks = {}
        for param_group in self.param_groups:
            for p in param_group['params']:
                self._locks[p] = threading.Lock()

        # The closer to input layer, the higher the priority is.
        self._priority_indexes = {}
        priority = 0
        for p in model.parameters():
            self._priority_indexes[p] = priority
            priority += 1

        assert len(self._grad_accs) == 0
        if size() > 1:
            self._register_forward_hooks()
            self._register_hooks()

        # Poll whether the tensor is ready for allreduce or whether the allreduce is finished.
        self.event_queue = queue.Queue()
        self._poller = threading.Thread(target=self._poll, args=())
        self._poller.start()

        # Let rank 0 decide the communication order.
        self._immediate = False
        self._rank = rank()
        if self._rank != 0:
            self._immediate = True

        core.start(rank=self._rank, arch="allreduce")
 def _allreduce_grad_async(self, p, name):
     tensor = p.data.view(-1)
     stime = time.time()
     #print("Rank: %s Original Values: %s" %(rank(), tensor))
     tensor, ctx, selected_tensors = self._compression.compress(tensor, name) #tensor, None
     #logger.info("Compression Time: %s" %(time.time()-stime))
     if settings.LOGGING_GRADIENTS and rank() == 0:
         grads = tensor.cpu().numpy()
         np.save('%s/r%d_gradients_iter_%d' % (self._gradient_path, rank(), self.train_iter), grads)
     #print("Rank: %s Selected Values: %s" %(rank(), selected_tensors))
     handle = allreduce_async_(selected_tensors, average=True, name=name) #(tensor_compressed, average=True, name=name) 
     return handle, None
Exemple #8
0
 def _allreduce_grad_async(self, p, name):
     tensor = p.data.view(-1)
     if False and rank(
     ) == 0 and self.train_iter % 200 == 0 and self.train_iter < 3000:
         grads = tensor.cpu().numpy()
         layer_idx = self._sequential_keys.index(name)
         np.save(
             '%s/r%d_gradients_iter_%d::%s::%d' %
             (self._gradient_path, rank(), self.train_iter, name,
              layer_idx), grads)
     allreduce_name = name
     if len(name) > 200:
         allreduce_name = name[0:100] + '...' + name[-100:]
     handle = allreduce_async_(tensor, average=True, name=allreduce_name)
     return handle, None
    def _sparse_allreduce_async(self, p, name, density):
        stime = time.time()
        tensor = p.data.view(-1)
        tensor_compressed, ctx, selected_values = self._compression.compress(tensor, name, ratio=density)
        self._selected_num_gradients.append(int(ctx.numel()))

        if settings.LOGGING_GRADIENTS and rank() == 0:
            grads = tensor.cpu().numpy()
            np.save('%s/r%d_gradients_iter_%d' % (self._gradient_path, rank(), self.train_iter), grads)
        indexes = ctx
        handle = allgather_async(selected_values, name)
        handle_idx = allgather_async(indexes.int(), name+'_indexes')
        if self._profiling:
            utils.force_insert_item(self._compression_timers, name, time.time()-stime)
        return (handle, handle_idx), ctx 
Exemple #10
0
    def _benchmark_communication(self):
        #logger.info('Benchmarking communication performance...')
        comm_profiler = CommunicationProfiler(allreduce_async_, synchronize)
        sizes, times = comm_profiler.benchmark(num_iters=10)

        def _fit_linear_function(x, y):
            X = np.array(x).reshape((-1, 1)) * 4
            Y = np.array(y)
            model = LinearRegression()
            model.fit(X, Y)
            alpha = model.intercept_
            beta = model.coef_[0]
            #A = np.vstack([X, np.ones(len(X))]).T
            #beta, alpha = np.linalg.lstsq(A, Y, rcond=None)[0]
            return alpha, beta

        alpha, beta = _fit_linear_function(sizes, times)
        self.alpha = alpha
        self.beta = beta
        alpha_tensor = torch.ones(1) * alpha
        beta_tensor = torch.ones(1) * beta
        alpha_tensor = broadcast(alpha_tensor, root_rank=0)
        beta_tensor = broadcast(beta_tensor, root_rank=0)
        if rank() != 0:
            self.alpha = float(alpha_tensor[0])
            self.beta = float(beta_tensor[0])
Exemple #11
0
def broadcast_object(obj,
                     root_rank=0,
                     name=None,
                     process_set=global_process_set):
    """
    Serializes and broadcasts an object from root rank to all other processes.
    Typical usage is to broadcast the `optimizer.state_dict()`, for example:

    .. code-block:: python

        state_dict = broadcast_object(optimizer.state_dict(), 0)
        if hvd.rank() > 0:
            optimizer.load_state_dict(state_dict)

    Arguments:
        obj: An object capable of being serialized without losing any context.
        root_rank: The rank of the process from which parameters will be
                   broadcasted to all other processes.
        name: Optional name to use during broadcast, will default to the class
              type.
        process_set: Process set object to limit this operation to a subset of
                     Horovod processes. Default is the global process set.
    Returns:
        The object that was broadcast from the `root_rank`.
    """
    if name is None:
        name = type(obj).__name__

    if rank() == root_rank:
        b = io.BytesIO()
        cloudpickle.dump(obj, b)
        t = torch.ByteTensor(bytearray(b.getvalue()))
        sz = torch.IntTensor([t.shape[0]])
        broadcast_(sz, root_rank, name + '.sz', process_set)
    else:
        sz = torch.IntTensor([0])
        broadcast_(sz, root_rank, name + '.sz', process_set)
        t = torch.ByteTensor(sz.tolist()[0])

    broadcast_(t, root_rank, name + '.t', process_set)

    if rank() != root_rank:
        buf = io.BytesIO(t.numpy().tobytes())
        obj = cloudpickle.load(buf)

    return obj
 def increase_one_epoch(self):
     self.train_epoch += 1
     if rank() == 0:
         density = self.get_current_density()
         size = np.sum(self._sizes)
         k = max(int(size * density), 1)
         logger.info('Average number of selected gradients: %f, exact k: %d', np.mean(self._selected_num_gradients), k)
         logger.info('The number of selected gradients: %s', self._selected_num_gradients)
     self._selected_num_gradients = []
Exemple #13
0
    def reset(self):
        self.num_replicas = size()
        self.rank = rank()

        # Exclude any samples we have already processed this epoch
        self.remaining_indices = [idx for idx in range(len(self.dataset))
                                  if idx not in self.processed_indices]

        self.num_samples = int(math.ceil(len(self.remaining_indices) * 1.0 / self.num_replicas))
        self.total_size = self.num_samples * self.num_replicas
 def _print_profiling(self):
     if self._profiling and rank() == 0 and len(self._allreduce_timers.keys()) > 0 and len(self._allreduce_timers.get(self._allreduce_timers.keys()[0], [])) ==  40:
         cps = self._compression_timers # compression
         ars = self._allreduce_timers # allreduce times
         ups = self._update_times # update times
         r = rank()
         tcp = 0.0; tar = 0.0; tup = 0.0; total=0.0
         for k in cps:
             acp = np.mean(cps[k])
             tcp += acp
             aar = np.mean(ars[k])
             tar += aar
             aup = np.mean(ups[k])
             tup += aup
         total = tcp+tar+tup
         logger.info('[%d]: Total compress: %f, allreduce: %f, update: %f, total: %f', r, tcp, tar, tup, total)
         cps.clear()
         ars.clear()
         ups.clear()
Exemple #15
0
    def reset(self):
        self.num_replicas = size()
        self.rank = rank()

        # Exclude any samples we have already processed this epoch
        all_indices = [idx for idx in range(len(self.dataset))]
        if self.shuffle:
            # Shuffle indices across workers deterministically in place
            seed = self.seed + self.epoch
            random.Random(seed).shuffle(all_indices)
        self.remaining_indices = all_indices[self.processed_num:]

        self.num_samples = int(
            math.ceil(len(self.remaining_indices) * 1.0 / self.num_replicas))
        self.total_size = self.num_samples * self.num_replicas
def _init_logging():
    class MyLogger:
        def __init__(self, logpath):
            self.log_file = open(logpath, 'w+')

        def debug(self, msg):
            self.log_file.write(msg + '\n')
            # self.log_file.wrtie("\n")
        def __del__(self):
            self.log_file.close()

    logdir = "~/horovod_logs/hooks"
    logdir = os.path.expanduser(logdir)
    if not os.path.exists(logdir):
        os.makedirs(logdir)
    dt = datetime.fromtimestamp(time.time())
    timestamp = dt.strftime("%Y%m%d-%H%M%S")
    logging_file = os.path.join(logdir,
                                "hook-{}-rank{}.log".format(timestamp, rank()))
    print(logging_file)
    logger = MyLogger(logging_file)
    return logger
Exemple #17
0
    def _generate_groups_mgwfbp(self):
        num_of_workers = size()
        p_alpha_beta_56Gbps = {
            64: (0.00080632079996292579, 1.8 * 3.2713239529771973e-10),
            32: (0.00040632079996292579, 1.5 * 3.2713239529771973e-10),
            16: (0.00023583677659915685 * 3, 4.0594787739537565e-10),
            8: (9.75367204301171e-05, 3.0568230536676206e-10),
            4: (4.204298980348825e-05, 2.0589360830118177e-10),
            2: (2.554691138304671e-06, 9.837548167872609e-11)
        }
        p_alpha_beta_10Gbps = {
            64: (0.0023476410788581382 * 3, 9.643300782166769e-10),
            32: (0.0013476410788581382 * 3, 8.643300782166769e-10),
            16: (0.0009080981007148093, 7.395651186836712e-10),
            8: (0.0005230272768511732, 8.570746975492128e-10),
            4: (4.204298980348825e-05, 2.0589360830118177e-10),
            2: (2.554691138304671e-06, 9.837548167872609e-11)
        }
        if self.alpha is not None:
            alpha, beta = self.alpha, self.beta
        else:
            if self._rdma:
                alpha, beta = p_alpha_beta_56Gbps[num_of_workers]
            else:
                alpha, beta = p_alpha_beta_10Gbps[num_of_workers]
        nbytes = 2 if self._fp16 else 4

        def __calculate_comm_start(tc, tb, taob, L):
            taoc = [0] * L
            taoc[L - 1] = taob[L - 1] + tb[L - 1]
            for l in range(L - 1)[::-1]:
                taoc[l] = max(taoc[l + 1] + tc[l + 1], taob[l] + tb[l])
            return taoc

        def __merge(taob, tc, p, l):
            tc[l] = 0
            p[l - 1] = p[l - 1] + p[l]
            p[l] = 0
            if self.size_commtime_dict is not None:
                tc[l - 1] = self.size_commtime_dict[l - 1]
            else:
                tc[l - 1] = utils.predict_allreduce_time_with_size(
                    alpha, beta, p[l - 1] * nbytes, num_of_workers)

        sizes = [
            self._named_parameters[k].data.numel()
            for k in self._seq_layernames
        ]
        seq_layernames = self._seq_layernames
        if not utils.check_unique(seq_layernames):
            raise ValueError
        self._sizes = sizes
        p = sizes[:]
        L = len(sizes)
        if self.size_commtime_dict is not None:
            tc = [self.size_commtime_dict[s] for s in sizes]
        else:
            tc = [
                utils.predict_allreduce_time_with_size(alpha, beta, s * nbytes,
                                                       num_of_workers)
                for s in sizes
            ]
        tb = list(self._layerwise_times)
        taob = [0] * L
        for l in range(0, L - 1)[::-1]:
            taob[l] = taob[l + 1] + tb[l + 1]
        taoc = __calculate_comm_start(tc, tb, taob, L)
        if rank() == 0:
            #logger.debug('seq_layernames: %s', seq_layernames)
            #logger.debug('tb: %s', tb)
            #logger.debug('taob: %s', taob)
            #logger.debug('sizes: %s', p)
            #logger.warn('tc sum: %f', np.sum(tc))
            pass
            #logger.warn('tc: %s', tc)
            #logger.warn('taoc: %s', taoc)
        groups = []
        group = []
        idx = 0
        key_groupidx_maps = {}
        l = L - 1
        key = seq_layernames[l]
        key_groupidx_maps[key] = idx
        for l in range(1, L)[::-1]:
            key = seq_layernames[l]
            group.append(key)
            key_groupidx_maps[key] = idx
            current_taob = taob[l - 1] + tb[l - 1]
            merged = False
            if current_taob < taoc[l] + tc[l]:
                if taoc[l] > current_taob:
                    __merge(taob, tc, p, l)
                    taoc = __calculate_comm_start(tc, tb, taob, L)
                    merged = True
                else:
                    t_wait = current_taob - taoc[l]
                    t_saved = alpha
                    if t_wait < t_saved:
                        __merge(taob, tc, p, l)
                        taoc = __calculate_comm_start(tc, tb, taob, L)
                        merged = True
            #if not merged and (key.find('bn') >= 0 or key.find('bias') >= 0):
            if not merged and p[l] < 8192:
                __merge(taob, tc, p, l)
                taoc = __calculate_comm_start(tc, tb, taob, L)
                merged = True
            if not merged:
                idx += 1
                groups.append(group)
                group = []
            #elif current_taob > taoc[l+1]+tc[l+1] and current_taob < taoc[l]+tc[l] and taoc[l]+alpha > current_taob:
            #    __merge(taob, tc, p, l)
            #    taoc = __calculate_comm_start(tc, tb, taob, L)
            #else:
            #    idx += 1
            #    groups.append(group)
            #    group = []
        l = 0
        key = seq_layernames[l]
        key_groupidx_maps[key] = idx
        group.append(key)
        if len(group) > 0:
            groups.append(group)

        if rank() == 0:
            #logger.debug('seq_layernames: %s', seq_layernames)
            #pass
            #logger.info('Merged tc sum: %f', np.sum(tc))
            print('Merged sizes: ', p[::-1])
            print('# of parameters: ', np.sum(p[::-1]))
            #logger.info('Merged tb: %s', tb[::-1])
            #logger.info('Merged taob: %s', taob[::-1])
            #logger.info('Merged tc: %s', tc[::-1])
            #logger.info('Merged taoc: %s', taoc[::-1])

        return groups, key_groupidx_maps
    def _generate_groups_mgwfbp(self):
        num_of_workers = size()
        p_alpha_beta_56Gbps = {
            16: (0.00023583677659915685, 4.0594787739537565e-10),
            8: (9.75367204301171e-05, 3.0568230536676206e-10),
            4: (4.204298980348825e-05, 2.0589360830118177e-10),
            2: (2.554691138304671e-06, 9.837548167872609e-11)
        }
        p_alpha_beta_10Gbps = {
            16: (0.0009080981007148093, 7.395651186836712e-10),
            8: (0.0005230272768511732, 8.570746975492128e-10),
            4: (4.204298980348825e-05, 2.0589360830118177e-10),
            2: (2.554691138304671e-06, 9.837548167872609e-11)
        }
        if self.alpha is not None:
            alpha, beta = self.alpha, self.beta
        else:
            if settings.CONNECTION == '10GbE':
                alpha, beta = p_alpha_beta_10Gbps[num_of_workers]
            else:
                alpha, beta = p_alpha_beta_56Gbps[num_of_workers]
        nbytes = 2 if settings.FP16 else 4

        def __calculate_comm_start(tc, tb, taob, L):
            taoc = [0] * L
            taoc[L - 1] = taob[L - 1] + tb[L - 1]
            for l in range(L - 1)[::-1]:
                taoc[l] = max(taoc[l + 1] + tc[l + 1], taob[l] + tb[l])
            return taoc

        def __merge(taob, tc, p, l):
            tc[l] = 0
            p[l - 1] = p[l - 1] + p[l]
            p[l] = 0
            if self.size_commtime_dict is not None:
                tc[l - 1] = self.size_commtime_dict[l - 1]
            else:
                tc[l - 1] = utils.predict_allreduce_time_with_size(
                    alpha, beta, p[l - 1] * nbytes, num_of_workers)

        sizes = [
            self._named_parameters[k].data.numel()
            for k in self._seq_layernames
        ]
        seq_layernames = self._seq_layernames
        if not utils.check_unique(seq_layernames):
            raise ValueError
        self._sizes = sizes
        p = sizes[:]
        L = len(sizes)
        if self.size_commtime_dict is not None:
            tc = [self.size_commtime_dict[s] for s in sizes]
        else:
            tc = [
                utils.predict_allreduce_time_with_size(alpha, beta, s * nbytes,
                                                       num_of_workers)
                for s in sizes
            ]
        tb = list(self._layerwise_times)
        taob = [0] * L
        for l in range(0, L - 1)[::-1]:
            taob[l] = taob[l + 1] + tb[l + 1]
        taoc = __calculate_comm_start(tc, tb, taob, L)
        if rank() == 0:
            logger.info('tc sum: %f', np.sum(tc))
        groups = []
        group = []
        idx = 0
        key_groupidx_maps = {}
        l = L - 1
        key = seq_layernames[l]
        key_groupidx_maps[key] = idx
        for l in range(1, L)[::-1]:
            key = seq_layernames[l]
            group.append(key)
            key_groupidx_maps[key] = idx
            current_taob = taob[l - 1] + tb[l - 1]
            merged = False
            if current_taob < taoc[l] + tc[l]:
                if taoc[l] > current_taob:
                    __merge(taob, tc, p, l)
                    taoc = __calculate_comm_start(tc, tb, taob, L)
                    merged = True
                else:
                    t_wait = current_taob - taoc[l]
                    t_saved = alpha
                    if t_wait < t_saved:
                        __merge(taob, tc, p, l)
                        taoc = __calculate_comm_start(tc, tb, taob, L)
                        merged = True
            if not merged:
                idx += 1
                groups.append(group)
                group = []
        l = 0
        key = seq_layernames[l]
        key_groupidx_maps[key] = idx
        group.append(key)
        if len(group) > 0:
            groups.append(group)

        if rank() == 0:
            logger.info('Predicted non-overlapped time: %f',
                        taoc[0] + tc[0] - (taob[0] + tb[0]))
            logger.info('Predicted tb+tc= %f', taoc[0] + tc[0])
            logger.info('Merged tc sum: %f', np.sum(tc))

        return groups, key_groupidx_maps
    def synchronize(self):
        global SPEED
        num_of_workers = size()
        ratio = 0
        i = 0
        for p, value in self._handles.items():
            name = self._merged_parameter_names.get(p)
            handle, ctx, density = value

            if self._sparse and density < 1:
                stime = time.time()
                handle_idx = None
                all_indexes = None
                if type(handle) is tuple:
                    handle, handle_idx = handle[0], handle[1]
                output = synchronize(handle)
                if handle_idx is not None:
                    all_indexes = synchronize(handle_idx)
                if self._profiling:
                    utils.force_insert_item(self._allreduce_timers, name,
                                            time.time() - stime)
                stime = time.time()
                new_grad = p.data.view(-1)
                dectx = output, all_indexes, num_of_workers
                new_grad = self._compression.decompress(new_grad, dectx)
                if self._profiling:
                    utils.force_insert_item(self._update_times, name,
                                            time.time() - stime)
            elif density == 1:
                stime = time.time()
                output = synchronize(handle)
                if self._profiling:
                    utils.force_insert_item(self._allreduce_timers, name,
                                            time.time() - stime)
                stime = time.time()
                if self._norm_clip is not None:
                    norm_clip = np.sqrt(1.0 / size()) * self._norm_clip
                    norm_type = 2.0
                    param_norm = output.norm(norm_type)
                    total_norm = param_norm.item()
                    clip_coef = norm_clip / (total_norm + 1e-6)
                    if clip_coef < 1:
                        output.mul_(clip_coef)

                p.set_(output)
                if self._profiling:
                    utils.force_insert_item(self._update_times, name,
                                            time.time() - stime)
            elif density > 1:
                #allgather instead of allreduce of sparse tensor
                stime = time.time()
                output = synchronize(handle)
                if self._profiling:
                    utils.force_insert_item(self._allreduce_timers, name,
                                            time.time() - stime)
                stime = time.time()
                new_grad = p.data.view(-1)
                new_grad.fill_(0.0)
                numel = output.size(0)
                real_num_values = numel // num_of_workers
                for i in range(num_of_workers):
                    values = output.data[i * real_num_values:(i + 1) *
                                         real_num_values]
                    new_grad += values
                new_grad /= num_of_workers

                if self._norm_clip is not None:
                    norm_clip = np.sqrt(1.0 / size()) * self._norm_clip
                    norm_type = 2.0
                    param_norm = new_grad.norm(norm_type)
                    total_norm = param_norm.item()
                    clip_coef = norm_clip / (total_norm + 1e-6)
                    if clip_coef < 1:
                        new_grad.mul_(clip_coef)

                p.set_(new_grad)
                if self._profiling:
                    utils.force_insert_item(self._update_times, name,
                                            time.time() - stime)

            # Ahmed - track number of elments
            if ctx is not None:
                ratio += ctx.numel() / p.data.numel()
            else:
                ratio += 1
            self._avg_ratio += ratio
            self._num_avg_sample += 1

            if density < 1:
                #Volume for all-gather compression (data + indexes) - %TODO should multiply (1-1/num-of-workers) (to remove portion of local node)
                self._sum_volume += output.numel() * output.element_size(
                ) + all_indexes.numel() * all_indexes.element_size()
            elif density == 1:
                #Volume for all-reduce no-compression
                self._sum_volume += 2 * output.numel() * output.element_size()
            elif density == 2:
                #Volume for all-gather no compression (data ) - %TODO should multiply (1-1/num-of-workers) (to remove portion of local node)
                self._sum_volume += output.numel() * output.element_size()
            self._num_vol_sample += 1

        if rank() == 0 and self.train_iter % settings.DISPLAY == 0:
            self._tb.log('datavol/cum_vol_bytes', self._sum_volume)
            self._tb.log('datavol/avg_vol_bytes',
                         self._sum_volume / self._num_vol_sample)

            if self._compression is not compressors['none']:  #and ratio > 0:
                #target_k = (self.model_elemnum * density)
                self._tb.log('compress/comp_ratio', ratio)
                self._tb.log('compress/est_compratio', ratio / density)
                self._tb.log('compress/avg_est_compratio',
                             (1.0 * self._avg_ratio / self._num_avg_sample) /
                             density)
                if self.stages < 0:
                    self._tb.log('compress/num_stages',
                                 self._compression.cur_stages)
                else:
                    self._tb.log('compress/num_stages', self.stages)
                if self.stages == 0:
                    self._tb.log('compress/first_ratio', self.iratio)
                else:
                    self._tb.log('compress/first_ratio',
                                 self._compression.first_ratio)
                self._num_sample = 0
                self._sum_elems = 0

        if len(self._groups) != len(self._sequential_keys):
            for merged_p, value in self._handles.items():
                new_name = self._merged_parameter_names.get(merged_p)
                tensors = self._pull_from_buffer(new_name, merged_p)
                for n in tensors:
                    p = self._named_parameters.get(n)
                    p.grad.set_(tensors[n].data.type(p.grad.type()))
        self.train_iter += 1
        self._handles.clear()
        self._print_profiling()
Exemple #20
0
    def _generate_groups_mgwfbp(self):
        num_of_workers = size()
        alpha = 9.618801111215886e-08
        beta = 3.89407453e-13

        def __calculate_comm_start(tc, tb, taob, L):
            taoc = [0] * L
            taoc[L - 1] = taob[L - 1] + tb[L - 1]
            for l in range(L - 1)[::-1]:
                taoc[l] = max(tc[l + 1] + tc[l + 1], taob[l] + tb[l])
            return taoc

        def __merge(taob, tc, p, l):
            tc[l] = 0
            p[l - 1] = p[l - 1] + p[l]
            p[l] = 0
            tc[l - 1] = utils.predict_allreduce_time_with_size(
                alpha, beta, p[l - 1] * 4, num_of_workers)

        sizes = [
            self._named_parameters[k].data.numel()
            for k in self._seq_layernames
        ][::-1]
        seq_layernames = self._seq_layernames[::-1]
        self._sizes = sizes
        p = sizes[:]
        L = len(sizes)
        tc = [
            utils.predict_allreduce_time_with_size(alpha, beta, s * 4,
                                                   num_of_workers)
            for s in sizes
        ]
        tb = list(self._layerwise_times[::-1])
        taob = [0]
        for t in tb[:-1]:
            taob.append(t + taob[-1])
        taob = taob[::-1]
        taoc = __calculate_comm_start(tc, tb, taob, L)
        if rank() == 0 and DEBUG:
            logger.debug('seq_layernames: %s', seq_layernames)
            logger.debug('tb: %s', tb)
            logger.debug('taob: %s', taob)
            logger.debug('sizes: %s', p)
            logger.debug('tc: %s', tc)
            logger.debug('taoc: %s', taoc)
        groups = []
        group = []
        idx = 0
        key_groupidx_maps = {}
        l = L - 1
        key = seq_layernames[l]
        key_groupidx_maps[key] = idx
        group.append(key)
        for l in range(1, L - 1)[::-1]:
            key = seq_layernames[l]
            group.append(key)
            key_groupidx_maps[key] = idx
            current_taob = taob[l - 2] if l >= 2 else taob[0]
            if current_taob - taoc[l] < alpha:
                __merge(taob, tc, p, l)
                taoc = __calculate_comm_start(tc, tb, taob, L)
            else:
                idx += 1
                groups.append(group)
                group = []
        l = 0
        key = seq_layernames[l]
        key_groupidx_maps[key] = idx
        group.append(key)
        if len(group) > 0:
            groups.append(group)
        return groups, key_groupidx_maps
Exemple #21
0
    def _generate_groups_mgwfbp(self):
        num_of_workers = size()
        p_alpha_beta = {
            16: (0.00010632079996292579, 1.5 * 3.2713239529771973e-10),
            8: (9.75367204301171e-05, 3.0568230536676206e-10),
            4: (4.204298980348825e-05, 2.0589360830118177e-10),
            2: (2.554691138304671e-06, 9.837548167872609e-11)
        }
        alpha, beta = p_alpha_beta[num_of_workers]

        def __calculate_comm_start(tc, tb, taob, L):
            taoc = [0] * L
            taoc[L - 1] = taob[L - 1] + tb[L - 1]
            for l in range(L - 1)[::-1]:
                taoc[l] = max(taoc[l + 1] + tc[l + 1], taob[l] + tb[l])
            return taoc

        def __merge(taob, tc, p, l):
            tc[l] = 0
            p[l - 1] = p[l - 1] + p[l]
            p[l] = 0
            tc[l - 1] = utils.predict_allreduce_time_with_size(
                alpha, beta, p[l - 1] * 4, num_of_workers)

        sizes = [
            self._named_parameters[k].data.numel()
            for k in self._seq_layernames
        ]
        seq_layernames = self._seq_layernames
        self._sizes = sizes
        p = sizes[:]
        L = len(sizes)
        tc = [
            utils.predict_allreduce_time_with_size(alpha, beta, s * 4,
                                                   num_of_workers)
            for s in sizes
        ]
        tb = list(self._layerwise_times)
        taob = [0] * L
        for l in range(0, L - 1)[::-1]:
            taob[l] = taob[l + 1] + tb[l + 1]
        taoc = __calculate_comm_start(tc, tb, taob, L)
        if rank() == 0:
            logger.warn('tc sum: %f', np.sum(tc))
            logger.warn('tc: %s', tc)
            logger.warn('taoc: %s', taoc)
        groups = []
        group = []
        idx = 0
        key_groupidx_maps = {}
        l = L - 1
        key = seq_layernames[l]
        key_groupidx_maps[key] = idx
        group.append(key)
        for l in range(1, L - 1)[::-1]:
            key = seq_layernames[l]
            group.append(key)
            key_groupidx_maps[key] = idx
            current_taob = taob[l - 1] + tb[l - 1]
            if current_taob < taoc[l + 1] + tc[l + 1]:
                __merge(taob, tc, p, l)
                taoc = __calculate_comm_start(tc, tb, taob, L)
            elif current_taob > taoc[l + 1] + tc[l + 1] and current_taob < taoc[
                    l] + tc[l] and taoc[l] + alpha > current_taob:
                __merge(taob, tc, p, l)
                taoc = __calculate_comm_start(tc, tb, taob, L)
            else:
                idx += 1
                groups.append(group)
                group = []
        l = 0
        key = seq_layernames[l]
        key_groupidx_maps[key] = idx
        group.append(key)
        logger.info('Predicted non-overlapped time: %f',
                    taoc[0] + tc[0] - (taob[0] + tb[0]))
        logger.info('Predicted tb+tc= %f', taoc[0] + tc[0])
        if len(group) > 0:
            groups.append(group)
        return groups, key_groupidx_maps
    def synchronize(self):

        num_of_workers = size()
        for p, value in self._handles.items():
            name = self._merged_parameter_names.get(p)
            handle, ctx, density = value
            if self._sparse and density < 1:
                stime = time.time()
                handle_idx = None
                all_indexes = None
                if type(handle) is tuple:
                    handle, handle_idx = handle[0], handle[1]
                output = synchronize(handle)
                if handle_idx is not None:
                    all_indexes = synchronize(handle_idx)

                if self._profiling:
                    utils.force_insert_item(self._allreduce_timers, name, time.time()-stime)
                stime = time.time()
                new_grad = p.data.view(-1)
                new_grad.fill_(0.0)
                numel = output.size(0)
                real_num_values = numel//num_of_workers
                for i in range(num_of_workers):
                    values_and_indexes = output.data[i*real_num_values:(i+1)*real_num_values]
                    if all_indexes is None:
                        values = values_and_indexes[0:real_num_values//2]
                        indexes = values_and_indexes[real_num_values//2:].long()
                    else:
                        values = values_and_indexes
                        indexes = all_indexes.data[i*real_num_values:(i+1)*real_num_values].long()
                    new_grad[indexes[0:indexes.numel()//2]] += values[0:indexes.numel()//2]
                    new_grad[indexes[indexes.numel()//2:]] += values[indexes.numel()//2:]
                new_grad /= num_of_workers

                if self._profiling:
                    utils.force_insert_item(self._update_times, name, time.time()-stime)
            else:
                stime = time.time()
                output = synchronize(handle)
                print("Rank: %s Mean after allreduce: %s" %(rank(),output))
                stime = time.time()
                output = self._compression.decompress(output)
                #logger.info("Decompression Time : %s" %(time.time()-stime))
                #print("Rank: %s Tensor Decompressed: %s" %(rank(),output))
                if self._profiling:
                    utils.force_insert_item(self._allreduce_timers, name, time.time()-stime)
                stime = time.time()

                if self._norm_clip is not None:
                    norm_clip = np.sqrt(1.0/size()) * self._norm_clip
                    norm_type = 2.0
                    param_norm = output.norm(norm_type)
                    total_norm = param_norm.item() 
                    clip_coef = norm_clip / (total_norm + 1e-6)
                    if clip_coef < 1:
                        output.mul_(clip_coef)

                p.set_(output)
                if self._profiling:
                    utils.force_insert_item(self._update_times, name, time.time()-stime)
        if len(self._groups) != len(self._sequential_keys):
            for merged_p, value in self._handles.items():
                new_name = self._merged_parameter_names.get(merged_p)
                tensors = self._pull_from_buffer(new_name, merged_p)
                for n in tensors:
                    p = self._named_parameters.get(n)
                    p.grad.set_(tensors[n].data.type(p.grad.type()))
        self.train_iter += 1
        self._handles.clear()
        self._print_profiling()