Example #1
0
    def _sparse_allreduce_async(self, p, name, density):
        stime = time.time()
        tensor = p.data.view(-1)
        tensor_compressed, ctx, selected_values = self._compression.compress(
            tensor, name, ratio=density)

        if False and rank(
        ) == 0 and self.train_iter % 200 == 0 and self.train_iter < 3000:
            grads = tensor.cpu().numpy()
            layer_idx = self._sequential_keys.index(name)
            np.save(
                '%s/r%d_gradients_iter_%d::%s::%d' %
                (self._gradient_path, rank(), self.train_iter, name,
                 layer_idx), grads)
        indexes = ctx
        if indexes is None:
            handle = allgather_async(tensor_compressed, name)
            handle_idx = None  # quantization uses all indices
        else:
            handle = allgather_async(selected_values, name)
            handle_idx = allgather_async(indexes.int(), name + '_indexes')
        if self._profiling:
            utils.force_insert_item(self._compression_timers, name,
                                    time.time() - stime)
        return (handle, handle_idx), ctx
Example #2
0
 def _sparse_allreduce_async(self, p, name):
     stime = time.time()
     tensor = p.data.view(-1)
     tensor_compressed, ctx, selected_values = self._compression.compress(
         tensor, name, ratio=self._density)
     indexes = ctx
     handle = allgather_async(selected_values, name)
     handle_idx = allgather_async(indexes.int(), name + '_indexes')
     if self._profiling:
         utils.force_insert_item(self._compression_timers, name,
                                 time.time() - stime)
     return (handle, handle_idx), ctx
    def _sparse_allreduce_async(self, p, name, density):
        stime = time.time()
        tensor = p.data.view(-1)
        tensor_compressed, ctx, selected_values = self._compression.compress(tensor, name, ratio=density)
        self._selected_num_gradients.append(int(ctx.numel()))

        if settings.LOGGING_GRADIENTS and rank() == 0:
            grads = tensor.cpu().numpy()
            np.save('%s/r%d_gradients_iter_%d' % (self._gradient_path, rank(), self.train_iter), grads)
        indexes = ctx
        handle = allgather_async(selected_values, name)
        handle_idx = allgather_async(indexes.int(), name+'_indexes')
        if self._profiling:
            utils.force_insert_item(self._compression_timers, name, time.time()-stime)
        return (handle, handle_idx), ctx 
Example #4
0
    def synchronize(self):

        for p, value in self._handles.items():
            name = self._merged_parameter_names.get(p)
            handle, ctx, density = value
            stime = time.time()
            output = synchronize(handle)
            if self._profiling:
                utils.force_insert_item(self._allreduce_timers, name,
                                        time.time() - stime)
            stime = time.time()

            if self._norm_clip is not None:
                norm_clip = np.sqrt(1.0 / size()) * self._norm_clip
                norm_type = 2.0
                param_norm = output.norm(norm_type)
                total_norm = param_norm.item()
                clip_coef = norm_clip / (total_norm + 1e-6)
                if clip_coef < 1:
                    output.mul_(clip_coef)

            p.set_(output)
            if self._profiling:
                utils.force_insert_item(self._update_times, name,
                                        time.time() - stime)
        if len(self._groups) != len(self._sequential_keys):
            for merged_p, value in self._handles.items():
                new_name = self._merged_parameter_names.get(merged_p)
                tensors = self._pull_from_buffer(new_name, merged_p)
                for n in tensors:
                    p = self._named_parameters.get(n)
                    if settings.FP16:
                        p.grad.set_(tensors[n].data.type(p.grad.type()))
                    else:
                        p.grad.set_(tensors[n].data)
        self.train_iter += 1
        self._handles.clear()
        self._print_profiling()
Example #5
0
    def _sparse_allreduce_async(self, p, name, density):
        stime = time.time()
        tensor = p.data.view(-1)

        if settings.CPU_COMPRESS:
            density = abs(density)
            tensor_cpy = tensor.to('cpu')
            tensor_compressed, ctx, selected_values = self._compression.compress(
                tensor_cpy,
                name,
                ratio=density,
                tb=self._tb,
                i_ratio=self.iratio,
                stages=self.stages,
                ec_grad_w=self.ec_gradw,
                ec_mem_w=self.ec_memw)
            selected_values = selected_values.to('cuda')
            ctx = ctx.to('cuda')
        else:
            tensor_compressed, ctx, selected_values = self._compression.compress(
                tensor,
                name,
                ratio=density,
                tb=self._tb,
                i_ratio=self.iratio,
                stages=self.stages,
                ec_grad_w=self.ec_gradw,
                ec_mem_w=self.ec_memw)
        if self._profiling:
            utils.force_insert_item(self._compression_timers, name,
                                    time.time() - stime)

        self._selected_num_gradients.append(int(ctx.numel()))

        indexes = ctx
        handle = allgather_async(selected_values, name)
        handle_idx = allgather_async(indexes.int(), name + '_indexes')
        return (handle, handle_idx), ctx
Example #6
0
    def _sparse_allreduce_async(self, p, name, density):
        stime = time.time()
        tensor = p.data.view(-1)
        #if self.train_iter < 100:
        #    tensor_compressed, ctx, selected_values = compressors['topkec'].compress(tensor, name, ratio=density, tb=self._tb)
        #else:
        if settings.CPU_COMPRESS:
            density = abs(density)
            tensor_cpy = tensor.to('cpu')
            tensor_compressed, ctx, selected_values = self._compression.compress(tensor_cpy, name, ratio=density, tb=self._tb, i_ratio=self.iratio, stages=self.stages, ec_grad_w=self.ec_gradw, ec_mem_w=self.ec_memw)
            selected_values = selected_values.to('cuda')
            ctx = ctx.to('cuda')
        else:
            tensor_compressed, ctx, selected_values = self._compression.compress(tensor, name, ratio=density, tb=self._tb, i_ratio=self.iratio, stages=self.stages, ec_grad_w=self.ec_gradw, ec_mem_w=self.ec_memw)
        if self._profiling:
            utils.force_insert_item(self._compression_timers, name, time.time() - stime)

        # #Ahmed - Add the grads to memory
        # if settings.UPDATE_ITER:
        #     self._grad_memory[name] += (tensor - tensor_compressed)
        #     self._grad_count += 1

        self._selected_num_gradients.append(int(ctx.numel()))

        # if settings.LOGGING_GRADIENTS and ctx.numel() == 0 and rank() == 0:
        #     grads = tensor.cpu().numpy()
        #     np.save('%s/r%d_zero_gradients_iter_%d' % (self._gradient_path, rank(), self.train_iter), grads)
        # elif settings.LOGGING_GRADIENTS and settings.UPDATE_ITER and self.train_iter % settings.UPDATE_ITER == 0 and rank() == 0:
        #     grads = tensor.cpu().numpy()
        #     np.save('%s/r%d_gradients_iter_%d' % (self._gradient_path, rank(), self.train_iter), grads)
        #handle = allreduce_async_(tensor_compressed, average=True, name=name)

        indexes = ctx
        handle = allgather_async(selected_values, name)
        handle_idx = allgather_async(indexes.int(), name+'_indexes')
        return (handle, handle_idx), ctx 
Example #7
0
    def synchronize(self):

        num_of_workers = size()
        for p, value in self._handles.items():
            name = self._merged_parameter_names.get(p)
            handle, ctx, density = value
            if self._sparse and density < 1:
                stime = time.time()
                handle_idx = None
                all_indexes = None
                if type(handle) is tuple:
                    handle, handle_idx = handle[0], handle[1]
                output = synchronize(handle)
                if handle_idx is not None:
                    all_indexes = synchronize(handle_idx)

                if self._profiling:
                    utils.force_insert_item(self._allreduce_timers, name,
                                            time.time() - stime)
                stime = time.time()
                new_grad = p.data.view(-1)
                new_grad.fill_(0.0)
                numel = output.size(0)
                real_num_values = numel // num_of_workers
                for i in range(num_of_workers):
                    values_and_indexes = output.data[i *
                                                     real_num_values:(i + 1) *
                                                     real_num_values]
                    if all_indexes is None:
                        values = values_and_indexes
                        indexes = None
                        per_values = values
                        per_values = self._compression.decompress(
                            per_values, p.size())
                        new_grad += per_values.view(-1)
                    else:
                        values = values_and_indexes
                        indexes = all_indexes.data[i *
                                                   real_num_values:(i + 1) *
                                                   real_num_values].long()
                        per_values = values[0:indexes.numel()]
                        per_values = self._compression.decompress(
                            per_values, p.size())
                        new_grad[indexes[0:indexes.numel()]] += per_values
                new_grad /= num_of_workers

                if self._profiling:
                    utils.force_insert_item(self._update_times, name,
                                            time.time() - stime)
            else:
                stime = time.time()
                output = synchronize(handle)
                if self._profiling:
                    utils.force_insert_item(self._allreduce_timers, name,
                                            time.time() - stime)
                stime = time.time()

                if self._norm_clip is not None:
                    norm_clip = np.sqrt(1.0 / size()) * self._norm_clip
                    norm_type = 2.0
                    param_norm = output.norm(norm_type)
                    total_norm = param_norm.item()
                    clip_coef = norm_clip / (total_norm + 1e-6)
                    if clip_coef < 1:
                        output.mul_(clip_coef)
                if self._compression:
                    output = self._compression.decompress(output, p.size())
                p.set_(output)
                if self._profiling:
                    utils.force_insert_item(self._update_times, name,
                                            time.time() - stime)
        if len(self._groups) != len(self._sequential_keys):
            for merged_p, value in self._handles.items():
                new_name = self._merged_parameter_names.get(merged_p)
                tensors = self._pull_from_buffer(new_name, merged_p)
                for n in tensors:
                    p = self._named_parameters.get(n)
                    if self._fp16:
                        p.grad.set_(tensors[n].data.type(p.grad.type()))
                    else:
                        p.grad.set_(tensors[n].data)
        self.train_iter += 1
        self._handles.clear()
        self._print_profiling()
Example #8
0
    def synchronize(self):
        global SPEED
        num_of_workers = size()
        ratio = 0
        i = 0
        for p, value in self._handles.items():
            name = self._merged_parameter_names.get(p)
            handle, ctx, density = value

            if self._sparse and density < 1:
                stime = time.time()
                handle_idx = None
                all_indexes = None
                if type(handle) is tuple:
                    handle, handle_idx = handle[0], handle[1]
                output = synchronize(handle)
                if handle_idx is not None:
                    all_indexes = synchronize(handle_idx)
                if self._profiling:
                    utils.force_insert_item(self._allreduce_timers, name,
                                            time.time() - stime)
                stime = time.time()
                new_grad = p.data.view(-1)
                dectx = output, all_indexes, num_of_workers
                new_grad = self._compression.decompress(new_grad, dectx)
                if self._profiling:
                    utils.force_insert_item(self._update_times, name,
                                            time.time() - stime)
            elif density == 1:
                stime = time.time()
                output = synchronize(handle)
                if self._profiling:
                    utils.force_insert_item(self._allreduce_timers, name,
                                            time.time() - stime)
                stime = time.time()
                if self._norm_clip is not None:
                    norm_clip = np.sqrt(1.0 / size()) * self._norm_clip
                    norm_type = 2.0
                    param_norm = output.norm(norm_type)
                    total_norm = param_norm.item()
                    clip_coef = norm_clip / (total_norm + 1e-6)
                    if clip_coef < 1:
                        output.mul_(clip_coef)

                p.set_(output)
                if self._profiling:
                    utils.force_insert_item(self._update_times, name,
                                            time.time() - stime)
            elif density > 1:
                #allgather instead of allreduce of sparse tensor
                stime = time.time()
                output = synchronize(handle)
                if self._profiling:
                    utils.force_insert_item(self._allreduce_timers, name,
                                            time.time() - stime)
                stime = time.time()
                new_grad = p.data.view(-1)
                new_grad.fill_(0.0)
                numel = output.size(0)
                real_num_values = numel // num_of_workers
                for i in range(num_of_workers):
                    values = output.data[i * real_num_values:(i + 1) *
                                         real_num_values]
                    new_grad += values
                new_grad /= num_of_workers

                if self._norm_clip is not None:
                    norm_clip = np.sqrt(1.0 / size()) * self._norm_clip
                    norm_type = 2.0
                    param_norm = new_grad.norm(norm_type)
                    total_norm = param_norm.item()
                    clip_coef = norm_clip / (total_norm + 1e-6)
                    if clip_coef < 1:
                        new_grad.mul_(clip_coef)

                p.set_(new_grad)
                if self._profiling:
                    utils.force_insert_item(self._update_times, name,
                                            time.time() - stime)

            # Ahmed - track number of elments
            if ctx is not None:
                ratio += ctx.numel() / p.data.numel()
            else:
                ratio += 1
            self._avg_ratio += ratio
            self._num_avg_sample += 1

            if density < 1:
                #Volume for all-gather compression (data + indexes) - %TODO should multiply (1-1/num-of-workers) (to remove portion of local node)
                self._sum_volume += output.numel() * output.element_size(
                ) + all_indexes.numel() * all_indexes.element_size()
            elif density == 1:
                #Volume for all-reduce no-compression
                self._sum_volume += 2 * output.numel() * output.element_size()
            elif density == 2:
                #Volume for all-gather no compression (data ) - %TODO should multiply (1-1/num-of-workers) (to remove portion of local node)
                self._sum_volume += output.numel() * output.element_size()
            self._num_vol_sample += 1

        if rank() == 0 and self.train_iter % settings.DISPLAY == 0:
            self._tb.log('datavol/cum_vol_bytes', self._sum_volume)
            self._tb.log('datavol/avg_vol_bytes',
                         self._sum_volume / self._num_vol_sample)

            if self._compression is not compressors['none']:  #and ratio > 0:
                #target_k = (self.model_elemnum * density)
                self._tb.log('compress/comp_ratio', ratio)
                self._tb.log('compress/est_compratio', ratio / density)
                self._tb.log('compress/avg_est_compratio',
                             (1.0 * self._avg_ratio / self._num_avg_sample) /
                             density)
                if self.stages < 0:
                    self._tb.log('compress/num_stages',
                                 self._compression.cur_stages)
                else:
                    self._tb.log('compress/num_stages', self.stages)
                if self.stages == 0:
                    self._tb.log('compress/first_ratio', self.iratio)
                else:
                    self._tb.log('compress/first_ratio',
                                 self._compression.first_ratio)
                self._num_sample = 0
                self._sum_elems = 0

        if len(self._groups) != len(self._sequential_keys):
            for merged_p, value in self._handles.items():
                new_name = self._merged_parameter_names.get(merged_p)
                tensors = self._pull_from_buffer(new_name, merged_p)
                for n in tensors:
                    p = self._named_parameters.get(n)
                    p.grad.set_(tensors[n].data.type(p.grad.type()))
        self.train_iter += 1
        self._handles.clear()
        self._print_profiling()