Exemple #1
0
 def __merge(taob, tc, p, l):
     tc[l] = 0
     p[l - 1] = p[l - 1] + p[l]
     p[l] = 0
     if self.size_commtime_dict is not None:
         tc[l - 1] = self.size_commtime_dict[l - 1]
     else:
         tc[l - 1] = utils.predict_allreduce_time_with_size(
             alpha, beta, p[l - 1] * nbytes, num_of_workers)
Exemple #2
0
    def _generate_groups_mgwfbp(self):
        num_of_workers = size()
        p_alpha_beta_56Gbps = {
            64: (0.00080632079996292579, 1.8 * 3.2713239529771973e-10),
            32: (0.00040632079996292579, 1.5 * 3.2713239529771973e-10),
            16: (0.00023583677659915685 * 3, 4.0594787739537565e-10),
            8: (9.75367204301171e-05, 3.0568230536676206e-10),
            4: (4.204298980348825e-05, 2.0589360830118177e-10),
            2: (2.554691138304671e-06, 9.837548167872609e-11)
        }
        p_alpha_beta_10Gbps = {
            64: (0.0023476410788581382 * 3, 9.643300782166769e-10),
            32: (0.0013476410788581382 * 3, 8.643300782166769e-10),
            16: (0.0009080981007148093, 7.395651186836712e-10),
            8: (0.0005230272768511732, 8.570746975492128e-10),
            4: (4.204298980348825e-05, 2.0589360830118177e-10),
            2: (2.554691138304671e-06, 9.837548167872609e-11)
        }
        if self.alpha is not None:
            alpha, beta = self.alpha, self.beta
        else:
            if self._rdma:
                alpha, beta = p_alpha_beta_56Gbps[num_of_workers]
            else:
                alpha, beta = p_alpha_beta_10Gbps[num_of_workers]
        nbytes = 2 if self._fp16 else 4

        def __calculate_comm_start(tc, tb, taob, L):
            taoc = [0] * L
            taoc[L - 1] = taob[L - 1] + tb[L - 1]
            for l in range(L - 1)[::-1]:
                taoc[l] = max(taoc[l + 1] + tc[l + 1], taob[l] + tb[l])
            return taoc

        def __merge(taob, tc, p, l):
            tc[l] = 0
            p[l - 1] = p[l - 1] + p[l]
            p[l] = 0
            if self.size_commtime_dict is not None:
                tc[l - 1] = self.size_commtime_dict[l - 1]
            else:
                tc[l - 1] = utils.predict_allreduce_time_with_size(
                    alpha, beta, p[l - 1] * nbytes, num_of_workers)

        sizes = [
            self._named_parameters[k].data.numel()
            for k in self._seq_layernames
        ]
        seq_layernames = self._seq_layernames
        if not utils.check_unique(seq_layernames):
            raise ValueError
        self._sizes = sizes
        p = sizes[:]
        L = len(sizes)
        if self.size_commtime_dict is not None:
            tc = [self.size_commtime_dict[s] for s in sizes]
        else:
            tc = [
                utils.predict_allreduce_time_with_size(alpha, beta, s * nbytes,
                                                       num_of_workers)
                for s in sizes
            ]
        tb = list(self._layerwise_times)
        taob = [0] * L
        for l in range(0, L - 1)[::-1]:
            taob[l] = taob[l + 1] + tb[l + 1]
        taoc = __calculate_comm_start(tc, tb, taob, L)
        if rank() == 0:
            #logger.debug('seq_layernames: %s', seq_layernames)
            #logger.debug('tb: %s', tb)
            #logger.debug('taob: %s', taob)
            #logger.debug('sizes: %s', p)
            #logger.warn('tc sum: %f', np.sum(tc))
            pass
            #logger.warn('tc: %s', tc)
            #logger.warn('taoc: %s', taoc)
        groups = []
        group = []
        idx = 0
        key_groupidx_maps = {}
        l = L - 1
        key = seq_layernames[l]
        key_groupidx_maps[key] = idx
        for l in range(1, L)[::-1]:
            key = seq_layernames[l]
            group.append(key)
            key_groupidx_maps[key] = idx
            current_taob = taob[l - 1] + tb[l - 1]
            merged = False
            if current_taob < taoc[l] + tc[l]:
                if taoc[l] > current_taob:
                    __merge(taob, tc, p, l)
                    taoc = __calculate_comm_start(tc, tb, taob, L)
                    merged = True
                else:
                    t_wait = current_taob - taoc[l]
                    t_saved = alpha
                    if t_wait < t_saved:
                        __merge(taob, tc, p, l)
                        taoc = __calculate_comm_start(tc, tb, taob, L)
                        merged = True
            #if not merged and (key.find('bn') >= 0 or key.find('bias') >= 0):
            if not merged and p[l] < 8192:
                __merge(taob, tc, p, l)
                taoc = __calculate_comm_start(tc, tb, taob, L)
                merged = True
            if not merged:
                idx += 1
                groups.append(group)
                group = []
            #elif current_taob > taoc[l+1]+tc[l+1] and current_taob < taoc[l]+tc[l] and taoc[l]+alpha > current_taob:
            #    __merge(taob, tc, p, l)
            #    taoc = __calculate_comm_start(tc, tb, taob, L)
            #else:
            #    idx += 1
            #    groups.append(group)
            #    group = []
        l = 0
        key = seq_layernames[l]
        key_groupidx_maps[key] = idx
        group.append(key)
        if len(group) > 0:
            groups.append(group)

        if rank() == 0:
            #logger.debug('seq_layernames: %s', seq_layernames)
            #pass
            #logger.info('Merged tc sum: %f', np.sum(tc))
            print('Merged sizes: ', p[::-1])
            print('# of parameters: ', np.sum(p[::-1]))
            #logger.info('Merged tb: %s', tb[::-1])
            #logger.info('Merged taob: %s', taob[::-1])
            #logger.info('Merged tc: %s', tc[::-1])
            #logger.info('Merged taoc: %s', taoc[::-1])

        return groups, key_groupidx_maps
    def _generate_groups_mgwfbp(self):
        num_of_workers = size()
        p_alpha_beta_56Gbps = {
            16: (0.00023583677659915685, 4.0594787739537565e-10),
            8: (9.75367204301171e-05, 3.0568230536676206e-10),
            4: (4.204298980348825e-05, 2.0589360830118177e-10),
            2: (2.554691138304671e-06, 9.837548167872609e-11)
        }
        p_alpha_beta_10Gbps = {
            16: (0.0009080981007148093, 7.395651186836712e-10),
            8: (0.0005230272768511732, 8.570746975492128e-10),
            4: (4.204298980348825e-05, 2.0589360830118177e-10),
            2: (2.554691138304671e-06, 9.837548167872609e-11)
        }
        if self.alpha is not None:
            alpha, beta = self.alpha, self.beta
        else:
            if settings.CONNECTION == '10GbE':
                alpha, beta = p_alpha_beta_10Gbps[num_of_workers]
            else:
                alpha, beta = p_alpha_beta_56Gbps[num_of_workers]
        nbytes = 2 if settings.FP16 else 4

        def __calculate_comm_start(tc, tb, taob, L):
            taoc = [0] * L
            taoc[L - 1] = taob[L - 1] + tb[L - 1]
            for l in range(L - 1)[::-1]:
                taoc[l] = max(taoc[l + 1] + tc[l + 1], taob[l] + tb[l])
            return taoc

        def __merge(taob, tc, p, l):
            tc[l] = 0
            p[l - 1] = p[l - 1] + p[l]
            p[l] = 0
            if self.size_commtime_dict is not None:
                tc[l - 1] = self.size_commtime_dict[l - 1]
            else:
                tc[l - 1] = utils.predict_allreduce_time_with_size(
                    alpha, beta, p[l - 1] * nbytes, num_of_workers)

        sizes = [
            self._named_parameters[k].data.numel()
            for k in self._seq_layernames
        ]
        seq_layernames = self._seq_layernames
        if not utils.check_unique(seq_layernames):
            raise ValueError
        self._sizes = sizes
        p = sizes[:]
        L = len(sizes)
        if self.size_commtime_dict is not None:
            tc = [self.size_commtime_dict[s] for s in sizes]
        else:
            tc = [
                utils.predict_allreduce_time_with_size(alpha, beta, s * nbytes,
                                                       num_of_workers)
                for s in sizes
            ]
        tb = list(self._layerwise_times)
        taob = [0] * L
        for l in range(0, L - 1)[::-1]:
            taob[l] = taob[l + 1] + tb[l + 1]
        taoc = __calculate_comm_start(tc, tb, taob, L)
        if rank() == 0:
            logger.info('tc sum: %f', np.sum(tc))
        groups = []
        group = []
        idx = 0
        key_groupidx_maps = {}
        l = L - 1
        key = seq_layernames[l]
        key_groupidx_maps[key] = idx
        for l in range(1, L)[::-1]:
            key = seq_layernames[l]
            group.append(key)
            key_groupidx_maps[key] = idx
            current_taob = taob[l - 1] + tb[l - 1]
            merged = False
            if current_taob < taoc[l] + tc[l]:
                if taoc[l] > current_taob:
                    __merge(taob, tc, p, l)
                    taoc = __calculate_comm_start(tc, tb, taob, L)
                    merged = True
                else:
                    t_wait = current_taob - taoc[l]
                    t_saved = alpha
                    if t_wait < t_saved:
                        __merge(taob, tc, p, l)
                        taoc = __calculate_comm_start(tc, tb, taob, L)
                        merged = True
            if not merged:
                idx += 1
                groups.append(group)
                group = []
        l = 0
        key = seq_layernames[l]
        key_groupidx_maps[key] = idx
        group.append(key)
        if len(group) > 0:
            groups.append(group)

        if rank() == 0:
            logger.info('Predicted non-overlapped time: %f',
                        taoc[0] + tc[0] - (taob[0] + tb[0]))
            logger.info('Predicted tb+tc= %f', taoc[0] + tc[0])
            logger.info('Merged tc sum: %f', np.sum(tc))

        return groups, key_groupidx_maps
Exemple #4
0
 def __merge(taob, tc, p, l):
     tc[l] = 0
     p[l - 1] = p[l - 1] + p[l]
     p[l] = 0
     tc[l - 1] = utils.predict_allreduce_time_with_size(
         alpha, beta, p[l - 1] * 4, num_of_workers)
Exemple #5
0
    def _generate_groups_mgwfbp(self):
        num_of_workers = size()
        alpha = 9.618801111215886e-08
        beta = 3.89407453e-13

        def __calculate_comm_start(tc, tb, taob, L):
            taoc = [0] * L
            taoc[L - 1] = taob[L - 1] + tb[L - 1]
            for l in range(L - 1)[::-1]:
                taoc[l] = max(tc[l + 1] + tc[l + 1], taob[l] + tb[l])
            return taoc

        def __merge(taob, tc, p, l):
            tc[l] = 0
            p[l - 1] = p[l - 1] + p[l]
            p[l] = 0
            tc[l - 1] = utils.predict_allreduce_time_with_size(
                alpha, beta, p[l - 1] * 4, num_of_workers)

        sizes = [
            self._named_parameters[k].data.numel()
            for k in self._seq_layernames
        ][::-1]
        seq_layernames = self._seq_layernames[::-1]
        self._sizes = sizes
        p = sizes[:]
        L = len(sizes)
        tc = [
            utils.predict_allreduce_time_with_size(alpha, beta, s * 4,
                                                   num_of_workers)
            for s in sizes
        ]
        tb = list(self._layerwise_times[::-1])
        taob = [0]
        for t in tb[:-1]:
            taob.append(t + taob[-1])
        taob = taob[::-1]
        taoc = __calculate_comm_start(tc, tb, taob, L)
        if rank() == 0 and DEBUG:
            logger.debug('seq_layernames: %s', seq_layernames)
            logger.debug('tb: %s', tb)
            logger.debug('taob: %s', taob)
            logger.debug('sizes: %s', p)
            logger.debug('tc: %s', tc)
            logger.debug('taoc: %s', taoc)
        groups = []
        group = []
        idx = 0
        key_groupidx_maps = {}
        l = L - 1
        key = seq_layernames[l]
        key_groupidx_maps[key] = idx
        group.append(key)
        for l in range(1, L - 1)[::-1]:
            key = seq_layernames[l]
            group.append(key)
            key_groupidx_maps[key] = idx
            current_taob = taob[l - 2] if l >= 2 else taob[0]
            if current_taob - taoc[l] < alpha:
                __merge(taob, tc, p, l)
                taoc = __calculate_comm_start(tc, tb, taob, L)
            else:
                idx += 1
                groups.append(group)
                group = []
        l = 0
        key = seq_layernames[l]
        key_groupidx_maps[key] = idx
        group.append(key)
        if len(group) > 0:
            groups.append(group)
        return groups, key_groupidx_maps
Exemple #6
0
    def _generate_groups_mgwfbp(self):
        num_of_workers = size()
        p_alpha_beta = {
            16: (0.00010632079996292579, 1.5 * 3.2713239529771973e-10),
            8: (9.75367204301171e-05, 3.0568230536676206e-10),
            4: (4.204298980348825e-05, 2.0589360830118177e-10),
            2: (2.554691138304671e-06, 9.837548167872609e-11)
        }
        alpha, beta = p_alpha_beta[num_of_workers]

        def __calculate_comm_start(tc, tb, taob, L):
            taoc = [0] * L
            taoc[L - 1] = taob[L - 1] + tb[L - 1]
            for l in range(L - 1)[::-1]:
                taoc[l] = max(taoc[l + 1] + tc[l + 1], taob[l] + tb[l])
            return taoc

        def __merge(taob, tc, p, l):
            tc[l] = 0
            p[l - 1] = p[l - 1] + p[l]
            p[l] = 0
            tc[l - 1] = utils.predict_allreduce_time_with_size(
                alpha, beta, p[l - 1] * 4, num_of_workers)

        sizes = [
            self._named_parameters[k].data.numel()
            for k in self._seq_layernames
        ]
        seq_layernames = self._seq_layernames
        self._sizes = sizes
        p = sizes[:]
        L = len(sizes)
        tc = [
            utils.predict_allreduce_time_with_size(alpha, beta, s * 4,
                                                   num_of_workers)
            for s in sizes
        ]
        tb = list(self._layerwise_times)
        taob = [0] * L
        for l in range(0, L - 1)[::-1]:
            taob[l] = taob[l + 1] + tb[l + 1]
        taoc = __calculate_comm_start(tc, tb, taob, L)
        if rank() == 0:
            logger.warn('tc sum: %f', np.sum(tc))
            logger.warn('tc: %s', tc)
            logger.warn('taoc: %s', taoc)
        groups = []
        group = []
        idx = 0
        key_groupidx_maps = {}
        l = L - 1
        key = seq_layernames[l]
        key_groupidx_maps[key] = idx
        group.append(key)
        for l in range(1, L - 1)[::-1]:
            key = seq_layernames[l]
            group.append(key)
            key_groupidx_maps[key] = idx
            current_taob = taob[l - 1] + tb[l - 1]
            if current_taob < taoc[l + 1] + tc[l + 1]:
                __merge(taob, tc, p, l)
                taoc = __calculate_comm_start(tc, tb, taob, L)
            elif current_taob > taoc[l + 1] + tc[l + 1] and current_taob < taoc[
                    l] + tc[l] and taoc[l] + alpha > current_taob:
                __merge(taob, tc, p, l)
                taoc = __calculate_comm_start(tc, tb, taob, L)
            else:
                idx += 1
                groups.append(group)
                group = []
        l = 0
        key = seq_layernames[l]
        key_groupidx_maps[key] = idx
        group.append(key)
        logger.info('Predicted non-overlapped time: %f',
                    taoc[0] + tc[0] - (taob[0] + tb[0]))
        logger.info('Predicted tb+tc= %f', taoc[0] + tc[0])
        if len(group) > 0:
            groups.append(group)
        return groups, key_groupidx_maps