コード例 #1
0
 def __calculate_comm_start(ts, taos, sizes, L):
     taoc = [0] * L 
     tc = [utils.allgather_perf_model(s, P, self._density) for s in sizes]
     taoc[L-1] = taos[L-1] + ts[L-1]
     for l in range(L-1)[::-1]:
         taoc[l] = max(taoc[l+1] + tc[l+1], taos[l] + ts[l])
     return taoc, tc
コード例 #2
0
        def __merge(tb, ts, tc, p, l):
            tb[l - 1] += tb[l]
            tb[l] = 0

            p[l - 1] = p[l - 1] + p[l]
            p[l] = 0

            tc[l - 1] = utils.allgather_perf_model(p[l - 1], P, self._density)
            tc[l] = 0

            ts[l - 1] = utils.topk_perf_model(p[l - 1])
            ts[l] = 0
コード例 #3
0
    def _generate_groups_mgs(self):
        P = size()  # number of wokers

        def __calculate_sparse_and_backward_start(tb, sizes, L, start=0):
            taos = [start] * L
            ts = [utils.topk_perf_model(s) for s in sizes]
            taob = [start] * L
            taob[L - 1] = start
            taos[L - 1] = taob[L - 1] + tb[L - 1]
            for l in range(L - 1)[::-1]:
                taob[l] = taos[l + 1] + ts[l + 1]
                taos[l] = taob[l] + tb[l]
            return taob, taos, ts

        def __calculate_comm_start(ts, taos, sizes, L):
            taoc = [0] * L
            tc = [
                utils.allgather_perf_model(s, P, self._density) for s in sizes
            ]
            taoc[L - 1] = taos[L - 1] + ts[L - 1]
            for l in range(L - 1)[::-1]:
                taoc[l] = max(taoc[l + 1] + tc[l + 1], taos[l] + ts[l])
            return taoc, tc

        def __merge(tb, ts, tc, p, l):
            tb[l - 1] += tb[l]
            tb[l] = 0

            p[l - 1] = p[l - 1] + p[l]
            p[l] = 0

            tc[l - 1] = utils.allgather_perf_model(p[l - 1], P, self._density)
            tc[l] = 0

            ts[l - 1] = utils.topk_perf_model(p[l - 1])
            ts[l] = 0

        sizes = [
            self._named_parameters[k].data.numel()
            for k in self._seq_layernames
        ]
        seq_layernames = self._seq_layernames
        self._sizes = sizes
        p = sizes[:]
        L = len(sizes)
        tb = list(self._layerwise_times)
        taob, taos, ts = __calculate_sparse_and_backward_start(tb, p, L)
        taoc, tc = __calculate_comm_start(ts, taos, p, L)

        groups = []
        group = []
        idx = 0
        key_groupidx_maps = {}
        l = L - 1
        key = seq_layernames[l]
        key_groupidx_maps[key] = idx
        group.append(key)
        for l in range(1, L - 1)[::-1]:
            key = seq_layernames[l]
            group.append(key)
            key_groupidx_maps[key] = idx

            tw = tb[l-1]+utils.topk_perf_model(p[l]+p[l-1])\
                - utils.topk_perf_model(p[l]) - utils.topk_perf_model(p[l-1])\
                - (taoc[l] - (taos[l]+ts[l]))
            tsave = utils.allgather_perf_model(p[l], P, self._density)+utils.allgather_perf_model(p[l-1], P, self._density)-\
                    utils.allgather_perf_model((p[l]+p[l-1]), P, self._density)
            if tw < tsave:
                __merge(tb, ts, tc, p, l)
                taob2, taos2, ts2 = __calculate_sparse_and_backward_start(
                    tb[:l], p[:l], l, start=taob[l] + tb[l])
                taob[:l] = taob2
                taos[:l] = taos2
                taoc, tc = __calculate_comm_start(ts, taos, p, L)
            else:
                idx += 1
                groups.append(group)
                group = []
        l = 0
        key = seq_layernames[l]
        key_groupidx_maps[key] = idx
        group.append(key)
        if len(group) > 0:
            groups.append(group)
        return groups, key_groupidx_maps