def __calculate_comm_start(ts, taos, sizes, L): taoc = [0] * L tc = [utils.allgather_perf_model(s, P, self._density) for s in sizes] taoc[L-1] = taos[L-1] + ts[L-1] for l in range(L-1)[::-1]: taoc[l] = max(taoc[l+1] + tc[l+1], taos[l] + ts[l]) return taoc, tc
def __merge(tb, ts, tc, p, l): tb[l - 1] += tb[l] tb[l] = 0 p[l - 1] = p[l - 1] + p[l] p[l] = 0 tc[l - 1] = utils.allgather_perf_model(p[l - 1], P, self._density) tc[l] = 0 ts[l - 1] = utils.topk_perf_model(p[l - 1]) ts[l] = 0
def _generate_groups_mgs(self): P = size() # number of wokers def __calculate_sparse_and_backward_start(tb, sizes, L, start=0): taos = [start] * L ts = [utils.topk_perf_model(s) for s in sizes] taob = [start] * L taob[L - 1] = start taos[L - 1] = taob[L - 1] + tb[L - 1] for l in range(L - 1)[::-1]: taob[l] = taos[l + 1] + ts[l + 1] taos[l] = taob[l] + tb[l] return taob, taos, ts def __calculate_comm_start(ts, taos, sizes, L): taoc = [0] * L tc = [ utils.allgather_perf_model(s, P, self._density) for s in sizes ] taoc[L - 1] = taos[L - 1] + ts[L - 1] for l in range(L - 1)[::-1]: taoc[l] = max(taoc[l + 1] + tc[l + 1], taos[l] + ts[l]) return taoc, tc def __merge(tb, ts, tc, p, l): tb[l - 1] += tb[l] tb[l] = 0 p[l - 1] = p[l - 1] + p[l] p[l] = 0 tc[l - 1] = utils.allgather_perf_model(p[l - 1], P, self._density) tc[l] = 0 ts[l - 1] = utils.topk_perf_model(p[l - 1]) ts[l] = 0 sizes = [ self._named_parameters[k].data.numel() for k in self._seq_layernames ] seq_layernames = self._seq_layernames self._sizes = sizes p = sizes[:] L = len(sizes) tb = list(self._layerwise_times) taob, taos, ts = __calculate_sparse_and_backward_start(tb, p, L) taoc, tc = __calculate_comm_start(ts, taos, p, L) groups = [] group = [] idx = 0 key_groupidx_maps = {} l = L - 1 key = seq_layernames[l] key_groupidx_maps[key] = idx group.append(key) for l in range(1, L - 1)[::-1]: key = seq_layernames[l] group.append(key) key_groupidx_maps[key] = idx tw = tb[l-1]+utils.topk_perf_model(p[l]+p[l-1])\ - utils.topk_perf_model(p[l]) - utils.topk_perf_model(p[l-1])\ - (taoc[l] - (taos[l]+ts[l])) tsave = utils.allgather_perf_model(p[l], P, self._density)+utils.allgather_perf_model(p[l-1], P, self._density)-\ utils.allgather_perf_model((p[l]+p[l-1]), P, self._density) if tw < tsave: __merge(tb, ts, tc, p, l) taob2, taos2, ts2 = __calculate_sparse_and_backward_start( tb[:l], p[:l], l, start=taob[l] + tb[l]) taob[:l] = taob2 taos[:l] = taos2 taoc, tc = __calculate_comm_start(ts, taos, p, L) else: idx += 1 groups.append(group) group = [] l = 0 key = seq_layernames[l] key_groupidx_maps[key] = idx group.append(key) if len(group) > 0: groups.append(group) return groups, key_groupidx_maps