def main(): args = parse_args() grad_sizes = model_grad_sizes[args.model] ms.context.set_context(mode=ms.context.GRAPH_MODE, device_target=args.device) schedule = { 3: 2, 6: 3, 9: 4, 12: 1, } kfops.init(args.device) all_reduce = kfops.KungFuAllReduce() all_reduce_max = kfops.KungFuAllReduce(op=ReduceOp.MAX) resize = kfops.KungFuResize() xs = [ ms.Tensor(np.array([1.0] * size).astype(np.float32)) for size in grad_sizes ] step = 0 need_sync = True while True: if need_sync: step = sync_step(step, all_reduce_max) print('step: %d' % (step)) need_sync = False t0 = time.time() ys = [all_reduce(x) for x in xs] t1 = time.time() d = t1 - t0 if step in schedule: new_size = ms.Tensor(schedule[step], dtype=ms.uint32) print('step=%d, will resize to %d' % (step, schedule[step])) changed, detached = resize(new_size) print('changed %s, detached: %s' % (changed, detached)) if changed: need_sync = True if detached: break step += 1 if step > args.steps: break print('train loop finished') kfops.finalize(args.device)
def train(args): with kfops.KungFuContext(device=args.device): all_reduce = kfops.KungFuAllReduce() x = ms.Tensor(np.array([1.0, 2.0, 3.0]).astype(np.float32)) print(x) y = all_reduce(x) print(y)
def __init__(self, *args, **kwargs): super(KungFuMomentum, self).__init__(*args, **kwargs) self.map_ = ms.ops.composite.Map() self.all_reduce = kfops.KungFuAllReduce() self.dbg_log_tensor = False self.log_tensor = kfops.KungFuLogTensor()
def main(): args = parse_args() ms.context.set_context(mode=ms.context.GRAPH_MODE, device_target=args.device) with kfops.KungFuContext(device=args.device): all_reduce = kfops.KungFuAllReduce() x = ms.Tensor(np.array([1.0, 2.0, 3.0]).astype(np.float32)) print(x) y = all_reduce(x) print(y)
def main(): args = parse_args() grad_sizes = model_grad_sizes[args.model] ms.context.set_context(mode=ms.context.GRAPH_MODE, device_target=args.device) if args.collective == 'mindspore': init() cluster_size = get_group_size() rank = get_rank() else: print('using kungfu collective') kfops.init(args.device) cluster_size = parse_kungfu_size() rank = parse_kungfu_port() - 10000 print('rank: %d, size: %d' % (rank, cluster_size)) if args.collective == 'mindspore': all_reduce = ms.ops.operations.AllReduce() elif args.collective == 'kungfu': all_reduce = kfops.KungFuAllReduce() else: raise RuntimeError('invalid collective') xs = [ ms.Tensor(np.array([1.0] * size).astype(np.float32)) for size in grad_sizes ] data_size = sum(grad_sizes) * 4 # 1 float is 4 bytes multiplier = 4 * (cluster_size - 1) Gi = 1024 * 1024 * 1024 def run_stage(name, steps): for i in range(steps): t0 = time.time() ys = [all_reduce(x) for x in xs] t1 = time.time() d = t1 - t0 rate = float(data_size) * multiplier / Gi / d if rank == 0: print('%s %d took %.3fms, data rate: %.3fGiB/s' % (name, i + 1, d * 1e3, rate)) run_stage('warmup', args.warmup_steps) run_stage('step', args.steps) if args.collective == 'kungfu': kfops.finalize(args.device)
def main(): args = parse_args() log_args(args) ms.context.set_context(mode=ms.context.GRAPH_MODE, device_target=args.device, save_graphs=False) kfops.init(args.device) all_reduce = kfops.KungFuAllReduce() x = ms.Tensor(np.array([1.0, 2.0, 3.0]).astype(np.float32)) print(x) y = all_reduce(x) print(y) kfops.finalize(args.device)
def __init__(self, num_features, eps=1e-5, momentum=0.9, affine=True, gamma_init="ones", beta_init="zeros", moving_mean_init="zeros", moving_var_init="ones", input_dims="2d", data_format="NCHW"): super().__init__() validator.check_value_type('num_features', num_features, [int], self.cls_name) if num_features < 1: raise ValueError("num_features must be at least 1") self.num_features = num_features if momentum < 0 or momentum > 1: error_msg = "momentum should be a number in range [0, 1], but got {}".format(momentum) raise ValueError(error_msg) self.momentum = 1.0 - momentum self.input_dims = input_dims self.format = validator.check_string(data_format, ['NCHW', 'NHWC'], 'format', self.cls_name) if ms.context.get_context("device_target") != "GPU" and self.format == "NHWC": raise ValueError("NHWC format only support in GPU target.") self.eps = eps self.moving_mean = ms.Parameter(initializer( moving_mean_init, num_features), name="mean", requires_grad=False) self.moving_variance = ms.Parameter(initializer( moving_var_init, num_features), name="variance", requires_grad=False) self.gamma = ms.Parameter(initializer( gamma_init, num_features), name="gamma", requires_grad=affine) self.beta = ms.Parameter(initializer( beta_init, num_features), name="beta", requires_grad=affine) # self._cluster_size_op = kfops.KungFuClusterSize() self._all_reduce_op = kfops.KungFuAllReduce() self._square_op = ms.ops.Square() self._sqrt_op = ms.ops.Sqrt() # HACK self._cluster_size_op = kfops.KungFuClusterSizeInput() self._cluster_size_input = ms.Tensor(np.ones((1,), dtype=np.int32))