def test_auto_parallel_arithmetic_broadcast_both(): class Net(nn.Cell): def __init__(self): super().__init__() self.matmul = P.MatMul() self.floordiv = P.FloorDiv() def construct(self, x, y, b): out = self.matmul(x, y) out = self.floordiv(out, b) return out context.set_auto_parallel_context(device_num=8, global_rank=0) net = NetWithLoss(Net()) context.set_auto_parallel_context(parallel_mode="auto_parallel") reset_op_id() x = Tensor(np.ones([64, 32]), dtype=ms.float32) y = Tensor(np.ones([32, 1]), dtype=ms.float32) b = Tensor(np.ones([1, 64]), dtype=ms.float32) compile_net(net, x, y, b, phase='train') strategies = _executor._get_shard_strategy(net) for (k, v) in strategies.items(): if re.search('FloorDiv-op', k) is not None: assert v == [[8, 1], [1, 1]] elif re.search('MatMul-op', k) is not None: assert v == [[8, 1], [1, 1]]
def test_train_4k_8p_gpu(batch_size=32, num_classes=4096): dev_num = 8 context.set_context(mode=context.GRAPH_MODE, device_target="GPU") context.set_auto_parallel_context(parallel_mode=ParallelMode.AUTO_PARALLEL, device_num=dev_num) set_algo_parameters(elementwise_op_strategy_follow=True) resset_op_id() np.random.seed(6) input_np = np.ones([batch_size, 3, 224, 224]).astype(np.float32) label_np = np.zeros([batch_size]).astype(np.int32) for i in range(0, batch_size): label_np[i] = i % num_classes dataset = DatasetLenet(Tensor(input_np), Tensor(label_np), 1) net = resnet50(num_classes) loss = SoftmaxCrossEntropyExpand(sparse=True) opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), 0.01, 0.9) model = Model(net, loss_fn=loss, optimizer=opt) model.train(5, dataset, dataset_sink_mode=False) strategies = _executor._get_shard_strategy(model._train_network) for (k, v) in strategies.items(): if re.search('Conv2D-op', k) is not None: assert v[0][0] == dev_num elif re.search('MatMul-op', k) is not None: assert v == [[dev_num, 1], [1, 1]] elif re.search('ReduceSum-op', k) is not None: assert v == [[dev_num, 1]]
def test_auto_parallel_arithmetic_broadcast_both(): class Net(nn.Cell): def __init__(self): super().__init__() self.matmul = P.MatMul() self.floordiv = P.FloorDiv() def construct(self, x, y, b): out = self.matmul(x, y) out = self.floordiv(out, b) return out context.set_auto_parallel_context(device_num=8, global_rank=0) net = NetWithLoss(Net()) context.set_auto_parallel_context(parallel_mode="auto_parallel") reset_op_id() x = Tensor(np.ones([64, 32]), dtype=ms.float32) y = Tensor(np.ones([32, 1]), dtype=ms.float32) b = Tensor(np.ones([1, 64]), dtype=ms.float32) compile_net(net, x, y, b, phase='train') strategies = _executor._get_shard_strategy(net) expected_strategies = { 'Default/network-Net/FloorDiv-op1': [[8, 1], [1, 1]], 'Default/network-Net/MatMul-op0': [[8, 1], [1, 1]] } assert strategies == expected_strategies
def test_two_matmul_transpose(): class Net(nn.Cell): def __init__(self): super().__init__() self.matmul1 = P.MatMul() self.matmul2 = P.MatMul() self.transpose1 = P.Transpose() self.transpose2 = P.Transpose() def construct(self, x, y, b): out = self.matmul1(x, y) out = self.matmul2(out, b) out = self.transpose1(out, (1, 0)) out = self.transpose2(out, (1, 0)) return out size = 16 context.set_auto_parallel_context(device_num=size, global_rank=0) x = Tensor(np.ones([128, 32]), dtype=ms.float32) y = Tensor(np.ones([32, 64]), dtype=ms.float32) b = Tensor(np.ones([64, 64]), dtype=ms.float32) net = NetWithLoss(Net()) context.set_auto_parallel_context(parallel_mode="auto_parallel") net.set_auto_parallel() reset_op_id() net.set_train() _executor.compile(net, x, y, b, phase='train') strategies = _executor._get_shard_strategy(net) expected_strategies = {'Default/network-Net/Transpose-op4': [[1, 16]], 'Default/network-Net/Transpose-op5': [[16, 1]], 'Default/network-Net/MatMul-op7': [[16, 1], [1, 1]], 'Default/network-Net/MatMul-op6': [[16, 1], [1, 1]]} assert strategies == expected_strategies
def test_matmul_prelu(): class Net(nn.Cell): def __init__(self): super().__init__() self.mul1 = P.Mul() self.prelu = P.PReLU() def construct(self, x, y, b): out = self.mul1(x, y) out = self.prelu(out, b) return out size = 16 context.set_auto_parallel_context(device_num=size, global_rank=0) x = Tensor(np.ones([16, 3, 128, 32]), dtype=ms.float32) y = Tensor(np.ones([16, 3, 128, 32]), dtype=ms.float32) b = Tensor(np.array([0.01, 0.02, 0.03]), dtype=ms.float32) net = NetWithLoss(Net()) context.set_auto_parallel_context(parallel_mode="auto_parallel") net.set_auto_parallel() reset_op_id() net.set_train() _executor.compile(net, x, y, b, phase='train') strategies = _executor._get_shard_strategy(net) for (k, v) in strategies.items(): if re.search('PReLU-op', k) is not None: assert v == [[16, 1, 1, 1], [1]] elif re.search('Mul-op', k) is not None: assert v == [[16, 1, 1, 1], [16, 1, 1, 1]]
def test_auto_parallel_assign_sub_with_ref_key(): size = 8 context.set_auto_parallel_context(device_num=size, global_rank=0) x = Tensor(np.random.rand(4, 4, 32, 64), dtype=ms.float32) net = NetWithLoss(nn.PReLU(4)) context.set_auto_parallel_context(parallel_mode="auto_parallel") net.set_auto_parallel() reset_op_id() _executor.compile(net, x, phase="train") strategies = _executor._get_shard_strategy(net) for (k, v) in strategies.items(): if re.search('PReLU-op', k) is not None: assert v == [[1, 1, 1, 8], [1]] elif re.search('ReLU-op', k) is not None: assert v == [[1]]
def test_double_subgraphs_train(): context.set_context(save_graphs=True) context.set_auto_parallel_context(device_num=1, global_rank=0) context.set_auto_parallel_context(parallel_mode="auto_parallel") net = TrainStepWarp(NetWithLoss(Net())) batch_ids = np.ones([8, 8, 8, 8]).astype(np.int32) ds_train = DatasetLenet(Tensor(batch_ids), None) model = Model(net) model.train(1, ds_train, dataset_sink_mode=False) strategies = _executor._get_shard_strategy(net) expected_strategies = {'Default/network-NetWithLoss/ReduceMean-op3': [[1, 1, 1, 1]], 'Default/network-NetWithLoss/net-Net/ReLU-op4': [[1, 1, 1, 1]], 'Default/network-NetWithLoss/net-Net/Mul-op5': [[1, 1, 1, 1], [1, 1, 1, 1]], 'Default/network-NetWithLoss/net-Net/Mul-op6': [[1, 1, 1, 1], [1, 1, 1, 1]], 'Default/network-NetWithLoss/net-Net/Cast-op1': [[1, 1, 1, 1]], 'Default/network-NetWithLoss/ReduceSum-op7': [[1, 1, 1, 1]]} assert strategies == expected_strategies
def test_double_subgraphs(): _set_multi_subgraphs() context.set_context(save_graphs=True) context.set_auto_parallel_context(device_num=8, global_rank=0) context.set_auto_parallel_context(parallel_mode="auto_parallel") net = TrainStepWarp(NetWithLoss(Net())) net.set_auto_parallel() x = Tensor(np.ones([8, 8, 8, 8]), dtype=ms.float32) reset_op_id() _executor.compile(net, x, phase='train') strategies = _executor._get_shard_strategy(net) expected_strategies = {'Default/network-NetWithLoss/ReduceMean-op0': [[8, 1, 1, 1]], 'Default/network-NetWithLoss/net-Net/ReLU-op1': [[8, 1, 1, 1]], 'Default/network-NetWithLoss/net-Net/Mul-op2': [[8, 1, 1, 1], [8, 1, 1, 1]], 'Default/network-NetWithLoss/net-Net/Mul-op3': [[8, 1, 1, 1], [8, 1, 1, 1]], 'Default/network-NetWithLoss/ReduceSum-op4': [[8, 1, 1, 1]]} assert strategies == expected_strategies
def all_to_all_common(): learning_rate = 0.1 momentum = 0.9 epoch_size = 2 context.reset_auto_parallel_context() context.set_auto_parallel_context(parallel_mode=ParallelMode.AUTO_PARALLEL, device_num=1, global_rank=0) predict = Tensor(np.ones([32, 128]), dtype=ms.float32) label = Tensor(np.ones([32]), dtype=ms.int32) dataset = Dataset(predict, label, 2) net = all_to_all_net() loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') opt = Momentum(net.trainable_params(), learning_rate, momentum) model = Model(net, loss, opt) model.train(epoch_size, dataset, dataset_sink_mode=False) strategys = _executor._get_shard_strategy(model._train_network) return strategys
def test_common_parameter(): class Net(nn.Cell): def __init__(self): super().__init__() self.matmul1 = P.MatMul() self.matmul2 = P.MatMul() self.matmul3 = P.MatMul() self.weight1 = Parameter(Tensor( np.ones([64, 64]).astype(np.float16) * 0.01), "w", requires_grad=True) self.cast1 = P.Cast() self.cast2 = P.Cast() def construct(self, x, y): m1_result = self.matmul1(x, self.cast1(self.weight1, mstype.float32)) m2_result = self.matmul2(y, self.cast2(self.weight1, mstype.float32)) m3_result = self.matmul3(m2_result, m1_result) return m3_result size = 8 context.set_auto_parallel_context(device_num=size, global_rank=0) set_algo_parameters(elementwise_op_strategy_follow=True) x = Tensor(np.ones([64, 64]), dtype=ms.float32) y = Tensor(np.ones([64, 64]), dtype=ms.float32) net = NetWithLoss(Net()) context.set_auto_parallel_context(parallel_mode="auto_parallel") net.set_auto_parallel() reset_op_id() net.set_train() _executor.compile(net, x, y, phase='train') strategies = _executor._get_shard_strategy(net) for (k, v) in strategies.items(): if re.search('MatMul-op', k) is not None: assert v == [[8, 1], [1, 1]] elif re.search('Cast-op', k) is not None: assert v == [[1, 1]]
def test_double_star_graph(): class Net(nn.Cell): def __init__(self): super().__init__() self.matmul1 = P.MatMul() self.matmul2 = P.MatMul() self.matmul3 = P.MatMul() self.cast1 = P.Cast() self.cast2 = P.Cast() def construct(self, x, y, z, w): m1_result = self.matmul1(x, y) m2_result = self.matmul2(z, w) m3_result = self.matmul3(self.cast1(m2_result, mstype.float16), self.cast2(m1_result, mstype.float16)) return m3_result size = 8 context.set_auto_parallel_context(device_num=size, global_rank=0) x = Tensor(np.ones([32, 8]), dtype=ms.float32) y = Tensor(np.ones([8, 16]), dtype=ms.float32) z = Tensor(np.ones([8, 16]), dtype=ms.float32) w = Tensor(np.ones([16, 32]), dtype=ms.float32) net = NetWithLoss(Net()) context.set_auto_parallel_context(parallel_mode="auto_parallel") net.set_auto_parallel() reset_op_id() net.set_train() _executor.compile(net, x, y, z, w, phase='train') strategies = _executor._get_shard_strategy(net) expected_strategies = { 'Default/network-Net/Cast-op2': [[8, 1]], 'Default/network-Net/Cast-op4': [[1, 8]], 'Default/network-Net/MatMul-op3': [[8, 1], [1, 1]], 'Default/network-Net/MatMul-op5': [[1, 1], [1, 8]], 'Default/network-Net/MatMul-op1': [[1, 8], [8, 1]] } assert strategies == expected_strategies
def test_double_subgraphs_train(): context.set_context(save_graphs=True) context.set_auto_parallel_context(device_num=1, global_rank=0) context.set_auto_parallel_context(parallel_mode="auto_parallel") net = TrainStepWarp(NetWithLoss(Net())) batch_ids = np.ones([8, 8, 8, 8]).astype(np.int32) ds_train = DatasetLenet(Tensor(batch_ids), None) model = Model(net) model.train(1, ds_train, dataset_sink_mode=False) strategies = _executor._get_shard_strategy(net) for (k, v) in strategies.items(): if re.search('ReduceMean-op', k) is not None: assert v == [[1, 1, 1, 1]] elif re.search('ReLU-op', k) is not None: assert v == [[1, 1, 1, 1]] elif re.search('Mul-op', k) is not None: assert v == [[1, 1, 1, 1], [1, 1, 1, 1]] elif re.search('Cast-op', k) is not None: assert v == [[1, 1, 1, 1]] elif re.search('ReduceSum-op', k) is not None: assert v == [[1, 1, 1, 1]]
def test_double_subgraphs(): _set_multi_subgraphs() context.set_context(save_graphs=True) context.set_auto_parallel_context(device_num=8, global_rank=0) context.set_auto_parallel_context(parallel_mode="auto_parallel") net = TrainStepWarp(NetWithLoss(Net())) net.set_auto_parallel() x = Tensor(np.ones([8, 8, 8, 8]), dtype=ms.float32) reset_op_id() net.set_train() _executor.compile(net, x, phase='train') strategies = _executor._get_shard_strategy(net) for (k, v) in strategies.items(): if re.search('ReduceMean-op', k) is not None: assert v == [[8, 1, 1, 1]] elif re.search('ReLU-op', k) is not None: assert v == [[8, 1, 1, 1]] elif re.search('Mul-op', k) is not None: assert v == [[8, 1, 1, 1], [8, 1, 1, 1]] elif re.search('ReduceSum-op', k) is not None: assert v == [[8, 1, 1, 1]]
def all_to_all_common(strategy1): learning_rate = 0.1 momentum = 0.9 epoch_size = 2 context.reset_auto_parallel_context() context.set_auto_parallel_context( parallel_mode=ParallelMode.SEMI_AUTO_PARALLEL, device_num=8) predict = Tensor(np.ones([32, 128]), dtype=ms.float32) label = Tensor(np.ones([32]), dtype=ms.int32) dataset = Dataset(predict, label, 2) net = all_to_all_net(strategy1) loss = SoftmaxCrossEntropyWithLogits(sparse=True) loss.softmax_cross_entropy.shard(((8, 1), (8, 1))) loss.one_hot.shard(((8, 1), (), ())) opt = Momentum(net.trainable_params(), learning_rate, momentum) model = Model(net, loss, opt) model.train(epoch_size, dataset, dataset_sink_mode=False) strategys = _executor._get_shard_strategy(model._train_network) return strategys
def test_two_bn(): class Net(nn.Cell): def __init__(self): super().__init__() self.block1 = get_block() self.block2 = get_block() self.relu = P.ReLU() self.add = P.Add() self.bias = Tensor(np.ones([64, 64]), dtype=ms.float32) def construct(self, x): out = self.block1(x) out = self.relu(out) out = self.add(out, self.bias) out = self.block2(out) return out context.set_context(save_graphs=False) context.set_auto_parallel_context(device_num=8, global_rank=0) context.set_auto_parallel_context(parallel_mode="auto_parallel") net = NetWithLoss(Net()) x = Tensor(np.ones([64, 64]), dtype=ms.float32) net.set_auto_parallel() net.set_train() set_algo_parameters(elementwise_op_strategy_follow=True) reset_op_id() _executor.compile(net, x, phase='train') strategies = _executor._get_shard_strategy(net) assert len(strategies) == 4 for (k, v) in strategies.items(): if re.search('BatchNorm-op', k) is not None: assert v == [[8, 1], [1], [1], [1], [1]] elif re.search('TensorAdd-op', k) is not None: assert v == [[8, 1], [8, 1]] elif re.search('ReLU-op', k) is not None: assert v == [[8, 1]]
else: ## fp32 training opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), lr, config.momentum, config.weight_decay) model = Model(net, loss_fn=loss, optimizer=opt, metrics={'acc'}) # define callbacks time_cb = TimeMonitor(data_size=step_size) loss_cb = LossMonitor() cb = [time_cb, loss_cb] if config.save_checkpoint: config_ck = CheckpointConfig( save_checkpoint_steps=config.save_checkpoint_epochs * step_size, keep_checkpoint_max=config.keep_checkpoint_max) ckpt_cb = ModelCheckpoint(prefix="resnet", directory=ckpt_save_dir, config=config_ck) cb += [ckpt_cb] # train model if args_opt.net == "se-resnet50": config.epoch_size = config.train_epoch_size model.train(config.epoch_size - config.pretrain_epoch_size, dataset, callbacks=cb, sink_size=dataset.get_dataset_size(), dataset_sink_mode=(not args_opt.parameter_server)) open("resnet50_partition.txt", "w").write(str(_executor._get_shard_strategy(model._train_network)))
def test_two_matmul(): class Net(nn.Cell): def __init__(self): super().__init__() self.matmul1 = P.MatMul() self.matmul2 = P.MatMul() def construct(self, x, y, b): out = self.matmul1(x, y) out = self.matmul2(out, b) return out size = 16 context.set_auto_parallel_context(device_num=size, global_rank=0) cost_model_context.set_cost_model_context( device_memory_capacity=32.0 * 1024.0 * 1024.0 * 1024.0, costmodel_alpha=1.0, costmodel_beta=60.0, costmodel_gamma=0.1, costmodel_communi_threshold=1024.0, costmodel_communi_const=2222.0, costmodel_communi_bias=1111.0) dev_mem_cap = cost_model_context.get_cost_model_context( "device_memory_capacity") assert dev_mem_cap == 32.0 * 1024.0 * 1024.0 * 1024.0 costmodel_alpha = cost_model_context.get_cost_model_context( "costmodel_alpha") assert costmodel_alpha == 1.0 costmodel_beta = cost_model_context.get_cost_model_context( "costmodel_beta") assert costmodel_beta == 60.0 costmodel_gamma = cost_model_context.get_cost_model_context( "costmodel_gamma") assert costmodel_gamma == 0.1 costmodel_communi_threshold = cost_model_context.get_cost_model_context( "costmodel_communi_threshold") assert costmodel_communi_threshold == 1024.0 costmodel_communi_const = cost_model_context.get_cost_model_context( "costmodel_communi_const") assert costmodel_communi_const == 2222.0 costmodel_communi_bias = cost_model_context.get_cost_model_context( "costmodel_communi_bias") assert costmodel_communi_bias == 1111.0 cost_model_context.reset_cost_model_context() dev_mem_cap = cost_model_context.get_cost_model_context( "device_memory_capacity") assert dev_mem_cap == 16.0 * 1024.0 * 1024.0 * 1024.0 costmodel_alpha = cost_model_context.get_cost_model_context( "costmodel_alpha") assert costmodel_alpha == 1.0 costmodel_beta = cost_model_context.get_cost_model_context( "costmodel_beta") assert costmodel_beta == 400.0 costmodel_gamma = cost_model_context.get_cost_model_context( "costmodel_gamma") assert costmodel_gamma == 0.001 costmodel_communi_threshold = cost_model_context.get_cost_model_context( "costmodel_communi_threshold") assert costmodel_communi_threshold == 2048.0 costmodel_communi_const = cost_model_context.get_cost_model_context( "costmodel_communi_const") assert costmodel_communi_const == 3072.0 costmodel_communi_bias = cost_model_context.get_cost_model_context( "costmodel_communi_bias") assert costmodel_communi_bias == 1024.0 set_algo_parameters(tensor_slice_align_enable=False, tensor_slice_align_size=32, fully_use_devices=False, elementwise_op_strategy_follow=False, enable_algo_approxi=True, algo_approxi_epsilon=0.001) para_slice_align_enable = get_algo_parameters("tensor_slice_align_enable") assert not para_slice_align_enable para_slice_align_size = get_algo_parameters("tensor_slice_align_size") assert para_slice_align_size == 32 fully_use_devices = get_algo_parameters("fully_use_devices") assert not fully_use_devices elementwise_op_strategy_follow = get_algo_parameters( "elementwise_op_strategy_follow") assert not elementwise_op_strategy_follow enable_approxi = get_algo_parameters("enable_algo_approxi") assert enable_approxi algo_epsilon = get_algo_parameters("algo_approxi_epsilon") assert algo_epsilon == 0.001 expecte_single_loop = True signle_loop = _get_algo_single_loop() assert expecte_single_loop == signle_loop expecte_single_loop = False _set_algo_single_loop(expecte_single_loop) signle_loop = _get_algo_single_loop() assert expecte_single_loop == signle_loop reset_algo_parameters() para_slice_align_enable = get_algo_parameters("tensor_slice_align_enable") assert not para_slice_align_enable para_slice_align_size = get_algo_parameters("tensor_slice_align_size") assert para_slice_align_size == 16 fully_use_devices = get_algo_parameters("fully_use_devices") assert fully_use_devices elementwise_op_strategy_follow = get_algo_parameters( "elementwise_op_strategy_follow") assert not elementwise_op_strategy_follow enable_approxi = get_algo_parameters("enable_algo_approxi") assert not enable_approxi algo_epsilon = get_algo_parameters("algo_approxi_epsilon") assert algo_epsilon == 0.1 x = Tensor(np.ones([128, 32]), dtype=ms.float32) y = Tensor(np.ones([32, 64]), dtype=ms.float32) b = Tensor(np.ones([64, 64]), dtype=ms.float32) net = NetWithLoss(Net()) context.set_auto_parallel_context(parallel_mode="auto_parallel") net.set_auto_parallel() reset_op_id() net.set_train() _executor.compile(net, x, y, b, phase='train') strategies = _executor._get_shard_strategy(net) for (k, v) in strategies.items(): if re.search('MatMul-op', k) is not None: assert v == [[16, 1], [1, 1]]
def test_two_matmul(): class Net(nn.Cell): def __init__(self): super().__init__() self.matmul1 = P.MatMul() self.matmul2 = P.MatMul() def construct(self, x, y, b): out = self.matmul1(x, y) out = self.matmul2(out, b) return out size = 16 context.set_auto_parallel_context(device_num=size, global_rank=0) cost_model_context.set_cost_model_context( device_memory_capacity=32.0 * 1024.0 * 1024.0 * 1024.0, costmodel_alpha=1.0, costmodel_beta=60.0, costmodel_gamma=0.1, costmodel_communi_threshold=1024.0, costmodel_communi_const=2222.0, costmodel_communi_bias=1111.0) dev_mem_cap = cost_model_context.get_cost_model_context( "device_memory_capacity") assert dev_mem_cap == 32.0 * 1024.0 * 1024.0 * 1024.0 costmodel_alpha = cost_model_context.get_cost_model_context( "costmodel_alpha") assert costmodel_alpha == 1.0 costmodel_beta = cost_model_context.get_cost_model_context( "costmodel_beta") assert costmodel_beta == 60.0 costmodel_gamma = cost_model_context.get_cost_model_context( "costmodel_gamma") assert costmodel_gamma == 0.1 costmodel_communi_threshold = cost_model_context.get_cost_model_context( "costmodel_communi_threshold") assert costmodel_communi_threshold == 1024.0 costmodel_communi_const = cost_model_context.get_cost_model_context( "costmodel_communi_const") assert costmodel_communi_const == 2222.0 costmodel_communi_bias = cost_model_context.get_cost_model_context( "costmodel_communi_bias") assert costmodel_communi_bias == 1111.0 cost_model_context.reset_cost_model_context() dev_mem_cap = cost_model_context.get_cost_model_context( "device_memory_capacity") assert dev_mem_cap == 16.0 * 1024.0 * 1024.0 * 1024.0 costmodel_alpha = cost_model_context.get_cost_model_context( "costmodel_alpha") assert costmodel_alpha == 1.0 costmodel_beta = cost_model_context.get_cost_model_context( "costmodel_beta") assert costmodel_beta == 400.0 costmodel_gamma = cost_model_context.get_cost_model_context( "costmodel_gamma") assert costmodel_gamma == 0.001 costmodel_communi_threshold = cost_model_context.get_cost_model_context( "costmodel_communi_threshold") assert costmodel_communi_threshold == 2048.0 costmodel_communi_const = cost_model_context.get_cost_model_context( "costmodel_communi_const") assert costmodel_communi_const == 3072.0 costmodel_communi_bias = cost_model_context.get_cost_model_context( "costmodel_communi_bias") assert costmodel_communi_bias == 1024.0 set_algo_parameters(tensor_slice_align_enable=False, tensor_slice_align_size=32, fully_use_devices=False, elementwise_op_strategy_follow=False) para_slice_align_enable = get_algo_parameters("tensor_slice_align_enable") assert not para_slice_align_enable para_slice_align_size = get_algo_parameters("tensor_slice_align_size") assert para_slice_align_size == 32 fully_use_devices = get_algo_parameters("fully_use_devices") assert not fully_use_devices elementwise_op_strategy_follow = get_algo_parameters( "elementwise_op_strategy_follow") assert not elementwise_op_strategy_follow reset_algo_parameters() para_slice_align_enable = get_algo_parameters("tensor_slice_align_enable") assert not para_slice_align_enable para_slice_align_size = get_algo_parameters("tensor_slice_align_size") assert para_slice_align_size == 16 fully_use_devices = get_algo_parameters("fully_use_devices") assert fully_use_devices elementwise_op_strategy_follow = get_algo_parameters( "elementwise_op_strategy_follow") assert not elementwise_op_strategy_follow x = Tensor(np.ones([128, 32]), dtype=ms.float32) y = Tensor(np.ones([32, 64]), dtype=ms.float32) b = Tensor(np.ones([64, 64]), dtype=ms.float32) net = NetWithLoss(Net()) context.set_auto_parallel_context(parallel_mode="auto_parallel") net.set_auto_parallel() reset_op_id() _executor.compile(net, x, y, b, phase='train') strategies = _executor._get_shard_strategy(net) expected_strategies = { 'Default/network-Net/MatMul-op0': [[16, 1], [1, 1]], 'Default/network-Net/MatMul-op1': [[16, 1], [1, 1]] } assert strategies == expected_strategies
def run_pretrain(): """pre-train bert_clue""" parser = argparse.ArgumentParser(description='bert pre_training') parser.add_argument( '--device_target', type=str, default='Ascend', choices=['Ascend', 'GPU'], help='device where the code will be implemented. (Default: Ascend)') parser.add_argument("--distribute", type=str, default="false", choices=["true", "false"], help="Run distribute, default is false.") parser.add_argument("--epoch_size", type=int, default="1", help="Epoch size, default is 1.") parser.add_argument("--device_id", type=int, default=0, help="Device id, default is 0.") parser.add_argument("--device_num", type=int, default=1, help="Use device nums, default is 1.") parser.add_argument("--enable_save_ckpt", type=str, default="true", choices=["true", "false"], help="Enable save checkpoint, default is true.") parser.add_argument("--enable_lossscale", type=str, default="true", choices=["true", "false"], help="Use lossscale or not, default is not.") parser.add_argument("--do_shuffle", type=str, default="true", choices=["true", "false"], help="Enable shuffle for dataset, default is true.") parser.add_argument("--enable_data_sink", type=str, default="true", choices=["true", "false"], help="Enable data sink, default is true.") parser.add_argument("--data_sink_steps", type=int, default="1", help="Sink steps for each epoch, default is 1.") parser.add_argument( "--accumulation_steps", type=int, default="1", help= "Accumulating gradients N times before weight update, default is 1.") parser.add_argument("--save_checkpoint_path", type=str, default="", help="Save checkpoint path") parser.add_argument("--load_checkpoint_path", type=str, default="", help="Load checkpoint file path") parser.add_argument("--save_checkpoint_steps", type=int, default=1000, help="Save checkpoint steps, " "default is 1000.") parser.add_argument("--train_steps", type=int, default=-1, help="Training Steps, default is -1, " "meaning run all steps according to epoch number.") parser.add_argument("--save_checkpoint_num", type=int, default=1, help="Save checkpoint numbers, default is 1.") parser.add_argument("--data_dir", type=str, default="", help="Data path, it is better to use absolute path") parser.add_argument("--schema_dir", type=str, default="", help="Schema path, it is better to use absolute path") args_opt = parser.parse_args() context.set_context(mode=context.GRAPH_MODE, device_target=args_opt.device_target, device_id=args_opt.device_id) context.set_context(reserve_class_name_in_scope=False) ckpt_save_dir = args_opt.save_checkpoint_path if args_opt.distribute == "true": if args_opt.device_target == 'Ascend': D.init() device_num = args_opt.device_num rank = args_opt.device_id % device_num else: D.init() device_num = D.get_group_size() rank = D.get_rank() ckpt_save_dir = args_opt.save_checkpoint_path + 'ckpt_' + str( get_rank()) + '/' context.reset_auto_parallel_context() context.set_auto_parallel_context( parallel_mode=ParallelMode.AUTO_PARALLEL, gradients_mean=True, device_num=device_num, auto_parallel_search_mode="recursive_programming") _set_bert_all_reduce_split() else: rank = 0 device_num = 1 if args_opt.device_target == 'GPU' and bert_net_cfg.compute_type != mstype.float32: logger.warning('Gpu only support fp32 temporarily, run with fp32.') bert_net_cfg.compute_type = mstype.float32 if args_opt.accumulation_steps > 1: logger.info("accumulation steps: {}".format( args_opt.accumulation_steps)) logger.info("global batch size: {}".format( cfg.batch_size * args_opt.accumulation_steps)) if args_opt.enable_data_sink == "true": args_opt.data_sink_steps *= args_opt.accumulation_steps logger.info("data sink steps: {}".format(args_opt.data_sink_steps)) if args_opt.enable_save_ckpt == "true": args_opt.save_checkpoint_steps *= args_opt.accumulation_steps logger.info("save checkpoint steps: {}".format( args_opt.save_checkpoint_steps)) ds = create_bert_dataset(device_num, rank, args_opt.do_shuffle, args_opt.data_dir, args_opt.schema_dir) net_with_loss = BertNetworkWithLoss(bert_net_cfg, True) new_repeat_count = args_opt.epoch_size * ds.get_dataset_size( ) // args_opt.data_sink_steps if args_opt.train_steps > 0: train_steps = args_opt.train_steps * args_opt.accumulation_steps new_repeat_count = min(new_repeat_count, train_steps // args_opt.data_sink_steps) else: args_opt.train_steps = args_opt.epoch_size * ds.get_dataset_size( ) // args_opt.accumulation_steps logger.info("train steps: {}".format(args_opt.train_steps)) optimizer = _get_optimizer(args_opt, net_with_loss) callback = [ TimeMonitor(args_opt.data_sink_steps), LossCallBack(ds.get_dataset_size()) ] if args_opt.enable_save_ckpt == "true" and args_opt.device_id % min( 8, device_num) == 0: config_ck = CheckpointConfig( save_checkpoint_steps=args_opt.save_checkpoint_steps, keep_checkpoint_max=args_opt.save_checkpoint_num) ckpoint_cb = ModelCheckpoint( prefix='checkpoint_bert', directory=None if ckpt_save_dir == "" else ckpt_save_dir, config=config_ck) callback.append(ckpoint_cb) if args_opt.load_checkpoint_path: param_dict = load_checkpoint(args_opt.load_checkpoint_path) load_param_into_net(net_with_loss, param_dict) if args_opt.enable_lossscale == "true": update_cell = DynamicLossScaleUpdateCell( loss_scale_value=cfg.loss_scale_value, scale_factor=cfg.scale_factor, scale_window=cfg.scale_window) if args_opt.accumulation_steps <= 1: net_with_grads = BertTrainOneStepWithLossScaleCell( net_with_loss, optimizer=optimizer, scale_update_cell=update_cell) else: accumulation_steps = args_opt.accumulation_steps net_with_grads = BertTrainAccumulateStepsWithLossScaleCell( net_with_loss, optimizer=optimizer, scale_update_cell=update_cell, accumulation_steps=accumulation_steps, enable_global_norm=cfg.enable_global_norm) else: net_with_grads = BertTrainOneStepCell(net_with_loss, optimizer=optimizer) model = Model(net_with_grads) model.train(new_repeat_count, ds, callbacks=callback, dataset_sink_mode=(args_opt.enable_data_sink == "true"), sink_size=args_opt.data_sink_steps) open("bert_4gpu.txt", "w").write(str(_executor._get_shard_strategy(model._train_network)))
def compile_graph_two_input(x, y, net): net.set_auto_parallel() net.set_train(False) _executor.compile(net, x, y, auto_parallel_mode=True) strategies = _executor._get_shard_strategy(net) return strategies
def train(cloud_args=None): """training process""" args = parse_args(cloud_args) context.set_context(mode=context.GRAPH_MODE, enable_auto_mixed_precision=True, device_target=args.platform, save_graphs=False) if os.getenv('DEVICE_ID', "not_set").isdigit(): context.set_context(device_id=int(os.getenv('DEVICE_ID'))) # init distributed if args.is_distributed: # parallel_mode = ParallelMode.DATA_PARALLEL parallel_mode = ParallelMode.AUTO_PARALLEL context.set_auto_parallel_context(parallel_mode=parallel_mode, device_num=args.group_size, gradients_mean=True) # dataloader de_dataset = classification_dataset(args.data_dir, args.image_size, args.per_batch_size, 1, args.rank, args.group_size, num_parallel_workers=8) de_dataset.map_model = 4 # !!!important args.steps_per_epoch = de_dataset.get_dataset_size() args.logger.save_args(args) # network args.logger.important_info('start create network') # get network and init network = get_network(args.backbone, num_classes=args.num_classes, platform=args.platform) if network is None: raise NotImplementedError('not implement {}'.format(args.backbone)) load_pretrain_model(args.pretrained, network, args) # lr scheduler lr = get_lr(args) # optimizer opt = Momentum(params=get_param_groups(network), learning_rate=Tensor(lr), momentum=args.momentum, weight_decay=args.weight_decay, loss_scale=args.loss_scale) # loss if not args.label_smooth: args.label_smooth_factor = 0.0 loss = CrossEntropy(smooth_factor=args.label_smooth_factor, num_classes=args.num_classes) if args.is_dynamic_loss_scale == 1: loss_scale_manager = DynamicLossScaleManager(init_loss_scale=65536, scale_factor=2, scale_window=2000) else: loss_scale_manager = FixedLossScaleManager(args.loss_scale, drop_overflow_update=False) model = Model(network, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale_manager, metrics={'acc'}, amp_level="O3") # checkpoint save progress_cb = ProgressMonitor(args) callbacks = [progress_cb,] if args.rank_save_ckpt_flag: ckpt_config = CheckpointConfig(save_checkpoint_steps=args.ckpt_interval * args.steps_per_epoch, keep_checkpoint_max=args.ckpt_save_max) save_ckpt_path = os.path.join(args.outputs_dir, 'ckpt_' + str(args.rank) + '/') ckpt_cb = ModelCheckpoint(config=ckpt_config, directory=save_ckpt_path, prefix='{}'.format(args.rank)) callbacks.append(ckpt_cb) model.train(args.max_epoch, de_dataset, callbacks=callbacks, dataset_sink_mode=True) open("resnext50_partition_4gpu.txt", "w").write(str(_executor._get_shard_strategy(model._train_network))) print("successfully exported partition!")