def test_train_64k_8p(batch_size=32, num_classes=65536): # 1048576 #131072 #32768 #8192 dev_num = 8 context.set_auto_parallel_context(parallel_mode=ParallelMode.AUTO_PARALLEL, device_num=dev_num) cost_model_context.set_cost_model_context(costmodel_gamma=0.001, costmodel_beta=400.0) set_algo_parameters(elementwise_op_strategy_follow=True) resset_op_id() np.random.seed(6) input_np = np.ones([batch_size, 3, 224, 224]).astype(np.float32) label_np = np.zeros([batch_size]).astype(np.int32) for i in range(0, batch_size): label_np[i] = i % num_classes dataset = DatasetLenet(Tensor(input_np), Tensor(label_np), 1) net = resnet50(num_classes) loss = SoftmaxCrossEntropyExpand(sparse=True) opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), 0.01, 0.9) model = Model(net, loss_fn=loss, optimizer=opt) model.train(5, dataset, dataset_sink_mode=False) strategies = _executor._get_shard_strategy(model._train_network) for (k, v) in strategies.items(): if re.search('Conv2D-op', k) is not None: assert v[0][0] == dev_num elif re.search('MatMul-op', k) is not None: assert v == [[1, 1], [dev_num, 1]] elif re.search('ReduceSum-op', k) is not None: assert v == [[1, dev_num]]
def test_allreduce_fusion5(): cost_model_context.set_cost_model_context(costmodel_allreduce_fusion_algorithm=2) cost_model_context.set_cost_model_context(costmodel_allreduce_fusion_tail_time=0.1) cost_model_context.set_cost_model_context(costmodel_allreduce_fusion_allreduce_inherent_time=0.05) cost_model_context.set_cost_model_context(costmodel_allreduce_fusion_allreduce_bandwidth=0.000001) cost_model_context.set_cost_model_context(costmodel_allreduce_fusion_computation_time_parameter=0.0000015) context.reset_auto_parallel_context() context.set_auto_parallel_context(parallel_mode=ParallelMode.SEMI_AUTO_PARALLEL) net = SimpleDMLNet(DenseNet2(has_bias=False, activation=None), DenseNet2(has_bias=False, activation=None)) allreduce_fusion_dict = train_common(net) expect_dict = {'backbone2.fc8.weight': 3, 'backbone2.fc7.weight': 3, 'backbone2.fc6.weight': 3, 'backbone2.fc5.weight': 3, 'backbone2.fc4.weight': 2, 'backbone2.fc3.weight': 2, 'backbone2.fc2.weight': 1, 'backbone2.fc1.weight': 1, 'backbone1.fc8.weight': 3, 'backbone1.fc7.weight': 3, 'backbone1.fc6.weight': 3, 'backbone1.fc5.weight': 3, 'backbone1.fc4.weight': 2, 'backbone1.fc3.weight': 2, 'backbone1.fc2.weight': 1, 'backbone1.fc1.weight': 1,} assert allreduce_fusion_dict == expect_dict cost_model_context.reset_cost_model_context()
def test_allreduce_fusion2(): cost_model_context.set_cost_model_context(costmodel_allreduce_fusion_times=2) cost_model_context.set_cost_model_context(costmodel_allreduce_fusion_tail_percent=0.5) cost_model_context.reset_cost_model_context() net = SimpleDMLNet(DenseNet1(has_bias=False, activation=None), DenseNet2(has_bias=False, activation=None)) allreduce_fusion_dict = train_common(net) expect_dict = {} assert allreduce_fusion_dict == expect_dict cost_model_context.reset_cost_model_context()
def test_inference_phase(): context.set_auto_parallel_context(device_num=8, global_rank=0) context.set_auto_parallel_context(parallel_mode="auto_parallel") set_cost_model_context(run_phase=1) net = Net(512, 128) predict = Tensor(np.ones([64, 512]).astype(np.float32) * 0.001) label = Tensor(np.ones([64, 128]).astype(np.float32)) loss = nn.SoftmaxCrossEntropyWithLogits() optimizer = Momentum(params=net.trainable_params(), learning_rate=0.1, momentum=0.9) net_with_loss = WithLossCell(net, loss) train_network = TrainOneStepCell(net_with_loss, optimizer) train_network.set_train() output = train_network(predict, label)
def test_double_subgraphs(): cost_model_context.set_cost_model_context(multi_subgraphs=True) context.set_context(save_graphs=True) context.set_auto_parallel_context(device_num=8, global_rank=0) net = TrainStepWarp(NetWithLoss(Net())) context.set_auto_parallel_context(parallel_mode="auto_parallel") x = Tensor(np.ones([8, 8, 8, 8]), dtype=ms.float32) reset_op_id() _executor.compile(net, x, phase='train') strategies = _executor._get_strategy(net) expected_strategies = {'Default/network-NetWithLoss/ReduceMean-op0': [[8, 1, 1, 1]], 'Default/network-NetWithLoss/net-Net/ReLU-op1': [[8, 1, 1, 1]], 'Default/network-NetWithLoss/net-Net/Mul-op2': [[8, 1, 1, 1], [8, 1, 1, 1]], 'Default/network-NetWithLoss/net-Net/Mul-op3': [[8, 1, 1, 1], [8, 1, 1, 1]], 'Default/network-NetWithLoss/ReduceSum-op4': [[8, 1, 1, 1]]} assert strategies == expected_strategies
def test_allreduce_fusion4(): cost_model_context.set_cost_model_context(costmodel_allreduce_fusion_algorithm=1) cost_model_context.set_cost_model_context(costmodel_allreduce_fusion_times=2) cost_model_context.set_cost_model_context(costmodel_allreduce_fusion_tail_percent=0.5) context.reset_auto_parallel_context() context.set_auto_parallel_context(parallel_mode=ParallelMode.SEMI_AUTO_PARALLEL) net = SimpleDMLNet(DenseNet2(has_bias=False, activation=None), DenseNet2(has_bias=False, activation=None)) allreduce_fusion_dict = train_common(net) expect_dict = {'backbone2.fc8.weight': 2, 'backbone2.fc7.weight': 2, 'backbone2.fc6.weight': 2, 'backbone1.fc8.weight': 2, 'backbone1.fc7.weight': 2, 'backbone1.fc6.weight': 2, 'backbone2.fc5.weight': 1, 'backbone2.fc4.weight': 1, 'backbone2.fc3.weight': 1, 'backbone2.fc2.weight': 1, 'backbone2.fc1.weight': 1, 'backbone1.fc5.weight': 1, 'backbone1.fc4.weight': 1, 'backbone1.fc3.weight': 1, 'backbone1.fc2.weight': 1, 'backbone1.fc1.weight': 1} assert allreduce_fusion_dict == expect_dict cost_model_context.reset_cost_model_context()
def test_allreduce_fusion3(): cost_model_context.set_cost_model_context( costmodel_allreduce_fusion_algorithm=1) cost_model_context.set_cost_model_context( costmodel_allreduce_fusion_times=3) cost_model_context.set_cost_model_context( costmodel_allreduce_fusion_tail_percent=0.3333333) net = SimpleDMLNet(DenseNet1(has_bias=True, activation='relu'), DenseNet2(has_bias=False, activation='relu')) allreduce_fusion_dict = train_common(net) expect_dict = { 'backbone2.fc8.weight': 3, 'backbone2.fc7.weight': 3, 'backbone2.fc6.weight': 2, 'backbone2.fc5.weight': 2, 'backbone2.fc4.weight': 2, 'backbone2.fc3.weight': 1, 'backbone2.fc2.weight': 1, 'backbone2.fc1.weight': 1, 'backbone1.fc4.bias': 3, 'backbone1.fc4.weight': 3, 'backbone1.fc3.bias': 3, 'backbone1.fc3.weight': 2, 'backbone1.fc2.bias': 2, 'backbone1.fc2.weight': 2, 'backbone1.fc1.bias': 2, 'backbone1.fc1.weight': 2 } assert (allreduce_fusion_dict == expect_dict) cost_model_context.reset_cost_model_context()
def train_32k_8p_fusion2(batch_size=32, num_classes=32768): # 1048576 #131072 #32768 #8192 cost_model_context.set_cost_model_context(costmodel_allreduce_fusion_algorithm=2) cost_model_context.set_cost_model_context(costmodel_allreduce_fusion_tail_time=0.1) cost_model_context.set_cost_model_context(costmodel_allreduce_fusion_allreduce_inherent_time=0.05) cost_model_context.set_cost_model_context(costmodel_allreduce_fusion_allreduce_bandwidth=0.000001) cost_model_context.set_cost_model_context(costmodel_allreduce_fusion_computation_time_parameter=0.0000015) allreduce_fusion_dict = test_train_32k_8p(batch_size, num_classes) expect_dict = {'end_point.bias': 2, 'end_point.weight': 2, 'layer4.2.bn3.beta': 2, 'layer4.2.bn3.gamma': 2, 'layer4.2.conv3.weight': 2, 'layer4.2.bn2.beta': 2, 'layer4.2.bn2.gamma': 2, 'layer4.2.conv2.weight': 2, 'layer4.2.bn1.beta': 2, 'layer4.2.bn1.gamma': 2, 'layer4.2.conv1.weight': 2, 'layer4.1.bn3.beta': 2, 'layer4.1.bn3.gamma': 2, 'layer4.1.conv3.weight': 2, 'layer4.1.bn2.beta': 2, 'layer4.1.bn2.gamma': 2, 'layer4.1.conv2.weight': 2, 'layer4.1.bn1.beta': 2, 'layer4.1.bn1.gamma': 2, 'layer4.1.conv1.weight': 2, 'layer4.0.bn_down_sample.beta': 2, 'layer4.0.bn_down_sample.gamma': 2, 'layer4.0.conv_down_sample.weight': 2, 'layer4.0.bn3.beta': 2, 'layer4.0.bn3.gamma': 2, 'layer4.0.conv3.weight': 2, 'layer4.0.bn2.beta': 2, 'layer4.0.bn2.gamma': 2, 'layer4.0.conv2.weight': 2, 'layer4.0.bn1.beta': 2, 'layer4.0.bn1.gamma': 2, 'layer4.0.conv1.weight': 2, 'layer3.5.bn3.beta': 2, 'layer3.5.bn3.gamma': 2, 'layer3.5.conv3.weight': 2, 'layer3.5.bn2.beta': 2, 'layer3.5.bn2.gamma': 2, 'layer3.5.conv2.weight': 2, 'layer3.5.bn1.beta': 2, 'layer3.5.bn1.gamma': 2, 'layer3.5.conv1.weight': 2, 'layer3.4.bn3.beta': 2, 'layer3.4.bn3.gamma': 2, 'layer3.4.conv3.weight': 2, 'layer3.4.bn2.beta': 2, 'layer3.4.bn2.gamma': 2, 'layer3.4.conv2.weight': 2, 'layer3.4.bn1.beta': 2, 'layer3.4.bn1.gamma': 2, 'layer3.4.conv1.weight': 2, 'layer3.3.bn3.beta': 2, 'layer3.3.bn3.gamma': 2, 'layer3.3.conv3.weight': 2, 'layer3.3.bn2.beta': 2, 'layer3.3.bn2.gamma': 2, 'layer3.3.conv2.weight': 2, 'layer3.3.bn1.beta': 2, 'layer3.3.bn1.gamma': 2, 'layer3.3.conv1.weight': 2, 'layer3.2.bn3.beta': 2, 'layer3.2.bn3.gamma': 2, 'layer3.2.conv3.weight': 2, 'layer3.2.bn2.beta': 2, 'layer3.2.bn2.gamma': 2, 'layer3.2.conv2.weight': 2, 'layer3.2.bn1.beta': 2, 'layer3.2.bn1.gamma': 2, 'layer3.2.conv1.weight': 2, 'layer3.1.bn3.beta': 2, 'layer3.1.bn3.gamma': 2, 'layer3.1.conv3.weight': 2, 'layer3.1.bn2.beta': 2, 'layer3.1.bn2.gamma': 2, 'layer3.1.conv2.weight': 2, 'layer3.1.bn1.beta': 2, 'layer3.1.bn1.gamma': 2, 'layer3.1.conv1.weight': 2, 'layer3.0.bn_down_sample.beta': 2, 'layer3.0.bn_down_sample.gamma': 2, 'layer3.0.conv_down_sample.weight': 2, 'layer3.0.bn3.beta': 2, 'layer3.0.bn3.gamma': 2, 'layer3.0.conv3.weight': 2, 'layer3.0.bn2.beta': 2, 'layer3.0.bn2.gamma': 2, 'layer3.0.conv2.weight': 2, 'layer3.0.bn1.beta': 2, 'layer3.0.bn1.gamma': 2, 'layer3.0.conv1.weight': 2, 'layer2.3.bn3.beta': 2, 'layer2.3.bn3.gamma': 2, 'layer2.3.conv3.weight': 2, 'layer2.3.bn2.beta': 2, 'layer2.3.bn2.gamma': 2, 'layer2.3.conv2.weight': 2, 'layer2.3.bn1.beta': 2, 'layer2.3.bn1.gamma': 2, 'layer2.3.conv1.weight': 2, 'layer2.2.bn3.beta': 2, 'layer2.2.bn3.gamma': 2, 'layer2.2.conv3.weight': 2, 'layer2.2.bn2.beta': 2, 'layer2.2.bn2.gamma': 2, 'layer2.2.conv2.weight': 2, 'layer2.2.bn1.beta': 2, 'layer2.2.bn1.gamma': 2, 'layer2.2.conv1.weight': 2, 'layer2.1.bn3.beta': 2, 'layer2.1.bn3.gamma': 2, 'layer2.1.conv3.weight': 2, 'layer2.1.bn2.beta': 2, 'layer2.1.bn2.gamma': 2, 'layer2.1.conv2.weight': 2, 'layer2.1.bn1.beta': 2, 'layer2.1.bn1.gamma': 2, 'layer2.1.conv1.weight': 2, 'layer2.0.bn_down_sample.beta': 2, 'layer2.0.bn_down_sample.gamma': 2, 'layer2.0.conv_down_sample.weight': 2, 'layer2.0.bn3.beta': 2, 'layer2.0.bn3.gamma': 2, 'layer2.0.conv3.weight': 2, 'layer2.0.bn2.beta': 2, 'layer2.0.bn2.gamma': 2, 'layer2.0.conv2.weight': 2, 'layer2.0.bn1.beta': 2, 'layer2.0.bn1.gamma': 2, 'layer2.0.conv1.weight': 2, 'layer1.2.bn3.beta': 2, 'layer1.2.bn3.gamma': 2, 'layer1.2.conv3.weight': 2, 'layer1.2.bn2.beta': 2, 'layer1.2.bn2.gamma': 2, 'layer1.2.conv2.weight': 2, 'layer1.2.bn1.beta': 2, 'layer1.2.bn1.gamma': 2, 'layer1.2.conv1.weight': 2, 'layer1.1.bn3.beta': 2, 'layer1.1.bn3.gamma': 2, 'layer1.1.conv3.weight': 2, 'layer1.1.bn2.beta': 2, 'layer1.1.bn2.gamma': 2, 'layer1.1.conv2.weight': 2, 'layer1.1.bn1.beta': 2, 'layer1.1.bn1.gamma': 2, 'layer1.1.conv1.weight': 2, 'layer1.0.bn_down_sample.beta': 2, 'layer1.0.bn_down_sample.gamma': 2, 'layer1.0.conv_down_sample.weight': 2, 'layer1.0.bn3.beta': 2, 'layer1.0.bn3.gamma': 2, 'layer1.0.conv3.weight': 2, 'layer1.0.bn2.beta': 2, 'layer1.0.bn2.gamma': 2, 'layer1.0.conv2.weight': 1, 'layer1.0.bn1.beta': 1, 'layer1.0.bn1.gamma': 1, 'layer1.0.conv1.weight': 1, 'bn1.beta': 1, 'bn1.gamma': 1, 'conv1.weight': 1} assert allreduce_fusion_dict == expect_dict cost_model_context.reset_cost_model_context()
def train_32k_8p_fusion1(batch_size=32, num_classes=32768): # 1048576 #131072 #32768 #8192 cost_model_context.set_cost_model_context(costmodel_gamma=0.001, costmodel_beta=400.0) cost_model_context.set_cost_model_context(costmodel_allreduce_fusion_algorithm=1) cost_model_context.set_cost_model_context(costmodel_allreduce_fusion_times=2) cost_model_context.set_cost_model_context(costmodel_allreduce_fusion_tail_percent=0.5) allreduce_fusion_dict = test_train_32k_8p(batch_size, num_classes) expect_dict = {'end_point.bias': 2, 'end_point.weight': 2, 'layer4.2.bn3.beta': 2, 'layer4.2.bn3.gamma': 2, 'layer4.2.conv3.weight': 2, 'layer4.2.bn2.beta': 2, 'layer4.2.bn2.gamma': 2, 'layer4.2.conv2.weight': 2, 'layer4.2.bn1.beta': 2, 'layer4.2.bn1.gamma': 2, 'layer4.2.conv1.weight': 2, 'layer4.1.bn3.beta': 2, 'layer4.1.bn3.gamma': 2, 'layer4.1.conv3.weight': 2, 'layer4.1.bn2.beta': 2, 'layer4.1.bn2.gamma': 2, 'layer4.1.conv2.weight': 2, 'layer4.1.bn1.beta': 2, 'layer4.1.bn1.gamma': 2, 'layer4.1.conv1.weight': 2, 'layer4.0.bn_down_sample.beta': 2, 'layer4.0.bn_down_sample.gamma': 2, 'layer4.0.conv_down_sample.weight': 2, 'layer4.0.bn3.beta': 2, 'layer4.0.bn3.gamma': 2, 'layer4.0.conv3.weight': 2, 'layer4.0.bn2.beta': 2, 'layer4.0.bn2.gamma': 2, 'layer4.0.conv2.weight': 2, 'layer4.0.bn1.beta': 2, 'layer4.0.bn1.gamma': 2, 'layer4.0.conv1.weight': 2, 'layer3.5.bn3.beta': 2, 'layer3.5.bn3.gamma': 2, 'layer3.5.conv3.weight': 2, 'layer3.5.bn2.beta': 2, 'layer3.5.bn2.gamma': 2, 'layer3.5.conv2.weight': 2, 'layer3.5.bn1.beta': 2, 'layer3.5.bn1.gamma': 2, 'layer3.5.conv1.weight': 2, 'layer3.4.bn3.beta': 2, 'layer3.4.bn3.gamma': 2, 'layer3.4.conv3.weight': 2, 'layer3.4.bn2.beta': 2, 'layer3.4.bn2.gamma': 2, 'layer3.4.conv2.weight': 2, 'layer3.4.bn1.beta': 2, 'layer3.4.bn1.gamma': 2, 'layer3.4.conv1.weight': 2, 'layer3.3.bn3.beta': 2, 'layer3.3.bn3.gamma': 2, 'layer3.3.conv3.weight': 2, 'layer3.3.bn2.beta': 2, 'layer3.3.bn2.gamma': 2, 'layer3.3.conv2.weight': 2, 'layer3.3.bn1.beta': 2, 'layer3.3.bn1.gamma': 2, 'layer3.3.conv1.weight': 2, 'layer3.2.bn3.beta': 2, 'layer3.2.bn3.gamma': 2, 'layer3.2.conv3.weight': 2, 'layer3.2.bn2.beta': 2, 'layer3.2.bn2.gamma': 2, 'layer3.2.conv2.weight': 2, 'layer3.2.bn1.beta': 2, 'layer3.2.bn1.gamma': 2, 'layer3.2.conv1.weight': 2, 'layer3.1.bn3.beta': 2, 'layer3.1.bn3.gamma': 2, 'layer3.1.conv3.weight': 2, 'layer3.1.bn2.beta': 2, 'layer3.1.bn2.gamma': 2, 'layer3.1.conv2.weight': 2, 'layer3.1.bn1.beta': 2, 'layer3.1.bn1.gamma': 2, 'layer3.1.conv1.weight': 2, 'layer3.0.bn_down_sample.beta': 2, 'layer3.0.bn_down_sample.gamma': 2, 'layer3.0.conv_down_sample.weight': 2, 'layer3.0.bn3.beta': 2, 'layer3.0.bn3.gamma': 2, 'layer3.0.conv3.weight': 2, 'layer3.0.bn2.beta': 2, 'layer3.0.bn2.gamma': 2, 'layer3.0.conv2.weight': 2, 'layer3.0.bn1.beta': 2, 'layer3.0.bn1.gamma': 2, 'layer3.0.conv1.weight': 2, 'layer2.3.bn3.beta': 2, 'layer2.3.bn3.gamma': 2, 'layer2.3.conv3.weight': 2, 'layer2.3.bn2.beta': 2, 'layer2.3.bn2.gamma': 2, 'layer2.3.conv2.weight': 2, 'layer2.3.bn1.beta': 2, 'layer2.3.bn1.gamma': 2, 'layer2.3.conv1.weight': 2, 'layer2.2.bn3.beta': 2, 'layer2.2.bn3.gamma': 2, 'layer2.2.conv3.weight': 2, 'layer2.2.bn2.beta': 2, 'layer2.2.bn2.gamma': 2, 'layer2.2.conv2.weight': 2, 'layer2.2.bn1.beta': 2, 'layer2.2.bn1.gamma': 2, 'layer2.2.conv1.weight': 2, 'layer2.1.bn3.beta': 2, 'layer2.1.bn3.gamma': 2, 'layer2.1.conv3.weight': 2, 'layer2.1.bn2.beta': 2, 'layer2.1.bn2.gamma': 2, 'layer2.1.conv2.weight': 2, 'layer2.1.bn1.beta': 2, 'layer2.1.bn1.gamma': 2, 'layer2.1.conv1.weight': 2, 'layer2.0.bn_down_sample.beta': 2, 'layer2.0.bn_down_sample.gamma': 2, 'layer2.0.conv_down_sample.weight': 2, 'layer2.0.bn3.beta': 2, 'layer2.0.bn3.gamma': 2, 'layer2.0.conv3.weight': 2, 'layer2.0.bn2.beta': 2, 'layer2.0.bn2.gamma': 2, 'layer2.0.conv2.weight': 2, 'layer2.0.bn1.beta': 2, 'layer2.0.bn1.gamma': 2, 'layer2.0.conv1.weight': 2, 'layer1.2.bn3.beta': 2, 'layer1.2.bn3.gamma': 2, 'layer1.2.conv3.weight': 2, 'layer1.2.bn2.beta': 2, 'layer1.2.bn2.gamma': 2, 'layer1.2.conv2.weight': 2, 'layer1.2.bn1.beta': 2, 'layer1.2.bn1.gamma': 2, 'layer1.2.conv1.weight': 2, 'layer1.1.bn3.beta': 2, 'layer1.1.bn3.gamma': 2, 'layer1.1.conv3.weight': 2, 'layer1.1.bn2.beta': 2, 'layer1.1.bn2.gamma': 2, 'layer1.1.conv2.weight': 2, 'layer1.1.bn1.beta': 2, 'layer1.1.bn1.gamma': 2, 'layer1.1.conv1.weight': 2, 'layer1.0.bn_down_sample.beta': 2, 'layer1.0.bn_down_sample.gamma': 2, 'layer1.0.conv_down_sample.weight': 2, 'layer1.0.bn3.beta': 2, 'layer1.0.bn3.gamma': 2, 'layer1.0.conv3.weight': 2, 'layer1.0.bn2.beta': 2, 'layer1.0.bn2.gamma': 2, 'layer1.0.conv2.weight': 2, 'layer1.0.bn1.beta': 2, 'layer1.0.bn1.gamma': 2, 'layer1.0.conv1.weight': 2, 'bn1.beta': 1, 'bn1.gamma': 1, 'conv1.weight': 1} assert allreduce_fusion_dict == expect_dict cost_model_context.reset_cost_model_context()
def test_two_matmul(): class Net(nn.Cell): def __init__(self): super().__init__() self.matmul1 = P.MatMul() self.matmul2 = P.MatMul() def construct(self, x, y, b): out = self.matmul1(x, y) out = self.matmul2(out, b) return out size = 16 context.set_auto_parallel_context(device_num=size, global_rank=0) cost_model_context.set_cost_model_context( device_memory_capacity=32.0 * 1024.0 * 1024.0 * 1024.0, costmodel_alpha=1.0, costmodel_beta=60.0, costmodel_gamma=0.1, costmodel_communi_threshold=1024.0, costmodel_communi_const=2222.0, costmodel_communi_bias=1111.0) dev_mem_cap = cost_model_context.get_cost_model_context( "device_memory_capacity") assert dev_mem_cap == 32.0 * 1024.0 * 1024.0 * 1024.0 costmodel_alpha = cost_model_context.get_cost_model_context( "costmodel_alpha") assert costmodel_alpha == 1.0 costmodel_beta = cost_model_context.get_cost_model_context( "costmodel_beta") assert costmodel_beta == 60.0 costmodel_gamma = cost_model_context.get_cost_model_context( "costmodel_gamma") assert costmodel_gamma == 0.1 costmodel_communi_threshold = cost_model_context.get_cost_model_context( "costmodel_communi_threshold") assert costmodel_communi_threshold == 1024.0 costmodel_communi_const = cost_model_context.get_cost_model_context( "costmodel_communi_const") assert costmodel_communi_const == 2222.0 costmodel_communi_bias = cost_model_context.get_cost_model_context( "costmodel_communi_bias") assert costmodel_communi_bias == 1111.0 cost_model_context.reset_cost_model_context() dev_mem_cap = cost_model_context.get_cost_model_context( "device_memory_capacity") assert dev_mem_cap == 16.0 * 1024.0 * 1024.0 * 1024.0 costmodel_alpha = cost_model_context.get_cost_model_context( "costmodel_alpha") assert costmodel_alpha == 1.0 costmodel_beta = cost_model_context.get_cost_model_context( "costmodel_beta") assert costmodel_beta == 400.0 costmodel_gamma = cost_model_context.get_cost_model_context( "costmodel_gamma") assert costmodel_gamma == 0.001 costmodel_communi_threshold = cost_model_context.get_cost_model_context( "costmodel_communi_threshold") assert costmodel_communi_threshold == 2048.0 costmodel_communi_const = cost_model_context.get_cost_model_context( "costmodel_communi_const") assert costmodel_communi_const == 3072.0 costmodel_communi_bias = cost_model_context.get_cost_model_context( "costmodel_communi_bias") assert costmodel_communi_bias == 1024.0 set_algo_parameters(tensor_slice_align_enable=False, tensor_slice_align_size=32, fully_use_devices=False, elementwise_op_strategy_follow=False) para_slice_align_enable = get_algo_parameters("tensor_slice_align_enable") assert not para_slice_align_enable para_slice_align_size = get_algo_parameters("tensor_slice_align_size") assert para_slice_align_size == 32 fully_use_devices = get_algo_parameters("fully_use_devices") assert not fully_use_devices elementwise_op_strategy_follow = get_algo_parameters( "elementwise_op_strategy_follow") assert not elementwise_op_strategy_follow reset_algo_parameters() para_slice_align_enable = get_algo_parameters("tensor_slice_align_enable") assert not para_slice_align_enable para_slice_align_size = get_algo_parameters("tensor_slice_align_size") assert para_slice_align_size == 16 fully_use_devices = get_algo_parameters("fully_use_devices") assert fully_use_devices elementwise_op_strategy_follow = get_algo_parameters( "elementwise_op_strategy_follow") assert not elementwise_op_strategy_follow x = Tensor(np.ones([128, 32]), dtype=ms.float32) y = Tensor(np.ones([32, 64]), dtype=ms.float32) b = Tensor(np.ones([64, 64]), dtype=ms.float32) net = NetWithLoss(Net()) context.set_auto_parallel_context(parallel_mode="auto_parallel") net.set_auto_parallel() reset_op_id() _executor.compile(net, x, y, b, phase='train') strategies = _executor._get_shard_strategy(net) expected_strategies = { 'Default/network-Net/MatMul-op0': [[16, 1], [1, 1]], 'Default/network-Net/MatMul-op1': [[16, 1], [1, 1]] } assert strategies == expected_strategies
def test_allreduce_fusion_parameters(): cost_model_context.reset_cost_model_context() cost_model_context.set_cost_model_context(costmodel_allreduce_fusion_algorithm=2) algorithm = cost_model_context.get_cost_model_context('costmodel_allreduce_fusion_algorithm') assert algorithm == 2 cost_model_context.set_cost_model_context(costmodel_allreduce_fusion_algorithm=1) algorithm = cost_model_context.get_cost_model_context('costmodel_allreduce_fusion_algorithm') assert algorithm == 1 cost_model_context.reset_cost_model_context() algorithm = cost_model_context.get_cost_model_context('costmodel_allreduce_fusion_algorithm') assert algorithm == 0 cost_model_context.set_cost_model_context(costmodel_allreduce_fusion_times=2) fusion_times = cost_model_context.get_cost_model_context('costmodel_allreduce_fusion_times') assert fusion_times == 2 cost_model_context.set_cost_model_context(costmodel_allreduce_fusion_tail_percent=0.2) tail_percent = cost_model_context.get_cost_model_context('costmodel_allreduce_fusion_tail_percent') assert tail_percent == 0.2 cost_model_context.reset_cost_model_context() tail_percent = cost_model_context.get_cost_model_context('costmodel_allreduce_fusion_tail_percent') assert tail_percent == 0.1 cost_model_context.set_cost_model_context(costmodel_allreduce_fusion_tail_time=0.2) tail_time = cost_model_context.get_cost_model_context('costmodel_allreduce_fusion_tail_time') assert tail_time == 0.2 cost_model_context.reset_cost_model_context() tail_time = cost_model_context.get_cost_model_context('costmodel_allreduce_fusion_tail_time') assert tail_time == 0.1 cost_model_context.set_cost_model_context(costmodel_allreduce_fusion_allreduce_inherent_time=0.2) allreduce_inherent_time = cost_model_context.get_cost_model_context( 'costmodel_allreduce_fusion_allreduce_inherent_time') assert allreduce_inherent_time == 0.2 cost_model_context.reset_cost_model_context() allreduce_inherent_time = cost_model_context.get_cost_model_context( 'costmodel_allreduce_fusion_allreduce_inherent_time') assert allreduce_inherent_time == 0.1 cost_model_context.set_cost_model_context(costmodel_allreduce_fusion_allreduce_bandwidth=0.2) allreduce_bandwidth = cost_model_context.get_cost_model_context('costmodel_allreduce_fusion_allreduce_bandwidth') assert allreduce_bandwidth == 0.2 cost_model_context.reset_cost_model_context() allreduce_bandwidth = cost_model_context.get_cost_model_context('costmodel_allreduce_fusion_allreduce_bandwidth') assert allreduce_bandwidth == 0.1 cost_model_context.set_cost_model_context(costmodel_allreduce_fusion_computation_time_parameter=0.2) computation_time_parameter = cost_model_context.get_cost_model_context( 'costmodel_allreduce_fusion_computation_time_parameter') assert computation_time_parameter == 0.2 cost_model_context.reset_cost_model_context() computation_time_parameter = cost_model_context.get_cost_model_context( 'costmodel_allreduce_fusion_computation_time_parameter') assert computation_time_parameter == 0.1
from mindspore.parallel import _cost_model_context as cost_model_context from mindspore.nn.wrap.cell_wrapper import VirtualDatasetCellTriple from src.wide_and_deep import PredictWithSigmoid, TrainStepWrap, NetWithLossClass, WideDeepModel from src.callbacks import LossCallBack, EvalCallBack from src.datasets import create_dataset from src.metrics import AUCMetric from src.config import WideDeepConfig sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", save_graphs=True) context.set_auto_parallel_context(parallel_mode=ParallelMode.AUTO_PARALLEL, mirror_mean=True) cost_model_context.set_cost_model_context(multi_subgraphs=True) init() def get_WideDeep_net(config): """ Get network of wide&deep model. """ WideDeep_net = WideDeepModel(config) loss_net = NetWithLossClass(WideDeep_net, config) loss_net = VirtualDatasetCellTriple(loss_net) train_net = TrainStepWrap(loss_net) eval_net = PredictWithSigmoid(WideDeep_net) eval_net = VirtualDatasetCellTriple(eval_net) return train_net, eval_net
def test_two_matmul(): class Net(nn.Cell): def __init__(self): super().__init__() self.matmul1 = P.MatMul() self.matmul2 = P.MatMul() def construct(self, x, y, b): out = self.matmul1(x, y) out = self.matmul2(out, b) return out size = 16 context.set_auto_parallel_context(device_num=size, global_rank=0) cost_model_context.set_cost_model_context( device_memory_capacity=32.0 * 1024.0 * 1024.0 * 1024.0, costmodel_alpha=1.0, costmodel_beta=60.0, costmodel_gamma=0.1, costmodel_communi_threshold=1024.0, costmodel_communi_const=2222.0, costmodel_communi_bias=1111.0) dev_mem_cap = cost_model_context.get_cost_model_context( "device_memory_capacity") assert dev_mem_cap == 32.0 * 1024.0 * 1024.0 * 1024.0 costmodel_alpha = cost_model_context.get_cost_model_context( "costmodel_alpha") assert costmodel_alpha == 1.0 costmodel_beta = cost_model_context.get_cost_model_context( "costmodel_beta") assert costmodel_beta == 60.0 costmodel_gamma = cost_model_context.get_cost_model_context( "costmodel_gamma") assert costmodel_gamma == 0.1 costmodel_communi_threshold = cost_model_context.get_cost_model_context( "costmodel_communi_threshold") assert costmodel_communi_threshold == 1024.0 costmodel_communi_const = cost_model_context.get_cost_model_context( "costmodel_communi_const") assert costmodel_communi_const == 2222.0 costmodel_communi_bias = cost_model_context.get_cost_model_context( "costmodel_communi_bias") assert costmodel_communi_bias == 1111.0 cost_model_context.reset_cost_model_context() dev_mem_cap = cost_model_context.get_cost_model_context( "device_memory_capacity") assert dev_mem_cap == 16.0 * 1024.0 * 1024.0 * 1024.0 costmodel_alpha = cost_model_context.get_cost_model_context( "costmodel_alpha") assert costmodel_alpha == 1.0 costmodel_beta = cost_model_context.get_cost_model_context( "costmodel_beta") assert costmodel_beta == 400.0 costmodel_gamma = cost_model_context.get_cost_model_context( "costmodel_gamma") assert costmodel_gamma == 0.001 costmodel_communi_threshold = cost_model_context.get_cost_model_context( "costmodel_communi_threshold") assert costmodel_communi_threshold == 2048.0 costmodel_communi_const = cost_model_context.get_cost_model_context( "costmodel_communi_const") assert costmodel_communi_const == 3072.0 costmodel_communi_bias = cost_model_context.get_cost_model_context( "costmodel_communi_bias") assert costmodel_communi_bias == 1024.0 set_algo_parameters(tensor_slice_align_enable=False, tensor_slice_align_size=32, fully_use_devices=False, elementwise_op_strategy_follow=False, enable_algo_approxi=True, algo_approxi_epsilon=0.001) para_slice_align_enable = get_algo_parameters("tensor_slice_align_enable") assert not para_slice_align_enable para_slice_align_size = get_algo_parameters("tensor_slice_align_size") assert para_slice_align_size == 32 fully_use_devices = get_algo_parameters("fully_use_devices") assert not fully_use_devices elementwise_op_strategy_follow = get_algo_parameters( "elementwise_op_strategy_follow") assert not elementwise_op_strategy_follow enable_approxi = get_algo_parameters("enable_algo_approxi") assert enable_approxi algo_epsilon = get_algo_parameters("algo_approxi_epsilon") assert algo_epsilon == 0.001 expecte_single_loop = True signle_loop = _get_algo_single_loop() assert expecte_single_loop == signle_loop expecte_single_loop = False _set_algo_single_loop(expecte_single_loop) signle_loop = _get_algo_single_loop() assert expecte_single_loop == signle_loop reset_algo_parameters() para_slice_align_enable = get_algo_parameters("tensor_slice_align_enable") assert not para_slice_align_enable para_slice_align_size = get_algo_parameters("tensor_slice_align_size") assert para_slice_align_size == 16 fully_use_devices = get_algo_parameters("fully_use_devices") assert fully_use_devices elementwise_op_strategy_follow = get_algo_parameters( "elementwise_op_strategy_follow") assert not elementwise_op_strategy_follow enable_approxi = get_algo_parameters("enable_algo_approxi") assert not enable_approxi algo_epsilon = get_algo_parameters("algo_approxi_epsilon") assert algo_epsilon == 0.1 x = Tensor(np.ones([128, 32]), dtype=ms.float32) y = Tensor(np.ones([32, 64]), dtype=ms.float32) b = Tensor(np.ones([64, 64]), dtype=ms.float32) net = NetWithLoss(Net()) context.set_auto_parallel_context(parallel_mode="auto_parallel") net.set_auto_parallel() reset_op_id() net.set_train() _executor.compile(net, x, y, b, phase='train') strategies = _executor._get_shard_strategy(net) for (k, v) in strategies.items(): if re.search('MatMul-op', k) is not None: assert v == [[16, 1], [1, 1]]
def test_train_32k_8p(epoch_size=3, batch_size=32, num_classes=32768): #1048576 #131072 #32768 #8192 dev_num = 8 context.set_auto_parallel_context(parallel_mode=ParallelMode.AUTO_PARALLEL, device_num=dev_num) cost_model_context.set_cost_model_context(costmodel_gamma=0.001, costmodel_beta=260.0) cost_model_context.set_cost_model_context(costmodel_allreduce_fusion_algorithm=1) cost_model_context.set_cost_model_context(costmodel_allreduce_fusion_times=2) cost_model_context.set_cost_model_context(costmodel_allreduce_fusion_tail_percent=0.5) set_algo_parameters(elementwise_op_strategy_follow=True) resset_op_id() np.random.seed(6) input_np = np.ones([batch_size, 3, 224, 224]).astype(np.float32) label_np = np.zeros([batch_size]).astype(np.int32) for i in range(0, batch_size): label_np[i] = i % num_classes dataset = DatasetLenet(Tensor(input_np), Tensor(label_np), 1) net = resnet50(num_classes) loss = SoftmaxCrossEntropyExpand(sparse=True) opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), 0.01, 0.9) model = Model(net, loss_fn=loss, optimizer=opt) model.train(5, dataset, dataset_sink_mode=False) strategies = _executor._get_strategy(model._train_network) for (k, v) in strategies.items(): if re.search('Conv2D-op', k) is not None: assert v[0][0] == dev_num elif re.search('MatMul-op', k) is not None: assert v == [[dev_num, 1], [1, 1]] elif re.search('ReduceSum-op', k) is not None: assert v == [[dev_num, 1]] allreduce_fusion_dict = _executor._get_allreduce_fusion(model._train_network) print(allreduce_fusion_dict) expect_dict = {'end_point.bias': 2, 'end_point.weight': 2, 'layer4.2.bn3.beta': 2, 'layer4.2.bn3.gamma': 2, 'layer4.2.conv3.weight': 2, 'layer4.2.bn2.beta': 2, 'layer4.2.bn2.gamma': 2, 'layer4.2.conv2.weight': 2, 'layer4.2.bn1.beta': 2, 'layer4.2.bn1.gamma': 2, 'layer4.2.conv1.weight': 2, 'layer4.1.bn3.beta': 2, 'layer4.1.bn3.gamma': 2, 'layer4.1.conv3.weight': 2, 'layer4.1.bn2.beta': 2, 'layer4.1.bn2.gamma': 2, 'layer4.1.conv2.weight': 2, 'layer4.1.bn1.beta': 2, 'layer4.1.bn1.gamma': 2, 'layer4.1.conv1.weight': 2, 'layer4.0.bn_down_sample.beta': 2, 'layer4.0.bn_down_sample.gamma': 2, 'layer4.0.conv_down_sample.weight': 2, 'layer4.0.bn3.beta': 2, 'layer4.0.bn3.gamma': 2, 'layer4.0.conv3.weight': 2, 'layer4.0.bn2.beta': 2, 'layer4.0.bn2.gamma': 2, 'layer4.0.conv2.weight': 2, 'layer4.0.bn1.beta': 2, 'layer4.0.bn1.gamma': 2, 'layer4.0.conv1.weight': 2, 'layer3.5.bn3.beta': 2, 'layer3.5.bn3.gamma': 2, 'layer3.5.conv3.weight': 2, 'layer3.5.bn2.beta': 2, 'layer3.5.bn2.gamma': 2, 'layer3.5.conv2.weight': 2, 'layer3.5.bn1.beta': 2, 'layer3.5.bn1.gamma': 2, 'layer3.5.conv1.weight': 2, 'layer3.4.bn3.beta': 2, 'layer3.4.bn3.gamma': 2, 'layer3.4.conv3.weight': 2, 'layer3.4.bn2.beta': 2, 'layer3.4.bn2.gamma': 2, 'layer3.4.conv2.weight': 2, 'layer3.4.bn1.beta': 2, 'layer3.4.bn1.gamma': 2, 'layer3.4.conv1.weight': 2, 'layer3.3.bn3.beta': 2, 'layer3.3.bn3.gamma': 2, 'layer3.3.conv3.weight': 2, 'layer3.3.bn2.beta': 2, 'layer3.3.bn2.gamma': 2, 'layer3.3.conv2.weight': 2, 'layer3.3.bn1.beta': 2, 'layer3.3.bn1.gamma': 2, 'layer3.3.conv1.weight': 2, 'layer3.2.bn3.beta': 2, 'layer3.2.bn3.gamma': 2, 'layer3.2.conv3.weight': 2, 'layer3.2.bn2.beta': 2, 'layer3.2.bn2.gamma': 2, 'layer3.2.conv2.weight': 2, 'layer3.2.bn1.beta': 2, 'layer3.2.bn1.gamma': 2, 'layer3.2.conv1.weight': 2, 'layer3.1.bn3.beta': 2, 'layer3.1.bn3.gamma': 2, 'layer3.1.conv3.weight': 2, 'layer3.1.bn2.beta': 2, 'layer3.1.bn2.gamma': 2, 'layer3.1.conv2.weight': 2, 'layer3.1.bn1.beta': 2, 'layer3.1.bn1.gamma': 2, 'layer3.1.conv1.weight': 2, 'layer3.0.bn_down_sample.beta': 1, 'layer3.0.bn_down_sample.gamma': 1, 'layer3.0.conv_down_sample.weight': 2, 'layer3.0.bn3.beta': 1, 'layer3.0.bn3.gamma': 1, 'layer3.0.conv3.weight': 2, 'layer3.0.bn2.beta': 2, 'layer3.0.bn2.gamma': 2, 'layer3.0.conv2.weight': 2, 'layer3.0.bn1.beta': 2, 'layer3.0.bn1.gamma': 2, 'layer3.0.conv1.weight': 2, 'layer2.3.bn3.beta': 2, 'layer2.3.bn3.gamma': 2, 'layer2.3.conv3.weight': 2, 'layer2.3.bn2.beta': 2, 'layer2.3.bn2.gamma': 2, 'layer2.3.conv2.weight': 2, 'layer2.3.bn1.beta': 2, 'layer2.3.bn1.gamma': 2, 'layer2.3.conv1.weight': 2, 'layer2.2.bn3.beta': 2, 'layer2.2.bn3.gamma': 2, 'layer2.2.conv3.weight': 2, 'layer2.2.bn2.beta': 2, 'layer2.2.bn2.gamma': 2, 'layer2.2.conv2.weight': 2, 'layer2.2.bn1.beta': 2, 'layer2.2.bn1.gamma': 2, 'layer2.2.conv1.weight': 2, 'layer2.1.bn3.beta': 1, 'layer2.1.bn3.gamma': 1, 'layer2.1.conv3.weight': 2, 'layer2.1.bn2.beta': 2, 'layer2.1.bn2.gamma': 2, 'layer2.1.conv2.weight': 2, 'layer2.1.bn1.beta': 2, 'layer2.1.bn1.gamma': 2, 'layer2.1.conv1.weight': 2, 'layer2.0.bn_down_sample.beta': 1, 'layer2.0.bn_down_sample.gamma': 1, 'layer2.0.conv_down_sample.weight': 2, 'layer2.0.bn3.beta': 1, 'layer2.0.bn3.gamma': 1, 'layer2.0.conv3.weight': 2, 'layer2.0.bn2.beta': 2, 'layer2.0.bn2.gamma': 2, 'layer2.0.conv2.weight': 2, 'layer2.0.bn1.beta': 2, 'layer2.0.bn1.gamma': 2, 'layer2.0.conv1.weight': 2, 'layer1.2.bn3.beta': 2, 'layer1.2.bn3.gamma': 2, 'layer1.2.conv3.weight': 2, 'layer1.2.bn2.beta': 2, 'layer1.2.bn2.gamma': 2, 'layer1.2.conv2.weight': 2, 'layer1.2.bn1.beta': 2, 'layer1.2.bn1.gamma': 2, 'layer1.2.conv1.weight': 2, 'layer1.1.bn3.beta': 1, 'layer1.1.bn3.gamma': 1, 'layer1.1.conv3.weight': 2, 'layer1.1.bn2.beta': 2, 'layer1.1.bn2.gamma': 2, 'layer1.1.conv2.weight': 2, 'layer1.1.bn1.beta': 2, 'layer1.1.bn1.gamma': 2, 'layer1.1.conv1.weight': 2, 'layer1.0.bn_down_sample.beta': 1, 'layer1.0.bn_down_sample.gamma': 1, 'layer1.0.conv_down_sample.weight': 2, 'layer1.0.bn3.beta': 1, 'layer1.0.bn3.gamma': 1, 'layer1.0.conv3.weight': 2, 'layer1.0.bn2.beta': 2, 'layer1.0.bn2.gamma': 2, 'layer1.0.conv2.weight': 2, 'layer1.0.bn1.beta': 2, 'layer1.0.bn1.gamma': 2, 'layer1.0.conv1.weight': 2, 'bn1.beta': 1, 'bn1.gamma': 1, 'conv1.weight': 2} assert (allreduce_fusion_dict == expect_dict) cost_model_context.reset_cost_model_context()