コード例 #1
0
        avg_loss = fluid.layers.mean(loss)
        return avg_loss


class TestMnist(TestParallelDyGraphRunnerBase):
    def get_model(self):
        model = MNIST()
        train_reader = paddle.batch(
            paddle.dataset.mnist.train(), batch_size=2, drop_last=True)
        opt = paddle.optimizer.Adam(
            learning_rate=1e-3, parameters=model.parameters())
        return model, train_reader, opt

    def run_one_loop(self, model, opt, data):
        batch_size = len(data)
        dy_x_data = np.array([x[0].reshape(1, 28, 28)
                              for x in data]).astype('float32')
        y_data = np.array(
            [x[1] for x in data]).astype('int64').reshape(batch_size, 1)
        img = to_variable(dy_x_data)
        label = to_variable(y_data)
        label.stop_gradient = True

        avg_loss = model(img, label)

        return avg_loss


if __name__ == "__main__":
    runtime_main(TestMnist)
コード例 #2
0
# global configs
# using small `vocab_size` to test rows number over height
batch_size = 4
batch_num = 200
hidden_size = 10
vocab_size = 10
num_steps = 3
init_scale = 0.1


class TestSparseEmbeddingOverHeight(TestSparseEmbedding):
    def get_model(self):
        model = SimpleNet(hidden_size=hidden_size,
                          vocab_size=vocab_size,
                          num_steps=num_steps,
                          init_scale=init_scale,
                          is_sparse=True)

        train_reader = paddle.batch(fake_sample_reader(),
                                    batch_size=batch_size,
                                    drop_last=True)

        optimizer = fluid.optimizer.SGD(learning_rate=0.001,
                                        parameter_list=model.parameters())

        return model, train_reader, optimizer


if __name__ == "__main__":
    runtime_main(TestSparseEmbeddingOverHeight)
コード例 #3
0
def fake_sample_reader():
    def __reader__():
        for i in range(batch_num):
            x_data = np.random.random_sample((10, )).astype('float32')
            yield x_data

    return __reader__


class TestSimpleNet(TestParallelDyGraphRunnerBase):
    def get_model(self):
        model = SimpleNet()
        train_reader = paddle.batch(fake_sample_reader(),
                                    batch_size=batch_size,
                                    drop_last=True)
        optimizer = paddle.optimizer.SGD(learning_rate=0.001,
                                         parameters=model.parameters())
        return model, train_reader, optimizer

    def run_one_loop(self, model, optimizer, batch):
        x_data = np.array([x for x in batch])
        x_data = x_data.reshape((-1, 10))
        x = paddle.to_tensor(x_data)
        out = model(x)
        loss = out.sum() / len(batch)
        return loss


if __name__ == "__main__":
    runtime_main(TestSimpleNet)
コード例 #4
0
                          vocab_size=vocab_size,
                          num_steps=num_steps,
                          init_scale=init_scale,
                          is_sparse=True)

        train_reader = paddle.batch(fake_sample_reader(),
                                    batch_size=batch_size,
                                    drop_last=True)

        optimizer = paddle.optimizer.SGD(learning_rate=0.001,
                                         parameters=model.parameters())

        return model, train_reader, optimizer

    def run_one_loop(self, model, optimizer, batch):
        x_data = np.array([x[0].reshape(3) for x in batch]).astype('int64')
        y_data = np.array([x[1].reshape(3) for x in batch]).astype('int64')
        x_data = x_data.reshape((-1, num_steps, 1))
        y_data = y_data.reshape((-1, 1))

        x = paddle.to_tensor(x_data)
        y = paddle.to_tensor(y_data)

        dy_loss = model(x, y)

        return dy_loss


if __name__ == "__main__":
    runtime_main(TestSparseEmbeddingFP64)
コード例 #5
0
                              "train")
    train_reader = get_batch_reader([train_file], batch_size)
    train_feed = ["query_ids", "pos_title_ids", "neg_title_ids", "label"]
    return train_reader, train_feed


class TestDistSimnetBow2x2(TestDistRunnerBase):
    def get_model(self, batch_size=2):
        # Train program
        avg_cost, acc, predict = \
            train_network(batch_size,
                          bool(int(os.environ["IS_DISTRIBUTED"])),
                          bool(int(os.environ["IS_SPARSE"])),
                          bool(int(os.environ["IS_SELF_CONTAINED_LR"])))

        inference_program = fluid.default_main_program().clone()

        # Optimization
        opt = os.getenv('OPTIMIZER', 'sgd')
        opt = get_optimizer(opt)
        opt.minimize(avg_cost)

        # Reader
        train_reader, _ = get_train_reader(batch_size)
        return inference_program, avg_cost, train_reader, train_reader, acc, predict


if __name__ == "__main__":
    paddle.dataset.common.download(DATA_URL, 'simnet', DATA_MD5, "train")
    runtime_main(TestDistSimnetBow2x2)
コード例 #6
0
                                                        momentum=0.9)
            optimizer = fleet.distributed_optimizer(optimizer,
                                                    strategy=strategy)
            optimizer.minimize(avg_cost)

    # execution
    device_id = int(os.getenv("FLAGS_selected_gpus", "0"))
    place = fluid.CUDAPlace(device_id)
    exe = fluid.Executor(place)
    exe.run(startup_prog)
    dirname = "./ut_sharding_save_model"
    sharding.utils.save_persistables(exe,
                                     dirname,
                                     main_program=train_prog,
                                     filename=None)

    out_losses = []
    if six.PY2:
        print(pickle.dumps(out_losses))
    else:
        sys.stdout.buffer.write(pickle.dumps(out_losses))


if __name__ == "__main__":
    #NOTE(liangjianzhong): dist unittest should be imlpement using runtime_main in test_dist_base.py
    # but the runtime_main in test_dist_base.py use the fleet, DistributedStrategy from
    # paddle.fluid.incubate.fleet.collective which is not support by sharding (paddle.distributed.fleet).
    # this should be update in future.
    # runtime_main(TestDistMnist2x2)
    runtime_main()
コード例 #7
0
ファイル: dist_word2vec.py プロジェクト: pyqt1/MyPaddle
        dict_size = len(word_dict)

        first_word = fluid.layers.data(name='firstw', shape=[1], dtype='int64')
        second_word = fluid.layers.data(name='secondw',
                                        shape=[1],
                                        dtype='int64')
        third_word = fluid.layers.data(name='thirdw', shape=[1], dtype='int64')
        forth_word = fluid.layers.data(name='forthw', shape=[1], dtype='int64')
        next_word = fluid.layers.data(name='nextw', shape=[1], dtype='int64')
        avg_cost, predict_word = __network__(
            [first_word, second_word, third_word, forth_word, next_word])

        inference_program = paddle.fluid.default_main_program().clone()

        sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
        sgd_optimizer.minimize(avg_cost)

        train_reader = paddle.batch(
            paddle.dataset.imikolov.train(word_dict, N), BATCH_SIZE)
        test_reader = paddle.batch(paddle.dataset.imikolov.test(word_dict, N),
                                   BATCH_SIZE)

        return inference_program, avg_cost, train_reader, test_reader, None, predict_word


if __name__ == "__main__":
    import os
    os.environ['CPU_NUM'] = '1'
    os.environ['USE_CUDA'] = "FALSE"
    runtime_main(TestDistWord2vec2x2)
コード例 #8
0
ファイル: dist_se_resnext.py プロジェクト: pyqt1/MyPaddle
        bd = [step * e for e in epochs]
        base_lr = 0.1
        lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]

        if not use_dgc:
            optimizer = fluid.optimizer.Momentum(
                learning_rate=fluid.layers.piecewise_decay(
                    boundaries=bd, values=lr),
                momentum=0.9,
                regularization=fluid.regularizer.L2Decay(1e-4))
        else:
            optimizer = fluid.optimizer.DGCMomentumOptimizer(
                learning_rate=fluid.layers.piecewise_decay(
                    boundaries=bd, values=lr),
                momentum=0.9,
                rampup_begin_step=0,
                regularization=fluid.regularizer.L2Decay(1e-4))
        optimizer.minimize(avg_cost)

        # Reader
        train_reader = paddle.batch(
            paddle.dataset.flowers.test(use_xmap=False), batch_size=batch_size)
        test_reader = paddle.batch(
            paddle.dataset.flowers.test(use_xmap=False), batch_size=batch_size)

        return test_program, avg_cost, train_reader, test_reader, acc_top1, out


if __name__ == "__main__":
    runtime_main(DistSeResneXt2x2)
            input=predict, label=label, total=batch_size_tensor)

        test_program = fluid.default_main_program().clone(for_test=True)

        # Reader
        train_reader = paddle.batch(
            paddle.dataset.mnist.test(), batch_size=batch_size)
        test_reader = paddle.batch(
            paddle.dataset.mnist.test(), batch_size=batch_size)

        optimizer = paddle.fluid.optimizer.Adam(0.01)
        if single_device:
            optimizer.minimize(avg_cost)
        else:
            role = role_maker.PaddleCloudRoleMaker(is_collective=True)
            fleet.init(role)
            strategy = paddle.distributed.fleet.DistributedStrategy()
            strategy.without_graph_optimization = True
            strategy.fuse_all_reduce_ops = True
            strategy._calc_comm_same_stream = False
            strategy.fuse_grad_size_in_num = 8
            optimizer = fleet.distributed_optimizer(
                optimizer, strategy=strategy)
            optimizer.minimize(avg_cost)

        return test_program, avg_cost, train_reader, test_reader, batch_acc, predict


if __name__ == "__main__":
    runtime_main(TestFleetMetaOptimizerFuseAllReducePrecision)
コード例 #10
0
class TestSeResNeXt(TestParallelDyGraphRunnerBase):
    def get_model(self):
        model = SeResNeXt()
        train_reader = paddle.batch(
            paddle.dataset.flowers.test(use_xmap=False),
            batch_size=train_parameters["batch_size"],
            drop_last=True)
        optimizer = optimizer_setting(train_parameters,
                                      parameter_list=model.parameters())
        return model, train_reader, optimizer

    def run_one_loop(self, model, opt, data):
        bs = len(data)
        dy_x_data = np.array([x[0].reshape(3, 224, 224)
                              for x in data]).astype('float32')
        dy_x_data = dy_x_data / 255.0
        y_data = np.array([x[1] for x in data]).astype('int64').reshape(bs, 1)
        img = to_variable(dy_x_data)
        label = to_variable(y_data)
        label.stop_gradient = True

        out = model(img)
        softmax_out = fluid.layers.softmax(out, use_cudnn=False)
        loss = fluid.layers.cross_entropy(input=softmax_out, label=label)
        avg_loss = fluid.layers.mean(x=loss)
        return avg_loss


if __name__ == "__main__":
    runtime_main(TestSeResNeXt)
コード例 #11
0
class TestNoSyncUnusedParam(TestNoSync):
    def get_model(self):
        model = SimpleNetUnusedParam()
        train_reader = paddle.batch(fake_sample_reader(),
                                    batch_size=batch_size,
                                    drop_last=True)
        optimizer = paddle.optimizer.SGD(learning_rate=0.001,
                                         parameters=model.parameters())
        return model, train_reader, optimizer

    def run_one_loop(self, model, optimizer, batch):
        x_data = np.array([x for x in batch])
        x_data = x_data.reshape((-1, 10))
        x = paddle.to_tensor(x_data)
        out = model(x)
        loss = out.sum() / len(batch)
        return loss


def fake_sample_reader():
    def __reader__():
        for i in range(batch_num):
            x_data = np.random.random_sample((10, )).astype('float32')
            yield x_data

    return __reader__


if __name__ == "__main__":
    runtime_main(TestNoSyncUnusedParam)
コード例 #12
0
        return y


class TestSyncBatchNorm(TestParallelDyGraphRunnerBase):
    def get_model(self):
        model = TestLayer(3, 64, 7)
        train_reader = paddle.batch(
            paddle.dataset.flowers.test(use_xmap=False),
            batch_size=32,
            drop_last=True)
        opt = fluid.optimizer.Adam(learning_rate=1e-3,
                                   parameter_list=model.parameters())
        return model, train_reader, opt

    def run_one_loop(self, model, opt, data):
        batch_size = len(data)
        dy_x_data = np.array([x[0].reshape(3, 224, 224)
                              for x in data]).astype('float32')
        img = to_variable(dy_x_data)
        img.stop_gradient = False

        out = model(img)

        out = fluid.layers.mean(out)

        return out


if __name__ == "__main__":
    runtime_main(TestSyncBatchNorm)
コード例 #13
0
    :param word_idx: word dictionary
    :type word_idx: dict
    :return: Training reader creator
    :rtype: callable
    """
    return reader_creator(re.compile("train/pos/.*\.txt$"),
                          re.compile("train/neg/.*\.txt$"), word_idx)


def test(word_idx):
    """
    IMDB test set creator.

    It returns a reader creator, each sample in the reader is an zero-based ID
    sequence and label in [0, 1].

    :param word_idx: word dictionary
    :type word_idx: dict
    :return: Test reader creator
    :rtype: callable
    """
    return reader_creator(re.compile("test/pos/.*\.txt$"),
                          re.compile("test/neg/.*\.txt$"), word_idx)


if __name__ == "__main__":
    paddle.dataset.common.download(VOCAB_URL, 'text_classification', VOCAB_MD5)
    paddle.dataset.common.download(DATA_URL, 'text_classification', DATA_MD5)
    runtime_main(TestDistTextClassification2x2)
コード例 #14
0
                          vocab_size=vocab_size,
                          num_steps=num_steps,
                          init_scale=init_scale,
                          is_sparse=True)

        train_reader = paddle.batch(fake_sample_reader(),
                                    batch_size=batch_size,
                                    drop_last=True)

        optimizer = fluid.optimizer.SGD(learning_rate=0.001,
                                        parameter_list=model.parameters())

        return model, train_reader, optimizer

    def run_one_loop(self, model, optimizer, batch):
        x_data = np.array([x[0].reshape(3) for x in batch]).astype('int64')
        y_data = np.array([x[1].reshape(3) for x in batch]).astype('int64')
        x_data = x_data.reshape((-1, num_steps, 1))
        y_data = y_data.reshape((-1, 1))

        x = to_variable(x_data)
        y = to_variable(y_data)

        dy_loss = model(x, y)

        return dy_loss


if __name__ == "__main__":
    runtime_main(TestSparseEmbedding)
コード例 #15
0
        # Evaluator
        batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
        batch_acc = fluid.layers.accuracy(input=predict,
                                          label=label,
                                          total=batch_size_tensor)

        inference_program = fluid.default_main_program().clone()
        # Optimization
        # TODO(typhoonzero): fix distributed adam optimizer
        # opt = fluid.optimizer.AdamOptimizer(
        #     learning_rate=0.001, beta1=0.9, beta2=0.999)
        if not use_dgc:
            opt = fluid.optimizer.Momentum(learning_rate=self.lr, momentum=0.9)
        else:
            opt = fluid.optimizer.DGCMomentumOptimizer(learning_rate=self.lr,
                                                       momentum=0.9,
                                                       rampup_begin_step=0)

        # Reader
        train_reader = paddle.batch(paddle.dataset.mnist.test(),
                                    batch_size=batch_size)
        test_reader = paddle.batch(paddle.dataset.mnist.test(),
                                   batch_size=batch_size)
        opt.minimize(avg_cost)
        return inference_program, avg_cost, train_reader, test_reader, batch_acc, predict


if __name__ == "__main__":
    runtime_main(TestDistMnist2x2)
コード例 #16
0
                    with model.no_sync():
                        loss = self.run_one_loop(model, opt, data)
                        loss.backward()
                else:
                    loss = self.run_one_loop(model, opt, data)
                    loss.backward()
            else:
                loss = self.run_one_loop(model, opt, data)
                loss.backward()
                opt.minimize(loss)
                print_to_err(
                    type(self).__name__,
                    "loss at step %d: %f" % (step_id, loss.numpy()))
                out_losses.append(loss.numpy())
                model.clear_gradients()
        print_to_out(out_losses)
        return out_losses


def fake_sample_reader():
    def __reader__():
        for i in range(batch_num):
            x_data = np.random.random_sample((10, )).astype('float32')
            yield x_data

    return __reader__


if __name__ == "__main__":
    runtime_main(TestNoSyncControlFlow)
コード例 #17
0
            if step_id == RUN_STEP:
                break
            if step_id % 3 != 0:
                if args.update_method == "nccl2":
                    with model.no_sync():
                        loss = self.run_one_loop(model, opt, data)
                        loss.backward()
                else:
                    loss = self.run_one_loop(model, opt, data)
                    loss.backward()
            else:
                loss = self.run_one_loop(model, opt, data)
                loss.backward()
                opt.minimize(loss)
                out_losses.append(loss.numpy())
                model.clear_gradients()
        return out_losses


def fake_sample_reader():
    def __reader__():
        for i in range(batch_num):
            x_data = np.random.random_sample((10, )).astype('float32')
            yield x_data

    return __reader__


if __name__ == "__main__":
    runtime_main(TestNoSync)
コード例 #18
0
            param_attr=fluid.ParamAttr(
                name="wide_embedding",
                initializer=fluid.initializer.Constant(value=0.01)),
            is_sparse=IS_SPARSE)
        lr_pool = fluid.layers.sequence_pool(input=lr_embbding,
                                             pool_type="sum")

        merge_layer = fluid.layers.concat(input=[dnn_out, lr_pool], axis=1)

        predict = fluid.layers.fc(input=merge_layer, size=2, act='softmax')
        acc = fluid.layers.accuracy(input=predict, label=label)
        auc_var, batch_auc_var, auc_states = fluid.layers.auc(input=predict,
                                                              label=label)
        cost = fluid.layers.cross_entropy(input=predict, label=label)
        avg_cost = fluid.layers.mean(x=cost)

        inference_program = paddle.fluid.default_main_program().clone()

        sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.0001)
        sgd_optimizer.minimize(avg_cost)

        dataset = dist_ctr_reader.Dataset()
        train_reader = paddle.batch(dataset.train(), batch_size=batch_size)
        test_reader = paddle.batch(dataset.test(), batch_size=batch_size)

        return inference_program, avg_cost, train_reader, test_reader, None, predict


if __name__ == "__main__":
    runtime_main(TestDistCTR2x2)
コード例 #19
0
                          vocab_size=vocab_size,
                          num_steps=num_steps,
                          init_scale=init_scale,
                          is_sparse=False)

        train_reader = paddle.batch(fake_sample_reader(),
                                    batch_size=batch_size,
                                    drop_last=True)

        optimizer = paddle.optimizer.SGD(learning_rate=0.001,
                                         parameters=model.parameters())

        return model, train_reader, optimizer

    def run_one_loop(self, model, optimizer, batch):
        x_data = np.array([x[0].reshape(3) for x in batch]).astype('int64')
        y_data = np.array([x[1].reshape(3) for x in batch]).astype('int64')
        x_data = x_data.reshape((-1, num_steps, 1))
        y_data = y_data.reshape((-1, 1))

        x = paddle.to_tensor(x_data)
        y = paddle.to_tensor(y_data)

        dy_loss = model(x, y)

        return dy_loss["loss"]


if __name__ == "__main__":
    runtime_main(TestSparseEmbeddingUnusedVars)
コード例 #20
0
        batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
        batch_acc = fluid.layers.accuracy(input=predict,
                                          label=label,
                                          total=batch_size_tensor)

        test_program = fluid.default_main_program().clone(for_test=True)

        # Reader
        train_reader = paddle.batch(paddle.dataset.mnist.test(),
                                    batch_size=batch_size)
        test_reader = paddle.batch(paddle.dataset.mnist.test(),
                                   batch_size=batch_size)

        optimizer = paddle.fluid.optimizer.Adam(0.01)
        if single_device:
            optimizer.minimize(avg_cost)
        else:
            role = role_maker.PaddleCloudRoleMaker(is_collective=True)
            fleet.init(role)
            strategy = paddle.distributed.fleet.DistributedStrategy()
            strategy.without_graph_optimization = True
            optimizer = fleet.distributed_optimizer(optimizer,
                                                    strategy=strategy)
            optimizer.minimize(avg_cost)

        return test_program, avg_cost, train_reader, test_reader, batch_acc, predict


if __name__ == "__main__":
    runtime_main(TestFleetMetaOptimizerPrecision)
コード例 #21
0
            strategy.tensor_parallel = True
            strategy.tensor_parallel_configs = {'tensor_parallel_degree': 2}

        rank = fleet.worker_index() if dist_strategy else None
        avg_cost = create_model(data_in, rank)
        opt = fluid.optimizer.SGD(0.1)

        if dist_strategy:
            dist_opt = fleet.distributed_optimizer(optimizer=opt,
                                                   strategy=strategy)
            dist_opt.minimize(avg_cost)
        else:
            opt.minimize(avg_cost)

        def gen_data():
            np.random.seed(2021)
            while True:
                data = [np.random.random([IN_SIZE]).astype(DTYPE)]
                yield data

        train_reader = paddle.batch(gen_data, batch_size=batch_size)

        if dist_strategy:
            return None, avg_cost, train_reader, None, None, None, data_loader
        else:
            return None, avg_cost, train_reader, None, None, None


if __name__ == "__main__":
    runtime_main(TestModelParallel)
コード例 #22
0
                            ModelHyperParams.weight_sharing,
                            TrainTaskConfig.label_smooth_eps,
                            is_sparse=True)
        train_reader = paddle.batch(fake_data_reader(),
                                    TrainTaskConfig.batch_size)
        if naive_optimize:
            optimizer = fluid.optimizer.SGD(learning_rate=0.001,
                                            parameter_list=model.parameters())
        else:
            optimizer = fluid.optimizer.Adam(learning_rate=NoamDecay(
                ModelHyperParams.d_model, TrainTaskConfig.warmup_steps,
                TrainTaskConfig.learning_rate),
                                             beta1=TrainTaskConfig.beta1,
                                             beta2=TrainTaskConfig.beta2,
                                             epsilon=TrainTaskConfig.eps,
                                             parameter_list=model.parameters())

        return model, train_reader, optimizer

    def run_one_loop(self, model, optimizer, batch):
        enc_inputs, dec_inputs, label, weights = np_to_variable(batch)

        dy_sum_cost, dy_avg_cost, dy_predict, dy_token_num = model(
            enc_inputs, dec_inputs, label, weights)

        return dy_avg_cost


if __name__ == "__main__":
    runtime_main(TestTransformer)
コード例 #23
0
                sys.stdout.buffer.write(pickle.dumps(np.ravel(var).tolist()))

        elif save_mode == "DIST":
            skip_steps = int(os.getenv("SKIP_STEPS"))
            loss = None
            if need_save:
                for idx in six.moves.xrange(8):
                    loss, = exe.run(fetch_list=[avg_cost.name],
                                    feed=feeder.feed(get_data()))
                    if need_save and model_dir and idx == skip_steps and args.trainer_id == 0:
                        io.save_persistables(startup_exe, model_dir,
                                             trainer_prog)
            else:
                for idx in six.moves.xrange(8):
                    data = get_data()
                    if idx <= skip_steps:
                        continue
                    loss, = exe.run(fetch_list=[avg_cost.name],
                                    feed=feeder.feed(data))
            if six.PY2:
                print(pickle.dumps(loss.tolist()))
            else:
                sys.stdout.buffer.write(pickle.dumps(loss.tolist()))
        else:
            raise Exception("save_mode must be LOCAL or DIST")


if __name__ == "__main__":
    paddle.dataset.common.download(DATA_URL, 'simnet', DATA_MD5, "train")
    runtime_main(TestDistSaveLoad2x2)
コード例 #24
0
                k_steps=strategy.gradient_merge_configs["k_steps"],
                avg=strategy.gradient_merge_configs["avg"])
            world_size = 1
        else:
            optimizer = fleet.distributed_optimizer(optimizer)
            world_size = fleet.world_size()
        optimizer.minimize(cost)
        if world_size > 1:
            assert paddle.static.default_main_program().num_blocks == 2
            gm_block = paddle.static.default_main_program().block(1)
            start_allreduce_idx = None
            for i, op in enumerate(gm_block.ops):
                if op.type == "c_allreduce_sum":
                    start_allreduce_idx = i
                    break
            # the magic number 1 below means skip the c_sync_calc_stream op
            if avg:
                assert start_allreduce_idx > 1
            else:
                assert start_allreduce_idx == 1

        train_reader = paddle.batch(paddle.dataset.mnist.test(),
                                    batch_size=batch_size)
        test_reader = paddle.batch(paddle.dataset.mnist.test(),
                                   batch_size=batch_size)
        return test_program, cost, train_reader, test_reader, acc, predict


if __name__ == "__main__":
    runtime_main(TestDistMnistGradientMergeRawOptimizer)