def test_batchnorm_batch_parallel():
    num_classes = 1001
    batch_size = 32
    learning_rate = 0.1
    momentum = 0.9
    epoch_size = 2
    rank_size = 0

    predict = Tensor(np.ones([batch_size, 3, 224, 224]), dtype=ms.float32)
    label = Tensor(np.ones([batch_size]), dtype=ms.int32)

    dataset = DatasetLenet(predict, label, 2)
    net = batchnorm_net(num_classes)

    loss = SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True)
    loss.softmax_cross_entropy.set_strategy(((dev_num, 1), (dev_num, 1)))
    opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()),
                   learning_rate, momentum)

    context.reset_auto_parallel_context()
    context.set_auto_parallel_context(
        parallel_mode=ParallelMode.SEMI_AUTO_PARALLEL, device_num=dev_num)
    context.set_context(mode=context.GRAPH_MODE)
    model = Model(net, loss, opt)
    model.train(epoch_size, dataset, dataset_sink_mode=False)
Ejemplo n.º 2
0
    def test_gpu_profiler(self):
        context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
        profiler = Profiler(output_path='data')
        profiler_name = os.listdir(os.path.join(os.getcwd(), 'data'))[0]
        self.profiler_path = os.path.join(os.getcwd(),
                                          f'data/{profiler_name}/')
        ds_train = create_dataset(os.path.join(self.mnist_path, "train"))
        if ds_train.get_dataset_size() == 0:
            raise ValueError(
                "Please check dataset size > 0 and batch_size <= dataset size")

        lenet = LeNet5()
        loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean")
        optim = Momentum(lenet.trainable_params(),
                         learning_rate=0.1,
                         momentum=0.9)
        model = Model(lenet,
                      loss_fn=loss,
                      optimizer=optim,
                      metrics={'acc': Accuracy()})

        model.train(1, ds_train, dataset_sink_mode=True)
        profiler.analyse()

        self._check_gpu_profiling_file()
Ejemplo n.º 3
0
def test_dp_monitor_gpu():
    context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
    batch_size = 16
    batches = 128
    epochs = 1
    rdp = PrivacyMonitorFactory.create(policy='rdp',
                                       num_samples=60000,
                                       batch_size=batch_size,
                                       initial_noise_multiplier=0.4,
                                       noise_decay_rate=6e-5)
    suggest_epoch = rdp.max_epoch_suggest()
    LOGGER.info(TAG, 'The recommended maximum training epochs is: %s',
                suggest_epoch)
    network = LeNet5()
    net_loss = nn.SoftmaxCrossEntropyWithLogits(is_grad=False,
                                                sparse=True,
                                                reduction="mean")
    net_opt = nn.Momentum(network.trainable_params(), 0.01, 0.9)

    model = Model(network, net_loss, net_opt)

    LOGGER.info(TAG, "============== Starting Training ==============")
    ds1 = ds.GeneratorDataset(dataset_generator(batch_size, batches),
                              ["data", "label"])
    ds1.set_dataset_size(batch_size * batches)
    model.train(epochs, ds1, callbacks=[rdp], dataset_sink_mode=False)
Ejemplo n.º 4
0
def mnist_train(epoch_size, batch_size, lr, momentum):
    mnist_path = "./MNIST_unzip/"
    ds = generate_mnist_dataset(os.path.join(mnist_path, "train"),
                                batch_size=batch_size,
                                repeat_size=1)

    network = LeNet5()
    net_loss = nn.SoftmaxCrossEntropyWithLogits(is_grad=False,
                                                sparse=True,
                                                reduction="mean")
    net_opt = nn.Momentum(network.trainable_params(), lr, momentum)
    config_ck = CheckpointConfig(save_checkpoint_steps=1875,
                                 keep_checkpoint_max=10)
    ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet",
                                 directory="./trained_ckpt_file/",
                                 config=config_ck)
    model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()})

    LOGGER.info(TAG, "============== Starting Training ==============")
    model.train(epoch_size,
                ds,
                callbacks=[ckpoint_cb, LossMonitor()],
                dataset_sink_mode=False)

    LOGGER.info(TAG, "============== Starting Testing ==============")
    ckpt_file_name = "trained_ckpt_file/checkpoint_lenet-10_1875.ckpt"
    param_dict = load_checkpoint(ckpt_file_name)
    load_param_into_net(network, param_dict)
    ds_eval = generate_mnist_dataset(os.path.join(mnist_path, "test"),
                                     batch_size=batch_size)
    acc = model.eval(ds_eval, dataset_sink_mode=False)
    LOGGER.info(TAG, "============== Accuracy: %s ==============", acc)
Ejemplo n.º 5
0
def train_common(net):
    batch_size = 32
    learning_rate = 0.1
    momentum = 0.9
    epoch_size = 2
    device_num = 4
    context.reset_auto_parallel_context()
    auto_parallel_context().set_enable_all_reduce_fusion(
        enable_all_reduce_fusion=True)
    context.set_auto_parallel_context(
        parallel_mode=ParallelMode.SEMI_AUTO_PARALLEL,
        device_num=device_num,
        parameter_broadcast=False)
    context.set_context(mode=context.GRAPH_MODE)

    predict = Tensor(np.ones([batch_size, 128]), dtype=ms.float32)
    label = Tensor(np.ones([batch_size]), dtype=ms.int32)
    dataset = Dataset(predict, label, 2)

    loss = SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True)
    opt = Momentum(net.trainable_params(), learning_rate, momentum)
    model = Model(net, loss, opt)

    model.train(epoch_size, dataset, dataset_sink_mode=False)
    allreduce_fusion_dict = _executor._get_allreduce_fusion(
        model._train_network)

    print(allreduce_fusion_dict)
    return allreduce_fusion_dict
Ejemplo n.º 6
0
    def test_summary_ops(self):
        """Test summary operators."""
        ds_train = create_mnist_dataset('train', num_samples=1, batch_size=1)
        ds_train_iter = ds_train.create_dict_iterator()
        expected_data = next(ds_train_iter)['image'].asnumpy()

        net = LeNet5()
        loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean")
        optim = Momentum(net.trainable_params(),
                         learning_rate=0.1,
                         momentum=0.9)
        model = Model(net,
                      loss_fn=loss,
                      optimizer=optim,
                      metrics={'loss': Loss()})
        model.train(1, ds_train, dataset_sink_mode=False)

        summary_data = _get_summary_tensor_data()
        image_data = summary_data['x[:Image]'].asnumpy()
        tensor_data = summary_data['x[:Tensor]'].asnumpy()
        x_fc3 = summary_data['x_fc3[:Scalar]'].asnumpy()

        assert np.allclose(expected_data, image_data)
        assert np.allclose(expected_data, tensor_data)
        assert not np.allclose(0, x_fc3)
Ejemplo n.º 7
0
def train_lenet_quant():
    context.set_context(mode=context.GRAPH_MODE, device_target=device_target)
    cfg = quant_cfg
    ckpt_path = './ckpt_lenet_noquant-10_1875.ckpt'
    ds_train = create_dataset(os.path.join(data_path, "train"), cfg.batch_size, 1)
    step_size = ds_train.get_dataset_size()

    # define fusion network
    network = LeNet5Fusion(cfg.num_classes)

    # load quantization aware network checkpoint
    param_dict = load_checkpoint(ckpt_path)
    load_nonquant_param_into_quant_net(network, param_dict)

    # convert fusion network to quantization aware network
    network = quant.convert_quant_network(network, quant_delay=900, bn_fold=False, per_channel=[True, False],
                                          symmetric=[False, False])

    # define network loss
    net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean")
    # define network optimization
    net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum)

    # call back and monitor
    config_ckpt = CheckpointConfig(save_checkpoint_steps=cfg.epoch_size * step_size,
                                   keep_checkpoint_max=cfg.keep_checkpoint_max)
    ckpt_callback = ModelCheckpoint(prefix="ckpt_lenet_quant", config=config_ckpt)

    # define model
    model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()})

    print("============== Starting Training ==============")
    model.train(cfg['epoch_size'], ds_train, callbacks=[ckpt_callback, LossMonitor()],
                dataset_sink_mode=True)
    print("============== End Training ==============")
Ejemplo n.º 8
0
def test_trains():
    init()
    lr = 0.1
    momentum = 0.9
    max_epoch = 20
    device_number = 32
    batch_size_per_device = 128
    input_channels = 256
    out_channels = 512

    context.reset_auto_parallel_context()
    context.set_auto_parallel_context(parallel_mode=ParallelMode.SEMI_AUTO_PARALLEL, device_num=device_number)
    predict = Tensor(np.ones([batch_size_per_device, input_channels]), dtype=ms.float32)
    dataset = Dataset(predict, 4)

    network = fc_with_initialize(input_channels, out_channels)
    network.set_train()

    criterion = get_loss(batch_size_per_device * device_number)

    train_network = BuildTrainNetwork(network, criterion)
    train_network.set_train()
    opt = Momentum(train_network.trainable_params(), lr, momentum)
    train_net = TrainOneStepCell(train_network, opt).set_train()

    model = Model(train_net)
    model.train(max_epoch, dataset, dataset_sink_mode=False)
    context.reset_auto_parallel_context()
Ejemplo n.º 9
0
def test_callbacks_non_sink_mismatch_size():
    logger.info("test_callbacks_non_sink_mismatch_size")
    default_timeout = ds.config.get_callback_timeout()
    ds.config.set_callback_timeout(1)

    events = []
    my_cb1 = MyWaitedCallback(events, 2)
    my_cb2 = MyMSCallback(events)
    arr = [1, 2, 3, 4]
    data = ds.NumpySlicesDataset((arr, arr),
                                 column_names=["c1", "c2"],
                                 shuffle=False)
    data = data.map(operations=(lambda x: x), callbacks=my_cb1)
    data = data.batch(3)
    net = Net()
    model = Model(net)
    with pytest.raises(Exception) as err:
        model.train(2,
                    data,
                    dataset_sink_mode=False,
                    callbacks=[my_cb2, my_cb1])
    assert "RuntimeError: ds_step_begin timed out after 1 second(s)" in str(
        err.value)

    ds.config.set_callback_timeout(default_timeout)
Ejemplo n.º 10
0
def main(data_path,
         device_target='Ascend',
         summary_dir='./summary_dir',
         learning_rate=0.01):
    context.set_context(mode=context.GRAPH_MODE, device_target=device_target)

    momentum = 0.9
    epoch_size = 1
    batch_size = 32

    network = LeNet5()
    network.set_train()
    net_loss = CrossEntropyLoss()
    net_opt = nn.Momentum(network.trainable_params(), learning_rate, momentum)
    model = Model(network, net_loss, net_opt)

    # Init SummaryCollector callback to record summary data in model.train or model.eval
    summary_collector = SummaryCollector(summary_dir=summary_dir,
                                         collect_freq=10)

    ds = create_dataset(os.path.join(data_path, "train"),
                        batch_size=batch_size)

    print("============== Starting Training ==============")
    model.train(epoch_size,
                ds,
                callbacks=[summary_collector],
                dataset_sink_mode=False)
    print("============== Train End =====================")
Ejemplo n.º 11
0
def train(data_dir, lr=0.01, momentum=0.9, num_epochs=2, ckpt_name="lenet"):
    dataset_sink = context.get_context('device_target') == 'Ascend'
    repeat = num_epochs if dataset_sink else 1
    ds_train = create_dataset(data_dir, repeat=repeat)
    ds_eval = create_dataset(data_dir, training=False)
    steps_per_epoch = ds_train.get_dataset_size()

    net = LeNet5()
    loss = nn.loss.SoftmaxCrossEntropyWithLogits(is_grad=False,
                                                 sparse=True,
                                                 reduction='mean')
    opt = nn.Momentum(net.trainable_params(), lr, momentum)

    ckpt_cfg = CheckpointConfig(save_checkpoint_steps=steps_per_epoch,
                                keep_checkpoint_max=5)
    ckpt_cb = ModelCheckpoint(prefix=ckpt_name,
                              directory='ckpt',
                              config=ckpt_cfg)
    loss_cb = LossMonitor(steps_per_epoch)

    model = Model(net, loss, opt, metrics={'acc', 'loss'})
    model.train(num_epochs,
                ds_train,
                callbacks=[ckpt_cb, loss_cb],
                dataset_sink_mode=dataset_sink)
    metrics = model.eval(ds_eval, dataset_sink_mode=dataset_sink)
    print('Metrics:', metrics)
Ejemplo n.º 12
0
def loss_scale_manager_common(strategy1):
    learning_rate = 0.1
    momentum = 0.9
    epoch_size = 2

    context.reset_auto_parallel_context()
    context.set_auto_parallel_context(parallel_mode=ParallelMode.AUTO_PARALLEL,
                                      device_num=8)
    predict = Tensor(np.ones([32, 128]), dtype=ms.float32)
    label = Tensor(np.ones([32]), dtype=ms.int32)
    dataset = Dataset(predict, label, 2)
    net = all_to_all_net(strategy1)

    loss = SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True)
    loss.softmax_cross_entropy.set_strategy(((8, 1), (8, 1)))
    opt = Momentum(net.trainable_params(), learning_rate, momentum)
    scale_manager = DynamicLossScaleManager(32, 2, 2000)
    model = Model(net, loss, opt, loss_scale_manager=scale_manager)
    # if no GE exists, outputs = self._train_network(*next_element) outputs inputs tensor.
    try:
        model.train(epoch_size, dataset, dataset_sink_mode=False)
    except TypeError:
        pass
    else:
        assert False
Ejemplo n.º 13
0
def calibration():
    """ do the calibration to get the scale offset record file"""
    dataset = create_dataset(
        dataset_path=ARGS_OPT.eval_dataset,
        do_train=False,
        batch_size=config.batch_size,  # pylint: disable=no-member
        target=ARGS_OPT.device_target)
    dataset = dataset.take(1)
    loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean')

    network = resnet(10)
    network.set_train(False)
    param_dict = load_checkpoint(ARGS_OPT.pre_trained)
    load_param_into_net(network, param_dict)
    input_data = np.random.uniform(0.0, 1.0, size=[32, 3, 224,
                                                   224]).astype(np.float32)
    config_file = os.path.join(CUR_DIR, './config.json')
    amct.create_quant_config(config_file, network, input_data)
    calibration_network = amct.quantize_model(config_file, network, input_data)

    model = Model(calibration_network,
                  loss_fn=loss,
                  metrics={'top_1_accuracy', 'top_5_accuracy'})
    _ = model.eval(dataset)
    amct.save_model('./resnet50_quant_calibration', calibration_network,
                    input_data)
Ejemplo n.º 14
0
 def mix_parallel_matmul_trains(self):
     parallel_callback = ModelCallback()
     matmul_stra = ((device_num, 1), (1, 1))
     reduce_max_stra = ((1, device_num), )
     sub_stra = ((device_num, 1), (device_num, 1))
     exp_stra = ((1, device_num), )
     reduce_sum_stra = ((1, device_num), )
     div_stra = ((1, device_num), (1, 1))
     log_stra = ((1, device_num), )
     mul_stra = ((1, device_num), (1, device_num))
     sum_cross_entropy_stra = ((1, device_num), )
     mul2_stra = ((), (device_num, ))
     reduce_mean_stra = ((device_num, ), )
     onehot_stra = ((1, device_num), (), ())
     loss_stra_list = [
         exp_stra, reduce_sum_stra, onehot_stra, div_stra, log_stra,
         sum_cross_entropy_stra, mul_stra, mul2_stra, reduce_mean_stra,
         reduce_max_stra, sub_stra
     ]
     context.set_auto_parallel_context(parallel_mode="auto_parallel")
     net = MatmulNet(matmul_stra=matmul_stra, loss_stra_list=loss_stra_list)
     optimizer = Momentum(net.trainable_params(),
                          learning_rate=0.1,
                          momentum=0.9)
     model = Model(net, optimizer=optimizer)
     epoch_size = 6
     dataset = Dataset(self.input_part, self.label_part)
     model.train(epoch_size,
                 dataset,
                 callbacks=parallel_callback,
                 dataset_sink_mode=False)
     loss_value = np.array(parallel_callback.loss_list)
     return loss_value
Ejemplo n.º 15
0
def test_row_tensor_model_train():
    class Net(nn.Cell):
        def __init__(self, in_features, out_features):
            super(Net, self).__init__()
            self.weight = Parameter(Tensor(
                np.ones([out_features, in_features]).astype(np.float32)),
                                    name="weight")
            self.add = P.TensorAdd()
            self.cast = P.Cast()
            self.flag = True

        def construct(self, inputs, label):
            x = self.add(inputs, self.weight)
            if self.flag:
                x = self.cast(x, mstype.float32)
            return x

    dataset_types = (np.float32, np.float32)
    dataset_shapes = ((16, 16), (16, 16))
    dataset = MindDataSet(dataset_types, dataset_shapes)
    net = Net(16, 16)
    net.set_train()

    optimizer = Momentum(net.trainable_params(),
                         learning_rate=0.1,
                         momentum=0.9)
    model = Model(net, optimizer=optimizer)
    model.train(2, dataset, dataset_sink_mode=False)
Ejemplo n.º 16
0
def net_trains(criterion, rank):
    init()
    lr = 0.1
    momentum = 0.9
    max_epoch = 20
    input_channels = 256
    out_channels = 512
    context.set_context(mode=context.GRAPH_MODE, save_graphs=False)
    context.reset_auto_parallel_context()
    context.set_auto_parallel_context(parallel_mode=ParallelMode.SEMI_AUTO_PARALLEL, device_num=device_number,
                                      global_rank=rank)
    predict = Tensor(np.ones([batch_size_per_device, input_channels]), dtype=ms.float32)
    dataset = Dataset(predict, 4)

    network = fc_with_initialize(input_channels, out_channels)
    network.set_train()

    train_network = BuildTrainNetwork(network, criterion)
    train_network.set_train()
    opt = Momentum(train_network.trainable_params(), lr, momentum)
    train_net = TrainOneStepCell(train_network, opt).set_train()

    model = Model(train_net)
    model.train(max_epoch, dataset, dataset_sink_mode=False)
    context.reset_auto_parallel_context()
Ejemplo n.º 17
0
def eval_quant():
    context.set_context(mode=context.GRAPH_MODE, device_target=device_target)
    cfg = quant_cfg
    ds_eval = create_dataset(os.path.join(data_path, "test"), cfg.batch_size,
                             1)
    ckpt_path = './ckpt_lenet_quant-10_937.ckpt'
    # define fusion network
    network = LeNet5Fusion(cfg.num_classes)
    # convert fusion network to quantization aware network
    quantizer = QuantizationAwareTraining(quant_delay=0,
                                          bn_fold=False,
                                          freeze_bn=10000,
                                          per_channel=[True, False],
                                          symmetric=[True, False])
    network = quantizer.quantize(network)

    # define loss
    net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean")
    # define network optimization
    net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum)

    # call back and monitor
    model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()})

    # load quantization aware network checkpoint
    param_dict = load_checkpoint(ckpt_path)
    not_load_param = load_param_into_net(network, param_dict)
    if not_load_param:
        raise ValueError("Load param into net fail!")

    print("============== Starting Testing ==============")
    acc = model.eval(ds_eval, dataset_sink_mode=True)
    print("============== {} ==============".format(acc))
    assert acc['Accuracy'] > 0.98
def test_resnet_model_parallel():
    num_classes = 1024
    batch_size = 32
    learning_rate = 0.1
    momentum = 0.9
    epoch_size = 2

    context.reset_auto_parallel_context()
    context.set_auto_parallel_context(device_num=dev_num, global_rank=0)
    context.set_auto_parallel_context(
        parallel_mode=ParallelMode.SEMI_AUTO_PARALLEL, device_num=dev_num)
    context.set_context(mode=context.GRAPH_MODE)
    predict = Tensor(np.ones([batch_size, 64, 112, 112]), dtype=ms.float32)
    label = Tensor(np.ones([batch_size]), dtype=ms.int32)

    dataset = DatasetLenet(predict, label, 2)
    net = resnet_model_parallel_net(num_classes)

    loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean')
    loss.softmax_cross_entropy.shard(((dev_num, 1), (dev_num, 1)))
    opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()),
                   learning_rate, momentum)

    model = Model(net, loss, opt)
    model.train(epoch_size, dataset, dataset_sink_mode=False)
Ejemplo n.º 19
0
def test_callbacks_non_sink_batch_size2():
    logger.info("test_callbacks_non_sink_batch_size2")

    events = []
    my_cb1 = MyWaitedCallback(events, 2)
    my_cb2 = MyMSCallback(events)
    arr = [1, 2, 3, 4]
    data = ds.NumpySlicesDataset((arr, arr),
                                 column_names=["c1", "c2"],
                                 shuffle=False)
    data = data.map(operations=(lambda x: x), callbacks=my_cb1)
    data = data.batch(2)
    net = Net()
    model = Model(net)

    model.train(2, data, dataset_sink_mode=False, callbacks=[my_cb2, my_cb1])

    expected_synced_events = [
        'ms_step_end_1_1', 'ds_step_begin_1_3', 'ms_step_end_1_2',
        'ms_epoch_end_1_2', 'ds_epoch_begin_2_4', 'ds_step_begin_2_5',
        'ms_step_end_2_3', 'ds_step_begin_2_7', 'ms_step_end_2_4',
        'ms_epoch_end_2_4'
    ]

    assert events == expected_synced_events
Ejemplo n.º 20
0
def test_auto_parallel_arithmetic_model():
    class NetOneHot(nn.Cell):
        def __init__(self):
            super().__init__()
            self.matmul = P.MatMul()
            self.one_hot = P.OneHot().shard(((1, 8), (), ()))
            self.on_value = Tensor(1.0, ms.float32)
            self.off_value = Tensor(0.0, ms.float32)
            self.matmul2 = P.MatMul()
            self.w = Parameter(Tensor(np.zeros([32, 64]).astype(np.float32)),
                               "weight",
                               requires_grad=True)

        def construct(self, x, b):
            out = self.matmul(x, self.w)
            out1 = self.one_hot(b, 64, self.on_value, self.off_value)
            out2 = self.matmul2(out, out1)
            return out2

    context.reset_auto_parallel_context()
    context.set_auto_parallel_context(device_num=8,
                                      global_rank=0,
                                      parallel_mode=ParallelMode.AUTO_PARALLEL)
    net = NetOneHot()

    x = Tensor(np.ones([8, 32]), dtype=ms.float32)
    b = Tensor(np.ones([8]), dtype=ms.int32)
    dataset = Dataset(x, b, 2)

    opt = Momentum(net.trainable_params(), 0.1, 0.9)
    model = Model(net, optimizer=opt)

    model.train(2, dataset, dataset_sink_mode=False)
Ejemplo n.º 21
0
def train():
    context.set_context(
        mode=context.GRAPH_MODE,
        device_target="Ascend",
        #save_graphs=True,
        #save_graphs_path="/home/work/user-job-dir/EAST/",
        #enable_reduce_precision=False,
        #device_id=5
    )

    epoch = 600

    my_dataset.download_dataset()
    train_img_path = os.path.abspath('/cache/train_img')
    train_gt_path = os.path.abspath('/cache/train_gt')
    #my_dataset.data_to_mindrecord_byte_image(train_img_path, train_gt_path, mindrecord_dir='/cache', prefix='icdar_train.mindrecord',file_num=1)
    #dataset = my_dataset.create_icdar_train_dataset(mindrecord_file=['icdar_train.mindrecord0','icdar_train.mindrecord1','icdar_train.mindrecord2','icdar_train.mindrecord3'], batch_size=32, repeat_num=epoch,
    #                            is_training=True, num_parallel_workers=8, length=512, scale=0.25)
    #dataset = my_dataset.create_icdar_train_dataset(mindrecord_file='/cache/icdar_train.mindrecord', batch_size=32, repeat_num=epoch,
    #                            is_training=True, num_parallel_workers=24, length=512, scale=0.25)
    #dataset = my_dataset.create_demo_dataset(batch_size=21, repeat_num=2)
    #train_img_path = os.path.abspath('/home/licheng/gpzlx1/ICDAR_2015/train/img')
    #train_gt_path  = os.path.abspath('/home/licheng/gpzlx1/ICDAR_2015/train/gt')
    dataset = datasetV2.create_icdar_train_dataset(train_img_path,
                                                   train_gt_path,
                                                   batch_size=14,
                                                   repeat_num=1,
                                                   is_training=True,
                                                   num_parallel_workers=24)
    #dataset = datasetV3.create_icdar_train_dataset(train_img_path, train_gt_path, batch_size=14, repeat_num=1, is_training=True, num_parallel_workers=24)
    dataset_size = dataset.get_dataset_size()

    print("Create dataset done!, dataset_size: ", dataset_size)

    #east = EAST.EAST()
    net = EAST_VGG.EAST()

    #ckpt_config = CheckpointConfig(save_checkpoint_steps=dataset_size * 20)
    #ckpoint_cb = ModelCheckpoint(prefix='EAST', directory='/cache', config=ckpt_config)

    milestone = [100, 300]
    learning_rates = [1e-3, 1e-4]
    lr = piecewise_constant_lr(milestone, learning_rates)
    opt = nn.Adam(filter(lambda x: x.requires_grad, net.get_parameters()),
                  learning_rate=lr)
    net = my_loss.EASTWithLossCell(net)
    net = my_loss.TrainingWrapper(net, opt)
    net.set_train(True)

    callback = [TimeMonitor(data_size=dataset_size),
                LossMonitor()]  #, ckpoint_cb]

    model = Model(net)
    dataset_sink_mode = False
    print("start trainig")
    model.train(epoch,
                dataset,
                callbacks=callback,
                dataset_sink_mode=dataset_sink_mode)
Ejemplo n.º 22
0
 def single_matmul_trains(self):
     single_callback = ModelCallback()
     net = MatmulNet()
     optimizer = Momentum(net.trainable_params(), learning_rate=0.1, momentum=0.9)
     model = Model(net, optimizer=optimizer)
     epoch_size = 6
     dataset = Dataset(self.input_full, self.label_full)
     model.train(epoch_size, dataset, callbacks=single_callback, dataset_sink_mode=False)
     loss_value = np.array(single_callback.loss_list)
     return loss_value
Ejemplo n.º 23
0
def compile_net(net):
    context.set_context(save_graphs=False)
    learning_rate = 0.1
    momentum = 0.9
    epoch_size = 2
    dataset = Dataset(_x, _b)
    opt = Momentum(net.trainable_params(), learning_rate, momentum)
    model = Model(net, optimizer=opt)
    model.train(epoch_size, dataset, dataset_sink_mode=False)
    context.reset_auto_parallel_context()
Ejemplo n.º 24
0
def compile_net(net):
    context.set_context(save_graphs=True)
    learning_rate = 0.1
    momentum = 0.9
    epoch_size = 2
    dataset = Dataset(_x, _b)
    loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean')
    opt = Momentum(net.trainable_params(), learning_rate, momentum)
    model = Model(net, loss, optimizer=opt, amp_level="O2")
    model.train(epoch_size, dataset, dataset_sink_mode=False)
    context.reset_auto_parallel_context()
Ejemplo n.º 25
0
 def data_parallel_matmul_trains(self):
     parallel_callback = ModelCallback()
     context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
     net = MatmulNet()
     optimizer = Momentum(net.trainable_params(), learning_rate=0.1, momentum=0.9)
     model = Model(net, optimizer=optimizer)
     epoch_size = 6
     dataset = Dataset(self.input_part, self.label_part)
     model.train(epoch_size, dataset, callbacks=parallel_callback, dataset_sink_mode=False)
     loss_value = np.array(parallel_callback.loss_list)
     return loss_value
Ejemplo n.º 26
0
def test_compile_f16_model_train():
    dataset_types = (np.float32, np.float32)
    dataset_shapes = ((16, 16), (16, 16))

    dataset = MindDataSet(dataset_types, dataset_shapes)
    net = NetFP16(16, 16)
    net.set_train()

    loss = MSELoss()
    optimizer = Momentum(net.trainable_params(), learning_rate=0.1, momentum=0.9)
    model = Model(net, loss_fn=loss, optimizer=optimizer, metrics=None)
    model.train(2, dataset, dataset_sink_mode=False)
Ejemplo n.º 27
0
def main():
    # We currently support pynative mode with device GPU
    context.set_context(mode=context.PYNATIVE_MODE, device_target='GPU')
    epoch_size = 1
    batch_size = 32
    mnist_path = "/data/chengzi/zhusuan-mindspore/data/MNIST"
    repeat_size = 1

    # Define model parameters
    z_dim = 40
    x_dim = 32 * 32

    # create the network
    generator = Generator(x_dim, z_dim, batch_size)
    variational = Variational(x_dim, z_dim, batch_size)
    network = zs.variational.ELBO(generator, variational)

    # define loss
    # learning rate setting
    lr = 0.001
    net_loss = ReduceMeanLoss()

    # define the optimizer
    print(network.trainable_params()[0])
    net_opt = nn.Adam(network.trainable_params(), lr)

    model = Model(network, net_loss, net_opt)

    ds_train = create_dataset(os.path.join(mnist_path, "train"), batch_size,
                              repeat_size)
    model.train(epoch_size,
                ds_train,
                callbacks=[LossMonitor()],
                dataset_sink_mode=False)

    print(network.trainable_params()[0])

    iterator = ds_train.create_tuple_iterator()
    for item in iterator:
        batch_x = item[0].reshape(32, 32 * 32)
        break
    z, _ = network.variational(Tensor(batch_x), None, None)
    sample, _, _, _ = network.generator(None, z, None)
    sample = sample.asnumpy()
    save_img(batch_x, 'result/origin_x.png')
    save_img(sample, 'result/reconstruct_x.png')

    for i in range(4):
        sample, _, _, _ = network.generator(None, None, None)
        sample = sample.asnumpy()
        samples = sample if i == 0 else np.concatenate([samples, sample],
                                                       axis=0)
    save_img(samples, 'result/sample_x.png', num=4 * batch_size)
Ejemplo n.º 28
0
def test_compile_f16_model_train_fixed():
    dataset_types = (np.float32, np.float32)
    dataset_shapes = ((16, 16), (16, 16))

    dataset = MindDataSet(dataset_types, dataset_shapes)
    net = NetFP16(16, 16)
    net.set_train()
    scale_manager = FixedLossScaleManager()
    loss = MSELoss()
    optimizer = Momentum(net.trainable_params(), learning_rate=0.1, momentum=0.9)
    model = Model(net, loss_fn=loss, optimizer=optimizer, metrics=None, loss_scale_manager=scale_manager)
    model.train(2, dataset)
def resnet50_train(args_opt):
    epoch_size = args_opt.epoch_size
    batch_size = cfg.batch_size
    class_num = cfg.class_num
    loss_scale_num = cfg.loss_scale
    local_data_path = '/cache/data'
    local_ckpt_path = '/cache/ckpt_file'

    # set graph mode and parallel mode
    context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", save_graphs=False)

    # data download
    print('Download data.')
    mox.file.copy_parallel(src_url=args_opt.data_url, dst_url=local_data_path)

    # create dataset
    print('Create train and evaluate dataset.')
    train_dataset = create_dataset(dataset_path=local_data_path, do_train=True,
                                   repeat_num=epoch_size, batch_size=batch_size)
    train_step_size = train_dataset.get_dataset_size()
    print('Create dataset success.')

    # create model
    net = resnet50(class_num=class_num)
    # reduction='mean' means that apply reduction of mean to loss
    loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean')
    lr = Tensor(get_lr(global_step=0, total_epochs=epoch_size, steps_per_epoch=train_step_size))
    opt = Momentum(net.trainable_params(), lr, momentum=0.9, weight_decay=1e-4, loss_scale=loss_scale_num)
    loss_scale = FixedLossScaleManager(loss_scale_num, False)

    # amp_level="O2" means that the hybrid precision of O2 mode is used for training
    # the whole network except that batchnorm will be cast into float16 format and dynamic loss scale will be used
    # 'keep_batchnorm_fp32 = False' means that use the float16 format
    model = Model(net, amp_level="O2", keep_batchnorm_fp32=False, loss_fn=loss,
                  optimizer=opt, loss_scale_manager=loss_scale, metrics={'acc'})

    # define performance callback to show ips and loss callback to show loss for every epoch
    time_cb = TimeMonitor(data_size=train_step_size)
    performance_cb = PerformanceCallback(batch_size)
    loss_cb = LossMonitor()
    cb = [time_cb, performance_cb, loss_cb]
    config_ck = CheckpointConfig(save_checkpoint_steps=cfg.save_checkpoint_epochs * train_step_size,
                                 keep_checkpoint_max=cfg.keep_checkpoint_max)
    ckpt_cb = ModelCheckpoint(prefix="resnet", directory=local_ckpt_path, config=config_ck)
    cb += [ckpt_cb]

    print(f'Start run training, total epoch: {epoch_size}.')
    model.train(epoch_size, train_dataset, callbacks=cb)

    # upload checkpoint files
    print('Upload checkpoint.')
    mox.file.copy_parallel(src_url=local_ckpt_path, dst_url=args_opt.train_url)
Ejemplo n.º 30
0
 def train_mindspore_impl(self, indices, epoch, batch_size, use_parallel=True):
     ds = FakeData(size=8, batch_size=batch_size, num_class=8, image_size=(), use_parallel=use_parallel)
     ds.set_image_data_type(np.int32)
     net = self
     net.set_train()
     loss = nn.SoftmaxCrossEntropyWithLogits()
     optimizer = nn.Adam(net.trainable_params())
     optimizer.target = "CPU"
     model = Model(net, loss, optimizer)
     for _ in range(epoch):
         model.train(1, ds, dataset_sink_mode=False)
     output = net(indices)
     return output