def test_batchnorm_batch_parallel(): num_classes = 1001 batch_size = 32 learning_rate = 0.1 momentum = 0.9 epoch_size = 2 rank_size = 0 predict = Tensor(np.ones([batch_size, 3, 224, 224]), dtype=ms.float32) label = Tensor(np.ones([batch_size]), dtype=ms.int32) dataset = DatasetLenet(predict, label, 2) net = batchnorm_net(num_classes) loss = SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True) loss.softmax_cross_entropy.set_strategy(((dev_num, 1), (dev_num, 1))) opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), learning_rate, momentum) context.reset_auto_parallel_context() context.set_auto_parallel_context( parallel_mode=ParallelMode.SEMI_AUTO_PARALLEL, device_num=dev_num) context.set_context(mode=context.GRAPH_MODE) model = Model(net, loss, opt) model.train(epoch_size, dataset, dataset_sink_mode=False)
def test_gpu_profiler(self): context.set_context(mode=context.GRAPH_MODE, device_target="GPU") profiler = Profiler(output_path='data') profiler_name = os.listdir(os.path.join(os.getcwd(), 'data'))[0] self.profiler_path = os.path.join(os.getcwd(), f'data/{profiler_name}/') ds_train = create_dataset(os.path.join(self.mnist_path, "train")) if ds_train.get_dataset_size() == 0: raise ValueError( "Please check dataset size > 0 and batch_size <= dataset size") lenet = LeNet5() loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") optim = Momentum(lenet.trainable_params(), learning_rate=0.1, momentum=0.9) model = Model(lenet, loss_fn=loss, optimizer=optim, metrics={'acc': Accuracy()}) model.train(1, ds_train, dataset_sink_mode=True) profiler.analyse() self._check_gpu_profiling_file()
def test_dp_monitor_gpu(): context.set_context(mode=context.GRAPH_MODE, device_target="GPU") batch_size = 16 batches = 128 epochs = 1 rdp = PrivacyMonitorFactory.create(policy='rdp', num_samples=60000, batch_size=batch_size, initial_noise_multiplier=0.4, noise_decay_rate=6e-5) suggest_epoch = rdp.max_epoch_suggest() LOGGER.info(TAG, 'The recommended maximum training epochs is: %s', suggest_epoch) network = LeNet5() net_loss = nn.SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True, reduction="mean") net_opt = nn.Momentum(network.trainable_params(), 0.01, 0.9) model = Model(network, net_loss, net_opt) LOGGER.info(TAG, "============== Starting Training ==============") ds1 = ds.GeneratorDataset(dataset_generator(batch_size, batches), ["data", "label"]) ds1.set_dataset_size(batch_size * batches) model.train(epochs, ds1, callbacks=[rdp], dataset_sink_mode=False)
def mnist_train(epoch_size, batch_size, lr, momentum): mnist_path = "./MNIST_unzip/" ds = generate_mnist_dataset(os.path.join(mnist_path, "train"), batch_size=batch_size, repeat_size=1) network = LeNet5() net_loss = nn.SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True, reduction="mean") net_opt = nn.Momentum(network.trainable_params(), lr, momentum) config_ck = CheckpointConfig(save_checkpoint_steps=1875, keep_checkpoint_max=10) ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet", directory="./trained_ckpt_file/", config=config_ck) model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()}) LOGGER.info(TAG, "============== Starting Training ==============") model.train(epoch_size, ds, callbacks=[ckpoint_cb, LossMonitor()], dataset_sink_mode=False) LOGGER.info(TAG, "============== Starting Testing ==============") ckpt_file_name = "trained_ckpt_file/checkpoint_lenet-10_1875.ckpt" param_dict = load_checkpoint(ckpt_file_name) load_param_into_net(network, param_dict) ds_eval = generate_mnist_dataset(os.path.join(mnist_path, "test"), batch_size=batch_size) acc = model.eval(ds_eval, dataset_sink_mode=False) LOGGER.info(TAG, "============== Accuracy: %s ==============", acc)
def train_common(net): batch_size = 32 learning_rate = 0.1 momentum = 0.9 epoch_size = 2 device_num = 4 context.reset_auto_parallel_context() auto_parallel_context().set_enable_all_reduce_fusion( enable_all_reduce_fusion=True) context.set_auto_parallel_context( parallel_mode=ParallelMode.SEMI_AUTO_PARALLEL, device_num=device_num, parameter_broadcast=False) context.set_context(mode=context.GRAPH_MODE) predict = Tensor(np.ones([batch_size, 128]), dtype=ms.float32) label = Tensor(np.ones([batch_size]), dtype=ms.int32) dataset = Dataset(predict, label, 2) loss = SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True) opt = Momentum(net.trainable_params(), learning_rate, momentum) model = Model(net, loss, opt) model.train(epoch_size, dataset, dataset_sink_mode=False) allreduce_fusion_dict = _executor._get_allreduce_fusion( model._train_network) print(allreduce_fusion_dict) return allreduce_fusion_dict
def test_summary_ops(self): """Test summary operators.""" ds_train = create_mnist_dataset('train', num_samples=1, batch_size=1) ds_train_iter = ds_train.create_dict_iterator() expected_data = next(ds_train_iter)['image'].asnumpy() net = LeNet5() loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") optim = Momentum(net.trainable_params(), learning_rate=0.1, momentum=0.9) model = Model(net, loss_fn=loss, optimizer=optim, metrics={'loss': Loss()}) model.train(1, ds_train, dataset_sink_mode=False) summary_data = _get_summary_tensor_data() image_data = summary_data['x[:Image]'].asnumpy() tensor_data = summary_data['x[:Tensor]'].asnumpy() x_fc3 = summary_data['x_fc3[:Scalar]'].asnumpy() assert np.allclose(expected_data, image_data) assert np.allclose(expected_data, tensor_data) assert not np.allclose(0, x_fc3)
def train_lenet_quant(): context.set_context(mode=context.GRAPH_MODE, device_target=device_target) cfg = quant_cfg ckpt_path = './ckpt_lenet_noquant-10_1875.ckpt' ds_train = create_dataset(os.path.join(data_path, "train"), cfg.batch_size, 1) step_size = ds_train.get_dataset_size() # define fusion network network = LeNet5Fusion(cfg.num_classes) # load quantization aware network checkpoint param_dict = load_checkpoint(ckpt_path) load_nonquant_param_into_quant_net(network, param_dict) # convert fusion network to quantization aware network network = quant.convert_quant_network(network, quant_delay=900, bn_fold=False, per_channel=[True, False], symmetric=[False, False]) # define network loss net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") # define network optimization net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum) # call back and monitor config_ckpt = CheckpointConfig(save_checkpoint_steps=cfg.epoch_size * step_size, keep_checkpoint_max=cfg.keep_checkpoint_max) ckpt_callback = ModelCheckpoint(prefix="ckpt_lenet_quant", config=config_ckpt) # define model model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()}) print("============== Starting Training ==============") model.train(cfg['epoch_size'], ds_train, callbacks=[ckpt_callback, LossMonitor()], dataset_sink_mode=True) print("============== End Training ==============")
def test_trains(): init() lr = 0.1 momentum = 0.9 max_epoch = 20 device_number = 32 batch_size_per_device = 128 input_channels = 256 out_channels = 512 context.reset_auto_parallel_context() context.set_auto_parallel_context(parallel_mode=ParallelMode.SEMI_AUTO_PARALLEL, device_num=device_number) predict = Tensor(np.ones([batch_size_per_device, input_channels]), dtype=ms.float32) dataset = Dataset(predict, 4) network = fc_with_initialize(input_channels, out_channels) network.set_train() criterion = get_loss(batch_size_per_device * device_number) train_network = BuildTrainNetwork(network, criterion) train_network.set_train() opt = Momentum(train_network.trainable_params(), lr, momentum) train_net = TrainOneStepCell(train_network, opt).set_train() model = Model(train_net) model.train(max_epoch, dataset, dataset_sink_mode=False) context.reset_auto_parallel_context()
def test_callbacks_non_sink_mismatch_size(): logger.info("test_callbacks_non_sink_mismatch_size") default_timeout = ds.config.get_callback_timeout() ds.config.set_callback_timeout(1) events = [] my_cb1 = MyWaitedCallback(events, 2) my_cb2 = MyMSCallback(events) arr = [1, 2, 3, 4] data = ds.NumpySlicesDataset((arr, arr), column_names=["c1", "c2"], shuffle=False) data = data.map(operations=(lambda x: x), callbacks=my_cb1) data = data.batch(3) net = Net() model = Model(net) with pytest.raises(Exception) as err: model.train(2, data, dataset_sink_mode=False, callbacks=[my_cb2, my_cb1]) assert "RuntimeError: ds_step_begin timed out after 1 second(s)" in str( err.value) ds.config.set_callback_timeout(default_timeout)
def main(data_path, device_target='Ascend', summary_dir='./summary_dir', learning_rate=0.01): context.set_context(mode=context.GRAPH_MODE, device_target=device_target) momentum = 0.9 epoch_size = 1 batch_size = 32 network = LeNet5() network.set_train() net_loss = CrossEntropyLoss() net_opt = nn.Momentum(network.trainable_params(), learning_rate, momentum) model = Model(network, net_loss, net_opt) # Init SummaryCollector callback to record summary data in model.train or model.eval summary_collector = SummaryCollector(summary_dir=summary_dir, collect_freq=10) ds = create_dataset(os.path.join(data_path, "train"), batch_size=batch_size) print("============== Starting Training ==============") model.train(epoch_size, ds, callbacks=[summary_collector], dataset_sink_mode=False) print("============== Train End =====================")
def train(data_dir, lr=0.01, momentum=0.9, num_epochs=2, ckpt_name="lenet"): dataset_sink = context.get_context('device_target') == 'Ascend' repeat = num_epochs if dataset_sink else 1 ds_train = create_dataset(data_dir, repeat=repeat) ds_eval = create_dataset(data_dir, training=False) steps_per_epoch = ds_train.get_dataset_size() net = LeNet5() loss = nn.loss.SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True, reduction='mean') opt = nn.Momentum(net.trainable_params(), lr, momentum) ckpt_cfg = CheckpointConfig(save_checkpoint_steps=steps_per_epoch, keep_checkpoint_max=5) ckpt_cb = ModelCheckpoint(prefix=ckpt_name, directory='ckpt', config=ckpt_cfg) loss_cb = LossMonitor(steps_per_epoch) model = Model(net, loss, opt, metrics={'acc', 'loss'}) model.train(num_epochs, ds_train, callbacks=[ckpt_cb, loss_cb], dataset_sink_mode=dataset_sink) metrics = model.eval(ds_eval, dataset_sink_mode=dataset_sink) print('Metrics:', metrics)
def loss_scale_manager_common(strategy1): learning_rate = 0.1 momentum = 0.9 epoch_size = 2 context.reset_auto_parallel_context() context.set_auto_parallel_context(parallel_mode=ParallelMode.AUTO_PARALLEL, device_num=8) predict = Tensor(np.ones([32, 128]), dtype=ms.float32) label = Tensor(np.ones([32]), dtype=ms.int32) dataset = Dataset(predict, label, 2) net = all_to_all_net(strategy1) loss = SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True) loss.softmax_cross_entropy.set_strategy(((8, 1), (8, 1))) opt = Momentum(net.trainable_params(), learning_rate, momentum) scale_manager = DynamicLossScaleManager(32, 2, 2000) model = Model(net, loss, opt, loss_scale_manager=scale_manager) # if no GE exists, outputs = self._train_network(*next_element) outputs inputs tensor. try: model.train(epoch_size, dataset, dataset_sink_mode=False) except TypeError: pass else: assert False
def calibration(): """ do the calibration to get the scale offset record file""" dataset = create_dataset( dataset_path=ARGS_OPT.eval_dataset, do_train=False, batch_size=config.batch_size, # pylint: disable=no-member target=ARGS_OPT.device_target) dataset = dataset.take(1) loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') network = resnet(10) network.set_train(False) param_dict = load_checkpoint(ARGS_OPT.pre_trained) load_param_into_net(network, param_dict) input_data = np.random.uniform(0.0, 1.0, size=[32, 3, 224, 224]).astype(np.float32) config_file = os.path.join(CUR_DIR, './config.json') amct.create_quant_config(config_file, network, input_data) calibration_network = amct.quantize_model(config_file, network, input_data) model = Model(calibration_network, loss_fn=loss, metrics={'top_1_accuracy', 'top_5_accuracy'}) _ = model.eval(dataset) amct.save_model('./resnet50_quant_calibration', calibration_network, input_data)
def mix_parallel_matmul_trains(self): parallel_callback = ModelCallback() matmul_stra = ((device_num, 1), (1, 1)) reduce_max_stra = ((1, device_num), ) sub_stra = ((device_num, 1), (device_num, 1)) exp_stra = ((1, device_num), ) reduce_sum_stra = ((1, device_num), ) div_stra = ((1, device_num), (1, 1)) log_stra = ((1, device_num), ) mul_stra = ((1, device_num), (1, device_num)) sum_cross_entropy_stra = ((1, device_num), ) mul2_stra = ((), (device_num, )) reduce_mean_stra = ((device_num, ), ) onehot_stra = ((1, device_num), (), ()) loss_stra_list = [ exp_stra, reduce_sum_stra, onehot_stra, div_stra, log_stra, sum_cross_entropy_stra, mul_stra, mul2_stra, reduce_mean_stra, reduce_max_stra, sub_stra ] context.set_auto_parallel_context(parallel_mode="auto_parallel") net = MatmulNet(matmul_stra=matmul_stra, loss_stra_list=loss_stra_list) optimizer = Momentum(net.trainable_params(), learning_rate=0.1, momentum=0.9) model = Model(net, optimizer=optimizer) epoch_size = 6 dataset = Dataset(self.input_part, self.label_part) model.train(epoch_size, dataset, callbacks=parallel_callback, dataset_sink_mode=False) loss_value = np.array(parallel_callback.loss_list) return loss_value
def test_row_tensor_model_train(): class Net(nn.Cell): def __init__(self, in_features, out_features): super(Net, self).__init__() self.weight = Parameter(Tensor( np.ones([out_features, in_features]).astype(np.float32)), name="weight") self.add = P.TensorAdd() self.cast = P.Cast() self.flag = True def construct(self, inputs, label): x = self.add(inputs, self.weight) if self.flag: x = self.cast(x, mstype.float32) return x dataset_types = (np.float32, np.float32) dataset_shapes = ((16, 16), (16, 16)) dataset = MindDataSet(dataset_types, dataset_shapes) net = Net(16, 16) net.set_train() optimizer = Momentum(net.trainable_params(), learning_rate=0.1, momentum=0.9) model = Model(net, optimizer=optimizer) model.train(2, dataset, dataset_sink_mode=False)
def net_trains(criterion, rank): init() lr = 0.1 momentum = 0.9 max_epoch = 20 input_channels = 256 out_channels = 512 context.set_context(mode=context.GRAPH_MODE, save_graphs=False) context.reset_auto_parallel_context() context.set_auto_parallel_context(parallel_mode=ParallelMode.SEMI_AUTO_PARALLEL, device_num=device_number, global_rank=rank) predict = Tensor(np.ones([batch_size_per_device, input_channels]), dtype=ms.float32) dataset = Dataset(predict, 4) network = fc_with_initialize(input_channels, out_channels) network.set_train() train_network = BuildTrainNetwork(network, criterion) train_network.set_train() opt = Momentum(train_network.trainable_params(), lr, momentum) train_net = TrainOneStepCell(train_network, opt).set_train() model = Model(train_net) model.train(max_epoch, dataset, dataset_sink_mode=False) context.reset_auto_parallel_context()
def eval_quant(): context.set_context(mode=context.GRAPH_MODE, device_target=device_target) cfg = quant_cfg ds_eval = create_dataset(os.path.join(data_path, "test"), cfg.batch_size, 1) ckpt_path = './ckpt_lenet_quant-10_937.ckpt' # define fusion network network = LeNet5Fusion(cfg.num_classes) # convert fusion network to quantization aware network quantizer = QuantizationAwareTraining(quant_delay=0, bn_fold=False, freeze_bn=10000, per_channel=[True, False], symmetric=[True, False]) network = quantizer.quantize(network) # define loss net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") # define network optimization net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum) # call back and monitor model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()}) # load quantization aware network checkpoint param_dict = load_checkpoint(ckpt_path) not_load_param = load_param_into_net(network, param_dict) if not_load_param: raise ValueError("Load param into net fail!") print("============== Starting Testing ==============") acc = model.eval(ds_eval, dataset_sink_mode=True) print("============== {} ==============".format(acc)) assert acc['Accuracy'] > 0.98
def test_resnet_model_parallel(): num_classes = 1024 batch_size = 32 learning_rate = 0.1 momentum = 0.9 epoch_size = 2 context.reset_auto_parallel_context() context.set_auto_parallel_context(device_num=dev_num, global_rank=0) context.set_auto_parallel_context( parallel_mode=ParallelMode.SEMI_AUTO_PARALLEL, device_num=dev_num) context.set_context(mode=context.GRAPH_MODE) predict = Tensor(np.ones([batch_size, 64, 112, 112]), dtype=ms.float32) label = Tensor(np.ones([batch_size]), dtype=ms.int32) dataset = DatasetLenet(predict, label, 2) net = resnet_model_parallel_net(num_classes) loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') loss.softmax_cross_entropy.shard(((dev_num, 1), (dev_num, 1))) opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), learning_rate, momentum) model = Model(net, loss, opt) model.train(epoch_size, dataset, dataset_sink_mode=False)
def test_callbacks_non_sink_batch_size2(): logger.info("test_callbacks_non_sink_batch_size2") events = [] my_cb1 = MyWaitedCallback(events, 2) my_cb2 = MyMSCallback(events) arr = [1, 2, 3, 4] data = ds.NumpySlicesDataset((arr, arr), column_names=["c1", "c2"], shuffle=False) data = data.map(operations=(lambda x: x), callbacks=my_cb1) data = data.batch(2) net = Net() model = Model(net) model.train(2, data, dataset_sink_mode=False, callbacks=[my_cb2, my_cb1]) expected_synced_events = [ 'ms_step_end_1_1', 'ds_step_begin_1_3', 'ms_step_end_1_2', 'ms_epoch_end_1_2', 'ds_epoch_begin_2_4', 'ds_step_begin_2_5', 'ms_step_end_2_3', 'ds_step_begin_2_7', 'ms_step_end_2_4', 'ms_epoch_end_2_4' ] assert events == expected_synced_events
def test_auto_parallel_arithmetic_model(): class NetOneHot(nn.Cell): def __init__(self): super().__init__() self.matmul = P.MatMul() self.one_hot = P.OneHot().shard(((1, 8), (), ())) self.on_value = Tensor(1.0, ms.float32) self.off_value = Tensor(0.0, ms.float32) self.matmul2 = P.MatMul() self.w = Parameter(Tensor(np.zeros([32, 64]).astype(np.float32)), "weight", requires_grad=True) def construct(self, x, b): out = self.matmul(x, self.w) out1 = self.one_hot(b, 64, self.on_value, self.off_value) out2 = self.matmul2(out, out1) return out2 context.reset_auto_parallel_context() context.set_auto_parallel_context(device_num=8, global_rank=0, parallel_mode=ParallelMode.AUTO_PARALLEL) net = NetOneHot() x = Tensor(np.ones([8, 32]), dtype=ms.float32) b = Tensor(np.ones([8]), dtype=ms.int32) dataset = Dataset(x, b, 2) opt = Momentum(net.trainable_params(), 0.1, 0.9) model = Model(net, optimizer=opt) model.train(2, dataset, dataset_sink_mode=False)
def train(): context.set_context( mode=context.GRAPH_MODE, device_target="Ascend", #save_graphs=True, #save_graphs_path="/home/work/user-job-dir/EAST/", #enable_reduce_precision=False, #device_id=5 ) epoch = 600 my_dataset.download_dataset() train_img_path = os.path.abspath('/cache/train_img') train_gt_path = os.path.abspath('/cache/train_gt') #my_dataset.data_to_mindrecord_byte_image(train_img_path, train_gt_path, mindrecord_dir='/cache', prefix='icdar_train.mindrecord',file_num=1) #dataset = my_dataset.create_icdar_train_dataset(mindrecord_file=['icdar_train.mindrecord0','icdar_train.mindrecord1','icdar_train.mindrecord2','icdar_train.mindrecord3'], batch_size=32, repeat_num=epoch, # is_training=True, num_parallel_workers=8, length=512, scale=0.25) #dataset = my_dataset.create_icdar_train_dataset(mindrecord_file='/cache/icdar_train.mindrecord', batch_size=32, repeat_num=epoch, # is_training=True, num_parallel_workers=24, length=512, scale=0.25) #dataset = my_dataset.create_demo_dataset(batch_size=21, repeat_num=2) #train_img_path = os.path.abspath('/home/licheng/gpzlx1/ICDAR_2015/train/img') #train_gt_path = os.path.abspath('/home/licheng/gpzlx1/ICDAR_2015/train/gt') dataset = datasetV2.create_icdar_train_dataset(train_img_path, train_gt_path, batch_size=14, repeat_num=1, is_training=True, num_parallel_workers=24) #dataset = datasetV3.create_icdar_train_dataset(train_img_path, train_gt_path, batch_size=14, repeat_num=1, is_training=True, num_parallel_workers=24) dataset_size = dataset.get_dataset_size() print("Create dataset done!, dataset_size: ", dataset_size) #east = EAST.EAST() net = EAST_VGG.EAST() #ckpt_config = CheckpointConfig(save_checkpoint_steps=dataset_size * 20) #ckpoint_cb = ModelCheckpoint(prefix='EAST', directory='/cache', config=ckpt_config) milestone = [100, 300] learning_rates = [1e-3, 1e-4] lr = piecewise_constant_lr(milestone, learning_rates) opt = nn.Adam(filter(lambda x: x.requires_grad, net.get_parameters()), learning_rate=lr) net = my_loss.EASTWithLossCell(net) net = my_loss.TrainingWrapper(net, opt) net.set_train(True) callback = [TimeMonitor(data_size=dataset_size), LossMonitor()] #, ckpoint_cb] model = Model(net) dataset_sink_mode = False print("start trainig") model.train(epoch, dataset, callbacks=callback, dataset_sink_mode=dataset_sink_mode)
def single_matmul_trains(self): single_callback = ModelCallback() net = MatmulNet() optimizer = Momentum(net.trainable_params(), learning_rate=0.1, momentum=0.9) model = Model(net, optimizer=optimizer) epoch_size = 6 dataset = Dataset(self.input_full, self.label_full) model.train(epoch_size, dataset, callbacks=single_callback, dataset_sink_mode=False) loss_value = np.array(single_callback.loss_list) return loss_value
def compile_net(net): context.set_context(save_graphs=False) learning_rate = 0.1 momentum = 0.9 epoch_size = 2 dataset = Dataset(_x, _b) opt = Momentum(net.trainable_params(), learning_rate, momentum) model = Model(net, optimizer=opt) model.train(epoch_size, dataset, dataset_sink_mode=False) context.reset_auto_parallel_context()
def compile_net(net): context.set_context(save_graphs=True) learning_rate = 0.1 momentum = 0.9 epoch_size = 2 dataset = Dataset(_x, _b) loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') opt = Momentum(net.trainable_params(), learning_rate, momentum) model = Model(net, loss, optimizer=opt, amp_level="O2") model.train(epoch_size, dataset, dataset_sink_mode=False) context.reset_auto_parallel_context()
def data_parallel_matmul_trains(self): parallel_callback = ModelCallback() context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") net = MatmulNet() optimizer = Momentum(net.trainable_params(), learning_rate=0.1, momentum=0.9) model = Model(net, optimizer=optimizer) epoch_size = 6 dataset = Dataset(self.input_part, self.label_part) model.train(epoch_size, dataset, callbacks=parallel_callback, dataset_sink_mode=False) loss_value = np.array(parallel_callback.loss_list) return loss_value
def test_compile_f16_model_train(): dataset_types = (np.float32, np.float32) dataset_shapes = ((16, 16), (16, 16)) dataset = MindDataSet(dataset_types, dataset_shapes) net = NetFP16(16, 16) net.set_train() loss = MSELoss() optimizer = Momentum(net.trainable_params(), learning_rate=0.1, momentum=0.9) model = Model(net, loss_fn=loss, optimizer=optimizer, metrics=None) model.train(2, dataset, dataset_sink_mode=False)
def main(): # We currently support pynative mode with device GPU context.set_context(mode=context.PYNATIVE_MODE, device_target='GPU') epoch_size = 1 batch_size = 32 mnist_path = "/data/chengzi/zhusuan-mindspore/data/MNIST" repeat_size = 1 # Define model parameters z_dim = 40 x_dim = 32 * 32 # create the network generator = Generator(x_dim, z_dim, batch_size) variational = Variational(x_dim, z_dim, batch_size) network = zs.variational.ELBO(generator, variational) # define loss # learning rate setting lr = 0.001 net_loss = ReduceMeanLoss() # define the optimizer print(network.trainable_params()[0]) net_opt = nn.Adam(network.trainable_params(), lr) model = Model(network, net_loss, net_opt) ds_train = create_dataset(os.path.join(mnist_path, "train"), batch_size, repeat_size) model.train(epoch_size, ds_train, callbacks=[LossMonitor()], dataset_sink_mode=False) print(network.trainable_params()[0]) iterator = ds_train.create_tuple_iterator() for item in iterator: batch_x = item[0].reshape(32, 32 * 32) break z, _ = network.variational(Tensor(batch_x), None, None) sample, _, _, _ = network.generator(None, z, None) sample = sample.asnumpy() save_img(batch_x, 'result/origin_x.png') save_img(sample, 'result/reconstruct_x.png') for i in range(4): sample, _, _, _ = network.generator(None, None, None) sample = sample.asnumpy() samples = sample if i == 0 else np.concatenate([samples, sample], axis=0) save_img(samples, 'result/sample_x.png', num=4 * batch_size)
def test_compile_f16_model_train_fixed(): dataset_types = (np.float32, np.float32) dataset_shapes = ((16, 16), (16, 16)) dataset = MindDataSet(dataset_types, dataset_shapes) net = NetFP16(16, 16) net.set_train() scale_manager = FixedLossScaleManager() loss = MSELoss() optimizer = Momentum(net.trainable_params(), learning_rate=0.1, momentum=0.9) model = Model(net, loss_fn=loss, optimizer=optimizer, metrics=None, loss_scale_manager=scale_manager) model.train(2, dataset)
def resnet50_train(args_opt): epoch_size = args_opt.epoch_size batch_size = cfg.batch_size class_num = cfg.class_num loss_scale_num = cfg.loss_scale local_data_path = '/cache/data' local_ckpt_path = '/cache/ckpt_file' # set graph mode and parallel mode context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", save_graphs=False) # data download print('Download data.') mox.file.copy_parallel(src_url=args_opt.data_url, dst_url=local_data_path) # create dataset print('Create train and evaluate dataset.') train_dataset = create_dataset(dataset_path=local_data_path, do_train=True, repeat_num=epoch_size, batch_size=batch_size) train_step_size = train_dataset.get_dataset_size() print('Create dataset success.') # create model net = resnet50(class_num=class_num) # reduction='mean' means that apply reduction of mean to loss loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') lr = Tensor(get_lr(global_step=0, total_epochs=epoch_size, steps_per_epoch=train_step_size)) opt = Momentum(net.trainable_params(), lr, momentum=0.9, weight_decay=1e-4, loss_scale=loss_scale_num) loss_scale = FixedLossScaleManager(loss_scale_num, False) # amp_level="O2" means that the hybrid precision of O2 mode is used for training # the whole network except that batchnorm will be cast into float16 format and dynamic loss scale will be used # 'keep_batchnorm_fp32 = False' means that use the float16 format model = Model(net, amp_level="O2", keep_batchnorm_fp32=False, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics={'acc'}) # define performance callback to show ips and loss callback to show loss for every epoch time_cb = TimeMonitor(data_size=train_step_size) performance_cb = PerformanceCallback(batch_size) loss_cb = LossMonitor() cb = [time_cb, performance_cb, loss_cb] config_ck = CheckpointConfig(save_checkpoint_steps=cfg.save_checkpoint_epochs * train_step_size, keep_checkpoint_max=cfg.keep_checkpoint_max) ckpt_cb = ModelCheckpoint(prefix="resnet", directory=local_ckpt_path, config=config_ck) cb += [ckpt_cb] print(f'Start run training, total epoch: {epoch_size}.') model.train(epoch_size, train_dataset, callbacks=cb) # upload checkpoint files print('Upload checkpoint.') mox.file.copy_parallel(src_url=local_ckpt_path, dst_url=args_opt.train_url)
def train_mindspore_impl(self, indices, epoch, batch_size, use_parallel=True): ds = FakeData(size=8, batch_size=batch_size, num_class=8, image_size=(), use_parallel=use_parallel) ds.set_image_data_type(np.int32) net = self net.set_train() loss = nn.SoftmaxCrossEntropyWithLogits() optimizer = nn.Adam(net.trainable_params()) optimizer.target = "CPU" model = Model(net, loss, optimizer) for _ in range(epoch): model.train(1, ds, dataset_sink_mode=False) output = net(indices) return output