def _get_aleatoric_uncertainty_model(self): """ Get the model which can obtain the aleatoric uncertainty. """ if self.ale_uncer_model is None: self.ale_uncer_model = AleatoricUncertaintyModel( self.ale_model, self.num_classes, self.task_type) net_loss = AleatoricLoss(self.task_type) net_opt = Adam(self.ale_uncer_model.trainable_params()) if self.task_type == 'classification': model = Model(self.ale_uncer_model, net_loss, net_opt, metrics={"Accuracy": Accuracy()}) else: model = Model(self.ale_uncer_model, net_loss, net_opt, metrics={"MSE": MSE()}) if self.save_model: config_ck = CheckpointConfig(keep_checkpoint_max=self.epochs) ckpoint_cb = ModelCheckpoint( prefix='checkpoint_ale_uncer_model', directory=self.ale_uncer_model_path, config=config_ck) model.train(self.epochs, self.ale_train_dataset, callbacks=[ckpoint_cb, LossMonitor()]) elif self.ale_uncer_model_path is None: model.train(self.epochs, self.ale_train_dataset, callbacks=[LossMonitor()]) else: uncer_param_dict = load_checkpoint(self.ale_uncer_model_path) load_param_into_net(self.ale_uncer_model, uncer_param_dict)
def _get_epistemic_uncertainty_model(self): """ Get the model which can obtain the epistemic uncertainty. """ if self.epi_uncer_model is None: self.epi_uncer_model = EpistemicUncertaintyModel(self.epi_model) if self.epi_uncer_model.drop_count == 0 and self.epi_train_dataset is not None: if self.task_type == 'classification': net_loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") net_opt = Adam(self.epi_uncer_model.trainable_params()) model = Model(self.epi_uncer_model, net_loss, net_opt, metrics={"Accuracy": Accuracy()}) else: net_loss = MSELoss() net_opt = Adam(self.epi_uncer_model.trainable_params()) model = Model(self.epi_uncer_model, net_loss, net_opt, metrics={"MSE": MSE()}) if self.save_model: config_ck = CheckpointConfig(keep_checkpoint_max=self.epochs) ckpoint_cb = ModelCheckpoint(prefix='checkpoint_epi_uncer_model', directory=self.epi_uncer_model_path, config=config_ck) model.train(self.epochs, self.epi_train_dataset, dataset_sink_mode=False, callbacks=[ckpoint_cb, LossMonitor()]) elif self.epi_uncer_model_path is None: model.train(self.epochs, self.epi_train_dataset, dataset_sink_mode=False, callbacks=[LossMonitor()]) else: uncer_param_dict = load_checkpoint(self.epi_uncer_model_path) load_param_into_net(self.epi_uncer_model, uncer_param_dict)
def train_bert(): """train bert""" context.set_context(mode=context.GRAPH_MODE) context.set_context(device_target="Ascend") context.set_context(enable_task_sink=True) context.set_context(enable_loop_sink=True) context.set_context(enable_mem_reuse=True) ds = create_train_dataset(bert_net_cfg.batch_size) netwithloss = BertNetworkWithLoss(bert_net_cfg, True) optimizer = Lamb(netwithloss.trainable_params(), decay_steps=bert_train_cfg.decay_steps, start_learning_rate=bert_train_cfg.start_learning_rate, end_learning_rate=bert_train_cfg.end_learning_rate, power=bert_train_cfg.power, warmup_steps=bert_train_cfg.num_warmup_steps, decay_filter=lambda x: False) netwithgrads = BertTrainOneStepCell(netwithloss, optimizer=optimizer) netwithgrads.set_train(True) model = Model(netwithgrads) config_ck = CheckpointConfig( save_checkpoint_steps=bert_train_cfg.save_checkpoint_steps, keep_checkpoint_max=bert_train_cfg.keep_checkpoint_max) ckpoint_cb = ModelCheckpoint(prefix=bert_train_cfg.checkpoint_prefix, config=config_ck) model.train(ds.get_repeat_count(), ds, callbacks=[LossMonitor(), ckpoint_cb], dataset_sink_mode=False)
def train_lenet_quant(): context.set_context(mode=context.GRAPH_MODE, device_target=device_target) cfg = quant_cfg ckpt_path = './ckpt_lenet_noquant-10_1875.ckpt' ds_train = create_dataset(os.path.join(data_path, "train"), cfg.batch_size, 1) step_size = ds_train.get_dataset_size() # define fusion network network = LeNet5Fusion(cfg.num_classes) # load quantization aware network checkpoint param_dict = load_checkpoint(ckpt_path) load_nonquant_param_into_quant_net(network, param_dict) # convert fusion network to quantization aware network network = quant.convert_quant_network(network, quant_delay=900, bn_fold=False, per_channel=[True, False], symmetric=[False, False]) # define network loss net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") # define network optimization net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum) # call back and monitor config_ckpt = CheckpointConfig(save_checkpoint_steps=cfg.epoch_size * step_size, keep_checkpoint_max=cfg.keep_checkpoint_max) ckpt_callback = ModelCheckpoint(prefix="ckpt_lenet_quant", config=config_ckpt) # define model model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()}) print("============== Starting Training ==============") model.train(cfg['epoch_size'], ds_train, callbacks=[ckpt_callback, LossMonitor()], dataset_sink_mode=True) print("============== End Training ==============")
def train_net(network, model, args, ckpoint_cb, sink_mode): """Define the training method.""" print("============== Starting Training ==============") # load training dataset ds_train = create_dataset(os.path.join(args.data_dir, "train"), args.batch_size, args.repeat_size) callbacks = [ # ckpoint_cb, LossMonitor(per_print_times=20), ] if args.use_kungfu: if args.use_kungfu_elastic: from kungfu_mindspore_callbacks import KungFuElasticCallback schedule = { 10: 2, 20: 3, 30: 4, 40: 1, 50: 2, 60: 3, 70: 4, 80: 1, } kungfu_elastic_callback = KungFuElasticCallback(schedule) callbacks.append(kungfu_elastic_callback) log_callbacks(callbacks) print('sink_mode: %s' % (sink_mode)) model.train(args.epoch_size, ds_train, callbacks=callbacks, dataset_sink_mode=sink_mode)
def _train_epoch(self): if zeus.is_torch_backend(): self.model.train() for batch_index, batch in enumerate(self.train_loader): batch = self.make_batch(batch) batch_logs = {'train_batch': batch} self.callbacks.before_train_step(batch_index, batch_logs) train_batch_output = self.train_step(batch) batch_logs.update(train_batch_output) if self.config.is_detection_trainer: batch_logs.update({'is_detection_trainer': True}) self.callbacks.after_train_step(batch_index, batch_logs) elif zeus.is_tf_backend(): self.estimator.train(input_fn=self.train_input_fn, steps=len(self.train_loader), hooks=self._init_logging_hook()) elif zeus.is_ms_backend(): self.ms_model = MsModel(network=self.model, loss_fn=self.loss, optimizer=self.optimizer, metrics={self.metric_name: self.valid_metrics()}) config_ck = CheckpointConfig(save_checkpoint_steps=self.config.save_steps) # save the network model and parameters for subsequence fine-tuning save_path = self.get_local_worker_path(self.step_name, self.worker_id) ckpoint_cb = ModelCheckpoint(config=config_ck, directory=save_path) loss_cb = LossMonitor(per_print_times=self.config.report_freq) eval_cb = EvalCallBack(self.ms_model, self.valid_loader) self.ms_model.train(epoch=self.epochs, train_dataset=self.train_loader, callbacks=[ckpoint_cb, loss_cb, eval_cb], dataset_sink_mode=self.dataset_sink_mode)
def train_net(model, epoch_size, data_path, ckpoint_cb, sink_mode): """train_net""" ds_train = create_dataset(os.path.join(data_path, "train"), 32) model.train(epoch_size, ds_train, callbacks=[ckpoint_cb, LossMonitor(125)], dataset_sink_mode=sink_mode)
def mnist_train(epoch_size, batch_size, lr, momentum): mnist_path = "./MNIST_unzip/" ds = generate_mnist_dataset(os.path.join(mnist_path, "train"), batch_size=batch_size, repeat_size=1) network = LeNet5() net_loss = nn.SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True, reduction="mean") net_opt = nn.Momentum(network.trainable_params(), lr, momentum) config_ck = CheckpointConfig(save_checkpoint_steps=1875, keep_checkpoint_max=10) ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet", directory="./trained_ckpt_file/", config=config_ck) model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()}) LOGGER.info(TAG, "============== Starting Training ==============") model.train(epoch_size, ds, callbacks=[ckpoint_cb, LossMonitor()], dataset_sink_mode=False) LOGGER.info(TAG, "============== Starting Testing ==============") ckpt_file_name = "trained_ckpt_file/checkpoint_lenet-10_1875.ckpt" param_dict = load_checkpoint(ckpt_file_name) load_param_into_net(network, param_dict) ds_eval = generate_mnist_dataset(os.path.join(mnist_path, "test"), batch_size=batch_size) acc = model.eval(ds_eval, dataset_sink_mode=False) LOGGER.info(TAG, "============== Accuracy: %s ==============", acc)
def train(data_dir, lr=0.01, momentum=0.9, num_epochs=2, ckpt_name="lenet"): dataset_sink = context.get_context('device_target') == 'Ascend' repeat = num_epochs if dataset_sink else 1 ds_train = create_dataset(data_dir, repeat=repeat) ds_eval = create_dataset(data_dir, training=False) steps_per_epoch = ds_train.get_dataset_size() net = LeNet5() loss = nn.loss.SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True, reduction='mean') opt = nn.Momentum(net.trainable_params(), lr, momentum) ckpt_cfg = CheckpointConfig(save_checkpoint_steps=steps_per_epoch, keep_checkpoint_max=5) ckpt_cb = ModelCheckpoint(prefix=ckpt_name, directory='ckpt', config=ckpt_cfg) loss_cb = LossMonitor(steps_per_epoch) model = Model(net, loss, opt, metrics={'acc', 'loss'}) model.train(num_epochs, ds_train, callbacks=[ckpt_cb, loss_cb], dataset_sink_mode=dataset_sink) metrics = model.eval(ds_eval, dataset_sink_mode=dataset_sink) print('Metrics:', metrics)
def train(model, dataset_direct, filename, columns_list, num_consumer=4, batch=16, epoch=50, save_checkpoint_steps=2172, keep_checkpoint_max=50, prefix="model", directory='./'): """ train network """ config_ck = CheckpointConfig(save_checkpoint_steps=save_checkpoint_steps, keep_checkpoint_max=keep_checkpoint_max) ckpoint_cb = ModelCheckpoint(prefix=prefix, directory=directory, config=config_ck) data_train = create_dataset(dataset_direct, filename, batch, columns_list, num_consumer) model.train(epoch, data_train, callbacks=[ ckpoint_cb, LossMonitor(per_print_times=181), TimeMonitor() ], dataset_sink_mode=True)
def train(): context.set_context( mode=context.GRAPH_MODE, device_target="Ascend", #save_graphs=True, #save_graphs_path="/home/work/user-job-dir/EAST/", #enable_reduce_precision=False, #device_id=5 ) epoch = 600 my_dataset.download_dataset() train_img_path = os.path.abspath('/cache/train_img') train_gt_path = os.path.abspath('/cache/train_gt') #my_dataset.data_to_mindrecord_byte_image(train_img_path, train_gt_path, mindrecord_dir='/cache', prefix='icdar_train.mindrecord',file_num=1) #dataset = my_dataset.create_icdar_train_dataset(mindrecord_file=['icdar_train.mindrecord0','icdar_train.mindrecord1','icdar_train.mindrecord2','icdar_train.mindrecord3'], batch_size=32, repeat_num=epoch, # is_training=True, num_parallel_workers=8, length=512, scale=0.25) #dataset = my_dataset.create_icdar_train_dataset(mindrecord_file='/cache/icdar_train.mindrecord', batch_size=32, repeat_num=epoch, # is_training=True, num_parallel_workers=24, length=512, scale=0.25) #dataset = my_dataset.create_demo_dataset(batch_size=21, repeat_num=2) #train_img_path = os.path.abspath('/home/licheng/gpzlx1/ICDAR_2015/train/img') #train_gt_path = os.path.abspath('/home/licheng/gpzlx1/ICDAR_2015/train/gt') dataset = datasetV2.create_icdar_train_dataset(train_img_path, train_gt_path, batch_size=14, repeat_num=1, is_training=True, num_parallel_workers=24) #dataset = datasetV3.create_icdar_train_dataset(train_img_path, train_gt_path, batch_size=14, repeat_num=1, is_training=True, num_parallel_workers=24) dataset_size = dataset.get_dataset_size() print("Create dataset done!, dataset_size: ", dataset_size) #east = EAST.EAST() net = EAST_VGG.EAST() #ckpt_config = CheckpointConfig(save_checkpoint_steps=dataset_size * 20) #ckpoint_cb = ModelCheckpoint(prefix='EAST', directory='/cache', config=ckpt_config) milestone = [100, 300] learning_rates = [1e-3, 1e-4] lr = piecewise_constant_lr(milestone, learning_rates) opt = nn.Adam(filter(lambda x: x.requires_grad, net.get_parameters()), learning_rate=lr) net = my_loss.EASTWithLossCell(net) net = my_loss.TrainingWrapper(net, opt) net.set_train(True) callback = [TimeMonitor(data_size=dataset_size), LossMonitor()] #, ckpoint_cb] model = Model(net) dataset_sink_mode = False print("start trainig") model.train(epoch, dataset, callbacks=callback, dataset_sink_mode=dataset_sink_mode)
def resnet50_train(args_opt): device_id = 0 device_num = 1 epoch_size = args_opt.epoch_size batch_size = 32 class_num = 10 loss_scale_num = 1024 local_data_path = '/home/share/dataset/cifar-10-batches-bin/' # your cifar10 path # set graph mode and parallel mode context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", save_graphs=False) context.set_context(device_id=device_id) if device_num > 1: context.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) init() local_data_path = os.path.join(local_data_path, str(device_id)) # data download print('Download data.') mox.file.copy_parallel(src_url=args_opt.data_url, dst_url=local_data_path) # create dataset print('Create train and evaluate dataset.') train_dataset = create_dataset(dataset_path=local_data_path, do_train=True, repeat_num=1, batch_size=batch_size) eval_dataset = create_dataset(dataset_path=local_data_path, do_train=False, repeat_num=1, batch_size=batch_size) train_step_size = train_dataset.get_dataset_size() print('Create dataset success.') # create model net = resnet50(class_num = class_num) # reduction='mean' means that apply reduction of mean to loss loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') lr = Tensor(get_lr(global_step=0, total_epochs=epoch_size, steps_per_epoch=train_step_size)) opt = Momentum(net.trainable_params(), lr, momentum=0.9, weight_decay=1e-4, loss_scale=loss_scale_num) loss_scale = FixedLossScaleManager(loss_scale_num, False) # amp_level="O2" means that the hybrid precision of O2 mode is used for training # the whole network except that batchnoram will be cast into float16 format and dynamic loss scale will be used # 'keep_batchnorm_fp32 = False' means that use the float16 format model = Model(net, amp_level="O2", keep_batchnorm_fp32=False, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics={'acc'}) # define performance callback to show ips and loss callback to show loss for every epoch performance_cb = PerformanceCallback(batch_size) loss_cb = LossMonitor() cb = [performance_cb, loss_cb] print(f'Start run training, total epoch: {epoch_size}.') model.train(epoch_size, train_dataset, callbacks=cb) if device_num == 1 or device_id == 0: print(f'=================================Start run evaluation.=================================') output = model.eval(eval_dataset) print(f'Evaluation result: {output}.')
def train_net(args, model, epoch_size, mnist_path, repeat_size, ckpoint_cb): """Define the training method.""" print("============== Starting Training ==============") # load training dataset ds_train = create_dataset(os.path.join(mnist_path, "train"), 32, repeat_size) model.train(epoch_size, ds_train, callbacks=[ckpoint_cb, LossMonitor()], dataset_sink_mode=False)
def main(): # We currently support pynative mode with device GPU context.set_context(mode=context.PYNATIVE_MODE, device_target='GPU') epoch_size = 1 batch_size = 32 mnist_path = "/data/chengzi/zhusuan-mindspore/data/MNIST" repeat_size = 1 # Define model parameters z_dim = 40 x_dim = 32 * 32 # create the network generator = Generator(x_dim, z_dim, batch_size) variational = Variational(x_dim, z_dim, batch_size) network = zs.variational.ELBO(generator, variational) # define loss # learning rate setting lr = 0.001 net_loss = ReduceMeanLoss() # define the optimizer print(network.trainable_params()[0]) net_opt = nn.Adam(network.trainable_params(), lr) model = Model(network, net_loss, net_opt) ds_train = create_dataset(os.path.join(mnist_path, "train"), batch_size, repeat_size) model.train(epoch_size, ds_train, callbacks=[LossMonitor()], dataset_sink_mode=False) print(network.trainable_params()[0]) iterator = ds_train.create_tuple_iterator() for item in iterator: batch_x = item[0].reshape(32, 32 * 32) break z, _ = network.variational(Tensor(batch_x), None, None) sample, _, _, _ = network.generator(None, z, None) sample = sample.asnumpy() save_img(batch_x, 'result/origin_x.png') save_img(sample, 'result/reconstruct_x.png') for i in range(4): sample, _, _, _ = network.generator(None, None, None) sample = sample.asnumpy() samples = sample if i == 0 else np.concatenate([samples, sample], axis=0) save_img(samples, 'result/sample_x.png', num=4 * batch_size)
def train_net(data_dir, seg_dir, run_distribute, config=None): if run_distribute: init() rank_id = get_rank() rank_size = get_group_size() parallel_mode = ParallelMode.DATA_PARALLEL context.set_auto_parallel_context(parallel_mode=parallel_mode, device_num=rank_size, gradients_mean=True) else: rank_id = 0 rank_size = 1 # train_dataset = create_dataset(data_path=data_dir, seg_path=seg_dir, config=config, \ # rank_size=rank_size, rank_id=rank_id, is_training=True) train_dataset = create_dataset_diy() # for item in train_dataset: # print(item) # exit(0) train_data_size = train_dataset.get_dataset_size() print("train dataset length is:", train_data_size) network = UNet3d(config=config) loss = SoftmaxCrossEntropyWithLogits() # loss = nn.DiceLoss() lr = Tensor(dynamic_lr(config, train_data_size), mstype.float32) optimizer = nn.Adam(params=network.trainable_params(), learning_rate=lr) scale_manager = FixedLossScaleManager(config.loss_scale, drop_overflow_update=False) network.set_train() model = Model(network, loss_fn=loss, optimizer=optimizer, loss_scale_manager=scale_manager, amp_level='O3') time_cb = TimeMonitor(data_size=train_data_size) loss_cb = LossMonitor(per_print_times=2) ckpt_config = CheckpointConfig( save_checkpoint_steps=train_data_size, keep_checkpoint_max=config.keep_checkpoint_max) ckpoint_cb = ModelCheckpoint(prefix='{}'.format(config.model), directory='./ckpt_{}/'.format(rank_size), config=ckpt_config) callbacks_list = [loss_cb, time_cb, ckpoint_cb] print("============== Starting Training ==============") model.train(config.epoch_size, train_dataset, callbacks=callbacks_list, dataset_sink_mode=False) print("============== End Training ==============")
def resnet50_train(args_opt): epoch_size = args_opt.epoch_size batch_size = cfg.batch_size class_num = cfg.class_num loss_scale_num = cfg.loss_scale local_data_path = '/cache/data' local_ckpt_path = '/cache/ckpt_file' # set graph mode and parallel mode context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", save_graphs=False) # data download print('Download data.') mox.file.copy_parallel(src_url=args_opt.data_url, dst_url=local_data_path) # create dataset print('Create train and evaluate dataset.') train_dataset = create_dataset(dataset_path=local_data_path, do_train=True, repeat_num=epoch_size, batch_size=batch_size) train_step_size = train_dataset.get_dataset_size() print('Create dataset success.') # create model net = resnet50(class_num=class_num) # reduction='mean' means that apply reduction of mean to loss loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') lr = Tensor(get_lr(global_step=0, total_epochs=epoch_size, steps_per_epoch=train_step_size)) opt = Momentum(net.trainable_params(), lr, momentum=0.9, weight_decay=1e-4, loss_scale=loss_scale_num) loss_scale = FixedLossScaleManager(loss_scale_num, False) # amp_level="O2" means that the hybrid precision of O2 mode is used for training # the whole network except that batchnorm will be cast into float16 format and dynamic loss scale will be used # 'keep_batchnorm_fp32 = False' means that use the float16 format model = Model(net, amp_level="O2", keep_batchnorm_fp32=False, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics={'acc'}) # define performance callback to show ips and loss callback to show loss for every epoch time_cb = TimeMonitor(data_size=train_step_size) performance_cb = PerformanceCallback(batch_size) loss_cb = LossMonitor() cb = [time_cb, performance_cb, loss_cb] config_ck = CheckpointConfig(save_checkpoint_steps=cfg.save_checkpoint_epochs * train_step_size, keep_checkpoint_max=cfg.keep_checkpoint_max) ckpt_cb = ModelCheckpoint(prefix="resnet", directory=local_ckpt_path, config=config_ck) cb += [ckpt_cb] print(f'Start run training, total epoch: {epoch_size}.') model.train(epoch_size, train_dataset, callbacks=cb) # upload checkpoint files print('Upload checkpoint.') mox.file.copy_parallel(src_url=local_ckpt_path, dst_url=args_opt.train_url)
def train_net(network_model, epoch_size, data_path, repeat_size, ckpoint_cb, sink_mode): """Define the training method.""" print("============== Starting Training ==============") # load training dataset ds_train = dm.create_dataset(os.path.join( data_path, "./MindSpore_train_images_dataset/train"), do_train=True, repeat_num=1) network_model.train(epoch_size, ds_train, callbacks=[ckpoint_cb, LossMonitor()], dataset_sink_mode=sink_mode)
def resnet50_train(args_opt): epoch_size = args_opt.epoch_size batch_size = 32 class_num = 10 loss_scale_num = 1024 local_data_path = '/cache/data' # set graph mode and parallel mode context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", save_graphs=False) context.set_context(enable_task_sink=True, device_id=device_id) context.set_context(enable_loop_sink=True) context.set_context(enable_mem_reuse=True) if device_num > 1: context.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, mirror_mean=True) local_data_path = os.path.join(local_data_path, str(device_id)) # data download print('Download data.') mox.file.copy_parallel(src_url=args_opt.data_url, dst_url=local_data_path) # create dataset print('Create train and evaluate dataset.') train_dataset = create_dataset(dataset_path=local_data_path, do_train=True, repeat_num=epoch_size, batch_size=batch_size) eval_dataset = create_dataset(dataset_path=local_data_path, do_train=False, repeat_num=1, batch_size=batch_size) train_step_size = train_dataset.get_dataset_size() print('Create dataset success.') # create model net = resnet50(class_num = class_num) loss = SoftmaxCrossEntropyWithLogits(sparse=True) lr = Tensor(get_lr(global_step=0, total_epochs=epoch_size, steps_per_epoch=train_step_size)) opt = Momentum(net.trainable_params(), lr, momentum=0.9, weight_decay=1e-4, loss_scale=loss_scale_num) loss_scale = FixedLossScaleManager(loss_scale_num, False) model = Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics={'acc'}) # define performance callback to show ips and loss callback to show loss for every epoch performance_cb = PerformanceCallback(batch_size) loss_cb = LossMonitor() cb = [performance_cb, loss_cb] print(f'Start run training, total epoch: {epoch_size}.') model.train(epoch_size, train_dataset, callbacks=cb) if device_num == 1 or device_id == 0: print(f'Start run evaluation.') output = model.eval(eval_dataset) print(f'Evaluation result: {output}.')
def logistic_regression(ds_train, X_test, Y_test): net = nn.Dense(4, 1) loss = Loss() opt = nn.optim.SGD(net.trainable_params(), learning_rate=0.003) model = ms.train.Model(net, loss, opt) model.train(5, ds_train, callbacks=[LossMonitor(per_print_times=ds_train.get_dataset_size())], dataset_sink_mode=False) # 计算测试集上的精度 x = model.predict(ms.Tensor(X_test)).asnumpy() pred = np.round(1 / (1 + np.exp(-x))) correct = np.equal(pred, Y_test) acc = np.mean(correct) print('Test accuracy is', acc)
def test_all_trains(): ds_train = create_dataset( os.path.join('/home/workspace/mindspore_dataset/mnist', "train"), 32, 1) network = LeNet5(10) net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") net_opt = nn.Momentum(network.trainable_params(), 0.01, 0.9) time_cb = TimeMonitor(data_size=ds_train.get_dataset_size()) model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()}) print("============== Starting Training ==============") model.train(1, ds_train, callbacks=[time_cb, LossMonitor()])
def train(data_dir, lr=0.01, momentum=0.9, num_epochs=3): ds_train = create_dataset(data_dir) ds_eval = create_dataset(data_dir, training=False) net = LeNet5() loss = nn.loss.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') opt = nn.Momentum(net.trainable_params(), lr, momentum) loss_cb = LossMonitor(per_print_times=ds_train.get_dataset_size()) model = Model(net, loss, opt, metrics={'acc', 'loss'}) # dataset_sink_mode can be True when using Ascend model.train(num_epochs, ds_train, callbacks=[loss_cb], dataset_sink_mode=False) metrics = model.eval(ds_eval, dataset_sink_mode=False) print('Metrics:', metrics)
def get_tensor_evolution_data(self, indices, ckpt_file, data_type="activation"): indices = [1] dataset = create_dataset(indices) load_checkpoint(ckpt_file, net=self.model._network) data_evolution_callback = DataEvolutionCallback(data_type=data_type) self.model.train(1, dataset, callbacks=[LossMonitor(), data_evolution_callback], dataset_sink_mode=False) return data_evolution_callback.result
def test_train_cifar(num_classes=10, epoch_size=10): context.set_auto_parallel_context(parallel_mode=ParallelMode.AUTO_PARALLEL, mirror_mean=True) loss_cb = LossMonitor() dataset = create_dataset(epoch_size) net = resnet50(32, num_classes) loss = SoftmaxCrossEntropyExpand(sparse=True) opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), 0.01, 0.9) model = Model(net, loss_fn=loss, optimizer=opt) model.train(epoch_size, dataset, callbacks=[loss_cb], dataset_sink_mode=False)
def test_train_and_eval_lenet(): context.set_context(mode=context.GRAPH_MODE, device_target="GPU") network = LeNet5(10) net_loss = nn.SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True, reduction="mean") net_opt = nn.Momentum(network.trainable_params(), 0.01, 0.9) model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()}) print("============== Starting Training ==============") ds_train = create_dataset(os.path.join('/home/workspace/mindspore_dataset/mnist', "train"), 32, 1) model.train(1, ds_train, callbacks=[LossMonitor()], dataset_sink_mode=True) print("============== Starting Testing ==============") ds_eval = create_dataset(os.path.join('/home/workspace/mindspore_dataset/mnist', "test"), 32, 1) acc = model.eval(ds_eval, dataset_sink_mode=True) print("============== {} ==============".format(acc))
def train_net(args, epoch_size, data_path, eval_per_epoch, repeat_size, ckpoint_cb, sink_mode): """define the training method""" print("============== Starting Training ==============") # Create training dataset ds_train = create_dataset(args, True, training_path, 32, repeat_size) # Initialise model model = Model(resnet, net_loss, net_opt, metrics={"Accuracy": Accuracy()}) # model = Model(resnet, net_loss, net_opt, metrics={"Accuracy": Accuracy()}, amp_level="O3") # this will not work for CPU epoch_per_eval = {"epoch": [], "acc": []} eval_cb = Evalcb(model, ds_train, eval_per_epoch, epoch_per_eval) model.train(epoch_size, ds_train, callbacks=[ckpoint_cb, LossMonitor(), eval_cb], dataset_sink_mode=sink_mode)
def get_tensor_from_training(self, indices, ckpt_file="./logs/resnet/weights/-1_30.ckpt", node_name="fc", data_type="activation"): dataset = create_dataset(indices) load_checkpoint(ckpt_file, net=self.model._network) data_inception_callback = DataInterceptionCallback(node_name=node_name, data_type=data_type) self.model.train(1, dataset, callbacks=[LossMonitor(), data_inception_callback], dataset_sink_mode=False) return data_inception_callback.result, data_inception_callback.labels
def train(self, trainset, *args): """The main training loop in a federated learning workload. Arguments: trainset: The training dataset. """ self.start_training() self.mindspore_model.train( Config().trainer.epochs, trainset, callbacks=[LossMonitor(per_print_times=300)], dataset_sink_mode=False) self.pause_training()
def train_net(args, model, epoch_size, data_home, repeat_size, ckpoint_cb, sink_mode): """define the training method""" print("============== Starting Training ==============") # init weight #load training dataset ds_train = create_dataset(os.path.join(data_home, "cifar-10-batches-bin"), do_train=True, batch_size=32, repeat_num=1) model.train(epoch_size, ds_train, callbacks=[ckpoint_cb, LossMonitor()], dataset_sink_mode=sink_mode) # cifar-10-batches-bin
def test_build_callbacks(): """Test_build_callbacks.""" ck_obj = ModelCheckpoint() loss_cb_1 = LossMonitor(1) callbacks = [None] with pytest.raises(TypeError): callbacks = _build_callbacks(callbacks) callbacks = ['Error'] with pytest.raises(TypeError): callbacks = _build_callbacks(callbacks) callbacks = [ck_obj, loss_cb_1, 'Error', None] with pytest.raises(TypeError): callback_list = _build_callbacks(callbacks)
def test_CallbackManager(): """TestCallbackManager.""" ck_obj = ModelCheckpoint() loss_cb_1 = LossMonitor(1) callbacks = [None] with pytest.raises(TypeError): _CallbackManager(callbacks) callbacks = ['Error'] with pytest.raises(TypeError): _CallbackManager(callbacks) callbacks = [ck_obj, loss_cb_1, 'Error', None] with pytest.raises(TypeError): _CallbackManager(callbacks)