def __eval(self): # import from mindspore import Model, load_param_into_net, load_checkpoint from mindspore.nn.metrics import Accuracy # load params if self.__ckpt_path: load_param_into_net(self.__network, load_checkpoint(self.__ckpt_path)) else: print( "Warning: `ckpt_path` is None, Please call func: `set_ckpt_path($ckpt_path)`." ) return # loss_fn & optimizer & metrics model = Model(self.__network, loss_fn=self.__loss_fn, optimizer=self.__optimizer, metrics={"Accuracy": Accuracy()} if self.__metrics is None else self.__metrics) # eval print(">>>>>>>>>>>>>>>>>>>>> eval start ... <<<<<<<<<<<<<<<<<<<<<<") result = model.eval(self.__dataset) print( ">>>>>>>>>>>>>>>>>>>>> eval success ~ <<<<<<<<<<<<<<<<<<<<<<: result=", result)
def test_single_input(): """ test_single_input """ input_data = Tensor(np.random.randint(0, 255, [1, 3, 224, 224]).astype(np.float32)) context.set_context(mode=context.GRAPH_MODE) model = Model(Net()) out = model.predict(input_data) assert out is not None
def main(data_path, device_target='Ascend', summary_dir='./summary_dir', learning_rate=0.01): context.set_context(mode=context.GRAPH_MODE, device_target=device_target) momentum = 0.9 epoch_size = 1 batch_size = 32 network = LeNet5() network.set_train() net_loss = CrossEntropyLoss() net_opt = nn.Momentum(network.trainable_params(), learning_rate, momentum) model = Model(network, net_loss, net_opt) # Init SummaryCollector callback to record summary data in model.train or model.eval summary_collector = SummaryCollector(summary_dir=summary_dir, collect_freq=10) ds = create_dataset(os.path.join(data_path, "train"), batch_size=batch_size) print("============== Starting Training ==============") model.train(epoch_size, ds, callbacks=[summary_collector], dataset_sink_mode=False) print("============== Train End =====================")
def test_net(network, data_path, ckpt): """define the evaluation method""" print("============== Starting Testing ==============") #load the saved model for evaluation load_checkpoint(ckpt, net=network) #load testing dataset ds_eval = create_dataset(False, data_path) # config = GPTConfig(batch_size=4, # seq_length=1024, # vocab_size=50257, # embedding_size=1024, # num_layers=24, # num_heads=16, # expand_ratio=4, # post_layernorm_residual=False, # dropout_rate=0.1, # compute_dtype=mstype.float16, # use_past=False) # loss = CrossEntropyLoss(config) net_loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') model = Model(resnet, net_loss, metrics={"Accuracy": Accuracy()}) # model = Model(resnet, net_loss, metrics={"Accuracy": Accuracy()}, amp_level="O3") acc = model.eval(ds_eval, dataset_sink_mode=False) print("============== Accuracy:{} ==============".format(acc))
def __train(self): # import from mindspore import Model, load_param_into_net, load_checkpoint # load params if self.__ckpt_path and os.path.isfile( self.__ckpt_path) and os.path.exists(self.__ckpt_path): load_param_into_net(self.__network, load_checkpoint(self.__ckpt_path)) # loss_fn & optimizer & metrics model = Model(self.__network, loss_fn=self.__loss_fn, optimizer=self.__optimizer, metrics=self.__metrics) # train print(">>>>>>>>>>>>>>>>>>>>> train start ... <<<<<<<<<<<<<<<<<<<<<<") self.__update_cbs() model.train(self.__epoch_size, self.__dataset, callbacks=self.__callbacks, dataset_sink_mode=self.__do_sink, sink_size=self.__sink_size) print(">>>>>>>>>>>>>>>>>>>>> train success ~ <<<<<<<<<<<<<<<<<<<<<<")
def train_and_eval(config): """ test_train_eval """ np.random.seed(1000) data_path = config.data_path batch_size = config.batch_size epochs = config.epochs print("epochs is {}".format(epochs)) ds_train = create_dataset(data_path, train_mode=True, epochs=epochs, batch_size=batch_size, rank_id=get_rank(), rank_size=get_group_size()) ds_eval = create_dataset(data_path, train_mode=False, epochs=epochs + 1, batch_size=batch_size, rank_id=get_rank(), rank_size=get_group_size()) print("ds_train.size: {}".format(ds_train.get_dataset_size())) print("ds_eval.size: {}".format(ds_eval.get_dataset_size())) net_builder = ModelBuilder() train_net, eval_net = net_builder.get_net(config) train_net.set_train() auc_metric = AUCMetric() model = Model(train_net, eval_network=eval_net, metrics={"auc": auc_metric}) eval_callback = EvalCallBack(model, ds_eval, auc_metric, config) callback = LossCallBack(config=config) ckptconfig = CheckpointConfig(save_checkpoint_steps=ds_train.get_dataset_size(), keep_checkpoint_max=5) ckpoint_cb = ModelCheckpoint(prefix='widedeep_train', directory=config.ckpt_path, config=ckptconfig) out = model.eval(ds_eval) print("=====" * 5 + "model.eval() initialized: {}".format(out)) model.train(epochs, ds_train, callbacks=[TimeMonitor(ds_train.get_dataset_size()), eval_callback, callback, ckpoint_cb])
def test_train(): """distributed training""" context.set_context(mode=context.GRAPH_MODE) parallel_dataset = FakeData() strategy = ((2, 1), (1, 4)) context.set_auto_parallel_context( parallel_mode=ParallelMode.SEMI_AUTO_PARALLEL, device_num=8, strategy_ckpt_save_file="./train_strategy.ckpt") network = Net(matmul_size=(96, 16), strategy=strategy) net_opt = Momentum(network.trainable_params(), 0.01, 0.9) net_loss = SoftmaxCrossEntropyWithLogits(reduction='mean') model = Model(network=network, loss_fn=net_loss, optimizer=net_opt) ckpt_config = CheckpointConfig(keep_checkpoint_max=1, integrated_save=False) global_rank_id = int(os.getenv("RANK_ID")) ckpt_path = './rank_{}_ckpt'.format(global_rank_id) ckpt_callback = ModelCheckpoint(prefix='parallel', directory=ckpt_path, config=ckpt_config) model.train(epoch=2, train_dataset=parallel_dataset, callbacks=[ckpt_callback], dataset_sink_mode=False) context.reset_auto_parallel_context()
def test_resnet_train_tensor(): """test_resnet_train_tensor""" batch_size = 1 size = 2 context.set_context(mode=context.GRAPH_MODE) context.reset_auto_parallel_context() context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, device_num=size, parameter_broadcast=True) one_hot_len = 10 dataset_types = (np.float32, np.float32) dataset_shapes = [[batch_size, 3, 224, 224], [batch_size, one_hot_len]] predict = Tensor(np.ones([batch_size, 3, 224, 224]).astype(np.float32) * 0.01) label = Tensor(np.zeros([batch_size, one_hot_len]).astype(np.float32)) dataset = DatasetLenet(predict, label, 2, size=2, batch_size=2, np_types=dataset_types, output_shapes=dataset_shapes, input_indexs=(0, 1)) dataset.reset() network = resnet9(one_hot_len) network.set_train() loss_fn = nn.SoftmaxCrossEntropyWithLogits() optimizer = Momentum(filter(lambda x: x.requires_grad, network.get_parameters()), learning_rate=0.1, momentum=0.9) model = Model(network=network, loss_fn=loss_fn, optimizer=optimizer) model.train(epoch=2, train_dataset=dataset, dataset_sink_mode=False) context.set_context(mode=context.GRAPH_MODE) context.reset_auto_parallel_context()
def test_cell_list(): input_np = np.random.randn(2, 3, 4, 5).astype(np.float32) input_me = Tensor(input_np) net = Net3() context.set_context(mode=context.GRAPH_MODE) model = Model(net) model.predict(input_me)
def test_train_eval(): """ test_train_eval """ config = WideDeepConfig() data_path = config.data_path batch_size = config.batch_size epochs = config.epochs print("epochs is {}".format(epochs)) ds_train = create_dataset(data_path, train_mode=True, epochs=epochs, batch_size=batch_size, data_type=DataType.MINDRECORD, rank_id=get_rank(), rank_size=get_group_size()) ds_eval = create_dataset(data_path, train_mode=False, epochs=epochs + 1, batch_size=batch_size, data_type=DataType.MINDRECORD, rank_id=get_rank(), rank_size=get_group_size()) print("ds_train.size: {}".format(ds_train.get_dataset_size())) print("ds_eval.size: {}".format(ds_eval.get_dataset_size())) net_builder = ModelBuilder() train_net, eval_net = net_builder.get_net(config) train_net.set_train() auc_metric = AUCMetric() model = Model(train_net, eval_network=eval_net, metrics={"auc": auc_metric}) eval_callback = EvalCallBack(model, ds_eval, auc_metric, config) callback = LossCallBack(config=config) context.set_auto_parallel_context(strategy_ckpt_save_file="./strategy_train.ckpt") model.train(epochs, ds_train, callbacks=[TimeMonitor(ds_train.get_dataset_size()), eval_callback, callback]) eval_values = list(eval_callback.eval_values) assert eval_values[0] > 0.78
def test_train(configure): """ test_train """ data_path = configure.data_path batch_size = configure.batch_size epochs = configure.epochs ds_train = create_dataset(data_path, train_mode=True, epochs=epochs, batch_size=batch_size) print("ds_train.size: {}".format(ds_train.get_dataset_size())) net_builder = ModelBuilder() train_net, _ = net_builder.get_net(configure) train_net.set_train() model = Model(train_net) callback = LossCallBack(config=configure) ckptconfig = CheckpointConfig( save_checkpoint_steps=ds_train.get_dataset_size(), keep_checkpoint_max=5) ckpoint_cb = ModelCheckpoint(prefix='widedeep_train', directory=configure.ckpt_path, config=ckptconfig) model.train(epochs, ds_train, callbacks=[ TimeMonitor(ds_train.get_dataset_size()), callback, ckpoint_cb ])
def test_deeplabv3_1p(): start_time = time.time() epoch_size = 100 args_opt = argparse.Namespace(base_size=513, crop_size=513, batch_size=2) args_opt.base_size = config.crop_size args_opt.crop_size = config.crop_size args_opt.batch_size = config.batch_size train_dataset = create_dataset(args_opt, data_url, 1, config.batch_size, usage="eval") dataset_size = train_dataset.get_dataset_size() callback = LossCallBack(dataset_size) net = deeplabv3_resnet50(config.seg_num_classes, [config.batch_size, 3, args_opt.crop_size, args_opt.crop_size], infer_scale_sizes=config.eval_scales, atrous_rates=config.atrous_rates, decoder_output_stride=config.decoder_output_stride, output_stride=config.output_stride, fine_tune_batch_norm=config.fine_tune_batch_norm, image_pyramid=config.image_pyramid) net.set_train() model_fine_tune(net, 'layer') loss = OhemLoss(config.seg_num_classes, config.ignore_label) opt = Momentum(filter(lambda x: 'beta' not in x.name and 'gamma' not in x.name and 'depth' not in x.name and 'bias' not in x.name, net.trainable_params()), learning_rate=config.learning_rate, momentum=config.momentum, weight_decay=config.weight_decay) model = Model(net, loss, opt) model.train(epoch_size, train_dataset, callback) print(time.time() - start_time) print("expect loss: ", callback.loss) print("expect time: ", callback.time) expect_loss = 0.92 expect_time = 40 assert callback.loss.asnumpy() <= expect_loss assert callback.time <= expect_time
def test_eval(config): """ test evaluate """ data_path = config.data_path batch_size = config.batch_size ds_eval = create_dataset(data_path, train_mode=False, epochs=2, batch_size=batch_size) print("ds_eval.size: {}".format(ds_eval.get_dataset_size())) net_builder = ModelBuilder() train_net, eval_net = net_builder.get_net(config) param_dict = load_checkpoint(config.ckpt_path) load_param_into_net(eval_net, param_dict) auc_metric = AUCMetric() model = Model(train_net, eval_network=eval_net, metrics={"auc": auc_metric}) eval_callback = EvalCallBack(model, ds_eval, auc_metric, config) model.eval(ds_eval, callbacks=eval_callback)
def test_multiple_argument(): """ test_multiple_argument """ input_data = Tensor(np.random.randint(0, 255, [1, 3, 224, 224]).astype(np.float32)) input_label = Tensor(np.random.randint(0, 3, [1, 3]).astype(np.float32)) context.set_context(mode=context.GRAPH_MODE) model = Model(LossNet()) out = model.predict(input_data, input_label) assert out is not None
def test_value_error(): np.random.seed(6) adv = np.random.rand(4, 4, 4).astype(np.float32) encoder = Model(Net()) model = Model(PredNet()) detector = DivergenceBasedDetector(encoder, model, option='bad_op') with pytest.raises(NotImplementedError): assert detector.detect_diff(adv)
def test_train_eval(): """ test_train_eval """ np.random.seed(1000) config = WideDeepConfig() data_path = config.data_path batch_size = config.batch_size epochs = config.epochs print("epochs is {}".format(epochs)) ds_train = create_dataset(data_path, train_mode=True, epochs=1, batch_size=batch_size, rank_id=get_rank(), rank_size=get_group_size()) ds_eval = create_dataset(data_path, train_mode=False, epochs=1, batch_size=batch_size, rank_id=get_rank(), rank_size=get_group_size()) print("ds_train.size: {}".format(ds_train.get_dataset_size())) print("ds_eval.size: {}".format(ds_eval.get_dataset_size())) net_builder = ModelBuilder() train_net, eval_net = net_builder.get_net(config) train_net.set_train() auc_metric = AUCMetric() model = Model(train_net, eval_network=eval_net, metrics={"auc": auc_metric}) eval_callback = EvalCallBack(model, ds_eval, auc_metric, config) callback = LossCallBack(config=config) ckptconfig = CheckpointConfig( save_checkpoint_steps=ds_train.get_dataset_size(), keep_checkpoint_max=5) ckpoint_cb = ModelCheckpoint(prefix='widedeep_train', directory=config.ckpt_path, config=ckptconfig) out = model.eval(ds_eval) print("=====" * 5 + "model.eval() initialized: {}".format(out)) model.train(epochs, ds_train, callbacks=[ TimeMonitor(ds_train.get_dataset_size()), eval_callback, callback, ckpoint_cb ]) expect_out0 = [0.792634, 0.799862, 0.803324] expect_out6 = [0.796580, 0.803908, 0.807262] if get_rank() == 0: assert np.allclose(eval_callback.eval_values, expect_out0) if get_rank() == 6: assert np.allclose(eval_callback.eval_values, expect_out6)
def function_access_base(number): """ function_access_base """ input_np = np.random.randn(2, 3, 4, 5).astype(np.float32) input_me = Tensor(input_np) if number == 2: net = access2_net(number) context.set_context(mode=context.GRAPH_MODE) model = Model(net) model.predict(input_me)
def test_train_eval(config): """ test_train_eval """ data_path = config.data_path batch_size = config.batch_size epochs = config.epochs sparse = config.sparse if config.dataset_type == "tfrecord": dataset_type = DataType.TFRECORD elif config.dataset_type == "mindrecord": dataset_type = DataType.MINDRECORD else: dataset_type = DataType.H5 ds_train = create_dataset(data_path, train_mode=True, epochs=1, batch_size=batch_size, data_type=dataset_type) ds_eval = create_dataset(data_path, train_mode=False, epochs=1, batch_size=batch_size, data_type=dataset_type) print("ds_train.size: {}".format(ds_train.get_dataset_size())) print("ds_eval.size: {}".format(ds_eval.get_dataset_size())) net_builder = ModelBuilder() train_net, eval_net = net_builder.get_net(config) train_net.set_train() auc_metric = AUCMetric() model = Model(train_net, eval_network=eval_net, metrics={"auc": auc_metric}) eval_callback = EvalCallBack(model, ds_eval, auc_metric, config) callback = LossCallBack(config=config) ckptconfig = CheckpointConfig( save_checkpoint_steps=ds_train.get_dataset_size(), keep_checkpoint_max=5) ckpoint_cb = ModelCheckpoint(prefix='widedeep_train', directory=config.ckpt_path, config=ckptconfig) out = model.eval(ds_eval) print("=====" * 5 + "model.eval() initialized: {}".format(out)) model.train(epochs, ds_train, callbacks=[ TimeMonitor(ds_train.get_dataset_size()), eval_callback, callback, ckpoint_cb ], dataset_sink_mode=(not sparse))
def train_and_eval(config): """ train_and_eval """ set_seed(1000) data_path = config.data_path epochs = config.epochs print("epochs is {}".format(epochs)) ds_train = create_dataset(data_path, train_mode=True, epochs=1, batch_size=config.batch_size, is_tf_dataset=config.is_tf_dataset, rank_id=get_rank(), rank_size=get_group_size()) ds_eval = create_dataset(data_path, train_mode=False, epochs=1, batch_size=config.batch_size, is_tf_dataset=config.is_tf_dataset, rank_id=get_rank(), rank_size=get_group_size()) print("ds_train.size: {}".format(ds_train.get_dataset_size())) print("ds_eval.size: {}".format(ds_eval.get_dataset_size())) net_builder = ModelBuilder() train_net, eval_net = net_builder.get_net(config) train_net.set_train() auc_metric = AUCMetric() model = Model(train_net, eval_network=eval_net, metrics={"auc": auc_metric}) eval_callback = EvalCallBack(model, ds_eval, auc_metric, config) callback = LossCallBack(config) # Only save the last checkpoint at the last epoch. For saving epochs at each epoch, please ckptconfig = CheckpointConfig( save_checkpoint_steps=ds_train.get_dataset_size() * config.epochs, keep_checkpoint_max=10) ckpoint_cb = ModelCheckpoint(prefix='widedeep_train', directory=config.ckpt_path + '/ckpt_' + str(get_rank()) + '/', config=ckptconfig) callback_list = [ TimeMonitor(ds_train.get_dataset_size()), eval_callback, callback ] if int(get_rank()) == 0: callback_list.append(ckpoint_cb) model.train(epochs, ds_train, callbacks=callback_list, sink_size=ds_train.get_dataset_size())
def resnet50_train(args): """Training the ResNet-50.""" epoch_size = args.epoch_size batch_size = 32 class_num = 10 loss_scale_num = 1024 local_data_path = '/cache/data' # set graph mode and parallel mode context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", save_graphs=False) context.set_context(device_id=device_id) if device_num > 1: context.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) init() local_data_path = os.path.join(local_data_path, str(device_id)) # data download print('Download data.') mox.file.copy_parallel(src_url=args.data_url, dst_url=local_data_path) # create dataset print('Create train and evaluate dataset.') train_dataset = create_dataset(dataset_path=local_data_path, do_train=True, repeat_num=1, batch_size=batch_size) eval_dataset = create_dataset(dataset_path=local_data_path, do_train=False, repeat_num=1, batch_size=batch_size) train_step_size = train_dataset.get_dataset_size() print('Create dataset success.') # create model net = resnet50(class_num=class_num) # reduction='mean' means that apply reduction of mean to loss loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') lr = Tensor(get_lr(global_step=0, total_epochs=epoch_size, steps_per_epoch=train_step_size)) opt = Momentum(net.trainable_params(), lr, momentum=0.9, weight_decay=1e-4, loss_scale=loss_scale_num) loss_scale = FixedLossScaleManager(loss_scale_num, False) # amp_level="O2" means that the hybrid precision of O2 mode is used for training # the whole network except that batchnoram will be cast into float16 format and dynamic loss scale will be used # 'keep_batchnorm_fp32 = False' means that use the float16 format model = Model(net, loss_fn=loss, optimizer=opt, metrics={'acc'}, amp_level="O2", keep_batchnorm_fp32=False, loss_scale_manager=loss_scale) # define performance callback to show ips and loss callback to show loss for every epoch performance_cb = PerformanceCallback(batch_size) loss_cb = LossMonitor() cb = [performance_cb, loss_cb] print(f'Start run training, total epoch: {epoch_size}.') model.train(epoch_size, train_dataset, callbacks=cb) if device_num == 1 or device_id == 0: print(f'Start run evaluation.') output = model.eval(eval_dataset) print(f'Evaluation result: {output}.')
def logical_operator_base(symbol): """ logical_operator_base """ input_np = np.random.randn(2, 3, 4, 5).astype(np.float32) input_me = Tensor(input_np) logical_operator = {"and": 1, "or": 2} x = logical_operator[symbol] net = logical_Net(x) context.set_context(mode=context.GRAPH_MODE) model = Model(net) model.predict(input_me)
def test_distribute_predict_auto_parallel(): context.set_context(mode=context.GRAPH_MODE, save_graphs=True) context.set_auto_parallel_context(parallel_mode="auto_parallel", device_num=8, full_batch=True) inputs = Tensor(np.ones([32, 64, 128]).astype(np.float32)) net = Net() model = Model(net) predict_map = model.infer_predict_layout(inputs) output = model.predict(inputs) context.reset_auto_parallel_context() return predict_map, output
def run_test(netclass, count): context.set_context(mode=context.GRAPH_MODE) net = netclass() model = Model(net) for _ in range(count): input_np = np.random.randn(2, 3).astype(np.float32) input_ms = Tensor(input_np) output_np = net.construct(input_np) # run python output_ms = model.predict(input_ms) # run graph assert np.shape(output_np) == np.shape(output_ms.asnumpy())
def test_graph_summary_callback2(): dataset = get_dataset() net = Net() loss = nn.SoftmaxCrossEntropyWithLogits() optim = Momentum(net.trainable_params(), 0.1, 0.9) context.set_context(mode=context.GRAPH_MODE) model = Model(net, loss_fn=loss, optimizer=optim, metrics=None) with SummaryRecord(SUMMARY_DIR, file_suffix="_MS_GRAPH", network=net) as test_writer: summary_cb = SummaryStep(test_writer, 1) model.train(2, dataset, callbacks=summary_cb)
def train_net(data_dir, cross_valid_ind=1, epochs=400, batch_size=16, lr=0.0001, run_distribute=False, cfg=None): if run_distribute: init() group_size = get_group_size() parallel_mode = ParallelMode.DATA_PARALLEL context.set_auto_parallel_context(parallel_mode=parallel_mode, device_num=group_size, gradients_mean=False) net = UNet(n_channels=cfg['num_channels'], n_classes=cfg['num_classes']) if cfg['resume']: param_dict = load_checkpoint(cfg['resume_ckpt']) load_param_into_net(net, param_dict) criterion = CrossEntropyWithLogits() train_dataset, _ = create_dataset(data_dir, epochs, batch_size, True, cross_valid_ind, run_distribute) train_data_size = train_dataset.get_dataset_size() print("dataset length is:", train_data_size) ckpt_config = CheckpointConfig( save_checkpoint_steps=train_data_size, keep_checkpoint_max=cfg['keep_checkpoint_max']) ckpoint_cb = ModelCheckpoint(prefix='ckpt_unet_medical_adam', directory='./ckpt_{}/'.format(device_id), config=ckpt_config) optimizer = nn.Adam(params=net.trainable_params(), learning_rate=lr, weight_decay=cfg['weight_decay'], loss_scale=cfg['loss_scale']) loss_scale_manager = mindspore.train.loss_scale_manager.FixedLossScaleManager( cfg['FixedLossScaleManager'], False) model = Model(net, loss_fn=criterion, loss_scale_manager=loss_scale_manager, optimizer=optimizer, amp_level="O3") print("============== Starting Training ==============") model.train( 1, train_dataset, callbacks=[StepLossTimeMonitor(batch_size=batch_size), ckpoint_cb], dataset_sink_mode=False) print("============== End Training ==============")
def test_mag_net_divergence_transform(): """ Compute mindspore result. """ np.random.seed(6) adv = np.random.rand(4, 4, 4).astype(np.float32) encoder = Model(Net()) model = Model(PredNet()) detector = DivergenceBasedDetector(encoder, model) adv_trans = detector.transform(adv) assert np.any(adv_trans != adv)
def run_test(netclass, count, dev): context.set_context(mode=context.GRAPH_MODE, device_target=dev) net = netclass() model = Model(net) for _ in range(count): input_np = np.random.randn(2, 3).astype(np.float32) input_ms = Tensor(input_np) output_np = net.construct(input_np) # run python output_ms = model.predict(input_ms) # run graph np.testing.assert_array_almost_equal(output_np, output_ms.asnumpy(), decimal=3)
def train_net(data_dir, seg_dir, run_distribute, config=None): if run_distribute: init() rank_id = get_rank() rank_size = get_group_size() parallel_mode = ParallelMode.DATA_PARALLEL context.set_auto_parallel_context(parallel_mode=parallel_mode, device_num=rank_size, gradients_mean=True) else: rank_id = 0 rank_size = 1 # train_dataset = create_dataset(data_path=data_dir, seg_path=seg_dir, config=config, \ # rank_size=rank_size, rank_id=rank_id, is_training=True) train_dataset = create_dataset_diy() # for item in train_dataset: # print(item) # exit(0) train_data_size = train_dataset.get_dataset_size() print("train dataset length is:", train_data_size) network = UNet3d(config=config) loss = SoftmaxCrossEntropyWithLogits() # loss = nn.DiceLoss() lr = Tensor(dynamic_lr(config, train_data_size), mstype.float32) optimizer = nn.Adam(params=network.trainable_params(), learning_rate=lr) scale_manager = FixedLossScaleManager(config.loss_scale, drop_overflow_update=False) network.set_train() model = Model(network, loss_fn=loss, optimizer=optimizer, loss_scale_manager=scale_manager, amp_level='O3') time_cb = TimeMonitor(data_size=train_data_size) loss_cb = LossMonitor(per_print_times=2) ckpt_config = CheckpointConfig( save_checkpoint_steps=train_data_size, keep_checkpoint_max=config.keep_checkpoint_max) ckpoint_cb = ModelCheckpoint(prefix='{}'.format(config.model), directory='./ckpt_{}/'.format(rank_size), config=ckpt_config) callbacks_list = [loss_cb, time_cb, ckpoint_cb] print("============== Starting Training ==============") model.train(config.epoch_size, train_dataset, callbacks=callbacks_list, dataset_sink_mode=False) print("============== End Training ==============")
def test_eval(config): """ test evaluate """ data_path = config.data_path batch_size = config.batch_size if config.dataset_type == "tfrecord": dataset_type = DataType.TFRECORD elif config.dataset_type == "mindrecord": dataset_type = DataType.MINDRECORD else: dataset_type = DataType.H5 ds_eval = create_dataset(data_path, train_mode=False, epochs=1, batch_size=batch_size, data_type=dataset_type) print("ds_eval.size: {}".format(ds_eval.get_dataset_size())) net_builder = ModelBuilder() train_net, eval_net = net_builder.get_net(config) ckpt_path = config.ckpt_path if ";" in ckpt_path: ckpt_paths = ckpt_path.split(';') param_list_dict = {} strategy = build_searched_strategy(config.stra_ckpt) for slice_path in ckpt_paths: param_slice_dict = load_checkpoint(slice_path) for key, value in param_slice_dict.items(): if 'optimizer' in key: continue if key not in param_list_dict: param_list_dict[key] = [] param_list_dict[key].append(value) param_dict = {} for key, value in param_list_dict.items(): if key in strategy: merged_parameter = merge_sliced_parameter(value, strategy) else: merged_parameter = merge_sliced_parameter(value) param_dict[key] = merged_parameter else: param_dict = load_checkpoint(ckpt_path) load_param_into_net(eval_net, param_dict) auc_metric = AUCMetric() model = Model(train_net, eval_network=eval_net, metrics={"auc": auc_metric}) eval_callback = EvalCallBack(model, ds_eval, auc_metric, config) model.eval(ds_eval, callbacks=eval_callback)
def run_eval(): """eval method""" if not os.path.exists(config.output_path): os.makedirs(config.output_path) context.set_context(mode=context.GRAPH_MODE, device_target="Davinci", save_graphs=False, device_id=get_device_id()) layers = config.layers num_factors = config.num_factors topk = rconst.TOP_K num_eval_neg = rconst.NUM_EVAL_NEGATIVES ds_eval, num_eval_users, num_eval_items = create_dataset( test_train=False, data_dir=config.data_path, dataset=config.dataset, train_epochs=0, eval_batch_size=config.eval_batch_size) print("ds_eval.size: {}".format(ds_eval.get_dataset_size())) ncf_net = NCFModel(num_users=num_eval_users, num_items=num_eval_items, num_factors=num_factors, model_layers=layers, mf_regularization=0, mlp_reg_layers=[0.0, 0.0, 0.0, 0.0], mf_dim=16) param_dict = load_checkpoint(config.checkpoint_file_path) load_param_into_net(ncf_net, param_dict) loss_net = NetWithLossClass(ncf_net) train_net = TrainStepWrap(loss_net) eval_net = PredictWithSigmoid(ncf_net, topk, num_eval_neg) ncf_metric = NCFMetric() model = Model(train_net, eval_network=eval_net, metrics={"ncf": ncf_metric}) ncf_metric.clear() out = model.eval(ds_eval) eval_file_path = os.path.join(config.output_path, config.eval_file_name) eval_file = open(eval_file_path, "a+") eval_file.write("EvalCallBack: HR = {}, NDCG = {}\n".format( out['ncf'][0], out['ncf'][1])) eval_file.close() print("EvalCallBack: HR = {}, NDCG = {}".format(out['ncf'][0], out['ncf'][1])) print("=" * 100 + "Eval Finish!" + "=" * 100)