def test_train_eval(): """ test_train_eval """ config = WideDeepConfig() data_path = config.data_path batch_size = config.batch_size epochs = config.epochs print("epochs is {}".format(epochs)) ds_train = create_dataset(data_path, train_mode=True, epochs=epochs, batch_size=batch_size, data_type=DataType.MINDRECORD, rank_id=get_rank(), rank_size=get_group_size()) ds_eval = create_dataset(data_path, train_mode=False, epochs=epochs + 1, batch_size=batch_size, data_type=DataType.MINDRECORD, rank_id=get_rank(), rank_size=get_group_size()) print("ds_train.size: {}".format(ds_train.get_dataset_size())) print("ds_eval.size: {}".format(ds_eval.get_dataset_size())) net_builder = ModelBuilder() train_net, eval_net = net_builder.get_net(config) train_net.set_train() auc_metric = AUCMetric() model = Model(train_net, eval_network=eval_net, metrics={"auc": auc_metric}) eval_callback = EvalCallBack(model, ds_eval, auc_metric, config) callback = LossCallBack(config=config) context.set_auto_parallel_context(strategy_ckpt_save_file="./strategy_train.ckpt") model.train(epochs, ds_train, callbacks=[TimeMonitor(ds_train.get_dataset_size()), eval_callback, callback]) eval_values = list(eval_callback.eval_values) assert eval_values[0] > 0.78
def train_and_eval(config): """ test_train_eval """ np.random.seed(1000) data_path = config.data_path batch_size = config.batch_size epochs = config.epochs print("epochs is {}".format(epochs)) ds_train = create_dataset(data_path, train_mode=True, epochs=epochs, batch_size=batch_size, rank_id=get_rank(), rank_size=get_group_size()) ds_eval = create_dataset(data_path, train_mode=False, epochs=epochs + 1, batch_size=batch_size, rank_id=get_rank(), rank_size=get_group_size()) print("ds_train.size: {}".format(ds_train.get_dataset_size())) print("ds_eval.size: {}".format(ds_eval.get_dataset_size())) net_builder = ModelBuilder() train_net, eval_net = net_builder.get_net(config) train_net.set_train() auc_metric = AUCMetric() model = Model(train_net, eval_network=eval_net, metrics={"auc": auc_metric}) eval_callback = EvalCallBack(model, ds_eval, auc_metric, config) callback = LossCallBack(config=config) ckptconfig = CheckpointConfig(save_checkpoint_steps=ds_train.get_dataset_size(), keep_checkpoint_max=5) ckpoint_cb = ModelCheckpoint(prefix='widedeep_train', directory=config.ckpt_path, config=ckptconfig) out = model.eval(ds_eval) print("=====" * 5 + "model.eval() initialized: {}".format(out)) model.train(epochs, ds_train, callbacks=[TimeMonitor(ds_train.get_dataset_size()), eval_callback, callback, ckpoint_cb])
def test_train_eval(): """ test_train_eval """ np.random.seed(1000) config = WideDeepConfig() data_path = config.data_path batch_size = config.batch_size epochs = config.epochs print("epochs is {}".format(epochs)) ds_train = create_dataset(data_path, train_mode=True, epochs=1, batch_size=batch_size, rank_id=get_rank(), rank_size=get_group_size()) ds_eval = create_dataset(data_path, train_mode=False, epochs=1, batch_size=batch_size, rank_id=get_rank(), rank_size=get_group_size()) print("ds_train.size: {}".format(ds_train.get_dataset_size())) print("ds_eval.size: {}".format(ds_eval.get_dataset_size())) net_builder = ModelBuilder() train_net, eval_net = net_builder.get_net(config) train_net.set_train() auc_metric = AUCMetric() model = Model(train_net, eval_network=eval_net, metrics={"auc": auc_metric}) eval_callback = EvalCallBack(model, ds_eval, auc_metric, config) callback = LossCallBack(config=config) ckptconfig = CheckpointConfig( save_checkpoint_steps=ds_train.get_dataset_size(), keep_checkpoint_max=5) ckpoint_cb = ModelCheckpoint(prefix='widedeep_train', directory=config.ckpt_path, config=ckptconfig) out = model.eval(ds_eval) print("=====" * 5 + "model.eval() initialized: {}".format(out)) model.train(epochs, ds_train, callbacks=[ TimeMonitor(ds_train.get_dataset_size()), eval_callback, callback, ckpoint_cb ]) expect_out0 = [0.792634, 0.799862, 0.803324] expect_out6 = [0.796580, 0.803908, 0.807262] if get_rank() == 0: assert np.allclose(eval_callback.eval_values, expect_out0) if get_rank() == 6: assert np.allclose(eval_callback.eval_values, expect_out6)
def test_train_eval(config): """ test_train_eval """ data_path = config.data_path batch_size = config.batch_size epochs = config.epochs sparse = config.sparse if config.dataset_type == "tfrecord": dataset_type = DataType.TFRECORD elif config.dataset_type == "mindrecord": dataset_type = DataType.MINDRECORD else: dataset_type = DataType.H5 ds_train = create_dataset(data_path, train_mode=True, epochs=1, batch_size=batch_size, data_type=dataset_type) ds_eval = create_dataset(data_path, train_mode=False, epochs=1, batch_size=batch_size, data_type=dataset_type) print("ds_train.size: {}".format(ds_train.get_dataset_size())) print("ds_eval.size: {}".format(ds_eval.get_dataset_size())) net_builder = ModelBuilder() train_net, eval_net = net_builder.get_net(config) train_net.set_train() auc_metric = AUCMetric() model = Model(train_net, eval_network=eval_net, metrics={"auc": auc_metric}) eval_callback = EvalCallBack(model, ds_eval, auc_metric, config) callback = LossCallBack(config=config) ckptconfig = CheckpointConfig( save_checkpoint_steps=ds_train.get_dataset_size(), keep_checkpoint_max=5) ckpoint_cb = ModelCheckpoint(prefix='widedeep_train', directory=config.ckpt_path, config=ckptconfig) out = model.eval(ds_eval) print("=====" * 5 + "model.eval() initialized: {}".format(out)) model.train(epochs, ds_train, callbacks=[ TimeMonitor(ds_train.get_dataset_size()), eval_callback, callback, ckpoint_cb ], dataset_sink_mode=(not sparse))
def train_and_eval(config): """ train_and_eval """ set_seed(1000) data_path = config.data_path epochs = config.epochs print("epochs is {}".format(epochs)) ds_train = create_dataset(data_path, train_mode=True, epochs=1, batch_size=config.batch_size, is_tf_dataset=config.is_tf_dataset, rank_id=get_rank(), rank_size=get_group_size()) ds_eval = create_dataset(data_path, train_mode=False, epochs=1, batch_size=config.batch_size, is_tf_dataset=config.is_tf_dataset, rank_id=get_rank(), rank_size=get_group_size()) print("ds_train.size: {}".format(ds_train.get_dataset_size())) print("ds_eval.size: {}".format(ds_eval.get_dataset_size())) net_builder = ModelBuilder() train_net, eval_net = net_builder.get_net(config) train_net.set_train() auc_metric = AUCMetric() model = Model(train_net, eval_network=eval_net, metrics={"auc": auc_metric}) eval_callback = EvalCallBack(model, ds_eval, auc_metric, config) callback = LossCallBack(config) # Only save the last checkpoint at the last epoch. For saving epochs at each epoch, please ckptconfig = CheckpointConfig( save_checkpoint_steps=ds_train.get_dataset_size() * config.epochs, keep_checkpoint_max=10) ckpoint_cb = ModelCheckpoint(prefix='widedeep_train', directory=config.ckpt_path + '/ckpt_' + str(get_rank()) + '/', config=ckptconfig) callback_list = [ TimeMonitor(ds_train.get_dataset_size()), eval_callback, callback ] if int(get_rank()) == 0: callback_list.append(ckpoint_cb) model.train(epochs, ds_train, callbacks=callback_list, sink_size=ds_train.get_dataset_size())
def train_and_eval(config): """ test_train_eval """ data_path = config.data_path batch_size = config.batch_size epochs = config.epochs if config.dataset_type == "tfrecord": dataset_type = DataType.TFRECORD elif config.dataset_type == "mindrecord": dataset_type = DataType.MINDRECORD else: dataset_type = DataType.H5 host_device_mix = bool(config.host_device_mix) print("epochs is {}".format(epochs)) if config.full_batch: context.set_auto_parallel_context(full_batch=True) de.config.set_seed(1) ds_train = create_dataset(data_path, train_mode=True, epochs=1, batch_size=batch_size*get_group_size(), data_type=dataset_type) ds_eval = create_dataset(data_path, train_mode=False, epochs=1, batch_size=batch_size*get_group_size(), data_type=dataset_type) else: ds_train = create_dataset(data_path, train_mode=True, epochs=1, batch_size=batch_size, rank_id=get_rank(), rank_size=get_group_size(), data_type=dataset_type) ds_eval = create_dataset(data_path, train_mode=False, epochs=1, batch_size=batch_size, rank_id=get_rank(), rank_size=get_group_size(), data_type=dataset_type) print("ds_train.size: {}".format(ds_train.get_dataset_size())) print("ds_eval.size: {}".format(ds_eval.get_dataset_size())) net_builder = ModelBuilder() train_net, eval_net = net_builder.get_net(config) train_net.set_train() auc_metric = AUCMetric() model = Model(train_net, eval_network=eval_net, metrics={"auc": auc_metric}) eval_callback = EvalCallBack(model, ds_eval, auc_metric, config, host_device_mix=host_device_mix) callback = LossCallBack(config=config) ckptconfig = CheckpointConfig(save_checkpoint_steps=ds_train.get_dataset_size(), keep_checkpoint_max=5) ckpoint_cb = ModelCheckpoint(prefix='widedeep_train', directory=config.ckpt_path, config=ckptconfig) context.set_auto_parallel_context(strategy_ckpt_save_file="./strategy_train.ckpt") callback_list = [TimeMonitor(ds_train.get_dataset_size()), eval_callback, callback] if not host_device_mix: callback_list.append(ckpoint_cb) model.train(epochs, ds_train, callbacks=callback_list, dataset_sink_mode=(not host_device_mix))
def train_and_eval(config): """ train_and_eval. """ data_path = config.data_path epochs = config.epochs print("epochs is {}".format(epochs)) ds_eval = create_dataset(data_path, train_mode=False, epochs=1, batch_size=config.batch_size, is_tf_dataset=config.is_tf_dataset) print("ds_eval.size: {}".format(ds_eval.get_dataset_size())) net_builder = ModelBuilder() train_net, eval_net = net_builder.get_net(config) param_dict = load_checkpoint(config.ckpt_path) load_param_into_net(eval_net, param_dict) auc_metric = AUCMetric() model = Model(train_net, eval_network=eval_net, metrics={"auc": auc_metric}) eval_callback = EvalCallBack(model, ds_eval, auc_metric, config) model.eval(ds_eval, callbacks=eval_callback)
def test_eval(config): """ test evaluate """ data_path = config.data_path batch_size = config.batch_size if config.dataset_type == "tfrecord": dataset_type = DataType.TFRECORD elif config.dataset_type == "mindrecord": dataset_type = DataType.MINDRECORD else: dataset_type = DataType.H5 ds_eval = create_dataset(data_path, train_mode=False, epochs=1, batch_size=batch_size, data_type=dataset_type) print("ds_eval.size: {}".format(ds_eval.get_dataset_size())) net_builder = ModelBuilder() train_net, eval_net = net_builder.get_net(config) param_dict = load_checkpoint(config.ckpt_path) load_param_into_net(eval_net, param_dict) auc_metric = AUCMetric() model = Model(train_net, eval_network=eval_net, metrics={"auc": auc_metric}) eval_callback = EvalCallBack(model, ds_eval, auc_metric, config) model.eval(ds_eval, callbacks=eval_callback)
def test_train(configure): """ test_train """ data_path = configure.data_path batch_size = configure.batch_size epochs = configure.epochs ds_train = create_dataset(data_path, train_mode=True, epochs=epochs, batch_size=batch_size) print("ds_train.size: {}".format(ds_train.get_dataset_size())) net_builder = ModelBuilder() train_net, _ = net_builder.get_net(configure) train_net.set_train() model = Model(train_net) callback = LossCallBack(config=configure) ckptconfig = CheckpointConfig( save_checkpoint_steps=ds_train.get_dataset_size(), keep_checkpoint_max=5) ckpoint_cb = ModelCheckpoint(prefix='widedeep_train', directory=configure.ckpt_path, config=ckptconfig) model.train(epochs, ds_train, callbacks=[ TimeMonitor(ds_train.get_dataset_size()), callback, ckpoint_cb ])
def train_and_eval(config): """ test_train_eval """ set_seed(1000) data_path = config.data_path batch_size = config.batch_size epochs = config.epochs if config.dataset_type == "tfrecord": dataset_type = DataType.TFRECORD elif config.dataset_type == "mindrecord": dataset_type = DataType.MINDRECORD else: dataset_type = DataType.H5 print("epochs is {}".format(epochs)) ds_train = create_dataset(data_path, train_mode=True, epochs=1, batch_size=batch_size, rank_id=get_rank(), rank_size=get_group_size(), data_type=dataset_type) ds_eval = create_dataset(data_path, train_mode=False, epochs=1, batch_size=batch_size, rank_id=get_rank(), rank_size=get_group_size(), data_type=dataset_type) print("ds_train.size: {}".format(ds_train.get_dataset_size())) print("ds_eval.size: {}".format(ds_eval.get_dataset_size())) net_builder = ModelBuilder() train_net, eval_net = net_builder.get_net(config) train_net.set_train() auc_metric = AUCMetric() model = Model(train_net, eval_network=eval_net, metrics={"auc": auc_metric}) eval_callback = EvalCallBack(model, ds_eval, auc_metric, config) callback = LossCallBack(config=config) ckptconfig = CheckpointConfig(save_checkpoint_steps=ds_train.get_dataset_size(), keep_checkpoint_max=5) ckpoint_cb = ModelCheckpoint(prefix='widedeep_train', directory=config.ckpt_path + '/ckpt_' + str(get_rank()) + '/', config=ckptconfig) out = model.eval(ds_eval) print("=====" * 5 + "model.eval() initialized: {}".format(out)) callback_list = [TimeMonitor(ds_train.get_dataset_size()), eval_callback, callback] if get_rank() == 0: callback_list.append(ckpoint_cb) model.train(epochs, ds_train, callbacks=callback_list, sink_size=ds_train.get_dataset_size())
def train_and_eval(config): """ train_and_eval. """ data_path = config.data_path epochs = config.epochs print("epochs is {}".format(epochs)) ds_train = create_dataset(data_path, train_mode=True, epochs=1, batch_size=config.batch_size, is_tf_dataset=config.is_tf_dataset) ds_eval = create_dataset(data_path, train_mode=False, epochs=1, batch_size=config.batch_size, is_tf_dataset=config.is_tf_dataset) print("ds_train.size: {}".format(ds_train.get_dataset_size())) print("ds_eval.size: {}".format(ds_eval.get_dataset_size())) net_builder = ModelBuilder() train_net, eval_net = net_builder.get_net(config) train_net.set_train() auc_metric = AUCMetric() model = Model(train_net, eval_network=eval_net, metrics={"auc": auc_metric}) eval_callback = EvalCallBack(model, ds_eval, auc_metric, config) callback = LossCallBack(config) ckptconfig = CheckpointConfig( save_checkpoint_steps=ds_train.get_dataset_size(), keep_checkpoint_max=10) ckpoint_cb = ModelCheckpoint(prefix='widedeep_train', directory=config.ckpt_path, config=ckptconfig) model.train(epochs, ds_train, callbacks=[ TimeMonitor(ds_train.get_dataset_size()), eval_callback, callback, ckpoint_cb ])
def train_and_eval(config): """ test_train_eval """ set_seed(1000) data_path = config.data_path batch_size = config.batch_size epochs = config.epochs if config.dataset_type == "tfrecord": dataset_type = DataType.TFRECORD elif config.dataset_type == "mindrecord": dataset_type = DataType.MINDRECORD else: dataset_type = DataType.H5 parameter_server = bool(config.parameter_server) cache_enable = config.vocab_cache_size > 0 print("epochs is {}".format(epochs)) ds_train = create_dataset(data_path, train_mode=True, epochs=1, batch_size=batch_size, data_type=dataset_type) ds_eval = create_dataset(data_path, train_mode=False, epochs=1, batch_size=batch_size, data_type=dataset_type) print("ds_train.size: {}".format(ds_train.get_dataset_size())) print("ds_eval.size: {}".format(ds_eval.get_dataset_size())) net_builder = ModelBuilder() train_net, eval_net = net_builder.get_net(config) train_net.set_train() auc_metric = AUCMetric() model = Model(train_net, eval_network=eval_net, metrics={"auc": auc_metric}) eval_callback = EvalCallBack(model, ds_eval, auc_metric, config) callback = LossCallBack(config=config) ckptconfig = CheckpointConfig(save_checkpoint_steps=ds_train.get_dataset_size(), keep_checkpoint_max=5) ckpoint_cb = ModelCheckpoint(prefix='widedeep_train', directory=config.ckpt_path, config=ckptconfig) callback_list = [TimeMonitor(ds_train.get_dataset_size()), eval_callback, callback, ckpoint_cb] model.train(epochs, ds_train, callbacks=callback_list, dataset_sink_mode=(parameter_server and cache_enable))
def test_eval(config): """ test evaluate """ data_path = config.data_path batch_size = config.batch_size if config.dataset_type == "tfrecord": dataset_type = DataType.TFRECORD elif config.dataset_type == "mindrecord": dataset_type = DataType.MINDRECORD else: dataset_type = DataType.H5 ds_eval = create_dataset(data_path, train_mode=False, epochs=1, batch_size=batch_size, data_type=dataset_type) print("ds_eval.size: {}".format(ds_eval.get_dataset_size())) net_builder = ModelBuilder() train_net, eval_net = net_builder.get_net(config) ckpt_path = config.ckpt_path if ";" in ckpt_path: ckpt_paths = ckpt_path.split(';') param_list_dict = {} strategy = build_searched_strategy(config.stra_ckpt) for slice_path in ckpt_paths: param_slice_dict = load_checkpoint(slice_path) for key, value in param_slice_dict.items(): if 'optimizer' in key: continue if key not in param_list_dict: param_list_dict[key] = [] param_list_dict[key].append(value) param_dict = {} for key, value in param_list_dict.items(): if key in strategy: merged_parameter = merge_sliced_parameter(value, strategy) else: merged_parameter = merge_sliced_parameter(value) param_dict[key] = merged_parameter else: param_dict = load_checkpoint(ckpt_path) load_param_into_net(eval_net, param_dict) auc_metric = AUCMetric() model = Model(train_net, eval_network=eval_net, metrics={"auc": auc_metric}) eval_callback = EvalCallBack(model, ds_eval, auc_metric, config) model.eval(ds_eval, callbacks=eval_callback)
def test_eval(config): """ test evaluate """ data_path = config.data_path ckpt_path = config.ckpt_path batch_size = config.batch_size if config.dataset_type == "tfrecord": dataset_type = DataType.TFRECORD elif config.dataset_type == "mindrecord": dataset_type = DataType.MINDRECORD else: dataset_type = DataType.H5 # data upload print('Upload data from obs to modelarts server.') mox.file.copy_parallel(src_url=config.data_url, dst_url=data_path) mox.file.copy_parallel(src_url=config.ckpt_url, dst_url=ckpt_path) tar_file = data_path + "train_demo.tar.gz" untar(tar_file, data_path) data_path = data_path + config.dataset_type ds_eval = create_dataset(data_path, train_mode=False, epochs=1, batch_size=batch_size, data_type=dataset_type) print("ds_eval.size: {}".format(ds_eval.get_dataset_size())) net_builder = ModelBuilder() train_net, eval_net = net_builder.get_net(config) param_dict = load_checkpoint(find_ckpt(ckpt_path)) load_param_into_net(eval_net, param_dict) auc_metric = AUCMetric() model = Model(train_net, eval_network=eval_net, metrics={"auc": auc_metric}) eval_callback = EvalCallBack(model, ds_eval, auc_metric, config) model.eval(ds_eval, callbacks=eval_callback)
"""Mixed Precision Tutorial The sample can be run on GPU and Ascend 910 AI processor. """ import mindspore.nn as nn from mindspore import context, Model from mindspore.train.callback import LossMonitor from mindspore.nn.metrics import Accuracy from src.lenet import LeNet5 from src.datasets import create_dataset if __name__ == "__main__": context.set_context(mode=context.GRAPH_MODE, device_target="GPU") ds_train = create_dataset("./datasets/MNIST_Data/train", 32) ds_eval = create_dataset("./datasets/MNIST_Data/test", 32) # Initialize network network = LeNet5(10) # Define Loss and Optimizer net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") net_opt = nn.Momentum(network.trainable_params(), learning_rate=0.01, momentum=0.9) # amp_leval=O2 in GPU, amp_leval=O3 in Ascend, O0 is without mixed precision model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()}, amp_level="O2") # Run training model.train(epoch=1, callbacks=[LossMonitor()], train_dataset=ds_train) # Run training acc = model.eval(ds_eval, dataset_sink_mode=False)
train_data_path = "./datasets/MNIST_Data/train" eval_data_path = "./datasets/MNIST_Data/test" model_path = "./models/ckpt/custom_debugging_info/" net_loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") repeat_size = 1 network = LeNet5() metrics = { 'accuracy': nn.Accuracy(), 'loss': nn.Loss(), 'precision': nn.Precision(), 'recall': nn.Recall(), 'f1_score': nn.F1() } net_opt = nn.Momentum(network.trainable_params(), lr, momentum) config_ck = CheckpointConfig(save_checkpoint_steps=1875, keep_checkpoint_max=10) ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet", directory=model_path, config=config_ck) model = Model(network, net_loss, net_opt, metrics=metrics) print("============== Starting Training ==============") ds_train = create_dataset(train_data_path, repeat_size=repeat_size) stop_cb = StopAtTime(run_time=0.6) model.train(epoch_size, ds_train, callbacks=[ckpoint_cb, LossMonitor(375), stop_cb], dataset_sink_mode=False) print("============== Starting Testing ==============") ds_eval = create_dataset(eval_data_path, repeat_size=repeat_size) acc = model.eval(ds_eval, dataset_sink_mode=False) print("============== Accuracy:{} ==============".format(acc))
def train_and_eval(config): """ test_train_eval """ data_path = config.data_path batch_size = config.batch_size epochs = config.epochs if config.dataset_type == "tfrecord": dataset_type = DataType.TFRECORD elif config.dataset_type == "mindrecord": dataset_type = DataType.MINDRECORD else: dataset_type = DataType.H5 host_device_mix = bool(config.host_device_mix) sparse = config.sparse print("epochs is {}".format(epochs)) if config.full_batch: context.set_auto_parallel_context(full_batch=True) ds.config.set_seed(1) if config.field_slice: compute_manual_shape(config, get_group_size()) ds_train = create_dataset(data_path, train_mode=True, epochs=1, batch_size=batch_size * get_group_size(), data_type=dataset_type, manual_shape=config.manual_shape, target_column=config.field_size) ds_eval = create_dataset(data_path, train_mode=False, epochs=1, batch_size=batch_size * get_group_size(), data_type=dataset_type, manual_shape=config.manual_shape, target_column=config.field_size) else: ds_train = create_dataset(data_path, train_mode=True, epochs=1, batch_size=batch_size * get_group_size(), data_type=dataset_type) ds_eval = create_dataset(data_path, train_mode=False, epochs=1, batch_size=batch_size * get_group_size(), data_type=dataset_type) else: ds_train = create_dataset(data_path, train_mode=True, epochs=1, batch_size=batch_size, rank_id=get_rank(), rank_size=get_group_size(), data_type=dataset_type) ds_eval = create_dataset(data_path, train_mode=False, epochs=1, batch_size=batch_size, rank_id=get_rank(), rank_size=get_group_size(), data_type=dataset_type) print("ds_train.size: {}".format(ds_train.get_dataset_size())) print("ds_eval.size: {}".format(ds_eval.get_dataset_size())) net_builder = ModelBuilder() train_net, eval_net = net_builder.get_net(config) train_net.set_train() auc_metric = AUCMetric() model = Model(train_net, eval_network=eval_net, metrics={"auc": auc_metric}) # Save strategy ckpts according to the rank id, this must be done before initializing the callbacks. config.stra_ckpt = os.path.join( config.stra_ckpt + "-{}".format(get_rank()), "strategy.ckpt") eval_callback = EvalCallBack(model, ds_eval, auc_metric, config) callback = LossCallBack(config=config, per_print_times=20) ckptconfig = CheckpointConfig( save_checkpoint_steps=ds_train.get_dataset_size() * epochs, keep_checkpoint_max=5, integrated_save=False) ckpoint_cb = ModelCheckpoint(prefix='widedeep_train', directory=os.path.join( config.ckpt_path, 'ckpt_' + str(get_rank())), config=ckptconfig) context.set_auto_parallel_context(strategy_ckpt_save_file=config.stra_ckpt) callback_list = [ TimeMonitor(ds_train.get_dataset_size()), eval_callback, callback ] if not host_device_mix: callback_list.append(ckpoint_cb) model.train(epochs, ds_train, callbacks=callback_list, dataset_sink_mode=(not sparse))
def train_eval(config): """ test evaluate """ data_path = config.data_path + config.dataset_type ckpt_path = config.ckpt_path epochs = config.epochs batch_size = config.batch_size if config.dataset_type == "tfrecord": dataset_type = DataType.TFRECORD elif config.dataset_type == "mindrecord": dataset_type = DataType.MINDRECORD else: dataset_type = DataType.H5 ds_train = create_dataset(data_path, train_mode=True, epochs=1, batch_size=batch_size, data_type=dataset_type) print("ds_train.size: {}".format(ds_train.get_dataset_size())) ds_eval = create_dataset(data_path, train_mode=False, epochs=1, batch_size=batch_size, data_type=dataset_type) print("ds_eval.size: {}".format(ds_eval.get_dataset_size())) net_builder = ModelBuilder() train_net, eval_net = net_builder.get_net(config) train_net.set_train() train_model = Model(train_net) train_callback = LossCallBack(config=config) ckptconfig = CheckpointConfig( save_checkpoint_steps=ds_train.get_dataset_size(), keep_checkpoint_max=1) ckpoint_cb = ModelCheckpoint(prefix='widedeep_train', directory=config.ckpt_path, config=ckptconfig) train_model.train(epochs, ds_train, callbacks=[ TimeMonitor(ds_train.get_dataset_size()), train_callback, ckpoint_cb ]) # data download print('Download data from modelarts server to obs.') mox.file.copy_parallel(src_url=config.ckpt_path, dst_url=config.train_url) param_dict = load_checkpoint(find_ckpt(ckpt_path)) load_param_into_net(eval_net, param_dict) auc_metric = AUCMetric() eval_model = Model(train_net, eval_network=eval_net, metrics={"auc": auc_metric}) eval_callback = EvalCallBack(eval_model, ds_eval, auc_metric, config) eval_model.eval(ds_eval, callbacks=eval_callback)
def train_and_eval(config): """ test_train_eval """ set_seed(1000) data_path = config.data_path batch_size = config.batch_size epochs = config.epochs if config.dataset_type == "tfrecord": dataset_type = DataType.TFRECORD elif config.dataset_type == "mindrecord": dataset_type = DataType.MINDRECORD else: dataset_type = DataType.H5 parameter_server = bool(config.parameter_server) if cache_enable: config.full_batch = True print("epochs is {}".format(epochs)) if config.full_batch: context.set_auto_parallel_context(full_batch=True) ds.config.set_seed(1) ds_train = create_dataset(data_path, train_mode=True, epochs=1, batch_size=batch_size * get_group_size(), data_type=dataset_type) ds_eval = create_dataset(data_path, train_mode=False, epochs=1, batch_size=batch_size * get_group_size(), data_type=dataset_type) else: ds_train = create_dataset(data_path, train_mode=True, epochs=1, batch_size=batch_size, rank_id=get_rank(), rank_size=get_group_size(), data_type=dataset_type) ds_eval = create_dataset(data_path, train_mode=False, epochs=1, batch_size=batch_size, rank_id=get_rank(), rank_size=get_group_size(), data_type=dataset_type) print("ds_train.size: {}".format(ds_train.get_dataset_size())) print("ds_eval.size: {}".format(ds_eval.get_dataset_size())) net_builder = ModelBuilder() train_net, eval_net = net_builder.get_net(config) train_net.set_train() auc_metric = AUCMetric() model = Model(train_net, eval_network=eval_net, metrics={"auc": auc_metric}) if cache_enable: config.stra_ckpt = os.path.join( config.stra_ckpt + "-{}".format(get_rank()), "strategy.ckpt") context.set_auto_parallel_context( strategy_ckpt_save_file=config.stra_ckpt) eval_callback = EvalCallBack(model, ds_eval, auc_metric, config) callback = LossCallBack(config=config) if _is_role_worker(): if cache_enable: ckptconfig = CheckpointConfig( save_checkpoint_steps=ds_train.get_dataset_size() * epochs, keep_checkpoint_max=1, integrated_save=False) else: ckptconfig = CheckpointConfig( save_checkpoint_steps=ds_train.get_dataset_size(), keep_checkpoint_max=5) else: ckptconfig = CheckpointConfig(save_checkpoint_steps=1, keep_checkpoint_max=1) ckpoint_cb = ModelCheckpoint(prefix='widedeep_train', directory=config.ckpt_path + '/ckpt_' + str(get_rank()) + '/', config=ckptconfig) callback_list = [ TimeMonitor(ds_train.get_dataset_size()), eval_callback, callback ] if get_rank() == 0: callback_list.append(ckpoint_cb) model.train(epochs, ds_train, callbacks=callback_list, dataset_sink_mode=bool(parameter_server and cache_enable))
def train_and_eval(config): """ test_train_eval """ data_path = config.data_path batch_size = config.batch_size epochs = config.epochs print("epochs is {}".format(epochs)) if config.full_batch: context.set_auto_parallel_context(full_batch=True) de.config.set_seed(1) ds_train = create_dataset(data_path, train_mode=True, epochs=epochs, batch_size=batch_size * get_group_size()) ds_eval = create_dataset(data_path, train_mode=False, epochs=epochs + 1, batch_size=batch_size * get_group_size()) else: ds_train = create_dataset(data_path, train_mode=True, epochs=epochs, batch_size=batch_size, rank_id=get_rank(), rank_size=get_group_size()) ds_eval = create_dataset(data_path, train_mode=False, epochs=epochs + 1, batch_size=batch_size, rank_id=get_rank(), rank_size=get_group_size()) print("ds_train.size: {}".format(ds_train.get_dataset_size())) print("ds_eval.size: {}".format(ds_eval.get_dataset_size())) net_builder = ModelBuilder() train_net, eval_net = net_builder.get_net(config) train_net.set_train() auc_metric = AUCMetric() model = Model(train_net, eval_network=eval_net, metrics={"auc": auc_metric}) eval_callback = EvalCallBack(model, ds_eval, auc_metric, config) callback = LossCallBack(config=config) ckptconfig = CheckpointConfig( save_checkpoint_steps=ds_train.get_dataset_size(), keep_checkpoint_max=5) ckpoint_cb = ModelCheckpoint(prefix='widedeep_train', directory=config.ckpt_path, config=ckptconfig) context.set_auto_parallel_context( strategy_ckpt_save_file="./strategy_train.ckpt") model.train(epochs, ds_train, callbacks=[ TimeMonitor(ds_train.get_dataset_size()), eval_callback, callback, ckpoint_cb ])
model_file = model_path + "checkpoint_lenet-1_1875.ckpt" os.system('rm -f {0}*.ckpt {0}*.meta {0}*.pb'.format(model_path)) network = LeNet5() net_opt = nn.Momentum(network.trainable_params(), lr, momentum) net_loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()}) config_ck = CheckpointConfig(save_checkpoint_steps=1875, keep_checkpoint_max=16) ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet", directory=model_path, config=config_ck) train_dataset = create_dataset("./datasets/MNIST_Data/train") eval_dataset = create_dataset("./datasets/MNIST_Data/test") print("===============start training===============") model.train(epoch_size, train_dataset, callbacks=[ckpoint_cb, LossMonitor()], dataset_sink_mode=False) print( "===============saving models in mindir and onnx formats===============" ) output_file_formats(model_file, network, [32, 1, 32, 32], "checkpoint_lenet", "MINDIR") output_file_formats(model_file, network, [32, 1, 32, 32], "checkpoint_lenet", "ONNX")