def test_train_eval():
    """
    test_train_eval
    """
    config = WideDeepConfig()
    data_path = config.data_path
    batch_size = config.batch_size
    epochs = config.epochs
    print("epochs is {}".format(epochs))
    ds_train = create_dataset(data_path, train_mode=True, epochs=epochs, batch_size=batch_size,
                              data_type=DataType.MINDRECORD, rank_id=get_rank(), rank_size=get_group_size())
    ds_eval = create_dataset(data_path, train_mode=False, epochs=epochs + 1, batch_size=batch_size,
                             data_type=DataType.MINDRECORD, rank_id=get_rank(), rank_size=get_group_size())
    print("ds_train.size: {}".format(ds_train.get_dataset_size()))
    print("ds_eval.size: {}".format(ds_eval.get_dataset_size()))

    net_builder = ModelBuilder()

    train_net, eval_net = net_builder.get_net(config)
    train_net.set_train()
    auc_metric = AUCMetric()

    model = Model(train_net, eval_network=eval_net, metrics={"auc": auc_metric})

    eval_callback = EvalCallBack(model, ds_eval, auc_metric, config)

    callback = LossCallBack(config=config)
    context.set_auto_parallel_context(strategy_ckpt_save_file="./strategy_train.ckpt")
    model.train(epochs, ds_train,
                callbacks=[TimeMonitor(ds_train.get_dataset_size()), eval_callback, callback])
    eval_values = list(eval_callback.eval_values)
    assert eval_values[0] > 0.78
Beispiel #2
0
def train_and_eval(config):
    """
    test_train_eval
    """
    np.random.seed(1000)
    data_path = config.data_path
    batch_size = config.batch_size
    epochs = config.epochs
    print("epochs is {}".format(epochs))
    ds_train = create_dataset(data_path, train_mode=True, epochs=epochs,
                              batch_size=batch_size, rank_id=get_rank(), rank_size=get_group_size())
    ds_eval = create_dataset(data_path, train_mode=False, epochs=epochs + 1,
                             batch_size=batch_size, rank_id=get_rank(), rank_size=get_group_size())
    print("ds_train.size: {}".format(ds_train.get_dataset_size()))
    print("ds_eval.size: {}".format(ds_eval.get_dataset_size()))

    net_builder = ModelBuilder()

    train_net, eval_net = net_builder.get_net(config)
    train_net.set_train()
    auc_metric = AUCMetric()

    model = Model(train_net, eval_network=eval_net, metrics={"auc": auc_metric})

    eval_callback = EvalCallBack(model, ds_eval, auc_metric, config)

    callback = LossCallBack(config=config)
    ckptconfig = CheckpointConfig(save_checkpoint_steps=ds_train.get_dataset_size(), keep_checkpoint_max=5)
    ckpoint_cb = ModelCheckpoint(prefix='widedeep_train',
                                 directory=config.ckpt_path, config=ckptconfig)
    out = model.eval(ds_eval)
    print("=====" * 5 + "model.eval() initialized: {}".format(out))
    model.train(epochs, ds_train,
                callbacks=[TimeMonitor(ds_train.get_dataset_size()), eval_callback, callback, ckpoint_cb])
Beispiel #3
0
def test_train_eval():
    """
    test_train_eval
    """
    np.random.seed(1000)
    config = WideDeepConfig()
    data_path = config.data_path
    batch_size = config.batch_size
    epochs = config.epochs
    print("epochs is {}".format(epochs))
    ds_train = create_dataset(data_path,
                              train_mode=True,
                              epochs=1,
                              batch_size=batch_size,
                              rank_id=get_rank(),
                              rank_size=get_group_size())
    ds_eval = create_dataset(data_path,
                             train_mode=False,
                             epochs=1,
                             batch_size=batch_size,
                             rank_id=get_rank(),
                             rank_size=get_group_size())
    print("ds_train.size: {}".format(ds_train.get_dataset_size()))
    print("ds_eval.size: {}".format(ds_eval.get_dataset_size()))

    net_builder = ModelBuilder()

    train_net, eval_net = net_builder.get_net(config)
    train_net.set_train()
    auc_metric = AUCMetric()

    model = Model(train_net,
                  eval_network=eval_net,
                  metrics={"auc": auc_metric})

    eval_callback = EvalCallBack(model, ds_eval, auc_metric, config)

    callback = LossCallBack(config=config)
    ckptconfig = CheckpointConfig(
        save_checkpoint_steps=ds_train.get_dataset_size(),
        keep_checkpoint_max=5)
    ckpoint_cb = ModelCheckpoint(prefix='widedeep_train',
                                 directory=config.ckpt_path,
                                 config=ckptconfig)
    out = model.eval(ds_eval)
    print("=====" * 5 + "model.eval() initialized: {}".format(out))
    model.train(epochs,
                ds_train,
                callbacks=[
                    TimeMonitor(ds_train.get_dataset_size()), eval_callback,
                    callback, ckpoint_cb
                ])
    expect_out0 = [0.792634, 0.799862, 0.803324]
    expect_out6 = [0.796580, 0.803908, 0.807262]
    if get_rank() == 0:
        assert np.allclose(eval_callback.eval_values, expect_out0)
    if get_rank() == 6:
        assert np.allclose(eval_callback.eval_values, expect_out6)
Beispiel #4
0
def test_train_eval(config):
    """
    test_train_eval
    """
    data_path = config.data_path
    batch_size = config.batch_size
    epochs = config.epochs
    sparse = config.sparse
    if config.dataset_type == "tfrecord":
        dataset_type = DataType.TFRECORD
    elif config.dataset_type == "mindrecord":
        dataset_type = DataType.MINDRECORD
    else:
        dataset_type = DataType.H5
    ds_train = create_dataset(data_path,
                              train_mode=True,
                              epochs=1,
                              batch_size=batch_size,
                              data_type=dataset_type)
    ds_eval = create_dataset(data_path,
                             train_mode=False,
                             epochs=1,
                             batch_size=batch_size,
                             data_type=dataset_type)
    print("ds_train.size: {}".format(ds_train.get_dataset_size()))
    print("ds_eval.size: {}".format(ds_eval.get_dataset_size()))

    net_builder = ModelBuilder()

    train_net, eval_net = net_builder.get_net(config)
    train_net.set_train()
    auc_metric = AUCMetric()

    model = Model(train_net,
                  eval_network=eval_net,
                  metrics={"auc": auc_metric})

    eval_callback = EvalCallBack(model, ds_eval, auc_metric, config)

    callback = LossCallBack(config=config)
    ckptconfig = CheckpointConfig(
        save_checkpoint_steps=ds_train.get_dataset_size(),
        keep_checkpoint_max=5)
    ckpoint_cb = ModelCheckpoint(prefix='widedeep_train',
                                 directory=config.ckpt_path,
                                 config=ckptconfig)

    out = model.eval(ds_eval)
    print("=====" * 5 + "model.eval() initialized: {}".format(out))
    model.train(epochs,
                ds_train,
                callbacks=[
                    TimeMonitor(ds_train.get_dataset_size()), eval_callback,
                    callback, ckpoint_cb
                ],
                dataset_sink_mode=(not sparse))
Beispiel #5
0
def train_and_eval(config):
    """
    train_and_eval
    """
    set_seed(1000)
    data_path = config.data_path
    epochs = config.epochs
    print("epochs is {}".format(epochs))

    ds_train = create_dataset(data_path,
                              train_mode=True,
                              epochs=1,
                              batch_size=config.batch_size,
                              is_tf_dataset=config.is_tf_dataset,
                              rank_id=get_rank(),
                              rank_size=get_group_size())
    ds_eval = create_dataset(data_path,
                             train_mode=False,
                             epochs=1,
                             batch_size=config.batch_size,
                             is_tf_dataset=config.is_tf_dataset,
                             rank_id=get_rank(),
                             rank_size=get_group_size())

    print("ds_train.size: {}".format(ds_train.get_dataset_size()))
    print("ds_eval.size: {}".format(ds_eval.get_dataset_size()))

    net_builder = ModelBuilder()

    train_net, eval_net = net_builder.get_net(config)
    train_net.set_train()
    auc_metric = AUCMetric()

    model = Model(train_net,
                  eval_network=eval_net,
                  metrics={"auc": auc_metric})

    eval_callback = EvalCallBack(model, ds_eval, auc_metric, config)
    callback = LossCallBack(config)
    # Only save the last checkpoint at the last epoch. For saving epochs at each epoch, please
    ckptconfig = CheckpointConfig(
        save_checkpoint_steps=ds_train.get_dataset_size() * config.epochs,
        keep_checkpoint_max=10)
    ckpoint_cb = ModelCheckpoint(prefix='widedeep_train',
                                 directory=config.ckpt_path + '/ckpt_' +
                                 str(get_rank()) + '/',
                                 config=ckptconfig)
    callback_list = [
        TimeMonitor(ds_train.get_dataset_size()), eval_callback, callback
    ]
    if int(get_rank()) == 0:
        callback_list.append(ckpoint_cb)
    model.train(epochs,
                ds_train,
                callbacks=callback_list,
                sink_size=ds_train.get_dataset_size())
def train_and_eval(config):
    """
    test_train_eval
    """
    data_path = config.data_path
    batch_size = config.batch_size
    epochs = config.epochs
    if config.dataset_type == "tfrecord":
        dataset_type = DataType.TFRECORD
    elif config.dataset_type == "mindrecord":
        dataset_type = DataType.MINDRECORD
    else:
        dataset_type = DataType.H5
    host_device_mix = bool(config.host_device_mix)
    print("epochs is {}".format(epochs))
    if config.full_batch:
        context.set_auto_parallel_context(full_batch=True)
        de.config.set_seed(1)
        ds_train = create_dataset(data_path, train_mode=True, epochs=1,
                                  batch_size=batch_size*get_group_size(), data_type=dataset_type)
        ds_eval = create_dataset(data_path, train_mode=False, epochs=1,
                                 batch_size=batch_size*get_group_size(), data_type=dataset_type)
    else:
        ds_train = create_dataset(data_path, train_mode=True, epochs=1,
                                  batch_size=batch_size, rank_id=get_rank(),
                                  rank_size=get_group_size(), data_type=dataset_type)
        ds_eval = create_dataset(data_path, train_mode=False, epochs=1,
                                 batch_size=batch_size, rank_id=get_rank(),
                                 rank_size=get_group_size(), data_type=dataset_type)
    print("ds_train.size: {}".format(ds_train.get_dataset_size()))
    print("ds_eval.size: {}".format(ds_eval.get_dataset_size()))

    net_builder = ModelBuilder()

    train_net, eval_net = net_builder.get_net(config)
    train_net.set_train()
    auc_metric = AUCMetric()

    model = Model(train_net, eval_network=eval_net, metrics={"auc": auc_metric})

    eval_callback = EvalCallBack(model, ds_eval, auc_metric, config, host_device_mix=host_device_mix)

    callback = LossCallBack(config=config)
    ckptconfig = CheckpointConfig(save_checkpoint_steps=ds_train.get_dataset_size(), keep_checkpoint_max=5)
    ckpoint_cb = ModelCheckpoint(prefix='widedeep_train',
                                 directory=config.ckpt_path, config=ckptconfig)
    context.set_auto_parallel_context(strategy_ckpt_save_file="./strategy_train.ckpt")
    callback_list = [TimeMonitor(ds_train.get_dataset_size()), eval_callback, callback]
    if not host_device_mix:
        callback_list.append(ckpoint_cb)
    model.train(epochs, ds_train, callbacks=callback_list, dataset_sink_mode=(not host_device_mix))
Beispiel #7
0
def train_and_eval(config):
    """
    train_and_eval.
    """
    data_path = config.data_path
    epochs = config.epochs
    print("epochs is {}".format(epochs))

    ds_eval = create_dataset(data_path, train_mode=False, epochs=1,
                             batch_size=config.batch_size, is_tf_dataset=config.is_tf_dataset)

    print("ds_eval.size: {}".format(ds_eval.get_dataset_size()))

    net_builder = ModelBuilder()

    train_net, eval_net = net_builder.get_net(config)
    param_dict = load_checkpoint(config.ckpt_path)
    load_param_into_net(eval_net, param_dict)

    auc_metric = AUCMetric()
    model = Model(train_net, eval_network=eval_net, metrics={"auc": auc_metric})

    eval_callback = EvalCallBack(model, ds_eval, auc_metric, config)

    model.eval(ds_eval, callbacks=eval_callback)
Beispiel #8
0
def test_eval(config):
    """
    test evaluate
    """
    data_path = config.data_path
    batch_size = config.batch_size
    if config.dataset_type == "tfrecord":
        dataset_type = DataType.TFRECORD
    elif config.dataset_type == "mindrecord":
        dataset_type = DataType.MINDRECORD
    else:
        dataset_type = DataType.H5
    ds_eval = create_dataset(data_path, train_mode=False, epochs=1,
                             batch_size=batch_size, data_type=dataset_type)
    print("ds_eval.size: {}".format(ds_eval.get_dataset_size()))

    net_builder = ModelBuilder()
    train_net, eval_net = net_builder.get_net(config)

    param_dict = load_checkpoint(config.ckpt_path)
    load_param_into_net(eval_net, param_dict)

    auc_metric = AUCMetric()
    model = Model(train_net, eval_network=eval_net, metrics={"auc": auc_metric})

    eval_callback = EvalCallBack(model, ds_eval, auc_metric, config)

    model.eval(ds_eval, callbacks=eval_callback)
Beispiel #9
0
def test_train(configure):
    """
    test_train
    """
    data_path = configure.data_path
    batch_size = configure.batch_size
    epochs = configure.epochs
    ds_train = create_dataset(data_path,
                              train_mode=True,
                              epochs=epochs,
                              batch_size=batch_size)
    print("ds_train.size: {}".format(ds_train.get_dataset_size()))

    net_builder = ModelBuilder()
    train_net, _ = net_builder.get_net(configure)
    train_net.set_train()

    model = Model(train_net)
    callback = LossCallBack(config=configure)
    ckptconfig = CheckpointConfig(
        save_checkpoint_steps=ds_train.get_dataset_size(),
        keep_checkpoint_max=5)
    ckpoint_cb = ModelCheckpoint(prefix='widedeep_train',
                                 directory=configure.ckpt_path,
                                 config=ckptconfig)
    model.train(epochs,
                ds_train,
                callbacks=[
                    TimeMonitor(ds_train.get_dataset_size()), callback,
                    ckpoint_cb
                ])
def train_and_eval(config):
    """
    test_train_eval
    """
    set_seed(1000)
    data_path = config.data_path
    batch_size = config.batch_size
    epochs = config.epochs
    if config.dataset_type == "tfrecord":
        dataset_type = DataType.TFRECORD
    elif config.dataset_type == "mindrecord":
        dataset_type = DataType.MINDRECORD
    else:
        dataset_type = DataType.H5
    print("epochs is {}".format(epochs))
    ds_train = create_dataset(data_path, train_mode=True, epochs=1,
                              batch_size=batch_size, rank_id=get_rank(),
                              rank_size=get_group_size(), data_type=dataset_type)
    ds_eval = create_dataset(data_path, train_mode=False, epochs=1,
                             batch_size=batch_size, rank_id=get_rank(),
                             rank_size=get_group_size(), data_type=dataset_type)
    print("ds_train.size: {}".format(ds_train.get_dataset_size()))
    print("ds_eval.size: {}".format(ds_eval.get_dataset_size()))

    net_builder = ModelBuilder()

    train_net, eval_net = net_builder.get_net(config)
    train_net.set_train()
    auc_metric = AUCMetric()

    model = Model(train_net, eval_network=eval_net, metrics={"auc": auc_metric})

    eval_callback = EvalCallBack(model, ds_eval, auc_metric, config)

    callback = LossCallBack(config=config)
    ckptconfig = CheckpointConfig(save_checkpoint_steps=ds_train.get_dataset_size(), keep_checkpoint_max=5)
    ckpoint_cb = ModelCheckpoint(prefix='widedeep_train',
                                 directory=config.ckpt_path + '/ckpt_' + str(get_rank()) + '/',
                                 config=ckptconfig)
    out = model.eval(ds_eval)
    print("=====" * 5 + "model.eval() initialized: {}".format(out))
    callback_list = [TimeMonitor(ds_train.get_dataset_size()), eval_callback, callback]
    if get_rank() == 0:
        callback_list.append(ckpoint_cb)
    model.train(epochs, ds_train,
                callbacks=callback_list,
                sink_size=ds_train.get_dataset_size())
Beispiel #11
0
def train_and_eval(config):
    """
    train_and_eval.
    """
    data_path = config.data_path
    epochs = config.epochs
    print("epochs is {}".format(epochs))

    ds_train = create_dataset(data_path,
                              train_mode=True,
                              epochs=1,
                              batch_size=config.batch_size,
                              is_tf_dataset=config.is_tf_dataset)
    ds_eval = create_dataset(data_path,
                             train_mode=False,
                             epochs=1,
                             batch_size=config.batch_size,
                             is_tf_dataset=config.is_tf_dataset)

    print("ds_train.size: {}".format(ds_train.get_dataset_size()))
    print("ds_eval.size: {}".format(ds_eval.get_dataset_size()))

    net_builder = ModelBuilder()

    train_net, eval_net = net_builder.get_net(config)
    train_net.set_train()
    auc_metric = AUCMetric()

    model = Model(train_net,
                  eval_network=eval_net,
                  metrics={"auc": auc_metric})

    eval_callback = EvalCallBack(model, ds_eval, auc_metric, config)
    callback = LossCallBack(config)
    ckptconfig = CheckpointConfig(
        save_checkpoint_steps=ds_train.get_dataset_size(),
        keep_checkpoint_max=10)
    ckpoint_cb = ModelCheckpoint(prefix='widedeep_train',
                                 directory=config.ckpt_path,
                                 config=ckptconfig)

    model.train(epochs,
                ds_train,
                callbacks=[
                    TimeMonitor(ds_train.get_dataset_size()), eval_callback,
                    callback, ckpoint_cb
                ])
def train_and_eval(config):
    """
    test_train_eval
    """
    set_seed(1000)
    data_path = config.data_path
    batch_size = config.batch_size
    epochs = config.epochs
    if config.dataset_type == "tfrecord":
        dataset_type = DataType.TFRECORD
    elif config.dataset_type == "mindrecord":
        dataset_type = DataType.MINDRECORD
    else:
        dataset_type = DataType.H5
    parameter_server = bool(config.parameter_server)
    cache_enable = config.vocab_cache_size > 0
    print("epochs is {}".format(epochs))
    ds_train = create_dataset(data_path, train_mode=True, epochs=1,
                              batch_size=batch_size, data_type=dataset_type)
    ds_eval = create_dataset(data_path, train_mode=False, epochs=1,
                             batch_size=batch_size, data_type=dataset_type)
    print("ds_train.size: {}".format(ds_train.get_dataset_size()))
    print("ds_eval.size: {}".format(ds_eval.get_dataset_size()))

    net_builder = ModelBuilder()

    train_net, eval_net = net_builder.get_net(config)
    train_net.set_train()
    auc_metric = AUCMetric()

    model = Model(train_net, eval_network=eval_net, metrics={"auc": auc_metric})

    eval_callback = EvalCallBack(model, ds_eval, auc_metric, config)
    callback = LossCallBack(config=config)
    ckptconfig = CheckpointConfig(save_checkpoint_steps=ds_train.get_dataset_size(), keep_checkpoint_max=5)
    ckpoint_cb = ModelCheckpoint(prefix='widedeep_train', directory=config.ckpt_path, config=ckptconfig)
    callback_list = [TimeMonitor(ds_train.get_dataset_size()), eval_callback, callback, ckpoint_cb]

    model.train(epochs, ds_train,
                callbacks=callback_list,
                dataset_sink_mode=(parameter_server and cache_enable))
Beispiel #13
0
def test_eval(config):
    """
    test evaluate
    """
    data_path = config.data_path
    batch_size = config.batch_size
    if config.dataset_type == "tfrecord":
        dataset_type = DataType.TFRECORD
    elif config.dataset_type == "mindrecord":
        dataset_type = DataType.MINDRECORD
    else:
        dataset_type = DataType.H5
    ds_eval = create_dataset(data_path,
                             train_mode=False,
                             epochs=1,
                             batch_size=batch_size,
                             data_type=dataset_type)
    print("ds_eval.size: {}".format(ds_eval.get_dataset_size()))

    net_builder = ModelBuilder()
    train_net, eval_net = net_builder.get_net(config)
    ckpt_path = config.ckpt_path
    if ";" in ckpt_path:
        ckpt_paths = ckpt_path.split(';')
        param_list_dict = {}
        strategy = build_searched_strategy(config.stra_ckpt)
        for slice_path in ckpt_paths:
            param_slice_dict = load_checkpoint(slice_path)
            for key, value in param_slice_dict.items():
                if 'optimizer' in key:
                    continue
                if key not in param_list_dict:
                    param_list_dict[key] = []
                param_list_dict[key].append(value)
        param_dict = {}
        for key, value in param_list_dict.items():
            if key in strategy:
                merged_parameter = merge_sliced_parameter(value, strategy)
            else:
                merged_parameter = merge_sliced_parameter(value)
            param_dict[key] = merged_parameter
    else:
        param_dict = load_checkpoint(ckpt_path)
    load_param_into_net(eval_net, param_dict)

    auc_metric = AUCMetric()
    model = Model(train_net,
                  eval_network=eval_net,
                  metrics={"auc": auc_metric})

    eval_callback = EvalCallBack(model, ds_eval, auc_metric, config)

    model.eval(ds_eval, callbacks=eval_callback)
Beispiel #14
0
def test_eval(config):
    """
    test evaluate
    """
    data_path = config.data_path
    ckpt_path = config.ckpt_path
    batch_size = config.batch_size
    if config.dataset_type == "tfrecord":
        dataset_type = DataType.TFRECORD
    elif config.dataset_type == "mindrecord":
        dataset_type = DataType.MINDRECORD
    else:
        dataset_type = DataType.H5

    # data upload
    print('Upload data from obs to modelarts server.')
    mox.file.copy_parallel(src_url=config.data_url, dst_url=data_path)
    mox.file.copy_parallel(src_url=config.ckpt_url, dst_url=ckpt_path)

    tar_file = data_path + "train_demo.tar.gz"
    untar(tar_file, data_path)
    data_path = data_path + config.dataset_type

    ds_eval = create_dataset(data_path,
                             train_mode=False,
                             epochs=1,
                             batch_size=batch_size,
                             data_type=dataset_type)
    print("ds_eval.size: {}".format(ds_eval.get_dataset_size()))

    net_builder = ModelBuilder()
    train_net, eval_net = net_builder.get_net(config)
    param_dict = load_checkpoint(find_ckpt(ckpt_path))
    load_param_into_net(eval_net, param_dict)

    auc_metric = AUCMetric()
    model = Model(train_net,
                  eval_network=eval_net,
                  metrics={"auc": auc_metric})

    eval_callback = EvalCallBack(model, ds_eval, auc_metric, config)

    model.eval(ds_eval, callbacks=eval_callback)
Beispiel #15
0
"""Mixed Precision Tutorial
The sample can be run on GPU and Ascend 910 AI processor.
"""
import mindspore.nn as nn
from mindspore import context, Model
from mindspore.train.callback import LossMonitor
from mindspore.nn.metrics import Accuracy
from src.lenet import LeNet5
from src.datasets import create_dataset


if __name__ == "__main__":

    context.set_context(mode=context.GRAPH_MODE, device_target="GPU")

    ds_train = create_dataset("./datasets/MNIST_Data/train", 32)
    ds_eval = create_dataset("./datasets/MNIST_Data/test", 32)
    # Initialize network
    network = LeNet5(10)

    # Define Loss and Optimizer
    net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean")
    net_opt = nn.Momentum(network.trainable_params(), learning_rate=0.01, momentum=0.9)
    # amp_leval=O2 in GPU, amp_leval=O3 in Ascend, O0 is without mixed precision
    model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()}, amp_level="O2")

    # Run training
    model.train(epoch=1, callbacks=[LossMonitor()], train_dataset=ds_train)

    # Run training
    acc = model.eval(ds_eval, dataset_sink_mode=False)
    train_data_path = "./datasets/MNIST_Data/train"
    eval_data_path = "./datasets/MNIST_Data/test"
    model_path = "./models/ckpt/custom_debugging_info/"

    net_loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean")
    repeat_size = 1
    network = LeNet5()

    metrics = {
        'accuracy': nn.Accuracy(),
        'loss': nn.Loss(),
        'precision': nn.Precision(),
        'recall': nn.Recall(),
        'f1_score': nn.F1()
        }
    net_opt = nn.Momentum(network.trainable_params(), lr, momentum)
    config_ck = CheckpointConfig(save_checkpoint_steps=1875, keep_checkpoint_max=10)
    ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet", directory=model_path, config=config_ck)

    model = Model(network, net_loss, net_opt, metrics=metrics)

    print("============== Starting Training ==============")
    ds_train = create_dataset(train_data_path, repeat_size=repeat_size)
    stop_cb = StopAtTime(run_time=0.6)
    model.train(epoch_size, ds_train, callbacks=[ckpoint_cb, LossMonitor(375), stop_cb], dataset_sink_mode=False)

    print("============== Starting Testing ==============")
    ds_eval = create_dataset(eval_data_path, repeat_size=repeat_size)
    acc = model.eval(ds_eval, dataset_sink_mode=False)
    print("============== Accuracy:{} ==============".format(acc))
def train_and_eval(config):
    """
    test_train_eval
    """
    data_path = config.data_path
    batch_size = config.batch_size
    epochs = config.epochs
    if config.dataset_type == "tfrecord":
        dataset_type = DataType.TFRECORD
    elif config.dataset_type == "mindrecord":
        dataset_type = DataType.MINDRECORD
    else:
        dataset_type = DataType.H5
    host_device_mix = bool(config.host_device_mix)
    sparse = config.sparse
    print("epochs is {}".format(epochs))
    if config.full_batch:
        context.set_auto_parallel_context(full_batch=True)
        ds.config.set_seed(1)
        if config.field_slice:
            compute_manual_shape(config, get_group_size())
            ds_train = create_dataset(data_path,
                                      train_mode=True,
                                      epochs=1,
                                      batch_size=batch_size * get_group_size(),
                                      data_type=dataset_type,
                                      manual_shape=config.manual_shape,
                                      target_column=config.field_size)
            ds_eval = create_dataset(data_path,
                                     train_mode=False,
                                     epochs=1,
                                     batch_size=batch_size * get_group_size(),
                                     data_type=dataset_type,
                                     manual_shape=config.manual_shape,
                                     target_column=config.field_size)
        else:
            ds_train = create_dataset(data_path,
                                      train_mode=True,
                                      epochs=1,
                                      batch_size=batch_size * get_group_size(),
                                      data_type=dataset_type)
            ds_eval = create_dataset(data_path,
                                     train_mode=False,
                                     epochs=1,
                                     batch_size=batch_size * get_group_size(),
                                     data_type=dataset_type)
    else:
        ds_train = create_dataset(data_path,
                                  train_mode=True,
                                  epochs=1,
                                  batch_size=batch_size,
                                  rank_id=get_rank(),
                                  rank_size=get_group_size(),
                                  data_type=dataset_type)
        ds_eval = create_dataset(data_path,
                                 train_mode=False,
                                 epochs=1,
                                 batch_size=batch_size,
                                 rank_id=get_rank(),
                                 rank_size=get_group_size(),
                                 data_type=dataset_type)
    print("ds_train.size: {}".format(ds_train.get_dataset_size()))
    print("ds_eval.size: {}".format(ds_eval.get_dataset_size()))

    net_builder = ModelBuilder()

    train_net, eval_net = net_builder.get_net(config)
    train_net.set_train()
    auc_metric = AUCMetric()

    model = Model(train_net,
                  eval_network=eval_net,
                  metrics={"auc": auc_metric})

    # Save strategy ckpts according to the rank id, this must be done before initializing the callbacks.
    config.stra_ckpt = os.path.join(
        config.stra_ckpt + "-{}".format(get_rank()), "strategy.ckpt")

    eval_callback = EvalCallBack(model, ds_eval, auc_metric, config)

    callback = LossCallBack(config=config, per_print_times=20)
    ckptconfig = CheckpointConfig(
        save_checkpoint_steps=ds_train.get_dataset_size() * epochs,
        keep_checkpoint_max=5,
        integrated_save=False)
    ckpoint_cb = ModelCheckpoint(prefix='widedeep_train',
                                 directory=os.path.join(
                                     config.ckpt_path,
                                     'ckpt_' + str(get_rank())),
                                 config=ckptconfig)

    context.set_auto_parallel_context(strategy_ckpt_save_file=config.stra_ckpt)
    callback_list = [
        TimeMonitor(ds_train.get_dataset_size()), eval_callback, callback
    ]
    if not host_device_mix:
        callback_list.append(ckpoint_cb)
    model.train(epochs,
                ds_train,
                callbacks=callback_list,
                dataset_sink_mode=(not sparse))
Beispiel #18
0
def train_eval(config):
    """
    test evaluate
    """
    data_path = config.data_path + config.dataset_type
    ckpt_path = config.ckpt_path
    epochs = config.epochs
    batch_size = config.batch_size
    if config.dataset_type == "tfrecord":
        dataset_type = DataType.TFRECORD
    elif config.dataset_type == "mindrecord":
        dataset_type = DataType.MINDRECORD
    else:
        dataset_type = DataType.H5

    ds_train = create_dataset(data_path,
                              train_mode=True,
                              epochs=1,
                              batch_size=batch_size,
                              data_type=dataset_type)
    print("ds_train.size: {}".format(ds_train.get_dataset_size()))
    ds_eval = create_dataset(data_path,
                             train_mode=False,
                             epochs=1,
                             batch_size=batch_size,
                             data_type=dataset_type)
    print("ds_eval.size: {}".format(ds_eval.get_dataset_size()))

    net_builder = ModelBuilder()
    train_net, eval_net = net_builder.get_net(config)
    train_net.set_train()

    train_model = Model(train_net)
    train_callback = LossCallBack(config=config)
    ckptconfig = CheckpointConfig(
        save_checkpoint_steps=ds_train.get_dataset_size(),
        keep_checkpoint_max=1)
    ckpoint_cb = ModelCheckpoint(prefix='widedeep_train',
                                 directory=config.ckpt_path,
                                 config=ckptconfig)
    train_model.train(epochs,
                      ds_train,
                      callbacks=[
                          TimeMonitor(ds_train.get_dataset_size()),
                          train_callback, ckpoint_cb
                      ])

    # data download
    print('Download data from modelarts server to obs.')
    mox.file.copy_parallel(src_url=config.ckpt_path, dst_url=config.train_url)

    param_dict = load_checkpoint(find_ckpt(ckpt_path))
    load_param_into_net(eval_net, param_dict)

    auc_metric = AUCMetric()
    eval_model = Model(train_net,
                       eval_network=eval_net,
                       metrics={"auc": auc_metric})
    eval_callback = EvalCallBack(eval_model, ds_eval, auc_metric, config)

    eval_model.eval(ds_eval, callbacks=eval_callback)
Beispiel #19
0
def train_and_eval(config):
    """
    test_train_eval
    """
    set_seed(1000)
    data_path = config.data_path
    batch_size = config.batch_size
    epochs = config.epochs
    if config.dataset_type == "tfrecord":
        dataset_type = DataType.TFRECORD
    elif config.dataset_type == "mindrecord":
        dataset_type = DataType.MINDRECORD
    else:
        dataset_type = DataType.H5
    parameter_server = bool(config.parameter_server)
    if cache_enable:
        config.full_batch = True
    print("epochs is {}".format(epochs))
    if config.full_batch:
        context.set_auto_parallel_context(full_batch=True)
        ds.config.set_seed(1)
        ds_train = create_dataset(data_path,
                                  train_mode=True,
                                  epochs=1,
                                  batch_size=batch_size * get_group_size(),
                                  data_type=dataset_type)
        ds_eval = create_dataset(data_path,
                                 train_mode=False,
                                 epochs=1,
                                 batch_size=batch_size * get_group_size(),
                                 data_type=dataset_type)
    else:
        ds_train = create_dataset(data_path,
                                  train_mode=True,
                                  epochs=1,
                                  batch_size=batch_size,
                                  rank_id=get_rank(),
                                  rank_size=get_group_size(),
                                  data_type=dataset_type)
        ds_eval = create_dataset(data_path,
                                 train_mode=False,
                                 epochs=1,
                                 batch_size=batch_size,
                                 rank_id=get_rank(),
                                 rank_size=get_group_size(),
                                 data_type=dataset_type)
    print("ds_train.size: {}".format(ds_train.get_dataset_size()))
    print("ds_eval.size: {}".format(ds_eval.get_dataset_size()))

    net_builder = ModelBuilder()

    train_net, eval_net = net_builder.get_net(config)
    train_net.set_train()
    auc_metric = AUCMetric()

    model = Model(train_net,
                  eval_network=eval_net,
                  metrics={"auc": auc_metric})

    if cache_enable:
        config.stra_ckpt = os.path.join(
            config.stra_ckpt + "-{}".format(get_rank()), "strategy.ckpt")
        context.set_auto_parallel_context(
            strategy_ckpt_save_file=config.stra_ckpt)

    eval_callback = EvalCallBack(model, ds_eval, auc_metric, config)

    callback = LossCallBack(config=config)
    if _is_role_worker():
        if cache_enable:
            ckptconfig = CheckpointConfig(
                save_checkpoint_steps=ds_train.get_dataset_size() * epochs,
                keep_checkpoint_max=1,
                integrated_save=False)
        else:
            ckptconfig = CheckpointConfig(
                save_checkpoint_steps=ds_train.get_dataset_size(),
                keep_checkpoint_max=5)
    else:
        ckptconfig = CheckpointConfig(save_checkpoint_steps=1,
                                      keep_checkpoint_max=1)
    ckpoint_cb = ModelCheckpoint(prefix='widedeep_train',
                                 directory=config.ckpt_path + '/ckpt_' +
                                 str(get_rank()) + '/',
                                 config=ckptconfig)
    callback_list = [
        TimeMonitor(ds_train.get_dataset_size()), eval_callback, callback
    ]
    if get_rank() == 0:
        callback_list.append(ckpoint_cb)
    model.train(epochs,
                ds_train,
                callbacks=callback_list,
                dataset_sink_mode=bool(parameter_server and cache_enable))
def train_and_eval(config):
    """
    test_train_eval
    """
    data_path = config.data_path
    batch_size = config.batch_size
    epochs = config.epochs
    print("epochs is {}".format(epochs))
    if config.full_batch:
        context.set_auto_parallel_context(full_batch=True)
        de.config.set_seed(1)
        ds_train = create_dataset(data_path,
                                  train_mode=True,
                                  epochs=epochs,
                                  batch_size=batch_size * get_group_size())
        ds_eval = create_dataset(data_path,
                                 train_mode=False,
                                 epochs=epochs + 1,
                                 batch_size=batch_size * get_group_size())
    else:
        ds_train = create_dataset(data_path,
                                  train_mode=True,
                                  epochs=epochs,
                                  batch_size=batch_size,
                                  rank_id=get_rank(),
                                  rank_size=get_group_size())
        ds_eval = create_dataset(data_path,
                                 train_mode=False,
                                 epochs=epochs + 1,
                                 batch_size=batch_size,
                                 rank_id=get_rank(),
                                 rank_size=get_group_size())
    print("ds_train.size: {}".format(ds_train.get_dataset_size()))
    print("ds_eval.size: {}".format(ds_eval.get_dataset_size()))

    net_builder = ModelBuilder()

    train_net, eval_net = net_builder.get_net(config)
    train_net.set_train()
    auc_metric = AUCMetric()

    model = Model(train_net,
                  eval_network=eval_net,
                  metrics={"auc": auc_metric})

    eval_callback = EvalCallBack(model, ds_eval, auc_metric, config)

    callback = LossCallBack(config=config)
    ckptconfig = CheckpointConfig(
        save_checkpoint_steps=ds_train.get_dataset_size(),
        keep_checkpoint_max=5)
    ckpoint_cb = ModelCheckpoint(prefix='widedeep_train',
                                 directory=config.ckpt_path,
                                 config=ckptconfig)
    context.set_auto_parallel_context(
        strategy_ckpt_save_file="./strategy_train.ckpt")
    model.train(epochs,
                ds_train,
                callbacks=[
                    TimeMonitor(ds_train.get_dataset_size()), eval_callback,
                    callback, ckpoint_cb
                ])
Beispiel #21
0
    model_file = model_path + "checkpoint_lenet-1_1875.ckpt"

    os.system('rm -f {0}*.ckpt {0}*.meta {0}*.pb'.format(model_path))

    network = LeNet5()
    net_opt = nn.Momentum(network.trainable_params(), lr, momentum)
    net_loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean')
    model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()})

    config_ck = CheckpointConfig(save_checkpoint_steps=1875,
                                 keep_checkpoint_max=16)
    ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet",
                                 directory=model_path,
                                 config=config_ck)

    train_dataset = create_dataset("./datasets/MNIST_Data/train")
    eval_dataset = create_dataset("./datasets/MNIST_Data/test")
    print("===============start training===============")
    model.train(epoch_size,
                train_dataset,
                callbacks=[ckpoint_cb, LossMonitor()],
                dataset_sink_mode=False)
    print(
        "===============saving models in mindir and onnx formats==============="
    )

    output_file_formats(model_file, network, [32, 1, 32, 32],
                        "checkpoint_lenet", "MINDIR")
    output_file_formats(model_file, network, [32, 1, 32, 32],
                        "checkpoint_lenet", "ONNX")