Ejemplo n.º 1
0
def do_train(dataset=None,
             network=None,
             load_checkpoint_path="",
             save_checkpoint_path="",
             epoch_num=1):
    """ do train """
    if load_checkpoint_path == "":
        raise ValueError(
            "Pretrain model missed, finetune task must load pretrain model!")
    steps_per_epoch = dataset.get_dataset_size()
    # optimizer
    optimizer = Adam(network.trainable_params(),
                     learning_rate=optimizer_cfg.learning_rate)
    # load checkpoint into network
    ckpt_config = CheckpointConfig(save_checkpoint_steps=steps_per_epoch,
                                   keep_checkpoint_max=1)
    ckpoint_cb = ModelCheckpoint(
        prefix="classifier",
        directory=None if save_checkpoint_path == "" else save_checkpoint_path,
        config=ckpt_config)
    param_dict = load_checkpoint(load_checkpoint_path)
    load_param_into_net(network, param_dict)

    update_cell = DynamicLossScaleUpdateCell(loss_scale_value=2**32,
                                             scale_factor=2,
                                             scale_window=1000)
    netwithgrads = BertFinetuneCell(network,
                                    optimizer=optimizer,
                                    scale_update_cell=update_cell)
    model = Model(netwithgrads)
    callbacks = [
        TimeMonitor(dataset.get_dataset_size()),
        LossCallBack(dataset.get_dataset_size()), ckpoint_cb
    ]
    model.train(epoch_num, dataset, callbacks=callbacks)
Ejemplo n.º 2
0
def train_and_eval(config):
    """
    test_train_eval
    """
    np.random.seed(1000)
    data_path = config.data_path
    batch_size = config.batch_size
    epochs = config.epochs
    print("epochs is {}".format(epochs))
    ds_train = create_dataset(data_path, train_mode=True, epochs=epochs,
                              batch_size=batch_size, rank_id=get_rank(), rank_size=get_group_size())
    ds_eval = create_dataset(data_path, train_mode=False, epochs=epochs + 1,
                             batch_size=batch_size, rank_id=get_rank(), rank_size=get_group_size())
    print("ds_train.size: {}".format(ds_train.get_dataset_size()))
    print("ds_eval.size: {}".format(ds_eval.get_dataset_size()))

    net_builder = ModelBuilder()

    train_net, eval_net = net_builder.get_net(config)
    train_net.set_train()
    auc_metric = AUCMetric()

    model = Model(train_net, eval_network=eval_net, metrics={"auc": auc_metric})

    eval_callback = EvalCallBack(model, ds_eval, auc_metric, config)

    callback = LossCallBack(config=config)
    ckptconfig = CheckpointConfig(save_checkpoint_steps=ds_train.get_dataset_size(), keep_checkpoint_max=5)
    ckpoint_cb = ModelCheckpoint(prefix='widedeep_train',
                                 directory=config.ckpt_path, config=ckptconfig)
    out = model.eval(ds_eval)
    print("=====" * 5 + "model.eval() initialized: {}".format(out))
    model.train(epochs, ds_train,
                callbacks=[TimeMonitor(ds_train.get_dataset_size()), eval_callback, callback, ckpoint_cb])
Ejemplo n.º 3
0
def run_transformer_train():
    """
    Transformer training.
    """
    parser = argparse_init()
    args, _ = parser.parse_known_args()
    context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=args.device_id)
    context.set_context(reserve_class_name_in_scope=False, enable_auto_mixed_precision=False)

    if args.distribute == "true":
        device_num = args.device_num
        context.reset_auto_parallel_context()
        context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, mirror_mean=True,
                                          parameter_broadcast=True, device_num=device_num)
        D.init()
        rank_id = args.device_id % device_num
    else:
        device_num = 1
        rank_id = 0
    dataset = create_transformer_dataset(epoch_count=1, rank_size=device_num,
                                         rank_id=rank_id, do_shuffle=args.do_shuffle,
                                         enable_data_sink=args.enable_data_sink,
                                         dataset_path=args.data_path)

    netwithloss = TransformerNetworkWithLoss(transformer_net_cfg, True)

    if args.checkpoint_path:
        parameter_dict = load_checkpoint(args.checkpoint_path)
        load_param_into_net(netwithloss, parameter_dict)

    lr = Tensor(create_dynamic_lr(schedule="constant*rsqrt_hidden*linear_warmup*rsqrt_decay",
                                  training_steps=dataset.get_dataset_size()*args.epoch_size,
                                  learning_rate=cfg.lr_schedule.learning_rate,
                                  warmup_steps=cfg.lr_schedule.warmup_steps,
                                  hidden_size=transformer_net_cfg.hidden_size,
                                  start_decay_step=cfg.lr_schedule.start_decay_step,
                                  min_lr=cfg.lr_schedule.min_lr), mstype.float32)
    optimizer = Adam(netwithloss.trainable_params(), lr)

    callbacks = [TimeMonitor(dataset.get_dataset_size()), LossCallBack()]
    if args.enable_save_ckpt == "true":
        if device_num == 1 or (device_num > 1 and rank_id == 0):
            ckpt_config = CheckpointConfig(save_checkpoint_steps=args.save_checkpoint_steps,
                                           keep_checkpoint_max=args.save_checkpoint_num)
            ckpoint_cb = ModelCheckpoint(prefix='transformer', directory=args.save_checkpoint_path, config=ckpt_config)
            callbacks.append(ckpoint_cb)

    if args.enable_lossscale == "true":
        scale_manager = DynamicLossScaleManager(init_loss_scale=cfg.init_loss_scale_value,
                                                scale_factor=cfg.scale_factor,
                                                scale_window=cfg.scale_window)
        update_cell = scale_manager.get_update_cell()
        netwithgrads = TransformerTrainOneStepWithLossScaleCell(netwithloss, optimizer=optimizer,
                                                                scale_update_cell=update_cell)
    else:
        netwithgrads = TransformerTrainOneStepCell(netwithloss, optimizer=optimizer)

    netwithgrads.set_train(True)
    model = Model(netwithgrads)
    model.train(args.epoch_size, dataset, callbacks=callbacks, dataset_sink_mode=(args.enable_data_sink == "true"))
Ejemplo n.º 4
0
def test_train(configure):
    """
    test_train
    """
    data_path = configure.data_path
    batch_size = configure.batch_size
    epochs = configure.epochs
    ds_train = create_dataset(data_path,
                              train_mode=True,
                              epochs=epochs,
                              batch_size=batch_size)
    print("ds_train.size: {}".format(ds_train.get_dataset_size()))

    net_builder = ModelBuilder()
    train_net, _ = net_builder.get_net(configure)
    train_net.set_train()

    model = Model(train_net)
    callback = LossCallBack(config=configure)
    ckptconfig = CheckpointConfig(
        save_checkpoint_steps=ds_train.get_dataset_size(),
        keep_checkpoint_max=5)
    ckpoint_cb = ModelCheckpoint(prefix='widedeep_train',
                                 directory=configure.ckpt_path,
                                 config=ckptconfig)
    model.train(epochs,
                ds_train,
                callbacks=[
                    TimeMonitor(ds_train.get_dataset_size()), callback,
                    ckpoint_cb
                ])
Ejemplo n.º 5
0
def test_train_eval():
    """
    test_train_eval
    """
    config = WideDeepConfig()
    data_path = config.data_path
    batch_size = config.batch_size
    epochs = config.epochs
    print("epochs is {}".format(epochs))
    ds_train = create_dataset(data_path, train_mode=True, epochs=epochs, batch_size=batch_size,
                              data_type=DataType.MINDRECORD, rank_id=get_rank(), rank_size=get_group_size())
    ds_eval = create_dataset(data_path, train_mode=False, epochs=epochs + 1, batch_size=batch_size,
                             data_type=DataType.MINDRECORD, rank_id=get_rank(), rank_size=get_group_size())
    print("ds_train.size: {}".format(ds_train.get_dataset_size()))
    print("ds_eval.size: {}".format(ds_eval.get_dataset_size()))

    net_builder = ModelBuilder()

    train_net, eval_net = net_builder.get_net(config)
    train_net.set_train()
    auc_metric = AUCMetric()

    model = Model(train_net, eval_network=eval_net, metrics={"auc": auc_metric})

    eval_callback = EvalCallBack(model, ds_eval, auc_metric, config)

    callback = LossCallBack(config=config)
    context.set_auto_parallel_context(strategy_ckpt_save_file="./strategy_train.ckpt")
    model.train(epochs, ds_train,
                callbacks=[TimeMonitor(ds_train.get_dataset_size()), eval_callback, callback])
    eval_values = list(eval_callback.eval_values)
    assert eval_values[0] > 0.78
Ejemplo n.º 6
0
def train(model,
          dataset_direct,
          filename,
          columns_list,
          num_consumer=4,
          batch=16,
          epoch=50,
          save_checkpoint_steps=2172,
          keep_checkpoint_max=50,
          prefix="model",
          directory='./'):
    """
    train network
    """
    config_ck = CheckpointConfig(save_checkpoint_steps=save_checkpoint_steps,
                                 keep_checkpoint_max=keep_checkpoint_max)
    ckpoint_cb = ModelCheckpoint(prefix=prefix,
                                 directory=directory,
                                 config=config_ck)
    data_train = create_dataset(dataset_direct, filename, batch, columns_list,
                                num_consumer)

    model.train(epoch,
                data_train,
                callbacks=[
                    ckpoint_cb,
                    LossMonitor(per_print_times=181),
                    TimeMonitor()
                ],
                dataset_sink_mode=True)
Ejemplo n.º 7
0
def train():
    context.set_context(
        mode=context.GRAPH_MODE,
        device_target="Ascend",
        #save_graphs=True,
        #save_graphs_path="/home/work/user-job-dir/EAST/",
        #enable_reduce_precision=False,
        #device_id=5
    )

    epoch = 600

    my_dataset.download_dataset()
    train_img_path = os.path.abspath('/cache/train_img')
    train_gt_path = os.path.abspath('/cache/train_gt')
    #my_dataset.data_to_mindrecord_byte_image(train_img_path, train_gt_path, mindrecord_dir='/cache', prefix='icdar_train.mindrecord',file_num=1)
    #dataset = my_dataset.create_icdar_train_dataset(mindrecord_file=['icdar_train.mindrecord0','icdar_train.mindrecord1','icdar_train.mindrecord2','icdar_train.mindrecord3'], batch_size=32, repeat_num=epoch,
    #                            is_training=True, num_parallel_workers=8, length=512, scale=0.25)
    #dataset = my_dataset.create_icdar_train_dataset(mindrecord_file='/cache/icdar_train.mindrecord', batch_size=32, repeat_num=epoch,
    #                            is_training=True, num_parallel_workers=24, length=512, scale=0.25)
    #dataset = my_dataset.create_demo_dataset(batch_size=21, repeat_num=2)
    #train_img_path = os.path.abspath('/home/licheng/gpzlx1/ICDAR_2015/train/img')
    #train_gt_path  = os.path.abspath('/home/licheng/gpzlx1/ICDAR_2015/train/gt')
    dataset = datasetV2.create_icdar_train_dataset(train_img_path,
                                                   train_gt_path,
                                                   batch_size=14,
                                                   repeat_num=1,
                                                   is_training=True,
                                                   num_parallel_workers=24)
    #dataset = datasetV3.create_icdar_train_dataset(train_img_path, train_gt_path, batch_size=14, repeat_num=1, is_training=True, num_parallel_workers=24)
    dataset_size = dataset.get_dataset_size()

    print("Create dataset done!, dataset_size: ", dataset_size)

    #east = EAST.EAST()
    net = EAST_VGG.EAST()

    #ckpt_config = CheckpointConfig(save_checkpoint_steps=dataset_size * 20)
    #ckpoint_cb = ModelCheckpoint(prefix='EAST', directory='/cache', config=ckpt_config)

    milestone = [100, 300]
    learning_rates = [1e-3, 1e-4]
    lr = piecewise_constant_lr(milestone, learning_rates)
    opt = nn.Adam(filter(lambda x: x.requires_grad, net.get_parameters()),
                  learning_rate=lr)
    net = my_loss.EASTWithLossCell(net)
    net = my_loss.TrainingWrapper(net, opt)
    net.set_train(True)

    callback = [TimeMonitor(data_size=dataset_size),
                LossMonitor()]  #, ckpoint_cb]

    model = Model(net)
    dataset_sink_mode = False
    print("start trainig")
    model.train(epoch,
                dataset,
                callbacks=callback,
                dataset_sink_mode=dataset_sink_mode)
Ejemplo n.º 8
0
def test_train_eval():
    """
    test_train_eval
    """
    np.random.seed(1000)
    config = WideDeepConfig()
    data_path = config.data_path
    batch_size = config.batch_size
    epochs = config.epochs
    print("epochs is {}".format(epochs))
    ds_train = create_dataset(data_path,
                              train_mode=True,
                              epochs=1,
                              batch_size=batch_size,
                              rank_id=get_rank(),
                              rank_size=get_group_size())
    ds_eval = create_dataset(data_path,
                             train_mode=False,
                             epochs=1,
                             batch_size=batch_size,
                             rank_id=get_rank(),
                             rank_size=get_group_size())
    print("ds_train.size: {}".format(ds_train.get_dataset_size()))
    print("ds_eval.size: {}".format(ds_eval.get_dataset_size()))

    net_builder = ModelBuilder()

    train_net, eval_net = net_builder.get_net(config)
    train_net.set_train()
    auc_metric = AUCMetric()

    model = Model(train_net,
                  eval_network=eval_net,
                  metrics={"auc": auc_metric})

    eval_callback = EvalCallBack(model, ds_eval, auc_metric, config)

    callback = LossCallBack(config=config)
    ckptconfig = CheckpointConfig(
        save_checkpoint_steps=ds_train.get_dataset_size(),
        keep_checkpoint_max=5)
    ckpoint_cb = ModelCheckpoint(prefix='widedeep_train',
                                 directory=config.ckpt_path,
                                 config=ckptconfig)
    out = model.eval(ds_eval)
    print("=====" * 5 + "model.eval() initialized: {}".format(out))
    model.train(epochs,
                ds_train,
                callbacks=[
                    TimeMonitor(ds_train.get_dataset_size()), eval_callback,
                    callback, ckpoint_cb
                ])
    expect_out0 = [0.792634, 0.799862, 0.803324]
    expect_out6 = [0.796580, 0.803908, 0.807262]
    if get_rank() == 0:
        assert np.allclose(eval_callback.eval_values, expect_out0)
    if get_rank() == 6:
        assert np.allclose(eval_callback.eval_values, expect_out6)
Ejemplo n.º 9
0
def train():
    rank_id = 0
    if args.run_distribute:
        context.set_auto_parallel_context(
            device_num=args.device_num,
            parallel_mode=ParallelMode.DATA_PARALLEL,
            gradients_mean=True,
            parameter_broadcast=True)
        init()
        rank_id = get_rank()

    # dataset/network/criterion/optim
    ds = train_dataset_creator(args.device_id, args.device_num)
    step_size = ds.get_dataset_size()
    print('Create dataset done!')

    config.INFERENCE = False
    net = ETSNet(config)
    net = net.set_train()
    param_dict = load_checkpoint(args.pre_trained)
    load_param_into_net(net, param_dict)
    print('Load Pretrained parameters done!')

    criterion = DiceLoss(batch_size=config.TRAIN_BATCH_SIZE)

    lrs = lr_generator(start_lr=1e-3,
                       lr_scale=0.1,
                       total_iters=config.TRAIN_TOTAL_ITER)
    opt = nn.SGD(params=net.trainable_params(),
                 learning_rate=lrs,
                 momentum=0.99,
                 weight_decay=5e-4)

    # warp model
    net = WithLossCell(net, criterion)
    if args.run_distribute:
        net = TrainOneStepCell(net,
                               opt,
                               reduce_flag=True,
                               mean=True,
                               degree=args.device_num)
    else:
        net = TrainOneStepCell(net, opt)

    time_cb = TimeMonitor(data_size=step_size)
    loss_cb = LossCallBack(per_print_times=10)
    # set and apply parameters of check point config.TRAIN_MODEL_SAVE_PATH
    ckpoint_cf = CheckpointConfig(save_checkpoint_steps=1875,
                                  keep_checkpoint_max=2)
    ckpoint_cb = ModelCheckpoint(prefix="ETSNet",
                                 config=ckpoint_cf,
                                 directory="./ckpt_{}".format(rank_id))

    model = Model(net)
    model.train(config.TRAIN_REPEAT_NUM,
                ds,
                dataset_sink_mode=True,
                callbacks=[time_cb, loss_cb, ckpoint_cb])
Ejemplo n.º 10
0
def test_train_eval(config):
    """
    test_train_eval
    """
    data_path = config.data_path
    batch_size = config.batch_size
    epochs = config.epochs
    sparse = config.sparse
    if config.dataset_type == "tfrecord":
        dataset_type = DataType.TFRECORD
    elif config.dataset_type == "mindrecord":
        dataset_type = DataType.MINDRECORD
    else:
        dataset_type = DataType.H5
    ds_train = create_dataset(data_path,
                              train_mode=True,
                              epochs=1,
                              batch_size=batch_size,
                              data_type=dataset_type)
    ds_eval = create_dataset(data_path,
                             train_mode=False,
                             epochs=1,
                             batch_size=batch_size,
                             data_type=dataset_type)
    print("ds_train.size: {}".format(ds_train.get_dataset_size()))
    print("ds_eval.size: {}".format(ds_eval.get_dataset_size()))

    net_builder = ModelBuilder()

    train_net, eval_net = net_builder.get_net(config)
    train_net.set_train()
    auc_metric = AUCMetric()

    model = Model(train_net,
                  eval_network=eval_net,
                  metrics={"auc": auc_metric})

    eval_callback = EvalCallBack(model, ds_eval, auc_metric, config)

    callback = LossCallBack(config=config)
    ckptconfig = CheckpointConfig(
        save_checkpoint_steps=ds_train.get_dataset_size(),
        keep_checkpoint_max=5)
    ckpoint_cb = ModelCheckpoint(prefix='widedeep_train',
                                 directory=config.ckpt_path,
                                 config=ckptconfig)

    out = model.eval(ds_eval)
    print("=====" * 5 + "model.eval() initialized: {}".format(out))
    model.train(epochs,
                ds_train,
                callbacks=[
                    TimeMonitor(ds_train.get_dataset_size()), eval_callback,
                    callback, ckpoint_cb
                ],
                dataset_sink_mode=(not sparse))
Ejemplo n.º 11
0
def train_and_eval(config):
    """
    train_and_eval
    """
    set_seed(1000)
    data_path = config.data_path
    epochs = config.epochs
    print("epochs is {}".format(epochs))

    ds_train = create_dataset(data_path,
                              train_mode=True,
                              epochs=1,
                              batch_size=config.batch_size,
                              is_tf_dataset=config.is_tf_dataset,
                              rank_id=get_rank(),
                              rank_size=get_group_size())
    ds_eval = create_dataset(data_path,
                             train_mode=False,
                             epochs=1,
                             batch_size=config.batch_size,
                             is_tf_dataset=config.is_tf_dataset,
                             rank_id=get_rank(),
                             rank_size=get_group_size())

    print("ds_train.size: {}".format(ds_train.get_dataset_size()))
    print("ds_eval.size: {}".format(ds_eval.get_dataset_size()))

    net_builder = ModelBuilder()

    train_net, eval_net = net_builder.get_net(config)
    train_net.set_train()
    auc_metric = AUCMetric()

    model = Model(train_net,
                  eval_network=eval_net,
                  metrics={"auc": auc_metric})

    eval_callback = EvalCallBack(model, ds_eval, auc_metric, config)
    callback = LossCallBack(config)
    # Only save the last checkpoint at the last epoch. For saving epochs at each epoch, please
    ckptconfig = CheckpointConfig(
        save_checkpoint_steps=ds_train.get_dataset_size() * config.epochs,
        keep_checkpoint_max=10)
    ckpoint_cb = ModelCheckpoint(prefix='widedeep_train',
                                 directory=config.ckpt_path + '/ckpt_' +
                                 str(get_rank()) + '/',
                                 config=ckptconfig)
    callback_list = [
        TimeMonitor(ds_train.get_dataset_size()), eval_callback, callback
    ]
    if int(get_rank()) == 0:
        callback_list.append(ckpoint_cb)
    model.train(epochs,
                ds_train,
                callbacks=callback_list,
                sink_size=ds_train.get_dataset_size())
Ejemplo n.º 12
0
def run_predistill():
    """
    run predistill
    """
    cfg = phase1_cfg
    context.set_context(mode=context.GRAPH_MODE, device_target=args_opt.device_target, device_id=args_opt.device_id)
    context.set_context(reserve_class_name_in_scope=False)
    load_teacher_checkpoint_path = args_opt.load_teacher_ckpt_path
    load_student_checkpoint_path = args_opt.load_gd_ckpt_path
    netwithloss = BertNetworkWithLoss_td(teacher_config=td_teacher_net_cfg, teacher_ckpt=load_teacher_checkpoint_path,
                                         student_config=td_student_net_cfg, student_ckpt=load_student_checkpoint_path,
                                         is_training=True, task_type='classification',
                                         num_labels=args_opt.num_labels, is_predistill=True)

    rank = 0
    device_num = 1
    dataset = create_tinybert_dataset('td', td_teacher_net_cfg.batch_size,
                                      device_num, rank, args_opt.do_shuffle,
                                      args_opt.train_data_dir, args_opt.schema_dir)

    dataset_size = dataset.get_dataset_size()
    print('td1 dataset size: ', dataset_size)
    if args_opt.enable_data_sink == 'true':
        repeat_count = args_opt.td_phase1_epoch_size * dataset.get_dataset_size() // args_opt.data_sink_steps
        time_monitor_steps = args_opt.data_sink_steps
    else:
        repeat_count = args_opt.td_phase1_epoch_size
        time_monitor_steps = dataset_size

    optimizer_cfg = cfg.optimizer_cfg

    lr_schedule = BertLearningRate(learning_rate=optimizer_cfg.AdamWeightDecay.learning_rate,
                                   end_learning_rate=optimizer_cfg.AdamWeightDecay.end_learning_rate,
                                   warmup_steps=int(dataset_size / 10),
                                   decay_steps=int(dataset_size * args_opt.td_phase1_epoch_size),
                                   power=optimizer_cfg.AdamWeightDecay.power)
    params = netwithloss.trainable_params()
    decay_params = list(filter(optimizer_cfg.AdamWeightDecay.decay_filter, params))
    other_params = list(filter(lambda x: not cfg.AdamWeightDecay.decay_filter(x), params))
    group_params = [{'params': decay_params, 'weight_decay': optimizer_cfg.AdamWeightDecay.weight_decay},
                    {'params': other_params, 'weight_decay': 0.0},
                    {'order_params': params}]

    optimizer = AdamWeightDecay(group_params, learning_rate=lr_schedule, eps=optimizer_cfg.AdamWeightDecay.eps)
    callback = [TimeMonitor(time_monitor_steps), LossCallBack(), ModelSaveCkpt(netwithloss.bert,
                                                                               args_opt.save_ckpt_step,
                                                                               args_opt.max_ckpt_num,
                                                                               td_phase1_save_ckpt_dir)]
    update_cell = DynamicLossScaleUpdateCell(loss_scale_value=cfg.loss_scale_value,
                                             scale_factor=cfg.scale_factor,
                                             scale_window=cfg.scale_window)
    netwithgrads = BertEvaluationCell(netwithloss, optimizer=optimizer, scale_update_cell=update_cell)
    model = Model(netwithgrads)
    model.train(repeat_count, dataset, callbacks=callback,
                dataset_sink_mode=(args_opt.enable_data_sink == 'true'),
                sink_size=args_opt.data_sink_steps)
Ejemplo n.º 13
0
def train_net(data_dir, seg_dir, run_distribute, config=None):
    if run_distribute:
        init()
        rank_id = get_rank()
        rank_size = get_group_size()
        parallel_mode = ParallelMode.DATA_PARALLEL
        context.set_auto_parallel_context(parallel_mode=parallel_mode,
                                          device_num=rank_size,
                                          gradients_mean=True)
    else:
        rank_id = 0
        rank_size = 1
    # train_dataset = create_dataset(data_path=data_dir, seg_path=seg_dir, config=config, \
    #                                 rank_size=rank_size, rank_id=rank_id, is_training=True)
    train_dataset = create_dataset_diy()
    # for item in train_dataset:
    #     print(item)
    # exit(0)

    train_data_size = train_dataset.get_dataset_size()
    print("train dataset length is:", train_data_size)

    network = UNet3d(config=config)

    loss = SoftmaxCrossEntropyWithLogits()
    # loss = nn.DiceLoss()
    lr = Tensor(dynamic_lr(config, train_data_size), mstype.float32)
    optimizer = nn.Adam(params=network.trainable_params(), learning_rate=lr)
    scale_manager = FixedLossScaleManager(config.loss_scale,
                                          drop_overflow_update=False)
    network.set_train()

    model = Model(network,
                  loss_fn=loss,
                  optimizer=optimizer,
                  loss_scale_manager=scale_manager,
                  amp_level='O3')

    time_cb = TimeMonitor(data_size=train_data_size)
    loss_cb = LossMonitor(per_print_times=2)
    ckpt_config = CheckpointConfig(
        save_checkpoint_steps=train_data_size,
        keep_checkpoint_max=config.keep_checkpoint_max)
    ckpoint_cb = ModelCheckpoint(prefix='{}'.format(config.model),
                                 directory='./ckpt_{}/'.format(rank_size),
                                 config=ckpt_config)
    callbacks_list = [loss_cb, time_cb, ckpoint_cb]
    print("============== Starting Training ==============")
    model.train(config.epoch_size,
                train_dataset,
                callbacks=callbacks_list,
                dataset_sink_mode=False)
    print("============== End Training ==============")
def resnet50_train(args_opt):
    epoch_size = args_opt.epoch_size
    batch_size = cfg.batch_size
    class_num = cfg.class_num
    loss_scale_num = cfg.loss_scale
    local_data_path = '/cache/data'
    local_ckpt_path = '/cache/ckpt_file'

    # set graph mode and parallel mode
    context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", save_graphs=False)

    # data download
    print('Download data.')
    mox.file.copy_parallel(src_url=args_opt.data_url, dst_url=local_data_path)

    # create dataset
    print('Create train and evaluate dataset.')
    train_dataset = create_dataset(dataset_path=local_data_path, do_train=True,
                                   repeat_num=epoch_size, batch_size=batch_size)
    train_step_size = train_dataset.get_dataset_size()
    print('Create dataset success.')

    # create model
    net = resnet50(class_num=class_num)
    # reduction='mean' means that apply reduction of mean to loss
    loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean')
    lr = Tensor(get_lr(global_step=0, total_epochs=epoch_size, steps_per_epoch=train_step_size))
    opt = Momentum(net.trainable_params(), lr, momentum=0.9, weight_decay=1e-4, loss_scale=loss_scale_num)
    loss_scale = FixedLossScaleManager(loss_scale_num, False)

    # amp_level="O2" means that the hybrid precision of O2 mode is used for training
    # the whole network except that batchnorm will be cast into float16 format and dynamic loss scale will be used
    # 'keep_batchnorm_fp32 = False' means that use the float16 format
    model = Model(net, amp_level="O2", keep_batchnorm_fp32=False, loss_fn=loss,
                  optimizer=opt, loss_scale_manager=loss_scale, metrics={'acc'})

    # define performance callback to show ips and loss callback to show loss for every epoch
    time_cb = TimeMonitor(data_size=train_step_size)
    performance_cb = PerformanceCallback(batch_size)
    loss_cb = LossMonitor()
    cb = [time_cb, performance_cb, loss_cb]
    config_ck = CheckpointConfig(save_checkpoint_steps=cfg.save_checkpoint_epochs * train_step_size,
                                 keep_checkpoint_max=cfg.keep_checkpoint_max)
    ckpt_cb = ModelCheckpoint(prefix="resnet", directory=local_ckpt_path, config=config_ck)
    cb += [ckpt_cb]

    print(f'Start run training, total epoch: {epoch_size}.')
    model.train(epoch_size, train_dataset, callbacks=cb)

    # upload checkpoint files
    print('Upload checkpoint.')
    mox.file.copy_parallel(src_url=local_ckpt_path, dst_url=args_opt.train_url)
def train_and_eval(config):
    """
    test_train_eval
    """
    data_path = config.data_path
    batch_size = config.batch_size
    epochs = config.epochs
    if config.dataset_type == "tfrecord":
        dataset_type = DataType.TFRECORD
    elif config.dataset_type == "mindrecord":
        dataset_type = DataType.MINDRECORD
    else:
        dataset_type = DataType.H5
    host_device_mix = bool(config.host_device_mix)
    print("epochs is {}".format(epochs))
    if config.full_batch:
        context.set_auto_parallel_context(full_batch=True)
        de.config.set_seed(1)
        ds_train = create_dataset(data_path, train_mode=True, epochs=1,
                                  batch_size=batch_size*get_group_size(), data_type=dataset_type)
        ds_eval = create_dataset(data_path, train_mode=False, epochs=1,
                                 batch_size=batch_size*get_group_size(), data_type=dataset_type)
    else:
        ds_train = create_dataset(data_path, train_mode=True, epochs=1,
                                  batch_size=batch_size, rank_id=get_rank(),
                                  rank_size=get_group_size(), data_type=dataset_type)
        ds_eval = create_dataset(data_path, train_mode=False, epochs=1,
                                 batch_size=batch_size, rank_id=get_rank(),
                                 rank_size=get_group_size(), data_type=dataset_type)
    print("ds_train.size: {}".format(ds_train.get_dataset_size()))
    print("ds_eval.size: {}".format(ds_eval.get_dataset_size()))

    net_builder = ModelBuilder()

    train_net, eval_net = net_builder.get_net(config)
    train_net.set_train()
    auc_metric = AUCMetric()

    model = Model(train_net, eval_network=eval_net, metrics={"auc": auc_metric})

    eval_callback = EvalCallBack(model, ds_eval, auc_metric, config, host_device_mix=host_device_mix)

    callback = LossCallBack(config=config)
    ckptconfig = CheckpointConfig(save_checkpoint_steps=ds_train.get_dataset_size(), keep_checkpoint_max=5)
    ckpoint_cb = ModelCheckpoint(prefix='widedeep_train',
                                 directory=config.ckpt_path, config=ckptconfig)
    context.set_auto_parallel_context(strategy_ckpt_save_file="./strategy_train.ckpt")
    callback_list = [TimeMonitor(ds_train.get_dataset_size()), eval_callback, callback]
    if not host_device_mix:
        callback_list.append(ckpoint_cb)
    model.train(epochs, ds_train, callbacks=callback_list, dataset_sink_mode=(not host_device_mix))
Ejemplo n.º 16
0
def test_all_trains():
    ds_train = create_dataset(
        os.path.join('/home/workspace/mindspore_dataset/mnist', "train"), 32,
        1)

    network = LeNet5(10)
    net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean")
    net_opt = nn.Momentum(network.trainable_params(), 0.01, 0.9)
    time_cb = TimeMonitor(data_size=ds_train.get_dataset_size())

    model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()})

    print("============== Starting Training ==============")
    model.train(1, ds_train, callbacks=[time_cb, LossMonitor()])
Ejemplo n.º 17
0
def do_train(dataset=None, network=None, load_checkpoint_path="", save_checkpoint_path="", epoch_num=1):
    """ do train """
    if load_checkpoint_path == "":
        raise ValueError("Pretrain model missed, finetune task must load pretrain model!")
    steps_per_epoch = dataset.get_dataset_size()
    # optimizer
    if optimizer_cfg.optimizer == 'AdamWeightDecay':
        lr_schedule = BertLearningRate(learning_rate=optimizer_cfg.AdamWeightDecay.learning_rate,
                                       end_learning_rate=optimizer_cfg.AdamWeightDecay.end_learning_rate,
                                       warmup_steps=int(steps_per_epoch * epoch_num * 0.1),
                                       decay_steps=steps_per_epoch * epoch_num,
                                       power=optimizer_cfg.AdamWeightDecay.power)
        params = network.trainable_params()
        decay_params = list(filter(optimizer_cfg.AdamWeightDecay.decay_filter, params))
        other_params = list(filter(lambda x: not optimizer_cfg.AdamWeightDecay.decay_filter(x), params))
        group_params = [{'params': decay_params, 'weight_decay': optimizer_cfg.AdamWeightDecay.weight_decay},
                        {'params': other_params, 'weight_decay': 0.0}]
        optimizer = AdamWeightDecay(group_params, lr_schedule, eps=optimizer_cfg.AdamWeightDecay.eps)
    elif optimizer_cfg.optimizer == 'Lamb':
        lr_schedule = BertLearningRate(learning_rate=optimizer_cfg.Lamb.learning_rate,
                                       end_learning_rate=optimizer_cfg.Lamb.end_learning_rate,
                                       warmup_steps=int(steps_per_epoch * epoch_num * 0.1),
                                       decay_steps=steps_per_epoch * epoch_num,
                                       power=optimizer_cfg.Lamb.power)
        optimizer = Lamb(network.trainable_params(), learning_rate=lr_schedule)
    elif optimizer_cfg.optimizer == 'Momentum':
        optimizer = Momentum(network.trainable_params(), learning_rate=optimizer_cfg.Momentum.learning_rate,
                             momentum=optimizer_cfg.Momentum.momentum)
    else:
        raise Exception("Optimizer not supported. support: [AdamWeightDecay, Lamb, Momentum]")

    # load checkpoint into network
    ckpt_config = CheckpointConfig(save_checkpoint_steps=steps_per_epoch, keep_checkpoint_max=1)
    ckpoint_cb = ModelCheckpoint(prefix="ner",
                                 directory=None if save_checkpoint_path == "" else save_checkpoint_path,
                                 config=ckpt_config)
    param_dict = load_checkpoint(load_checkpoint_path)
    load_param_into_net(network, param_dict)

    update_cell = DynamicLossScaleUpdateCell(loss_scale_value=2**32, scale_factor=2, scale_window=1000)
    netwithgrads = BertFinetuneCell(network, optimizer=optimizer, scale_update_cell=update_cell)
    model = Model(netwithgrads)
    callbacks = [TimeMonitor(dataset.get_dataset_size()), LossCallBack(dataset.get_dataset_size()), ckpoint_cb]
    train_begin = time.time()
    model.train(epoch_num, dataset, callbacks=callbacks)
    train_end = time.time()
    print("latency: {:.6f} s".format(train_end - train_begin))
def train_and_eval(config):
    """
    test_train_eval
    """
    set_seed(1000)
    data_path = config.data_path
    batch_size = config.batch_size
    epochs = config.epochs
    if config.dataset_type == "tfrecord":
        dataset_type = DataType.TFRECORD
    elif config.dataset_type == "mindrecord":
        dataset_type = DataType.MINDRECORD
    else:
        dataset_type = DataType.H5
    print("epochs is {}".format(epochs))
    ds_train = create_dataset(data_path, train_mode=True, epochs=1,
                              batch_size=batch_size, rank_id=get_rank(),
                              rank_size=get_group_size(), data_type=dataset_type)
    ds_eval = create_dataset(data_path, train_mode=False, epochs=1,
                             batch_size=batch_size, rank_id=get_rank(),
                             rank_size=get_group_size(), data_type=dataset_type)
    print("ds_train.size: {}".format(ds_train.get_dataset_size()))
    print("ds_eval.size: {}".format(ds_eval.get_dataset_size()))

    net_builder = ModelBuilder()

    train_net, eval_net = net_builder.get_net(config)
    train_net.set_train()
    auc_metric = AUCMetric()

    model = Model(train_net, eval_network=eval_net, metrics={"auc": auc_metric})

    eval_callback = EvalCallBack(model, ds_eval, auc_metric, config)

    callback = LossCallBack(config=config)
    ckptconfig = CheckpointConfig(save_checkpoint_steps=ds_train.get_dataset_size(), keep_checkpoint_max=5)
    ckpoint_cb = ModelCheckpoint(prefix='widedeep_train',
                                 directory=config.ckpt_path + '/ckpt_' + str(get_rank()) + '/',
                                 config=ckptconfig)
    out = model.eval(ds_eval)
    print("=====" * 5 + "model.eval() initialized: {}".format(out))
    callback_list = [TimeMonitor(ds_train.get_dataset_size()), eval_callback, callback]
    if get_rank() == 0:
        callback_list.append(ckpoint_cb)
    model.train(epochs, ds_train,
                callbacks=callback_list,
                sink_size=ds_train.get_dataset_size())
Ejemplo n.º 19
0
def train_and_eval(config):
    """
    train_and_eval.
    """
    data_path = config.data_path
    epochs = config.epochs
    print("epochs is {}".format(epochs))

    ds_train = create_dataset(data_path,
                              train_mode=True,
                              epochs=1,
                              batch_size=config.batch_size,
                              is_tf_dataset=config.is_tf_dataset)
    ds_eval = create_dataset(data_path,
                             train_mode=False,
                             epochs=1,
                             batch_size=config.batch_size,
                             is_tf_dataset=config.is_tf_dataset)

    print("ds_train.size: {}".format(ds_train.get_dataset_size()))
    print("ds_eval.size: {}".format(ds_eval.get_dataset_size()))

    net_builder = ModelBuilder()

    train_net, eval_net = net_builder.get_net(config)
    train_net.set_train()
    auc_metric = AUCMetric()

    model = Model(train_net,
                  eval_network=eval_net,
                  metrics={"auc": auc_metric})

    eval_callback = EvalCallBack(model, ds_eval, auc_metric, config)
    callback = LossCallBack(config)
    ckptconfig = CheckpointConfig(
        save_checkpoint_steps=ds_train.get_dataset_size(),
        keep_checkpoint_max=10)
    ckpoint_cb = ModelCheckpoint(prefix='widedeep_train',
                                 directory=config.ckpt_path,
                                 config=ckptconfig)

    model.train(epochs,
                ds_train,
                callbacks=[
                    TimeMonitor(ds_train.get_dataset_size()), eval_callback,
                    callback, ckpoint_cb
                ])
Ejemplo n.º 20
0
def train_lenet():
    context.set_context(mode=context.GRAPH_MODE, device_target=device_target)
    cfg = nonquant_cfg
    ds_train = create_dataset(os.path.join(data_path, "train"),
                              cfg.batch_size)

    network = LeNet5(cfg.num_classes)
    net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean")
    net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum)
    time_cb = TimeMonitor(data_size=ds_train.get_dataset_size())
    config_ck = CheckpointConfig(save_checkpoint_steps=cfg.save_checkpoint_steps,
                                 keep_checkpoint_max=cfg.keep_checkpoint_max)
    ckpoint_cb = ModelCheckpoint(prefix="ckpt_lenet_noquant", config=config_ck)
    model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()})

    print("============== Starting Training Lenet==============")
    model.train(cfg['epoch_size'], ds_train, callbacks=[time_cb, ckpoint_cb, LossMonitor()],
                dataset_sink_mode=True)
Ejemplo n.º 21
0
def train_lenet5():
    """
    Train lenet5
    """
    config.ckpt_path = config.output_path
    context.set_context(mode=context.GRAPH_MODE,
                        device_target=config.device_target)
    ds_train = create_lenet_dataset(os.path.join(config.data_path, "train"),
                                    config.batch_size,
                                    num_parallel_workers=1)
    if ds_train.get_dataset_size() == 0:
        raise ValueError(
            "Please check dataset size > 0 and batch_size <= dataset size")

    network = LeNet5(config.num_classes)
    net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean")
    net_opt = nn.Momentum(network.trainable_params(), config.lr,
                          config.momentum)
    time_cb = TimeMonitor(data_size=ds_train.get_dataset_size())
    config_ck = CheckpointConfig(
        save_checkpoint_steps=config.save_checkpoint_steps,
        keep_checkpoint_max=config.keep_checkpoint_max)
    ckpoint_cb = ModelCheckpoint(
        prefix="checkpoint_lenet",
        directory=None if config.ckpt_path == "" else config.ckpt_path,
        config=config_ck)

    if config.device_target != "Ascend":
        model = Model(network,
                      net_loss,
                      net_opt,
                      metrics={"Accuracy": Accuracy()})
    else:
        model = Model(network,
                      net_loss,
                      net_opt,
                      metrics={"Accuracy": Accuracy()},
                      amp_level="O2")

    print("============== Starting Training ==============")
    model.train(config.epoch_size,
                ds_train,
                callbacks=[time_cb, ckpoint_cb,
                           LossMonitor()])
def train_and_eval(config):
    """
    test_train_eval
    """
    set_seed(1000)
    data_path = config.data_path
    batch_size = config.batch_size
    epochs = config.epochs
    if config.dataset_type == "tfrecord":
        dataset_type = DataType.TFRECORD
    elif config.dataset_type == "mindrecord":
        dataset_type = DataType.MINDRECORD
    else:
        dataset_type = DataType.H5
    parameter_server = bool(config.parameter_server)
    cache_enable = config.vocab_cache_size > 0
    print("epochs is {}".format(epochs))
    ds_train = create_dataset(data_path, train_mode=True, epochs=1,
                              batch_size=batch_size, data_type=dataset_type)
    ds_eval = create_dataset(data_path, train_mode=False, epochs=1,
                             batch_size=batch_size, data_type=dataset_type)
    print("ds_train.size: {}".format(ds_train.get_dataset_size()))
    print("ds_eval.size: {}".format(ds_eval.get_dataset_size()))

    net_builder = ModelBuilder()

    train_net, eval_net = net_builder.get_net(config)
    train_net.set_train()
    auc_metric = AUCMetric()

    model = Model(train_net, eval_network=eval_net, metrics={"auc": auc_metric})

    eval_callback = EvalCallBack(model, ds_eval, auc_metric, config)
    callback = LossCallBack(config=config)
    ckptconfig = CheckpointConfig(save_checkpoint_steps=ds_train.get_dataset_size(), keep_checkpoint_max=5)
    ckpoint_cb = ModelCheckpoint(prefix='widedeep_train', directory=config.ckpt_path, config=ckptconfig)
    callback_list = [TimeMonitor(ds_train.get_dataset_size()), eval_callback, callback, ckpoint_cb]

    model.train(epochs, ds_train,
                callbacks=callback_list,
                dataset_sink_mode=(parameter_server and cache_enable))
Ejemplo n.º 23
0
def do_train(dataset=None, network=None, load_checkpoint_path="", save_checkpoint_path=""):
    """ do train """
    if load_checkpoint_path == "":
        raise ValueError("Pretrain model missed, finetune task must load pretrain model!")
    steps_per_epoch = dataset.get_dataset_size()
    epoch_num = dataset.get_repeat_count()
    # optimizer
    if optimizer_cfg.optimizer == 'AdamWeightDecayDynamicLR':
        optimizer = AdamWeightDecayDynamicLR(network.trainable_params(),
                                             decay_steps=steps_per_epoch * epoch_num,
                                             learning_rate=optimizer_cfg.AdamWeightDecayDynamicLR.learning_rate,
                                             end_learning_rate=optimizer_cfg.AdamWeightDecayDynamicLR.end_learning_rate,
                                             power=optimizer_cfg.AdamWeightDecayDynamicLR.power,
                                             warmup_steps=int(steps_per_epoch * epoch_num * 0.1),
                                             weight_decay=optimizer_cfg.AdamWeightDecayDynamicLR.weight_decay,
                                             eps=optimizer_cfg.AdamWeightDecayDynamicLR.eps)
    elif optimizer_cfg.optimizer == 'Lamb':
        optimizer = Lamb(network.trainable_params(), decay_steps=steps_per_epoch * epoch_num,
                         start_learning_rate=optimizer_cfg.Lamb.start_learning_rate,
                         end_learning_rate=optimizer_cfg.Lamb.end_learning_rate,
                         power=optimizer_cfg.Lamb.power, weight_decay=optimizer_cfg.Lamb.weight_decay,
                         warmup_steps=int(steps_per_epoch * epoch_num * 0.1),
                         decay_filter=optimizer_cfg.Lamb.decay_filter)
    elif optimizer_cfg.optimizer == 'Momentum':
        optimizer = Momentum(network.trainable_params(), learning_rate=optimizer_cfg.Momentum.learning_rate,
                             momentum=optimizer_cfg.Momentum.momentum)
    else:
        raise Exception("Optimizer not supported. support: [AdamWeightDecayDynamicLR, Lamb, Momentum]")

    # load checkpoint into network
    ckpt_config = CheckpointConfig(save_checkpoint_steps=steps_per_epoch, keep_checkpoint_max=1)
    ckpoint_cb = ModelCheckpoint(prefix="classifier", directory=save_checkpoint_path, config=ckpt_config)
    param_dict = load_checkpoint(load_checkpoint_path)
    load_param_into_net(network, param_dict)

    update_cell = DynamicLossScaleUpdateCell(loss_scale_value=2**32, scale_factor=2, scale_window=1000)
    netwithgrads = BertFinetuneCell(network, optimizer=optimizer, scale_update_cell=update_cell)
    model = Model(netwithgrads)
    callbacks = [TimeMonitor(dataset.get_dataset_size()), LossCallBack(), ckpoint_cb]
    model.train(epoch_num, dataset, callbacks=callbacks)
Ejemplo n.º 24
0
            get_lr(global_step=0,
                   lr_init=config.lr_init,
                   lr_end=config.lr_end,
                   lr_max=config.lr_max,
                   warmup_epochs=config.warmup_epochs,
                   total_epochs=epoch_size,
                   steps_per_epoch=step_size,
                   lr_decay_mode='poly'))
        opt = Momentum(filter(lambda x: x.requires_grad,
                              net.get_parameters()), lr, config.momentum,
                       config.weight_decay, config.loss_scale)

        model = Model(net,
                      loss_fn=loss,
                      optimizer=opt,
                      loss_scale_manager=loss_scale,
                      metrics={'acc'})

        time_cb = TimeMonitor(data_size=step_size)
        loss_cb = LossMonitor()
        cb = [time_cb, loss_cb]
        if config.save_checkpoint:
            config_ck = CheckpointConfig(
                save_checkpoint_steps=config.save_checkpoint_steps,
                keep_checkpoint_max=config.keep_checkpoint_max)
            ckpt_cb = ModelCheckpoint(prefix="resnet",
                                      directory=config.save_checkpoint_path,
                                      config=config_ck)
            cb += [ckpt_cb]
        model.train(epoch_size, dataset, callbacks=cb)
Ejemplo n.º 25
0
                      keep_batchnorm_fp32=False,
                      loss_scale_manager=None)
    else:
        if not args.label_smooth:
            args.label_smooth_factor = 0.0
        loss = CrossEntropy(smooth_factor=args.label_smooth_factor,
                            num_classes=args.num_classes)

        loss_scale_manager = FixedLossScaleManager(args.loss_scale,
                                                   drop_overflow_update=False)
        model = Model(network,
                      loss_fn=loss,
                      optimizer=opt,
                      loss_scale_manager=loss_scale_manager,
                      amp_level="O2")

    # define callbacks
    time_cb = TimeMonitor(data_size=batch_num)
    loss_cb = LossMonitor(per_print_times=batch_num)
    callbacks = [time_cb, loss_cb]
    if args.rank_save_ckpt_flag:
        ckpt_config = CheckpointConfig(
            save_checkpoint_steps=args.ckpt_interval * args.steps_per_epoch,
            keep_checkpoint_max=args.ckpt_save_max)
        ckpt_cb = ModelCheckpoint(config=ckpt_config,
                                  directory=args.outputs_dir,
                                  prefix='{}'.format(args.rank))
        callbacks.append(ckpt_cb)

    model.train(args.max_epoch, dataset, callbacks=callbacks)
Ejemplo n.º 26
0
    args = parser.parse_args()

    if args.device_target == "CPU":
        args.dataset_sink_mode = False

    context.set_context(mode=context.GRAPH_MODE,
                        device_target=args.device_target)
    ds_train = create_dataset(os.path.join(args.data_path, "train"),
                              cfg.batch_size)

    network = LeNet5(cfg.num_classes)
    net_loss = nn.SoftmaxCrossEntropyWithLogits(is_grad=False,
                                                sparse=True,
                                                reduction="mean")
    net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum)
    time_cb = TimeMonitor(data_size=ds_train.get_dataset_size())
    config_ck = CheckpointConfig(
        save_checkpoint_steps=cfg.save_checkpoint_steps,
        keep_checkpoint_max=cfg.keep_checkpoint_max)
    ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet",
                                 directory=args.ckpt_path,
                                 config=config_ck)
    model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()})

    print("============== Starting Training ==============")
    model.train(cfg['epoch_size'],
                ds_train,
                callbacks=[time_cb, ckpoint_cb,
                           LossMonitor()],
                dataset_sink_mode=args.dataset_sink_mode)
Ejemplo n.º 27
0
def inception_v4_train():
    """
    Train Inceptionv4 in data parallelism
    """
    print('epoch_size: {} batch_size: {} class_num {}'.format(config.epoch_size, config.batch_size, config.num_classes))

    context.set_context(mode=context.GRAPH_MODE, device_target=args.platform)
    if args.platform == "Ascend":
        context.set_context(device_id=args.device_id)
        context.set_context(enable_graph_kernel=False)

    rank = 0
    if device_num > 1:
        if args.platform == "Ascend":
            init(backend_name='hccl')
        elif args.platform == "GPU":
            init()
        else:
            raise ValueError("Unsupported device target.")

        rank = get_rank()
        context.set_auto_parallel_context(device_num=device_num,
                                          parallel_mode=ParallelMode.DATA_PARALLEL,
                                          gradients_mean=True,
                                          all_reduce_fusion_config=[200, 400])

    # create dataset
    train_dataset = create_dataset(dataset_path=args.dataset_path, do_train=True,
                                   repeat_num=1, batch_size=config.batch_size, shard_id=rank)
    train_step_size = train_dataset.get_dataset_size()

    # create model
    net = Inceptionv4(classes=config.num_classes)
    # loss
    loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean")
    # learning rate
    lr = Tensor(generate_cosine_lr(steps_per_epoch=train_step_size, total_epochs=config.epoch_size))

    decayed_params = []
    no_decayed_params = []
    for param in net.trainable_params():
        if 'beta' not in param.name and 'gamma' not in param.name and 'bias' not in param.name:
            decayed_params.append(param)
        else:
            no_decayed_params.append(param)
    for param in net.trainable_params():
        if 'beta' not in param.name and 'gamma' not in param.name and 'bias' not in param.name:
            param.set_data(initializer(XavierUniform(), param.data.shape, param.data.dtype))
    group_params = [{'params': decayed_params, 'weight_decay': config.weight_decay},
                    {'params': no_decayed_params},
                    {'order_params': net.trainable_params()}]

    opt = RMSProp(group_params, lr, decay=config.decay, epsilon=config.epsilon, weight_decay=config.weight_decay,
                  momentum=config.momentum, loss_scale=config.loss_scale)

    if args.device_id == 0:
        print(lr)
        print(train_step_size)
    if args.resume:
        ckpt = load_checkpoint(args.resume)
        load_param_into_net(net, ckpt)

    loss_scale_manager = FixedLossScaleManager(config.loss_scale, drop_overflow_update=False)


    if args.platform == "Ascend":
        model = Model(net, loss_fn=loss, optimizer=opt, metrics={'acc', 'top_1_accuracy', 'top_5_accuracy'},
                      loss_scale_manager=loss_scale_manager, amp_level=config.amp_level)
    elif args.platform == "GPU":
        model = Model(net, loss_fn=loss, optimizer=opt, metrics={'acc', 'top_1_accuracy', 'top_5_accuracy'},
                      loss_scale_manager=loss_scale_manager, amp_level='O0')
    else:
        raise ValueError("Unsupported device target.")

    # define callbacks
    performance_cb = TimeMonitor(data_size=train_step_size)
    loss_cb = LossMonitor(per_print_times=train_step_size)
    ckp_save_step = config.save_checkpoint_epochs * train_step_size
    config_ck = CheckpointConfig(save_checkpoint_steps=ckp_save_step, keep_checkpoint_max=config.keep_checkpoint_max)
    ckpoint_cb = ModelCheckpoint(prefix=f"inceptionV4-train-rank{rank}",
                                 directory='ckpts_rank_' + str(rank), config=config_ck)
    callbacks = [performance_cb, loss_cb]
    if device_num > 1 and config.is_save_on_master:
        if args.device_id == 0:
            callbacks.append(ckpoint_cb)
    else:
        callbacks.append(ckpoint_cb)

    # train model
    model.train(config.epoch_size, train_dataset, callbacks=callbacks, dataset_sink_mode=True)
Ejemplo n.º 28
0
def train():
    """Train function."""

    args.outputs_dir = params['save_model_path']

    if args.group_size > 1:
        init()
        context.set_auto_parallel_context(
            device_num=get_group_size(),
            parallel_mode=ParallelMode.DATA_PARALLEL,
            gradients_mean=True)
        args.outputs_dir = os.path.join(args.outputs_dir,
                                        "ckpt_{}/".format(str(get_rank())))
        args.rank = get_rank()
    else:
        args.outputs_dir = os.path.join(args.outputs_dir, "ckpt_0/")
        args.rank = 0

    if args.group_size > 1:
        args.max_epoch = params["max_epoch_train_NP"]
        args.loss_scale = params['loss_scale'] / 2
        args.lr_steps = list(map(int, params["lr_steps_NP"].split(',')))
        params['train_type'] = params['train_type_NP']
        params['optimizer'] = params['optimizer_NP']
        params['group_params'] = params['group_params_NP']
    else:
        args.max_epoch = params["max_epoch_train"]
        args.loss_scale = params['loss_scale']
        args.lr_steps = list(map(int, params["lr_steps"].split(',')))

    # create network
    print('start create network')
    criterion = openpose_loss()
    criterion.add_flags_recursive(fp32=True)
    network = OpenPoseNet(vggpath=params['vgg_path'],
                          vgg_with_bn=params['vgg_with_bn'])
    if params["load_pretrain"]:
        print("load pretrain model:", params["pretrained_model_path"])
        load_model(network, params["pretrained_model_path"])
    train_net = BuildTrainNetwork(network, criterion)

    # create dataset
    if os.path.exists(args.jsonpath_train) and os.path.exists(args.imgpath_train) \
            and os.path.exists(args.maskpath_train):
        print('start create dataset')
    else:
        print('Error: wrong data path')
        return 0

    num_worker = 20 if args.group_size > 1 else 48
    de_dataset_train = create_dataset(args.jsonpath_train,
                                      args.imgpath_train,
                                      args.maskpath_train,
                                      batch_size=params['batch_size'],
                                      rank=args.rank,
                                      group_size=args.group_size,
                                      num_worker=num_worker,
                                      multiprocessing=True,
                                      shuffle=True,
                                      repeat_num=1)
    steps_per_epoch = de_dataset_train.get_dataset_size()
    print("steps_per_epoch: ", steps_per_epoch)

    # lr scheduler
    lr_stage, lr_base, lr_vgg = get_lr(params['lr'] * args.group_size,
                                       params['lr_gamma'],
                                       steps_per_epoch,
                                       args.max_epoch,
                                       args.lr_steps,
                                       args.group_size,
                                       lr_type=params['lr_type'],
                                       warmup_epoch=params['warmup_epoch'])

    # optimizer
    if params['group_params']:
        vgg19_base_params = list(
            filter(lambda x: 'base.vgg_base' in x.name,
                   train_net.trainable_params()))
        base_params = list(
            filter(lambda x: 'base.conv' in x.name,
                   train_net.trainable_params()))
        stages_params = list(
            filter(lambda x: 'base' not in x.name,
                   train_net.trainable_params()))

        group_params = [{
            'params': vgg19_base_params,
            'lr': lr_vgg
        }, {
            'params': base_params,
            'lr': lr_base
        }, {
            'params': stages_params,
            'lr': lr_stage
        }]

        if params['optimizer'] == "Momentum":
            opt = Momentum(group_params, learning_rate=lr_stage, momentum=0.9)
        elif params['optimizer'] == "Adam":
            opt = Adam(group_params)
        else:
            raise ValueError("optimizer not support.")
    else:
        if params['optimizer'] == "Momentum":
            opt = Momentum(train_net.trainable_params(),
                           learning_rate=lr_stage,
                           momentum=0.9)
        elif params['optimizer'] == "Adam":
            opt = Adam(train_net.trainable_params(), learning_rate=lr_stage)
        else:
            raise ValueError("optimizer not support.")

    # callback
    config_ck = CheckpointConfig(
        save_checkpoint_steps=params['ckpt_interval'],
        keep_checkpoint_max=params["keep_checkpoint_max"])
    ckpoint_cb = ModelCheckpoint(prefix='{}'.format(args.rank),
                                 directory=args.outputs_dir,
                                 config=config_ck)
    time_cb = TimeMonitor(data_size=de_dataset_train.get_dataset_size())
    if args.rank == 0:
        callback_list = [MyLossMonitor(), time_cb, ckpoint_cb]
    else:
        callback_list = [MyLossMonitor(), time_cb]

    # train
    if params['train_type'] == 'clip_grad':
        train_net = TrainOneStepWithClipGradientCell(train_net,
                                                     opt,
                                                     sens=args.loss_scale)
        train_net.set_train()
        model = Model(train_net)
    elif params['train_type'] == 'fix_loss_scale':
        loss_scale_manager = FixedLossScaleManager(args.loss_scale,
                                                   drop_overflow_update=False)
        train_net.set_train()
        model = Model(train_net,
                      optimizer=opt,
                      loss_scale_manager=loss_scale_manager)
    else:
        raise ValueError("Type {} is not support.".format(
            params['train_type']))

    print("============== Starting Training ==============")
    model.train(args.max_epoch,
                de_dataset_train,
                callbacks=callback_list,
                dataset_sink_mode=False)
    return 0
Ejemplo n.º 29
0
    loss = LossNet()
    lr = Tensor(dynamic_lr(training_cfg, dataset_size), mstype.float32)
    opt = Momentum(params=net.trainable_params(), learning_rate=lr, momentum=config.momentum,\
        weight_decay=config.weight_decay, loss_scale=config.loss_scale)
    net_with_loss = WithLossCell(net, loss)
    if args_opt.run_distribute:
        net = TrainOneStepCell(net_with_loss,
                               opt,
                               sens=config.loss_scale,
                               reduce_flag=True,
                               mean=True,
                               degree=device_num)
    else:
        net = TrainOneStepCell(net_with_loss, opt, sens=config.loss_scale)

    time_cb = TimeMonitor(data_size=dataset_size)
    loss_cb = LossCallBack(rank_id=rank)
    cb = [time_cb, loss_cb]
    if config.save_checkpoint:
        ckptconfig = CheckpointConfig(
            save_checkpoint_steps=config.save_checkpoint_epochs * dataset_size,
            keep_checkpoint_max=config.keep_checkpoint_max)
        save_checkpoint_path = os.path.join(config.save_checkpoint_path,
                                            "ckpt_" + str(rank) + "/")
        ckpoint_cb = ModelCheckpoint(prefix='ctpn',
                                     directory=save_checkpoint_path,
                                     config=ckptconfig)
        cb += [ckpoint_cb]

    model = Model(net)
    model.train(training_cfg.total_epoch,
Ejemplo n.º 30
0
def run_general_distill():
    """
    run general distill
    """
    parser = argparse.ArgumentParser(description='tinybert general distill')
    parser.add_argument(
        '--device_target',
        type=str,
        default='Ascend',
        choices=['Ascend', 'GPU'],
        help='device where the code will be implemented. (Default: Ascend)')
    parser.add_argument("--distribute",
                        type=str,
                        default="false",
                        help="Run distribute, default is false.")
    parser.add_argument("--epoch_size",
                        type=int,
                        default="3",
                        help="Epoch size, default is 1.")
    parser.add_argument("--device_id",
                        type=int,
                        default=0,
                        help="Device id, default is 0.")
    parser.add_argument("--device_num",
                        type=int,
                        default=1,
                        help="Use device nums, default is 1.")
    parser.add_argument("--save_ckpt_step",
                        type=int,
                        default=100,
                        help="Enable data sink, default is true.")
    parser.add_argument("--max_ckpt_num",
                        type=int,
                        default=1,
                        help="Enable data sink, default is true.")
    parser.add_argument("--do_shuffle",
                        type=str,
                        default="true",
                        help="Enable shuffle for dataset, default is true.")
    parser.add_argument("--enable_data_sink",
                        type=str,
                        default="true",
                        help="Enable data sink, default is true.")
    parser.add_argument("--data_sink_steps",
                        type=int,
                        default=1,
                        help="Sink steps for each epoch, default is 1.")
    parser.add_argument("--save_ckpt_path",
                        type=str,
                        default="",
                        help="Save checkpoint path")
    parser.add_argument("--load_teacher_ckpt_path",
                        type=str,
                        default="",
                        help="Load checkpoint file path")
    parser.add_argument("--data_dir",
                        type=str,
                        default="",
                        help="Data path, it is better to use absolute path")
    parser.add_argument("--schema_dir",
                        type=str,
                        default="",
                        help="Schema path, it is better to use absolute path")
    args_opt = parser.parse_args()

    context.set_context(mode=context.GRAPH_MODE,
                        device_target=args_opt.device_target,
                        device_id=args_opt.device_id)
    context.set_context(mode=context.GRAPH_MODE,
                        device_target=args_opt.device_target,
                        device_id=args_opt.device_id)
    context.set_context(reserve_class_name_in_scope=False)
    context.set_context(variable_memory_max_size="30GB")

    save_ckpt_dir = os.path.join(
        args_opt.save_ckpt_path,
        datetime.datetime.now().strftime('%Y-%m-%d_time_%H_%M_%S'))

    if not os.path.exists(save_ckpt_dir):
        os.makedirs(save_ckpt_dir)

    if args_opt.distribute == "true":
        D.init('hccl')
        device_num = args_opt.device_num
        rank = args_opt.device_id % device_num
        context.reset_auto_parallel_context()
        context.set_auto_parallel_context(
            parallel_mode=ParallelMode.DATA_PARALLEL,
            mirror_mean=True,
            device_num=device_num)
    else:
        rank = 0
        device_num = 1

    netwithloss = BertNetworkWithLoss_gd(
        teacher_config=bert_teacher_net_cfg,
        teacher_ckpt=args_opt.load_teacher_ckpt_path,
        student_config=bert_student_net_cfg,
        is_training=True,
        use_one_hot_embeddings=False)

    dataset = create_tinybert_dataset('gd', bert_teacher_net_cfg.batch_size,
                                      device_num, rank, args_opt.do_shuffle,
                                      args_opt.data_dir, args_opt.schema_dir)

    dataset_size = dataset.get_dataset_size()
    print('dataset size: ', dataset_size)
    if args_opt.enable_data_sink == "true":
        repeat_count = args_opt.epoch_size * dataset.get_dataset_size(
        ) // args_opt.data_sink_steps
        time_monitor_steps = args_opt.data_sink_steps
    else:
        repeat_count = args_opt.epoch_size
        time_monitor_steps = dataset_size

    lr_schedule = BertLearningRate(
        learning_rate=common_cfg.AdamWeightDecay.learning_rate,
        end_learning_rate=common_cfg.AdamWeightDecay.end_learning_rate,
        warmup_steps=int(dataset_size * args_opt.epoch_size / 10),
        decay_steps=int(dataset_size * args_opt.epoch_size),
        power=common_cfg.AdamWeightDecay.power)
    params = netwithloss.trainable_params()
    decay_params = list(filter(common_cfg.AdamWeightDecay.decay_filter,
                               params))
    other_params = list(
        filter(lambda x: not cfg.AdamWeightDecay.decay_filter(x), params))
    group_params = [{
        'params': decay_params,
        'weight_decay': common_cfg.AdamWeightDecay.weight_decay
    }, {
        'params': other_params,
        'weight_decay': 0.0
    }, {
        'order_params': params
    }]

    optimizer = AdamWeightDecay(group_params,
                                learning_rate=lr_schedule,
                                eps=common_cfg.AdamWeightDecay.eps)

    callback = [
        TimeMonitor(time_monitor_steps),
        LossCallBack(),
        ModelSaveCkpt(netwithloss.bert, args_opt.save_ckpt_step,
                      args_opt.max_ckpt_num, save_ckpt_dir)
    ]

    update_cell = DynamicLossScaleUpdateCell(
        loss_scale_value=common_cfg.loss_scale_value,
        scale_factor=common_cfg.scale_factor,
        scale_window=common_cfg.scale_window)

    netwithgrads = BertTrainWithLossScaleCell(netwithloss,
                                              optimizer=optimizer,
                                              scale_update_cell=update_cell)
    model = Model(netwithgrads)
    model.train(repeat_count,
                dataset,
                callbacks=callback,
                dataset_sink_mode=(args_opt.enable_data_sink == "true"),
                sink_size=args_opt.data_sink_steps)