Ejemplo n.º 1
0
    # define lr
    lr_init = cf.learning_rate if not args_opt.run_distribute else cf.learning_rate * args_opt.device_num
    lr = get_lr(cf.epoch_size, step_size, lr_init)
    # define loss
    loss = CTCLoss(max_sequence_length=cf.captcha_width,
                   max_label_length=max_captcha_digits,
                   batch_size=cf.batch_size)
    # define net
    net = StackedRNN(input_size=input_size,
                     batch_size=cf.batch_size,
                     hidden_size=cf.hidden_size)
    # define opt
    opt = nn.SGD(params=net.trainable_params(),
                 learning_rate=lr,
                 momentum=cf.momentum)
    net = WithLossCell(net, loss)
    net = TrainOneStepCellWithGradClip(net, opt).set_train()
    # define model
    model = Model(net)
    # define callbacks
    callbacks = [LossMonitor(), TimeMonitor(data_size=step_size)]
    if cf.save_checkpoint:
        config_ck = CheckpointConfig(
            save_checkpoint_steps=cf.save_checkpoint_steps,
            keep_checkpoint_max=cf.keep_checkpoint_max)
        ckpt_cb = ModelCheckpoint(prefix="waptctc",
                                  directory=cf.save_checkpoint_path,
                                  config=config_ck)
        callbacks.append(ckpt_cb)
    model.train(cf.epoch_size, dataset, callbacks=callbacks)
Ejemplo n.º 2
0
def train_eval(config):
    """
    test evaluate
    """
    data_path = config.data_path + config.dataset_type
    ckpt_path = config.ckpt_path
    epochs = config.epochs
    batch_size = config.batch_size
    if config.dataset_type == "tfrecord":
        dataset_type = DataType.TFRECORD
    elif config.dataset_type == "mindrecord":
        dataset_type = DataType.MINDRECORD
    else:
        dataset_type = DataType.H5

    ds_train = create_dataset(data_path,
                              train_mode=True,
                              epochs=1,
                              batch_size=batch_size,
                              data_type=dataset_type)
    print("ds_train.size: {}".format(ds_train.get_dataset_size()))
    ds_eval = create_dataset(data_path,
                             train_mode=False,
                             epochs=1,
                             batch_size=batch_size,
                             data_type=dataset_type)
    print("ds_eval.size: {}".format(ds_eval.get_dataset_size()))

    net_builder = ModelBuilder()
    train_net, eval_net = net_builder.get_net(config)
    train_net.set_train()

    train_model = Model(train_net)
    train_callback = LossCallBack(config=config)
    ckptconfig = CheckpointConfig(
        save_checkpoint_steps=ds_train.get_dataset_size(),
        keep_checkpoint_max=1)
    ckpoint_cb = ModelCheckpoint(prefix='widedeep_train',
                                 directory=config.ckpt_path,
                                 config=ckptconfig)
    train_model.train(epochs,
                      ds_train,
                      callbacks=[
                          TimeMonitor(ds_train.get_dataset_size()),
                          train_callback, ckpoint_cb
                      ])

    # data download
    print('Download data from modelarts server to obs.')
    mox.file.copy_parallel(src_url=config.ckpt_path, dst_url=config.train_url)

    param_dict = load_checkpoint(find_ckpt(ckpt_path))
    load_param_into_net(eval_net, param_dict)

    auc_metric = AUCMetric()
    eval_model = Model(train_net,
                       eval_network=eval_net,
                       metrics={"auc": auc_metric})
    eval_callback = EvalCallBack(eval_model, ds_eval, auc_metric, config)

    eval_model.eval(ds_eval, callbacks=eval_callback)
Ejemplo n.º 3
0
                step_size_per_epoch)
    lr = Tensor(lr)

    if args.checkpoint != '':
        param_dict = load_checkpoint(args.pre_trained_model_path)
        load_param_into_net(model, param_dict)
        print('Successfully loading the pre-trained model')

    weights = model.trainable_params()
    optimizer = Adam(weights, learning_rate=lr, loss_scale=1024.)
    train_net = TrainOneStepCell(loss_net, optimizer)

    model = Model(train_net)
    lr_cb = Monitor(lr)
    callback_list = [lr_cb]
    if args.is_distributed:
        ckpt_path = os.path.join(args.checkpoint_dir,
                                 'ckpt_' + str(get_rank()) + '/')
    else:
        ckpt_path = args.checkpoint_dir
    config_ck = CheckpointConfig(save_checkpoint_steps=step_size_per_epoch,
                                 keep_checkpoint_max=10)
    ckpt_cb = ModelCheckpoint(prefix='wavenet',
                              directory=ckpt_path,
                              config=config_ck)
    callback_list.append(ckpt_cb)
    model.train(hparams.nepochs,
                data_loaders,
                callbacks=callback_list,
                dataset_sink_mode=False)
Ejemplo n.º 4
0
                       learning_rate=Tensor(lr),
                       momentum=args.momentum,
                       weight_decay=args.weight_decay,
                       loss_scale=args.loss_scale)

    network = TrainingWrapper(network, opt, sens=args.loss_scale)
    network.set_train()

    if args.rank_save_ckpt_flag:
        # checkpoint save
        ckpt_max_num = args.max_epoch * args.steps_per_epoch // args.ckpt_interval
        ckpt_config = CheckpointConfig(
            save_checkpoint_steps=args.ckpt_interval,
            keep_checkpoint_max=ckpt_max_num)
        ckpt_cb = ModelCheckpoint(config=ckpt_config,
                                  directory=args.outputs_dir,
                                  prefix='{}'.format(args.rank))
        cb_params = _InternalCallbackParam()
        cb_params.train_network = network
        cb_params.epoch_num = ckpt_max_num
        cb_params.cur_epoch_num = 1
        run_context = RunContext(cb_params)
        ckpt_cb.begin(run_context)

        args.logger.info(
            'args.steps_per_epoch = {} args.ckpt_interval ={}'.format(
                args.steps_per_epoch, args.ckpt_interval))

    t_end = time.time()

    for i_all, batch_load in enumerate(data_loader):
Ejemplo n.º 5
0
    max_text_length = config.max_text_length
    
    # create dataset
    dataset = create_dataset(img_dir=os.path.join(dataset_path,'images'), label_dir=os.path.join(dataset_path,'labels'), batch_size=config.batch_size,
                             num_shards=device_num, shard_id=rank, config=config)
    step_size = dataset.get_dataset_size()
    lr_init = config.learning_rate
    lr = nn.dynamic_lr.cosine_decay_lr(0.0, lr_init, config.epoch_size * step_size, step_size, config.epoch_size)
    loss = CTCLoss(max_sequence_length=config.num_step,
                max_label_length=max_text_length,
                batch_size=config.batch_size)
    net = crnn(config)
    opt = nn.SGD(params=net.trainable_params(), learning_rate=lr, momentum=config.momentum, nesterov=config.nesterov)

    net = WithLossCell(net, loss)
    net = TrainOneStepCellWithGradClip(net, opt).set_train()
    # define model
    model = Model(net)
    # define callbacks
    callbacks = [LossMonitor(), TimeMonitor(data_size=step_size)]
    if config.save_checkpoint and rank == 0:
        config_ck = CheckpointConfig(save_checkpoint_steps=config.save_checkpoint_steps,
                                    keep_checkpoint_max=config.keep_checkpoint_max)
        save_ckpt_path = os.path.join(config.save_checkpoint_path, 'ckpt_' + str(rank) + '/')
        ckpt_cb = ModelCheckpoint(prefix="crnn", directory=save_ckpt_path, config=config_ck)
        callbacks.append(ckpt_cb)
    model.train(config.epoch_size, dataset, callbacks=callbacks, dataset_sink_mode=False)


Ejemplo n.º 6
0
                     learning_rate=config.OptimConfig.learning_rate,
                     eps=config.OptimConfig.epsilon,
                     loss_scale=config.OptimConfig.loss_scale)
    train_net = TrainOneStepCell(loss_net, optimizer)
    train_net.set_train(True)
    if args.pre_trained_model_path != '':
        param_dict = load_checkpoint(args.pre_trained_model_path)
        load_param_into_net(train_net, param_dict)
        print('Successfully loading the pre-trained model')

    model = Model(train_net)
    lr_cb = Monitor(lr)
    callback_list = [lr_cb]

    if args.is_distributed:
        config.CheckpointConfig.ckpt_file_name_prefix = config.CheckpointConfig.ckpt_file_name_prefix + str(
            get_rank())
        config.CheckpointConfig.ckpt_path = os.path.join(
            config.CheckpointConfig.ckpt_path, 'ckpt_' + str(get_rank()) + '/')
    config_ck = CheckpointConfig(
        save_checkpoint_steps=1,
        keep_checkpoint_max=config.CheckpointConfig.keep_checkpoint_max)
    ckpt_cb = ModelCheckpoint(
        prefix=config.CheckpointConfig.ckpt_file_name_prefix,
        directory=config.CheckpointConfig.ckpt_path,
        config=config_ck)
    callback_list.append(ckpt_cb)
    model.train(config.TrainingConfig.epochs,
                ds_train,
                callbacks=callback_list)
Ejemplo n.º 7
0
                   lr_end=config.lr_end,
                   lr_max=config.lr_max,
                   warmup_epochs=config.warmup_epochs,
                   total_epochs=epoch_size,
                   steps_per_epoch=step_size,
                   lr_decay_mode='poly'))
        opt = Momentum(filter(lambda x: x.requires_grad,
                              net.get_parameters()), lr, config.momentum,
                       config.weight_decay, config.loss_scale)

        model = Model(net,
                      loss_fn=loss,
                      optimizer=opt,
                      loss_scale_manager=loss_scale,
                      metrics={'acc'},
                      amp_level="O2",
                      keep_batchnorm_fp32=False)

        time_cb = TimeMonitor(data_size=step_size)
        loss_cb = LossMonitor()
        cb = [time_cb, loss_cb]
        if config.save_checkpoint:
            config_ck = CheckpointConfig(
                save_checkpoint_steps=config.save_checkpoint_steps,
                keep_checkpoint_max=config.keep_checkpoint_max)
            ckpt_cb = ModelCheckpoint(prefix="resnet",
                                      directory=config.save_checkpoint_path,
                                      config=config_ck)
            cb += [ckpt_cb]
        model.train(epoch_size, dataset, callbacks=cb)
Ejemplo n.º 8
0
                      loss_scale_manager=None)
    else:
        if not args.label_smooth:
            args.label_smooth_factor = 0.0
        loss = CrossEntropy(smooth_factor=args.label_smooth_factor,
                            num_classes=args.num_classes)

        loss_scale_manager = FixedLossScaleManager(args.loss_scale,
                                                   drop_overflow_update=False)
        model = Model(network,
                      loss_fn=loss,
                      optimizer=opt,
                      loss_scale_manager=loss_scale_manager,
                      amp_level="O2")

    # checkpoint save
    progress_cb = ProgressMonitor(args)
    callbacks = [
        progress_cb,
    ]
    if args.rank_save_ckpt_flag:
        ckpt_config = CheckpointConfig(
            save_checkpoint_steps=args.ckpt_interval * args.steps_per_epoch,
            keep_checkpoint_max=args.ckpt_save_max)
        ckpt_cb = ModelCheckpoint(config=ckpt_config,
                                  directory=args.outputs_dir,
                                  prefix='{}'.format(args.rank))
        callbacks.append(ckpt_cb)

    model.train(args.max_epoch, dataset, callbacks=callbacks)
Ejemplo n.º 9
0
    else:
        if not args.label_smooth:
            args.label_smooth_factor = 0.0
        loss = CrossEntropy(smooth_factor=args.label_smooth_factor,
                            num_classes=args.num_classes)

        loss_scale_manager = FixedLossScaleManager(args.loss_scale,
                                                   drop_overflow_update=False)
        model = Model(network,
                      loss_fn=loss,
                      optimizer=opt,
                      loss_scale_manager=loss_scale_manager,
                      amp_level="O2")

    # define callbacks
    time_cb = TimeMonitor(data_size=batch_num)
    loss_cb = LossMonitor(per_print_times=batch_num)
    callbacks = [time_cb, loss_cb]
    if args.rank_save_ckpt_flag:
        ckpt_config = CheckpointConfig(
            save_checkpoint_steps=args.ckpt_interval * args.steps_per_epoch,
            keep_checkpoint_max=args.ckpt_save_max)
        save_ckpt_path = os.path.join(args.outputs_dir,
                                      'ckpt_' + str(args.rank) + '/')
        ckpt_cb = ModelCheckpoint(config=ckpt_config,
                                  directory=save_ckpt_path,
                                  prefix='{}'.format(args.rank))
        callbacks.append(ckpt_cb)

    model.train(args.max_epoch, dataset, callbacks=callbacks)
Ejemplo n.º 10
0
def _build_training_pipeline(config: TransformerConfig,
                             pre_training_dataset=None,
                             fine_tune_dataset=None,
                             test_dataset=None,
                             platform="Ascend"):
    """
    Build training pipeline.

    Args:
        config (TransformerConfig): Config of mass model.
        pre_training_dataset (Dataset): Pre-training dataset.
        fine_tune_dataset (Dataset): Fine-tune dataset.
        test_dataset (Dataset): Test dataset.
    """
    net_with_loss = TransformerNetworkWithLoss(config, is_training=True)
    net_with_loss.init_parameters_data()

    if config.existed_ckpt:
        if config.existed_ckpt.endswith(".npz"):
            weights = np.load(config.existed_ckpt)
        else:
            weights = load_checkpoint(config.existed_ckpt)
        for param in net_with_loss.trainable_params():
            weights_name = param.name
            if weights_name not in weights:
                raise ValueError(
                    f"Param {weights_name} is not found in ckpt file.")

            if isinstance(weights[weights_name], Parameter):
                param.default_input = weights[weights_name].default_input
            elif isinstance(weights[weights_name], Tensor):
                param.default_input = Tensor(weights[weights_name].asnumpy(),
                                             config.dtype)
            elif isinstance(weights[weights_name], np.ndarray):
                param.default_input = Tensor(weights[weights_name],
                                             config.dtype)
            else:
                param.default_input = weights[weights_name]
    else:
        for param in net_with_loss.trainable_params():
            name = param.name
            value = param.default_input
            if isinstance(value, Tensor):
                if name.endswith(".gamma"):
                    param.default_input = one_weight(value.asnumpy().shape)
                elif name.endswith(".beta") or name.endswith(".bias"):
                    param.default_input = zero_weight(value.asnumpy().shape)
                else:
                    param.default_input = weight_variable(
                        value.asnumpy().shape)

    dataset = pre_training_dataset if pre_training_dataset is not None \
        else fine_tune_dataset

    if dataset is None:
        raise ValueError(
            "pre-training dataset or fine-tuning dataset must be provided one."
        )

    update_steps = dataset.get_repeat_count() * dataset.get_dataset_size()
    if config.lr_scheduler == "isr":
        lr = Tensor(square_root_schedule(
            lr=config.lr,
            update_num=update_steps,
            decay_start_step=config.decay_start_step,
            warmup_steps=config.warmup_steps,
            min_lr=config.min_lr),
                    dtype=mstype.float32)
    elif config.lr_scheduler == "poly":
        lr = Tensor(polynomial_decay_scheduler(
            lr=config.lr,
            min_lr=config.min_lr,
            decay_steps=config.decay_steps,
            total_update_num=update_steps,
            warmup_steps=config.warmup_steps,
            power=config.poly_lr_scheduler_power),
                    dtype=mstype.float32)
    else:
        lr = config.lr

    if config.optimizer.lower() == "adam":
        optimizer = Adam(net_with_loss.trainable_params(),
                         lr,
                         beta1=0.9,
                         beta2=0.98)
    elif config.optimizer.lower() == "lamb":
        lr = BertLearningRate(decay_steps=12000,
                              learning_rate=config.lr,
                              end_learning_rate=config.min_lr,
                              power=10.0,
                              warmup_steps=config.warmup_steps)
        decay_params = list(
            filter(
                lambda x: 'layernorm' not in x.name.lower() and 'bias' not in x
                .name.lower(), net_with_loss.trainable_params()))
        other_params = list(
            filter(
                lambda x: 'layernorm' in x.name.lower() or 'bias' in x.name.
                lower(), net_with_loss.trainable_params()))
        group_params = [{
            'params': decay_params,
            'weight_decay': 0.01
        }, {
            'params': other_params
        }]

        optimizer = Lamb(group_params, lr, eps=1e-6)
    elif config.optimizer.lower() == "momentum":
        optimizer = Momentum(net_with_loss.trainable_params(),
                             lr,
                             momentum=0.9)
    else:
        raise ValueError(f"optimizer only support `adam` and `momentum` now.")

    # loss scale.
    if platform == "Ascend":
        scale_manager = DynamicLossScaleManager(
            init_loss_scale=config.init_loss_scale,
            scale_factor=config.loss_scale_factor,
            scale_window=config.scale_window)
    else:
        scale_manager = FixedLossScaleManager(loss_scale=1.0,
                                              drop_overflow_update=True)
    net_with_grads = TransformerTrainOneStepWithLossScaleCell(
        network=net_with_loss,
        optimizer=optimizer,
        scale_update_cell=scale_manager.get_update_cell())
    net_with_grads.set_train(True)
    model = Model(net_with_grads)
    loss_monitor = LossCallBack(config)
    ckpt_config = CheckpointConfig(
        save_checkpoint_steps=config.save_ckpt_steps,
        keep_checkpoint_max=config.keep_ckpt_max)

    rank_size = os.getenv('RANK_SIZE')
    callbacks = [loss_monitor]
    if rank_size is not None and int(
            rank_size) > 1 and MultiAscend.get_rank() % 8 == 0:
        ckpt_callback = ModelCheckpoint(
            prefix=config.ckpt_prefix,
            directory=os.path.join(config.ckpt_path,
                                   'ckpt_{}'.format(os.getenv('DEVICE_ID'))),
            config=ckpt_config)
        callbacks.append(ckpt_callback)

    if rank_size is None or int(rank_size) == 1:
        ckpt_callback = ModelCheckpoint(
            prefix=config.ckpt_prefix,
            directory=os.path.join(config.ckpt_path,
                                   'ckpt_{}'.format(os.getenv('DEVICE_ID'))),
            config=ckpt_config)
        callbacks.append(ckpt_callback)

    print(f" | ALL SET, PREPARE TO TRAIN.")
    _train(model=model,
           config=config,
           pre_training_dataset=pre_training_dataset,
           fine_tune_dataset=fine_tune_dataset,
           test_dataset=test_dataset,
           callbacks=callbacks)
Ejemplo n.º 11
0
            get_lr(global_step=0,
                   lr_init=0,
                   lr_end=0,
                   lr_max=config_gpu.lr,
                   warmup_epochs=config_gpu.warmup_epochs,
                   total_epochs=epoch_size,
                   steps_per_epoch=step_size))
        opt = Momentum(filter(lambda x: x.requires_grad,
                              net.get_parameters()), lr, config_gpu.momentum,
                       config_gpu.weight_decay, config_gpu.loss_scale)
        # define model
        model = Model(net,
                      loss_fn=loss,
                      optimizer=opt,
                      loss_scale_manager=loss_scale)

        cb = [Monitor(lr_init=lr.asnumpy())]
        ckpt_save_dir = config_gpu.save_checkpoint_path + "ckpt_" + str(
            get_rank()) + "/"
        if config_gpu.save_checkpoint:
            config_ck = CheckpointConfig(
                save_checkpoint_steps=config_gpu.save_checkpoint_epochs *
                step_size,
                keep_checkpoint_max=config_gpu.keep_checkpoint_max)
            ckpt_cb = ModelCheckpoint(prefix="mobilenetV3",
                                      directory=ckpt_save_dir,
                                      config=config_ck)
            cb += [ckpt_cb]
        # begine train
        model.train(epoch_size, dataset, callbacks=cb)
Ejemplo n.º 12
0
    lr = Tensor(lr)

    # define optimization
    opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), lr,
                   config.momentum, config.weight_decay, config.loss_scale)

    # define model
    model = Model(net,
                  loss_fn=loss,
                  optimizer=opt,
                  loss_scale_manager=loss_scale,
                  metrics={'acc'})

    print("============== Starting Training ==============")
    time_callback = TimeMonitor(data_size=step_size)
    loss_callback = LossMonitor()
    callbacks = [time_callback, loss_callback]
    if rank_id == 0:
        if config.save_checkpoint:
            config_ckpt = CheckpointConfig(
                save_checkpoint_steps=config.save_checkpoint_epochs *
                step_size,
                keep_checkpoint_max=config.keep_checkpoint_max)
            ckpt_callback = ModelCheckpoint(
                prefix="ResNet50",
                directory=config.save_checkpoint_path,
                config=config_ckpt)
            callbacks += [ckpt_callback]
    model.train(epoch_size, dataset, callbacks=callbacks)
    print("============== End Training ==============")
Ejemplo n.º 13
0
        net_with_loss = WithLossCell(net, loss)
        if args_opt.run_distribute:
            net = TrainOneStepCell(net_with_loss,
                                   net,
                                   opt,
                                   sens=config.loss_scale,
                                   reduce_flag=True,
                                   mean=True,
                                   degree=device_num)
        else:
            net = TrainOneStepCell(net_with_loss,
                                   net,
                                   opt,
                                   sens=config.loss_scale)

        time_cb = TimeMonitor(data_size=dataset_size)
        loss_cb = LossCallBack()
        cb = [time_cb, loss_cb]
        if config.save_checkpoint:
            ckptconfig = CheckpointConfig(
                save_checkpoint_steps=config.save_checkpoint_epochs *
                dataset_size,
                keep_checkpoint_max=config.keep_checkpoint_max)
            ckpoint_cb = ModelCheckpoint(prefix='mask_rcnn',
                                         directory=config.save_checkpoint_path,
                                         config=ckptconfig)
            cb += [ckpoint_cb]

        model = Model(net)
        model.train(config.epoch_size, dataset, callbacks=cb)
Ejemplo n.º 14
0
def train(cfg):

    context.set_context(mode=context.GRAPH_MODE, device_target='GPU', save_graphs=False)
    if cfg['ngpu'] > 1:
        init("nccl")
        context.set_auto_parallel_context(device_num=get_group_size(), parallel_mode=ParallelMode.DATA_PARALLEL,
                                          gradients_mean=True)
        cfg['ckpt_path'] = cfg['ckpt_path'] + "ckpt_" + str(get_rank()) + "/"
    else:
        raise ValueError('cfg_num_gpu <= 1')

    batch_size = cfg['batch_size']
    max_epoch = cfg['epoch']

    momentum = cfg['momentum']
    weight_decay = cfg['weight_decay']
    initial_lr = cfg['initial_lr']
    gamma = cfg['gamma']
    training_dataset = cfg['training_dataset']
    num_classes = 2
    negative_ratio = 7
    stepvalues = (cfg['decay1'], cfg['decay2'])

    ds_train = create_dataset(training_dataset, cfg, batch_size, multiprocessing=True, num_worker=cfg['num_workers'])
    print('dataset size is : \n', ds_train.get_dataset_size())

    steps_per_epoch = math.ceil(ds_train.get_dataset_size())

    multibox_loss = MultiBoxLoss(num_classes, cfg['num_anchor'], negative_ratio, cfg['batch_size'])
    backbone = resnet50(1001)
    backbone.set_train(True)

    if cfg['pretrain'] and cfg['resume_net'] is None:
        pretrained_res50 = cfg['pretrain_path']
        param_dict_res50 = load_checkpoint(pretrained_res50)
        load_param_into_net(backbone, param_dict_res50)
        print('Load resnet50 from [{}] done.'.format(pretrained_res50))

    net = RetinaFace(phase='train', backbone=backbone)
    net.set_train(True)

    if cfg['resume_net'] is not None:
        pretrain_model_path = cfg['resume_net']
        param_dict_retinaface = load_checkpoint(pretrain_model_path)
        load_param_into_net(net, param_dict_retinaface)
        print('Resume Model from [{}] Done.'.format(cfg['resume_net']))

    net = RetinaFaceWithLossCell(net, multibox_loss, cfg)

    lr = adjust_learning_rate(initial_lr, gamma, stepvalues, steps_per_epoch, max_epoch,
                              warmup_epoch=cfg['warmup_epoch'])

    if cfg['optim'] == 'momentum':
        opt = mindspore.nn.Momentum(net.trainable_params(), lr, momentum)
    elif cfg['optim'] == 'sgd':
        opt = mindspore.nn.SGD(params=net.trainable_params(), learning_rate=lr, momentum=momentum,
                               weight_decay=weight_decay, loss_scale=1)
    else:
        raise ValueError('optim is not define.')

    net = TrainingWrapper(net, opt)

    model = Model(net)

    config_ck = CheckpointConfig(save_checkpoint_steps=cfg['save_checkpoint_steps'],
                                 keep_checkpoint_max=cfg['keep_checkpoint_max'])
    ckpoint_cb = ModelCheckpoint(prefix="RetinaFace", directory=cfg['ckpt_path'], config=config_ck)

    time_cb = TimeMonitor(data_size=ds_train.get_dataset_size())
    callback_list = [LossMonitor(), time_cb, ckpoint_cb]

    print("============== Starting Training ==============")
    model.train(max_epoch, ds_train, callbacks=callback_list,
                dataset_sink_mode=True)
Ejemplo n.º 15
0
def test_checkpoint_save_ckpt_seconds():
    """Test checkpoint save ckpt seconds."""
    train_config = CheckpointConfig(save_checkpoint_steps=16,
                                    save_checkpoint_seconds=100,
                                    keep_checkpoint_max=0,
                                    keep_checkpoint_per_n_minutes=1)
    ckpt_cb = ModelCheckpoint(config=train_config)
    cb_params = _InternalCallbackParam()
    net = Net()
    loss = nn.SoftmaxCrossEntropyWithLogits()
    optim = Momentum(net.trainable_params(), learning_rate=0.1, momentum=0.9)
    network_ = WithLossCell(net, loss)
    _train_network = TrainOneStepCell(network_, optim)
    cb_params.train_network = _train_network
    cb_params.epoch_num = 10
    cb_params.cur_epoch_num = 4
    cb_params.cur_step_num = 128
    cb_params.batch_num = 32
    run_context = RunContext(cb_params)
    ckpt_cb.begin(run_context)
    ckpt_cb.step_end(run_context)
    ckpt_cb2 = ModelCheckpoint(config=train_config)
    cb_params.cur_epoch_num = 1
    cb_params.cur_step_num = 16
    ckpt_cb2.begin(run_context)
    ckpt_cb2.step_end(run_context)
Ejemplo n.º 16
0
                        help='dataset_sink_mode is False or True')
    args = parser.parse_args()

    context.set_context(mode=context.GRAPH_MODE,
                        device_target=args.device_target)

    network = AlexNet(cfg.num_classes)
    loss = nn.SoftmaxCrossEntropyWithLogits(is_grad=False,
                                            sparse=True,
                                            reduction="mean")
    lr = Tensor(
        get_lr(0, cfg.learning_rate, cfg.epoch_size,
               cfg.save_checkpoint_steps))
    opt = nn.Momentum(network.trainable_params(), lr, cfg.momentum)
    model = Model(network, loss, opt, metrics={"Accuracy": Accuracy()})  # test

    print("============== Starting Training ==============")
    ds_train = create_dataset(args.data_path, cfg.batch_size, cfg.epoch_size)
    time_cb = TimeMonitor(data_size=ds_train.get_dataset_size())
    config_ck = CheckpointConfig(
        save_checkpoint_steps=cfg.save_checkpoint_steps,
        keep_checkpoint_max=cfg.keep_checkpoint_max)
    ckpoint_cb = ModelCheckpoint(prefix="checkpoint_alexnet",
                                 directory=args.ckpt_path,
                                 config=config_ck)
    model.train(cfg.epoch_size,
                ds_train,
                callbacks=[time_cb, ckpoint_cb,
                           LossMonitor()],
                dataset_sink_mode=args.dataset_sink_mode)
Ejemplo n.º 17
0
def run_pretrain():
    """pre-train bert_clue"""
    parser = argparse_init()
    args_opt = parser.parse_args()
    context.set_context(mode=context.GRAPH_MODE,
                        device_target=args_opt.device_target,
                        device_id=args_opt.device_id)
    context.set_context(reserve_class_name_in_scope=False)
    is_auto_enable_graph_kernel = _auto_enable_graph_kernel(
        args_opt.device_target, args_opt.enable_graph_kernel)
    _set_graph_kernel_context(args_opt.device_target,
                              args_opt.enable_graph_kernel,
                              is_auto_enable_graph_kernel)
    ckpt_save_dir = args_opt.save_checkpoint_path
    if args_opt.distribute == "true":
        if args_opt.device_target == 'Ascend':
            D.init()
            device_num = args_opt.device_num
            rank = args_opt.device_id % device_num
        else:
            D.init()
            device_num = D.get_group_size()
            rank = D.get_rank()
        ckpt_save_dir = args_opt.save_checkpoint_path + 'ckpt_' + str(
            get_rank()) + '/'

        context.reset_auto_parallel_context()
        context.set_auto_parallel_context(
            parallel_mode=ParallelMode.DATA_PARALLEL,
            gradients_mean=True,
            device_num=device_num)
        _set_bert_all_reduce_split()
    else:
        rank = 0
        device_num = 1

    _check_compute_type(args_opt, is_auto_enable_graph_kernel)

    if args_opt.accumulation_steps > 1:
        logger.info("accumulation steps: {}".format(
            args_opt.accumulation_steps))
        logger.info("global batch size: {}".format(
            cfg.batch_size * args_opt.accumulation_steps))
        if args_opt.enable_data_sink == "true":
            args_opt.data_sink_steps *= args_opt.accumulation_steps
            logger.info("data sink steps: {}".format(args_opt.data_sink_steps))
        if args_opt.enable_save_ckpt == "true":
            args_opt.save_checkpoint_steps *= args_opt.accumulation_steps
            logger.info("save checkpoint steps: {}".format(
                args_opt.save_checkpoint_steps))

    ds = create_bert_dataset(device_num, rank, args_opt.do_shuffle,
                             args_opt.data_dir, args_opt.schema_dir)
    net_with_loss = BertNetworkWithLoss(bert_net_cfg, True)

    new_repeat_count = args_opt.epoch_size * ds.get_dataset_size(
    ) // args_opt.data_sink_steps
    if args_opt.train_steps > 0:
        train_steps = args_opt.train_steps * args_opt.accumulation_steps
        new_repeat_count = min(new_repeat_count,
                               train_steps // args_opt.data_sink_steps)
    else:
        args_opt.train_steps = args_opt.epoch_size * ds.get_dataset_size(
        ) // args_opt.accumulation_steps
        logger.info("train steps: {}".format(args_opt.train_steps))

    optimizer = _get_optimizer(args_opt, net_with_loss)
    callback = [
        TimeMonitor(args_opt.data_sink_steps),
        LossCallBack(ds.get_dataset_size())
    ]
    if args_opt.enable_save_ckpt == "true" and args_opt.device_id % min(
            8, device_num) == 0:
        config_ck = CheckpointConfig(
            save_checkpoint_steps=args_opt.save_checkpoint_steps,
            keep_checkpoint_max=args_opt.save_checkpoint_num)
        ckpoint_cb = ModelCheckpoint(
            prefix='checkpoint_bert',
            directory=None if ckpt_save_dir == "" else ckpt_save_dir,
            config=config_ck)
        callback.append(ckpoint_cb)

    if args_opt.load_checkpoint_path:
        param_dict = load_checkpoint(args_opt.load_checkpoint_path)
        load_param_into_net(net_with_loss, param_dict)

    if args_opt.enable_lossscale == "true":
        update_cell = DynamicLossScaleUpdateCell(
            loss_scale_value=cfg.loss_scale_value,
            scale_factor=cfg.scale_factor,
            scale_window=cfg.scale_window)
        accumulation_steps = args_opt.accumulation_steps
        enable_global_norm = cfg.enable_global_norm
        if accumulation_steps <= 1:
            if cfg.optimizer == 'AdamWeightDecay' and args_opt.device_target == 'GPU':
                net_with_grads = BertTrainOneStepWithLossScaleCellForAdam(
                    net_with_loss,
                    optimizer=optimizer,
                    scale_update_cell=update_cell)
            else:
                net_with_grads = BertTrainOneStepWithLossScaleCell(
                    net_with_loss,
                    optimizer=optimizer,
                    scale_update_cell=update_cell)
        else:
            allreduce_post = args_opt.distribute == "false" or args_opt.allreduce_post_accumulation == "true"
            net_with_accumulation = (
                BertTrainAccumulationAllReducePostWithLossScaleCell
                if allreduce_post else
                BertTrainAccumulationAllReduceEachWithLossScaleCell)
            net_with_grads = net_with_accumulation(
                net_with_loss,
                optimizer=optimizer,
                scale_update_cell=update_cell,
                accumulation_steps=accumulation_steps,
                enable_global_norm=enable_global_norm)
    else:
        net_with_grads = BertTrainOneStepCell(net_with_loss,
                                              optimizer=optimizer)

    model = Model(net_with_grads)
    model = ConvertModelUtils().convert_to_thor_model(
        model,
        network=net_with_grads,
        optimizer=optimizer,
        frequency=cfg.Thor.frequency)
    model.train(new_repeat_count,
                ds,
                callbacks=callback,
                dataset_sink_mode=(args_opt.enable_data_sink == "true"),
                sink_size=args_opt.data_sink_steps)
Ejemplo n.º 18
0
def train():
    args = parse_args()

    # init multicards training
    if args.is_distributed:
        init()
        args.rank = get_rank()
        args.group_size = get_group_size()

        parallel_mode = ParallelMode.DATA_PARALLEL
        context.set_auto_parallel_context(parallel_mode=parallel_mode,
                                          gradients_mean=True,
                                          device_num=args.group_size)

    # dataset
    dataset = data_generator.SegDataset(image_mean=args.image_mean,
                                        image_std=args.image_std,
                                        data_file=args.data_file,
                                        batch_size=args.batch_size,
                                        crop_size=args.crop_size,
                                        max_scale=args.max_scale,
                                        min_scale=args.min_scale,
                                        ignore_label=args.ignore_label,
                                        num_classes=args.num_classes,
                                        num_readers=2,
                                        num_parallel_calls=4,
                                        shard_id=args.rank,
                                        shard_num=args.group_size)
    dataset = dataset.get_dataset(repeat=1)

    # network
    if args.model == 'deeplab_v3_s16':
        network = net_factory.nets_map[args.model]('train', args.num_classes,
                                                   16, args.freeze_bn)
    elif args.model == 'deeplab_v3_s8':
        network = net_factory.nets_map[args.model]('train', args.num_classes,
                                                   8, args.freeze_bn)
    else:
        raise NotImplementedError('model [{:s}] not recognized'.format(
            args.model))

    # loss
    loss_ = loss.SoftmaxCrossEntropyLoss(args.num_classes, args.ignore_label)
    loss_.add_flags_recursive(fp32=True)
    train_net = BuildTrainNetwork(network, loss_)

    # load pretrained model
    if args.ckpt_pre_trained:
        param_dict = load_checkpoint(args.ckpt_pre_trained)
        load_param_into_net(train_net, param_dict)

    # optimizer
    iters_per_epoch = dataset.get_dataset_size()
    total_train_steps = iters_per_epoch * args.train_epochs
    if args.lr_type == 'cos':
        lr_iter = learning_rates.cosine_lr(args.base_lr, total_train_steps,
                                           total_train_steps)
    elif args.lr_type == 'poly':
        lr_iter = learning_rates.poly_lr(args.base_lr,
                                         total_train_steps,
                                         total_train_steps,
                                         end_lr=0.0,
                                         power=0.9)
    elif args.lr_type == 'exp':
        lr_iter = learning_rates.exponential_lr(args.base_lr,
                                                args.lr_decay_step,
                                                args.lr_decay_rate,
                                                total_train_steps,
                                                staircase=True)
    else:
        raise ValueError('unknown learning rate type')
    opt = nn.Momentum(params=train_net.trainable_params(),
                      learning_rate=lr_iter,
                      momentum=0.9,
                      weight_decay=0.0001,
                      loss_scale=args.loss_scale)

    # loss scale
    manager_loss_scale = FixedLossScaleManager(args.loss_scale,
                                               drop_overflow_update=False)
    model = Model(train_net,
                  optimizer=opt,
                  amp_level="O3",
                  loss_scale_manager=manager_loss_scale)

    # callback for saving ckpts
    time_cb = TimeMonitor(data_size=iters_per_epoch)
    loss_cb = LossMonitor()
    cbs = [time_cb, loss_cb]

    if args.rank == 0:
        config_ck = CheckpointConfig(
            save_checkpoint_steps=args.save_steps,
            keep_checkpoint_max=args.keep_checkpoint_max)
        ckpoint_cb = ModelCheckpoint(prefix=args.model,
                                     directory=args.train_dir,
                                     config=config_ck)
        cbs.append(ckpoint_cb)

    model.train(args.train_epochs, dataset, callbacks=cbs)
Ejemplo n.º 19
0
            ## fp32 training
            opt = Momentum(
                filter(lambda x: x.requires_grad, net.get_parameters()), lr,
                config.momentum, config.weight_decay)
            model = Model(net, loss_fn=loss, optimizer=opt, metrics={'acc'})

    # define callbacks
    time_cb = TimeMonitor(data_size=step_size)
    loss_cb = LossMonitor()
    cb = [time_cb, loss_cb]
    if config.save_checkpoint:
        config_ck = CheckpointConfig(
            save_checkpoint_steps=config.save_checkpoint_epochs * step_size,
            keep_checkpoint_max=config.keep_checkpoint_max)
        ckpt_cb = ModelCheckpoint(prefix="resnet",
                                  directory=ckpt_save_dir,
                                  config=config_ck)
        cb += [ckpt_cb]

    if args_opt.elastic:
        from elastic_schedule import schedule
        from src.kungfu_mindspore_callbacks import KungFuElasticCallback
        kungfu_elastic_callback = KungFuElasticCallback(schedule)
        cb += [kungfu_elastic_callback]
        dataset_sink_mode = False
        print('enabled elastic')

        # from src.debug import DebugStopHook
        # cb += [DebugStopHook()]
    cb += [LogStepCallback()]
Ejemplo n.º 20
0
def main():
    parser = argparse.ArgumentParser(description="YOLOv3 train")
    parser.add_argument("--only_create_dataset",
                        type=bool,
                        default=False,
                        help="If set it true, only create "
                        "Mindrecord, default is false.")
    parser.add_argument("--distribute",
                        type=bool,
                        default=False,
                        help="Run distribute, default is false.")
    parser.add_argument("--device_id",
                        type=int,
                        default=0,
                        help="Device id, default is 0.")
    parser.add_argument("--device_num",
                        type=int,
                        default=1,
                        help="Use device nums, default is 1.")
    parser.add_argument("--lr",
                        type=float,
                        default=0.001,
                        help="Learning rate, default is 0.001.")
    parser.add_argument("--mode",
                        type=str,
                        default="sink",
                        help="Run sink mode or not, default is sink")
    parser.add_argument("--epoch_size",
                        type=int,
                        default=10,
                        help="Epoch size, default is 10")
    parser.add_argument("--batch_size",
                        type=int,
                        default=32,
                        help="Batch size, default is 32.")
    parser.add_argument("--pre_trained",
                        type=str,
                        default=None,
                        help="Pretrained checkpoint file path")
    parser.add_argument("--pre_trained_epoch_size",
                        type=int,
                        default=0,
                        help="Pretrained epoch size")
    parser.add_argument("--save_checkpoint_epochs",
                        type=int,
                        default=5,
                        help="Save checkpoint epochs, default is 5.")
    parser.add_argument("--loss_scale",
                        type=int,
                        default=1024,
                        help="Loss scale, default is 1024.")
    parser.add_argument(
        "--mindrecord_dir",
        type=str,
        default="./Mindrecord_train",
        help=
        "Mindrecord directory. If the mindrecord_dir is empty, it wil generate mindrecord file by"
        "image_dir and anno_path. Note if mindrecord_dir isn't empty, it will use mindrecord_dir "
        "rather than image_dir and anno_path. Default is ./Mindrecord_train")
    parser.add_argument("--image_dir",
                        type=str,
                        default="",
                        help="Dataset directory, "
                        "the absolute image path is joined by the image_dir "
                        "and the relative path in anno_path")
    parser.add_argument("--anno_path",
                        type=str,
                        default="",
                        help="Annotation path.")
    args_opt = parser.parse_args()

    context.set_context(mode=context.GRAPH_MODE,
                        device_target="Ascend",
                        device_id=args_opt.device_id)
    if args_opt.distribute:
        device_num = args_opt.device_num
        context.reset_auto_parallel_context()
        context.set_auto_parallel_context(
            parallel_mode=ParallelMode.DATA_PARALLEL,
            mirror_mean=True,
            device_num=device_num)
        init()
        rank = args_opt.device_id % device_num
    else:
        rank = 0
        device_num = 1

    print("Start create dataset!")

    # It will generate mindrecord file in args_opt.mindrecord_dir,
    # and the file name is yolo.mindrecord0, 1, ... file_num.
    if not os.path.isdir(args_opt.mindrecord_dir):
        os.makedirs(args_opt.mindrecord_dir)

    prefix = "yolo.mindrecord"
    mindrecord_file = os.path.join(args_opt.mindrecord_dir, prefix + "0")
    if not os.path.exists(mindrecord_file):
        if os.path.isdir(args_opt.image_dir) and os.path.exists(
                args_opt.anno_path):
            print("Create Mindrecord.")
            data_to_mindrecord_byte_image(args_opt.image_dir,
                                          args_opt.anno_path,
                                          args_opt.mindrecord_dir,
                                          prefix=prefix,
                                          file_num=8)
            print("Create Mindrecord Done, at {}".format(
                args_opt.mindrecord_dir))
        else:
            print("image_dir or anno_path not exits.")

    if not args_opt.only_create_dataset:
        loss_scale = float(args_opt.loss_scale)

        # When create MindDataset, using the fitst mindrecord file, such as yolo.mindrecord0.
        dataset = create_yolo_dataset(mindrecord_file,
                                      repeat_num=args_opt.epoch_size,
                                      batch_size=args_opt.batch_size,
                                      device_num=device_num,
                                      rank=rank)
        dataset_size = dataset.get_dataset_size()
        print("Create dataset done!")

        net = yolov3_resnet18(ConfigYOLOV3ResNet18())
        net = YoloWithLossCell(net, ConfigYOLOV3ResNet18())
        init_net_param(net, "XavierUniform")

        # checkpoint
        ckpt_config = CheckpointConfig(save_checkpoint_steps=dataset_size *
                                       args_opt.save_checkpoint_epochs)
        ckpoint_cb = ModelCheckpoint(prefix="yolov3",
                                     directory=None,
                                     config=ckpt_config)

        if args_opt.pre_trained:
            if args_opt.pre_trained_epoch_size <= 0:
                raise KeyError(
                    "pre_trained_epoch_size must be greater than 0.")
            param_dict = load_checkpoint(args_opt.pre_trained)
            load_param_into_net(net, param_dict)
        total_epoch_size = 60
        if args_opt.distribute:
            total_epoch_size = 160
        lr = Tensor(
            get_lr(learning_rate=args_opt.lr,
                   start_step=args_opt.pre_trained_epoch_size * dataset_size,
                   global_step=total_epoch_size * dataset_size,
                   decay_step=1000,
                   decay_rate=0.95,
                   steps=True))
        opt = nn.Adam(filter(lambda x: x.requires_grad, net.get_parameters()),
                      lr,
                      loss_scale=loss_scale)
        net = TrainingWrapper(net, opt, loss_scale)

        callback = [
            TimeMonitor(data_size=dataset_size),
            LossMonitor(), ckpoint_cb
        ]

        model = Model(net)
        dataset_sink_mode = False
        if args_opt.mode == "sink":
            print("In sink mode, one epoch return a loss.")
            dataset_sink_mode = True
        print(
            "Start train YOLOv3, the first epoch will be slower because of the graph compilation."
        )
        model.train(args_opt.epoch_size,
                    dataset,
                    callbacks=callback,
                    dataset_sink_mode=dataset_sink_mode)
Ejemplo n.º 21
0
def do_train(dataset=None,
             network=None,
             load_checkpoint_path="",
             save_checkpoint_path="",
             epoch_num=1):
    """ do train """
    if load_checkpoint_path == "":
        raise ValueError(
            "Pretrain model missed, finetune task must load pretrain model!")
    steps_per_epoch = dataset.get_dataset_size()
    # optimizer
    if optimizer_cfg.optimizer == 'AdamWeightDecay':
        lr_schedule = BertLearningRate(
            learning_rate=optimizer_cfg.AdamWeightDecay.learning_rate,
            end_learning_rate=optimizer_cfg.AdamWeightDecay.end_learning_rate,
            warmup_steps=int(steps_per_epoch * epoch_num * 0.1),
            decay_steps=steps_per_epoch * epoch_num,
            power=optimizer_cfg.AdamWeightDecay.power)
        params = net_with_loss.trainable_params()
        decay_params = list(
            filter(optimizer_cfg.AdamWeightDecay.decay_filter, params))
        other_params = list(filter(lambda x: x not in decay_params, params))
        group_params = [{
            'params':
            decay_params,
            'weight_decay':
            optimizer_cfg.AdamWeightDecay.weight_decay
        }, {
            'params': other_params,
            'weight_decay': 0.0
        }]

        optimizer = AdamWeightDecay(group_params,
                                    lr_schedule,
                                    eps=optimizer_cfg.AdamWeightDecay.eps)
    elif optimizer_cfg.optimizer == 'Lamb':
        lr_schedule = BertLearningRate(
            learning_rate=optimizer_cfg.Lamb.learning_rate,
            end_learning_rate=optimizer_cfg.Lamb.end_learning_rate,
            warmup_steps=int(steps_per_epoch * epoch_num * 0.1),
            decay_steps=steps_per_epoch * epoch_num,
            power=optimizer_cfg.Lamb.power)
        optimizer = Lamb(network.trainable_params(), learning_rate=lr_schedule)
    elif optimizer_cfg.optimizer == 'Momentum':
        optimizer = Momentum(
            network.trainable_params(),
            learning_rate=optimizer_cfg.Momentum.learning_rate,
            momentum=optimizer_cfg.Momentum.momentum)
    else:
        raise Exception(
            "Optimizer not supported. support: [AdamWeightDecay, Lamb, Momentum]"
        )

    # load checkpoint into network
    ckpt_config = CheckpointConfig(save_checkpoint_steps=steps_per_epoch,
                                   keep_checkpoint_max=1)
    ckpoint_cb = ModelCheckpoint(prefix="classifier",
                                 directory=save_checkpoint_path,
                                 config=ckpt_config)
    param_dict = load_checkpoint(load_checkpoint_path)
    load_param_into_net(network, param_dict)

    update_cell = DynamicLossScaleUpdateCell(loss_scale_value=2**32,
                                             scale_factor=2,
                                             scale_window=1000)
    netwithgrads = BertFinetuneCell(network,
                                    optimizer=optimizer,
                                    scale_update_cell=update_cell)
    model = Model(netwithgrads)
    callbacks = [
        TimeMonitor(dataset.get_dataset_size()),
        LossCallBack(), ckpoint_cb
    ]
    model.train(epoch_num, dataset, callbacks=callbacks)
Ejemplo n.º 22
0
    if args_opt.run_distribute:
        net = TrainOneStepCell(net_with_loss,
                               net,
                               opt,
                               sens=config.loss_scale,
                               reduce_flag=True,
                               mean=True,
                               degree=device_num)
    else:
        net = TrainOneStepCell(net_with_loss, net, opt, sens=config.loss_scale)

    time_cb = TimeMonitor(data_size=dataset_size)
    loss_cb = LossCallBack(rank_id=rank)
    cb = [time_cb, loss_cb]
    if config.save_checkpoint:
        ckptconfig = CheckpointConfig(
            save_checkpoint_steps=config.save_checkpoint_epochs * dataset_size,
            keep_checkpoint_max=config.keep_checkpoint_max)
        save_checkpoint_path = os.path.join(config.save_checkpoint_path,
                                            "ckpt_" + str(rank) + "/")
        ckpoint_cb = ModelCheckpoint(prefix='ctpn',
                                     directory=save_checkpoint_path,
                                     config=ckptconfig)
        cb += [ckpoint_cb]

    model = Model(net)
    model.train(training_cfg.total_epoch,
                dataset,
                callbacks=cb,
                dataset_sink_mode=True)
Ejemplo n.º 23
0
def do_train(dataset=None,
             network=None,
             load_checkpoint_path="",
             save_checkpoint_path="",
             epoch_num=1):
    """
    Do train
    Args:
        dataset: the train dataset.
        network:  the network with loss
        load_checkpoint_path: the file path which saved pretrain model checkpoint.
        save_checkpoint_path:  the file path which will save finetune model checkpoint.
        epoch_num: the number of epoch
    """
    if load_checkpoint_path == "":
        raise ValueError(
            "Pretrain model missed, finetune task must load pretrain model!")

    steps_per_epoch = dataset.get_dataset_size(
    )  # samples / batch_size  doing####

    if cfg.optimizer == 'AdamWeightDecay':
        lr_schedule = GPT2LearningRate(
            learning_rate=cfg.AdamWeightDecay.learning_rate,
            end_learning_rate=cfg.AdamWeightDecay.end_learning_rate,
            warmup_steps=int(steps_per_epoch * epoch_num * 0.1),
            decay_steps=steps_per_epoch * epoch_num,
            power=cfg.AdamWeightDecay.power)
        params = network.trainable_params(
        )  # return a list of all trainable parmeters of the network

        # Use parameter groups and set different values
        decay_params = list(filter(cfg.AdamWeightDecay.decay_filter,
                                   params))  # without layernorm and bias
        other_params = list(
            filter(lambda x: not cfg.AdamWeightDecay.decay_filter(x),
                   params))  # with layernorm and bias
        group_params = [{
            'params': decay_params,
            'weight_decay': cfg.AdamWeightDecay.weight_decay
        }, {
            'params': other_params,
            'weight_decay': 0.0
        }]
        optimizer = AdamWeightDecay(group_params,
                                    lr_schedule,
                                    eps=cfg.AdamWeightDecay.eps)

    elif cfg.optimizer == 'Lamb':
        lr_schedule = GPT2LearningRate(
            learning_rate=cfg.Lamb.learning_rate,
            end_learning_rate=cfg.Lamb.end_learning_rate,
            warmup_steps=int(steps_per_epoch * epoch_num * 0.1),
            decay_steps=steps_per_epoch * epoch_num,
            power=cfg.Lamb.power)
        optimizer = Lamb(network.trainable_params(), lr_schedule)
    elif cfg.optimizer == 'Momentum':
        optimizer = Momentum(network.trainable_params(),
                             cfg.Momentum.learning_rate, cfg.Momentum.momentum)
    else:
        raise Exception(
            "Optimizer not supported. support: [AdamWeightDecay, Lamb, Momentum]"
        )

    # load checkpoint into network
    ckpt_config = CheckpointConfig(save_checkpoint_steps=steps_per_epoch,
                                   keep_checkpoint_max=1)
    ckpoint_cb = ModelCheckpoint(
        prefix="gpt2_language_model_wiki2",
        directory=None if save_checkpoint_path == "" else save_checkpoint_path,
        config=ckpt_config)
    param_dict = load_checkpoint(load_checkpoint_path)

    final_param_dict = {}
    for k, v in param_dict.items():
        final_param_dict['gpt2_loss.gpt2.gpt2.' + k] = param_dict[k]
    # set the weights of final linear weights to weights of gpt2 token embedding
    final_param_dict['gpt2_loss.gpt2.dense1.weight'] = param_dict[
        'gpt2_embedding_lookup.embedding_table']

    load_param_into_net(network, final_param_dict)
    print("Load new parameter successfully!\n")

    update_cell = DynamicLossScaleUpdateCell(loss_scale_value=2**32,
                                             scale_factor=2,
                                             scale_window=1000)
    netwithgrads = GPT2FinetuneCell(network,
                                    optimizer=optimizer,
                                    scale_update_cell=update_cell)
    netwithgrads.set_train(True)

    loss_cb = LossMonitor()

    model = Model(netwithgrads)
    # callbacks = [TimeMonitor(dataset.get_dataset_size()), LossCallBack(dataset.get_dataset_size()), ckpoint_cb]
    callbacks = [TimeMonitor(dataset.get_dataset_size()), loss_cb, ckpoint_cb]

    print("============== Starting Training ==============")
    model.train(epoch_num, dataset, callbacks=callbacks)
    print("============== Training Success ==============")
Ejemplo n.º 24
0
def do_train(dataset=None,
             network=None,
             load_checkpoint_path="",
             save_checkpoint_path="",
             epoch_num=1):
    """
    Do train
    Args:
        dataset: the train dataset.
        network:  the network with loss
        load_checkpoint_path: the file path which saved pretrain model checkpoint.
        save_checkpoint_path:  the file path which will save finetune model checkpoint.
        epoch_num: the number of epoch
    """
    if load_checkpoint_path == "":
        raise ValueError(
            "Pretrain model missed, finetune task must load pretrain model!")

    steps_per_epoch = dataset.get_dataset_size()

    # optimizer
    if cfg.optimizer == 'AdamWeightDecay':
        lr_schedule = GPT2LearningRate(
            learning_rate=cfg.AdamWeightDecay.learning_rate,
            end_learning_rate=cfg.AdamWeightDecay.end_learning_rate,
            warmup_steps=int(steps_per_epoch * epoch_num * 0.1),
            decay_steps=steps_per_epoch * epoch_num,
            power=cfg.AdamWeightDecay.power)
        params = network.trainable_params()

        decay_params = list(filter(cfg.AdamWeightDecay.decay_filter, params))
        other_params = list(
            filter(lambda x: not cfg.AdamWeightDecay.decay_filter(x), params))
        group_params = [{
            'params': decay_params,
            'weight_decay': cfg.AdamWeightDecay.weight_decay
        }, {
            'params': other_params,
            'weight_decay': 0.0
        }]
        optimizer = AdamWeightDecay(group_params,
                                    lr_schedule,
                                    eps=cfg.AdamWeightDecay.eps)
    elif cfg.optimizer == 'Lamb':
        lr_schedule = GPT2LearningRate(
            learning_rate=cfg.Lamb.learning_rate,
            end_learning_rate=cfg.Lamb.end_learning_rate,
            warmup_steps=int(steps_per_epoch * epoch_num * 0.1),
            decay_steps=steps_per_epoch * epoch_num,
            power=cfg.Lamb.power)
        optimizer = Lamb(network.trainable_params(), lr_schedule)
    elif cfg.optimizer == 'Momentum':
        optimizer = Momentum(network.trainable_params(),
                             cfg.Momentum.learning_rate, cfg.Momentum.momentum)
    else:
        raise Exception(
            "Optimizer not supported. support: [AdamWeightDecay, Lamb, Momentum]"
        )

    # load checkpoint into network
    ckpt_config = CheckpointConfig(save_checkpoint_steps=steps_per_epoch,
                                   keep_checkpoint_max=1)
    prefix_name = "gpt2_summarization_" + str(cfg.gpt2_network) + "_" + str(cfg.optimizer) + "_" \
                  + str(epoch_num) + "_bs" + str(gpt2_net_cfg.batch_size)
    ckpoint_cb = ModelCheckpoint(
        prefix=prefix_name,
        directory=None if save_checkpoint_path == "" else save_checkpoint_path,
        config=ckpt_config)
    param_dict = load_checkpoint(load_checkpoint_path)

    final_param_dict = {}
    for name, _ in param_dict.items():
        final_param_dict['gpt2.gpt2.' + name] = param_dict[name]
    final_param_dict['gpt2.lm_head.weight'] = param_dict[
        'gpt2_embedding_lookup.embedding_table']

    load_param_into_net(network, final_param_dict)
    print("Load pretrained parameter successfully!\n")

    update_cell = DynamicLossScaleUpdateCell(loss_scale_value=2**32,
                                             scale_factor=2,
                                             scale_window=1000)
    netwithgrads = GPT2FinetuneCell(network,
                                    optimizer=optimizer,
                                    scale_update_cell=update_cell)
    netwithgrads.set_train(True)

    loss_cb = LossMonitor(per_print_times=1)
    model = Model(netwithgrads)
    callbacks = [TimeMonitor(dataset.get_dataset_size()), loss_cb, ckpoint_cb]

    print("============== Starting Finetuning ==============")
    model.train(epoch_num,
                dataset,
                callbacks=callbacks,
                dataset_sink_mode=False)
    print("============== Finetuning Success ==============")
Ejemplo n.º 25
0
def run_pretrain():
    """pre-train bert_clue"""
    parser = argparse.ArgumentParser(description='bert pre_training')
    parser.add_argument("--distribute", type=str, default="false", help="Run distribute, default is false.")
    parser.add_argument("--epoch_size", type=int, default="1", help="Epoch size, default is 1.")
    parser.add_argument("--device_id", type=int, default=0, help="Device id, default is 0.")
    parser.add_argument("--device_num", type=int, default=1, help="Use device nums, default is 1.")
    parser.add_argument("--enable_save_ckpt", type=str, default="true", help="Enable save checkpoint, default is true.")
    parser.add_argument("--enable_lossscale", type=str, default="true", help="Use lossscale or not, default is not.")
    parser.add_argument("--do_shuffle", type=str, default="true", help="Enable shuffle for dataset, default is true.")
    parser.add_argument("--enable_data_sink", type=str, default="true", help="Enable data sink, default is true.")
    parser.add_argument("--data_sink_steps", type=int, default="1", help="Sink steps for each epoch, default is 1.")
    parser.add_argument("--checkpoint_path", type=str, default="", help="Checkpoint file path")
    parser.add_argument("--save_checkpoint_steps", type=int, default=1000, help="Save checkpoint steps, "
                                                                                "default is 1000.")
    parser.add_argument("--save_checkpoint_num", type=int, default=1, help="Save checkpoint numbers, default is 1.")
    parser.add_argument("--data_dir", type=str, default="", help="Data path, it is better to use absolute path")
    parser.add_argument("--schema_dir", type=str, default="", help="Schema path, it is better to use absolute path")

    args_opt = parser.parse_args()
    context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=args_opt.device_id)
    context.set_context(reserve_class_name_in_scope=False)

    if args_opt.distribute == "true":
        device_num = args_opt.device_num
        context.reset_auto_parallel_context()
        context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, mirror_mean=True,
                                          device_num=device_num)
        from mindspore.parallel._auto_parallel_context import auto_parallel_context
        if bert_net_cfg.num_hidden_layers == 12:
            auto_parallel_context().set_all_reduce_fusion_split_indices([28, 55, 82, 109, 136, 163, 190, 205])
        elif bert_net_cfg.num_hidden_layers == 24:
            auto_parallel_context().set_all_reduce_fusion_split_indices([38, 93, 148, 203, 258, 313, 368, 397])
        D.init()
        rank = args_opt.device_id % device_num
    else:
        rank = 0
        device_num = 1

    ds, new_repeat_count = create_bert_dataset(args_opt.epoch_size, device_num, rank, args_opt.do_shuffle,
                                               args_opt.enable_data_sink, args_opt.data_sink_steps,
                                               args_opt.data_dir, args_opt.schema_dir)

    netwithloss = BertNetworkWithLoss(bert_net_cfg, True)

    if cfg.optimizer == 'Lamb':
        optimizer = Lamb(netwithloss.trainable_params(), decay_steps=ds.get_dataset_size() * ds.get_repeat_count(),
                         start_learning_rate=cfg.Lamb.start_learning_rate, end_learning_rate=cfg.Lamb.end_learning_rate,
                         power=cfg.Lamb.power, warmup_steps=cfg.Lamb.warmup_steps, weight_decay=cfg.Lamb.weight_decay,
                         eps=cfg.Lamb.eps)
    elif cfg.optimizer == 'Momentum':
        optimizer = Momentum(netwithloss.trainable_params(), learning_rate=cfg.Momentum.learning_rate,
                             momentum=cfg.Momentum.momentum)
    elif cfg.optimizer == 'AdamWeightDecayDynamicLR':
        optimizer = AdamWeightDecayDynamicLR(netwithloss.trainable_params(),
                                             decay_steps=ds.get_dataset_size() * ds.get_repeat_count(),
                                             learning_rate=cfg.AdamWeightDecayDynamicLR.learning_rate,
                                             end_learning_rate=cfg.AdamWeightDecayDynamicLR.end_learning_rate,
                                             power=cfg.AdamWeightDecayDynamicLR.power,
                                             weight_decay=cfg.AdamWeightDecayDynamicLR.weight_decay,
                                             eps=cfg.AdamWeightDecayDynamicLR.eps,
                                             warmup_steps=cfg.AdamWeightDecayDynamicLR.warmup_steps)
    else:
        raise ValueError("Don't support optimizer {}, only support [Lamb, Momentum, AdamWeightDecayDynamicLR]".
                         format(cfg.optimizer))
    callback = [TimeMonitor(ds.get_dataset_size()), LossCallBack()]
    if args_opt.enable_save_ckpt == "true":
        config_ck = CheckpointConfig(save_checkpoint_steps=args_opt.save_checkpoint_steps,
                                     keep_checkpoint_max=args_opt.save_checkpoint_num)
        ckpoint_cb = ModelCheckpoint(prefix='checkpoint_bert', config=config_ck)
        callback.append(ckpoint_cb)

    if args_opt.checkpoint_path:
        param_dict = load_checkpoint(args_opt.checkpoint_path)
        load_param_into_net(netwithloss, param_dict)

    if args_opt.enable_lossscale == "true":
        update_cell = DynamicLossScaleUpdateCell(loss_scale_value=cfg.loss_scale_value,
                                                 scale_factor=cfg.scale_factor,
                                                 scale_window=cfg.scale_window)
        netwithgrads = BertTrainOneStepWithLossScaleCell(netwithloss, optimizer=optimizer,
                                                         scale_update_cell=update_cell)
    else:
        netwithgrads = BertTrainOneStepCell(netwithloss, optimizer=optimizer)

    model = Model(netwithgrads)
    model.train(new_repeat_count, ds, callbacks=callback, dataset_sink_mode=(args_opt.enable_data_sink == "true"))
Ejemplo n.º 26
0
                  num_classes=cfg.num_classes,
                  vec_length=cfg.vec_length)
    # Continue training if set pre_trained to be True
    if cfg.pre_trained:
        param_dict = load_checkpoint(cfg.checkpoint_path)
        load_param_into_net(net, param_dict)

    opt = nn.Adam(filter(lambda x: x.requires_grad, net.get_parameters()),
                  learning_rate=learning_rate,
                  weight_decay=cfg.weight_decay)
    loss = SoftmaxCrossEntropyExpand(sparse=True)

    model = Model(net,
                  loss_fn=loss,
                  optimizer=opt,
                  metrics={'acc': Accuracy()})

    config_ck = CheckpointConfig(save_checkpoint_steps=int(cfg.epoch_size *
                                                           batch_num / 2),
                                 keep_checkpoint_max=cfg.keep_checkpoint_max)
    time_cb = TimeMonitor(data_size=batch_num)
    ckpt_save_dir = "./ckpt_" + str(rank) + "/"
    ckpoint_cb = ModelCheckpoint(prefix="train_textcnn",
                                 directory=ckpt_save_dir,
                                 config=config_ck)
    loss_cb = LossMonitor()
    model.train(cfg.epoch_size,
                dataset,
                callbacks=[time_cb, ckpoint_cb, loss_cb])
    print("train success")
Ejemplo n.º 27
0
def main():
    parser = argparse.ArgumentParser(description="SSD training")
    parser.add_argument("--only_create_dataset",
                        type=bool,
                        default=False,
                        help="If set it true, only create "
                        "Mindrecord, default is false.")
    parser.add_argument("--distribute",
                        type=bool,
                        default=False,
                        help="Run distribute, default is false.")
    parser.add_argument("--device_id",
                        type=int,
                        default=0,
                        help="Device id, default is 0.")
    parser.add_argument("--device_num",
                        type=int,
                        default=1,
                        help="Use device nums, default is 1.")
    parser.add_argument("--lr",
                        type=float,
                        default=0.25,
                        help="Learning rate, default is 0.25.")
    parser.add_argument("--mode",
                        type=str,
                        default="sink",
                        help="Run sink mode or not, default is sink.")
    parser.add_argument("--dataset",
                        type=str,
                        default="coco",
                        help="Dataset, defalut is coco.")
    parser.add_argument("--epoch_size",
                        type=int,
                        default=70,
                        help="Epoch size, default is 70.")
    parser.add_argument("--batch_size",
                        type=int,
                        default=32,
                        help="Batch size, default is 32.")
    parser.add_argument("--checkpoint_path",
                        type=str,
                        default="",
                        help="Checkpoint file path.")
    parser.add_argument("--save_checkpoint_epochs",
                        type=int,
                        default=5,
                        help="Save checkpoint epochs, default is 5.")
    parser.add_argument("--loss_scale",
                        type=int,
                        default=1024,
                        help="Loss scale, default is 1024.")
    args_opt = parser.parse_args()

    context.set_context(mode=context.GRAPH_MODE,
                        device_target="Ascend",
                        device_id=args_opt.device_id)
    context.set_context(enable_task_sink=True,
                        enable_loop_sink=True,
                        enable_mem_reuse=True)

    if args_opt.distribute:
        device_num = args_opt.device_num
        context.reset_auto_parallel_context()
        context.set_auto_parallel_context(
            parallel_mode=ParallelMode.DATA_PARALLEL,
            mirror_mean=True,
            device_num=device_num)
        init()
        rank = args_opt.device_id % device_num
    else:
        rank = 0
        device_num = 1

    print("Start create dataset!")

    # It will generate mindrecord file in args_opt.mindrecord_dir,
    # and the file name is ssd.mindrecord0, 1, ... file_num.

    config = ConfigSSD()
    prefix = "ssd.mindrecord"
    mindrecord_dir = config.MINDRECORD_DIR
    mindrecord_file = os.path.join(mindrecord_dir, prefix + "0")
    if not os.path.exists(mindrecord_file):
        if not os.path.isdir(mindrecord_dir):
            os.makedirs(mindrecord_dir)
        if args_opt.dataset == "coco":
            if os.path.isdir(config.COCO_ROOT):
                print("Create Mindrecord.")
                data_to_mindrecord_byte_image("coco", True, prefix)
                print("Create Mindrecord Done, at {}".format(mindrecord_dir))
            else:
                print("COCO_ROOT not exits.")
        else:
            if os.path.isdir(config.IMAGE_DIR) and os.path.exists(
                    config.ANNO_PATH):
                print("Create Mindrecord.")
                data_to_mindrecord_byte_image("other", True, prefix)
                print("Create Mindrecord Done, at {}".format(mindrecord_dir))
            else:
                print("IMAGE_DIR or ANNO_PATH not exits.")

    if not args_opt.only_create_dataset:
        loss_scale = float(args_opt.loss_scale)

        # When create MindDataset, using the fitst mindrecord file, such as ssd.mindrecord0.
        dataset = create_ssd_dataset(mindrecord_file,
                                     repeat_num=args_opt.epoch_size,
                                     batch_size=args_opt.batch_size,
                                     device_num=device_num,
                                     rank=rank)

        dataset_size = dataset.get_dataset_size()
        print("Create dataset done!")

        ssd = SSD300(backbone=ssd_mobilenet_v2(), config=config)
        net = SSDWithLossCell(ssd, config)
        init_net_param(net)

        # checkpoint
        ckpt_config = CheckpointConfig(save_checkpoint_steps=dataset_size *
                                       args_opt.save_checkpoint_epochs)
        ckpoint_cb = ModelCheckpoint(prefix="ssd",
                                     directory=None,
                                     config=ckpt_config)

        lr = Tensor(
            get_lr(global_step=0,
                   lr_init=0,
                   lr_end=0,
                   lr_max=args_opt.lr,
                   warmup_epochs=max(args_opt.epoch_size // 20, 1),
                   total_epochs=args_opt.epoch_size,
                   steps_per_epoch=dataset_size))
        opt = nn.Momentum(
            filter(lambda x: x.requires_grad, net.get_parameters()), lr, 0.9,
            0.0001, loss_scale)
        net = TrainingWrapper(net, opt, loss_scale)

        if args_opt.checkpoint_path != "":
            param_dict = load_checkpoint(args_opt.checkpoint_path)
            load_param_into_net(net, param_dict)

        callback = [
            TimeMonitor(data_size=dataset_size),
            LossMonitor(), ckpoint_cb
        ]

        model = Model(net)
        dataset_sink_mode = False
        if args_opt.mode == "sink":
            print("In sink mode, one epoch return a loss.")
            dataset_sink_mode = True
        print(
            "Start train SSD, the first epoch will be slower because of the graph compilation."
        )
        model.train(args_opt.epoch_size,
                    dataset,
                    callbacks=callback,
                    dataset_sink_mode=dataset_sink_mode)
Ejemplo n.º 28
0
def run_pretrain():
    """pre-train bert_clue"""
    parser = argparse.ArgumentParser(description='bert pre_training')
    parser.add_argument(
        '--device_target',
        type=str,
        default='Ascend',
        choices=['Ascend', 'GPU'],
        help='device where the code will be implemented. (Default: Ascend)')
    parser.add_argument("--distribute",
                        type=str,
                        default="false",
                        help="Run distribute, default is false.")
    parser.add_argument("--epoch_size",
                        type=int,
                        default="1",
                        help="Epoch size, default is 1.")
    parser.add_argument("--device_id",
                        type=int,
                        default=0,
                        help="Device id, default is 0.")
    parser.add_argument("--device_num",
                        type=int,
                        default=1,
                        help="Use device nums, default is 1.")
    parser.add_argument("--enable_save_ckpt",
                        type=str,
                        default="true",
                        help="Enable save checkpoint, default is true.")
    parser.add_argument("--enable_lossscale",
                        type=str,
                        default="true",
                        help="Use lossscale or not, default is not.")
    parser.add_argument("--do_shuffle",
                        type=str,
                        default="true",
                        help="Enable shuffle for dataset, default is true.")
    parser.add_argument("--enable_data_sink",
                        type=str,
                        default="true",
                        help="Enable data sink, default is true.")
    parser.add_argument("--data_sink_steps",
                        type=int,
                        default="1",
                        help="Sink steps for each epoch, default is 1.")
    parser.add_argument("--save_checkpoint_path",
                        type=str,
                        default="",
                        help="Save checkpoint path")
    parser.add_argument("--load_checkpoint_path",
                        type=str,
                        default="",
                        help="Load checkpoint file path")
    parser.add_argument("--save_checkpoint_steps",
                        type=int,
                        default=1000,
                        help="Save checkpoint steps, "
                        "default is 1000.")
    parser.add_argument("--train_steps",
                        type=int,
                        default=-1,
                        help="Training Steps, default is -1, "
                        "meaning run all steps according to epoch number.")
    parser.add_argument("--save_checkpoint_num",
                        type=int,
                        default=1,
                        help="Save checkpoint numbers, default is 1.")
    parser.add_argument("--data_dir",
                        type=str,
                        default="",
                        help="Data path, it is better to use absolute path")
    parser.add_argument("--schema_dir",
                        type=str,
                        default="",
                        help="Schema path, it is better to use absolute path")

    args_opt = parser.parse_args()
    context.set_context(mode=context.GRAPH_MODE,
                        device_target=args_opt.device_target,
                        device_id=args_opt.device_id)
    context.set_context(reserve_class_name_in_scope=False)
    context.set_context(variable_memory_max_size="30GB")
    ckpt_save_dir = args_opt.save_checkpoint_path
    if args_opt.distribute == "true":
        if args_opt.device_target == 'Ascend':
            D.init('hccl')
            device_num = args_opt.device_num
            rank = args_opt.device_id % device_num
        else:
            D.init('nccl')
            device_num = D.get_group_size()
            rank = D.get_rank()
            ckpt_save_dir = args_opt.save_checkpoint_path + 'ckpt_' + str(
                rank) + '/'

        context.reset_auto_parallel_context()
        context.set_auto_parallel_context(
            parallel_mode=ParallelMode.DATA_PARALLEL,
            mirror_mean=True,
            device_num=device_num)
        from mindspore.parallel._auto_parallel_context import auto_parallel_context
        if bert_net_cfg.num_hidden_layers == 12:
            if bert_net_cfg.use_relative_positions:
                auto_parallel_context().set_all_reduce_fusion_split_indices(
                    [29, 58, 87, 116, 145, 174, 203, 217])
            else:
                auto_parallel_context().set_all_reduce_fusion_split_indices(
                    [28, 55, 82, 109, 136, 163, 190, 205])
        elif bert_net_cfg.num_hidden_layers == 24:
            if bert_net_cfg.use_relative_positions:
                auto_parallel_context().set_all_reduce_fusion_split_indices(
                    [30, 90, 150, 210, 270, 330, 390, 421])
            else:
                auto_parallel_context().set_all_reduce_fusion_split_indices(
                    [38, 93, 148, 203, 258, 313, 368, 397])
    else:
        rank = 0
        device_num = 1

    if args_opt.device_target == 'GPU' and bert_net_cfg.compute_type != mstype.float32:
        logger.warning('Gpu only support fp32 temporarily, run with fp32.')
        bert_net_cfg.compute_type = mstype.float32

    ds, new_repeat_count = create_bert_dataset(args_opt.epoch_size, device_num,
                                               rank, args_opt.do_shuffle,
                                               args_opt.enable_data_sink,
                                               args_opt.data_sink_steps,
                                               args_opt.data_dir,
                                               args_opt.schema_dir)
    if args_opt.train_steps > 0:
        new_repeat_count = min(
            new_repeat_count, args_opt.train_steps // args_opt.data_sink_steps)
    netwithloss = BertNetworkWithLoss(bert_net_cfg, True)

    if cfg.optimizer == 'Lamb':
        optimizer = Lamb(netwithloss.trainable_params(),
                         decay_steps=ds.get_dataset_size() * new_repeat_count,
                         start_learning_rate=cfg.Lamb.start_learning_rate,
                         end_learning_rate=cfg.Lamb.end_learning_rate,
                         power=cfg.Lamb.power,
                         warmup_steps=cfg.Lamb.warmup_steps,
                         weight_decay=cfg.Lamb.weight_decay,
                         eps=cfg.Lamb.eps)
    elif cfg.optimizer == 'Momentum':
        optimizer = Momentum(netwithloss.trainable_params(),
                             learning_rate=cfg.Momentum.learning_rate,
                             momentum=cfg.Momentum.momentum)
    elif cfg.optimizer == 'AdamWeightDecayDynamicLR':
        optimizer = AdamWeightDecayDynamicLR(
            netwithloss.trainable_params(),
            decay_steps=ds.get_dataset_size() * new_repeat_count,
            learning_rate=cfg.AdamWeightDecayDynamicLR.learning_rate,
            end_learning_rate=cfg.AdamWeightDecayDynamicLR.end_learning_rate,
            power=cfg.AdamWeightDecayDynamicLR.power,
            weight_decay=cfg.AdamWeightDecayDynamicLR.weight_decay,
            eps=cfg.AdamWeightDecayDynamicLR.eps,
            warmup_steps=cfg.AdamWeightDecayDynamicLR.warmup_steps)
    else:
        raise ValueError(
            "Don't support optimizer {}, only support [Lamb, Momentum, AdamWeightDecayDynamicLR]"
            .format(cfg.optimizer))
    callback = [TimeMonitor(ds.get_dataset_size()), LossCallBack()]
    if args_opt.enable_save_ckpt == "true":
        config_ck = CheckpointConfig(
            save_checkpoint_steps=args_opt.save_checkpoint_steps,
            keep_checkpoint_max=args_opt.save_checkpoint_num)
        ckpoint_cb = ModelCheckpoint(prefix='checkpoint_bert',
                                     directory=ckpt_save_dir,
                                     config=config_ck)
        callback.append(ckpoint_cb)

    if args_opt.load_checkpoint_path:
        param_dict = load_checkpoint(args_opt.load_checkpoint_path)
        load_param_into_net(netwithloss, param_dict)

    if args_opt.enable_lossscale == "true":
        update_cell = DynamicLossScaleUpdateCell(
            loss_scale_value=cfg.loss_scale_value,
            scale_factor=cfg.scale_factor,
            scale_window=cfg.scale_window)
        netwithgrads = BertTrainOneStepWithLossScaleCell(
            netwithloss, optimizer=optimizer, scale_update_cell=update_cell)
    else:
        netwithgrads = BertTrainOneStepCell(netwithloss, optimizer=optimizer)

    model = Model(netwithgrads)
    model.train(new_repeat_count,
                ds,
                callbacks=callback,
                dataset_sink_mode=(args_opt.enable_data_sink == "true"))
Ejemplo n.º 29
0
    # define funsion network
    network = LeNet5Fusion(cfg.num_classes)
    # load aware quantizaiton network checkpoint
    param_dict = load_checkpoint(args.ckpt_path)
    load_param_into_net(network, param_dict)
    # convert funsion netwrok to aware quantizaiton network
    network = quant.convert_quant_network(network,
                                          quant_delay=0,
                                          bn_fold=False,
                                          freeze_bn=10000)

    net_loss = nn.SoftmaxCrossEntropyWithLogits(is_grad=False,
                                                sparse=True,
                                                reduction="mean")
    net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum)
    time_cb = TimeMonitor(data_size=ds_train.get_dataset_size())
    config_ck = CheckpointConfig(save_checkpoint_steps=cfg.epoch_size *
                                 step_size,
                                 keep_checkpoint_max=cfg.keep_checkpoint_max)
    ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet", config=config_ck)
    model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()})

    print("============== Starting Training ==============")
    model.train(cfg['epoch_size'],
                ds_train,
                callbacks=[time_cb, ckpoint_cb,
                           LossMonitor()],
                dataset_sink_mode=args.dataset_sink_mode)
    print("============== End Training ==============")
Ejemplo n.º 30
0
        init_loss_scale=args.dynamic_init_loss_scale,
        scale_factor=2,
        scale_window=2000)
    model = Model(train_net,
                  optimizer=opt,
                  metrics=None,
                  loss_scale_manager=scale_manager)
    save_checkpoint_steps = args.ckpt_steps
    args.logger.info('save_checkpoint_steps:{}'.format(save_checkpoint_steps))
    if args.max_ckpts == -1:
        keep_checkpoint_max = int(args.steps_per_epoch * args.max_epoch /
                                  save_checkpoint_steps) + 5  # for more than 5
    else:
        keep_checkpoint_max = args.max_ckpts
    args.logger.info('keep_checkpoint_max:{}'.format(keep_checkpoint_max))

    ckpt_config = CheckpointConfig(save_checkpoint_steps=save_checkpoint_steps,
                                   keep_checkpoint_max=keep_checkpoint_max)
    max_epoch_train = args.max_epoch
    args.logger.info('max_epoch_train:{}'.format(max_epoch_train))
    ckpt_cb = ModelCheckpoint(config=ckpt_config,
                              directory=args.ckpt_path,
                              prefix='{}'.format(args.local_rank))
    args.epoch_cnt = 0
    progress_cb = ProgressMonitor(args)
    new_epoch_train = max_epoch_train * steps_per_epoch // args.log_interval
    model.train(new_epoch_train,
                de_dataset,
                callbacks=[progress_cb, ckpt_cb],
                sink_size=args.log_interval)