Ejemplo n.º 1
0
def resnet50_train(args_opt):
    epoch_size = args_opt.epoch_size
    batch_size = 32
    class_num = 10
    loss_scale_num = 1024
    local_data_path = '/cache/data'

    # set graph mode and parallel mode
    context.set_context(mode=context.GRAPH_MODE,
                        device_target="Ascend",
                        save_graphs=False)
    context.set_context(device_id=device_id)
    if device_num > 1:
        context.set_auto_parallel_context(
            device_num=device_num,
            parallel_mode=ParallelMode.DATA_PARALLEL,
            gradients_mean=True)
        init()
        local_data_path = os.path.join(local_data_path, str(device_id))

    # data download
    print('Download data.')
    mox.file.copy_parallel(src_url=args_opt.data_url, dst_url=local_data_path)

    # create dataset
    print('Create train and evaluate dataset.')
    train_dataset = create_dataset(dataset_path=local_data_path,
                                   do_train=True,
                                   repeat_num=1,
                                   batch_size=batch_size)
    eval_dataset = create_dataset(dataset_path=local_data_path,
                                  do_train=False,
                                  repeat_num=1,
                                  batch_size=batch_size)
    train_step_size = train_dataset.get_dataset_size()
    print('Create dataset success.')

    # create model
    net = resnet50(class_num=class_num)
    # reduction='mean' means that apply reduction of mean to loss
    loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean')
    lr = Tensor(
        get_lr(global_step=0,
               total_epochs=epoch_size,
               steps_per_epoch=train_step_size))
    opt = Momentum(net.trainable_params(),
                   lr,
                   momentum=0.9,
                   weight_decay=1e-4,
                   loss_scale=loss_scale_num)
    loss_scale = FixedLossScaleManager(loss_scale_num, False)

    # amp_level="O2" means that the hybrid precision of O2 mode is used for training
    # the whole network except that batchnoram will be cast into float16 format and dynamic loss scale will be used
    # 'keep_batchnorm_fp32 = False' means that use the float16 format
    model = Model(net,
                  amp_level="O2",
                  keep_batchnorm_fp32=False,
                  loss_fn=loss,
                  optimizer=opt,
                  loss_scale_manager=loss_scale,
                  metrics={'acc'})

    # define performance callback to show ips and loss callback to show loss for every epoch
    performance_cb = PerformanceCallback(batch_size)
    loss_cb = LossMonitor()
    cb = [performance_cb, loss_cb]

    print(f'Start run training, total epoch: {epoch_size}.')
    model.train(epoch_size, train_dataset, callbacks=cb)
    if device_num == 1 or device_id == 0:
        print(f'Start run evaluation.')
        output = model.eval(eval_dataset)
        print(f'Evaluation result: {output}.')
Ejemplo n.º 2
0
     'params': no_decayed_params
 }, {
     'order_params': net.trainalbe_params()
 }]
 opt = Momentum(group_params,
                lr,
                config.momentum,
                loss_scale=config.loss_scale)
 # define loss, model
 if target == "Ascend":
     if args_opt.dataset == "imagenet2012":
         if not config.use_label_smooth:
             config.label_smooth_factor = 0.0
         loss = SoftmaxCrossEntropyWithLogits(
             sparse=True,
             reduction="mean",
             smooth_factor=config.label_smooth_factor,
             num_classes=config.class_num)
     else:
         loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean')
     loss_scale = FixedLossScaleManager(config.loss_scale,
                                        drop_overflow_update=False)
     model = Model(net,
                   loss_fn=loss,
                   optimizer=opt,
                   loss_scale_manager=loss_scale,
                   metrics={'acc'},
                   amp_level="O2",
                   keep_batchnorm_fp32=False)
 else:
     # GPU target
    parser.add_argument('--checkpoint_path',
                        type=str,
                        default=None,
                        help='Pretrained checkpoint path')
    args = parser.parse_args()
    context.set_context(mode=context.GRAPH_MODE,
                        device_target=args.device_target)
    dataset_sink_mode = not args.device_target == "CPU"

    # define net
    net = resnet(
        class_num=10
    )  #if you wish to consider other module you can pass also this argument
    # ckpoint = args.checkpoint_path
    # define loss, model
    loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean')
    #define cifar-10 path
    cifar_path = "./CIFAR-10"
    # define model
    model = Model(net, loss_fn=loss, metrics={"Accuracy": Accuracy()})
    # eval model
    test_net(net, model, cifar_path, args.checkpoint_path)

    #     # config for resent50, cifar10
    # config1 = ed({
    #     "class_num": 10,
    #     "batch_size": 32,
    #     "loss_scale": 1024,
    #     "momentum": 0.9,
    #     "weight_decay": 1e-4,
    #     "epoch_size": 90,
def resnet50_train(args_opt):
    epoch_size = args_opt.epoch_size
    batch_size = cfg.batch_size
    class_num = cfg.class_num
    loss_scale_num = cfg.loss_scale
    local_data_path = '/cache/data'
    local_ckpt_path = '/cache/ckpt_file'

    # set graph mode and parallel mode
    context.set_context(mode=context.GRAPH_MODE,
                        device_target="Ascend",
                        save_graphs=False)

    # data download
    print('Download data.')
    mox.file.copy_parallel(src_url=args_opt.data_url, dst_url=local_data_path)

    # create dataset
    print('Create train and evaluate dataset.')
    train_dataset = create_dataset(dataset_path=local_data_path,
                                   do_train=True,
                                   repeat_num=epoch_size,
                                   batch_size=batch_size)
    train_step_size = train_dataset.get_dataset_size()
    print('Create dataset success.')

    # create model
    net = resnet50(class_num=class_num)
    # reduction='mean' means that apply reduction of mean to loss
    loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean')
    lr = Tensor(
        get_lr(global_step=0,
               total_epochs=epoch_size,
               steps_per_epoch=train_step_size))
    opt = Momentum(net.trainable_params(),
                   lr,
                   momentum=0.9,
                   weight_decay=1e-4,
                   loss_scale=loss_scale_num)
    loss_scale = FixedLossScaleManager(loss_scale_num, False)

    # amp_level="O2" means that the hybrid precision of O2 mode is used for training
    # the whole network except that batchnorm will be cast into float16 format and dynamic loss scale will be used
    # 'keep_batchnorm_fp32 = False' means that use the float16 format
    model = Model(net,
                  amp_level="O2",
                  keep_batchnorm_fp32=False,
                  loss_fn=loss,
                  optimizer=opt,
                  loss_scale_manager=loss_scale,
                  metrics={'acc'})

    # define performance callback to show ips and loss callback to show loss for every epoch
    time_cb = TimeMonitor(data_size=train_step_size)
    performance_cb = PerformanceCallback(batch_size)
    loss_cb = LossMonitor()
    cb = [time_cb, performance_cb, loss_cb]
    config_ck = CheckpointConfig(
        save_checkpoint_steps=cfg.save_checkpoint_epochs * train_step_size,
        keep_checkpoint_max=cfg.keep_checkpoint_max)
    ckpt_cb = ModelCheckpoint(prefix="resnet",
                              directory=local_ckpt_path,
                              config=config_ck)
    cb += [ckpt_cb]

    print(f'Start run training, total epoch: {epoch_size}.')
    model.train(epoch_size, train_dataset, callbacks=cb)

    # upload checkpoint files
    print('Upload checkpoint.')
    mox.file.copy_parallel(src_url=local_ckpt_path, dst_url=args_opt.train_url)
Ejemplo n.º 5
0
    if run_distribute:
        context.set_auto_parallel_context(device_num=rank_size, parallel_mode=ParallelMode.DATA_PARALLEL,
                                          parameter_broadcast=True, mirror_mean=True)
        auto_parallel_context().set_all_reduce_fusion_split_indices([140])
        init()

    epoch_size = config.epoch_size
    net = mobilenet_v2(num_classes=config.num_classes)
    net.to_float(mstype.float16)
    for _, cell in net.cells_and_names():
        if isinstance(cell, nn.Dense):
            cell.add_flags_recursive(fp32=True)
    if config.label_smooth > 0:
        loss = CrossEntropyWithLabelSmooth(smooth_factor=config.label_smooth, num_classes=config.num_classes)
    else:
        loss = SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True, reduction='mean')

    print("train args: ", args_opt, "\ncfg: ", config,
          "\nparallel args: rank_id {}, device_id {}, rank_size {}".format(rank_id, device_id, rank_size))

    dataset = create_dataset(dataset_path=args_opt.dataset_path, do_train=True,
                             repeat_num=epoch_size, batch_size=config.batch_size)
    step_size = dataset.get_dataset_size()
    if args_opt.pre_trained:
        param_dict = load_checkpoint(args_opt.pre_trained)
        load_param_into_net(net, param_dict)

    loss_scale = FixedLossScaleManager(config.loss_scale, drop_overflow_update=False)
    lr = Tensor(get_lr(global_step=0, lr_init=0, lr_end=0, lr_max=config.lr,
                       warmup_epochs=config.warmup_epochs, total_epochs=epoch_size, steps_per_epoch=step_size))
    opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), lr, config.momentum,
Ejemplo n.º 6
0
if __name__ == '__main__':
    if args_opt.do_eval:
        context.set_context(enable_hccl=False)
    else:
        if args_opt.run_distribute:
            context.set_context(enable_hccl=True)
            context.set_auto_parallel_context(device_num=args_opt.device_num, parallel_mode=ParallelMode.DATA_PARALLEL,
                                              mirror_mean=True)
            auto_parallel_context().set_all_reduce_fusion_split_indices([140])
            init()
        else:
            context.set_context(enable_hccl=False)

    epoch_size = config.epoch_size
    net = resnet50(class_num=config.class_num)
    loss = SoftmaxCrossEntropyWithLogits(sparse=True)


    if args_opt.do_train:
        dataset = create_dataset(dataset_path=args_opt.dataset_path, do_train=True,
                                 repeat_num=epoch_size, batch_size=config.batch_size)
        step_size = dataset.get_dataset_size()

        loss_scale = FixedLossScaleManager(config.loss_scale, drop_overflow_update=False)
        lr = Tensor(get_lr(global_step=0, lr_init=config.lr_init, lr_end=config.lr_end, lr_max=config.lr_max,
                           warmup_epochs=config.warmup_epochs, total_epochs=epoch_size, steps_per_epoch=step_size,
                           lr_decay_mode='poly'))
        opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), lr, config.momentum,
                       config.weight_decay, config.loss_scale)

        model = Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics={'acc'})
Ejemplo n.º 7
0
 def __init__(self, reduction='mean'):
     super(CrossEntropyLoss2, self).__init__()
     self.cross_entropy = SoftmaxCrossEntropyWithLogits(reduction=reduction)
Ejemplo n.º 8
0
    def __init__(self, reduction='mean'):
        super(CrossEntropyLoss, self).__init__()

        self.reduce_mean = P.ReduceMean()
        self.cross_entropy = SoftmaxCrossEntropyWithLogits()
        self.reduction = reduction