Example #1
0
def test_resetnet50_build():
    X = me.tensor()
    Y = me.tensor()
    X.set_dtype(dtype.float32)
    Y.set_dtype(dtype.float32)
    network = resnet50()
    model = Model(network=network, loss_fn=loss_func, optimizer=optimizer)
Example #2
0
def test_amp_resnet50_loss():
    inputs = Tensor(np.ones([2, 3, 224, 224]).astype(np.float32))
    label = Tensor(np.zeros([2, 10]).astype(np.float32))
    net = resnet50()
    loss = nn.SoftmaxCrossEntropyWithLogits(reduction='mean')
    optimizer = nn.Momentum(net.trainable_params(), learning_rate=0.1, momentum=0.9)
    train_network = amp.build_train_network(net, optimizer, loss, level="O2")
    train_network(inputs, label)
Example #3
0
def resnet50_train(args_opt):
    epoch_size = args_opt.epoch_size
    batch_size = 32
    class_num = 10
    loss_scale_num = 1024
    local_data_path = '/cache/data'

    # set graph mode and parallel mode
    context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", save_graphs=False)
    context.set_context(enable_task_sink=True, device_id=device_id)
    context.set_context(enable_loop_sink=True)
    context.set_context(enable_mem_reuse=True)
    if device_num > 1:
        context.set_auto_parallel_context(device_num=device_num,
                                          parallel_mode=ParallelMode.DATA_PARALLEL,
                                          mirror_mean=True)
        local_data_path = os.path.join(local_data_path, str(device_id))

    # data download
    print('Download data.')
    mox.file.copy_parallel(src_url=args_opt.data_url, dst_url=local_data_path)

    # create dataset
    print('Create train and evaluate dataset.')
    train_dataset = create_dataset(dataset_path=local_data_path, do_train=True,
                                   repeat_num=epoch_size, batch_size=batch_size)
    eval_dataset = create_dataset(dataset_path=local_data_path, do_train=False,
                                   repeat_num=1, batch_size=batch_size)
    train_step_size = train_dataset.get_dataset_size()
    print('Create dataset success.')

    # create model
    net = resnet50(class_num = class_num)
    loss = SoftmaxCrossEntropyWithLogits(sparse=True)
    lr = Tensor(get_lr(global_step=0, total_epochs=epoch_size, steps_per_epoch=train_step_size))
    opt = Momentum(net.trainable_params(), lr, momentum=0.9, weight_decay=1e-4, loss_scale=loss_scale_num)
    loss_scale = FixedLossScaleManager(loss_scale_num, False)

    model = Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics={'acc'})

    # define performance callback to show ips and loss callback to show loss for every epoch
    performance_cb = PerformanceCallback(batch_size)
    loss_cb = LossMonitor()
    cb = [performance_cb, loss_cb]

    print(f'Start run training, total epoch: {epoch_size}.')
    model.train(epoch_size, train_dataset, callbacks=cb)
    if device_num == 1 or device_id == 0:
        print(f'Start run evaluation.')
        output = model.eval(eval_dataset)
        print(f'Evaluation result: {output}.')
Example #4
0
                    save_graphs=False)
context.set_context(enable_task_sink=True, device_id=device_id)
context.set_context(enable_loop_sink=True)
context.set_context(enable_mem_reuse=True)

if __name__ == '__main__':
    if not args_opt.do_eval and args_opt.run_distribute:
        context.set_auto_parallel_context(
            device_num=args_opt.device_num,
            parallel_mode=ParallelMode.DATA_PARALLEL,
            mirror_mean=True)
        auto_parallel_context().set_all_reduce_fusion_split_indices([140])
        init()

    epoch_size = config.epoch_size
    net = resnet50(class_num=config.class_num)
    loss = SoftmaxCrossEntropyWithLogits(sparse=True)

    if args_opt.do_train:
        dataset = create_dataset(dataset_path=args_opt.dataset_path,
                                 do_train=True,
                                 repeat_num=epoch_size,
                                 batch_size=config.batch_size)
        step_size = dataset.get_dataset_size()

        loss_scale = FixedLossScaleManager(config.loss_scale,
                                           drop_overflow_update=False)
        lr = Tensor(
            get_lr(global_step=0,
                   lr_init=config.lr_init,
                   lr_end=config.lr_end,
Example #5
0
def train_process(q, device_id, epoch_size, device_num, enable_hccl):
    os.system("mkdir " + str(device_id))
    os.chdir(str(device_id))
    context.set_context(mode=context.GRAPH_MODE,
                        device_target="Ascend",
                        save_graphs=False)
    context.set_context(device_id=device_id)
    os.environ['MINDSPORE_HCCL_CONFIG_PATH'] = MINDSPORE_HCCL_CONFIG_PATH
    os.environ['RANK_ID'] = str(device_id)
    os.environ['RANK_SIZE'] = str(device_num)
    if enable_hccl:
        context.set_auto_parallel_context(
            device_num=device_num,
            parallel_mode=ParallelMode.DATA_PARALLEL,
            mirror_mean=True,
            parameter_broadcast=True)
        auto_parallel_context().set_all_reduce_fusion_split_indices([107, 160])
        init()

    # network
    net = resnet50(class_num=config.class_num)

    # evaluation network
    dist_eval_network = ClassifyCorrectCell(net)

    if not config.use_label_smooth:
        config.label_smooth_factor = 0.0

    # loss
    loss = nn.SoftmaxCrossEntropyWithLogits(
        sparse=True,
        reduction="mean",
        smooth_factor=config.label_smooth_factor,
        num_classes=config.class_num)

    # train dataset
    dataset = create_dataset(dataset_path=dataset_path,
                             do_train=True,
                             repeat_num=epoch_size,
                             batch_size=config.batch_size)

    step_size = dataset.get_dataset_size()
    eval_interval = config.eval_interval
    dataset.__loop_size__ = step_size * eval_interval

    # evalutation dataset
    eval_dataset = create_dataset(dataset_path=eval_path,
                                  do_train=False,
                                  repeat_num=epoch_size,
                                  batch_size=config.eval_batch_size)

    # loss scale
    loss_scale = FixedLossScaleManager(config.loss_scale,
                                       drop_overflow_update=False)

    # learning rate
    lr = Tensor(
        get_learning_rate(lr_init=config.lr_init,
                          lr_end=0.0,
                          lr_max=config.lr_max,
                          warmup_epochs=config.warmup_epochs,
                          total_epochs=config.epoch_size,
                          steps_per_epoch=step_size,
                          lr_decay_mode=config.lr_decay_mode))

    # optimizer
    decayed_params = list(
        filter(
            lambda x: 'beta' not in x.name and 'gamma' not in x.name and 'bias'
            not in x.name, net.trainable_params()))
    no_decayed_params = [
        param for param in net.trainable_params()
        if param not in decayed_params
    ]
    group_params = [{
        'params': decayed_params,
        'weight_decay': config.weight_decay
    }, {
        'params': no_decayed_params,
        'weight_decay': 0.0
    }, {
        'order_params': net.trainable_params()
    }]

    if config.use_lars:
        momentum = nn.Momentum(group_params,
                               lr,
                               config.momentum,
                               loss_scale=config.loss_scale,
                               use_nesterov=config.use_nesterov)
        opt = nn.LARS(momentum,
                      epsilon=config.lars_epsilon,
                      coefficient=config.lars_coefficient,
                      lars_filter=lambda x: 'beta' not in x.name and 'gamma'
                      not in x.name and 'bias' not in x.name)

    else:
        opt = nn.Momentum(group_params,
                          lr,
                          config.momentum,
                          loss_scale=config.loss_scale,
                          use_nesterov=config.use_nesterov)

    # model
    model = Model(net,
                  loss_fn=loss,
                  optimizer=opt,
                  loss_scale_manager=loss_scale,
                  amp_level="O2",
                  keep_batchnorm_fp32=False,
                  metrics={
                      'acc':
                      DistAccuracy(batch_size=config.eval_batch_size,
                                   device_num=device_num)
                  },
                  eval_network=dist_eval_network)

    # model init
    print("init_start", device_id)
    model.init(dataset, eval_dataset)
    print("init_stop", device_id)

    # callbacks
    loss_cb = LossGet(1, step_size)

    # train and eval
    print("run_start", device_id)
    acc = 0.0
    time_cost = 0.0
    for epoch_idx in range(0, int(epoch_size / eval_interval)):
        model.train(1, dataset, callbacks=loss_cb)
        eval_start = time.time()
        output = model.eval(eval_dataset)
        eval_cost = (time.time() - eval_start) * 1000
        acc = float(output["acc"])
        time_cost = loss_cb.get_per_step_time()
        loss = loss_cb.get_loss()
        print(
            "the {} epoch's resnet result:\n "
            "device{}, training loss {}, acc {}, "
            "training per step cost {:.2f} ms, eval cost {:.2f} ms, total_cost {:.2f} ms"
            .format(epoch_idx, device_id, loss, acc, time_cost, eval_cost,
                    time_cost * step_size + eval_cost))
    q.put({'acc': acc, 'cost': time_cost})
Example #6
0
def test_resnet_construct(x):
    # not right model to import
    network = resnet50()
    return network.construct(x)