Ejemplo n.º 1
0
def run_transformer_train():
    """
    Transformer training.
    """
    parser = argparse_init()
    args, _ = parser.parse_known_args()
    context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=args.device_id)
    context.set_context(reserve_class_name_in_scope=False, enable_auto_mixed_precision=False)

    if args.distribute == "true":
        device_num = args.device_num
        context.reset_auto_parallel_context()
        context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, mirror_mean=True,
                                          parameter_broadcast=True, device_num=device_num)
        D.init()
        rank_id = args.device_id % device_num
    else:
        device_num = 1
        rank_id = 0
    dataset = create_transformer_dataset(epoch_count=1, rank_size=device_num,
                                         rank_id=rank_id, do_shuffle=args.do_shuffle,
                                         enable_data_sink=args.enable_data_sink,
                                         dataset_path=args.data_path)

    netwithloss = TransformerNetworkWithLoss(transformer_net_cfg, True)

    if args.checkpoint_path:
        parameter_dict = load_checkpoint(args.checkpoint_path)
        load_param_into_net(netwithloss, parameter_dict)

    lr = Tensor(create_dynamic_lr(schedule="constant*rsqrt_hidden*linear_warmup*rsqrt_decay",
                                  training_steps=dataset.get_dataset_size()*args.epoch_size,
                                  learning_rate=cfg.lr_schedule.learning_rate,
                                  warmup_steps=cfg.lr_schedule.warmup_steps,
                                  hidden_size=transformer_net_cfg.hidden_size,
                                  start_decay_step=cfg.lr_schedule.start_decay_step,
                                  min_lr=cfg.lr_schedule.min_lr), mstype.float32)
    optimizer = Adam(netwithloss.trainable_params(), lr)

    callbacks = [TimeMonitor(dataset.get_dataset_size()), LossCallBack()]
    if args.enable_save_ckpt == "true":
        if device_num == 1 or (device_num > 1 and rank_id == 0):
            ckpt_config = CheckpointConfig(save_checkpoint_steps=args.save_checkpoint_steps,
                                           keep_checkpoint_max=args.save_checkpoint_num)
            ckpoint_cb = ModelCheckpoint(prefix='transformer', directory=args.save_checkpoint_path, config=ckpt_config)
            callbacks.append(ckpoint_cb)

    if args.enable_lossscale == "true":
        scale_manager = DynamicLossScaleManager(init_loss_scale=cfg.init_loss_scale_value,
                                                scale_factor=cfg.scale_factor,
                                                scale_window=cfg.scale_window)
        update_cell = scale_manager.get_update_cell()
        netwithgrads = TransformerTrainOneStepWithLossScaleCell(netwithloss, optimizer=optimizer,
                                                                scale_update_cell=update_cell)
    else:
        netwithgrads = TransformerTrainOneStepCell(netwithloss, optimizer=optimizer)

    netwithgrads.set_train(True)
    model = Model(netwithgrads)
    model.train(args.epoch_size, dataset, callbacks=callbacks, dataset_sink_mode=(args.enable_data_sink == "true"))
Ejemplo n.º 2
0
def test_bert_tdt():
    """test bert tdt"""
    np.random.seed(0)
    context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", reserve_class_name_in_scope=False)
    context.set_context(enable_graph_kernel=True)
    ds = me_de_train_dataset()
    config = get_config(version='large', batch_size=16)
    netwithloss = BertNetworkWithLoss(config, True)
    optimizer = Lamb(netwithloss.trainable_params(), decay_steps=ds.get_dataset_size()*ds.get_repeat_count(),
                     start_learning_rate=5e-5, end_learning_rate=1e-9,
                     power=10.0, warmup_steps=0, weight_decay=0.01)
    scale_window = 3
    scale_manager = DynamicLossScaleManager(262144, 2, scale_window)
    netwithgrads = BertTrainOneStepWithLossScaleCell(netwithloss, optimizer=optimizer,
                                                     scale_update_cell=scale_manager.get_update_cell())
    netwithgrads.set_train(True)
    model = Model(netwithgrads)
    callback = ModelCallback()
    params = netwithloss.trainable_params()
    for param in params:
        param.init_data()
        value = param.default_input
        name = param.name
        if isinstance(value, Tensor):
            if name.split('.')[-1] in ['weight']:
                if name.split('.')[-3] in ['cls2']:
                    logger.info("***************** BERT param name is 1 {}".format(name))
                    param.default_input = weight_variable(value.asnumpy().shape)
                else:
                    logger.info("***************** BERT param name is 2 {}".format(name))
                    tempshape = value.asnumpy().shape
                    shape = (tempshape[1], tempshape[0])
                    weight_value = weight_variable(shape).asnumpy()
                    param.default_input = Tensor(np.transpose(weight_value, [1, 0]))
            else:
                logger.info("***************** BERT param name is 3 {}".format(name))
                param.default_input = weight_variable(value.asnumpy().shape)
    model.train(1, ds, callbacks=callback, dataset_sink_mode=False)

    # assertion occurs while the loss value, overflow state or loss_scale value is wrong
    loss_value = np.array(callback.loss_list)
    expect_loss_value = [12.559319, 12.333815, 12.339806, 12.350235, 12.343947, 12.830965, 12.375336, 12.973715,
                         12.57929, 12.7766905]
    error = loss_value - expect_loss_value
    print("loss value: {}".format(loss_value))
    print("error value: {}".format(error))
    assert np.allclose(loss_value, expect_loss_value, 0, 0.0005)

    overflow = np.array(callback.overflow_list)
    expect_overflow = [True, True, True, True, False, False, False, True, False, False]
    print("overflow: {}".format(overflow))
    assert (overflow == expect_overflow).all()

    loss_scale = np.array(callback.lossscale_list)
    expect_loss_scale = [131072.0, 65536.0, 32768.0, 16384.0, 16384.0, 16384.0, 32768.0, 16384.0, 16384.0, 16384.0]
    print("loss scale: {}".format(loss_scale))
    assert np.allclose(loss_scale, expect_loss_scale, 0, 0)
Ejemplo n.º 3
0
def test_compile_fp16_lr_overflow_dynamic_graph():
    inputs = Tensor(np.ones([16, 16]).astype(np.float32))
    label = Tensor(np.zeros([16, 16]).astype(np.float32))
    lr = Tensor(np.ones([1], np.float32) * 0.1)
    net = NetFP16(16, 16)
    loss = MSELoss()
    optimizer = Momentum(net.trainable_params(), learning_rate=lr, momentum=0.9)

    net_with_loss = WithLossCell(net, loss)
    scale_manager = DynamicLossScaleManager()
    update_cell = scale_manager.get_update_cell()
    train_network = TrainOneStepWithLossScaleCell(net_with_loss, optimizer, scale_update_cell=update_cell)
    train_network.set_train()
    output = train_network(inputs, label)
    print("the result is ", output)
Ejemplo n.º 4
0
def test_compile_grad_error():
    inputs = Tensor(np.ones([16, 16]).astype(np.float32))
    label = Tensor(np.zeros([16, 16]).astype(np.float32))
    lr = Tensor(np.ones([1], np.float32) * 0.1)
    net = NetFP16(16, 16)
    loss = MSELoss()
    optimizer = Momentum(net.trainable_params(), learning_rate=lr, momentum=0.9)

    net_with_loss = WithLossCell(net, loss)
    scale_manager = DynamicLossScaleManager()
    update_cell = scale_manager.get_update_cell()
    train_network = TrainOneStepWithLossScaleCell(net_with_loss, optimizer, scale_update_cell=update_cell)
    train_network.set_train()
    with pytest.raises(TypeError) as e:
        train_network(inputs, label)
        print(e)
Ejemplo n.º 5
0
def loss_scale_manager_common(strategy1):
    learning_rate = 0.1
    momentum = 0.9
    epoch_size = 2

    context.reset_auto_parallel_context()
    context.set_auto_parallel_context(parallel_mode=ParallelMode.AUTO_PARALLEL,
                                      device_num=8)
    predict = Tensor(np.ones([32, 128]), dtype=ms.float32)
    label = Tensor(np.ones([32]), dtype=ms.int32)
    dataset = Dataset(predict, label, 2)
    net = all_to_all_net(strategy1)

    loss = SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True)
    loss.softmax_cross_entropy.set_strategy(((8, 1), (8, 1)))
    opt = Momentum(net.trainable_params(), learning_rate, momentum)
    scale_manager = DynamicLossScaleManager(32, 2, 2000)
    model = Model(net, loss, opt, loss_scale_manager=scale_manager)
    # if no GE exists, outputs = self._train_network(*next_element) outputs inputs tensor.
    try:
        model.train(epoch_size, dataset, dataset_sink_mode=False)
    except TypeError:
        pass
    else:
        assert False
Ejemplo n.º 6
0
 def __init__(self, network, optimizer=None):
     super(ModelBert, self).__init__()
     self.optimizer = optimizer
     manager = DynamicLossScaleManager()
     update_cell = LossScaleUpdateCell(manager)
     self.train_network = BertTrainOneStepWithLossScaleCell(network, self.optimizer,
                                                            scale_update_cell=update_cell)
     self.train_network.set_train()
Ejemplo n.º 7
0
def test_loss_scale_fp16_model_train_overflow():
    dataset_types = (np.float32, np.float32)
    dataset_shapes = ((16, 16), (16, 16))
    dataset = MindDataSet(dataset_types, dataset_shapes)

    net = NetFP16(16, 16)
    net.set_train()

    loss = MSELoss()
    optimizer = Momentum(net.trainable_params(), learning_rate=0.1, momentum=0.9)
    scale_manager = DynamicLossScaleManager(init_loss_scale=16, scale_factor=2, scale_window=2)
    model = Model(net, loss_fn=loss, optimizer=optimizer, metrics=None, loss_scale_manager=scale_manager)
    model.train(2, dataset, dataset_sink_mode=False)
Ejemplo n.º 8
0
def _build_training_pipeline(config: TransformerConfig,
                             pre_training_dataset=None,
                             fine_tune_dataset=None,
                             test_dataset=None):
    """
    Build training pipeline.

    Args:
        config (TransformerConfig): Config of mass model.
        pre_training_dataset (Dataset): Pre-training dataset.
        fine_tune_dataset (Dataset): Fine-tune dataset.
        test_dataset (Dataset): Test dataset.
    """
    net_with_loss = TransformerNetworkWithLoss(config, is_training=True)
    net_with_loss.init_parameters_data()

    if config.existed_ckpt:
        if config.existed_ckpt.endswith(".npz"):
            weights = np.load(config.existed_ckpt)
        else:
            weights = load_checkpoint(config.existed_ckpt)
        for param in net_with_loss.trainable_params():
            weights_name = param.name
            if weights_name not in weights:
                raise ValueError(
                    f"Param {weights_name} is not found in ckpt file.")

            if isinstance(weights[weights_name], Parameter):
                param.default_input = weights[weights_name].default_input
            elif isinstance(weights[weights_name], Tensor):
                param.default_input = Tensor(weights[weights_name].asnumpy(),
                                             config.dtype)
            elif isinstance(weights[weights_name], np.ndarray):
                param.default_input = Tensor(weights[weights_name],
                                             config.dtype)
            else:
                param.default_input = weights[weights_name]
    else:
        for param in net_with_loss.trainable_params():
            name = param.name
            value = param.default_input
            if isinstance(value, Tensor):
                if name.endswith(".gamma"):
                    param.default_input = one_weight(value.asnumpy().shape)
                elif name.endswith(".beta") or name.endswith(".bias"):
                    param.default_input = zero_weight(value.asnumpy().shape)
                else:
                    param.default_input = weight_variable(
                        value.asnumpy().shape)

    dataset = pre_training_dataset if pre_training_dataset is not None \
        else fine_tune_dataset

    if dataset is None:
        raise ValueError(
            "pre-training dataset or fine-tuning dataset must be provided one."
        )

    update_steps = dataset.get_repeat_count() * dataset.get_dataset_size()
    if config.lr_scheduler == "isr":
        lr = Tensor(square_root_schedule(
            lr=config.lr,
            update_num=update_steps,
            decay_start_step=config.decay_start_step,
            warmup_steps=config.warmup_steps,
            min_lr=config.min_lr),
                    dtype=mstype.float32)
    elif config.lr_scheduler == "poly":
        lr = Tensor(polynomial_decay_scheduler(
            lr=config.lr,
            min_lr=config.min_lr,
            decay_steps=config.decay_steps,
            total_update_num=update_steps,
            warmup_steps=config.warmup_steps,
            power=config.poly_lr_scheduler_power),
                    dtype=mstype.float32)
    else:
        lr = config.lr

    if config.optimizer.lower() == "adam":
        optimizer = Adam(net_with_loss.trainable_params(),
                         lr,
                         beta1=0.9,
                         beta2=0.98)
    elif config.optimizer.lower() == "lamb":
        lr = BertLearningRate(decay_steps=12000,
                              learning_rate=config.lr,
                              end_learning_rate=config.min_lr,
                              power=10.0,
                              warmup_steps=config.warmup_steps)
        decay_params = list(
            filter(
                lambda x: 'layernorm' not in x.name.lower() and 'bias' not in x
                .name.lower(), net_with_loss.trainable_params()))
        other_params = list(
            filter(
                lambda x: 'layernorm' in x.name.lower() or 'bias' in x.name.
                lower(), net_with_loss.trainable_params()))
        group_params = [{
            'params': decay_params,
            'weight_decay': 0.01
        }, {
            'params': other_params
        }]

        optimizer = Lamb(group_params, lr, eps=1e-6)
    elif config.optimizer.lower() == "momentum":
        optimizer = Momentum(net_with_loss.trainable_params(),
                             lr,
                             momentum=0.9)
    else:
        raise ValueError(f"optimizer only support `adam` and `momentum` now.")

    # Dynamic loss scale.
    scale_manager = DynamicLossScaleManager(
        init_loss_scale=config.init_loss_scale,
        scale_factor=config.loss_scale_factor,
        scale_window=config.scale_window)
    net_with_grads = TransformerTrainOneStepWithLossScaleCell(
        network=net_with_loss,
        optimizer=optimizer,
        scale_update_cell=scale_manager.get_update_cell())
    net_with_grads.set_train(True)
    model = Model(net_with_grads)
    loss_monitor = LossCallBack(config)
    ckpt_config = CheckpointConfig(
        save_checkpoint_steps=config.save_ckpt_steps,
        keep_checkpoint_max=config.keep_ckpt_max)

    rank_size = os.getenv('RANK_SIZE')
    callbacks = [loss_monitor]
    if rank_size is not None and int(
            rank_size) > 1 and MultiAscend.get_rank() % 8 == 0:
        ckpt_callback = ModelCheckpoint(
            prefix=config.ckpt_prefix,
            directory=os.path.join(config.ckpt_path,
                                   'ckpt_{}'.format(os.getenv('DEVICE_ID'))),
            config=ckpt_config)
        callbacks.append(ckpt_callback)

    if rank_size is None or int(rank_size) == 1:
        ckpt_callback = ModelCheckpoint(
            prefix=config.ckpt_prefix,
            directory=os.path.join(config.ckpt_path,
                                   'ckpt_{}'.format(os.getenv('DEVICE_ID'))),
            config=ckpt_config)
        callbacks.append(ckpt_callback)

    print(f" | ALL SET, PREPARE TO TRAIN.")
    _train(model=model,
           config=config,
           pre_training_dataset=pre_training_dataset,
           fine_tune_dataset=fine_tune_dataset,
           test_dataset=test_dataset,
           callbacks=callbacks)
Ejemplo n.º 9
0
def test_bert_percision():
    """test bert percision"""
    context.set_context(mode=context.GRAPH_MODE,
                        device_target="Ascend",
                        reserve_class_name_in_scope=False)
    ds, new_repeat_count, _ = me_de_train_dataset()
    version = os.getenv('VERSION', 'large')
    batch_size = 16
    config = get_config(version=version, batch_size=batch_size)
    netwithloss = BertNetworkWithLoss(config, True)
    lr = BertLearningRate(decay_steps=ds.get_dataset_size() * new_repeat_count,
                          learning_rate=5e-5,
                          end_learning_rate=1e-9,
                          power=10.0,
                          warmup_steps=0)
    decay_filter = lambda x: 'layernorm' not in x.name.lower(
    ) and 'bias' not in x.name.lower()
    no_decay_filter = lambda x: 'layernorm' in x.name.lower(
    ) or 'bias' in x.name.lower()
    decay_params = list(filter(decay_filter, netwithloss.trainable_params()))
    other_params = list(filter(no_decay_filter,
                               netwithloss.trainable_params()))
    group_params = [{
        'params': decay_params,
        'weight_decay': 0.01
    }, {
        'params': other_params
    }, {
        'order_params': netwithloss.trainable_params()
    }]
    optimizer = Lamb(group_params, lr)
    scale_window = 3
    scale_manager = DynamicLossScaleManager(2**16, 2, scale_window)
    netwithgrads = BertTrainOneStepWithLossScaleCell(
        netwithloss,
        optimizer=optimizer,
        scale_update_cell=scale_manager.get_update_cell())
    netwithgrads.set_train(True)
    model = Model(netwithgrads)
    callback = ModelCallback()
    params = netwithloss.trainable_params()
    for param in params:
        param.init_data()
        value = param.default_input
        name = param.name
        if isinstance(value, Tensor):
            if name.split('.')[-1] in ['weight']:
                if name.split('.')[-3] in ['cls2']:
                    logger.info(
                        "***************** BERT param name is 1 {}".format(
                            name))
                    param.default_input = weight_variable(
                        value.asnumpy().shape)
                else:
                    logger.info(
                        "***************** BERT param name is 2 {}".format(
                            name))
                    tempshape = value.asnumpy().shape
                    shape = (tempshape[1], tempshape[0])
                    weight_value = weight_variable(shape).asnumpy()
                    param.default_input = Tensor(
                        np.transpose(weight_value, [1, 0]))
            else:
                logger.info(
                    "***************** BERT param name is 3 {}".format(name))
                param.default_input = weight_variable(value.asnumpy().shape)
    model.train(new_repeat_count,
                ds,
                callbacks=callback,
                dataset_sink_mode=False)

    # assertion occurs while the loss value, overflow state or loss_scale value is wrong
    loss_value = np.array(callback.loss_list)
    assert np.allclose(loss_value[0], 12.206575, 0, 0.000001)

    expect_loss_value = [
        12.206575, 11.865044, 11.828129, 11.826707, 11.82108, 12.407423,
        12.005459, 12.621225, 12.222903, 12.427446
    ]
    print("loss value: {}".format(loss_value))
    assert np.allclose(loss_value, expect_loss_value, 0, 0.0005)

    overflow = np.array(callback.overflow_list)
    expect_overflow = [
        False, False, False, True, False, False, False, True, False, False
    ]
    print("overflow: {}".format(overflow))
    assert (overflow == expect_overflow).all()

    loss_scale = np.array(callback.lossscale_list)
    expect_loss_scale = [
        65536.0, 65536.0, 131072.0, 65536.0, 65536.0, 65536.0, 131072.0,
        65536.0, 65536.0, 65536.0
    ]
    print("loss scale: {}".format(loss_scale))
    assert np.allclose(loss_scale, expect_loss_scale, 0, 0)
Ejemplo n.º 10
0
def test_bert_tdt():
    """test bert tdt"""
    context.set_context(mode=context.GRAPH_MODE,
                        device_target="Ascend",
                        reserve_class_name_in_scope=False)
    ds = me_de_train_dataset()
    version = os.getenv('VERSION', 'large')
    batch_size = int(os.getenv('BATCH_SIZE', '16'))
    config = get_config(version=version, batch_size=batch_size)
    netwithloss = BertNetworkWithLoss(config, True)
    optimizer = Lamb(netwithloss.trainable_params(),
                     decay_steps=ds.get_dataset_size() * ds.get_repeat_count(),
                     start_learning_rate=5e-5,
                     end_learning_rate=1e-9,
                     power=10.0,
                     warmup_steps=0,
                     weight_decay=0.01)
    scale_window = 3
    scale_manager = DynamicLossScaleManager(2**16, 2, scale_window)
    netwithgrads = BertTrainOneStepWithLossScaleCell(
        netwithloss,
        optimizer=optimizer,
        scale_update_cell=scale_manager.get_update_cell())
    netwithgrads.set_train(True)
    model = Model(netwithgrads)
    callback = ModelCallback()
    params = netwithloss.trainable_params()
    for param in params:
        param.init_data()
        value = param.default_input
        name = param.name
        if isinstance(value, Tensor):
            if name.split('.')[-1] in ['weight']:
                if name.split('.')[-3] in ['cls2']:
                    logger.info(
                        "***************** BERT param name is 1 {}".format(
                            name))
                    param.default_input = weight_variable(
                        value.asnumpy().shape)
                else:
                    logger.info(
                        "***************** BERT param name is 2 {}".format(
                            name))
                    tempshape = value.asnumpy().shape
                    shape = (tempshape[1], tempshape[0])
                    weight_value = weight_variable(shape).asnumpy()
                    param.default_input = Tensor(
                        np.transpose(weight_value, [1, 0]))
            else:
                logger.info(
                    "***************** BERT param name is 3 {}".format(name))
                param.default_input = weight_variable(value.asnumpy().shape)
    model.train(ds.get_repeat_count(),
                ds,
                callbacks=callback,
                dataset_sink_mode=False)

    # assertion occurs while the loss value, overflow state or loss_scale value is wrong
    loss_value = np.array(callback.loss_list)
    expect_loss_value = [
        12.207198, 11.980881, 11.984844, 11.879381, 11.832978, 12.411333,
        12.009284, 12.621277, 12.223178, 12.427385
    ]
    print("loss value: {}".format(loss_value))
    assert np.allclose(loss_value, expect_loss_value, 0, 0.0005)

    overflow = np.array(callback.overflow_list)
    expect_overflow = [
        True, True, False, False, False, True, False, False, False, True
    ]
    print("overflow: {}".format(overflow))
    assert (overflow == expect_overflow).all()

    loss_scale = np.array(callback.lossscale_list)
    expect_loss_scale = [
        32768.0, 16384.0, 16384.0, 16384.0, 32768.0, 16384.0, 16384.0, 16384.0,
        32768.0, 16384.0
    ]
    print("loss scale: {}".format(loss_scale))
    assert np.allclose(loss_scale, expect_loss_scale, 0, 0)
Ejemplo n.º 11
0
def train():
    """train"""
    set_seed(1)
    device_id = int(os.getenv('DEVICE_ID', '0'))
    rank_id = int(os.getenv('RANK_ID', '0'))
    device_num = int(os.getenv('RANK_SIZE', '1'))
    # context.set_context(mode=context.GRAPH_MODE, device_target="GPU", save_graphs=False, device_id=device_id)
    context.set_context(mode=context.PYNATIVE_MODE,
                        device_target="GPU",
                        save_graphs=False,
                        device_id=device_id)

    if device_num > 1:
        init()
        context.set_auto_parallel_context(
            parallel_mode=ParallelMode.DATA_PARALLEL,
            device_num=device_num,
            global_rank=device_id,
            gradients_mean=True)
    if args.modelArts_mode:
        import moxing as mox
        local_data_url = '/cache/data'
        mox.file.copy_parallel(src_url=args.data_url, dst_url=local_data_url)

    train_dataset = DIV2K(args,
                          name=args.data_train,
                          train=True,
                          benchmark=False)
    train_dataset.set_scale(args.task_id)
    print(len(train_dataset))
    train_de_dataset = ds.GeneratorDataset(train_dataset, ["LR", "HR"],
                                           num_shards=device_num,
                                           shard_id=rank_id,
                                           shuffle=True)
    train_de_dataset = train_de_dataset.batch(args.batch_size,
                                              drop_remainder=True)

    eval_dataset = SRData(args,
                          name=args.data_test,
                          train=False,
                          benchmark=True)
    print(len(eval_dataset))
    eval_ds = ds.GeneratorDataset(eval_dataset, ['LR', 'HR'], shuffle=False)
    eval_ds = eval_ds.batch(1, drop_remainder=True)

    # net_m = RCAN(args)
    net_m = EDSR(args)
    print("Init net weights successfully")

    if args.ckpt_path:
        param_dict = load_checkpoint(args.pth_path)
        load_param_into_net(net_m, param_dict)
        print("Load net weight successfully")
    step_size = train_de_dataset.get_dataset_size()
    lr = []
    for i in range(0, args.epochs):
        cur_lr = args.lr / (2**((i + 1) // 200))
        lr.extend([cur_lr] * step_size)
    opt = nn.Adam(net_m.trainable_params(),
                  learning_rate=lr,
                  loss_scale=args.loss_scale)
    loss = nn.L1Loss()
    loss_scale_manager = DynamicLossScaleManager(init_loss_scale=args.init_loss_scale, \
             scale_factor=2, scale_window=1000)

    eval_net = net_m
    model = Model(net_m,
                  loss_fn=loss,
                  optimizer=opt,
                  loss_scale_manager=loss_scale_manager)

    time_cb = TimeMonitor(data_size=step_size)
    loss_cb = LossMonitor()
    metrics = {
        "psnr": PSNR(rgb_range=args.rgb_range, shave=True),
    }
    eval_cb = EvalCallBack(eval_net,
                           eval_ds,
                           args.test_every,
                           step_size / args.batch_size,
                           metrics=metrics,
                           rank_id=rank_id)
    cb = [time_cb, loss_cb, eval_cb]
    config_ck = CheckpointConfig(
        save_checkpoint_steps=args.ckpt_save_interval * step_size,
        keep_checkpoint_max=args.ckpt_save_max)
    ckpt_cb = ModelCheckpoint(prefix=args.filename,
                              directory=args.ckpt_save_path,
                              config=config_ck)
    if device_id == 0:
        cb += [ckpt_cb]
    model.train(args.epochs,
                train_de_dataset,
                callbacks=cb,
                dataset_sink_mode=True)
Ejemplo n.º 12
0
        opt = Momentum(params=get_param_groups(net),
                       learning_rate=Tensor(lr),
                       momentum=cfg.momentum,
                       weight_decay=cfg.weight_decay,
                       loss_scale=cfg.loss_scale)
        if not cfg.use_label_smooth:
            cfg.label_smooth_factor = 0.0
        loss = CrossEntropySmooth(sparse=True,
                                  reduction="mean",
                                  smooth_factor=cfg.label_smooth_factor,
                                  num_classes=cfg.num_classes)

        if cfg.is_dynamic_loss_scale:
            loss_scale_manager = DynamicLossScaleManager(init_loss_scale=65536,
                                                         scale_factor=2,
                                                         scale_window=2000)
        else:
            loss_scale_manager = FixedLossScaleManager(
                cfg.loss_scale, drop_overflow_update=False)

    model = Model(net,
                  loss_fn=loss,
                  optimizer=opt,
                  metrics={'acc'},
                  amp_level="O3",
                  loss_scale_manager=loss_scale_manager)

    config_ck = CheckpointConfig(save_checkpoint_steps=batch_num * 50,
                                 keep_checkpoint_max=cfg.keep_checkpoint_max)
    time_cb = TimeMonitor(data_size=batch_num)
Ejemplo n.º 13
0
def train_alexnet():
    print(config)
    print('device id:', get_device_id())
    print('device num:', get_device_num())
    print('rank id:', get_rank_id())
    print('job id:', get_job_id())

    device_target = config.device_target
    context.set_context(mode=context.GRAPH_MODE,
                        device_target=config.device_target)
    context.set_context(save_graphs=False)

    device_num = get_device_num()
    if config.dataset_name == "cifar10":
        if device_num > 1:
            config.learning_rate = config.learning_rate * device_num
            config.epoch_size = config.epoch_size * 2
    elif config.dataset_name == "imagenet":
        pass
    else:
        raise ValueError("Unsupported dataset.")

    if device_num > 1:
        context.reset_auto_parallel_context()
        context.set_auto_parallel_context(device_num=device_num, \
            parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True)
        if device_target == "Ascend":
            context.set_context(device_id=get_device_id())
            init()
        elif device_target == "GPU":
            init()
    else:
        context.set_context(device_id=get_device_id())

    if config.dataset_name == "cifar10":
        ds_train = create_dataset_cifar10(config.data_path,
                                          config.batch_size,
                                          target=config.device_target)
    elif config.dataset_name == "imagenet":
        ds_train = create_dataset_imagenet(config.data_path, config.batch_size)
    else:
        raise ValueError("Unsupported dataset.")

    if ds_train.get_dataset_size() == 0:
        raise ValueError(
            "Please check dataset size > 0 and batch_size <= dataset size")

    network = AlexNet(config.num_classes, phase='train')

    loss_scale_manager = None
    metrics = None
    step_per_epoch = ds_train.get_dataset_size(
    ) if config.sink_size == -1 else config.sink_size
    if config.dataset_name == 'cifar10':
        loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean")
        lr = Tensor(
            get_lr_cifar10(0, config.learning_rate, config.epoch_size,
                           step_per_epoch))
        opt = nn.Momentum(network.trainable_params(), lr, config.momentum)
        metrics = {"Accuracy": Accuracy()}

    elif config.dataset_name == 'imagenet':
        loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean")
        lr = Tensor(
            get_lr_imagenet(config.learning_rate, config.epoch_size,
                            step_per_epoch))
        opt = nn.Momentum(params=get_param_groups(network),
                          learning_rate=lr,
                          momentum=config.momentum,
                          weight_decay=config.weight_decay,
                          loss_scale=config.loss_scale)

        from mindspore.train.loss_scale_manager import DynamicLossScaleManager, FixedLossScaleManager
        if config.is_dynamic_loss_scale == 1:
            loss_scale_manager = DynamicLossScaleManager(init_loss_scale=65536,
                                                         scale_factor=2,
                                                         scale_window=2000)
        else:
            loss_scale_manager = FixedLossScaleManager(
                config.loss_scale, drop_overflow_update=False)

    else:
        raise ValueError("Unsupported dataset.")

    if device_target == "Ascend":
        model = Model(network,
                      loss_fn=loss,
                      optimizer=opt,
                      metrics=metrics,
                      amp_level="O2",
                      keep_batchnorm_fp32=False,
                      loss_scale_manager=loss_scale_manager)
    elif device_target == "GPU":
        model = Model(network,
                      loss_fn=loss,
                      optimizer=opt,
                      metrics=metrics,
                      loss_scale_manager=loss_scale_manager)
    else:
        raise ValueError("Unsupported platform.")

    if device_num > 1:
        ckpt_save_dir = os.path.join(config.checkpoint_path + "_" +
                                     str(get_rank()))
    else:
        ckpt_save_dir = config.checkpoint_path

    time_cb = TimeMonitor(data_size=step_per_epoch)
    config_ck = CheckpointConfig(
        save_checkpoint_steps=config.save_checkpoint_steps,
        keep_checkpoint_max=config.keep_checkpoint_max)
    ckpoint_cb = ModelCheckpoint(prefix="checkpoint_alexnet",
                                 directory=ckpt_save_dir,
                                 config=config_ck)

    print("============== Starting Training ==============")
    model.train(config.epoch_size,
                ds_train,
                callbacks=[time_cb, ckpoint_cb,
                           LossMonitor()],
                dataset_sink_mode=config.dataset_sink_mode,
                sink_size=config.sink_size)
Ejemplo n.º 14
0
def test_bert_performance():
    """test bert performance"""
    context.set_context(mode=context.GRAPH_MODE,
                        device_target="Ascend",
                        reserve_class_name_in_scope=False)
    data_set, new_repeat_count, sink_size = me_de_train_dataset(sink_mode=True)
    version = os.getenv('VERSION', 'large')
    config = get_config(version=version)
    netwithloss = BertNetworkWithLoss(config, True)

    lr = BertLearningRate(decay_steps=sink_size * new_repeat_count,
                          learning_rate=5e-5,
                          end_learning_rate=1e-9,
                          power=10.0,
                          warmup_steps=0)
    decay_filter = lambda x: 'layernorm' not in x.name.lower(
    ) and 'bias' not in x.name.lower()
    no_decay_filter = lambda x: 'layernorm' in x.name.lower(
    ) or 'bias' in x.name.lower()
    decay_params = list(filter(decay_filter, netwithloss.trainable_params()))
    other_params = list(filter(no_decay_filter,
                               netwithloss.trainable_params()))
    group_params = [{
        'params': decay_params,
        'weight_decay': 0.01
    }, {
        'params': other_params
    }, {
        'order_params': netwithloss.trainable_params()
    }]
    optimizer = Lamb(group_params, lr)

    scale_window = 3
    scale_manager = DynamicLossScaleManager(2**16, 2, scale_window)
    netwithgrads = BertTrainOneStepWithLossScaleCell(
        netwithloss,
        optimizer=optimizer,
        scale_update_cell=scale_manager.get_update_cell())
    netwithgrads.set_train(True)
    model = Model(netwithgrads)
    callback = ModelCallback()
    params = netwithloss.trainable_params()
    for param in params:
        value = param.data
        name = param.name
        if isinstance(value, Tensor) and not value.has_init:
            if name.split('.')[-1] in ['weight']:
                if name.split('.')[-3] in ['cls2']:
                    logger.info(
                        "***************** BERT param name is 1 {}".format(
                            name))
                    param.set_data(weight_variable(value.asnumpy().shape))
                else:
                    logger.info(
                        "***************** BERT param name is 2 {}".format(
                            name))
                    tempshape = value.asnumpy().shape
                    shape = (tempshape[1], tempshape[0])
                    weight_value = weight_variable(shape).asnumpy()
                    param.set_data(Tensor(np.transpose(weight_value, [1, 0])))
            else:
                logger.info(
                    "***************** BERT param name is 3 {}".format(name))
                param.set_data(weight_variable(value.asnumpy().shape))
    time_monitor_callback = TimeMonitor(sink_size)
    model.train(new_repeat_count,
                data_set,
                callbacks=[time_monitor_callback, callback],
                dataset_sink_mode=True,
                sink_size=sink_size)

    # assertion occurs while the loss value, overflow state or loss_scale value is wrong
    loss_value = np.array(callback.loss_list)
    expect_loss_value = [11.3660, 11.3265, 11.3264]
    print("loss value: {}".format(loss_value))
    assert np.allclose(loss_value, expect_loss_value, 0, 0.0005)

    overflow = np.array(callback.overflow_list)
    expect_overflow = [True, True, True]
    print("overflow: {}".format(overflow))
    assert (overflow == expect_overflow).all()

    loss_scale = np.array(callback.lossscale_list)
    expect_loss_scale = [65536.0, 65536.0, 65536.0]
    print("loss scale: {}".format(loss_scale))
    assert np.allclose(loss_scale, expect_loss_scale, 0, 0)

    epoch_mseconds = np.array(time_monitor_callback.epoch_mseconds_list)[2]
    expect_epoch_mseconds = 1400
    print("epoch mseconds: {}".format(epoch_mseconds))
    assert epoch_mseconds <= expect_epoch_mseconds + 5

    per_step_mseconds = np.array(
        time_monitor_callback.per_step_mseconds_list)[2]
    expect_per_step_mseconds = 14
    print("per step mseconds: {}".format(per_step_mseconds))
    assert per_step_mseconds <= expect_per_step_mseconds + 1
Ejemplo n.º 15
0
def _build_training_pipeline(config: GNMTConfig,
                             pre_training_dataset=None,
                             fine_tune_dataset=None,
                             test_dataset=None):
    """
    Build training pipeline.

    Args:
        config (GNMTConfig): Config of mass model.
        pre_training_dataset (Dataset): Pre-training dataset.
        fine_tune_dataset (Dataset): Fine-tune dataset.
        test_dataset (Dataset): Test dataset.
    """
    net_with_loss = GNMTNetworkWithLoss(config,
                                        is_training=True,
                                        use_one_hot_embeddings=True)
    net_with_loss.init_parameters_data()
    _load_checkpoint_to_net(config, net_with_loss)

    dataset = pre_training_dataset if pre_training_dataset is not None \
        else fine_tune_dataset

    if dataset is None:
        raise ValueError(
            "pre-training dataset or fine-tuning dataset must be provided one."
        )

    update_steps = config.epochs * dataset.get_dataset_size()

    lr = _get_lr(config, update_steps)
    optimizer = _get_optimizer(config, net_with_loss, lr)

    # Dynamic loss scale.
    scale_manager = DynamicLossScaleManager(
        init_loss_scale=config.init_loss_scale,
        scale_factor=config.loss_scale_factor,
        scale_window=config.scale_window)
    net_with_grads = GNMTTrainOneStepWithLossScaleCell(
        network=net_with_loss,
        optimizer=optimizer,
        scale_update_cell=scale_manager.get_update_cell())
    net_with_grads.set_train(True)
    model = Model(net_with_grads)
    loss_monitor = LossCallBack(config)
    dataset_size = dataset.get_dataset_size()
    time_cb = TimeMonitor(data_size=dataset_size)
    ckpt_config = CheckpointConfig(
        save_checkpoint_steps=config.save_ckpt_steps,
        keep_checkpoint_max=config.keep_ckpt_max)

    rank_size = os.getenv('RANK_SIZE')
    callbacks = [time_cb, loss_monitor]
    if rank_size is not None and int(
            rank_size) > 1 and MultiAscend.get_rank() % 8 == 0:
        ckpt_callback = ModelCheckpoint(
            prefix=config.ckpt_prefix,
            directory=os.path.join(config.ckpt_path,
                                   'ckpt_{}'.format(os.getenv('DEVICE_ID'))),
            config=ckpt_config)
        callbacks.append(ckpt_callback)
        summary_callback = SummaryCollector(summary_dir="./summary",
                                            collect_freq=50)
        callbacks.append(summary_callback)

    if rank_size is None or int(rank_size) == 1:
        ckpt_callback = ModelCheckpoint(
            prefix=config.ckpt_prefix,
            directory=os.path.join(config.ckpt_path,
                                   'ckpt_{}'.format(os.getenv('DEVICE_ID'))),
            config=ckpt_config)
        callbacks.append(ckpt_callback)
        summary_callback = SummaryCollector(summary_dir="./summary",
                                            collect_freq=50)
        callbacks.append(summary_callback)

    print(f" | ALL SET, PREPARE TO TRAIN.")
    _train(model=model,
           config=config,
           pre_training_dataset=pre_training_dataset,
           fine_tune_dataset=fine_tune_dataset,
           test_dataset=test_dataset,
           callbacks=callbacks)
Ejemplo n.º 16
0
        print("dataset file {} not exists, please check!".format(
            mindrecord_file))
        raise ValueError(mindrecord_file)
    dataset = create_gru_dataset(epoch_count=config.num_epochs,
                                 batch_size=config.batch_size,
                                 dataset_path=mindrecord_file,
                                 rank_size=device_num,
                                 rank_id=rank)
    dataset_size = dataset.get_dataset_size()
    print("dataset size is {}".format(dataset_size))
    network = Seq2Seq(config)
    network = GRUWithLossCell(network)
    lr = dynamic_lr(config, dataset_size)
    opt = Adam(network.trainable_params(), learning_rate=lr)
    scale_manager = DynamicLossScaleManager(
        init_loss_scale=config.init_loss_scale_value,
        scale_factor=config.scale_factor,
        scale_window=config.scale_window)
    update_cell = scale_manager.get_update_cell()
    netwithgrads = GRUTrainOneStepWithLossScaleCell(network, opt, update_cell)

    time_cb = TimeMonitor(data_size=dataset_size)
    loss_cb = LossCallBack(rank_id=rank)
    cb = [time_cb, loss_cb]
    #Save Checkpoint
    if config.save_checkpoint:
        ckpt_config = CheckpointConfig(
            save_checkpoint_steps=config.ckpt_epoch * dataset_size,
            keep_checkpoint_max=config.keep_checkpoint_max)
        save_ckpt_path = os.path.join(args.outputs_dir,
                                      'ckpt_' + str(args.rank_id) + '/')
        ckpt_cb = ModelCheckpoint(config=ckpt_config,
Ejemplo n.º 17
0
def train(cloud_args=None):
    """training process"""
    args = parse_args(cloud_args)

    context.set_context(mode=context.GRAPH_MODE, enable_auto_mixed_precision=True,
                        device_target=args.device_target, save_graphs=False)

    if args.device_target == 'Ascend':
        devid = int(os.getenv('DEVICE_ID'))
        context.set_context(device_id=devid)

    # init distributed
    if args.is_distributed:
        init()
        args.rank = get_rank()
        args.group_size = get_group_size()

    if args.is_dynamic_loss_scale == 1:
        args.loss_scale = 1  # for dynamic loss scale can not set loss scale in momentum opt

    # select for master rank save ckpt or all rank save, compatible for model parallel
    args.rank_save_ckpt_flag = 0
    if args.is_save_on_master:
        if args.rank == 0:
            args.rank_save_ckpt_flag = 1
    else:
        args.rank_save_ckpt_flag = 1

    # logger
    args.outputs_dir = os.path.join(args.ckpt_path,
                                    datetime.datetime.now().strftime('%Y-%m-%d_time_%H_%M_%S'))
    args.logger = get_logger(args.outputs_dir, args.rank)

    if args.net == "densenet100":
        from src.network.densenet import DenseNet100 as DenseNet
    else:
        from src.network.densenet import DenseNet121 as DenseNet

    if args.dataset == "cifar10":
        from src.datasets import classification_dataset_cifar10 as classification_dataset
    else:
        from src.datasets import classification_dataset_imagenet as classification_dataset

    # dataloader
    de_dataset = classification_dataset(args.data_dir, args.image_size, args.per_batch_size, args.max_epoch,
                                        args.rank, args.group_size)
    de_dataset.map_model = 4
    args.steps_per_epoch = de_dataset.get_dataset_size()

    args.logger.save_args(args)

    # network
    args.logger.important_info('start create network')
    # get network and init
    network = DenseNet(args.num_classes)
    # loss
    if not args.label_smooth:
        args.label_smooth_factor = 0.0
    criterion = CrossEntropy(smooth_factor=args.label_smooth_factor, num_classes=args.num_classes)

    # load pretrain model
    if os.path.isfile(args.pretrained):
        param_dict = load_checkpoint(args.pretrained)
        param_dict_new = {}
        for key, values in param_dict.items():
            if key.startswith('moments.'):
                continue
            elif key.startswith('network.'):
                param_dict_new[key[8:]] = values
            else:
                param_dict_new[key] = values
        load_param_into_net(network, param_dict_new)
        args.logger.info('load model {} success'.format(args.pretrained))

    # lr scheduler
    lr_scheduler = get_lr_scheduler(args)
    lr_schedule = lr_scheduler.get_lr()

    # optimizer
    opt = Momentum(params=get_param_groups(network), learning_rate=Tensor(lr_schedule),
                   momentum=args.momentum, weight_decay=args.weight_decay, loss_scale=args.loss_scale)

    # mixed precision training
    criterion.add_flags_recursive(fp32=True)

    # package training process, adjust lr + forward + backward + optimizer
    train_net = BuildTrainNetwork(network, criterion)
    if args.is_distributed:
        parallel_mode = ParallelMode.DATA_PARALLEL
    else:
        parallel_mode = ParallelMode.STAND_ALONE
    if args.is_dynamic_loss_scale == 1:
        loss_scale_manager = DynamicLossScaleManager(init_loss_scale=65536, scale_factor=2, scale_window=2000)
    else:
        loss_scale_manager = FixedLossScaleManager(args.loss_scale, drop_overflow_update=False)

    context.set_auto_parallel_context(parallel_mode=parallel_mode, device_num=args.group_size,
                                      gradients_mean=True)

    if args.device_target == 'Ascend':
        model = Model(train_net, optimizer=opt, metrics=None, loss_scale_manager=loss_scale_manager, amp_level="O3")
    elif args.device_target == 'GPU':
        model = Model(train_net, optimizer=opt, metrics=None, loss_scale_manager=loss_scale_manager, amp_level="O0")
    elif args.device_target == 'CPU':
        model = Model(train_net, optimizer=opt, metrics=None, loss_scale_manager=loss_scale_manager, amp_level="O0")
    else:
        raise ValueError("Unsupported device target.")

    # checkpoint save
    progress_cb = ProgressMonitor(args)
    callbacks = [progress_cb,]
    if args.rank_save_ckpt_flag:
        ckpt_max_num = args.max_epoch * args.steps_per_epoch // args.ckpt_interval
        ckpt_config = CheckpointConfig(save_checkpoint_steps=args.ckpt_interval,
                                       keep_checkpoint_max=ckpt_max_num)
        ckpt_cb = ModelCheckpoint(config=ckpt_config, directory=args.outputs_dir,
                                  prefix='{}'.format(args.rank))
        callbacks.append(ckpt_cb)

    model.train(args.max_epoch, de_dataset, callbacks=callbacks)
Ejemplo n.º 18
0
            if not config.use_label_smooth:
                config.label_smooth_factor = 0.0
            loss = CrossEntropySmooth(sparse=True,
                                      reduction="mean",
                                      smooth_factor=config.label_smooth_factor,
                                      num_classes=config.class_num)
        else:
            loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean")

        if (args_opt.net == "resnet101" or args_opt.net == "resnet50") and \
            not args_opt.parameter_server and target != "CPU":
            opt = Momentum(
                filter(lambda x: x.requires_grad, net.get_parameters()), lr,
                config.momentum, config.weight_decay, config.loss_scale)
            from mindspore.train.loss_scale_manager import DynamicLossScaleManager
            loss_scale = DynamicLossScaleManager()
            model = Model(net,
                          loss_fn=loss,
                          optimizer=opt,
                          loss_scale_manager=loss_scale,
                          metrics={'acc'},
                          amp_level=amp_level,
                          keep_batchnorm_fp32=False)
        else:
            ## fp32 training
            opt = Momentum(
                filter(lambda x: x.requires_grad, net.get_parameters()), lr,
                config.momentum, config.weight_decay)
            model = Model(net, loss_fn=loss, optimizer=opt, metrics={'acc'})

    # define callbacks
Ejemplo n.º 19
0
        args.logger.info('load model {} success'.format(args.pretrained))
    else:
        init_net(args, network_1)

    train_net = BuildTrainNetwork(network_1, criterion_1, args)

    args.logger.info('args:{}'.format(args))
    # call warmup_step should behind the args steps_per_epoch
    args.lrs = warmup_step_list(args, gamma=0.1)
    lrs_gen = list_to_gen(args.lrs)
    opt = Momentum(params=train_net.trainable_params(),
                   learning_rate=lrs_gen,
                   momentum=momentum,
                   weight_decay=weight_decay)
    scale_manager = DynamicLossScaleManager(
        init_loss_scale=args.dynamic_init_loss_scale,
        scale_factor=2,
        scale_window=2000)
    model = Model(train_net,
                  optimizer=opt,
                  metrics=None,
                  loss_scale_manager=scale_manager)
    save_checkpoint_steps = args.ckpt_steps
    args.logger.info('save_checkpoint_steps:{}'.format(save_checkpoint_steps))
    if args.max_ckpts == -1:
        keep_checkpoint_max = int(args.steps_per_epoch * args.max_epoch /
                                  save_checkpoint_steps) + 5  # for more than 5
    else:
        keep_checkpoint_max = args.max_ckpts
    args.logger.info('keep_checkpoint_max:{}'.format(keep_checkpoint_max))

    ckpt_config = CheckpointConfig(save_checkpoint_steps=save_checkpoint_steps,
Ejemplo n.º 20
0
def test_bert_tdt():
    """test bert tdt"""
    context.set_context(mode=context.GRAPH_MODE,
                        device_target="Ascend",
                        reserve_class_name_in_scope=False)
    context.set_context(enable_loop_sink=True)
    context.set_context(enable_mem_reuse=True)
    ds = me_de_train_dataset()
    version = os.getenv('VERSION', 'large')
    batch_size = int(os.getenv('BATCH_SIZE', '16'))
    config = get_config(version=version, batch_size=batch_size)
    netwithloss = BertNetworkWithLoss(config, True)
    optimizer = Momentum(netwithloss.trainable_params(),
                         learning_rate=2e-5,
                         momentum=0.9)
    scale_window = 3
    scale_manager = DynamicLossScaleManager(2**16, 2, scale_window)
    netwithgrads = BertTrainOneStepWithLossScaleCell(
        netwithloss,
        optimizer=optimizer,
        scale_update_cell=scale_manager.get_update_cell())
    netwithgrads.set_train(True)
    model = Model(netwithgrads)
    callback = ModelCallback()
    params = netwithloss.trainable_params()
    for param in params:
        param.init_data()
        value = param.default_input
        name = param.name
        if isinstance(value, Tensor):
            if name.split('.')[-1] in ['weight']:
                if name.split('.')[-3] in ['cls2']:
                    logger.info(
                        "***************** BERT param name is 1 {}".format(
                            name))
                    param.default_input = weight_variable(
                        value.asnumpy().shape)
                else:
                    logger.info(
                        "***************** BERT param name is 2 {}".format(
                            name))
                    tempshape = value.asnumpy().shape
                    shape = (tempshape[1], tempshape[0])
                    weight_value = weight_variable(shape).asnumpy()
                    param.default_input = Tensor(
                        np.transpose(weight_value, [1, 0]))
            else:
                logger.info(
                    "***************** BERT param name is 3 {}".format(name))
                param.default_input = weight_variable(value.asnumpy().shape)
    model.train(ds.get_repeat_count(),
                ds,
                callbacks=callback,
                dataset_sink_mode=False)

    # assertion occurs while the loss value, overflow state or loss_scale value is wrong
    loss_value = np.array(callback.loss_list)
    expect_loss_value = [
        12.1918125, 11.966035, 11.972114, 11.982188, 11.974092, 12.610916,
        12.17565, 12.840416, 12.40291, 12.621661
    ]
    print("loss value: {}".format(loss_value))
    assert np.allclose(loss_value, expect_loss_value, 0.00001, 0.00001)

    overflow = np.array(callback.overflow_list)
    expect_overflow = [
        True, True, False, False, False, True, False, False, False, True
    ]
    print("overflow: {}".format(overflow))
    assert (overflow == expect_overflow).all()

    loss_scale = np.array(callback.lossscale_list)
    expect_loss_scale = [
        32768.0, 16384.0, 16384.0, 16384.0, 32768.0, 16384.0, 16384.0, 16384.0,
        32768.0, 16384.0
    ]
    print("loss scale: {}".format(loss_scale))
    assert np.allclose(loss_scale, expect_loss_scale, 0.00001, 0.00001)
Ejemplo n.º 21
0
def train(cloud_args=None):
    """training process"""
    args = parse_args(cloud_args)

    # init distributed
    if args.is_distributed:
        init()
        args.rank = get_rank()
        args.group_size = get_group_size()

    if args.is_dynamic_loss_scale == 1:
        args.loss_scale = 1  # for dynamic loss scale can not set loss scale in momentum opt

    # select for master rank save ckpt or all rank save, compatiable for model parallel
    args.rank_save_ckpt_flag = 0
    if args.is_save_on_master:
        if args.rank == 0:
            args.rank_save_ckpt_flag = 1
    else:
        args.rank_save_ckpt_flag = 1

    # logger
    args.outputs_dir = os.path.join(args.ckpt_path,
                                    datetime.datetime.now().strftime('%Y-%m-%d_time_%H_%M_%S'))
    args.logger = get_logger(args.outputs_dir, args.rank)

    # dataloader
    de_dataset = classification_dataset(args.data_dir, args.image_size,
                                        args.per_batch_size, args.max_epoch,
                                        args.rank, args.group_size)
    de_dataset.map_model = 4  # !!!important
    args.steps_per_epoch = de_dataset.get_dataset_size()

    args.logger.save_args(args)

    # network
    args.logger.important_info('start create network')
    # get network and init
    network = get_network(args.backbone, args.num_classes)
    if network is None:
        raise NotImplementedError('not implement {}'.format(args.backbone))
    network.add_flags_recursive(fp16=True)
    # loss
    if not args.label_smooth:
        args.label_smooth_factor = 0.0
    criterion = CrossEntropy(smooth_factor=args.label_smooth_factor,
                             num_classes=args.num_classes)

    # load pretrain model
    if os.path.isfile(args.pretrained):
        param_dict = load_checkpoint(args.pretrained)
        param_dict_new = {}
        for key, values in param_dict.items():
            if key.startswith('moments.'):
                continue
            elif key.startswith('network.'):
                param_dict_new[key[8:]] = values
            else:
                param_dict_new[key] = values
        load_param_into_net(network, param_dict_new)
        args.logger.info('load model {} success'.format(args.pretrained))

    # lr scheduler
    if args.lr_scheduler == 'exponential':
        lr = warmup_step_lr(args.lr,
                            args.lr_epochs,
                            args.steps_per_epoch,
                            args.warmup_epochs,
                            args.max_epoch,
                            gamma=args.lr_gamma,
                            )
    elif args.lr_scheduler == 'cosine_annealing':
        lr = warmup_cosine_annealing_lr(args.lr,
                                        args.steps_per_epoch,
                                        args.warmup_epochs,
                                        args.max_epoch,
                                        args.T_max,
                                        args.eta_min)
    else:
        raise NotImplementedError(args.lr_scheduler)

    # optimizer
    opt = Momentum(params=get_param_groups(network),
                   learning_rate=Tensor(lr),
                   momentum=args.momentum,
                   weight_decay=args.weight_decay,
                   loss_scale=args.loss_scale)


    criterion.add_flags_recursive(fp32=True)

    # package training process, adjust lr + forward + backward + optimizer
    train_net = BuildTrainNetwork(network, criterion)
    if args.is_distributed:
        parallel_mode = ParallelMode.DATA_PARALLEL
    else:
        parallel_mode = ParallelMode.STAND_ALONE
    if args.is_dynamic_loss_scale == 1:
        loss_scale_manager = DynamicLossScaleManager(init_loss_scale=65536, scale_factor=2, scale_window=2000)
    else:
        loss_scale_manager = FixedLossScaleManager(args.loss_scale, drop_overflow_update=False)

    # Model api changed since TR5_branch 2020/03/09
    context.set_auto_parallel_context(parallel_mode=parallel_mode, device_num=args.group_size,
                                      parameter_broadcast=True, mirror_mean=True)
    model = Model(train_net, optimizer=opt, metrics=None, loss_scale_manager=loss_scale_manager)

    # checkpoint save
    progress_cb = ProgressMonitor(args)
    callbacks = [progress_cb,]
    if args.rank_save_ckpt_flag:
        ckpt_max_num = args.max_epoch * args.steps_per_epoch // args.ckpt_interval
        ckpt_config = CheckpointConfig(save_checkpoint_steps=args.ckpt_interval,
                                       keep_checkpoint_max=ckpt_max_num)
        ckpt_cb = ModelCheckpoint(config=ckpt_config,
                                  directory=args.outputs_dir,
                                  prefix='{}'.format(args.rank))
        callbacks.append(ckpt_cb)

    model.train(args.max_epoch, de_dataset, callbacks=callbacks, dataset_sink_mode=True)
Ejemplo n.º 22
0
def train(cloud_args=None):
    """training process"""
    args = parse_args(cloud_args)
    context.set_context(mode=context.GRAPH_MODE,
                        enable_auto_mixed_precision=True,
                        device_target=args.platform,
                        save_graphs=False)
    if os.getenv('DEVICE_ID', "not_set").isdigit():
        context.set_context(device_id=int(os.getenv('DEVICE_ID')))

    # init distributed
    if args.is_distributed:
        if args.platform == "Ascend":
            init()
        else:
            init("nccl")
        args.rank = get_rank()
        args.group_size = get_group_size()
        parallel_mode = ParallelMode.DATA_PARALLEL
        context.set_auto_parallel_context(parallel_mode=parallel_mode,
                                          device_num=args.group_size,
                                          parameter_broadcast=True,
                                          mirror_mean=True)
    else:
        args.rank = 0
        args.group_size = 1

    if args.is_dynamic_loss_scale == 1:
        args.loss_scale = 1  # for dynamic loss scale can not set loss scale in momentum opt

    # select for master rank save ckpt or all rank save, compatiable for model parallel
    args.rank_save_ckpt_flag = 0
    if args.is_save_on_master:
        if args.rank == 0:
            args.rank_save_ckpt_flag = 1
    else:
        args.rank_save_ckpt_flag = 1

    # logger
    args.outputs_dir = os.path.join(
        args.ckpt_path,
        datetime.datetime.now().strftime('%Y-%m-%d_time_%H_%M_%S'))
    args.logger = get_logger(args.outputs_dir, args.rank)

    # dataloader
    de_dataset = classification_dataset(args.data_dir,
                                        args.image_size,
                                        args.per_batch_size,
                                        1,
                                        args.rank,
                                        args.group_size,
                                        num_parallel_workers=8)
    de_dataset.map_model = 4  # !!!important
    args.steps_per_epoch = de_dataset.get_dataset_size()

    args.logger.save_args(args)

    # network
    args.logger.important_info('start create network')
    # get network and init
    network = get_network(args.backbone,
                          args.num_classes,
                          platform=args.platform)
    if network is None:
        raise NotImplementedError('not implement {}'.format(args.backbone))

    # load pretrain model
    if os.path.isfile(args.pretrained):
        param_dict = load_checkpoint(args.pretrained)
        param_dict_new = {}
        for key, values in param_dict.items():
            if key.startswith('moments.'):
                continue
            elif key.startswith('network.'):
                param_dict_new[key[8:]] = values
            else:
                param_dict_new[key] = values
        load_param_into_net(network, param_dict_new)
        args.logger.info('load model {} success'.format(args.pretrained))

    # lr scheduler
    if args.lr_scheduler == 'exponential':
        lr = warmup_step_lr(
            args.lr,
            args.lr_epochs,
            args.steps_per_epoch,
            args.warmup_epochs,
            args.max_epoch,
            gamma=args.lr_gamma,
        )
    elif args.lr_scheduler == 'cosine_annealing':
        lr = warmup_cosine_annealing_lr(args.lr, args.steps_per_epoch,
                                        args.warmup_epochs, args.max_epoch,
                                        args.T_max, args.eta_min)
    else:
        raise NotImplementedError(args.lr_scheduler)

    # optimizer
    opt = Momentum(params=get_param_groups(network),
                   learning_rate=Tensor(lr),
                   momentum=args.momentum,
                   weight_decay=args.weight_decay,
                   loss_scale=args.loss_scale)

    # loss
    if not args.label_smooth:
        args.label_smooth_factor = 0.0
    loss = CrossEntropy(smooth_factor=args.label_smooth_factor,
                        num_classes=args.num_classes)

    if args.is_dynamic_loss_scale == 1:
        loss_scale_manager = DynamicLossScaleManager(init_loss_scale=65536,
                                                     scale_factor=2,
                                                     scale_window=2000)
    else:
        loss_scale_manager = FixedLossScaleManager(args.loss_scale,
                                                   drop_overflow_update=False)

    if args.platform == "Ascend":
        model = Model(network,
                      loss_fn=loss,
                      optimizer=opt,
                      loss_scale_manager=loss_scale_manager,
                      metrics={'acc'},
                      amp_level="O3")
    else:
        auto_mixed_precision(network)
        model = Model(network,
                      loss_fn=loss,
                      optimizer=opt,
                      loss_scale_manager=loss_scale_manager,
                      metrics={'acc'})

    # checkpoint save
    progress_cb = ProgressMonitor(args)
    callbacks = [
        progress_cb,
    ]
    if args.rank_save_ckpt_flag:
        ckpt_config = CheckpointConfig(
            save_checkpoint_steps=args.ckpt_interval * args.steps_per_epoch,
            keep_checkpoint_max=args.ckpt_save_max)
        ckpt_cb = ModelCheckpoint(config=ckpt_config,
                                  directory=args.outputs_dir,
                                  prefix='{}'.format(args.rank))
        callbacks.append(ckpt_cb)

    model.train(args.max_epoch,
                de_dataset,
                callbacks=callbacks,
                dataset_sink_mode=True)
Ejemplo n.º 23
0
def test_transformer():
    """
    Transformer training.
    """
    np.random.seed(1)
    context.set_context(mode=context.GRAPH_MODE, device_target="Ascend")
    context.set_context(reserve_class_name_in_scope=False,
                        enable_auto_mixed_precision=False)
    version = os.getenv('VERSION', 'large')
    batch_size = 96
    epoch_size = 3
    config = get_config(version=version, batch_size=batch_size)
    dataset = load_test_data(batch_size=transformer_net_cfg.batch_size,
                             data_file=DATA_DIR)

    netwithloss = TransformerNetworkWithLoss(config, True)

    lr = Tensor(
        create_dynamic_lr(
            schedule="constant*rsqrt_hidden*linear_warmup*rsqrt_decay",
            training_steps=dataset.get_dataset_size() * epoch_size,
            learning_rate=cfg.lr_schedule.learning_rate,
            warmup_steps=cfg.lr_schedule.warmup_steps,
            hidden_size=config.hidden_size), mstype.float32)
    optimizer = Adam(netwithloss.trainable_params(), lr)

    callback = ModelCallback()

    scale_manager = DynamicLossScaleManager(init_loss_scale=4194304,
                                            scale_factor=cfg.scale_factor,
                                            scale_window=3)
    update_cell = scale_manager.get_update_cell()
    netwithgrads = TransformerTrainOneStepWithLossScaleCell(
        netwithloss, optimizer=optimizer, scale_update_cell=update_cell)

    netwithgrads.set_train(True)
    time_monitor_callback = TimeMonitor(dataset.get_dataset_size())
    model = Model(netwithgrads)
    model.train(epoch_size,
                dataset,
                callbacks=[time_monitor_callback, callback],
                dataset_sink_mode=False)

    # assertion occurs while the loss value, overflow state or loss_scale value is wrong
    loss_value = np.array(callback.loss_list)
    assert np.allclose(loss_value[0], 11.241606, 0, 0.000005)

    expect_loss_value = [
        11.241606, 11.243232, 11.217459, 11.204157, 11.213804, 11.215373,
        11.190564, 11.150393, 11.191823, 11.160045
    ]

    print("loss value: {}".format(loss_value))
    assert np.allclose(loss_value[0:10], expect_loss_value, 0, 0.0005)

    overflow = np.array(callback.overflow_list)
    expect_overflow = [
        False, False, False, True, False, False, False, True, False, False
    ]
    print("overflow: {}".format(overflow))
    assert (overflow[0:10] == expect_overflow).all()

    loss_scale = np.array(callback.lossscale_list)
    expect_loss_scale = [
        4194304.0, 4194304.0, 8388608.0, 4194304.0, 4194304.0, 4194304.0,
        8388608.0, 4194304.0, 4194304.0, 4194304.0
    ]
    print("loss scale: {}".format(loss_scale))
    assert np.allclose(loss_scale[0:10], expect_loss_scale, 0, 0)

    epoch_mseconds = np.array(time_monitor_callback.epoch_mseconds_list)[2]
    expect_epoch_mseconds = 2400
    print("epoch mseconds: {}".format(epoch_mseconds))
    assert epoch_mseconds <= expect_epoch_mseconds + 20

    per_step_mseconds = np.array(
        time_monitor_callback.per_step_mseconds_list)[2]
    expect_per_step_mseconds = 240
    print("per step mseconds: {}".format(per_step_mseconds))
    assert per_step_mseconds <= expect_per_step_mseconds + 2
Ejemplo n.º 24
0
def test_bert_performance():
    """test bert performance"""
    context.set_context(mode=context.GRAPH_MODE,
                        device_target="Ascend",
                        reserve_class_name_in_scope=False)
    ds, new_repeat_count = me_de_train_dataset(sink_mode=True)
    version = os.getenv('VERSION', 'large')
    batch_size = 16
    config = get_config(version=version, batch_size=batch_size)
    netwithloss = BertNetworkWithLoss(config, True)
    optimizer = Lamb(netwithloss.trainable_params(),
                     decay_steps=ds.get_dataset_size() * new_repeat_count,
                     start_learning_rate=5e-5,
                     end_learning_rate=1e-9,
                     power=10.0,
                     warmup_steps=0,
                     weight_decay=0.01)
    scale_window = 3
    scale_manager = DynamicLossScaleManager(2**16, 2, scale_window)
    netwithgrads = BertTrainOneStepWithLossScaleCell(
        netwithloss,
        optimizer=optimizer,
        scale_update_cell=scale_manager.get_update_cell())
    netwithgrads.set_train(True)
    model = Model(netwithgrads)
    callback = ModelCallback()
    params = netwithloss.trainable_params()
    for param in params:
        param.init_data()
        value = param.default_input
        name = param.name
        if isinstance(value, Tensor):
            if name.split('.')[-1] in ['weight']:
                if name.split('.')[-3] in ['cls2']:
                    logger.info(
                        "***************** BERT param name is 1 {}".format(
                            name))
                    param.default_input = weight_variable(
                        value.asnumpy().shape)
                else:
                    logger.info(
                        "***************** BERT param name is 2 {}".format(
                            name))
                    tempshape = value.asnumpy().shape
                    shape = (tempshape[1], tempshape[0])
                    weight_value = weight_variable(shape).asnumpy()
                    param.default_input = Tensor(
                        np.transpose(weight_value, [1, 0]))
            else:
                logger.info(
                    "***************** BERT param name is 3 {}".format(name))
                param.default_input = weight_variable(value.asnumpy().shape)
    time_monitor_callback = TimeMonitor(ds.get_dataset_size())
    model.train(new_repeat_count,
                ds,
                callbacks=[time_monitor_callback, callback],
                dataset_sink_mode=True)

    # assertion occurs while the loss value, overflow state or loss_scale value is wrong
    loss_value = np.array(callback.loss_list)
    expect_loss_value = [10.235566, 10.207392, 10.206976]
    print("loss value: {}".format(loss_value))
    assert np.allclose(loss_value, expect_loss_value, 0, 0.0005)

    overflow = np.array(callback.overflow_list)
    expect_overflow = [True, True, True]
    print("overflow: {}".format(overflow))
    assert (overflow == expect_overflow).all()

    loss_scale = np.array(callback.lossscale_list)
    expect_loss_scale = [262144.0, 262144.0, 262144.0]
    print("loss scale: {}".format(loss_scale))
    assert np.allclose(loss_scale, expect_loss_scale, 0, 0)

    epoch_mseconds = np.array(time_monitor_callback.epoch_mseconds_list)[2]
    expect_epoch_mseconds = 1600
    print("epoch mseconds: {}".format(epoch_mseconds))
    assert epoch_mseconds <= expect_epoch_mseconds + 5

    per_step_mseconds = np.array(
        time_monitor_callback.per_step_mseconds_list)[2]
    expect_per_step_mseconds = 16
    print("per step mseconds: {}".format(per_step_mseconds))
    assert per_step_mseconds <= expect_per_step_mseconds + 1
Ejemplo n.º 25
0
def test_bert_precision(enable_graph_kernel=False):
    """test bert precision"""
    context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", reserve_class_name_in_scope=False)
    if enable_graph_kernel:
        context.set_context(enable_graph_kernel=True)
    data_set, new_repeat_count, _ = me_de_train_dataset()
    version = os.getenv('VERSION', 'large')
    config = get_config(version=version)
    netwithloss = BertNetworkWithLoss(config, True)
    lr = BertLearningRate(decay_steps=data_set.get_dataset_size() * new_repeat_count,
                          learning_rate=5e-5, end_learning_rate=1e-9,
                          power=10.0, warmup_steps=0)
    decay_filter = lambda x: 'layernorm' not in x.name.lower() and 'bias' not in x.name.lower()
    no_decay_filter = lambda x: 'layernorm' in x.name.lower() or 'bias' in x.name.lower()
    decay_params = list(filter(decay_filter, netwithloss.trainable_params()))
    other_params = list(filter(no_decay_filter, netwithloss.trainable_params()))
    group_params = [{'params': decay_params, 'weight_decay': 0.01},
                    {'params': other_params},
                    {'order_params': netwithloss.trainable_params()}]
    optimizer = Lamb(group_params, lr)
    scale_window = 3
    scale_manager = DynamicLossScaleManager(2 ** 16, 2, scale_window)
    netwithgrads = BertTrainOneStepWithLossScaleCell(netwithloss, optimizer=optimizer,
                                                     scale_update_cell=scale_manager.get_update_cell())
    netwithgrads.set_train(True)
    model = Model(netwithgrads)
    callback = ModelCallback()
    params = netwithloss.trainable_params()
    for param in params:
        value = param.data
        name = param.name
        if isinstance(value, Tensor):
            if name.split('.')[-1] in ['weight']:
                if name.split('.')[-3] in ['cls2']:
                    logger.info("***************** BERT param name is 1 {}".format(name))
                    param.set_data(weight_variable(value.asnumpy().shape))
                else:
                    logger.info("***************** BERT param name is 2 {}".format(name))
                    tempshape = value.asnumpy().shape
                    shape = (tempshape[1], tempshape[0])
                    weight_value = weight_variable(shape).asnumpy()
                    param.set_data(Tensor(np.transpose(weight_value, [1, 0])))
            else:
                logger.info("***************** BERT param name is 3 {}".format(name))
                param.set_data(weight_variable(value.asnumpy().shape))
    model.train(new_repeat_count, data_set, callbacks=callback, dataset_sink_mode=False)

    # assertion occurs while the loss value, overflow state or loss_scale value is wrong
    loss_value = np.array(callback.loss_list)

    if enable_graph_kernel:
        expect_loss_value = [12.206627, 11.840489, 11.798470, 11.796345, 11.790964, 12.366766, 11.971539, 12.576565,
                             12.185522, 12.386192]
    else:
        assert np.allclose(loss_value[0], 12.2066, 0, 0.0005)
        expect_loss_value = [12.206587, 11.966410, 11.965916, 11.975922, 11.970262, 12.608881, 12.174048, 12.840656,
                             12.407923, 12.631133]
    print("loss value: {}".format(loss_value))
    assert np.allclose(loss_value, expect_loss_value, 0, 0.0005)

    overflow = np.array(callback.overflow_list)
    expect_overflow = [False, False, False, True, False, False, False, True, False, False]
    print("overflow: {}".format(overflow))
    assert (overflow == expect_overflow).all()

    loss_scale = np.array(callback.lossscale_list)
    expect_loss_scale = [65536.0, 65536.0, 131072.0, 65536.0, 65536.0, 65536.0, 131072.0, 65536.0, 65536.0, 65536.0]
    print("loss scale: {}".format(loss_scale))
    assert np.allclose(loss_scale, expect_loss_scale, 0, 0)
Ejemplo n.º 26
0
def train(args):
    '''train'''
    print('=============yolov3 start trainging==================')


    # init distributed
    if args.world_size != 1:
        init()
        args.local_rank = get_rank()
        args.world_size = get_group_size()

    args.batch_size = config.batch_size
    args.warmup_lr = config.warmup_lr
    args.lr_rates = config.lr_rates
    args.lr_steps = config.lr_steps
    args.gamma = config.gamma
    args.weight_decay = config.weight_decay
    args.momentum = config.momentum
    args.max_epoch = config.max_epoch
    args.log_interval = config.log_interval
    args.ckpt_path = config.ckpt_path
    args.ckpt_interval = config.ckpt_interval

    args.outputs_dir = os.path.join(args.ckpt_path, datetime.datetime.now().strftime('%Y-%m-%d_time_%H_%M_%S'))
    print('args.outputs_dir', args.outputs_dir)

    args.logger = get_logger(args.outputs_dir, args.local_rank)

    if args.world_size != 8:
        args.lr_steps = [i * 8 // args.world_size for i in args.lr_steps]

    if args.world_size == 1:
        args.weight_decay = 0.

    if args.world_size != 1:
        parallel_mode = ParallelMode.DATA_PARALLEL
    else:
        parallel_mode = ParallelMode.STAND_ALONE

    context.set_auto_parallel_context(parallel_mode=parallel_mode, device_num=args.world_size, gradients_mean=True)
    mindrecord_path = args.mindrecord_path

    num_classes = config.num_classes
    anchors = config.anchors
    anchors_mask = config.anchors_mask
    num_anchors_list = [len(x) for x in anchors_mask]

    momentum = args.momentum
    args.logger.info('train opt momentum:{}'.format(momentum))

    weight_decay = args.weight_decay * float(args.batch_size)
    args.logger.info('real weight_decay:{}'.format(weight_decay))
    lr_scale = args.world_size / 8
    args.logger.info('lr_scale:{}'.format(lr_scale))

    # dataloader
    args.logger.info('start create dataloader')
    epoch = args.max_epoch
    ds = de.MindDataset(mindrecord_path + "0", columns_list=["image", "annotation"], num_shards=args.world_size,
                        shard_id=args.local_rank)

    ds = ds.map(input_columns=["image", "annotation"],
                output_columns=["image", "annotation", 'coord_mask_0', 'conf_pos_mask_0', 'conf_neg_mask_0',
                                'cls_mask_0', 't_coord_0', 't_conf_0', 't_cls_0', 'gt_list_0', 'coord_mask_1',
                                'conf_pos_mask_1', 'conf_neg_mask_1', 'cls_mask_1', 't_coord_1', 't_conf_1',
                                't_cls_1', 'gt_list_1', 'coord_mask_2', 'conf_pos_mask_2', 'conf_neg_mask_2',
                                'cls_mask_2', 't_coord_2', 't_conf_2', 't_cls_2', 'gt_list_2'],
                column_order=["image", "annotation", 'coord_mask_0', 'conf_pos_mask_0', 'conf_neg_mask_0',
                              'cls_mask_0', 't_coord_0', 't_conf_0', 't_cls_0', 'gt_list_0', 'coord_mask_1',
                              'conf_pos_mask_1', 'conf_neg_mask_1', 'cls_mask_1', 't_coord_1', 't_conf_1',
                              't_cls_1', 'gt_list_1', 'coord_mask_2', 'conf_pos_mask_2', 'conf_neg_mask_2',
                              'cls_mask_2', 't_coord_2', 't_conf_2', 't_cls_2', 'gt_list_2'],
                operations=compose_map_func, num_parallel_workers=16, python_multiprocessing=True)

    ds = ds.batch(args.batch_size, drop_remainder=True, num_parallel_workers=8)

    args.steps_per_epoch = ds.get_dataset_size()
    lr = warmup_step_new(args, lr_scale=lr_scale)

    ds = ds.repeat(epoch)
    args.logger.info('args.steps_per_epoch:{}'.format(args.steps_per_epoch))
    args.logger.info('args.world_size:{}'.format(args.world_size))
    args.logger.info('args.local_rank:{}'.format(args.local_rank))
    args.logger.info('end create dataloader')
    args.logger.save_args(args)
    args.logger.important_info('start create network')
    create_network_start = time.time()

    # backbone and loss
    network = backbone_HwYolov3(num_classes, num_anchors_list, args)

    criterion0 = YoloLoss(num_classes, anchors, anchors_mask[0], 64, 0, head_idx=0.0)
    criterion1 = YoloLoss(num_classes, anchors, anchors_mask[1], 32, 0, head_idx=1.0)
    criterion2 = YoloLoss(num_classes, anchors, anchors_mask[2], 16, 0, head_idx=2.0)

    # load pretrain model
    if os.path.isfile(args.pretrained):
        param_dict = load_checkpoint(args.pretrained)
        param_dict_new = {}
        for key, values in param_dict.items():
            if key.startswith('moments.'):
                continue
            elif key.startswith('network.'):
                param_dict_new[key[8:]] = values
            else:
                param_dict_new[key] = values
        load_param_into_net(network, param_dict_new)
        args.logger.info('load model {} success'.format(args.pretrained))

    train_net = BuildTrainNetworkV2(network, criterion0, criterion1, criterion2, args)

    # optimizer
    opt = Momentum(params=train_net.trainable_params(), learning_rate=Tensor(lr), momentum=momentum,
                   weight_decay=weight_decay)

    # package training process
    train_net = TrainOneStepWithLossScaleCell(train_net, opt)
    train_net.set_broadcast_flag()

    # checkpoint
    ckpt_max_num = args.max_epoch * args.steps_per_epoch // args.ckpt_interval
    train_config = CheckpointConfig(save_checkpoint_steps=args.ckpt_interval, keep_checkpoint_max=ckpt_max_num)
    ckpt_cb = ModelCheckpoint(config=train_config, directory=args.outputs_dir, prefix='{}'.format(args.local_rank))
    cb_params = _InternalCallbackParam()
    cb_params.train_network = train_net
    cb_params.epoch_num = ckpt_max_num
    cb_params.cur_epoch_num = 1
    run_context = RunContext(cb_params)
    ckpt_cb.begin(run_context)

    train_net.set_train()
    t_end = time.time()
    t_epoch = time.time()
    old_progress = -1
    i = 0
    scale_manager = DynamicLossScaleManager(init_loss_scale=2 ** 10, scale_factor=2, scale_window=2000)

    for data in ds.create_tuple_iterator(output_numpy=True):

        batch_images = data[0]
        batch_labels = data[1]
        coord_mask_0 = data[2]
        conf_pos_mask_0 = data[3]
        conf_neg_mask_0 = data[4]
        cls_mask_0 = data[5]
        t_coord_0 = data[6]
        t_conf_0 = data[7]
        t_cls_0 = data[8]
        gt_list_0 = data[9]
        coord_mask_1 = data[10]
        conf_pos_mask_1 = data[11]
        conf_neg_mask_1 = data[12]
        cls_mask_1 = data[13]
        t_coord_1 = data[14]
        t_conf_1 = data[15]
        t_cls_1 = data[16]
        gt_list_1 = data[17]
        coord_mask_2 = data[18]
        conf_pos_mask_2 = data[19]
        conf_neg_mask_2 = data[20]
        cls_mask_2 = data[21]
        t_coord_2 = data[22]
        t_conf_2 = data[23]
        t_cls_2 = data[24]
        gt_list_2 = data[25]

        img_tensor = Tensor(batch_images, mstype.float32)
        coord_mask_tensor_0 = Tensor(coord_mask_0.astype(np.float32))
        conf_pos_mask_tensor_0 = Tensor(conf_pos_mask_0.astype(np.float32))
        conf_neg_mask_tensor_0 = Tensor(conf_neg_mask_0.astype(np.float32))
        cls_mask_tensor_0 = Tensor(cls_mask_0.astype(np.float32))
        t_coord_tensor_0 = Tensor(t_coord_0.astype(np.float32))
        t_conf_tensor_0 = Tensor(t_conf_0.astype(np.float32))
        t_cls_tensor_0 = Tensor(t_cls_0.astype(np.float32))
        gt_list_tensor_0 = Tensor(gt_list_0.astype(np.float32))

        coord_mask_tensor_1 = Tensor(coord_mask_1.astype(np.float32))
        conf_pos_mask_tensor_1 = Tensor(conf_pos_mask_1.astype(np.float32))
        conf_neg_mask_tensor_1 = Tensor(conf_neg_mask_1.astype(np.float32))
        cls_mask_tensor_1 = Tensor(cls_mask_1.astype(np.float32))
        t_coord_tensor_1 = Tensor(t_coord_1.astype(np.float32))
        t_conf_tensor_1 = Tensor(t_conf_1.astype(np.float32))
        t_cls_tensor_1 = Tensor(t_cls_1.astype(np.float32))
        gt_list_tensor_1 = Tensor(gt_list_1.astype(np.float32))

        coord_mask_tensor_2 = Tensor(coord_mask_2.astype(np.float32))
        conf_pos_mask_tensor_2 = Tensor(conf_pos_mask_2.astype(np.float32))
        conf_neg_mask_tensor_2 = Tensor(conf_neg_mask_2.astype(np.float32))
        cls_mask_tensor_2 = Tensor(cls_mask_2.astype(np.float32))
        t_coord_tensor_2 = Tensor(t_coord_2.astype(np.float32))
        t_conf_tensor_2 = Tensor(t_conf_2.astype(np.float32))
        t_cls_tensor_2 = Tensor(t_cls_2.astype(np.float32))
        gt_list_tensor_2 = Tensor(gt_list_2.astype(np.float32))

        scaling_sens = Tensor(scale_manager.get_loss_scale(), dtype=mstype.float32)

        loss0, overflow, _ = train_net(img_tensor, coord_mask_tensor_0, conf_pos_mask_tensor_0,
                                       conf_neg_mask_tensor_0, cls_mask_tensor_0, t_coord_tensor_0,
                                       t_conf_tensor_0, t_cls_tensor_0, gt_list_tensor_0,
                                       coord_mask_tensor_1, conf_pos_mask_tensor_1, conf_neg_mask_tensor_1,
                                       cls_mask_tensor_1, t_coord_tensor_1, t_conf_tensor_1,
                                       t_cls_tensor_1, gt_list_tensor_1, coord_mask_tensor_2,
                                       conf_pos_mask_tensor_2, conf_neg_mask_tensor_2,
                                       cls_mask_tensor_2, t_coord_tensor_2, t_conf_tensor_2,
                                       t_cls_tensor_2, gt_list_tensor_2, scaling_sens)

        overflow = np.all(overflow.asnumpy())
        if overflow:
            scale_manager.update_loss_scale(overflow)
        else:
            scale_manager.update_loss_scale(False)
        args.logger.info('rank[{}], iter[{}], loss[{}], overflow:{}, loss_scale:{}, lr:{}, batch_images:{}, '
                         'batch_labels:{}'.format(args.local_rank, i, loss0, overflow, scaling_sens, lr[i],
                                                  batch_images.shape, batch_labels.shape))

        # save ckpt
        cb_params.cur_step_num = i + 1  # current step number
        cb_params.batch_num = i + 2
        if args.local_rank == 0:
            ckpt_cb.step_end(run_context)

        # save Log
        if i == 0:
            time_for_graph_compile = time.time() - create_network_start
            args.logger.important_info('Yolov3, graph compile time={:.2f}s'.format(time_for_graph_compile))

        if i % args.steps_per_epoch == 0:
            cb_params.cur_epoch_num += 1

        if i % args.log_interval == 0 and args.local_rank == 0:
            time_used = time.time() - t_end
            epoch = int(i / args.steps_per_epoch)
            fps = args.batch_size * (i - old_progress) * args.world_size / time_used
            args.logger.info('epoch[{}], iter[{}], loss:[{}], {:.2f} imgs/sec'.format(epoch, i, loss0, fps))
            t_end = time.time()
            old_progress = i

        if i % args.steps_per_epoch == 0 and args.local_rank == 0:
            epoch_time_used = time.time() - t_epoch
            epoch = int(i / args.steps_per_epoch)
            fps = args.batch_size * args.world_size * args.steps_per_epoch / epoch_time_used
            args.logger.info('=================================================')
            args.logger.info('epoch time: epoch[{}], iter[{}], {:.2f} imgs/sec'.format(epoch, i, fps))
            args.logger.info('=================================================')
            t_epoch = time.time()

        i = i + 1

    args.logger.info('=============yolov3 training finished==================')
Ejemplo n.º 27
0
def run_train():
    '''run train function.'''
    config.local_rank = get_rank_id()
    config.world_size = get_device_num()
    log_path = os.path.join(config.ckpt_path, 'logs')
    config.logger = get_logger(log_path, config.local_rank)

    support_train_stage = ['base', 'beta']
    if config.train_stage.lower() not in support_train_stage:
        config.logger.info('your train stage is not support.')
        raise ValueError('train stage not support.')

    if not os.path.exists(config.data_dir):
        config.logger.info(
            'ERROR, data_dir is not exists, please set data_dir in config.py')
        raise ValueError(
            'ERROR, data_dir is not exists, please set data_dir in config.py')

    parallel_mode = ParallelMode.HYBRID_PARALLEL if config.is_distributed else ParallelMode.STAND_ALONE
    context.set_auto_parallel_context(parallel_mode=parallel_mode,
                                      device_num=config.world_size,
                                      gradients_mean=True)
    if config.is_distributed:
        init()

    if config.local_rank % 8 == 0:
        if not os.path.exists(config.ckpt_path):
            os.makedirs(config.ckpt_path)

    de_dataset, steps_per_epoch, num_classes = get_de_dataset(config)
    config.logger.info('de_dataset: %d', de_dataset.get_dataset_size())

    config.steps_per_epoch = steps_per_epoch
    config.num_classes = num_classes
    config.lr_epochs = list(map(int, config.lr_epochs.split(',')))
    config.logger.info('config.num_classes: %d', config.num_classes)
    config.logger.info('config.world_size: %d', config.world_size)
    config.logger.info('config.local_rank: %d', config.local_rank)
    config.logger.info('config.lr: %f', config.lr)

    if config.nc_16 == 1:
        if config.model_parallel == 0:
            if config.num_classes % 16 == 0:
                config.logger.info('data parallel aleardy 16, nums: %d',
                                   config.num_classes)
            else:
                config.num_classes = (config.num_classes // 16 + 1) * 16
        else:
            if config.num_classes % (config.world_size * 16) == 0:
                config.logger.info('model parallel aleardy 16, nums: %d',
                                   config.num_classes)
            else:
                config.num_classes = (config.num_classes //
                                      (config.world_size * 16) +
                                      1) * config.world_size * 16

    config.logger.info('for D, loaded, class nums: %d', config.num_classes)
    config.logger.info('steps_per_epoch: %d', config.steps_per_epoch)
    config.logger.info('img_total_num: %d',
                       config.steps_per_epoch * config.per_batch_size)

    config.logger.info('get_backbone----in----')
    _backbone = get_backbone(config)
    config.logger.info('get_backbone----out----')
    config.logger.info('get_metric_fc----in----')
    margin_fc_1 = get_metric_fc(config)
    config.logger.info('get_metric_fc----out----')
    config.logger.info('DistributedHelper----in----')
    network_1 = DistributedHelper(_backbone, margin_fc_1)
    config.logger.info('DistributedHelper----out----')
    config.logger.info('network fp16----in----')
    if config.fp16 == 1:
        network_1.add_flags_recursive(fp16=True)
    config.logger.info('network fp16----out----')

    criterion_1 = get_loss(config)
    if config.fp16 == 1 and config.model_parallel == 0:
        criterion_1.add_flags_recursive(fp32=True)

    network_1 = load_pretrain(config, network_1)
    train_net = BuildTrainNetwork(network_1, criterion_1, config)

    # call warmup_step should behind the config steps_per_epoch
    config.lrs = warmup_step_list(config, gamma=0.1)
    lrs_gen = list_to_gen(config.lrs)
    opt = Momentum(params=train_net.trainable_params(),
                   learning_rate=lrs_gen,
                   momentum=config.momentum,
                   weight_decay=config.weight_decay)
    scale_manager = DynamicLossScaleManager(
        init_loss_scale=config.dynamic_init_loss_scale,
        scale_factor=2,
        scale_window=2000)
    model = Model(train_net,
                  optimizer=opt,
                  metrics=None,
                  loss_scale_manager=scale_manager)

    save_checkpoint_steps = config.ckpt_steps
    config.logger.info('save_checkpoint_steps: %d', save_checkpoint_steps)
    if config.max_ckpts == -1:
        keep_checkpoint_max = int(config.steps_per_epoch * config.max_epoch /
                                  save_checkpoint_steps) + 5
    else:
        keep_checkpoint_max = config.max_ckpts
    config.logger.info('keep_checkpoint_max: %d', keep_checkpoint_max)

    ckpt_config = CheckpointConfig(save_checkpoint_steps=save_checkpoint_steps,
                                   keep_checkpoint_max=keep_checkpoint_max)
    config.logger.info('max_epoch_train: %d', config.max_epoch)
    ckpt_cb = ModelCheckpoint(config=ckpt_config,
                              directory=config.ckpt_path,
                              prefix='{}'.format(config.local_rank))
    config.epoch_cnt = 0
    progress_cb = ProgressMonitor(config)
    new_epoch_train = config.max_epoch * steps_per_epoch // config.log_interval
    model.train(new_epoch_train,
                de_dataset,
                callbacks=[progress_cb, ckpt_cb],
                sink_size=config.log_interval)
Ejemplo n.º 28
0
def train(args):
    '''train'''
    print('=============yolov3 start trainging==================')
    devid = int(os.getenv('DEVICE_ID',
                          '0')) if args.run_platform != 'CPU' else 0
    context.set_context(mode=context.GRAPH_MODE,
                        device_target=args.run_platform,
                        save_graphs=False,
                        device_id=devid)
    # init distributed
    if args.world_size != 1:
        init()
        args.local_rank = get_rank()
        args.world_size = get_group_size()
        context.set_auto_parallel_context(
            parallel_mode=ParallelMode.DATA_PARALLEL,
            device_num=args.world_size,
            gradients_mean=True)
    args.logger = get_logger(args.outputs_dir, args.local_rank)

    # dataloader
    ds = create_dataset(args)

    args.logger.important_info('start create network')
    create_network_start = time.time()

    train_net = define_network(args)

    # checkpoint
    ckpt_max_num = args.max_epoch * args.steps_per_epoch // args.ckpt_interval
    train_config = CheckpointConfig(save_checkpoint_steps=args.ckpt_interval,
                                    keep_checkpoint_max=ckpt_max_num)
    ckpt_cb = ModelCheckpoint(config=train_config,
                              directory=args.outputs_dir,
                              prefix='{}'.format(args.local_rank))
    cb_params = _InternalCallbackParam()
    cb_params.train_network = train_net
    cb_params.epoch_num = ckpt_max_num
    cb_params.cur_epoch_num = 1
    run_context = RunContext(cb_params)
    ckpt_cb.begin(run_context)

    train_net.set_train()
    t_end = time.time()
    t_epoch = time.time()
    old_progress = -1
    i = 0
    if args.use_loss_scale:
        scale_manager = DynamicLossScaleManager(init_loss_scale=2**10,
                                                scale_factor=2,
                                                scale_window=2000)
    for data in ds.create_tuple_iterator(output_numpy=True):
        batch_images = data[0]
        batch_labels = data[1]
        input_list = [Tensor(batch_images, mstype.float32)]
        for idx in range(2, 26):
            input_list.append(Tensor(data[idx], mstype.float32))
        if args.use_loss_scale:
            scaling_sens = Tensor(scale_manager.get_loss_scale(),
                                  dtype=mstype.float32)
            loss0, overflow, _ = train_net(*input_list, scaling_sens)
            overflow = np.all(overflow.asnumpy())
            if overflow:
                scale_manager.update_loss_scale(overflow)
            else:
                scale_manager.update_loss_scale(False)
            args.logger.info(
                'rank[{}], iter[{}], loss[{}], overflow:{}, loss_scale:{}, lr:{}, batch_images:{}, '
                'batch_labels:{}'.format(args.local_rank, i, loss0, overflow,
                                         scaling_sens, args.lr[i],
                                         batch_images.shape,
                                         batch_labels.shape))
        else:
            loss0 = train_net(*input_list)
            args.logger.info(
                'rank[{}], iter[{}], loss[{}], lr:{}, batch_images:{}, '
                'batch_labels:{}'.format(args.local_rank, i, loss0, args.lr[i],
                                         batch_images.shape,
                                         batch_labels.shape))
        # save ckpt
        cb_params.cur_step_num = i + 1  # current step number
        cb_params.batch_num = i + 2
        if args.local_rank == 0:
            ckpt_cb.step_end(run_context)

        # save Log
        if i == 0:
            time_for_graph_compile = time.time() - create_network_start
            args.logger.important_info(
                'Yolov3, graph compile time={:.2f}s'.format(
                    time_for_graph_compile))

        if i % args.steps_per_epoch == 0:
            cb_params.cur_epoch_num += 1

        if i % args.log_interval == 0 and args.local_rank == 0:
            time_used = time.time() - t_end
            epoch = int(i / args.steps_per_epoch)
            fps = args.batch_size * (
                i - old_progress) * args.world_size / time_used
            args.logger.info(
                'epoch[{}], iter[{}], loss:[{}], {:.2f} imgs/sec'.format(
                    epoch, i, loss0, fps))
            t_end = time.time()
            old_progress = i

        if i % args.steps_per_epoch == 0 and args.local_rank == 0:
            epoch_time_used = time.time() - t_epoch
            epoch = int(i / args.steps_per_epoch)
            fps = args.batch_size * args.world_size * args.steps_per_epoch / epoch_time_used
            args.logger.info(
                '=================================================')
            args.logger.info(
                'epoch time: epoch[{}], iter[{}], {:.2f} imgs/sec'.format(
                    epoch, i, fps))
            args.logger.info(
                '=================================================')
            t_epoch = time.time()

        i = i + 1

    args.logger.info('=============yolov3 training finished==================')
Ejemplo n.º 29
0
def train(cloud_args=None):
    """training process"""
    args = parse_args(cloud_args)
    context.set_context(mode=context.GRAPH_MODE,
                        enable_auto_mixed_precision=True,
                        device_target=args.platform,
                        save_graphs=False)
    if os.getenv('DEVICE_ID', "not_set").isdigit():
        context.set_context(device_id=int(os.getenv('DEVICE_ID')))

    # init distributed
    if args.is_distributed:
        parallel_mode = ParallelMode.DATA_PARALLEL
        context.set_auto_parallel_context(parallel_mode=parallel_mode,
                                          device_num=args.group_size,
                                          gradients_mean=True)
    # dataloader
    de_dataset = classification_dataset(args.data_dir,
                                        args.image_size,
                                        args.per_batch_size,
                                        1,
                                        args.rank,
                                        args.group_size,
                                        num_parallel_workers=8)
    de_dataset.map_model = 4  # !!!important
    args.steps_per_epoch = de_dataset.get_dataset_size()

    args.logger.save_args(args)

    # network
    args.logger.important_info('start create network')
    # get network and init
    network = get_network(args.backbone,
                          num_classes=args.num_classes,
                          platform=args.platform)
    if network is None:
        raise NotImplementedError('not implement {}'.format(args.backbone))

    load_pretrain_model(args.pretrained, network, args)

    # lr scheduler
    lr = get_lr(args)

    # optimizer
    opt = Momentum(params=get_param_groups(network),
                   learning_rate=Tensor(lr),
                   momentum=args.momentum,
                   weight_decay=args.weight_decay,
                   loss_scale=args.loss_scale)

    # loss
    if not args.label_smooth:
        args.label_smooth_factor = 0.0
    loss = CrossEntropy(smooth_factor=args.label_smooth_factor,
                        num_classes=args.num_classes)

    if args.is_dynamic_loss_scale == 1:
        loss_scale_manager = DynamicLossScaleManager(init_loss_scale=65536,
                                                     scale_factor=2,
                                                     scale_window=2000)
    else:
        loss_scale_manager = FixedLossScaleManager(args.loss_scale,
                                                   drop_overflow_update=False)

    model = Model(network,
                  loss_fn=loss,
                  optimizer=opt,
                  loss_scale_manager=loss_scale_manager,
                  metrics={'acc'},
                  amp_level="O3")

    # checkpoint save
    progress_cb = ProgressMonitor(args)
    callbacks = [
        progress_cb,
    ]
    if args.rank_save_ckpt_flag:
        ckpt_config = CheckpointConfig(
            save_checkpoint_steps=args.ckpt_interval * args.steps_per_epoch,
            keep_checkpoint_max=args.ckpt_save_max)
        save_ckpt_path = os.path.join(args.outputs_dir,
                                      'ckpt_' + str(args.rank) + '/')
        ckpt_cb = ModelCheckpoint(config=ckpt_config,
                                  directory=save_ckpt_path,
                                  prefix='{}'.format(args.rank))
        callbacks.append(ckpt_cb)

    model.train(args.max_epoch,
                de_dataset,
                callbacks=callbacks,
                dataset_sink_mode=True)
Ejemplo n.º 30
0
def test_bert_tdt():
    """test bert tdt"""
    context.set_context(mode=context.GRAPH_MODE,
                        device_target="Ascend",
                        reserve_class_name_in_scope=False)
    context.set_context(enable_task_sink=True)
    context.set_context(enable_loop_sink=True)
    context.set_context(enable_mem_reuse=True)
    ds = me_de_train_dataset()
    version = os.getenv('VERSION', 'large')
    batch_size = int(os.getenv('BATCH_SIZE', '16'))
    config = get_config(version=version, batch_size=batch_size)
    netwithloss = BertNetworkWithLoss(config, True)
    optimizer = Momentum(netwithloss.trainable_params(),
                         learning_rate=2e-5,
                         momentum=0.9)
    scale_window = 3
    scale_manager = DynamicLossScaleManager(2**32, 2, scale_window)
    netwithgrads = BertTrainOneStepWithLossScaleCell(
        netwithloss,
        optimizer=optimizer,
        scale_update_cell=scale_manager.get_update_cell())
    netwithgrads.set_train(True)
    model = Model(netwithgrads)
    callback = ModelCallback()
    params = netwithloss.trainable_params()
    for param in params:
        value = param.default_input
        name = param.name
        if isinstance(value, Tensor):
            if name.split('.')[-1] in ['weight']:
                if name.split('.')[-3] in ['cls2']:
                    logger.info(
                        "***************** BERT param name is 1 {}".format(
                            name))
                    param.default_input = weight_variable(
                        value.asnumpy().shape)
                else:
                    logger.info(
                        "***************** BERT param name is 2 {}".format(
                            name))
                    tempshape = value.asnumpy().shape
                    shape = (tempshape[1], tempshape[0])
                    weight_value = weight_variable(shape).asnumpy()
                    param.default_input = Tensor(
                        np.transpose(weight_value, [1, 0]))
            else:
                logger.info(
                    "***************** BERT param name is 3 {}".format(name))
                param.default_input = weight_variable(value.asnumpy().shape)
    model.train(ds.get_repeat_count(),
                ds,
                callbacks=callback,
                dataset_sink_mode=False)

    # assertion occurs while the loss_scale value is wrong
    count = 0
    for i in range(len(callback.overflow_list)):
        if callback.overflow_list[i] == Tensor(True, mstype.bool_) and i > 0:
            count = 0
            assert callback.lossscale_list[i] == callback.lossscale_list[
                i - 1] * Tensor(0.5, mstype.float32)
        if callback.overflow_list[i] == Tensor(False, mstype.bool_):
            count = count + 1
            if count == scale_window:
                count = 0
                assert callback.lossscale_list[i] == callback.lossscale_list[
                    i - 1] * Tensor(2.0, mstype.float32)