Example #1
0
def train_process_bert_thor(q, device_id, epoch_size, device_num):
    os.system("mkdir " + str(device_id))
    os.chdir(str(device_id))
    context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=device_id, save_graphs=False)
    context.set_context(reserve_class_name_in_scope=False)
    context.set_context(max_call_depth=3000)
    os.environ['MINDSPORE_HCCL_CONFIG_PATH'] = MINDSPORE_HCCL_CONFIG_PATH
    os.environ['RANK_ID'] = str(device_id)
    os.environ['RANK_SIZE'] = str(device_num)

    D.init()
    rank = device_id % device_num
    context.reset_auto_parallel_context()
    _set_bert_all_reduce_split()
    context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True,
                                      device_num=device_num)

    data_set = create_bert_dataset(device_num=device_num, rank=rank, do_shuffle=False, data_dir=DATASET_PATH,
                                   schema_dir=None)
    net_with_loss = BertNetworkWithLoss(bert_net_cfg, True)

    new_repeat_count = epoch_size * data_set.get_dataset_size() // data_sink_steps
    new_repeat_count = min(new_repeat_count, train_steps // data_sink_steps)

    lr = get_bert_thor_lr()
    damping = get_bert_thor_damping()
    split_indices = [38, 77]
    optimizer = THOR(net_with_loss, lr, damping, momentum, weight_decay, loss_scale, batch_size,
                     decay_filter=lambda x: 'layernorm' not in x.name.lower() and 'bias' not in x.name.lower(),
                     split_indices=split_indices)
    time_monitor_callback = TimeMonitor(data_sink_steps)
    loss_callback = LossCallback()
    callback = [time_monitor_callback, loss_callback]

    if load_checkpoint_path:
        param_dict = load_checkpoint(load_checkpoint_path)
        load_param_into_net(net_with_loss, param_dict)

    net_with_grads = BertTrainOneStepCell(net_with_loss, optimizer=optimizer)
    model = Model(net_with_grads)
    model = ConvertModelUtils().convert_to_thor_model(model, network=net_with_grads, optimizer=optimizer,
                                                      frequency=frequency)
    model.train(new_repeat_count, data_set, callbacks=callback, dataset_sink_mode=True, sink_size=data_sink_steps)

    loss_list = loss_callback.loss_list
    per_step_mseconds = time_monitor_callback.per_step_mseconds_list
    q.put({'loss': loss_list, 'cost': per_step_mseconds})
Example #2
0
def test_bert_precision(enable_graph_kernel=False):
    """test bert precision"""
    context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", reserve_class_name_in_scope=False)
    if enable_graph_kernel:
        context.set_context(enable_graph_kernel=True)
    data_set, new_repeat_count, _ = me_de_train_dataset()
    version = os.getenv('VERSION', 'large')
    config = get_config(version=version)
    netwithloss = BertNetworkWithLoss(config, True)
    lr = BertLearningRate(decay_steps=data_set.get_dataset_size() * new_repeat_count,
                          learning_rate=5e-5, end_learning_rate=1e-9,
                          power=10.0, warmup_steps=0)
    decay_filter = lambda x: 'layernorm' not in x.name.lower() and 'bias' not in x.name.lower()
    no_decay_filter = lambda x: 'layernorm' in x.name.lower() or 'bias' in x.name.lower()
    decay_params = list(filter(decay_filter, netwithloss.trainable_params()))
    other_params = list(filter(no_decay_filter, netwithloss.trainable_params()))
    group_params = [{'params': decay_params, 'weight_decay': 0.01},
                    {'params': other_params},
                    {'order_params': netwithloss.trainable_params()}]
    optimizer = Lamb(group_params, lr)
    scale_window = 3
    scale_manager = DynamicLossScaleManager(2 ** 16, 2, scale_window)
    netwithgrads = BertTrainOneStepWithLossScaleCell(netwithloss, optimizer=optimizer,
                                                     scale_update_cell=scale_manager.get_update_cell())
    netwithgrads.set_train(True)
    model = Model(netwithgrads)
    callback = ModelCallback()
    params = netwithloss.trainable_params()
    for param in params:
        value = param.data
        name = param.name
        if isinstance(value, Tensor):
            if name.split('.')[-1] in ['weight']:
                if name.split('.')[-3] in ['cls2']:
                    logger.info("***************** BERT param name is 1 {}".format(name))
                    param.set_data(weight_variable(value.asnumpy().shape))
                else:
                    logger.info("***************** BERT param name is 2 {}".format(name))
                    tempshape = value.asnumpy().shape
                    shape = (tempshape[1], tempshape[0])
                    weight_value = weight_variable(shape).asnumpy()
                    param.set_data(Tensor(np.transpose(weight_value, [1, 0])))
            else:
                logger.info("***************** BERT param name is 3 {}".format(name))
                param.set_data(weight_variable(value.asnumpy().shape))
    model.train(new_repeat_count, data_set, callbacks=callback, dataset_sink_mode=False)

    # assertion occurs while the loss value, overflow state or loss_scale value is wrong
    loss_value = np.array(callback.loss_list)

    if enable_graph_kernel:
        expect_loss_value = [12.206627, 11.840489, 11.798470, 11.796345, 11.790964, 12.366766, 11.971539, 12.576565,
                             12.185522, 12.386192]
    else:
        assert np.allclose(loss_value[0], 12.2066, 0, 0.0005)
        expect_loss_value = [12.206587, 11.966410, 11.965916, 11.975922, 11.970262, 12.608881, 12.174048, 12.840656,
                             12.407923, 12.631133]
    print("loss value: {}".format(loss_value))
    assert np.allclose(loss_value, expect_loss_value, 0, 0.0005)

    overflow = np.array(callback.overflow_list)
    expect_overflow = [False, False, False, True, False, False, False, True, False, False]
    print("overflow: {}".format(overflow))
    assert (overflow == expect_overflow).all()

    loss_scale = np.array(callback.lossscale_list)
    expect_loss_scale = [65536.0, 65536.0, 131072.0, 65536.0, 65536.0, 65536.0, 131072.0, 65536.0, 65536.0, 65536.0]
    print("loss scale: {}".format(loss_scale))
    assert np.allclose(loss_scale, expect_loss_scale, 0, 0)
def test_bert_performance():
    """test bert performance"""
    context.set_context(mode=context.GRAPH_MODE,
                        device_target="Ascend",
                        reserve_class_name_in_scope=False)
    data_set, new_repeat_count, sink_size = me_de_train_dataset(sink_mode=True)
    version = os.getenv('VERSION', 'large')
    config = get_config(version=version)
    netwithloss = BertNetworkWithLoss(config, True)

    lr = BertLearningRate(decay_steps=sink_size * new_repeat_count,
                          learning_rate=5e-5,
                          end_learning_rate=1e-9,
                          power=10.0,
                          warmup_steps=0)
    decay_filter = lambda x: 'layernorm' not in x.name.lower(
    ) and 'bias' not in x.name.lower()
    no_decay_filter = lambda x: 'layernorm' in x.name.lower(
    ) or 'bias' in x.name.lower()
    decay_params = list(filter(decay_filter, netwithloss.trainable_params()))
    other_params = list(filter(no_decay_filter,
                               netwithloss.trainable_params()))
    group_params = [{
        'params': decay_params,
        'weight_decay': 0.01
    }, {
        'params': other_params
    }, {
        'order_params': netwithloss.trainable_params()
    }]
    optimizer = Lamb(group_params, lr)

    scale_window = 3
    scale_manager = DynamicLossScaleManager(2**16, 2, scale_window)
    netwithgrads = BertTrainOneStepWithLossScaleCell(
        netwithloss,
        optimizer=optimizer,
        scale_update_cell=scale_manager.get_update_cell())
    netwithgrads.set_train(True)
    model = Model(netwithgrads)
    callback = ModelCallback()
    params = netwithloss.trainable_params()
    for param in params:
        value = param.data
        name = param.name
        if isinstance(value, Tensor) and not value.has_init:
            if name.split('.')[-1] in ['weight']:
                if name.split('.')[-3] in ['cls2']:
                    logger.info(
                        "***************** BERT param name is 1 {}".format(
                            name))
                    param.set_data(weight_variable(value.asnumpy().shape))
                else:
                    logger.info(
                        "***************** BERT param name is 2 {}".format(
                            name))
                    tempshape = value.asnumpy().shape
                    shape = (tempshape[1], tempshape[0])
                    weight_value = weight_variable(shape).asnumpy()
                    param.set_data(Tensor(np.transpose(weight_value, [1, 0])))
            else:
                logger.info(
                    "***************** BERT param name is 3 {}".format(name))
                param.set_data(weight_variable(value.asnumpy().shape))
    time_monitor_callback = TimeMonitor(sink_size)
    model.train(new_repeat_count,
                data_set,
                callbacks=[time_monitor_callback, callback],
                dataset_sink_mode=True,
                sink_size=sink_size)

    # assertion occurs while the loss value, overflow state or loss_scale value is wrong
    loss_value = np.array(callback.loss_list)
    expect_loss_value = [11.3660, 11.3265, 11.3264]
    print("loss value: {}".format(loss_value))
    assert np.allclose(loss_value, expect_loss_value, 0, 0.0005)

    overflow = np.array(callback.overflow_list)
    expect_overflow = [True, True, True]
    print("overflow: {}".format(overflow))
    assert (overflow == expect_overflow).all()

    loss_scale = np.array(callback.lossscale_list)
    expect_loss_scale = [65536.0, 65536.0, 65536.0]
    print("loss scale: {}".format(loss_scale))
    assert np.allclose(loss_scale, expect_loss_scale, 0, 0)

    epoch_mseconds = np.array(time_monitor_callback.epoch_mseconds_list)[2]
    expect_epoch_mseconds = 1400
    print("epoch mseconds: {}".format(epoch_mseconds))
    assert epoch_mseconds <= expect_epoch_mseconds + 5

    per_step_mseconds = np.array(
        time_monitor_callback.per_step_mseconds_list)[2]
    expect_per_step_mseconds = 14
    print("per step mseconds: {}".format(per_step_mseconds))
    assert per_step_mseconds <= expect_per_step_mseconds + 1
def test_bert_percision():
    """test bert percision"""
    context.set_context(mode=context.GRAPH_MODE,
                        device_target="Ascend",
                        reserve_class_name_in_scope=False)
    ds, new_repeat_count, _ = me_de_train_dataset()
    version = os.getenv('VERSION', 'large')
    batch_size = 16
    config = get_config(version=version, batch_size=batch_size)
    netwithloss = BertNetworkWithLoss(config, True)
    lr = BertLearningRate(decay_steps=ds.get_dataset_size() * new_repeat_count,
                          learning_rate=5e-5,
                          end_learning_rate=1e-9,
                          power=10.0,
                          warmup_steps=0)
    decay_filter = lambda x: 'layernorm' not in x.name.lower(
    ) and 'bias' not in x.name.lower()
    no_decay_filter = lambda x: 'layernorm' in x.name.lower(
    ) or 'bias' in x.name.lower()
    decay_params = list(filter(decay_filter, netwithloss.trainable_params()))
    other_params = list(filter(no_decay_filter,
                               netwithloss.trainable_params()))
    group_params = [{
        'params': decay_params,
        'weight_decay': 0.01
    }, {
        'params': other_params
    }, {
        'order_params': netwithloss.trainable_params()
    }]
    optimizer = Lamb(group_params, lr)
    scale_window = 3
    scale_manager = DynamicLossScaleManager(2**16, 2, scale_window)
    netwithgrads = BertTrainOneStepWithLossScaleCell(
        netwithloss,
        optimizer=optimizer,
        scale_update_cell=scale_manager.get_update_cell())
    netwithgrads.set_train(True)
    model = Model(netwithgrads)
    callback = ModelCallback()
    params = netwithloss.trainable_params()
    for param in params:
        value = param.default_input
        name = param.name
        if isinstance(value, Tensor):
            if name.split('.')[-1] in ['weight']:
                if name.split('.')[-3] in ['cls2']:
                    logger.info(
                        "***************** BERT param name is 1 {}".format(
                            name))
                    param.default_input = weight_variable(
                        value.asnumpy().shape)
                else:
                    logger.info(
                        "***************** BERT param name is 2 {}".format(
                            name))
                    tempshape = value.asnumpy().shape
                    shape = (tempshape[1], tempshape[0])
                    weight_value = weight_variable(shape).asnumpy()
                    param.default_input = Tensor(
                        np.transpose(weight_value, [1, 0]))
            else:
                logger.info(
                    "***************** BERT param name is 3 {}".format(name))
                param.default_input = weight_variable(value.asnumpy().shape)
    model.train(new_repeat_count,
                ds,
                callbacks=callback,
                dataset_sink_mode=False)

    # assertion occurs while the loss value, overflow state or loss_scale value is wrong
    loss_value = np.array(callback.loss_list)
    assert np.allclose(loss_value[0], 12.2065868, 0, 0.000001)

    expect_loss_value = [
        12.2065868, 11.8651543, 11.8282356, 11.8266964, 11.8210478, 12.4073524,
        12.0055466, 12.6212320, 12.2229223, 12.4272099
    ]
    print("loss value: {}".format(loss_value))
    assert np.allclose(loss_value, expect_loss_value, 0, 0.0005)

    overflow = np.array(callback.overflow_list)
    expect_overflow = [
        False, False, False, True, False, False, False, True, False, False
    ]
    print("overflow: {}".format(overflow))
    assert (overflow == expect_overflow).all()

    loss_scale = np.array(callback.lossscale_list)
    expect_loss_scale = [
        65536.0, 65536.0, 131072.0, 65536.0, 65536.0, 65536.0, 131072.0,
        65536.0, 65536.0, 65536.0
    ]
    print("loss scale: {}".format(loss_scale))
    assert np.allclose(loss_scale, expect_loss_scale, 0, 0)