def train_process_bert_thor(q, device_id, epoch_size, device_num): os.system("mkdir " + str(device_id)) os.chdir(str(device_id)) context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=device_id, save_graphs=False) context.set_context(reserve_class_name_in_scope=False) context.set_context(max_call_depth=3000) os.environ['MINDSPORE_HCCL_CONFIG_PATH'] = MINDSPORE_HCCL_CONFIG_PATH os.environ['RANK_ID'] = str(device_id) os.environ['RANK_SIZE'] = str(device_num) D.init() rank = device_id % device_num context.reset_auto_parallel_context() _set_bert_all_reduce_split() context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, device_num=device_num) data_set = create_bert_dataset(device_num=device_num, rank=rank, do_shuffle=False, data_dir=DATASET_PATH, schema_dir=None) net_with_loss = BertNetworkWithLoss(bert_net_cfg, True) new_repeat_count = epoch_size * data_set.get_dataset_size() // data_sink_steps new_repeat_count = min(new_repeat_count, train_steps // data_sink_steps) lr = get_bert_thor_lr() damping = get_bert_thor_damping() split_indices = [38, 77] optimizer = THOR(net_with_loss, lr, damping, momentum, weight_decay, loss_scale, batch_size, decay_filter=lambda x: 'layernorm' not in x.name.lower() and 'bias' not in x.name.lower(), split_indices=split_indices) time_monitor_callback = TimeMonitor(data_sink_steps) loss_callback = LossCallback() callback = [time_monitor_callback, loss_callback] if load_checkpoint_path: param_dict = load_checkpoint(load_checkpoint_path) load_param_into_net(net_with_loss, param_dict) net_with_grads = BertTrainOneStepCell(net_with_loss, optimizer=optimizer) model = Model(net_with_grads) model = ConvertModelUtils().convert_to_thor_model(model, network=net_with_grads, optimizer=optimizer, frequency=frequency) model.train(new_repeat_count, data_set, callbacks=callback, dataset_sink_mode=True, sink_size=data_sink_steps) loss_list = loss_callback.loss_list per_step_mseconds = time_monitor_callback.per_step_mseconds_list q.put({'loss': loss_list, 'cost': per_step_mseconds})
def test_bert_precision(enable_graph_kernel=False): """test bert precision""" context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", reserve_class_name_in_scope=False) if enable_graph_kernel: context.set_context(enable_graph_kernel=True) data_set, new_repeat_count, _ = me_de_train_dataset() version = os.getenv('VERSION', 'large') config = get_config(version=version) netwithloss = BertNetworkWithLoss(config, True) lr = BertLearningRate(decay_steps=data_set.get_dataset_size() * new_repeat_count, learning_rate=5e-5, end_learning_rate=1e-9, power=10.0, warmup_steps=0) decay_filter = lambda x: 'layernorm' not in x.name.lower() and 'bias' not in x.name.lower() no_decay_filter = lambda x: 'layernorm' in x.name.lower() or 'bias' in x.name.lower() decay_params = list(filter(decay_filter, netwithloss.trainable_params())) other_params = list(filter(no_decay_filter, netwithloss.trainable_params())) group_params = [{'params': decay_params, 'weight_decay': 0.01}, {'params': other_params}, {'order_params': netwithloss.trainable_params()}] optimizer = Lamb(group_params, lr) scale_window = 3 scale_manager = DynamicLossScaleManager(2 ** 16, 2, scale_window) netwithgrads = BertTrainOneStepWithLossScaleCell(netwithloss, optimizer=optimizer, scale_update_cell=scale_manager.get_update_cell()) netwithgrads.set_train(True) model = Model(netwithgrads) callback = ModelCallback() params = netwithloss.trainable_params() for param in params: value = param.data name = param.name if isinstance(value, Tensor): if name.split('.')[-1] in ['weight']: if name.split('.')[-3] in ['cls2']: logger.info("***************** BERT param name is 1 {}".format(name)) param.set_data(weight_variable(value.asnumpy().shape)) else: logger.info("***************** BERT param name is 2 {}".format(name)) tempshape = value.asnumpy().shape shape = (tempshape[1], tempshape[0]) weight_value = weight_variable(shape).asnumpy() param.set_data(Tensor(np.transpose(weight_value, [1, 0]))) else: logger.info("***************** BERT param name is 3 {}".format(name)) param.set_data(weight_variable(value.asnumpy().shape)) model.train(new_repeat_count, data_set, callbacks=callback, dataset_sink_mode=False) # assertion occurs while the loss value, overflow state or loss_scale value is wrong loss_value = np.array(callback.loss_list) if enable_graph_kernel: expect_loss_value = [12.206627, 11.840489, 11.798470, 11.796345, 11.790964, 12.366766, 11.971539, 12.576565, 12.185522, 12.386192] else: assert np.allclose(loss_value[0], 12.2066, 0, 0.0005) expect_loss_value = [12.206587, 11.966410, 11.965916, 11.975922, 11.970262, 12.608881, 12.174048, 12.840656, 12.407923, 12.631133] print("loss value: {}".format(loss_value)) assert np.allclose(loss_value, expect_loss_value, 0, 0.0005) overflow = np.array(callback.overflow_list) expect_overflow = [False, False, False, True, False, False, False, True, False, False] print("overflow: {}".format(overflow)) assert (overflow == expect_overflow).all() loss_scale = np.array(callback.lossscale_list) expect_loss_scale = [65536.0, 65536.0, 131072.0, 65536.0, 65536.0, 65536.0, 131072.0, 65536.0, 65536.0, 65536.0] print("loss scale: {}".format(loss_scale)) assert np.allclose(loss_scale, expect_loss_scale, 0, 0)
def test_bert_performance(): """test bert performance""" context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", reserve_class_name_in_scope=False) data_set, new_repeat_count, sink_size = me_de_train_dataset(sink_mode=True) version = os.getenv('VERSION', 'large') config = get_config(version=version) netwithloss = BertNetworkWithLoss(config, True) lr = BertLearningRate(decay_steps=sink_size * new_repeat_count, learning_rate=5e-5, end_learning_rate=1e-9, power=10.0, warmup_steps=0) decay_filter = lambda x: 'layernorm' not in x.name.lower( ) and 'bias' not in x.name.lower() no_decay_filter = lambda x: 'layernorm' in x.name.lower( ) or 'bias' in x.name.lower() decay_params = list(filter(decay_filter, netwithloss.trainable_params())) other_params = list(filter(no_decay_filter, netwithloss.trainable_params())) group_params = [{ 'params': decay_params, 'weight_decay': 0.01 }, { 'params': other_params }, { 'order_params': netwithloss.trainable_params() }] optimizer = Lamb(group_params, lr) scale_window = 3 scale_manager = DynamicLossScaleManager(2**16, 2, scale_window) netwithgrads = BertTrainOneStepWithLossScaleCell( netwithloss, optimizer=optimizer, scale_update_cell=scale_manager.get_update_cell()) netwithgrads.set_train(True) model = Model(netwithgrads) callback = ModelCallback() params = netwithloss.trainable_params() for param in params: value = param.data name = param.name if isinstance(value, Tensor) and not value.has_init: if name.split('.')[-1] in ['weight']: if name.split('.')[-3] in ['cls2']: logger.info( "***************** BERT param name is 1 {}".format( name)) param.set_data(weight_variable(value.asnumpy().shape)) else: logger.info( "***************** BERT param name is 2 {}".format( name)) tempshape = value.asnumpy().shape shape = (tempshape[1], tempshape[0]) weight_value = weight_variable(shape).asnumpy() param.set_data(Tensor(np.transpose(weight_value, [1, 0]))) else: logger.info( "***************** BERT param name is 3 {}".format(name)) param.set_data(weight_variable(value.asnumpy().shape)) time_monitor_callback = TimeMonitor(sink_size) model.train(new_repeat_count, data_set, callbacks=[time_monitor_callback, callback], dataset_sink_mode=True, sink_size=sink_size) # assertion occurs while the loss value, overflow state or loss_scale value is wrong loss_value = np.array(callback.loss_list) expect_loss_value = [11.3660, 11.3265, 11.3264] print("loss value: {}".format(loss_value)) assert np.allclose(loss_value, expect_loss_value, 0, 0.0005) overflow = np.array(callback.overflow_list) expect_overflow = [True, True, True] print("overflow: {}".format(overflow)) assert (overflow == expect_overflow).all() loss_scale = np.array(callback.lossscale_list) expect_loss_scale = [65536.0, 65536.0, 65536.0] print("loss scale: {}".format(loss_scale)) assert np.allclose(loss_scale, expect_loss_scale, 0, 0) epoch_mseconds = np.array(time_monitor_callback.epoch_mseconds_list)[2] expect_epoch_mseconds = 1400 print("epoch mseconds: {}".format(epoch_mseconds)) assert epoch_mseconds <= expect_epoch_mseconds + 5 per_step_mseconds = np.array( time_monitor_callback.per_step_mseconds_list)[2] expect_per_step_mseconds = 14 print("per step mseconds: {}".format(per_step_mseconds)) assert per_step_mseconds <= expect_per_step_mseconds + 1
def test_bert_percision(): """test bert percision""" context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", reserve_class_name_in_scope=False) ds, new_repeat_count, _ = me_de_train_dataset() version = os.getenv('VERSION', 'large') batch_size = 16 config = get_config(version=version, batch_size=batch_size) netwithloss = BertNetworkWithLoss(config, True) lr = BertLearningRate(decay_steps=ds.get_dataset_size() * new_repeat_count, learning_rate=5e-5, end_learning_rate=1e-9, power=10.0, warmup_steps=0) decay_filter = lambda x: 'layernorm' not in x.name.lower( ) and 'bias' not in x.name.lower() no_decay_filter = lambda x: 'layernorm' in x.name.lower( ) or 'bias' in x.name.lower() decay_params = list(filter(decay_filter, netwithloss.trainable_params())) other_params = list(filter(no_decay_filter, netwithloss.trainable_params())) group_params = [{ 'params': decay_params, 'weight_decay': 0.01 }, { 'params': other_params }, { 'order_params': netwithloss.trainable_params() }] optimizer = Lamb(group_params, lr) scale_window = 3 scale_manager = DynamicLossScaleManager(2**16, 2, scale_window) netwithgrads = BertTrainOneStepWithLossScaleCell( netwithloss, optimizer=optimizer, scale_update_cell=scale_manager.get_update_cell()) netwithgrads.set_train(True) model = Model(netwithgrads) callback = ModelCallback() params = netwithloss.trainable_params() for param in params: value = param.default_input name = param.name if isinstance(value, Tensor): if name.split('.')[-1] in ['weight']: if name.split('.')[-3] in ['cls2']: logger.info( "***************** BERT param name is 1 {}".format( name)) param.default_input = weight_variable( value.asnumpy().shape) else: logger.info( "***************** BERT param name is 2 {}".format( name)) tempshape = value.asnumpy().shape shape = (tempshape[1], tempshape[0]) weight_value = weight_variable(shape).asnumpy() param.default_input = Tensor( np.transpose(weight_value, [1, 0])) else: logger.info( "***************** BERT param name is 3 {}".format(name)) param.default_input = weight_variable(value.asnumpy().shape) model.train(new_repeat_count, ds, callbacks=callback, dataset_sink_mode=False) # assertion occurs while the loss value, overflow state or loss_scale value is wrong loss_value = np.array(callback.loss_list) assert np.allclose(loss_value[0], 12.2065868, 0, 0.000001) expect_loss_value = [ 12.2065868, 11.8651543, 11.8282356, 11.8266964, 11.8210478, 12.4073524, 12.0055466, 12.6212320, 12.2229223, 12.4272099 ] print("loss value: {}".format(loss_value)) assert np.allclose(loss_value, expect_loss_value, 0, 0.0005) overflow = np.array(callback.overflow_list) expect_overflow = [ False, False, False, True, False, False, False, True, False, False ] print("overflow: {}".format(overflow)) assert (overflow == expect_overflow).all() loss_scale = np.array(callback.lossscale_list) expect_loss_scale = [ 65536.0, 65536.0, 131072.0, 65536.0, 65536.0, 65536.0, 131072.0, 65536.0, 65536.0, 65536.0 ] print("loss scale: {}".format(loss_scale)) assert np.allclose(loss_scale, expect_loss_scale, 0, 0)