def do_train(dataset=None, network=None, load_checkpoint_path="", save_checkpoint_path="", epoch_num=1,resume=False): """ Do train Args: dataset: the train dataset. network: the network with loss load_checkpoint_path: the file path which saved pretrain model checkpoint. save_checkpoint_path: the file path which will save finetune model checkpoint. epoch_num: the number of epoch """ if load_checkpoint_path == "": raise ValueError("Pretrain model missed, finetune task must load pretrain model!") steps_per_epoch = dataset.get_dataset_size() # samples / batch_size #print info print("="*30,"TRAIN INFO","="*30) print("optimizer: {}".format(cfg.optimizer)) #Select Optimizer if cfg.optimizer == 'AdamWeightDecay': lr_schedule = GPT2LearningRate(learning_rate=cfg.AdamWeightDecay.learning_rate, end_learning_rate=cfg.AdamWeightDecay.end_learning_rate, warmup_steps=int(steps_per_epoch * epoch_num * 0.1), decay_steps=steps_per_epoch * epoch_num, power=cfg.AdamWeightDecay.power) params = network.trainable_params() # return a list of all trainable parmeters of the network # Use parameter groups and set different values decay_params = list(filter(cfg.AdamWeightDecay.decay_filter, params)) # without layernorm and bias other_params = list(filter(lambda x: not cfg.AdamWeightDecay.decay_filter(x), params)) # with layernorm and bias group_params = [{'params': decay_params, 'weight_decay': cfg.AdamWeightDecay.weight_decay}, {'params': other_params, 'weight_decay': 0.0}] optimizer = AdamWeightDecay(group_params, lr_schedule, eps=cfg.AdamWeightDecay.eps) elif cfg.optimizer == 'Lamb': #print info print("lr: {}".format(cfg.Lamb.learning_rate)) print("end_learning_rate: {}".format(cfg.Lamb.end_learning_rate)) #print("warmup_steps: {}".format(int(steps_per_epoch * epoch_num * 0.1))) print("power: {}".format(cfg.Lamb.power)) lr_schedule = GPT2LearningRate(learning_rate=cfg.Lamb.learning_rate, end_learning_rate=cfg.Lamb.end_learning_rate, warmup_steps=int(steps_per_epoch * epoch_num * 0.1), decay_steps=steps_per_epoch * epoch_num, power=cfg.Lamb.power) optimizer = Lamb(network.trainable_params(), lr_schedule) elif cfg.optimizer == 'Momentum': optimizer = Momentum(network.trainable_params(), cfg.Momentum.learning_rate, cfg.Momentum.momentum) else: raise Exception("Optimizer not supported. support: [AdamWeightDecay, Lamb, Momentum]") # load checkpoint into network ckpt_config = CheckpointConfig(save_checkpoint_steps=steps_per_epoch, keep_checkpoint_max=1) ckpoint_cb = ModelCheckpoint(prefix="gpt2_summarization", directory=None if save_checkpoint_path == "" else save_checkpoint_path, config=ckpt_config) param_dict = load_checkpoint(load_checkpoint_path) reorganized_param_dict = dict() if resume == False : print("Do not resume.\nRESUME STATE: {}".format(resume)) for netName in param_dict: reorganized_param_dict['gpt2.gpt2.'+netName] = param_dict[netName] reorganized_param_dict['gpt2.lm_head.weight'] = param_dict['gpt2_embedding_lookup.embedding_table'] else: print("Start to resume training.\nRESUME STATE: {}".format(resume)) reorganized_param_dict = param_dict load_param_into_net(network, reorganized_param_dict) update_cell = DynamicLossScaleUpdateCell(loss_scale_value=2**32, scale_factor=2, scale_window=1000) netwithgrads = GPT2FinetuneCell(network, optimizer=optimizer, scale_update_cell=update_cell) netwithgrads.set_train(True) loss_cb = LossMonitor(per_print_times=1) model = Model(netwithgrads) callbacks = [TimeMonitor(dataset.get_dataset_size()), loss_cb, ckpoint_cb] print("============== Starting Training For Summrization Task ==============") model.train(epoch_num, dataset, callbacks=callbacks, dataset_sink_mode=False) print("============== Summrization Training Success ==============")
def do_train(dataset=None, network=None, load_checkpoint_path="", save_checkpoint_path="", epoch_num=1): """ Do train Args: dataset: the train dataset. network: the network with loss load_checkpoint_path: the file path which saved pretrained model checkpoint. save_checkpoint_path: the file path which will save finetuned model checkpoint. epoch_num: the number of epoch. """ if load_checkpoint_path == "": raise ValueError( "Pretrain model missed, finetune task must load pretrain model!") steps_per_epoch = dataset.get_dataset_size() # optimizer if cfg.optimizer == 'AdamWeightDecay': lr_schedule = GPT2LearningRate( learning_rate=cfg.AdamWeightDecay.learning_rate, end_learning_rate=cfg.AdamWeightDecay.end_learning_rate, warmup_steps=int(steps_per_epoch * epoch_num * 0.1), decay_steps=steps_per_epoch * epoch_num, power=cfg.AdamWeightDecay.power) params = network.trainable_params() decay_params = list(filter(cfg.AdamWeightDecay.decay_filter, params)) other_params = list( filter(lambda x: not cfg.AdamWeightDecay.decay_filter(x), params)) group_params = [{ 'params': decay_params, 'weight_decay': cfg.AdamWeightDecay.weight_decay }, { 'params': other_params, 'weight_decay': 0.0 }] optimizer = AdamWeightDecay(group_params, lr_schedule, eps=cfg.AdamWeightDecay.eps) elif cfg.optimizer == 'Lamb': lr_schedule = GPT2LearningRate( learning_rate=cfg.Lamb.learning_rate, end_learning_rate=cfg.Lamb.end_learning_rate, warmup_steps=int(steps_per_epoch * epoch_num * 0.1), decay_steps=steps_per_epoch * epoch_num, power=cfg.Lamb.power) optimizer = Lamb(network.trainable_params(), lr_schedule) elif cfg.optimizer == 'Momentum': optimizer = Momentum(network.trainable_params(), cfg.Momentum.learning_rate, cfg.Momentum.momentum) else: raise Exception( "Optimizer not supported. support: [AdamWeightDecay, Lamb, Momentum]" ) # load checkpoint into network ckpt_config = CheckpointConfig(save_checkpoint_steps=steps_per_epoch, keep_checkpoint_max=1) prefix_name = "gpt2_translation_" + str(cfg.gpt2_network) + "_" + str(cfg.optimizer) + "_" \ + str(epoch_num) + "_bs" + str(gpt2_net_cfg.batch_size) ckpoint_cb = ModelCheckpoint( prefix=prefix_name, directory=None if save_checkpoint_path == "" else save_checkpoint_path, config=ckpt_config) param_dict = load_checkpoint(load_checkpoint_path) final_param_dict = {} for name, _ in param_dict.items(): final_param_dict['gpt2.gpt2.' + name] = param_dict[name] final_param_dict['gpt2.dense1.weight'] = param_dict[ 'gpt2_embedding_lookup.embedding_table'] load_param_into_net(network, final_param_dict) print("Load the pretrained parameter successfully! \n") update_cell = DynamicLossScaleUpdateCell(loss_scale_value=2**32, scale_factor=2, scale_window=1000) netwithgrads = GPT2FinetuneCell(network, optimizer=optimizer, scale_update_cell=update_cell) netwithgrads.set_train(True) loss_cb = LossMonitor(per_print_times=1) model = Model(netwithgrads) callbacks = [TimeMonitor(dataset.get_dataset_size()), loss_cb, ckpoint_cb] print( "=================== Starting Training For Translation Task ====================" ) model.train(epoch_num, dataset, callbacks=callbacks, dataset_sink_mode=False) print( "=================== Translation Training Success ====================" )
def test_train(): ''' finetune function ''' target = args_opt.device_target if target == "Ascend": devid = int(os.getenv('DEVICE_ID')) context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=devid) poetry, tokenizer, keep_words = create_tokenizer() print(len(keep_words)) dataset = create_poetry_dataset(bert_net_cfg.batch_size, poetry, tokenizer) num_tokens = 3191 poetrymodel = BertPoetryModel(bert_net_cfg, True, num_tokens, dropout_prob=0.1) netwithloss = BertPoetry(poetrymodel, bert_net_cfg, True, dropout_prob=0.1) callback = LossCallBack(poetrymodel) # optimizer steps_per_epoch = dataset.get_dataset_size() print("============ steps_per_epoch is {}".format(steps_per_epoch)) lr_schedule = BertLearningRate( learning_rate=cfg.AdamWeightDecay.learning_rate, end_learning_rate=cfg.AdamWeightDecay.end_learning_rate, warmup_steps=1000, decay_steps=cfg.epoch_num * steps_per_epoch, power=cfg.AdamWeightDecay.power) optimizer = AdamWeightDecay(netwithloss.trainable_params(), lr_schedule) # load checkpoint into network ckpt_config = CheckpointConfig(save_checkpoint_steps=steps_per_epoch, keep_checkpoint_max=1) ckpoint_cb = ModelCheckpoint(prefix=cfg.ckpt_prefix, directory=cfg.ckpt_dir, config=ckpt_config) param_dict = load_checkpoint(cfg.pre_training_ckpt) new_dict = {} # load corresponding rows of embedding_lookup for key in param_dict: if "bert_embedding_lookup" not in key: new_dict[key] = param_dict[key] else: value = param_dict[key] np_value = value.data.asnumpy() np_value = np_value[keep_words] tensor_value = Tensor(np_value, mstype.float32) parameter_value = Parameter(tensor_value, name=key) new_dict[key] = parameter_value load_param_into_net(netwithloss, new_dict) update_cell = DynamicLossScaleUpdateCell(loss_scale_value=2**32, scale_factor=2, scale_window=1000) netwithgrads = BertPoetryCell(netwithloss, optimizer=optimizer, scale_update_cell=update_cell) model = Model(netwithgrads) model.train(cfg.epoch_num, dataset, callbacks=[callback, ckpoint_cb], dataset_sink_mode=True)
def do_train(dataset=None, network=None, load_checkpoint_path="", save_checkpoint_path="", epoch_num=1): """ Do train Args: dataset: the train dataset. network: the network with loss load_checkpoint_path: the file path which saved pretrain model checkpoint. save_checkpoint_path: the file path which will save finetune model checkpoint. epoch_num: the number of epoch """ if load_checkpoint_path == "": raise ValueError( "Pretrain model missed, finetune task must load pretrain model!") steps_per_epoch = dataset.get_dataset_size( ) # samples / batch_size doing#### if cfg.optimizer == 'AdamWeightDecay': lr_schedule = GPT2LearningRate( learning_rate=cfg.AdamWeightDecay.learning_rate, end_learning_rate=cfg.AdamWeightDecay.end_learning_rate, warmup_steps=int(steps_per_epoch * epoch_num * 0.1), decay_steps=steps_per_epoch * epoch_num, power=cfg.AdamWeightDecay.power) params = network.trainable_params( ) # return a list of all trainable parmeters of the network # Use parameter groups and set different values decay_params = list(filter(cfg.AdamWeightDecay.decay_filter, params)) # without layernorm and bias other_params = list( filter(lambda x: not cfg.AdamWeightDecay.decay_filter(x), params)) # with layernorm and bias group_params = [{ 'params': decay_params, 'weight_decay': cfg.AdamWeightDecay.weight_decay }, { 'params': other_params, 'weight_decay': 0.0 }] optimizer = AdamWeightDecay(group_params, lr_schedule, eps=cfg.AdamWeightDecay.eps) elif cfg.optimizer == 'Lamb': lr_schedule = GPT2LearningRate( learning_rate=cfg.Lamb.learning_rate, end_learning_rate=cfg.Lamb.end_learning_rate, warmup_steps=int(steps_per_epoch * epoch_num * 0.1), decay_steps=steps_per_epoch * epoch_num, power=cfg.Lamb.power) optimizer = Lamb(network.trainable_params(), lr_schedule) elif cfg.optimizer == 'Momentum': optimizer = Momentum(network.trainable_params(), cfg.Momentum.learning_rate, cfg.Momentum.momentum) else: raise Exception( "Optimizer not supported. support: [AdamWeightDecay, Lamb, Momentum]" ) # load checkpoint into network ckpt_config = CheckpointConfig(save_checkpoint_steps=steps_per_epoch, keep_checkpoint_max=1) ckpoint_cb = ModelCheckpoint( prefix="gpt2_language_model_wiki2", directory=None if save_checkpoint_path == "" else save_checkpoint_path, config=ckpt_config) param_dict = load_checkpoint(load_checkpoint_path) final_param_dict = {} for k, v in param_dict.items(): final_param_dict['gpt2_loss.gpt2.gpt2.' + k] = param_dict[k] # set the weights of final linear weights to weights of gpt2 token embedding final_param_dict['gpt2_loss.gpt2.dense1.weight'] = param_dict[ 'gpt2_embedding_lookup.embedding_table'] load_param_into_net(network, final_param_dict) print("Load new parameter successfully!\n") update_cell = DynamicLossScaleUpdateCell(loss_scale_value=2**32, scale_factor=2, scale_window=1000) netwithgrads = GPT2FinetuneCell(network, optimizer=optimizer, scale_update_cell=update_cell) netwithgrads.set_train(True) loss_cb = LossMonitor() model = Model(netwithgrads) # callbacks = [TimeMonitor(dataset.get_dataset_size()), LossCallBack(dataset.get_dataset_size()), ckpoint_cb] callbacks = [TimeMonitor(dataset.get_dataset_size()), loss_cb, ckpoint_cb] print("============== Starting Training ==============") model.train(epoch_num, dataset, callbacks=callbacks) print("============== Training Success ==============")