def run(): lu.setup_logging() # Show the config logger.info("Config:\n{}".format(cfg)) # seed everything for reproducability common.seed_everything(cfg.RNG_SEED) # Configure the CUDNN backend torch.backends.cudnn.benchmark = cfg.CUDNN.BENCHMARK # train the model train_model()
def main(): args = get_argparse().parse_args() if not os.path.exists(args.output_dir): os.mkdir(args.output_dir) time_ = time.strftime("%Y-%m-%d-%H:%M:%S", time.localtime()) log_file = os.path.join(args.output_dir, 'tener-{}.log'.format(time_)) init_logger(log_file=log_file) if args.gpu and args.use_cuda: device = torch.device("cuda:%s" % args.gpu) args.n_gpu = 1 elif args.local_rank == -1 or args.use_cuda: device = torch.device( "cuda" if torch.cuda.is_available() and args.use_cuda else "cpu") args.n_gpu = torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) # torch.Tensor分配到的设备对象 torch.distributed.init_process_group(backend="nccl") args.n_gpu = 1 args.device = device logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1)) seed_everything(args.seed) ner_processor = CluenerProcessor(args.data_path) model = NER_model(ner_processor, args) model.to(args.device) if args.model_path is not None: model.load_state_dict( torch.load(os.path.join(args.model_path, 'pytorch_model.bin'))) if args.do_train: train(args, ner_processor, model) if args.do_eval: evaluate(args, ner_processor, model, show_entity_info=True) if args.do_predict: predict(args, ner_processor, model)
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--gpt_config_file", default="CDial-GPT2_LCCC-base/config.json", type=str, help="The config json file corresponding to the pre-trained GPT model." ) parser.add_argument( "--vocab_file", default="CDial-GPT2_LCCC-base/vocab.txt", type=str, help="The vocabulary file that the GPT model was trained on.") parser.add_argument( "--init_checkpoint", default="CDial-GPT2_LCCC-base/pytorch_model.bin", type=str, help="Initial checkpoint (usually from a pre-trained GPT model).") # Required parameters parser.add_argument( "--output_dir", default=None, type=str, required=True, help="The output directory where the predictions will be written.") parser.add_argument( "--model_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints will be written." ) parser.add_argument( "--cached_dir", default=None, type=str, required=True, help="The output directory where the data cache will be written.") # Other parameters parser.add_argument("--name", type=str, default="GPT2Gen", help="name of the model") parser.add_argument("--dataset", type=str, default="GPT2Gen", help="Dataloader class.") parser.add_argument("--datapath", type=str, default="resources://OpenSubtitles", help="Directory for data set.") parser.add_argument("--max_sent_length", default=192, tpye=int, help="The max length of the sentence pair.") parser.add_argument("--num_turns", default=8, type=int, help="The max turn length of the post field.") parser.add_argument( "--is_relative", action="store_true", help= "If True, use relative turn embedding, else use absolute turn embedding." ) parser.add_argument("--do_train", action="store_true", help="Whether to run training.") parser.add_argument("--do_predict", action="store_true", help="Whether to run eval on the dev set.") parser.add_argument("--cache", action="store_ture", help="Whether to save the data result.") parser.add_argument("--train_batch_size", default=8, type=int, help="Total batch size for training.") parser.add_argument("--predict_batch_size", default=16, type=int, help="Total batch size for prediction.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for the optimizer.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") # ----------------- 用于推理阶段的一些参数 ------------------- parser.add_argument("--no_sample", action="store_true", help="Set to use greedy decoding instead of sampling.") parser.add_argument("--min_decoder_length", type=int, default=3, help="The minimum length of the generated response.") parser.add_argument("--max_decoder_length", type=int, default=30, help="The maximum length of the generated response.") parser.add_argument("--temperature", type=float, default=1, help="Sampling softmax temperature.") parser.add_argument( "--top_k", type=int, default=0, help="Filter top-k tokens before sampling (<=0: no filtering)") parser.add_argument( "--top_p", type=float, default=0.0, help="Nucleus filetring (top-p) before sampling (<=0.0: no filtering)") # -------------------------------------------------------- # 表示训练的前多少进行warm_up parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. E.g., 0.1=10% " "of training.") parser.add_argument("--no_cuda", default=False, action="store_true", help="Whether not to use CUDA when available.") parser.add_argument("--seed", type=int, default=42, help="random seed for initialization.") parser.add_argument( "--gradient_accumulation_steps", type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( "--do_lower_case", default=True, action="store_true", help= "Whether to lower case the input text. True for uncased models, False for cased models." ) args = parser.parse_args() if not args.do_train and not args.do_predict: raise ValueError( "At least one of `do_train` or `do_predict` must be True.") if not os.path.exists(args.output_dir): os.makedirs(args.output_dir, exist_ok=True) if not os.path.exists(args.model_dir): os.makedirs(args.model_dir, exist_ok=True) data_class = GPTGen config_class = GPT2Config model_class = GPT2LMHeadModel tokenizer_class = BertTokenizer # 加载数据 def load_dataset(file_id, vocab_name, do_lower_case, max_sent_length, num_turns, is_relative): dm = data_class(file_id=file_id, vocab_name=vocab_name, do_lower_case=do_lower_case, max_sent_length=max_sent_length, num_turns=num_turns, is_relative=is_relative) return dm logger.info("模型训练侧加载数据") if args.cache: dataManager = try_cache( load_dataset, { "file_id": args.datapath, "vocab_name": args.vocab_file, "do_lower_case": args.do_lower_case, "max_sent_length": args.max_sent_length, "num_turns": args.num_turns, "is_relative": args.is_relative }, args.cache_dir, data_class.__name__) else: dataManager = load_dataset(file_id=args.datapath, vocab_name=args.vocab_file, do_lower_case=args.do_lower_case, max_sent_length=args.max_sent_length, num_turns=args.num_turns, is_relative=args.is_relative) if args.do_train: if not args.no_cuda: if not "CUDA_VISIBLE_DEVICES" in os.environ: os.environ["CUDA_VISIBLE_DEVICES"] = '0' device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() logger.info(f"device: {device} | n_gpu: {n_gpu}") if args.gradient_accumulation_steps < 1: raise ValueError( f"Invalid gradient_accumulation_steps parameter: {args.gradient_accumulation_steps}, should be >= 1" ) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) seed_everything(args.seed) logger.info("train examples {}".format( len(dataManager.data["train"]["resp"]))) num_train_steps = int(len(dataManager.data["train"]["resp"]) \ / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) # 加载预训练模型 config = config_class.from_json_file(args.gpt_config_file) config.num_turns = args.num_turns model = model_class(config) if args.init_checkpoint is not None: logger.info("加载GPT预训练权重") state_dict = torch.load(args.init_checkpoint, map_location="cpu") missing_keys = [] unexpected_keys = [] error_msgs = [] # 深拷贝state_dict,便于下面的_load_from_state_dict进行修改 metadata = getattr(state_dict, "_metadata", None) state_dict = state_dict.copy() if metadata is not None: state_dict._metadata = metadata def load(module, prefix=""): local_metadata = {} if metadata is None else metadata.get( prefix[:-1], {}) module._load_from_state_dict(state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs) for name, child in module._modules.items(): if child is not None: load(child, prefix + name + ".") load( model, prefix="" if hasattr(model, "transformer") else "transformer.") logger.info("missing keys: {}".format(missing_keys)) logger.info("unexpected keys: {}".format(unexpected_keys)) logger.info("error msgs: {}".format(error_msgs)) model.to(device) model = torch.nn.DataParallel(model) # 准备优化器和优化参数 param_optimizer = list(model.named_parameters()) # 去除pooling层,这一层会产生梯度None # 影响apex的使用 param_optimizer = [n for n in param_optimizer if "pooler" not in n[0]] no_decay = [ "bias", "ln_1.bias", "ln_1.weight", "ln_2.bias", "ln_2.weight" ] optimizer_grouped_parameters = [{ "params": [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], "weight_decay": 0.01 }, { "params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], "weight_decay": 0.0 }] t_total = num_train_steps optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate) global_step = 0 logger.info("***** Running training *****") logger.info(" Num training_examples = %d", len(dataManager.data['train']['resp'])) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) model.train() losses = [] for epoch in trange(int(args.num_train_epochs), desc="Epoch"): model.zero_grad() # 初始化数据 dataManager.restart(key="train", batch_size=args.train_batch_size, shuffle=True) # 获取下一个batch的数据 data = dataManager.get_next_batch(key="train") step = 0 loss_value = 0 while data is not None: if n_gpu == 1: preprocess_batch(data, device) else: preprocess_batch(data) outputs = model(input_ids=data["input_ids"], attention_mask=data["input_mask"], token_type_ids=data["token_type_ids"], turn_ids=data["turn_ids"], labels=data["lm_labels"]) # loss: 是一个tensor型的标量 # lm_logits: [batch, seq_length, vocab_size] loss, lm_logits = outputs[0], outputs[1] if n_gpu > 1: loss = loss.mean() if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss_value += loss.cpu().item( ) * args.gradient_accumulation_steps loss.backward() # 如果达到累积的梯度数量,则反向传播 if (step + 1) % args.gradient_accumulation_steps == 0: # modify leanring rate with special warm up GPT lr_this_step = args.learning_rate * warmup_linear( global_step / t_total, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 losses.append(loss.detach().cpu().item()) # 如果要记录当前的损失值 if (step + 1) % 1000 == 0: logger.info( f"step: {step + 1} | loss: {(loss_value / 1000):.4f} | ppl: {(np.exp(loss_value / 1000)):.4f}" ) loss_value = 0 step += 1 data = dataManager.get_next_batch(key="train") logger.info( f"保存模型 pytorch_model.{int(args.num_train_epochs)}.{epoch+1}.bin" ) output_model_file = os.path.join( args.model_dir, f"pytorch_model.{int(args.num_train_epochs)}.{int(epoch+1)}.bin" ) # 保存训练好的模型 model_to_save = model.module if hasattr(model, "module") else model torch.save(model_to_save.state_dict(), output_model_file) # 保存损失 logger.info("保存训练过程中的loss") save_losses(args.model_dir, losses={"loss": losses}) logger.info("训练结束") # 定义测试集上运行的过程 if args.do_predict: total_epoch = int(args.num_train_epochs) chosen_epoch = 10 if not args.no_cuda: if not "CUDA_VISIBLE_DEVICES" in os.environ: os.environ["CUDA_VISIBLE_DEVICES"] = '0' device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() seed_everything(args.seed) output_model_file = os.path.join( args.model_dir, "pytorch_model.%d.%d.bin" % (total_epoch, chosen_epoch)) model_state_dict = torch.load(output_model_file) tokenizer = tokenizer_class(vocab_file=args.vocab_file, do_lower_case=args.do_lower_case) config = config_class.from_json_file(args.gpt_config_file) config.num_turns = args.num_turns model = model_class(config) model.load_state_dict(model_state_dict) model.to(device) logger.info(f"transform special tokens {SPECIAL_TOKENS} to ids") special_tokens_ids = tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS) if n_gpu > 1: model = torch.nn.DataParallel(model) # 这里的myMetrics主要用来计算bleu和distinct # 这里bleu和distinct计算是按照中文word计算的,而不是按照char metric1 = MyMetrics() metric2 = MyPerplexity(unk_id=dataManager.bert_unk_id) logger.info("***** Running testing *****") logger.info(" Num post-response pairs = %d", len(dataManager.data['test']['resp'])) logger.info(" Batch size = %d", args.predict_batch_size) model.eval() logger.info("Start evaluating") dataManager.restart(key='test', batch_size=args.predict_batch_size, shuffle=False) data = dataManager.get_next_batch(key='test') # 保存预测和生成的结果 gold_strings = [] gen_strings = [] while data is not None: cur_batch_size = int(len(data["input_ids"])) for i in range(cur_batch_size): input_ids = data["input_ids"][i] token_type_ids = data["token_type_ids"][i] turn_ids = data["turn_ids"][i] # posts_len = data["posts_len"][i] resp_list = data["resp"][i] # 这里是tokenizer分词之后的列表,并没有转化为id resp_length = data["resp_lens"][i] # 这里得到最终输出的所有ids with torch.no_grad(): # 这里的pred_logits是经过log_softmax之后的 # pred_ids: [seq_len], list型 # pred_logits: [seq_len, vocab_size], torch.Tensor类型 pred_ids, pred_logits = sample_sequence( history=input_ids, model=model, args=args, device=device, special_tokens_ids=special_tokens_ids, token_type_ids=token_type_ids, turn_ids=turn_ids, current_output=None) # 将输出的ids转化为tokens # decode的输出是一个字符串,token之间使用空格拼接 pred_text = tokenizer.decode(pred_ids, skip_special_tokens=False) # 计算bleu和distinct指标 pred_text_string = "".join(pred_text.split()) resp_text_string = "".join(resp_list) metric1.forward(ref=resp_text_string, hyp=pred_text_string) # 计算ppl # 将resp_list转化为torch.Tensor类型的token_id resp_ids = torch.tensor( tokenizer.convert_tokens_to_ids(resp_list), dtype=torch.long, device=pred_logits.device) metric2.forward(resp_length=resp_length, resp_ids=resp_ids, gen_log_prob=pred_logits) gold_strings.append(resp_text_string) gen_strings.append(pred_text_string) data = dataManager.get_next_batch(key="test") hits = test_process_hits(dataManager, model, args) result = metric1.close() result.update(metric2.close()) result.update(hits) # 保存预测结果 output_prediction_file = args.output_dir + f"/{args.name}_test.{total_epoch}.{chosen_epoch}.txt" logger.info(f"预测指标保存的路径 {output_prediction_file}") with open(output_prediction_file, "w", encoding="utf-8") as f: print("Test Result: ") res_print = list(result.items()) res_print.sort(key=lambda x: x[0]) for key, value in res_print: if isinstance(value, float): print(f"\t{key}:\t{value}") f.write(f"{key}:\t{value}\n") f.write("\n") for gold, gen in zip(gold_strings, gen_strings): f.write(f"resp:\t{gold}\n") f.write(f"gen:\t{gen}\n\n")
def main(): parser = argparse.ArgumentParser() parser.add_argument("--bert_config_file", default="chinese_wwm_pytorch/bert_config.json", type=str, help="The config json file corresponding to the pre-trained BERT model. " "This specifies the model architecture.") parser.add_argument("--vocab_file", default="chinese_wwm_pytorch/vocab.txt", type=str, help="The vocabulary file that the BERT model was trained on.") parser.add_argument("--init_checkpoint", default="chinese_wwm_pytorch/pytorch_model.bin", type=str, help="Initial checkpoint (usually from a pre-trained BERT model).") ## Required parameters parser.add_argument("--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints and predictions will be written.") parser.add_argument("--model_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints and predictions will be written.") parser.add_argument("--cache_dir", default=None, type=str, required=True, help="Whether to run training.") ## Other parameters parser.add_argument('--name', type=str, default='BERTRetrieval', help='name of model') parser.add_argument('--dataset', type=str, default='ChDialogMemBERTRetrieval', help='Dataloader class. Default: OpenSubtitles') parser.add_argument('--datapath', type=str, default='resources://OpenSubtitles', help='Directory for data set. Default: resources://OpenSubtitles') parser.add_argument('--wv_class', type=str, default='TencentChinese', help="Wordvector class, none for not using pretrained wordvec. Default: Glove") parser.add_argument('--wv_path', type=str, default='/home/zhengchujie/wordvector/chinese', help="Directory for pretrained wordvector. Default: resources://Glove300d") parser.add_argument('--embedding_size', type=int, default=200, help="The embed dim of the pretrained word vector.") parser.add_argument("--num_choices", default=10, type=int, help="the number of retrieval options") parser.add_argument("--max_sent_length", default=192, type=int, help="The max length of the sentence pair.") parser.add_argument("--max_know_length", default=100, type=int, help="The max length of the knowledge triplets") parser.add_argument("--num_turns", default=8, type=int, help="The max turn length of the post field.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_predict", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--cache", action='store_true', help="Whether to run training.") parser.add_argument("--train_batch_size", default=8, type=int, help="Total batch size for training.") parser.add_argument("--predict_batch_size", default=16, type=int, help="Total batch size for predictions.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--warmup_proportion", default=0.1, type=float, help="Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10% " "of training.") parser.add_argument("--lamb", default=0.6, type=float, help="The factor of the attention loss.") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.") parser.add_argument("--do_lower_case", default=True, action='store_true', help="Whether to lower case the input text. True for uncased models, False for cased models.") args = parser.parse_args() if not args.do_train and not args.do_predict: raise ValueError("At least one of `do_train` or `do_predict` must be True.") if not os.path.exists(args.output_dir): os.makedirs(args.output_dir, exist_ok=True) if not os.path.exists(args.model_dir): os.makedirs(args.model_dir, exist_ok=True) data_class = MyMemBERTRetrieval wordvec_class = TencentChinese # 加载数据 def load_dataset(file_id, bert_vocab_name, do_lower_case, num_choices, max_sent_length, max_know_length, num_turns): dm = data_class(file_id=file_id, bert_vocab_name=bert_vocab_name, do_lower_case=do_lower_case, num_choices=num_choices, max_sent_length=max_sent_length, max_know_length=max_know_length, num_turns=num_turns) return dm logger.info("模型训练侧加载数据") if args.cache: if not os.path.isdir(args.cache_dir): os.mkdir(args.cache_dir) logger.info("加载缓存数据") dataManager = try_cache(load_dataset, {"file_id": args.datapath, "bert_vocab_name": args.vocab_file, "do_lower_case": args.do_lower_case, "num_choices": args.num_choices, "max_sent_length": args.max_sent_length, "max_know_length": args.max_know_length, "num_turns": args.num_turns}, args.cache_dir, data_class.__name__) vocab = dataManager.id2know_word logger.info("加载词向量文件") embed = try_cache(lambda wv, ez, vl: wordvec_class(wv).load_matrix(ez, vl), (args.wv_path, args.embedding_size, vocab), args.cache_dir, wordvec_class.__name__) else: dataManager = load_dataset(file_id=args.datapath, bert_vocab_name=args.vocab_file, do_lower_case=args.do_lower_case, num_choices=args.num_choices, max_sent_length=args.max_sent_length, max_know_length=args.max_know_length, num_turns=args.num_turns) logger.info("定义并加载词向量文件") wv = wordvec_class(args.wv_path) vocab = dataManager.id2know_word embed = wv.load_matrix(args.embedding_size, vocab) #dataManager._max_know_length = 100 if args.do_train: if not args.no_cuda: if not "CUDA_VISIBLE_DEVICES" in os.environ: os.environ["CUDA_VISIBLE_DEVICES"] = '0' device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() logger.info("device: {} n_gpu: {}".format(device, n_gpu)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) seed_everything(args.seed) logger.info("train examples {}".format(len(dataManager.data['train']['resp']))) num_train_steps = int(len(dataManager.data['train'][ 'resp']) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) # Prepare model ''' if os.path.exists(output_model_file): model_state_dict = torch.load(output_model_file) model = BERTRetrieval(num_choices=args.num_choices, bert_config_file=args.bert_config_file) model.load_state_dict(model_state_dict) ''' model = BERTRetrieval(num_choices=args.num_choices, bert_config_file=args.bert_config_file, init_embeddings=embed) if args.init_checkpoint is not None: logger.info('load bert weight') state_dict = torch.load(args.init_checkpoint, map_location='cpu') missing_keys = [] unexpected_keys = [] error_msgs = [] # copy state_dict so _load_from_state_dict can modify it metadata = getattr(state_dict, '_metadata', None) state_dict = state_dict.copy() if metadata is not None: state_dict._metadata = metadata def load(module, prefix=''): local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {}) module._load_from_state_dict(state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs) for name, child in module._modules.items(): # logger.info("name {} chile {}".format(name,child)) if child is not None: load(child, prefix + name + '.') load(model, prefix='' if hasattr(model, 'bert') else 'bert.') logger.info("missing keys:{}".format(missing_keys)) logger.info('unexpected keys:{}'.format(unexpected_keys)) logger.info('error msgs:{}'.format(error_msgs)) model.to(device) model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) # # # hack to remove pooler, which is not used # # thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] # 分层学习率 bert_param_optimizer = [(n, p) for n, p in param_optimizer if ("bert" in n)] other_param_optimizer = [(n, p) for n, p in param_optimizer if ("bert" not in n)] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] # optimizer_grouped_parameters = [ # {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, # {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}] optimizer_grouped_parameters = [ {"params": [p for n, p in bert_param_optimizer if not any(nd in n for nd in no_decay)], "weight_decay": 0.01, "lr": 2e-5}, {"params": [p for n, p in bert_param_optimizer if any(nd in n for nd in no_decay)], "weight_decay": 0.0, "lr": 2e-5}, {"params": [p for n, p in other_param_optimizer if not any(nd in n for nd in no_decay)], "weight_decay": 0.01, "lr": args.learning_rate}, {"params": [p for n, p in other_param_optimizer if any(nd in n for nd in no_decay)], "weight_decay": 0.0, "lr": args.learning_rate} ] t_total = num_train_steps optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate) scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=int(t_total * args.warmup_proportion), num_training_steps=t_total) global_step = 0 logger.info("***** Running training *****") logger.info(" Num post-response pairs = %d", len(dataManager.data['train']['resp'])) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) model.train() losses = [] for epoch in trange(int(args.num_train_epochs), desc="Epoch"): model.zero_grad() dataManager.restart(key='train', batch_size=args.train_batch_size) data = dataManager.get_next_batch(key='train') step = 0 loss_value = 0 kg_loss_value = 0 kg_acc_value = 0 while data is not None: if n_gpu == 1: preprocess_batch(data, device) # multi-gpu does scattering it-self else: preprocess_batch(data) loss, kg_loss, kg_acc = model(data, data['labels']) loss = loss + args.lamb * kg_loss if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss_value += loss.cpu().item() * args.gradient_accumulation_steps kg_loss_value += kg_loss.cpu().item() kg_acc_value += kg_acc.cpu().item() loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: # modify learning rate with special warm up BERT uses # lr_this_step = args.learning_rate * warmup_linear(global_step / t_total, args.warmup_proportion) # for param_group in optimizer.param_groups: # param_group['lr'] = lr_this_step optimizer.step() scheduler.step() model.zero_grad() global_step += 1 # 每次反向传播之前记录一下当前的指标 losses.append(loss.detach().cpu().item()) if (step + 1) % 1000 == 0: logger.info("step:{} | loss@{} | kg_loss@{} | kg_acc@{}".format(step + 1, loss_value / 1000, kg_loss_value / 1000, kg_acc_value / 1000)) loss_value = 0 kg_loss_value = 0 kg_acc_value = 0 step += 1 data = dataManager.get_next_batch(key='train') logger.info(f"保存模型 pytorch_model.{int(args.num_train_epochs)}.{epoch+1}.bin") output_model_file = os.path.join(args.model_dir, "pytorch_model.%d.%d.bin" % (int(args.num_train_epochs), epoch + 1)) # Save a trained model model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self torch.save(model_to_save.state_dict(), output_model_file) # 保存所有的损失值 logger.info("保存训练过程的loss") save_losses(args.model_dir, losses={"loss": losses}) logger.info("训练结束") # Load a trained model that you have fine-tuned if args.do_predict: total_epoch = int(args.num_train_epochs) chosen_epoch = 10 if not args.no_cuda: if not "CUDA_VISIBLE_DEVICES" in os.environ: os.environ["CUDA_VISIBLE_DEVICES"] = '0' device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() seed_everything(args.seed) output_model_file = os.path.join(args.model_dir, "pytorch_model.%d.%d.bin" % (total_epoch, chosen_epoch)) model_state_dict = torch.load(output_model_file) model = BERTRetrieval(num_choices=args.num_choices, bert_config_file=args.bert_config_file, init_embeddings=embed) model.load_state_dict(model_state_dict) model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) metric = MyMetrics() logger.info("***** Running testing *****") logger.info(" Num post-response pairs = %d", len(dataManager.data['test']['resp'])) logger.info(" Batch size = %d", args.predict_batch_size) model.eval() logger.info("Start evaluating") dataManager.restart(key='test', batch_size=args.predict_batch_size, shuffle=False) data = dataManager.get_next_batch(key='test') gens = [] gold = [] choices = [] hits = {1: [0, 0], 3:[0, 0], 5: [0, 0]} while data is not None: preprocess_batch(data, device) truth_response, can_responses = data['resp'], data['can_resps'] with torch.no_grad(): prob, pred = model(data) assert len(pred) == len(truth_response) assert len(pred) == len(can_responses) assert len(can_responses[0]) == args.num_choices for truth, pd, cans, prb in zip(truth_response, pred, can_responses, prob): metric.forword(truth, cans[pd]) gold.append(truth) gens.append(cans[pd]) choices.append(cans) idx = cans.index(truth) p_sort = np.argsort(prb) for key, count in hits.items(): if idx in p_sort[-key:]: count[0] += 1 count[1] += 1 data = dataManager.get_next_batch(key='test') result = metric.close() result.update({'hits@%d' % key: value[0] / value[1] for key, value in hits.items()}) output_prediction_file = args.output_dir + f"/{args.name}_test.{total_epoch}.{chosen_epoch}.txt" with open(output_prediction_file, "w", encoding="utf-8") as f: print("Test Result:") res_print = list(result.items()) res_print.sort(key=lambda x: x[0]) for key, value in res_print: if isinstance(value, float): print("\t%s:\t%f" % (key, value)) f.write("%s:\t%f\n" % (key, value)) f.write('\n') for resp, gen, options in zip(gold, gens, choices): f.write("resp:\t%s\n" % resp) f.write("gen:\t%s\n\n" % gen) for i, option in enumerate(options): f.write("candidate %d:\t%s\n" % (i, option)) f.write("\n")
help="number of arenas") parser.add_argument("--batch_size", type=int, default=32) parser.add_argument("--print_interval", type=int, default=20) parser.add_argument("--gpu", action="store_true", default=False, help="Use GPU if available (default device)") parser.add_argument("--tag", action="append", nargs="+", default=[], help="add user tags to run") args = parser.parse_args() seed_everything(args.seed) token = os.environ.get('TELEGRAM_BOT_TOKEN', None) channel = os.environ.get('TELEGRAM_CHANNEL', None) if args.notg: token = None channel = None commit_hash = subprocess.check_output( ['git', 'rev-parse', '--short', 'HEAD']).decode('ascii').strip('\n') experiment_tags = ["dqn_v1", f"commit_{str(commit_hash)}"] tags = list(np.asarray(args.tag).flatten()) experiment_tags.extend(tags) tgwriter = TelegramWriter(token, channel) with tgwriter.post() as f:
def train(args, processor, model): args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_dataset = NER_dataset( load_and_cache_examples(args, processor, data_type='train'), args.train_max_seq_len) train_sampler = RandomSampler( train_dataset) if args.local_rank == -1 else DistributedSampler( train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=collate_fn) t_total = len(train_dataloader) * args.epoch transformer_param_optimizer = list(model.transformer.parameters()) crf_param_optimizer = list(model.crf.parameters()) linear_param_optimizer = list(model.out_fc.parameters()) optimizer_grouped_parameters = [ { 'params': transformer_param_optimizer, 'lr': args.learning_rate }, { 'params': crf_param_optimizer, 'lr': args.crf_learning_rate }, { 'params': linear_param_optimizer, 'lr': args.crf_learning_rate }, ] args.warmup_steps = int(t_total * args.warmup_rate) if args.optim == 'sgd:': optimizer = optim.SGD(optimizer_grouped_parameters, lr=args.learning_rate, momentum=args.momentum_rate) elif args.optim == 'adam': optimizer = optim.Adam(optimizer_grouped_parameters, lr=args.learning_rate) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.epoch) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed) = %d", args.train_batch_size * (torch.distributed.get_world_size() if args.local_rank != -1 else 1), ) logger.info(" Total optimization steps = %d", t_total) global_step = 0 steps_trained_in_current_epoch = 0 best_f1 = 0.0 tr_loss = 0.0 model.zero_grad() # Added here for reproductibility (even between python 2 and 3) seed_everything(args.seed) for index in range(int(args.epoch)): for step, batch in enumerate(train_dataloader): # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue model.train() batch = tuple(t.to(args.device) for t in batch) inputs = { "input_ids": batch[0], "input_mask": batch[1], "labels": batch[2], 'input_lens': batch[3] } outputs = model(**inputs) # model outputs are always tuple in pytorch-transformers (see doc) loss = outputs[0] if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training loss.backward() tr_loss += loss.item() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) scheduler.step() # Update learning rate schedule optimizer.step() model.zero_grad() if global_step % args.log_steps == 0: logger.info( "training porcess —— epoch:%d —— global_step-%d —— loss-%.4f" % (index + 1, global_step + 1, loss.item())) global_step += 1 if args.local_rank in [-1, 0]: # Log metrics print(" ") if args.local_rank == -1: # Only evaluate when single GPU otherwise metrics may not average well eval_results = evaluate(args, processor, model) if eval_results['f1'] > best_f1: logger.info( f"\nEpoch {index+1}: eval_f1 improved from {best_f1} to {eval_results['f1']}" ) output_dir = os.path.join(args.output_dir, "best_model") if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = ( model.module if hasattr(model, "module") else model ) # Take care of distributed/parallel training torch.save(model_to_save.state_dict(), os.path.join(output_dir, "pytorch_model.bin")) torch.save(args, os.path.join(output_dir, "training_args.bin")) logger.info("Saving model checkpoint to %s", output_dir) best_f1 = eval_results['f1'] if 'cuda' in str(args.device): torch.cuda.empty_cache() return global_step, tr_loss / global_step