def classify( model: BertForTokenClassification, tokenizer: BertTokenizerFast, # 输入 sequence: str, labels: List[int] = None, ) -> List[Entity]: """ classify的功能: 给定model、tokenizer,sequence: 给定label(对应以下label不为空的情况),计算loss; label为空:预测entities 训练好的模型,可以直接使用 """ # ensure model is configured to return dict # otherwise this code will break # 确保模型配置为返回dict,否则此代码将中断 if not model.config.return_dict: raise ValueError( 'Model should be instantiated with `return_dict=True`') # convert input sequence (and optional labels) into an inputs bundle # 将输入序列(和可选标签)转换为输入包 inputs, mask = pack_sequence_as_inputs( tokenizer=tokenizer, sequence=sequence, labels=labels, max_token_length=model.config.max_position_embeddings, ) # put data on the gpu (if available) # if torch.cuda.is_available(): # model.cuda() # inputs = {k: v.cuda() for k, v in inputs.items()} # if labels is not None, it means that the caller is interested in the loss # value of the given input sequence. So, this should be done in a grad context. # 如果labels不是None,则表示调用者对给定输入序列的损失值感兴趣。所以,这应该在毕业的背景下进行。 if labels is not None: return model(**inputs).loss # if labels is None, it means that the caller is interested in the entities # to be recognized by the model. In this case, the outputs can be computed # without a grad context # 如果labels为None,则表示调用者对模型要识别的实体感兴趣。在这种情况下,可以在没有梯度上下文的情况下计算输出 with torch.no_grad(): logits = model(**inputs).logits.cpu() # decode model's output # 解码模型输出 entities = extract_entities( sequence=sequence, logits=logits[:, 1:-1][mask], encode=tokenizer.encode, decode=tokenizer.decode, ) entities = realign_extracted_entities( sequence=sequence, tokens=tokenizer.tokenize(sequence), entities=entities, vocab=tokenizer.get_vocab(), ) return list(entities)
def main(): parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) model_args, data_args, training_args = parser.parse_args_into_dataclasses() if data_args.eval_data_file is None and training_args.do_eval: raise ValueError( "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file " "or remove the --do_eval argument.") if (os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.local_rank != -1), training_args.fp16, ) logger.info("Training/evaluation parameters %s", training_args) # Set seed set_seed(training_args.seed) # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. if model_args.config_name: config = AutoConfig.from_pretrained(model_args.config_name, cache_dir=model_args.cache_dir) elif model_args.model_name_or_path: config = AutoConfig.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir) else: config = CONFIG_MAPPING[model_args.model_type]() logger.warning( "You are instantiating a new config instance from scratch.") print("Config before overwrite max_position_embeddings:", config) config.max_position_embeddings = 4096 config.num_hidden_layers = 6 config.num_attention_heads = 8 config.hidden_size = 512 config.intermediate_size = 2048 print("Config after overwrite max_position_embeddings:", config) # if model_args.tokenizer_name: # tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, cache_dir=model_args.cache_dir) # elif model_args.model_name_or_path: # tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir) # else: # raise ValueError( # "You are instantiating a new tokenizer from scratch. This is not supported, but you can do it from another script, save it," # "and load it from here, using --tokenizer_name" # ) logging.info("Loading tokenizer") if model_args.tokenizer_name: tokenizer = BertTokenizerFast(model_args.tokenizer_name, clean_text=True, lowercase=False, strip_accents=True) else: raise ValueError("Specify tokenizer name") logging.info("Loading model") if model_args.model_name_or_path: model = AutoModelWithLMHead.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) else: logger.info("Training new model from scratch") model = AutoModelWithLMHead.from_config(config) logging.info("Resizing embeddings") model.resize_token_embeddings(len(tokenizer)) print(len(tokenizer.get_vocab()), len(tokenizer)) if config.model_type in ["bert", "roberta", "distilbert", "camembert" ] and not data_args.mlm: raise ValueError( "BERT and RoBERTa-like models do not have LM heads but masked LM heads. They must be run using the" "--mlm flag (masked language modeling).") # Get datasets logging.info("Loading train dataset") train_dataset = get_dataset(data_args) if training_args.do_train else None logging.info("Loading eval dataset") eval_dataset = (get_dataset( data_args, evaluate=True, ) if training_args.do_eval else None) if config.model_type == "xlnet": data_collator = DataCollatorForPermutationLanguageModeling( tokenizer=tokenizer, plm_probability=data_args.plm_probability, max_span_length=data_args.max_span_length, ) else: data_collator = DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm=data_args.mlm, mlm_probability=data_args.mlm_probability, ) # Initialize our Trainer logging.info("Initializing trainer") trainer = Trainer( model=model, args=training_args, data_collator=data_collator, train_dataset=train_dataset, eval_dataset=eval_dataset, prediction_loss_only=True, ) # Training if training_args.do_train: logging.info("Training") model_path = (model_args.model_name_or_path if model_args.model_name_or_path is not None and os.path.isdir(model_args.model_name_or_path) else None) trainer.train(model_path=model_path) trainer.save_model() # For convenience, we also re-save the tokenizer to the same directory, # so that you can share your model easily on huggingface.co/models =) if trainer.is_world_master(): tokenizer.save_pretrained(training_args.output_dir) # Evaluation results = {} if training_args.do_eval: logger.info("*** Evaluate ***") eval_output = trainer.evaluate() perplexity = math.exp(eval_output["eval_loss"]) result = {"perplexity": perplexity} output_eval_file = os.path.join(training_args.output_dir, "eval_results_lm.txt") if trainer.is_world_master(): with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) results.update(result) return results
def batchify_long_inputs( tokenizer: BertTokenizerFast, inputs: Dict[str, torch.Tensor], labels: List[int] = None, max_token_length=512, ): """ In cases where the input sequence token length is greater than 512, the input sequence is packed into a batch mainly for efficiency purposes. 在输入序列token长度大于512的情况下,将输入序列打包到批次中主要是出于提高效率的目的。 """ token_ids = inputs['input_ids'][0, 1:-1].tolist() # get the special token IDs for later use # 获取特殊tokenID,以供以后使用 cls = tokenizer.cls_token_id sep = tokenizer.sep_token_id pad = tokenizer.pad_token_id # the token IDs of the spliced sequence is collected in an array # in preparation for the creation of the needed matrices ahead # 剪接序列的tokenID被收集在一个数组中,以准备在前面创建所需的矩阵 token_blocks, label_blocks = [], [] for start, end in split_into_blocks( token_ids=token_ids, separator_token_id=tokenizer.get_vocab().get('。'), block_size=max_token_length - 2, ): # split the input tokens into blocks (separated by period) # 将输入tokens序列拆分为块(以句点分隔) token_blocks.append([cls] + token_ids[start:end] + [sep]) # also split the labels into blocks (if passed) # 还将标签分成块(如果通过) if labels is not None: label_blocks.append([0] + labels[start:end] + [0]) # create a matrix vertically stacking the token IDs. # the width of this matrix depends on the longest token block. # also, each row of the matrix contains [CLS] and [SEP] tokens. # 创建一个垂直堆叠token ID的矩阵。 该矩阵的宽度取决于最长的token块。 同样,矩阵的每一行都包含[CLS]和[SEP]token。 max_block_len = max([len(block) for block in token_blocks]) input_ids = torch.tensor([ block + [pad] * (max_block_len - len(block)) for block in token_blocks ]) attention_mask = torch.tensor([[1] * len(block) + [0] * (max_block_len - len(block)) for block in token_blocks]) label_ids = torch.tensor([ block + [pad] * (max_block_len - len(block)) for block in label_blocks ]) if labels is not None else None # basically the same with `attention_mask` except that it doesn't # take into account the [CLS] and [SEP] positions. # this is created so that the final logits can be indexed conveniently # 与“ attention_mask”基本相同,除了它不考虑[CLS]和[SEP]位置。 创建它是为了方便最终索引登录 mask = torch.tensor([[1] * (len(block) - 2) + [0] * (max_block_len - len(block)) for block in token_blocks], dtype=torch.bool) # combine inputs as one batch to be processed at once # 将输入合并为一批,一次处理 inputs = { 'input_ids': input_ids, 'attention_mask': attention_mask, 'token_type_ids': torch.zeros_like(input_ids), } # add the labels tensor (if applicable) # 添加标签张量(如果适用) if labels is not None: inputs['labels'] = label_ids return inputs, mask