def save_all(self, path: str, tokenizer: BertTokenizer, label_encoder): """Save all files needed for inference :param path: :param tokenizer: Bert tokenizer :param label_encoder: label encoder :return: """ torch.save(self.model.state_dict(), os.path.join(path, config.MODEL_NAME)) tokenizer.save_pretrained(path) output = open(os.path.join(path, 'label_encoder.pkl'), 'wb') pickle.dump(label_encoder, output) output.close()
def save_model( args, model, optimizer, src_tokenizer: BertTokenizer, tgt_tokenizer: GPT2Tokenizer, nstep, nepoch, bleu, loss, ): # 记录整体训练评价结果 train_metric_log_file = os.path.join(args.output_dir, "training_metric.tsv") if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) if os.path.exists(train_metric_log_file): with open(train_metric_log_file, "a", encoding="utf-8") as fa: fa.write("{}\t{}\t{}\t{}\n".format(nepoch, nstep, loss, bleu)) else: with open(train_metric_log_file, "w", encoding="utf-8") as fw: fw.write("epoch\tstep\tloss\tbleu\n") fw.write("{}\t{}\t{}\t{}\n".format(nepoch, nstep, loss, bleu)) # 保存模型 model_save_path = os.path.join( args.output_dir, "epoch{}_step{}/".format(nepoch, nstep) ) os.makedirs(model_save_path) model.save_pretrained(model_save_path) if local_rank == 0 or local_rank == -1: print( "epoch:{} step:{} loss:{} bleu:{} model save complete.".format( nepoch, nstep, round(loss, 4), round(bleu, 4) ) ) if args.save_optimizer: torch.save(optimizer, os.path.join(model_save_path, "optimizer.pt")) # 保存tokenizer src_tokenizer.save_pretrained(os.path.join(model_save_path, "src_tokenizer")) tgt_tokenizer.save_pretrained(os.path.join(model_save_path, "tgt_tokenizer"))
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) model_args, data_args, training_args = parser.parse_args_into_dataclasses() if data_args.eval_data_file is None and training_args.do_eval: raise ValueError( "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file " "or remove the --do_eval argument.") if (os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.local_rank != -1), training_args.fp16, ) logger.info("Training/evaluation parameters %s", training_args) # Set seed set_seed(training_args.seed) # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. if model_args.config_name: config = AutoConfig.from_pretrained(model_args.config_name, cache_dir=model_args.cache_dir) elif model_args.model_name_or_path: config = AutoConfig.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir) else: config = CONFIG_MAPPING[model_args.model_type]() logger.warning( "You are instantiating a new config instance from scratch.") with Path('foodbert/data/used_ingredients.json').open() as f: used_ingredients = json.load(f) # Dont seperate these tokenizer = BertTokenizer( vocab_file='foodbert/data/bert-base-cased-vocab.txt', do_lower_case=False, max_len=128, never_split=used_ingredients ) # For one sentence instruction, longer shouldn't be necessary if model_args.model_name_or_path: model = AutoModelWithLMHead.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) else: logger.info("Training new model from scratch") model = AutoModelWithLMHead.from_config(config) model.resize_token_embeddings(len(tokenizer)) if config.model_type in ["bert", "roberta", "distilbert", "camembert" ] and not data_args.mlm: raise ValueError( "BERT and RoBERTa-like models do not have LM heads but masked LM heads. They must be run using the --mlm " "flag (masked language modeling).") if data_args.block_size <= 0: data_args.block_size = tokenizer.max_len # Our input block size will be the max possible for the model else: data_args.block_size = min(data_args.block_size, tokenizer.max_len) # Get datasets train_dataset = (get_dataset( data_args, tokenizer=tokenizer, local_rank=training_args.local_rank) if training_args.do_train else None) eval_dataset = (get_dataset(data_args, tokenizer=tokenizer, local_rank=training_args.local_rank, evaluate=True) if training_args.do_eval else None) data_collator = DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm=data_args.mlm, mlm_probability=data_args.mlm_probability) # Make sure checkpoint recovery and continous training works on GPU, probably we need to make sure to push all parameters to the gpu # Solves bug in Trainer https://github.com/huggingface/transformers/issues/4240 model.to(training_args.device) # Initialize our Trainer trainer = Trainer( model=model, args=training_args, data_collator=data_collator, train_dataset=train_dataset, eval_dataset=eval_dataset, prediction_loss_only=True, ) # Training if training_args.do_train: model_path = (model_args.model_name_or_path if model_args.model_name_or_path is not None and os.path.isdir(model_args.model_name_or_path) else None) trainer.train(model_path=model_path) trainer.save_model() # For convenience, we also re-save the tokenizer to the same directory, # so that you can share your model easily on huggingface.co/models =) if trainer.is_world_master(): tokenizer.save_pretrained(training_args.output_dir) # Evaluation results = {} if training_args.do_eval and training_args.local_rank in [-1, 0]: logger.info("*** Evaluate ***") eval_output = trainer.evaluate() perplexity = math.exp(eval_output["eval_loss"]) result = {"perplexity": perplexity} output_eval_file = os.path.join(training_args.output_dir, "eval_results_lm.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) results.update(result) return results
def train(args, train_dataset, model: PreTrainedModel, tokenizer: BertTokenizer) -> Tuple[int, float]: if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() args.train_batch_size = args.per_gpu_batch_size * max(1, args.n_gpu) # 补齐 pad def collate(examples: List[torch.tensor]): if tokenizer._pad_token is None: return pad_sequence(examples, batch_first=True) return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id) train_sampler = RandomSampler( train_dataset) if args.local_rank == -1 else DistributedSampler( train_dataset) # create dataloader for training train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=collate) # prepare gradient accumulation if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs # load the model model = model.module if hasattr( model, 'module') else model # take care of distribute/parallel training model.resize_token_embeddings(len(dataloader)) # Prepare optimizer and schedule(linear warmup and decay) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) # check if saved optimizer or scheduler state exist if (args.model_name_or_path and os.path.isfile( os.path.join(args.model_name_or_path, 'optimizer.pt')) and os.path.isfile( os.path.join(args.model_name_or_path, 'scheduler.pt'))): optimizer.load_state_dict( torch.load(os.path.join(args.model_name_or_path, 'optimizer.pt'))) scheduler.load_state_dict( torch.laod(os.path.join(args.model_name_or_path, 'scheduler.pt'))) # 混合精度训练 if args.fp16: try: from apex import amp except ImportError: raise ImportError( 'Please install apex from https://www.github.com/nvidia/apex to use fp16 training.' ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) # display log information before training logger.info("***** Running training *****") logger.info('Num examples =%d', len(train_dataset)) logger.info("Num Epochs =%d", args.num_train_epochs) logger.info("Instantaneous batch size per GPU=%d", args.per_gpu_batch_size) logger.info( "Total train batch size(w.parallel,distribute&accumulation)=%d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1), ) logger.info("Gradient Accumulation steps=%d", args.gradient_accumulation_steps) logger.info("Total optimization steps=%d", t_total) global_step = 0 epochs_trained = 0 steps_trained_in_current_epoch = 0 # Check if continuing training from a checkpoint if args.model_name_or_path and os.path.exists(args.model_name_or_path): try: # set global_step to global step of last saved checkpoint from model path checkpoint_suffix = args.model_name_or_path.split('-')[-1].split( '/')[0] global_step = int(checkpoint_suffix) epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps) steps_trained_in_current_epoch = global_step % ( len(train_dataloader) // args.gradient_accumulation_steps) logger.info( "Continuing training from checkpoint, will skip to saved global step" ) logger.info("Continuing training from epcoh %d", epochs_trained) logger.info("Continuing training from global step %d", global_step) logger.info("Will skip the first %d step in the first epoch", steps_trained_in_current_epoch) except ValueError: logger.info(" Starting fine_tuning") tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(epochs_trained, int(args.num_train_epochs), desc='Epoch', disable=args.local_rank not in [-1, 0]) set_seed(args) # Added here for reproducibility for epoch in train_iterator: epoch_iterator = tqdm(train_dataloader, desc='Iteration', disable=args.local_rank not in [-1, 0]) if args.local_rank != -1: train_sampler.set_epoch(epoch) for step, batch in enumerate(epoch_iterator): # skip past any already trained step if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue # 对输入数据进行mask处理 inputs, labels = mask_tokens(batch, tokenizer, args) if args.mlm else (batch, batch) inputs = inputs.to(args.device) labels = labels.to(args.device) model.train() outputs = model(inputs, masked_lm_labels=labels) if args.mlm else model( inputs, labels=labels) loss = outputs[0] if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm(amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() model.zero_grad() global_step += 1 if agrs.local_rank in [ -1, 0 ] and args.logging_steps > 0 and global_step % args.logging_steps == 0: # log metrics if args.local_rank == -1 and args.evaluate_during_training: # only evaluate when single GPU otherwise metrics may not average well results = evaluate(args, model, tokenizer) for key, value in results.items(): tb_writer.add_scaler("eval_{}".format(key), value, global_step) tb_writer.add_scaler('lr', scheduler.get_lr()[0], global_step) tb_writer.add_scaler('loss', (tr_loss - logging_loss) / args.logging_steps, global_step) logging_loss = tr_loss if args.local_rank in [ -1, 0 ] and args.save_steps > 0 and global_step % args.save_steps == 0: checkpoint_predix = 'checkpoint' # save model check point output_dir = os.path.join( args.outout_dir, "{}-{}".format(checkpoint_prefix, global_step)) os.makedirs(output_dir, exist_ok=True) model_to_save = (model.module if hasattr(model, "module") else model) model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, 'training_args.bin')) logger.info('Saving model checkpoint to %s', output_dir) _rotate_checkpoints(args, checkpoint_prefix) torch.save(optimizer.state_dict(), os.path.join(output_dir, 'optimizer.pt')) torch.save(scheduler.state_dict(), os.path.join(output_dir, 'scheduler.pt')) logger.info('Saving optimizer and scheduler states to %s', output_dir) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break if args.local_rank in [-1, 0]: tb_writer.close() return global_step, tr_loss / global_step
class BuildCustomTransformersVocabulary(object): def __init__(self, base_vocab_path='./vocab_small.txt', additional_special_tokens={ 'additional_special_tokens': ['<num>', '<img>', '<url>', '#E-s', '|||'] }): self.tokenizer = BertTokenizer(vocab_file=base_vocab_path, do_lower_case=False, do_basic_tokenize=True) self.tokenizer.add_special_tokens(additional_special_tokens) self.no_vocab_tokens = set() def get_no_vocab_token(self, text, unk_token='[UNK]', other_split=False): """ tokens compare @param text: @param unk_token: @param other_split: 原始拆分出来single token txt, bert tokenizer拆分之后依然拆解为多个token, 是否增加词汇 @return: """ # text_tokens = self.tokenizer.tokenize(text) # bert tokenizer根据词汇表处理之后切分出来的token(包含unk) origin_tokens = self.tokenize(text) # 切词之后结果, 不在词汇表中的词没有转为unk # # 第一种方法不能保证一一对应, 有些切分出来字符再次转换时候会被再次切分 # assert len(text_tokens) == len(origin_tokens) for idx, token in enumerate(origin_tokens): # 使用transformer tokenizer根据基础词汇表转换 bert_token = self.tokenizer.tokenize(token) # if token != origin_tokens[idx]: # # 未知token添加进词汇表 # self.no_vocab_tokens.append(origin_tokens[idx]) if len(bert_token) == 1 and bert_token[0] == unk_token: self.no_vocab_tokens.add(token) # 借助set去重 if other_split and len(bert_token) > 1: # 单个字符被bert tokenizer拆分为多个字符, 实际不需要拆分 self.no_vocab_tokens.add(token) def _tokenize(self, text): """将text拆分为 token list""" tokens_list = self.tokenizer.basic_tokenizer.tokenize( text, never_split=self.tokenizer.all_special_tokens) return tokens_list def tokenize(self, text: str, **kwargs): """ Converts a string in a sequence of tokens (string), using the tokenizer. Split in words for word-based vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces). Take care of added tokens. Args: text (:obj:`string`): The sequence to be encoded. **kwargs (:obj: `dict`): Arguments passed to the model-specific `prepare_for_tokenization` preprocessing method. """ all_special_tokens = self.tokenizer.all_special_tokens text = self.tokenizer.prepare_for_tokenization(text, **kwargs) # TODO: should this be in the base class? def lowercase_text(t): # convert non-special tokens to lowercase escaped_special_toks = [ re.escape(s_tok) for s_tok in all_special_tokens ] pattern = r"(" + r"|".join(escaped_special_toks) + r")|" + r"(.+?)" return re.sub(pattern, lambda m: m.groups()[0] or m.groups()[1].lower(), t) if self.tokenizer.init_kwargs.get("do_lower_case", False): text = lowercase_text(text) def split_on_token(tok, text): result = [] split_text = text.split(tok) for i, sub_text in enumerate(split_text): sub_text = sub_text.rstrip() if i == 0 and not sub_text: result += [tok] elif i == len(split_text) - 1: if sub_text: result += [sub_text] else: pass else: if sub_text: result += [sub_text] result += [tok] return result def split_on_tokens(tok_list, text): if not text.strip(): return [] if not tok_list: return self._tokenize(text) tokenized_text = [] text_list = [text] for tok in tok_list: tokenized_text = [] for sub_text in text_list: if sub_text not in self.tokenizer.unique_added_tokens_encoder: tokenized_text += split_on_token(tok, sub_text) else: tokenized_text += [sub_text] text_list = tokenized_text return list( itertools.chain.from_iterable( (self._tokenize(token) if token not in self.tokenizer.unique_added_tokens_encoder else [token] for token in tokenized_text))) added_tokens = self.tokenizer.unique_added_tokens_encoder tokenized_text = split_on_tokens(added_tokens, text) return tokenized_text def update_vocab(self, new_vocab_tokens: list): """ 更新原有基础词汇表 @param new_vocab_tokens: @return: """ add_token_num = self.tokenizer.add_tokens(new_vocab_tokens) return add_token_num def custom_save_vocabulary(self, new_vocab_path): """保存新的词汇表""" if os.path.exists(new_vocab_path): os.remove(new_vocab_path) index = 0 with open(new_vocab_path, mode='w', encoding='utf-8') as writer: for token, token_index in sorted(self.tokenizer.vocab.items(), key=lambda kv: kv[1]): if index != token_index: print( "Saving vocabulary to {}: vocabulary indices are not consecutive." " Please check that the vocabulary is not corrupted!". format(new_vocab_path)) index = token_index writer.write(token + "\n") index += 1 # 将新增加的token添加到词汇表 add_tokens_vocab = OrderedDict(self.tokenizer.added_tokens_encoder) for token, token_index in sorted(add_tokens_vocab.items(), key=lambda kv: kv[1]): if index != token_index: print( "Saving vocabulary to {}: vocabulary indices are not consecutive." " Please check that the vocabulary is not corrupted!". format(new_vocab_path)) index = token_index writer.write(token + "\n") index += 1 return new_vocab_path def save_vocab_pretrained(self, vocab_pretrained_path): """保存词表预训练全部内容""" if not os.path.exists(vocab_pretrained_path): # 路径不存在, 创建路径 os.makedirs(vocab_pretrained_path) all_file = self.tokenizer.save_pretrained( vocab_pretrained_path) # 存储所有词汇内容 # model.resize_token_embeddings(len(tokenizer)) -> 重新设置embedding大小(词汇表大小已经改变) # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, # i.e. the length of the tokenizer. return all_file
print(i) print(s) tokenizer = BertTokenizer("data/atis/token.vocab", bos_token="<BOS>", eos_token="<EOS>", model_max_len=50) tokenizer.prepare_for_model(tokenizer.encode(y), return_tensors="pt") tokenizer.SPECIAL_TOKENS_ATTRIBUTES tokenizer.encode(y) tokenizer.encode_plus(y) y = "<BOS> embedding what is the flight number <EOS>" ids = tokenizer.encode_plus tokenizer.decode(tokenizer.encode(y)) tokenizer.save_pretrained("data/atis/save") tokenizer.save_vocabulary("data/atis/save/saved") tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", bos_token="<BOS>", eos_token="<EOS>") tokenizer.tokenize("i like tea") special_tokens = {"bos_token": "<BOS>", "eos_token": "<EOS>"} tokenizer.add_special_tokens(special_tokens) tokenizer.bos_token_id tokenizer.eos_token_id tokenizer.all_special_ids tokenizer.special_tokens_map tokenizer.additional_special_tokens
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) model_args, data_args, training_args = parser.parse_args_into_dataclasses() if data_args.eval_data_file is None and training_args.do_eval: raise ValueError( "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file " "or remove the --do_eval argument." ) if ( os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir ): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.local_rank != -1), training_args.fp16, ) logger.info("Training/evaluation parameters %s", training_args) # Set seed set_seed(training_args.seed) # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. if model_args.config_name: config = AutoConfig.from_pretrained(model_args.config_name, cache_dir=model_args.cache_dir) elif model_args.model_name_or_path: config = AutoConfig.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir) else: config = CONFIG_MAPPING[model_args.model_type]() logger.warning("You are instantiating a new config instance from scratch.") if model_args.tokenizer_name: tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, cache_dir=model_args.cache_dir) elif model_args.model_name_or_path: tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir) else: raise ValueError( "You are instantiating a new tokenizer from scratch. This is not supported, but you can do it from another script, save it," "and load it from here, using --tokenizer_name" ) if data_args.additional_tokens_file: with open(data_args.additional_tokens_file, "r") as infile: additional_tokens = [l.strip() for l in infile] #tokenizer.add_tokens(additional_tokens) tokenizer = BertTokenizer(data_args.additional_tokens_file, do_basic_tokenize=False) if model_args.model_name_or_path: model = AutoModelWithLMHead.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) else: logger.info("Training new model from scratch") model = AutoModelWithLMHead.from_config(config) model.resize_token_embeddings(len(tokenizer)) if config.model_type in ["bert", "roberta", "distilbert", "camembert"] and not data_args.mlm: raise ValueError( "BERT and RoBERTa-like models do not have LM heads but masked LM heads. They must be run using the" "--mlm flag (masked language modeling)." ) if data_args.block_size <= 0: data_args.block_size = tokenizer.max_len # Our input block size will be the max possible for the model else: data_args.block_size = min(data_args.block_size, tokenizer.max_len) # Get datasets train_dataset = get_dataset(data_args, tokenizer=tokenizer) if training_args.do_train else None eval_dataset = get_dataset(data_args, tokenizer=tokenizer, evaluate=True) if training_args.do_eval else None if config.model_type == "xlnet": data_collator = DataCollatorForPermutationLanguageModeling( tokenizer=tokenizer, plm_probability=data_args.plm_probability, max_span_length=data_args.max_span_length, ) else: data_collator = DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm=data_args.mlm, mlm_probability=data_args.mlm_probability ) # Initialize our Trainer trainer = Trainer( model=model, args=training_args, data_collator=data_collator, train_dataset=train_dataset, eval_dataset=eval_dataset, prediction_loss_only=True, ) # Training if training_args.do_train: model_path = ( model_args.model_name_or_path if model_args.model_name_or_path is not None and os.path.isdir(model_args.model_name_or_path) else None ) trainer.train(model_path=model_path) trainer.save_model() # For convenience, we also re-save the tokenizer to the same directory, # so that you can share your model easily on huggingface.co/models =) if trainer.is_world_master(): tokenizer.save_pretrained(training_args.output_dir) # Evaluation results = {} if training_args.do_eval: logger.info("*** Evaluate ***") eval_output = trainer.evaluate() perplexity = math.exp(eval_output["eval_loss"]) result = {"perplexity": perplexity} output_eval_file = os.path.join(training_args.output_dir, "eval_results_lm.txt") if trainer.is_world_master(): with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) results.update(result) return results
vectorize = lambda x: vectorize_with_bert(" ".join(preprocess_string(x)), model, tokenizer, "sum", 1) get_similarity = lambda s1, s2: 1 - cosine(vectorize(s1), vectorize(s2)) df[model_name] = df.apply( lambda x: get_similarity(x["concept_1"], x["concept_2"]), axis=1) y_true = list(df["class"].values) y_prob = list(df[model_name].values) precision, recall, thresholds = precision_recall_curve(y_true, y_prob) f_beta = lambda pr, re, beta: [((1 + beta**2) * p * r) / ((((beta**2) * p) + r)) for p, r in zip(pr, re)] f_1_scores = f_beta(precision, recall, beta=1) f_1_max = np.nanmax(f_1_scores) rows.append((model_name, epoch_i, loss.item(), f_1_max)) # Writing results of the validation to those files. df.to_csv(output_path_for_results, index=False) header = ["model", "epoch", "training_loss", "f1_max"] pd.DataFrame(rows, columns=header).to_csv(output_path_for_results_summary, index=False) # In[44]: output_dir = "../models/bert_small/model_save_{}/".format( datetime.datetime.now().strftime('%m_%d_%Y_h%Hm%Ms%S')) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model.module if hasattr(model, 'module') else model model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir)
acc = ((y_pred_label == label.view(-1)).sum()).item() epoch_loss += loss.item() epoch_acc += acc return epoch_loss / len(iterator), epoch_acc / len( iterator.dataset.dataset) for i in range(epochs): train_loss, train_acc = train(model, sentiment_train_loader, optimizer, criterion, device) valid_loss, valid_acc = evaluate(model, sentiment_valid_loader, criterion, device) print("\n") print("train loss: ", train_loss, "\t", "train acc:", train_acc) print("valid loss: ", valid_loss, "\t", "valid acc:", valid_acc, end="\n\n") # 保存 import os saved_model = "./saved_model" saved_tokenizer = "./saved_tokenizer" os.makedirs(saved_model) os.makedirs(saved_tokenizer) model.save_pretrained(saved_model) tokenizer.save_pretrained(saved_tokenizer)
def train(args, train_dataset, model: BertForMlmWithClassification, tokenizer: BertTokenizer) -> Tuple[int, float]: if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) def collate(data: List[torch.Tensor]): sentences, labels = list(zip(*data)) if tokenizer._pad_token is None: return pad_sequence(sentences, batch_first=True) return ( pad_sequence(sentences, batch_first=True, padding_value=tokenizer.pad_token_id), torch.tensor(labels), ) train_sampler = (RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=collate) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = ( args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1) else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) # Check if saved optimizer or scheduler states exist if (args.model_name_or_path and os.path.isfile( os.path.join(args.model_name_or_path, "optimizer.pt")) and os.path.isfile( os.path.join(args.model_name_or_path, "scheduler.pt"))): # Load in optimizer and scheduler states optimizer.load_state_dict( torch.load(os.path.join(args.model_name_or_path, "optimizer.pt"))) scheduler.load_state_dict( torch.load(os.path.join(args.model_name_or_path, "scheduler.pt"))) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True, ) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1), ) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 epochs_trained = 0 steps_trained_in_current_epoch = 0 # Check if continuing training from a checkpoint if args.model_name_or_path and os.path.exists(args.model_name_or_path): try: # set global_step to gobal_step of last saved checkpoint from model path checkpoint_suffix = args.model_name_or_path.split("-")[-1].split( "/")[0] global_step = int(checkpoint_suffix) epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps) steps_trained_in_current_epoch = global_step % ( len(train_dataloader) // args.gradient_accumulation_steps) logger.info( " Continuing training from checkpoint, will skip to saved global_step" ) logger.info(" Continuing training from epoch %d", epochs_trained) logger.info(" Continuing training from global step %d", global_step) logger.info(" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch) except ValueError: logger.info(" Starting fine-tuning.") tr_loss, logging_loss = 0.0, 0.0 # Take care of distributed/parallel training model_to_resize = model.module if hasattr(model, "module") else model model_to_resize.resize_token_embeddings(len(tokenizer)) model.zero_grad() train_iterator = trange( epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0], ) set_seed(args) # Added here for reproducibility for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): batch, class_labels = batch # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue inputs, mask_labels = (mask_tokens(batch, tokenizer, args) if args.mlm else (batch, batch)) inputs = inputs.to(args.device) mask_labels = mask_labels.to(args.device) if args.mlm else None class_labels = class_labels.to(args.device) model.train() outputs = model(input_ids=inputs, masked_lm_labels=mask_labels, class_labels=class_labels) loss = outputs[0] if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if (args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0): # Log metrics if ( args.local_rank == -1 and args.evaluate_during_training ): # Only evaluate when single GPU otherwise metrics may not average well results = evaluate(args, model, tokenizer) for key, value in results.items(): tb_writer.add_scalar("eval_{}".format(key), value, global_step) tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step) tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step) logging_loss = tr_loss if (args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0): checkpoint_prefix = "checkpoint" # Save model checkpoint output_dir = os.path.join( args.output_dir, "{}-{}".format(checkpoint_prefix, global_step)) os.makedirs(output_dir, exist_ok=True) model_to_save = ( model.module if hasattr(model, "module") else model ) # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, "training_args.bin")) logger.info("Saving model checkpoint to %s", output_dir) _rotate_checkpoints(args, checkpoint_prefix) torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) logger.info("Saving optimizer and scheduler states to %s", output_dir) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break if args.local_rank in [-1, 0]: tb_writer.close() return global_step, tr_loss / global_step
def main(): # gpu profile n_gpu, device = gpu_profile(args) # set random seed seed_set(args.seed, n_gpu) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) if not args.do_train and not args.do_predict: raise ValueError( "At least one of `do_train` or `do_predict` must be True.") if args.do_train: if not args.train_file: raise ValueError( "If `do_train` is True, then `train_file` must be specified.") if args.do_predict: if not args.predict_file: raise ValueError( "If `do_predict` is True, then `predict_file` must be specified." ) if os.path.exists(args.output_dir) == False: # raise ValueError("Output directory () already exists and is not empty.") os.makedirs(args.output_dir, exist_ok=True) feature_dir = '../data/rank/' + args.model_type # + '_large' if 'addBad' in args.model_info: feature_dir = feature_dir + '_large' if os.path.exists(feature_dir) == False: # raise ValueError("Output directory () already exists and is not empty.") os.makedirs(feature_dir, exist_ok=True) args.model_type = args.model_type.lower() config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] config = config_class.from_pretrained(args.model_name_or_path) vocab_file = os.path.join(args.model_name_or_path, 'vocab.txt') logger.info("loading the vocab file from {}".format(vocab_file)) tokenizer = BertTokenizer(vocab_file=vocab_file, do_lower_case=args.do_lower_case) # tokenizer = tokenizer_class.from_pretrained( # vocab_file, # do_lower_case=args.do_lower_case # ) model = model_class.from_pretrained(args.model_name_or_path, config=config) if args.our_pretrain_model != '': model = load_state_dict(model, args.our_pretrain_model) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) # bert_config = BertConfig.from_json_file(args.bert_config_file) # data_processor = Data_processor(tokenizer, args.policies_file, args.max_seq_length, args.max_query_length) if args.do_train: train_writer = prepare_summary_writer(args, mode='train') # get train features train_examples_file = "../data/rank/train_examples.pkl" train_features_file = os.path.join( feature_dir, 'train_features_{0}_{1}.pkl'.format(str(args.max_seq_length), str(args.doc_stride))) train_examples = data_processor.get_train_examples( args.train_file, train_examples_file) valid_examples = None valid_features = None valid_dataloader = None if args.do_valid: valid_writer = prepare_summary_writer(args, mode='valid') valid_examples = train_examples[4000:] train_examples = train_examples[:4000] else: valid_writer = None logger.info("train examples {}".format(len(train_examples))) if valid_examples != None: logger.info("valid examples {}".format(len(valid_examples))) train_features = data_processor.get_train_features( train_examples, train_features_file, args.doc_stride) train_dataloader = data_processor.prepare_train_dataloader( train_features, args.train_batch_size, args.local_rank) num_train_steps = int( len(train_features) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) logger.info("***** Running training *****") logger.info(" Num train_features = %d", len(train_features)) if args.do_valid: valid_features_file = os.path.join( feature_dir, 'valid_features_{0}_{1}.pkl'.format(str(args.max_seq_length), str(args.doc_stride))) valid_features = data_processor.get_valid_features( valid_examples, valid_features_file, args.doc_stride) logger.info(" Num valid_features = %d", len(valid_features)) valid_dataloader = data_processor.prepare_train_dataloader( valid_features, args.train_batch_size, args.local_rank) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) del train_examples del train_features # Prepare model # trained_model_file = os.path.join(args.output_dir, # "pytorch_model_bert_{}.bin".format(args.max_seq_length)) # model = prepare_model(args, bert_config, device, n_gpu, trained_model_file) # training model model = train(args, model, train_dataloader, device, num_train_steps, valid_examples=valid_examples, valid_features=valid_features, valid_dataloader=valid_dataloader, n_gpu=n_gpu, train_writer=train_writer, valid_writer=valid_writer, tokenizer=tokenizer) model_to_save = model.module if hasattr(model, "module") else model model_to_save.save_pretrained(model_save_path) tokenizer.save_pretrained(model_save_path) save_config_file() if args.do_predict: del model #trained_model_file = os.path.join(args.output_dir, args.model_type + '_' + args.model_info, "2020-03-22@11_57_57") # trained_model_file = os.path.join(args.output_dir + '/' + args.model_type) trained_model_file = args.predict_model_path model = model_class.from_pretrained(trained_model_file, config=config) # prepare predict dataloader # pred_features_file = os.path.join(feature_dir, # 'pred_features_{0}_{1}.pkl'.format(str(args.max_seq_length), str(args.doc_stride))) all_results = {} predict_file_name = args.predict_file.split('/')[-1].split('.')[0] cache_dir = os.path.join('/'.join(args.output_dir.split('/')[:-1]), 'predict_cache_{}'.format(predict_file_name)) if args.do_bm25: cache_dir = cache_dir + '_do_bm25' + '_{}'.format(args.topRate) if not os.path.exists(cache_dir): os.makedirs(cache_dir) # build cache xc test # for pred_dataloader, pred_features, pred_examples in data_processor.prepare_pred_dataloader( # args.predict_file, pred_features_file, args.predict_batch_size, args.doc_stride, cache_dir=cache_dir): # pass # print('build cache successfully') if args.do_bm25: data_processor = Data_processor_bm25(tokenizer, args.policies_file, args.max_seq_length, args.max_query_length) for pred_dataloader, pred_features, pred_examples in data_processor.prepare_pred_dataloader( args.predict_file, args.predict_batch_size, args.doc_stride, cache_dir=cache_dir, topRate=args.topRate): # predicting results = predict(args, model, pred_dataloader, device, n_gpu) output_prediction_file = os.path.join( '/'.join(args.output_dir.split('/')[:-1]), "predictions.json") # output_nbest_file = os.path.join(args.output_dir, "nbest_predictions.json") # output_null_log_odds_file = os.path.join(args.output_dir, "null_odds.json") results = write_predictions( args, pred_examples, pred_features, results, n_best_size=10, output_prediction_file=output_prediction_file, tokenizer=tokenizer) all_results.update(results) submit_file_name = '_'.join([ args.model_type, args.model_info, predict_file_name, trained_model_file.split('/')[-1] ]) if args.do_bm25: submit_file_name = submit_file_name + '_bm25' # generate submit file submit_file = os.path.join('/'.join(args.output_dir.split('/')[:-1]), submit_file_name + '.csv') gen_submit_csv(all_results, submit_file)