def collate_fn(self) -> Callable: if self.cfg.wwm: return DataCollatorForWholeWordMask( self.tokenizer, mlm_probability=self.cfg.mlm_probability) else: return DataCollatorForLanguageModeling( self.tokenizer, mlm_probability=self.cfg.mlm_probability)
def __init__( self, data: pd.DataFrame, batch_size: int, tokenizer: BertTokenizer, binarizer: MultiLabelBinarizer, ): """ Trainset generator for the genre data. Args: data: Dataset to be used for training batch_size: training batch size tokenizer: BertWordPieceTokenizer that should be used to tokenize the concatenated genre list. binarizer: MultiLabelBinarizer to binarize the actual gerne lables """ self._iterator_i = 0 self.batch_size = batch_size self.data = data.reset_index(drop=True) self.n_samples = len(self.data) self.tokenizer = tokenizer self.label_binarizer = binarizer self.collator = DataCollatorForWholeWordMask(self.tokenizer, mlm_probability=0.5)
def __init__( self, vocab: Vocabulary, embedding_dim: int, feedforward_dim: int, num_layers: int, num_attention_heads: int, position_embedding_dim: int, tokenizer_path: str, position_embedding_type: str = "absolute", activation: str = "gelu", hidden_dropout: float = 0.1, ) -> None: super().__init__() # TODO: # - Need to apply corrections in pretrained_transformer_mismatched_embedder tokenizer = BertTokenizer.from_pretrained(tokenizer_path) vocab.add_transformer_vocab(tokenizer, "tokens") # "tokens" is padded by default--undo that del vocab._token_to_index["tokens"]["@@PADDING@@"] del vocab._token_to_index["tokens"]["@@UNKNOWN@@"] assert len(vocab._token_to_index["tokens"]) == len(vocab._index_to_token["tokens"]) cfg = BertConfig( vocab_size=vocab.get_vocab_size("tokens"), hidden_size=embedding_dim, num_hidden_layers=num_layers, num_attention_heads=num_attention_heads, intermediate_size=feedforward_dim, hidden_act=activation, hidden_dropout_prob=hidden_dropout, max_position_embeddings=position_embedding_dim, position_embedding_type=position_embedding_type, use_cache=True, ) self.cfg = cfg self._vocab = vocab self._namespace = "tokens" self.bert = BertModel(cfg) self.masking_collator = DataCollatorForWholeWordMask( tokenizer=tokenizer, mlm=True, mlm_probability=0.15 )
def test_data_collator_for_whole_word_mask(self): features = [{ "input_ids": list(range(10)) }, { "input_ids": list(range(10)) }] tokenizer = BertTokenizer(self.vocab_file) data_collator = DataCollatorForWholeWordMask(tokenizer, return_tensors="np") batch = data_collator(features) self.assertEqual(batch["input_ids"].shape, (2, 10)) self.assertEqual(batch["labels"].shape, (2, 10))
def __init__(self, pretrained_model_name=None, tokenizer_cls=AutoTokenizer, config=None, tokenizer=None, mlm=True, masking_func=None, whole_word_masking=False, mlm_probability=0.15): if tokenizer is None: tokenizer = tokenizer_cls.from_pretrained(pretrained_model_name, config=config) if masking_func is None: masking_func = (DataCollatorForWholeWordMask( tokenizer, mlm, mlm_probability) if whole_word_masking else DataCollatorForLanguageModeling( tokenizer, mlm, mlm_probability)) self.masking_func = masking_func self.batch_processor = compose(untuple, masking_func, to_tuple)
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) if (os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty." "Use --overwrite_output_dir to overcome.") # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if is_main_process(training_args.local_rank) else logging.WARN, ) # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() logger.info("Training/evaluation parameters %s", training_args) # Set seed before initializing model. set_seed(training_args.seed) # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub). # # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called # 'text' is found. You can easily tweak this behavior (see below). # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. data_files = {} if data_args.train_file is not None: data_files["train"] = data_args.train_file if data_args.validation_file is not None: data_files["validation"] = data_args.validation_file extension = data_args.train_file.split(".")[-1] if extension == "txt": extension = "text" datasets = load_dataset(extension, data_files=data_files) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. if model_args.config_name: config = AutoConfig.from_pretrained(model_args.config_name, cache_dir=model_args.cache_dir) elif model_args.model_name_or_path: config = AutoConfig.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir) else: config = CONFIG_MAPPING[model_args.model_type]() logger.warning( "You are instantiating a new config instance from scratch.") if model_args.tokenizer_name: tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer) elif model_args.model_name_or_path: tokenizer = AutoTokenizer.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer) else: raise ValueError( "You are instantiating a new tokenizer from scratch. This is not supported by this script." "You can do it from another script, save it, and load it from here, using --tokenizer_name." ) if model_args.model_name_or_path: model = AutoModelForMaskedLM.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) else: logger.info("Training new model from scratch") model = AutoModelForMaskedLM.from_config(config) model.resize_token_embeddings(len(tokenizer)) # Preprocessing the datasets. # First we tokenize all the texts. if training_args.do_train: column_names = datasets["train"].column_names else: column_names = datasets["validation"].column_names text_column_name = "text" if "text" in column_names else column_names[0] padding = "max_length" if data_args.pad_to_max_length else False def tokenize_function(examples): # Remove empty lines examples["text"] = [ line for line in examples["text"] if len(line) > 0 and not line.isspace() ] return tokenizer(examples["text"], padding=padding, truncation=True, max_length=data_args.max_seq_length) tokenized_datasets = datasets.map( tokenize_function, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=[text_column_name], load_from_cache_file=not data_args.overwrite_cache, ) # Add the chinese references if provided if data_args.train_ref_file is not None: tokenized_datasets["train"] = add_chinese_references( tokenized_datasets["train"], data_args.train_ref_file) if data_args.valid_ref_file is not None: tokenized_datasets["validation"] = add_chinese_references( tokenized_datasets["validation"], data_args.validation_ref_file) # Data collator # This one will take care of randomly masking the tokens. data_collator = DataCollatorForWholeWordMask( tokenizer=tokenizer, mlm_probability=data_args.mlm_probability) # Initialize our Trainer trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_datasets["train"] if training_args.do_train else None, eval_dataset=tokenized_datasets["validation"] if training_args.do_eval else None, tokenizer=tokenizer, data_collator=data_collator, ) # Training if training_args.do_train: trainer.train(model_path=model_args.model_name_or_path if os.path. isdir(model_args.model_name_or_path) else None) trainer.save_model() # Saves the tokenizer too for easy upload # Evaluation results = {} if training_args.do_eval: logger.info("*** Evaluate ***") eval_output = trainer.evaluate() perplexity = math.exp(eval_output["eval_loss"]) results["perplexity"] = perplexity output_eval_file = os.path.join(training_args.output_dir, "eval_results_mlm_wwm.txt") if trainer.is_world_process_zero(): with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key, value in results.items(): logger.info(f" {key} = {value}") writer.write(f"{key} = {value}\n") return results
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) model_args, data_args, training_args = parser.parse_args_into_dataclasses() if data_args.eval_data_file is None and training_args.do_eval: raise ValueError( "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file " "or remove the --do_eval argument.") if (os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.local_rank != -1), training_args.fp16, ) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() logger.info("Training/evaluation parameters %s", training_args) # Set seed set_seed(training_args.seed) # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. if model_args.config_name: config = AutoConfig.from_pretrained(model_args.config_name, cache_dir=model_args.cache_dir) elif model_args.model_name_or_path: config = AutoConfig.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir) else: config = CONFIG_MAPPING[model_args.model_type]() logger.warning( "You are instantiating a new config instance from scratch.") if model_args.tokenizer_name: tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name, cache_dir=model_args.cache_dir) elif model_args.model_name_or_path: tokenizer = AutoTokenizer.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir) else: raise ValueError( "You are instantiating a new tokenizer from scratch. This is not supported, but you can do it from another script, save it," "and load it from here, using --tokenizer_name") if model_args.model_name_or_path: model = AutoModelWithLMHead.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) else: logger.info("Training new model from scratch") model = AutoModelWithLMHead.from_config(config) model.resize_token_embeddings(len(tokenizer)) if config.model_type in ["bert", "roberta", "distilbert", "camembert" ] and not data_args.mlm: raise ValueError( "BERT and RoBERTa-like models do not have LM heads but masked LM heads. They must be run using the" "--mlm flag (masked language modeling).") if data_args.block_size <= 0: data_args.block_size = tokenizer.max_len # Our input block size will be the max possible for the model else: data_args.block_size = min(data_args.block_size, tokenizer.max_len) # Get datasets train_dataset = (get_dataset( data_args, tokenizer=tokenizer, cache_dir=model_args.cache_dir) if training_args.do_train else None) eval_dataset = (get_dataset(data_args, tokenizer=tokenizer, evaluate=True, cache_dir=model_args.cache_dir) if training_args.do_eval else None) if config.model_type == "xlnet": data_collator = DataCollatorForPermutationLanguageModeling( tokenizer=tokenizer, plm_probability=data_args.plm_probability, max_span_length=data_args.max_span_length, ) else: if data_args.mlm and data_args.whole_word_mask: data_collator = DataCollatorForWholeWordMask( tokenizer=tokenizer, mlm_probability=data_args.mlm_probability) else: data_collator = DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm=data_args.mlm, mlm_probability=data_args.mlm_probability) # Initialize our Trainer trainer = Trainer( model=model, args=training_args, data_collator=data_collator, train_dataset=train_dataset, eval_dataset=eval_dataset, prediction_loss_only=True, ) # Training special_tokens_dict = { 'bos_token': '<BOS>', 'eos_token': '<EOS>', 'pad_token': '<PAD>' } num_added_toks = tokenizer.add_special_tokens(special_tokens_dict) model.resize_token_embeddings(len(tokenizer)) if training_args.do_train: model_path = (model_args.model_name_or_path if model_args.model_name_or_path is not None and os.path.isdir(model_args.model_name_or_path) else None) trainer.train(model_path=model_path) trainer.save_model() # For convenience, we also re-save the tokenizer to the same directory, # so that you can share your model easily on huggingface.co/models =) if trainer.is_world_master(): tokenizer.save_pretrained(training_args.output_dir) # Evaluation results = {} if training_args.do_eval: logger.info("*** Evaluate ***") eval_output = trainer.evaluate() perplexity = math.exp(eval_output["eval_loss"]) result = {"perplexity": perplexity} output_eval_file = os.path.join(training_args.output_dir, "eval_results_lm.txt") if trainer.is_world_master(): with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) results.update(result) return results
def __len__(self): return len(self.sentences) train_dataset = TokenizedSentencesDataset(train_sentences, tokenizer, max_length) dev_dataset = TokenizedSentencesDataset( dev_sentences, tokenizer, max_length, cache_tokenization=True) if len(dev_sentences) > 0 else None ##### Training arguments if do_whole_word_mask: data_collator = DataCollatorForWholeWordMask(tokenizer=tokenizer, mlm=True, mlm_probability=mlm_prob) else: data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=mlm_prob) training_args = TrainingArguments( output_dir=output_dir, overwrite_output_dir=True, num_train_epochs=num_train_epochs, evaluation_strategy="steps" if dev_dataset is not None else "no", per_device_train_batch_size=per_device_train_batch_size, eval_steps=save_steps, save_steps=save_steps, logging_steps=save_steps,
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) # Detecting last checkpoint. last_checkpoint = None if os.path.isdir( training_args.output_dir ) and training_args.do_train and not training_args.overwrite_output_dir: last_checkpoint = get_last_checkpoint(training_args.output_dir) if last_checkpoint is None and len(os.listdir( training_args.output_dir)) > 0: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome.") elif last_checkpoint is not None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) logger.setLevel(logging.INFO if is_main_process(training_args.local_rank ) else logging.WARN) # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() logger.info("Training/evaluation parameters %s", training_args) # Set seed before initializing model. set_seed(training_args.seed) # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub). # # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called # 'text' is found. You can easily tweak this behavior (see below). # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if data_args.dataset_name is not None: # Downloading and loading a dataset from the hub. datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name) if "validation" not in datasets.keys(): datasets["validation"] = load_dataset( data_args.dataset_name, data_args.dataset_config_name, split=f"train[:{data_args.validation_split_percentage}%]", ) datasets["train"] = load_dataset( data_args.dataset_name, data_args.dataset_config_name, split=f"train[{data_args.validation_split_percentage}%:]", ) else: data_files = {} if data_args.train_file is not None: data_files["train"] = data_args.train_file if data_args.validation_file is not None: data_files["validation"] = data_args.validation_file extension = data_args.train_file.split(".")[-1] if extension == "txt": extension = "text" datasets = load_dataset(extension, data_files=data_files) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config_kwargs = { "cache_dir": model_args.cache_dir, "revision": model_args.model_revision, "use_auth_token": True if model_args.use_auth_token else None, } if model_args.config_name: config = AutoConfig.from_pretrained(model_args.config_name, **config_kwargs) elif model_args.model_name_or_path: config = AutoConfig.from_pretrained(model_args.model_name_or_path, **config_kwargs) else: config = CONFIG_MAPPING[model_args.model_type]() logger.warning( "You are instantiating a new config instance from scratch.") tokenizer_kwargs = { "cache_dir": model_args.cache_dir, "use_fast": model_args.use_fast_tokenizer, "revision": model_args.model_revision, "use_auth_token": True if model_args.use_auth_token else None, } if model_args.tokenizer_name: tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, **tokenizer_kwargs) elif model_args.model_name_or_path: tokenizer = AutoTokenizer.from_pretrained( model_args.model_name_or_path, **tokenizer_kwargs) else: raise ValueError( "You are instantiating a new tokenizer from scratch. This is not supported by this script." "You can do it from another script, save it, and load it from here, using --tokenizer_name." ) if model_args.model_name_or_path: model = AutoModelForMaskedLM.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) else: logger.info("Training new model from scratch") model = AutoModelForMaskedLM.from_config(config) model.resize_token_embeddings(len(tokenizer)) total_param_num = 0 for name, param in model.named_parameters(): if param.requires_grad: param_num = param.numel() total_param_num += param_num # print(name, "\t", param_num) print("total parameters num", total_param_num) print("--" * 10) freeze_layers = "0, 1, 2, 3, 4" if freeze_layers is not "": layer_indexes = [int(x) for x in freeze_layers.split(", ")] total_freeze_param_num = 0 for param in model.distilbert.embeddings.parameters(): param.requires_grad = False param_num = param.numel() total_freeze_param_num += param_num for layer_idx in layer_indexes: for name, param in list(model.distilbert.transformer. layer[layer_idx].named_parameters()): param.requires_grad = False param_num = param.numel() total_freeze_param_num += param_num # print(name, "\t", param_num) print("froze layer", layer_idx) print("total freeze parameters num", total_freeze_param_num) print("left parameters num for training", total_param_num - total_freeze_param_num) # exit() # Preprocessing the datasets. # First we tokenize all the texts. if training_args.do_train: column_names = datasets["train"].column_names else: column_names = datasets["validation"].column_names text_column_name = "text" if "text" in column_names else column_names[0] padding = "max_length" if data_args.pad_to_max_length else False def tokenize_function(examples): # Remove empty lines examples["text"] = [ line for line in examples["text"] if len(line) > 0 and not line.isspace() ] return tokenizer(examples["text"], padding=padding, truncation=True, max_length=data_args.max_seq_length) tokenized_datasets = datasets.map( tokenize_function, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=[text_column_name], load_from_cache_file=not data_args.overwrite_cache, ) # Add the chinese references if provided if data_args.train_ref_file is not None: tokenized_datasets["train"] = add_chinese_references( tokenized_datasets["train"], data_args.train_ref_file) if data_args.validation_ref_file is not None: tokenized_datasets["validation"] = add_chinese_references( tokenized_datasets["validation"], data_args.validation_ref_file) # If we have ref files, need to avoid it removed by trainer has_ref = data_args.train_ref_file or data_args.validation_ref_file if has_ref: training_args.remove_unused_columns = False # Data collator # This one will take care of randomly masking the tokens. data_collator = DataCollatorForWholeWordMask( tokenizer=tokenizer, mlm_probability=data_args.mlm_probability) # Initialize our Trainer trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_datasets["train"] if training_args.do_train else None, eval_dataset=tokenized_datasets["validation"] if training_args.do_eval else None, tokenizer=tokenizer, data_collator=data_collator, ) # Training if training_args.do_train: if last_checkpoint is not None: checkpoint = last_checkpoint elif model_args.model_name_or_path is not None and os.path.isdir( model_args.model_name_or_path): checkpoint = model_args.model_name_or_path else: checkpoint = None train_result = trainer.train(resume_from_checkpoint=checkpoint) trainer.save_model() # Saves the tokenizer too for easy upload output_train_file = os.path.join(training_args.output_dir, "train_results.txt") if trainer.is_world_process_zero(): with open(output_train_file, "w") as writer: logger.info("***** Train results *****") for key, value in sorted(train_result.metrics.items()): logger.info(f" {key} = {value}") writer.write(f"{key} = {value}\n") # Need to save the state, since Trainer.save_model saves only the tokenizer with the model trainer.state.save_to_json( os.path.join(training_args.output_dir, "trainer_state.json")) # Evaluation results = {} if training_args.do_eval: logger.info("*** Evaluate ***") eval_output = trainer.evaluate() perplexity = math.exp(eval_output["eval_loss"]) results["perplexity"] = perplexity output_eval_file = os.path.join(training_args.output_dir, "eval_results_mlm_wwm.txt") if trainer.is_world_process_zero(): with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key, value in sorted(results.items()): logger.info(f" {key} = {value}") writer.write(f"{key} = {value}\n") return results
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments, EmbeddingsArguments, AdapterArguments) ) model_args, data_args, training_args, embedding_args, adapter_args = parser.parse_args_into_dataclasses() if data_args.eval_data_file is None and training_args.do_eval: raise ValueError( "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file " "or remove the --do_eval argument." ) if ( os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir ): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.local_rank != -1), training_args.fp16, ) logger.info("Training/evaluation parameters %s", training_args) # Set seed set_seed(training_args.seed) # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. if model_args.config_name: config = AutoConfig.from_pretrained(model_args.config_name, cache_dir=model_args.cache_dir) elif model_args.model_name_or_path: config = AutoConfig.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir) else: config = CONFIG_MAPPING[model_args.model_type]() logger.warning("You are instantiating a new config instance from scratch.") if model_args.tokenizer_name: tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name, cache_dir=model_args.cache_dir, do_lower_case=model_args.do_lower_case ) elif model_args.model_name_or_path: tokenizer = AutoTokenizer.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir, do_lower_case=model_args.do_lower_case ) else: raise ValueError( "You are instantiating a new tokenizer from scratch. This is not supported, but you can do it from another script, save it," "and load it from here, using --tokenizer_name" ) if model_args.model_name_or_path: model = AutoModelWithLMHead.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) else: logger.info("Training new model from scratch") model = AutoModelWithLMHead.from_config(config) if embedding_args.new_embeddings: logger.info( "Initializing new embeddings, copying special tokens, position embeddings, and token type embeddings" ) tokenizer_old = AutoTokenizer.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir, do_lower_case=model_args.do_lower_case ) model.config.vocab_size = tokenizer.vocab_size model.base_model.set_embeddings_type("full") model.base_model.overwrite_embeddings(args=embedding_args, a_tokenizer=tokenizer_old, b_tokenizer=tokenizer) model.resize_token_embeddings(len(tokenizer)) # Setup adapters if adapter_args.train_adapter: language = adapter_args.language if not language: raise ValueError("--language flag must be set when training an adapter") # check if language adapter already exists, otherwise add it if language not in model.config.adapters.adapter_list(AdapterType.text_lang): # resolve the adapter config adapter_config = AdapterConfig.load( adapter_args.adapter_config, non_linearity="gelu", reduction_factor=2, invertible_adapter=None ) model.add_adapter(language, AdapterType.text_lang, config=adapter_config) # Freeze all model weights except of those of this adapter & use this adapter in every forward pass adapter_names = [[language]] model.train_adapter([language]) elif embedding_args.freeze_base_model: logger.info("Freezing base model parameters") for parameters in model.base_model.parameters(): parameters.requires_grad = False adapter_names = None else: adapter_names = None if embedding_args.new_embeddings: logger.info("Unfreezing embedding parameters") for parameters in model.base_model.embeddings.parameters(): parameters.requires_grad = True if config.model_type in ["bert", "roberta", "distilbert", "camembert"] and not data_args.mlm: raise ValueError( "BERT and RoBERTa-like models do not have LM heads but masked LM heads. They must be run using the" "--mlm flag (masked language modeling)." ) if data_args.block_size <= 0: data_args.block_size = tokenizer.max_len # Our input block size will be the max possible for the model else: data_args.block_size = min(data_args.block_size, tokenizer.max_len) # Get datasets train_dataset = get_dataset(data_args, tokenizer=tokenizer) if training_args.do_train else None eval_dataset = get_dataset(data_args, tokenizer=tokenizer, evaluate=True,) if training_args.do_eval else None if data_args.mlm and data_args.whole_word_mask: data_collator = DataCollatorForWholeWordMask(tokenizer=tokenizer, mlm_probability=data_args.mlm_probability) else: data_collator = DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm=data_args.mlm, mlm_probability=data_args.mlm_probability, ) # Initialize our Trainer trainer = Trainer( model=model, args=training_args, data_collator=data_collator, train_dataset=train_dataset, eval_dataset=eval_dataset, prediction_loss_only=True, do_save_full_model=True, do_save_adapters=adapter_args.train_adapter, adapter_names=adapter_names, ) # Training if training_args.do_train: model_path = ( model_args.model_name_or_path if model_args.model_name_or_path is not None and os.path.isdir(model_args.model_name_or_path) else None ) trainer.train(model_path=model_path) trainer.save_model() # For convenience, we also re-save the tokenizer to the same directory, # so that you can share your model easily on huggingface.co/models =) if trainer.is_world_master(): tokenizer.save_pretrained(training_args.output_dir) # Evaluation results = {} if training_args.do_eval: logger.info("*** Evaluate ***") eval_output = trainer.evaluate() perplexity = math.exp(eval_output["eval_loss"]) result = {"perplexity": perplexity} output_eval_file = os.path.join(training_args.output_dir, "eval_results_lm.txt") if trainer.is_world_master(): with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) results.update(result) return results
idx = PretrainedTransformerMismatchedIndexer("./bert_out", namespace="tokens") def prepare_instance(s): tokens = [Token(t) for t in s.split(" ")] indexed = idx.tokens_to_indices(tokens, vocab) print([vocab.get_token_from_index(i) for i in indexed['token_ids']]) return Instance({"tokens": TextField(tokens, {"tokens": idx})}) instances = [prepare_instance("ϩⲙⲡⲣⲁⲛ ⲙⲡⲛⲟⲩⲧⲉ ⲛϣⲟⲣⲡ ⲁⲛⲟⲕ"), prepare_instance("ϩⲙⲡⲣⲁⲛ ⲙⲡⲛⲟⲩⲧⲉ ⲛϣⲟⲣⲡ ⲁⲛⲟⲕ")] for i in instances: i["tokens"].index(vocab) tensors = [i.as_tensor_dict() for i in instances] collator = DataCollatorForWholeWordMask(tokenizer=tokenizer) ids = torch.cat([tensors[0]['tokens']['tokens']['token_ids'].unsqueeze(0), tensors[1]['tokens']['tokens']['token_ids'].unsqueeze(0)], dim=0) ids.shape wwm = collator._whole_word_mask([[vocab.get_token_from_index(i.item()) for i in wp_ids] for wp_ids in ids]) wwms = [] for i in range(ids.shape[0]): tokens = [vocab.get_token_from_index(i.item()) for i in ids[i]] wwm = torch.tensor(collator._whole_word_mask(tokens)).unsqueeze(0) wwms.append(wwm) wwms = torch.cat(wwms, dim=0) wwm = torch.tensor(wwm).unsqueeze(0) wwm masked_ids, labels = collator.mask_tokens(ids, wwm)
class BertBackbone(Backbone): def __init__( self, vocab: Vocabulary, embedding_dim: int, feedforward_dim: int, num_layers: int, num_attention_heads: int, position_embedding_dim: int, tokenizer_path: str, position_embedding_type: str = "absolute", activation: str = "gelu", hidden_dropout: float = 0.1, ) -> None: super().__init__() # TODO: # - Need to apply corrections in pretrained_transformer_mismatched_embedder tokenizer = BertTokenizer.from_pretrained(tokenizer_path) vocab.add_transformer_vocab(tokenizer, "tokens") # "tokens" is padded by default--undo that del vocab._token_to_index["tokens"]["@@PADDING@@"] del vocab._token_to_index["tokens"]["@@UNKNOWN@@"] assert len(vocab._token_to_index["tokens"]) == len(vocab._index_to_token["tokens"]) cfg = BertConfig( vocab_size=vocab.get_vocab_size("tokens"), hidden_size=embedding_dim, num_hidden_layers=num_layers, num_attention_heads=num_attention_heads, intermediate_size=feedforward_dim, hidden_act=activation, hidden_dropout_prob=hidden_dropout, max_position_embeddings=position_embedding_dim, position_embedding_type=position_embedding_type, use_cache=True, ) self.cfg = cfg self._vocab = vocab self._namespace = "tokens" self.bert = BertModel(cfg) self.masking_collator = DataCollatorForWholeWordMask( tokenizer=tokenizer, mlm=True, mlm_probability=0.15 ) def _embed(self, text: TextFieldTensors) -> Dict[str, torch.Tensor]: """ This implementation is borrowed from `PretrainedTransformerMismatchedEmbedder` and uses average pooling to yield a de-wordpieced embedding for each original token. Returns both wordpiece embeddings+mask as well as original token embeddings+mask """ output = self.bert( input_ids=text['tokens']['token_ids'], attention_mask=text["tokens"]["wordpiece_mask"], token_type_ids=text['tokens']['type_ids'], ) wordpiece_embeddings = output.last_hidden_state offsets = text['tokens']['offsets'] # Assemble wordpiece embeddings into embeddings for each word using average pooling span_embeddings, span_mask = util.batched_span_select(wordpiece_embeddings.contiguous(), offsets) # type: ignore span_mask = span_mask.unsqueeze(-1) # Shape: (batch_size, num_orig_tokens, max_span_length, embedding_size) span_embeddings *= span_mask # zero out paddings # return the average of embeddings of all sub-tokens of a word # Sum over embeddings of all sub-tokens of a word # Shape: (batch_size, num_orig_tokens, embedding_size) span_embeddings_sum = span_embeddings.sum(2) # Shape (batch_size, num_orig_tokens) span_embeddings_len = span_mask.sum(2) # Find the average of sub-tokens embeddings by dividing `span_embedding_sum` by `span_embedding_len` # Shape: (batch_size, num_orig_tokens, embedding_size) orig_embeddings = span_embeddings_sum / torch.clamp_min(span_embeddings_len, 1) # All the places where the span length is zero, write in zeros. orig_embeddings[(span_embeddings_len == 0).expand(orig_embeddings.shape)] = 0 return { "wordpiece_mask": text['tokens']['wordpiece_mask'], "wordpiece_embeddings": wordpiece_embeddings, "orig_mask": text['tokens']['mask'], "orig_embeddings": orig_embeddings } def forward(self, text: TextFieldTensors) -> Dict[str, torch.Tensor]: # type: ignore bert_output = self._embed(text) outputs = { "encoded_text": bert_output['orig_embeddings'], "encoded_text_mask": bert_output['orig_mask'], "wordpiece_encoded_text": bert_output['wordpiece_embeddings'], "wordpiece_encoded_text_mask": bert_output['wordpiece_mask'], "token_ids": util.get_token_ids_from_text_field_tensors(text), } self._extend_with_masked_text(outputs, text) return outputs def _extend_with_masked_text(self, outputs: Dict[str, Any], text: TextFieldTensors) -> None: input_ids = text['tokens']['token_ids'] # get the binary mask that'll tell us which parts to mask--this is random and dynamically done wwms = [] for i in range(input_ids.shape[0]): tokens = [self._vocab.get_token_from_index(i.item()) for i in input_ids[i]] wwm = torch.tensor(self.masking_collator._whole_word_mask(tokens)).unsqueeze(0) wwms.append(wwm) wwms = torch.cat(wwms, dim=0) masked_ids, labels = self.masking_collator.mask_tokens(input_ids.to('cpu'), wwms.to('cpu')) masked_ids = masked_ids.to(input_ids.device) labels = labels.to(input_ids.device) bert_output = self.bert( input_ids=masked_ids, attention_mask=text["tokens"]["wordpiece_mask"], token_type_ids=text['tokens']['type_ids'], ) outputs["encoded_masked_text"] = bert_output.last_hidden_state outputs["masked_text_labels"] = labels @overrides def make_output_human_readable(self, output_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]: tokens = [] for instance_tokens in output_dict["token_ids"]: tokens.append( [ self._vocab.get_token_from_index(token_id.item(), namespace=self._namespace) for token_id in instance_tokens ] ) output_dict["tokens"] = tokens del output_dict["token_ids"] del output_dict["encoded_text"] del output_dict["encoded_text_mask"] del output_dict["encoded_masked_text"] return output_dict
def main(): # 更多参数请查看 in src/transformers/training_args.py # 或通过--help查看 # 现在,我们保留了不同的参数集,以使关注点更加清晰。 parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # 如果我们仅将一个参数传递给脚本,并且它是json文件的路径, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses() if ( os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir ): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty." "Use --overwrite_output_dir to overcome." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if is_main_process(training_args.local_rank) else logging.WARN, ) # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) # 将logger verbosity设置为 info(仅在main上): if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() logger.info("Training/evaluation parameters %s", training_args) # 在初始化模型之前设置种子。 set_seed(training_args.seed) # 获取数据集:您可以提供自己的CSV / JSON / TXT训练和评估文件(请参见下文) # 或仅提供位于hub的可用公共数据集之一的名称,网址为 https://huggingface.co/datasets/ # (该数据集将自动从数据集中心下载)。 # # 对于CSV / JSON文件,此脚本将使用名为'text'的列或如果未找到名为'text'的列的第一列。 # 您可以轻松调整此行为(请参见下文)。 # # 在分布式训练中,load_dataset函数可确保只有一个local进程可以同时下载数据集。 # data_files = {} if data_args.train_file is not None: data_files["train"] = data_args.train_file if data_args.validation_file is not None: data_files["validation"] = data_args.validation_file extension = data_args.train_file.split(".")[-1] if extension == "txt": extension = "text" # datasets = load_dataset(path=extension, data_files=data_files) datasets = load_dataset(path="data/text.py", data_files=data_files) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. # 加载 pretrained model and tokenizer # # 分布式训练 training: # .from_pretrained方法可确保只有一个local进程可以同时下载模型和vocab # 加载模型的配置 if model_args.config_name: config = AutoConfig.from_pretrained(model_args.config_name, cache_dir=model_args.cache_dir) elif model_args.model_name_or_path: config = AutoConfig.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir) else: config = CONFIG_MAPPING[model_args.model_type]() logger.warning("You are instantiating a new config instance from scratch.") #加载tokenizer if model_args.tokenizer_name: tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer ) elif model_args.model_name_or_path: tokenizer = AutoTokenizer.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer ) else: raise ValueError( "您正在从头实例化一个新的tokenizer。 此脚本不支持此功能。 " "您可以使用其它脚本进行--tokenizer_name保存,并从此处使用和加载。 " ) #加载模型,或者随机初始化模型 if model_args.model_name_or_path: model = AutoModelForMaskedLM.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) else: logger.info("从头scratch开始训练一个新的模型") model = AutoModelForMaskedLM.from_config(config) #可以根据字典重新调整嵌入层的大小, 如果单词表大小不变,embedding也不变, [old_vocab_size, 768] ---> [new_vocab_size, 768] model.resize_token_embeddings(len(tokenizer)) # 预处理数据集 # 首先,我们所有文本分词。 if training_args.do_train: # eg: column_names: ['text'] column_names = datasets["train"].column_names else: column_names = datasets["validation"].column_names # eg text_column_name: text text_column_name = "text" if "text" in column_names else column_names[0] # padding的方式 padding = "max_length" if data_args.pad_to_max_length else False def tokenize_function(examples): """ 处理2条数据的数据 Args: examples: {'text': ['古龙洗发水,洗完头发不干燥、也不容易油、不痒,味道持久,非常柔顺,而且泡泡很容易冲洗干净泡沫非常细腻,洗后头发很滑很顺,洗了之后就头发很蓬松,很香,而且我洗了是没有头皮屑的', '老用户了,一直在用满婷,感觉对控痘控油效果挺好的']} Returns: 返回包括一个batch的数据的, dict_keys(['input_ids', 'token_type_ids', 'attention_mask']) """ examples["text"] = [line for line in examples["text"] if len(line) > 0 and not line.isspace()] newexample = tokenizer(examples["text"], padding=padding, truncation=True, max_length=data_args.max_seq_length) return newexample #处理数据 tokenized_datasets = datasets.map( tokenize_function, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=[text_column_name], load_from_cache_file=not data_args.overwrite_cache, ) # 添加中文引用(如果提供的话) ,训练集的全词引用 if data_args.train_ref_file is not None: tokenized_datasets["train"] = add_chinese_references(tokenized_datasets["train"], data_args.train_ref_file) #验证集的全词引用 if data_args.validation_ref_file is not None: tokenized_datasets["validation"] = add_chinese_references( tokenized_datasets["validation"], data_args.validation_ref_file ) # Data collator, 输入模型前的数据处理器 # 这将进行随机masked 全词的token,具体mask全词的方式,请进入函数查看 data_collator = DataCollatorForWholeWordMask(tokenizer=tokenizer, mlm_probability=data_args.mlm_probability) # Initialize our Trainer trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_datasets["train"] if training_args.do_train else None, eval_dataset=tokenized_datasets["validation"] if training_args.do_eval else None, tokenizer=tokenizer, data_collator=data_collator, ) # Training if training_args.do_train: logger.info("*** 开始训练模型 ***") #model_path加载模型的optimizer/scheduler路径, 需要有optimizer.pt和scheduler.pt文件在路径下,否则会初始化 model_path = ( model_args.model_name_or_path if (model_args.model_name_or_path is not None and os.path.isdir(model_args.model_name_or_path)) else None ) trainer.train(model_path=model_path) #保存训练好的模型到output目录 trainer.save_model(output_dir=training_args.output_dir) # Saves the tokenizer too for easy upload # 评估 results = {} if training_args.do_eval: logger.info("*** 开始评估模型 ***") eval_output = trainer.evaluate() perplexity = math.exp(eval_output["eval_loss"]) results["perplexity"] = perplexity output_eval_file = os.path.join(training_args.output_dir, "eval_results_mlm_wwm.txt") if trainer.is_world_process_zero(): with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key, value in results.items(): logger.info(f" {key} = {value}") writer.write(f"{key} = {value}\n") return results