def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) model_args, data_args, training_args = parser.parse_args_into_dataclasses() if data_args.eval_data_file is None and training_args.do_eval: raise ValueError( "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file " "or remove the --do_eval argument.") if (os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.local_rank != -1), training_args.fp16, ) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() logger.info("Training/evaluation parameters %s", training_args) # Set seed set_seed(training_args.seed) # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. if model_args.config_name: config = AutoConfig.from_pretrained(model_args.config_name, cache_dir=model_args.cache_dir) elif model_args.model_name_or_path: config = AutoConfig.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir) else: config = CONFIG_MAPPING[model_args.model_type]() logger.warning( "You are instantiating a new config instance from scratch.") if model_args.tokenizer_name: tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name, cache_dir=model_args.cache_dir) elif model_args.model_name_or_path: tokenizer = AutoTokenizer.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir) else: raise ValueError( "You are instantiating a new tokenizer from scratch. This is not supported, but you can do it from another script, save it," "and load it from here, using --tokenizer_name") if model_args.model_name_or_path: model = AutoModelForMaskedLM.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) else: logger.info("Training new model from scratch") model = AutoModelForMaskedLM.from_config(config) model.resize_token_embeddings(len(tokenizer)) if config.model_type in ["bert", "roberta", "distilbert", "camembert" ] and not data_args.mlm: raise ValueError( "BERT and RoBERTa-like models do not have LM heads but masked LM heads. They must be run using the" "--mlm flag (masked language modeling).") if data_args.block_size <= 0: data_args.block_size = tokenizer.model_max_length # Our input block size will be the max possible for the model else: data_args.block_size = min(data_args.block_size, tokenizer.model_max_length) # Get datasets train_dataset = (get_dataset( data_args, tokenizer=tokenizer, cache_dir=model_args.cache_dir) if training_args.do_train else None) eval_dataset = (get_dataset(data_args, tokenizer=tokenizer, evaluate=True, cache_dir=model_args.cache_dir) if training_args.do_eval else None) if config.model_type == "xlnet": data_collator = DataCollatorForPermutationLanguageModeling( tokenizer=tokenizer, plm_probability=data_args.plm_probability, max_span_length=data_args.max_span_length, ) else: if data_args.mlm and data_args.whole_word_mask: data_collator = DataCollatorForWholeWordMask( tokenizer=tokenizer, mlm_probability=data_args.mlm_probability) else: data_collator = DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm=data_args.mlm, mlm_probability=data_args.mlm_probability) # Initialize our Trainer trainer = Trainer(model=model, args=training_args, data_collator=data_collator, train_dataset=train_dataset, eval_dataset=eval_dataset) # Training if training_args.do_train: model_path = (model_args.model_name_or_path if model_args.model_name_or_path is not None and os.path.isdir(model_args.model_name_or_path) else None) trainer.train(model_path=model_path) trainer.save_model() # For convenience, we also re-save the tokenizer to the same directory, # so that you can share your model easily on huggingface.co/models =) if trainer.is_world_process_zero(): tokenizer.save_pretrained(training_args.output_dir) # Evaluation results = {} if training_args.do_eval: logger.info("*** Evaluate ***") eval_output = trainer.evaluate() perplexity = math.exp(eval_output["eval_loss"]) result = {"perplexity": perplexity} output_eval_file = os.path.join(training_args.output_dir, "eval_results_lm.txt") if trainer.is_world_process_zero(): with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) results.update(result) return results
ds=data_loader.dataset sum=0 for i, (seq1,seq2,mask1,mask2) in enumerate(tqdm(data_loader)): input_ids1, attention_mask1= split_seq(seq1.to(device)),split_seq(mask1.to(device)) input_ids2, attention_mask2= split_seq(seq2.to(device)),split_seq(mask2.to(device)) with torch.no_grad(): for i in range(len(input_ids1)): input,label=ds.DataCollatorForLanguageModeling(input_ids1[i]) output=net(input_ids=input,attention_mask=attention_mask1[i],labels=label) sum+=output.loss.cpu().numpy() for i in range(len(input_ids2)): input,label=ds.DataCollatorForLanguageModeling(input_ids2[i]) output=net(input_ids=input,attention_mask=attention_mask2[i],labels=label) sum+=output.loss.cpu().numpy() fi=open("loge5.txt","a") print("eval loss ",sum,file=fi) fi.close() config = AutoConfig.from_pretrained('bert-base-cased') config.vocab_size=total_dataset.MAXTOKEN+10 model = AutoModelForMaskedLM.from_config(config).to(device) optim = AdamW(model.parameters(), lr=1e-5) test(model,test_loader) for i in range(50): print("start our training") train(model,train_loader,optim,i) print("start eval") test(model,test_loader) savemodel(model,i)
def test_trainer_iterable_dataset(self): # Simulate Language Modeling with an IterableDataset, with no __len__ method # Pick-up a tiny model, so it works on CPU # See Issue #5990: https://github.com/huggingface/transformers/issues/5990 MODEL_ID = "sshleifer/tiny-distilbert-base-cased" model = AutoModelForMaskedLM.from_pretrained(MODEL_ID) tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) train_dataset = SampleIterableDataset(file_path=PATH_SAMPLE_TEXT, tokenizer=tokenizer) training_args = TrainingArguments(output_dir="./examples", no_cuda=True, max_steps=2) data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15) training_args = TrainingArguments(output_dir="./examples", no_cuda=True, max_steps=2) trainer = Trainer(model=model, args=training_args, train_dataset=train_dataset, data_collator=data_collator) trainer.train() loader = trainer.get_train_dataloader() self.assertIsInstance(loader, torch.utils.data.DataLoader) self.assertIsInstance( loader.sampler, torch.utils.data.dataloader._InfiniteConstantSampler) # Exception if giving iterable dataset and no max_steps with self.assertRaises(ValueError): training_args = TrainingArguments(output_dir="./examples", no_cuda=True) _ = Trainer(model=model, args=training_args, train_dataset=train_dataset, data_collator=data_collator) # Exception if eval_dataset is iterable in __init__ with self.assertRaises(ValueError): training_args = TrainingArguments(output_dir="./examples", no_cuda=True, max_steps=2) _ = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=train_dataset, data_collator=data_collator, ) # Exception if predicting with iterable dataset with self.assertRaises(ValueError): training_args = TrainingArguments(output_dir="./examples", no_cuda=True) trainer = Trainer(model=model, args=training_args, data_collator=data_collator) trainer.predict(train_dataset) # Exception if evaluating with iterable dataset with self.assertRaises(ValueError): training_args = TrainingArguments(output_dir="./examples", no_cuda=True) trainer = Trainer(model=model, args=training_args, data_collator=data_collator) trainer.evaluate(train_dataset)
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) if (os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty." "Use --overwrite_output_dir to overcome.") # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if is_main_process(training_args.local_rank) else logging.WARN, ) # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() logger.info("Training/evaluation parameters %s", training_args) # Set seed before initializing model. set_seed(training_args.seed) # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub). # # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called # 'text' is found. You can easily tweak this behavior (see below). # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if data_args.dataset_name is not None: # Downloading and loading a dataset from the hub. datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name) if "validation" not in datasets.keys(): datasets["validation"] = load_dataset( data_args.dataset_name, data_args.dataset_config_name, split=f"train[:{data_args.validation_split_percentage}%]", ) datasets["train"] = load_dataset( data_args.dataset_name, data_args.dataset_config_name, split=f"train[{data_args.validation_split_percentage}%:]", ) else: data_files = {} if data_args.train_file is not None: data_files["train"] = data_args.train_file if data_args.validation_file is not None: data_files["validation"] = data_args.validation_file extension = data_args.train_file.split(".")[-1] if extension == "txt": extension = "text" datasets = load_dataset(extension, data_files=data_files) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. if model_args.config_name: config = AutoConfig.from_pretrained(model_args.config_name, cache_dir=model_args.cache_dir) elif model_args.model_name_or_path: config = AutoConfig.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir) else: config = CONFIG_MAPPING[model_args.model_type]() logger.warning( "You are instantiating a new config instance from scratch.") if model_args.tokenizer_name: tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer) elif model_args.model_name_or_path: tokenizer = AutoTokenizer.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer) else: raise ValueError( "You are instantiating a new tokenizer from scratch. This is not supported by this script." "You can do it from another script, save it, and load it from here, using --tokenizer_name." ) if model_args.model_name_or_path: model = AutoModelForMaskedLM.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) else: logger.info("Training new model from scratch") model = AutoModelForMaskedLM.from_config(config) model.resize_token_embeddings(len(tokenizer)) # Preprocessing the datasets. # First we tokenize all the texts. if training_args.do_train: column_names = datasets["train"].column_names else: column_names = datasets["validation"].column_names text_column_name = "text" if "text" in column_names else column_names[0] padding = "max_length" if data_args.pad_to_max_length else False def tokenize_function(examples): # Remove empty lines examples["text"] = [ line for line in examples["text"] if len(line) > 0 and not line.isspace() ] return tokenizer(examples["text"], padding=padding, truncation=True, max_length=data_args.max_seq_length) tokenized_datasets = datasets.map( tokenize_function, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=[text_column_name], load_from_cache_file=not data_args.overwrite_cache, ) # Add the chinese references if provided if data_args.train_ref_file is not None: tokenized_datasets["train"] = add_chinese_references( tokenized_datasets["train"], data_args.train_ref_file) if data_args.validation_ref_file is not None: tokenized_datasets["validation"] = add_chinese_references( tokenized_datasets["validation"], data_args.validation_ref_file) # Data collator # This one will take care of randomly masking the tokens. data_collator = DataCollatorForWholeWordMask( tokenizer=tokenizer, mlm_probability=data_args.mlm_probability) # Initialize our Trainer trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_datasets["train"] if training_args.do_train else None, eval_dataset=tokenized_datasets["validation"] if training_args.do_eval else None, tokenizer=tokenizer, data_collator=data_collator, ) # Training if training_args.do_train: model_path = (model_args.model_name_or_path if (model_args.model_name_or_path is not None and os.path.isdir(model_args.model_name_or_path)) else None) trainer.train(model_path=model_path) trainer.save_model() # Saves the tokenizer too for easy upload # Evaluation results = {} if training_args.do_eval: logger.info("*** Evaluate ***") eval_output = trainer.evaluate() perplexity = math.exp(eval_output["eval_loss"]) results["perplexity"] = perplexity output_eval_file = os.path.join(training_args.output_dir, "eval_results_mlm_wwm.txt") if trainer.is_world_process_zero(): with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key, value in results.items(): logger.info(f" {key} = {value}") writer.write(f"{key} = {value}\n") return results
def main(): args = parse_args() # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. accelerator = Accelerator() # Make one log on every process with the configuration for debugging. logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) logger.info(accelerator.state) # Setup logging, we only want one process per machine to log things on the screen. # accelerator.is_local_main_process is only True for one process per machine. logger.setLevel( logging.INFO if accelerator.is_local_main_process else logging.ERROR) if accelerator.is_local_main_process: datasets.utils.logging.set_verbosity_warning() transformers.utils.logging.set_verbosity_info() else: datasets.utils.logging.set_verbosity_error() transformers.utils.logging.set_verbosity_error() # If passed along, set the training seed now. if args.seed is not None: set_seed(args.seed) # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub). # # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called # 'text' is found. You can easily tweak this behavior (see below). # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if args.dataset_name is not None: # Downloading and loading a dataset from the hub. raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name) if "validation" not in raw_datasets.keys(): raw_datasets["validation"] = load_dataset( args.dataset_name, args.dataset_config_name, split=f"train[:{args.validation_split_percentage}%]", ) raw_datasets["train"] = load_dataset( args.dataset_name, args.dataset_config_name, split=f"train[{args.validation_split_percentage}%:]", ) else: data_files = {} if args.train_file is not None: data_files["train"] = args.train_file if args.validation_file is not None: data_files["validation"] = args.validation_file extension = args.train_file.split(".")[-1] if extension == "txt": extension = "text" raw_datasets = load_dataset(extension, data_files=data_files) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. # Load pretrained model and tokenizer # # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. if args.config_name: config = AutoConfig.from_pretrained(args.config_name) elif args.model_name_or_path: config = AutoConfig.from_pretrained(args.model_name_or_path) else: config = CONFIG_MAPPING[args.model_type]() logger.warning( "You are instantiating a new config instance from scratch.") if args.tokenizer_name: tokenizer = AutoTokenizer.from_pretrained( args.tokenizer_name, use_fast=not args.use_slow_tokenizer) elif args.model_name_or_path: tokenizer = AutoTokenizer.from_pretrained( args.model_name_or_path, use_fast=not args.use_slow_tokenizer) else: raise ValueError( "You are instantiating a new tokenizer from scratch. This is not supported by this script." "You can do it from another script, save it, and load it from here, using --tokenizer_name." ) if args.model_name_or_path: model = AutoModelForMaskedLM.from_pretrained( args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config, ) else: logger.info("Training new model from scratch") model = AutoModelForMaskedLM.from_config(config) model.resize_token_embeddings(len(tokenizer)) # Preprocessing the datasets. # First we tokenize all the texts. column_names = raw_datasets["train"].column_names text_column_name = "text" if "text" in column_names else column_names[0] if args.max_seq_length is None: max_seq_length = tokenizer.model_max_length if max_seq_length > 1024: logger.warn( f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). " "Picking 1024 instead. You can change that default value by passing --max_seq_length xxx." ) max_seq_length = 1024 else: if args.max_seq_length > tokenizer.model_max_length: logger.warn( f"The max_seq_length passed ({args.max_seq_length}) is larger than the maximum length for the" f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}." ) max_seq_length = min(args.max_seq_length, tokenizer.model_max_length) if args.line_by_line: # When using line_by_line, we just tokenize each nonempty line. padding = "max_length" if args.pad_to_max_length else False def tokenize_function(examples): # Remove empty lines examples["text"] = [ line for line in examples["text"] if len(line) > 0 and not line.isspace() ] return tokenizer( examples["text"], padding=padding, truncation=True, max_length=max_seq_length, # We use this option because DataCollatorForLanguageModeling (see below) is more efficient when it # receives the `special_tokens_mask`. return_special_tokens_mask=True, ) tokenized_datasets = raw_datasets.map( tokenize_function, batched=True, num_proc=args.preprocessing_num_workers, remove_columns=[text_column_name], load_from_cache_file=not args.overwrite_cache, ) else: # Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts. # We use `return_special_tokens_mask=True` because DataCollatorForLanguageModeling (see below) is more # efficient when it receives the `special_tokens_mask`. def tokenize_function(examples): return tokenizer(examples[text_column_name], return_special_tokens_mask=True) tokenized_datasets = raw_datasets.map( tokenize_function, batched=True, num_proc=args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not args.overwrite_cache, ) # Main data processing function that will concatenate all texts from our dataset and generate chunks of # max_seq_length. def group_texts(examples): # Concatenate all texts. concatenated_examples = { k: sum(examples[k], []) for k in examples.keys() } total_length = len(concatenated_examples[list(examples.keys())[0]]) # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can # customize this part to your needs. total_length = (total_length // max_seq_length) * max_seq_length # Split by chunks of max_len. result = { k: [ t[i:i + max_seq_length] for i in range(0, total_length, max_seq_length) ] for k, t in concatenated_examples.items() } return result # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a # remainder for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value # might be slower to preprocess. # # To speed up this part, we use multiprocessing. See the documentation of the map method for more information: # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map tokenized_datasets = tokenized_datasets.map( group_texts, batched=True, num_proc=args.preprocessing_num_workers, load_from_cache_file=not args.overwrite_cache, ) train_dataset = tokenized_datasets["train"] eval_dataset = tokenized_datasets["validation"] # Log a few random samples from the training set: for index in random.sample(range(len(train_dataset)), 3): logger.info( f"Sample {index} of the training set: {train_dataset[index]}.") # Data collator # This one will take care of randomly masking the tokens. data_collator = DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm_probability=args.mlm_probability) # DataLoaders creation: train_dataloader = DataLoader(train_dataset, shuffle=True, collate_fn=data_collator, batch_size=args.per_device_train_batch_size) eval_dataloader = DataLoader(eval_dataset, collate_fn=data_collator, batch_size=args.per_device_eval_batch_size) # Optimizer # Split weights in two groups, one with weight decay and the other not. no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate) # Prepare everything with our `accelerator`. model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare( model, optimizer, train_dataloader, eval_dataloader) # Note -> the training dataloader needs to be prepared before we grab his length below (cause its length will be # shorter in multiprocess) # Scheduler and math around the number of training steps. num_update_steps_per_epoch = math.ceil( len(train_dataloader) / args.gradient_accumulation_steps) if args.max_train_steps is None: args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch else: args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch) lr_scheduler = get_scheduler( name=args.lr_scheduler_type, optimizer=optimizer, num_warmup_steps=args.num_warmup_steps, num_training_steps=args.max_train_steps, ) # Train! total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps logger.info("***** Running training *****") logger.info(f" Num examples = {len(train_dataset)}") logger.info(f" Num Epochs = {args.num_train_epochs}") logger.info( f" Instantaneous batch size per device = {args.per_device_train_batch_size}" ) logger.info( f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}" ) logger.info( f" Gradient Accumulation steps = {args.gradient_accumulation_steps}") logger.info(f" Total optimization steps = {args.max_train_steps}") # Only show the progress bar once on each machine. progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process) completed_steps = 0 for epoch in range(args.num_train_epochs): model.train() for step, batch in enumerate(train_dataloader): outputs = model(**batch) loss = outputs.loss loss = loss / args.gradient_accumulation_steps accelerator.backward(loss) if step % args.gradient_accumulation_steps == 0 or step == len( train_dataloader) - 1: optimizer.step() lr_scheduler.step() optimizer.zero_grad() progress_bar.update(1) completed_steps += 1 if completed_steps >= args.max_train_steps: break model.eval() losses = [] for step, batch in enumerate(eval_dataloader): with torch.no_grad(): outputs = model(**batch) loss = outputs.loss losses.append( accelerator.gather(loss.repeat( args.per_device_eval_batch_size))) losses = torch.cat(losses) losses = losses[:len(eval_dataset)] perplexity = math.exp(torch.mean(losses)) logger.info(f"epoch {epoch}: perplexity: {perplexity}") if args.output_dir is not None: accelerator.wait_for_everyone() unwrapped_model = accelerator.unwrap_model(model) unwrapped_model.save_pretrained(args.output_dir, save_function=accelerator.save)
def analyze(): data = pd.read_csv( '/Users/Blanca/ironhack/gitrepo/ih_datamadpt0420_final_project/data/raw/rawdata.csv' ) print('...lets read the tweets saved...') # change date type from 'object' to 'date' data['date'] = pd.to_datetime(data['date']) # getting today's Timestamp today = pd.Timestamp.today().floor('D') # .normalize() does the same thing data = data[(data['date'] > today)] # select required columns data = data.drop(columns=['Unnamed: 0']) # data analysis => sorting data = data.sort_values('user_name', ascending=False) data = data[data.user_name != 'BiciMAD'] data = data.reset_index() data = data.drop(columns=['index']) def clean_tweet(tweet): return ' '.join( re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", tweet).split()) #return " ".join(re.sub("([^0-9A-Za-z \t])|(\w+:\/\/\S+)", "", tweet).split()) print('...and clean them a bit...') # Updated the tweets_clean data['tweets_clean'] = data['text'].apply(clean_tweet) print('tweets cleaned!...') from transformers import pipeline classifier = pipeline('sentiment-analysis') from transformers import AutoTokenizer, AutoModelForMaskedLM tokenizer = AutoTokenizer.from_pretrained( "dccuchile/bert-base-spanish-wwm-cased") model = AutoModelForMaskedLM.from_pretrained( "dccuchile/bert-base-spanish-wwm-cased") print('...sentiment analysis model from transformers there...') def transform(x): return classifier(x) # Apply transform function to all tweets data['sentiment'] = data['tweets_clean'].apply(transform) print('TODAYs tweets with sentiment analysis done!...') data["score"] = [ data["sentiment"][i][0]['score'] for i in range(data.shape[0]) ] data["label"] = [ data["sentiment"][i][0]['label'] for i in range(data.shape[0]) ] score = data['score'] positive = (data["label"] == "POSITIVE") negative = (data["label"] == "NEGATIVE") data['label_coded'] = data['label'].apply(lambda x: 1 if x == 'POSITIVE' else -1) data['score_coded'] = data['label_coded'] * data['score'] df_old = pd.read_csv( '/Users/Blanca/ironhack/gitrepo/ih_datamadpt0420_final_project/data/results/data_sentiment.csv' ) df_old = df_old.astype(str) df_str = data.astype(str) df = pd.merge(df_old, df_str, how='outer') df = df[df.date != 'date'] df.drop_duplicates(subset=['id'], keep='last', inplace=True) df.reset_index() # check new Tweets are in df df.sort_values('date', ascending=False).head(10) # save to csv - add a dataframe to an existing csv file df.to_csv( '/Users/Blanca/ironhack/gitrepo/ih_datamadpt0420_final_project/data/results/data_sentiment.csv', header=True) print('TODAYs tweets with sentiment label and score saved!...')
import torch from transformers import LineByLineTextDataset from transformers import Trainer, TrainingArguments from transformers import DataCollatorForLanguageModeling from transformers import AutoTokenizer from transformers import AutoModelForMaskedLM, AutoModelForPreTraining from pathlib import Path import os import torch print(torch.cuda.is_available()) tokenizer = AutoTokenizer.from_pretrained( "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext") model = AutoModelForMaskedLM.from_pretrained( 'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext') model.train() from transformers import AutoModelForMaskedLM, LineByLineTextDataset dataset = LineByLineTextDataset( tokenizer=tokenizer, file_path="../results_file_clean.txt", block_size=128, ) from transformers import DataCollatorForLanguageModeling data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True,
if GET_PERPLEXITY == 'yes': logging.info('Calculating perplexity') race_df = pd.read_csv(data_path + demo + '/' + input_file_1) race_df_2 = pd.read_csv(data_path + demo + '/' + input_file_2) tokenizer = AutoTokenizer.from_pretrained(pretrained_model) if debiasing_head: logging.info('Loading debiased model..') model = AutoModelWithLMAndDebiasHead.from_pretrained( pretrained_model, debiasing_head=debiasing_head) else: if 'bert' in args.model_path.__repr__().lower(): logging.info('in bert') model = AutoModelForMaskedLM.from_pretrained(pretrained_model) elif 'gpt' in pretrained_model.__repr__().lower(): logging.info('in gpt') model = AutoModelForCausalLM.from_pretrained(pretrained_model) else: logging.info('in CLM model by default') model = AutoModelForCausalLM.from_pretrained(pretrained_model) race_1_perplexity = get_perplexity_list(race_df, model, tokenizer) logging.info('Done with demo1 perplexity in {} on set'.format( (time.time() - start) / 60)) race_2_perplexity = get_perplexity_list(race_df_2, model, tokenizer) logging.info('Done with demo2 perplexity in {} on set'.format( (time.time() - start) / 60)) race_df['perplexity'] = race_1_perplexity
def run(n_epochs, lr, train_batch_size, val_batch_size, base_model, clustering_loss_weight, embedding_extractor, annealing_alphas, dataset, val_dataset, result_dir, early_stopping, early_stopping_tol, device, random_state): # Set random states np.random.seed(random_state) torch.manual_seed(random_state) torch.cuda.manual_seed_all(random_state) # load data train_df = pd.read_csv(dataset) train_texts = train_df['texts'].to_numpy() train_labels = train_df['labels'].to_numpy() train_data = TextDataset(train_texts, train_labels) train_data_loader = DataLoader(dataset=train_data, batch_size=train_batch_size, shuffle=False) val_df = pd.read_csv(val_dataset) val_texts = val_df['texts'].to_numpy() val_labels = val_df['labels'].to_numpy() val_data = TextDataset(val_texts, val_labels) val_data_loader = DataLoader(dataset=val_data, batch_size=val_batch_size, shuffle=False) # init lm model & tokenizer lm_model = AutoModelForMaskedLM.from_pretrained(base_model, return_dict=True, output_hidden_states=True) tokenizer = AutoTokenizer.from_pretrained(base_model, return_dict=True, output_hidden_states=True) lm_model.to(device) # init clustering model model, initial_centroids, initial_embeddings = init_model( lm_model=lm_model, tokenizer=tokenizer, data_loader=train_data_loader, embedding_extractor=embedding_extractor, n_clusters=np.unique(train_labels).shape[0], device=device) # init optimizer & scheduler opt = torch.optim.RMSprop( params=model.parameters(), lr=lr, # 2e-5, 5e-7, eps=1e-8) total_steps = len(train_data_loader) * n_epochs scheduler = get_linear_schedule_with_warmup( optimizer=opt, num_warmup_steps=int(len(train_data_loader) * 0.5), num_training_steps=total_steps) # train the model hist = train(n_epochs=n_epochs, model=model, optimizer=opt, scheduler=scheduler, annealing_alphas=annealing_alphas, train_data_loader=train_data_loader, eval_data_loader=val_data_loader, clustering_loss_weight=clustering_loss_weight, early_stopping=early_stopping, early_stopping_tol=early_stopping_tol, verbose=True) # do eval run_results = {} predicted_labels, true_labels = evaluate(model=model, eval_data_loader=val_data_loader, verbose=True) best_matching, accuracy = cluster_accuracy(true_labels, predicted_labels) ari = adjusted_rand_score(true_labels, predicted_labels) nmi = normalized_mutual_info_score(true_labels, predicted_labels) purity = purity_score(y_true=true_labels, y_pred=predicted_labels) run_results['best_matching'] = best_matching run_results['accuracy'] = accuracy run_results['ari'] = ari run_results['nmi'] = nmi run_results[ 'purity'] = purity # use purity to compare with microsoft paper # save train hist os.makedirs(result_dir, exist_ok=True) result_df = pd.DataFrame.from_records([run_results]) result_df.to_csv(os.path.join(result_dir, '20_newsgroups-distilbert.csv'), index=False) # save results & model os.makedirs(result_dir, exist_ok=True) with open(os.path.join(result_dir, 'train_hist.h'), 'wb') as f: pickle.dump(hist, file=f) torch.save(model, os.path.join(result_dir, 'model.bin'))
import torch import sys from transformers import pipeline, AutoTokenizer, AutoModelForMaskedLM import numpy as np from tqdm import tqdm model = AutoModelForMaskedLM.from_pretrained("distilbert-base-multilingual-cased") tokenizer = AutoTokenizer.from_pretrained("distilbert-base-multilingual-cased") device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') model.to(device) model.load_state_dict(torch.load('distilbert_chat', map_location=device)) fill_mask = pipeline( "fill-mask", model=model, tokenizer=tokenizer ) sentence = ' '.join(sys.argv[1:]) r = np.random.randint(low=2, high=5) for _ in range(r): t = fill_mask(sentence + '[MASK]') sentence += ' ' + t[0]['token_str'] print(sentence)
def main(): # 在 src/transformers/training_args.py中查看所有可能的参数,或将--help标志传递给此脚本。 # 现在,我们保留了不同的参数集,以使关注点更加清晰。 parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # 如果我们仅将一个参数传递给脚本,并且它是指向json文件的路径,那么让我们对其进行解析以获取参数。 model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses() if ( os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir ): raise ValueError( f"输出目录({training_args.output_dir}) 以及存在,并且不为空" "Use --overwrite_output_dir to overcome." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if is_main_process(training_args.local_rank) else logging.WARN, ) # 记录每个进程的日志 logger.warning( f"使用的 rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu} " + f"是否分布式训练: {bool(training_args.local_rank != -1)}, 16-bits 半精度训练: {training_args.fp16}" ) # 主进程的日志设为verbosity: if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() logger.info("训练/评估参数 %s", training_args) # Set seed before initializing model. set_seed(training_args.seed) # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub # # For CSV/JSON files, this script will use the column called 'text' or the first column. You can easily tweak this # behavior (see below) # # 在分布式训练中,load_dataset函数可确保只有一个本地进程可以同时下载数据集。 if data_args.dataset_name is not None: # 从hub下载和加载数据集。 # 首先确定本地缓存了cache文件 cache_script = os.path.join("data", data_args.dataset_name+".py") if not os.path.exists(cache_script): raise Exception("请检查本地是否存在相关脚本文件") datasets = load_dataset(path=cache_script, name=data_args.dataset_config_name, data_dir=data_args.data_dir) else: data_files = {} if data_args.train_file is not None: data_files["train"] = data_args.train_file if data_args.validation_file is not None: data_files["validation"] = data_args.validation_file extension = data_args.train_file.split(".")[-1] if extension == "txt": extension = "text" datasets = load_dataset(extension, data_files=data_files) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. # 加载预训练模型和tokenizer # # Distributed training: # .from_pretrained方法可确保只有一本地个进程可以同时下载模型和vocab。 if model_args.config_name: config = AutoConfig.from_pretrained(model_args.config_name, cache_dir=model_args.cache_dir) elif model_args.model_name_or_path: config = AutoConfig.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir) else: config = CONFIG_MAPPING[model_args.model_type]() logger.warning("你正从头开始初始化一个新的config.") # tokenizer的设置 if model_args.tokenizer_name: if model_args.tokenizer_name == "myroberta": tokenizer = BertTokenizer.from_pretrained( model_args.tokenizer_name, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer ) else: tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer ) elif model_args.model_name_or_path: tokenizer = AutoTokenizer.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer ) else: raise ValueError( "您正在从头实例化一个新的tokenizer。 此脚本不支持此功能。 " "您可以用其它形式训练好之后,在这里使用,使用方法: using --tokenizer_name." ) #模型的设置 if model_args.model_name_or_path: if model_args.model_name_or_path == 'myroberta': model = RobertaForMaskedLM.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) else: model = AutoModelForMaskedLM.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) else: logger.info("从头开始训练一个模型") model = AutoModelForMaskedLM.from_config(config) #重设下tokenizer的大小,如果当我们从头训练新模型时,这是必须的 model.resize_token_embeddings(len(tokenizer)) # 处理数据集 # First we tokenize all the texts. if training_args.do_train: column_names = datasets["train"].column_names else: column_names = datasets["validation"].column_names text_column_name = "text" if "text" in column_names else column_names[0] if data_args.line_by_line: # 按行处理, tokenize each nonempty line padding = "max_length" if data_args.pad_to_max_length else False def tokenize_function(examples): # 移除空行 # 收到的数据长度 print(f"收到的数据长度: {[len(t) for t in examples['text']]}") examples["text"] = [line for line in examples["text"] if len(line) > 0 and not line.isspace()] tokenizer_res = tokenizer( examples["text"], padding=padding, truncation=True, max_length=data_args.max_seq_length, # We use this option because DataCollatorForLanguageModeling (see below) is more efficient when it # receives the `special_tokens_mask`. return_special_tokens_mask=True, ) print(f"tokenizer之后的数据长度: {print([len(t) for t in tokenizer_res['input_ids']])}") return tokenizer_res tokenized_datasets = datasets.map( tokenize_function, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=[text_column_name], load_from_cache_file=not data_args.overwrite_cache, ) else: # 否则,我们将tokenize每个文本,然后将它们拼接在一起,然后再将它们分成较小的部分。 # We use `return_special_tokens_mask=True` because DataCollatorForLanguageModeling (see below) is more # efficient when it receives the `special_tokens_mask`. def tokenize_function(examples): return tokenizer(examples[text_column_name], return_special_tokens_mask=True) #默认一次处理1000行 tokenized_datasets = datasets.map( tokenize_function, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, ) if data_args.max_seq_length is None: max_seq_length = tokenizer.model_max_length else: if data_args.max_seq_length > tokenizer.model_max_length: logger.warning( f"参数给定的 max_seq_length ({data_args.max_seq_length}) 比模型的 ({tokenizer.model_max_length}) 最大长度长. 使用模型的最大长度 max_seq_length={tokenizer.model_max_length}." ) max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length) # 主要数据处理功能,可拼接数据集中的所有文本并生成max_seq_length的块。 def group_texts(examples): # 拼接所有文本。 concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()} total_length = len(concatenated_examples[list(examples.keys())[0]]) # 我们删除一小部分,如果模型支持该字段,则可以添加padding,而不是删除,您可以根据需要自定义此部分。 total_length = (total_length // max_seq_length) * max_seq_length # Split by chunks of max_len. result = { k: [t[i : i + max_seq_length] for i in range(0, total_length, max_seq_length)] for k, t in concatenated_examples.items() } return result # 注意,使用batched=True`时,此映射一起处理1,000个文本,因此group_texts会丢弃这1,000个文本组中的每一个的余数。 您可以在此处调整该batch_size,但较高的值可能会较慢进行预处理。 # To speed up this part, we use multiprocessing. See the documentation of the map method for more information: # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map tokenized_datasets = tokenized_datasets.map( group_texts, batched=True, num_proc=data_args.preprocessing_num_workers, load_from_cache_file=not data_args.overwrite_cache, ) # Data collator # 这部分是随机mask token的设置 data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=data_args.mlm_probability) # Initialize our Trainer trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_datasets["train"] if training_args.do_train else None, eval_dataset=tokenized_datasets["validation"] if training_args.do_eval else None, tokenizer=tokenizer, data_collator=data_collator, ) # Training if training_args.do_train: model_path = ( model_args.model_name_or_path if (model_args.model_name_or_path is not None and os.path.isdir(model_args.model_name_or_path)) else None ) trainer.train(model_path=model_path) trainer.save_model() # Saves the tokenizer too for easy upload # Evaluation results = {} if training_args.do_eval: logger.info("*** 开始评估 ***") eval_output = trainer.evaluate() perplexity = math.exp(eval_output["eval_loss"]) results["perplexity"] = perplexity output_eval_file = os.path.join(training_args.output_dir, "eval_results_mlm.txt") if trainer.is_world_process_zero(): with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key, value in results.items(): logger.info(f" {key} = {value}") writer.write(f"{key} = {value}\n") return results
def main(train_function): # ----- Parse local_rank for torch.distributed.launch ----------- parser = argparse.ArgumentParser() parser.add_argument("--local_rank", type=int) local_rank = parser.parse_args().local_rank if local_rank is None: local_rank = 0 # ----- Setup logging ----------- logger = logging.getLogger(__name__) logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) logger.setLevel(logging.INFO if is_main_process(local_rank) else logging.WARN) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(local_rank): transformers.utils.logging.set_verbosity_info() transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() # ----- Configurable Params ----------- # List of dicts with configuration for each dataset to be loaded # see available datasets in the Hub: https://huggingface.co/datasets. sizes # are of generated dataset, can be an order of magnitude larger after tokenization. # Not all datasets can be concatenated without preprocessing, features must align datasets_args = [ dict(path="wikitext", name="wikitext-2-raw-v1"), # 12.91 MB # dict(path="wikitext", name="wikitext-103-raw-v1"), # 524 MB # dict(path="ptb_text_only"), # 5.7 MB # dict(path="bookcorpus"), # 4.63 GB # dict(path="wikipedia"), # 35.38 GB ] # Training params # note: in V100 bs=8 uses 11/16 of available gpu mem, bs=12 uses 15/16 output_dir = os.path.expanduser("~/nta/results/bert") training_args = TrainingArguments( # Logging output_dir=output_dir, logging_first_step=True, logging_steps=10, # also define eval_steps eval_steps=10, max_steps=30, # num_train_epochs replaced by steps disable_tqdm=True, run_name="debug_run", # used for wandb, not for Ray # hyperparams per_device_train_batch_size=8, per_device_eval_batch_size=8, learning_rate=1e-4, lr_scheduler_type="linear", warmup_steps=500, weight_decay=1e-6, ) # Evaluate refers to evaluating perplexity on trained model in the validation set # doesn't refer to finetuning and evaluating on downstream tasks such as GLUE seed = random.randint(0, 1000000) # Changing the tokenizer will result in re-tokenizing the dataset. # As a reference, BERT tokenization will take ~ 3 hours for a 5GB dataset config_class = BertConfig tokenizer_name = "bert-base-cased" # ----- Seed ----------- set_seed(seed) print(f"Seed to reproduce: {seed}") # ----- Dataset ----------- # Load multiple datasets and concatenate. # using only 'train' and 'validation' sets, could also include 'test' # if no split is defined, load_dataset returns DatasetDict with all available splits train_datasets = [load_dataset(**args, split="train") for args in datasets_args] val_datasets = [load_dataset(**args, split="validation") for args in datasets_args] dataset = DatasetDict() dataset["train"] = concatenate_datasets(train_datasets) dataset["validation"] = concatenate_datasets(val_datasets) def load_and_split_dataset(dataset_args, split_percentage=5): """Alternative: if no validation set available, manuallly split the train set""" dataset = DatasetDict() dataset["train"] = load_dataset( **dataset_args, split=f"train[{split_percentage}%:]" ) dataset["validation"] = load_dataset( **dataset_args, split=f"train[:{split_percentage}%]" ) return dataset # ----- Load Model ----------- # Load model config = config_class() model = AutoModelForMaskedLM.from_config(config) # Load tokenizer # use_fast falls back to tokenizer lib implementation under the hood tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, use_fast=True) model.resize_token_embeddings(len(tokenizer)) # ----- Preprocess dataset ----------- # Only use the text column name when doing language modeling # this feature might have a different name depending on the dataset # might need to change column names prior to concatenating, if that is the case column_names = dataset["train"].column_names text_column_name = "text" if "text" in column_names else column_names[0] # Setting overwrite_cache to False will retokenize the dataset. # do not overwrite cache if using shared cache repository. overwrite_cache = False preprocessing_num_workers = None # We tokenize every text, then concatenate them together before splitting in smaller # parts. We use `return_special_tokens_mask=True` given # DataCollatorForLanguageModeling is more efficient when it # receives the `special_tokens_mask`. def tokenize_function(examples): return tokenizer(examples[text_column_name], return_special_tokens_mask=True) tokenized_dataset = dataset.map( tokenize_function, batched=True, remove_columns=column_names, num_proc=preprocessing_num_workers, load_from_cache_file=not overwrite_cache, ) # Main data processing function that will concatenate all texts from our dataset and # generate chunks of max_seq_length. max_seq_length = tokenizer.model_max_length def group_texts(examples): # Concatenate all texts. concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()} total_length = len(concatenated_examples[list(examples.keys())[0]]) # We drop the small remainder, we could add padding if the model supported it # instead of this drop, you can customize this part to your needs. total_length = (total_length // max_seq_length) * max_seq_length # Split by chunks of max_len. result = { k: [t[i : i + max_seq_length] for i in range(0, total_length, max_seq_length)] for k, t in concatenated_examples.items() } return result # Note that with `batched=True`, this map processes 1,000 texts together, so # group_texts throws away a remainder for each of those groups of 1,000 texts. # You can adjust batch_size here but a higher value will be slower to preprocess. tokenized_dataset = tokenized_dataset.map( group_texts, batched=True, num_proc=preprocessing_num_workers, load_from_cache_file=not overwrite_cache, ) # Data collator # This one will take care of randomly masking the tokens. # Q: what about dynamic masking, used in Roberta? data_collator = DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm_probability=0.15 ) # ----- Setup Trainer ----------- # Initialize Trainer. Similar to Vernon's Experiment class. # dataloader and training loop are contained in Trainer abstraction trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_dataset["train"], eval_dataset=tokenized_dataset["validation"], tokenizer=tokenizer, data_collator=data_collator, ) # ----- Functions to train and evaluate ----------- if train_function == "huggingface": # Tested run_hf(trainer, logger, output_dir, save_model=True, evaluate=True) elif train_function == "ray_single_node": # Tested run_ray_single_instance( trainer, logger, name="bert_test", config=None, num_samples=1, local_dir=os.path.expanduser("~/nta/results/experiments/transformers"), keep_checkpoints_num=1, resources_per_trial={"cpu": 8}, # note: checkpoint arguments cannot be used with a checkpointable function ) elif train_function == "ray_multiple_nodes": # Untested run_ray_distributed( trainer, logger, name="bert_test", config=None, num_samples=1, local_dir=os.path.expanduser("~/nta/results/experiments/transformers"), keep_checkpoints_num=1, queue_trials=True, verbose=2, resources_per_trial={"gpu": 4}, )
def main(): """ Collect XLM-R representations from corpus. """ parser = argparse.ArgumentParser() parser.add_argument( '--model_name_or_path', type=str, required=True, help='path to model directory or model name (e.g., xlm-roberta-base)') parser.add_argument( '--targets_path', type=str, required=True, help= 'Path to file with target words (one word per line — possibly with tab-separated change score — ' 'or a list of comma-separated word forms.') parser.add_argument('--output_path', type=str, required=True, help='Output path for extracted embeddings.') parser.add_argument( '--corpus_path', type=str, required=True, help='Path to corpus or corpus directory (iterates through files).') parser.add_argument('--context_window', type=int, default=512, help="The length of a token's entire context window") parser.add_argument( '--batch_size', type=int, default=64, help='The number of sentences processed at once by the LM.') # parser.add_argument( # '--n_layers', type=int, default=12, # help='The number of layers of the Transformer model.' # ) parser.add_argument( '--n_dims', type=int, default=768, help= 'The dimensionality of a Transformer layer (hence the dimensionality of the output embeddings).' ) parser.add_argument('--local_rank', type=int, default=-1, help='For distributed training (default: -1).') args = parser.parse_args() logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s') logging.info(__file__.upper()) start_time = time.time() # Setup CUDA, GPU & distributed training if args.local_rank == -1: device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend="nccl") n_gpu = 1 # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s", args.local_rank, device, n_gpu, bool(args.local_rank != -1)) # Set seeds across modules set_seed(42, n_gpu) # Load targets targets = defaultdict(list) with open(args.targets_path, 'r', encoding='utf-8') as f_in: for line in f_in.readlines(): line = line.strip() forms = line.split(',') if len(forms) > 1: for form in forms: if form not in targets[forms[0]]: targets[forms[0]].append(form) else: line = line.split('\t') targets[line[0]].append(line[0]) n_target_forms = sum([len(vals) for vals in targets.values()]) logger.warning(f"Target lemmas: {len(targets)}.") logger.warning(f"Target word forms: {n_target_forms}.") # Load pretrained model and tokenizer if args.local_rank not in [-1, 0]: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab # Load model and tokenizer tokenizer = AutoTokenizer.from_pretrained( args.model_name_or_path) #, never_split=targets) model = AutoModelForMaskedLM.from_pretrained(args.model_name_or_path, output_hidden_states=True) logger.warning(f"Tokenizer's added tokens:\n{tokenizer.get_added_vocab()}") if args.local_rank == 0: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab model.to(device) # Store vocabulary indices of target words targets_ids = defaultdict(lambda: dict()) for lemma in targets: for form in targets[lemma]: targets_ids[lemma][form] = tokenizer.encode( form, add_special_tokens=False) assert n_target_forms == sum([len(vals) for vals in targets_ids.values()]) ids2lemma = {} # maps all forms' token ids to their corresponding lemma lemma2ids = defaultdict( list ) # maps every lemma to a list of token ids corresponding to all word forms len_longest_tokenized = 0 for lemma, forms2ids in targets_ids.items(): for form, form_id in forms2ids.items(): # remove '▁' from the beginning of subtoken sequences if len(form_id) > 1 and form_id[0] == 6: form_id = form_id[1:] if len(form_id) == 0: logger.warning( 'Empty string? Lemma: {}\tForm:"{}"\tTokenized: "{}"'. format(lemma, form, tokenizer.tokenize(form))) continue if len(form_id) == 1 and form_id[0] == tokenizer.unk_token_id: logger.warning('Tokenizer returns UNK for this word form. ' 'Lemma: {}\tForm: {}\tTokenized: {}'.format( lemma, form, tokenizer.tokenize(form))) continue if len(form_id) > 1: logger.warning('Word form split into subtokens. ' 'Lemma: {}\tForm: {}\tTokenized: {}'.format( lemma, form, tokenizer.tokenize(form))) ids2lemma[tuple(form_id)] = lemma lemma2ids[lemma].append(tuple(form_id)) if len(tuple(form_id)) > len_longest_tokenized: len_longest_tokenized = len(tuple(form_id)) # multi-gpu training (should be after apex fp16 initialization) if n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) # Get sentence iterator sentences = PathLineSentences(args.corpus_path) nSentences = 0 target_counter = {target: 0 for target in lemma2ids} for sentence in sentences: nSentences += 1 sentence_token_ids = tokenizer.encode(' '.join(sentence), add_special_tokens=False) while sentence_token_ids: candidate_ids_found = False for length in list(range(1, len_longest_tokenized + 1))[::-1]: candidate_ids = tuple(sentence_token_ids[-length:]) if candidate_ids in ids2lemma: target_counter[ids2lemma[candidate_ids]] += 1 sentence_token_ids = sentence_token_ids[:-length] candidate_ids_found = True break if not candidate_ids_found: sentence_token_ids = sentence_token_ids[:-1] logger.warning('Total usages: %d' % (sum(list(target_counter.values())))) for lemma in target_counter: logger.warning(f'{lemma}: {target_counter[lemma]}') # Container for usages usages = { target: np.empty((target_count, args.n_dims)) # usage matrix for (target, target_count) in target_counter.items() } # Iterate over sentences and collect representations nUsages = 0 curr_idx = {target: 0 for target in target_counter} def collate(batch): return [{ 'input_ids': torch.cat([item[0]['input_ids'] for item in batch], dim=0), 'attention_mask': torch.cat([item[0]['attention_mask'] for item in batch], dim=0) }, [item[1] for item in batch], [item[2] for item in batch]] dataset = ContextsDataset(ids2lemma, sentences, args.context_window, tokenizer, len_longest_tokenized, nSentences) sampler = SequentialSampler(dataset) dataloader = DataLoader(dataset, sampler=sampler, batch_size=args.batch_size, collate_fn=collate) iterator = tqdm(dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(iterator): model.eval() batch_tuple = tuple() for t in batch: try: batch_tuple += (t.to(device), ) except AttributeError: batch_tuple += (t, ) batch_input_ids = batch_tuple[0] batch_lemmas, batch_spos = batch_tuple[1], batch_tuple[2] with torch.no_grad(): if torch.cuda.is_available(): batch_input_ids['input_ids'] = batch_input_ids['input_ids'].to( 'cuda') batch_input_ids['attention_mask'] = batch_input_ids[ 'attention_mask'].to('cuda') outputs = model(**batch_input_ids) if torch.cuda.is_available(): hidden_states = [ l.detach().cpu().clone().numpy() for l in outputs.hidden_states ] else: hidden_states = [ l.clone().numpy() for l in outputs.hidden_states ] # store usage tuples in a dictionary: lemma -> (vector, position) for b_id in np.arange(len(batch_lemmas)): lemma = batch_lemmas[b_id] layers = [ layer[b_id, batch_spos[b_id][0]:batch_spos[b_id][1], :] for layer in hidden_states ] usage_vector = np.mean(layers, axis=0) if usage_vector.shape[0] > 1: usage_vector = np.mean(usage_vector, axis=0) usages[lemma][curr_idx[lemma], :] = usage_vector curr_idx[lemma] += 1 nUsages += 1 iterator.close() np.savez_compressed(args.output_path, **usages) logger.warning('Total embeddings: %d' % (nUsages)) logger.warning("--- %s seconds ---" % (time.time() - start_time))
def __init__( self, vocab_path="", model_name="bert-base-cased", max_edit_dist=10, debug=False, performance=False, ): """To create an object for this class. It does not require any special Args: vocab_path (str, optional): Vocabulary file path to be used by the model . Defaults to "". model_name (str, optional): Pretrained BERT model name. Defaults to "bert-base-cased". max_edit_dist (int, optional): Maximum edit distance between two words. Defaults to 10. debug (bool, optional): This help prints logs as the data flows through the class. Defaults to False. performance (bool, optional): This is used to print the time taken by individual steps in spell check. Defaults to False. """ if ((type(vocab_path) != type("")) or (type(debug) != type(True)) or (type(performance) != type(True))): raise TypeError( "Please check datatype provided. vocab_path should be str," " debug and performance should be bool") if vocab_path != "": try: # First open() for user specified word addition to vocab with open(vocab_path, encoding="utf8") as f: # if want to remove '[unusedXX]' from vocab # words = [ # line.rstrip() # for line in f # if not line.startswith("[unused") # ] words = [line.strip() for line in f] # The below code adds the necessary words like numbers # /punctuations/tokenizer specific words like [PAD]/[ # unused0]/##M current_path = os.path.dirname(__file__) vocab_path = os.path.join(current_path, "data", "vocab.txt") extra_token = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"] words.extend(extra_token) with open(vocab_path, encoding="utf8") as f: # if want to remove '[unusedXX]' from vocab # words = [ # line.rstrip() # for line in f # if not line.startswith("[unused") # ] for line in f: extra_token = line.strip() if extra_token.startswith("[unused"): words.append(extra_token) elif extra_token.startswith("##"): words.append(extra_token) elif len(extra_token) == 1: words.append(extra_token) if debug: debug_file_path = os.path.join(current_path, "tests", "debugFile.txt") with open(debug_file_path, "w+") as new_file: new_file.write("\n".join(words)) print("Final vocab at " + debug_file_path) except Exception as e: print(e) warnings.warn("Using default vocab") vocab_path = "" words = [] if vocab_path == "": current_path = os.path.dirname(__file__) vocab_path = os.path.join(current_path, "data/vocab.txt") with open(vocab_path, encoding="utf8") as f: # if want to remove '[unusedXX]' from vocab # words = [ # line.rstrip() # for line in f # if not line.startswith("[unused") # ] words = [line.strip() for line in f] self.max_edit_dist = max_edit_dist self.model_name = model_name self.vocab = Vocab(strings=words) logging.getLogger("transformers").setLevel(logging.ERROR) self.BertTokenizer = AutoTokenizer.from_pretrained(self.model_name) self.BertModel = AutoModelForMaskedLM.from_pretrained(self.model_name) self.BertModel.eval() self.BertModel = self.BertModel.to(device) self.mask = self.BertTokenizer.mask_token self.debug = debug self.performance = performance if not Doc.has_extension("contextual_spellCheck"): Doc.set_extension("contextual_spellCheck", default=True) Doc.set_extension("performed_spellCheck", default=False) Doc.set_extension("suggestions_spellCheck", default={}) Doc.set_extension("outcome_spellCheck", default="") Doc.set_extension("score_spellCheck", default=None) Span.set_extension("get_has_spellCheck", getter=self.span_require_spell_check) Span.set_extension("score_spellCheck", getter=self.span_score_spell_check) Token.set_extension("get_require_spellCheck", getter=self.token_require_spell_check) Token.set_extension( "get_suggestion_spellCheck", getter=self.token_suggestion_spell_check, ) Token.set_extension("score_spellCheck", getter=self.token_score_spell_check)
from transformers import AutoTokenizer, AutoModelForMaskedLM, BertTokenizer import os tokenizer = BertTokenizer.from_pretrained("bert-base-chinese") model = AutoModelForMaskedLM.from_pretrained("clue/roberta_chinese_base") model.save_pretrained('myroberta') tokenizer.save_pretrained('myroberta') os.remove("myroberta/special_tokens_map.json") os.remove("myroberta/tokenizer_config.json") # os.system("mv deberta-base ../") # tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") # model = AutoModelForMaskedLM.from_pretrained("bert-base-uncased") # model.save_pretrained('bert_model_uncased') # tokenizer.save_pretrained('bert_model_uncased')
# obtain model and tokenizer # model_name = "bert-large-uncased-whole-word-masking" model_name = "bert-base-uncased" tokenizer = BertTokenizerFast.from_pretrained(model_name) phrase_tokenizer = PhraseTokenizer() #cwd/"saved_model"/"imdb_bert_base_uncased_finetuned_normal" if ds_name == "imdb": target_model_name = "imdb_bert_base_uncased_finetuned_training" target_model_path = cwd / "data" / "imdb" / "saved_model" / target_model_name elif ds_name == "yelp_polarity": target_model_name = "bert-base-uncased-yelp-polarity" target_model_path = f"textattack/{target_model_name}" target_model = BertForSequenceClassification.from_pretrained( str(target_model_path)).to(device) mlm_model = AutoModelForMaskedLM.from_pretrained(model_name).to(device) # turn models to eval model since only inference is needed target_model.eval() mlm_model.eval() # tokenize the dataset to include words and phrases test_ds = test_ds.map(phrase_tokenizer.tokenize) # create the attacker params = { 'k': 15, 'beam_width': 8, 'conf_thres': 3.0, 'sent_semantic_thres': 0.7, 'change_threshold': 0.4
from transformers import AutoTokenizer, AutoModelForMaskedLM tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") model = AutoModelForMaskedLM.from_pretrained("bert-base-uncased") model.save_pretrained('model/pytorch_bert_base_uncased/')
from transformers import LongformerForMaskedLM,RobertaForMaskedLM,AutoModelForMaskedLM,AutoTokenizer import copy import torch max_pos = 4096 attention_window = 512 roberta = AutoModelForMaskedLM.from_pretrained("hfl/chinese-roberta-wwm-ext") tokenizer = AutoTokenizer.from_pretrained("hfl/chinese-roberta-wwm-ext", model_max_length=max_pos) # extend position embedding config = roberta.config tokenizer.model_max_length = max_pos tokenizer.init_kwargs['model_max_length'] = max_pos current_max_pos, embed_size = roberta.bert.embeddings.position_embeddings.weight.shape max_pos += 2 config.max_position_embeddings = max_pos assert max_pos > current_max_pos new_pos_embed = roberta.bert.embeddings.position_embeddings.weight.new_empty(max_pos, embed_size) # copy position embeddings over and over to initialize the new position embeddings k = 2 step = current_max_pos - 2 while k < max_pos - 1: if k + step >= max_pos: new_pos_embed[k:] = roberta.bert.embeddings.position_embeddings.weight[2:(max_pos + 2 - k)] else: new_pos_embed[k:(k + step)] = roberta.bert.embeddings.position_embeddings.weight[2:] k += step roberta.bert.embeddings.position_embeddings.weight.data = new_pos_embed roberta.bert.embeddings.position_ids.data = torch.tensor([i for i in range(max_pos)]).reshape(1, max_pos)
def load_finetune(epoch): model = AutoModelForMaskedLM.from_pretrained('./save_ft_%d/' % epoch) return model
def main(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument("--train_data_file", default=None, type=str, required=True, help="The input training data file (a text file).") parser.add_argument( "--logging_dir", type=str, required=True, help="The logs directory.", ) parser.add_argument( "--output_dir", type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written.", ) parser.add_argument( "--model_type", type=str, required=True, help="The model architecture to be trained or fine-tuned.", ) # Other parameters parser.add_argument( "--eval_data_file", default=None, type=str, help= "An optional input evaluation data file to evaluate the perplexity on (a text file).", ) parser.add_argument( "--line_by_line", action="store_true", help= "Whether distinct lines of text in the dataset are to be handled as distinct sequences.", ) parser.add_argument( "--lazy_loading", action="store_true", help= "Whether to use lazy data loading. Is necessarily line-by-line as well.", ) parser.add_argument( "--force_pad_token", action="store_true", help= "Whether to force the addition of a padding token to tokenizer to prevent errors in encoding (e.g. with GPT)", ) parser.add_argument( "--should_continue", action="store_true", help="Whether to continue from latest checkpoint in output_dir") parser.add_argument( "--model_name_or_path", default=None, type=str, help= "The model checkpoint for weights initialization. Leave None if you want to train a model from scratch.", ) parser.add_argument( "--mlm", action="store_true", help= "Train with masked-language modeling loss instead of language modeling." ) parser.add_argument( "--mlm_probability", type=float, default=0.15, help="Ratio of tokens to mask for masked language modeling loss") parser.add_argument( "--config_name", default=None, type=str, help= "Optional pretrained config name or path if not the same as model_name_or_path. If both are None, initialize a new config.", ) parser.add_argument( "--tokenizer_name", default=None, type=str, help= "Optional pretrained tokenizer name or path if not the same as model_name_or_path. If both are None, initialize a new tokenizer.", ) parser.add_argument( "--cache_dir", default=None, type=str, help= "Optional directory to store the pre-trained models downloaded from s3 (instead of the default one)", ) parser.add_argument( "--data_cache_dir", default=None, type=str, help= "Optional directory to store the pre-trained models downloaded from s3 (instead of the default one)", ) parser.add_argument( "--block_size", default=-1, type=int, help="Optional input sequence length after tokenization." "The training dataset will be truncated in block of this size for training." "Default to the model max input length for single sentence inputs (take into account special tokens).", ) parser.add_argument("--do_train", action="store_true", help="Whether to run training.") parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.") parser.add_argument( "--evaluate_during_training", action="store_true", help="Run evaluation during training at each logging step.") parser.add_argument("--per_gpu_train_batch_size", default=4, type=int, help="Batch size per GPU/CPU for training.") parser.add_argument("--per_gpu_eval_batch_size", default=4, type=int, help="Batch size per GPU/CPU for evaluation.") parser.add_argument( "--gradient_accumulation_steps", type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass.", ) parser.add_argument("--learning_rate", default=1e-4, type=float, help="The initial learning rate for Adam.") parser.add_argument("--weight_decay", default=0.01, type=float, help="Weight decay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-6, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--num_train_epochs", default=1.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--num_workers", default=0, type=int, help= "multi-process data loading with the specified number of loader worker processes.." ) parser.add_argument( "--max_steps", default=-1, type=int, help= "If > 0: set total number of training steps to perform. Override num_train_epochs.", ) parser.add_argument("--warmup_portion", default=0.1, type=float, help="Linear warmup over total * warmup_portion.") parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument("--logging_steps", type=int, default=1000, help="Log every X updates steps.") parser.add_argument("--save_steps", type=int, default=500, help="Save checkpoint every X updates steps.") parser.add_argument( "--save_total_limit", type=int, default=None, help= "Limit the total amount of checkpoints, delete the older checkpoints in the output_dir, does not delete by default", ) parser.add_argument( "--eval_all_checkpoints", action="store_true", help= "Evaluate all checkpoints starting with the same prefix as model_name_or_path ending and ending with step number", ) parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available") parser.add_argument("--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory") parser.add_argument( "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets") parser.add_argument("--seed", type=int, default=42, help="random seed for initialization") parser.add_argument( "--fp16", action="store_true", help= "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit", ) parser.add_argument( "--fp16_opt_level", type=str, default="O1", help= "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html", ) parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank") parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.") parser.add_argument("--server_port", type=str, default="", help="For distant debugging.") global args args = parser.parse_args() args.warmup_portion = float(args.warmup_portion) args.inital_epoch = 0 if args.model_type in ["bert", "roberta", "distilbert", "camembert" ] and not args.mlm: raise ValueError( "BERT and RoBERTa-like models do not have LM heads but masked LM heads. They must be run using the --mlm " "flag (masked language modeling).") if args.eval_data_file is None and args.do_eval: raise ValueError( "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file " "or remove the --do_eval argument.") if args.should_continue: sorted_checkpoints = _sorted_checkpoints(args) if len(sorted_checkpoints) == 0: raise ValueError( "Used --should_continue but no checkpoint was found in --output_dir." ) else: args.model_name_or_path = sorted_checkpoints[-1] args.inital_epoch = int(args.model_name_or_path.split("-")[-1]) if (os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir and not args.should_continue): raise ValueError( "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome." .format(args.output_dir)) # Setup distant debugging if needed if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd # print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend="nccl") args.n_gpu = 1 args.device = device # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16, ) # Set seed set_seed(args) logger.info("Rank %d. Word size %d", args.local_rank, torch.distributed.get_world_size()) # Load pretrained model and tokenizer if args.local_rank not in [-1, 0]: torch.distributed.barrier( ) # Barrier to make sure only the first process in distributed training download model & vocab if args.config_name: config = AutoConfig.from_pretrained(args.config_name, cache_dir=args.cache_dir, padding="max_length") elif args.model_name_or_path: config = AutoConfig.from_pretrained(args.model_name_or_path, cache_dir=args.cache_dir, padding="max_length") else: # When we release a pip version exposing CONFIG_MAPPING, # we can do `config = CONFIG_MAPPING[args.model_type]()`. raise ValueError( "You are instantiating a new config instance from scratch. This is not supported, but you can do it from another script, save it," "and load it from here, using --config_name") global tokenizer if args.tokenizer_name: tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, cache_dir=args.cache_dir) elif args.model_name_or_path: tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, cache_dir=args.cache_dir) else: raise ValueError( "You are instantiating a new tokenizer from scratch. This is not supported, but you can do it from another script, save it," "and load it from here, using --tokenizer_name") if tokenizer.pad_token_id is None: if args.force_pad_token: # See PR 3388. Some tokenizers don't had pad tokens which causes errors at the encoding step in the collate_fn. # We give here the option to force the addition of a pad token. The attention mask is used to ignore this token # when feeding to the model. tokenizer.add_special_tokens({"pad_token": "<pad>"}) else: logger.warn( "Attempting to train a model whose tokenizer has no padding token. This may result in errors in the encoding step. Set the --force_pad_token flag to fix this." ) if args.block_size <= 0: args.block_size = tokenizer.model_max_length # Our input block size will be the max possible for the model else: args.block_size = min(args.block_size, tokenizer.model_max_length) if args.model_name_or_path: model = AutoModelForMaskedLM.from_pretrained( args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config, cache_dir=args.cache_dir) else: logger.info("Training new model from scratch") model = AutoModelForMaskedLM.from_config(config) logger.info("Load model at Rank %d", args.local_rank) model.to(args.device) if args.local_rank == 0: torch.distributed.barrier( ) # End of barrier to make sure only the first process in distributed training download model & vocab logger.info("Training/evaluation parameters %s", args) # Training if args.do_train: if args.local_rank not in [-1, 0]: torch.distributed.barrier( ) # Barrier to make sure only the first process in distributed training process the dataset, and the others will use the cache train_dataset = load_and_cache_examples(args, tokenizer, evaluate=False) logger.info("Load dataset at Rank %d.", args.local_rank) if args.local_rank == 0: torch.distributed.barrier() global_step, tr_loss = train(args, train_dataset, model, tokenizer) logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) # Saving best-practices: if you use save_pretrained for the model and tokenizer, you can reload them using from_pretrained() if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): # Create output directory if needed if args.local_rank in [-1, 0]: os.makedirs(args.output_dir, exist_ok=True) logger.info("Saving model checkpoint to %s", args.output_dir) # Save a trained model, configuration and tokenizer using `save_pretrained()`. # They can then be reloaded using `from_pretrained()` model_to_save = (model.module if hasattr(model, "module") else model ) # Take care of distributed/parallel training model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) # Good practice: save your training arguments together with the trained model torch.save(args, os.path.join(args.output_dir, "training_args.bin")) # Load a trained model and vocabulary that you have fine-tuned model = AutoModelForMaskedLM.from_pretrained(args.output_dir) tokenizer = AutoTokenizer.from_pretrained(args.output_dir) model.to(args.device) # Evaluation results = {} if args.do_eval and args.local_rank in [-1, 0]: checkpoints = [args.output_dir] if args.eval_all_checkpoints: checkpoints = list( os.path.dirname(c) for c in sorted( glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))) logging.getLogger("transformers.modeling_utils").setLevel( logging.WARN) # Reduce logging logger.info("Evaluate the following checkpoints: %s", checkpoints) for checkpoint in checkpoints: global_step = checkpoint.split( "-")[-1] if len(checkpoints) > 1 else "" prefix = checkpoint.split( "/")[-1] if checkpoint.find("checkpoint") != -1 else "" model = AutoModelForMaskedLM.from_pretrained(checkpoint) model.to(args.device) result = evaluate(args, model, tokenizer, prefix=prefix) result = dict( (k + "_{}".format(global_step), v) for k, v in result.items()) results.update(result) return results
def main(cfg): cwd = get_original_cwd() os.chdir(cwd) if not os.path.exists(f"data/{cfg.model_name_or_path}.pt"): get_label_word(cfg) if not os.path.exists(cfg.data_dir): generate_k_shot(cfg.data_dir) device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') data = REDataset(cfg) data_config = data.get_data_config() config = AutoConfig.from_pretrained(cfg.model_name_or_path) config.num_labels = data_config["num_labels"] model = AutoModelForMaskedLM.from_pretrained(cfg.model_name_or_path, config=config) # if torch.cuda.device_count() > 1: # print("Let's use", torch.cuda.device_count(), "GPUs!") # model = torch.nn.DataParallel(model, device_ids = list(range(torch.cuda.device_count()))) model.to(device) lit_model = BertLitModel(args=cfg, model=model, tokenizer=data.tokenizer) data.setup() if cfg.train_from_saved_model != '': model.load_state_dict( torch.load(cfg.train_from_saved_model)["checkpoint"]) print("load saved model from {}.".format(cfg.train_from_saved_model)) lit_model.best_f1 = torch.load(cfg.train_from_saved_model)["best_f1"] #data.tokenizer.save_pretrained('test') optimizer = lit_model.configure_optimizers() if cfg.train_from_saved_model != '': optimizer.load_state_dict( torch.load(cfg.train_from_saved_model)["optimizer"]) print("load saved optimizer from {}.".format( cfg.train_from_saved_model)) num_training_steps = len(data.train_dataloader( )) // cfg.gradient_accumulation_steps * cfg.num_train_epochs scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=num_training_steps * 0.1, num_training_steps=num_training_steps) log_step = 100 logging(cfg.log_dir, '-' * 89, print_=False) logging(cfg.log_dir, time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + ' INFO : START TO TRAIN ', print_=False) logging(cfg.log_dir, '-' * 89, print_=False) for epoch in range(cfg.num_train_epochs): model.train() num_batch = len(data.train_dataloader()) total_loss = 0 log_loss = 0 for index, train_batch in enumerate(tqdm(data.train_dataloader())): loss = lit_model.training_step(train_batch, index) total_loss += loss.item() log_loss += loss.item() loss.backward() optimizer.step() scheduler.step() optimizer.zero_grad() if log_step > 0 and (index + 1) % log_step == 0: cur_loss = log_loss / log_step logging( cfg.log_dir, '| epoch {:2d} | step {:4d} | lr {} | train loss {:5.3f}'. format(epoch, (index + 1), scheduler.get_last_lr(), cur_loss * 1000), print_=False) log_loss = 0 avrg_loss = total_loss / num_batch logging( cfg.log_dir, '| epoch {:2d} | train loss {:5.3f}'.format( epoch, avrg_loss * 1000)) model.eval() with torch.no_grad(): val_loss = [] for val_index, val_batch in enumerate(tqdm(data.val_dataloader())): loss = lit_model.validation_step(val_batch, val_index) val_loss.append(loss) f1, best, best_f1 = lit_model.validation_epoch_end(val_loss) logging(cfg.log_dir, '-' * 89) logging(cfg.log_dir, '| epoch {:2d} | dev_result: {}'.format(epoch, f1)) logging(cfg.log_dir, '-' * 89) logging(cfg.log_dir, '| best_f1: {}'.format(best_f1)) logging(cfg.log_dir, '-' * 89) if cfg.save_path != "" and best != -1: save_path = cfg.save_path torch.save( { 'epoch': epoch, 'checkpoint': model.state_dict(), 'best_f1': best_f1, 'optimizer': optimizer.state_dict() }, save_path, _use_new_zipfile_serialization=False) logging(cfg.log_dir, '| successfully save model at: {}'.format(save_path)) logging(cfg.log_dir, '-' * 89)
import torch from transformers import AutoModelForMaskedLM, AutoTokenizer tokenizer = AutoTokenizer.from_pretrained('distilbert-base-cased') model = AutoModelForMaskedLM.from_pretrained('distilbert-base-cased') sequence = f'Distilled models are smaller than the models they mimic. Using them instead of the' \ f' large versions would help {tokenizer.mask_token} our carbon footprint.' print(sequence) inputs = tokenizer.encode(sequence, return_tensors='pt') mask_token_index = torch.where(inputs == tokenizer.mask_token_id)[1] token_logits = model(inputs).logits mask_token_logits = token_logits[0, mask_token_index, :] top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist() for token in top_5_tokens: print(sequence.replace(tokenizer.mask_token, tokenizer.decode([token])))
def main(): args = parse_args() # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. # If we're using tracking, we also need to initialize it here and it will pick up all supported trackers in the environment accelerator = Accelerator( log_with="all", logging_dir=args.output_dir) if args.with_tracking else Accelerator() # Make one log on every process with the configuration for debugging. logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) logger.info(accelerator.state) # Setup logging, we only want one process per machine to log things on the screen. # accelerator.is_local_main_process is only True for one process per machine. logger.setLevel( logging.INFO if accelerator.is_local_main_process else logging.ERROR) if accelerator.is_local_main_process: datasets.utils.logging.set_verbosity_warning() transformers.utils.logging.set_verbosity_info() else: datasets.utils.logging.set_verbosity_error() transformers.utils.logging.set_verbosity_error() # If passed along, set the training seed now. if args.seed is not None: set_seed(args.seed) # Handle the repository creation if accelerator.is_main_process: if args.push_to_hub: if args.hub_model_id is None: repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token) else: repo_name = args.hub_model_id repo = Repository(args.output_dir, clone_from=repo_name) with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore: if "step_*" not in gitignore: gitignore.write("step_*\n") if "epoch_*" not in gitignore: gitignore.write("epoch_*\n") elif args.output_dir is not None: os.makedirs(args.output_dir, exist_ok=True) accelerator.wait_for_everyone() # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub). # # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called # 'text' is found. You can easily tweak this behavior (see below). # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if args.dataset_name is not None: # Downloading and loading a dataset from the hub. raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name) if "validation" not in raw_datasets.keys(): raw_datasets["validation"] = load_dataset( args.dataset_name, args.dataset_config_name, split=f"train[:{args.validation_split_percentage}%]", ) raw_datasets["train"] = load_dataset( args.dataset_name, args.dataset_config_name, split=f"train[{args.validation_split_percentage}%:]", ) else: data_files = {} if args.train_file is not None: data_files["train"] = args.train_file if args.validation_file is not None: data_files["validation"] = args.validation_file extension = args.train_file.split(".")[-1] if extension == "txt": extension = "text" raw_datasets = load_dataset(extension, data_files=data_files) # If no validation data is there, validation_split_percentage will be used to divide the dataset. if "validation" not in raw_datasets.keys(): raw_datasets["validation"] = load_dataset( extension, data_files=data_files, split=f"train[:{args.validation_split_percentage}%]", ) raw_datasets["train"] = load_dataset( extension, data_files=data_files, split=f"train[{args.validation_split_percentage}%:]", ) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. # Load pretrained model and tokenizer # # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. if args.config_name: config = AutoConfig.from_pretrained(args.config_name) elif args.model_name_or_path: config = AutoConfig.from_pretrained(args.model_name_or_path) else: config = CONFIG_MAPPING[args.model_type]() logger.warning( "You are instantiating a new config instance from scratch.") if args.tokenizer_name: tokenizer = AutoTokenizer.from_pretrained( args.tokenizer_name, use_fast=not args.use_slow_tokenizer) elif args.model_name_or_path: tokenizer = AutoTokenizer.from_pretrained( args.model_name_or_path, use_fast=not args.use_slow_tokenizer) else: raise ValueError( "You are instantiating a new tokenizer from scratch. This is not supported by this script." "You can do it from another script, save it, and load it from here, using --tokenizer_name." ) if args.model_name_or_path: model = AutoModelForMaskedLM.from_pretrained( args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config, ) else: logger.info("Training new model from scratch") model = AutoModelForMaskedLM.from_config(config) model.resize_token_embeddings(len(tokenizer)) # Preprocessing the datasets. # First we tokenize all the texts. column_names = raw_datasets["train"].column_names text_column_name = "text" if "text" in column_names else column_names[0] if args.max_seq_length is None: max_seq_length = tokenizer.model_max_length if max_seq_length > 1024: logger.warning( f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). " "Picking 1024 instead. You can change that default value by passing --max_seq_length xxx." ) max_seq_length = 1024 else: if args.max_seq_length > tokenizer.model_max_length: logger.warning( f"The max_seq_length passed ({args.max_seq_length}) is larger than the maximum length for the" f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}." ) max_seq_length = min(args.max_seq_length, tokenizer.model_max_length) if args.line_by_line: # When using line_by_line, we just tokenize each nonempty line. padding = "max_length" if args.pad_to_max_length else False def tokenize_function(examples): # Remove empty lines examples[text_column_name] = [ line for line in examples[text_column_name] if len(line) > 0 and not line.isspace() ] return tokenizer( examples[text_column_name], padding=padding, truncation=True, max_length=max_seq_length, # We use this option because DataCollatorForLanguageModeling (see below) is more efficient when it # receives the `special_tokens_mask`. return_special_tokens_mask=True, ) with accelerator.main_process_first(): tokenized_datasets = raw_datasets.map( tokenize_function, batched=True, num_proc=args.preprocessing_num_workers, remove_columns=[text_column_name], load_from_cache_file=not args.overwrite_cache, desc="Running tokenizer on dataset line_by_line", ) else: # Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts. # We use `return_special_tokens_mask=True` because DataCollatorForLanguageModeling (see below) is more # efficient when it receives the `special_tokens_mask`. def tokenize_function(examples): return tokenizer(examples[text_column_name], return_special_tokens_mask=True) with accelerator.main_process_first(): tokenized_datasets = raw_datasets.map( tokenize_function, batched=True, num_proc=args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not args.overwrite_cache, desc="Running tokenizer on every text in dataset", ) # Main data processing function that will concatenate all texts from our dataset and generate chunks of # max_seq_length. def group_texts(examples): # Concatenate all texts. concatenated_examples = { k: list(chain(*examples[k])) for k in examples.keys() } total_length = len(concatenated_examples[list(examples.keys())[0]]) # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can # customize this part to your needs. if total_length >= max_seq_length: total_length = (total_length // max_seq_length) * max_seq_length # Split by chunks of max_len. result = { k: [ t[i:i + max_seq_length] for i in range(0, total_length, max_seq_length) ] for k, t in concatenated_examples.items() } return result # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a # remainder for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value # might be slower to preprocess. # # To speed up this part, we use multiprocessing. See the documentation of the map method for more information: # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map with accelerator.main_process_first(): tokenized_datasets = tokenized_datasets.map( group_texts, batched=True, num_proc=args.preprocessing_num_workers, load_from_cache_file=not args.overwrite_cache, desc=f"Grouping texts in chunks of {max_seq_length}", ) train_dataset = tokenized_datasets["train"] eval_dataset = tokenized_datasets["validation"] # Conditional for small test subsets if len(train_dataset) > 3: # Log a few random samples from the training set: for index in random.sample(range(len(train_dataset)), 3): logger.info( f"Sample {index} of the training set: {train_dataset[index]}.") # Data collator # This one will take care of randomly masking the tokens. data_collator = DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm_probability=args.mlm_probability) # DataLoaders creation: train_dataloader = DataLoader(train_dataset, shuffle=True, collate_fn=data_collator, batch_size=args.per_device_train_batch_size) eval_dataloader = DataLoader(eval_dataset, collate_fn=data_collator, batch_size=args.per_device_eval_batch_size) # Optimizer # Split weights in two groups, one with weight decay and the other not. no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate) # On TPU, the tie weights in our model have been disconnected, so we need to restore the ties. if accelerator.distributed_type == DistributedType.TPU: model.tie_weights() # Note -> the training dataloader needs to be prepared before we grab his length below (cause its length will be # shorter in multiprocess) # Scheduler and math around the number of training steps. num_update_steps_per_epoch = math.ceil( len(train_dataloader) / args.gradient_accumulation_steps) if args.max_train_steps is None: args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch else: args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch) lr_scheduler = get_scheduler( name=args.lr_scheduler_type, optimizer=optimizer, num_warmup_steps=args.num_warmup_steps, num_training_steps=args.max_train_steps, ) # Prepare everything with our `accelerator`. model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare( model, optimizer, train_dataloader, eval_dataloader, lr_scheduler) # Figure out how many steps we should save the Accelerator states if hasattr(args.checkpointing_steps, "isdigit"): checkpointing_steps = args.checkpointing_steps if args.checkpointing_steps.isdigit(): checkpointing_steps = int(args.checkpointing_steps) else: checkpointing_steps = None # We need to initialize the trackers we use, and also store our configuration if args.with_tracking: experiment_config = vars(args) # TensorBoard cannot log Enums, need the raw value experiment_config["lr_scheduler_type"] = experiment_config[ "lr_scheduler_type"].value accelerator.init_trackers("mlm_no_trainer", experiment_config) # Train! total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps logger.info("***** Running training *****") logger.info(f" Num examples = {len(train_dataset)}") logger.info(f" Num Epochs = {args.num_train_epochs}") logger.info( f" Instantaneous batch size per device = {args.per_device_train_batch_size}" ) logger.info( f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}" ) logger.info( f" Gradient Accumulation steps = {args.gradient_accumulation_steps}") logger.info(f" Total optimization steps = {args.max_train_steps}") # Only show the progress bar once on each machine. progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process) completed_steps = 0 starting_epoch = 0 # Potentially load in the weights and states from a previous save if args.resume_from_checkpoint: if args.resume_from_checkpoint is not None or args.resume_from_checkpoint != "": accelerator.print( f"Resumed from checkpoint: {args.resume_from_checkpoint}") accelerator.load_state(args.resume_from_checkpoint) path = os.path.basename(args.resume_from_checkpoint) else: # Get the most recent checkpoint dirs = [f.name for f in os.scandir(os.getcwd()) if f.is_dir()] dirs.sort(key=os.path.getctime) path = dirs[ -1] # Sorts folders by date modified, most recent checkpoint is the last # Extract `epoch_{i}` or `step_{i}` training_difference = os.path.splitext(path)[0] if "epoch" in training_difference: starting_epoch = int(training_difference.replace("epoch_", "")) + 1 resume_step = None else: resume_step = int(training_difference.replace("step_", "")) starting_epoch = resume_step // len(train_dataloader) resume_step -= starting_epoch * len(train_dataloader) for epoch in range(starting_epoch, args.num_train_epochs): model.train() if args.with_tracking: total_loss = 0 for step, batch in enumerate(train_dataloader): # We need to skip steps until we reach the resumed step if args.resume_from_checkpoint and epoch == starting_epoch: if resume_step is not None and step < resume_step: completed_steps += 1 continue outputs = model(**batch) loss = outputs.loss # We keep track of the loss at each epoch if args.with_tracking: total_loss += loss.detach().float() loss = loss / args.gradient_accumulation_steps accelerator.backward(loss) if step % args.gradient_accumulation_steps == 0 or step == len( train_dataloader) - 1: optimizer.step() lr_scheduler.step() optimizer.zero_grad() progress_bar.update(1) completed_steps += 1 if isinstance(checkpointing_steps, int): if completed_steps % checkpointing_steps == 0: output_dir = f"step_{completed_steps }" if args.output_dir is not None: output_dir = os.path.join(args.output_dir, output_dir) accelerator.save_state(output_dir) if completed_steps >= args.max_train_steps: break model.eval() losses = [] for step, batch in enumerate(eval_dataloader): with torch.no_grad(): outputs = model(**batch) loss = outputs.loss losses.append( accelerator.gather(loss.repeat( args.per_device_eval_batch_size))) losses = torch.cat(losses) losses = losses[:len(eval_dataset)] try: perplexity = math.exp(torch.mean(losses)) except OverflowError: perplexity = float("inf") logger.info(f"epoch {epoch}: perplexity: {perplexity}") if args.with_tracking: accelerator.log( { "perplexity": perplexity, "train_loss": total_loss, "epoch": epoch, "step": completed_steps }, ) if args.push_to_hub and epoch < args.num_train_epochs - 1: accelerator.wait_for_everyone() unwrapped_model = accelerator.unwrap_model(model) unwrapped_model.save_pretrained( args.output_dir, is_main_process=accelerator.is_main_process, save_function=accelerator.save) if accelerator.is_main_process: tokenizer.save_pretrained(args.output_dir) repo.push_to_hub( commit_message=f"Training in progress epoch {epoch}", blocking=False, auto_lfs_prune=True) if args.checkpointing_steps == "epoch": output_dir = f"epoch_{epoch}" if args.output_dir is not None: output_dir = os.path.join(args.output_dir, output_dir) accelerator.save_state(output_dir) if args.output_dir is not None: accelerator.wait_for_everyone() unwrapped_model = accelerator.unwrap_model(model) unwrapped_model.save_pretrained( args.output_dir, is_main_process=accelerator.is_main_process, save_function=accelerator.save) if accelerator.is_main_process: tokenizer.save_pretrained(args.output_dir) if args.push_to_hub: repo.push_to_hub(commit_message="End of training", auto_lfs_prune=True) with open(os.path.join(args.output_dir, "all_results.json"), "w") as f: json.dump({"perplexity": perplexity}, f)
def run(n_epochs, lr, batch_size, base_model, clustering_loss_weight, embedding_extractor, annealing_alphas, dataset, train_idx_file, result_dir, early_stopping, early_stopping_tol, device, random_state): # Set random states np.random.seed(random_state) torch.manual_seed(random_state) torch.cuda.manual_seed_all(random_state) # load data df = pd.read_csv(dataset) with open(train_idx_file, 'r') as f: train_idx = np.array(list(map(int, f.readlines()))) df = df.iloc[train_idx].copy() texts = df['texts'].to_numpy() labels = df['labels'].to_numpy() data = TextDataset(texts, labels) data_loader = DataLoader(dataset=data, batch_size=batch_size, shuffle=False) # init lm model & tokenizer lm_model = AutoModelForMaskedLM.from_pretrained(base_model, return_dict=True, output_hidden_states=True) tokenizer = AutoTokenizer.from_pretrained(base_model, return_dict=True, output_hidden_states=True) lm_model.to(device) # init clustering model model, initial_centroids, initial_embeddings = init_model( lm_model=lm_model, tokenizer=tokenizer, data_loader=data_loader, embedding_extractor=embedding_extractor, n_clusters=np.unique(labels).shape[0], device=device) # init optimizer & scheduler opt = torch.optim.RMSprop( params=model.parameters(), lr=lr, # 2e-5, 5e-7, eps=1e-8) total_steps = len(data_loader) * n_epochs scheduler = get_linear_schedule_with_warmup(optimizer=opt, num_warmup_steps=int( len(data_loader) * 0.5), num_training_steps=total_steps) # train the model hist = train(n_epochs=n_epochs, model=model, optimizer=opt, scheduler=scheduler, annealing_alphas=annealing_alphas, train_data_loader=data_loader, clustering_loss_weight=clustering_loss_weight, early_stopping=early_stopping, early_stopping_tol=early_stopping_tol, verbose=True) # save results & model os.makedirs(result_dir) with open(os.path.join(result_dir, 'train_hist.h'), 'wb') as f: pickle.dump(hist, file=f) torch.save(model, os.path.join(result_dir, 'model.bin'))
"Usage: python train_mlm.py model_name data/train_sentences.txt [data/dev_sentences.txt]" ) exit() model_name = sys.argv[1] per_device_train_batch_size = 64 save_steps = 1000 #Save model every 1k steps num_train_epochs = 3 #Number of epochs use_fp16 = False #Set to True, if your GPU supports FP16 operations max_length = 100 #Max length for a text input do_whole_word_mask = True #If set to true, whole words are masked mlm_prob = 0.15 #Probability that a word is replaced by a [MASK] token # Load the model model = AutoModelForMaskedLM.from_pretrained(model_name) tokenizer = AutoTokenizer.from_pretrained(model_name) output_dir = "output/{}-{}".format( model_name.replace("/", "_"), datetime.now().strftime("%Y-%m-%d_%H-%M-%S")) print("Save checkpoints to:", output_dir) ##### Load our training datasets train_sentences = [] train_path = sys.argv[2] with gzip.open(train_path, 'rt', encoding='utf8') if train_path.endswith('.gz') else open( train_path, 'r', encoding='utf8') as fIn: for line in fIn:
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses() # Detecting last checkpoint. last_checkpoint = None if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir: last_checkpoint = get_last_checkpoint(training_args.output_dir) if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome." ) elif last_checkpoint is not None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) logger.setLevel(logging.INFO if is_main_process(training_args.local_rank) else logging.WARN) # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() logger.info(f"Training/evaluation parameters {training_args}") # Set seed before initializing model. set_seed(training_args.seed) # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config_kwargs = { "cache_dir": model_args.cache_dir, "revision": model_args.model_revision, "use_auth_token": True if model_args.use_auth_token else None, } # tokenizer = GPT2TokenizerFast( # os.path.join('../tokenizer', 'vocab.json'), # os.path.join('../tokenizer', 'merges.txt'), # bos_token='<s>', # eos_token='</s>', # sep_token='</s>', # cls_token='<s>', # unk_token='<unk>', # pad_token='<pad>', # mask_token='<mask>', # ) tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base') tokenizer.add_special_tokens({ 'bos_token': '<s>', 'eos_token': '</s>', 'sep_token': '</s>', 'cls_token': '<s>', 'unk_token': '<unk>', 'pad_token': '<pad>', 'mask_token': '<mask>', }) # tokenizer.convert_tokens_to_ids(s) if model_args.config_name: config = AutoConfig.from_pretrained(model_args.config_name, **config_kwargs) elif model_args.model_name_or_path: config = AutoConfig.from_pretrained(model_args.model_name_or_path, **config_kwargs) else: config = CONFIG_MAPPING[model_args.model_type]() logger.warning("You are instantiating a new config instance from scratch.") model = AutoModelForMaskedLM.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) model.resize_token_embeddings(len(tokenizer)) # Data collator # This one will take care of randomly masking the tokens. data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=data_args.mlm_probability) train_dataset = Dataset(paths=['../bin_data/{}_text_document'.format(i) for i in range(4)], tokenizer=tokenizer) eval_dataset = Dataset(paths='../bin_data/4_text_document', tokenizer=tokenizer) # Initialize our Trainer trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, tokenizer=tokenizer, data_collator=data_collator, ) # Training if training_args.do_train: if last_checkpoint is not None: checkpoint = last_checkpoint elif model_args.model_name_or_path is not None and os.path.isdir(model_args.model_name_or_path): checkpoint = model_args.model_name_or_path else: checkpoint = None train_result = trainer.train(resume_from_checkpoint=checkpoint) trainer.save_model() # Saves the tokenizer too for easy upload metrics = train_result.metrics max_train_samples = ( data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset) ) metrics["train_samples"] = min(max_train_samples, len(train_dataset)) trainer.log_metrics("train", metrics) trainer.save_metrics("train", metrics) trainer.save_state() # Evaluation if training_args.do_eval: logger.info("*** Evaluate ***") metrics = trainer.evaluate() max_val_samples = data_args.max_val_samples if data_args.max_val_samples is not None else len(eval_dataset) metrics["eval_samples"] = min(max_val_samples, len(eval_dataset)) perplexity = math.exp(metrics["eval_loss"]) metrics["perplexity"] = perplexity trainer.log_metrics("eval", metrics) trainer.save_metrics("eval", metrics)
def loadmodel(epoch): model = AutoModelForMaskedLM.from_pretrained('./save%d/'%epoch) return model
def stage1pretrain(): logger.info("stage1pretrain starts") config = PretrainConfig() if config.train_file is not None: extension = config.train_file.split(".")[-1] assert extension in [ "csv", "json", "txt" ], "`train_file` should be a csv, json or txt file." if config.validation_file is not None: extension = config.validation_file.split(".")[-1] assert extension in [ "csv", "json", "txt" ], "`validation_file` should be a csv, json or txt file." if config.output_dir is not None: os.makedirs(config.output_dir, exist_ok=True) saveDataWithTextsOnly("../../data/commonlitreadability/train.csv", "../../data/commonlitreadability/test.csv") accelerator = Accelerator() logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) logger.info(accelerator.state) logger.setLevel( logging.INFO if accelerator.is_local_main_process else logging.ERROR) if accelerator.is_local_main_process: datasets.utils.logging.set_verbosity_warning() transformers.utils.logging.set_verbosity_info() else: datasets.utils.logging.set_verbosity_error() transformers.utils.logging.set_verbosity_error() if config.seed is not None: set_seed(config.seed) data_files = {} if config.train_file is not None: data_files["train"] = config.train_file if config.validation_file is not None: data_files["validation"] = config.validation_file extension = config.train_file.split(".")[-1] if extension == "txt": extension = "text" raw_datasets = load_dataset(extension, data_files=data_files) if config.config_name: modelconfig = AutoConfig.from_pretrained(config.config_name) elif config.model_name_or_path: modelconfig = AutoConfig.from_pretrained(config.model_name_or_path) else: modelconfig = CONFIG_MAPPING[config.model_type]() logger.warning( "You are instantiating a new config instance from scratch.") if config.tokenizer_name: tokenizer = AutoTokenizer.from_pretrained( config.tokenizer_name, use_fast=not config.use_slow_tokenizer) elif config.model_name_or_path: tokenizer = AutoTokenizer.from_pretrained( config.model_name_or_path, use_fast=not config.use_slow_tokenizer) else: raise ValueError( "You are instantiating a new tokenizer from scratch. This is not supported by this script." "You can do it from another script, save it, and load it from here, using --tokenizer_name." ) if config.model_name_or_path: model = AutoModelForMaskedLM.from_pretrained( config.model_name_or_path, from_tf=bool(".ckpt" in config.model_name_or_path), config=modelconfig, ) else: logger.info("Training new model from scratch") model = AutoModelForMaskedLM.from_config(modelconfig) model.resize_token_embeddings(len(tokenizer)) column_names = raw_datasets["train"].column_names text_column_name = "text" if "text" in column_names else column_names[0] if config.max_seq_length is None: max_seq_length = tokenizer.model_max_length if max_seq_length > 1024: logger.warning( f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). " "Picking 1024 instead. You can change that default value by passing --max_seq_length xxx." ) max_seq_length = 1024 else: if config.max_seq_length > tokenizer.model_max_length: logger.warning( f"The max_seq_length passed ({config.max_seq_length}) is larger than the maximum length for the" f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}." ) max_seq_length = min(config.max_seq_length, tokenizer.model_max_length) def tokenize_function(examples): return tokenizer(examples[text_column_name], return_special_tokens_mask=True) def group_texts(examples): concatenated_examples = { k: sum(examples[k], []) for k in examples.keys() } total_length = len(concatenated_examples[list(examples.keys())[0]]) total_length = (total_length // max_seq_length) * max_seq_length result = { k: [ t[i:i + max_seq_length] for i in range(0, total_length, max_seq_length) ] for k, t in concatenated_examples.items() } return result tokenized_datasets = raw_datasets.map( tokenize_function, batched=True, num_proc=config.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not config.overwrite_cache, ) tokenized_datasets = tokenized_datasets.map( group_texts, batched=True, num_proc=config.preprocessing_num_workers, load_from_cache_file=not config.overwrite_cache, ) train_dataset = tokenized_datasets["train"] eval_dataset = tokenized_datasets["validation"] data_collator = DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm_probability=config.mlm_probability) train_dataloader = DataLoader( train_dataset, shuffle=True, collate_fn=data_collator, batch_size=config.per_device_train_batch_size) eval_dataloader = DataLoader(eval_dataset, collate_fn=data_collator, batch_size=config.per_device_eval_batch_size) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": config.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=config.learning_rate) model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare( model, optimizer, train_dataloader, eval_dataloader) num_update_steps_per_epoch = math.ceil( len(train_dataloader) / config.gradient_accumulation_steps) if config.max_train_steps is None: config.max_train_steps = config.num_train_epochs * num_update_steps_per_epoch else: config.num_train_epochs = math.ceil(config.max_train_steps / num_update_steps_per_epoch) lr_scheduler = get_scheduler( name=config.lr_scheduler_type, optimizer=optimizer, num_warmup_steps=config.num_warmup_steps, num_training_steps=config.max_train_steps, ) total_batch_size = config.per_device_train_batch_size * accelerator.num_processes * config.gradient_accumulation_steps logger.info("***** Running training *****") logger.info(f" Num examples = {len(train_dataset)}") logger.info(f" Num Epochs = {config.num_train_epochs}") logger.info( f" Instantaneous batch size per device = {config.per_device_train_batch_size}" ) logger.info( f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}" ) logger.info( f" Gradient Accumulation steps = {config.gradient_accumulation_steps}" ) logger.info(f" Total optimization steps = {config.max_train_steps}") # Only show the progress bar once on each machine. progress_bar = tqdm(range(config.max_train_steps), disable=not accelerator.is_local_main_process) completed_steps = 0 for epoch in range(config.num_train_epochs): model.train() for step, batch in enumerate(train_dataloader): outputs = model(**batch) loss = outputs.loss loss = loss / config.gradient_accumulation_steps accelerator.backward(loss) if step % config.gradient_accumulation_steps == 0 or step == len( train_dataloader) - 1: optimizer.step() lr_scheduler.step() optimizer.zero_grad() progress_bar.update(1) completed_steps += 1 if completed_steps >= config.max_train_steps: break model.eval() losses = [] for step, batch in enumerate(eval_dataloader): with torch.no_grad(): outputs = model(**batch) loss = outputs.loss losses.append( accelerator.gather( loss.repeat(config.per_device_eval_batch_size))) losses = torch.cat(losses) losses = losses[:len(eval_dataset)] perplexity = math.exp(torch.mean(losses)) logger.info(f"epoch {epoch}: perplexity: {perplexity}") if config.output_dir is not None: accelerator.wait_for_everyone() unwrapped_model = accelerator.unwrap_model(model) unwrapped_model.save_pretrained(config.output_dir, save_function=accelerator.save)
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) # Detecting last checkpoint. last_checkpoint = None if os.path.isdir( training_args.output_dir ) and training_args.do_train and not training_args.overwrite_output_dir: last_checkpoint = get_last_checkpoint(training_args.output_dir) if last_checkpoint is None and len(os.listdir( training_args.output_dir)) > 0: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome.") elif last_checkpoint is not None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) logger.setLevel(logging.INFO if is_main_process(training_args.local_rank ) else logging.WARN) # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() logger.info("Training/evaluation parameters %s", training_args) # Set seed before initializing model. set_seed(training_args.seed) # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub # # For CSV/JSON files, this script will use the column called 'text' or the first column. You can easily tweak this # behavior (see below) # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if data_args.dataset_name is not None: # Downloading and loading a dataset from the hub. datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name) if "validation" not in datasets.keys(): datasets["validation"] = load_dataset( data_args.dataset_name, data_args.dataset_config_name, split=f"train[:{data_args.validation_split_percentage}%]", ) datasets["train"] = load_dataset( data_args.dataset_name, data_args.dataset_config_name, split=f"train[{data_args.validation_split_percentage}%:]", ) else: data_files = {} if data_args.train_file is not None: data_files["train"] = data_args.train_file if data_args.validation_file is not None: data_files["validation"] = data_args.validation_file extension = data_args.train_file.split(".")[-1] if extension == "txt": extension = "text" datasets = load_dataset(extension, data_files=data_files) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config_kwargs = { "cache_dir": model_args.cache_dir, "revision": model_args.model_revision, "use_auth_token": True if model_args.use_auth_token else None, } if model_args.config_name: config = AutoConfig.from_pretrained(model_args.config_name, **config_kwargs) elif model_args.model_name_or_path: config = AutoConfig.from_pretrained(model_args.model_name_or_path, **config_kwargs) else: config = CONFIG_MAPPING[model_args.model_type]() logger.warning( "You are instantiating a new config instance from scratch.") tokenizer_kwargs = { "cache_dir": model_args.cache_dir, "use_fast": model_args.use_fast_tokenizer, "revision": model_args.model_revision, "use_auth_token": True if model_args.use_auth_token else None, } if model_args.tokenizer_name: tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, **tokenizer_kwargs) elif model_args.model_name_or_path: tokenizer = AutoTokenizer.from_pretrained( model_args.model_name_or_path, **tokenizer_kwargs) else: raise ValueError( "You are instantiating a new tokenizer from scratch. This is not supported by this script." "You can do it from another script, save it, and load it from here, using --tokenizer_name." ) if model_args.model_name_or_path: model = AutoModelForMaskedLM.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) else: logger.info("Training new model from scratch") model = AutoModelForMaskedLM.from_config(config) model.resize_token_embeddings(len(tokenizer)) # Preprocessing the datasets. # First we tokenize all the texts. if training_args.do_train: column_names = datasets["train"].column_names else: column_names = datasets["validation"].column_names text_column_name = "text" if "text" in column_names else column_names[0] if data_args.max_seq_length is None: max_seq_length = tokenizer.model_max_length if max_seq_length > 1024: logger.warn( f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). " "Picking 1024 instead. You can change that default value by passing --max_seq_length xxx." ) max_seq_length = 1024 else: if data_args.max_seq_length > tokenizer.model_max_length: logger.warn( f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the" f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}." ) max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length) if data_args.line_by_line: # When using line_by_line, we just tokenize each nonempty line. padding = "max_length" if data_args.pad_to_max_length else False def tokenize_function(examples): # Remove empty lines examples["text"] = [ line for line in examples["text"] if len(line) > 0 and not line.isspace() ] return tokenizer( examples["text"], padding=padding, truncation=True, max_length=max_seq_length, # We use this option because DataCollatorForLanguageModeling (see below) is more efficient when it # receives the `special_tokens_mask`. return_special_tokens_mask=True, ) tokenized_datasets = datasets.map( tokenize_function, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=[text_column_name], load_from_cache_file=not data_args.overwrite_cache, ) else: # Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts. # We use `return_special_tokens_mask=True` because DataCollatorForLanguageModeling (see below) is more # efficient when it receives the `special_tokens_mask`. def tokenize_function(examples): return tokenizer(examples[text_column_name], return_special_tokens_mask=True) tokenized_datasets = datasets.map( tokenize_function, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, ) # Main data processing function that will concatenate all texts from our dataset and generate chunks of # max_seq_length. def group_texts(examples): # Concatenate all texts. concatenated_examples = { k: sum(examples[k], []) for k in examples.keys() } total_length = len(concatenated_examples[list(examples.keys())[0]]) # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can # customize this part to your needs. total_length = (total_length // max_seq_length) * max_seq_length # Split by chunks of max_len. result = { k: [ t[i:i + max_seq_length] for i in range(0, total_length, max_seq_length) ] for k, t in concatenated_examples.items() } return result # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a # remainder for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value # might be slower to preprocess. # # To speed up this part, we use multiprocessing. See the documentation of the map method for more information: # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map tokenized_datasets = tokenized_datasets.map( group_texts, batched=True, num_proc=data_args.preprocessing_num_workers, load_from_cache_file=not data_args.overwrite_cache, ) if training_args.do_train: if "train" not in tokenized_datasets: raise ValueError("--do_train requires a train dataset") train_dataset = tokenized_datasets["train"] if data_args.max_train_samples is not None: train_dataset = train_dataset.select( range(data_args.max_train_samples)) if training_args.do_eval: if "validation" not in tokenized_datasets: raise ValueError("--do_eval requires a validation dataset") eval_dataset = tokenized_datasets["validation"] if data_args.max_val_samples is not None: eval_dataset = eval_dataset.select(range( data_args.max_val_samples)) # Data collator # This one will take care of randomly masking the tokens. data_collator = DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm_probability=data_args.mlm_probability) # Initialize our Trainer trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset if training_args.do_train else None, eval_dataset=eval_dataset if training_args.do_eval else None, tokenizer=tokenizer, data_collator=data_collator, ) # Training if training_args.do_train: if last_checkpoint is not None: checkpoint = last_checkpoint elif model_args.model_name_or_path is not None and os.path.isdir( model_args.model_name_or_path): checkpoint = model_args.model_name_or_path else: checkpoint = None train_result = trainer.train(resume_from_checkpoint=checkpoint) trainer.save_model() # Saves the tokenizer too for easy upload metrics = train_result.metrics max_train_samples = (data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)) metrics["train_samples"] = min(max_train_samples, len(train_dataset)) trainer.log_metrics("train", metrics) trainer.save_metrics("train", metrics) trainer.save_state() # Evaluation if training_args.do_eval: logger.info("*** Evaluate ***") metrics = trainer.evaluate() max_val_samples = data_args.max_val_samples if data_args.max_val_samples is not None else len( eval_dataset) metrics["eval_samples"] = min(max_val_samples, len(eval_dataset)) perplexity = math.exp(metrics["eval_loss"]) metrics["perplexity"] = perplexity trainer.log_metrics("eval", metrics) trainer.save_metrics("eval", metrics)
def fine_tune(cfg: DictConfig) -> float: """fine tune bert module""" init_wandb(cfg) tokenizer = AutoTokenizer.from_pretrained(cfg["module"]["arch"]) data_collator = DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm_probability=0.15, ) config = AutoConfig.from_pretrained(cfg.model.arch, num_labels=cfg.model.num_labels) model = AutoModelForMaskedLM.from_pretrained(cfg.model.arch, config=config) model.resize_token_embeddings(len(tokenizer)) train_ds, test_ds = getDataset(cfg, tokenizer) id = wandb.run.name.rsplit("-", 1)[1] trainConfig = cfg.train output_dir = os.path.join(trainConfig["output_dir"], id) print("module output dir = ", output_dir) train_args = TrainingArguments( # module pred/ckpt output_dir=output_dir, # tensorboard logs logging_dir="./logs", num_train_epochs=trainConfig["epoch"], per_device_train_batch_size=trainConfig["train_batch_size"], per_device_eval_batch_size=trainConfig["eval_batch_size"], # x (logging / eval /save) every acc * x_steps gradient_accumulation_steps=trainConfig["acc_batch"], evaluation_strategy=IntervalStrategy.EPOCH, label_smoothing_factor=trainConfig["label_smooth"], # AdamW learning_rate=trainConfig["lr"], warmup_steps=trainConfig["warmup"], # apply to all layers but bias / LayerNorm weight_decay=trainConfig["wd"], # save_total_limit=2, # if True, ignore param save_strategy / save_steps / save_total_limit load_best_model_at_end=True, # report_to=["none"], report_to=["wandb"], seed=cfg.seed, # logging_strategy=IntervalStrategy.STEPS, # metric_for_best_model=trainConfig["metric"] ) trainer = Trainer( model, args=train_args, train_dataset=train_ds, eval_dataset=test_ds, tokenizer=tokenizer, data_collator=data_collator, # callbacks=[ # EarlyStoppingCallback(early_stopping_patience=trainConfig["early_stopping_patience"]), # ], # compute_metrics=compute_metrics, ) print("logs in dir", os.getcwd()) print("gpu count = ", trainer.args.n_gpu, "is_fp16 =", trainer.args.fp16) train_result = trainer.train() trainer.save_model() metrics = train_result.metrics trainer.log_metrics("train", metrics) trainer.save_metrics("train", metrics) trainer.save_state() metrics = trainer.evaluate() try: perplexity = math.exp(metrics["eval_loss"]) except OverflowError: perplexity = float("inf") metrics["perplexity"] = perplexity trainer.log_metrics("eval", metrics) trainer.save_metrics("eval", metrics) # best module trainer.model.save_pretrained(os.path.join(output_dir, "best"))