def _setup_dataloader_from_config(self, cfg: DictConfig, data_split: str): start_time = perf_counter() logging.info(f'Creating {data_split} dataset') input_file = cfg.data_path tagger_data_augmentation = cfg.get('tagger_data_augmentation', False) dataset = TextNormalizationTaggerDataset( input_file=input_file, tokenizer=self._tokenizer, tokenizer_name=self.transformer_name, mode=self.mode, do_basic_tokenize=cfg.do_basic_tokenize, tagger_data_augmentation=tagger_data_augmentation, lang=self.lang, max_seq_length=self.max_sequence_len, use_cache=cfg.get('use_cache', False), max_insts=cfg.get('max_insts', -1), ) data_collator = DataCollatorForTokenClassification(self._tokenizer) dl = torch.utils.data.DataLoader(dataset=dataset, batch_size=cfg.batch_size, shuffle=cfg.shuffle, collate_fn=data_collator) running_time = perf_counter() - start_time logging.info(f'Took {running_time} seconds') return dl
def __call__(self): if self.lang == "de": # dataset = load_dataset('../config/conll2003-de/', 'conll2003-de', cache_dir=self.spec.cache_dir) dataset = load_dataset( 'D:/IntellijProjects/aikido2/aikido/kata/config/conll2003-de/', 'conll2003-de', cache_dir=self.spec.cache_dir) elif self.lang == "en": dataset = load_dataset("conll2003", cache_dir=self.spec.cache_dir) else: raise ValueError( f"cannot handle language {self.lang} for conll2003") encoded = dataset.map(function=self.tokenize_and_align_labels, batched=self.spec.batch_preprocess, batch_size=self.spec.batch_preprocess_size, remove_columns=[ "id", "tokens", "pos_tags", "chunk_tags", "ner_tags" ]) data_collator = DataCollatorForTokenClassification(self.tokenizer) return (DatasetKata(self.spec, encoded["train"], data_collator), DatasetKata(self.spec, encoded["validation"], data_collator), DatasetKata(self.spec, encoded["test"], data_collator))
def __init__(self, pretrained, prepared_dir, classifier_dir): """ pretrained is None means disable classifier """ self.pretrained = pretrained self.classifier_dir = classifier_dir self.prepared_dir = prepared_dir self.datasets = DatasetDict({ 'train': read_dataset_from_csv(prepared_dir + '/train.csv'), 'test': read_dataset_from_csv(prepared_dir + '/test.csv'), 'validation': read_dataset_from_csv(prepared_dir + '/validation.csv') }) self.metric = load_metric("seqeval") self.label_list = self.datasets["train"].features["tag"].feature.names check_folder(self.classifier_dir) if pretrained: self.model = AutoModelForTokenClassification.from_pretrained( self.pretrained, num_labels=len(self.label_list)) self.tokenizer = AutoTokenizer.from_pretrained(self.pretrained) self.data_collator = DataCollatorForTokenClassification( self.tokenizer)
def get_trainer( train_data, eval_data, model=None, tokenizer=None, tr_args=None): '''model = transformer model instance tokenizer = transformer tokenizer instance tr_args = Training arguments ''' device = 'cuda' if config.use_gpu else 'cpu' tokenizer = tokenizer or get_tokenizer(config.model_name) model = model or get_model( config.model_name, num_labels=len(config.ner_tags), device=device) training_args = config.training_args training_args.update(tr_args or {}) args = TrainingArguments( 'train-ner', **training_args ) data_collector = DataCollatorForTokenClassification(tokenizer) return Trainer( model, args, train_dataset=train_data, eval_dataset=eval_data, data_collator=data_collector, tokenizer=tokenizer, compute_metrics=compute_metrics )
def test_data_collator_for_token_classification(self): tokenizer = BertTokenizer(self.vocab_file) features = [ { "input_ids": [0, 1, 2], "labels": [0, 1, 2] }, { "input_ids": [0, 1, 2, 3, 4, 5], "labels": [0, 1, 2, 3, 4, 5] }, ] data_collator = DataCollatorForTokenClassification(tokenizer, return_tensors="tf") batch = data_collator(features) self.assertEqual(batch["input_ids"].shape.as_list(), [2, 6]) self.assertEqual(batch["input_ids"][0].numpy().tolist(), [0, 1, 2] + [tokenizer.pad_token_id] * 3) self.assertEqual(batch["labels"].shape.as_list(), [2, 6]) self.assertEqual(batch["labels"][0].numpy().tolist(), [0, 1, 2] + [-100] * 3) data_collator = DataCollatorForTokenClassification( tokenizer, padding="max_length", max_length=10, return_tensors="tf") batch = data_collator(features) self.assertEqual(batch["input_ids"].shape.as_list(), [2, 10]) self.assertEqual(batch["labels"].shape.as_list(), [2, 10]) data_collator = DataCollatorForTokenClassification( tokenizer, pad_to_multiple_of=8, return_tensors="tf") batch = data_collator(features) self.assertEqual(batch["input_ids"].shape.as_list(), [2, 8]) self.assertEqual(batch["labels"].shape.as_list(), [2, 8]) data_collator = DataCollatorForTokenClassification( tokenizer, label_pad_token_id=-1, return_tensors="tf") batch = data_collator(features) self.assertEqual(batch["input_ids"].shape.as_list(), [2, 6]) self.assertEqual(batch["input_ids"][0].numpy().tolist(), [0, 1, 2] + [tokenizer.pad_token_id] * 3) self.assertEqual(batch["labels"].shape.as_list(), [2, 6]) self.assertEqual(batch["labels"][0].numpy().tolist(), [0, 1, 2] + [-1] * 3)
def loaders(self): if self._loaders is None: ps, mgr = self.params, self.mgr if ps.pad_to_max_length: c = default_data_collator else: c = DataCollatorForTokenClassification( self.tokenizer, pad_to_multiple_of=(8 if mgr.use_fp16 else None) ) t = DataLoader( self.train_ds, shuffle=True, collate_fn=c, batch_size=ps.train_batch_size ) e = DataLoader(self.eval_ds, collate_fn=c, batch_size=ps.eval_batch_size) self._loaders = {TRAIN: t, EVAL: e} return self._loaders
def __call__(self): dataset = load_dataset("germeval_14", cache_dir=self.spec.cache_dir) encoded = dataset.map(function=self.tokenize_and_align_labels, batched=self.spec.batch_preprocess, batch_size=self.spec.batch_preprocess_size, remove_columns=[ 'id', 'ner_tags', 'nested_ner_tags', 'source', 'tokens' ]) data_collator = DataCollatorForTokenClassification(self.tokenizer) return (DatasetKata(self.spec, encoded["train"], data_collator), DatasetKata(self.spec, encoded["validation"], data_collator), DatasetKata(self.spec, encoded["test"], data_collator))
def __init__(self, samples, tok, max_length=128, use_token_type_ids=False): super().__init__() self.tok = tok self.data_collator = DataCollatorForTokenClassification( tok, max_length=max_length, padding='max_length') raw_samples = [s.tokenization_result for s in samples] for s in raw_samples: while len(s['confidences']) < max_length: s['confidences'].append(0) self.confidences = torch.FloatTensor( [s['confidences'] for s in raw_samples]) print('CONFIDENCES INFO', self.confidences.sum(), (self.confidences != 0).float().sum() / self.confidences.nelement(), self.confidences.shape) self.samples = deepcopy(raw_samples) for s in self.samples: del s['confidences'] self.collated = self.data_collator(self.samples) self.use_token_type_ids = use_token_type_ids
def main(train_json_path, val_json_path, model_name_or_dir, output_dir, logging_dir, logging_steps, batch_size, gradient_accumulation_steps, learning_rate, num_train_epochs, warmup_ratio, num_classes): tokenizer = XLMRobertaTokenizerFast.from_pretrained(model_name_or_dir) model = XLMRobertaForTokenClassification.from_pretrained( model_name_or_dir, num_labels=num_classes) sh_ner_train_dataset, sh_ner_val_dataset = create_sh_ner_dataset( train_json_path, val_json_path, tokenizer) data_collator = DataCollatorForTokenClassification(tokenizer) training_args = TrainingArguments( output_dir=output_dir, do_train=True, do_eval=True, do_predict=False, evaluation_strategy='epoch', per_device_train_batch_size=batch_size, per_device_eval_batch_size=batch_size, gradient_accumulation_steps=gradient_accumulation_steps, learning_rate=learning_rate, num_train_epochs=num_train_epochs, warmup_ratio=warmup_ratio, logging_dir=logging_dir, logging_strategy='steps', logging_steps=logging_steps, save_strategy='epoch', ) trainer = Trainer( model=model, args=training_args, train_dataset=sh_ner_train_dataset, eval_dataset=sh_ner_val_dataset, data_collator=data_collator, tokenizer=tokenizer, compute_metrics=compute_metrics, ) trainer.train()
def _setup_dataloader_from_config(self, cfg: DictConfig, mode: str): start_time = perf_counter() logging.info(f'Creating {mode} dataset') input_file = cfg.data_path dataset = TextNormalizationTaggerDataset( input_file, self._tokenizer, cfg.mode, cfg.get('do_basic_tokenize', False), cfg.get('tagger_data_augmentation', False), ) data_collator = DataCollatorForTokenClassification(self._tokenizer) dl = torch.utils.data.DataLoader( dataset=dataset, batch_size=cfg.batch_size, shuffle=cfg.shuffle, collate_fn=data_collator, ) running_time = perf_counter() - start_time logging.info(f'Took {running_time} seconds') return dl
def __init__(self, model_path): config = AutoConfig.from_pretrained(model_path) tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True) model = AutoModelForTokenClassification.from_pretrained( model_path, from_tf=False, config=config, ) data_collator = DataCollatorForTokenClassification( tokenizer, pad_to_multiple_of=None, ) self.trainer = Trainer( model=model, args=None, train_dataset=None, eval_dataset=None, tokenizer=tokenizer, data_collator=data_collator, compute_metrics=None, ) self.device = torch.device( 'cuda:0' if torch.cuda.is_available() else 'cpu') self.spacy_object = spacy.load('xx_sent_ud_sm')
def main(): args = parse_args() # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. accelerator = Accelerator() # Make one log on every process with the configuration for debugging. logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) logger.info(accelerator.state) # Setup logging, we only want one process per machine to log things on the screen. # accelerator.is_local_main_process is only True for one process per machine. logger.setLevel( logging.INFO if accelerator.is_local_main_process else logging.ERROR) if accelerator.is_local_main_process: datasets.utils.logging.set_verbosity_warning() transformers.utils.logging.set_verbosity_info() else: datasets.utils.logging.set_verbosity_error() transformers.utils.logging.set_verbosity_error() # If passed along, set the training seed now. if args.seed is not None: set_seed(args.seed) # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) # or just provide the name of one of the public datasets for token classification task available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub). # # For CSV/JSON files, this script will use the column called 'tokens' or the first column if no column called # 'tokens' is found. You can easily tweak this behavior (see below). # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if args.dataset_name is not None: # Downloading and loading a dataset from the hub. raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name) else: data_files = {} if args.train_file is not None: data_files["train"] = args.train_file if args.validation_file is not None: data_files["validation"] = args.validation_file extension = args.train_file.split(".")[-1] raw_datasets = load_dataset(extension, data_files=data_files) # Trim a number of training examples if args.debug: for split in raw_datasets.keys(): raw_datasets[split] = raw_datasets[split].select(range(100)) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. if raw_datasets["train"] is not None: column_names = raw_datasets["train"].column_names features = raw_datasets["train"].features else: column_names = raw_datasets["validation"].column_names features = raw_datasets["validation"].features if args.text_column_name is not None: text_column_name = args.text_column_name elif "tokens" in column_names: text_column_name = "tokens" else: text_column_name = column_names[0] if args.label_column_name is not None: label_column_name = args.label_column_name elif f"{args.task_name}_tags" in column_names: label_column_name = f"{args.task_name}_tags" else: label_column_name = column_names[1] # In the event the labels are not a `Sequence[ClassLabel]`, we will need to go through the dataset to get the # unique labels. def get_label_list(labels): unique_labels = set() for label in labels: unique_labels = unique_labels | set(label) label_list = list(unique_labels) label_list.sort() return label_list if isinstance(features[label_column_name].feature, ClassLabel): label_list = features[label_column_name].feature.names # No need to convert the labels since they are already ints. label_to_id = {i: i for i in range(len(label_list))} else: label_list = get_label_list(raw_datasets["train"][label_column_name]) label_to_id = {l: i for i, l in enumerate(label_list)} num_labels = len(label_list) # Load pretrained model and tokenizer # # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. if args.config_name: config = AutoConfig.from_pretrained(args.config_name, num_labels=num_labels) elif args.model_name_or_path: config = AutoConfig.from_pretrained(args.model_name_or_path, num_labels=num_labels) else: config = CONFIG_MAPPING[args.model_type]() logger.warning( "You are instantiating a new config instance from scratch.") tokenizer_name_or_path = args.tokenizer_name if args.tokenizer_name else args.model_name_or_path if not tokenizer_name_or_path: raise ValueError( "You are instantiating a new tokenizer from scratch. This is not supported by this script." "You can do it from another script, save it, and load it from here, using --tokenizer_name." ) if config.model_type in {"gpt2", "roberta"}: tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path, use_fast=True, add_prefix_space=True) else: tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path, use_fast=True) if args.model_name_or_path: model = AutoModelForTokenClassification.from_pretrained( args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config, ) else: logger.info("Training new model from scratch") model = AutoModelForTokenClassification.from_config(config) model.resize_token_embeddings(len(tokenizer)) # Preprocessing the raw_datasets. # First we tokenize all the texts. padding = "max_length" if args.pad_to_max_length else False # Tokenize all texts and align the labels with them. def tokenize_and_align_labels(examples): tokenized_inputs = tokenizer( examples[text_column_name], max_length=args.max_length, padding=padding, truncation=True, # We use this argument because the texts in our dataset are lists of words (with a label for each word). is_split_into_words=True, ) labels = [] for i, label in enumerate(examples[label_column_name]): word_ids = tokenized_inputs.word_ids(batch_index=i) previous_word_idx = None label_ids = [] for word_idx in word_ids: # Special tokens have a word id that is None. We set the label to -100 so they are automatically # ignored in the loss function. if word_idx is None: label_ids.append(-100) # We set the label for the first token of each word. elif word_idx != previous_word_idx: label_ids.append(label_to_id[label[word_idx]]) # For the other tokens in a word, we set the label to either the current label or -100, depending on # the label_all_tokens flag. else: label_ids.append(label_to_id[label[word_idx]] if args. label_all_tokens else -100) previous_word_idx = word_idx labels.append(label_ids) tokenized_inputs["labels"] = labels return tokenized_inputs processed_raw_datasets = raw_datasets.map( tokenize_and_align_labels, batched=True, remove_columns=raw_datasets["train"].column_names) train_dataset = processed_raw_datasets["train"] eval_dataset = processed_raw_datasets["validation"] # Log a few random samples from the training set: for index in random.sample(range(len(train_dataset)), 3): logger.info( f"Sample {index} of the training set: {train_dataset[index]}.") # DataLoaders creation: if args.pad_to_max_length: # If padding was already done ot max length, we use the default data collator that will just convert everything # to tensors. data_collator = default_data_collator else: # Otherwise, `DataCollatorForTokenClassification` will apply dynamic padding for us (by padding to the maximum length of # the samples passed). When using mixed precision, we add `pad_to_multiple_of=8` to pad all tensors to multiple # of 8s, which will enable the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta). data_collator = DataCollatorForTokenClassification( tokenizer, pad_to_multiple_of=(8 if accelerator.use_fp16 else None)) train_dataloader = DataLoader(train_dataset, shuffle=True, collate_fn=data_collator, batch_size=args.per_device_train_batch_size) eval_dataloader = DataLoader(eval_dataset, collate_fn=data_collator, batch_size=args.per_device_eval_batch_size) # Optimizer # Split weights in two groups, one with weight decay and the other not. no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate) # Use the device given by the `accelerator` object. device = accelerator.device model.to(device) # Prepare everything with our `accelerator`. model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare( model, optimizer, train_dataloader, eval_dataloader) # Note -> the training dataloader needs to be prepared before we grab his length below (cause its length will be # shorter in multiprocess) # Scheduler and math around the number of training steps. num_update_steps_per_epoch = math.ceil( len(train_dataloader) / args.gradient_accumulation_steps) if args.max_train_steps is None: args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch else: args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch) lr_scheduler = get_scheduler( name=args.lr_scheduler_type, optimizer=optimizer, num_warmup_steps=args.num_warmup_steps, num_training_steps=args.max_train_steps, ) # Metrics metric = load_metric("seqeval") def get_labels(predictions, references): # Transform predictions and references tensos to numpy arrays if device.type == "cpu": y_pred = predictions.detach().clone().numpy() y_true = references.detach().clone().numpy() else: y_pred = predictions.detach().cpu().clone().numpy() y_true = references.detach().cpu().clone().numpy() # Remove ignored index (special tokens) true_predictions = [[ label_list[p] for (p, l) in zip(pred, gold_label) if l != -100 ] for pred, gold_label in zip(y_pred, y_true)] true_labels = [[ label_list[l] for (p, l) in zip(pred, gold_label) if l != -100 ] for pred, gold_label in zip(y_pred, y_true)] return true_predictions, true_labels def compute_metrics(): results = metric.compute() if args.return_entity_level_metrics: # Unpack nested dictionaries final_results = {} for key, value in results.items(): if isinstance(value, dict): for n, v in value.items(): final_results[f"{key}_{n}"] = v else: final_results[key] = value return final_results else: return { "precision": results["overall_precision"], "recall": results["overall_recall"], "f1": results["overall_f1"], "accuracy": results["overall_accuracy"], } # Train! total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps logger.info("***** Running training *****") logger.info(f" Num examples = {len(train_dataset)}") logger.info(f" Num Epochs = {args.num_train_epochs}") logger.info( f" Instantaneous batch size per device = {args.per_device_train_batch_size}" ) logger.info( f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}" ) logger.info( f" Gradient Accumulation steps = {args.gradient_accumulation_steps}") logger.info(f" Total optimization steps = {args.max_train_steps}") # Only show the progress bar once on each machine. progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process) completed_steps = 0 for epoch in range(args.num_train_epochs): model.train() for step, batch in enumerate(train_dataloader): outputs = model(**batch) loss = outputs.loss loss = loss / args.gradient_accumulation_steps accelerator.backward(loss) if step % args.gradient_accumulation_steps == 0 or step == len( train_dataloader) - 1: optimizer.step() lr_scheduler.step() optimizer.zero_grad() progress_bar.update(1) completed_steps += 1 if completed_steps >= args.max_train_steps: break model.eval() for step, batch in enumerate(eval_dataloader): with torch.no_grad(): outputs = model(**batch) predictions = outputs.logits.argmax(dim=-1) labels = batch["labels"] if not args.pad_to_max_length: # necessary to pad predictions and labels for being gathered predictions = accelerator.pad_across_processes(predictions, dim=1, pad_index=-100) labels = accelerator.pad_across_processes(labels, dim=1, pad_index=-100) predictions_gathered = accelerator.gather(predictions) labels_gathered = accelerator.gather(labels) preds, refs = get_labels(predictions_gathered, labels_gathered) metric.add_batch( predictions=preds, references=refs, ) # predictions and preferences are expected to be a nested list of labels, not label_ids # eval_metric = metric.compute() eval_metric = compute_metrics() accelerator.print(f"epoch {epoch}:", eval_metric) if args.output_dir is not None: accelerator.wait_for_everyone() unwrapped_model = accelerator.unwrap_model(model) unwrapped_model.save_pretrained(args.output_dir, save_function=accelerator.save)
def train(no_cache: bool, dataset_path: str, data_config_name: str, training_args: TrainingArguments, tokenizer: RobertaTokenizerFast): print(f"tokenizer vocab size: {tokenizer.vocab_size}") print(f"\nLoading and tokenizing datasets found in {dataset_path}.") train_dataset, eval_dataset, test_dataset = load_dataset( 'EMBO/sd-nlp', # './tokcl/loader.py', data_config_name, script_version="main", # data_dir=dataset_path, split=["train", "validation", "test"], # download_mode=GenerateMode.FORCE_REDOWNLOAD if no_cache else GenerateMode.REUSE_DATASET_IF_EXISTS, cache_dir=CACHE) print(f"\nTraining with {len(train_dataset)} examples.") print(f"Evaluating on {len(eval_dataset)} examples.") if data_config_name in ["NER", "ROLES"]: # use our fancy data collator that randomly masks some of the inputs to enforce context learning training_args.remove_unused_columns = False # we need tag_mask data_collator = DataCollatorForMaskedTokenClassification( tokenizer=tokenizer, max_length=config.max_length, masking_probability=training_args.masking_probability, replacement_probability=training_args.replacement_probability, select_labels=training_args.select_labels) else: # normal token classification data_collator = DataCollatorForTokenClassification( tokenizer=tokenizer, max_length=config.max_length) num_labels = train_dataset.info.features['labels'].feature.num_classes label_list = train_dataset.info.features['labels'].feature.names print(f"\nTraining on {num_labels} features:") print(", ".join(label_list)) compute_metrics = MetricsComputer(label_list=label_list) model = RobertaForTokenClassification.from_pretrained( LM_MODEL_PATH, num_labels=num_labels, max_position_embeddings=config.max_length + 2) print("\nTraining arguments:") print(training_args) trainer = Trainer(model=model, args=training_args, data_collator=data_collator, train_dataset=train_dataset, eval_dataset=eval_dataset, compute_metrics=compute_metrics, callbacks=[ShowExample(tokenizer)]) print(f"CUDA available: {torch.cuda.is_available()}") trainer.train() trainer.save_model(training_args.output_dir) print(f"Testing on {len(test_dataset)}.") pred: NamedTuple = trainer.predict(test_dataset, metric_key_prefix='test') print(f"{pred.metrics}")
def collate_fn(self) -> Optional[Callable]: return DataCollatorForTokenClassification(tokenizer=self.tokenizer)
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) # Detecting last checkpoint. last_checkpoint = None if (os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): last_checkpoint = get_last_checkpoint(training_args.output_dir) if last_checkpoint is None and len(os.listdir( training_args.output_dir)) > 0: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome.") elif last_checkpoint is not None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) logger.setLevel(logging.INFO if is_main_process(training_args.local_rank ) else logging.WARN) # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() logger.info("Training/evaluation parameters %s", training_args) # Set seed before initializing model. set_seed(training_args.seed) # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub). # # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called # 'text' is found. You can easily tweak this behavior (see below). # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if data_args.dataset_name is not None: # Downloading and loading a dataset from the hub. datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name) else: data_files = {} if data_args.train_file is not None: data_files["train"] = data_args.train_file if data_args.validation_file is not None: data_files["validation"] = data_args.validation_file if data_args.test_file is not None: data_files["test"] = data_args.test_file extension = data_args.train_file.split(".")[-1] datasets = load_dataset(extension, data_files=data_files) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. if training_args.do_train: column_names = datasets["train"].column_names features = datasets["train"].features else: column_names = datasets["validation"].column_names features = datasets["validation"].features text_column_name = "tokens" if "tokens" in column_names else column_names[0] label_column_name = (f"{data_args.task_name}_tags" if f"{data_args.task_name}_tags" in column_names else column_names[1]) # In the event the labels are not a `Sequence[ClassLabel]`, we will need to go through the dataset to get the # unique labels. def get_label_list(labels): unique_labels = set() for label in labels: unique_labels = unique_labels | set(label) label_list = list(unique_labels) label_list.sort() return label_list if isinstance(features[label_column_name].feature, ClassLabel): label_list = features[label_column_name].feature.names # No need to convert the labels since they are already ints. label_to_id = {i: i for i in range(len(label_list))} else: label_list = get_label_list(datasets["train"][label_column_name]) label_to_id = {l: i for i, l in enumerate(label_list)} num_labels = len(label_list) config = PerceiverConfig( num_labels=num_labels, finetuning_task=data_args.task_name, cache_dir=model_args.cache_dir, d_latents=model_args.d_latents, d_model=model_args.d_model, max_position_embeddings=model_args.max_position_embeddings, ) tokenizer = PerceiverTokenizer( cache_dir=model_args.cache_dir, model_max_length=model_args.model_max_length) model = PerceiverForTokenClassification(config=config) model.main_input_name = "input_ids" # Preprocessing the dataset # Padding strategy padding = "max_length" if data_args.pad_to_max_length else False # Tokenize all texts and align the labels with them. def tokenize_and_align_labels(examples): tokenized_inputs = tokenizer( examples[text_column_name], padding=padding, truncation=True, # We use this argument because the texts in our dataset are lists of words (with a label for each word). is_split_into_words=True, # return_offsets_mapping=True, ) labels = [] """ There is no PerceiverTokenizerFast, follow code works for conll2003 datasets. words: ['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.'] words_labels: [3, 0, 7, 0, 0, 0, 7, 0, 0] tokens_labels: [-100, 3, 3, 0, 0, 0, 0, 0, 0, 0, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0, 0, -100] """ for i in range(len(tokenized_inputs["input_ids"])): # -2 special token [BOS] [SEP] id_length = len(tokenized_inputs["input_ids"][i]) - 2 char_length = len("".join(examples[text_column_name][i])) assert id_length == char_length label_ids = [-100] words = examples[text_column_name][i] words_labels = examples[label_column_name][i] for w, w_label in zip(words, words_labels): current_label = label_to_id[w_label] label_ids.extend([current_label] * len(w)) label_ids.append(-100) assert len(label_ids) == id_length + 2 labels.append(label_ids) tokenized_inputs["labels"] = labels return tokenized_inputs if training_args.do_train: if "train" not in datasets: raise ValueError("--do_train requires a train dataset") train_dataset = datasets["train"] if data_args.max_train_samples is not None: train_dataset = train_dataset.select( range(data_args.max_train_samples)) train_dataset = train_dataset.map( tokenize_and_align_labels, batched=True, num_proc=data_args.preprocessing_num_workers, load_from_cache_file=not data_args.overwrite_cache, ) if training_args.do_eval: if "validation" not in datasets: raise ValueError("--do_eval requires a validation dataset") eval_dataset = datasets["validation"] if data_args.max_val_samples is not None: eval_dataset = eval_dataset.select(range( data_args.max_val_samples)) eval_dataset = eval_dataset.map( tokenize_and_align_labels, batched=True, num_proc=data_args.preprocessing_num_workers, load_from_cache_file=not data_args.overwrite_cache, ) if training_args.do_predict: if "test" not in datasets: raise ValueError("--do_predict requires a test dataset") test_dataset = datasets["test"] if data_args.max_test_samples is not None: test_dataset = test_dataset.select( range(data_args.max_test_samples)) test_dataset = test_dataset.map( tokenize_and_align_labels, batched=True, num_proc=data_args.preprocessing_num_workers, load_from_cache_file=not data_args.overwrite_cache, ) # Data collator data_collator = DataCollatorForTokenClassification( tokenizer, pad_to_multiple_of=8 if training_args.fp16 else None) # Metrics metric = load_metric("seqeval") def compute_metrics(p): predictions, labels = p predictions = np.argmax(predictions, axis=2) # Remove ignored index (special tokens) true_predictions = [[ label_list[p] for (p, l) in zip(prediction, label) if l != -100 ] for prediction, label in zip(predictions, labels)] true_labels = [[ label_list[l] for (p, l) in zip(prediction, label) if l != -100 ] for prediction, label in zip(predictions, labels)] results = metric.compute(predictions=true_predictions, references=true_labels) if data_args.return_entity_level_metrics: # Unpack nested dictionaries final_results = {} for key, value in results.items(): if isinstance(value, dict): for n, v in value.items(): final_results[f"{key}_{n}"] = v else: final_results[key] = value return final_results else: return { "precision": results["overall_precision"], "recall": results["overall_recall"], "f1": results["overall_f1"], "accuracy": results["overall_accuracy"], } # Initialize our Trainer trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset if training_args.do_train else None, eval_dataset=eval_dataset if training_args.do_eval else None, tokenizer=tokenizer, data_collator=data_collator, compute_metrics=compute_metrics, ) # Training if training_args.do_train: if last_checkpoint is not None: checkpoint = last_checkpoint elif os.path.isdir(model_args.model_name_or_path): checkpoint = model_args.model_name_or_path else: checkpoint = None train_result = trainer.train(resume_from_checkpoint=checkpoint) metrics = train_result.metrics trainer.save_model() # Saves the tokenizer too for easy upload max_train_samples = (data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)) metrics["train_samples"] = min(max_train_samples, len(train_dataset)) trainer.log_metrics("train", metrics) trainer.save_metrics("train", metrics) trainer.save_state() # Evaluation if training_args.do_eval: logger.info("*** Evaluate ***") metrics = trainer.evaluate() max_val_samples = (data_args.max_val_samples if data_args.max_val_samples is not None else len(eval_dataset)) metrics["eval_samples"] = min(max_val_samples, len(eval_dataset)) trainer.log_metrics("eval", metrics) trainer.save_metrics("eval", metrics) # Predict if training_args.do_predict: logger.info("*** Predict ***") predictions, labels, metrics = trainer.predict(test_dataset) predictions = np.argmax(predictions, axis=2) # Remove ignored index (special tokens) true_predictions = [[ label_list[p] for (p, l) in zip(prediction, label) if l != -100 ] for prediction, label in zip(predictions, labels)] trainer.log_metrics("test", metrics) trainer.save_metrics("test", metrics) # Save predictions output_test_predictions_file = os.path.join(training_args.output_dir, "test_predictions.txt") if trainer.is_world_process_zero(): with open(output_test_predictions_file, "w") as writer: for prediction in true_predictions: writer.write(" ".join(prediction) + "\n")
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses() # Detecting last checkpoint. last_checkpoint = None if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir: last_checkpoint = get_last_checkpoint(training_args.output_dir) if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome." ) elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) # logger.setLevel(logging.INFO if training_args.should_log else logging.WARN) logger.setLevel(logging.INFO) # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) # Set the verbosity to info of the Transformers logger (on main process only): # if training_args.should_log: # transformers.utils.logging.set_verbosity_info() # transformers.utils.logging.enable_default_handler() # transformers.utils.logging.enable_explicit_format() transformers.utils.logging.set_verbosity_info() transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() logger.info(f"Training/evaluation parameters {training_args}") # Set seed before initializing model. set_seed(training_args.seed) # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub). # # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called # 'text' is found. You can easily tweak this behavior (see below). # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if data_args.dataset_name is not None: # Downloading and loading a dataset from the hub. datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir) else: data_files = {} if data_args.train_file is not None: data_files["train"] = data_args.train_file if data_args.validation_file is not None: data_files["validation"] = data_args.validation_file if data_args.test_file is not None: data_files["test"] = data_args.test_file extension = data_args.train_file.split(".")[-1] datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. if training_args.do_train: column_names = datasets["train"].column_names features = datasets["train"].features else: column_names = datasets["validation"].column_names features = datasets["validation"].features if data_args.text_column_name is not None: text_column_name = data_args.text_column_name elif "tokens" in column_names: text_column_name = "tokens" else: text_column_name = column_names[0] if data_args.label_column_name is not None: label_column_name = data_args.label_column_name elif f"{data_args.task_name}_tags" in column_names: label_column_name = f"{data_args.task_name}_tags" else: label_column_name = column_names[1] # In the event the labels are not a `Sequence[ClassLabel]`, we will need to go through the dataset to get the # unique labels. def get_label_list(labels): unique_labels = set() for label in labels: unique_labels = unique_labels | set(label) label_list = list(unique_labels) label_list.sort() return label_list if isinstance(features[label_column_name].feature, ClassLabel): label_list = features[label_column_name].feature.names # No need to convert the labels since they are already ints. label_to_id = {i: i for i in range(len(label_list))} else: label_list = get_label_list(datasets["train"][label_column_name]) label_to_id = {l: i for i, l in enumerate(label_list)} num_labels = len(label_list) # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, num_labels=num_labels, label2id=label_to_id, id2label={i: l for l, i in label_to_id.items()}, finetuning_task=data_args.task_name, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=True, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) model = AutoModelForTokenClassification.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) # Tokenizer check: this script requires a fast tokenizer. if not isinstance(tokenizer, PreTrainedTokenizerFast): raise ValueError( "This example script only works for models that have a fast tokenizer. Checkout the big table of models " "at https://huggingface.co/transformers/index.html#supported-frameworks to find the model types that meet this " "requirement" ) # Preprocessing the dataset # Padding strategy padding = "max_length" if data_args.pad_to_max_length else False # Tokenize all texts and align the labels with them. def tokenize_and_align_labels(examples): tokenized_inputs = tokenizer( examples[text_column_name], padding=padding, truncation=True, # We use this argument because the texts in our dataset are lists of words (with a label for each word). is_split_into_words=True, ) labels = [] for i, label in enumerate(examples[label_column_name]): word_ids = tokenized_inputs.word_ids(batch_index=i) previous_word_idx = None label_ids = [] for word_idx in word_ids: # Special tokens have a word id that is None. We set the label to -100 so they are automatically # ignored in the loss function. if word_idx is None: label_ids.append(-100) # We set the label for the first token of each word. elif word_idx != previous_word_idx: label_ids.append(label_to_id[label[word_idx]]) # For the other tokens in a word, we set the label to either the current label or -100, depending on # the label_all_tokens flag. else: label_ids.append(label_to_id[label[word_idx]] if data_args.label_all_tokens else -100) previous_word_idx = word_idx labels.append(label_ids) tokenized_inputs["labels"] = labels return tokenized_inputs if training_args.do_train: if "train" not in datasets: raise ValueError("--do_train requires a train dataset") train_dataset = datasets["train"] if data_args.max_train_samples is not None: train_dataset = train_dataset.select(range(data_args.max_train_samples)) train_dataset = train_dataset.map( tokenize_and_align_labels, batched=True, num_proc=data_args.preprocessing_num_workers, load_from_cache_file=not data_args.overwrite_cache, ) if training_args.do_eval: if "validation" not in datasets: raise ValueError("--do_eval requires a validation dataset") eval_dataset = datasets["validation"] if data_args.max_eval_samples is not None: eval_dataset = eval_dataset.select(range(data_args.max_eval_samples)) eval_dataset = eval_dataset.map( tokenize_and_align_labels, batched=True, num_proc=data_args.preprocessing_num_workers, load_from_cache_file=not data_args.overwrite_cache, ) if training_args.do_predict: if "test" not in datasets: raise ValueError("--do_predict requires a test dataset") predict_dataset = datasets["test"] if data_args.max_predict_samples is not None: predict_dataset = predict_dataset.select(range(data_args.max_predict_samples)) predict_dataset = predict_dataset.map( tokenize_and_align_labels, batched=True, num_proc=data_args.preprocessing_num_workers, load_from_cache_file=not data_args.overwrite_cache, ) # Data collator data_collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=8 if training_args.fp16 else None) # Metrics metric = load_metric("seqeval") def compute_metrics(p): predictions, labels = p predictions = np.argmax(predictions, axis=2) # Remove ignored index (special tokens) true_predictions = [ [label_list[p] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels) ] true_labels = [ [label_list[l] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels) ] results = metric.compute(predictions=true_predictions, references=true_labels) if data_args.return_entity_level_metrics: # Unpack nested dictionaries final_results = {} for key, value in results.items(): if isinstance(value, dict): for n, v in value.items(): final_results[f"{key}_{n}"] = v else: final_results[key] = value return final_results else: return { "precision": results["overall_precision"], "recall": results["overall_recall"], "f1": results["overall_f1"], "accuracy": results["overall_accuracy"], } # Initialize our Trainer trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset if training_args.do_train else None, eval_dataset=eval_dataset if training_args.do_eval else None, tokenizer=tokenizer, data_collator=data_collator, compute_metrics=compute_metrics, ) # Training if training_args.do_train: checkpoint = None # if training_args.resume_from_checkpoint is not None: # checkpoint = training_args.resume_from_checkpoint # elif last_checkpoint is not None: # checkpoint = last_checkpoint checkpoint = last_checkpoint train_result = trainer.train(resume_from_checkpoint=checkpoint) metrics = train_result.metrics trainer.save_model() # Saves the tokenizer too for easy upload max_train_samples = ( data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset) ) metrics["train_samples"] = min(max_train_samples, len(train_dataset)) trainer.log_metrics("train", metrics) trainer.save_metrics("train", metrics) trainer.save_state() # Evaluation if training_args.do_eval: logger.info("*** Evaluate ***") metrics = trainer.evaluate() max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset) metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset)) trainer.log_metrics("eval", metrics) trainer.save_metrics("eval", metrics) # Predict if training_args.do_predict: logger.info("*** Predict ***") predictions, labels, metrics = trainer.predict(predict_dataset, metric_key_prefix="predict") predictions = np.argmax(predictions, axis=2) # Remove ignored index (special tokens) true_predictions = [ [label_list[p] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels) ] trainer.log_metrics("predict", metrics) trainer.save_metrics("predict", metrics) # Save predictions output_predictions_file = os.path.join(training_args.output_dir, "predictions.txt") if trainer.is_world_process_zero(): with open(output_predictions_file, "w") as writer: for prediction in true_predictions: writer.write(" ".join(prediction) + "\n") if training_args.push_to_hub: kwargs = {"finetuned_from": model_args.model_name_or_path, "tags": "token-classification"} if data_args.dataset_name is not None: kwargs["dataset_tags"] = data_args.dataset_name if data_args.dataset_config_name is not None: kwargs["dataset_args"] = data_args.dataset_config_name kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}" else: kwargs["dataset"] = data_args.dataset_name trainer.push_to_hub(**kwargs)
return tokenized_inputs # tokenize_and_align_labels(datasets['train'][:5]) tokenized_datasets = datasets.map(tokenize_and_align_labels, batched=True) model = AutoModelForTokenClassification.from_pretrained( model_checkpoint, num_labels=len(label_list)) # no_grad # for name, param in model.named_parameters(): # if 'classifier' not in name: # classifier layer # param.requires_grad = False data_collator = DataCollatorForTokenClassification(tokenizer) metric = load_metric('seqeval') # labels = [label_list[i] for i in example[f'{task}_tags']] # metric.compute(predictions=[labels], references=[labels]) def compute_metrics(p): predictions, labels = p predictions = np.argmax(predictions, axis=2) true_predictions = [[ label_list[p] for (p, l) in zip(prediction, label) if l != -100 ] for prediction, label in zip(predictions, labels)] true_labels = [[ label_list[l] for (p, l) in zip(prediction, label) if l != -100 ] for prediction, label in zip(predictions, labels)]
def main(): datasets = load_dataset("conll2003") label_list = datasets["train"].features[f"{task}_tags"].feature.names example = datasets["train"][4] tokenizer = AutoTokenizer.from_pretrained(model_checkpoint) def tokenize_and_align_labels(examples): tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True) labels = [] for i, label in enumerate(examples[f"{task}_tags"]): word_ids = tokenized_inputs.word_ids(batch_index=i) previous_word_idx = None label_ids = [] for word_idx in word_ids: # Special tokens have a word id that is None. We set the label to -100 so they are automatically # ignored in the loss function. if word_idx is None: label_ids.append(-100) # We set the label for the first token of each word. elif word_idx != previous_word_idx: label_ids.append(label[word_idx]) # For the other tokens in a word, we set the label to either the current label or -100, depending on # the label_all_tokens flag. else: label_ids.append( label[word_idx] if label_all_tokens else -100) previous_word_idx = word_idx labels.append(label_ids) tokenized_inputs["labels"] = labels return tokenized_inputs # pipeline # examples = datasets["train"][:5] # tokenized_inputs = tokenizer( # examples["tokens"], truncation=True, is_split_into_words=True # ) tokenize_and_align_labels(datasets["train"][:5]) tokenized_datasets = datasets.map(tokenize_and_align_labels, batched=True) model = AutoModelForTokenClassification.from_pretrained( model_checkpoint, num_labels=len(label_list)) model_name = model_checkpoint.split("/")[-1] args = TrainingArguments( f"{model_name}-finetuned-{task}", evaluation_strategy="epoch", learning_rate=2e-5, per_device_train_batch_size=batch_size, per_device_eval_batch_size=batch_size, num_train_epochs=3, weight_decay=0.01, push_to_hub=True, ) data_collator = DataCollatorForTokenClassification(tokenizer) metric = load_metric("seqeval") labels = [label_list[i] for i in example[f"{task}_tags"]] metric.compute(predictions=[labels], references=[labels]) def compute_metrics(p): predictions, labels = p predictions = np.argmax(predictions, axis=2) # Remove ignored index (special tokens) true_predictions = [[ label_list[p] for (p, l) in zip(prediction, label) if l != -100 ] for prediction, label in zip(predictions, labels)] true_labels = [[ label_list[l] for (p, l) in zip(prediction, label) if l != -100 ] for prediction, label in zip(predictions, labels)] results = metric.compute(predictions=true_predictions, references=true_labels) return { "precision": results["overall_precision"], "recall": results["overall_recall"], "f1": results["overall_f1"], "accuracy": results["overall_accuracy"], } trainer = Trainer( model, args, train_dataset=tokenized_datasets["train"], eval_dataset=tokenized_datasets["validation"], data_collator=data_collator, tokenizer=tokenizer, compute_metrics=compute_metrics, ) trainer.train() # trainer.evaluate() predictions, labels, _ = trainer.predict(tokenized_datasets["validation"]) predictions = np.argmax(predictions, axis=2) # Remove ignored index (special tokens) true_predictions = [[ label_list[p] for (p, l) in zip(prediction, label) if l != -100 ] for prediction, label in zip(predictions, labels)] true_labels = [[ label_list[l] for (p, l) in zip(prediction, label) if l != -100 ] for prediction, label in zip(predictions, labels)] results = metric.compute(predictions=true_predictions, references=true_labels) results
def main(): parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # 如果我们仅将一个参数传递给脚本,并且它是json文件的路径,对其进行解析以获取参数。 model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) #检查output文件夹 if (os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty." "Use --overwrite_output_dir to overcome.") # 日志 logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if is_main_process(training_args.local_rank) else logging.WARN, ) # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() logger.info("Training/evaluation parameters %s", training_args) # Set seed before initializing model. set_seed(training_args.seed) # 自动下载数据集 if data_args.dataset_name is None: print("必须给数据集一个名字,和指定训练或测试的路径,才能加载数据集") #加载数据集 data_files = {} if data_args.train_file is not None: data_files["train"] = data_args.train_file if data_args.validation_file is not None: data_files["validation"] = data_args.validation_file if data_args.test_file is not None: data_files["test"] = data_args.test_file datasets = load_dataset(path=data_args.script_file, name=data_args.dataset_name, data_files=data_files) #使用数据 if training_args.do_train: column_names = datasets["train"].column_names features = datasets["train"].features else: column_names = datasets["validation"].column_names features = datasets["validation"].features # 使用哪个column作为text, column_names: ['id', 'ner_tags', 'tokens'] text_column_name = "tokens" if "tokens" in column_names else column_names[0] # 哪列作为label,这里是'ner_tags' label_column_name = (f"{data_args.task_name}_tags" if f"{data_args.task_name}_tags" in column_names else column_names[1]) #如果labels不是`Sequence [ClassLabel]`,我们将需要遍历数据集以获得唯一标签。 def get_label_list(labels): unique_labels = set() for label in labels: unique_labels = unique_labels | set(label) label_list = list(unique_labels) label_list.sort() return label_list # 看一下label的feautre是不是ClassLabel类型,已经定制好的 if isinstance(features[label_column_name].feature, ClassLabel): # label_list: ['O', 'B-COM', 'I-COM', 'B-EFF', 'I-EFF'] label_list = features[label_column_name].feature.names # 由名称变成id格式的字典 label_to_id = {i: i for i in range(len(label_list))} else: label_list = get_label_list(datasets["train"][label_column_name]) label_to_id = {l: i for i, l in enumerate(label_list)} num_labels = len(label_list) # 开始加载预训练模型和tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. # 加载模型配置 config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, num_labels=num_labels, finetuning_task=data_args.task_name, #ner cache_dir=model_args.cache_dir, #None ) if model_args.model_name_or_path == "albert_model": #我们的albertmodel使用的是Bert的tokenizer tokenizer = BertTokenizerFast.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=True, ) else: tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=True, ) model = AutoModelForTokenClassification.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) # 这个只能用fast tokenizer, Tokenizer check if not isinstance(tokenizer, PreTrainedTokenizerFast): raise ValueError( "This example script only works for models that have a fast tokenizer. Checkout the big table of models " "at https://huggingface.co/transformers/index.html#bigtable to find the model types that meet this " "requirement") # 数据预处理 # 最大序列长度 max_length = data_args.max_length # Padding 策略 padding = "max_length" if data_args.pad_to_max_length else False # Tokenize所有文本并将label与它们对齐。 def tokenize_and_align_labels(examples): """ datasets.map函数处理时会调用 Args: examples: 这里是2条样本, 例如: examples = {dict: 5} 'chunk_tags' = {list: 2} [[11, 21, 11, 12, 21, 22, 11, 12, 0], [11, 12]] 'id' = {list: 2} ['0', '1'] 'ner_tags' = {list: 2} [[3, 0, 7, 0, 0, 0, 7, 0, 0], [1, 2]] 'pos_tags' = {list: 2} [[22, 42, 16, 21, 35, 37, 16, 21, 7], [22, 22]] 'tokens' = {list: 2} [['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.'], ['Peter', 'Blackburn']] Returns: {'input_ids': [[101, 4649, 2244, 102], [101, 6208, 4658, 2094, 3723, 102]], 'token_type_ids': [[0, 0, 0, 0], [0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1], [1, 1, 1, 1, 1, 1]], 'labels': [[-100, 3, 4, -100], [-100, 1, 2, 2, 2, -100]]} """ # 对单条样本的examples的tokens字段,即文本字段进行tokenizer tokenized_inputs = tokenizer( examples[text_column_name], padding=padding, max_length=max_length, truncation=True, # 我们使用此参数是因为数据集中的文本是单词列表(每个单词带有标签) is_split_into_words=True, ) labels = [] for i, label in enumerate(examples[label_column_name]): # 对每条样本进行处理, label 是列表[3, 0, 7, 0, 0, 0, 7, 0, 0] word_ids = tokenized_inputs.word_ids(batch_index=i) # word_ids: [None, 0, 1, 2, 3, 4, 5, 6, 7, 8, None] previous_word_idx = None label_ids = [] for word_idx in word_ids: # 特殊token的单词ID为None的。 我们将label设置为-100,以便在损失函数中自动将其忽略 if word_idx is None: label_ids.append(-100) # 我们为每个单词的第一个token设置label。 elif word_idx != previous_word_idx: label_ids.append(label_to_id[label[word_idx]]) # 对于单词中的其他token,我们根据label_all_tokens标志将label设置为当前label或-100。 # 这里是对token中不是ner的部分,给label,默认给的-100 else: label_ids.append(label_to_id[label[word_idx]] if data_args. label_all_tokens else -100) previous_word_idx = word_idx labels.append(label_ids) # 最终labels是一个列表 # {list: 11}[-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, -100] # {list: 4}[-100, 1, 2, -100] tokenized_inputs["labels"] = labels # print(f"一个batch的tokenized_inputs,{tokenized_inputs}") return tokenized_inputs #处理数据,用map函数 tokenized_datasets = datasets.map( tokenize_and_align_labels, batched=True, num_proc=data_args.preprocessing_num_workers, load_from_cache_file=not data_args.overwrite_cache, ) # Data collator, 在for循环dataloader时调用 data_collator = DataCollatorForTokenClassification(tokenizer) # 计算metrics def compute_metrics(p): predictions, labels = p predictions = np.argmax(predictions, axis=2) # Remove ignored index (special tokens) true_predictions = [[ label_list[p] for (p, l) in zip(prediction, label) if l != -100 ] for prediction, label in zip(predictions, labels)] true_labels = [[ label_list[l] for (p, l) in zip(prediction, label) if l != -100 ] for prediction, label in zip(predictions, labels)] return { "accuracy_score": accuracy_score(true_labels, true_predictions), "precision": precision_score(true_labels, true_predictions), "recall": recall_score(true_labels, true_predictions), "f1": f1_score(true_labels, true_predictions), } # Initialize our Trainer, msra没有验证集,这里用test测试集,其实应该用train中分离出来 trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_datasets["train"] if training_args.do_train else None, eval_dataset=tokenized_datasets["validation"] if training_args.do_eval else None, tokenizer=tokenizer, data_collator=data_collator, compute_metrics=compute_metrics, ) # Training if training_args.do_train: # 这里model_path是用来是否继续训练的,恢复训练 trainer.train(model_path=model_args.model_name_or_path if os.path. isdir(model_args.model_name_or_path) else None) trainer.save_model() # Saves the tokenizer too for easy upload #评估模型 results = {} if training_args.do_eval: logger.info("*** Evaluate ***") results = trainer.evaluate() output_eval_file = os.path.join(training_args.output_dir, "eval_results_ner.txt") if trainer.is_world_process_zero(): with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key, value in results.items(): logger.info(f" {key} = {value}") writer.write(f"{key} = {value}\n") # if training_args.do_predict: logger.info("*** 预测 ***") test_dataset = tokenized_datasets["test"] predictions, labels, metrics = trainer.predict(test_dataset) predictions = np.argmax(predictions, axis=2) # Remove ignored index (special tokens) true_predictions = [[ label_list[p] for (p, l) in zip(prediction, label) if l != -100 ] for prediction, label in zip(predictions, labels)] output_test_results_file = os.path.join(training_args.output_dir, "test_results.txt") if trainer.is_world_process_zero(): with open(output_test_results_file, "w") as writer: for key, value in metrics.items(): logger.info(f" {key} = {value}") writer.write(f"{key} = {value}\n") # 保存评估结果 output_test_predictions_file = os.path.join(training_args.output_dir, "test_predictions.txt") if trainer.is_world_process_zero(): with open(output_test_predictions_file, "w") as writer: for prediction in true_predictions: writer.write(" ".join(prediction) + "\n") return results
def train_model(dict_args): # parse args dict parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) model_args, data_args, training_args = parser.parse_dict(dict_args) if (os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty." "Use --overwrite_output_dir to overcome.") # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if is_main_process(training_args.local_rank) else logging.WARN, ) # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() logger.info("Training/evaluation parameters %s", training_args) logger.info("Data Arguments %s", data_args) logger.info("Model Arguments %s", model_args) # Set seed before initializing model. set_seed(training_args.seed) # load datasets assert os.path.exists(data_args.train_file) and os.path.exists( data_args.validation_file) and os.path.exists(data_args.test_file) datasets = load_dataset("csv", data_files={ "train": data_args.train_file, "validation": data_args.validation_file, "test": data_args.test_file }, delimiter="\t", cache_dir=model_args.cache_dir) logger.info("Datasets %s", datasets) logger.info("Column names %s", datasets["train"].column_names) logger.info("Sample example %s", datasets["train"][0]) # get label information text_column_name = "text" label_column_name = "labels" bbox_column_name = "bbox" num_labels, label_to_id, id_to_label = get_label_info( datasets["train"][label_column_name], data_args.task_name) logger.info("num_labels %s", num_labels) logger.info("label_to_id %s", label_to_id) logger.info("id_to_label %s", id_to_label) # Load config, tokenizer and pre-trained model # For Distributed training: The .from_pretrained methods guarantee that only # one local process can concurrently download model & vocab. if data_args.task_name == "regression": config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, num_labels=num_labels, finetuning_task=data_args.task_name, cache_dir=model_args.cache_dir # for longformer : May adapt the attention_window=512 (default) in config ) else: config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, num_labels=num_labels, id2label=id_to_label, label2id=label_to_id, finetuning_task=data_args.task_name, cache_dir=model_args.cache_dir # for longformer : May adapt the attention_window=512 (default) in config ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=True, add_prefix_space=True # for roberta tokenizer ) if data_args.task_name == "ner": model = AutoModelForTokenClassification.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) else: model = AutoModelForSequenceClassification.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir) # Tokenizer check: this script requires a fast tokenizer. if not isinstance(tokenizer, PreTrainedTokenizerFast): raise ValueError( "This example script only works for models that have a fast tokenizer. Checkout the big table of models " "at https://huggingface.co/transformers/index.html#bigtable to find the model types that meet this " "requirement") # Pre-process the datasets (tokenize words and align labels/bboxes if needed) padding = "max_length" if data_args.pad_to_max_length else False use_bbox = data_args.use_bbox tokenized_datasets = datasets.map( lambda x: preprocess_dataset(x, tokenizer, label_to_id, data_args. label_all_tokens, padding, use_bbox, data_args.task_name), remove_columns=[label_column_name], batched=True, num_proc=data_args.preprocessing_num_workers, load_from_cache_file=not data_args.overwrite_cache, ) logger.info("Tokenized datasets %s", tokenized_datasets) logger.info("Column names %s", tokenized_datasets["train"].column_names) logger.info("Sample example %s", tokenized_datasets["train"][0]) # Data collator. Used to pad the inputs of a single batch to the max size of this batch # Not needed if padding has already been done (if pad_to_max_length is true): default_data_collator # This does not work with bboxes. Hence pad_to_max_length is always True when using bboxes if data_args.pad_to_max_length: data_collator = default_data_collator else: if data_args.task_name == "ner": data_collator = DataCollatorForTokenClassification(tokenizer) else: data_collator = None # will default to DataCollatorWithPadding logger.info("Data Collator used %s", data_collator) # Initialize our Trainer if data_args.task_name != "multilabel-classif": trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_datasets["train"] if training_args.do_train else None, eval_dataset=tokenized_datasets["validation"] if training_args.do_eval else None, tokenizer=tokenizer, data_collator=data_collator, compute_metrics=lambda x: compute_metrics(x, id_to_label, data_args .task_name), ) else: trainer = MultilabelClassificationTrainer( model=model, args=training_args, train_dataset=tokenized_datasets["train"] if training_args.do_train else None, eval_dataset=tokenized_datasets["validation"] if training_args.do_eval else None, tokenizer=tokenizer, data_collator=data_collator, compute_metrics=lambda x: compute_metrics(x, id_to_label, data_args .task_name), ) # Training if training_args.do_train: train_result = trainer.train( model_path=model_args.model_name_or_path if os.path. isdir(model_args.model_name_or_path) else None) # we save the final model (last or best) to the sagemaker output folder trainer.save_model(output_dir=data_args.sagemaker_output_path ) # It saves the tokenizer too for easy upload output_train_file = os.path.join(data_args.sagemaker_output_path, "train_results.txt") if trainer.is_world_process_zero(): with open(output_train_file, "w") as writer: logger.info("***** Train results *****") for key, value in sorted(train_result.metrics.items()): logger.info(f" {key} = {value}") writer.write(f"{key} = {value}\n") # Need to save the state, since Trainer.save_model saves only the tokenizer with the model trainer.state.save_to_json( os.path.join(data_args.sagemaker_output_path, "trainer_state.json")) # We also save the model_args and data_args for future use (training_args are already saved) torch.save( asdict(model_args), os.path.join(data_args.sagemaker_output_path, "model_args.bin")) torch.save( asdict(data_args), os.path.join(data_args.sagemaker_output_path, "data_args.bin")) # Evaluation (This will evaluate the final/best model on the dev set and write results results = {} if training_args.do_eval: logger.info("*** Evaluate best/final model on dev set ***") results = trainer.evaluate() output_eval_file = os.path.join(data_args.sagemaker_output_path, "eval_results.txt") if trainer.is_world_process_zero(): with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key, value in results.items(): logger.info(f" {key} = {value}") writer.write(f"{key} = {value}\n") # Evaluate and Predict on test set if training_args.do_predict: logger.info("*** Predict on test set ***") test_dataset = tokenized_datasets["test"] predictions, labels, metrics = trainer.predict( test_dataset, metric_key_prefix="test") if data_args.task_name == "classif": true_predictions = [ id_to_label[p] for p in np.argmax(predictions, axis=1) ] elif data_args.task_name == "multilabel-classif": predictions = 1 / (1 + np.exp(-predictions)) # sigmoid predictions = (predictions > 0.5) # threshold true_predictions = [[id_to_label[i] for i in np.where(p == 1)[0]] for p in predictions] elif data_args.task_name == "regression": true_predictions = np.squeeze(predictions) elif data_args.task_name == "ner": predictions = np.argmax(predictions, axis=2) true_predictions = [[ id_to_label[p] for (p, l) in zip(prediction, label) if l != -100 ] for prediction, label in zip(predictions, labels)] output_test_results_file = os.path.join( data_args.sagemaker_output_path, "test_results.txt") if trainer.is_world_process_zero(): with open(output_test_results_file, "w") as writer: for key, value in sorted(metrics.items()): logger.info(f" {key} = {value}") writer.write(f"{key} = {value}\n") output_test_predictions_file = os.path.join( data_args.sagemaker_output_path, "test_predictions.txt") if trainer.is_world_process_zero(): with open(output_test_predictions_file, "w") as writer: for prediction in true_predictions: if data_args.task_name == "ner": writer.write(" ".join(prediction) + "\n") else: writer.write(str(prediction) + "\n") return results
def train(args): dataset = load_dataset('ManyTypes4TypeScript.py', ignore_verifications=True) accelerator = Accelerator() tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, add_prefix_space=True, use_fast=True) def tokenize_and_align_labels(examples): def divide_chunks(l1, l2, n): for i in range(0, len(l1), n): yield {'input_ids': [0] + l1[i:i + n] + [2], 'labels': [-100] + l2[i:i + n] + [-100]} window_size = 510 tokenized_inputs = tokenizer(examples['tokens'], is_split_into_words=True, truncation=False, add_special_tokens=False) inputs_ = {'input_ids': [], 'labels': []} for encoding, label in zip(tokenized_inputs.encodings, examples['labels']): word_ids = encoding.word_ids # Map tokens to their respective word. previous_word_idx = None label_ids = [] for word_idx in word_ids: # Set the special tokens to -100. if word_idx is None: label_ids.append(-100) elif word_idx != previous_word_idx: # Only label the first token of a given word. l = label[word_idx] if label[word_idx] is not None else -100 label_ids.append(l) else: label_ids.append(-100) previous_word_idx = word_idx s_labels = set(label_ids) if len(s_labels) == 1 and list(s_labels)[0] == -100: continue for e in divide_chunks(encoding.ids, label_ids, window_size): for k, v in e.items(): inputs_[k].append(v) return inputs_ tokenized_hf = dataset.map(tokenize_and_align_labels, batched=True, remove_columns=['id', 'tokens', 'labels']) label_list = tokenized_hf["train"].features[f"labels"].feature.names model = AutoModelForTokenClassification.from_pretrained(args.model_name, num_labels=len(label_list)) train_dataset = tokenized_hf["train"] eval_dataset = tokenized_hf["test"] valid_dataset = tokenized_hf["validation"] logger = logging.getLogger(__name__) train_batch_size = args.train_batch_size eval_batch_size = args.eval_batch_size gradient_accumulation_steps = args.gradient_accumulation_steps data_collator = DataCollatorForTokenClassification( tokenizer, pad_to_multiple_of=(8 if accelerator.use_fp16 else None), padding='max_length', max_length=512 ) train_dataloader = DataLoader( train_dataset, shuffle=True, collate_fn=data_collator, batch_size=train_batch_size ) eval_dataloader = DataLoader(eval_dataset, collate_fn=data_collator, batch_size=eval_batch_size) valid_dataloader = DataLoader(valid_dataset, collate_fn=data_collator, batch_size=eval_batch_size) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], "weight_decay": args.weight_decay, }, { "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0, }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) # Use the device given by the `accelerator` object. device = accelerator.device print("Device: {0}".format(device)) model.to(device) # Prepare everything with our `accelerator`. model, optimizer, train_dataloader, eval_dataloader, valid_dataloader = accelerator.prepare( model, optimizer, train_dataloader, eval_dataloader, valid_dataloader ) lr_scheduler = get_scheduler( name='constant', # constant because streaming dataset optimizer=optimizer, # num_warmup_steps=args.warmup_steps, # num_training_steps=None if args.max_steps < 0. else args.max_steps, ) # Metrics - more detailed than overall accuracy in evaluator.py warnings.filterwarnings('ignore') metric = load_metric("seqeval") metric_unk = load_metric("seqeval") metric_top100 = load_metric("seqeval") train_total_batch_size = train_batch_size * accelerator.num_processes * gradient_accumulation_steps eval_total_batch_size = eval_batch_size * accelerator.num_processes logger.info("***** Running training *****") logger.info(f" Num examples = {len(train_dataset)}") logger.info(f" Num Epochs = {args.num_train_epochs}") logger.info(f" Instantaneous batch size per device = {train_batch_size}") logger.info(f" Total train batch size (w. parallel, distributed & accumulation) = {train_total_batch_size}") logger.info(f" Gradient Accumulation steps = {gradient_accumulation_steps}") # Only show the progress bar once on each machine. progress_bar_train = tqdm(range(len(train_dataset) // train_total_batch_size), disable=not accelerator.is_local_main_process) progress_bar_eval = tqdm(range(len(eval_dataset) // eval_total_batch_size), disable=not accelerator.is_local_main_process) completed_steps = 0 for epoch in range(args.num_train_epochs): if args.do_train: model.train() for step, batch in enumerate(train_dataloader): outputs = model(**batch) loss = outputs.loss loss = loss / gradient_accumulation_steps accelerator.backward(loss) accelerator.clip_grad_norm_(model.parameters(), args.max_grad_norm) if step % gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1: optimizer.step() lr_scheduler.step() optimizer.zero_grad() progress_bar_train.update(1) completed_steps += 1 if args.max_steps > 0 and step > args.max_steps: break if args.do_eval: export_predictions = [] model.eval() for step, batch in enumerate(eval_dataloader): with torch.no_grad(): outputs = model(input_ids=batch['input_ids'], labels=None) predictions = outputs.logits.argmax(dim=-1) labels = batch["labels"] predictions_gathered = accelerator.gather(predictions) labels_gathered = accelerator.gather(labels) preds, refs = get_labels(predictions_gathered, labels_gathered, label_list) export_predictions.extend(flatten(preds)) preds_unk, refs_unk = get_labels(predictions_gathered, labels_gathered, label_list, score_unk=True) preds_100, refs_100 = get_labels(predictions_gathered, labels_gathered, label_list, top100=True) progress_bar_eval.update(1) metric.add_batch( predictions=preds, references=refs, ) metric_unk.add_batch( predictions=preds_unk, references=refs_unk, ) metric_top100.add_batch( predictions=preds_100, references=refs_100, ) eval_metric = compute_metrics(metric, metric_unk, metric_top100) accelerator.print(f"epoch {epoch}:", eval_metric) enums = list(map(str, list(range(len(export_predictions))))) export_predictions = list(map(str, export_predictions)) export_predictions = ["{}\t{}".format(a_, b_) for a_, b_ in zip(enums, export_predictions)] with open(args.output_dir + "/predictions.txt", 'w') as f: f.write("\n".join(export_predictions)) if args.output_dir is not None: accelerator.wait_for_everyone() unwrapped_model = accelerator.unwrap_model(model) unwrapped_model.save_pretrained(args.output_dir, save_function=accelerator.save)
def main(args): # Set seed if args.run: seed = random.randrange(10**3) else: seed = args.seed set_seed(seed) # Run name model_name = args.model_name model_name = model_name[2:] if model_name.startswith("./") else model_name model_name = model_name[1:] if model_name.startswith("/") else model_name run_name = f"{model_name}_{args.task_name}" run_name = f"{run_name}_{args.dataset_config or args.dataset_name}" run_name = run_name.replace("/", "-") run_name = f"{run_name}_e{str(args.num_train_epochs)}" run_name = f"{run_name}_lr{str(args.learning_rate)}" run_name = f"{run_name}_ws{str(args.warmup_steps)}" run_name = f"{run_name}_wd{str(args.weight_decay)}" run_name = f"{run_name}_s{str(seed)}" if args.max_length != 512: run_name = f"{run_name}_seq{str(args.max_length)}" if args.label_all_tokens: run_name = f"{run_name}_labelall" do_supercase = args.supercase or "supercased" in run_name if do_supercase and "supercased" not in run_name: run_name = f"{run_name}_supercased" if args.run: run_name = f"{run_name}_r{str(args.run)}" output_dir = Path(args.output_dir) / run_name # Tokenizer settings padding = args.task_name not in ( "ner", "pos" ) # default: False @param ["False", "'max_length'"] {type: 'raw'} max_length = args.max_length #@param {type: "number"} # Training settings weight_decay = args.weight_decay #@param {type: "number"} adam_beta1 = 0.9 #@param {type: "number"} adam_beta2 = 0.999 #@param {type: "number"} adam_epsilon = 1e-08 #@param {type: "number"} max_grad_norm = 1.0 #@param {type: "number"} save_total_limit = 1 #@param {type: "integer"} load_best_model_at_end = False #@param {type: "boolean"} # wandb wandb.init(name=run_name, entity="nbailab") wandb.log({ "seed": int(seed), }) # Loading Dataset print("\n\n#####################################") print(args.model_name) print(args.task_name) print(args.dataset_config) train_split = args.dataset_split_train test_split = args.dataset_split_test validation_split = args.dataset_split_validation if ":" in args.dataset_name: dataset_name, dataset_config = args.dataset_name.split(":") else: dataset_name = args.dataset_name dataset_config = args.dataset_config if dataset_config is None or len(dataset_config) == 0: dataset = load_dataset(dataset_name) else: dataset = load_dataset(dataset_name, dataset_config) column_names = dataset[train_split].column_names features = dataset[train_split].features if "tokens" in column_names: text_column_name = "tokens" elif "text" in column_names: text_column_name = "text" else: text_column_name = column_names[0] if f"{args.task_name}_tags" in column_names: label_column_name = f"{args.task_name}_tags" elif "label" in column_names: label_column_name = "label" else: label_column_name = column_names[1] if isinstance(features[label_column_name], datasets.features.Sequence): label_list = features[label_column_name].feature.names else: label_list = features[label_column_name].names label_to_id = {i: i for i in range(len(label_list))} num_labels = len(label_list) print(f"Number of labels: {num_labels}") print({label.split("-")[-1] for label in label_list}) # Download Norwegian Models # Downloading the model directly from a GCP bucket should not take longer than 3 minutes. # if args.model_name.startswith("./nb_models") and not os.path.exists("./nb_models"): # !mkdir -p nb_models/eval/ # !gsutil -m cp -r gs://notram-public/nb_models/eval/* nb_models/eval/ # Training config = AutoConfig.from_pretrained( args.model_name, num_labels=num_labels, finetuning_task=args.task_name, cache_dir=args.cache_dir, force_download=args.force_download, ) tokenizer = AutoTokenizer.from_pretrained( args.model_name, cache_dir=args.cache_dir, use_fast=True, force_download=args.force_download, ) tokenizer_test_sentence = """ Denne gjengen håper at de sammen skal bidra til å gi kvinnefotballen i Kristiansand et lenge etterlengtet løft. """.strip() printm("""Tokenizer test""") printm(f"> {tokenizer_test_sentence}") if do_supercase: tokenizer_test_sentence = supercase(tokenizer_test_sentence, args.task_name in ("pos", "ner")) printm(tokenizer.tokenize(tokenizer_test_sentence)) printm(tokenizer(tokenizer_test_sentence).input_ids) # Token tasks if args.task_name in ("pos", "ner"): model = AutoModelForTokenClassification.from_pretrained( args.model_name, from_tf=bool(".ckpt" in args.model_name), config=config, cache_dir=args.cache_dir, force_download=args.force_download, ) # Preprocessing the dataset tokenized_datasets = dataset.map( lambda examples: tokenize_and_align_labels( tokenizer, examples, text_column_name, max_length, padding, label_column_name, label_to_id, args.label_all_tokens, do_supercase), batched=True, load_from_cache_file=not args.overwrite_cache, num_proc=os.cpu_count(), ) # Data collator data_collator = DataCollatorForTokenClassification(tokenizer) compute_metrics = token_compute_metrics # Sequence tasks else: model = AutoModelForSequenceClassification.from_pretrained( args.model_name, from_tf=bool(".ckpt" in args.model_name), config=config, cache_dir=args.cache_dir, force_download=args.force_download, ) # Preprocessing the dataset tokenized_datasets = dataset.map( lambda examples: tokenizer( supercase(examples[text_column_name], is_split_into_words=False) if do_supercase else examples[text_column_name], max_length=max_length, padding=padding, truncation=True, is_split_into_words=False, ), batched=True, load_from_cache_file=not args.overwrite_cache, num_proc=os.cpu_count(), ) # Data collator data_collator = DataCollatorWithPadding( tokenizer, max_length=max_length, padding=padding, ) compute_metrics = sequence_compute_metrics train_dataset = dataset_select(tokenized_datasets[train_split], args.max_train_size) test_dataset = dataset_select(tokenized_datasets[test_split], args.max_test_size) validation_dataset = dataset_select(tokenized_datasets[validation_split], args.max_validation_size) wandb.log({ "train_size": len(train_dataset), "test_size": len(test_dataset), "validation_size": len(validation_dataset), }) samples_per_batch = (train_dataset.shape[0] / args.train_batch_size) total_steps = args.num_train_epochs * samples_per_batch warmup_steps = int(args.warmup_steps * total_steps) wandb.log({ "total_steps": int(total_steps), "total_warmup_steps": warmup_steps }) do_eval = validation_split in tokenized_datasets do_test = test_split in tokenized_datasets training_args = TrainingArguments( output_dir=output_dir.as_posix(), overwrite_output_dir=args.overwrite_output_dir, do_train=True, do_eval=do_eval, do_predict=do_test, per_device_train_batch_size=int(args.train_batch_size), per_device_eval_batch_size=int(args.eval_batch_size or args.train_batch_size), learning_rate=float(args.learning_rate), weight_decay=weight_decay, adam_beta1=adam_beta1, adam_beta2=adam_beta2, adam_epsilon=adam_epsilon, max_grad_norm=max_grad_norm, num_train_epochs=args.num_train_epochs, warmup_steps=warmup_steps, load_best_model_at_end=load_best_model_at_end, seed=seed, save_total_limit=save_total_limit, run_name=run_name, disable_tqdm=True, eval_steps=500, ) # Initialize our Trainer trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=validation_dataset if do_eval else None, tokenizer=tokenizer, data_collator=data_collator, compute_metrics=lambda pairs: compute_metrics(pairs, label_list), ) train_result = trainer.train() trainer.save_model() # Saves the tokenizer too for easy upload write_file("train", train_result.metrics, output_dir, save_artifact=args.save_artifacts) # Need to save the state, since Trainer.save_model saves only the tokenizer with the model trainer.state.save_to_json(output_dir / "trainer_state.json") # Evaluation if do_eval: printm(f"**Evaluate**") results = trainer.evaluate() write_file("eval", results, output_dir, save_artifact=args.save_artifacts) # Tesing if do_test: printm("**Test**") predictions, labels, metrics = trainer.predict(test_dataset) write_file("test", metrics, output_dir, save_artifact=args.save_artifacts) if args.task_name in ("ner", "pos"): predictions = np.argmax(predictions, axis=2) # Remove ignored index (special tokens) true_predictions = [[ label_list[p] for (p, l) in zip(prediction, label) if l != -100 ] for prediction, label in zip(predictions, labels)] else: predictions = np.argmax(predictions, axis=1) true_predictions = [ label_list[p] for (p, l) in zip(predictions, labels) if l != -100 ] # Save predictions output_test_predictions_file = os.path.join(output_dir, "test_predictions.txt") output_test_predictions = "\n".join(" ".join(map(str, p)) for p in true_predictions) with open(output_test_predictions_file, "a+") as writer: writer.write(output_test_predictions) if args.save_artifacts: artifact = wandb.Artifact("predictions", type="result") artifact.add_file(output_test_predictions_file) wandb.log_artifact(artifact)
def main(): # Parser parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) model_args, data_args, training_args = parser.parse_args_into_dataclasses() # Set seed before initializing set_seed(training_args.seed) ########## Load dataset from script. ########## # 'ontonotes_v4.py' datasets = load_dataset(data_args.dataset_script) ### Access column names and features ### if training_args.do_train: column_names = datasets["train"].column_names features = datasets["train"].features else: column_names = datasets["validation"].column_names features = datasets["validation"].features # In the event the labels are not a `Sequence[ClassLabel]`, # we will need to go through the dataset to get the unique labels. if isinstance(features["pos_tags"].feature, ClassLabel): label_list = features["pos_tags"].feature.names # No need to convert the labels since they are already ints. label_to_id = {i: i for i in range(len(label_list))} else: label_list = get_label_list(datasets["train"]["pos_tags"]) label_to_id = {l: i for i, l in enumerate(label_list)} num_labels = len(label_list) ########## Load pre-trained or custom model, tokenizer and config ########## # BertConfig config = AutoConfig.from_pretrained(model_args.config_name, num_labels=num_labels, finetuning_task=data_args.task_name, cache_dir=None, revision="main", use_auth_token=True if False else None, output_hidden_states=True) # BERTTokenizer tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=None, use_fast=True, revision="main", use_auth_token=None, ) # Create custom model or pretained BERT if ".py" == model_args.model_name_or_path[-3:]: spec = importlib.util.spec_from_file_location( "module.name", model_args.model_name_or_path) module_name = importlib.util.module_from_spec(spec) spec.loader.exec_module(module_name) # Add `to_layer` for `LinearProbingBERT` if model_args.to_layer is not None: config.to_layer = model_args.to_layer # Creating custom model model = module_name.model()(config) print(f"Creating custom model in {model_args.model_name_or_path}") else: model = AutoModelForTokenClassification.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in "bert-base-cased"), config=config, cache_dir=None, revision="main", use_auth_token=True if False else None, ) # Freeze BERT. Train the `classifier` weights for param in model.base_model.parameters(): param.requires_grad = False def tokenize_fn(examples): """Tokenize the input sequence and align the label. `input_ids` and `label_ids` will be added in the feature example (dict). They are required for the forward and loss computation. Addtionally. `-100` in `label_ids` is assigned to segmented tokens and to speical tokens in BERT. Loss function will ignore them. Args: Examples: dict of features: {"tokens": [AL-AIN', ',', 'United', 'Arab', 'Emirates', '1996-12-06'], "pos_tags": [22, 6, 22, 22, 23, 11]} Return: tokenized_inputs: dict of futures including two addtional feature: `input_ids` and `label_ids`. Usages: >>> tokenized_dataset = datasets.map(tokenize_fn, >>> batched=True) # Check whether aligned. >>> for example in tokenized_dataset: tokens = example['tokens'] input_ids = example['input_ids'] tokenized_tokens = tokenizer.convert_ids_to_tokens(input_ids) label_ids = example['label_ids'] # aligned to max length print(tokens) print(tokenized_tokens) print(input_ids) [ 'SOCCER' ] # token [ [CLS], 'S', '##OC, '##CE', '##R', [SEP] ] #converted_tokens [ -100 , 4 , -100, -100, -11] # label_ids """ token_col_name = 'tokens' label_col_name = 'pos_tags' # will be added in the dict tokenized_inputs = tokenizer(examples[token_col_name], padding="max_length", truncation=True, is_split_into_words=True, max_length=data_args.max_seq_length) # Create label sequence # The `word_idx` in is used to map tokens to actual word . # tokenied_token: [CLS, Ha, ##LLO, PAD, PAD] # `word_ids`: [None, 1, 1, None, None] # `label_ids`: [-100, 4, 4, -100, -100] labels = list() for i, label in enumerate(examples["pos_tags"]): word_ids = tokenized_inputs.word_ids(batch_index=i) previous_word_idx = None label_ids = [] for word_idx in word_ids: # None are for [CLS], [SEP] and [PAD] tokens. if word_idx is None: label_ids.append(-100) # Set label for the word once elif word_idx != previous_word_idx: label_ids.append(label_to_id[label[word_idx]]) #label_ids.append(label) else: label_ids.append(-100) previous_word_idx = word_idx labels.append(label_ids) # Add `labels` sequence for loss computation tokenized_inputs["labels"] = labels return tokenized_inputs ### Truncate number of examples ### if training_args.do_train: if "train" not in datasets: raise ValueError("--do_train requires a train dataset") train_dataset = datasets["train"] if data_args.max_train_samples is not None: train_dataset = train_dataset.select( range(data_args.max_train_samples)) train_dataset = train_dataset.map( tokenize_fn, batched=True, ) if training_args.do_eval: if "validation" not in datasets: raise ValueError("--do_eval requires a validation dataset") eval_dataset = datasets["validation"] if data_args.max_val_samples is not None: eval_dataset = eval_dataset.select(range( data_args.max_val_samples)) eval_dataset = eval_dataset.map( tokenize_fn, batched=True, ) if training_args.do_predict: if "test" not in datasets: raise ValueError("--do_predict requires a test dataset") test_dataset = datasets["test"] if data_args.max_test_samples is not None: test_dataset = test_dataset.select( range(data_args.max_test_samples)) test_dataset = test_dataset.map( tokenize_fn, batched=True, ) data_collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=8) # Metrics metric = load_metric("seqeval") # Define metrics def compute_metrics(p): """Compute the evaluation metric for POS tagging. This function Will be called if set `logging_steps` number of steps to log metrics or set `eval_steps`. """ # transformers.trainer_utils.EvalPrediction predictions, labels = p # Shape of predictions (batch, seq_max_len, num_labels) predictions = np.argmax(predictions, axis=2) # Remove ignored index (special tokens) true_predictions = [[ label_list[p] for (p, l) in zip(prediction, label) if l != -100 ] for prediction, label in zip(predictions, labels)] true_labels = [[ label_list[l] for (p, l) in zip(prediction, label) if l != -100 ] for prediction, label in zip(predictions, labels)] true_preds = list() true_label = list() for l in true_labels: for ele in l: true_label.append(ele) for l in true_predictions: for ele in l: true_preds.append(ele) ### DON't USE ### data_args.return_tag_level_metrics = False if data_args.return_tag_level_metrics: result = classification_report(true_label, true_preds, output_dict=False, labels=np.unique(true_preds)) else: precision, recall, f1, _ = precision_recall_fscore_support( true_label, true_preds, average='macro', zero_division=0) acc = accuracy_score(true_label, true_preds) result = { 'precision': precision, 'recall': recall, 'f1': f1, 'accuracy': acc, } return result ########## Origin ########## #results = metric.compute(predictions=true_predictions, references=true_labels) if data_args.return_tag_level_metrics: # Unpack nested dictionaries final_results = {} for key, value in results.items(): if isinstance(value, dict): for n, v in value.items(): final_results[f"{key}_{n}"] = v else: final_results[key] = value return final_results else: return { "precision": results["overall_precision"], "recall": results["overall_recall"], "f1": results["overall_f1"], "accuracy": results["overall_accuracy"], } ########## Origin ########## ########## Train, evaluate and test with Trainer ########## # Initialize our Trainer trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset if training_args.do_train else None, eval_dataset=eval_dataset if training_args.do_eval else None, tokenizer=tokenizer, data_collator=data_collator, compute_metrics=compute_metrics, ) # Training last_checkpoint = None if training_args.do_train: if last_checkpoint is not None: checkpoint = last_checkpoint else: checkpoint = None train_result = trainer.train(resume_from_checkpoint=checkpoint) metrics = train_result.metrics trainer.save_model("./") # Saves the tokenizer too for easy upload max_train_samples = (data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)) metrics["train_samples"] = min(max_train_samples, len(train_dataset)) # Evaluation results = {} if training_args.do_eval: logger.info("*** Evaluate ***") results = trainer.evaluate() max_val_samples = data_args.max_val_samples if data_args.max_val_samples is not None else len( eval_dataset) metrics["eval_samples"] = min(max_val_samples, len(eval_dataset)) # Predict if training_args.do_predict: logger.info("*** Predict ***") predictions, labels, metrics = trainer.predict(test_dataset) predictions = np.argmax(predictions, axis=2) print("Test metrics", metrics) # Remove ignored index (special tokens) true_predictions = [[ label_list[p] for (p, l) in zip(prediction, label) if l != -100 ] for prediction, label in zip(predictions, labels)] # Save predictions with open("test_predictions.txt", "w") as writer: for prediction in true_predictions: writer.write(" ".join(prediction) + "\n") return results
def do_predict(test_data): parser = HfArgumentParser((ModelArguments, DataTrainingArguments)) model_args, data_args = parser.parse_args_into_dataclasses() training_args = TrainingArguments(output_dir="cosmetic_ner", do_predict=True) test_dict = {'tokens': [[token for token in line] for line in test_data]} datasets = Dataset.from_dict(test_dict) text_column_name = "tokens" label_list = ['O', 'B-COM', 'I-COM', 'B-EFF', 'I-EFF'] num_labels = 5 # 开始加载预训练模型和tokenizer # 加载模型配置 config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, num_labels=num_labels, finetuning_task=data_args.task_name, # ner cache_dir=model_args.cache_dir, # None ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=True, ) model = AutoModelForTokenClassification.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) # 数据预处理 # 最大序列长度 max_length = data_args.max_length # Padding 策略 padding = "max_length" if data_args.pad_to_max_length else False # Tokenize所有文本并将label与它们对齐。 def tokenize_and_align_labels(examples): """ datasets.map函数处理时会调用 Args: examples: 这里是2条样本, 例如: examples = {dict: 5} 'chunk_tags' = {list: 2} [[11, 21, 11, 12, 21, 22, 11, 12, 0], [11, 12]] 'id' = {list: 2} ['0', '1'] 'ner_tags' = {list: 2} [[3, 0, 7, 0, 0, 0, 7, 0, 0], [1, 2]] 'pos_tags' = {list: 2} [[22, 42, 16, 21, 35, 37, 16, 21, 7], [22, 22]] 'tokens' = {list: 2} [['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.'], ['Peter', 'Blackburn']] Returns: {'input_ids': [[101, 4649, 2244, 102], [101, 6208, 4658, 2094, 3723, 102]], 'token_type_ids': [[0, 0, 0, 0], [0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1], [1, 1, 1, 1, 1, 1]], 'labels': [[-100, 3, 4, -100], [-100, 1, 2, 2, 2, -100]]} """ # 对单条样本的examples的tokens字段,即文本字段进行tokenizer tokenized_inputs = tokenizer( examples[text_column_name], padding=padding, max_length=max_length, truncation=True, # 我们使用此参数是因为数据集中的文本是单词列表(每个单词带有标签) is_split_into_words=True, ) print(f"\n样本是{examples}") print(f"一个batch的tokenized_inputs,{tokenized_inputs}") return tokenized_inputs # 处理数据,用map函数 tokenized_datasets = datasets.map( tokenize_and_align_labels, batched=True, num_proc=data_args.preprocessing_num_workers, load_from_cache_file=False, ) # Data collator, 在for循环dataloader时调用 data_collator = DataCollatorForTokenClassification(tokenizer) # Initialize our Trainer, msra没有验证集,这里用test测试集,其实应该用train中分离出来 trainer = Trainer( model=model, args=training_args, tokenizer=tokenizer, data_collator=data_collator, ) # 评估模型 results = {} # if training_args.do_predict: logger.info("*** 预测 ***") test_dataset = tokenized_datasets predictions = trainer.predict(test_dataset) predictions = np.argmax(predictions.predictions, axis=2) # 是一个嵌套列表,子列表是label的名字, 去掉prediction的第一个和最后一个元素CLS, SEP true_predictions = [ [label_list[p] for p in prediction[1:-1]] for prediction in predictions ] # 去掉padding的长度 text_tokens = test_dict['tokens'] predicts = [prediction[:len(token)] for token, prediction in zip(text_tokens, true_predictions)] results = extract_words(text=test_dict['tokens'], predicts=predicts) return results
def main(): args = parse_args() # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The # information sent is the one passed as arguments along with your Python/PyTorch versions. send_example_telemetry("run_ner_no_trainer", args) # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers # in the environment accelerator = ( Accelerator(log_with=args.report_to, logging_dir=args.output_dir) if args.with_tracking else Accelerator() ) # Make one log on every process with the configuration for debugging. logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) logger.info(accelerator.state, main_process_only=False) if accelerator.is_local_main_process: datasets.utils.logging.set_verbosity_warning() transformers.utils.logging.set_verbosity_info() else: datasets.utils.logging.set_verbosity_error() transformers.utils.logging.set_verbosity_error() # If passed along, set the training seed now. if args.seed is not None: set_seed(args.seed) # Handle the repository creation if accelerator.is_main_process: if args.push_to_hub: if args.hub_model_id is None: repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token) else: repo_name = args.hub_model_id repo = Repository(args.output_dir, clone_from=repo_name) with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore: if "step_*" not in gitignore: gitignore.write("step_*\n") if "epoch_*" not in gitignore: gitignore.write("epoch_*\n") elif args.output_dir is not None: os.makedirs(args.output_dir, exist_ok=True) accelerator.wait_for_everyone() # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) # or just provide the name of one of the public datasets for token classification task available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub). # # For CSV/JSON files, this script will use the column called 'tokens' or the first column if no column called # 'tokens' is found. You can easily tweak this behavior (see below). # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if args.dataset_name is not None: # Downloading and loading a dataset from the hub. raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name) else: data_files = {} if args.train_file is not None: data_files["train"] = args.train_file if args.validation_file is not None: data_files["validation"] = args.validation_file extension = args.train_file.split(".")[-1] raw_datasets = load_dataset(extension, data_files=data_files) # Trim a number of training examples if args.debug: for split in raw_datasets.keys(): raw_datasets[split] = raw_datasets[split].select(range(100)) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. if raw_datasets["train"] is not None: column_names = raw_datasets["train"].column_names features = raw_datasets["train"].features else: column_names = raw_datasets["validation"].column_names features = raw_datasets["validation"].features if args.text_column_name is not None: text_column_name = args.text_column_name elif "tokens" in column_names: text_column_name = "tokens" else: text_column_name = column_names[0] if args.label_column_name is not None: label_column_name = args.label_column_name elif f"{args.task_name}_tags" in column_names: label_column_name = f"{args.task_name}_tags" else: label_column_name = column_names[1] # In the event the labels are not a `Sequence[ClassLabel]`, we will need to go through the dataset to get the # unique labels. def get_label_list(labels): unique_labels = set() for label in labels: unique_labels = unique_labels | set(label) label_list = list(unique_labels) label_list.sort() return label_list # If the labels are of type ClassLabel, they are already integers and we have the map stored somewhere. # Otherwise, we have to get the list of labels manually. labels_are_int = isinstance(features[label_column_name].feature, ClassLabel) if labels_are_int: label_list = features[label_column_name].feature.names label_to_id = {i: i for i in range(len(label_list))} else: label_list = get_label_list(raw_datasets["train"][label_column_name]) label_to_id = {l: i for i, l in enumerate(label_list)} num_labels = len(label_list) # Load pretrained model and tokenizer # # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. if args.config_name: config = AutoConfig.from_pretrained(args.config_name, num_labels=num_labels) elif args.model_name_or_path: config = AutoConfig.from_pretrained(args.model_name_or_path, num_labels=num_labels) else: config = CONFIG_MAPPING[args.model_type]() logger.warning("You are instantiating a new config instance from scratch.") tokenizer_name_or_path = args.tokenizer_name if args.tokenizer_name else args.model_name_or_path if not tokenizer_name_or_path: raise ValueError( "You are instantiating a new tokenizer from scratch. This is not supported by this script." "You can do it from another script, save it, and load it from here, using --tokenizer_name." ) if config.model_type in {"gpt2", "roberta"}: tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path, use_fast=True, add_prefix_space=True) else: tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path, use_fast=True) if args.model_name_or_path: model = AutoModelForTokenClassification.from_pretrained( args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config, ignore_mismatched_sizes=args.ignore_mismatched_sizes, ) else: logger.info("Training new model from scratch") model = AutoModelForTokenClassification.from_config(config) model.resize_token_embeddings(len(tokenizer)) # Model has labels -> use them. if model.config.label2id != PretrainedConfig(num_labels=num_labels).label2id: if list(sorted(model.config.label2id.keys())) == list(sorted(label_list)): # Reorganize `label_list` to match the ordering of the model. if labels_are_int: label_to_id = {i: int(model.config.label2id[l]) for i, l in enumerate(label_list)} label_list = [model.config.id2label[i] for i in range(num_labels)] else: label_list = [model.config.id2label[i] for i in range(num_labels)] label_to_id = {l: i for i, l in enumerate(label_list)} else: logger.warning( "Your model seems to have been trained with labels, but they don't match the dataset: ", f"model labels: {list(sorted(model.config.label2id.keys()))}, dataset labels:" f" {list(sorted(label_list))}.\nIgnoring the model labels as a result.", ) # Set the correspondences label/ID inside the model config model.config.label2id = {l: i for i, l in enumerate(label_list)} model.config.id2label = {i: l for i, l in enumerate(label_list)} # Map that sends B-Xxx label to its I-Xxx counterpart b_to_i_label = [] for idx, label in enumerate(label_list): if label.startswith("B-") and label.replace("B-", "I-") in label_list: b_to_i_label.append(label_list.index(label.replace("B-", "I-"))) else: b_to_i_label.append(idx) # Preprocessing the datasets. # First we tokenize all the texts. padding = "max_length" if args.pad_to_max_length else False # Tokenize all texts and align the labels with them. def tokenize_and_align_labels(examples): tokenized_inputs = tokenizer( examples[text_column_name], max_length=args.max_length, padding=padding, truncation=True, # We use this argument because the texts in our dataset are lists of words (with a label for each word). is_split_into_words=True, ) labels = [] for i, label in enumerate(examples[label_column_name]): word_ids = tokenized_inputs.word_ids(batch_index=i) previous_word_idx = None label_ids = [] for word_idx in word_ids: # Special tokens have a word id that is None. We set the label to -100 so they are automatically # ignored in the loss function. if word_idx is None: label_ids.append(-100) # We set the label for the first token of each word. elif word_idx != previous_word_idx: label_ids.append(label_to_id[label[word_idx]]) # For the other tokens in a word, we set the label to either the current label or -100, depending on # the label_all_tokens flag. else: if args.label_all_tokens: label_ids.append(b_to_i_label[label_to_id[label[word_idx]]]) else: label_ids.append(-100) previous_word_idx = word_idx labels.append(label_ids) tokenized_inputs["labels"] = labels return tokenized_inputs with accelerator.main_process_first(): processed_raw_datasets = raw_datasets.map( tokenize_and_align_labels, batched=True, remove_columns=raw_datasets["train"].column_names, desc="Running tokenizer on dataset", ) train_dataset = processed_raw_datasets["train"] eval_dataset = processed_raw_datasets["validation"] # Log a few random samples from the training set: for index in random.sample(range(len(train_dataset)), 3): logger.info(f"Sample {index} of the training set: {train_dataset[index]}.") # DataLoaders creation: if args.pad_to_max_length: # If padding was already done ot max length, we use the default data collator that will just convert everything # to tensors. data_collator = default_data_collator else: # Otherwise, `DataCollatorForTokenClassification` will apply dynamic padding for us (by padding to the maximum length of # the samples passed). When using mixed precision, we add `pad_to_multiple_of=8` to pad all tensors to multiple # of 8s, which will enable the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta). data_collator = DataCollatorForTokenClassification( tokenizer, pad_to_multiple_of=(8 if accelerator.use_fp16 else None) ) train_dataloader = DataLoader( train_dataset, shuffle=True, collate_fn=data_collator, batch_size=args.per_device_train_batch_size ) eval_dataloader = DataLoader(eval_dataset, collate_fn=data_collator, batch_size=args.per_device_eval_batch_size) # Optimizer # Split weights in two groups, one with weight decay and the other not. no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], "weight_decay": args.weight_decay, }, { "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0, }, ] optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=args.learning_rate) # Use the device given by the `accelerator` object. device = accelerator.device model.to(device) # Scheduler and math around the number of training steps. overrode_max_train_steps = False num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) if args.max_train_steps is None: args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch overrode_max_train_steps = True lr_scheduler = get_scheduler( name=args.lr_scheduler_type, optimizer=optimizer, num_warmup_steps=args.num_warmup_steps, num_training_steps=args.max_train_steps, ) # Prepare everything with our `accelerator`. model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare( model, optimizer, train_dataloader, eval_dataloader, lr_scheduler ) # We need to recalculate our total training steps as the size of the training dataloader may have changed. num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) if overrode_max_train_steps: args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch # Afterwards we recalculate our number of training epochs args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch) # Figure out how many steps we should save the Accelerator states if hasattr(args.checkpointing_steps, "isdigit"): checkpointing_steps = args.checkpointing_steps if args.checkpointing_steps.isdigit(): checkpointing_steps = int(args.checkpointing_steps) else: checkpointing_steps = None # We need to initialize the trackers we use, and also store our configuration. # We initialize the trackers only on main process because `accelerator.log` # only logs on main process and we don't want empty logs/runs on other processes. if args.with_tracking: if accelerator.is_main_process: experiment_config = vars(args) # TensorBoard cannot log Enums, need the raw value experiment_config["lr_scheduler_type"] = experiment_config["lr_scheduler_type"].value accelerator.init_trackers("ner_no_trainer", experiment_config) # Metrics metric = load_metric("seqeval") def get_labels(predictions, references): # Transform predictions and references tensos to numpy arrays if device.type == "cpu": y_pred = predictions.detach().clone().numpy() y_true = references.detach().clone().numpy() else: y_pred = predictions.detach().cpu().clone().numpy() y_true = references.detach().cpu().clone().numpy() # Remove ignored index (special tokens) true_predictions = [ [label_list[p] for (p, l) in zip(pred, gold_label) if l != -100] for pred, gold_label in zip(y_pred, y_true) ] true_labels = [ [label_list[l] for (p, l) in zip(pred, gold_label) if l != -100] for pred, gold_label in zip(y_pred, y_true) ] return true_predictions, true_labels def compute_metrics(): results = metric.compute() if args.return_entity_level_metrics: # Unpack nested dictionaries final_results = {} for key, value in results.items(): if isinstance(value, dict): for n, v in value.items(): final_results[f"{key}_{n}"] = v else: final_results[key] = value return final_results else: return { "precision": results["overall_precision"], "recall": results["overall_recall"], "f1": results["overall_f1"], "accuracy": results["overall_accuracy"], } # Train! total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps logger.info("***** Running training *****") logger.info(f" Num examples = {len(train_dataset)}") logger.info(f" Num Epochs = {args.num_train_epochs}") logger.info(f" Instantaneous batch size per device = {args.per_device_train_batch_size}") logger.info(f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}") logger.info(f" Gradient Accumulation steps = {args.gradient_accumulation_steps}") logger.info(f" Total optimization steps = {args.max_train_steps}") # Only show the progress bar once on each machine. progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process) completed_steps = 0 starting_epoch = 0 # Potentially load in the weights and states from a previous save if args.resume_from_checkpoint: if args.resume_from_checkpoint is not None or args.resume_from_checkpoint != "": accelerator.print(f"Resumed from checkpoint: {args.resume_from_checkpoint}") accelerator.load_state(args.resume_from_checkpoint) path = os.path.basename(args.resume_from_checkpoint) else: # Get the most recent checkpoint dirs = [f.name for f in os.scandir(os.getcwd()) if f.is_dir()] dirs.sort(key=os.path.getctime) path = dirs[-1] # Sorts folders by date modified, most recent checkpoint is the last # Extract `epoch_{i}` or `step_{i}` training_difference = os.path.splitext(path)[0] if "epoch" in training_difference: starting_epoch = int(training_difference.replace("epoch_", "")) + 1 resume_step = None else: resume_step = int(training_difference.replace("step_", "")) starting_epoch = resume_step // len(train_dataloader) resume_step -= starting_epoch * len(train_dataloader) for epoch in range(starting_epoch, args.num_train_epochs): model.train() if args.with_tracking: total_loss = 0 for step, batch in enumerate(train_dataloader): # We need to skip steps until we reach the resumed step if args.resume_from_checkpoint and epoch == starting_epoch: if resume_step is not None and step < resume_step: completed_steps += 1 continue outputs = model(**batch) loss = outputs.loss # We keep track of the loss at each epoch if args.with_tracking: total_loss += loss.detach().float() loss = loss / args.gradient_accumulation_steps accelerator.backward(loss) if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1: optimizer.step() lr_scheduler.step() optimizer.zero_grad() progress_bar.update(1) completed_steps += 1 if isinstance(checkpointing_steps, int): if completed_steps % checkpointing_steps == 0: output_dir = f"step_{completed_steps }" if args.output_dir is not None: output_dir = os.path.join(args.output_dir, output_dir) accelerator.save_state(output_dir) if completed_steps >= args.max_train_steps: break model.eval() samples_seen = 0 for step, batch in enumerate(eval_dataloader): with torch.no_grad(): outputs = model(**batch) predictions = outputs.logits.argmax(dim=-1) labels = batch["labels"] if not args.pad_to_max_length: # necessary to pad predictions and labels for being gathered predictions = accelerator.pad_across_processes(predictions, dim=1, pad_index=-100) labels = accelerator.pad_across_processes(labels, dim=1, pad_index=-100) predictions_gathered, labels_gathered = accelerator.gather((predictions, labels)) # If we are in a multiprocess environment, the last batch has duplicates if accelerator.num_processes > 1: if step == len(eval_dataloader) - 1: predictions_gathered = predictions_gathered[: len(eval_dataloader.dataset) - samples_seen] labels_gathered = labels_gathered[: len(eval_dataloader.dataset) - samples_seen] else: samples_seen += labels_gathered.shape[0] preds, refs = get_labels(predictions_gathered, labels_gathered) metric.add_batch( predictions=preds, references=refs, ) # predictions and preferences are expected to be a nested list of labels, not label_ids eval_metric = compute_metrics() accelerator.print(f"epoch {epoch}:", eval_metric) if args.with_tracking: accelerator.log( { "seqeval": eval_metric, "train_loss": total_loss.item() / len(train_dataloader), "epoch": epoch, "step": completed_steps, }, step=completed_steps, ) if args.push_to_hub and epoch < args.num_train_epochs - 1: accelerator.wait_for_everyone() unwrapped_model = accelerator.unwrap_model(model) unwrapped_model.save_pretrained( args.output_dir, is_main_process=accelerator.is_main_process, save_function=accelerator.save ) if accelerator.is_main_process: tokenizer.save_pretrained(args.output_dir) repo.push_to_hub( commit_message=f"Training in progress epoch {epoch}", blocking=False, auto_lfs_prune=True ) if args.checkpointing_steps == "epoch": output_dir = f"epoch_{epoch}" if args.output_dir is not None: output_dir = os.path.join(args.output_dir, output_dir) accelerator.save_state(output_dir) if args.output_dir is not None: accelerator.wait_for_everyone() unwrapped_model = accelerator.unwrap_model(model) unwrapped_model.save_pretrained( args.output_dir, is_main_process=accelerator.is_main_process, save_function=accelerator.save ) if accelerator.is_main_process: tokenizer.save_pretrained(args.output_dir) if args.push_to_hub: repo.push_to_hub(commit_message="End of training", auto_lfs_prune=True) with open(os.path.join(args.output_dir, "all_results.json"), "w") as f: json.dump( {"eval_accuracy": eval_metric["accuracy"], "train_loss": total_loss.item() / len(train_dataloader)}, f )
# preparing datasets datasets["train"] = datasets["train"].map( tokenize_and_align_labels, batched=True, num_proc=args.preprocessing_num_workers, load_from_cache_file=not args.overwrite_cache, ) datasets["validation"] = datasets["validation"].map( tokenize_and_align_labels, batched=True, num_proc=args.preprocessing_num_workers, load_from_cache_file=not args.overwrite_cache, ) data_collator = DataCollatorForTokenClassification(tokenizer, padding=padding) # Metrics ner_metric = load_metric("seqeval") def compute_metrics_ner(p): predictions, labels = p predictions = np.argmax(predictions, axis=2) # Remove ignored index (special tokens) true_predictions = [[ label_list[p] for (p, l) in zip(prediction, label) if l != -100 ] for prediction, label in zip(predictions, labels)] true_labels = [[ label_list[l] for (p, l) in zip(prediction, label) if l != -100 ] for prediction, label in zip(predictions, labels)]
def main(): # region Argument Parsing parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TFTrainingArguments)) model_args, data_args, training_args = parser.parse_args_into_dataclasses() # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The # information sent is the one passed as arguments along with your Python/PyTorch versions. send_example_telemetry("run_ner", model_args, data_args, framework="tensorflow") # endregion # region Setup logging # we only want one process per machine to log things on the screen. # accelerator.is_local_main_process is only True for one process per machine. logger.setLevel(logging.INFO) datasets.utils.logging.set_verbosity_warning() transformers.utils.logging.set_verbosity_info() # If passed along, set the training seed now. if training_args.seed is not None: set_seed(training_args.seed) # endregion # region Loading datasets # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) # or just provide the name of one of the public datasets for token classification task available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub). # # For CSV/JSON files, this script will use the column called 'tokens' or the first column if no column called # 'tokens' is found. You can easily tweak this behavior (see below). # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if data_args.dataset_name is not None: # Downloading and loading a dataset from the hub. raw_datasets = load_dataset( data_args.dataset_name, data_args.dataset_config_name, use_auth_token=True if model_args.use_auth_token else None, ) else: data_files = {} if data_args.train_file is not None: data_files["train"] = data_args.train_file if data_args.validation_file is not None: data_files["validation"] = data_args.validation_file extension = data_args.train_file.split(".")[-1] raw_datasets = load_dataset( extension, data_files=data_files, use_auth_token=True if model_args.use_auth_token else None, ) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. if raw_datasets["train"] is not None: column_names = raw_datasets["train"].column_names features = raw_datasets["train"].features else: column_names = raw_datasets["validation"].column_names features = raw_datasets["validation"].features if data_args.text_column_name is not None: text_column_name = data_args.text_column_name elif "tokens" in column_names: text_column_name = "tokens" else: text_column_name = column_names[0] if data_args.label_column_name is not None: label_column_name = data_args.label_column_name elif f"{data_args.task_name}_tags" in column_names: label_column_name = f"{data_args.task_name}_tags" else: label_column_name = column_names[1] # In the event the labels are not a `Sequence[ClassLabel]`, we will need to go through the dataset to get the # unique labels. def get_label_list(labels): unique_labels = set() for label in labels: unique_labels = unique_labels | set(label) label_list = list(unique_labels) label_list.sort() return label_list if isinstance(features[label_column_name].feature, ClassLabel): label_list = features[label_column_name].feature.names # No need to convert the labels since they are already ints. label_to_id = {i: i for i in range(len(label_list))} else: label_list = get_label_list(raw_datasets["train"][label_column_name]) label_to_id = {l: i for i, l in enumerate(label_list)} num_labels = len(label_list) # endregion # region Load config and tokenizer # # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. if model_args.config_name: config = AutoConfig.from_pretrained(model_args.config_name, num_labels=num_labels) elif model_args.model_name_or_path: config = AutoConfig.from_pretrained(model_args.model_name_or_path, num_labels=num_labels) else: config = CONFIG_MAPPING[model_args.model_type]() logger.warning( "You are instantiating a new config instance from scratch.") tokenizer_name_or_path = model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path if not tokenizer_name_or_path: raise ValueError( "You are instantiating a new tokenizer from scratch. This is not supported by this script." "You can do it from another script, save it, and load it from here, using --tokenizer_name." ) if config.model_type in {"gpt2", "roberta"}: tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path, use_fast=True, add_prefix_space=True) else: tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path, use_fast=True) # endregion # region Preprocessing the raw datasets # First we tokenize all the texts. padding = "max_length" if data_args.pad_to_max_length else False # Tokenize all texts and align the labels with them. def tokenize_and_align_labels(examples): tokenized_inputs = tokenizer( examples[text_column_name], max_length=data_args.max_length, padding=padding, truncation=True, # We use this argument because the texts in our dataset are lists of words (with a label for each word). is_split_into_words=True, ) labels = [] for i, label in enumerate(examples[label_column_name]): word_ids = tokenized_inputs.word_ids(batch_index=i) previous_word_idx = None label_ids = [] for word_idx in word_ids: # Special tokens have a word id that is None. We set the label to -100 so they are automatically # ignored in the loss function. if word_idx is None: label_ids.append(-100) # We set the label for the first token of each word. elif word_idx != previous_word_idx: label_ids.append(label_to_id[label[word_idx]]) # For the other tokens in a word, we set the label to either the current label or -100, depending on # the label_all_tokens flag. else: label_ids.append(label_to_id[label[word_idx]] if data_args. label_all_tokens else -100) previous_word_idx = word_idx labels.append(label_ids) tokenized_inputs["labels"] = labels return tokenized_inputs processed_raw_datasets = raw_datasets.map( tokenize_and_align_labels, batched=True, remove_columns=raw_datasets["train"].column_names, desc="Running tokenizer on dataset", ) train_dataset = processed_raw_datasets["train"] eval_dataset = processed_raw_datasets["validation"] if data_args.max_train_samples is not None: max_train_samples = min(len(train_dataset), data_args.max_train_samples) train_dataset = train_dataset.select(range(max_train_samples)) if data_args.max_eval_samples is not None: max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples) eval_dataset = eval_dataset.select(range(max_eval_samples)) # Log a few random samples from the training set: for index in random.sample(range(len(train_dataset)), 3): logger.info( f"Sample {index} of the training set: {train_dataset[index]}.") # endregion with training_args.strategy.scope(): # region Initialize model if model_args.model_name_or_path: model = TFAutoModelForTokenClassification.from_pretrained( model_args.model_name_or_path, config=config, ) else: logger.info("Training new model from scratch") model = TFAutoModelForTokenClassification.from_config(config) model.resize_token_embeddings(len(tokenizer)) # endregion # region Create TF datasets # We need the DataCollatorForTokenClassification here, as we need to correctly pad labels as # well as inputs. collate_fn = DataCollatorForTokenClassification(tokenizer=tokenizer, return_tensors="tf") num_replicas = training_args.strategy.num_replicas_in_sync total_train_batch_size = training_args.per_device_train_batch_size * num_replicas dataset_options = tf.data.Options() dataset_options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF # model.prepare_tf_dataset() wraps a Hugging Face dataset in a tf.data.Dataset which is ready to use in # training. This is the recommended way to use a Hugging Face dataset when training with Keras. You can also # use the lower-level dataset.to_tf_dataset() method, but you will have to specify things like column names # yourself if you use this method, whereas they are automatically inferred from the model input names when # using model.prepare_tf_dataset() # For more info see the docs: # https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.TFPreTrainedModel.prepare_tf_dataset # https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset.to_tf_dataset tf_train_dataset = model.prepare_tf_dataset( train_dataset, collate_fn=collate_fn, batch_size=total_train_batch_size, shuffle=True, ).with_options(dataset_options) total_eval_batch_size = training_args.per_device_eval_batch_size * num_replicas tf_eval_dataset = model.prepare_tf_dataset( eval_dataset, collate_fn=collate_fn, batch_size=total_eval_batch_size, shuffle=False, ).with_options(dataset_options) # endregion # region Optimizer, loss and compilation num_train_steps = int( len(tf_train_dataset) * training_args.num_train_epochs) if training_args.warmup_steps > 0: num_warmup_steps = training_args.warmup_steps elif training_args.warmup_ratio > 0: num_warmup_steps = int(num_train_steps * training_args.warmup_ratio) else: num_warmup_steps = 0 optimizer, lr_schedule = create_optimizer( init_lr=training_args.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, adam_beta1=training_args.adam_beta1, adam_beta2=training_args.adam_beta2, adam_epsilon=training_args.adam_epsilon, weight_decay_rate=training_args.weight_decay, adam_global_clipnorm=training_args.max_grad_norm, ) model.compile(optimizer=optimizer, jit_compile=training_args.xla) # endregion # Metrics metric = evaluate.load("seqeval") def get_labels(y_pred, y_true): # Transform predictions and references tensos to numpy arrays # Remove ignored index (special tokens) true_predictions = [[ label_list[p] for (p, l) in zip(pred, gold_label) if l != -100 ] for pred, gold_label in zip(y_pred, y_true)] true_labels = [[ label_list[l] for (p, l) in zip(pred, gold_label) if l != -100 ] for pred, gold_label in zip(y_pred, y_true)] return true_predictions, true_labels def compute_metrics(): results = metric.compute() if data_args.return_entity_level_metrics: # Unpack nested dictionaries final_results = {} for key, value in results.items(): if isinstance(value, dict): for n, v in value.items(): final_results[f"{key}_{n}"] = v else: final_results[key] = value return final_results else: return { "precision": results["overall_precision"], "recall": results["overall_recall"], "f1": results["overall_f1"], "accuracy": results["overall_accuracy"], } # endregion # region Preparing push_to_hub and model card push_to_hub_model_id = training_args.push_to_hub_model_id model_name = model_args.model_name_or_path.split("/")[-1] if not push_to_hub_model_id: if data_args.dataset_name is not None: push_to_hub_model_id = f"{model_name}-finetuned-{data_args.dataset_name}" else: push_to_hub_model_id = f"{model_name}-finetuned-token-classification" model_card_kwargs = { "finetuned_from": model_args.model_name_or_path, "tasks": "token-classification" } if data_args.dataset_name is not None: model_card_kwargs["dataset_tags"] = data_args.dataset_name if data_args.dataset_config_name is not None: model_card_kwargs[ "dataset_args"] = data_args.dataset_config_name model_card_kwargs[ "dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}" else: model_card_kwargs["dataset"] = data_args.dataset_name if training_args.push_to_hub: callbacks = [ PushToHubCallback( output_dir=training_args.output_dir, model_id=push_to_hub_model_id, organization=training_args.push_to_hub_organization, token=training_args.push_to_hub_token, tokenizer=tokenizer, **model_card_kwargs, ) ] else: callbacks = [] # endregion # region Training logger.info("***** Running training *****") logger.info(f" Num examples = {len(train_dataset)}") logger.info(f" Num Epochs = {training_args.num_train_epochs}") logger.info( f" Instantaneous batch size per device = {training_args.per_device_train_batch_size}" ) logger.info(f" Total train batch size = {total_train_batch_size}") # Only show the progress bar once on each machine. model.fit( tf_train_dataset, validation_data=tf_eval_dataset, epochs=int(training_args.num_train_epochs), callbacks=callbacks, ) # endregion # region Predictions # If you have variable batch sizes (i.e. not using pad_to_max_length), then # this bit might fail on TF < 2.8 because TF can't concatenate outputs of varying seq # length from predict(). try: predictions = model.predict( tf_eval_dataset, batch_size=training_args.per_device_eval_batch_size)["logits"] except tf.python.framework.errors_impl.InvalidArgumentError: raise ValueError( "Concatenating predictions failed! If your version of TensorFlow is 2.8.0 or older " "then you will need to use --pad_to_max_length to generate predictions, as older " "versions of TensorFlow cannot concatenate variable-length predictions as RaggedTensor." ) if isinstance(predictions, tf.RaggedTensor): predictions = predictions.to_tensor(default_value=-100) predictions = tf.math.argmax(predictions, axis=-1).numpy() if "label" in eval_dataset: labels = eval_dataset.with_format("tf")["label"] else: labels = eval_dataset.with_format("tf")["labels"] if isinstance(labels, tf.RaggedTensor): labels = labels.to_tensor(default_value=-100) labels = labels.numpy() attention_mask = eval_dataset.with_format("tf")["attention_mask"] if isinstance(attention_mask, tf.RaggedTensor): attention_mask = attention_mask.to_tensor(default_value=-100) attention_mask = attention_mask.numpy() labels[attention_mask == 0] = -100 preds, refs = get_labels(predictions, labels) metric.add_batch( predictions=preds, references=refs, ) eval_metric = compute_metrics() logger.info("Evaluation metrics:") for key, val in eval_metric.items(): logger.info(f"{key}: {val:.4f}") if training_args.output_dir is not None: output_eval_file = os.path.join(training_args.output_dir, "all_results.json") with open(output_eval_file, "w") as writer: writer.write(json.dumps(eval_metric)) # endregion if training_args.output_dir is not None and not training_args.push_to_hub: # If we're not pushing to hub, at least save a local copy when we're done model.save_pretrained(training_args.output_dir)
def main(args): dataset = prepare_dataset(args) tokenizer = prepare_tokenizer(args) # data_collator = YD_DataCollatorForTokenClassification(tokenizer) data_collator = DataCollatorForTokenClassification(tokenizer) if 'test' in dataset: column_names = dataset["test"].column_names features = dataset["test"].features else: raise ValueError('Evaluation must specify test_file!') text_column_name = "tokens" if "tokens" in column_names else column_names[0] label_column_name = ('ner_tags' if 'ner_tags' in column_names else column_names[1]) if isinstance(features[label_column_name].feature, ClassLabel): label_list = features[label_column_name].feature.names # No need to convert the labels since they are already ints. label_to_id = {i: i for i in range(len(label_list))} else: if 'test' in label_column_name: label_list = get_label_list(datasets["test"][label_column_name]) else: raise ValueError('Evaluation must specify test_file!') label_to_id = {l: i for i, l in enumerate(label_list)} args.num_labels = len(label_list) # Tokenize all texts and align the labels with them. def tokenize_and_align_labels(examples, label_all_tokens=False): tokenized_inputs = tokenizer( examples[text_column_name], padding=False, truncation=True, # We use this argument because the texts in our dataset are lists of words (with a label for each word). is_split_into_words=True, return_offsets_mapping=True, ) #print('tokenized_inputs', tokenized_inputs) offset_mappings = tokenized_inputs.pop("offset_mapping") labels = [] for label, offset_mapping in zip(examples[label_column_name], offset_mappings): label_index = 0 current_label = -100 label_ids = [] for offset in offset_mapping: # We set the label for the first token of each word. Special characters will have an offset of (0, 0) # so the test ignores them. if offset[0] == 0 and offset[1] != 0: current_label = label_to_id[label[label_index]] label_index += 1 label_ids.append(current_label) # For special tokens, we set the label to -100 so it's automatically ignored in the loss function. elif offset[0] == 0 and offset[1] == 0: label_ids.append(-100) # For the other tokens in a word, we set the label to either the current label or -100, depending on # the label_all_tokens flag. else: label_ids.append( current_label if label_all_tokens else -100) labels.append(label_ids) tokenized_inputs["labels"] = labels return tokenized_inputs tokenized_datasets = dataset.map( tokenize_and_align_labels, # batched=True, **YD** test non-batched results batched=True, num_proc=8, load_from_cache_file=False, ) test_dataset = tokenized_datasets['test'] # **YD** core code to keep only usefule parameters for model test_dataset.set_format(type=test_dataset.format["type"], columns=_NER_COLUMNS) # **YD** dataloader data_loader = DataLoader( test_dataset, batch_size=args.batch_size, sampler=None, collate_fn=data_collator, drop_last=False, # num_workers=8, num_workers=0, ) model = prepare_model(args) model.cuda() true_predictions = [] true_labels = [] for index, input in tqdm(enumerate(data_loader)): labels, input['labels'] = input['labels'].tolist(), None # print(input.keys()) input = utils.move_to_cuda(input) predictions = model(**input) if index == 0: print('predictions', predictions) predictions = torch.argmax(predictions, axis=2).tolist() if index == 0: print('labels', labels) print('predictions', predictions) true_predictions.extend( [[label_list[p] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]) true_labels.extend( [[label_list[l] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]) true_predictions = [ true_prediction for true_prediction in true_predictions if true_prediction != [] ] true_labels = [ true_label for true_label in true_labels if true_label != [] ] print('true_predictions', true_predictions[0], true_predictions[-1]) print('true_labels', true_labels[0], true_labels[-1]) print({ "accuracy_score": accuracy_score(true_labels, true_predictions), "precision": precision_score(true_labels, true_predictions), "recall": recall_score(true_labels, true_predictions), "f1": f1_score(true_labels, true_predictions), })
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) if (os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty." "Use --overwrite_output_dir to overcome.") # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if is_main_process(training_args.local_rank) else logging.WARN, ) # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() logger.info("Training/evaluation parameters %s", training_args) # Set seed before initializing model. set_seed(training_args.seed) # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub). # # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called # 'text' is found. You can easily tweak this behavior (see below). # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if data_args.dataset_name is not None: # Downloading and loading a dataset from the hub. datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name) else: data_files = {} if data_args.train_file is not None: data_files["train"] = data_args.train_file if data_args.validation_file is not None: data_files["validation"] = data_args.validation_file if data_args.test_file is not None: data_files["test"] = data_args.test_file extension = data_args.train_file.split(".")[-1] datasets = load_dataset(extension, data_files=data_files) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. if training_args.do_train: column_names = datasets["train"].column_names else: column_names = datasets["validation"].column_names text_column_name = "words" if "words" in column_names else column_names[0] label_column_name = data_args.task_name if data_args.task_name in column_names else column_names[ 1] # Labeling (this part will be easier when https://github.com/huggingface/datasets/issues/797 is solved) def get_label_list(labels): unique_labels = set() for label in labels: unique_labels = unique_labels | set(label) label_list = list(unique_labels) label_list.sort() return label_list label_list = get_label_list(datasets["train"][label_column_name]) label_to_id = {l: i for i, l in enumerate(label_list)} num_labels = len(label_list) # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, num_labels=num_labels, finetuning_task=data_args.task_name, cache_dir=model_args.cache_dir, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=True, ) model = AutoModelForTokenClassification.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) # Preprocessing the dataset # Padding strategy padding = "max_length" if data_args.pad_to_max_length else False # Tokenize all texts and align the labels with them. def tokenize_and_align_labels(examples): tokenized_inputs = tokenizer( examples[text_column_name], padding=padding, truncation=True, # We use this argument because the texts in our dataset are lists of words (with a label for each word). is_split_into_words=True, return_offsets_mapping=True, ) offset_mappings = tokenized_inputs.pop("offset_mapping") labels = [] for label, offset_mapping in zip(examples[label_column_name], offset_mappings): label_index = 0 current_label = -100 label_ids = [] for offset in offset_mapping: # We set the label for the first token of each word. Special characters will have an offset of (0, 0) # so the test ignores them. if offset[0] == 0 and offset[1] != 0: current_label = label_to_id[label[label_index]] label_index += 1 label_ids.append(current_label) # For special tokens, we set the label to -100 so it's automatically ignored in the loss function. elif offset[0] == 0 and offset[1] == 0: label_ids.append(-100) # For the other tokens in a word, we set the label to either the current label or -100, depending on # the label_all_tokens flag. else: label_ids.append( current_label if data_args.label_all_tokens else -100) labels.append(label_ids) tokenized_inputs["labels"] = labels return tokenized_inputs tokenized_datasets = datasets.map( tokenize_and_align_labels, batched=True, num_proc=data_args.preprocessing_num_workers, load_from_cache_file=not data_args.overwrite_cache, ) # Data collator data_collator = DataCollatorForTokenClassification(tokenizer) # Metrics def compute_metrics(p): predictions, labels = p predictions = np.argmax(predictions, axis=2) # Remove ignored index (special tokens) true_predictions = [[ label_list[p] for (p, l) in zip(prediction, label) if l != -100 ] for prediction, label in zip(predictions, labels)] true_labels = [[ label_list[l] for (p, l) in zip(prediction, label) if l != -100 ] for prediction, label in zip(predictions, labels)] return { "accuracy_score": accuracy_score(true_labels, true_predictions), "precision": precision_score(true_labels, true_predictions), "recall": recall_score(true_labels, true_predictions), "f1": f1_score(true_labels, true_predictions), } # Initialize our Trainer trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_datasets["train"] if training_args.do_train else None, eval_dataset=tokenized_datasets["validation"] if training_args.do_eval else None, tokenizer=tokenizer, data_collator=data_collator, compute_metrics=compute_metrics, ) # Training if training_args.do_train: trainer.train(model_path=model_args.model_name_or_path if os.path. isdir(model_args.model_name_or_path) else None) trainer.save_model() # Saves the tokenizer too for easy upload # Evaluation results = {} if training_args.do_eval: logger.info("*** Evaluate ***") results = trainer.evaluate() output_eval_file = os.path.join(training_args.output_dir, "eval_results_ner.txt") if trainer.is_world_process_zero(): with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key, value in results.items(): logger.info(f" {key} = {value}") writer.write(f"{key} = {value}\n") # Predict if training_args.do_predict: logger.info("*** Predict ***") test_dataset = datasets["test"] predictions, labels, metrics = trainer.predict(test_dataset) predictions = np.argmax(predictions, axis=2) # Remove ignored index (special tokens) true_predictions = [[ label_list[p] for (p, l) in zip(prediction, label) if l != -100 ] for prediction, label in zip(predictions, labels)] output_test_results_file = os.path.join(training_args.output_dir, "test_results.txt") if trainer.is_world_master(): with open(output_test_results_file, "w") as writer: for key, value in metrics.items(): logger.info(f" {key} = {value}") writer.write(f"{key} = {value}\n") # Save predictions output_test_predictions_file = os.path.join(training_args.output_dir, "test_predictions.txt") if trainer.is_world_master(): with open(output_test_predictions_file, "w") as writer: for prediction in true_predictions: writer.write(" ".join(prediction) + "\n") return results
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) if (os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty." "Use --overwrite_output_dir to overcome.") # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if is_main_process(training_args.local_rank) else logging.WARN, ) # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() logger.info("Training/evaluation parameters %s", training_args) # Set seed before initializing model. set_seed(training_args.seed) # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub). # # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called # 'text' is found. You can easily tweak this behavior (see below). # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if data_args.dataset_name is not None: # Downloading and loading a dataset from the hub. datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name) else: data_files = {} if data_args.train_file is not None: data_files["train"] = data_args.train_file if data_args.validation_file is not None: data_files["validation"] = data_args.validation_file if data_args.test_file is not None: data_files["test"] = data_args.test_file table = csv.read_csv("./data/train.csv", parse_options=ParseOptions(delimiter="\t")) class_label_ = table.column("label").unique() class_label = ClassLabel(num_classes=len(class_label_), names=class_label_.tolist()) train = main_ner.process_data(data_args.train_file, class_label) test = main_ner.process_data(data_args.test_file, class_label) val = main_ner.process_data(data_args.validation_file, class_label) # table = csv.read_csv(data_args.train_file) extension = data_args.train_file.split(".")[-1] datasets = load_dataset(extension, data_files=data_files, delimiter="\t", quoting=csv_lib.QUOTE_NONE) train_dataset = datasets["train"] test_dataset = datasets["test"] val_dataset = datasets["validation"] table = train_dataset.data label = table.column("label") class_label_ = label.unique() class_label = Sequence(feature=ClassLabel( num_classes=len(class_label_), names=class_label_.tolist())) train_dataset.features['ner_tags'] = class_label # train_ner_list: ChunkedArray = class_label.feature.str2int(train_dataset.data.column('label').to_numpy()) # train_ner_array = pa.array(train_ner_list) # train_data = train_dataset.data.append_column("ner_tags", train_ner_array) train_dataset._data = train test_dataset.features['ner_tags'] = class_label test_dataset._data = test val_dataset.features['ner_tags'] = class_label # val_ner_list: ChunkedArray = class_label.feature.str2int(val_dataset.data.column('label').to_numpy()) # val_ner_array = pa.array(val_ner_list) # val_data = val_dataset.data.append_column("ner_tags", val_ner_array) val_dataset._data = val # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. if training_args.do_train: column_names = datasets["train"].column_names features = datasets["train"].features else: column_names = datasets["validation"].column_names features = datasets["validation"].features text_column_name = "tokens" if "tokens" in column_names else column_names[0] label_column_name = (f"{data_args.task_name}_tags" if f"{data_args.task_name}_tags" in column_names else column_names[1]) # In the event the labels are not a `Sequence[ClassLabel]`, we will need to go through the dataset to get the # unique labels. def get_label_list(labels): unique_labels = set() for label in labels: unique_labels = unique_labels | set(label) label_list = list(unique_labels) label_list.sort() return label_list seq: Sequence = features[label_column_name] # label_list = ["O", "B-GENE", "I-GENE"] # label_to_id = {i: i for i in range(len(label_list))} if isinstance(seq.feature, ClassLabel): label_list = features[label_column_name].feature.names # No need to convert the labels since they are already ints. label_to_id = {i: i for i in range(len(label_list))} else: label_list = get_label_list(datasets["train"][label_column_name]) label_to_id = {l: i for i, l in enumerate(label_list)} num_labels = len(label_list) # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, num_labels=num_labels, finetuning_task=data_args.task_name, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=True, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) model = AutoModelForTokenClassification.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) # Tokenizer check: this script requires a fast tokenizer. if not isinstance(tokenizer, PreTrainedTokenizerFast): raise ValueError( "This example script only works for models that have a fast tokenizer. Checkout the big table of models " "at https://huggingface.co/transformers/index.html#bigtable to find the model types that meet this " "requirement") # Preprocessing the dataset # Padding strategy padding = "max_length" if data_args.pad_to_max_length else False # Tokenize all texts and align the labels with them. def tokenize_and_align_labels(examples): tokenized_inputs = tokenizer( examples[text_column_name], padding=padding, truncation=True, # We use this argument because the texts in our dataset are lists of words (with a label for each word). is_split_into_words=True, ) labels = [] if len(examples) == 3: for i, label in enumerate(examples[label_column_name]): word_ids = tokenized_inputs.word_ids(batch_index=i) previous_word_idx = None label_ids = [] for word_idx in word_ids: # Special tokens have a word id that is None. We set the label to -100 so they are automatically # ignored in the loss function. if word_idx is None: label_ids.append(-100) # We set the label for the first token of each word. elif word_idx != previous_word_idx: label_ids.append(label_to_id[label[word_idx]]) # For the other tokens in a word, we set the label to either the current label or -100, depending on # the label_all_tokens flag. else: label_ids.append(label_to_id[label[word_idx]] if data_args.label_all_tokens else -100) previous_word_idx = word_idx labels.append(label_ids) tokenized_inputs["labels"] = labels return tokenized_inputs else: print("asdasdsa") tokenized_datasets = datasets.map( tokenize_and_align_labels, batched=True, num_proc=data_args.preprocessing_num_workers, load_from_cache_file=not data_args.overwrite_cache, ) # Data collator data_collator = DataCollatorForTokenClassification(tokenizer) # Metrics metric = load_metric("seqeval") def compute_metrics(p): predictions, labels = p predictions = np.argmax(predictions, axis=2) # Remove ignored index (special tokens) true_predictions = [[ label_list[p] for (p, l) in zip(prediction, label) if l != -100 ] for prediction, label in zip(predictions, labels)] true_labels = [[ label_list[l] for (p, l) in zip(prediction, label) if l != -100 ] for prediction, label in zip(predictions, labels)] results = metric.compute(predictions=true_predictions, references=true_labels) if data_args.return_entity_level_metrics: # Unpack nested dictionaries final_results = {} for key, value in results.items(): if isinstance(value, dict): for n, v in value.items(): final_results[f"{key}_{n}"] = v else: final_results[key] = value return final_results else: return { "precision": results["overall_precision"], "recall": results["overall_recall"], "f1": results["overall_f1"], "accuracy": results["overall_accuracy"], } # Initialize our Trainer trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_datasets["train"] if training_args.do_train else None, eval_dataset=tokenized_datasets["validation"] if training_args.do_eval else None, tokenizer=tokenizer, data_collator=data_collator, compute_metrics=compute_metrics, ) # Training if training_args.do_train: train_result = trainer.train( model_path=model_args.model_name_or_path if os.path. isdir(model_args.model_name_or_path) else None) trainer.save_model() # Saves the tokenizer too for easy upload output_train_file = os.path.join(training_args.output_dir, "train_results.txt") if trainer.is_world_process_zero(): with open(output_train_file, "w") as writer: logger.info("***** Train results *****") for key, value in sorted(train_result.metrics.items()): logger.info(f" {key} = {value}") writer.write(f"{key} = {value}\n") # Need to save the state, since Trainer.save_model saves only the tokenizer with the model trainer.state.save_to_json( os.path.join(training_args.output_dir, "trainer_state.json")) # Evaluation results = {} if training_args.do_eval: logger.info("*** Evaluate ***") results = trainer.evaluate() output_eval_file = os.path.join(training_args.output_dir, "eval_results_ner.txt") if trainer.is_world_process_zero(): with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key, value in results.items(): logger.info(f" {key} = {value}") writer.write(f"{key} = {value}\n") # Predict if training_args.do_predict: logger.info("*** Predict ***") test_dataset = tokenized_datasets["test"] predictions, labels, metrics = trainer.predict(test_dataset) predictions = np.argmax(predictions, axis=2) # Remove ignored index (special tokens) true_predictions = [[ label_list[p] for (p, l) in zip(prediction, label) if l != -100 ] for prediction, label in zip(predictions, labels)] output_test_results_file = os.path.join(training_args.output_dir, "test_results.txt") if trainer.is_world_process_zero(): with open(output_test_results_file, "w") as writer: for key, value in sorted(metrics.items()): logger.info(f" {key} = {value}") writer.write(f"{key} = {value}\n") # Save predictions output_test_predictions_file = os.path.join(training_args.output_dir, "test_predictions.txt") if trainer.is_world_process_zero(): with open(output_test_predictions_file, "w") as writer: for prediction in true_predictions: writer.write(" ".join(prediction) + "\n") return results