Esempio n. 1
0
 def _setup_dataloader_from_config(self, cfg: DictConfig, data_split: str):
     start_time = perf_counter()
     logging.info(f'Creating {data_split} dataset')
     input_file = cfg.data_path
     tagger_data_augmentation = cfg.get('tagger_data_augmentation', False)
     dataset = TextNormalizationTaggerDataset(
         input_file=input_file,
         tokenizer=self._tokenizer,
         tokenizer_name=self.transformer_name,
         mode=self.mode,
         do_basic_tokenize=cfg.do_basic_tokenize,
         tagger_data_augmentation=tagger_data_augmentation,
         lang=self.lang,
         max_seq_length=self.max_sequence_len,
         use_cache=cfg.get('use_cache', False),
         max_insts=cfg.get('max_insts', -1),
     )
     data_collator = DataCollatorForTokenClassification(self._tokenizer)
     dl = torch.utils.data.DataLoader(dataset=dataset,
                                      batch_size=cfg.batch_size,
                                      shuffle=cfg.shuffle,
                                      collate_fn=data_collator)
     running_time = perf_counter() - start_time
     logging.info(f'Took {running_time} seconds')
     return dl
Esempio n. 2
0
    def __call__(self):
        if self.lang == "de":
            # dataset = load_dataset('../config/conll2003-de/', 'conll2003-de', cache_dir=self.spec.cache_dir)
            dataset = load_dataset(
                'D:/IntellijProjects/aikido2/aikido/kata/config/conll2003-de/',
                'conll2003-de',
                cache_dir=self.spec.cache_dir)
        elif self.lang == "en":
            dataset = load_dataset("conll2003", cache_dir=self.spec.cache_dir)
        else:
            raise ValueError(
                f"cannot handle language {self.lang} for conll2003")

        encoded = dataset.map(function=self.tokenize_and_align_labels,
                              batched=self.spec.batch_preprocess,
                              batch_size=self.spec.batch_preprocess_size,
                              remove_columns=[
                                  "id", "tokens", "pos_tags", "chunk_tags",
                                  "ner_tags"
                              ])

        data_collator = DataCollatorForTokenClassification(self.tokenizer)

        return (DatasetKata(self.spec, encoded["train"], data_collator),
                DatasetKata(self.spec, encoded["validation"], data_collator),
                DatasetKata(self.spec, encoded["test"], data_collator))
Esempio n. 3
0
    def __init__(self, pretrained, prepared_dir, classifier_dir):
        """
        pretrained is None means disable classifier
        """
        self.pretrained = pretrained
        self.classifier_dir = classifier_dir
        self.prepared_dir = prepared_dir
        self.datasets = DatasetDict({
            'train':
            read_dataset_from_csv(prepared_dir + '/train.csv'),
            'test':
            read_dataset_from_csv(prepared_dir + '/test.csv'),
            'validation':
            read_dataset_from_csv(prepared_dir + '/validation.csv')
        })
        self.metric = load_metric("seqeval")
        self.label_list = self.datasets["train"].features["tag"].feature.names
        check_folder(self.classifier_dir)

        if pretrained:
            self.model = AutoModelForTokenClassification.from_pretrained(
                self.pretrained, num_labels=len(self.label_list))
            self.tokenizer = AutoTokenizer.from_pretrained(self.pretrained)
            self.data_collator = DataCollatorForTokenClassification(
                self.tokenizer)
Esempio n. 4
0
def get_trainer(
    train_data, eval_data, model=None, tokenizer=None, tr_args=None):
    '''model = transformer model instance
    tokenizer = transformer tokenizer instance
    tr_args = Training arguments
    '''
    device = 'cuda' if config.use_gpu else 'cpu'
    tokenizer = tokenizer or get_tokenizer(config.model_name)
    model = model or get_model(
        config.model_name, num_labels=len(config.ner_tags), device=device)
    
    
    training_args = config.training_args
    training_args.update(tr_args or {})

    args = TrainingArguments(
        'train-ner', **training_args
    )

    data_collector = DataCollatorForTokenClassification(tokenizer)

    return Trainer(
        model,
        args,
        train_dataset=train_data,
        eval_dataset=eval_data,
        data_collator=data_collector,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )
    def test_data_collator_for_token_classification(self):
        tokenizer = BertTokenizer(self.vocab_file)
        features = [
            {
                "input_ids": [0, 1, 2],
                "labels": [0, 1, 2]
            },
            {
                "input_ids": [0, 1, 2, 3, 4, 5],
                "labels": [0, 1, 2, 3, 4, 5]
            },
        ]

        data_collator = DataCollatorForTokenClassification(tokenizer,
                                                           return_tensors="tf")
        batch = data_collator(features)
        self.assertEqual(batch["input_ids"].shape.as_list(), [2, 6])
        self.assertEqual(batch["input_ids"][0].numpy().tolist(),
                         [0, 1, 2] + [tokenizer.pad_token_id] * 3)
        self.assertEqual(batch["labels"].shape.as_list(), [2, 6])
        self.assertEqual(batch["labels"][0].numpy().tolist(),
                         [0, 1, 2] + [-100] * 3)

        data_collator = DataCollatorForTokenClassification(
            tokenizer,
            padding="max_length",
            max_length=10,
            return_tensors="tf")
        batch = data_collator(features)
        self.assertEqual(batch["input_ids"].shape.as_list(), [2, 10])
        self.assertEqual(batch["labels"].shape.as_list(), [2, 10])

        data_collator = DataCollatorForTokenClassification(
            tokenizer, pad_to_multiple_of=8, return_tensors="tf")
        batch = data_collator(features)
        self.assertEqual(batch["input_ids"].shape.as_list(), [2, 8])
        self.assertEqual(batch["labels"].shape.as_list(), [2, 8])

        data_collator = DataCollatorForTokenClassification(
            tokenizer, label_pad_token_id=-1, return_tensors="tf")
        batch = data_collator(features)
        self.assertEqual(batch["input_ids"].shape.as_list(), [2, 6])
        self.assertEqual(batch["input_ids"][0].numpy().tolist(),
                         [0, 1, 2] + [tokenizer.pad_token_id] * 3)
        self.assertEqual(batch["labels"].shape.as_list(), [2, 6])
        self.assertEqual(batch["labels"][0].numpy().tolist(),
                         [0, 1, 2] + [-1] * 3)
Esempio n. 6
0
 def loaders(self):
     if self._loaders is None:
         ps, mgr = self.params, self.mgr
         if ps.pad_to_max_length:
             c = default_data_collator
         else:
             c = DataCollatorForTokenClassification(
                 self.tokenizer, pad_to_multiple_of=(8 if mgr.use_fp16 else None)
             )
         t = DataLoader(
             self.train_ds, shuffle=True, collate_fn=c, batch_size=ps.train_batch_size
         )
         e = DataLoader(self.eval_ds, collate_fn=c, batch_size=ps.eval_batch_size)
         self._loaders = {TRAIN: t, EVAL: e}
     return self._loaders
Esempio n. 7
0
    def __call__(self):
        dataset = load_dataset("germeval_14", cache_dir=self.spec.cache_dir)
        encoded = dataset.map(function=self.tokenize_and_align_labels,
                              batched=self.spec.batch_preprocess,
                              batch_size=self.spec.batch_preprocess_size,
                              remove_columns=[
                                  'id', 'ner_tags', 'nested_ner_tags',
                                  'source', 'tokens'
                              ])

        data_collator = DataCollatorForTokenClassification(self.tokenizer)

        return (DatasetKata(self.spec, encoded["train"], data_collator),
                DatasetKata(self.spec, encoded["validation"], data_collator),
                DatasetKata(self.spec, encoded["test"], data_collator))
Esempio n. 8
0
 def __init__(self, samples, tok, max_length=128, use_token_type_ids=False):
     super().__init__()
     self.tok = tok
     self.data_collator = DataCollatorForTokenClassification(
         tok, max_length=max_length, padding='max_length')
     raw_samples = [s.tokenization_result for s in samples]
     for s in raw_samples:
         while len(s['confidences']) < max_length:
             s['confidences'].append(0)
     self.confidences = torch.FloatTensor(
         [s['confidences'] for s in raw_samples])
     print('CONFIDENCES INFO', self.confidences.sum(),
           (self.confidences != 0).float().sum() /
           self.confidences.nelement(), self.confidences.shape)
     self.samples = deepcopy(raw_samples)
     for s in self.samples:
         del s['confidences']
     self.collated = self.data_collator(self.samples)
     self.use_token_type_ids = use_token_type_ids
def main(train_json_path, val_json_path, model_name_or_dir, output_dir,
         logging_dir, logging_steps, batch_size, gradient_accumulation_steps,
         learning_rate, num_train_epochs, warmup_ratio, num_classes):
    tokenizer = XLMRobertaTokenizerFast.from_pretrained(model_name_or_dir)
    model = XLMRobertaForTokenClassification.from_pretrained(
        model_name_or_dir, num_labels=num_classes)

    sh_ner_train_dataset, sh_ner_val_dataset = create_sh_ner_dataset(
        train_json_path, val_json_path, tokenizer)

    data_collator = DataCollatorForTokenClassification(tokenizer)

    training_args = TrainingArguments(
        output_dir=output_dir,
        do_train=True,
        do_eval=True,
        do_predict=False,
        evaluation_strategy='epoch',
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        gradient_accumulation_steps=gradient_accumulation_steps,
        learning_rate=learning_rate,
        num_train_epochs=num_train_epochs,
        warmup_ratio=warmup_ratio,
        logging_dir=logging_dir,
        logging_strategy='steps',
        logging_steps=logging_steps,
        save_strategy='epoch',
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=sh_ner_train_dataset,
        eval_dataset=sh_ner_val_dataset,
        data_collator=data_collator,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

    trainer.train()
Esempio n. 10
0
 def _setup_dataloader_from_config(self, cfg: DictConfig, mode: str):
     start_time = perf_counter()
     logging.info(f'Creating {mode} dataset')
     input_file = cfg.data_path
     dataset = TextNormalizationTaggerDataset(
         input_file,
         self._tokenizer,
         cfg.mode,
         cfg.get('do_basic_tokenize', False),
         cfg.get('tagger_data_augmentation', False),
     )
     data_collator = DataCollatorForTokenClassification(self._tokenizer)
     dl = torch.utils.data.DataLoader(
         dataset=dataset,
         batch_size=cfg.batch_size,
         shuffle=cfg.shuffle,
         collate_fn=data_collator,
     )
     running_time = perf_counter() - start_time
     logging.info(f'Took {running_time} seconds')
     return dl
Esempio n. 11
0
 def __init__(self, model_path):
     config = AutoConfig.from_pretrained(model_path)
     tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True)
     model = AutoModelForTokenClassification.from_pretrained(
         model_path,
         from_tf=False,
         config=config,
     )
     data_collator = DataCollatorForTokenClassification(
         tokenizer,
         pad_to_multiple_of=None,
     )
     self.trainer = Trainer(
         model=model,
         args=None,
         train_dataset=None,
         eval_dataset=None,
         tokenizer=tokenizer,
         data_collator=data_collator,
         compute_metrics=None,
     )
     self.device = torch.device(
         'cuda:0' if torch.cuda.is_available() else 'cpu')
     self.spacy_object = spacy.load('xx_sent_ud_sm')
Esempio n. 12
0
def main():
    args = parse_args()

    # Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
    accelerator = Accelerator()
    # Make one log on every process with the configuration for debugging.
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
    )
    logger.info(accelerator.state)

    # Setup logging, we only want one process per machine to log things on the screen.
    # accelerator.is_local_main_process is only True for one process per machine.
    logger.setLevel(
        logging.INFO if accelerator.is_local_main_process else logging.ERROR)
    if accelerator.is_local_main_process:
        datasets.utils.logging.set_verbosity_warning()
        transformers.utils.logging.set_verbosity_info()
    else:
        datasets.utils.logging.set_verbosity_error()
        transformers.utils.logging.set_verbosity_error()

    # If passed along, set the training seed now.
    if args.seed is not None:
        set_seed(args.seed)

    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
    # or just provide the name of one of the public datasets for token classification task available on the hub at https://huggingface.co/datasets/
    # (the dataset will be downloaded automatically from the datasets Hub).
    #
    # For CSV/JSON files, this script will use the column called 'tokens' or the first column if no column called
    # 'tokens' is found. You can easily tweak this behavior (see below).
    #
    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
    # download the dataset.
    if args.dataset_name is not None:
        # Downloading and loading a dataset from the hub.
        raw_datasets = load_dataset(args.dataset_name,
                                    args.dataset_config_name)
    else:
        data_files = {}
        if args.train_file is not None:
            data_files["train"] = args.train_file
        if args.validation_file is not None:
            data_files["validation"] = args.validation_file
        extension = args.train_file.split(".")[-1]
        raw_datasets = load_dataset(extension, data_files=data_files)
    # Trim a number of training examples
    if args.debug:
        for split in raw_datasets.keys():
            raw_datasets[split] = raw_datasets[split].select(range(100))
    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
    # https://huggingface.co/docs/datasets/loading_datasets.html.

    if raw_datasets["train"] is not None:
        column_names = raw_datasets["train"].column_names
        features = raw_datasets["train"].features
    else:
        column_names = raw_datasets["validation"].column_names
        features = raw_datasets["validation"].features

    if args.text_column_name is not None:
        text_column_name = args.text_column_name
    elif "tokens" in column_names:
        text_column_name = "tokens"
    else:
        text_column_name = column_names[0]

    if args.label_column_name is not None:
        label_column_name = args.label_column_name
    elif f"{args.task_name}_tags" in column_names:
        label_column_name = f"{args.task_name}_tags"
    else:
        label_column_name = column_names[1]

    # In the event the labels are not a `Sequence[ClassLabel]`, we will need to go through the dataset to get the
    # unique labels.
    def get_label_list(labels):
        unique_labels = set()
        for label in labels:
            unique_labels = unique_labels | set(label)
        label_list = list(unique_labels)
        label_list.sort()
        return label_list

    if isinstance(features[label_column_name].feature, ClassLabel):
        label_list = features[label_column_name].feature.names
        # No need to convert the labels since they are already ints.
        label_to_id = {i: i for i in range(len(label_list))}
    else:
        label_list = get_label_list(raw_datasets["train"][label_column_name])
        label_to_id = {l: i for i, l in enumerate(label_list)}
    num_labels = len(label_list)

    # Load pretrained model and tokenizer
    #
    # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.
    if args.config_name:
        config = AutoConfig.from_pretrained(args.config_name,
                                            num_labels=num_labels)
    elif args.model_name_or_path:
        config = AutoConfig.from_pretrained(args.model_name_or_path,
                                            num_labels=num_labels)
    else:
        config = CONFIG_MAPPING[args.model_type]()
        logger.warning(
            "You are instantiating a new config instance from scratch.")

    tokenizer_name_or_path = args.tokenizer_name if args.tokenizer_name else args.model_name_or_path
    if not tokenizer_name_or_path:
        raise ValueError(
            "You are instantiating a new tokenizer from scratch. This is not supported by this script."
            "You can do it from another script, save it, and load it from here, using --tokenizer_name."
        )

    if config.model_type in {"gpt2", "roberta"}:
        tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path,
                                                  use_fast=True,
                                                  add_prefix_space=True)
    else:
        tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path,
                                                  use_fast=True)

    if args.model_name_or_path:
        model = AutoModelForTokenClassification.from_pretrained(
            args.model_name_or_path,
            from_tf=bool(".ckpt" in args.model_name_or_path),
            config=config,
        )
    else:
        logger.info("Training new model from scratch")
        model = AutoModelForTokenClassification.from_config(config)

    model.resize_token_embeddings(len(tokenizer))

    # Preprocessing the raw_datasets.
    # First we tokenize all the texts.
    padding = "max_length" if args.pad_to_max_length else False

    # Tokenize all texts and align the labels with them.

    def tokenize_and_align_labels(examples):
        tokenized_inputs = tokenizer(
            examples[text_column_name],
            max_length=args.max_length,
            padding=padding,
            truncation=True,
            # We use this argument because the texts in our dataset are lists of words (with a label for each word).
            is_split_into_words=True,
        )

        labels = []
        for i, label in enumerate(examples[label_column_name]):
            word_ids = tokenized_inputs.word_ids(batch_index=i)
            previous_word_idx = None
            label_ids = []
            for word_idx in word_ids:
                # Special tokens have a word id that is None. We set the label to -100 so they are automatically
                # ignored in the loss function.
                if word_idx is None:
                    label_ids.append(-100)
                # We set the label for the first token of each word.
                elif word_idx != previous_word_idx:
                    label_ids.append(label_to_id[label[word_idx]])
                # For the other tokens in a word, we set the label to either the current label or -100, depending on
                # the label_all_tokens flag.
                else:
                    label_ids.append(label_to_id[label[word_idx]] if args.
                                     label_all_tokens else -100)
                previous_word_idx = word_idx

            labels.append(label_ids)
        tokenized_inputs["labels"] = labels
        return tokenized_inputs

    processed_raw_datasets = raw_datasets.map(
        tokenize_and_align_labels,
        batched=True,
        remove_columns=raw_datasets["train"].column_names)

    train_dataset = processed_raw_datasets["train"]
    eval_dataset = processed_raw_datasets["validation"]

    # Log a few random samples from the training set:
    for index in random.sample(range(len(train_dataset)), 3):
        logger.info(
            f"Sample {index} of the training set: {train_dataset[index]}.")

    # DataLoaders creation:
    if args.pad_to_max_length:
        # If padding was already done ot max length, we use the default data collator that will just convert everything
        # to tensors.
        data_collator = default_data_collator
    else:
        # Otherwise, `DataCollatorForTokenClassification` will apply dynamic padding for us (by padding to the maximum length of
        # the samples passed). When using mixed precision, we add `pad_to_multiple_of=8` to pad all tensors to multiple
        # of 8s, which will enable the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
        data_collator = DataCollatorForTokenClassification(
            tokenizer,
            pad_to_multiple_of=(8 if accelerator.use_fp16 else None))

    train_dataloader = DataLoader(train_dataset,
                                  shuffle=True,
                                  collate_fn=data_collator,
                                  batch_size=args.per_device_train_batch_size)
    eval_dataloader = DataLoader(eval_dataset,
                                 collate_fn=data_collator,
                                 batch_size=args.per_device_eval_batch_size)

    # Optimizer
    # Split weights in two groups, one with weight decay and the other not.
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            args.weight_decay,
        },
        {
            "params": [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            0.0,
        },
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)

    # Use the device given by the `accelerator` object.
    device = accelerator.device
    model.to(device)

    # Prepare everything with our `accelerator`.
    model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
        model, optimizer, train_dataloader, eval_dataloader)

    # Note -> the training dataloader needs to be prepared before we grab his length below (cause its length will be
    # shorter in multiprocess)

    # Scheduler and math around the number of training steps.
    num_update_steps_per_epoch = math.ceil(
        len(train_dataloader) / args.gradient_accumulation_steps)
    if args.max_train_steps is None:
        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
    else:
        args.num_train_epochs = math.ceil(args.max_train_steps /
                                          num_update_steps_per_epoch)

    lr_scheduler = get_scheduler(
        name=args.lr_scheduler_type,
        optimizer=optimizer,
        num_warmup_steps=args.num_warmup_steps,
        num_training_steps=args.max_train_steps,
    )

    # Metrics
    metric = load_metric("seqeval")

    def get_labels(predictions, references):
        # Transform predictions and references tensos to numpy arrays
        if device.type == "cpu":
            y_pred = predictions.detach().clone().numpy()
            y_true = references.detach().clone().numpy()
        else:
            y_pred = predictions.detach().cpu().clone().numpy()
            y_true = references.detach().cpu().clone().numpy()

        # Remove ignored index (special tokens)
        true_predictions = [[
            label_list[p] for (p, l) in zip(pred, gold_label) if l != -100
        ] for pred, gold_label in zip(y_pred, y_true)]
        true_labels = [[
            label_list[l] for (p, l) in zip(pred, gold_label) if l != -100
        ] for pred, gold_label in zip(y_pred, y_true)]
        return true_predictions, true_labels

    def compute_metrics():
        results = metric.compute()
        if args.return_entity_level_metrics:
            # Unpack nested dictionaries
            final_results = {}
            for key, value in results.items():
                if isinstance(value, dict):
                    for n, v in value.items():
                        final_results[f"{key}_{n}"] = v
                else:
                    final_results[key] = value
            return final_results
        else:
            return {
                "precision": results["overall_precision"],
                "recall": results["overall_recall"],
                "f1": results["overall_f1"],
                "accuracy": results["overall_accuracy"],
            }

    # Train!
    total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps

    logger.info("***** Running training *****")
    logger.info(f"  Num examples = {len(train_dataset)}")
    logger.info(f"  Num Epochs = {args.num_train_epochs}")
    logger.info(
        f"  Instantaneous batch size per device = {args.per_device_train_batch_size}"
    )
    logger.info(
        f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}"
    )
    logger.info(
        f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
    logger.info(f"  Total optimization steps = {args.max_train_steps}")
    # Only show the progress bar once on each machine.
    progress_bar = tqdm(range(args.max_train_steps),
                        disable=not accelerator.is_local_main_process)
    completed_steps = 0

    for epoch in range(args.num_train_epochs):
        model.train()
        for step, batch in enumerate(train_dataloader):
            outputs = model(**batch)
            loss = outputs.loss
            loss = loss / args.gradient_accumulation_steps
            accelerator.backward(loss)
            if step % args.gradient_accumulation_steps == 0 or step == len(
                    train_dataloader) - 1:
                optimizer.step()
                lr_scheduler.step()
                optimizer.zero_grad()
                progress_bar.update(1)
                completed_steps += 1

            if completed_steps >= args.max_train_steps:
                break

        model.eval()
        for step, batch in enumerate(eval_dataloader):
            with torch.no_grad():
                outputs = model(**batch)
            predictions = outputs.logits.argmax(dim=-1)
            labels = batch["labels"]
            if not args.pad_to_max_length:  # necessary to pad predictions and labels for being gathered
                predictions = accelerator.pad_across_processes(predictions,
                                                               dim=1,
                                                               pad_index=-100)
                labels = accelerator.pad_across_processes(labels,
                                                          dim=1,
                                                          pad_index=-100)

            predictions_gathered = accelerator.gather(predictions)
            labels_gathered = accelerator.gather(labels)
            preds, refs = get_labels(predictions_gathered, labels_gathered)
            metric.add_batch(
                predictions=preds,
                references=refs,
            )  # predictions and preferences are expected to be a nested list of labels, not label_ids

        # eval_metric = metric.compute()
        eval_metric = compute_metrics()
        accelerator.print(f"epoch {epoch}:", eval_metric)

    if args.output_dir is not None:
        accelerator.wait_for_everyone()
        unwrapped_model = accelerator.unwrap_model(model)
        unwrapped_model.save_pretrained(args.output_dir,
                                        save_function=accelerator.save)
Esempio n. 13
0
def train(no_cache: bool, dataset_path: str, data_config_name: str,
          training_args: TrainingArguments, tokenizer: RobertaTokenizerFast):
    print(f"tokenizer vocab size: {tokenizer.vocab_size}")

    print(f"\nLoading and tokenizing datasets found in {dataset_path}.")
    train_dataset, eval_dataset, test_dataset = load_dataset(
        'EMBO/sd-nlp',
        # './tokcl/loader.py',
        data_config_name,
        script_version="main",
        # data_dir=dataset_path,
        split=["train", "validation", "test"],
        # download_mode=GenerateMode.FORCE_REDOWNLOAD if no_cache else GenerateMode.REUSE_DATASET_IF_EXISTS,
        cache_dir=CACHE)
    print(f"\nTraining with {len(train_dataset)} examples.")
    print(f"Evaluating on {len(eval_dataset)} examples.")

    if data_config_name in ["NER", "ROLES"]:
        # use our fancy data collator that randomly masks some of the inputs to enforce context learning
        training_args.remove_unused_columns = False  # we need tag_mask
        data_collator = DataCollatorForMaskedTokenClassification(
            tokenizer=tokenizer,
            max_length=config.max_length,
            masking_probability=training_args.masking_probability,
            replacement_probability=training_args.replacement_probability,
            select_labels=training_args.select_labels)
    else:
        # normal token classification
        data_collator = DataCollatorForTokenClassification(
            tokenizer=tokenizer, max_length=config.max_length)

    num_labels = train_dataset.info.features['labels'].feature.num_classes
    label_list = train_dataset.info.features['labels'].feature.names
    print(f"\nTraining on {num_labels} features:")
    print(", ".join(label_list))

    compute_metrics = MetricsComputer(label_list=label_list)

    model = RobertaForTokenClassification.from_pretrained(
        LM_MODEL_PATH,
        num_labels=num_labels,
        max_position_embeddings=config.max_length + 2)

    print("\nTraining arguments:")
    print(training_args)

    trainer = Trainer(model=model,
                      args=training_args,
                      data_collator=data_collator,
                      train_dataset=train_dataset,
                      eval_dataset=eval_dataset,
                      compute_metrics=compute_metrics,
                      callbacks=[ShowExample(tokenizer)])

    print(f"CUDA available: {torch.cuda.is_available()}")

    trainer.train()
    trainer.save_model(training_args.output_dir)

    print(f"Testing on {len(test_dataset)}.")
    pred: NamedTuple = trainer.predict(test_dataset, metric_key_prefix='test')
    print(f"{pred.metrics}")
 def collate_fn(self) -> Optional[Callable]:
     return DataCollatorForTokenClassification(tokenizer=self.tokenizer)
Esempio n. 15
0
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TrainingArguments))
    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args = parser.parse_json_file(
            json_file=os.path.abspath(sys.argv[1]))
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses(
        )

    # Detecting last checkpoint.
    last_checkpoint = None
    if (os.path.isdir(training_args.output_dir) and training_args.do_train
            and not training_args.overwrite_output_dir):
        last_checkpoint = get_last_checkpoint(training_args.output_dir)
        if last_checkpoint is None and len(os.listdir(
                training_args.output_dir)) > 0:
            raise ValueError(
                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
                "Use --overwrite_output_dir to overcome.")
        elif last_checkpoint is not None:
            logger.info(
                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
            )

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        handlers=[logging.StreamHandler(sys.stdout)],
    )
    logger.setLevel(logging.INFO if is_main_process(training_args.local_rank
                                                    ) else logging.WARN)

    # Log on each process the small summary:
    logger.warning(
        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
        +
        f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
    )
    # Set the verbosity to info of the Transformers logger (on main process only):
    if is_main_process(training_args.local_rank):
        transformers.utils.logging.set_verbosity_info()
        transformers.utils.logging.enable_default_handler()
        transformers.utils.logging.enable_explicit_format()
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed before initializing model.
    set_seed(training_args.seed)

    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
    # (the dataset will be downloaded automatically from the datasets Hub).
    #
    # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
    # 'text' is found. You can easily tweak this behavior (see below).
    #
    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
    # download the dataset.
    if data_args.dataset_name is not None:
        # Downloading and loading a dataset from the hub.
        datasets = load_dataset(data_args.dataset_name,
                                data_args.dataset_config_name)
    else:
        data_files = {}
        if data_args.train_file is not None:
            data_files["train"] = data_args.train_file
        if data_args.validation_file is not None:
            data_files["validation"] = data_args.validation_file
        if data_args.test_file is not None:
            data_files["test"] = data_args.test_file
        extension = data_args.train_file.split(".")[-1]
        datasets = load_dataset(extension, data_files=data_files)
    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
    # https://huggingface.co/docs/datasets/loading_datasets.html.

    if training_args.do_train:
        column_names = datasets["train"].column_names
        features = datasets["train"].features
    else:
        column_names = datasets["validation"].column_names
        features = datasets["validation"].features
    text_column_name = "tokens" if "tokens" in column_names else column_names[0]
    label_column_name = (f"{data_args.task_name}_tags"
                         if f"{data_args.task_name}_tags" in column_names else
                         column_names[1])

    # In the event the labels are not a `Sequence[ClassLabel]`, we will need to go through the dataset to get the
    # unique labels.
    def get_label_list(labels):
        unique_labels = set()
        for label in labels:
            unique_labels = unique_labels | set(label)
        label_list = list(unique_labels)
        label_list.sort()
        return label_list

    if isinstance(features[label_column_name].feature, ClassLabel):
        label_list = features[label_column_name].feature.names
        # No need to convert the labels since they are already ints.
        label_to_id = {i: i for i in range(len(label_list))}
    else:
        label_list = get_label_list(datasets["train"][label_column_name])
        label_to_id = {l: i for i, l in enumerate(label_list)}
    num_labels = len(label_list)

    config = PerceiverConfig(
        num_labels=num_labels,
        finetuning_task=data_args.task_name,
        cache_dir=model_args.cache_dir,
        d_latents=model_args.d_latents,
        d_model=model_args.d_model,
        max_position_embeddings=model_args.max_position_embeddings,
    )

    tokenizer = PerceiverTokenizer(
        cache_dir=model_args.cache_dir,
        model_max_length=model_args.model_max_length)

    model = PerceiverForTokenClassification(config=config)
    model.main_input_name = "input_ids"

    # Preprocessing the dataset
    # Padding strategy
    padding = "max_length" if data_args.pad_to_max_length else False

    # Tokenize all texts and align the labels with them.
    def tokenize_and_align_labels(examples):
        tokenized_inputs = tokenizer(
            examples[text_column_name],
            padding=padding,
            truncation=True,
            # We use this argument because the texts in our dataset are lists of words (with a label for each word).
            is_split_into_words=True,
            # return_offsets_mapping=True,
        )
        labels = []
        """
        There is no PerceiverTokenizerFast, follow code works for conll2003 datasets.
        words: ['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']
        words_labels: [3, 0, 7, 0, 0, 0, 7, 0, 0] 
        tokens_labels: [-100, 3, 3, 0, 0, 0, 0, 0, 0, 0, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0, 0, -100]
        """
        for i in range(len(tokenized_inputs["input_ids"])):
            # -2 special token [BOS]  [SEP]
            id_length = len(tokenized_inputs["input_ids"][i]) - 2
            char_length = len("".join(examples[text_column_name][i]))
            assert id_length == char_length

            label_ids = [-100]

            words = examples[text_column_name][i]
            words_labels = examples[label_column_name][i]
            for w, w_label in zip(words, words_labels):
                current_label = label_to_id[w_label]
                label_ids.extend([current_label] * len(w))

            label_ids.append(-100)
            assert len(label_ids) == id_length + 2

            labels.append(label_ids)

        tokenized_inputs["labels"] = labels
        return tokenized_inputs

    if training_args.do_train:
        if "train" not in datasets:
            raise ValueError("--do_train requires a train dataset")
        train_dataset = datasets["train"]
        if data_args.max_train_samples is not None:
            train_dataset = train_dataset.select(
                range(data_args.max_train_samples))
        train_dataset = train_dataset.map(
            tokenize_and_align_labels,
            batched=True,
            num_proc=data_args.preprocessing_num_workers,
            load_from_cache_file=not data_args.overwrite_cache,
        )

    if training_args.do_eval:
        if "validation" not in datasets:
            raise ValueError("--do_eval requires a validation dataset")
        eval_dataset = datasets["validation"]
        if data_args.max_val_samples is not None:
            eval_dataset = eval_dataset.select(range(
                data_args.max_val_samples))
        eval_dataset = eval_dataset.map(
            tokenize_and_align_labels,
            batched=True,
            num_proc=data_args.preprocessing_num_workers,
            load_from_cache_file=not data_args.overwrite_cache,
        )

    if training_args.do_predict:
        if "test" not in datasets:
            raise ValueError("--do_predict requires a test dataset")
        test_dataset = datasets["test"]
        if data_args.max_test_samples is not None:
            test_dataset = test_dataset.select(
                range(data_args.max_test_samples))
        test_dataset = test_dataset.map(
            tokenize_and_align_labels,
            batched=True,
            num_proc=data_args.preprocessing_num_workers,
            load_from_cache_file=not data_args.overwrite_cache,
        )

    # Data collator
    data_collator = DataCollatorForTokenClassification(
        tokenizer, pad_to_multiple_of=8 if training_args.fp16 else None)

    # Metrics
    metric = load_metric("seqeval")

    def compute_metrics(p):
        predictions, labels = p
        predictions = np.argmax(predictions, axis=2)

        # Remove ignored index (special tokens)
        true_predictions = [[
            label_list[p] for (p, l) in zip(prediction, label) if l != -100
        ] for prediction, label in zip(predictions, labels)]
        true_labels = [[
            label_list[l] for (p, l) in zip(prediction, label) if l != -100
        ] for prediction, label in zip(predictions, labels)]

        results = metric.compute(predictions=true_predictions,
                                 references=true_labels)
        if data_args.return_entity_level_metrics:
            # Unpack nested dictionaries
            final_results = {}
            for key, value in results.items():
                if isinstance(value, dict):
                    for n, v in value.items():
                        final_results[f"{key}_{n}"] = v
                else:
                    final_results[key] = value
            return final_results
        else:
            return {
                "precision": results["overall_precision"],
                "recall": results["overall_recall"],
                "f1": results["overall_f1"],
                "accuracy": results["overall_accuracy"],
            }

    # Initialize our Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset if training_args.do_train else None,
        eval_dataset=eval_dataset if training_args.do_eval else None,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    # Training
    if training_args.do_train:
        if last_checkpoint is not None:
            checkpoint = last_checkpoint
        elif os.path.isdir(model_args.model_name_or_path):
            checkpoint = model_args.model_name_or_path
        else:
            checkpoint = None
        train_result = trainer.train(resume_from_checkpoint=checkpoint)
        metrics = train_result.metrics
        trainer.save_model()  # Saves the tokenizer too for easy upload

        max_train_samples = (data_args.max_train_samples
                             if data_args.max_train_samples is not None else
                             len(train_dataset))
        metrics["train_samples"] = min(max_train_samples, len(train_dataset))

        trainer.log_metrics("train", metrics)
        trainer.save_metrics("train", metrics)
        trainer.save_state()

    # Evaluation
    if training_args.do_eval:
        logger.info("*** Evaluate ***")

        metrics = trainer.evaluate()

        max_val_samples = (data_args.max_val_samples
                           if data_args.max_val_samples is not None else
                           len(eval_dataset))
        metrics["eval_samples"] = min(max_val_samples, len(eval_dataset))

        trainer.log_metrics("eval", metrics)
        trainer.save_metrics("eval", metrics)

    # Predict
    if training_args.do_predict:
        logger.info("*** Predict ***")

        predictions, labels, metrics = trainer.predict(test_dataset)
        predictions = np.argmax(predictions, axis=2)

        # Remove ignored index (special tokens)
        true_predictions = [[
            label_list[p] for (p, l) in zip(prediction, label) if l != -100
        ] for prediction, label in zip(predictions, labels)]

        trainer.log_metrics("test", metrics)
        trainer.save_metrics("test", metrics)

        # Save predictions
        output_test_predictions_file = os.path.join(training_args.output_dir,
                                                    "test_predictions.txt")
        if trainer.is_world_process_zero():
            with open(output_test_predictions_file, "w") as writer:
                for prediction in true_predictions:
                    writer.write(" ".join(prediction) + "\n")
Esempio n. 16
0
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    # Detecting last checkpoint.
    last_checkpoint = None
    if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
        last_checkpoint = get_last_checkpoint(training_args.output_dir)
        if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
            raise ValueError(
                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
                "Use --overwrite_output_dir to overcome."
            )
        elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
            logger.info(
                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
            )

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        handlers=[logging.StreamHandler(sys.stdout)],
    )
    # logger.setLevel(logging.INFO if training_args.should_log else logging.WARN)
    logger.setLevel(logging.INFO)

    # Log on each process the small summary:
    logger.warning(
        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
    )
    # Set the verbosity to info of the Transformers logger (on main process only):
    # if training_args.should_log:
    #     transformers.utils.logging.set_verbosity_info()
    #     transformers.utils.logging.enable_default_handler()
    #     transformers.utils.logging.enable_explicit_format()
    transformers.utils.logging.set_verbosity_info()
    transformers.utils.logging.enable_default_handler()
    transformers.utils.logging.enable_explicit_format()
    logger.info(f"Training/evaluation parameters {training_args}")

    # Set seed before initializing model.
    set_seed(training_args.seed)

    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
    # (the dataset will be downloaded automatically from the datasets Hub).
    #
    # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
    # 'text' is found. You can easily tweak this behavior (see below).
    #
    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
    # download the dataset.
    if data_args.dataset_name is not None:
        # Downloading and loading a dataset from the hub.
        datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir)
    else:
        data_files = {}
        if data_args.train_file is not None:
            data_files["train"] = data_args.train_file
        if data_args.validation_file is not None:
            data_files["validation"] = data_args.validation_file
        if data_args.test_file is not None:
            data_files["test"] = data_args.test_file
        extension = data_args.train_file.split(".")[-1]
        datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
    # https://huggingface.co/docs/datasets/loading_datasets.html.

    if training_args.do_train:
        column_names = datasets["train"].column_names
        features = datasets["train"].features
    else:
        column_names = datasets["validation"].column_names
        features = datasets["validation"].features

    if data_args.text_column_name is not None:
        text_column_name = data_args.text_column_name
    elif "tokens" in column_names:
        text_column_name = "tokens"
    else:
        text_column_name = column_names[0]

    if data_args.label_column_name is not None:
        label_column_name = data_args.label_column_name
    elif f"{data_args.task_name}_tags" in column_names:
        label_column_name = f"{data_args.task_name}_tags"
    else:
        label_column_name = column_names[1]

    # In the event the labels are not a `Sequence[ClassLabel]`, we will need to go through the dataset to get the
    # unique labels.
    def get_label_list(labels):
        unique_labels = set()
        for label in labels:
            unique_labels = unique_labels | set(label)
        label_list = list(unique_labels)
        label_list.sort()
        return label_list

    if isinstance(features[label_column_name].feature, ClassLabel):
        label_list = features[label_column_name].feature.names
        # No need to convert the labels since they are already ints.
        label_to_id = {i: i for i in range(len(label_list))}
    else:
        label_list = get_label_list(datasets["train"][label_column_name])
        label_to_id = {l: i for i, l in enumerate(label_list)}
    num_labels = len(label_list)

    # Load pretrained model and tokenizer
    #
    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.
    config = AutoConfig.from_pretrained(
        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
        num_labels=num_labels,
        label2id=label_to_id,
        id2label={i: l for l, i in label_to_id.items()},
        finetuning_task=data_args.task_name,
        cache_dir=model_args.cache_dir,
        revision=model_args.model_revision,
        use_auth_token=True if model_args.use_auth_token else None,
    )
    tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
        use_fast=True,
        revision=model_args.model_revision,
        use_auth_token=True if model_args.use_auth_token else None,
    )
    model = AutoModelForTokenClassification.from_pretrained(
        model_args.model_name_or_path,
        from_tf=bool(".ckpt" in model_args.model_name_or_path),
        config=config,
        cache_dir=model_args.cache_dir,
        revision=model_args.model_revision,
        use_auth_token=True if model_args.use_auth_token else None,
    )

    # Tokenizer check: this script requires a fast tokenizer.
    if not isinstance(tokenizer, PreTrainedTokenizerFast):
        raise ValueError(
            "This example script only works for models that have a fast tokenizer. Checkout the big table of models "
            "at https://huggingface.co/transformers/index.html#supported-frameworks to find the model types that meet this "
            "requirement"
        )

    # Preprocessing the dataset
    # Padding strategy
    padding = "max_length" if data_args.pad_to_max_length else False

    # Tokenize all texts and align the labels with them.
    def tokenize_and_align_labels(examples):
        tokenized_inputs = tokenizer(
            examples[text_column_name],
            padding=padding,
            truncation=True,
            # We use this argument because the texts in our dataset are lists of words (with a label for each word).
            is_split_into_words=True,
        )
        labels = []
        for i, label in enumerate(examples[label_column_name]):
            word_ids = tokenized_inputs.word_ids(batch_index=i)
            previous_word_idx = None
            label_ids = []
            for word_idx in word_ids:
                # Special tokens have a word id that is None. We set the label to -100 so they are automatically
                # ignored in the loss function.
                if word_idx is None:
                    label_ids.append(-100)
                # We set the label for the first token of each word.
                elif word_idx != previous_word_idx:
                    label_ids.append(label_to_id[label[word_idx]])
                # For the other tokens in a word, we set the label to either the current label or -100, depending on
                # the label_all_tokens flag.
                else:
                    label_ids.append(label_to_id[label[word_idx]] if data_args.label_all_tokens else -100)
                previous_word_idx = word_idx

            labels.append(label_ids)
        tokenized_inputs["labels"] = labels
        return tokenized_inputs

    if training_args.do_train:
        if "train" not in datasets:
            raise ValueError("--do_train requires a train dataset")
        train_dataset = datasets["train"]
        if data_args.max_train_samples is not None:
            train_dataset = train_dataset.select(range(data_args.max_train_samples))
        train_dataset = train_dataset.map(
            tokenize_and_align_labels,
            batched=True,
            num_proc=data_args.preprocessing_num_workers,
            load_from_cache_file=not data_args.overwrite_cache,
        )

    if training_args.do_eval:
        if "validation" not in datasets:
            raise ValueError("--do_eval requires a validation dataset")
        eval_dataset = datasets["validation"]
        if data_args.max_eval_samples is not None:
            eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
        eval_dataset = eval_dataset.map(
            tokenize_and_align_labels,
            batched=True,
            num_proc=data_args.preprocessing_num_workers,
            load_from_cache_file=not data_args.overwrite_cache,
        )

    if training_args.do_predict:
        if "test" not in datasets:
            raise ValueError("--do_predict requires a test dataset")
        predict_dataset = datasets["test"]
        if data_args.max_predict_samples is not None:
            predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))
        predict_dataset = predict_dataset.map(
            tokenize_and_align_labels,
            batched=True,
            num_proc=data_args.preprocessing_num_workers,
            load_from_cache_file=not data_args.overwrite_cache,
        )

    # Data collator
    data_collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=8 if training_args.fp16 else None)

    # Metrics
    metric = load_metric("seqeval")

    def compute_metrics(p):
        predictions, labels = p
        predictions = np.argmax(predictions, axis=2)

        # Remove ignored index (special tokens)
        true_predictions = [
            [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(predictions, labels)
        ]
        true_labels = [
            [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(predictions, labels)
        ]

        results = metric.compute(predictions=true_predictions, references=true_labels)
        if data_args.return_entity_level_metrics:
            # Unpack nested dictionaries
            final_results = {}
            for key, value in results.items():
                if isinstance(value, dict):
                    for n, v in value.items():
                        final_results[f"{key}_{n}"] = v
                else:
                    final_results[key] = value
            return final_results
        else:
            return {
                "precision": results["overall_precision"],
                "recall": results["overall_recall"],
                "f1": results["overall_f1"],
                "accuracy": results["overall_accuracy"],
            }

    # Initialize our Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset if training_args.do_train else None,
        eval_dataset=eval_dataset if training_args.do_eval else None,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    # Training
    if training_args.do_train:
        checkpoint = None
        # if training_args.resume_from_checkpoint is not None:
        #     checkpoint = training_args.resume_from_checkpoint
        # elif last_checkpoint is not None:
        #     checkpoint = last_checkpoint
        checkpoint = last_checkpoint
        train_result = trainer.train(resume_from_checkpoint=checkpoint)
        metrics = train_result.metrics
        trainer.save_model()  # Saves the tokenizer too for easy upload

        max_train_samples = (
            data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
        )
        metrics["train_samples"] = min(max_train_samples, len(train_dataset))

        trainer.log_metrics("train", metrics)
        trainer.save_metrics("train", metrics)
        trainer.save_state()

    # Evaluation
    if training_args.do_eval:
        logger.info("*** Evaluate ***")

        metrics = trainer.evaluate()

        max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
        metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))

        trainer.log_metrics("eval", metrics)
        trainer.save_metrics("eval", metrics)

    # Predict
    if training_args.do_predict:
        logger.info("*** Predict ***")

        predictions, labels, metrics = trainer.predict(predict_dataset, metric_key_prefix="predict")
        predictions = np.argmax(predictions, axis=2)

        # Remove ignored index (special tokens)
        true_predictions = [
            [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(predictions, labels)
        ]

        trainer.log_metrics("predict", metrics)
        trainer.save_metrics("predict", metrics)

        # Save predictions
        output_predictions_file = os.path.join(training_args.output_dir, "predictions.txt")
        if trainer.is_world_process_zero():
            with open(output_predictions_file, "w") as writer:
                for prediction in true_predictions:
                    writer.write(" ".join(prediction) + "\n")

    if training_args.push_to_hub:
        kwargs = {"finetuned_from": model_args.model_name_or_path, "tags": "token-classification"}
        if data_args.dataset_name is not None:
            kwargs["dataset_tags"] = data_args.dataset_name
            if data_args.dataset_config_name is not None:
                kwargs["dataset_args"] = data_args.dataset_config_name
                kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
            else:
                kwargs["dataset"] = data_args.dataset_name

        trainer.push_to_hub(**kwargs)
  return tokenized_inputs


# tokenize_and_align_labels(datasets['train'][:5])
tokenized_datasets = datasets.map(tokenize_and_align_labels, batched=True)

model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint, num_labels=len(label_list))

# no_grad
# for name, param in model.named_parameters():
#   if 'classifier' not in name:  # classifier layer
#     param.requires_grad = False


data_collator = DataCollatorForTokenClassification(tokenizer)

metric = load_metric('seqeval')
# labels = [label_list[i] for i in example[f'{task}_tags']]
# metric.compute(predictions=[labels], references=[labels])


def compute_metrics(p):
  predictions, labels = p
  predictions = np.argmax(predictions, axis=2)
  true_predictions = [[
      label_list[p] for (p, l) in zip(prediction, label) if l != -100
  ] for prediction, label in zip(predictions, labels)]
  true_labels = [[
      label_list[l] for (p, l) in zip(prediction, label) if l != -100
  ] for prediction, label in zip(predictions, labels)]
Esempio n. 18
0
def main():
    datasets = load_dataset("conll2003")
    label_list = datasets["train"].features[f"{task}_tags"].feature.names
    example = datasets["train"][4]

    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

    def tokenize_and_align_labels(examples):
        tokenized_inputs = tokenizer(examples["tokens"],
                                     truncation=True,
                                     is_split_into_words=True)

        labels = []
        for i, label in enumerate(examples[f"{task}_tags"]):
            word_ids = tokenized_inputs.word_ids(batch_index=i)
            previous_word_idx = None
            label_ids = []
            for word_idx in word_ids:
                # Special tokens have a word id that is None. We set the label to -100 so they are automatically
                # ignored in the loss function.
                if word_idx is None:
                    label_ids.append(-100)
                # We set the label for the first token of each word.
                elif word_idx != previous_word_idx:
                    label_ids.append(label[word_idx])
                # For the other tokens in a word, we set the label to either the current label or -100, depending on
                # the label_all_tokens flag.
                else:
                    label_ids.append(
                        label[word_idx] if label_all_tokens else -100)
                previous_word_idx = word_idx

            labels.append(label_ids)

        tokenized_inputs["labels"] = labels
        return tokenized_inputs

    # pipeline
    # examples = datasets["train"][:5]
    # tokenized_inputs = tokenizer(
    #     examples["tokens"], truncation=True, is_split_into_words=True
    # )

    tokenize_and_align_labels(datasets["train"][:5])
    tokenized_datasets = datasets.map(tokenize_and_align_labels, batched=True)

    model = AutoModelForTokenClassification.from_pretrained(
        model_checkpoint, num_labels=len(label_list))

    model_name = model_checkpoint.split("/")[-1]
    args = TrainingArguments(
        f"{model_name}-finetuned-{task}",
        evaluation_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=3,
        weight_decay=0.01,
        push_to_hub=True,
    )

    data_collator = DataCollatorForTokenClassification(tokenizer)
    metric = load_metric("seqeval")
    labels = [label_list[i] for i in example[f"{task}_tags"]]
    metric.compute(predictions=[labels], references=[labels])

    def compute_metrics(p):
        predictions, labels = p
        predictions = np.argmax(predictions, axis=2)

        # Remove ignored index (special tokens)
        true_predictions = [[
            label_list[p] for (p, l) in zip(prediction, label) if l != -100
        ] for prediction, label in zip(predictions, labels)]
        true_labels = [[
            label_list[l] for (p, l) in zip(prediction, label) if l != -100
        ] for prediction, label in zip(predictions, labels)]

        results = metric.compute(predictions=true_predictions,
                                 references=true_labels)
        return {
            "precision": results["overall_precision"],
            "recall": results["overall_recall"],
            "f1": results["overall_f1"],
            "accuracy": results["overall_accuracy"],
        }

    trainer = Trainer(
        model,
        args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["validation"],
        data_collator=data_collator,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

    trainer.train()
    # trainer.evaluate()

    predictions, labels, _ = trainer.predict(tokenized_datasets["validation"])
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [[
        label_list[p] for (p, l) in zip(prediction, label) if l != -100
    ] for prediction, label in zip(predictions, labels)]
    true_labels = [[
        label_list[l] for (p, l) in zip(prediction, label) if l != -100
    ] for prediction, label in zip(predictions, labels)]

    results = metric.compute(predictions=true_predictions,
                             references=true_labels)
    results
Esempio n. 19
0
def main():

    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TrainingArguments))
    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # 如果我们仅将一个参数传递给脚本,并且它是json文件的路径,对其进行解析以获取参数。
        model_args, data_args, training_args = parser.parse_json_file(
            json_file=os.path.abspath(sys.argv[1]))
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses(
        )
    #检查output文件夹
    if (os.path.exists(training_args.output_dir)
            and os.listdir(training_args.output_dir) and training_args.do_train
            and not training_args.overwrite_output_dir):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty."
            "Use --overwrite_output_dir to overcome.")

    # 日志
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO
        if is_main_process(training_args.local_rank) else logging.WARN,
    )

    # Log on each process the small summary:
    logger.warning(
        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
        +
        f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
    )
    # Set the verbosity to info of the Transformers logger (on main process only):
    if is_main_process(training_args.local_rank):
        transformers.utils.logging.set_verbosity_info()
        transformers.utils.logging.enable_default_handler()
        transformers.utils.logging.enable_explicit_format()
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed before initializing model.
    set_seed(training_args.seed)

    # 自动下载数据集
    if data_args.dataset_name is None:
        print("必须给数据集一个名字,和指定训练或测试的路径,才能加载数据集")
    #加载数据集
    data_files = {}
    if data_args.train_file is not None:
        data_files["train"] = data_args.train_file
    if data_args.validation_file is not None:
        data_files["validation"] = data_args.validation_file
    if data_args.test_file is not None:
        data_files["test"] = data_args.test_file
    datasets = load_dataset(path=data_args.script_file,
                            name=data_args.dataset_name,
                            data_files=data_files)

    #使用数据
    if training_args.do_train:
        column_names = datasets["train"].column_names
        features = datasets["train"].features
    else:
        column_names = datasets["validation"].column_names
        features = datasets["validation"].features
    # 使用哪个column作为text, column_names: ['id', 'ner_tags', 'tokens']
    text_column_name = "tokens" if "tokens" in column_names else column_names[0]
    # 哪列作为label,这里是'ner_tags'
    label_column_name = (f"{data_args.task_name}_tags"
                         if f"{data_args.task_name}_tags" in column_names else
                         column_names[1])

    #如果labels不是`Sequence [ClassLabel]`,我们将需要遍历数据集以获得唯一标签。
    def get_label_list(labels):
        unique_labels = set()
        for label in labels:
            unique_labels = unique_labels | set(label)
        label_list = list(unique_labels)
        label_list.sort()
        return label_list

    # 看一下label的feautre是不是ClassLabel类型,已经定制好的
    if isinstance(features[label_column_name].feature, ClassLabel):
        # label_list: ['O', 'B-COM', 'I-COM', 'B-EFF', 'I-EFF']
        label_list = features[label_column_name].feature.names
        # 由名称变成id格式的字典
        label_to_id = {i: i for i in range(len(label_list))}
    else:
        label_list = get_label_list(datasets["train"][label_column_name])
        label_to_id = {l: i for i, l in enumerate(label_list)}
    num_labels = len(label_list)

    # 开始加载预训练模型和tokenizer
    #
    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.
    # 加载模型配置
    config = AutoConfig.from_pretrained(
        model_args.config_name
        if model_args.config_name else model_args.model_name_or_path,
        num_labels=num_labels,
        finetuning_task=data_args.task_name,  #ner
        cache_dir=model_args.cache_dir,  #None
    )
    if model_args.model_name_or_path == "albert_model":
        #我们的albertmodel使用的是Bert的tokenizer
        tokenizer = BertTokenizerFast.from_pretrained(
            model_args.tokenizer_name
            if model_args.tokenizer_name else model_args.model_name_or_path,
            cache_dir=model_args.cache_dir,
            use_fast=True,
        )
    else:
        tokenizer = AutoTokenizer.from_pretrained(
            model_args.tokenizer_name
            if model_args.tokenizer_name else model_args.model_name_or_path,
            cache_dir=model_args.cache_dir,
            use_fast=True,
        )
    model = AutoModelForTokenClassification.from_pretrained(
        model_args.model_name_or_path,
        from_tf=bool(".ckpt" in model_args.model_name_or_path),
        config=config,
        cache_dir=model_args.cache_dir,
    )

    # 这个只能用fast tokenizer,  Tokenizer check
    if not isinstance(tokenizer, PreTrainedTokenizerFast):
        raise ValueError(
            "This example script only works for models that have a fast tokenizer. Checkout the big table of models "
            "at https://huggingface.co/transformers/index.html#bigtable to find the model types that meet this "
            "requirement")

    # 数据预处理
    # 最大序列长度
    max_length = data_args.max_length
    # Padding 策略
    padding = "max_length" if data_args.pad_to_max_length else False

    # Tokenize所有文本并将label与它们对齐。
    def tokenize_and_align_labels(examples):
        """
        datasets.map函数处理时会调用
        Args:
            examples: 这里是2条样本,
        例如: examples = {dict: 5}
 'chunk_tags' = {list: 2} [[11, 21, 11, 12, 21, 22, 11, 12, 0], [11, 12]]
 'id' = {list: 2} ['0', '1']
 'ner_tags' = {list: 2} [[3, 0, 7, 0, 0, 0, 7, 0, 0], [1, 2]]
 'pos_tags' = {list: 2} [[22, 42, 16, 21, 35, 37, 16, 21, 7], [22, 22]]
 'tokens' = {list: 2} [['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.'], ['Peter', 'Blackburn']]
        Returns:
        {'input_ids': [[101, 4649, 2244, 102], [101, 6208, 4658, 2094, 3723, 102]], 'token_type_ids': [[0, 0, 0, 0], [0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1], [1, 1, 1, 1, 1, 1]], 'labels': [[-100, 3, 4, -100], [-100, 1, 2, 2, 2, -100]]}
        """
        # 对单条样本的examples的tokens字段,即文本字段进行tokenizer
        tokenized_inputs = tokenizer(
            examples[text_column_name],
            padding=padding,
            max_length=max_length,
            truncation=True,
            # 我们使用此参数是因为数据集中的文本是单词列表(每个单词带有标签)
            is_split_into_words=True,
        )
        labels = []
        for i, label in enumerate(examples[label_column_name]):
            # 对每条样本进行处理, label 是列表[3, 0, 7, 0, 0, 0, 7, 0, 0]
            word_ids = tokenized_inputs.word_ids(batch_index=i)
            # word_ids: [None, 0, 1, 2, 3, 4, 5, 6, 7, 8, None]
            previous_word_idx = None
            label_ids = []
            for word_idx in word_ids:
                # 特殊token的单词ID为None的。 我们将label设置为-100,以便在损失函数中自动将其忽略
                if word_idx is None:
                    label_ids.append(-100)
                # 我们为每个单词的第一个token设置label。
                elif word_idx != previous_word_idx:
                    label_ids.append(label_to_id[label[word_idx]])
                # 对于单词中的其他token,我们根据label_all_tokens标志将label设置为当前label或-100。
                # 这里是对token中不是ner的部分,给label,默认给的-100
                else:
                    label_ids.append(label_to_id[label[word_idx]] if data_args.
                                     label_all_tokens else -100)
                previous_word_idx = word_idx

            labels.append(label_ids)
        # 最终labels是一个列表
        # {list: 11}[-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, -100]
        # {list: 4}[-100, 1, 2, -100]
        tokenized_inputs["labels"] = labels
        # print(f"一个batch的tokenized_inputs,{tokenized_inputs}")
        return tokenized_inputs

    #处理数据,用map函数
    tokenized_datasets = datasets.map(
        tokenize_and_align_labels,
        batched=True,
        num_proc=data_args.preprocessing_num_workers,
        load_from_cache_file=not data_args.overwrite_cache,
    )

    # Data collator, 在for循环dataloader时调用
    data_collator = DataCollatorForTokenClassification(tokenizer)

    # 计算metrics
    def compute_metrics(p):
        predictions, labels = p
        predictions = np.argmax(predictions, axis=2)

        # Remove ignored index (special tokens)
        true_predictions = [[
            label_list[p] for (p, l) in zip(prediction, label) if l != -100
        ] for prediction, label in zip(predictions, labels)]
        true_labels = [[
            label_list[l] for (p, l) in zip(prediction, label) if l != -100
        ] for prediction, label in zip(predictions, labels)]

        return {
            "accuracy_score": accuracy_score(true_labels, true_predictions),
            "precision": precision_score(true_labels, true_predictions),
            "recall": recall_score(true_labels, true_predictions),
            "f1": f1_score(true_labels, true_predictions),
        }

    # Initialize our Trainer, msra没有验证集,这里用test测试集,其实应该用train中分离出来
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets["train"]
        if training_args.do_train else None,
        eval_dataset=tokenized_datasets["validation"]
        if training_args.do_eval else None,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    # Training
    if training_args.do_train:
        # 这里model_path是用来是否继续训练的,恢复训练
        trainer.train(model_path=model_args.model_name_or_path if os.path.
                      isdir(model_args.model_name_or_path) else None)
        trainer.save_model()  # Saves the tokenizer too for easy upload

    #评估模型
    results = {}
    if training_args.do_eval:
        logger.info("*** Evaluate ***")

        results = trainer.evaluate()

        output_eval_file = os.path.join(training_args.output_dir,
                                        "eval_results_ner.txt")
        if trainer.is_world_process_zero():
            with open(output_eval_file, "w") as writer:
                logger.info("***** Eval results *****")
                for key, value in results.items():
                    logger.info(f"  {key} = {value}")
                    writer.write(f"{key} = {value}\n")

    #
    if training_args.do_predict:
        logger.info("*** 预测 ***")

        test_dataset = tokenized_datasets["test"]
        predictions, labels, metrics = trainer.predict(test_dataset)
        predictions = np.argmax(predictions, axis=2)

        # Remove ignored index (special tokens)
        true_predictions = [[
            label_list[p] for (p, l) in zip(prediction, label) if l != -100
        ] for prediction, label in zip(predictions, labels)]

        output_test_results_file = os.path.join(training_args.output_dir,
                                                "test_results.txt")
        if trainer.is_world_process_zero():
            with open(output_test_results_file, "w") as writer:
                for key, value in metrics.items():
                    logger.info(f"  {key} = {value}")
                    writer.write(f"{key} = {value}\n")

        # 保存评估结果
        output_test_predictions_file = os.path.join(training_args.output_dir,
                                                    "test_predictions.txt")
        if trainer.is_world_process_zero():
            with open(output_test_predictions_file, "w") as writer:
                for prediction in true_predictions:
                    writer.write(" ".join(prediction) + "\n")

    return results
Esempio n. 20
0
def train_model(dict_args):
    # parse args dict
    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TrainingArguments))
    model_args, data_args, training_args = parser.parse_dict(dict_args)

    if (os.path.exists(training_args.output_dir)
            and os.listdir(training_args.output_dir) and training_args.do_train
            and not training_args.overwrite_output_dir):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty."
            "Use --overwrite_output_dir to overcome.")

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO
        if is_main_process(training_args.local_rank) else logging.WARN,
    )

    # Log on each process the small summary:
    logger.warning(
        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
        +
        f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
    )
    # Set the verbosity to info of the Transformers logger (on main process only):
    if is_main_process(training_args.local_rank):
        transformers.utils.logging.set_verbosity_info()
        transformers.utils.logging.enable_default_handler()
        transformers.utils.logging.enable_explicit_format()
    logger.info("Training/evaluation parameters %s", training_args)
    logger.info("Data Arguments %s", data_args)
    logger.info("Model Arguments %s", model_args)

    # Set seed before initializing model.
    set_seed(training_args.seed)

    # load datasets
    assert os.path.exists(data_args.train_file) and os.path.exists(
        data_args.validation_file) and os.path.exists(data_args.test_file)
    datasets = load_dataset("csv",
                            data_files={
                                "train": data_args.train_file,
                                "validation": data_args.validation_file,
                                "test": data_args.test_file
                            },
                            delimiter="\t",
                            cache_dir=model_args.cache_dir)
    logger.info("Datasets %s", datasets)
    logger.info("Column names %s", datasets["train"].column_names)
    logger.info("Sample example %s", datasets["train"][0])

    # get label information
    text_column_name = "text"
    label_column_name = "labels"
    bbox_column_name = "bbox"
    num_labels, label_to_id, id_to_label = get_label_info(
        datasets["train"][label_column_name], data_args.task_name)
    logger.info("num_labels %s", num_labels)
    logger.info("label_to_id %s", label_to_id)
    logger.info("id_to_label %s", id_to_label)

    # Load config, tokenizer and pre-trained model
    # For Distributed training: The .from_pretrained methods guarantee that only
    # one local process can concurrently download model & vocab.
    if data_args.task_name == "regression":
        config = AutoConfig.from_pretrained(
            model_args.config_name
            if model_args.config_name else model_args.model_name_or_path,
            num_labels=num_labels,
            finetuning_task=data_args.task_name,
            cache_dir=model_args.cache_dir
            # for longformer : May adapt the attention_window=512 (default) in config
        )
    else:
        config = AutoConfig.from_pretrained(
            model_args.config_name
            if model_args.config_name else model_args.model_name_or_path,
            num_labels=num_labels,
            id2label=id_to_label,
            label2id=label_to_id,
            finetuning_task=data_args.task_name,
            cache_dir=model_args.cache_dir
            # for longformer : May adapt the attention_window=512 (default) in config
        )

    tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name
        if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
        use_fast=True,
        add_prefix_space=True  # for roberta tokenizer
    )

    if data_args.task_name == "ner":
        model = AutoModelForTokenClassification.from_pretrained(
            model_args.model_name_or_path,
            from_tf=bool(".ckpt" in model_args.model_name_or_path),
            config=config,
            cache_dir=model_args.cache_dir,
        )
    else:
        model = AutoModelForSequenceClassification.from_pretrained(
            model_args.model_name_or_path,
            from_tf=bool(".ckpt" in model_args.model_name_or_path),
            config=config,
            cache_dir=model_args.cache_dir)

    # Tokenizer check: this script requires a fast tokenizer.
    if not isinstance(tokenizer, PreTrainedTokenizerFast):
        raise ValueError(
            "This example script only works for models that have a fast tokenizer. Checkout the big table of models "
            "at https://huggingface.co/transformers/index.html#bigtable to find the model types that meet this "
            "requirement")

    # Pre-process the datasets (tokenize words and align labels/bboxes if needed)
    padding = "max_length" if data_args.pad_to_max_length else False
    use_bbox = data_args.use_bbox
    tokenized_datasets = datasets.map(
        lambda x: preprocess_dataset(x, tokenizer, label_to_id, data_args.
                                     label_all_tokens, padding, use_bbox,
                                     data_args.task_name),
        remove_columns=[label_column_name],
        batched=True,
        num_proc=data_args.preprocessing_num_workers,
        load_from_cache_file=not data_args.overwrite_cache,
    )
    logger.info("Tokenized datasets %s", tokenized_datasets)
    logger.info("Column names %s", tokenized_datasets["train"].column_names)
    logger.info("Sample example %s", tokenized_datasets["train"][0])

    # Data collator. Used to pad the inputs of a single batch to the max size of this batch
    # Not needed if padding has already been done (if pad_to_max_length is true): default_data_collator
    # This does not work with bboxes. Hence pad_to_max_length is always True when using bboxes
    if data_args.pad_to_max_length:
        data_collator = default_data_collator
    else:
        if data_args.task_name == "ner":
            data_collator = DataCollatorForTokenClassification(tokenizer)
        else:
            data_collator = None  # will default to DataCollatorWithPadding
    logger.info("Data Collator used %s", data_collator)

    # Initialize our Trainer
    if data_args.task_name != "multilabel-classif":
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=tokenized_datasets["train"]
            if training_args.do_train else None,
            eval_dataset=tokenized_datasets["validation"]
            if training_args.do_eval else None,
            tokenizer=tokenizer,
            data_collator=data_collator,
            compute_metrics=lambda x: compute_metrics(x, id_to_label, data_args
                                                      .task_name),
        )
    else:
        trainer = MultilabelClassificationTrainer(
            model=model,
            args=training_args,
            train_dataset=tokenized_datasets["train"]
            if training_args.do_train else None,
            eval_dataset=tokenized_datasets["validation"]
            if training_args.do_eval else None,
            tokenizer=tokenizer,
            data_collator=data_collator,
            compute_metrics=lambda x: compute_metrics(x, id_to_label, data_args
                                                      .task_name),
        )

    # Training
    if training_args.do_train:
        train_result = trainer.train(
            model_path=model_args.model_name_or_path if os.path.
            isdir(model_args.model_name_or_path) else None)
        # we save the final model (last or best) to the sagemaker output folder
        trainer.save_model(output_dir=data_args.sagemaker_output_path
                           )  # It saves the tokenizer too for easy upload
        output_train_file = os.path.join(data_args.sagemaker_output_path,
                                         "train_results.txt")
        if trainer.is_world_process_zero():
            with open(output_train_file, "w") as writer:
                logger.info("***** Train results *****")
                for key, value in sorted(train_result.metrics.items()):
                    logger.info(f"  {key} = {value}")
                    writer.write(f"{key} = {value}\n")
            # Need to save the state, since Trainer.save_model saves only the tokenizer with the model
            trainer.state.save_to_json(
                os.path.join(data_args.sagemaker_output_path,
                             "trainer_state.json"))
            # We also save the model_args and data_args for future use (training_args are already saved)
            torch.save(
                asdict(model_args),
                os.path.join(data_args.sagemaker_output_path,
                             "model_args.bin"))
            torch.save(
                asdict(data_args),
                os.path.join(data_args.sagemaker_output_path, "data_args.bin"))

    # Evaluation (This will evaluate the final/best model on the dev set and write results
    results = {}
    if training_args.do_eval:
        logger.info("*** Evaluate best/final model on dev set ***")
        results = trainer.evaluate()
        output_eval_file = os.path.join(data_args.sagemaker_output_path,
                                        "eval_results.txt")
        if trainer.is_world_process_zero():
            with open(output_eval_file, "w") as writer:
                logger.info("***** Eval results *****")
                for key, value in results.items():
                    logger.info(f"  {key} = {value}")
                    writer.write(f"{key} = {value}\n")

    # Evaluate and Predict on test set
    if training_args.do_predict:
        logger.info("*** Predict on test set ***")
        test_dataset = tokenized_datasets["test"]
        predictions, labels, metrics = trainer.predict(
            test_dataset, metric_key_prefix="test")
        if data_args.task_name == "classif":
            true_predictions = [
                id_to_label[p] for p in np.argmax(predictions, axis=1)
            ]
        elif data_args.task_name == "multilabel-classif":
            predictions = 1 / (1 + np.exp(-predictions))  # sigmoid
            predictions = (predictions > 0.5)  # threshold
            true_predictions = [[id_to_label[i] for i in np.where(p == 1)[0]]
                                for p in predictions]
        elif data_args.task_name == "regression":
            true_predictions = np.squeeze(predictions)
        elif data_args.task_name == "ner":
            predictions = np.argmax(predictions, axis=2)
            true_predictions = [[
                id_to_label[p] for (p, l) in zip(prediction, label)
                if l != -100
            ] for prediction, label in zip(predictions, labels)]
        output_test_results_file = os.path.join(
            data_args.sagemaker_output_path, "test_results.txt")
        if trainer.is_world_process_zero():
            with open(output_test_results_file, "w") as writer:
                for key, value in sorted(metrics.items()):
                    logger.info(f"  {key} = {value}")
                    writer.write(f"{key} = {value}\n")
        output_test_predictions_file = os.path.join(
            data_args.sagemaker_output_path, "test_predictions.txt")
        if trainer.is_world_process_zero():
            with open(output_test_predictions_file, "w") as writer:
                for prediction in true_predictions:
                    if data_args.task_name == "ner":
                        writer.write(" ".join(prediction) + "\n")
                    else:
                        writer.write(str(prediction) + "\n")
    return results
Esempio n. 21
0
def train(args):
    dataset = load_dataset('ManyTypes4TypeScript.py', ignore_verifications=True)
    accelerator = Accelerator()
    tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, add_prefix_space=True, use_fast=True)

    def tokenize_and_align_labels(examples):
        def divide_chunks(l1, l2, n):
            for i in range(0, len(l1), n):
                yield {'input_ids': [0] + l1[i:i + n] + [2], 'labels': [-100] + l2[i:i + n] + [-100]}

        window_size = 510
        tokenized_inputs = tokenizer(examples['tokens'], is_split_into_words=True, truncation=False,
                                     add_special_tokens=False)
        inputs_ = {'input_ids': [], 'labels': []}
        for encoding, label in zip(tokenized_inputs.encodings, examples['labels']):
            word_ids = encoding.word_ids  # Map tokens to their respective word.
            previous_word_idx = None
            label_ids = []
            for word_idx in word_ids:  # Set the special tokens to -100.
                if word_idx is None:
                    label_ids.append(-100)
                elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                    l = label[word_idx] if label[word_idx] is not None else -100
                    label_ids.append(l)
                else:
                    label_ids.append(-100)
                previous_word_idx = word_idx

            s_labels = set(label_ids)
            if len(s_labels) == 1 and list(s_labels)[0] == -100:
                continue
            for e in divide_chunks(encoding.ids, label_ids, window_size):
                for k, v in e.items():
                    inputs_[k].append(v)
        return inputs_

    tokenized_hf = dataset.map(tokenize_and_align_labels, batched=True, remove_columns=['id', 'tokens', 'labels'])
    label_list = tokenized_hf["train"].features[f"labels"].feature.names

    model = AutoModelForTokenClassification.from_pretrained(args.model_name, num_labels=len(label_list))

    train_dataset = tokenized_hf["train"]
    eval_dataset = tokenized_hf["test"]
    valid_dataset = tokenized_hf["validation"]
    logger = logging.getLogger(__name__)

    train_batch_size = args.train_batch_size
    eval_batch_size = args.eval_batch_size
    gradient_accumulation_steps = args.gradient_accumulation_steps
    data_collator = DataCollatorForTokenClassification(
        tokenizer, pad_to_multiple_of=(8 if accelerator.use_fp16 else None), padding='max_length', max_length=512
    )

    train_dataloader = DataLoader(
        train_dataset, shuffle=True, collate_fn=data_collator, batch_size=train_batch_size
    )
    eval_dataloader = DataLoader(eval_dataset, collate_fn=data_collator, batch_size=eval_batch_size)

    valid_dataloader = DataLoader(valid_dataset, collate_fn=data_collator, batch_size=eval_batch_size)

    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": args.weight_decay,
        },
        {
            "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
            "weight_decay": 0.0,
        },
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)

    # Use the device given by the `accelerator` object.
    device = accelerator.device
    print("Device: {0}".format(device))
    model.to(device)

    # Prepare everything with our `accelerator`.
    model, optimizer, train_dataloader, eval_dataloader, valid_dataloader = accelerator.prepare(
        model, optimizer, train_dataloader, eval_dataloader, valid_dataloader
    )

    lr_scheduler = get_scheduler(
        name='constant',  # constant because streaming dataset
        optimizer=optimizer,
        # num_warmup_steps=args.warmup_steps,
        # num_training_steps=None if args.max_steps < 0. else args.max_steps,
    )

    # Metrics - more detailed than overall accuracy in evaluator.py
    warnings.filterwarnings('ignore')
    metric = load_metric("seqeval")
    metric_unk = load_metric("seqeval")
    metric_top100 = load_metric("seqeval")

    train_total_batch_size = train_batch_size * accelerator.num_processes * gradient_accumulation_steps
    eval_total_batch_size = eval_batch_size * accelerator.num_processes
    logger.info("***** Running training *****")
    logger.info(f"  Num examples = {len(train_dataset)}")
    logger.info(f"  Num Epochs = {args.num_train_epochs}")
    logger.info(f"  Instantaneous batch size per device = {train_batch_size}")
    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {train_total_batch_size}")
    logger.info(f"  Gradient Accumulation steps = {gradient_accumulation_steps}")

    # Only show the progress bar once on each machine.
    progress_bar_train = tqdm(range(len(train_dataset) // train_total_batch_size),
                              disable=not accelerator.is_local_main_process)
    progress_bar_eval = tqdm(range(len(eval_dataset) // eval_total_batch_size),
                             disable=not accelerator.is_local_main_process)
    completed_steps = 0

    for epoch in range(args.num_train_epochs):

        if args.do_train:
            model.train()
            for step, batch in enumerate(train_dataloader):
                outputs = model(**batch)
                loss = outputs.loss
                loss = loss / gradient_accumulation_steps
                accelerator.backward(loss)
                accelerator.clip_grad_norm_(model.parameters(), args.max_grad_norm)
                if step % gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
                    optimizer.step()
                    lr_scheduler.step()
                    optimizer.zero_grad()
                    progress_bar_train.update(1)
                    completed_steps += 1
                    if args.max_steps > 0 and step > args.max_steps:
                        break

        if args.do_eval:
            export_predictions = []
            model.eval()
            for step, batch in enumerate(eval_dataloader):
                with torch.no_grad():
                    outputs = model(input_ids=batch['input_ids'], labels=None)
                predictions = outputs.logits.argmax(dim=-1)
                labels = batch["labels"]
                predictions_gathered = accelerator.gather(predictions)
                labels_gathered = accelerator.gather(labels)
                preds, refs = get_labels(predictions_gathered, labels_gathered, label_list)
                export_predictions.extend(flatten(preds))
                preds_unk, refs_unk = get_labels(predictions_gathered, labels_gathered, label_list, score_unk=True)
                preds_100, refs_100 = get_labels(predictions_gathered, labels_gathered, label_list, top100=True)
                progress_bar_eval.update(1)
                metric.add_batch(
                    predictions=preds,
                    references=refs,
                )
                metric_unk.add_batch(
                    predictions=preds_unk,
                    references=refs_unk,
                )
                metric_top100.add_batch(
                    predictions=preds_100,
                    references=refs_100,
                )

            eval_metric = compute_metrics(metric, metric_unk, metric_top100)
            accelerator.print(f"epoch {epoch}:", eval_metric)

            enums = list(map(str, list(range(len(export_predictions)))))
            export_predictions = list(map(str, export_predictions))
            export_predictions = ["{}\t{}".format(a_, b_) for a_, b_ in zip(enums, export_predictions)]
            with open(args.output_dir + "/predictions.txt", 'w') as f:
                f.write("\n".join(export_predictions))

    if args.output_dir is not None:
        accelerator.wait_for_everyone()
        unwrapped_model = accelerator.unwrap_model(model)
        unwrapped_model.save_pretrained(args.output_dir, save_function=accelerator.save)
Esempio n. 22
0
def main(args):
    # Set seed
    if args.run:
        seed = random.randrange(10**3)
    else:
        seed = args.seed
    set_seed(seed)
    # Run name
    model_name = args.model_name
    model_name = model_name[2:] if model_name.startswith("./") else model_name
    model_name = model_name[1:] if model_name.startswith("/") else model_name
    run_name = f"{model_name}_{args.task_name}"
    run_name = f"{run_name}_{args.dataset_config or args.dataset_name}"
    run_name = run_name.replace("/", "-")
    run_name = f"{run_name}_e{str(args.num_train_epochs)}"
    run_name = f"{run_name}_lr{str(args.learning_rate)}"
    run_name = f"{run_name}_ws{str(args.warmup_steps)}"
    run_name = f"{run_name}_wd{str(args.weight_decay)}"
    run_name = f"{run_name}_s{str(seed)}"
    if args.max_length != 512:
        run_name = f"{run_name}_seq{str(args.max_length)}"
    if args.label_all_tokens:
        run_name = f"{run_name}_labelall"
    do_supercase = args.supercase or "supercased" in run_name
    if do_supercase and "supercased" not in run_name:
        run_name = f"{run_name}_supercased"
    if args.run:
        run_name = f"{run_name}_r{str(args.run)}"
    output_dir = Path(args.output_dir) / run_name
    # Tokenizer settings
    padding = args.task_name not in (
        "ner", "pos"
    )  # default: False @param ["False", "'max_length'"] {type: 'raw'}
    max_length = args.max_length  #@param {type: "number"}
    # Training settings
    weight_decay = args.weight_decay  #@param {type: "number"}
    adam_beta1 = 0.9  #@param {type: "number"}
    adam_beta2 = 0.999  #@param {type: "number"}
    adam_epsilon = 1e-08  #@param {type: "number"}
    max_grad_norm = 1.0  #@param {type: "number"}
    save_total_limit = 1  #@param {type: "integer"}
    load_best_model_at_end = False  #@param {type: "boolean"}
    # wandb
    wandb.init(name=run_name, entity="nbailab")
    wandb.log({
        "seed": int(seed),
    })
    # Loading Dataset
    print("\n\n#####################################")
    print(args.model_name)
    print(args.task_name)
    print(args.dataset_config)
    train_split = args.dataset_split_train
    test_split = args.dataset_split_test
    validation_split = args.dataset_split_validation
    if ":" in args.dataset_name:
        dataset_name, dataset_config = args.dataset_name.split(":")
    else:
        dataset_name = args.dataset_name
        dataset_config = args.dataset_config
    if dataset_config is None or len(dataset_config) == 0:
        dataset = load_dataset(dataset_name)
    else:
        dataset = load_dataset(dataset_name, dataset_config)
    column_names = dataset[train_split].column_names
    features = dataset[train_split].features
    if "tokens" in column_names:
        text_column_name = "tokens"
    elif "text" in column_names:
        text_column_name = "text"
    else:
        text_column_name = column_names[0]
    if f"{args.task_name}_tags" in column_names:
        label_column_name = f"{args.task_name}_tags"
    elif "label" in column_names:
        label_column_name = "label"
    else:
        label_column_name = column_names[1]
    if isinstance(features[label_column_name], datasets.features.Sequence):
        label_list = features[label_column_name].feature.names
    else:
        label_list = features[label_column_name].names
    label_to_id = {i: i for i in range(len(label_list))}
    num_labels = len(label_list)
    print(f"Number of labels: {num_labels}")
    print({label.split("-")[-1] for label in label_list})

    # Download Norwegian Models
    # Downloading the model directly from a GCP bucket should not take longer than 3 minutes.
    # if args.model_name.startswith("./nb_models") and not os.path.exists("./nb_models"):
    #     !mkdir -p nb_models/eval/
    #     !gsutil -m cp -r gs://notram-public/nb_models/eval/* nb_models/eval/

    # Training
    config = AutoConfig.from_pretrained(
        args.model_name,
        num_labels=num_labels,
        finetuning_task=args.task_name,
        cache_dir=args.cache_dir,
        force_download=args.force_download,
    )
    tokenizer = AutoTokenizer.from_pretrained(
        args.model_name,
        cache_dir=args.cache_dir,
        use_fast=True,
        force_download=args.force_download,
    )
    tokenizer_test_sentence = """
    Denne gjengen håper at de sammen skal bidra til å gi kvinnefotballen i Kristiansand et lenge etterlengtet løft.
    """.strip()
    printm("""Tokenizer test""")
    printm(f"> {tokenizer_test_sentence}")
    if do_supercase:
        tokenizer_test_sentence = supercase(tokenizer_test_sentence,
                                            args.task_name in ("pos", "ner"))
    printm(tokenizer.tokenize(tokenizer_test_sentence))
    printm(tokenizer(tokenizer_test_sentence).input_ids)
    # Token tasks
    if args.task_name in ("pos", "ner"):
        model = AutoModelForTokenClassification.from_pretrained(
            args.model_name,
            from_tf=bool(".ckpt" in args.model_name),
            config=config,
            cache_dir=args.cache_dir,
            force_download=args.force_download,
        )
        # Preprocessing the dataset
        tokenized_datasets = dataset.map(
            lambda examples: tokenize_and_align_labels(
                tokenizer, examples, text_column_name, max_length, padding,
                label_column_name, label_to_id, args.label_all_tokens,
                do_supercase),
            batched=True,
            load_from_cache_file=not args.overwrite_cache,
            num_proc=os.cpu_count(),
        )
        # Data collator
        data_collator = DataCollatorForTokenClassification(tokenizer)
        compute_metrics = token_compute_metrics
    # Sequence tasks
    else:
        model = AutoModelForSequenceClassification.from_pretrained(
            args.model_name,
            from_tf=bool(".ckpt" in args.model_name),
            config=config,
            cache_dir=args.cache_dir,
            force_download=args.force_download,
        )
        # Preprocessing the dataset
        tokenized_datasets = dataset.map(
            lambda examples: tokenizer(
                supercase(examples[text_column_name],
                          is_split_into_words=False)
                if do_supercase else examples[text_column_name],
                max_length=max_length,
                padding=padding,
                truncation=True,
                is_split_into_words=False,
            ),
            batched=True,
            load_from_cache_file=not args.overwrite_cache,
            num_proc=os.cpu_count(),
        )
        # Data collator
        data_collator = DataCollatorWithPadding(
            tokenizer,
            max_length=max_length,
            padding=padding,
        )
        compute_metrics = sequence_compute_metrics
    train_dataset = dataset_select(tokenized_datasets[train_split],
                                   args.max_train_size)
    test_dataset = dataset_select(tokenized_datasets[test_split],
                                  args.max_test_size)
    validation_dataset = dataset_select(tokenized_datasets[validation_split],
                                        args.max_validation_size)
    wandb.log({
        "train_size": len(train_dataset),
        "test_size": len(test_dataset),
        "validation_size": len(validation_dataset),
    })
    samples_per_batch = (train_dataset.shape[0] / args.train_batch_size)
    total_steps = args.num_train_epochs * samples_per_batch
    warmup_steps = int(args.warmup_steps * total_steps)
    wandb.log({
        "total_steps": int(total_steps),
        "total_warmup_steps": warmup_steps
    })
    do_eval = validation_split in tokenized_datasets
    do_test = test_split in tokenized_datasets
    training_args = TrainingArguments(
        output_dir=output_dir.as_posix(),
        overwrite_output_dir=args.overwrite_output_dir,
        do_train=True,
        do_eval=do_eval,
        do_predict=do_test,
        per_device_train_batch_size=int(args.train_batch_size),
        per_device_eval_batch_size=int(args.eval_batch_size
                                       or args.train_batch_size),
        learning_rate=float(args.learning_rate),
        weight_decay=weight_decay,
        adam_beta1=adam_beta1,
        adam_beta2=adam_beta2,
        adam_epsilon=adam_epsilon,
        max_grad_norm=max_grad_norm,
        num_train_epochs=args.num_train_epochs,
        warmup_steps=warmup_steps,
        load_best_model_at_end=load_best_model_at_end,
        seed=seed,
        save_total_limit=save_total_limit,
        run_name=run_name,
        disable_tqdm=True,
        eval_steps=500,
    )
    # Initialize our Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=validation_dataset if do_eval else None,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=lambda pairs: compute_metrics(pairs, label_list),
    )
    train_result = trainer.train()
    trainer.save_model()  # Saves the tokenizer too for easy upload
    write_file("train",
               train_result.metrics,
               output_dir,
               save_artifact=args.save_artifacts)
    # Need to save the state, since Trainer.save_model saves only the tokenizer with the model
    trainer.state.save_to_json(output_dir / "trainer_state.json")
    # Evaluation
    if do_eval:
        printm(f"**Evaluate**")
        results = trainer.evaluate()
        write_file("eval",
                   results,
                   output_dir,
                   save_artifact=args.save_artifacts)
    # Tesing
    if do_test:
        printm("**Test**")
        predictions, labels, metrics = trainer.predict(test_dataset)
        write_file("test",
                   metrics,
                   output_dir,
                   save_artifact=args.save_artifacts)
        if args.task_name in ("ner", "pos"):
            predictions = np.argmax(predictions, axis=2)
            # Remove ignored index (special tokens)
            true_predictions = [[
                label_list[p] for (p, l) in zip(prediction, label) if l != -100
            ] for prediction, label in zip(predictions, labels)]
        else:
            predictions = np.argmax(predictions, axis=1)
            true_predictions = [
                label_list[p] for (p, l) in zip(predictions, labels)
                if l != -100
            ]
        # Save predictions
        output_test_predictions_file = os.path.join(output_dir,
                                                    "test_predictions.txt")
        output_test_predictions = "\n".join(" ".join(map(str, p))
                                            for p in true_predictions)
        with open(output_test_predictions_file, "a+") as writer:
            writer.write(output_test_predictions)
        if args.save_artifacts:
            artifact = wandb.Artifact("predictions", type="result")
            artifact.add_file(output_test_predictions_file)
            wandb.log_artifact(artifact)
Esempio n. 23
0
def main():
    # Parser
    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TrainingArguments))
    model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    # Set seed before initializing
    set_seed(training_args.seed)

    ########## Load dataset from script. ##########
    # 'ontonotes_v4.py'
    datasets = load_dataset(data_args.dataset_script)

    ### Access column names and features ###
    if training_args.do_train:
        column_names = datasets["train"].column_names
        features = datasets["train"].features
    else:
        column_names = datasets["validation"].column_names
        features = datasets["validation"].features

    # In the event the labels are not a `Sequence[ClassLabel]`,
    # we will need to go through the dataset to get the unique labels.
    if isinstance(features["pos_tags"].feature, ClassLabel):
        label_list = features["pos_tags"].feature.names
        # No need to convert the labels since they are already ints.
        label_to_id = {i: i for i in range(len(label_list))}
    else:
        label_list = get_label_list(datasets["train"]["pos_tags"])
        label_to_id = {l: i for i, l in enumerate(label_list)}
    num_labels = len(label_list)

    ########## Load pre-trained or custom model, tokenizer and config ##########

    # BertConfig
    config = AutoConfig.from_pretrained(model_args.config_name,
                                        num_labels=num_labels,
                                        finetuning_task=data_args.task_name,
                                        cache_dir=None,
                                        revision="main",
                                        use_auth_token=True if False else None,
                                        output_hidden_states=True)

    # BERTTokenizer
    tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name
        if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=None,
        use_fast=True,
        revision="main",
        use_auth_token=None,
    )

    # Create custom model or pretained BERT
    if ".py" == model_args.model_name_or_path[-3:]:
        spec = importlib.util.spec_from_file_location(
            "module.name", model_args.model_name_or_path)
        module_name = importlib.util.module_from_spec(spec)
        spec.loader.exec_module(module_name)

        # Add `to_layer` for `LinearProbingBERT`
        if model_args.to_layer is not None:
            config.to_layer = model_args.to_layer

        # Creating custom model
        model = module_name.model()(config)
        print(f"Creating custom model in {model_args.model_name_or_path}")
    else:
        model = AutoModelForTokenClassification.from_pretrained(
            model_args.model_name_or_path,
            from_tf=bool(".ckpt" in "bert-base-cased"),
            config=config,
            cache_dir=None,
            revision="main",
            use_auth_token=True if False else None,
        )

        # Freeze BERT. Train the `classifier` weights
        for param in model.base_model.parameters():
            param.requires_grad = False

    def tokenize_fn(examples):
        """Tokenize the input sequence and align the label.

        `input_ids` and `label_ids` will be added in the feature example (dict).
        They are required for the forward and loss computation.

        Addtionally. `-100` in `label_ids` is assigned to segmented tokens
        and to speical tokens in BERT. Loss function will ignore them.

        Args:
          Examples: dict of features:
                    {"tokens": [AL-AIN', ',', 'United', 'Arab', 'Emirates', '1996-12-06'],
                     "pos_tags": [22, 6, 22, 22, 23, 11]}

        Return:
          tokenized_inputs: dict of futures including two 
                            addtional feature: `input_ids` and `label_ids`.
        
        Usages:
        >>> tokenized_dataset = datasets.map(tokenize_fn,
        >>>                                  batched=True)
            # Check whether aligned.
        >>> for example in tokenized_dataset:
                tokens = example['tokens']
                input_ids = example['input_ids']
                tokenized_tokens = tokenizer.convert_ids_to_tokens(input_ids)
                label_ids = example['label_ids'] # aligned to max length 
                print(tokens)
                print(tokenized_tokens)
                print(input_ids)

        [ 'SOCCER' ] # token
        [ [CLS], 'S', '##OC, '##CE', '##R', [SEP] ] #converted_tokens 
        [ -100 ,  4 , -100,  -100, -11] # label_ids
        """
        token_col_name = 'tokens'
        label_col_name = 'pos_tags'

        # will be added in the dict
        tokenized_inputs = tokenizer(examples[token_col_name],
                                     padding="max_length",
                                     truncation=True,
                                     is_split_into_words=True,
                                     max_length=data_args.max_seq_length)

        # Create label sequence
        # The `word_idx` in is used to map tokens to actual word .
        # tokenied_token: [CLS, Ha, ##LLO,  PAD,  PAD]
        # `word_ids`:     [None, 1,     1, None, None]
        # `label_ids`:    [-100, 4,     4, -100, -100]
        labels = list()
        for i, label in enumerate(examples["pos_tags"]):
            word_ids = tokenized_inputs.word_ids(batch_index=i)
            previous_word_idx = None
            label_ids = []
            for word_idx in word_ids:
                # None are for [CLS], [SEP] and [PAD] tokens.
                if word_idx is None:
                    label_ids.append(-100)
                # Set label for the word once
                elif word_idx != previous_word_idx:
                    label_ids.append(label_to_id[label[word_idx]])
                    #label_ids.append(label)
                else:
                    label_ids.append(-100)
                previous_word_idx = word_idx
            labels.append(label_ids)

        # Add `labels` sequence for loss computation
        tokenized_inputs["labels"] = labels
        return tokenized_inputs

    ### Truncate  number of examples ###
    if training_args.do_train:
        if "train" not in datasets:
            raise ValueError("--do_train requires a train dataset")
        train_dataset = datasets["train"]
        if data_args.max_train_samples is not None:
            train_dataset = train_dataset.select(
                range(data_args.max_train_samples))
        train_dataset = train_dataset.map(
            tokenize_fn,
            batched=True,
        )

    if training_args.do_eval:
        if "validation" not in datasets:
            raise ValueError("--do_eval requires a validation dataset")
        eval_dataset = datasets["validation"]
        if data_args.max_val_samples is not None:
            eval_dataset = eval_dataset.select(range(
                data_args.max_val_samples))
        eval_dataset = eval_dataset.map(
            tokenize_fn,
            batched=True,
        )

    if training_args.do_predict:
        if "test" not in datasets:
            raise ValueError("--do_predict requires a test dataset")
        test_dataset = datasets["test"]
        if data_args.max_test_samples is not None:
            test_dataset = test_dataset.select(
                range(data_args.max_test_samples))
        test_dataset = test_dataset.map(
            tokenize_fn,
            batched=True,
        )

    data_collator = DataCollatorForTokenClassification(tokenizer,
                                                       pad_to_multiple_of=8)

    # Metrics
    metric = load_metric("seqeval")

    # Define metrics
    def compute_metrics(p):
        """Compute the evaluation metric for POS tagging. 

        This function Will be called if set `logging_steps` number of 
        steps to log metrics or set `eval_steps`.
        """
        # transformers.trainer_utils.EvalPrediction
        predictions, labels = p

        #  Shape of predictions (batch, seq_max_len, num_labels)
        predictions = np.argmax(predictions, axis=2)

        # Remove ignored index (special tokens)
        true_predictions = [[
            label_list[p] for (p, l) in zip(prediction, label) if l != -100
        ] for prediction, label in zip(predictions, labels)]
        true_labels = [[
            label_list[l] for (p, l) in zip(prediction, label) if l != -100
        ] for prediction, label in zip(predictions, labels)]

        true_preds = list()
        true_label = list()
        for l in true_labels:
            for ele in l:
                true_label.append(ele)
        for l in true_predictions:
            for ele in l:
                true_preds.append(ele)

        ### DON't USE ###
        data_args.return_tag_level_metrics = False
        if data_args.return_tag_level_metrics:
            result = classification_report(true_label,
                                           true_preds,
                                           output_dict=False,
                                           labels=np.unique(true_preds))
        else:
            precision, recall, f1, _ = precision_recall_fscore_support(
                true_label, true_preds, average='macro', zero_division=0)
            acc = accuracy_score(true_label, true_preds)

            result = {
                'precision': precision,
                'recall': recall,
                'f1': f1,
                'accuracy': acc,
            }
        return result

        ########## Origin ##########
        #results = metric.compute(predictions=true_predictions, references=true_labels)
        if data_args.return_tag_level_metrics:

            # Unpack nested dictionaries
            final_results = {}
            for key, value in results.items():
                if isinstance(value, dict):
                    for n, v in value.items():
                        final_results[f"{key}_{n}"] = v
                else:
                    final_results[key] = value
            return final_results
        else:
            return {
                "precision": results["overall_precision"],
                "recall": results["overall_recall"],
                "f1": results["overall_f1"],
                "accuracy": results["overall_accuracy"],
            }
        ########## Origin ##########

    ########## Train, evaluate and test with Trainer ##########
    # Initialize our Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset if training_args.do_train else None,
        eval_dataset=eval_dataset if training_args.do_eval else None,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    # Training
    last_checkpoint = None
    if training_args.do_train:
        if last_checkpoint is not None:
            checkpoint = last_checkpoint
        else:
            checkpoint = None
        train_result = trainer.train(resume_from_checkpoint=checkpoint)
        metrics = train_result.metrics
        trainer.save_model("./")  # Saves the tokenizer too for easy upload

        max_train_samples = (data_args.max_train_samples
                             if data_args.max_train_samples is not None else
                             len(train_dataset))
        metrics["train_samples"] = min(max_train_samples, len(train_dataset))

    # Evaluation
    results = {}
    if training_args.do_eval:
        logger.info("*** Evaluate ***")

        results = trainer.evaluate()

        max_val_samples = data_args.max_val_samples if data_args.max_val_samples is not None else len(
            eval_dataset)
        metrics["eval_samples"] = min(max_val_samples, len(eval_dataset))

    # Predict
    if training_args.do_predict:
        logger.info("*** Predict ***")

        predictions, labels, metrics = trainer.predict(test_dataset)
        predictions = np.argmax(predictions, axis=2)

        print("Test metrics", metrics)

        # Remove ignored index (special tokens)
        true_predictions = [[
            label_list[p] for (p, l) in zip(prediction, label) if l != -100
        ] for prediction, label in zip(predictions, labels)]

        # Save predictions
        with open("test_predictions.txt", "w") as writer:
            for prediction in true_predictions:
                writer.write(" ".join(prediction) + "\n")

    return results
def do_predict(test_data):
    parser = HfArgumentParser((ModelArguments, DataTrainingArguments))
    model_args, data_args = parser.parse_args_into_dataclasses()
    training_args = TrainingArguments(output_dir="cosmetic_ner", do_predict=True)

    test_dict = {'tokens': [[token for token in line] for line in test_data]}
    datasets = Dataset.from_dict(test_dict)

    text_column_name = "tokens"
    label_list = ['O', 'B-COM', 'I-COM', 'B-EFF', 'I-EFF']

    num_labels = 5

    # 开始加载预训练模型和tokenizer
    # 加载模型配置
    config = AutoConfig.from_pretrained(
        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
        num_labels=num_labels,
        finetuning_task=data_args.task_name,  # ner
        cache_dir=model_args.cache_dir,  # None
    )
    tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
        use_fast=True,
    )
    model = AutoModelForTokenClassification.from_pretrained(
        model_args.model_name_or_path,
        from_tf=bool(".ckpt" in model_args.model_name_or_path),
        config=config,
        cache_dir=model_args.cache_dir,
    )

    # 数据预处理
    # 最大序列长度
    max_length = data_args.max_length
    # Padding 策略
    padding = "max_length" if data_args.pad_to_max_length else False

    # Tokenize所有文本并将label与它们对齐。
    def tokenize_and_align_labels(examples):
        """
        datasets.map函数处理时会调用
        Args:
            examples: 这里是2条样本,
        例如: examples = {dict: 5}
 'chunk_tags' = {list: 2} [[11, 21, 11, 12, 21, 22, 11, 12, 0], [11, 12]]
 'id' = {list: 2} ['0', '1']
 'ner_tags' = {list: 2} [[3, 0, 7, 0, 0, 0, 7, 0, 0], [1, 2]]
 'pos_tags' = {list: 2} [[22, 42, 16, 21, 35, 37, 16, 21, 7], [22, 22]]
 'tokens' = {list: 2} [['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.'], ['Peter', 'Blackburn']]
        Returns:
        {'input_ids': [[101, 4649, 2244, 102], [101, 6208, 4658, 2094, 3723, 102]], 'token_type_ids': [[0, 0, 0, 0], [0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1], [1, 1, 1, 1, 1, 1]], 'labels': [[-100, 3, 4, -100], [-100, 1, 2, 2, 2, -100]]}
        """
        # 对单条样本的examples的tokens字段,即文本字段进行tokenizer
        tokenized_inputs = tokenizer(
            examples[text_column_name],
            padding=padding,
            max_length=max_length,
            truncation=True,
            # 我们使用此参数是因为数据集中的文本是单词列表(每个单词带有标签)
            is_split_into_words=True,
        )
        print(f"\n样本是{examples}")
        print(f"一个batch的tokenized_inputs,{tokenized_inputs}")
        return tokenized_inputs

    # 处理数据,用map函数
    tokenized_datasets = datasets.map(
        tokenize_and_align_labels,
        batched=True,
        num_proc=data_args.preprocessing_num_workers,
        load_from_cache_file=False,
    )

    # Data collator, 在for循环dataloader时调用
    data_collator = DataCollatorForTokenClassification(tokenizer)

    # Initialize our Trainer, msra没有验证集,这里用test测试集,其实应该用train中分离出来
    trainer = Trainer(
        model=model,
        args=training_args,
        tokenizer=tokenizer,
        data_collator=data_collator,
    )

    # 评估模型
    results = {}
    #
    if training_args.do_predict:
        logger.info("*** 预测 ***")
        test_dataset = tokenized_datasets
        predictions = trainer.predict(test_dataset)
        predictions = np.argmax(predictions.predictions, axis=2)

        # 是一个嵌套列表,子列表是label的名字, 去掉prediction的第一个和最后一个元素CLS, SEP
        true_predictions = [
            [label_list[p] for p in prediction[1:-1]]
            for prediction in predictions
        ]
        # 去掉padding的长度
        text_tokens = test_dict['tokens']
        predicts = [prediction[:len(token)] for token, prediction in zip(text_tokens, true_predictions)]
        results = extract_words(text=test_dict['tokens'], predicts=predicts)
    return results
def main():
    args = parse_args()

    # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
    # information sent is the one passed as arguments along with your Python/PyTorch versions.
    send_example_telemetry("run_ner_no_trainer", args)

    # Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
    # If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers
    # in the environment
    accelerator = (
        Accelerator(log_with=args.report_to, logging_dir=args.output_dir) if args.with_tracking else Accelerator()
    )
    # Make one log on every process with the configuration for debugging.
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
    )
    logger.info(accelerator.state, main_process_only=False)
    if accelerator.is_local_main_process:
        datasets.utils.logging.set_verbosity_warning()
        transformers.utils.logging.set_verbosity_info()
    else:
        datasets.utils.logging.set_verbosity_error()
        transformers.utils.logging.set_verbosity_error()

    # If passed along, set the training seed now.
    if args.seed is not None:
        set_seed(args.seed)

    # Handle the repository creation
    if accelerator.is_main_process:
        if args.push_to_hub:
            if args.hub_model_id is None:
                repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
            else:
                repo_name = args.hub_model_id
            repo = Repository(args.output_dir, clone_from=repo_name)

            with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore:
                if "step_*" not in gitignore:
                    gitignore.write("step_*\n")
                if "epoch_*" not in gitignore:
                    gitignore.write("epoch_*\n")
        elif args.output_dir is not None:
            os.makedirs(args.output_dir, exist_ok=True)
    accelerator.wait_for_everyone()

    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
    # or just provide the name of one of the public datasets for token classification task available on the hub at https://huggingface.co/datasets/
    # (the dataset will be downloaded automatically from the datasets Hub).
    #
    # For CSV/JSON files, this script will use the column called 'tokens' or the first column if no column called
    # 'tokens' is found. You can easily tweak this behavior (see below).
    #
    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
    # download the dataset.
    if args.dataset_name is not None:
        # Downloading and loading a dataset from the hub.
        raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name)
    else:
        data_files = {}
        if args.train_file is not None:
            data_files["train"] = args.train_file
        if args.validation_file is not None:
            data_files["validation"] = args.validation_file
        extension = args.train_file.split(".")[-1]
        raw_datasets = load_dataset(extension, data_files=data_files)
    # Trim a number of training examples
    if args.debug:
        for split in raw_datasets.keys():
            raw_datasets[split] = raw_datasets[split].select(range(100))
    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
    # https://huggingface.co/docs/datasets/loading_datasets.html.

    if raw_datasets["train"] is not None:
        column_names = raw_datasets["train"].column_names
        features = raw_datasets["train"].features
    else:
        column_names = raw_datasets["validation"].column_names
        features = raw_datasets["validation"].features

    if args.text_column_name is not None:
        text_column_name = args.text_column_name
    elif "tokens" in column_names:
        text_column_name = "tokens"
    else:
        text_column_name = column_names[0]

    if args.label_column_name is not None:
        label_column_name = args.label_column_name
    elif f"{args.task_name}_tags" in column_names:
        label_column_name = f"{args.task_name}_tags"
    else:
        label_column_name = column_names[1]

    # In the event the labels are not a `Sequence[ClassLabel]`, we will need to go through the dataset to get the
    # unique labels.
    def get_label_list(labels):
        unique_labels = set()
        for label in labels:
            unique_labels = unique_labels | set(label)
        label_list = list(unique_labels)
        label_list.sort()
        return label_list

    # If the labels are of type ClassLabel, they are already integers and we have the map stored somewhere.
    # Otherwise, we have to get the list of labels manually.
    labels_are_int = isinstance(features[label_column_name].feature, ClassLabel)
    if labels_are_int:
        label_list = features[label_column_name].feature.names
        label_to_id = {i: i for i in range(len(label_list))}
    else:
        label_list = get_label_list(raw_datasets["train"][label_column_name])
        label_to_id = {l: i for i, l in enumerate(label_list)}

    num_labels = len(label_list)

    # Load pretrained model and tokenizer
    #
    # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.
    if args.config_name:
        config = AutoConfig.from_pretrained(args.config_name, num_labels=num_labels)
    elif args.model_name_or_path:
        config = AutoConfig.from_pretrained(args.model_name_or_path, num_labels=num_labels)
    else:
        config = CONFIG_MAPPING[args.model_type]()
        logger.warning("You are instantiating a new config instance from scratch.")

    tokenizer_name_or_path = args.tokenizer_name if args.tokenizer_name else args.model_name_or_path
    if not tokenizer_name_or_path:
        raise ValueError(
            "You are instantiating a new tokenizer from scratch. This is not supported by this script."
            "You can do it from another script, save it, and load it from here, using --tokenizer_name."
        )

    if config.model_type in {"gpt2", "roberta"}:
        tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path, use_fast=True, add_prefix_space=True)
    else:
        tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path, use_fast=True)

    if args.model_name_or_path:
        model = AutoModelForTokenClassification.from_pretrained(
            args.model_name_or_path,
            from_tf=bool(".ckpt" in args.model_name_or_path),
            config=config,
            ignore_mismatched_sizes=args.ignore_mismatched_sizes,
        )
    else:
        logger.info("Training new model from scratch")
        model = AutoModelForTokenClassification.from_config(config)

    model.resize_token_embeddings(len(tokenizer))

    # Model has labels -> use them.
    if model.config.label2id != PretrainedConfig(num_labels=num_labels).label2id:
        if list(sorted(model.config.label2id.keys())) == list(sorted(label_list)):
            # Reorganize `label_list` to match the ordering of the model.
            if labels_are_int:
                label_to_id = {i: int(model.config.label2id[l]) for i, l in enumerate(label_list)}
                label_list = [model.config.id2label[i] for i in range(num_labels)]
            else:
                label_list = [model.config.id2label[i] for i in range(num_labels)]
                label_to_id = {l: i for i, l in enumerate(label_list)}
        else:
            logger.warning(
                "Your model seems to have been trained with labels, but they don't match the dataset: ",
                f"model labels: {list(sorted(model.config.label2id.keys()))}, dataset labels:"
                f" {list(sorted(label_list))}.\nIgnoring the model labels as a result.",
            )

    # Set the correspondences label/ID inside the model config
    model.config.label2id = {l: i for i, l in enumerate(label_list)}
    model.config.id2label = {i: l for i, l in enumerate(label_list)}

    # Map that sends B-Xxx label to its I-Xxx counterpart
    b_to_i_label = []
    for idx, label in enumerate(label_list):
        if label.startswith("B-") and label.replace("B-", "I-") in label_list:
            b_to_i_label.append(label_list.index(label.replace("B-", "I-")))
        else:
            b_to_i_label.append(idx)

    # Preprocessing the datasets.
    # First we tokenize all the texts.
    padding = "max_length" if args.pad_to_max_length else False

    # Tokenize all texts and align the labels with them.

    def tokenize_and_align_labels(examples):
        tokenized_inputs = tokenizer(
            examples[text_column_name],
            max_length=args.max_length,
            padding=padding,
            truncation=True,
            # We use this argument because the texts in our dataset are lists of words (with a label for each word).
            is_split_into_words=True,
        )

        labels = []
        for i, label in enumerate(examples[label_column_name]):
            word_ids = tokenized_inputs.word_ids(batch_index=i)
            previous_word_idx = None
            label_ids = []
            for word_idx in word_ids:
                # Special tokens have a word id that is None. We set the label to -100 so they are automatically
                # ignored in the loss function.
                if word_idx is None:
                    label_ids.append(-100)
                # We set the label for the first token of each word.
                elif word_idx != previous_word_idx:
                    label_ids.append(label_to_id[label[word_idx]])
                # For the other tokens in a word, we set the label to either the current label or -100, depending on
                # the label_all_tokens flag.
                else:
                    if args.label_all_tokens:
                        label_ids.append(b_to_i_label[label_to_id[label[word_idx]]])
                    else:
                        label_ids.append(-100)
                previous_word_idx = word_idx

            labels.append(label_ids)
        tokenized_inputs["labels"] = labels
        return tokenized_inputs

    with accelerator.main_process_first():
        processed_raw_datasets = raw_datasets.map(
            tokenize_and_align_labels,
            batched=True,
            remove_columns=raw_datasets["train"].column_names,
            desc="Running tokenizer on dataset",
        )

    train_dataset = processed_raw_datasets["train"]
    eval_dataset = processed_raw_datasets["validation"]

    # Log a few random samples from the training set:
    for index in random.sample(range(len(train_dataset)), 3):
        logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")

    # DataLoaders creation:
    if args.pad_to_max_length:
        # If padding was already done ot max length, we use the default data collator that will just convert everything
        # to tensors.
        data_collator = default_data_collator
    else:
        # Otherwise, `DataCollatorForTokenClassification` will apply dynamic padding for us (by padding to the maximum length of
        # the samples passed). When using mixed precision, we add `pad_to_multiple_of=8` to pad all tensors to multiple
        # of 8s, which will enable the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
        data_collator = DataCollatorForTokenClassification(
            tokenizer, pad_to_multiple_of=(8 if accelerator.use_fp16 else None)
        )

    train_dataloader = DataLoader(
        train_dataset, shuffle=True, collate_fn=data_collator, batch_size=args.per_device_train_batch_size
    )
    eval_dataloader = DataLoader(eval_dataset, collate_fn=data_collator, batch_size=args.per_device_eval_batch_size)

    # Optimizer
    # Split weights in two groups, one with weight decay and the other not.
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": args.weight_decay,
        },
        {
            "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
            "weight_decay": 0.0,
        },
    ]
    optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=args.learning_rate)

    # Use the device given by the `accelerator` object.
    device = accelerator.device
    model.to(device)

    # Scheduler and math around the number of training steps.
    overrode_max_train_steps = False
    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
    if args.max_train_steps is None:
        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
        overrode_max_train_steps = True

    lr_scheduler = get_scheduler(
        name=args.lr_scheduler_type,
        optimizer=optimizer,
        num_warmup_steps=args.num_warmup_steps,
        num_training_steps=args.max_train_steps,
    )

    # Prepare everything with our `accelerator`.
    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
        model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
    )

    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
    if overrode_max_train_steps:
        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
    # Afterwards we recalculate our number of training epochs
    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)

    # Figure out how many steps we should save the Accelerator states
    if hasattr(args.checkpointing_steps, "isdigit"):
        checkpointing_steps = args.checkpointing_steps
        if args.checkpointing_steps.isdigit():
            checkpointing_steps = int(args.checkpointing_steps)
    else:
        checkpointing_steps = None

    # We need to initialize the trackers we use, and also store our configuration.
    # We initialize the trackers only on main process because `accelerator.log`
    # only logs on main process and we don't want empty logs/runs on other processes.
    if args.with_tracking:
        if accelerator.is_main_process:
            experiment_config = vars(args)
            # TensorBoard cannot log Enums, need the raw value
            experiment_config["lr_scheduler_type"] = experiment_config["lr_scheduler_type"].value
            accelerator.init_trackers("ner_no_trainer", experiment_config)

    # Metrics
    metric = load_metric("seqeval")

    def get_labels(predictions, references):
        # Transform predictions and references tensos to numpy arrays
        if device.type == "cpu":
            y_pred = predictions.detach().clone().numpy()
            y_true = references.detach().clone().numpy()
        else:
            y_pred = predictions.detach().cpu().clone().numpy()
            y_true = references.detach().cpu().clone().numpy()

        # Remove ignored index (special tokens)
        true_predictions = [
            [label_list[p] for (p, l) in zip(pred, gold_label) if l != -100]
            for pred, gold_label in zip(y_pred, y_true)
        ]
        true_labels = [
            [label_list[l] for (p, l) in zip(pred, gold_label) if l != -100]
            for pred, gold_label in zip(y_pred, y_true)
        ]
        return true_predictions, true_labels

    def compute_metrics():
        results = metric.compute()
        if args.return_entity_level_metrics:
            # Unpack nested dictionaries
            final_results = {}
            for key, value in results.items():
                if isinstance(value, dict):
                    for n, v in value.items():
                        final_results[f"{key}_{n}"] = v
                else:
                    final_results[key] = value
            return final_results
        else:
            return {
                "precision": results["overall_precision"],
                "recall": results["overall_recall"],
                "f1": results["overall_f1"],
                "accuracy": results["overall_accuracy"],
            }

    # Train!
    total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps

    logger.info("***** Running training *****")
    logger.info(f"  Num examples = {len(train_dataset)}")
    logger.info(f"  Num Epochs = {args.num_train_epochs}")
    logger.info(f"  Instantaneous batch size per device = {args.per_device_train_batch_size}")
    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
    logger.info(f"  Total optimization steps = {args.max_train_steps}")
    # Only show the progress bar once on each machine.
    progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process)
    completed_steps = 0
    starting_epoch = 0
    # Potentially load in the weights and states from a previous save
    if args.resume_from_checkpoint:
        if args.resume_from_checkpoint is not None or args.resume_from_checkpoint != "":
            accelerator.print(f"Resumed from checkpoint: {args.resume_from_checkpoint}")
            accelerator.load_state(args.resume_from_checkpoint)
            path = os.path.basename(args.resume_from_checkpoint)
        else:
            # Get the most recent checkpoint
            dirs = [f.name for f in os.scandir(os.getcwd()) if f.is_dir()]
            dirs.sort(key=os.path.getctime)
            path = dirs[-1]  # Sorts folders by date modified, most recent checkpoint is the last
        # Extract `epoch_{i}` or `step_{i}`
        training_difference = os.path.splitext(path)[0]

        if "epoch" in training_difference:
            starting_epoch = int(training_difference.replace("epoch_", "")) + 1
            resume_step = None
        else:
            resume_step = int(training_difference.replace("step_", ""))
            starting_epoch = resume_step // len(train_dataloader)
            resume_step -= starting_epoch * len(train_dataloader)

    for epoch in range(starting_epoch, args.num_train_epochs):
        model.train()
        if args.with_tracking:
            total_loss = 0
        for step, batch in enumerate(train_dataloader):
            # We need to skip steps until we reach the resumed step
            if args.resume_from_checkpoint and epoch == starting_epoch:
                if resume_step is not None and step < resume_step:
                    completed_steps += 1
                    continue
            outputs = model(**batch)
            loss = outputs.loss
            # We keep track of the loss at each epoch
            if args.with_tracking:
                total_loss += loss.detach().float()
            loss = loss / args.gradient_accumulation_steps
            accelerator.backward(loss)
            if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
                optimizer.step()
                lr_scheduler.step()
                optimizer.zero_grad()
                progress_bar.update(1)
                completed_steps += 1

            if isinstance(checkpointing_steps, int):
                if completed_steps % checkpointing_steps == 0:
                    output_dir = f"step_{completed_steps }"
                    if args.output_dir is not None:
                        output_dir = os.path.join(args.output_dir, output_dir)
                    accelerator.save_state(output_dir)

            if completed_steps >= args.max_train_steps:
                break

        model.eval()
        samples_seen = 0
        for step, batch in enumerate(eval_dataloader):
            with torch.no_grad():
                outputs = model(**batch)
            predictions = outputs.logits.argmax(dim=-1)
            labels = batch["labels"]
            if not args.pad_to_max_length:  # necessary to pad predictions and labels for being gathered
                predictions = accelerator.pad_across_processes(predictions, dim=1, pad_index=-100)
                labels = accelerator.pad_across_processes(labels, dim=1, pad_index=-100)
            predictions_gathered, labels_gathered = accelerator.gather((predictions, labels))
            # If we are in a multiprocess environment, the last batch has duplicates
            if accelerator.num_processes > 1:
                if step == len(eval_dataloader) - 1:
                    predictions_gathered = predictions_gathered[: len(eval_dataloader.dataset) - samples_seen]
                    labels_gathered = labels_gathered[: len(eval_dataloader.dataset) - samples_seen]
                else:
                    samples_seen += labels_gathered.shape[0]
            preds, refs = get_labels(predictions_gathered, labels_gathered)
            metric.add_batch(
                predictions=preds,
                references=refs,
            )  # predictions and preferences are expected to be a nested list of labels, not label_ids

        eval_metric = compute_metrics()
        accelerator.print(f"epoch {epoch}:", eval_metric)
        if args.with_tracking:
            accelerator.log(
                {
                    "seqeval": eval_metric,
                    "train_loss": total_loss.item() / len(train_dataloader),
                    "epoch": epoch,
                    "step": completed_steps,
                },
                step=completed_steps,
            )

        if args.push_to_hub and epoch < args.num_train_epochs - 1:
            accelerator.wait_for_everyone()
            unwrapped_model = accelerator.unwrap_model(model)
            unwrapped_model.save_pretrained(
                args.output_dir, is_main_process=accelerator.is_main_process, save_function=accelerator.save
            )
            if accelerator.is_main_process:
                tokenizer.save_pretrained(args.output_dir)
                repo.push_to_hub(
                    commit_message=f"Training in progress epoch {epoch}", blocking=False, auto_lfs_prune=True
                )

        if args.checkpointing_steps == "epoch":
            output_dir = f"epoch_{epoch}"
            if args.output_dir is not None:
                output_dir = os.path.join(args.output_dir, output_dir)
            accelerator.save_state(output_dir)

    if args.output_dir is not None:
        accelerator.wait_for_everyone()
        unwrapped_model = accelerator.unwrap_model(model)
        unwrapped_model.save_pretrained(
            args.output_dir, is_main_process=accelerator.is_main_process, save_function=accelerator.save
        )
        if accelerator.is_main_process:
            tokenizer.save_pretrained(args.output_dir)
            if args.push_to_hub:
                repo.push_to_hub(commit_message="End of training", auto_lfs_prune=True)

        with open(os.path.join(args.output_dir, "all_results.json"), "w") as f:
            json.dump(
                {"eval_accuracy": eval_metric["accuracy"], "train_loss": total_loss.item() / len(train_dataloader)}, f
            )
Esempio n. 26
0
    # preparing datasets
    datasets["train"] = datasets["train"].map(
        tokenize_and_align_labels,
        batched=True,
        num_proc=args.preprocessing_num_workers,
        load_from_cache_file=not args.overwrite_cache,
    )

    datasets["validation"] = datasets["validation"].map(
        tokenize_and_align_labels,
        batched=True,
        num_proc=args.preprocessing_num_workers,
        load_from_cache_file=not args.overwrite_cache,
    )

    data_collator = DataCollatorForTokenClassification(tokenizer,
                                                       padding=padding)

    # Metrics
    ner_metric = load_metric("seqeval")

    def compute_metrics_ner(p):
        predictions, labels = p
        predictions = np.argmax(predictions, axis=2)

        # Remove ignored index (special tokens)
        true_predictions = [[
            label_list[p] for (p, l) in zip(prediction, label) if l != -100
        ] for prediction, label in zip(predictions, labels)]
        true_labels = [[
            label_list[l] for (p, l) in zip(prediction, label) if l != -100
        ] for prediction, label in zip(predictions, labels)]
Esempio n. 27
0
def main():
    # region Argument Parsing
    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TFTrainingArguments))
    model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
    # information sent is the one passed as arguments along with your Python/PyTorch versions.
    send_example_telemetry("run_ner",
                           model_args,
                           data_args,
                           framework="tensorflow")
    # endregion

    # region Setup logging
    # we only want one process per machine to log things on the screen.
    # accelerator.is_local_main_process is only True for one process per machine.
    logger.setLevel(logging.INFO)
    datasets.utils.logging.set_verbosity_warning()
    transformers.utils.logging.set_verbosity_info()

    # If passed along, set the training seed now.
    if training_args.seed is not None:
        set_seed(training_args.seed)
    # endregion

    # region Loading datasets
    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
    # or just provide the name of one of the public datasets for token classification task available on the hub at https://huggingface.co/datasets/
    # (the dataset will be downloaded automatically from the datasets Hub).
    #
    # For CSV/JSON files, this script will use the column called 'tokens' or the first column if no column called
    # 'tokens' is found. You can easily tweak this behavior (see below).
    #
    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
    # download the dataset.
    if data_args.dataset_name is not None:
        # Downloading and loading a dataset from the hub.
        raw_datasets = load_dataset(
            data_args.dataset_name,
            data_args.dataset_config_name,
            use_auth_token=True if model_args.use_auth_token else None,
        )
    else:
        data_files = {}
        if data_args.train_file is not None:
            data_files["train"] = data_args.train_file
        if data_args.validation_file is not None:
            data_files["validation"] = data_args.validation_file
        extension = data_args.train_file.split(".")[-1]
        raw_datasets = load_dataset(
            extension,
            data_files=data_files,
            use_auth_token=True if model_args.use_auth_token else None,
        )
    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
    # https://huggingface.co/docs/datasets/loading_datasets.html.

    if raw_datasets["train"] is not None:
        column_names = raw_datasets["train"].column_names
        features = raw_datasets["train"].features
    else:
        column_names = raw_datasets["validation"].column_names
        features = raw_datasets["validation"].features

    if data_args.text_column_name is not None:
        text_column_name = data_args.text_column_name
    elif "tokens" in column_names:
        text_column_name = "tokens"
    else:
        text_column_name = column_names[0]

    if data_args.label_column_name is not None:
        label_column_name = data_args.label_column_name
    elif f"{data_args.task_name}_tags" in column_names:
        label_column_name = f"{data_args.task_name}_tags"
    else:
        label_column_name = column_names[1]

    # In the event the labels are not a `Sequence[ClassLabel]`, we will need to go through the dataset to get the
    # unique labels.
    def get_label_list(labels):
        unique_labels = set()
        for label in labels:
            unique_labels = unique_labels | set(label)
        label_list = list(unique_labels)
        label_list.sort()
        return label_list

    if isinstance(features[label_column_name].feature, ClassLabel):
        label_list = features[label_column_name].feature.names
        # No need to convert the labels since they are already ints.
        label_to_id = {i: i for i in range(len(label_list))}
    else:
        label_list = get_label_list(raw_datasets["train"][label_column_name])
        label_to_id = {l: i for i, l in enumerate(label_list)}
    num_labels = len(label_list)
    # endregion

    # region Load config and tokenizer
    #
    # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.
    if model_args.config_name:
        config = AutoConfig.from_pretrained(model_args.config_name,
                                            num_labels=num_labels)
    elif model_args.model_name_or_path:
        config = AutoConfig.from_pretrained(model_args.model_name_or_path,
                                            num_labels=num_labels)
    else:
        config = CONFIG_MAPPING[model_args.model_type]()
        logger.warning(
            "You are instantiating a new config instance from scratch.")

    tokenizer_name_or_path = model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path
    if not tokenizer_name_or_path:
        raise ValueError(
            "You are instantiating a new tokenizer from scratch. This is not supported by this script."
            "You can do it from another script, save it, and load it from here, using --tokenizer_name."
        )

    if config.model_type in {"gpt2", "roberta"}:
        tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path,
                                                  use_fast=True,
                                                  add_prefix_space=True)
    else:
        tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path,
                                                  use_fast=True)
    # endregion

    # region Preprocessing the raw datasets
    # First we tokenize all the texts.
    padding = "max_length" if data_args.pad_to_max_length else False

    # Tokenize all texts and align the labels with them.

    def tokenize_and_align_labels(examples):
        tokenized_inputs = tokenizer(
            examples[text_column_name],
            max_length=data_args.max_length,
            padding=padding,
            truncation=True,
            # We use this argument because the texts in our dataset are lists of words (with a label for each word).
            is_split_into_words=True,
        )

        labels = []
        for i, label in enumerate(examples[label_column_name]):
            word_ids = tokenized_inputs.word_ids(batch_index=i)
            previous_word_idx = None
            label_ids = []
            for word_idx in word_ids:
                # Special tokens have a word id that is None. We set the label to -100 so they are automatically
                # ignored in the loss function.
                if word_idx is None:
                    label_ids.append(-100)
                # We set the label for the first token of each word.
                elif word_idx != previous_word_idx:
                    label_ids.append(label_to_id[label[word_idx]])
                # For the other tokens in a word, we set the label to either the current label or -100, depending on
                # the label_all_tokens flag.
                else:
                    label_ids.append(label_to_id[label[word_idx]] if data_args.
                                     label_all_tokens else -100)
                previous_word_idx = word_idx

            labels.append(label_ids)
        tokenized_inputs["labels"] = labels
        return tokenized_inputs

    processed_raw_datasets = raw_datasets.map(
        tokenize_and_align_labels,
        batched=True,
        remove_columns=raw_datasets["train"].column_names,
        desc="Running tokenizer on dataset",
    )

    train_dataset = processed_raw_datasets["train"]
    eval_dataset = processed_raw_datasets["validation"]

    if data_args.max_train_samples is not None:
        max_train_samples = min(len(train_dataset),
                                data_args.max_train_samples)
        train_dataset = train_dataset.select(range(max_train_samples))

    if data_args.max_eval_samples is not None:
        max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
        eval_dataset = eval_dataset.select(range(max_eval_samples))

    # Log a few random samples from the training set:
    for index in random.sample(range(len(train_dataset)), 3):
        logger.info(
            f"Sample {index} of the training set: {train_dataset[index]}.")
    # endregion

    with training_args.strategy.scope():
        # region Initialize model
        if model_args.model_name_or_path:
            model = TFAutoModelForTokenClassification.from_pretrained(
                model_args.model_name_or_path,
                config=config,
            )
        else:
            logger.info("Training new model from scratch")
            model = TFAutoModelForTokenClassification.from_config(config)

        model.resize_token_embeddings(len(tokenizer))
        # endregion

        # region Create TF datasets

        # We need the DataCollatorForTokenClassification here, as we need to correctly pad labels as
        # well as inputs.
        collate_fn = DataCollatorForTokenClassification(tokenizer=tokenizer,
                                                        return_tensors="tf")
        num_replicas = training_args.strategy.num_replicas_in_sync
        total_train_batch_size = training_args.per_device_train_batch_size * num_replicas

        dataset_options = tf.data.Options()
        dataset_options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF

        # model.prepare_tf_dataset() wraps a Hugging Face dataset in a tf.data.Dataset which is ready to use in
        # training. This is the recommended way to use a Hugging Face dataset when training with Keras. You can also
        # use the lower-level dataset.to_tf_dataset() method, but you will have to specify things like column names
        # yourself if you use this method, whereas they are automatically inferred from the model input names when
        # using model.prepare_tf_dataset()
        # For more info see the docs:
        # https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.TFPreTrainedModel.prepare_tf_dataset
        # https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset.to_tf_dataset

        tf_train_dataset = model.prepare_tf_dataset(
            train_dataset,
            collate_fn=collate_fn,
            batch_size=total_train_batch_size,
            shuffle=True,
        ).with_options(dataset_options)
        total_eval_batch_size = training_args.per_device_eval_batch_size * num_replicas
        tf_eval_dataset = model.prepare_tf_dataset(
            eval_dataset,
            collate_fn=collate_fn,
            batch_size=total_eval_batch_size,
            shuffle=False,
        ).with_options(dataset_options)

        # endregion

        # region Optimizer, loss and compilation
        num_train_steps = int(
            len(tf_train_dataset) * training_args.num_train_epochs)
        if training_args.warmup_steps > 0:
            num_warmup_steps = training_args.warmup_steps
        elif training_args.warmup_ratio > 0:
            num_warmup_steps = int(num_train_steps *
                                   training_args.warmup_ratio)
        else:
            num_warmup_steps = 0

        optimizer, lr_schedule = create_optimizer(
            init_lr=training_args.learning_rate,
            num_train_steps=num_train_steps,
            num_warmup_steps=num_warmup_steps,
            adam_beta1=training_args.adam_beta1,
            adam_beta2=training_args.adam_beta2,
            adam_epsilon=training_args.adam_epsilon,
            weight_decay_rate=training_args.weight_decay,
            adam_global_clipnorm=training_args.max_grad_norm,
        )

        model.compile(optimizer=optimizer, jit_compile=training_args.xla)
        # endregion

        # Metrics
        metric = evaluate.load("seqeval")

        def get_labels(y_pred, y_true):
            # Transform predictions and references tensos to numpy arrays

            # Remove ignored index (special tokens)
            true_predictions = [[
                label_list[p] for (p, l) in zip(pred, gold_label) if l != -100
            ] for pred, gold_label in zip(y_pred, y_true)]
            true_labels = [[
                label_list[l] for (p, l) in zip(pred, gold_label) if l != -100
            ] for pred, gold_label in zip(y_pred, y_true)]
            return true_predictions, true_labels

        def compute_metrics():
            results = metric.compute()
            if data_args.return_entity_level_metrics:
                # Unpack nested dictionaries
                final_results = {}
                for key, value in results.items():
                    if isinstance(value, dict):
                        for n, v in value.items():
                            final_results[f"{key}_{n}"] = v
                    else:
                        final_results[key] = value
                return final_results
            else:
                return {
                    "precision": results["overall_precision"],
                    "recall": results["overall_recall"],
                    "f1": results["overall_f1"],
                    "accuracy": results["overall_accuracy"],
                }

        # endregion

        # region Preparing push_to_hub and model card
        push_to_hub_model_id = training_args.push_to_hub_model_id
        model_name = model_args.model_name_or_path.split("/")[-1]
        if not push_to_hub_model_id:
            if data_args.dataset_name is not None:
                push_to_hub_model_id = f"{model_name}-finetuned-{data_args.dataset_name}"
            else:
                push_to_hub_model_id = f"{model_name}-finetuned-token-classification"

        model_card_kwargs = {
            "finetuned_from": model_args.model_name_or_path,
            "tasks": "token-classification"
        }
        if data_args.dataset_name is not None:
            model_card_kwargs["dataset_tags"] = data_args.dataset_name
            if data_args.dataset_config_name is not None:
                model_card_kwargs[
                    "dataset_args"] = data_args.dataset_config_name
                model_card_kwargs[
                    "dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
            else:
                model_card_kwargs["dataset"] = data_args.dataset_name

        if training_args.push_to_hub:
            callbacks = [
                PushToHubCallback(
                    output_dir=training_args.output_dir,
                    model_id=push_to_hub_model_id,
                    organization=training_args.push_to_hub_organization,
                    token=training_args.push_to_hub_token,
                    tokenizer=tokenizer,
                    **model_card_kwargs,
                )
            ]
        else:
            callbacks = []
        # endregion

        # region Training
        logger.info("***** Running training *****")
        logger.info(f"  Num examples = {len(train_dataset)}")
        logger.info(f"  Num Epochs = {training_args.num_train_epochs}")
        logger.info(
            f"  Instantaneous batch size per device = {training_args.per_device_train_batch_size}"
        )
        logger.info(f"  Total train batch size = {total_train_batch_size}")
        # Only show the progress bar once on each machine.

        model.fit(
            tf_train_dataset,
            validation_data=tf_eval_dataset,
            epochs=int(training_args.num_train_epochs),
            callbacks=callbacks,
        )
        # endregion

        # region Predictions
        # If you have variable batch sizes (i.e. not using pad_to_max_length), then
        # this bit might fail on TF < 2.8 because TF can't concatenate outputs of varying seq
        # length from predict().

        try:
            predictions = model.predict(
                tf_eval_dataset,
                batch_size=training_args.per_device_eval_batch_size)["logits"]
        except tf.python.framework.errors_impl.InvalidArgumentError:
            raise ValueError(
                "Concatenating predictions failed! If your version of TensorFlow is 2.8.0 or older "
                "then you will need to use --pad_to_max_length to generate predictions, as older "
                "versions of TensorFlow cannot concatenate variable-length predictions as RaggedTensor."
            )
        if isinstance(predictions, tf.RaggedTensor):
            predictions = predictions.to_tensor(default_value=-100)
        predictions = tf.math.argmax(predictions, axis=-1).numpy()
        if "label" in eval_dataset:
            labels = eval_dataset.with_format("tf")["label"]
        else:
            labels = eval_dataset.with_format("tf")["labels"]
        if isinstance(labels, tf.RaggedTensor):
            labels = labels.to_tensor(default_value=-100)
        labels = labels.numpy()
        attention_mask = eval_dataset.with_format("tf")["attention_mask"]
        if isinstance(attention_mask, tf.RaggedTensor):
            attention_mask = attention_mask.to_tensor(default_value=-100)
        attention_mask = attention_mask.numpy()
        labels[attention_mask == 0] = -100
        preds, refs = get_labels(predictions, labels)
        metric.add_batch(
            predictions=preds,
            references=refs,
        )
        eval_metric = compute_metrics()
        logger.info("Evaluation metrics:")
        for key, val in eval_metric.items():
            logger.info(f"{key}: {val:.4f}")

        if training_args.output_dir is not None:
            output_eval_file = os.path.join(training_args.output_dir,
                                            "all_results.json")
            with open(output_eval_file, "w") as writer:
                writer.write(json.dumps(eval_metric))
        # endregion

    if training_args.output_dir is not None and not training_args.push_to_hub:
        # If we're not pushing to hub, at least save a local copy when we're done
        model.save_pretrained(training_args.output_dir)
def main(args):
    dataset = prepare_dataset(args)
    tokenizer = prepare_tokenizer(args)
    # data_collator = YD_DataCollatorForTokenClassification(tokenizer)
    data_collator = DataCollatorForTokenClassification(tokenizer)

    if 'test' in dataset:
        column_names = dataset["test"].column_names
        features = dataset["test"].features
    else:
        raise ValueError('Evaluation must specify test_file!')

    text_column_name = "tokens" if "tokens" in column_names else column_names[0]
    label_column_name = ('ner_tags'
                         if 'ner_tags' in column_names else column_names[1])

    if isinstance(features[label_column_name].feature, ClassLabel):
        label_list = features[label_column_name].feature.names
        # No need to convert the labels since they are already ints.
        label_to_id = {i: i for i in range(len(label_list))}
    else:
        if 'test' in label_column_name:
            label_list = get_label_list(datasets["test"][label_column_name])
        else:
            raise ValueError('Evaluation must specify test_file!')

        label_to_id = {l: i for i, l in enumerate(label_list)}
    args.num_labels = len(label_list)

    # Tokenize all texts and align the labels with them.
    def tokenize_and_align_labels(examples, label_all_tokens=False):
        tokenized_inputs = tokenizer(
            examples[text_column_name],
            padding=False,
            truncation=True,
            # We use this argument because the texts in our dataset are lists of words (with a label for each word).
            is_split_into_words=True,
            return_offsets_mapping=True,
        )
        #print('tokenized_inputs', tokenized_inputs)

        offset_mappings = tokenized_inputs.pop("offset_mapping")
        labels = []
        for label, offset_mapping in zip(examples[label_column_name],
                                         offset_mappings):
            label_index = 0
            current_label = -100
            label_ids = []
            for offset in offset_mapping:
                # We set the label for the first token of each word. Special characters will have an offset of (0, 0)
                # so the test ignores them.
                if offset[0] == 0 and offset[1] != 0:
                    current_label = label_to_id[label[label_index]]
                    label_index += 1
                    label_ids.append(current_label)
                # For special tokens, we set the label to -100 so it's automatically ignored in the loss function.
                elif offset[0] == 0 and offset[1] == 0:
                    label_ids.append(-100)
                # For the other tokens in a word, we set the label to either the current label or -100, depending on
                # the label_all_tokens flag.
                else:
                    label_ids.append(
                        current_label if label_all_tokens else -100)

            labels.append(label_ids)
        tokenized_inputs["labels"] = labels
        return tokenized_inputs

    tokenized_datasets = dataset.map(
        tokenize_and_align_labels,
        # batched=True, **YD** test non-batched results
        batched=True,
        num_proc=8,
        load_from_cache_file=False,
    )

    test_dataset = tokenized_datasets['test']
    # **YD** core code to keep only usefule parameters for model
    test_dataset.set_format(type=test_dataset.format["type"],
                            columns=_NER_COLUMNS)

    # **YD** dataloader
    data_loader = DataLoader(
        test_dataset,
        batch_size=args.batch_size,
        sampler=None,
        collate_fn=data_collator,
        drop_last=False,
        # num_workers=8,
        num_workers=0,
    )

    model = prepare_model(args)
    model.cuda()

    true_predictions = []
    true_labels = []

    for index, input in tqdm(enumerate(data_loader)):
        labels, input['labels'] = input['labels'].tolist(), None

        # print(input.keys())
        input = utils.move_to_cuda(input)
        predictions = model(**input)
        if index == 0:
            print('predictions', predictions)
        predictions = torch.argmax(predictions, axis=2).tolist()
        if index == 0:
            print('labels', labels)
            print('predictions', predictions)

        true_predictions.extend(
            [[label_list[p] for (p, l) in zip(prediction, label) if l != -100]
             for prediction, label in zip(predictions, labels)])

        true_labels.extend(
            [[label_list[l] for (p, l) in zip(prediction, label) if l != -100]
             for prediction, label in zip(predictions, labels)])

    true_predictions = [
        true_prediction for true_prediction in true_predictions
        if true_prediction != []
    ]
    true_labels = [
        true_label for true_label in true_labels if true_label != []
    ]

    print('true_predictions', true_predictions[0], true_predictions[-1])
    print('true_labels', true_labels[0], true_labels[-1])

    print({
        "accuracy_score": accuracy_score(true_labels, true_predictions),
        "precision": precision_score(true_labels, true_predictions),
        "recall": recall_score(true_labels, true_predictions),
        "f1": f1_score(true_labels, true_predictions),
    })
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TrainingArguments))
    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args = parser.parse_json_file(
            json_file=os.path.abspath(sys.argv[1]))
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses(
        )

    if (os.path.exists(training_args.output_dir)
            and os.listdir(training_args.output_dir) and training_args.do_train
            and not training_args.overwrite_output_dir):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty."
            "Use --overwrite_output_dir to overcome.")

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO
        if is_main_process(training_args.local_rank) else logging.WARN,
    )

    # Log on each process the small summary:
    logger.warning(
        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
        +
        f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
    )
    # Set the verbosity to info of the Transformers logger (on main process only):
    if is_main_process(training_args.local_rank):
        transformers.utils.logging.set_verbosity_info()
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed before initializing model.
    set_seed(training_args.seed)

    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
    # (the dataset will be downloaded automatically from the datasets Hub).
    #
    # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
    # 'text' is found. You can easily tweak this behavior (see below).
    #
    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
    # download the dataset.
    if data_args.dataset_name is not None:
        # Downloading and loading a dataset from the hub.
        datasets = load_dataset(data_args.dataset_name,
                                data_args.dataset_config_name)
    else:
        data_files = {}
        if data_args.train_file is not None:
            data_files["train"] = data_args.train_file
        if data_args.validation_file is not None:
            data_files["validation"] = data_args.validation_file
        if data_args.test_file is not None:
            data_files["test"] = data_args.test_file
        extension = data_args.train_file.split(".")[-1]
        datasets = load_dataset(extension, data_files=data_files)
    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
    # https://huggingface.co/docs/datasets/loading_datasets.html.

    if training_args.do_train:
        column_names = datasets["train"].column_names
    else:
        column_names = datasets["validation"].column_names
    text_column_name = "words" if "words" in column_names else column_names[0]
    label_column_name = data_args.task_name if data_args.task_name in column_names else column_names[
        1]

    # Labeling (this part will be easier when https://github.com/huggingface/datasets/issues/797 is solved)
    def get_label_list(labels):
        unique_labels = set()
        for label in labels:
            unique_labels = unique_labels | set(label)
        label_list = list(unique_labels)
        label_list.sort()
        return label_list

    label_list = get_label_list(datasets["train"][label_column_name])
    label_to_id = {l: i for i, l in enumerate(label_list)}
    num_labels = len(label_list)

    # Load pretrained model and tokenizer
    #
    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.
    config = AutoConfig.from_pretrained(
        model_args.config_name
        if model_args.config_name else model_args.model_name_or_path,
        num_labels=num_labels,
        finetuning_task=data_args.task_name,
        cache_dir=model_args.cache_dir,
    )
    tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name
        if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
        use_fast=True,
    )
    model = AutoModelForTokenClassification.from_pretrained(
        model_args.model_name_or_path,
        from_tf=bool(".ckpt" in model_args.model_name_or_path),
        config=config,
        cache_dir=model_args.cache_dir,
    )

    # Preprocessing the dataset
    # Padding strategy
    padding = "max_length" if data_args.pad_to_max_length else False

    # Tokenize all texts and align the labels with them.
    def tokenize_and_align_labels(examples):
        tokenized_inputs = tokenizer(
            examples[text_column_name],
            padding=padding,
            truncation=True,
            # We use this argument because the texts in our dataset are lists of words (with a label for each word).
            is_split_into_words=True,
            return_offsets_mapping=True,
        )
        offset_mappings = tokenized_inputs.pop("offset_mapping")
        labels = []
        for label, offset_mapping in zip(examples[label_column_name],
                                         offset_mappings):
            label_index = 0
            current_label = -100
            label_ids = []
            for offset in offset_mapping:
                # We set the label for the first token of each word. Special characters will have an offset of (0, 0)
                # so the test ignores them.
                if offset[0] == 0 and offset[1] != 0:
                    current_label = label_to_id[label[label_index]]
                    label_index += 1
                    label_ids.append(current_label)
                # For special tokens, we set the label to -100 so it's automatically ignored in the loss function.
                elif offset[0] == 0 and offset[1] == 0:
                    label_ids.append(-100)
                # For the other tokens in a word, we set the label to either the current label or -100, depending on
                # the label_all_tokens flag.
                else:
                    label_ids.append(
                        current_label if data_args.label_all_tokens else -100)

            labels.append(label_ids)
        tokenized_inputs["labels"] = labels
        return tokenized_inputs

    tokenized_datasets = datasets.map(
        tokenize_and_align_labels,
        batched=True,
        num_proc=data_args.preprocessing_num_workers,
        load_from_cache_file=not data_args.overwrite_cache,
    )

    # Data collator
    data_collator = DataCollatorForTokenClassification(tokenizer)

    # Metrics
    def compute_metrics(p):
        predictions, labels = p
        predictions = np.argmax(predictions, axis=2)

        # Remove ignored index (special tokens)
        true_predictions = [[
            label_list[p] for (p, l) in zip(prediction, label) if l != -100
        ] for prediction, label in zip(predictions, labels)]
        true_labels = [[
            label_list[l] for (p, l) in zip(prediction, label) if l != -100
        ] for prediction, label in zip(predictions, labels)]

        return {
            "accuracy_score": accuracy_score(true_labels, true_predictions),
            "precision": precision_score(true_labels, true_predictions),
            "recall": recall_score(true_labels, true_predictions),
            "f1": f1_score(true_labels, true_predictions),
        }

    # Initialize our Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets["train"]
        if training_args.do_train else None,
        eval_dataset=tokenized_datasets["validation"]
        if training_args.do_eval else None,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    # Training
    if training_args.do_train:
        trainer.train(model_path=model_args.model_name_or_path if os.path.
                      isdir(model_args.model_name_or_path) else None)
        trainer.save_model()  # Saves the tokenizer too for easy upload

    # Evaluation
    results = {}
    if training_args.do_eval:
        logger.info("*** Evaluate ***")

        results = trainer.evaluate()

        output_eval_file = os.path.join(training_args.output_dir,
                                        "eval_results_ner.txt")
        if trainer.is_world_process_zero():
            with open(output_eval_file, "w") as writer:
                logger.info("***** Eval results *****")
                for key, value in results.items():
                    logger.info(f"  {key} = {value}")
                    writer.write(f"{key} = {value}\n")

    # Predict
    if training_args.do_predict:
        logger.info("*** Predict ***")

        test_dataset = datasets["test"]
        predictions, labels, metrics = trainer.predict(test_dataset)
        predictions = np.argmax(predictions, axis=2)

        # Remove ignored index (special tokens)
        true_predictions = [[
            label_list[p] for (p, l) in zip(prediction, label) if l != -100
        ] for prediction, label in zip(predictions, labels)]

        output_test_results_file = os.path.join(training_args.output_dir,
                                                "test_results.txt")
        if trainer.is_world_master():
            with open(output_test_results_file, "w") as writer:
                for key, value in metrics.items():
                    logger.info(f"  {key} = {value}")
                    writer.write(f"{key} = {value}\n")

        # Save predictions
        output_test_predictions_file = os.path.join(training_args.output_dir,
                                                    "test_predictions.txt")
        if trainer.is_world_master():
            with open(output_test_predictions_file, "w") as writer:
                for prediction in true_predictions:
                    writer.write(" ".join(prediction) + "\n")

    return results
Esempio n. 30
0
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TrainingArguments))
    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args = parser.parse_json_file(
            json_file=os.path.abspath(sys.argv[1]))
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses(
        )

    if (os.path.exists(training_args.output_dir)
            and os.listdir(training_args.output_dir) and training_args.do_train
            and not training_args.overwrite_output_dir):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty."
            "Use --overwrite_output_dir to overcome.")

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO
        if is_main_process(training_args.local_rank) else logging.WARN,
    )

    # Log on each process the small summary:
    logger.warning(
        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
        +
        f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
    )
    # Set the verbosity to info of the Transformers logger (on main process only):
    if is_main_process(training_args.local_rank):
        transformers.utils.logging.set_verbosity_info()
        transformers.utils.logging.enable_default_handler()
        transformers.utils.logging.enable_explicit_format()
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed before initializing model.
    set_seed(training_args.seed)

    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
    # (the dataset will be downloaded automatically from the datasets Hub).
    #
    # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
    # 'text' is found. You can easily tweak this behavior (see below).
    #
    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
    # download the dataset.
    if data_args.dataset_name is not None:
        # Downloading and loading a dataset from the hub.
        datasets = load_dataset(data_args.dataset_name,
                                data_args.dataset_config_name)
    else:
        data_files = {}
        if data_args.train_file is not None:
            data_files["train"] = data_args.train_file
        if data_args.validation_file is not None:
            data_files["validation"] = data_args.validation_file
        if data_args.test_file is not None:
            data_files["test"] = data_args.test_file

        table = csv.read_csv("./data/train.csv",
                             parse_options=ParseOptions(delimiter="\t"))
        class_label_ = table.column("label").unique()
        class_label = ClassLabel(num_classes=len(class_label_),
                                 names=class_label_.tolist())
        train = main_ner.process_data(data_args.train_file, class_label)
        test = main_ner.process_data(data_args.test_file, class_label)
        val = main_ner.process_data(data_args.validation_file, class_label)

        # table = csv.read_csv(data_args.train_file)
        extension = data_args.train_file.split(".")[-1]
        datasets = load_dataset(extension,
                                data_files=data_files,
                                delimiter="\t",
                                quoting=csv_lib.QUOTE_NONE)
        train_dataset = datasets["train"]
        test_dataset = datasets["test"]
        val_dataset = datasets["validation"]

        table = train_dataset.data
        label = table.column("label")
        class_label_ = label.unique()
        class_label = Sequence(feature=ClassLabel(
            num_classes=len(class_label_), names=class_label_.tolist()))

        train_dataset.features['ner_tags'] = class_label
        # train_ner_list: ChunkedArray = class_label.feature.str2int(train_dataset.data.column('label').to_numpy())
        # train_ner_array = pa.array(train_ner_list)
        # train_data = train_dataset.data.append_column("ner_tags", train_ner_array)
        train_dataset._data = train

        test_dataset.features['ner_tags'] = class_label
        test_dataset._data = test

        val_dataset.features['ner_tags'] = class_label
        # val_ner_list: ChunkedArray = class_label.feature.str2int(val_dataset.data.column('label').to_numpy())
        # val_ner_array = pa.array(val_ner_list)
        # val_data = val_dataset.data.append_column("ner_tags", val_ner_array)
        val_dataset._data = val

    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
    # https://huggingface.co/docs/datasets/loading_datasets.html.

    if training_args.do_train:
        column_names = datasets["train"].column_names
        features = datasets["train"].features
    else:
        column_names = datasets["validation"].column_names
        features = datasets["validation"].features
    text_column_name = "tokens" if "tokens" in column_names else column_names[0]
    label_column_name = (f"{data_args.task_name}_tags"
                         if f"{data_args.task_name}_tags" in column_names else
                         column_names[1])

    # In the event the labels are not a `Sequence[ClassLabel]`, we will need to go through the dataset to get the
    # unique labels.
    def get_label_list(labels):
        unique_labels = set()
        for label in labels:
            unique_labels = unique_labels | set(label)
        label_list = list(unique_labels)
        label_list.sort()
        return label_list

    seq: Sequence = features[label_column_name]
    # label_list = ["O", "B-GENE", "I-GENE"]
    # label_to_id = {i: i for i in range(len(label_list))}
    if isinstance(seq.feature, ClassLabel):
        label_list = features[label_column_name].feature.names
        # No need to convert the labels since they are already ints.
        label_to_id = {i: i for i in range(len(label_list))}
    else:
        label_list = get_label_list(datasets["train"][label_column_name])
        label_to_id = {l: i for i, l in enumerate(label_list)}
    num_labels = len(label_list)

    # Load pretrained model and tokenizer
    #
    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.
    config = AutoConfig.from_pretrained(
        model_args.config_name
        if model_args.config_name else model_args.model_name_or_path,
        num_labels=num_labels,
        finetuning_task=data_args.task_name,
        cache_dir=model_args.cache_dir,
        revision=model_args.model_revision,
        use_auth_token=True if model_args.use_auth_token else None,
    )
    tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name
        if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
        use_fast=True,
        revision=model_args.model_revision,
        use_auth_token=True if model_args.use_auth_token else None,
    )
    model = AutoModelForTokenClassification.from_pretrained(
        model_args.model_name_or_path,
        from_tf=bool(".ckpt" in model_args.model_name_or_path),
        config=config,
        cache_dir=model_args.cache_dir,
        revision=model_args.model_revision,
        use_auth_token=True if model_args.use_auth_token else None,
    )

    # Tokenizer check: this script requires a fast tokenizer.
    if not isinstance(tokenizer, PreTrainedTokenizerFast):
        raise ValueError(
            "This example script only works for models that have a fast tokenizer. Checkout the big table of models "
            "at https://huggingface.co/transformers/index.html#bigtable to find the model types that meet this "
            "requirement")

    # Preprocessing the dataset
    # Padding strategy
    padding = "max_length" if data_args.pad_to_max_length else False

    # Tokenize all texts and align the labels with them.
    def tokenize_and_align_labels(examples):
        tokenized_inputs = tokenizer(
            examples[text_column_name],
            padding=padding,
            truncation=True,
            # We use this argument because the texts in our dataset are lists of words (with a label for each word).
            is_split_into_words=True,
        )
        labels = []
        if len(examples) == 3:
            for i, label in enumerate(examples[label_column_name]):
                word_ids = tokenized_inputs.word_ids(batch_index=i)
                previous_word_idx = None
                label_ids = []
                for word_idx in word_ids:
                    # Special tokens have a word id that is None. We set the label to -100 so they are automatically
                    # ignored in the loss function.
                    if word_idx is None:
                        label_ids.append(-100)
                    # We set the label for the first token of each word.
                    elif word_idx != previous_word_idx:
                        label_ids.append(label_to_id[label[word_idx]])
                    # For the other tokens in a word, we set the label to either the current label or -100, depending on
                    # the label_all_tokens flag.
                    else:
                        label_ids.append(label_to_id[label[word_idx]] if
                                         data_args.label_all_tokens else -100)
                    previous_word_idx = word_idx

                labels.append(label_ids)
            tokenized_inputs["labels"] = labels
            return tokenized_inputs
        else:
            print("asdasdsa")

    tokenized_datasets = datasets.map(
        tokenize_and_align_labels,
        batched=True,
        num_proc=data_args.preprocessing_num_workers,
        load_from_cache_file=not data_args.overwrite_cache,
    )

    # Data collator
    data_collator = DataCollatorForTokenClassification(tokenizer)

    # Metrics
    metric = load_metric("seqeval")

    def compute_metrics(p):
        predictions, labels = p
        predictions = np.argmax(predictions, axis=2)

        # Remove ignored index (special tokens)
        true_predictions = [[
            label_list[p] for (p, l) in zip(prediction, label) if l != -100
        ] for prediction, label in zip(predictions, labels)]
        true_labels = [[
            label_list[l] for (p, l) in zip(prediction, label) if l != -100
        ] for prediction, label in zip(predictions, labels)]

        results = metric.compute(predictions=true_predictions,
                                 references=true_labels)
        if data_args.return_entity_level_metrics:
            # Unpack nested dictionaries
            final_results = {}
            for key, value in results.items():
                if isinstance(value, dict):
                    for n, v in value.items():
                        final_results[f"{key}_{n}"] = v
                else:
                    final_results[key] = value
            return final_results
        else:
            return {
                "precision": results["overall_precision"],
                "recall": results["overall_recall"],
                "f1": results["overall_f1"],
                "accuracy": results["overall_accuracy"],
            }

    # Initialize our Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets["train"]
        if training_args.do_train else None,
        eval_dataset=tokenized_datasets["validation"]
        if training_args.do_eval else None,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    # Training
    if training_args.do_train:
        train_result = trainer.train(
            model_path=model_args.model_name_or_path if os.path.
            isdir(model_args.model_name_or_path) else None)
        trainer.save_model()  # Saves the tokenizer too for easy upload

        output_train_file = os.path.join(training_args.output_dir,
                                         "train_results.txt")
        if trainer.is_world_process_zero():
            with open(output_train_file, "w") as writer:
                logger.info("***** Train results *****")
                for key, value in sorted(train_result.metrics.items()):
                    logger.info(f"  {key} = {value}")
                    writer.write(f"{key} = {value}\n")

            # Need to save the state, since Trainer.save_model saves only the tokenizer with the model
            trainer.state.save_to_json(
                os.path.join(training_args.output_dir, "trainer_state.json"))

    # Evaluation
    results = {}
    if training_args.do_eval:
        logger.info("*** Evaluate ***")

        results = trainer.evaluate()

        output_eval_file = os.path.join(training_args.output_dir,
                                        "eval_results_ner.txt")
        if trainer.is_world_process_zero():
            with open(output_eval_file, "w") as writer:
                logger.info("***** Eval results *****")
                for key, value in results.items():
                    logger.info(f"  {key} = {value}")
                    writer.write(f"{key} = {value}\n")

    # Predict
    if training_args.do_predict:
        logger.info("*** Predict ***")

        test_dataset = tokenized_datasets["test"]
        predictions, labels, metrics = trainer.predict(test_dataset)
        predictions = np.argmax(predictions, axis=2)

        # Remove ignored index (special tokens)
        true_predictions = [[
            label_list[p] for (p, l) in zip(prediction, label) if l != -100
        ] for prediction, label in zip(predictions, labels)]

        output_test_results_file = os.path.join(training_args.output_dir,
                                                "test_results.txt")
        if trainer.is_world_process_zero():
            with open(output_test_results_file, "w") as writer:
                for key, value in sorted(metrics.items()):
                    logger.info(f"  {key} = {value}")
                    writer.write(f"{key} = {value}\n")

        # Save predictions
        output_test_predictions_file = os.path.join(training_args.output_dir,
                                                    "test_predictions.txt")
        if trainer.is_world_process_zero():
            with open(output_test_predictions_file, "w") as writer:
                for prediction in true_predictions:
                    writer.write(" ".join(prediction) + "\n")

    return results