def test_data_collator_with_padding(self):
        tokenizer = BertTokenizer(self.vocab_file)
        features = [{
            "input_ids": [0, 1, 2]
        }, {
            "input_ids": [0, 1, 2, 3, 4, 5]
        }]

        data_collator = DataCollatorWithPadding(tokenizer, return_tensors="np")
        batch = data_collator(features)
        self.assertEqual(batch["input_ids"].shape, (2, 6))
        self.assertEqual(batch["input_ids"][0].tolist(),
                         [0, 1, 2] + [tokenizer.pad_token_id] * 3)

        data_collator = DataCollatorWithPadding(tokenizer,
                                                padding="max_length",
                                                max_length=10,
                                                return_tensors="np")
        batch = data_collator(features)
        self.assertEqual(batch["input_ids"].shape, (2, 10))

        data_collator = DataCollatorWithPadding(tokenizer,
                                                pad_to_multiple_of=8,
                                                return_tensors="np")
        batch = data_collator(features)
        self.assertEqual(batch["input_ids"].shape, (2, 8))
Ejemplo n.º 2
0
def main(train_args: TrainingArguments, args: Args):
    log.info(f"Parsed args: {args}")
    log.info(f"Parsed training args: {train_args}")

    # https://huggingface.co/docs/datasets/loading#json
    dataset = load_dataset("json",
                           data_files=join(args.input_dir, args.input_file))
    coalesced_dataset = dataset.map(coalesce)

    tokenizer = AutoTokenizer.from_pretrained(args.model)

    def preprocess_function(examples):
        return tokenizer(examples["text"], truncation=True)

    tokenized_dataset = coalesced_dataset.map(preprocess_function,
                                              batched=True)

    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    model = AutoModelForSequenceClassification.from_pretrained(args.model,
                                                               num_labels=2)

    # TODO: separate train and eval inputs.
    trainer = Trainer(
        model=model,
        args=train_args,
        train_dataset=tokenized_dataset["train"],
        eval_dataset=tokenized_dataset["train"],
        tokenizer=tokenizer,
        data_collator=data_collator,
    )

    trainer.train()
Ejemplo n.º 3
0
def init_dataset_for_squad(model_args,
                           data_args,
                           training_args,
                           last_checkpoint=None):

    datasets = init_datasets_squad(data_args, model_args)

    # Place holder for now
    extra_config_kwargs = {}
    config = init_config(model_args, extra_config_kwargs=extra_config_kwargs)
    tokenizer = init_tokenizer(model_args)
    model = init_model(model_args,
                       config,
                       tokenizer,
                       finetuning=True,
                       squad=True)
    check_sparsity_callback(model, model_args)

    logging.info(f"Tokenizing datasets for squad ...")
    (train_dataset,
     eval_dataset,
     eval_examples,
     answer_column_name) = \
        preprocess_datasets_squad(datasets, tokenizer, training_args, data_args)

    if data_args.pad_to_max_length:
        data_collator = default_data_collator
    else:
        pad_to_multiple_of = 8 if training_args.fp16 else None
        data_collator = \
            DataCollatorWithPadding(tokenizer,
                                    pad_to_multiple_of=pad_to_multiple_of)

    return (tokenizer, data_collator, train_dataset, eval_dataset,
            eval_examples, model, answer_column_name)
Ejemplo n.º 4
0
 def collate(self, samples: Any) -> Tensor:
     """Override to convert a set of samples to a batch."""
     if self.padding != "max_length":
         data_collator = DataCollatorWithPadding(
             AutoTokenizer.from_pretrained(self.backbone, use_fast=True))
         return data_collator(samples)
     return default_data_collator(samples)
Ejemplo n.º 5
0
    def katas(self):
        if not (os.path.exists(self.folder)):
            logger.info(f"Download GermEval18 dataset")
            _download_extract_downstream_data(self.folder + "/train.tsv",
                                              proxies=None)

        def preprocess_function(examples):
            labels = GermEval18.labels(True)
            examples["coarse_label"] = [
                labels.index(x) for x in examples["coarse_label"]
            ]

            labels = GermEval18.labels(False)
            examples["fine_label"] = [
                labels.index(x) for x in examples["fine_label"]
            ]

            assert callable(self.tokenizer), "tokenizer is not callable"
            return self.tokenizer(
                examples["text"],
                truncation=True,
                return_token_type_ids=True,  # FIXME
                max_length=self.spec.max_seq_length)

        spec = KataSpec(remove_columns=["text"],
                        max_seq_length=self.spec.max_seq_length)
        collator = DataCollatorWithPadding(self.tokenizer,
                                           padding="max_length",
                                           max_length=self.spec.max_seq_length)
        return (TsvKata(spec, preprocess_function, collator,
                        "../data/germeval18/train.tsv"),
                TsvKata(spec, preprocess_function, collator,
                        "../data/germeval18/test.tsv"))
Ejemplo n.º 6
0
def get_dataloader_and_optimizer(args, tokenizer, model, train_dataset, eval_dataset):
    data_collator = DataCollatorWithPadding(tokenizer)
    train_dataloader = DataLoader(train_dataset, shuffle=True, collate_fn=data_collator,
                                  batch_size=args.batch_size)
    eval_dataloader = DataLoader(eval_dataset, collate_fn=data_collator,
                                 batch_size=args.batch_size)

    optimizer = AdamW(model.parameters(), lr=args.learning_rate)

    return optimizer, train_dataloader, eval_dataloader, data_collator
Ejemplo n.º 7
0
def init_dataset_for_finetuning(model_args,
                                data_args,
                                training_args,
                                last_checkpoint=None):
    datasets = init_datasets_task(data_args, training_args)
    is_regression, label_list, num_labels = get_labels(datasets, data_args)
    logging.info(f"Training {data_args.task_name} with {num_labels} labels")

    # For finetuning required to add labels and task name to config kwargs
    extra_config_kwargs = dict(
        num_labels=num_labels,
        finetuning_task=data_args.task_name,
    )
    config = init_config(model_args, extra_config_kwargs=extra_config_kwargs)
    tokenizer = init_tokenizer(model_args)
    model = init_model(model_args, config, tokenizer, finetuning=True)
    check_sparsity_callback(model, model_args)

    # Tokenizing and preprocessing the datasets for downstream tasks
    # TODO: load from cached tokenized datasets for finetuning as well
    logging.info(f"Tokenizing datasets for finetuning ...")
    tokenized_datasets = preprocess_datasets_task(datasets, tokenizer,
                                                  data_args, model, num_labels,
                                                  label_list, is_regression)

    # Separate into train, eval and test
    train_dataset = tokenized_datasets["train"]
    eval_dataset = tokenized_datasets["validation_matched" if data_args.
                                      task_name == "mnli" else "validation"]

    test_dataset = None
    if (data_args.task_name is not None or data_args.test_file is not None):
        if training_args.do_predict:
            test_dataset = tokenized_datasets["test_matched" if data_args.
                                              task_name == "mnli" else "test"]

    # Log fingerprint used in HF smart caching
    logging.info(f"Dataset fingerprint: {train_dataset._fingerprint}")

    # Data collator will default to DataCollatorWithPadding,
    # so we change it if we already did the padding.
    if data_args.pad_to_max_length:
        data_collator = default_data_collator
    elif training_args.fp16:
        data_collator = DataCollatorWithPadding(tokenizer,
                                                pad_to_multiple_of=8)
    else:
        data_collator = None

    return (tokenizer, data_collator, train_dataset, eval_dataset,
            test_dataset, model, is_regression, tokenized_datasets, label_list,
            config)
Ejemplo n.º 8
0
    def __init__(self, args, model, tokenizer, eval_answers):
        self.args = args
        self.metric = load_metric("squad")
        self.model, self.tokenizer = model, tokenizer

        self.data_collator = DataCollatorWithPadding(
            self.tokenizer,
            pad_to_multiple_of=8 if self.args.train.fp16 else None)

        self.train_dataset = None  # 필요한거: train pp된거, eval 원본, eval retrieve 된거, eval retrieve 되고 pp된거
        self.eval_dataset = None
        self.eval_examples = None
        self.eval_answers = eval_answers
Ejemplo n.º 9
0
def explain(model, test_str):
    data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=None)
    # test_str = "The new design is awful!"
    ex_tokens = tokenizer([test_str])
    str_list = tokenizer.convert_ids_to_tokens(ex_tokens['input_ids'][0])
    # print(str_list)
    ex_tokens = data_collator(ex_tokens)
    result = model(output_hidden_states=True, **ex_tokens)
    logits = result.logits
    pred = torch.argmax(logits)
    layer_ids = [-2, -3, -4, -5, -6, -7]
    expln_ = []
    for layer_id in layer_ids:
        # layer_id = -2
        if layer_id == -2:
            result.hidden_states[layer_id].retain_grad()
            model.zero_grad()
            logits[0][pred.item()].backward(retain_graph=True)

            hs_grad = result.hidden_states[layer_id].grad
            expln = (hs_grad * result.hidden_states[layer_id]).sum(dim=-1)
            cls_grad = hs_grad[:, 0, :]
            cls_hs = result.hidden_states[layer_id][:, 0, :]
            sep_grad = hs_grad[:, -1, :]
            sep_hs = result.hidden_states[layer_id][:, -1, :]

            expln_.append(expln)
        else:
            result.hidden_states[layer_id].retain_grad()

            model.zero_grad()
            cls_hs.backward(cls_grad, retain_graph=True)
            hs_grad = result.hidden_states[layer_id].grad
            expln = (hs_grad * result.hidden_states[layer_id]).sum(dim=-1)

            cls_grad = hs_grad[:, 0, :]
            cls_hs = result.hidden_states[layer_id][:, 0, :]

            model.zero_grad()
            sep_hs.backward(sep_grad, retain_graph=True)
            hs_grad = result.hidden_states[layer_id].grad
            expln += (hs_grad * result.hidden_states[layer_id]).sum(dim=-1)

            sep_grad = hs_grad[:, -1, :]
            sep_hs = result.hidden_states[layer_id][:, -1, :]

            expln_.append(expln)

    expln = sum(expln_)
    return pred, expln, str_list
Ejemplo n.º 10
0
    def __call__(self):
        dataset = load_dataset(path="glue", name="cola", cache_dir=self.spec.cache_dir)
        encoded = dataset.map(function=self.preprocess_function,
                              batched=self.spec.batch_preprocess,
                              batch_size=self.spec.batch_preprocess_size,
                              remove_columns=["sentence"])

        data_collator = DataCollatorWithPadding(self.tokenizer, "max_length", self.spec.max_seq_length)

        return (
            DatasetKata(self.spec, encoded["train"], data_collator),
            DatasetKata(self.spec, encoded["validation"], data_collator),
            DatasetKata(self.spec, encoded["test"], data_collator)
        )
Ejemplo n.º 11
0
def run_glue(task_key,
             cfg,
             model,
             model_args,
             training_args,
             tokenizer,
             mode="train",
             extract=False,
             **kwargs):
    r"""
        cfg: YACS cfg node
        ckpt_path: Unsupported
    """
    task_name = TASK_KEY_TO_NAME[task_key]

    data_args = DataTrainingArguments(task_name=task_name,
                                      data_dir=cfg.DATA.DATAPATH)

    glue_dataset = load_features_dict(tokenizer, cfg)
    # print(glue_dataset.keys())
    train_dataset = glue_dataset[task_key]['train']
    split_key = cfg.EVAL.SPLIT
    if task_key == "mnli":
        split_key = f"{split_key}_matched"
    eval_dataset = glue_dataset[task_key][split_key]
    # eval_dataset = glue_dataset[task_key]['validation_mismached']
    # train_dataset = GlueDataset(data_args, tokenizer=tokenizer, limit_length=cfg.TRAIN.TASK_LIMIT)
    # eval_dataset = GlueDataset(data_args, tokenizer=tokenizer, mode='dev')
    collator = DataCollatorWithPadding(tokenizer)
    trainer = FixedTrainer(model=model,
                           args=training_args,
                           train_dataset=train_dataset,
                           eval_dataset=eval_dataset,
                           compute_metrics=get_eval_metrics_func(task_key),
                           data_collator=collator,
                           config=cfg)

    if mode == "train":
        trainer.train()
    if mode != "train" or cfg.EVAL_ON_COMPLETION:
        extract_path = None
        if extract:
            extract_path = get_extract_path(cfg, model_args)
        metrics = trainer.evaluate(extract_path=extract_path,
                                   cache_path=osp.join(
                                       cfg.TASK.EXTRACT_TOKENS_MASK_CACHE,
                                       task_key))
        metrics_file = get_metrics_path(cfg, model_args)
        torch.save(metrics, metrics_file)
Ejemplo n.º 12
0
    def of(self, *args):
        def preprocess_function(examples):
            assert callable(self.tokenizer), "tokenizer is not callable"
            return self.tokenizer(examples["text"],
                                  truncation=True,
                                  max_length=self.max_length)

        values = args[0]

        spec = KataSpec(remove_columns=["text"],
                        max_seq_length=self.max_length)
        collator = DataCollatorWithPadding(self.tokenizer,
                                           padding="max_length",
                                           max_length=self.max_length)
        return DictKata(spec, preprocess_function, collator, {"text": values})
Ejemplo n.º 13
0
 def loaders(self):
     if self._loaders is None:
         ps, mgr = self.params, self.mgr
         if ps.pad_to_max_length:
             c = default_data_collator
         else:
             c = DataCollatorWithPadding(
                 self.tokenizer, pad_to_multiple_of=(8 if mgr.use_fp16 else None)
             )
         t = DataLoader(
             self.train_ds, shuffle=True, collate_fn=c, batch_size=ps.train_batch_size
         )
         e = DataLoader(self.eval_ds, collate_fn=c, batch_size=ps.eval_batch_size)
         self._loaders = {TRAIN: t, EVAL: e}
     return self._loaders
Ejemplo n.º 14
0
 def __init__(
     self,
     # The abbreviation/name of your Hugging Face transformer architecture (e.b., bert, bart, etc..)
     hf_arch: str,
     # A specific configuration instance you want to use
     hf_config: PretrainedConfig,
     # A Hugging Face tokenizer
     hf_tokenizer: PreTrainedTokenizerBase,
     # A Hugging Face model
     hf_model: PreTrainedModel,
     # Defaults to use Hugging Face's DataCollatorWithPadding(tokenizer=hf_tokenizer)
     data_collator: Type = None,
 ):
     store_attr()
     self.data_collator = data_collator if (
         data_collator) else DataCollatorWithPadding(tokenizer=hf_tokenizer)
Ejemplo n.º 15
0
 def loaders(self):
     if self._loaders is None:
         ps = self.params
         if ps.pad_to_max_length:
             c = default_data_collator
         elif ps.fp16:
             c = DataCollatorWithPadding(self.tokenizer, pad_to_multiple_of=8)
         else:
             c = None
         t = DataLoader(
             self.train_ds, shuffle=True, collate_fn=c, batch_size=ps.train_batch_size
         )
         e = DataLoader(self.eval_ds, collate_fn=c, batch_size=ps.eval_batch_size)
         self._loaders = {TRAIN: t, EVAL: e}
         if ps.do_test:
             p = DataLoader(self.test_ds, collate_fn=c, batch_size=ps.eval_batch_size)
             self._loaders[TEST] = p
     return self._loaders
Ejemplo n.º 16
0
def get_data(
        model_args,
        training_args,
        tokenizer,
        text_data_path="../data/test_dataset"):  # 경로 변경 ../data/test_dataset
    """
    get data

    Args:
        model_args: model arguments
        training_args: training arguments
        tokenizer: tokenizer
        text_data_path: Defaults to "../data/test_dataset"

    Returns:
        text_data, val_iter, val_dataset, scores
    """
    text_data = load_from_disk(text_data_path)

    # run_ lasticsearch
    if "elastic" in model_args.retrieval_type:
        is_sentence_trainformer = False
        if "sentence_trainformer" in model_args.retrieval_type:
            is_sentence_trainformer = True
        # number of text to concat
        concat_num = model_args.retrieval_elastic_num
        text_data, scores = run_elasticsearch(text_data, concat_num,
                                              model_args,
                                              is_sentence_trainformer)
    elif model_args.retrieval_type == "dense":
        concat_num = model_args.retrieval_elastic_num
        text_data, scores = run_concat_dense_retrival(text_data, concat_num)

    column_names = text_data["validation"].column_names

    data_collator = (DataCollatorWithPadding(
        tokenizer, pad_to_multiple_of=8 if training_args.fp16 else None))
    # 데이터 tokenize(mrc 모델안에 들어 갈 수 있도록)
    data_processor = DataProcessor(tokenizer)
    val_text = text_data["validation"]
    val_dataset = data_processor.val_tokenzier(val_text, column_names)
    val_iter = DataLoader(val_dataset, collate_fn=data_collator, batch_size=1)

    return text_data, val_iter, val_dataset, scores
Ejemplo n.º 17
0
    def test_strategies_with_dataset(self, args=args):
        """
        (Constraint)
            - num_train_epoch 1
            - random seed 1
            - dataset fragment (rows : 100)
        (Caution)
            ERROR가 표시된다면, 상위 단위 테스트 결과를 확인하세요.
        """
        for seed, strategy in [(SEED, strategy) for strategy in strategies]:
            wandb.init(project="p-stage-3-test", reinit=True)
            args = update_args(args, strategy)
            args.strategy, args.seed = strategy, seed
            set_seed(seed)

            datasets = prepare_dataset(args, is_train=True)
            model, tokenizer = get_reader_model(args)
            train_dataset, post_processing_function = preprocess_dataset(
                args, datasets, tokenizer, is_train=True)

            train_dataset = train_dataset.select(range(100))  # select 100

            data_collator = DataCollatorWithPadding(
                tokenizer, pad_to_multiple_of=8 if args.train.fp16 else None)

            args.train.do_train = True
            args.train.run_name = "_".join(
                [strategy, args.alias, str(seed), "test"])
            wandb.run.name = args.train.run_name

            # TRAIN MRC
            args.train.num_train_epochs = 1.0  # fix epoch 1
            trainer = QuestionAnsweringTrainer(
                model=model,
                args=args.train,  # training_args
                custom_args=args,
                train_dataset=train_dataset,
                tokenizer=tokenizer,
                data_collator=data_collator,
                post_process_function=post_processing_function,
                compute_metrics=compute_metrics,
            )

            trainer.train()
Ejemplo n.º 18
0
    def train(self, input_filepath, dataclass_args: TCTrainArgs):

        if not dataclass_args.load_preprocessed_data:
            self.logger.info("Preprocessing dataset...")
            contexts, labels = self._get_data(input_filepath)
            train_encodings = self.tokenizer(contexts, truncation=True, padding=True)
        else:
            self.logger.info("Loading dataset from %s...", dataclass_args.load_preprocessed_data_path)
            train_encodings, labels = self._get_preprocessed_data(dataclass_args.load_preprocessed_data_path)

        if dataclass_args.save_preprocessed_data:
            self.logger.info("Saving training dataset to %s...", dataclass_args.save_preprocessed_data_path)
            input_ids = train_encodings["input_ids"]
            attention_mask = train_encodings["attention_mask"]
            self._generate_json(dataclass_args.save_preprocessed_data_path, input_ids, attention_mask, labels, "train")

        train_dataset = TextClassificationDataset(train_encodings, labels)
        data_collator = DataCollatorWithPadding(self.tokenizer)
        self._run_train(train_dataset, dataclass_args, data_collator)
Ejemplo n.º 19
0
    def __call__(self):
        if not (os.path.exists(self.folder)):
            logger.info(f"Download ToxicComments dataset")
            _download_extract_downstream_data(self.folder + "/train.tsv",
                                              proxies=None)

        def preprocess_function(examples):
            # TODO: use to utility class
            def hot_encoding(labels: Optional[str]):
                label_ids = [0] * len(ToxicComments.labels())

                if labels is None:
                    return label_ids

                for l in labels.split(","):
                    if l != "":
                        label_ids[ToxicComments.labels().index(l)] = 1
                return label_ids

            examples["label"] = [hot_encoding(x) for x in examples["label"]]

            assert callable(self.tokenizer), "tokenizer is not callable"
            return self.tokenizer(
                examples["text"],
                truncation=True,
                return_token_type_ids=True,  # FIXME
                max_length=self.spec.max_seq_length)

        spec = KataSpec(remove_columns=["text"],
                        max_seq_length=self.spec.max_seq_length)
        collator = DataCollatorWithPadding(self.tokenizer,
                                           padding="max_length",
                                           max_length=self.spec.max_seq_length)
        return (TsvKata(spec,
                        preprocess_function,
                        collator,
                        "../data/toxic-comments/train.tsv",
                        quote_char='"'),
                TsvKata(spec,
                        preprocess_function,
                        collator,
                        "../data/toxic-comments/val.tsv",
                        quote_char='"'))
Ejemplo n.º 20
0
 def loaders(self):
     if self._loaders is None:
         ps, mgr = self.params, self.mgr
         if ps.pad_to_max_length:
             c = default_data_collator
         else:
             c = DataCollatorWithPadding(
                 self.tokenizer, pad_to_multiple_of=(8 if mgr.use_fp16 else None)
             )
         t = DataLoader(
             self.train_ds, shuffle=True, collate_fn=c, batch_size=ps.train_batch_size
         )
         x = self.eval_ds.remove_columns(["example_id", "offset_mapping"])
         e = DataLoader(x, collate_fn=c, batch_size=ps.eval_batch_size)
         self._loaders = {TRAIN: t, EVAL: e}
         if ps.do_test:
             x = self.test_ds.remove_columns(["example_id", "offset_mapping"])
             p = DataLoader(x, collate_fn=c, batch_size=ps.eval_batch_size)
             self._loaders[TEST] = p
     return self._loaders
Ejemplo n.º 21
0
    def eval(self, input_filepath, dataclass_args: TCEvalArgs):
        if not dataclass_args.load_preprocessed_data:
            self.logger.info("Preprocessing dataset...")
            contexts, labels = self._get_data(input_filepath)
            eval_encodings = self.tokenizer(contexts, truncation=True, padding=True)
        else:
            self.logger.info("Loading dataset from %s...", dataclass_args.load_preprocessed_data_path)
            eval_encodings, labels = self._get_preprocessed_data(dataclass_args.load_preprocessed_data_path)

        if dataclass_args.save_preprocessed_data:
            self.logger.info("Saving training dataset to %s...", dataclass_args.save_preprocessed_data_path)
            input_ids = eval_encodings["input_ids"]
            attention_mask = eval_encodings["attention_mask"]
            self._generate_json(dataclass_args.save_preprocessed_data_path, input_ids, attention_mask, labels, "train")


        eval_dataset = TextClassificationDataset(eval_encodings, labels)
        data_collator = DataCollatorWithPadding(self.tokenizer)

        result = self._run_eval(eval_dataset, data_collator, dataclass_args)
        return EvalResult(loss=result["eval_loss"])
Ejemplo n.º 22
0
    def train(self, input_filepath, dataclass_args: QATrainArgs):
        """
        See docstring in HappyQuestionAnswering.train()
        """
        if dataclass_args.save_preprocessed_data:
            self.logger.info("Saving preprocessed data is currently "
                             "not available for question answering models. "
                             "It will be added soon. ")
        if dataclass_args.load_preprocessed_data:
            self.logger.info("Loading preprocessed data is currently "
                             "not available for question answering models. "
                             "It will be added soon. ")

        self.logger.info("Preprocessing dataset...")
        contexts, questions, answers = self._get_data(input_filepath)
        self.__add_end_idx(contexts, answers)
        encodings = self.tokenizer(contexts, questions, truncation=True, padding=True)
        self.__add_token_positions(encodings, answers)
        dataset = QuestionAnsweringDataset(encodings)
        data_collator = DataCollatorWithPadding(self.tokenizer)
        self._run_train(dataset, dataclass_args, data_collator)
Ejemplo n.º 23
0
    def create_trainer(self):
        # Data collator
        # We have already padded to max length if the corresponding flag is True, otherwise we need to pad in the data
        # collator.
        training_args = self.training_args
        data_args = self.data_args

        data_collator = (default_data_collator if data_args.pad_to_max_length
                         else DataCollatorWithPadding(self.tokenizer))

        # TODO: Once the fix lands in a Datasets release, remove the _local here and the squad_v2_local folder.
        current_dir = os.path.sep.join(
            os.path.join(__file__).split(os.path.sep)[:-1])
        metric = load_metric(
            os.path.join(current_dir, "squad_v2_local") if data_args.
            version_2_with_negative else "squad")

        def compute_metrics(p: EvalPrediction):
            return metric.compute(predictions=p.predictions,
                                  references=p.label_ids)

        all_args = self.get_all_args(exclude_base=True)

        # Initialize our Trainer
        self.trainer = self.QA_TRAINER_CLASS(
            model=None,
            args=training_args,
            train_dataset=self.train_dataset
            if training_args.do_train else None,
            eval_dataset=self.validation_dataset
            if training_args.do_eval else None,
            eval_examples=self.datasets["validation"]
            if training_args.do_eval else None,
            tokenizer=self.tokenizer,
            data_collator=data_collator,
            post_process_function=self._post_processing_function,
            compute_metrics=compute_metrics,
            model_init=self.model_init,
            **all_args,
        )
Ejemplo n.º 24
0
def load_data():
    """Load IMDB data (training and eval)"""
    raw_datasets = load_dataset("imdb")
    raw_datasets = raw_datasets.shuffle(seed=42)

    # remove unnecessary data split
    del raw_datasets["unsupervised"]

    tokenizer = AutoTokenizer.from_pretrained(CHECKPOINT)

    def tokenize_function(examples):
        return tokenizer(examples["text"], truncation=True)

    # random 100 samples
    population = random.sample(range(len(raw_datasets["train"])), 100)

    tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
    tokenized_datasets["train"] = tokenized_datasets["train"].select(
        population)
    tokenized_datasets["test"] = tokenized_datasets["test"].select(population)

    tokenized_datasets = tokenized_datasets.remove_columns("text")
    tokenized_datasets = tokenized_datasets.rename_column("label", "labels")

    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    trainloader = DataLoader(
        tokenized_datasets["train"],
        shuffle=True,
        batch_size=32,
        collate_fn=data_collator,
    )

    testloader = DataLoader(tokenized_datasets["test"],
                            batch_size=32,
                            collate_fn=data_collator)

    return trainloader, testloader
Ejemplo n.º 25
0
    def create_trainer(self):
        # Data collator
        # We have already padded to max length if the corresponding flag is True, otherwise we need to pad in the data
        # collator.
        training_args = self.training_args
        data_args = self.data_args

        data_collator = (default_data_collator if data_args.pad_to_max_length
                         else DataCollatorWithPadding(self.tokenizer))

        metric = load_metric(
            "squad_v2" if data_args.version_2_with_negative else "squad")

        def compute_metrics(p: EvalPrediction):
            return metric.compute(predictions=p.predictions,
                                  references=p.label_ids)

        all_args = self.get_all_args(exclude_base=True)

        # Initialize our Trainer
        self.trainer = self.QA_TRAINER_CLASS(
            model=None,
            args=training_args,
            train_dataset=self.train_dataset
            if training_args.do_train else None,
            eval_dataset=self.validation_dataset
            if training_args.do_eval else None,
            eval_examples=self.datasets["validation"]
            if training_args.do_eval else None,
            tokenizer=self.tokenizer,
            data_collator=data_collator,
            post_process_function=self._post_processing_function,
            compute_metrics=compute_metrics,
            model_init=self.model_init,
            **all_args,
        )
Ejemplo n.º 26
0
    def eval(self, input_filepath, dataclass_args: QAEvalArgs):
        """
        See docstring in HappyQuestionAnswering.eval()

        """
        if dataclass_args.save_preprocessed_data:
            self.logger.info("Saving preprocessed data is currently "
                             "not available for question answering models. "
                             "It will be added soon. ")
        if dataclass_args.load_preprocessed_data:
            self.logger.info("Loading preprocessed data is currently "
                             "not available for question answering models. "
                             "It will be added soon. ")

        contexts, questions, answers = self._get_data(input_filepath)

        self.__add_end_idx(contexts, answers)
        encodings = self.tokenizer(contexts, questions, truncation=True, padding=True)
        self.__add_token_positions(encodings, answers)
        eval_dataset = QuestionAnsweringDataset(encodings)
        data_collator = DataCollatorWithPadding(self.tokenizer)

        result = self._run_eval(eval_dataset, data_collator, dataclass_args)
        return EvalResult(loss=result["eval_loss"])
Ejemplo n.º 27
0
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TrainingArguments))
    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args = parser.parse_json_file(
            json_file=os.path.abspath(sys.argv[1]))
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses(
        )

    # Detecting last checkpoint.
    last_checkpoint = None
    if (os.path.isdir(training_args.output_dir) and training_args.do_train
            and not training_args.overwrite_output_dir):
        last_checkpoint = get_last_checkpoint(training_args.output_dir)
        if last_checkpoint is None and len(os.listdir(
                training_args.output_dir)) > 0:
            raise ValueError(
                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
                "Use --overwrite_output_dir to overcome.")
        elif last_checkpoint is not None:
            logger.info(
                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
            )

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        handlers=[logging.StreamHandler(sys.stdout)],
    )
    logger.setLevel(logging.INFO if is_main_process(training_args.local_rank
                                                    ) else logging.WARN)

    # Log on each process the small summary:
    logger.warning(
        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
        +
        f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
    )
    # Set the verbosity to info of the Transformers logger (on main process only):
    if is_main_process(training_args.local_rank):
        transformers.utils.logging.set_verbosity_info()
        transformers.utils.logging.enable_default_handler()
        transformers.utils.logging.enable_explicit_format()
    logger.info(f"Training/evaluation parameters {training_args}")

    # Set seed before initializing model.
    set_seed(training_args.seed)

    # Get the datasets: you can either provide your own CSV/JSON training and evaluation files (see below)
    # or specify a GLUE benchmark task (the dataset will be downloaded automatically from the datasets Hub).
    #
    # For CSV/JSON files, this script will use as labels the column called 'label' and as pair of sentences the
    # sentences in columns called 'sentence1' and 'sentence2' if such column exists or the first two columns not named
    # label if at least two columns are provided.
    #
    # If the CSVs/JSONs contain only one non-label column, the script does single sentence classification on this
    # single column. You can easily tweak this behavior (see below)
    #
    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
    # download the dataset.
    if data_args.task_name is not None:
        # Downloading and loading a dataset from the hub.
        datasets = load_dataset("glue", data_args.task_name)
    else:
        # Loading a dataset from your local files.
        # CSV/JSON training and evaluation files are needed.
        data_files = {
            "train": data_args.train_file,
            "validation": data_args.validation_file
        }

        # Get the test dataset: you can provide your own CSV/JSON test file (see below)
        # when you use `do_predict` without specifying a GLUE benchmark task.
        if training_args.do_predict:
            if data_args.test_file is not None:
                train_extension = data_args.train_file.split(".")[-1]
                test_extension = data_args.test_file.split(".")[-1]
                assert (
                    test_extension == train_extension
                ), "`test_file` should have the same extension (csv or json) as `train_file`."
                data_files["test"] = data_args.test_file
            else:
                raise ValueError(
                    "Need either a GLUE task or a test file for `do_predict`.")

        for key in data_files.keys():
            logger.info(f"load a local file for {key}: {data_files[key]}")

        if data_args.train_file.endswith(".csv"):
            # Loading a dataset from local csv files
            datasets = load_dataset("csv", data_files=data_files)
        else:
            # Loading a dataset from local json files
            datasets = load_dataset("json", data_files=data_files)
    # See more about loading any type of standard or custom dataset at
    # https://huggingface.co/docs/datasets/loading_datasets.html.

    # Labels
    if data_args.task_name is not None:
        is_regression = data_args.task_name == "stsb"
        if not is_regression:
            label_list = datasets["train"].features["label"].names
            num_labels = len(label_list)
        else:
            num_labels = 1
    else:
        # Trying to have good defaults here, don't hesitate to tweak to your needs.
        is_regression = datasets["train"].features["label"].dtype in [
            "float32", "float64"
        ]
        if is_regression:
            num_labels = 1
        else:
            # A useful fast method:
            # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.unique
            label_list = datasets["train"].unique("label")
            label_list.sort()  # Let's sort it for determinism
            num_labels = len(label_list)

    # Load pretrained model and tokenizer
    #
    # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.
    config = AutoConfig.from_pretrained(
        model_args.config_name
        if model_args.config_name else model_args.model_name_or_path,
        num_labels=num_labels,
        finetuning_task=data_args.task_name,
        cache_dir=model_args.cache_dir,
        revision=model_args.model_revision,
        use_auth_token=True if model_args.use_auth_token else None,
    )
    tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name
        if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
        use_fast=model_args.use_fast_tokenizer,
        revision=model_args.model_revision,
        use_auth_token=True if model_args.use_auth_token else None,
    )
    model = AutoModelForSequenceClassification.from_pretrained(
        model_args.model_name_or_path,
        from_tf=bool(".ckpt" in model_args.model_name_or_path),
        config=config,
        cache_dir=model_args.cache_dir,
        revision=model_args.model_revision,
        use_auth_token=True if model_args.use_auth_token else None,
    )

    # Preprocessing the datasets
    if data_args.task_name is not None:
        sentence1_key, sentence2_key = task_to_keys[data_args.task_name]
    else:
        # Again, we try to have some nice defaults but don't hesitate to tweak to your use case.
        non_label_column_names = [
            name for name in datasets["train"].column_names if name != "label"
        ]
        if "sentence1" in non_label_column_names and "sentence2" in non_label_column_names:
            sentence1_key, sentence2_key = "sentence1", "sentence2"
        else:
            if len(non_label_column_names) >= 2:
                sentence1_key, sentence2_key = non_label_column_names[:2]
            else:
                sentence1_key, sentence2_key = non_label_column_names[0], None

    # Padding strategy
    if data_args.pad_to_max_length:
        padding = "max_length"
    else:
        # We will pad later, dynamically at batch creation, to the max sequence length in each batch
        padding = False

    # Some models have set the order of the labels to use, so let's make sure we do use it.
    label_to_id = None
    if (model.config.label2id !=
            PretrainedConfig(num_labels=num_labels).label2id
            and data_args.task_name is not None and not is_regression):
        # Some have all caps in their config, some don't.
        label_name_to_id = {
            k.lower(): v
            for k, v in model.config.label2id.items()
        }
        if list(sorted(label_name_to_id.keys())) == list(sorted(label_list)):
            label_to_id = {
                i: int(label_name_to_id[label_list[i]])
                for i in range(num_labels)
            }
        else:
            logger.warn(
                "Your model seems to have been trained with labels, but they don't match the dataset: ",
                f"model labels: {list(sorted(label_name_to_id.keys()))}, dataset labels: {list(sorted(label_list))}."
                "\nIgnoring the model labels as a result.",
            )
    elif data_args.task_name is None and not is_regression:
        label_to_id = {v: i for i, v in enumerate(label_list)}

    if data_args.max_seq_length > tokenizer.model_max_length:
        logger.warn(
            f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
            f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
        )
    max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)

    def preprocess_function(examples):
        # Tokenize the texts
        args = ((examples[sentence1_key], ) if sentence2_key is None else
                (examples[sentence1_key], examples[sentence2_key]))
        result = tokenizer(*args,
                           padding=padding,
                           max_length=max_seq_length,
                           truncation=True)

        # Map labels to IDs (not necessary for GLUE tasks)
        if label_to_id is not None and "label" in examples:
            result["label"] = [(label_to_id[l] if l != -1 else -1)
                               for l in examples["label"]]
        return result

    datasets = datasets.map(preprocess_function,
                            batched=True,
                            load_from_cache_file=not data_args.overwrite_cache)
    if training_args.do_train:
        if "train" not in datasets:
            raise ValueError("--do_train requires a train dataset")
        train_dataset = datasets["train"]
        if data_args.max_train_samples is not None:
            train_dataset = train_dataset.select(
                range(data_args.max_train_samples))

    if training_args.do_eval:
        if "validation" not in datasets and "validation_matched" not in datasets:
            raise ValueError("--do_eval requires a validation dataset")
        eval_dataset = datasets["validation_matched" if data_args.task_name ==
                                "mnli" else "validation"]
        if data_args.max_val_samples is not None:
            eval_dataset = eval_dataset.select(range(
                data_args.max_val_samples))

    if (training_args.do_predict or data_args.task_name is not None
            or data_args.test_file is not None):
        if "test" not in datasets and "test_matched" not in datasets:
            raise ValueError("--do_predict requires a test dataset")
        test_dataset = datasets["test_matched" if data_args.task_name ==
                                "mnli" else "test"]
        if data_args.max_test_samples is not None:
            test_dataset = test_dataset.select(
                range(data_args.max_test_samples))

    # Log a few random samples from the training set:
    if training_args.do_train:
        for index in random.sample(range(len(train_dataset)), 3):
            logger.info(
                f"Sample {index} of the training set: {train_dataset[index]}.")

    # Get the metric function
    if data_args.task_name is not None:
        metric = load_metric("glue", data_args.task_name)
    # TODO: When datasets metrics include regular accuracy, make an else here and remove special branch from
    # compute_metrics

    # You can define your custom compute_metrics function. It takes an `EvalPrediction` object (a namedtuple with a
    # predictions and label_ids field) and has to return a dictionary string to float.
    def compute_metrics(p: EvalPrediction):
        preds = p.predictions[0] if isinstance(p.predictions,
                                               tuple) else p.predictions
        preds = np.squeeze(preds) if is_regression else np.argmax(preds,
                                                                  axis=1)
        if data_args.task_name is not None:
            result = metric.compute(predictions=preds, references=p.label_ids)
            if len(result) > 1:
                result["combined_score"] = np.mean(list(
                    result.values())).item()
            return result
        elif is_regression:
            return {"mse": ((preds - p.label_ids)**2).mean().item()}
        else:
            return {
                "accuracy":
                (preds == p.label_ids).astype(np.float32).mean().item()
            }

    # Data collator will default to DataCollatorWithPadding, so we change it if we already did the padding.
    if data_args.pad_to_max_length:
        data_collator = default_data_collator
    elif training_args.fp16:
        data_collator = DataCollatorWithPadding(tokenizer,
                                                pad_to_multiple_of=8)
    else:
        data_collator = None

    # Initialize our Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset if training_args.do_train else None,
        eval_dataset=eval_dataset if training_args.do_eval else None,
        compute_metrics=compute_metrics,
        tokenizer=tokenizer,
        data_collator=data_collator,
    )

    # Training
    if training_args.do_train:
        checkpoint = None
        if last_checkpoint is not None:
            checkpoint = last_checkpoint
        elif os.path.isdir(model_args.model_name_or_path):
            # Check the config from that potential checkpoint has the right number of labels before using it as a
            # checkpoint.
            if AutoConfig.from_pretrained(
                    model_args.model_name_or_path).num_labels == num_labels:
                checkpoint = model_args.model_name_or_path

        train_result = trainer.train(resume_from_checkpoint=checkpoint)
        metrics = train_result.metrics
        max_train_samples = (data_args.max_train_samples
                             if data_args.max_train_samples is not None else
                             len(train_dataset))
        metrics["train_samples"] = min(max_train_samples, len(train_dataset))

        trainer.save_model()  # Saves the tokenizer too for easy upload

        trainer.log_metrics("train", metrics)
        trainer.save_metrics("train", metrics)
        trainer.save_state()

    # Evaluation
    if training_args.do_eval:
        logger.info("*** Evaluate ***")

        # Loop to handle MNLI double evaluation (matched, mis-matched)
        tasks = [data_args.task_name]
        eval_datasets = [eval_dataset]
        if data_args.task_name == "mnli":
            tasks.append("mnli-mm")
            eval_datasets.append(datasets["validation_mismatched"])

        for eval_dataset, task in zip(eval_datasets, tasks):
            metrics = trainer.evaluate(eval_dataset=eval_dataset)

            max_val_samples = (data_args.max_val_samples
                               if data_args.max_val_samples is not None else
                               len(eval_dataset))
            metrics["eval_samples"] = min(max_val_samples, len(eval_dataset))

            trainer.log_metrics("eval", metrics)
            trainer.save_metrics("eval", metrics)

    if training_args.do_predict:
        logger.info("*** Test ***")

        # Loop to handle MNLI double evaluation (matched, mis-matched)
        tasks = [data_args.task_name]
        test_datasets = [test_dataset]
        if data_args.task_name == "mnli":
            tasks.append("mnli-mm")
            test_datasets.append(datasets["test_mismatched"])

        for test_dataset, task in zip(test_datasets, tasks):
            # Removing the `label` columns because it contains -1 and Trainer won't like that.
            test_dataset.remove_columns_("label")
            predictions = trainer.predict(
                test_dataset=test_dataset).predictions
            predictions = (np.squeeze(predictions) if is_regression else
                           np.argmax(predictions, axis=1))

            output_test_file = os.path.join(training_args.output_dir,
                                            f"test_results_{task}.txt")
            if trainer.is_world_process_zero():
                with open(output_test_file, "w") as writer:
                    logger.info(f"***** Test results {task} *****")
                    writer.write("index\tprediction\n")
                    for index, item in enumerate(predictions):
                        if is_regression:
                            writer.write(f"{index}\t{item:3.3f}\n")
                        else:
                            item = label_list[item]
                            writer.write(f"{index}\t{item}\n")
Ejemplo n.º 28
0
def main():
    args = parse_args()

    # Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
    accelerator = Accelerator()
    # Make one log on every process with the configuration for debugging.
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
    )
    logger.info(accelerator.state)

    # Setup logging, we only want one process per machine to log things on the screen.
    # accelerator.is_local_main_process is only True for one process per machine.
    logger.setLevel(
        logging.INFO if accelerator.is_local_main_process else logging.ERROR)
    if accelerator.is_local_main_process:
        datasets.utils.logging.set_verbosity_warning()
        transformers.utils.logging.set_verbosity_info()
    else:
        datasets.utils.logging.set_verbosity_error()
        transformers.utils.logging.set_verbosity_error()

    # If passed along, set the training seed now.
    if args.seed is not None:
        set_seed(args.seed)

    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
    # (the dataset will be downloaded automatically from the datasets Hub).
    #
    # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
    # 'text' is found. You can easily tweak this behavior (see below).
    #
    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
    # download the dataset.
    if args.dataset_name is not None:
        # Downloading and loading a dataset from the hub.
        raw_datasets = load_dataset(args.dataset_name,
                                    args.dataset_config_name)
    else:
        data_files = {}
        if args.train_file is not None:
            data_files["train"] = args.train_file
        if args.validation_file is not None:
            data_files["validation"] = args.validation_file
        if args.test_file is not None:
            data_files["test"] = args.test_file
        extension = args.train_file.split(".")[-1]
        raw_datasets = load_dataset(extension,
                                    data_files=data_files,
                                    field="data")
    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
    # https://huggingface.co/docs/datasets/loading_datasets.html.

    # Load pretrained model and tokenizer
    #
    # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.

    if args.config_name:
        config = AutoConfig.from_pretrained(args.config_name)
    elif args.model_name_or_path:
        config = AutoConfig.from_pretrained(args.model_name_or_path)
    else:
        config = CONFIG_MAPPING[args.model_type]()
        logger.warning(
            "You are instantiating a new config instance from scratch.")

    if args.tokenizer_name:
        tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name,
                                                  use_fast=True)
    elif args.model_name_or_path:
        tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path,
                                                  use_fast=True)
    else:
        raise ValueError(
            "You are instantiating a new tokenizer from scratch. This is not supported by this script."
            "You can do it from another script, save it, and load it from here, using --tokenizer_name."
        )

    if args.model_name_or_path:
        model = AutoModelForQuestionAnswering.from_pretrained(
            args.model_name_or_path,
            from_tf=bool(".ckpt" in args.model_name_or_path),
            config=config,
        )
    else:
        logger.info("Training new model from scratch")
        model = AutoModelForQuestionAnswering.from_config(config)

    # Preprocessing the datasets.
    # Preprocessing is slighlty different for training and evaluation.

    column_names = raw_datasets["train"].column_names

    question_column_name = "question" if "question" in column_names else column_names[
        0]
    context_column_name = "context" if "context" in column_names else column_names[
        1]
    answer_column_name = "answers" if "answers" in column_names else column_names[
        2]

    # Padding side determines if we do (question|context) or (context|question).
    pad_on_right = tokenizer.padding_side == "right"

    if args.max_seq_length > tokenizer.model_max_length:
        logger.warning(
            f"The max_seq_length passed ({args.max_seq_length}) is larger than the maximum length for the"
            f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
        )

    max_seq_length = min(args.max_seq_length, tokenizer.model_max_length)

    # Training preprocessing
    def prepare_train_features(examples):
        # Some of the questions have lots of whitespace on the left, which is not useful and will make the
        # truncation of the context fail (the tokenized question will take a lots of space). So we remove that
        # left whitespace
        examples[question_column_name] = [
            q.lstrip() for q in examples[question_column_name]
        ]

        # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
        # in one example possible giving several features when a context is long, each of those features having a
        # context that overlaps a bit the context of the previous feature.
        tokenized_examples = tokenizer(
            examples[
                question_column_name if pad_on_right else context_column_name],
            examples[
                context_column_name if pad_on_right else question_column_name],
            truncation="only_second" if pad_on_right else "only_first",
            max_length=max_seq_length,
            stride=args.doc_stride,
            return_overflowing_tokens=True,
            return_offsets_mapping=True,
            padding="max_length" if args.pad_to_max_length else False,
        )

        # Since one example might give us several features if it has a long context, we need a map from a feature to
        # its corresponding example. This key gives us just that.
        sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
        # The offset mappings will give us a map from token to character position in the original context. This will
        # help us compute the start_positions and end_positions.
        offset_mapping = tokenized_examples.pop("offset_mapping")

        # Let's label those examples!
        tokenized_examples["start_positions"] = []
        tokenized_examples["end_positions"] = []

        for i, offsets in enumerate(offset_mapping):
            # We will label impossible answers with the index of the CLS token.
            input_ids = tokenized_examples["input_ids"][i]
            cls_index = input_ids.index(tokenizer.cls_token_id)

            # Grab the sequence corresponding to that example (to know what is the context and what is the question).
            sequence_ids = tokenized_examples.sequence_ids(i)

            # One example can give several spans, this is the index of the example containing this span of text.
            sample_index = sample_mapping[i]
            answers = examples[answer_column_name][sample_index]
            # If no answers are given, set the cls_index as answer.
            if len(answers["answer_start"]) == 0:
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
            else:
                # Start/end character index of the answer in the text.
                start_char = answers["answer_start"][0]
                end_char = start_char + len(answers["text"][0])

                # Start token index of the current span in the text.
                token_start_index = 0
                while sequence_ids[token_start_index] != (1 if pad_on_right
                                                          else 0):
                    token_start_index += 1

                # End token index of the current span in the text.
                token_end_index = len(input_ids) - 1
                while sequence_ids[token_end_index] != (1 if pad_on_right else
                                                        0):
                    token_end_index -= 1

                # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
                if not (offsets[token_start_index][0] <= start_char
                        and offsets[token_end_index][1] >= end_char):
                    tokenized_examples["start_positions"].append(cls_index)
                    tokenized_examples["end_positions"].append(cls_index)
                else:
                    # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
                    # Note: we could go after the last offset if the answer is the last word (edge case).
                    while token_start_index < len(offsets) and offsets[
                            token_start_index][0] <= start_char:
                        token_start_index += 1
                    tokenized_examples["start_positions"].append(
                        token_start_index - 1)
                    while offsets[token_end_index][1] >= end_char:
                        token_end_index -= 1
                    tokenized_examples["end_positions"].append(
                        token_end_index + 1)

        return tokenized_examples

    if "train" not in raw_datasets:
        raise ValueError("--do_train requires a train dataset")
    train_dataset = raw_datasets["train"]
    if args.max_train_samples is not None:
        # We will select sample from whole data if agument is specified
        train_dataset = train_dataset.select(range(args.max_train_samples))
    # Create train feature from dataset
    train_dataset = train_dataset.map(
        prepare_train_features,
        batched=True,
        num_proc=args.preprocessing_num_workers,
        remove_columns=column_names,
        load_from_cache_file=not args.overwrite_cache,
        desc="Running tokenizer on train dataset",
    )
    if args.max_train_samples is not None:
        # Number of samples might increase during Feature Creation, We select only specified max samples
        train_dataset = train_dataset.select(range(args.max_train_samples))

    # Validation preprocessing
    def prepare_validation_features(examples):
        # Some of the questions have lots of whitespace on the left, which is not useful and will make the
        # truncation of the context fail (the tokenized question will take a lots of space). So we remove that
        # left whitespace
        examples[question_column_name] = [
            q.lstrip() for q in examples[question_column_name]
        ]

        # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
        # in one example possible giving several features when a context is long, each of those features having a
        # context that overlaps a bit the context of the previous feature.
        tokenized_examples = tokenizer(
            examples[
                question_column_name if pad_on_right else context_column_name],
            examples[
                context_column_name if pad_on_right else question_column_name],
            truncation="only_second" if pad_on_right else "only_first",
            max_length=max_seq_length,
            stride=args.doc_stride,
            return_overflowing_tokens=True,
            return_offsets_mapping=True,
            padding="max_length" if args.pad_to_max_length else False,
        )

        # Since one example might give us several features if it has a long context, we need a map from a feature to
        # its corresponding example. This key gives us just that.
        sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")

        # For evaluation, we will need to convert our predictions to substrings of the context, so we keep the
        # corresponding example_id and we will store the offset mappings.
        tokenized_examples["example_id"] = []

        for i in range(len(tokenized_examples["input_ids"])):
            # Grab the sequence corresponding to that example (to know what is the context and what is the question).
            sequence_ids = tokenized_examples.sequence_ids(i)
            context_index = 1 if pad_on_right else 0

            # One example can give several spans, this is the index of the example containing this span of text.
            sample_index = sample_mapping[i]
            tokenized_examples["example_id"].append(
                examples["id"][sample_index])

            # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token
            # position is part of the context or not.
            tokenized_examples["offset_mapping"][i] = [
                (o if sequence_ids[k] == context_index else None)
                for k, o in enumerate(tokenized_examples["offset_mapping"][i])
            ]

        return tokenized_examples

    if "validation" not in raw_datasets:
        raise ValueError("--do_eval requires a validation dataset")
    eval_examples = raw_datasets["validation"]
    if args.max_eval_samples is not None:
        # We will select sample from whole data
        eval_examples = eval_examples.select(range(args.max_eval_samples))
    # Validation Feature Creation
    eval_dataset = eval_examples.map(
        prepare_validation_features,
        batched=True,
        num_proc=args.preprocessing_num_workers,
        remove_columns=column_names,
        load_from_cache_file=not args.overwrite_cache,
        desc="Running tokenizer on validation dataset",
    )

    if args.max_eval_samples is not None:
        # During Feature creation dataset samples might increase, we will select required samples again
        eval_dataset = eval_dataset.select(range(args.max_eval_samples))

    if args.do_predict:
        if "test" not in raw_datasets:
            raise ValueError("--do_predict requires a test dataset")
        predict_examples = raw_datasets["test"]
        if args.max_predict_samples is not None:
            # We will select sample from whole data
            predict_examples = predict_examples.select(
                range(args.max_predict_samples))
        # Predict Feature Creation
        predict_dataset = predict_examples.map(
            prepare_validation_features,
            batched=True,
            num_proc=args.preprocessing_num_workers,
            remove_columns=column_names,
            load_from_cache_file=not args.overwrite_cache,
            desc="Running tokenizer on prediction dataset",
        )
        if args.max_predict_samples is not None:
            # During Feature creation dataset samples might increase, we will select required samples again
            predict_dataset = predict_dataset.select(
                range(args.max_predict_samples))

    # Log a few random samples from the training set:
    for index in random.sample(range(len(train_dataset)), 3):
        logger.info(
            f"Sample {index} of the training set: {train_dataset[index]}.")

    # DataLoaders creation:
    if args.pad_to_max_length:
        # If padding was already done ot max length, we use the default data collator that will just convert everything
        # to tensors.
        data_collator = default_data_collator
    else:
        # Otherwise, `DataCollatorWithPadding` will apply dynamic padding for us (by padding to the maximum length of
        # the samples passed). When using mixed precision, we add `pad_to_multiple_of=8` to pad all tensors to multiple
        # of 8s, which will enable the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
        data_collator = DataCollatorWithPadding(
            tokenizer,
            pad_to_multiple_of=(8 if accelerator.use_fp16 else None))

    train_dataloader = DataLoader(train_dataset,
                                  shuffle=True,
                                  collate_fn=data_collator,
                                  batch_size=args.per_device_train_batch_size)

    eval_dataset_for_model = eval_dataset.remove_columns(
        ["example_id", "offset_mapping"])
    eval_dataloader = DataLoader(eval_dataset_for_model,
                                 collate_fn=data_collator,
                                 batch_size=args.per_device_eval_batch_size)

    if args.do_predict:
        predict_dataset_for_model = predict_dataset.remove_columns(
            ["example_id", "offset_mapping"])
        predict_dataloader = DataLoader(
            predict_dataset_for_model,
            collate_fn=data_collator,
            batch_size=args.per_device_eval_batch_size)

    # Post-processing:
    def post_processing_function(examples,
                                 features,
                                 predictions,
                                 stage="eval"):
        # Post-processing: we match the start logits and end logits to answers in the original context.
        predictions = postprocess_qa_predictions(
            examples=examples,
            features=features,
            predictions=predictions,
            version_2_with_negative=args.version_2_with_negative,
            n_best_size=args.n_best_size,
            max_answer_length=args.max_answer_length,
            null_score_diff_threshold=args.null_score_diff_threshold,
            output_dir=args.output_dir,
            prefix=stage,
        )
        # Format the result to the format the metric expects.
        if args.version_2_with_negative:
            formatted_predictions = [{
                "id": k,
                "prediction_text": v,
                "no_answer_probability": 0.0
            } for k, v in predictions.items()]
        else:
            formatted_predictions = [{
                "id": k,
                "prediction_text": v
            } for k, v in predictions.items()]

        references = [{
            "id": ex["id"],
            "answers": ex[answer_column_name]
        } for ex in examples]
        return EvalPrediction(predictions=formatted_predictions,
                              label_ids=references)

    metric = load_metric(
        "squad_v2" if args.version_2_with_negative else "squad")

    # Create and fill numpy array of size len_of_validation_data * max_length_of_output_tensor
    def create_and_fill_np_array(start_or_end_logits, dataset, max_len):
        """
        Create and fill numpy array of size len_of_validation_data * max_length_of_output_tensor

        Args:
            start_or_end_logits(:obj:`tensor`):
                This is the output predictions of the model. We can only enter either start or end logits.
            eval_dataset: Evaluation dataset
            max_len(:obj:`int`):
                The maximum length of the output tensor. ( See the model.eval() part for more details )
        """

        step = 0
        # create a numpy array and fill it with -100.
        logits_concat = np.full((len(dataset), max_len),
                                -100,
                                dtype=np.float64)
        # Now since we have create an array now we will populate it with the outputs gathered using accelerator.gather
        for i, output_logit in enumerate(
                start_or_end_logits):  # populate columns
            # We have to fill it such that we have to take the whole tensor and replace it on the newly created array
            # And after every iteration we have to change the step

            batch_size = output_logit.shape[0]
            cols = output_logit.shape[1]

            if step + batch_size < len(dataset):
                logits_concat[step:step + batch_size, :cols] = output_logit
            else:
                logits_concat[step:, :cols] = output_logit[:len(dataset) -
                                                           step]

            step += batch_size

        return logits_concat

    # Optimizer
    # Split weights in two groups, one with weight decay and the other not.
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            args.weight_decay,
        },
        {
            "params": [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            0.0,
        },
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)

    # Prepare everything with our `accelerator`.
    model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
        model, optimizer, train_dataloader, eval_dataloader)

    # Note -> the training dataloader needs to be prepared before we grab his length below (cause its length will be
    # shorter in multiprocess)

    # Scheduler and math around the number of training steps.
    num_update_steps_per_epoch = math.ceil(
        len(train_dataloader) / args.gradient_accumulation_steps)
    if args.max_train_steps is None:
        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
    else:
        args.num_train_epochs = math.ceil(args.max_train_steps /
                                          num_update_steps_per_epoch)

    lr_scheduler = get_scheduler(
        name=args.lr_scheduler_type,
        optimizer=optimizer,
        num_warmup_steps=args.num_warmup_steps,
        num_training_steps=args.max_train_steps,
    )

    # Train!
    total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps

    logger.info("***** Running training *****")
    logger.info(f"  Num examples = {len(train_dataset)}")
    logger.info(f"  Num Epochs = {args.num_train_epochs}")
    logger.info(
        f"  Instantaneous batch size per device = {args.per_device_train_batch_size}"
    )
    logger.info(
        f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}"
    )
    logger.info(
        f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
    logger.info(f"  Total optimization steps = {args.max_train_steps}")

    # Only show the progress bar once on each machine.
    progress_bar = tqdm(range(args.max_train_steps),
                        disable=not accelerator.is_local_main_process)
    completed_steps = 0

    for epoch in range(args.num_train_epochs):
        model.train()
        for step, batch in enumerate(train_dataloader):
            outputs = model(**batch)
            loss = outputs.loss
            loss = loss / args.gradient_accumulation_steps
            accelerator.backward(loss)
            if step % args.gradient_accumulation_steps == 0 or step == len(
                    train_dataloader) - 1:
                optimizer.step()
                lr_scheduler.step()
                optimizer.zero_grad()
                progress_bar.update(1)
                completed_steps += 1

            if completed_steps >= args.max_train_steps:
                break

    # Evaluation
    logger.info("***** Running Evaluation *****")
    logger.info(f"  Num examples = {len(eval_dataset)}")
    logger.info(f"  Batch size = {args.per_device_eval_batch_size}")

    all_start_logits = []
    all_end_logits = []
    for step, batch in enumerate(eval_dataloader):
        with torch.no_grad():
            outputs = model(**batch)
            start_logits = outputs.start_logits
            end_logits = outputs.end_logits

            if not args.pad_to_max_length:  # necessary to pad predictions and labels for being gathered
                start_logits = accelerator.pad_across_processes(start_logits,
                                                                dim=1,
                                                                pad_index=-100)
                end_logits = accelerator.pad_across_processes(end_logits,
                                                              dim=1,
                                                              pad_index=-100)

            all_start_logits.append(
                accelerator.gather(start_logits).cpu().numpy())
            all_end_logits.append(accelerator.gather(end_logits).cpu().numpy())

    max_len = max([x.shape[1] for x in all_start_logits
                   ])  # Get the max_length of the tensor

    # concatenate the numpy array
    start_logits_concat = create_and_fill_np_array(all_start_logits,
                                                   eval_dataset, max_len)
    end_logits_concat = create_and_fill_np_array(all_end_logits, eval_dataset,
                                                 max_len)

    # delete the list of numpy arrays
    del all_start_logits
    del all_end_logits

    outputs_numpy = (start_logits_concat, end_logits_concat)
    prediction = post_processing_function(eval_examples, eval_dataset,
                                          outputs_numpy)
    eval_metric = metric.compute(predictions=prediction.predictions,
                                 references=prediction.label_ids)
    logger.info(f"Evaluation metrics: {eval_metric}")

    # Prediction
    if args.do_predict:
        logger.info("***** Running Prediction *****")
        logger.info(f"  Num examples = {len(predict_dataset)}")
        logger.info(f"  Batch size = {args.per_device_eval_batch_size}")

        all_start_logits = []
        all_end_logits = []
        for step, batch in enumerate(predict_dataloader):
            with torch.no_grad():
                outputs = model(**batch)
                start_logits = outputs.start_logits
                end_logits = outputs.end_logits

                if not args.pad_to_max_length:  # necessary to pad predictions and labels for being gathered
                    start_logits = accelerator.pad_across_processes(
                        start_logits, dim=1, pad_index=-100)
                    end_logits = accelerator.pad_across_processes(
                        start_logits, dim=1, pad_index=-100)

                all_start_logits.append(
                    accelerator.gather(start_logits).cpu().numpy())
                all_end_logits.append(
                    accelerator.gather(end_logits).cpu().numpy())

        max_len = max([x.shape[1] for x in all_start_logits
                       ])  # Get the max_length of the tensor
        # concatenate the numpy array
        start_logits_concat = create_and_fill_np_array(all_start_logits,
                                                       predict_dataset,
                                                       max_len)
        end_logits_concat = create_and_fill_np_array(all_end_logits,
                                                     predict_dataset, max_len)

        # delete the list of numpy arrays
        del all_start_logits
        del all_end_logits

        outputs_numpy = (start_logits_concat, end_logits_concat)
        prediction = post_processing_function(predict_examples,
                                              predict_dataset, outputs_numpy)
        predict_metric = metric.compute(predictions=prediction.predictions,
                                        references=prediction.label_ids)
        logger.info(f"Predict metrics: {predict_metric}")

    if args.output_dir is not None:
        accelerator.wait_for_everyone()
        unwrapped_model = accelerator.unwrap_model(model)
        unwrapped_model.save_pretrained(args.output_dir,
                                        save_function=accelerator.save)
Ejemplo n.º 29
0
def finetune(accelerator, model_name_or_path, train_file, output_dir,
             **kwargs):
    """Fine-tuning a pre-trained model on a downstream task.

  Args:
    accelerator: An instance of an accelerator for distributed training (on
      multi-GPU, TPU) or mixed precision training.
    model_name_or_path: Path to pretrained model or model identifier from
      huggingface.co/models.
    train_file: A csv or a json file containing the training data.
    output_dir: The output directory where the model predictions and checkpoints
      will be written.
    **kwargs: Dictionary of key/value pairs with which to update the
      configuration object after loading. The values in kwargs of any keys which
      are configuration attributes will be used to override the loaded values.
  """
    # Make one log on every process with the configuration for debugging.
    logging.basicConfig(
        format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
        datefmt='%m/%d/%Y %H:%M:%S',
        level=logging.INFO,
    )
    logger.info(accelerator.state)

    # Setup logging, we only want one process per machine to log things on the
    # screen. accelerator.is_local_main_process is only True for one process per
    # machine.
    logger.setLevel(
        logging.INFO if accelerator.is_local_main_process else logging.ERROR)

    model_args = FTModelArguments(model_name_or_path=model_name_or_path)
    data_args = FTDataArguments(train_file=train_file)
    training_args = FTTrainingArguments(output_dir=output_dir)
    args = argparse.Namespace()

    for arg_class in (model_args, data_args, training_args):
        for key, value in vars(arg_class).items():
            setattr(args, key, value)

    for key, value in kwargs.items():
        if hasattr(args, key):
            setattr(args, key, value)

    # Sanity checks
    data_files = {}
    args.data_file_extension = None

    # You need to provide the training data as we always run training
    args.do_train = True
    assert args.train_file is not None
    data_files[Split.TRAIN.value] = args.train_file

    if args.do_eval or args.evaluation_strategy != IntervalStrategy.NO.value:
        assert args.eval_file is not None
        data_files[Split.EVAL.value] = args.eval_file

    if args.do_eval and args.test_file is not None:
        data_files[Split.TEST.value] = args.test_file

    if args.do_predict:
        assert args.infer_file is not None
        data_files[Split.INFER.value] = args.infer_file

    for key in data_files:
        extension = data_files[key].split('.')[-1]
        assert extension in ['csv', 'json'
                             ], f'`{key}_file` should be a csv or a json file.'
        if args.data_file_extension is None:
            args.data_file_extension = extension
        else:
            assert (
                extension == args.data_file_extension
            ), f'`{key}_file` should be a {args.data_file_extension} file`.'

    assert (
        args.eval_metric in datasets.list_metrics()
    ), f'{args.eval_metric} not in the list of supported metrics {datasets.list_metrics()}.'

    # Handle the output directory creation
    if accelerator.is_main_process:
        if args.output_dir is not None:
            os.makedirs(args.output_dir, exist_ok=True)
    accelerator.wait_for_everyone()

    # If passed along, set the training seed now.
    if args.seed is not None:
        set_seed(args.seed)

    # You need to provide your CSV/JSON data files.
    #
    # For CSV/JSON files, this script will use as labels the column called 'label'
    # and as pair of sentences the sentences in columns called 'sentence1' and
    # 'sentence2' if these columns exist or the first two columns not named
    # 'label' if at least two columns are provided.
    #
    # If the CSVs/JSONs contain only one non-label column, the script does single
    # sentence classification on this single column.
    #
    # In distributed training, the load_dataset function guarantees that only one
    # local process can download the dataset.

    # Loading the dataset from local csv or json files.
    raw_datasets = load_dataset(args.data_file_extension,
                                data_files=data_files)

    # Labels
    is_regression = raw_datasets[
        Split.TRAIN.value].features['label'].dtype in ['float32', 'float64']
    args.is_regression = is_regression

    if args.is_regression:
        label_list = None
        num_labels = 1
    else:
        label_list = args.label_list
        assert label_list is not None
        label_list.sort()  # Let's sort it for determinism
        num_labels = len(label_list)
    args.num_labels = num_labels

    # Load pre-trained model
    config, tokenizer, model = load_from_pretrained(args,
                                                    args.model_name_or_path)

    # Preprocessing the datasets
    non_label_column_names = [
        name for name in raw_datasets[Split.TRAIN.value].column_names
        if name != 'label'
    ]
    if 'sentence1' in non_label_column_names and 'sentence2' in non_label_column_names:
        sentence1_key, sentence2_key = 'sentence1', 'sentence2'
    else:
        if len(non_label_column_names) >= 2:
            sentence1_key, sentence2_key = non_label_column_names[:2]
        else:
            sentence1_key, sentence2_key = non_label_column_names[0], None

    label_to_id = {v: i for i, v in enumerate(label_list)}
    config.label2id = label_to_id
    config.id2label = {id: label for label, id in config.label2id.items()}
    padding = 'max_length' if args.pad_to_max_length else False

    def preprocess_function(examples):
        # Tokenize the texts
        texts = ((examples[sentence1_key], ) if sentence2_key is None else
                 (examples[sentence1_key], examples[sentence2_key]))
        result = tokenizer(*texts,
                           padding=padding,
                           max_length=args.max_length,
                           truncation=True)

        if 'label' in examples:
            if label_to_id is not None:
                # Map labels to IDs (not necessary for GLUE tasks)
                result['labels'] = [label_to_id[l] for l in examples['label']]
            else:
                # In all cases, rename the column to labels because the model will
                # expect that.
                result['labels'] = examples['label']
        return result

    with accelerator.main_process_first():
        processed_datasets = raw_datasets.map(
            preprocess_function,
            batched=True,
            remove_columns=raw_datasets[Split.TRAIN.value].column_names,
            desc='Running tokenizer on dataset',
        )

    num_examples = {}
    splits = [s.value for s in Split]
    for split in splits:
        if split in processed_datasets:
            num_examples[split] = len(processed_datasets[split])
    args.num_examples = num_examples

    train_dataset = processed_datasets[Split.TRAIN.value]
    eval_dataset = processed_datasets[
        Split.EVAL.value] if Split.EVAL.value in processed_datasets else None
    test_dataset = processed_datasets[
        Split.TEST.value] if Split.TEST.value in processed_datasets else None
    infer_dataset = processed_datasets[
        Split.INFER.value] if Split.INFER.value in processed_datasets else None

    # Log a few random samples from the training set:
    for index in random.sample(range(len(train_dataset)), 3):
        logger.info('Sample %d of the training set: %s.', index,
                    train_dataset[index])

    # DataLoaders creation:
    if args.pad_to_max_length:
        # If padding was already done ot max length, we use the default data
        # collator that will just convert everything to tensors.
        data_collator = default_data_collator
    else:
        # Otherwise, `DataCollatorWithPadding` will apply dynamic padding for us (by
        # padding to the maximum length of the samples passed). When using mixed
        # precision, we add `pad_to_multiple_of=8` to pad all tensors to multiple of
        # 8s, which will enable the use of Tensor Cores on NVIDIA hardware with
        # compute capability >= 7.5 (Volta).
        data_collator = DataCollatorWithPadding(
            tokenizer,
            pad_to_multiple_of=(8 if accelerator.use_fp16 else None))

    train_dataloader = DataLoader(
        train_dataset,
        batch_size=args.per_device_train_batch_size,
        shuffle=True,
        collate_fn=data_collator,
    )
    eval_dataloader, test_dataloader, infer_dataloader = None, None, None

    if eval_dataset is not None:
        eval_dataloader = DataLoader(
            eval_dataset,
            batch_size=args.per_device_eval_batch_size,
            collate_fn=data_collator)

    if test_dataset is not None:
        test_dataloader = DataLoader(
            test_dataset,
            batch_size=args.per_device_eval_batch_size,
            collate_fn=data_collator)

    if infer_dataset is not None:
        infer_dataloader = DataLoader(
            infer_dataset,
            batch_size=args.per_device_eval_batch_size,
            collate_fn=data_collator)

    # Optimizer
    # Split weights in two groups, one with weight decay and the other not.
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {
            'params': [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            args.weight_decay,
        },
        {
            'params': [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.0,
        },
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)

    # Prepare everything with our `accelerator`.
    model, optimizer, train_dataloader, eval_dataloader, test_dataloader, infer_dataloader = accelerator.prepare(
        model, optimizer, train_dataloader, eval_dataloader, test_dataloader,
        infer_dataloader)

    # Note -> the training dataloader needs to be prepared before we grab its
    # length below (cause its length will be shorter in multiprocess)

    # Scheduler and math around the number of training steps.
    num_update_steps_per_epoch = math.ceil(
        len(train_dataloader) / args.gradient_accumulation_steps)
    if args.max_steps == -1:
        args.max_steps = args.num_train_epochs * num_update_steps_per_epoch
    else:
        args.num_train_epochs = math.ceil(args.max_steps /
                                          num_update_steps_per_epoch)

    lr_scheduler = get_scheduler(
        name=args.lr_scheduler_type,
        optimizer=optimizer,
        num_warmup_steps=args.warmup_steps,
        num_training_steps=args.max_steps,
    )

    # Train
    completed_steps, avg_train_loss = train(args, accelerator, model,
                                            tokenizer, train_dataloader,
                                            optimizer, lr_scheduler,
                                            eval_dataloader)
    accelerator.wait_for_everyone()
    logger.info(
        'Training job completed: completed_steps = %d, avg_train_loss = %f',
        completed_steps, avg_train_loss)

    args.model_name_or_path = os.path.join(args.output_dir, 'best-checkpoint')
    logger.info('Loading the best checkpoint: %s', args.model_name_or_path)
    config, tokenizer, model = load_from_pretrained(args,
                                                    args.model_name_or_path)
    model = accelerator.prepare(model)

    if args.do_eval:
        # Evaluate
        if eval_dataloader is not None:
            logger.info(
                '***** Running evaluation on the eval data using the best checkpoint *****'
            )
            eval_results = evaluate(args, accelerator, eval_dataloader,
                                    Split.EVAL.value, model, 'best-checkpoint')
            avg_eval_loss = eval_results['avg_eval_loss']
            eval_metric = eval_results[args.eval_metric]
            logger.info('Evaluation job completed: avg_eval_loss = %f',
                        avg_eval_loss)
            logger.info('Evaluation result for the best checkpoint: %s = %f',
                        args.eval_metric, eval_metric)

        if test_dataloader is not None:
            logger.info(
                '***** Running evaluation on the test data using the best checkpoint *****'
            )
            eval_results = evaluate(args, accelerator, test_dataloader,
                                    Split.TEST.value, model, 'best-checkpoint')
            avg_eval_loss = eval_results['avg_eval_loss']
            eval_metric = eval_results[args.eval_metric]
            logger.info('Test job completed: avg_test_loss = %f',
                        avg_eval_loss)
            logger.info('Test result for the best checkpoint: %s = %f',
                        args.eval_metric, eval_metric)

    if args.do_predict:
        # Predict
        if infer_dataloader is not None:
            logger.info(
                '***** Running inference using the best checkpoint *****')
            evaluate(args,
                     accelerator,
                     infer_dataloader,
                     Split.INFER.value,
                     model,
                     'best-checkpoint',
                     has_labels=False)
            logger.info('Inference job completed.')

    # Release all references to the internal objects stored and call the garbage
    # collector. You should call this method between two trainings with different
    # models/optimizers.
    accelerator.free_memory()
Ejemplo n.º 30
0
def main():
    args = parse_args()

    # Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
    accelerator = Accelerator()
    # Make one log on every process with the configuration for debugging.
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
    )
    logger.info(accelerator.state)

    # Setup logging, we only want one process per machine to log things on the screen.
    # accelerator.is_local_main_process is only True for one process per machine.
    logger.setLevel(
        logging.INFO if accelerator.is_local_main_process else logging.ERROR)
    if accelerator.is_local_main_process:
        datasets.utils.logging.set_verbosity_warning()
        transformers.utils.logging.set_verbosity_info()
    else:
        datasets.utils.logging.set_verbosity_error()
        transformers.utils.logging.set_verbosity_error()

    # If passed along, set the training seed now.
    if args.seed is not None:
        set_seed(args.seed)

    # Get the datasets: you can either provide your own CSV/JSON training and evaluation files (see below)
    # or specify a GLUE benchmark task (the dataset will be downloaded automatically from the datasets Hub).

    # For CSV/JSON files, this script will use as labels the column called 'label' and as pair of sentences the
    # sentences in columns called 'sentence1' and 'sentence2' if such column exists or the first two columns not named
    # label if at least two columns are provided.

    # If the CSVs/JSONs contain only one non-label column, the script does single sentence classification on this
    # single column. You can easily tweak this behavior (see below)

    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
    # download the dataset.
    if args.task_name is not None:
        # Downloading and loading a dataset from the hub.
        raw_datasets = load_dataset("glue", args.task_name)
    else:
        # Loading the dataset from local csv or json file.
        data_files = {}
        if args.train_file is not None:
            data_files["train"] = args.train_file
        if args.validation_file is not None:
            data_files["validation"] = args.validation_file
        extension = (args.train_file if args.train_file is not None else
                     args.valid_file).split(".")[-1]
        raw_datasets = load_dataset(extension, data_files=data_files)
    # See more about loading any type of standard or custom dataset at
    # https://huggingface.co/docs/datasets/loading_datasets.html.

    # Labels
    if args.task_name is not None:
        is_regression = args.task_name == "stsb"
        if not is_regression:
            label_list = raw_datasets["train"].features["label"].names
            num_labels = len(label_list)
        else:
            num_labels = 1
    else:
        # Trying to have good defaults here, don't hesitate to tweak to your needs.
        is_regression = raw_datasets["train"].features["label"].dtype in [
            "float32", "float64"
        ]
        if is_regression:
            num_labels = 1
        else:
            # A useful fast method:
            # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.unique
            label_list = raw_datasets["train"].unique("label")
            label_list.sort()  # Let's sort it for determinism
            num_labels = len(label_list)

    # Load pretrained model and tokenizer
    #
    # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.
    config = AutoConfig.from_pretrained(args.model_name_or_path,
                                        num_labels=num_labels,
                                        finetuning_task=args.task_name)
    tokenizer = AutoTokenizer.from_pretrained(
        args.model_name_or_path, use_fast=not args.use_slow_tokenizer)
    model = AutoModelForSequenceClassification.from_pretrained(
        args.model_name_or_path,
        from_tf=bool(".ckpt" in args.model_name_or_path),
        config=config,
    )

    # Preprocessing the datasets
    if args.task_name is not None:
        sentence1_key, sentence2_key = task_to_keys[args.task_name]
    else:
        # Again, we try to have some nice defaults but don't hesitate to tweak to your use case.
        non_label_column_names = [
            name for name in raw_datasets["train"].column_names
            if name != "label"
        ]
        if "sentence1" in non_label_column_names and "sentence2" in non_label_column_names:
            sentence1_key, sentence2_key = "sentence1", "sentence2"
        else:
            if len(non_label_column_names) >= 2:
                sentence1_key, sentence2_key = non_label_column_names[:2]
            else:
                sentence1_key, sentence2_key = non_label_column_names[0], None

    # Some models have set the order of the labels to use, so let's make sure we do use it.
    label_to_id = None
    if (model.config.label2id !=
            PretrainedConfig(num_labels=num_labels).label2id
            and args.task_name is not None and not is_regression):
        # Some have all caps in their config, some don't.
        label_name_to_id = {
            k.lower(): v
            for k, v in model.config.label2id.items()
        }
        if list(sorted(label_name_to_id.keys())) == list(sorted(label_list)):
            logger.info(
                f"The configuration of the model provided the following label correspondence: {label_name_to_id}. "
                "Using it!")
            label_to_id = {
                i: label_name_to_id[label_list[i]]
                for i in range(num_labels)
            }
        else:
            logger.warn(
                "Your model seems to have been trained with labels, but they don't match the dataset: ",
                f"model labels: {list(sorted(label_name_to_id.keys()))}, dataset labels: {list(sorted(label_list))}."
                "\nIgnoring the model labels as a result.",
            )
    elif args.task_name is None:
        label_to_id = {v: i for i, v in enumerate(label_list)}

    padding = "max_length" if args.pad_to_max_length else False

    def preprocess_function(examples):
        # Tokenize the texts
        texts = ((examples[sentence1_key], ) if sentence2_key is None else
                 (examples[sentence1_key], examples[sentence2_key]))
        result = tokenizer(*texts,
                           padding=padding,
                           max_length=args.max_length,
                           truncation=True)

        if "label" in examples:
            if label_to_id is not None:
                # Map labels to IDs (not necessary for GLUE tasks)
                result["labels"] = [label_to_id[l] for l in examples["label"]]
            else:
                # In all cases, rename the column to labels because the model will expect that.
                result["labels"] = examples["label"]
        return result

    processed_datasets = raw_datasets.map(
        preprocess_function,
        batched=True,
        remove_columns=raw_datasets["train"].column_names)

    train_dataset = processed_datasets["train"]
    eval_dataset = processed_datasets["validation_matched" if args.task_name ==
                                      "mnli" else "validation"]

    # Log a few random samples from the training set:
    for index in random.sample(range(len(train_dataset)), 3):
        logger.info(
            f"Sample {index} of the training set: {train_dataset[index]}.")

    # DataLoaders creation:
    if args.pad_to_max_length:
        # If padding was already done ot max length, we use the default data collator that will just convert everything
        # to tensors.
        data_collator = default_data_collator
    else:
        # Otherwise, `DataCollatorWithPadding` will apply dynamic padding for us (by padding to the maximum length of
        # the samples passed). When using mixed precision, we add `pad_to_multiple_of=8` to pad all tensors to multiple
        # of 8s, which will enable the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
        data_collator = DataCollatorWithPadding(
            tokenizer,
            pad_to_multiple_of=(8 if accelerator.use_fp16 else None))

    train_dataloader = DataLoader(train_dataset,
                                  shuffle=True,
                                  collate_fn=data_collator,
                                  batch_size=args.per_device_train_batch_size)
    eval_dataloader = DataLoader(eval_dataset,
                                 collate_fn=data_collator,
                                 batch_size=args.per_device_eval_batch_size)

    # Optimizer
    # Split weights in two groups, one with weight decay and the other not.
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            args.weight_decay,
        },
        {
            "params": [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            0.0,
        },
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)

    # Prepare everything with our `accelerator`.
    model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
        model, optimizer, train_dataloader, eval_dataloader)

    # Note -> the training dataloader needs to be prepared before we grab his length below (cause its length will be
    # shorter in multiprocess)

    # Scheduler and math around the number of training steps.
    num_update_steps_per_epoch = math.ceil(
        len(train_dataloader) / args.gradient_accumulation_steps)
    if args.max_train_steps is None:
        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
    else:
        args.num_train_epochs = math.ceil(args.max_train_steps /
                                          num_update_steps_per_epoch)

    lr_scheduler = get_scheduler(
        name=args.lr_scheduler_type,
        optimizer=optimizer,
        num_warmup_steps=args.num_warmup_steps,
        num_training_steps=args.max_train_steps,
    )

    # Get the metric function
    if args.task_name is not None:
        metric = load_metric("glue", args.task_name)

    # Train!
    total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps

    logger.info("***** Running training *****")
    logger.info(f"  Num examples = {len(train_dataset)}")
    logger.info(f"  Num Epochs = {args.num_train_epochs}")
    logger.info(
        f"  Instantaneous batch size per device = {args.per_device_train_batch_size}"
    )
    logger.info(
        f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}"
    )
    logger.info(
        f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
    logger.info(f"  Total optimization steps = {args.max_train_steps}")
    # Only show the progress bar once on each machine.
    progress_bar = tqdm(range(args.max_train_steps),
                        disable=not accelerator.is_local_main_process)
    completed_steps = 0

    for epoch in range(args.num_train_epochs):
        model.train()
        for step, batch in enumerate(train_dataloader):
            outputs = model(**batch)
            loss = outputs.loss
            loss = loss / args.gradient_accumulation_steps
            accelerator.backward(loss)
            if step % args.gradient_accumulation_steps == 0 or step == len(
                    train_dataloader) - 1:
                optimizer.step()
                lr_scheduler.step()
                optimizer.zero_grad()
                progress_bar.update(1)
                completed_steps += 1

            if completed_steps >= args.max_train_steps:
                break

        model.eval()
        for step, batch in enumerate(eval_dataloader):
            outputs = model(**batch)
            predictions = outputs.logits.argmax(dim=-1)
            metric.add_batch(
                predictions=accelerator.gather(predictions),
                references=accelerator.gather(batch["labels"]),
            )

        eval_metric = metric.compute()
        logger.info(f"epoch {epoch}: {eval_metric}")

    if args.output_dir is not None:
        accelerator.wait_for_everyone()
        unwrapped_model = accelerator.unwrap_model(model)
        unwrapped_model.save_pretrained(args.output_dir,
                                        save_function=accelerator.save)

    if args.task_name == "mnli":
        # Final evaluation on mismatched validation set
        eval_dataset = processed_datasets["validation_mismatched"]
        eval_dataloader = DataLoader(
            eval_dataset,
            collate_fn=data_collator,
            batch_size=args.per_device_eval_batch_size)
        eval_dataloader = accelerator.prepare(eval_dataloader)

        model.eval()
        for step, batch in enumerate(eval_dataloader):
            outputs = model(**batch)
            predictions = outputs.logits.argmax(dim=-1)
            metric.add_batch(
                predictions=accelerator.gather(predictions),
                references=accelerator.gather(batch["labels"]),
            )

        eval_metric = metric.compute()
        logger.info(f"mnli-mm: {eval_metric}")