Esempio n. 1
0
def load_model_config_tokenizer(args):
    if args.config_name:
        config = AutoConfig.from_pretrained(args.config_name)
    elif args.model_name_or_path:
        config = AutoConfig.from_pretrained(args.model_name_or_path)
    else:
        config = CONFIG_MAPPING[args.model_type]()
        logger.warning("You are instantiating a new config instance from scratch.")

    if args.tokenizer_name:
        tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, use_fast=not args.use_slow_tokenizer)
    elif args.model_name_or_path:
        tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, use_fast=not args.use_slow_tokenizer)
    else:
        raise ValueError(
            "You are instantiating a new tokenizer from scratch. This is not supported by this script."
            "You can do it from another script, save it, and load it from here, using --tokenizer_name."
        )
    if args.model_name_or_path:
        model = AutoModelForMultipleChoice.from_pretrained(
            args.model_name_or_path,
            from_tf=bool(".ckpt" in args.model_name_or_path),
            config=config,
        )
    else:
        logger.info("Training new model from scratch")
        model = AutoModelForMultipleChoice.from_config(config)
    return model, config, tokenizer
Esempio n. 2
0
def load_reranker(model_name_or_path):
    logger.info(f'Loading model from: {model_name_or_path}')
    config = AutoConfig.from_pretrained(model_name_or_path)
    tokenizer = AutoTokenizer.from_pretrained(model_name_or_path,
                                              do_lower_case=True)
    model = AutoModelForMultipleChoice.from_pretrained(
        model_name_or_path,
        from_tf=bool(".ckpt" in model_name_or_path),
        config=config,
    )
    model = model.eval()
    return model, tokenizer
    def load(self, fname = None):
        if fname is not None:
            self.load_path = fname

        if self.pretrained_bert:
            log.info(f"From pretrained {self.pretrained_bert}.")
            config = AutoConfig.from_pretrained(self.pretrained_bert, num_labels=self.n_classes, 
                                                output_attentions=False, output_hidden_states=False)

            self.model = AutoModelForMultipleChoice.from_pretrained(self.pretrained_bert, config=config)

        elif self.bert_config_file and Path(self.bert_config_file).is_file():
            self.bert_config = AutoConfig.from_json_file(str(expand_path(self.bert_config_file)))
            if self.attention_probs_keep_prob is not None:
                self.bert_config.attention_probs_dropout_prob = 1.0 - self.attention_probs_keep_prob
            if self.hidden_keep_prob is not None:
                self.bert_config.hidden_dropout_prob = 1.0 - self.hidden_keep_prob
            self.model = AutoModelForMultipleChoice.from_config(config=self.bert_config)
        else:
            raise ConfigError("No pre-trained BERT model is given.")

        self.model.to(self.device)

        self.optimizer = getattr(torch.optim, self.optimizer_name)(
            self.model.parameters(), **self.optimizer_parameters)
        if self.lr_scheduler_name is not None:
            self.lr_scheduler = getattr(torch.optim.lr_scheduler, self.lr_scheduler_name)(
                self.optimizer, **self.lr_scheduler_parameters)

        if self.load_path:
            log.info(f"Load path {self.load_path} is given.")
            if isinstance(self.load_path, Path) and not self.load_path.parent.is_dir():
                raise ConfigError("Provided load path is incorrect!")

            weights_path = Path(self.load_path.resolve())
            weights_path = weights_path.with_suffix(f".pth.tar")
            if weights_path.exists():
                log.info(f"Load path {weights_path} exists.")
                log.info(f"Initializing `{self.__class__.__name__}` from saved.")

                # now load the weights, optimizer from saved
                log.info(f"Loading weights from {weights_path}.")
                checkpoint = torch.load(weights_path, map_location=self.device)
                self.model.load_state_dict(checkpoint["model_state_dict"])
                self.optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
                self.epochs_done = checkpoint.get("epochs_done", 0)
            else:
                log.info(f"Init from scratch. Load path {weights_path} does not exist.")
Esempio n. 4
0
def download(model_name, cache_dir):
    model = AutoModelForMultipleChoice.from_pretrained(
        model_name,
        force_download=True,
        cache_dir=cache_dir,
    )
    config = AutoConfig.from_pretrained(
        model_name,
        cache_dir=cache_dir,
        force_download=True,
    )
    tokenizer = AutoTokenizer.from_pretrained(
        model_name,
        cache_dir=cache_dir,
        force_download=True,
    )
    return model, config, tokenizer
Esempio n. 5
0
    def get_this_model(task, model_config):
        from transformers import AutoModelForSequenceClassification
        from transformers import AutoModelForSeq2SeqLM
        from transformers import AutoModelForMultipleChoice
        from transformers import AutoModelForTokenClassification

        if task in (SEQCLASSIFICATION, SEQREGRESSION):
            return AutoModelForSequenceClassification.from_pretrained(
                checkpoint_path, config=model_config)
        elif task == TOKENCLASSIFICATION:
            return AutoModelForTokenClassification.from_pretrained(
                checkpoint_path, config=model_config)
        elif task in NLG_TASKS:
            return AutoModelForSeq2SeqLM.from_pretrained(checkpoint_path,
                                                         config=model_config)
        elif task == MULTICHOICECLASSIFICATION:
            return AutoModelForMultipleChoice.from_pretrained(
                checkpoint_path, config=model_config)
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, AllTrainingArguments))
    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args = parser.parse_json_file(
            json_file=os.path.abspath(sys.argv[1]))
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses(
        )

    checkpoint_dir = hyperparam_path_for_two_stage_evidence_selector(
        model_args, data_args, training_args)
    ckpt_dir = Path(checkpoint_dir)
    postfix = ""
    if training_args.train_extensive_evidence_selector or training_args.train_intensive_evidence_selector:
        postfix += "_train"
    else:
        postfix += "_eval"
    setup_root_logger(ckpt_dir,
                      training_args.local_rank,
                      debug=False,
                      postfix=postfix)

    training_args.output_dir = checkpoint_dir

    # Log on each process the small summary:
    logger.warning(
        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
        +
        f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
    )
    # Set the verbosity to info of the Transformers logger (on main process only):
    if is_main_process(training_args.local_rank):
        transformers.utils.logging.set_verbosity_info()
    logger.info("Training/evaluation parameters %s", training_args)
    logger.info("Data parameters %s", data_args)
    logger.info("Model parameters %s", model_args)

    # Set seed before initializing model.
    set_seed(training_args.seed)

    # Get the [datasets]: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
    # (the dataset will be downloaded automatically from the datasets Hub).

    # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
    # 'text' is found. You can easily tweak this behavior (see below).

    if data_args.dataset not in ['race', 'dream']:
        raise ValueError("Dataset should be race or dream.")

    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
    # download the dataset.
    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
    # https://huggingface.co/docs/datasets/loading_datasets.html.
    data_files = {
        'train':
        data_args.train_file if data_args.train_file is not None else None,
        'validation':
        data_args.validation_file
        if data_args.validation_file is not None else None,
        'test':
        data_args.test_file if data_args.test_file is not None else None
    }

    # datasets = load_dataset(data_args.dataload_script, data_args.dataload_split,
    #                         data_files=data_files if data_files['train'] is not None else None,
    #                         data_dir=data_args.data_dir,
    #                         split={'train': ReadInstruction('train', from_=0, to=5, unit='abs'),
    #                                'validation': ReadInstruction('validation', from_=0, to=5, unit='abs'),
    #                                'test': ReadInstruction('test', from_=0, to=5, unit='abs')})
    datasets = load_dataset(
        data_args.dataload_script,
        data_args.dataload_split,
        data_files=data_files if data_files['train'] is not None else None,
        data_dir=data_args.data_dir)

    # Load pretrained model and tokenizer

    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.
    tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name
        if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
        use_fast=model_args.use_fast_tokenizer,
    )
    extensive_evidence_selector_path = model_args.extensive_evidence_selector_path \
        if model_args.extensive_evidence_selector_path else model_args.model_name_or_path
    intensive_evidence_selector_path = model_args.intensive_evidence_selector_path \
        if model_args.intensive_evidence_selector_path else model_args.model_name_or_path
    evidence_reader_path = model_args.evidence_reader_path \
        if model_args.evidence_reader_path else model_args.model_name_or_path
    answer_verifier_path = model_args.answer_verifier_path \
        if model_args.answer_verifier_path else model_args.model_name_or_path

    extensive_selector_config = AutoConfig.from_pretrained(
        extensive_evidence_selector_path,
        cache_dir=model_args.cache_dir,
    )
    intensive_selector_config = AutoConfig.from_pretrained(
        intensive_evidence_selector_path,
        cache_dir=model_args.cache_dir,
    )
    evidence_reader_config = AutoConfig.from_pretrained(
        evidence_reader_path,
        cache_dir=model_args.cache_dir,
    )
    answer_verifier_config = AutoConfig.from_pretrained(
        answer_verifier_path,
        cache_dir=model_args.cache_dir,
    )

    extensive_evidence_selector = AutoModelForSequenceClassification.from_pretrained(
        extensive_evidence_selector_path,
        config=extensive_selector_config,
        cache_dir=model_args.cache_dir,
    )
    intensive_evidence_selector = AutoModelForMultipleChoice.from_pretrained(
        intensive_evidence_selector_path,
        config=intensive_selector_config,
        cache_dir=model_args.cache_dir,
    )
    evidence_reader = AutoModelForMultipleChoice.from_pretrained(
        evidence_reader_path,
        config=evidence_reader_config,
        cache_dir=model_args.cache_dir,
    )
    if model_args.verifier_type == "classification":
        answer_verifier = AutoModelForSequenceClassification.from_pretrained(
            answer_verifier_path,
            config=answer_verifier_config,
            cache_dir=model_args.cache_dir,
        )
    elif model_args.verifier_type == "multi_choice":
        answer_verifier = AutoModelForMultipleChoice.from_pretrained(
            answer_verifier_path,
            config=answer_verifier_config,
            cache_dir=model_args.cache_dir,
        )

    if training_args.train_extensive_evidence_selector:
        column_names = datasets["train"].column_names
    else:
        column_names = datasets["validation"].column_names

    pprepare_features_for_initializing_evidence_selector = partial(
        prepare_features_for_initializing_extensive_evidence_selector,
        evidence_sampling_num=data_args.evidence_sampling_num,
        tokenizer=tokenizer,
        data_args=data_args,
        pseudo_label_path=data_args.pseudo_label_path)

    pprepare_features_for_generating_optionwise_evidence = partial(
        prepare_features_for_generating_optionwise_evidence,
        tokenizer=tokenizer,
        data_args=data_args)

    pprepare_features_for_reading_optionwise_evidence = partial(
        prepare_features_for_reading_optionwise_evidence,
        tokenizer=tokenizer,
        data_args=data_args)

    pprepare_features_for_intensive_evidence_selector = partial(
        prepare_features_for_intensive_evidence_selector,
        evidence_len=data_args.intensive_evidence_len,
        train_intensive_selector_with_option=data_args.
        train_intensive_selector_with_option,
        train_intensive_selector_with_non_overlapping_evidence=data_args.
        train_intensive_selector_with_non_overlapping_evidence,
        tokenizer=tokenizer,
        data_args=data_args)

    pprepare_features_for_multiple_choice = partial(prepare_features,
                                                    tokenizer=tokenizer,
                                                    data_args=data_args)

    if model_args.verifier_type == "classification":
        pprepare_features_for_training_answer_verifier = partial(
            prepare_features_for_training_answer_verifier,
            evidence_len=data_args.verifier_evidence_len,
            train_answer_verifier_with_option=data_args.
            train_answer_verifier_with_option,
            downsampling=data_args.train_verifier_with_downsampling,
            tokenizer=tokenizer,
            data_args=data_args)
    elif model_args.verifier_type == "multi_choice":
        pprepare_features_for_training_answer_verifier = partial(
            prepare_features_for_training_mc_style_answer_verifier,
            evidence_len=data_args.verifier_evidence_len,
            tokenizer=tokenizer,
            data_args=data_args)

    extensive_trainer = Trainer(
        model=extensive_evidence_selector,
        args=training_args,
        train_dataset=None,
        eval_dataset=None,
        tokenizer=tokenizer,
        data_collator=DataCollatorForSequenceClassification(
            tokenizer=tokenizer),
        compute_metrics=compute_mc_metrics,
    )

    intensive_trainer = Trainer(
        model=intensive_evidence_selector,
        args=training_args,
        train_dataset=None,
        eval_dataset=None,
        tokenizer=tokenizer,
        data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer),
        compute_metrics=compute_mc_metrics,
    )

    mc_trainer = Trainer(
        model=evidence_reader,
        args=training_args,
        train_dataset=None,
        eval_dataset=None,
        tokenizer=tokenizer,
        data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer),
        compute_metrics=compute_mc_metrics,
    )

    verifier_trainer = Trainer(
        model=answer_verifier,
        args=training_args,
        train_dataset=None,
        eval_dataset=None,
        tokenizer=tokenizer,
        data_collator=DataCollatorForSequenceClassification(
            tokenizer=tokenizer) if model_args.verifier_type
        == "classification" else DataCollatorForMultipleChoice(
            tokenizer=tokenizer),
        compute_metrics=compute_classification_metrics if
        model_args.verifier_type == "classification" else compute_mc_metrics,
    )

    if training_args.train_answer_verifier or training_args.eval_intensive_evidence_selector or training_args.eval_answer_verifier:
        multiple_choice_datasets = {
            k: datasets[k].map(
                pprepare_features_for_multiple_choice,
                batched=True,
                num_proc=data_args.preprocessing_num_workers,
                remove_columns=column_names,
                load_from_cache_file=not data_args.overwrite_cache,
            )
            for k in datasets.keys()
        }

    if training_args.train_extensive_evidence_selector or training_args.eval_extensive_evidence_selector:
        train_extensive_evidence_selector_datasets = {
            k: datasets[k].map(
                pprepare_features_for_initializing_evidence_selector,
                batched=True,
                num_proc=data_args.preprocessing_num_workers,
                remove_columns=column_names,
                load_from_cache_file=not data_args.overwrite_cache,
            )
            for k in datasets.keys()
            if k != "train" or training_args.train_extensive_evidence_selector
        }

    if training_args.train_extensive_evidence_selector:
        extensive_trainer.train_dataset = train_extensive_evidence_selector_datasets[
            "train"]
        extensive_trainer.eval_dataset = train_extensive_evidence_selector_datasets[
            "validation"]
        train_result = extensive_trainer.train()

        output_train_file = os.path.join(training_args.output_dir,
                                         "train_results.txt")
        with open(output_train_file, "w") as writer:
            logger.info("***** Extensive Train results *****")
            for key, value in sorted(train_result.metrics.items()):
                logger.info(f"  {key} = {value}")
                writer.write(f"{key} = {value}\n")

    # generate extensive evidence logits
    if training_args.train_intensive_evidence_selector or training_args.train_answer_verifier:
        extensive_evidence_logits = {
            k: extensive_trainer.evidence_generating(
                v, pprepare_features_for_generating_optionwise_evidence)
            for k, v in datasets.items()
        }
    elif training_args.eval_intensive_evidence_selector or training_args.eval_answer_verifier:
        extensive_evidence_logits = {
            k: extensive_trainer.evidence_generating(
                v, pprepare_features_for_generating_optionwise_evidence)
            for k, v in datasets.items() if k != "train"
        }

    # prepare features for intensive evidence selector
    if training_args.train_intensive_evidence_selector or training_args.eval_intensive_evidence_selector \
            or training_args.eval_answer_verifier:
        train_intensive_evidence_selector_datasets = {}
        extensive_evidence_sentences = {}
        for split in datasets.keys():
            if not training_args.train_intensive_evidence_selector and split == 'train':
                continue
            intensive_dataset = datasets[split].map(
                partial(pprepare_features_for_intensive_evidence_selector,
                        evidence_logits=extensive_evidence_logits[split]),
                batched=True,
                num_proc=data_args.preprocessing_num_workers,
                remove_columns=column_names,
                load_from_cache_file=not data_args.overwrite_cache,
            )

            evidence_sentences = {
                eid: [[(logit, sent)
                       for sent, logit in zip(option_sents, option_logits)]
                      for option_sents, option_logits in zip(
                          evidence_sent, evidence_logit)]
                for eid, evidence_sent, evidence_logit in zip(
                    intensive_dataset['example_ids'],
                    intensive_dataset['evidence_sentence'],
                    intensive_dataset['evidence_logit'])
            }
            train_intensive_evidence_selector_datasets[
                split] = intensive_dataset.remove_columns(
                    ["evidence_sentence", "evidence_logit"])
            extensive_evidence_sentences[split] = evidence_sentences

    # prepare features for answer verifier
    if training_args.train_answer_verifier or training_args.eval_answer_verifier:
        mc_label_dict = {
            split: {
                example['example_ids']: example['label']
                for example in multiple_choice_datasets[split]
            }
            for split in datasets.keys()
            if split != "train" or training_args.train_answer_verifier
        }
        reader_output = {
            split: mc_trainer.evaluate(multiple_choice_datasets[split])
            for split in datasets.keys()
            if split != "train" or training_args.train_answer_verifier
        }
        answer_logits = {
            split: {
                example_id: prediction.tolist()
                for prediction, label_id, example_id in zip(
                    *reader_output[split][:-1])
            }
            for split in datasets.keys() if split != "train"
        }
        if data_args.answer_logits_path:
            logger.info(
                f"loading answer logits from {data_args.answer_logits_path}")
            with open(data_args.answer_logits_path) as f:
                trainset_answer_logits = json.load(f)
            answer_logits['train'] = trainset_answer_logits

        train_answer_verifier_datasets = {
            k: datasets[k].map(
                partial(pprepare_features_for_training_answer_verifier,
                        answer_logits=answer_logits[k],
                        evidence_logits=extensive_evidence_logits[k],
                        is_training=(k == "train")),
                batched=True,
                num_proc=data_args.preprocessing_num_workers,
                remove_columns=column_names,
                load_from_cache_file=not data_args.overwrite_cache,
            )
            for k in datasets.keys()
            if k != "train" or training_args.train_answer_verifier
        }
        if training_args.train_answer_verifier:
            logger.info(
                f"total {sum(train_answer_verifier_datasets['train']['label'])} positive example for training verifier"
            )

    if training_args.train_intensive_evidence_selector:
        intensive_trainer.train_dataset = train_intensive_evidence_selector_datasets[
            "train"]
        intensive_trainer.eval_dataset = train_intensive_evidence_selector_datasets[
            "validation"]

        train_result = intensive_trainer.train()

        output_train_file = os.path.join(training_args.output_dir,
                                         "train_results.txt")
        with open(output_train_file, "a+") as writer:
            logger.info("***** Intensive Train results *****")
            for key, value in sorted(train_result.metrics.items()):
                logger.info(f"  {key} = {value}")
                writer.write(f"{key} = {value}\n")

    if training_args.train_answer_verifier:
        verifier_trainer.train_dataset = train_answer_verifier_datasets[
            "train"]
        verifier_trainer.eval_dataset = train_answer_verifier_datasets[
            "validation"]

        train_result = verifier_trainer.train()

        output_train_file = os.path.join(training_args.output_dir,
                                         "train_results.txt")
        with open(output_train_file, "a+") as writer:
            logger.info("***** Intensive Train results *****")
            for key, value in sorted(train_result.metrics.items()):
                logger.info(f"  {key} = {value}")
                writer.write(f"{key} = {value}\n")

    # Evaluation
    # To use the best checkpoint model at end, use the aruguments
    # load_best_model_at_end, metric_for_best_model, evaluation_strategy steps
    # --load_best_model_at_end \
    # --metric_for_best_model accuracy \
    # --evaluation_strategy steps \

    if training_args.eval_extensive_evidence_selector:

        for split in ["validation", "test"]:
            logger.info(f"*** Evaluate {split} set ***")
            results = extensive_trainer.evaluate(
                train_extensive_evidence_selector_datasets[split]).metrics
            fulleval_results, all_evidence_sentences = extensive_trainer.evaluate_extensive_selector_with_explicit_reader(
                evidence_reader=evidence_reader,
                eval_dataset=datasets[split],
                feature_func_for_evidence_reading=
                pprepare_features_for_reading_optionwise_evidence,
                feature_func_for_evidence_generating=
                pprepare_features_for_generating_optionwise_evidence)

            metrics = {**results, **fulleval_results}
            output_eval_file = os.path.join(training_args.output_dir,
                                            f"{split}_results.txt")
            with open(output_eval_file, "a+") as writer:
                logger.info("***** Extensive Eval results *****")
                for key, value in sorted(metrics.items()):
                    logger.info(f"  {key} = {value}")
                    writer.write(f"{key} = {value}\n")

            output_evidence_file = os.path.join(training_args.output_dir,
                                                f"{split}_evidence.json")
            with open(output_evidence_file, "w") as f:
                json.dump(all_evidence_sentences, f)

    if training_args.eval_intensive_evidence_selector:

        for split in ["validation", "test"]:
            logger.info(f"*** Evaluate {split} set ***")
            metrics, _ = intensive_trainer.evaluate_intensive_selector_with_explicit_reader(
                evidence_reader=evidence_reader,
                multiple_choice_dataset=multiple_choice_datasets[split],
                intensive_selector_dataset=
                train_intensive_evidence_selector_datasets[split])
            output_eval_file = os.path.join(training_args.output_dir,
                                            f"{split}_results.txt")
            with open(output_eval_file, "a+") as writer:
                logger.info("***** Extensive Eval results *****")
                for key, value in sorted(metrics.items()):
                    logger.info(f"  {key} = {value}")
                    writer.write(f"{key} = {value}\n")

    if training_args.eval_answer_verifier:

        selector_output = {
            k: intensive_trainer.evaluate(
                train_intensive_evidence_selector_datasets[k])
            for k in datasets.keys() if k != "train"
        }
        selector_logits = {
            k: {
                example_id: prediction.tolist()
                for prediction, label_id, example_id in zip(
                    *selector_output[k][:-1])
            }
            for k in datasets.keys() if k != "train"
        }

        for split in ["validation", "test"]:
            logger.info(f"*** Evaluate {split} set ***")
            results = verifier_trainer.evaluate(
                train_answer_verifier_datasets[split])
            verifier_logits = {
                example_id: prediction.tolist()
                for prediction, label_id, example_id in zip(*results[:-1])
            }
            metrics = results.metrics

            if model_args.verifier_type == "classification":
                if split == 'validation':
                    fulleval_metrics = evaluate_verifier_with_reader_and_iselector(
                        reader_logits=answer_logits[split],
                        selector_logits=selector_logits[split],
                        verifier_logits=verifier_logits,
                        label_dict=mc_label_dict[split])
                    val_verify_thresholds = {
                        k: v
                        for k, v in fulleval_metrics.items() if "thresh" in k
                    }
                else:
                    fulleval_metrics = evaluate_verifier_with_reader_and_iselector(
                        reader_logits=answer_logits[split],
                        selector_logits=selector_logits[split],
                        verifier_logits=verifier_logits,
                        label_dict=mc_label_dict[split],
                        threshold=val_verify_thresholds)
            else:
                fulleval_metrics = evaluate_mc_style_verifier_with_reader_and_iselector(
                    reader_logits=answer_logits[split],
                    selector_logits=selector_logits[split],
                    verifier_logits=verifier_logits,
                    label_dict=mc_label_dict[split])

            metrics = {**metrics, **fulleval_metrics}
            output_eval_file = os.path.join(training_args.output_dir,
                                            f"{split}_results.txt")
            with open(output_eval_file, "a+") as writer:
                logger.info("***** Verifier Eval results *****")
                for key, value in sorted(metrics.items()):
                    logger.info(f"  {key} = {value}")
                    writer.write(f"{key} = {value}\n")

            verifier_prediction = {
                'verifier_logits': verifier_logits,
                'reader_logits': answer_logits[split],
                'selector_logits': selector_logits[split]
            }
            output_prediction_file = os.path.join(
                training_args.output_dir, f"{split}_verifier_prediction.json")
            with open(output_prediction_file, "w") as f:
                json.dump(verifier_prediction, f)
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments, MetaTrainingArguments))
    model_args, data_args, training_args, metatraining_args = parser.parse_args_into_dataclasses()

    if (
        os.path.exists(training_args.output_dir)
        and os.listdir(training_args.output_dir)
        and training_args.do_train
        and not training_args.overwrite_output_dir
    ):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
        )

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        training_args.local_rank,
        training_args.device,
        training_args.n_gpu,
        bool(training_args.local_rank != -1),
        training_args.fp16,
    )
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed
    set_seed(training_args.seed)

    try:
        processor = processors[data_args.task_name]()
        label_list = processor.get_labels()
        num_labels = len(label_list)
    except KeyError:
        raise ValueError("Task not found: %s" % (data_args.task_name))

    # Load pretrained model and tokenizer
    #
    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.

    config = AutoConfig.from_pretrained(
        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
        num_labels=num_labels,
        finetuning_task=data_args.task_name,
        cache_dir=model_args.cache_dir,
    )
    tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
    )
    # BertForMultipleChoice
    model = AutoModelForMultipleChoice.from_pretrained(
        model_args.model_name_or_path,
        from_tf=bool(".ckpt" in model_args.model_name_or_path),
        config=config,
        cache_dir=model_args.cache_dir,
    )

    # Get datasets
    s1_train_dataset = (
        MetaMultipleChoiceDataset(
            data_dir=os.path.join(data_args.data_dir, 'swag'),
            tokenizer=tokenizer,
            task=data_args.task_name,
            max_seq_length=data_args.max_seq_length,
            overwrite_cache=data_args.overwrite_cache,
            mode=Split.train,
            num_task=20,
            k_support=5,
            k_query=1,
        )
        if training_args.do_train
        else None
    )

    # s2_train_dataset = (
    #     MetaMultipleChoiceDataset(
    #         data_dir=os.path.join(data_args.data_dir, 'ComVE_A'),
    #         tokenizer=tokenizer,
    #         task=data_args.task_name,
    #         max_seq_length=data_args.max_seq_length,
    #         overwrite_cache=data_args.overwrite_cache,
    #         mode=Split.train,
            # num_task=100,
            # k_support=5,
            # k_query=1,
    #     )
    #     if training_args.do_train
    #     else None
    # )

    # s3_train_dataset = (
    #     MetaMultipleChoiceDataset(
    #         data_dir=os.path.join(data_args.data_dir, 'ComVE_B'),
    #         tokenizer=tokenizer,
    #         task=data_args.task_name,
    #         max_seq_length=data_args.max_seq_length,
    #         overwrite_cache=data_args.overwrite_cache,
    #         mode=Split.train,
            # num_task=100,
            # k_support=5,
            # k_query=1,
    #     )
    #     if training_args.do_train
    #     else None
    # )
    # s1_train_dataset = (
    #     MultipleChoiceDataset(
    #         data_dir=os.path.join(data_args.data_dir, 'swag'),
    #         tokenizer=tokenizer,
    #         task=data_args.task_name,
    #         max_seq_length=data_args.max_seq_length,
    #         overwrite_cache=data_args.overwrite_cache,
    #         mode=Split.train,
    #     )
    #     if training_args.do_train
    #     else None
    # )
    # eval_dataset = (
    #     MultipleChoiceDataset(
    #         data_dir=data_args.data_dir,
    #         tokenizer=tokenizer,
    #         task=data_args.task_name,
    #         max_seq_length=data_args.max_seq_length,
    #         overwrite_cache=data_args.overwrite_cache,
    #         mode=Split.test,
    #     )
    #     if training_args.do_eval
    #     else None
    # )

    target_train_dataset = (
        MultipleChoiceDataset(
            data_dir=os.path.join(data_args.data_dir, 'cqa'), 
            tokenizer=tokenizer,
            task='cqa_clf',
            max_seq_length=data_args.max_seq_length,
            overwrite_cache=data_args.overwrite_cache,
            mode=Split.train,
        )
        if training_args.do_train
        else None
    )

    # [TODO]:Modify this...
    # target_test_dataset = (
    #     MultipleChoiceDataset(
    #         data_dir=os.path.join(data_args.data_dir, 'cqa'), 
    #         tokenizer=tokenizer,
    #         task='cqa_clf',
    #         max_seq_length=data_args.max_seq_length,
    #         overwrite_cache=data_args.overwrite_cache,
    #         mode=Split.test,
    #     )
    #     if training_args.do_train
    #     else None
    # )

    def compute_metrics(p: EvalPrediction) -> Dict:
        preds = np.argmax(p.predictions, axis=1)
        return {"acc": simple_accuracy(preds, p.label_ids)}


    # Initialize our Trainer


    # Create meta batch
    s1_db = create_batch_of_tasks(s1_train_dataset, is_shuffle = True, batch_size = metatraining_args.outer_batch_size) 
    # s2_db = create_batch_of_tasks(s2_train_dataset, is_shuffle = True, batch_size = metatraining_args.outer_batch_size) 
    # s3_db = create_batch_of_tasks(s3_train_dataset, is_shuffle = True, batch_size = metatraining_args.outer_batch_size) 

    # Define Data Loader

    def _get_train_sampler(train_dataset) -> Optional[torch.utils.data.sampler.Sampler]:
        if isinstance(train_dataset, torch.utils.data.IterableDataset):
            return None
        else:
            return (
                RandomSampler(train_dataset)
            )

    # s1_train_sampler = _get_train_sampler(s1_train_dataset)

    # s1_train_dataloader = DataLoader(s1_tarin_dataset,
    #  batch_size=args.train_batch_size,
    #  sampler=s1_train_sampler,
    #  collate_fn=DataCollatorWithPadding(tokenizer),
    #  drop_last=args.dataloader_drop_last)
    
    target_train_sampler = _get_train_sampler(target_train_dataset)

    target_train_dataloader = DataLoader(target_train_dataset,
    batch_size=training_args.train_batch_size,
    sampler=target_train_sampler,
    collate_fn=default_data_collator, #DataCollatorWithPadding(tokenizer),
    drop_last=training_args.dataloader_drop_last)

    
    metalearner = MetaLearner(metatraining_args, tokenizer)
    mtl_optimizer = Adam(metalearner.model.parameters(), lr=metatraining_args.mtl_update_lr)
   

    for source_idx, db in enumerate([s1_db]): # , s2_db, s3_db]):

        for step, task_batch in enumerate(db):
            # Meta-Training(FOMAML)
            f = open('log.txt', 'a')
            # print("\n")
            # print(task_batch)
            # print("\n")
            acc, loss = metalearner(task_batch)
            print('Step:', step, '\tTraining Loss | Acc:', loss, " | ",acc)
            f.write(str(acc) + '\n')

        # Fine-tuning on Target Set
        # target_batch = iter(target_train_dataloader).next()
        target_train_loss = []
        target_train_acc = []
        metalearner.model.cuda()
        metalearner.model.train()
        print(metalearner.model.parameters())

        for target_batch in tqdm.tqdm(target_train_dataloader):
            target_batch = metalearner.prepare_inputs(target_batch)
            outputs = metalearner.model(**target_batch)
            loss = outputs[0]
            loss.backward()
            metalearner.outer_optimizer.step()
            metalearner.outer_optimizer.zero_grad()
            target_train_loss.append(loss.item())

            # Compute Acc for target
            logits = F.softmax(outputs[1], dim=1)
            target_label_id = target_batch.get('labels')
            pre_label_id = torch.argmax(logits,dim=1)
            pre_label_id = pre_label_id.detach().cpu().numpy().tolist()
            target_label_id = target_label_id.detach().cpu().numpy().tolist()
            acc = accuracy_score(pre_label_id,target_label_id)
            target_train_acc.append(acc)



        print("Target Loss: ", np.mean(target_train_loss))
        print("Target Acc: ", np.mean(target_train_acc))
            
            # end fine tuning
        
    # end MML 
    
    # MTL : Normal fine tuning
    target_finetune_loss = []
    for target_batch in target_train_dataloader:
        metalearner.model.train()
        target_batch = metalearner.prepare_inputs(target_batch)
        outputs = metalearner.model(**target_batch)
        loss = outputs[0]              
        loss.backward()
        mtl_optimizer.step()
        mtl_optimizer.zero_grad()
        target_finetune_loss.append(loss.item())

    print("Target Loss: ", np.mean(target_finetune_loss))
Esempio n. 8
0
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
    model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    if (
        os.path.exists(training_args.output_dir)
        and os.listdir(training_args.output_dir)
        and training_args.do_train
        and not training_args.overwrite_output_dir
    ):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
        )

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        training_args.local_rank,
        training_args.device,
        training_args.n_gpu,
        bool(training_args.local_rank != -1),
        training_args.fp16,
    )
    # Set the verbosity to info of the Transformers logger (on main process only):
    if is_main_process(training_args.local_rank):
        transformers.utils.logging.set_verbosity_info()
        transformers.utils.logging.enable_default_handler()
        transformers.utils.logging.enable_explicit_format()
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed
    set_seed(training_args.seed)

    try:
        processor = processors[data_args.task_name]()
        label_list = processor.get_labels()
        num_labels = len(label_list)
    except KeyError:
        raise ValueError("Task not found: %s" % (data_args.task_name))

    # Load pretrained model and tokenizer
    #
    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.

    config = AutoConfig.from_pretrained(
        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
        num_labels=num_labels,
        finetuning_task=data_args.task_name,
        cache_dir=model_args.cache_dir,
    )
    tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
    )
    model = AutoModelForMultipleChoice.from_pretrained(
        model_args.model_name_or_path,
        from_tf=bool(".ckpt" in model_args.model_name_or_path),
        config=config,
        cache_dir=model_args.cache_dir,
    )

    # Get datasets
    train_dataset = (
        MultipleChoiceDataset(
            data_dir=data_args.data_dir,
            tokenizer=tokenizer,
            task=data_args.task_name,
            max_seq_length=data_args.max_seq_length,
            overwrite_cache=data_args.overwrite_cache,
            mode=Split.train,
        )
        if training_args.do_train
        else None
    )
    eval_dataset = (
        MultipleChoiceDataset(
            data_dir=data_args.data_dir,
            tokenizer=tokenizer,
            task=data_args.task_name,
            max_seq_length=data_args.max_seq_length,
            overwrite_cache=data_args.overwrite_cache,
            mode=Split.dev,
        )
        if training_args.do_eval
        else None
    )

    def compute_metrics(p: EvalPrediction) -> Dict:
        preds = np.argmax(p.predictions, axis=1)
        return {"acc": simple_accuracy(preds, p.label_ids)}

    # Data collator
    data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8) if training_args.fp16 else None

    # Initialize our Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=compute_metrics,
        data_collator=data_collator,
    )

    # Training
    if training_args.do_train:
        trainer.train(
            model_path=model_args.model_name_or_path if os.path.isdir(model_args.model_name_or_path) else None
        )
        trainer.save_model()
        # For convenience, we also re-save the tokenizer to the same directory,
        # so that you can share your model easily on huggingface.co/models =)
        if trainer.is_world_master():
            tokenizer.save_pretrained(training_args.output_dir)

    # Evaluation
    results = {}
    if training_args.do_eval:
        logger.info("*** Evaluate ***")

        result = trainer.evaluate()

        output_eval_file = os.path.join(training_args.output_dir, "eval_results.txt")
        if trainer.is_world_master():
            with open(output_eval_file, "w") as writer:
                logger.info("***** Eval results *****")
                for key, value in result.items():
                    logger.info("  %s = %s", key, value)
                    writer.write("%s = %s\n" % (key, value))

                results.update(result)

    return results
Esempio n. 9
0
def main():
    args = parse_args()

    # Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
    accelerator = Accelerator()
    # Make one log on every process with the configuration for debugging.
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
    )
    logger.info(accelerator.state)

    # Setup logging, we only want one process per machine to log things on the screen.
    # accelerator.is_local_main_process is only True for one process per machine.
    logger.setLevel(logging.INFO if accelerator.is_local_main_process else logging.ERROR)
    if accelerator.is_local_main_process:
        datasets.utils.logging.set_verbosity_warning()
        transformers.utils.logging.set_verbosity_info()
    else:
        datasets.utils.logging.set_verbosity_error()
        transformers.utils.logging.set_verbosity_error()

    # If passed along, set the training seed now.
    if args.seed is not None:
        set_seed(args.seed)

    # Handle the repository creation
    if accelerator.is_main_process:
        if args.push_to_hub:
            if args.hub_model_id is None:
                repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
            else:
                repo_name = args.hub_model_id
            repo = Repository(args.output_dir, clone_from=repo_name)
        elif args.output_dir is not None:
            os.makedirs(args.output_dir, exist_ok=True)
    accelerator.wait_for_everyone()

    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
    # (the dataset will be downloaded automatically from the datasets Hub).
    #
    # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
    # 'text' is found. You can easily tweak this behavior (see below).
    #
    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
    # download the dataset.
    if args.dataset_name is not None:
        # Downloading and loading a dataset from the hub.
        raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name)
    else:
        data_files = {}
        if args.train_file is not None:
            data_files["train"] = args.train_file
        if args.validation_file is not None:
            data_files["validation"] = args.validation_file
        extension = args.train_file.split(".")[-1]
        raw_datasets = load_dataset(extension, data_files=data_files)
    # Trim a number of training examples
    if args.debug:
        for split in raw_datasets.keys():
            raw_datasets[split] = raw_datasets[split].select(range(100))
    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
    # https://huggingface.co/docs/datasets/loading_datasets.html.

    if raw_datasets["train"] is not None:
        column_names = raw_datasets["train"].column_names
    else:
        column_names = raw_datasets["validation"].column_names

    # When using your own dataset or a different dataset from swag, you will probably need to change this.
    ending_names = [f"ending{i}" for i in range(4)]
    context_name = "sent1"
    question_header_name = "sent2"
    label_column_name = "label" if "label" in column_names else "labels"

    # Load pretrained model and tokenizer
    #
    # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.
    if args.config_name:
        config = AutoConfig.from_pretrained(args.model_name_or_path)
    elif args.model_name_or_path:
        config = AutoConfig.from_pretrained(args.model_name_or_path)
    else:
        config = CONFIG_MAPPING[args.model_type]()
        logger.warning("You are instantiating a new config instance from scratch.")

    if args.tokenizer_name:
        tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, use_fast=not args.use_slow_tokenizer)
    elif args.model_name_or_path:
        tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, use_fast=not args.use_slow_tokenizer)
    else:
        raise ValueError(
            "You are instantiating a new tokenizer from scratch. This is not supported by this script."
            "You can do it from another script, save it, and load it from here, using --tokenizer_name."
        )

    if args.model_name_or_path:
        model = AutoModelForMultipleChoice.from_pretrained(
            args.model_name_or_path,
            from_tf=bool(".ckpt" in args.model_name_or_path),
            config=config,
        )
    else:
        logger.info("Training new model from scratch")
        model = AutoModelForMultipleChoice.from_config(config)

    model.resize_token_embeddings(len(tokenizer))

    # Preprocessing the datasets.
    # First we tokenize all the texts.
    padding = "max_length" if args.pad_to_max_length else False

    def preprocess_function(examples):
        first_sentences = [[context] * 4 for context in examples[context_name]]
        question_headers = examples[question_header_name]
        second_sentences = [
            [f"{header} {examples[end][i]}" for end in ending_names] for i, header in enumerate(question_headers)
        ]
        labels = examples[label_column_name]

        # Flatten out
        first_sentences = list(chain(*first_sentences))
        second_sentences = list(chain(*second_sentences))

        # Tokenize
        tokenized_examples = tokenizer(
            first_sentences,
            second_sentences,
            max_length=args.max_length,
            padding=padding,
            truncation=True,
        )
        # Un-flatten
        tokenized_inputs = {k: [v[i : i + 4] for i in range(0, len(v), 4)] for k, v in tokenized_examples.items()}
        tokenized_inputs["labels"] = labels
        return tokenized_inputs

    with accelerator.main_process_first():
        processed_datasets = raw_datasets.map(
            preprocess_function, batched=True, remove_columns=raw_datasets["train"].column_names
        )

    train_dataset = processed_datasets["train"]
    eval_dataset = processed_datasets["validation"]

    # Log a few random samples from the training set:
    for index in random.sample(range(len(train_dataset)), 3):
        logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")

    # DataLoaders creation:
    if args.pad_to_max_length:
        # If padding was already done ot max length, we use the default data collator that will just convert everything
        # to tensors.
        data_collator = default_data_collator
    else:
        # Otherwise, `DataCollatorWithPadding` will apply dynamic padding for us (by padding to the maximum length of
        # the samples passed). When using mixed precision, we add `pad_to_multiple_of=8` to pad all tensors to multiple
        # of 8s, which will enable the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
        data_collator = DataCollatorForMultipleChoice(
            tokenizer, pad_to_multiple_of=(8 if accelerator.use_fp16 else None)
        )

    train_dataloader = DataLoader(
        train_dataset, shuffle=True, collate_fn=data_collator, batch_size=args.per_device_train_batch_size
    )
    eval_dataloader = DataLoader(eval_dataset, collate_fn=data_collator, batch_size=args.per_device_eval_batch_size)

    # Optimizer
    # Split weights in two groups, one with weight decay and the other not.
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": args.weight_decay,
        },
        {
            "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
            "weight_decay": 0.0,
        },
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)

    # Use the device given by the `accelerator` object.
    device = accelerator.device
    model.to(device)

    # Prepare everything with our `accelerator`.
    model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
        model, optimizer, train_dataloader, eval_dataloader
    )

    # Note -> the training dataloader needs to be prepared before we grab his length below (cause its length will be
    # shorter in multiprocess)

    # Scheduler and math around the number of training steps.
    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
    if args.max_train_steps is None:
        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
    else:
        args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)

    lr_scheduler = get_scheduler(
        name=args.lr_scheduler_type,
        optimizer=optimizer,
        num_warmup_steps=args.num_warmup_steps,
        num_training_steps=args.max_train_steps,
    )

    # Metrics
    metric = load_metric("accuracy")

    # Train!
    total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps

    logger.info("***** Running training *****")
    logger.info(f"  Num examples = {len(train_dataset)}")
    logger.info(f"  Num Epochs = {args.num_train_epochs}")
    logger.info(f"  Instantaneous batch size per device = {args.per_device_train_batch_size}")
    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
    logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
    logger.info(f"  Total optimization steps = {args.max_train_steps}")
    # Only show the progress bar once on each machine.
    progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process)
    completed_steps = 0

    for epoch in range(args.num_train_epochs):
        model.train()
        for step, batch in enumerate(train_dataloader):
            outputs = model(**batch)
            loss = outputs.loss
            loss = loss / args.gradient_accumulation_steps
            accelerator.backward(loss)
            if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
                optimizer.step()
                lr_scheduler.step()
                optimizer.zero_grad()
                progress_bar.update(1)
                completed_steps += 1

            if completed_steps >= args.max_train_steps:
                break

        model.eval()
        for step, batch in enumerate(eval_dataloader):
            with torch.no_grad():
                outputs = model(**batch)
            predictions = outputs.logits.argmax(dim=-1)
            metric.add_batch(
                predictions=accelerator.gather(predictions),
                references=accelerator.gather(batch["labels"]),
            )

        eval_metric = metric.compute()
        accelerator.print(f"epoch {epoch}: {eval_metric}")

        if args.push_to_hub and epoch < args.num_train_epochs - 1:
            accelerator.wait_for_everyone()
            unwrapped_model = accelerator.unwrap_model(model)
            unwrapped_model.save_pretrained(args.output_dir, save_function=accelerator.save)
            if accelerator.is_main_process:
                tokenizer.save_pretrained(args.output_dir)
                repo.push_to_hub(
                    commit_message=f"Training in progress epoch {epoch}", blocking=False, auto_lfs_prune=True
                )

    if args.output_dir is not None:
        accelerator.wait_for_everyone()
        unwrapped_model = accelerator.unwrap_model(model)
        unwrapped_model.save_pretrained(args.output_dir, save_function=accelerator.save)
        if accelerator.is_main_process:
            tokenizer.save_pretrained(args.output_dir)
            if args.push_to_hub:
                repo.push_to_hub(commit_message="End of training", auto_lfs_prune=True)
Esempio n. 10
0
    config = AutoConfig.from_pretrained(
        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
        num_labels=num_labels,
        finetuning_task=data_args.task_name,
        cache_dir=model_args.cache_dir,
    )
    config.attention_type = attention_type
    config.k_value = k_value
    tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
    )
    model = AutoModelForMultipleChoice.from_pretrained(
        model_args.model_name_or_path,
        from_tf=bool(".ckpt" in model_args.model_name_or_path),
        config=config,
        cache_dir=model_args.cache_dir,
    )

    # Get datasets
    train_dataset = (
        MultipleChoiceDataset(
            data_dir=data_args.data_dir,
            tokenizer=tokenizer,
            task=data_args.task_name,
            max_seq_length=data_args.max_seq_length,
            overwrite_cache=data_args.overwrite_cache,
            mode=Split.train,
        )
        if training_args.do_train
        else None
Esempio n. 11
0
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TrainingArguments))
    model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    if (os.path.exists(training_args.output_dir)
            and os.listdir(training_args.output_dir) and training_args.do_train
            and not training_args.overwrite_output_dir):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
        )

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO
        if training_args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        training_args.local_rank,
        training_args.device,
        training_args.n_gpu,
        bool(training_args.local_rank != -1),
        training_args.fp16,
    )
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed
    set_seed(training_args.seed)

    try:
        processor = processors[data_args.task_name]()
        label_list = processor.get_labels()
        num_labels = len(label_list)
    except KeyError:
        raise ValueError("Task not found: %s" % (data_args.task_name))

    config = AutoConfig.from_pretrained(
        model_args.config_name
        if model_args.config_name else model_args.model_name_or_path,
        num_labels=num_labels,
        finetuning_task=data_args.task_name,
        cache_dir=model_args.cache_dir,
    )
    tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name
        if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
    )
    model = AutoModelForMultipleChoice.from_pretrained(
        model_args.model_name_or_path,
        from_tf=bool(".ckpt" in model_args.model_name_or_path),
        config=config,
        cache_dir=model_args.cache_dir,
    )

    if data_args.reinit_pooler:
        if model_args.model_type in ["bert", "roberta"]:
            encoder_temp = getattr(model, model_args.model_type)
            encoder_temp.pooler.dense.weight.data.normal_(
                mean=0.0, std=encoder_temp.config.initializer_range)
            encoder_temp.pooler.dense.bias.data.zero_()
            for p in encoder_temp.pooler.parameters():
                p.requires_grad = True
        elif model_args.model_type in ["xlnet", "bart", "electra"]:
            raise ValueError(
                f"{model_args.model_type} does not have a pooler at the end")
        else:
            raise NotImplementedError

    if data_args.reinit_layers > 0:
        if model_args.model_type in ["bert", "roberta", "electra"]:
            assert data_args.reinit_pooler or model_args.model_type == "electra"
            from transformers.modeling_bert import BertLayerNorm

            encoder_temp = getattr(model, model_args.model_type)
            for layer in encoder_temp.encoder.layer[-data_args.reinit_layers:]:
                for module in layer.modules():
                    if isinstance(module, (nn.Linear, nn.Embedding)):
                        module.weight.data.normal_(
                            mean=0.0,
                            std=encoder_temp.config.initializer_range)
                    elif isinstance(module, BertLayerNorm):
                        module.bias.data.zero_()
                        module.weight.data.fill_(1.0)
                    if isinstance(module,
                                  nn.Linear) and module.bias is not None:
                        module.bias.data.zero_()
        elif model_args.model_type == "xlnet":
            from transformers.modeling_xlnet import XLNetLayerNorm, XLNetRelativeAttention

            for layer in model.transformer.layer[-data_args.reinit_layers:]:
                for module in layer.modules():
                    if isinstance(module, (nn.Linear, nn.Embedding)):
                        module.weight.data.normal_(
                            mean=0.0,
                            std=model.transformer.config.initializer_range)
                        if isinstance(module,
                                      nn.Linear) and module.bias is not None:
                            module.bias.data.zero_()
                    elif isinstance(module, XLNetLayerNorm):
                        module.bias.data.zero_()
                        module.weight.data.fill_(1.0)
                    elif isinstance(module, XLNetRelativeAttention):
                        for param in [
                                module.q,
                                module.k,
                                module.v,
                                module.o,
                                module.r,
                                module.r_r_bias,
                                module.r_s_bias,
                                module.r_w_bias,
                                module.seg_embed,
                        ]:
                            param.data.normal_(
                                mean=0.0,
                                std=model.transformer.config.initializer_range)
        elif model_args.model_type == "bart":
            for layer in model.model.decoder.layers[-data_args.reinit_layers:]:
                for module in layer.modules():
                    model.model._init_weights(module)

        else:
            raise NotImplementedError

    train_dataset = (MultipleChoiceDataset(
        data_dir=data_args.data_dir,
        tokenizer=tokenizer,
        task=data_args.task_name,
        max_seq_length=data_args.max_seq_length,
        overwrite_cache=data_args.overwrite_cache,
        mode=Split.train,
        solve_coref=data_args.solve_coref,
    ) if training_args.do_train else None)
    eval_dataset = (MultipleChoiceDataset(
        data_dir=data_args.data_dir,
        tokenizer=tokenizer,
        task=data_args.task_name,
        max_seq_length=data_args.max_seq_length,
        overwrite_cache=data_args.overwrite_cache,
        mode=Split.dev,
        solve_coref=data_args.solve_coref,
    ) if training_args.do_eval else None)

    test_dataset = (MultipleChoiceDataset(
        data_dir=data_args.data_dir,
        tokenizer=tokenizer,
        task=data_args.task_name,
        max_seq_length=data_args.max_seq_length,
        overwrite_cache=data_args.overwrite_cache,
        mode=Split.test,
        solve_coref=data_args.solve_coref,
    ) if training_args.do_predict else None)

    test_dataset_high = (MultipleChoiceDataset(
        data_dir=data_args.data_dir,
        tokenizer=tokenizer,
        task=data_args.task_name,
        max_seq_length=data_args.max_seq_length,
        overwrite_cache=data_args.overwrite_cache,
        mode=Split.test,
        solve_coref=data_args.solve_coref,
        group='high',
    ) if training_args.do_predict else None)

    test_dataset_middle = (MultipleChoiceDataset(
        data_dir=data_args.data_dir,
        tokenizer=tokenizer,
        task=data_args.task_name,
        max_seq_length=data_args.max_seq_length,
        overwrite_cache=data_args.overwrite_cache,
        mode=Split.test,
        solve_coref=data_args.solve_coref,
        group='middle',
    ) if training_args.do_predict else None)

    def compute_metrics(p: EvalPrediction) -> Dict:
        preds = np.argmax(p.predictions, axis=1)
        return {"acc": simple_accuracy(preds, p.label_ids)}

    # Initialize our Trainer
    if training_args.freelb:
        trainer = FreeLBTrainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
            compute_metrics=compute_metrics,
        )
    else:
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
            compute_metrics=compute_metrics,
        )

    # Training
    if training_args.do_train:
        trainer.train(model_path=model_args.model_name_or_path if os.path.
                      isdir(model_args.model_name_or_path) else None)
        trainer.save_model()

        if trainer.is_world_master():
            tokenizer.save_pretrained(training_args.output_dir)

    # Evaluation
    results = {}
    if training_args.do_eval:
        logger.info("*** Evaluate ***")

        result = trainer.evaluate()

        output_eval_file = os.path.join(training_args.output_dir,
                                        "eval_results.txt")
        if trainer.is_world_master():
            with open(output_eval_file, "w") as writer:
                logger.info("***** Eval results *****")
                for key, value in result.items():
                    logger.info("  %s = %s", key, value)
                    writer.write("%s = %s\n" % (key, value))

                results.update(result)

    if training_args.do_predict:
        predictions, label_ids, metrics = trainer.predict(test_dataset)
        predictions_high, label_ids_high, metrics_high = trainer.predict(
            test_dataset_high)
        predictions_middle, label_ids_middle, metrics_middle = trainer.predict(
            test_dataset_middle)

        predictions_file = os.path.join(training_args.output_dir,
                                        "test_predictions")
        labels_ids_file = os.path.join(training_args.output_dir,
                                       "test_labels_id")

        predictions_file_high = os.path.join(training_args.output_dir,
                                             "test_predictions_high")
        labels_ids_file_high = os.path.join(training_args.output_dir,
                                            "test_labels_id_high")

        predictions_file_middle = os.path.join(training_args.output_dir,
                                               "test_predictions_middle")
        labels_ids_file_middle = os.path.join(training_args.output_dir,
                                              "test_labels_id_middle")

        torch.save(predictions, predictions_file)
        torch.save(label_ids, labels_ids_file)

        torch.save(predictions_high, predictions_file_high)
        torch.save(label_ids_high, labels_ids_file_high)

        torch.save(predictions_middle, predictions_file_middle)
        torch.save(label_ids_middle, labels_ids_file_middle)

        examples_ids = []
        for input_feature in test_dataset.features:
            examples_ids.append(input_feature.example_id)

        examples_ids_file = os.path.join(training_args.output_dir,
                                         "examples_ids")
        torch.save(examples_ids, examples_ids_file)

        output_eval_file = os.path.join(training_args.output_dir,
                                        "test_results.txt")

        if trainer.is_world_master():
            with open(output_eval_file, "w") as writer:
                logger.info("***** Test results *****")
                for key, value in metrics.items():
                    logger.info("  %s = %s", key, value)
                    writer.write("%s = %s\n" % (key, value))
                for key, value in metrics_high.items():
                    logger.info("  high %s = %s", key, value)
                    writer.write("high %s = %s\n" % (key, value))
                for key, value in metrics_middle.items():
                    logger.info("  middle %s = %s", key, value)
                    writer.write("middle %s = %s\n" % (key, value))

    return results
Esempio n. 12
0
def setup(argc=None, **kwargs):
    if argc is None:
        argc = sys.argv[1:]
    parser = HfArgumentParser((
        ModelArguments, DataTrainingArguments,
        DirArguments, TrainingArguments, WindowArguments
    ))
    if (
        isinstance(argc, list) and
        len(argc) == 1 and
        argc[0].endswith('.json')
    ):
        model_args, data_args, dir_args, training_args, window_args = (
            parser.parse_json_file(argc[0])
        )
    elif isinstance(argc, dict):
        model_args, data_args, dir_args, training_args, window_args = (
            parser.parse_dict(argc)
        )
    else:
        model_args, data_args, dir_args, training_args, window_args = (
            parser.parse_args_into_dataclasses()
        )

    if (
        os.path.exists(training_args.output_dir)
        and [f for f in os.listdir(training_args.output_dir) if f != '.gitignore']
        and training_args.do_train
        and not training_args.overwrite_output_dir
    ):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
        )

    all_args = {
        'model_args': model_args,
        'data_args': data_args,
        'dir_args': dir_args,
        'training_args': training_args,
        'window_args': window_args,
    }
    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        training_args.local_rank,
        training_args.device,
        training_args.n_gpu,
        bool(training_args.local_rank != -1),
        training_args.fp16,
    )
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed
    set_seed(training_args.seed)

    try:
        processor = processors[data_args.task_name]()
        label_list = processor.get_labels()
        num_labels = len(label_list)
    except KeyError:
        raise ValueError("Task not found: %s" % (data_args.task_name))

    config_kwargs = kwargs.pop('config_kwargs', {})
    tokenizer_kwargs = kwargs.pop('tokenizer_kwargs', {})
    model_kwargs = kwargs.pop('model_kwargs', {})

    config = AutoConfig.from_pretrained(
        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
        num_labels=num_labels,
        finetuning_task=data_args.task_name,
        cache_dir=model_args.cache_dir,
        **config_kwargs,
    )
    tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
        **tokenizer_kwargs,
    )
    model = AutoModelForMultipleChoice.from_pretrained(
        model_args.model_name_or_path,
        from_tf=bool(".ckpt" in model_args.model_name_or_path),
        config=config,
        cache_dir=model_args.cache_dir,
        **model_kwargs,
    )

    return all_args, processor, config, tokenizer, model
Esempio n. 13
0
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TrainingArguments))
    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args = parser.parse_json_file(
            json_file=os.path.abspath(sys.argv[1]))
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses(
        )

    if (os.path.exists(training_args.output_dir)
            and os.listdir(training_args.output_dir) and training_args.do_train
            and not training_args.overwrite_output_dir):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty."
            "Use --overwrite_output_dir to overcome.")

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO
        if is_main_process(training_args.local_rank) else logging.WARN,
    )

    # Log on each process the small summary:
    logger.warning(
        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
        +
        f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
    )
    # Set the verbosity to info of the Transformers logger (on main process only):
    if is_main_process(training_args.local_rank):
        transformers.utils.logging.set_verbosity_info()
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed before initializing model.
    set_seed(training_args.seed)

    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
    # (the dataset will be downloaded automatically from the datasets Hub).

    # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
    # 'text' is found. You can easily tweak this behavior (see below).

    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
    # download the dataset.
    if data_args.train_file is not None or data_args.validation_file is not None:
        data_files = {}
        if data_args.train_file is not None:
            data_files["train"] = data_args.train_file
        if data_args.validation_file is not None:
            data_files["validation"] = data_args.validation_file
        extension = data_args.train_file.split(".")[-1]
        datasets = load_dataset(extension, data_files=data_files)
    else:
        # Downloading and loading the swag dataset from the hub.
        datasets = load_dataset("swag", "regular")
    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
    # https://huggingface.co/docs/datasets/loading_datasets.html.

    # Load pretrained model and tokenizer

    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.
    config = AutoConfig.from_pretrained(
        model_args.config_name
        if model_args.config_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
    )
    tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name
        if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
        use_fast=model_args.use_fast_tokenizer,
    )
    model = AutoModelForMultipleChoice.from_pretrained(
        model_args.model_name_or_path,
        from_tf=bool(".ckpt" in model_args.model_name_or_path),
        config=config,
        cache_dir=model_args.cache_dir,
    )

    # When using your own dataset or a different dataset from swag, you will probably need to change this.
    ending_names = [f"ending{i}" for i in range(4)]
    context_name = "sent1"
    question_header_name = "sent2"

    # Preprocessing the datasets.
    def preprocess_function(examples):
        first_sentences = [[context] * 4 for context in examples[context_name]]
        question_headers = examples[question_header_name]
        second_sentences = [[
            f"{header} {examples[end][i]}" for end in ending_names
        ] for i, header in enumerate(question_headers)]

        # Flatten out
        first_sentences = sum(first_sentences, [])
        second_sentences = sum(second_sentences, [])

        # Tokenize
        tokenized_examples = tokenizer(
            first_sentences,
            second_sentences,
            truncation=True,
            max_length=data_args.max_seq_length,
            padding="max_length" if data_args.pad_to_max_length else False,
        )
        # Un-flatten
        return {
            k: [v[i:i + 4] for i in range(0, len(v), 4)]
            for k, v in tokenized_examples.items()
        }

    tokenized_datasets = datasets.map(
        preprocess_function,
        batched=True,
        num_proc=data_args.preprocessing_num_workers,
        load_from_cache_file=not data_args.overwrite_cache,
    )

    # Data collator
    data_collator = (default_data_collator if data_args.pad_to_max_length else
                     DataCollatorForMultipleChoice(tokenizer=tokenizer))

    # Metric
    def compute_metrics(eval_predictions):
        predictions, label_ids = eval_predictions
        preds = np.argmax(predictions, axis=1)
        return {
            "accuracy": (preds == label_ids).astype(np.float32).mean().item()
        }

    # Initialize our Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets["train"]
        if training_args.do_train else None,
        eval_dataset=tokenized_datasets["validation"]
        if training_args.do_eval else None,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    # Training
    if training_args.do_train:
        trainer.train(model_path=model_args.model_name_or_path if os.path.
                      isdir(model_args.model_name_or_path) else None)
        trainer.save_model()  # Saves the tokenizer too for easy upload

    # Evaluation
    results = {}
    if training_args.do_eval:
        logger.info("*** Evaluate ***")

        results = trainer.evaluate()

        output_eval_file = os.path.join(training_args.output_dir,
                                        "eval_results_swag.txt")
        if trainer.is_world_process_zero():
            with open(output_eval_file, "w") as writer:
                logger.info("***** Eval results *****")
                for key, value in results.items():
                    logger.info(f"  {key} = {value}")
                    writer.write(f"{key} = {value}\n")

    return results
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TrainingArguments))
    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args = parser.parse_json_file(
            json_file=os.path.abspath(sys.argv[1]))
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses(
        )

    checkpoint_dir = hyperparam_path_for_initializing_evidence_selector(
        model_args, data_args, training_args)
    ckpt_dir = Path(checkpoint_dir)
    postfix = ""
    if training_args.do_train:
        postfix += "_train"
    elif training_args.do_eval:
        postfix += "_eval"
    setup_root_logger(ckpt_dir,
                      training_args.local_rank,
                      debug=False,
                      postfix=postfix)

    training_args.output_dir = checkpoint_dir

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO
        if is_main_process(training_args.local_rank) else logging.WARN,
    )

    # Log on each process the small summary:
    logger.warning(
        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
        +
        f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
    )
    # Set the verbosity to info of the Transformers logger (on main process only):
    if is_main_process(training_args.local_rank):
        transformers.utils.logging.set_verbosity_info()
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed before initializing model.
    set_seed(training_args.seed)

    # Get the [datasets]: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
    # (the dataset will be downloaded automatically from the datasets Hub).

    # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
    # 'text' is found. You can easily tweak this behavior (see below).

    if data_args.dataset not in ['race', 'dream']:
        raise ValueError("Dataset should be race or dream.")
    else:
        if data_args.dataset == 'race':
            from mcmrc.data_utils.processors import prepare_features_for_reading_evidence
        if data_args.dataset == 'dream':
            pass

    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
    # download the dataset.
    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
    # https://huggingface.co/docs/datasets/loading_datasets.html.
    data_files = {}
    data_files[
        'train'] = data_args.train_file if data_args.train_file is not None else None
    data_files[
        'validation'] = data_args.validation_file if data_args.validation_file is not None else None
    data_files[
        'test'] = data_args.test_file if data_args.test_file is not None else None

    datasets = load_dataset(
        data_args.dataload_script,
        data_args.dataload_split,
        data_files=data_files if data_files['train'] is not None else None,
        data_dir=data_args.data_dir)

    # Load pretrained model and tokenizer

    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.
    config = AutoConfig.from_pretrained(
        model_args.config_name
        if model_args.config_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
    )
    tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name
        if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
        use_fast=model_args.use_fast_tokenizer,
    )
    model = AutoModelForMultipleChoice.from_pretrained(
        model_args.model_name_or_path,
        from_tf=bool(".ckpt" in model_args.model_name_or_path),
        config=config,
        cache_dir=model_args.cache_dir,
    )

    if training_args.do_train:
        column_names = datasets["train"].column_names
    else:
        column_names = datasets["validation"].column_names

    all_pseudo_label = load_pseudo_label(data_args.pseudo_label_path)

    if data_args.run_pseudo_label_with_options:
        pseudo_logit = all_pseudo_label['options_prob_diff']
    else:
        pseudo_logit = all_pseudo_label['logit']
    acc = all_pseudo_label['acc']

    # Data collator
    data_collator = DataCollatorForMultipleChoice(tokenizer=tokenizer)

    # Metric
    def compute_metrics(eval_predictions):
        predictions, label_ids = eval_predictions
        preds = np.argmax(predictions, axis=1)
        return {
            "accuracy": (preds == label_ids).astype(np.float32).mean().item()
        }

    eval_on_dev = (data_args.eval_dataset == "all" or data_args.eval_dataset
                   == "dev") and training_args.do_eval
    eval_on_test = (data_args.eval_dataset == "all" or data_args.eval_dataset
                    == "test") and training_args.do_eval

    train_results = {}
    eval_results = {}
    test_results = {}
    for evidence_num in range(1, data_args.max_evidence_len + 1):

        pprepare_features_for_using_pseudo_label_as_evidence = partial(
            prepare_features_for_reading_evidence,
            run_pseudo_label_with_options=data_args.
            run_pseudo_label_with_options,
            evidence_logits=pseudo_logit,
            evidence_len=evidence_num,
            tokenizer=tokenizer,
            data_args=data_args)
        tokenized_datasets = datasets.map(
            pprepare_features_for_using_pseudo_label_as_evidence,
            batched=True,
            num_proc=data_args.preprocessing_num_workers,
            remove_columns=column_names,
            load_from_cache_file=not data_args.overwrite_cache,
        )

        # Initialize our Trainer
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=tokenized_datasets["train"]
            if training_args.do_train else None,
            eval_dataset=tokenized_datasets["validation"]
            if training_args.do_eval else None,
            tokenizer=tokenizer,
            data_collator=data_collator,
            compute_metrics=compute_metrics,
        )

        if training_args.do_train:
            train_result = trainer.train(
                model_path=model_args.model_name_or_path if os.path.
                isdir(model_args.model_name_or_path) else None)
            trainer.save_model()  # Saves the tokenizer too for easy upload

            # Need to save the state, since Trainer.save_model saves only the tokenizer with the model
            trainer.state.save_to_json(
                os.path.join(training_args.output_dir,
                             f"evidence_{evidence_num}_trainer_state.json"))
            for key in list(train_result.metric.keys()):
                train_results[
                    f'evidence{evidence_num}_{key}'] = train_result.metric[key]

        if eval_on_dev:
            logger.info("*** Evaluate ***")
            results = trainer.evaluate(
                eval_dataset=tokenized_datasets["validation"])
            for key in list(results.keys()):
                eval_results[f'evidence{evidence_num}_{key}'] = results[key]

        if eval_on_test:
            logger.info("*** Test ***")
            results = trainer.evaluate(eval_dataset=tokenized_datasets["test"])
            for key in list(results.keys()):
                test_results[f'evidence{evidence_num}_{key}'] = results[key]

    if eval_on_dev:
        output_eval_file = os.path.join(training_args.output_dir,
                                        "eval_results.txt")
        if trainer.is_world_process_zero():
            with open(output_eval_file, "w") as writer:
                logger.info("***** Eval results *****")
                for key, value in sorted(eval_results.items()):
                    logger.info(f"  {key} = {value}")
                    writer.write(f"{key} = {value}\n")

    if eval_on_test:
        output_test_file = os.path.join(training_args.output_dir,
                                        "test_results.txt")
        with open(output_test_file, "w") as writer:
            logger.info("***** Test results *****")
            for key, value in sorted(test_results.items()):
                logger.info(f"  {key} = {value}")
                writer.write(f"{key} = {value}\n")

    if training_args.do_train:
        output_train_file = os.path.join(training_args.output_dir,
                                         "train_results.txt")
        with open(output_train_file, "w") as writer:
            logger.info("***** Train results *****")
            for key, value in sorted(train_results.items()):
                logger.info(f"  {key} = {value}")
                writer.write(f"{key} = {value}\n")
Esempio n. 15
0
def main():

    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
    model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    if (
        os.path.exists(training_args.output_dir)
        and os.listdir(training_args.output_dir)
        and training_args.do_train
        and not training_args.overwrite_output_dir
    ):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
        )

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        training_args.local_rank,
        training_args.device,
        training_args.n_gpu,
        bool(training_args.local_rank != -1),
        training_args.fp16,
    )
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed
    set_seed(training_args.seed)

    try:
        processor = processors[data_args.task_name]()
        label_list = processor.get_labels()
        num_labels = len(label_list)
    except KeyError:
        raise ValueError("Task not found: %s" % (data_args.task_name))

    # Load pretrained model and tokenizer
    # download model & vocab.

    config = AutoConfig.from_pretrained(
        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
        num_labels=num_labels,
        finetuning_task=data_args.task_name,
        cache_dir=model_args.cache_dir,
    )
    tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
    )
    model = AutoModelForMultipleChoice.from_pretrained(
        model_args.model_name_or_path,
        from_tf=bool(".ckpt" in model_args.model_name_or_path),
        config=config,
        cache_dir=model_args.cache_dir,
    )

    # Get datasets
    train_dataset = (
        MultipleChoiceDataset(
            data_dir=data_args.data_dir,
            tokenizer=tokenizer,
            task=data_args.task_name,
            max_seq_length=data_args.max_seq_length,
            overwrite_cache=data_args.overwrite_cache,
            mode=Split.train,
        )
        if training_args.do_train
        else None
    )
    eval_dataset = (
        MultipleChoiceDataset(
            data_dir=data_args.data_dir,
            tokenizer=tokenizer,
            task=data_args.task_name,
            max_seq_length=data_args.max_seq_length,
            overwrite_cache=data_args.overwrite_cache,
            mode=Split.dev,
        )
        if training_args.do_eval
        else None
    )

    def compute_metrics(p: EvalPrediction) -> Dict:
        preds = np.argmax(p.predictions, axis=1)
        return {"acc": simple_accuracy(preds, p.label_ids)}

    # Initialize our Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=compute_metrics,
    )

    # Training
    if training_args.do_train:
        trainer.train(
            model_path=model_args.model_name_or_path if os.path.isdir(model_args.model_name_or_path) else None
        )
        trainer.save_model()
        if trainer.is_world_master():
            tokenizer.save_pretrained(training_args.output_dir)

    # Evaluation
    results = {}
    if training_args.do_eval:
        logger.info("*** Evaluate ***")

        result = trainer.evaluate()

        output_eval_file = os.path.join(training_args.output_dir, "eval_results.txt")
        if trainer.is_world_master():
            with open(output_eval_file, "w") as writer:
                logger.info("***** Eval results *****")
                for key, value in result.items():
                    logger.info("  %s = %s", key, value)
                    writer.write("%s = %s\n" % (key, value))
                results.update(result)

    return results
Esempio n. 16
0
    second_sentences = sum(second_sentences, [])

    # Tokenize
    tokenized_examples = tokenizer(first_sentences,
                                   second_sentences,
                                   truncation=True)
    # Un-flatten
    return {
        k: [v[i:i + 4] for i in range(0, len(v), 4)]
        for k, v in tokenized_examples.items()
    }


encoded_datasets = datasets.map(preprocess_function, batched=True, num_proc=2)

model = AutoModelForMultipleChoice.from_pretrained(model_checkpoint)

args = TrainingArguments(
    "test-race",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=0.01,
)

label_map = {"A": 0, "B": 1, "C": 2, "D": 3}


@dataclass
Esempio n. 17
0
def main():
    # args
    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TrainingArguments))
    model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    # data
    processor = processors['race']()
    label_list = processor.get_labels()
    num_labels = len(label_list)

    # load model
    global_config = AutoConfig.from_pretrained(
        model_args.config_name
        if model_args.config_name else model_args.model_name_or_path,
        num_labels=num_labels,
        finetuning_task=data_args.task_name,
        cache_dir=model_args.cache_dir,
    )
    tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name
        if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
    )
    global_model = AutoModelForMultipleChoice.from_pretrained(
        model_args.model_name_or_path,
        from_tf=bool(".ckpt" in model_args.model_name_or_path),
        config=global_config,
        cache_dir=model_args.cache_dir,
    )

    # local_model = BertForMaskedLM.from_pretrained(

    # )

    # Get datasets
    train_dataset = (MultipleChoiceDataset(
        data_dir=data_args.data_dir,
        tokenizer=tokenizer,
        task=data_args.task_name,
        max_seq_length=data_args.max_seq_length,
        overwrite_cache=data_args.overwrite_cache,
        mode=Split.train,
    ) if training_args.do_train else None)
    eval_dataset = (MultipleChoiceDataset(
        data_dir=data_args.data_dir,
        tokenizer=tokenizer,
        task=data_args.task_name,
        max_seq_length=data_args.max_seq_length,
        overwrite_cache=data_args.overwrite_cache,
        mode=Split.dev,
    ) if training_args.do_eval else None)

    def compute_metrics(p: EvalPrediction) -> Dict:
        preds = np.argmax(p.predictions, axis=1)
        return {"acc": simple_accuracy(preds, p.label_ids)}

    # Initialize our Trainer
    trainer = Trainer(
        model=global_model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=compute_metrics,
    )

    # Training
    if training_args.do_train:
        trainer.train(model_path=model_args.model_name_or_path if os.path.
                      isdir(model_args.model_name_or_path) else None)
        trainer.save_model()
        # For convenience, we also re-save the tokenizer to the same directory,
        # so that you can share your model easily on huggingface.co/models =)
        if trainer.is_world_master():
            tokenizer.save_pretrained(training_args.output_dir)

    # Evaluation
    results = {}
    if training_args.do_eval and training_args.local_rank in [-1, 0]:
        logger.info("*** Evaluate ***")

        result = trainer.evaluate()

        output_eval_file = os.path.join(training_args.output_dir,
                                        "eval_results.txt")
        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results *****")
            for key, value in result.items():
                logger.info("  %s = %s", key, value)
                writer.write("%s = %s\n" % (key, value))

            results.update(result)

    return results
Esempio n. 18
0
def main() -> None:
    global best_loss
    step = 0

    args = parser.parse_args()

    if args.seed is not None:
        random.seed(args.seed)
        torch.manual_seed(args.seed)
        cudnn.deterministic = True
        warnings.warn('You have chosen to seed training. '
                      'This will turn on the CUDNN deterministic setting, '
                      'which can slow down your training considerably! '
                      'You may see unexpected behavior when restarting '
                      'from checkpoints.')

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    if args.start_index is not None or args.end_index is not None:
        start_index = args.start_index
        end_index = args.end_index
        if start_index is None:
            start_index = 0
        if end_index is None:
            corpus = Corpus(filename=download(args.corpus),
                            utterance_start_index=start_index)
        else:
            corpus = Corpus(filename=download(args.corpus),
                            utterance_start_index=start_index,
                            utterance_end_index=end_index)
    else:
        corpus = Corpus(filename=download(args.corpus))

    add_title_to_root(corpus)

    conversations = list(corpus.iter_conversations())

    tokenizer = AutoTokenizer.from_pretrained(args.model_name)
    dataset = ConversationPathDataset(corpus,
                                      tokenizer,
                                      min_len=args.conversation_min,
                                      max_len=args.conversation_max,
                                      n_neighbors=args.num_neighbors,
                                      max_tokenization_len=args.utterance_max)
    sampler = ConversationPathBatchSampler(args.batch_size, dataset.min_len,
                                           dataset.get_indices_by_len())
    loader = DataLoader(dataset,
                        batch_sampler=sampler,
                        collate_fn=conversation_path_collate_fn,
                        pin_memory=device.type != 'cpu',
                        num_workers=4)

    # utterance_encoder = AutoModel.from_pretrained(args.model_name)
    # conversation_encoder = nn.LSTM(utterance_encoder.config.hidden_size, args.hidden, args.num_layers)
    # model = ConversationClassificationHRNN(utterance_encoder, conversation_encoder, 1)
    # mlm_head = AutoModelForMaskedLM.from_pretrained(args.model_name).predictions
    model = AutoModelForMultipleChoice.from_pretrained(args.model_name)
    model.to(device)
    # mlm_head.to(device)
    criterion = nn.CrossEntropyLoss()
    # optimizer = AdamW(list(model.parameters()) + list(mlm_head.parameters()), args.learning_rate)
    optimizer = AdamW(list(model.parameters()), args.learning_rate)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=WARMUP_RATIO * args.training_steps,
        num_training_steps=args.training_steps)
    scaler = GradScaler()

    if args.resume_path is not None:
        if os.path.isfile(args.resume_path):
            print("=> loading checkpoint '{}'".format(args.resume_path))
            checkpoint = torch.load(args.resume_path, map_location=device)
            step = checkpoint['step']
            best_loss = checkpoint['best_loss']
            model.bert.load_state_dict(checkpoint['state_dict'])
            # mlm_head.load_state_dict(checkpoint['head_state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            scheduler.load_state_dict(checkpoint['scheduler'])
            print("=> loaded checkpoint '{}' (step {})".format(
                args.resume_path, checkpoint['step']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume_path))

    while step < args.training_steps:
        loop_steps = args.loop_steps if args.training_steps - step > args.loop_steps else args.training_steps - step
        # loss = train(loader, model, mlm_head, criterion, optimizer, scheduler, scaler,
        #     device, loop_steps, step // args.loop_steps)
        loss = train(loader, model, criterion, optimizer, scheduler, scaler,
                     device, loop_steps, step // args.loop_steps)
        step += loop_steps

        # checkpoint model every k training loops
        k = 2
        if step % (k * args.loop_steps) == 0 or step == args.training_steps:

            is_best = loss < best_loss
            best_loss = min(loss, best_loss)

            run_name = '{}.{}.{}.{}.{}'.format(
                args.model_name.split('/')[-1], args.corpus,
                args.conversation_max, args.num_neighbors, args.utterance_max)

            # save_checkpoint({
            #     'step': step,
            #     'model': args.model_name,
            #     'state_dict': model.state_dict(),
            #     'head_state_dict': mlm_head.state_dict(),
            #     'best_loss': best_loss,
            #     'optimizer': optimizer.state_dict(),
            #     'scheduler': scheduler.state_dict()
            # }, is_best, run_name)
            save_checkpoint(
                {
                    'step': step,
                    'model': args.model_name,
                    'state_dict': model.bert.state_dict(),
                    'best_loss': best_loss,
                    'optimizer': optimizer.state_dict(),
                    'scheduler': scheduler.state_dict()
                }, is_best, run_name)
def main():
    args = parse_args()

    # Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
    # If we're using tracking, we also need to initialize it here and it will pick up all supported trackers in the environment
    accelerator = Accelerator(
        log_with="all",
        logging_dir=args.output_dir) if args.with_tracking else Accelerator()
    # Make one log on every process with the configuration for debugging.
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
    )
    logger.info(accelerator.state, main_process_only=False)
    if accelerator.is_local_main_process:
        datasets.utils.logging.set_verbosity_warning()
        transformers.utils.logging.set_verbosity_info()
    else:
        datasets.utils.logging.set_verbosity_error()
        transformers.utils.logging.set_verbosity_error()

    # If passed along, set the training seed now.
    if args.seed is not None:
        set_seed(args.seed)

    # Handle the repository creation
    if accelerator.is_main_process:
        if args.push_to_hub:
            if args.hub_model_id is None:
                repo_name = get_full_repo_name(Path(args.output_dir).name,
                                               token=args.hub_token)
            else:
                repo_name = args.hub_model_id
            repo = Repository(args.output_dir, clone_from=repo_name)

            with open(os.path.join(args.output_dir, ".gitignore"),
                      "w+") as gitignore:
                if "step_*" not in gitignore:
                    gitignore.write("step_*\n")
                if "epoch_*" not in gitignore:
                    gitignore.write("epoch_*\n")
        elif args.output_dir is not None:
            os.makedirs(args.output_dir, exist_ok=True)
    accelerator.wait_for_everyone()

    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
    # (the dataset will be downloaded automatically from the datasets Hub).
    #
    # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
    # 'text' is found. You can easily tweak this behavior (see below).
    #
    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
    # download the dataset.
    if args.dataset_name is not None:
        # Downloading and loading a dataset from the hub.
        raw_datasets = load_dataset(args.dataset_name,
                                    args.dataset_config_name)
    else:
        data_files = {}
        if args.train_file is not None:
            data_files["train"] = args.train_file
        if args.validation_file is not None:
            data_files["validation"] = args.validation_file
        extension = args.train_file.split(".")[-1]
        raw_datasets = load_dataset(extension, data_files=data_files)
    # Trim a number of training examples
    if args.debug:
        for split in raw_datasets.keys():
            raw_datasets[split] = raw_datasets[split].select(range(100))
    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
    # https://huggingface.co/docs/datasets/loading_datasets.html.

    if raw_datasets["train"] is not None:
        column_names = raw_datasets["train"].column_names
    else:
        column_names = raw_datasets["validation"].column_names

    # When using your own dataset or a different dataset from swag, you will probably need to change this.
    ending_names = [f"ending{i}" for i in range(4)]
    context_name = "sent1"
    question_header_name = "sent2"
    label_column_name = "label" if "label" in column_names else "labels"

    # Load pretrained model and tokenizer
    #
    # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.
    if args.config_name:
        config = AutoConfig.from_pretrained(args.model_name_or_path)
    elif args.model_name_or_path:
        config = AutoConfig.from_pretrained(args.model_name_or_path)
    else:
        config = CONFIG_MAPPING[args.model_type]()
        logger.warning(
            "You are instantiating a new config instance from scratch.")

    if args.tokenizer_name:
        tokenizer = AutoTokenizer.from_pretrained(
            args.tokenizer_name, use_fast=not args.use_slow_tokenizer)
    elif args.model_name_or_path:
        tokenizer = AutoTokenizer.from_pretrained(
            args.model_name_or_path, use_fast=not args.use_slow_tokenizer)
    else:
        raise ValueError(
            "You are instantiating a new tokenizer from scratch. This is not supported by this script."
            "You can do it from another script, save it, and load it from here, using --tokenizer_name."
        )

    if args.model_name_or_path:
        model = AutoModelForMultipleChoice.from_pretrained(
            args.model_name_or_path,
            from_tf=bool(".ckpt" in args.model_name_or_path),
            config=config,
        )
    else:
        logger.info("Training new model from scratch")
        model = AutoModelForMultipleChoice.from_config(config)

    model.resize_token_embeddings(len(tokenizer))

    # Preprocessing the datasets.
    # First we tokenize all the texts.
    padding = "max_length" if args.pad_to_max_length else False

    def preprocess_function(examples):
        first_sentences = [[context] * 4 for context in examples[context_name]]
        question_headers = examples[question_header_name]
        second_sentences = [[
            f"{header} {examples[end][i]}" for end in ending_names
        ] for i, header in enumerate(question_headers)]
        labels = examples[label_column_name]

        # Flatten out
        first_sentences = list(chain(*first_sentences))
        second_sentences = list(chain(*second_sentences))

        # Tokenize
        tokenized_examples = tokenizer(
            first_sentences,
            second_sentences,
            max_length=args.max_length,
            padding=padding,
            truncation=True,
        )
        # Un-flatten
        tokenized_inputs = {
            k: [v[i:i + 4] for i in range(0, len(v), 4)]
            for k, v in tokenized_examples.items()
        }
        tokenized_inputs["labels"] = labels
        return tokenized_inputs

    with accelerator.main_process_first():
        processed_datasets = raw_datasets.map(
            preprocess_function,
            batched=True,
            remove_columns=raw_datasets["train"].column_names)

    train_dataset = processed_datasets["train"]
    eval_dataset = processed_datasets["validation"]

    # Log a few random samples from the training set:
    for index in random.sample(range(len(train_dataset)), 3):
        logger.info(
            f"Sample {index} of the training set: {train_dataset[index]}.")

    # DataLoaders creation:
    if args.pad_to_max_length:
        # If padding was already done ot max length, we use the default data collator that will just convert everything
        # to tensors.
        data_collator = default_data_collator
    else:
        # Otherwise, `DataCollatorWithPadding` will apply dynamic padding for us (by padding to the maximum length of
        # the samples passed). When using mixed precision, we add `pad_to_multiple_of=8` to pad all tensors to multiple
        # of 8s, which will enable the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
        data_collator = DataCollatorForMultipleChoice(
            tokenizer,
            pad_to_multiple_of=(8 if accelerator.use_fp16 else None))

    train_dataloader = DataLoader(train_dataset,
                                  shuffle=True,
                                  collate_fn=data_collator,
                                  batch_size=args.per_device_train_batch_size)
    eval_dataloader = DataLoader(eval_dataset,
                                 collate_fn=data_collator,
                                 batch_size=args.per_device_eval_batch_size)

    # Optimizer
    # Split weights in two groups, one with weight decay and the other not.
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            args.weight_decay,
        },
        {
            "params": [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            0.0,
        },
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)

    # Use the device given by the `accelerator` object.
    device = accelerator.device
    model.to(device)

    # Scheduler and math around the number of training steps.
    num_update_steps_per_epoch = math.ceil(
        len(train_dataloader) / args.gradient_accumulation_steps)
    if args.max_train_steps is None:
        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
    else:
        args.num_train_epochs = math.ceil(args.max_train_steps /
                                          num_update_steps_per_epoch)

    lr_scheduler = get_scheduler(
        name=args.lr_scheduler_type,
        optimizer=optimizer,
        num_warmup_steps=args.num_warmup_steps,
        num_training_steps=args.max_train_steps,
    )

    # Prepare everything with our `accelerator`.
    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
        model, optimizer, train_dataloader, eval_dataloader, lr_scheduler)

    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
    num_update_steps_per_epoch = math.ceil(
        len(train_dataloader) / args.gradient_accumulation_steps)
    args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch

    # Figure out how many steps we should save the Accelerator states
    if hasattr(args.checkpointing_steps, "isdigit"):
        checkpointing_steps = args.checkpointing_steps
        if args.checkpointing_steps.isdigit():
            checkpointing_steps = int(args.checkpointing_steps)
    else:
        checkpointing_steps = None

    # We need to initialize the trackers we use, and also store our configuration
    if args.with_tracking:
        experiment_config = vars(args)
        # TensorBoard cannot log Enums, need the raw value
        experiment_config["lr_scheduler_type"] = experiment_config[
            "lr_scheduler_type"].value
        accelerator.init_trackers("swag_no_trainer", experiment_config)

    # Metrics
    metric = load_metric("accuracy")

    # Train!
    total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps

    logger.info("***** Running training *****")
    logger.info(f"  Num examples = {len(train_dataset)}")
    logger.info(f"  Num Epochs = {args.num_train_epochs}")
    logger.info(
        f"  Instantaneous batch size per device = {args.per_device_train_batch_size}"
    )
    logger.info(
        f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}"
    )
    logger.info(
        f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
    logger.info(f"  Total optimization steps = {args.max_train_steps}")
    # Only show the progress bar once on each machine.
    progress_bar = tqdm(range(args.max_train_steps),
                        disable=not accelerator.is_local_main_process)
    completed_steps = 0
    starting_epoch = 0

    # Potentially load in the weights and states from a previous save
    if args.resume_from_checkpoint:
        if args.resume_from_checkpoint is not None or args.resume_from_checkpoint != "":
            accelerator.print(
                f"Resumed from checkpoint: {args.resume_from_checkpoint}")
            accelerator.load_state(args.resume_from_checkpoint)
            path = os.path.basename(args.resume_from_checkpoint)
        else:
            # Get the most recent checkpoint
            dirs = [f.name for f in os.scandir(os.getcwd()) if f.is_dir()]
            dirs.sort(key=os.path.getctime)
            path = dirs[
                -1]  # Sorts folders by date modified, most recent checkpoint is the last
        # Extract `epoch_{i}` or `step_{i}`
        training_difference = os.path.splitext(path)[0]

        if "epoch" in training_difference:
            starting_epoch = int(training_difference.replace("epoch_", "")) + 1
            resume_step = None
        else:
            resume_step = int(training_difference.replace("step_", ""))
            starting_epoch = resume_step // len(train_dataloader)
            resume_step -= starting_epoch * len(train_dataloader)

    for epoch in range(starting_epoch, args.num_train_epochs):
        model.train()
        if args.with_tracking:
            total_loss = 0
        for step, batch in enumerate(train_dataloader):
            # We need to skip steps until we reach the resumed step
            if args.resume_from_checkpoint and epoch == starting_epoch:
                if resume_step is not None and step < resume_step:
                    completed_steps += 1
                    continue
            outputs = model(**batch)
            loss = outputs.loss
            # We keep track of the loss at each epoch
            if args.with_tracking:
                total_loss += loss.detach().float()
            loss = loss / args.gradient_accumulation_steps
            accelerator.backward(loss)
            if step % args.gradient_accumulation_steps == 0 or step == len(
                    train_dataloader) - 1:
                optimizer.step()
                lr_scheduler.step()
                optimizer.zero_grad()
                progress_bar.update(1)
                completed_steps += 1

            if isinstance(checkpointing_steps, int):
                if completed_steps % checkpointing_steps == 0:
                    output_dir = f"step_{completed_steps }"
                    if args.output_dir is not None:
                        output_dir = os.path.join(args.output_dir, output_dir)
                    accelerator.save_state(output_dir)

            if completed_steps >= args.max_train_steps:
                break

        model.eval()
        samples_seen = 0
        for step, batch in enumerate(eval_dataloader):
            with torch.no_grad():
                outputs = model(**batch)
            predictions = outputs.logits.argmax(dim=-1)
            predictions, references = accelerator.gather(
                (predictions, batch["labels"]))
            # If we are in a multiprocess environment, the last batch has duplicates
            if accelerator.num_processes > 1:
                if step == len(eval_dataloader) - 1:
                    predictions = predictions[:len(eval_dataloader.dataset) -
                                              samples_seen]
                    references = references[:len(eval_dataloader.dataset) -
                                            samples_seen]
                else:
                    samples_seen += references.shape[0]
            metric.add_batch(
                predictions=predictions,
                references=references,
            )

        eval_metric = metric.compute()
        accelerator.print(f"epoch {epoch}: {eval_metric}")

        if args.with_tracking:
            accelerator.log(
                {
                    "accuracy": eval_metric,
                    "train_loss": total_loss,
                    "epoch": epoch,
                    "step": completed_steps
                }, )

        if args.push_to_hub and epoch < args.num_train_epochs - 1:
            accelerator.wait_for_everyone()
            unwrapped_model = accelerator.unwrap_model(model)
            unwrapped_model.save_pretrained(
                args.output_dir,
                is_main_process=accelerator.is_main_process,
                save_function=accelerator.save)
            if accelerator.is_main_process:
                tokenizer.save_pretrained(args.output_dir)
                repo.push_to_hub(
                    commit_message=f"Training in progress epoch {epoch}",
                    blocking=False,
                    auto_lfs_prune=True)

        if args.checkpointing_steps == "epoch":
            output_dir = f"epoch_{epoch}"
            if args.output_dir is not None:
                output_dir = os.path.join(args.output_dir, output_dir)
            accelerator.save_state(output_dir)

    if args.output_dir is not None:
        accelerator.wait_for_everyone()
        unwrapped_model = accelerator.unwrap_model(model)
        unwrapped_model.save_pretrained(
            args.output_dir,
            is_main_process=accelerator.is_main_process,
            save_function=accelerator.save)
        if accelerator.is_main_process:
            tokenizer.save_pretrained(args.output_dir)
            if args.push_to_hub:
                repo.push_to_hub(commit_message="End of training",
                                 auto_lfs_prune=True)
        with open(os.path.join(args.output_dir, "all_results.json"), "w") as f:
            json.dump({"eval_accuracy": eval_metric["accuracy"]}, f)
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser((ModelArguments, DataTrainingArguments,
                               TrainingArguments, AdapterArguments))

    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args, adapter_args = parser.parse_json_file(
            json_file=os.path.abspath(sys.argv[1]))
    else:
        model_args, data_args, training_args, adapter_args = parser.parse_args_into_dataclasses(
        )

    if (os.path.exists(training_args.output_dir)
            and os.listdir(training_args.output_dir) and training_args.do_train
            and not training_args.overwrite_output_dir):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty."
            f" Use --overwrite_output_dir to overcome.")

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO
        if training_args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        training_args.local_rank,
        training_args.device,
        training_args.n_gpu,
        bool(training_args.local_rank != -1),
        training_args.fp16,
    )
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed
    set_seed(training_args.seed)

    try:
        processor = processors[data_args.task_name]()
        label_list = processor.get_labels()
        num_labels = len(label_list)
    except KeyError:
        raise ValueError("Task not found: %s" % (data_args.task_name))

    # Load pretrained model and tokenizer
    #
    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.

    config = AutoConfig.from_pretrained(
        model_args.config_name
        if model_args.config_name else model_args.model_name_or_path,
        num_labels=num_labels,
        finetuning_task=data_args.task_name,
        cache_dir=model_args.cache_dir,
    )
    tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name
        if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
    )
    model = AutoModelForMultipleChoice.from_pretrained(
        model_args.model_name_or_path,
        from_tf=bool(".ckpt" in model_args.model_name_or_path),
        config=config,
        cache_dir=model_args.cache_dir,
    )

    # Setup adapters
    from transformers.adapter_config import AdapterType

    # base_model = getattr(model, model.base_model_prefix, model)
    # base_model.set_adapter_config(AdapterType.text_task, adapter_args.adapter_config)

    from transformers.adapter_config import PfeifferConfig
    model.load_adapter("/home/theorist17/projects/adapter/adapters/MNLI/mnli",
                       "text_task",
                       config=PfeifferConfig(),
                       with_head=False)
    model.load_adapter(
        "/home/theorist17/projects/adapter/adapters/commonsenseqa/commonsenseqa",
        "text_task",
        config=PfeifferConfig(),
        with_head=False)
    model.load_adapter(
        "/home/theorist17/projects/adapter/adapters/conceptnet/conceptnet",
        "text_task",
        config=PfeifferConfig(),
        with_head=False)
    adapter_names = [["mnli", "commonsenseqa", "conceptnet"]]

    model.add_fusion(adapter_names[0], "dynamic")
    #model.base_model.set_active_adapters(adapter_names)
    #model.train_fusion(adapter_names)
    model.train_fusion(adapter_names)
    # inspect parameters of the fusion layer
    for (n, p) in model.named_parameters():
        print(n, p.requires_grad)

    # Get datasets
    train_dataset = (MultipleChoiceDataset(
        data_dir=data_args.data_dir,
        tokenizer=tokenizer,
        task=data_args.task_name,
        max_seq_length=data_args.max_seq_length,
        overwrite_cache=data_args.overwrite_cache,
        mode=Split.train,
    ) if training_args.do_train else None)
    eval_dataset = (MultipleChoiceDataset(
        data_dir=data_args.data_dir,
        tokenizer=tokenizer,
        task=data_args.task_name,
        max_seq_length=data_args.max_seq_length,
        overwrite_cache=data_args.overwrite_cache,
        mode=Split.dev,
    ) if training_args.do_eval else None)

    def simple_accuracy(preds, labels):
        return (preds == labels).mean()

    def compute_metrics(p: EvalPrediction) -> Dict:
        preds = np.argmax(p.predictions, axis=1)
        return {"acc": simple_accuracy(preds, p.label_ids)}

    # Initialize our Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=compute_metrics,
        do_save_full_model=False,
        do_save_adapter_fusion=True,
        adapter_names=adapter_names,
    )

    # Training
    if training_args.do_train:
        trainer.train(model_path=model_args.model_name_or_path if os.path.
                      isdir(model_args.model_name_or_path) else None)
        trainer.save_model()
        # For convenience, we also re-save the tokenizer to the same directory,
        # so that you can share your model easily on huggingface.co/models =)
        if trainer.is_world_master():
            tokenizer.save_pretrained(training_args.output_dir)

    # Evaluation
    eval_results = {}
    if training_args.do_eval:
        logger.info("*** Evaluate ***")

        result = trainer.evaluate()

        output_eval_file = os.path.join(training_args.output_dir,
                                        "eval_results.txt")
        if trainer.is_world_master():
            with open(output_eval_file, "w") as writer:
                logger.info("***** Eval results *****")
                for key, value in result.items():
                    logger.info("  %s = %s", key, value)
                    writer.write("%s = %s\n" % (key, value))

                eval_results.update(result)

    return eval_results
Esempio n. 21
0
def main():
    parser = argparse.ArgumentParser()

    # Required parameters
    parser.add_argument(
        "--train_file", default=None, type=str, required=True, help="SWAG csv for training. E.g., train.csv"
    )
    parser.add_argument(
        "--predict_file",
        default=None,
        type=str,
        required=True,
        help="SWAG csv for predictions. E.g., val.csv or test.csv",
    )
    parser.add_argument(
        "--model_name_or_path",
        default=None,
        type=str,
        required=True,
        help="Path to pretrained model or model identifier from huggingface.co/models",
    )
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help="The output directory where the model checkpoints and predictions will be written.",
    )

    # Other parameters
    parser.add_argument(
        "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name"
    )
    parser.add_argument(
        "--tokenizer_name",
        default="",
        type=str,
        help="Pretrained tokenizer name or path if not the same as model_name",
    )
    parser.add_argument(
        "--max_seq_length",
        default=384,
        type=int,
        help="The maximum total input sequence length after tokenization. Sequences "
        "longer than this will be truncated, and sequences shorter than this will be padded.",
    )
    parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
    parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
    parser.add_argument(
        "--evaluate_during_training", action="store_true", help="Rul evaluation during training at each logging step."
    )

    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.")
    parser.add_argument(
        "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation."
    )
    parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
    parser.add_argument(
        "--gradient_accumulation_steps",
        type=int,
        default=1,
        help="Number of updates steps to accumulate before performing a backward/update pass.",
    )
    parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.")
    parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
    parser.add_argument(
        "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform."
    )
    parser.add_argument(
        "--max_steps",
        default=-1,
        type=int,
        help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
    )
    parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")

    parser.add_argument("--logging_steps", type=int, default=50, help="Log every X updates steps.")
    parser.add_argument("--save_steps", type=int, default=50, help="Save checkpoint every X updates steps.")
    parser.add_argument(
        "--eval_all_checkpoints",
        action="store_true",
        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
    )
    parser.add_argument("--no_cuda", action="store_true", help="Whether not to use CUDA when available")
    parser.add_argument(
        "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory"
    )
    parser.add_argument(
        "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
    )
    parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")

    parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus")
    parser.add_argument(
        "--fp16",
        action="store_true",
        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
    )
    parser.add_argument(
        "--fp16_opt_level",
        type=str,
        default="O1",
        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
        "See details at https://nvidia.github.io/apex/amp.html",
    )
    parser.add_argument("--server_ip", type=str, default="", help="Can be used for distant debugging.")
    parser.add_argument("--server_port", type=str, default="", help="Can be used for distant debugging.")
    args = parser.parse_args()

    if (
        os.path.exists(args.output_dir)
        and os.listdir(args.output_dir)
        and args.do_train
        and not args.overwrite_output_dir
    ):
        raise ValueError(
            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
                args.output_dir
            )
        )

    # Setup distant debugging if needed
    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd

        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
        ptvsd.wait_for_attach()

    # Setup CUDA, GPU & distributed training
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
        args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend="nccl")
        args.n_gpu = 1
    args.device = device

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        args.local_rank,
        device,
        args.n_gpu,
        bool(args.local_rank != -1),
        args.fp16,
    )
    # Set the verbosity to info of the Transformers logger (on main process only):
    if is_main_process(args.local_rank):
        transformers.utils.logging.set_verbosity_info()
        transformers.utils.logging.enable_default_handler()
        transformers.utils.logging.enable_explicit_format()

    # Set seed
    set_seed(args)

    # Load pretrained model and tokenizer
    if args.local_rank not in [-1, 0]:
        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab

    config = AutoConfig.from_pretrained(args.config_name if args.config_name else args.model_name_or_path)
    tokenizer = AutoTokenizer.from_pretrained(
        args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
    )
    model = AutoModelForMultipleChoice.from_pretrained(
        args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config
    )

    if args.local_rank == 0:
        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab

    model.to(args.device)

    logger.info("Training/evaluation parameters %s", args)

    # Training
    if args.do_train:
        train_dataset = load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False)
        global_step, tr_loss = train(args, train_dataset, model, tokenizer)
        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)

    # Save the trained model and the tokenizer
    if args.local_rank == -1 or torch.distributed.get_rank() == 0:
        logger.info("Saving model checkpoint to %s", args.output_dir)
        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
        # They can then be reloaded using `from_pretrained()`
        model_to_save = (
            model.module if hasattr(model, "module") else model
        )  # Take care of distributed/parallel training
        model_to_save.save_pretrained(args.output_dir)
        tokenizer.save_pretrained(args.output_dir)

        # Good practice: save your training arguments together with the trained model
        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))

        # Load a trained model and vocabulary that you have fine-tuned
        model = AutoModelForMultipleChoice.from_pretrained(args.output_dir)
        tokenizer = AutoTokenizer.from_pretrained(args.output_dir)
        model.to(args.device)

    # Evaluation - we can ask to evaluate all the checkpoints (sub-directories) in a directory
    results = {}
    if args.do_eval and args.local_rank in [-1, 0]:
        if args.do_train:
            checkpoints = [args.output_dir]
        else:
            # if do_train is False and do_eval is true, load model directly from pretrained.
            checkpoints = [args.model_name_or_path]

        if args.eval_all_checkpoints:
            checkpoints = list(
                os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
            )

        logger.info("Evaluate the following checkpoints: %s", checkpoints)

        for checkpoint in checkpoints:
            # Reload the model
            global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
            model = AutoModelForMultipleChoice.from_pretrained(checkpoint)
            tokenizer = AutoTokenizer.from_pretrained(checkpoint)
            model.to(args.device)

            # Evaluate
            result = evaluate(args, model, tokenizer, prefix=global_step)

            result = dict((k + ("_{}".format(global_step) if global_step else ""), v) for k, v in result.items())
            results.update(result)

    logger.info("Results: {}".format(results))

    return results
Esempio n. 22
0
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TrainingArguments))
    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args = parser.parse_json_file(
            json_file=os.path.abspath(sys.argv[1]))
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses(
        )

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        handlers=[logging.StreamHandler(sys.stdout)],
    )
    log_level = training_args.get_process_log_level()
    logger.setLevel(log_level)
    datasets.utils.logging.set_verbosity(log_level)
    transformers.utils.logging.set_verbosity(log_level)
    transformers.utils.logging.enable_default_handler()
    transformers.utils.logging.enable_explicit_format()

    # Log on each process the small summary:
    logger.warning(
        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
        +
        f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
    )
    logger.info(f"Training/evaluation parameters {training_args}")

    # Detecting last checkpoint.
    last_checkpoint = None
    if os.path.isdir(
            training_args.output_dir
    ) and training_args.do_train and not training_args.overwrite_output_dir:
        last_checkpoint = get_last_checkpoint(training_args.output_dir)
        if last_checkpoint is None and len(os.listdir(
                training_args.output_dir)) > 0:
            raise ValueError(
                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
                "Use --overwrite_output_dir to overcome.")
        elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
            logger.info(
                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
            )

    # Set seed before initializing model.
    set_seed(training_args.seed)

    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
    # (the dataset will be downloaded automatically from the datasets Hub).

    # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
    # 'text' is found. You can easily tweak this behavior (see below).

    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
    # download the dataset.
    if data_args.train_file is not None or data_args.validation_file is not None:
        data_files = {}
        if data_args.train_file is not None:
            data_files["train"] = data_args.train_file
        if data_args.validation_file is not None:
            data_files["validation"] = data_args.validation_file
        extension = data_args.train_file.split(".")[-1]
        raw_datasets = load_dataset(extension,
                                    data_files=data_files,
                                    cache_dir=model_args.cache_dir)
    else:
        # Downloading and loading the swag dataset from the hub.
        raw_datasets = load_dataset("swag",
                                    "regular",
                                    cache_dir=model_args.cache_dir)
    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
    # https://huggingface.co/docs/datasets/loading_datasets.html.

    # Load pretrained model and tokenizer

    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.
    config = AutoConfig.from_pretrained(
        model_args.config_name
        if model_args.config_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
        revision=model_args.model_revision,
        use_auth_token=True if model_args.use_auth_token else None,
    )
    tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name
        if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
        use_fast=model_args.use_fast_tokenizer,
        revision=model_args.model_revision,
        use_auth_token=True if model_args.use_auth_token else None,
    )
    model = AutoModelForMultipleChoice.from_pretrained(
        model_args.model_name_or_path,
        from_tf=bool(".ckpt" in model_args.model_name_or_path),
        config=config,
        cache_dir=model_args.cache_dir,
        revision=model_args.model_revision,
        use_auth_token=True if model_args.use_auth_token else None,
    )

    # When using your own dataset or a different dataset from swag, you will probably need to change this.
    ending_names = [f"ending{i}" for i in range(4)]
    context_name = "sent1"
    question_header_name = "sent2"

    if data_args.max_seq_length is None:
        max_seq_length = tokenizer.model_max_length
        if max_seq_length > 1024:
            logger.warning(
                f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). "
                "Picking 1024 instead. You can change that default value by passing --max_seq_length xxx."
            )
            max_seq_length = 1024
    else:
        if data_args.max_seq_length > tokenizer.model_max_length:
            logger.warning(
                f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
                f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
            )
        max_seq_length = min(data_args.max_seq_length,
                             tokenizer.model_max_length)

    # Preprocessing the datasets.
    def preprocess_function(examples):
        first_sentences = [[context] * 4 for context in examples[context_name]]
        question_headers = examples[question_header_name]
        second_sentences = [[
            f"{header} {examples[end][i]}" for end in ending_names
        ] for i, header in enumerate(question_headers)]

        # Flatten out
        first_sentences = sum(first_sentences, [])
        second_sentences = sum(second_sentences, [])

        # Tokenize
        tokenized_examples = tokenizer(
            first_sentences,
            second_sentences,
            truncation=True,
            max_length=max_seq_length,
            padding="max_length" if data_args.pad_to_max_length else False,
        )
        # Un-flatten
        return {
            k: [v[i:i + 4] for i in range(0, len(v), 4)]
            for k, v in tokenized_examples.items()
        }

    if training_args.do_train:
        if "train" not in raw_datasets:
            raise ValueError("--do_train requires a train dataset")
        train_dataset = raw_datasets["train"]
        if data_args.max_train_samples is not None:
            train_dataset = train_dataset.select(
                range(data_args.max_train_samples))
        with training_args.main_process_first(
                desc="train dataset map pre-processing"):
            train_dataset = train_dataset.map(
                preprocess_function,
                batched=True,
                num_proc=data_args.preprocessing_num_workers,
                load_from_cache_file=not data_args.overwrite_cache,
            )

    if training_args.do_eval:
        if "validation" not in raw_datasets:
            raise ValueError("--do_eval requires a validation dataset")
        eval_dataset = raw_datasets["validation"]
        if data_args.max_eval_samples is not None:
            eval_dataset = eval_dataset.select(
                range(data_args.max_eval_samples))
        with training_args.main_process_first(
                desc="validation dataset map pre-processing"):
            eval_dataset = eval_dataset.map(
                preprocess_function,
                batched=True,
                num_proc=data_args.preprocessing_num_workers,
                load_from_cache_file=not data_args.overwrite_cache,
            )

    # Data collator
    data_collator = (default_data_collator if data_args.pad_to_max_length else
                     DataCollatorForMultipleChoice(
                         tokenizer=tokenizer,
                         pad_to_multiple_of=8 if training_args.fp16 else None))

    # Metric
    def compute_metrics(eval_predictions):
        predictions, label_ids = eval_predictions
        preds = np.argmax(predictions, axis=1)
        return {
            "accuracy": (preds == label_ids).astype(np.float32).mean().item()
        }

    # Initialize our Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset if training_args.do_train else None,
        eval_dataset=eval_dataset if training_args.do_eval else None,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    # Training
    if training_args.do_train:
        checkpoint = None
        if training_args.resume_from_checkpoint is not None:
            checkpoint = training_args.resume_from_checkpoint
        elif last_checkpoint is not None:
            checkpoint = last_checkpoint
        train_result = trainer.train(resume_from_checkpoint=checkpoint)
        trainer.save_model()  # Saves the tokenizer too for easy upload
        metrics = train_result.metrics

        max_train_samples = (data_args.max_train_samples
                             if data_args.max_train_samples is not None else
                             len(train_dataset))
        metrics["train_samples"] = min(max_train_samples, len(train_dataset))

        trainer.log_metrics("train", metrics)
        trainer.save_metrics("train", metrics)
        trainer.save_state()

    # Evaluation
    if training_args.do_eval:
        logger.info("*** Evaluate ***")

        metrics = trainer.evaluate()
        max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(
            eval_dataset)
        metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))

        trainer.log_metrics("eval", metrics)
        trainer.save_metrics("eval", metrics)

    kwargs = dict(
        finetuned_from=model_args.model_name_or_path,
        tasks="multiple-choice",
        dataset_tags="swag",
        dataset_args="regular",
        dataset="SWAG",
        language="en",
    )

    if training_args.push_to_hub:
        trainer.push_to_hub(**kwargs)
    else:
        trainer.create_model_card(**kwargs)
Esempio n. 23
0
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments, MultiLingAdapterArguments))
    model_args, data_args, training_args, adapter_args = parser.parse_args_into_dataclasses()

    if (
        os.path.exists(training_args.output_dir)
        and os.listdir(training_args.output_dir)
        and training_args.do_train
        and not training_args.overwrite_output_dir
    ):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
        )

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        training_args.local_rank,
        training_args.device,
        training_args.n_gpu,
        bool(training_args.local_rank != -1),
        training_args.fp16,
    )
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed
    set_seed(training_args.seed)

    try:
        processor = processors[data_args.task_name]()
        label_list = processor.get_labels()
        num_labels = len(label_list)
    except KeyError:
        raise ValueError("Task not found: %s" % (data_args.task_name))

    # Load pretrained model and tokenizer
    #
    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.

    config = AutoConfig.from_pretrained(
        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
        num_labels=num_labels,
        finetuning_task=data_args.task_name,
        cache_dir=model_args.cache_dir,
    )
    tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
    )
    model = AutoModelForMultipleChoice.from_pretrained(
        model_args.model_name_or_path,
        from_tf=bool(".ckpt" in model_args.model_name_or_path),
        config=config,
        cache_dir=model_args.cache_dir,
    )

    # Setup adapters
    if adapter_args.train_adapter:
        task_name = data_args.task_name
        # check if adapter already exists, otherwise add it
        if task_name not in model.config.adapters.adapter_list(AdapterType.text_task):
            # resolve the adapter config
            adapter_config = AdapterConfig.load(
                adapter_args.adapter_config,
                non_linearity=adapter_args.adapter_non_linearity,
                reduction_factor=adapter_args.adapter_reduction_factor,
            )
            # load a pre-trained from Hub if specified
            if adapter_args.load_adapter:
                model.load_adapter(
                    adapter_args.load_adapter, AdapterType.text_task, config=adapter_config, load_as=task_name,
                )
            # otherwise, add a fresh adapter
            else:
                model.add_adapter(task_name, AdapterType.text_task, config=adapter_config)
        # optionally load a pre-trained language adapter
        if adapter_args.load_lang_adapter:
            # resolve the language adapter config
            lang_adapter_config = AdapterConfig.load(
                adapter_args.lang_adapter_config,
                non_linearity=adapter_args.lang_adapter_non_linearity,
                reduction_factor=adapter_args.lang_adapter_reduction_factor,
            )
            # load the language adapter from Hub
            lang_adapter_name = model.load_adapter(
                adapter_args.load_lang_adapter,
                AdapterType.text_lang,
                config=lang_adapter_config,
                load_as=adapter_args.language,
            )
        else:
            lang_adapter_name = None
        # Freeze all model weights except of those of this adapter
        model.train_adapter([task_name])
        # Set the adapters to be used in every forward pass
        if lang_adapter_name:
            model.set_active_adapters([lang_adapter_name, task_name])
        else:
            model.set_active_adapters([task_name])

    # Get datasets
    train_dataset = (
        MultipleChoiceDataset(
            data_dir=data_args.data_dir,
            tokenizer=tokenizer,
            task=data_args.task_name,
            max_seq_length=data_args.max_seq_length,
            overwrite_cache=data_args.overwrite_cache,
            mode=Split.train,
        )
        if training_args.do_train
        else None
    )
    eval_dataset = (
        MultipleChoiceDataset(
            data_dir=data_args.data_dir,
            tokenizer=tokenizer,
            task=data_args.task_name,
            max_seq_length=data_args.max_seq_length,
            overwrite_cache=data_args.overwrite_cache,
            mode=Split.dev,
        )
        if training_args.do_eval
        else None
    )

    def compute_metrics(p: EvalPrediction) -> Dict:
        preds = np.argmax(p.predictions, axis=1)
        return {"acc": simple_accuracy(preds, p.label_ids)}

    # Initialize our Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=compute_metrics,
        do_save_full_model=not adapter_args.train_adapter,
        do_save_adapters=adapter_args.train_adapter,
    )

    # Training
    if training_args.do_train:
        trainer.train(
            model_path=model_args.model_name_or_path if os.path.isdir(model_args.model_name_or_path) else None
        )
        trainer.save_model()
        # For convenience, we also re-save the tokenizer to the same directory,
        # so that you can share your model easily on huggingface.co/models =)
        if trainer.is_world_master():
            tokenizer.save_pretrained(training_args.output_dir)

    # Evaluation
    results = {}
    if training_args.do_eval:
        logger.info("*** Evaluate ***")

        result = trainer.evaluate()

        output_eval_file = os.path.join(training_args.output_dir, "eval_results.txt")
        if trainer.is_world_master():
            with open(output_eval_file, "w") as writer:
                logger.info("***** Eval results *****")
                for key, value in result.items():
                    logger.info("  %s = %s", key, value)
                    writer.write("%s = %s\n" % (key, value))

                results.update(result)

    return results
def main():
    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TrainingArguments))
    model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    if (os.path.exists(training_args.output_dir)
            and os.listdir(training_args.output_dir) and training_args.do_train
            and not training_args.overwrite_output_dir):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
        )

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO
        if training_args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        training_args.local_rank,
        training_args.device,
        training_args.n_gpu,
        bool(training_args.local_rank != -1),
        training_args.fp16,
    )
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed
    set_seed(training_args.seed)

    try:
        processor = processors[data_args.task_name]()
        label_list = processor.get_labels()
        num_labels = len(label_list)
    except KeyError:
        raise ValueError("Task not found: %s" % (data_args.task_name))

    config = AutoConfig.from_pretrained(
        model_args.config_name
        if model_args.config_name else model_args.model_name_or_path,
        num_labels=num_labels,
        finetuning_task=data_args.task_name,
        cache_dir=model_args.cache_dir,
    )
    tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name
        if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
    )
    model = AutoModelForMultipleChoice.from_pretrained(
        model_args.model_name_or_path,
        from_tf=bool(".ckpt" in model_args.model_name_or_path),
        config=config,
        cache_dir=model_args.cache_dir,
    )

    train_dataset = (MultipleChoiceDataset(
        data_dir=data_args.data_dir,
        tokenizer=tokenizer,
        task=data_args.task_name,
        max_seq_length=data_args.max_seq_length,
        overwrite_cache=data_args.overwrite_cache,
        mode=Split.train,
        perturbation_type=data_args.perturbation_type,
        perturbation_num=data_args.perturbation_num_train,
        augment=data_args.augment,
        name_gender_or_race=data_args.name_gender_or_race,
    ) if training_args.do_train else None)

    eval_dataset = (MultipleChoiceDataset(
        data_dir=data_args.data_dir,
        tokenizer=tokenizer,
        task=data_args.task_name,
        max_seq_length=data_args.max_seq_length,
        overwrite_cache=data_args.overwrite_cache,
        mode=Split.dev,
    ) if training_args.do_train else None)

    def compute_metrics(p: EvalPrediction) -> Dict:
        preds = np.argmax(p.predictions, axis=1)
        return {"acc": simple_accuracy(preds, p.label_ids)}

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=compute_metrics,
    )

    # Training
    if training_args.do_train:
        trainer.train(model_path=model_args.model_name_or_path if os.path.
                      isdir(model_args.model_name_or_path) else None)
        trainer.save_model()

        if trainer.is_world_master():
            tokenizer.save_pretrained(training_args.output_dir)

    # Test
    test_dataset = MultipleChoiceDataset(
        data_dir=data_args.data_dir,
        tokenizer=tokenizer,
        task=data_args.task_name,
        max_seq_length=data_args.max_seq_length,
        overwrite_cache=data_args.overwrite_cache,
        mode=Split.test,
        perturbation_type=data_args.perturbation_type,
        perturbation_num=data_args.perturbation_num_test,
        augment=data_args.augment,
        name_gender_or_race=data_args.name_gender_or_race,
    )

    predictions, label_ids, metrics = trainer.predict(test_dataset)

    predictions_file = os.path.join(training_args.output_dir,
                                    "test_predictions")
    labels_ids_file = os.path.join(training_args.output_dir, "test_labels_id")
    torch.save(predictions, predictions_file)
    torch.save(label_ids, labels_ids_file)

    examples_ids = []
    perturbated = []
    run = []

    for input_feature in test_dataset.features:
        examples_ids.append(input_feature.example_id)

    for examples in test_dataset.examples:
        perturbated.append(examples.perturbated)
        run.append(examples.run)

    examples_ids_file = os.path.join(training_args.output_dir, "examples_ids")
    torch.save(examples_ids, examples_ids_file)
    perturbated_file = os.path.join(training_args.output_dir, "perturbated")
    torch.save(perturbated, perturbated_file)
    run_file = os.path.join(training_args.output_dir, "run")
    torch.save(run, run_file)

    output_eval_file = os.path.join(training_args.output_dir,
                                    "test_results.txt")

    if trainer.is_world_master():
        with open(output_eval_file, "w") as writer:
            logger.info("***** Test results *****")
            for key, value in metrics.items():
                logger.info("  %s = %s", key, value)
                writer.write("%s = %s\n" % (key, value))
    return metrics
Esempio n. 25
0
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, AllTrainingArguments))
    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args = parser.parse_json_file(
            json_file=os.path.abspath(sys.argv[1]))
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses(
        )

    checkpoint_dir = hyperparam_path_for_two_stage_evidence_selector(
        model_args, data_args, training_args)
    ckpt_dir = Path(checkpoint_dir)
    postfix = ""
    if training_args.train_evidence_selector or training_args.train_answer_verifier:
        postfix += "_train"
    else:
        postfix += "_eval"
    setup_root_logger(ckpt_dir,
                      training_args.local_rank,
                      debug=False,
                      postfix=postfix)

    training_args.output_dir = checkpoint_dir

    # Log on each process the small summary:
    logger.warning(
        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
        +
        f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
    )
    # Set the verbosity to info of the Transformers logger (on main process only):
    if is_main_process(training_args.local_rank):
        transformers.utils.logging.set_verbosity_info()
    logger.info("Training/evaluation parameters %s", training_args)
    logger.info("Data parameters %s", data_args)
    logger.info("Model parameters %s", model_args)

    # Set seed before initializing model.
    set_seed(training_args.seed)

    # Get the [datasets]: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
    # (the dataset will be downloaded automatically from the datasets Hub).

    # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
    # 'text' is found. You can easily tweak this behavior (see below).

    if data_args.dataset not in ['race', 'dream']:
        raise ValueError("Dataset should be race or dream.")

    if training_args.eval_on_exp_race and data_args.exp_race_file is None and data_args.dataset == 'race':
        raise ValueError("exp_race_file must be specified")

    if data_args.dataset == 'dream':
        training_args.eval_on_exp_race = False

    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
    # download the dataset.
    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
    # https://huggingface.co/docs/datasets/loading_datasets.html.

    if data_args.debug_mode:
        datasets = load_dataset(data_args.dataload_script,
                                data_args.dataload_split,
                                data_dir=data_args.data_dir,
                                split={
                                    'train':
                                    ReadInstruction('train',
                                                    from_=0,
                                                    to=5,
                                                    unit='abs'),
                                    'validation':
                                    ReadInstruction('validation',
                                                    from_=0,
                                                    to=5,
                                                    unit='abs'),
                                    'test':
                                    ReadInstruction('test',
                                                    from_=0,
                                                    to=5,
                                                    unit='abs')
                                })
    else:
        datasets = load_dataset(data_args.dataload_script,
                                data_args.dataload_split,
                                data_dir=data_args.data_dir)

    if training_args.eval_on_exp_race:
        datasets['exp'] = Dataset.from_dict(
            load_exp_race_data(data_args.exp_race_file))

    if training_args.eval_on_adv_race:
        for subset in os.listdir(data_args.adv_race_path):
            datasets[subset] = Dataset.from_dict(
                load_adv_race_data(
                    os.path.join(data_args.adv_race_path, subset,
                                 "test_dis.json")))

    # Load pretrained model and tokenizer

    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.
    tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name
        if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
        use_fast=model_args.use_fast_tokenizer,
    )
    evidence_selector_path = model_args.evidence_selector_path \
        if model_args.evidence_selector_path else model_args.model_name_or_path
    answer_verifier_path = model_args.answer_verifier_path \
        if model_args.answer_verifier_path else model_args.model_name_or_path
    evidence_reader_path = model_args.evidence_reader_path \
        if model_args.evidence_reader_path else model_args.model_name_or_path

    evidence_selector_config = AutoConfig.from_pretrained(
        evidence_selector_path,
        cache_dir=model_args.cache_dir,
    )
    answer_verifier_config = AutoConfig.from_pretrained(
        answer_verifier_path,
        cache_dir=model_args.cache_dir,
    )
    evidence_reader_config = AutoConfig.from_pretrained(
        evidence_reader_path,
        cache_dir=model_args.cache_dir,
    )

    evidence_selector = AutoModelForSequenceClassification.from_pretrained(
        evidence_selector_path,
        config=evidence_selector_config,
        cache_dir=model_args.cache_dir,
    )
    answer_verifier = AutoModelForMultipleChoice.from_pretrained(
        answer_verifier_path,
        config=answer_verifier_config,
        cache_dir=model_args.cache_dir,
    )
    evidence_reader = AutoModelForMultipleChoice.from_pretrained(
        evidence_reader_path,
        config=evidence_reader_config,
        cache_dir=model_args.cache_dir,
    )

    if training_args.train_evidence_selector:
        column_names = datasets["train"].column_names
    else:
        column_names = datasets["validation"].column_names

    pprepare_features_for_initializing_evidence_selector = partial(
        prepare_features_for_initializing_evidence_selector,
        evidence_sampling_num=data_args.evidence_sampling_num,
        tokenizer=tokenizer,
        data_args=data_args,
        pseudo_label_path=data_args.pseudo_label_path)

    pprepare_features_for_generating_optionwise_evidence = partial(
        prepare_features_for_generating_optionwise_evidence,
        tokenizer=tokenizer,
        data_args=data_args)

    pprepare_features_for_reading_optionwise_evidence = partial(
        prepare_features_for_reading_optionwise_evidence,
        tokenizer=tokenizer,
        data_args=data_args)

    pprepare_features_for_answer_verifier = partial(
        prepare_features_for_answer_verifier,
        evidence_len=data_args.verifier_evidence_len,
        train_verifier_with_option=data_args.train_verifier_with_option,
        train_verifier_with_non_overlapping_evidence=data_args.
        train_verifier_with_non_overlapping_evidence,
        tokenizer=tokenizer,
        data_args=data_args)

    pprepare_features_for_multiple_choice = partial(prepare_features,
                                                    tokenizer=tokenizer,
                                                    data_args=data_args)

    training_args.num_train_epochs = training_args.num_train_selector_epochs
    selector_trainer = Trainer(
        model=evidence_selector,
        args=training_args,
        train_dataset=None,
        eval_dataset=None,
        tokenizer=tokenizer,
        data_collator=DataCollatorForSequenceClassification(
            tokenizer=tokenizer),
        compute_metrics=compute_mc_metrics,
    )

    training_args.num_train_epochs = training_args.num_train_verifier_epochs
    verifier_trainer = Trainer(
        model=answer_verifier,
        args=training_args,
        train_dataset=None,
        eval_dataset=None,
        tokenizer=tokenizer,
        data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer),
        compute_metrics=compute_mc_metrics,
    )
    if training_args.train_evidence_selector and training_args.train_answer_verifier:
        selector_trainer.checkpoint_dir = os.path.join(
            training_args.output_dir, "evidence_selector")
        verifier_trainer.checkpoint_dir = os.path.join(
            training_args.output_dir, "answer_verifier")

    mc_trainer = Trainer(
        model=evidence_reader,
        args=training_args,
        train_dataset=None,
        eval_dataset=None,
        tokenizer=tokenizer,
        data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer),
        compute_metrics=compute_mc_metrics,
    )

    if training_args.eval_answer_verifier:
        multiple_choice_datasets = {
            k: datasets[k].map(
                pprepare_features_for_multiple_choice,
                batched=True,
                num_proc=data_args.preprocessing_num_workers,
                remove_columns=column_names,
                load_from_cache_file=not data_args.overwrite_cache,
            )
            for k in datasets.keys()
        }

    if training_args.train_evidence_selector or training_args.eval_evidence_selector:
        train_evidence_selector_datasets = {
            k: datasets[k].map(
                pprepare_features_for_initializing_evidence_selector,
                batched=True,
                num_proc=data_args.preprocessing_num_workers,
                remove_columns=column_names,
                load_from_cache_file=not data_args.overwrite_cache,
            )
            for k in datasets.keys()
            if k != "train" or training_args.train_evidence_selector
        }

    if training_args.train_evidence_selector:
        logger.info("**** Train Evidence Selector ****")
        selector_trainer.train_dataset = train_evidence_selector_datasets[
            "train"]
        selector_trainer.eval_dataset = train_evidence_selector_datasets[
            "validation"]
        train_result = selector_trainer.train()

        output_train_file = os.path.join(training_args.output_dir,
                                         "train_results.txt")
        with open(output_train_file, "w") as writer:
            logger.info("***** Evidence selector train results *****")
            writer.write("***** Evidence selector train results *****")
            for key, value in sorted(train_result.metrics.items()):
                logger.info(f"{key} = {value:.3f}")
                writer.write(f"{key} = {value:.3f}\n")

    if training_args.eval_evidence_selector:
        logger.info("**** Evaluate Evidence Selector ****")
        for split in ["validation", "test"]:
            logger.info(f"*** Evaluate {split} set ***")
            results = selector_trainer.evaluate(
                train_evidence_selector_datasets[split]).metrics
            fulleval_results, all_evidence_sentences = selector_trainer.evaluate_selector_with_explicit_reader(
                evidence_reader=evidence_reader,
                eval_dataset=datasets[split],
                feature_func_for_evidence_reading=
                pprepare_features_for_reading_optionwise_evidence,
                feature_func_for_evidence_generating=
                pprepare_features_for_generating_optionwise_evidence)

            metrics = {**results, **fulleval_results}
            output_eval_file = os.path.join(training_args.output_dir,
                                            f"{split}_selector_results.txt")
            with open(output_eval_file, "a+") as writer:
                logger.info("***** Evidence Selector Eval results *****")
                for key, value in sorted(metrics.items()):
                    logger.info(f"{key} = {value:.3f}")
                    writer.write(f"{key} = {value:.3f}\n")

            output_evidence_file = os.path.join(training_args.output_dir,
                                                f"{split}_evidence.json")
            with open(output_evidence_file, "w") as f:
                json.dump(all_evidence_sentences, f)

    # generate evidence logits
    if training_args.train_answer_verifier:
        evidence_logits = {
            k: selector_trainer.evidence_generating(
                v, pprepare_features_for_generating_optionwise_evidence)
            for k, v in datasets.items()
        }
    elif training_args.eval_answer_verifier:
        evidence_logits = {
            k: selector_trainer.evidence_generating(
                v, pprepare_features_for_generating_optionwise_evidence)
            for k, v in datasets.items() if k != "train"
        }

    output_evidence_logits_file = os.path.join(training_args.output_dir,
                                               f"evidence_logits.json")
    with open(output_evidence_logits_file, "w") as f:
        json.dump(evidence_logits, f)

    # prepare features for answer verifier
    if training_args.train_answer_verifier or training_args.eval_answer_verifier:
        logger.info("**** preparing features for answer verifier ****")
        train_answer_verifier_datasets = {}
        evidence_sentences = {}

        for split in datasets.keys():
            if not training_args.train_answer_verifier and split == 'train':
                continue

            verifier_dataset = datasets[split].map(
                partial(pprepare_features_for_answer_verifier,
                        evidence_logits=evidence_logits[split]),
                batched=True,
                num_proc=data_args.preprocessing_num_workers,
                remove_columns=column_names,
                load_from_cache_file=not data_args.overwrite_cache,
            )

            evidence_sentences = {
                eid: [[(logit, sent)
                       for sent, logit in zip(option_sents, option_logits)]
                      for option_sents, option_logits in zip(
                          evidence_sent, evidence_logit)]
                for eid, evidence_sent, evidence_logit in zip(
                    verifier_dataset['example_ids'],
                    verifier_dataset['evidence_sentence'],
                    verifier_dataset['evidence_logit'])
            }
            train_answer_verifier_datasets[
                split] = verifier_dataset.remove_columns(
                    ["evidence_sentence", "evidence_logit"])
            evidence_sentences[split] = evidence_sentences

        output_evidence_file = os.path.join(training_args.output_dir,
                                            f"all_evidence.json")
        with open(output_evidence_file, "w") as f:
            json.dump(evidence_sentences, f)

    if torch.cuda.is_available():
        logger.info("**** release evidence selector ****")
        del selector_trainer
        del evidence_selector
        torch.cuda.empty_cache()

    if training_args.train_answer_verifier:
        logger.info("**** Train  answer verifier ****")
        verifier_trainer.train_dataset = train_answer_verifier_datasets[
            "train"]
        verifier_trainer.eval_dataset = train_answer_verifier_datasets[
            "validation"]

        train_result = verifier_trainer.train()

        output_train_file = os.path.join(training_args.output_dir,
                                         "verifier_train_results.txt")
        with open(output_train_file, "a+") as writer:
            logger.info("***** Verifier Train results *****")
            writer.write("***** Verifier Train results *****")
            for key, value in sorted(train_result.metrics.items()):
                logger.info(f"{key} = {value:.3f}")
                writer.write(f"{key} = {value:.3f}\n")

    # Evaluation
    # To use the best checkpoint model at end, use the aruguments
    # load_best_model_at_end, metric_for_best_model, evaluation_strategy steps
    # --load_best_model_at_end \
    # --metric_for_best_model accuracy \
    # --evaluation_strategy steps \

    if training_args.eval_answer_verifier:

        eval_sets = ["validation", "test"]
        if training_args.eval_on_exp_race and data_args.dataset == "race":
            eval_sets.append("exp")

        if training_args.eval_on_adv_race and data_args.dataset == "race":
            eval_sets += ['charSwap', 'AddSent', 'DE', 'DG', 'Orig']

        for split in eval_sets:
            logger.info(f"*** Evaluate Answer Verifier on {split} set ***")
            metrics, predictions = verifier_trainer.evaluate_answer_verifier_with_explicit_reader(
                evidence_reader=evidence_reader,
                multiple_choice_dataset=multiple_choice_datasets[split],
                answer_verifier_dataset=train_answer_verifier_datasets[split])

            output_prediction_file = os.path.join(
                training_args.output_dir, f"{split}_verifier_predictions.json")
            with open(output_prediction_file, "w") as f:
                json.dump(predictions, f)
            if training_args.eval_on_exp_race and split == "exp":
                ground_truth_file = json.load(
                    open(data_args.exp_race_file, 'rb'))
                for ratio, merge_prediction in predictions.items():
                    prediction_file = {}
                    for eid, probs in merge_prediction.items():
                        pred_option = np.argmax(probs)
                        pred_evidence = sorted(
                            evidence_sentences['exp'][eid][pred_option],
                            key=lambda x: x[0],
                            reverse=True)[0][1]
                        prediction_file[eid] = {
                            "answer": chr(pred_option + ord("A")),
                            "evidence": pred_evidence
                        }
                    all_f1, ans_f1, evi_f1, total_count, skip_count = evaluate_multi_choice(
                        ground_truth_file, prediction_file)
                    metrics[f"merge_{ratio}_all_f1"] = all_f1
                    metrics[f"merge_{ratio}_ans_f1"] = ans_f1
                    metrics[f"merge_{ratio}_evi_f1"] = evi_f1
                    metrics[f"merge_{ratio}_total_count"] = total_count
                    metrics[f"merge_{ratio}_skip_count"] = skip_count

            output_eval_file = os.path.join(training_args.output_dir,
                                            f"{split}_verifier_results.txt")
            with open(output_eval_file, "a+") as writer:
                logger.info(f"***** Eval {split} results *****")
                for key, value in sorted(metrics.items()):
                    logger.info(f"{key} = {value:.3f}")
                    writer.write(f"{key} = {value:.3f}\n")
Esempio n. 26
0
        model, config, tokenizer, preprocess_datasets = objects[0], objects[1], objects[2], objects[3]
    model.to(device)
    # DataLoaders creation:
    # If padding was already done ot max length, we use the default data collator that will just convert everything
    # to tensors.

    if args.train:
        total_steps = train(args, model, preprocess_datasets['train'], preprocess_datasets['dev'], device, tokenizer)
    if (not args.train) and args.eval:
        # only evaluation without train
        model = set_model_distributed(args, model)
        loss, eval_metric = evaluation(args, model, preprocess_datasets['dev'], device)
    if args.predict:
        if args.local_rank in [-1, 0]:
            logger.info("Predict on validation data...")
            if args.ensemble_models is not None:
                args.ensemble_models = args.ensemble_models.split(',')
                all_predictions = []
                for model_path in args.ensemble_models:
                    model = AutoModelForMultipleChoice.from_pretrained(model_path)
                    model.to(device)
                    current_predictions, current_logits = predict(args, model, preprocess_datasets['predict'], device, return_logit=True)
                    all_predictions.append(current_logits)  # model_number, example_num, 4
                predictions = np.mean(all_predictions, 0).argmax(1)
            else:
                # get model on single gpu
                model = model.module if hasattr(model, "module") else model
                predictions = predict(args, model, preprocess_datasets['predict'], device)
            write_to_csv(preprocess_datasets['predict'], predictions, args.predict_out)
    
    def run_multiple_choice(self, model_name, task_name, fp16):
        model_args = ModelArguments(model_name_or_path=model_name,
                                    cache_dir=self.cache_dir)
        data_args = DataTrainingArguments(task_name=task_name,
                                          data_dir=self.data_dir,
                                          max_seq_length=self.max_seq_length)

        training_args = TrainingArguments(
            output_dir=os.path.join(self.output_dir, task_name),
            do_train=True,
            do_eval=True,
            per_gpu_train_batch_size=self.train_batch_size,
            per_gpu_eval_batch_size=self.eval_batch_size,
            learning_rate=self.learning_rate,
            num_train_epochs=self.num_train_epochs,
            local_rank=self.local_rank,
            overwrite_output_dir=self.overwrite_output_dir,
            gradient_accumulation_steps=self.gradient_accumulation_steps,
            fp16=fp16,
            logging_steps=self.logging_steps)

        # Setup logging
        logging.basicConfig(
            format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
            datefmt="%m/%d/%Y %H:%M:%S",
            level=logging.INFO
            if training_args.local_rank in [-1, 0] else logging.WARN,
        )
        logger.warning(
            "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
            training_args.local_rank,
            training_args.device,
            training_args.n_gpu,
            bool(training_args.local_rank != -1),
            training_args.fp16,
        )
        logger.info("Training/evaluation parameters %s", training_args)

        set_seed(training_args.seed)
        onnxruntime.set_seed(training_args.seed)

        try:
            processor = SwagProcessor()
            label_list = processor.get_labels()
            num_labels = len(label_list)
        except KeyError:
            raise ValueError("Task not found: %s" % (data_args.task_name))

        config = AutoConfig.from_pretrained(
            model_args.config_name
            if model_args.config_name else model_args.model_name_or_path,
            num_labels=num_labels,
            finetuning_task=data_args.task_name,
            cache_dir=model_args.cache_dir,
        )
        tokenizer = AutoTokenizer.from_pretrained(
            model_args.tokenizer_name
            if model_args.tokenizer_name else model_args.model_name_or_path,
            cache_dir=model_args.cache_dir,
        )

        model = AutoModelForMultipleChoice.from_pretrained(
            model_args.model_name_or_path,
            from_tf=bool(".ckpt" in model_args.model_name_or_path),
            config=config,
            cache_dir=model_args.cache_dir,
        )

        # Get datasets
        train_dataset = (MultipleChoiceDataset(
            data_dir=data_args.data_dir,
            tokenizer=tokenizer,
            task=data_args.task_name,
            processor=processor,
            max_seq_length=data_args.max_seq_length,
            overwrite_cache=data_args.overwrite_cache,
            mode=Split.train,
        ) if training_args.do_train else None)
        eval_dataset = (MultipleChoiceDataset(
            data_dir=data_args.data_dir,
            tokenizer=tokenizer,
            task=data_args.task_name,
            processor=processor,
            max_seq_length=data_args.max_seq_length,
            overwrite_cache=data_args.overwrite_cache,
            mode=Split.dev,
        ) if training_args.do_eval else None)

        def compute_metrics(p: EvalPrediction) -> Dict:
            preds = np.argmax(p.predictions, axis=1)
            return {"acc": simple_accuracy(preds, p.label_ids)}

        if model_name.startswith('bert'):
            model_desc = ModelDescription([
                IODescription('input_ids', [
                    self.train_batch_size, num_labels, data_args.max_seq_length
                ],
                              torch.int64,
                              num_classes=model.config.vocab_size),
                IODescription('attention_mask', [
                    self.train_batch_size, num_labels, data_args.max_seq_length
                ],
                              torch.int64,
                              num_classes=2),
                IODescription('token_type_ids', [
                    self.train_batch_size, num_labels, data_args.max_seq_length
                ],
                              torch.int64,
                              num_classes=2),
                IODescription('labels', [self.train_batch_size, num_labels],
                              torch.int64,
                              num_classes=num_labels)
            ], [
                IODescription('loss', [], torch.float32),
                IODescription('reshaped_logits',
                              [self.train_batch_size, num_labels],
                              torch.float32)
            ])
        else:
            model_desc = ModelDescription([
                IODescription('input_ids',
                              ['batch', num_labels, 'max_seq_len_in_batch'],
                              torch.int64,
                              num_classes=model.config.vocab_size),
                IODescription('attention_mask',
                              ['batch', num_labels, 'max_seq_len_in_batch'],
                              torch.int64,
                              num_classes=2),
                IODescription('labels', ['batch', num_labels],
                              torch.int64,
                              num_classes=num_labels)
            ], [
                IODescription('loss', [], torch.float32),
                IODescription('reshaped_logits', ['batch', num_labels],
                              torch.float32)
            ])

        # Initialize the ORTTrainer within ORTTransformerTrainer
        trainer = ORTTransformerTrainer(
            model=model,
            model_desc=model_desc,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
            compute_metrics=compute_metrics,
        )

        # Training
        if training_args.do_train:
            trainer.train()
            trainer.save_model()

        # Evaluation
        results = {}
        if training_args.do_eval and training_args.local_rank in [-1, 0]:
            logger.info("*** Evaluate ***")

            result = trainer.evaluate()

            logger.info("***** Eval results {} *****".format(
                data_args.task_name))
            for key, value in result.items():
                logger.info("  %s = %s", key, value)

            results.update(result)

        return results
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    checkpoint_dir = hyperparam_path_for_initializing_evidence_selector(model_args, data_args, training_args)
    ckpt_dir = Path(checkpoint_dir)
    postfix = ""
    if training_args.do_train:
        postfix += "_train"
    elif training_args.do_eval:
        postfix += "_eval"
    setup_root_logger(ckpt_dir, training_args.local_rank, debug=False, postfix=postfix)

    training_args.output_dir = checkpoint_dir


    # Log on each process the small summary:
    logger.warning(
        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
    )
    # Set the verbosity to info of the Transformers logger (on main process only):
    if is_main_process(training_args.local_rank):
        transformers.utils.logging.set_verbosity_info()
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed before initializing model.
    set_seed(training_args.seed)

    # Get the [datasets]: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
    # (the dataset will be downloaded automatically from the datasets Hub).

    # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
    # 'text' is found. You can easily tweak this behavior (see below).

    if data_args.dataset not in ['race', 'dream']:
        raise ValueError("Dataset should be race or dream.")
    else:
        if data_args.dataset == 'race':
            from mcmrc.data_utils.processors import prepare_features_for_initializing_simple_evidence_selector, \
                prepare_features_for_generating_evidence_using_selector, prepare_features_for_reading_evidence
        if data_args.dataset == 'dream':
            pass

    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
    # download the dataset.
    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
    # https://huggingface.co/docs/datasets/loading_datasets.html.
    data_files = {}
    data_files['train'] = data_args.train_file if data_args.train_file is not None else None
    data_files['validation'] = data_args.validation_file if data_args.validation_file is not None else None
    data_files['test'] = data_args.test_file if data_args.test_file is not None else None

    datasets = load_dataset(data_args.dataload_script, data_args.dataload_split, data_files=data_files if data_files['train'] is not None else None,
                            data_dir=data_args.data_dir)

    # Load pretrained model and tokenizer

    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.
    config = AutoConfig.from_pretrained(
        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
    )
    if data_args.train_with_adversarial_examples:
        config.num_labels = 3
    tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
        use_fast=model_args.use_fast_tokenizer,
    )
    evidence_selector = AutoModelForSequenceClassification.from_pretrained(
        model_args.model_name_or_path,
        from_tf=bool(".ckpt" in model_args.model_name_or_path),
        config=config,
        cache_dir=model_args.cache_dir,
    )

    evidence_reader = AutoModelForMultipleChoice.from_pretrained(
        model_args.evidence_reader_path,
        from_tf=bool(".ckpt" in model_args.model_name_or_path),
        config=config,
        cache_dir=model_args.cache_dir,
    )


    if training_args.do_train:
        column_names = datasets["train"].column_names
    else:
        column_names = datasets["validation"].column_names
    pprepare_features_for_initializing_evidence_selector = partial(prepare_features_for_initializing_simple_evidence_selector, evidence_len=data_args.evidence_len,
                                tokenizer=tokenizer, data_args=data_args, pseudo_label_path=data_args.pseudo_label_path)
    initializing_evidence_selector_datasets = datasets.map(
        pprepare_features_for_initializing_evidence_selector,
        batched=True,
        num_proc=data_args.preprocessing_num_workers,
        remove_columns=column_names,
        load_from_cache_file=not data_args.overwrite_cache,
    )
    pprepare_features_for_generating_evidence_using_selector = partial(prepare_features_for_generating_evidence_using_selector,
                                tokenizer=tokenizer, data_args=data_args)
    evidence_generating_datasets = {k: datasets[k].map(
        pprepare_features_for_generating_evidence_using_selector,
        batched=True,
        num_proc=data_args.preprocessing_num_workers,
        remove_columns=column_names,
        load_from_cache_file=not data_args.overwrite_cache,
    ) for k in datasets.keys() if k != "train"}

    pprepare_features_for_reading_evidence = partial(prepare_features_for_reading_evidence, pseudo_label_or_not=False, tokenizer=tokenizer, data_args=data_args)




    # Data collator
    data_collator = DataCollatorForSequenceClassification(tokenizer=tokenizer)

    # Metric
    def compute_metrics(eval_predictions):
        predictions, label_ids = eval_predictions
        preds = np.argmax(predictions, axis=1)
        return {"accuracy": (preds == label_ids).astype(np.float32).mean().item()}

    # Initialize our Trainer
    trainer = Trainer(
        model=evidence_selector,
        args=training_args,
        train_dataset=initializing_evidence_selector_datasets["train"] if training_args.do_train else None,
        eval_dataset=initializing_evidence_selector_datasets["validation"] if training_args.do_eval else None,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    if training_args.do_train:
        train_result = trainer.train()

        output_train_file = os.path.join(training_args.output_dir, "train_results.txt")
        with open(output_train_file, "w") as writer:
            logger.info("***** Train results *****")
            for key, value in sorted(train_result.metrics.items()):
                logger.info(f"  {key} = {value}")
                writer.write(f"{key} = {value}\n")

    # Evaluation
    # To use the best checkpoint model at end, use the aruguments
    # load_best_model_at_end, metric_for_best_model, evaluation_strategy steps
    # --load_best_model_at_end \
    # --metric_for_best_model accuracy \
    # --evaluation_strategy steps \
    eval_on_dev = (data_args.eval_dataset == "all" or data_args.eval_dataset == "dev") and training_args.do_eval
    eval_on_test = (data_args.eval_dataset == "all" or data_args.eval_dataset == "test") and training_args.do_eval

    if eval_on_dev:
        logger.info("*** Evaluate ***")
        results = trainer.evaluate(initializing_evidence_selector_datasets["validation"]).metrics
        fulleval_results = trainer.evaluate_with_explicit_reader(evidence_reader, datasets["validation"], pprepare_features_for_reading_evidence,
                                                                 evidence_generating_datasets["validation"])

        metrics = {**results, **fulleval_results}
        output_eval_file = os.path.join(training_args.output_dir, "eval_results.txt")
        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results *****")
            for key, value in sorted(metrics.items()):
                logger.info(f"  {key} = {value}")
                writer.write(f"{key} = {value}\n")
    if eval_on_test:
        logger.info("*** Test ***")

        results = trainer.evaluate(initializing_evidence_selector_datasets["test"]).metrics
        fulleval_results = trainer.evaluate_with_explicit_reader(evidence_reader, datasets["test"], pprepare_features_for_reading_evidence,
                                                                 evidence_generating_datasets["test"])

        metrics = {**results, **fulleval_results}
        output_test_file = os.path.join(training_args.output_dir, "test_results.txt")
        with open(output_test_file, "w") as writer:
            logger.info("***** Test results *****")
            for key, value in sorted(metrics.items()):
                logger.info(f"  {key} = {value}")
                writer.write(f"{key} = {value}\n")
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser(
        (BasicModelArguments, DataTrainingArguments, TrainingArguments))
    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args = parser.parse_json_file(
            json_file=os.path.abspath(sys.argv[1]))
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses(
        )

    checkpoint_dir = hyperparam_path_for_baseline(model_args, data_args,
                                                  training_args)
    ckpt_dir = Path(checkpoint_dir)
    postfix = ""
    if training_args.do_train:
        postfix += "_train"
    if training_args.do_eval:
        postfix += "_eval"
    setup_root_logger(ckpt_dir,
                      training_args.local_rank,
                      debug=False,
                      postfix=postfix)

    training_args.output_dir = checkpoint_dir

    # Log on each process the small summary:
    logger.warning(
        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, "
        +
        f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
    )
    # Set the verbosity to info of the Transformers logger (on main process only):
    if is_main_process(training_args.local_rank):
        transformers.utils.logging.set_verbosity_info()
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed before initializing model.
    set_seed(training_args.seed)

    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
    # (the dataset will be downloaded automatically from the datasets Hub).

    # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
    # 'text' is found. You can easily tweak this behavior (see below).
    if not 0 <= data_args.holdout_set < data_args.n_fold:
        raise ValueError("Test fold must be in [0, n_fold)")

    if data_args.dataset not in ['race', 'dream']:
        raise ValueError("Dataset should be race or dream.")
    else:
        from mcmrc.data_utils.processors import prepare_features

    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
    # download the dataset.
    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
    # https://huggingface.co/docs/datasets/loading_datasets.html.

    if data_args.debug_mode:
        datasets = load_dataset(data_args.dataload_script,
                                data_args.dataload_split,
                                data_dir=data_args.data_dir,
                                split={
                                    'train':
                                    ReadInstruction('train',
                                                    from_=0,
                                                    to=5,
                                                    unit='abs'),
                                    'validation':
                                    ReadInstruction('validation',
                                                    from_=0,
                                                    to=5,
                                                    unit='abs'),
                                    'test':
                                    ReadInstruction('test',
                                                    from_=0,
                                                    to=5,
                                                    unit='abs')
                                })
    else:
        datasets = load_dataset(data_args.dataload_script,
                                data_args.dataload_split,
                                data_dir=data_args.data_dir)

    if data_args.shuffle_train_dataset:
        datasets['train'] = datasets['train'].shuffle(seed=training_args.seed)

    if data_args.split_train_dataset:
        holdout_set_start = int(
            len(datasets['train']) / data_args.n_fold * data_args.holdout_set)
        holdout_set_end = int(
            len(datasets['train']) / data_args.n_fold *
            (data_args.holdout_set + 1))
        shuffled_train_set = datasets['train'].shuffle(seed=training_args.seed)
        if holdout_set_start == 0:
            new_train_set = Dataset.from_dict(
                shuffled_train_set[holdout_set_end:])
        elif holdout_set_end == len(datasets['train']):
            new_train_set = Dataset.from_dict(
                shuffled_train_set[:holdout_set_start])
        else:
            new_train_set = concatenate_datasets([
                Dataset.from_dict(shuffled_train_set[:holdout_set_start]),
                Dataset.from_dict(shuffled_train_set[holdout_set_end:])
            ])

        new_holdout_set = Dataset.from_dict(
            shuffled_train_set[holdout_set_start:holdout_set_end])
        assert new_train_set.num_rows + new_holdout_set.num_rows == shuffled_train_set.num_rows
        datasets['train'] = new_train_set
        datasets['holdout_set'] = new_holdout_set

    # Load pretrained model and tokenizer

    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.
    config = AutoConfig.from_pretrained(
        model_args.config_name
        if model_args.config_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
    )
    tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name
        if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
        use_fast=model_args.use_fast_tokenizer,
    )
    model = AutoModelForMultipleChoice.from_pretrained(
        model_args.model_name_or_path,
        from_tf=bool(".ckpt" in model_args.model_name_or_path),
        config=config,
        cache_dir=model_args.cache_dir,
    )

    if training_args.do_train:
        column_names = datasets["train"].column_names
    else:
        column_names = datasets["validation"].column_names

    pprepare_features = partial(prepare_features,
                                tokenizer=tokenizer,
                                data_args=data_args)
    tokenized_datasets = datasets.map(
        pprepare_features,
        batched=True,
        num_proc=data_args.preprocessing_num_workers,
        remove_columns=column_names,
        load_from_cache_file=not data_args.overwrite_cache,
    )

    # Data collator
    data_collator = (default_data_collator if data_args.pad_to_max_length else
                     DataCollatorForMultipleChoice(tokenizer=tokenizer))

    # Initialize our Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets["train"]
        if training_args.do_train else None,
        eval_dataset=tokenized_datasets["validation"]
        if training_args.do_eval else None,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_mc_metrics,
    )

    # Training
    if training_args.do_train:
        train_result = trainer.train()

        output_train_file = os.path.join(training_args.output_dir,
                                         "train_results.txt")
        with open(output_train_file, "w") as writer:
            logger.info("***** Train results *****")
            for key, value in sorted(train_result.metrics.items()):
                logger.info(f"{key} = {value:.3f}")
                writer.write(f"{key} = {value:.3f}\n")

    # Evaluation
    # To use the best checkpoint model at end, use the aruguments
    # load_best_model_at_end, metric_for_best_model, evaluation_strategy steps
    # --load_best_model_at_end \
    # --metric_for_best_model accuracy \
    # --evaluation_strategy steps \
    if training_args.do_eval:

        if training_args.load_best_model_at_end:
            best_model = AutoModelForMultipleChoice.from_pretrained(
                training_args.output_dir,
                from_tf=bool(".ckpt" in model_args.model_name_or_path),
                config=config,
                cache_dir=model_args.cache_dir,
            )
            best_model = best_model.to(training_args.device)

        for split in [k for k in datasets.keys() if k != "train"]:
            logger.info(f"*** Evaluate {split} set ***")
            results = trainer.evaluate(tokenized_datasets[split])
            if training_args.load_best_model_at_end:
                final_model = trainer.model
                trainer.model = best_model
                best_model_results = trainer.evaluate(
                    tokenized_datasets[split])
                trainer.model = final_model

            output_eval_file = os.path.join(training_args.output_dir,
                                            f"{split}_results.txt")
            with open(output_eval_file, "a+") as writer:
                logger.info("***** Extensive Eval results *****")
                if not training_args.do_train:
                    writer.write(
                        f"eval checkpoint {model_args.model_name_or_path}\n")
                for key, value in sorted(results.metrics.items()):
                    logger.info(f"{key} = {value:.3f}")
                    writer.write(f"{key} = {value:.3f}\n")
                if training_args.load_best_model_at_end:
                    writer.write(f"best model on dev set\n")
                    for key, value in sorted(
                            best_model_results.metrics.items()):
                        logger.info(f"{key} = {value:.3f}")
                        writer.write(f"{key} = {value:.3f}\n")
            if data_args.output_prediction_file or data_args.split_train_dataset:
                prediction = {
                    example_id: prediction.tolist()
                    for prediction, label_id, example_id in zip(*results[:-1])
                }
                if split == "holdout_set":
                    output_prediction_file = os.path.join(
                        training_args.output_dir,
                        f"holdout_{data_args.n_fold}_{data_args.holdout_set}_prediction.json"
                    )
                else:
                    output_prediction_file = os.path.join(
                        training_args.output_dir, f"{split}_prediction.json")
                with open(output_prediction_file, "w") as f:
                    json.dump(prediction, f)
Esempio n. 30
0
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser(
        (BasicModelArguments, DataTrainingArguments, TrainingArguments))
    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args = parser.parse_json_file(
            json_file=os.path.abspath(sys.argv[1]))
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses(
        )

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO
        if is_main_process(training_args.local_rank) else logging.WARN,
    )

    # Log on each process the small summary:
    logger.warning(
        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, "
        +
        f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
    )
    # Set the verbosity to info of the Transformers logger (on main process only):
    if is_main_process(training_args.local_rank):
        transformers.utils.logging.set_verbosity_info()
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed before initializing model.
    set_seed(training_args.seed)

    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
    # (the dataset will be downloaded automatically from the datasets Hub).

    # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
    # 'text' is found. You can easily tweak this behavior (see below).

    if data_args.dataset not in ['race', 'dream']:
        raise ValueError("Dataset should be race or dream.")
    else:
        from mcmrc.data_utils.processors import prepare_features_for_generate_pseudo_label

    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
    # download the dataset.

    if data_args.debug_mode:
        datasets = load_dataset(data_args.dataload_script,
                                data_args.dataload_split,
                                data_dir=data_args.data_dir,
                                split={
                                    'train':
                                    ReadInstruction('train',
                                                    from_=0,
                                                    to=5,
                                                    unit='abs'),
                                    'validation':
                                    ReadInstruction('validation',
                                                    from_=0,
                                                    to=5,
                                                    unit='abs'),
                                    'test':
                                    ReadInstruction('test',
                                                    from_=0,
                                                    to=5,
                                                    unit='abs')
                                })
    else:
        datasets = load_dataset(data_args.dataload_script,
                                data_args.dataload_split,
                                data_dir=data_args.data_dir)

    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
    # https://huggingface.co/docs/datasets/loading_datasets.html.

    # Load pretrained model and tokenizer

    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.
    config = AutoConfig.from_pretrained(
        model_args.config_name
        if model_args.config_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
    )
    tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name
        if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
        use_fast=model_args.use_fast_tokenizer,
    )
    model = AutoModelForMultipleChoice.from_pretrained(
        model_args.model_name_or_path,
        from_tf=bool(".ckpt" in model_args.model_name_or_path),
        config=config,
        cache_dir=model_args.cache_dir,
    )

    column_names = datasets["train"].column_names

    pprepare_features_for_generate_pseudo_label = partial(
        prepare_features_for_generate_pseudo_label,
        tokenizer=tokenizer,
        data_args=data_args)
    tokenized_datasets = datasets.map(
        pprepare_features_for_generate_pseudo_label,
        batched=True,
        num_proc=data_args.preprocessing_num_workers,
        remove_columns=column_names,
        load_from_cache_file=not data_args.overwrite_cache,
    )

    # Data collator
    data_collator = (default_data_collator if data_args.pad_to_max_length else
                     DataCollatorForGeneratingEvidenceLabel(
                         tokenizer=tokenizer))

    device = training_args.device
    model.to(device)
    model.eval()
    if training_args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    pseudo_label = {}
    options_prob_diff = {}
    acc = {}
    for train_test_or_eval, dataset in tokenized_datasets.items():
        dataloader = DataLoader(dataset,
                                batch_size=1,
                                sampler=SequentialSampler(dataset),
                                collate_fn=data_collator,
                                num_workers=0)

        pseudo_label_split = {}
        options_prob_diff_split = {}
        acc_split = {}
        print(f'{train_test_or_eval}', len(dataloader))
        for step, batch in enumerate(tqdm.tqdm(dataloader)):
            with torch.no_grad():
                origin_inputs = {
                    "input_ids": batch['input_ids'].to(device),
                    "attention_mask": batch['attention_mask'].to(device),
                    "token_type_ids": batch['token_type_ids'].to(device),
                }
                origin_logits = model(**origin_inputs).logits.detach().cpu()

            example_ids = batch['example_ids']
            sent_bounds = batch['sent_bound_token']

            for i, one_example_sent_bounds in enumerate(sent_bounds):

                if example_ids[i] not in pseudo_label_split.keys():
                    kl_div_per_example = {}
                    prob_diff_per_example = {}
                    pseudo_label_split[example_ids[i]] = kl_div_per_example
                    options_prob_diff_split[
                        example_ids[i]] = prob_diff_per_example
                else:
                    kl_div_per_example = pseudo_label_split[example_ids[i]]
                    prob_diff_per_example = options_prob_diff_split[
                        example_ids[i]]

                one_example_logit = origin_logits[i]
                one_example_sent_bounds = torch.tensor(one_example_sent_bounds,
                                                       device=device)
                one_example_attention_mask = batch['attention_mask'][i]
                one_example_input_ids = batch['input_ids'][i]
                one_example_token_type_ids = batch['token_type_ids'][i]
                one_example_label = batch['labels'][i]
                sent_num = one_example_sent_bounds.size()[0]

                for j in range(0, sent_num, training_args.eval_batch_size):
                    batch_start = j
                    batch_end = j + training_args.eval_batch_size if j < sent_num - training_args.eval_batch_size else sent_num
                    batched_sent_bound = torch.stack(
                        (one_example_sent_bounds[batch_start:batch_end, 1],
                         one_example_sent_bounds[batch_start:batch_end,
                                                 2])).unsqueeze(1).permute(
                                                     2, 1, 0)

                    batched_attention_mask = one_example_attention_mask.unsqueeze(
                        0).expand(batch_end - batch_start, -1,
                                  -1).clone().to(device)

                    pos_matrix = torch.arange(
                        batched_attention_mask.size()[-1],
                        device=device).view(1, 1, -1)
                    if_in_sent = torch.logical_and(
                        batched_sent_bound[:, :, 0].unsqueeze(-1) <=
                        pos_matrix, pos_matrix <=
                        batched_sent_bound[:, :, 1].unsqueeze(-1))

                    batched_attention_mask = torch.where(
                        if_in_sent, torch.tensor(0, device=device),
                        batched_attention_mask)
                    batched_input_ids = one_example_input_ids.expand(
                        batch_end - batch_start, -1, -1).contiguous()
                    batched_token_type_ids = one_example_token_type_ids.expand(
                        batch_end - batch_start, -1, -1).contiguous()

                    with torch.no_grad():
                        masked_inputs = {
                            "input_ids": batched_input_ids.to(device),
                            "attention_mask":
                            batched_attention_mask.to(device),
                            "token_type_ids":
                            batched_token_type_ids.to(device),
                        }
                        masked_logits = model(
                            **masked_inputs).logits.detach().cpu()
                        kl_divs = torch.sum(F.kl_div(
                            F.log_softmax(masked_logits, dim=-1),
                            F.softmax(one_example_logit, dim=-1),
                            reduction='none'),
                                            dim=-1)
                        prob_diff = F.softmax(masked_logits,
                                              dim=-1) - F.softmax(
                                                  one_example_logit, dim=-1)

                    for k, kl_div in enumerate(
                            kl_divs.detach().cpu().tolist()):
                        sent_idx = one_example_sent_bounds[batch_start + k,
                                                           0].item()
                        evidence_or_noise = 1 if F.softmax(masked_logits[k], dim=-1)[one_example_label].item() \
                                                < F.softmax(one_example_logit, dim=-1)[one_example_label].item() else -1
                        if sent_idx in kl_div_per_example.keys():
                            if kl_div > abs(kl_div_per_example[sent_idx]):
                                kl_div_per_example[
                                    sent_idx] = evidence_or_noise * kl_div
                                prob_diff_per_example[sent_idx] = prob_diff[
                                    k].detach().cpu().tolist()
                        else:
                            kl_div_per_example[
                                sent_idx] = evidence_or_noise * kl_div
                            prob_diff_per_example[sent_idx] = prob_diff[
                                k].detach().cpu().tolist()

                acc_split[example_ids[i]] = 1 if torch.argmax(
                    one_example_logit).item() == one_example_label.item(
                    ) else 0

        pseudo_label[train_test_or_eval] = pseudo_label_split
        options_prob_diff[train_test_or_eval] = options_prob_diff_split
        acc[train_test_or_eval] = acc_split

    label = {
        'pseudo_label': pseudo_label,
        'acc': acc,
        'options_prob_diff': options_prob_diff
    }
    torch.save(
        label, data_args.dataset +
        f"_pseudo_label_with_options_{config.model_type}_{config.hidden_size}.pt"
    )