Esempio n. 1
0
 def test_sequence_classification_forward(self):
     config, input_ids, batch_size = self._get_config_and_data()
     labels = _long_tensor([2] * batch_size).to(torch_device)
     model = BartForSequenceClassification(config)
     model.to(torch_device)
     outputs = model(input_ids=input_ids, decoder_input_ids=input_ids, labels=labels)
     expected_shape = torch.Size((batch_size, config.num_labels))
     self.assertEqual(outputs["logits"].shape, expected_shape)
     self.assertIsInstance(outputs["loss"].item(), float)
    def test_lm_forward(self):
        input_ids = torch.tensor(
            [
                [71, 82, 18, 33, 46, 91, 2],
                [68, 34, 26, 58, 30, 82, 2],
                [5, 97, 17, 39, 94, 40, 2],
                [76, 83, 94, 25, 70, 78, 2],
                [87, 59, 41, 35, 48, 66, 2],
                [55, 13, 16, 58, 5, 2, 1],  # note padding
                [64, 27, 31, 51, 12, 75, 2],
                [52, 64, 86, 17, 83, 39, 2],
                [48, 61, 9, 24, 71, 82, 2],
                [26, 1, 60, 48, 22, 13, 2],
                [21, 5, 62, 28, 14, 76, 2],
                [45, 98, 37, 86, 59, 48, 2],
                [70, 70, 50, 9, 28, 0, 2],
            ],
            dtype=torch.long,
            device=torch_device,
        )
        batch_size = input_ids.shape[0]
        decoder_lm_labels = ids_tensor([batch_size, input_ids.shape[1]],
                                       self.vocab_size)

        config = BartConfig(
            vocab_size=self.vocab_size,
            d_model=24,
            encoder_layers=2,
            decoder_layers=2,
            encoder_attention_heads=2,
            decoder_attention_heads=2,
            encoder_ffn_dim=32,
            decoder_ffn_dim=32,
            max_position_embeddings=48,
        )
        model = BartForSequenceClassification(config)
        model.to(torch_device)
        outputs = model.forward(input_ids=input_ids,
                                decoder_input_ids=input_ids)
        logits = outputs[0]
        expected_shape = torch.Size((batch_size, config.num_labels))
        self.assertEqual(logits.shape, expected_shape)

        lm_model = BartForMaskedLM(config)
        lm_model.to(torch_device)
        loss, logits, enc_features = lm_model.forward(
            input_ids=input_ids,
            lm_labels=decoder_lm_labels,
            decoder_input_ids=input_ids)
        expected_shape = (batch_size, input_ids.shape[1], config.vocab_size)
        self.assertEqual(logits.shape, expected_shape)
        self.assertIsInstance(loss.item(), float)
Esempio n. 3
0
def load_bart_fever_rte_model(model_name, data_dir):
    processors = {
        "rte": RteProcessor
    }

    output_modes = {
        "rte": "classification"
    }
    # task_name = args.task_name.lower()
    task_name = 'rte'
    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name]()
    output_mode = output_modes[task_name]

    label_list = processor.get_labels()  # [0,1]
    num_labels = len(label_list)
    pretrain_model_dir = '{}/FineTuneOn{}'.format(data_dir, model_name)
    # pretrain_model_dir = 'please enter your pretrain models path here/FineTuneOn{}'.format(model_name)
    # Prepare model
    # cache_dir = os.path.join(str(PYTORCH_TRANSFORMERS_CACHE), '{} model distributed_{}'.format(model_name, -1))
    # # cache_dir = os.path.join(str(PYTORCH_TRANSFORMERS_CACHE), '{} model distributed_{}'.format(model_name, -1))

    model = BartForSequenceClassification.from_pretrained(pretrain_model_dir, num_labels=num_labels)
    tokenizer = BartTokenizer.from_pretrained(pretrain_model_dir)
    # model = BertForSequenceClassification.from_pretrained('bert-base-uncased',
    #           cache_dir=cache_dir,
    #           num_labels=num_labels)
    # tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
    # print(tokenizer)
    return model, tokenizer
Esempio n. 4
0
    def __init__(self):

        self.nli_model = BartForSequenceClassification.from_pretrained(
            'facebook/bart-large-mnli')
        self.nli_model = self.nli_model.to(DEVICE)
        self.tokenizer = BartTokenizer.from_pretrained(
            'facebook/bart-large-mnli')
Esempio n. 5
0
def convert_bart_checkpoint(checkpoint_path, pytorch_dump_folder_path):
    """
    Copy/paste/tweak model's weights to our BERT structure.
    """
    bart = torch.hub.load("pytorch/fairseq", checkpoint_path)
    bart.eval()  # disable dropout
    bart.model.upgrade_state_dict(bart.model.state_dict())
    hf_model_name = checkpoint_path.replace(".", "-")
    config = BartConfig.from_pretrained(hf_model_name)
    tokens = bart.encode(SAMPLE_TEXT).unsqueeze(0)
    tokens2 = BartTokenizer.from_pretrained(hf_model_name).encode(
        SAMPLE_TEXT, return_tensors="pt").unsqueeze(0)
    assert torch.eq(tokens, tokens2).all()

    if checkpoint_path in ["bart.large", "bart.large.cnn"]:
        state_dict = bart.model.state_dict()
        for k in IGNORE_KEYS:
            state_dict.pop(k, None)
        state_dict["shared.weight"] = state_dict["decoder.embed_tokens.weight"]
        model = BartModel(config)
        their_output = bart.extract_features(tokens)
    else:  # MNLI Case
        state_dict = bart.state_dict()
        for k in IGNORE_KEYS:
            state_dict.pop(k, None)
        state_dict["model.shared.weight"] = state_dict[
            "model.decoder.embed_tokens.weight"]
        for src, dest in rename_keys:
            rename_key(state_dict, src, dest)
        model = BartForSequenceClassification(config)
        their_output = bart.eval("mnli", tokens, return_logits=True)

    # Load state dict
    model.load_state_dict(state_dict)
    model.eval()
    # Check results

    if checkpoint_path == "bart.large.cnn":  # generate doesnt work yet
        model = BartForMaskedLM(config, base_model=model)
        assert "lm_head.weight" in model.state_dict()
        assert model.lm_head.out_features == config.max_position_embeddings
        model.eval()
        our_outputs = model.model.forward(tokens)[0]
    else:
        our_outputs = model.forward(tokens)[0]
    assert their_output.shape == our_outputs.shape
    assert (their_output == our_outputs).all().item()
    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
    model.save_pretrained(pytorch_dump_folder_path)
Esempio n. 6
0
    def __init__(self, hparams, get_dataset):
        super().__init__()
        self.hparams = hparams
        self.get_dataset = get_dataset

        if self.hparams.task == "generation":
            self.model = BartForConditionalGeneration.from_pretrained(
                hparams.model_name_or_path)

        else:
            config = BartConfig.from_pretrained(hparams.model_name_or_path)
            config.num_labels = hparams.num_labels

            self.model = BartForSequenceClassification.from_pretrained(
                hparams.model_name_or_path, config=config)

        self.tokenizer = BartTokenizer.from_pretrained(
            hparams.tokenizer_name_or_path)
Esempio n. 7
0
def create_student(teacher, student_encoder_layers, student_decoder_layers):
    teacher = BartForSequenceClassification.from_pretrained(teacher).eval()
    student_updates = {
        "decoder_layers": student_decoder_layers,
        "encoder_layers": student_encoder_layers,
    }
    e_layers_to_copy: List = get_layers_to_copy(
        student_updates["encoder_layers"], teacher.config.encoder_layers)
    d_layers_to_copy: List = get_layers_to_copy(
        student_updates["decoder_layers"], teacher.config.decoder_layers)

    kw = teacher.config.to_diff_dict()
    kw.update(student_updates)
    # Copy weights
    student_cfg = teacher.config_class(**kw)
    student = type(teacher)(student_cfg)
    student, _ = init_student(student, teacher)
    copy_to_student(d_layers_to_copy, e_layers_to_copy, student_encoder_layers,
                    student_decoder_layers, student, teacher)
    return student
async def upload_pretrained_model_to_classifier(category: str):  # , pretrained_model: str):
    print("\n@ PUT upload_pretrained_2_classifier resourse.")
    print("loading local pretrained model...")

    pretrained_model = BartForSequenceClassification.from_pretrained(
        pretrained_model_name_or_path="C:/Users/lavml/Documents/SoSe20/nlp/BERT/restapi/results/fine-tuning/",
        local_files_only=True)
    tokenizer = BartTokenizer.from_pretrained('results/fine-tuning/', vocab_file="results/fine-tuning/vocab.json",
                                              merges_file="results/fine-tuning/merges.txt")

    classifier = pipeline('zero-shot-classification', model=pretrained_model, tokenizer=tokenizer)  # , device=0)

    # add it to the db

    new_id = 1 if len(db) == 0 else db[-1]['cl_id'] + 1
    db.append(
        {'cl_id': new_id, 'model': classifier, 'category': category, 'data': [], 'description': 'fine-tuned model'})

    return {'data': None,
            'message': f"Fine-tuned Model apended to the database under the id: {new_id}. Try the new resource here "
                       f"--> http://127.0.0.1:8000/api/zero-shot/{new_id}"}
def convert_bart_checkpoint(checkpoint_path, pytorch_dump_folder_path):
    """
    Copy/paste/tweak model's weights to our BERT structure.
    """
    b2 = torch.hub.load("pytorch/fairseq", checkpoint_path)
    b2.eval()  # disable dropout
    b2.model.upgrade_state_dict(b2.model.state_dict())
    config = BartConfig()
    tokens = b2.encode(SAMPLE_TEXT).unsqueeze(0)
    tokens2 = BartTokenizer.from_pretrained("bart-large").encode(
        SAMPLE_TEXT).unsqueeze(0)
    assert torch.eq(tokens, tokens2).all()

    # assert their_output.size() == (1, 11, 1024)

    if checkpoint_path == "bart.large":
        state_dict = b2.model.state_dict()
        state_dict["shared.weight"] = state_dict["decoder.embed_tokens.weight"]
        model = BartModel(config)
        their_output = b2.extract_features(tokens)

    else:  # MNLI Case
        state_dict = b2.state_dict()
        state_dict["model.shared.weight"] = state_dict[
            "model.decoder.embed_tokens.weight"]
        for src, dest in rename_keys:
            rename_key(state_dict, src, dest)
        state_dict.pop("_float_tensor", None)
        model = BartForSequenceClassification(config)
        their_output = b2.predict("mnli", tokens, return_logits=True)
    for k in IGNORE_KEYS:
        state_dict.pop(k, None)
    model.load_state_dict(state_dict)
    model.eval()
    our_outputs = model.forward(tokens)[0]

    assert their_output.shape == our_outputs.shape
    assert (their_output == our_outputs).all().item()
    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
    model.save_pretrained(pytorch_dump_folder_path)
Esempio n. 10
0
    def __init__(self, model_name='facebook/bart-large-mnli', device=None):
        """
        interface to BART-based text summarization using transformers library

        Args:
          model_name(str): name of BART model
          device(str): device to use (e.g., 'cuda', 'cpu')
        """
        if 'mnli' not in model_name:
            raise ValueError('ZeroShotClasifier requires an MNLI model')
        try:
            import torch
        except ImportError:
            raise Exception(
                'ZeroShotClassifier requires PyTorch to be installed.')
        self.torch_device = device
        if self.torch_device is None:
            self.torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
        from transformers import BartForSequenceClassification, BartTokenizer
        self.tokenizer = BartTokenizer.from_pretrained(model_name)
        self.model = BartForSequenceClassification.from_pretrained(
            model_name).to(self.torch_device)
Esempio n. 11
0
def loading_bart_model():
    bart_model = BartForSequenceClassification.from_pretrained('facebook/bart-large-mnli', output_attention=True)
    bart_tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-mnli', output_attention=True)
    return bart_model, bart_tokenizer
Esempio n. 12
0
def setupBartSentimentAnalysis(modelName):
    tokenizer = BartTokenizer.from_pretrained(modelName)
    model = BartForSequenceClassification.from_pretrained(modelName)
    return pipeline(task="sentiment-analysis",
                    model=model,
                    tokenizer=tokenizer)
Esempio n. 13
0
batch_size = 16
train_dataset = KGBDDataset(train_dev['train'])
valid_dataset = KGBDDataset(train_dev['dev'])
train_dataloader = DataLoader(train_dataset,
                              batch_size=batch_size,
                              num_workers=4,
                              shuffle=True)
valid_dataloader = DataLoader(valid_dataset,
                              batch_size=batch_size,
                              num_workers=4,
                              shuffle=False)

from transformers.optimization import AdamW, get_cosine_schedule_with_warmup
from transformers import BartForSequenceClassification

model = BartForSequenceClassification.from_pretrained(
    get_pytorch_kobart_model()).cuda()

param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [{
    'params':
    [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
    'weight_decay':
    0.01
}, {
    'params':
    [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
    'weight_decay':
    0.0
}]
optimizer = AdamW(optimizer_grouped_parameters, lr=5e-5, correct_bias=False)
Esempio n. 14
0
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TrainingArguments))
    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args = parser.parse_json_file(
            json_file=os.path.abspath(sys.argv[1]))
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses(
        )

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        handlers=[logging.StreamHandler(sys.stdout)],
    )

    log_level = training_args.get_process_log_level()
    logger.setLevel(log_level)
    datasets.utils.logging.set_verbosity(log_level)
    transformers.utils.logging.set_verbosity(log_level)
    transformers.utils.logging.enable_default_handler()
    transformers.utils.logging.enable_explicit_format()

    # Log on each process the small summary:
    logger.warning(
        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
        +
        f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
    )
    logger.info(f"Training/evaluation parameters {training_args}")

    # Detecting last checkpoint.
    last_checkpoint = None
    if os.path.isdir(
            training_args.output_dir
    ) and training_args.do_train and not training_args.overwrite_output_dir:
        last_checkpoint = get_last_checkpoint(training_args.output_dir)
        if last_checkpoint is None and len(os.listdir(
                training_args.output_dir)) > 0:
            raise ValueError(
                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
                "Use --overwrite_output_dir to overcome.")
        elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
            logger.info(
                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
            )

    # Set seed before initializing model.
    set_seed(training_args.seed)

    # Get the datasets: you can either provide your own CSV/JSON training and evaluation files (see below)
    # or specify a GLUE benchmark task (the dataset will be downloaded automatically from the datasets Hub).
    #
    # For JSON files, this script will use the `question` column for the input question and `table` column for the corresponding table.
    #
    # If the CSVs/JSONs contain only one non-label column, the script does single sentence classification on this
    # single column. You can easily tweak this behavior (see below)
    #
    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
    # download the dataset.
    if data_args.dataset_name is not None:
        # Downloading and loading a dataset from the hub.
        raw_datasets = load_dataset(data_args.dataset_name,
                                    data_args.dataset_config_name,
                                    cache_dir=model_args.cache_dir)
    else:
        # Loading a dataset from your local files.
        # CSV/JSON training and evaluation files are needed.
        data_files = {
            "train": data_args.train_file,
            "validation": data_args.validation_file
        }

        # Get the test dataset: you can provide your own CSV/JSON test file (see below)
        # when you use `do_predict` without specifying a GLUE benchmark task.
        if training_args.do_predict:
            if data_args.test_file is not None:
                train_extension = data_args.train_file.split(".")[-1]
                test_extension = data_args.test_file.split(".")[-1]
                assert (
                    test_extension == train_extension
                ), "`test_file` should have the same extension (csv or json) as `train_file`."
                data_files["test"] = data_args.test_file
            else:
                raise ValueError(
                    "Need either a GLUE task or a test file for `do_predict`.")

        for key in data_files.keys():
            logger.info(f"load a local file for {key}: {data_files[key]}")

        if data_args.train_file.endswith(".csv"):
            # Loading a dataset from local csv files
            raw_datasets = load_dataset("csv",
                                        data_files=data_files,
                                        cache_dir=model_args.cache_dir)
        else:
            # Loading a dataset from local json files
            raw_datasets = load_dataset("json",
                                        data_files=data_files,
                                        cache_dir=model_args.cache_dir)
    # See more about loading any type of standard or custom dataset at
    # https://huggingface.co/docs/datasets/loading_datasets.html.

    # Labels
    label_list = raw_datasets["train"].features["label"].names
    num_labels = len(label_list)

    # Load pretrained model and tokenizer
    #
    # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.
    config = AutoConfig.from_pretrained(
        model_args.config_name
        if model_args.config_name else model_args.model_name_or_path,
        num_labels=num_labels,
        cache_dir=model_args.cache_dir,
        revision=model_args.model_revision,
        use_auth_token=True if model_args.use_auth_token else None,
    )
    # load tapex tokenizer
    tokenizer = TapexTokenizer.from_pretrained(
        model_args.tokenizer_name
        if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
        use_fast=model_args.use_fast_tokenizer,
        revision=model_args.model_revision,
        use_auth_token=True if model_args.use_auth_token else None,
        add_prefix_space=True,
    )
    model = BartForSequenceClassification.from_pretrained(
        model_args.model_name_or_path,
        from_tf=bool(".ckpt" in model_args.model_name_or_path),
        config=config,
        cache_dir=model_args.cache_dir,
        revision=model_args.model_revision,
        use_auth_token=True if model_args.use_auth_token else None,
    )

    # Padding strategy
    if data_args.pad_to_max_length:
        padding = "max_length"
    else:
        # We will pad later, dynamically at batch creation, to the max sequence length in each batch
        padding = False

    # Some models have set the order of the labels to use, so let's make sure we do use it.
    model.config.label2id = {"Refused": 0, "Entailed": 1}
    model.config.id2label = {0: "Refused", 1: "Entailed"}

    if data_args.max_seq_length > tokenizer.model_max_length:
        logger.warning(
            f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
            f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
        )
    max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)

    def preprocess_tabfact_function(examples):
        # Tokenize the texts
        def _convert_table_text_to_pandas(_table_text):
            """Runs the structured pandas table object for _table_text.
            An example _table_text can be: round#clubs remaining\nfirst round#156\n
            """
            _table_content = [
                _table_row.split("#")
                for _table_row in _table_text.strip("\n").split("\n")
            ]
            _table_pd = pd.DataFrame.from_records(_table_content[1:],
                                                  columns=_table_content[0])
            return _table_pd

        questions = examples["statement"]
        tables = list(
            map(_convert_table_text_to_pandas, examples["table_text"]))
        result = tokenizer(tables,
                           questions,
                           padding=padding,
                           max_length=max_seq_length,
                           truncation=True)

        result["label"] = examples["label"]
        return result

    with training_args.main_process_first(desc="dataset map pre-processing"):
        raw_datasets = raw_datasets.map(
            preprocess_tabfact_function,
            batched=True,
            load_from_cache_file=not data_args.overwrite_cache,
            desc="Running tokenizer on dataset",
        )
    if training_args.do_train:
        if "train" not in raw_datasets:
            raise ValueError("--do_train requires a train dataset")
        train_dataset = raw_datasets["train"]
        if data_args.max_train_samples is not None:
            train_dataset = train_dataset.select(
                range(data_args.max_train_samples))

    if training_args.do_eval:
        if "validation" not in raw_datasets and "validation_matched" not in raw_datasets:
            raise ValueError("--do_eval requires a validation dataset")
        eval_dataset = raw_datasets["validation"]
        if data_args.max_eval_samples is not None:
            eval_dataset = eval_dataset.select(
                range(data_args.max_eval_samples))

    if training_args.do_predict or data_args.test_file is not None:
        if "test" not in raw_datasets and "test_matched" not in raw_datasets:
            raise ValueError("--do_predict requires a test dataset")
        predict_dataset = raw_datasets["test"]
        if data_args.max_predict_samples is not None:
            predict_dataset = predict_dataset.select(
                range(data_args.max_predict_samples))

    # Log a few random samples from the training set:
    if training_args.do_train:
        for index in random.sample(range(len(train_dataset)), 3):
            logger.info(
                f"Sample {index} of the training set: {train_dataset[index]}.")

    # You can define your custom compute_metrics function. It takes an `EvalPrediction` object (a namedtuple with a
    # predictions and label_ids field) and has to return a dictionary string to float.
    def compute_metrics(p: EvalPrediction):
        preds = p.predictions[0] if isinstance(p.predictions,
                                               tuple) else p.predictions
        preds = np.argmax(preds, axis=1)
        return {
            "accuracy":
            (preds == p.label_ids).astype(np.float32).mean().item()
        }

    # Data collator will default to DataCollatorWithPadding, so we change it if we already did the padding.
    if data_args.pad_to_max_length:
        data_collator = default_data_collator
    elif training_args.fp16:
        data_collator = DataCollatorWithPadding(tokenizer,
                                                pad_to_multiple_of=8)
    else:
        data_collator = None

    # Initialize our Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset if training_args.do_train else None,
        eval_dataset=eval_dataset if training_args.do_eval else None,
        compute_metrics=compute_metrics,
        tokenizer=tokenizer,
        data_collator=data_collator,
    )

    # Training
    if training_args.do_train:
        checkpoint = None
        if training_args.resume_from_checkpoint is not None:
            checkpoint = training_args.resume_from_checkpoint
        elif last_checkpoint is not None:
            checkpoint = last_checkpoint
        train_result = trainer.train(resume_from_checkpoint=checkpoint)
        metrics = train_result.metrics
        max_train_samples = (data_args.max_train_samples
                             if data_args.max_train_samples is not None else
                             len(train_dataset))
        metrics["train_samples"] = min(max_train_samples, len(train_dataset))

        trainer.save_model()  # Saves the tokenizer too for easy upload

        trainer.log_metrics("train", metrics)
        trainer.save_metrics("train", metrics)
        trainer.save_state()

    # Evaluation
    if training_args.do_eval:
        logger.info("*** Evaluate ***")

        metrics = trainer.evaluate(eval_dataset=eval_dataset)
        max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(
            eval_dataset)
        metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))

        trainer.log_metrics("eval", metrics)
        trainer.save_metrics("eval", metrics)

    if training_args.do_predict:
        logger.info("*** Predict ***")

        # Removing the `label` columns because it contains -1 and Trainer won't like that.
        predict_dataset = predict_dataset.remove_columns("label")
        predictions = trainer.predict(predict_dataset,
                                      metric_key_prefix="predict").predictions
        predictions = np.argmax(predictions, axis=1)

        output_predict_file = os.path.join(training_args.output_dir,
                                           "predict_results_tabfact.txt")
        if trainer.is_world_process_zero():
            with open(output_predict_file, "w") as writer:
                logger.info("***** Predict Results *****")
                writer.write("index\tprediction\n")
                for index, item in enumerate(predictions):
                    item = label_list[item]
                    writer.write(f"{index}\t{item}\n")

    kwargs = {
        "finetuned_from": model_args.model_name_or_path,
        "tasks": "text-classification"
    }

    if training_args.push_to_hub:
        trainer.push_to_hub(**kwargs)
    else:
        trainer.create_model_card(**kwargs)
Esempio n. 15
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--train',
                        type=str,
                        default='data/disaster_response_messages_training.csv')
    parser.add_argument('--test',
                        type=str,
                        default='data/disaster_response_messages_test.csv')
    parser.add_argument(
        '--validation',
        type=str,
        default='data/disaster_response_messages_validation.csv')
    parser.add_argument('--epoch', type=str, default='10')
    parser.add_argument('--model',
                        type=str,
                        default='bert',
                        choices=['bert', 'bart', 'gpt2', 'roberta', 'xlnet'])
    args = parser.parse_args()

    EPOCH = int(args.epoch)
    model_name = args.model

    # create data loader for training and validation
    if model_name == 'bert':
        train_set = BertDataset(args.train)
        val_set = BertDataset(args.validation)
        test_set = BertDataset(args.test)
    elif model_name == 'bart':
        train_set = BartDataset(args.train)
        val_set = BartDataset(args.validation)
        test_set = BartDataset(args.test)
    elif model_name == 'gpt2':
        train_set = GPT2Dataset(args.train)
        val_set = GPT2Dataset(args.validation)
        test_set = GPT2Dataset(args.test)
    elif model_name == 'roberta':
        train_set = RobertaDataset(args.train)
        val_set = RobertaDataset(args.validation)
        test_set = RobertaDataset(args.test)
    elif model_name == 'xlnet':
        train_set = XLNetDataset(args.train)
        val_set = XLNetDataset(args.validation)
        test_set = XLNetDataset(args.test)

    train_loader = DataLoader(train_set, batch_size=20, shuffle=True)
    val_loader = DataLoader(val_set, batch_size=20, shuffle=False)
    test_loader = DataLoader(test_set, batch_size=20, shuffle=False)

    print('Data Loaded.')

    if model_name == 'bert':
        model = BertForSequenceClassification.from_pretrained(
            'bert-base-uncased', num_labels=2)
    elif model_name == 'gpt2':
        model = GPT2ForSequenceClassification.from_pretrained('gpt2',
                                                              num_labels=2)
        model.config.pad_token_id = model.config.eos_token_id
    elif model_name == 'bart':
        model = BartForSequenceClassification.from_pretrained(
            'facebook/bart-base', num_labels=2)
    elif model_name == 'roberta':
        model = RobertaForSequenceClassification.from_pretrained(
            'roberta-base', num_labels=2)
    elif model_name == 'xlnet':
        model = XLNetForSequenceClassification.from_pretrained(
            'xlnet-base-cased', num_labels=2)

    optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
    total_steps = len(train_loader) * EPOCH
    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps=0,
                                                num_training_steps=total_steps)
    criterion = nn.CrossEntropyLoss()

    print('\nModel: ', model_name, '\tEpochs: ', EPOCH)

    epoch_loss = []
    epoch_val_acc = []

    for epoch in range(EPOCH):
        tqdm.write('Epoch: {}'.format(epoch + 1))
        loss = train(model, train_loader, criterion, optimizer, scheduler)
        epoch_loss.append(loss)
        val_acc = val(model, val_loader)
        epoch_val_acc.append(val_acc)

    torch.save(model, model_name + '/' + model_name + '_model.pt')

    # model = torch.load(model_name+'_model.pt')

    tqdm.write('\nFinal test...')
    test_result = test(model, test_loader)

    with open(model_name + '/' + model_name + '_loss.p', 'wb') as f:
        pickle.dump(epoch_loss, f)
    with open(model_name + '/' + model_name + '_val_accuracy.p', 'wb') as f:
        pickle.dump(epoch_val_acc, f)
    with open(model_name + '/' + model_name + '_test_result.p', 'wb') as f:
        pickle.dump(test_result, f)
Esempio n. 16
0
    def load_class(self):
        # Load the tokenizer.
        if self.verbose == True:
            print('Loading {} class...'.format(self.model_name))
        if self.model_name == 'bert':
            # Load BertForSequenceClassification, the pretrained BERT model with a single
            # linear classification layer on top.
            self.model = BertForSequenceClassification.from_pretrained(
                self.
                model_type,  # Use the 12-layer BERT model, with an uncased vocab.
                # You can increase this for multi-class tasks.
                num_labels=self.num_labels,
                output_attentions=
                False,  # Whether the model returns attentions weights.
                output_hidden_states=
                False,  # Whether the model returns all hidden-states.
            )
        if self.model_name == 'distilbert':
            self.model = DistilBertForSequenceClassification.from_pretrained(
                self.model_type,
                num_labels=self.num_labels,
                output_attentions=False,
                output_hidden_states=False,
            )
        if self.model_name == 'albert':
            self.model = AlbertForSequenceClassification.from_pretrained(
                self.model_type,
                num_labels=self.num_labels,
                output_attentions=False,
                output_hidden_states=False,
            )
        if self.model_name == 'bart':
            if self.task == 'classification':
                self.model = BartForSequenceClassification.from_pretrained(
                    self.model_type,
                    num_labels=self.num_labels,
                    output_attentions=False,
                    output_hidden_states=False,
                )
            if self.task == 'summarize':
                self.model = BartForConditionalGeneration.from_pretrained(
                    self.model_type)

        if self.model_name == 'xlnet':
            self.model = XLNetForSequenceClassification.from_pretrained(
                self.model_type,
                num_labels=self.num_labels,
                output_attentions=False,
                output_hidden_states=False,
            )
        if self.model_name == 'roberta':
            self.model = RobertaForSequenceClassification.from_pretrained(
                self.model_type,
                num_labels=self.num_labels,
                output_attentions=False,
                output_hidden_states=False,
            )
        if self.model_name == 'camenbert':
            self.model = CamembertForSequenceClassification.from_pretrained(
                self.model_type,
                num_labels=self.num_labels,
                output_attentions=False,
                output_hidden_states=False,
            )
        if self.model_name == 'flaubert':
            self.model = FlaubertForSequenceClassification.from_pretrained(
                self.model_type,
                num_labels=self.num_labels,
                output_attentions=False,
                output_hidden_states=False,
            )
        if self.model_name == 'gpt2':
            self.model = GPT2LMHeadModel.from_pretrained(self.model_type)
Esempio n. 17
0
def load_bart_model_tokenizer(model_name):
    tokenizer = BartTokenizer.from_pretrained(model_name)
    model = BartForSequenceClassification.from_pretrained(model_name)

    return model, tokenizer
Esempio n. 18
0
 def __init__(self, hparams, **kwargs):
     super(KoBARTClassification, self).__init__(hparams, **kwargs)
     self.model = BartForSequenceClassification.from_pretrained(get_pytorch_kobart_model())
     self.model.train()
     self.metric_acc = pl.metrics.classification.Accuracy()
    def __getitem__(self, idx):
        text = self.df.iloc[idx, 1]
        label = self.df.iloc[idx, 2]
        return text, label


nsmc_train_dataset = NsmcDataset(train_df)
print(nsmc_train_dataset.__getitem__(0))
train_loader = DataLoader(nsmc_train_dataset,
                          batch_size=1,
                          shuffle=True,
                          num_workers=2)

tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')
model = BartForSequenceClassification.from_pretrained('facebook/bart-large')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
import torch.nn.functional as F

optimizer = torch.optim.Adam(model.parameters(), lr=1e-6)

itr = 1
p_itr = 500
epochs = 2
total_loss = 0
total_len = 0
total_correct = 0

model.train()
for epoch in range(epochs):
async def retrain_classifier(data_id: int):
    # commented code works for automatic custom size dataset generation...
    print(f"\n@ POST request_retrain; data_id: {data_id}")

    # this will get a stored pipeline under id and init model var with it
    for i in db:
        stored_pipeline = None
        if i['cl_id'] == data_id:
            df_retrain = i['data']
            df_retrain.loc[:, 'labels'] = df_retrain['labels'].apply(lambda s: s[0])
            df_retrain.loc[:, 'scores'] = df_retrain['scores'].apply(lambda s: s[0])
            stored_pipeline = i['model']  # get the pipeline
            try:
                description = i['description']
                print(f"description found: {description}")
            except KeyError:
                pass

    model = stored_pipeline
    print(f"\ndf_retrain: \n{df_retrain}")
    labels_list = df_retrain['labels'].tolist()
    print(f"labels found: {labels_list}")
    label_nm = list(set(labels_list))[0]

    df_retrain['labels'] = df_retrain['scores']

    X = df_retrain['Plot'].tolist()
    y = df_retrain['labels'].tolist()
    df_retrain.drop(columns=["Author", "scores"], inplace=True)

    # print(f"df_retrain (edit2): \n{df_retrain}")
    print(f"X: {X}\ny: {y}")

    # try except when number of samples is less than 10
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=1)

    data_dictionary = {'Xtrain': X_train, "ytrain": y_train, "Xtest": X_test, "ytest": y_test, "Xval": X_val,
                       "yval": y_val}

    # TOKENIZATION
    tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-mnli')
    try:
        if description == "fine-tuned model":
            print("using tuned tokenizer")
            tokenizer = BartTokenizer.from_pretrained('results/fine-tuning/',
                                                      vocab_file="results/fine-tuning/vocab.json",
                                                      merges_file="results/fine-tuning/merges.txt")
    except UnboundLocalError:
        print("There no assign description for this classifier...")
    finally:
        print("BartTokenizer ready!")

    train_encodings = tokenizer(X_train, truncation=True, padding=True)
    val_encodings = tokenizer(X_val, truncation=True, padding=True)
    test_encodings = tokenizer(X_test, truncation=True, padding=True)

    # PYTORCH OBJECTS
    train_dataset = FeedbackDataset(train_encodings, y_train)
    val_dataset = FeedbackDataset(val_encodings, y_val)
    test_dataset = FeedbackDataset(test_encodings, y_test)

    # FINE-TUNING
    # Option 1: FINE-TUNING WITH TRAINER
    training_args = TrainingArguments(
        output_dir='./results',  # output directory
        num_train_epochs=2,  # 10... total number of training epochs
        per_device_train_batch_size=4,  # 16 ... batch size per device during training
        per_device_eval_batch_size=8,  # 64 ... batch size for evaluation
        warmup_steps=1,  # 500 ... number of warmup steps for learning rate scheduler
        weight_decay=0.01,  # strength of weight decay
        logging_dir='./logs',  # directory for storing logs
        logging_steps=1,
    )

    # model here should be whatever classifier is in this id and not always bart-large-mnli, it will only work for demo
    try:
        if description == "fine-tuned model":
            print("using tuned model...")
            model = BartForSequenceClassification.from_pretrained(
                pretrained_model_name_or_path="C:/Users/lavml/Documents/SoSe20/nlp/BERT/restapi/results/fine-tuning"
                                              "/pytorch_model.bin", local_files_only=True)

    except UnboundLocalError:
        print("There no assign description for this classifier...")
        model = BartForSequenceClassification.from_pretrained("facebook/bart-large-mnli")

    finally:
        print("BartForSequenceClassification ready!")

    try:
        if description == "fine-tuned model":
            print("using tuned tokenizer")
            tokenizer = BartTokenizer.from_pretrained('results/fine-tuning/',
                                                      vocab_file="results/fine-tuning/vocab.json",
                                                      merges_file="results/fine-tuning/merges.txt")
    except UnboundLocalError:
        print("There no assign description for this classifier...")
    finally:
        print("BartTokenizer ready!")

    try:
        trainer = Trainer(
            model=model,  # the instantiated 🤗 Transformers model to be trained
            args=training_args,  # training arguments, defined above
            train_dataset=train_dataset,  # training dataset
            eval_dataset=val_dataset  # evaluation dataset
        )
    except RuntimeError:
        print("CUDA RuntimeError. Device changed to cpu")
        training_args = TrainingArguments(
            output_dir='./results',  # output directory
            num_train_epochs=3,  # total number of training epochs
            per_device_train_batch_size=16,  # batch size per device during training
            per_device_eval_batch_size=64,  # batch size for evaluation
            warmup_steps=500,  # number of warmup steps for learning rate scheduler
            weight_decay=0.01,  # strength of weight decay
            logging_dir='./logs',  # directory for storing logs
            logging_steps=10,
            no_cuda=True,
        )
        trainer = Trainer(
            model=model,  # the instantiated 🤗 Transformers model to be trained
            args=training_args,  # training arguments, defined above
            train_dataset=train_dataset,  # training dataset
            eval_dataset=val_dataset  # evaluation dataset
        )

    print("\ntraining...")

    trainer.train()

    try:
        trainer.save_model('results/trainer/')

        model_to_save = model.module if hasattr(model, 'module') else model
        model_to_save.save_pretrained('results/fine-tuning/')
        tokenizer.save_pretrained('results/fine-tuning/')

    except:
        print("error saving with results/[trainer, fine-tuning]")
        pass

    print("fine-tuned and stored, output_dir = './results/fine-tuning/'")  #

    # LOAD MODEL TO DB
    pretrained_model = BartForSequenceClassification.from_pretrained(
        pretrained_model_name_or_path="C:/Users/lavml/Documents/SoSe20/nlp/BERT/restapi/results/fine-tuning/", local_files_only=True)
    tokenizer = BartTokenizer.from_pretrained('results/fine-tuning/', vocab_file="results/fine-tuning/vocab.json",
                                              merges_file="results/fine-tuning/merges.txt")

    classifier = pipeline('zero-shot-classification', model=pretrained_model, tokenizer=tokenizer)  # , device=0)

    # add it to the db

    new_id = 1 if len(db) == 0 else db[-1]['cl_id'] + 1
    db.append(
        {'cl_id': new_id, 'model': classifier, 'category': label_nm, 'data': [], 'description': 'fine-tuned model'})

    return {'data': None,
            'message': f"Fine-tuned Model apended to the database under the id: {new_id}. Try the new resource here "
                       f"--> http://127.0.0.1:8000/api/zero-shot/{new_id}"}
Esempio n. 21
0
def convert_bart_checkpoint(checkpoint_path, pytorch_dump_folder_path, hf_checkpoint_name=None):
    """
    Copy/paste/tweak model's weights to our BERT structure.
    """
    if not os.path.exists(checkpoint_path):
        bart = torch.hub.load("pytorch/fairseq", checkpoint_path).eval()
    else:
        bart = load_xsum_checkpoint(checkpoint_path)

    bart.model.upgrade_state_dict(bart.model.state_dict())
    if hf_checkpoint_name is None:
        hf_checkpoint_name = checkpoint_path.replace(".", "-")
    config = BartConfig.from_pretrained(hf_checkpoint_name)
    tokens = bart.encode(SAMPLE_TEXT).unsqueeze(0)
    tokens2 = BartTokenizer.from_pretrained(hf_checkpoint_name).encode(SAMPLE_TEXT, return_tensors="pt").unsqueeze(0)
    assert torch.eq(tokens, tokens2).all()

    if checkpoint_path == "bart.large.mnli":
        state_dict = bart.state_dict()
        remove_ignore_keys_(state_dict)
        state_dict["model.shared.weight"] = state_dict["model.decoder.embed_tokens.weight"]
        for src, dest in mnli_rename_keys:
            rename_key(state_dict, src, dest)
        model = BartForSequenceClassification(config).eval()
        model.load_state_dict(state_dict)
        fairseq_output = bart.predict("mnli", tokens, return_logits=True)
        new_model_outputs = model(tokens)[0]  # logits
    else:  # no classification heads to worry about
        state_dict = bart.model.state_dict()
        remove_ignore_keys_(state_dict)
        state_dict["shared.weight"] = state_dict["decoder.embed_tokens.weight"]
        fairseq_output = bart.extract_features(tokens)
        if hf_checkpoint_name == "facebook/bart-large":
            model = BartModel(config).eval()
            model.load_state_dict(state_dict)
            new_model_outputs = model(tokens).model[0]
        else:
            model = BartForConditionalGeneration(config).eval()  # an existing summarization ckpt
            model.model.load_state_dict(state_dict)
            if hasattr(model, "lm_head"):
                model.lm_head = _make_linear_from_emb(model.model.shared)
            new_model_outputs = model.model(tokens)[0]

    # Check results
    assert fairseq_output.shape == new_model_outputs.shape
    assert (fairseq_output == new_model_outputs).all().item()
    Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
    model.save_pretrained(pytorch_dump_folder_path)
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The input data dir. Should contain the .tsv files (or other data files) for the task."
    )
    parser.add_argument("--task_name",
                        default=None,
                        type=str,
                        required=True,
                        help="The name of the task to train.")
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model predictions and checkpoints will be written."
    )

    ## Other parameters
    parser.add_argument(
        "--cache_dir",
        default="",
        type=str,
        help=
        "Where do you want to store the pre-trained models downloaded from s3")
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. \n"
        "Sequences longer than this will be truncated, and sequences shorter \n"
        "than this will be padded.")
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_data_aug",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument(
        "--do_lower_case",
        action='store_true',
        help="Set this flag if you are using an uncased model.")
    parser.add_argument("--train_batch_size",
                        default=16,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--eval_batch_size",
                        default=64,
                        type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--learning_rate",
                        default=1e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. "
        "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )
    parser.add_argument(
        '--fp16',
        action='store_true',
        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument(
        '--loss_scale',
        type=float,
        default=0,
        help=
        "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
        "0 (default value): dynamic loss scaling.\n"
        "Positive power of 2: static loss scaling value.\n")
    parser.add_argument('--server_ip',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    parser.add_argument('--server_port',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    args = parser.parse_args()

    processors = {"rte": RteProcessor}

    output_modes = {"rte": "classification"}

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info(
        "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".
        format(device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train and not args.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    task_name = args.task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name]()
    output_mode = output_modes[task_name]

    # label_list = processor.get_labels() #["entailment", "neutral", "contradiction"]
    # label_list = ['How_do_I_create_a_profile_v4', 'Profile_Switch_v4', 'Deactivate_Active_Devices_v4', 'Ads_on_Hulu_v4', 'Watching_Hulu_with_Live_TV_v4', 'Hulu_Costs_and_Commitments_v4', 'offline_downloads_v4', 'womens_world_cup_v5', 'forgot_username_v4', 'confirm_account_cancellation_v4', 'Devices_to_Watch_HBO_on_v4', 'remove_add_on_v4', 'Internet_Speed_for_HD_and_4K_v4', 'roku_related_questions_v4', 'amazon_related_questions_v4', 'Clear_Browser_Cache_v4', 'ads_on_ad_free_plan_v4', 'inappropriate_ads_v4', 'itunes_related_questions_v4', 'Internet_Speed_Recommendations_v4', 'NBA_Basketball_v5', 'unexpected_charges_v4', 'change_billing_date_v4', 'NFL_on_Hulu_v5', 'How_to_delete_a_profile_v4', 'Devices_to_Watch_Hulu_on_v4', 'Manage_your_Hulu_subscription_v4', 'cancel_hulu_account_v4', 'disney_bundle_v4', 'payment_issues_v4', 'home_network_location_v4', 'Main_Menu_v4', 'Resetting_Hulu_Password_v4', 'Update_Payment_v4', 'I_need_general_troubleshooting_help_v4', 'What_is_Hulu_v4', 'sprint_related_questions_v4', 'Log_into_TV_with_activation_code_v4', 'Game_of_Thrones_v4', 'video_playback_issues_v4', 'How_to_edit_a_profile_v4', 'Watchlist_Remove_Video_v4', 'spotify_related_questions_v4', 'Deactivate_Login_Sessions_v4', 'Transfer_to_Agent_v4', 'Use_Hulu_Internationally_v4']

    train_examples, dev_examples, eval_examples, label_list = load_CLINC150_with_specific_domain(
        'banking', 1, augment=args.do_data_aug)
    num_labels = len(label_list)

    # train_examples = None
    num_train_optimization_steps = None
    if args.do_train:
        # train_examples = processor.get_RTE_as_train('/export/home/Dataset/glue_data/RTE/train.tsv') #train_pu_half_v1.txt
        # train_examples = get_data_hulu_fewshot('train', 5)

        num_train_optimization_steps = int(
            len(train_examples) / args.train_batch_size /
            args.gradient_accumulation_steps) * args.num_train_epochs
        if args.local_rank != -1:
            num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size(
            )

    # Prepare model
    # cache_dir = args.cache_dir if args.cache_dir else os.path.join(str(PYTORCH_TRANSFORMERS_CACHE), 'distributed_{}'.format(args.local_rank))

    # pretrain_model_dir = 'roberta-large-mnli' #'roberta-large' , 'roberta-large-mnli'
    # pretrain_model_dir = '/export/home/Dataset/BERT_pretrained_mine/crossdataentail/trainMNLItestRTE/0.8772563176895307'

    model_config = BartConfig.from_pretrained(pretrain_model_dir)
    model_config.num_labels = num_labels
    model = BartForSequenceClassification.from_pretrained(pretrain_model_dir,
                                                          config=model_config)
    # print('after:', model.classification_head.out_proj.out_features)
    # exit(0)

    # tokenizer = RobertaTokenizer.from_pretrained(pretrain_model_dir, do_lower_case=args.do_lower_case)
    tokenizer = BartTokenizer.from_pretrained(pretrain_model_dir,
                                              do_lower_case=args.do_lower_case)

    model.to(device)

    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]

    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
    global_step = 0
    nb_tr_steps = 0
    tr_loss = 0
    max_test_acc = 0.0
    max_dev_acc = 0.0
    if args.do_train:
        train_features = convert_examples_to_features(
            train_examples,
            label_list,
            args.max_seq_length,
            tokenizer,
            output_mode,
            cls_token_at_end=
            False,  #bool(args.model_type in ['xlnet']),            # xlnet has a cls token at the end
            cls_token=tokenizer.cls_token,
            cls_token_segment_id=0,  #2 if args.model_type in ['xlnet'] else 0,
            sep_token=tokenizer.sep_token,
            sep_token_extra=
            True,  #bool(args.model_type in ['roberta']),           # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
            pad_on_left=
            False,  #bool(args.model_type in ['xlnet']),                 # pad on the left for xlnet
            pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token
                                                       ])[0],
            pad_token_segment_id=0
        )  #4 if args.model_type in ['xlnet'] else 0,)
        '''load dev set'''
        # dev_examples = processor.get_RTE_as_dev('/export/home/Dataset/glue_data/RTE/dev.tsv')
        # dev_examples = get_data_hulu('dev')
        dev_features = convert_examples_to_features(
            dev_examples,
            label_list,
            args.max_seq_length,
            tokenizer,
            output_mode,
            cls_token_at_end=
            False,  #bool(args.model_type in ['xlnet']),            # xlnet has a cls token at the end
            cls_token=tokenizer.cls_token,
            cls_token_segment_id=0,  #2 if args.model_type in ['xlnet'] else 0,
            sep_token=tokenizer.sep_token,
            sep_token_extra=
            True,  #bool(args.model_type in ['roberta']),           # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
            pad_on_left=
            False,  #bool(args.model_type in ['xlnet']),                 # pad on the left for xlnet
            pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token
                                                       ])[0],
            pad_token_segment_id=0
        )  #4 if args.model_type in ['xlnet'] else 0,)

        dev_all_input_ids = torch.tensor([f.input_ids for f in dev_features],
                                         dtype=torch.long)
        dev_all_input_mask = torch.tensor([f.input_mask for f in dev_features],
                                          dtype=torch.long)
        dev_all_segment_ids = torch.tensor(
            [f.segment_ids for f in dev_features], dtype=torch.long)
        dev_all_label_ids = torch.tensor([f.label_id for f in dev_features],
                                         dtype=torch.long)

        dev_data = TensorDataset(dev_all_input_ids, dev_all_input_mask,
                                 dev_all_segment_ids, dev_all_label_ids)
        dev_sampler = SequentialSampler(dev_data)
        dev_dataloader = DataLoader(dev_data,
                                    sampler=dev_sampler,
                                    batch_size=args.eval_batch_size)
        '''load test set'''
        # eval_examples = processor.get_RTE_as_test('/export/home/Dataset/RTE/test_RTE_1235.txt')
        # eval_examples = get_data_hulu('test')
        eval_features = convert_examples_to_features(
            eval_examples,
            label_list,
            args.max_seq_length,
            tokenizer,
            output_mode,
            cls_token_at_end=
            False,  #bool(args.model_type in ['xlnet']),            # xlnet has a cls token at the end
            cls_token=tokenizer.cls_token,
            cls_token_segment_id=0,  #2 if args.model_type in ['xlnet'] else 0,
            sep_token=tokenizer.sep_token,
            sep_token_extra=
            True,  #bool(args.model_type in ['roberta']),           # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
            pad_on_left=
            False,  #bool(args.model_type in ['xlnet']),                 # pad on the left for xlnet
            pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token
                                                       ])[0],
            pad_token_segment_id=0
        )  #4 if args.model_type in ['xlnet'] else 0,)

        eval_all_input_ids = torch.tensor([f.input_ids for f in eval_features],
                                          dtype=torch.long)
        eval_all_input_mask = torch.tensor(
            [f.input_mask for f in eval_features], dtype=torch.long)
        eval_all_segment_ids = torch.tensor(
            [f.segment_ids for f in eval_features], dtype=torch.long)
        eval_all_label_ids = torch.tensor([f.label_id for f in eval_features],
                                          dtype=torch.long)

        eval_data = TensorDataset(eval_all_input_ids, eval_all_input_mask,
                                  eval_all_segment_ids, eval_all_label_ids)
        eval_sampler = SequentialSampler(eval_data)
        eval_dataloader = DataLoader(eval_data,
                                     sampler=eval_sampler,
                                     batch_size=args.eval_batch_size)

        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_optimization_steps)
        all_input_ids = torch.tensor([f.input_ids for f in train_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in train_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in train_features],
                                       dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in train_features],
                                     dtype=torch.long)

        train_data = TensorDataset(all_input_ids, all_input_mask,
                                   all_segment_ids, all_label_ids)
        train_sampler = RandomSampler(train_data)

        train_dataloader = DataLoader(train_data,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size)

        iter_co = 0
        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            for step, batch in enumerate(
                    tqdm(train_dataloader, desc="Iteration")):
                model.train()
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, label_ids = batch
                outputs = model(input_ids, input_mask, labels=label_ids)
                # loss_fct = CrossEntropyLoss()
                loss = outputs[
                    0]  #loss_fct(logits.view(-1, num_labels), label_ids.view(-1))

                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps

                loss.backward()

                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1

                optimizer.step()
                optimizer.zero_grad()
                global_step += 1
                iter_co += 1
                # if iter_co %20==0:
                if iter_co % len(train_dataloader) == 0:
                    '''
                    start evaluate on dev set after this epoch
                    '''
                    model.eval()

                    for idd, dev_or_test_dataloader in enumerate(
                        [dev_dataloader, eval_dataloader]):

                        if idd == 0:
                            logger.info("***** Running dev *****")
                            logger.info("  Num examples = %d",
                                        len(dev_examples))
                        else:
                            logger.info("***** Running test *****")
                            logger.info("  Num examples = %d",
                                        len(eval_examples))
                        # logger.info("  Batch size = %d", args.eval_batch_size)

                        eval_loss = 0
                        nb_eval_steps = 0
                        preds = []
                        gold_label_ids = []
                        # print('Evaluating...')
                        for input_ids, input_mask, segment_ids, label_ids in dev_or_test_dataloader:
                            input_ids = input_ids.to(device)
                            input_mask = input_mask.to(device)
                            segment_ids = segment_ids.to(device)
                            label_ids = label_ids.to(device)
                            gold_label_ids += list(
                                label_ids.detach().cpu().numpy())

                            with torch.no_grad():
                                logits = model(input_ids,
                                               input_mask,
                                               labels=None)
                            # print('logits:', logits)
                            logits = logits[0]

                            loss_fct = CrossEntropyLoss()
                            tmp_eval_loss = loss_fct(
                                logits.view(-1, num_labels),
                                label_ids.view(-1))

                            eval_loss += tmp_eval_loss.mean().item()
                            nb_eval_steps += 1
                            if len(preds) == 0:
                                preds.append(logits.detach().cpu().numpy())
                            else:
                                preds[0] = np.append(
                                    preds[0],
                                    logits.detach().cpu().numpy(),
                                    axis=0)

                        eval_loss = eval_loss / nb_eval_steps
                        preds = preds[0]
                        '''
                        preds: size*3 ["entailment", "neutral", "contradiction"]
                        wenpeng added a softxmax so that each row is a prob vec
                        '''
                        pred_probs = softmax(preds, axis=1)
                        pred_label_ids = list(np.argmax(pred_probs, axis=1))
                        # pred_indices = np.argmax(pred_probs, axis=1)

                        # pred_label_ids = []
                        # for p in pred_indices:
                        #     pred_label_ids.append(0 if p == 0 else 1)

                        gold_label_ids = gold_label_ids
                        assert len(pred_label_ids) == len(gold_label_ids)
                        hit_co = 0
                        for k in range(len(pred_label_ids)):
                            if pred_label_ids[k] == gold_label_ids[k]:
                                hit_co += 1
                        test_acc = hit_co / len(gold_label_ids)

                        if idd == 0:  # this is dev
                            if test_acc > max_dev_acc:
                                max_dev_acc = test_acc
                                print('\ndev acc:', test_acc, ' max_dev_acc:',
                                      max_dev_acc, '\n')

                            else:
                                print('\ndev acc:', test_acc, ' max_dev_acc:',
                                      max_dev_acc, '\n')
                                break
                        else:  # this is test
                            if test_acc > max_test_acc:
                                max_test_acc = test_acc
                            print('\ntest acc:', test_acc, ' max_test_acc:',
                                  max_test_acc, '\n')