Ejemplo n.º 1
0
def test_best_model(analysis, model_name, task_name, data_dir):
    data_args = DataTrainingArguments(task_name=task_name, data_dir=data_dir)

    tokenizer = AutoTokenizer.from_pretrained(model_name)

    best_config = analysis.get_best_config(metric="eval_acc", mode="max")
    print(best_config)
    best_checkpoint = recover_checkpoint(
        analysis.get_best_trial(metric="eval_acc",
                                mode="max").checkpoint.value)
    print(best_checkpoint)
    best_model = AutoModelForSequenceClassification.from_pretrained(
        best_checkpoint).to("cuda")

    test_args = TrainingArguments(output_dir="./best_model_results", )
    test_dataset = GlueDataset(data_args,
                               tokenizer=tokenizer,
                               mode="dev",
                               cache_dir=data_dir)
    test_dataset = test_dataset[len(test_dataset) // 2:]

    test_trainer = Trainer(best_model,
                           test_args,
                           compute_metrics=build_compute_metrics_fn(task_name))

    metrics = test_trainer.evaluate(test_dataset)
    print(metrics)
Ejemplo n.º 2
0
 def setUpClass(self):
     self.MODEL_ID = "albert-base-v2"
     self.data_args = DataTrainingArguments(
         task_name="mrpc",
         data_dir="./tests/fixtures/tests_samples/MRPC",
         overwrite_cache=True,
     )
     self.tokenizer = AutoTokenizer.from_pretrained(self.MODEL_ID)
     self.dataset = GlueDataset(self.data_args, self.tokenizer, mode="dev")
     self.config = AutoConfig.from_pretrained(
         self.MODEL_ID, num_labels=3, finetuning_task="mrpc")
     self.dataloader = DataLoader(self.dataset, batch_size=2, collate_fn=default_data_collator)
Ejemplo n.º 3
0
def run_pos(task_key: str,
            cfg: CN,
            model,
            model_args,
            training_args,
            tokenizer,
            mode="train",
            extract=False,
            **kwargs):
    r"""
        cfg: YACS cfg node
        ckpt_path: Unsupported
    """
    task_name = "POS"
    data_args = DataTrainingArguments(task_name=task_name,
                                      data_dir=cfg.DATA.DATAPATH)

    # Load pretrained model and tokenizer
    #
    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.

    # Get datasets
    pos_dataset = load_features_dict(tokenizer, cfg)
    train_dataset = pos_dataset["pos"]["train"]
    eval_dataset = pos_dataset["pos"]["validation"]

    # Initialize our Trainer
    trainer = FixedTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=get_eval_metrics_func(task_key),
        tokenizer=tokenizer,
        data_collator=DataCollatorForTokenClassification(tokenizer),
        config=cfg)

    if mode == "train":
        trainer.train()
    if mode != "train" or cfg.EVAL_ON_COMPLETION:
        extract_path = None
        if extract:
            extract_path = get_extract_path(cfg, model_args)
        metrics = trainer.evaluate(extract_path=extract_path,
                                   cache_path=osp.join(
                                       cfg.TASK.EXTRACT_TOKENS_MASK_CACHE,
                                       task_key))
        metrics_file = get_metrics_path(cfg, model_args)
        torch.save(metrics, metrics_file)
Ejemplo n.º 4
0
def run_glue(task_key,
             cfg,
             model,
             model_args,
             training_args,
             tokenizer,
             mode="train",
             extract=False,
             **kwargs):
    r"""
        cfg: YACS cfg node
        ckpt_path: Unsupported
    """
    task_name = TASK_KEY_TO_NAME[task_key]

    data_args = DataTrainingArguments(task_name=task_name,
                                      data_dir=cfg.DATA.DATAPATH)

    glue_dataset = load_features_dict(tokenizer, cfg)
    # print(glue_dataset.keys())
    train_dataset = glue_dataset[task_key]['train']
    split_key = cfg.EVAL.SPLIT
    if task_key == "mnli":
        split_key = f"{split_key}_matched"
    eval_dataset = glue_dataset[task_key][split_key]
    # eval_dataset = glue_dataset[task_key]['validation_mismached']
    # train_dataset = GlueDataset(data_args, tokenizer=tokenizer, limit_length=cfg.TRAIN.TASK_LIMIT)
    # eval_dataset = GlueDataset(data_args, tokenizer=tokenizer, mode='dev')
    collator = DataCollatorWithPadding(tokenizer)
    trainer = FixedTrainer(model=model,
                           args=training_args,
                           train_dataset=train_dataset,
                           eval_dataset=eval_dataset,
                           compute_metrics=get_eval_metrics_func(task_key),
                           data_collator=collator,
                           config=cfg)

    if mode == "train":
        trainer.train()
    if mode != "train" or cfg.EVAL_ON_COMPLETION:
        extract_path = None
        if extract:
            extract_path = get_extract_path(cfg, model_args)
        metrics = trainer.evaluate(extract_path=extract_path,
                                   cache_path=osp.join(
                                       cfg.TASK.EXTRACT_TOKENS_MASK_CACHE,
                                       task_key))
        metrics_file = get_metrics_path(cfg, model_args)
        torch.save(metrics, metrics_file)
Ejemplo n.º 5
0
def train_transformer(config, checkpoint_dir=None):
    data_args = DataTrainingArguments(task_name=config["task_name"],
                                      data_dir=config["data_dir"])
    tokenizer = AutoTokenizer.from_pretrained(config["model_name"])
    train_dataset = GlueDataset(data_args,
                                tokenizer=tokenizer,
                                mode="train",
                                cache_dir=config["data_dir"])
    eval_dataset = GlueDataset(data_args,
                               tokenizer=tokenizer,
                               mode="dev",
                               cache_dir=config["data_dir"])
    eval_dataset = eval_dataset[:len(eval_dataset) // 2]
    training_args = TrainingArguments(
        output_dir=tune.get_trial_dir(),
        learning_rate=config["learning_rate"],
        do_train=True,
        do_eval=True,
        evaluate_during_training=True,
        eval_steps=(len(train_dataset) // config["per_gpu_train_batch_size"]) +
        1,
        # We explicitly set save to 0, and do saving in evaluate instead
        save_steps=0,
        num_train_epochs=config["num_epochs"],
        max_steps=config["max_steps"],
        per_device_train_batch_size=config["per_gpu_train_batch_size"],
        per_device_eval_batch_size=config["per_gpu_val_batch_size"],
        warmup_steps=0,
        weight_decay=config["weight_decay"],
        logging_dir="./logs",
    )

    # Arguments for W&B.
    name = tune.get_trial_name()
    wandb_args = {
        "project_name": "transformers_pbt",
        "watch": "false",  # Either set to gradient, false, or all
        "run_name": name,
    }

    tune_trainer = get_trainer(recover_checkpoint(checkpoint_dir,
                                                  config["model_name"]),
                               train_dataset,
                               eval_dataset,
                               config["task_name"],
                               training_args,
                               wandb_args=wandb_args)
    tune_trainer.train(recover_checkpoint(checkpoint_dir,
                                          config["model_name"]))
Ejemplo n.º 6
0
 def test_meta_dataset(self):
     data_args = DataTrainingArguments(
         task_name="mrpc",
         data_dir="./tests/fixtures/tests_samples/MRPC",
         overwrite_cache=True,
     )
     train_dataset = GlueDataset(data_args, tokenizer=self.tokenizer)
     meta_dataset = MetaDataset(train_dataset)
     self.assertEqual(len(meta_dataset[1000]), 2)
     self.assertEqual(meta_dataset[1000][0]["input_ids"].shape, torch.Size([128]))
     self.assertEqual(
         meta_dataset[1000][0]["attention_mask"].shape, torch.Size([128])
     )
     self.assertEqual(meta_dataset[1000][0]["labels"].item(), 0)
     self.assertEqual(meta_dataset[1000][1]["labels"].item(), 1)
Ejemplo n.º 7
0
def main(config):
    os.environ["WANDB_WATCH"] = "False"  # To disable Huggingface logging

    auto_generated_dir = os.getcwd()
    log.info(f"Work dir: {auto_generated_dir}")
    os.chdir(hydra.utils.get_original_cwd())

    wandb_run = init_wandb(auto_generated_dir, config)

    args_train = TrainingArguments(output_dir=auto_generated_dir)
    args_train = update_config(args_train, config.training)

    args_data = DataTrainingArguments(task_name=config.data.task_name,
                                      data_dir=config.data.data_dir)
    args_data = update_config(args_data, config.data)

    train_eval_glue_model(config, args_train, args_data, auto_generated_dir)
def train(X_train, y_train, y_column_name, model_name=None):
    eval_dataset = y_train[y_column_name]

    model_args = ModelArguments(model_name_or_path="distilbert-base-cased", )
    global data_args
    data_args = DataTrainingArguments(task_name="mnli",
                                      data_dir="../../datasets/Newswire")
    num_labels = glue_tasks_num_labels[data_args.task_name]
    training_args = TrainingArguments(
        output_dir=model_name,
        overwrite_output_dir=True,
        do_train=True,
        do_eval=True,
        per_gpu_train_batch_size=32,
        per_gpu_eval_batch_size=128,
        num_train_epochs=1,
        logging_steps=500,
        logging_first_step=True,
        save_steps=1000,
        evaluate_during_training=True,
    )

    config = AutoConfig.from_pretrained(
        model_args.model_name_or_path,
        num_labels=num_labels,
        finetuning_task=data_args.task_name,
    )
    tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, )
    model = AutoModelForSequenceClassification.from_pretrained(
        model_args.model_name_or_path,
        config=config,
    )

    train_dataset = GlueDataset(data_args,
                                tokenizer=tokenizer,
                                limit_length=100_000)

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=compute_metrics,
    )
    trainer.train()
Ejemplo n.º 9
0
    def test_cluster_indices(self):
        clustering_args = Clustering_Arguments(
            batch_size=32,
            num_clusters_elements=32,
            embedding_path=self.embedding_path,
            num_clusters=8,
            cluster_output_path=self.cluster_output_path,
        )
        cluster_indices = self.clustering_proc.get_cluster_indices_by_num(
            clustering_args.num_clusters_elements
        )
        self.assertTrue(len(cluster_indices) > 10000)

        # Testing with Pytorch Dataset
        data_args = DataTrainingArguments(
            task_name="MRPC", data_dir=self.data_dir, overwrite_cache=True
        )
        tokenizer = AutoTokenizer.from_pretrained("albert-base-v2")
        train_dataset = GlueDataset(data_args, tokenizer)
        train_dataset = torch.utils.data.Subset(train_dataset, cluster_indices)
        self.assertEqual(len(train_dataset[0].input_ids), 128)
Ejemplo n.º 10
0
def twitter_bert(
        ROOTPATH=ROOTPATH,
        model_name_or_path="bert-base-uncased",
        task_name="TWIT",
        do_train=True,
        do_eval=True,
        data_dir=f'{ROOTPATH}/input',
        max_seq_length=128,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        learning_rate=2e-5,
        num_train_epochs=3.0,
        cache_dir=None,
        output_dir=f'{ROOTPATH}/output',
        overwrite_cache=True,
        overwrite_output_dir=True,
        local_rank=-1,
        device=torch.device("cuda" if torch.cuda.is_available() else "cpu"),
        n_gpu=torch.cuda.device_count(),
        fp16=False,
        num_labels=2,
        evaluate_during_training=False,
        weight_decay=0,
        adam_epsilon=1e-8,
        max_grad_norm=1.0,
        train_dataset=None,
        dev_dataset=None,
        test_dataset=None,
        full_dataset=None,
        labels=None,
        temp_json=f'{ROOTPATH}/temp/run{datetime.datetime.now().strftime("%Y%m%d_%H%M%S")}.json',
        use_test=False,
        save_steps=1e200,
        random_state=1234):

    set_seed(random_state)
    if full_dataset is not None:
        train_dataset, dev_dataset = train_test_split(
            full_dataset, test_size=0.2, random_state=random_state)

    # Setup logging
    logger = logging.getLogger(__name__)

    logger.info(f"LENGTH OF TRAIN DATASET: {len(train_dataset.index)}")
    # exit(0)

    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO)

    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        local_rank,
        device,
        n_gpu,
        bool(local_rank != -1),
        fp16,
    )

    logger.info(
        "Training/evaluation parameters local_rank: %s, device: %s, n_gpu: %s, fp16: %s",
        local_rank, device, n_gpu, fp16)
    logger.info(f"MAX SEQ LEN: {max_seq_length}")

    wordsegment.load()

    ## DEFINE FUNCTIONS
    @dataclass
    class ModelArguments:
        """
        Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
        """

        model_name_or_path: str = field(
            metadata={
                "help":
                "Path to pretrained model or model identifier from huggingface.co/models"
            })
        config_name: Optional[str] = field(
            default=None,
            metadata={
                "help":
                "Pretrained config name or path if not the same as model_name"
            })
        tokenizer_name: Optional[str] = field(
            default=None,
            metadata={
                "help":
                "Pretrained tokenizer name or path if not the same as model_name"
            })
        cache_dir: Optional[str] = field(
            default=None,
            metadata={
                "help":
                "Where do you want to store the pretrained models downloaded from s3"
            })

    training_args = TrainingArguments(
        output_dir=output_dir,
        do_train=True,
        do_eval=True,
        do_predict=True,
        num_train_epochs=num_train_epochs,
        per_device_train_batch_size=per_device_train_batch_size,
        per_device_eval_batch_size=per_device_eval_batch_size,
        learning_rate=learning_rate,
        overwrite_output_dir=overwrite_output_dir,
        evaluate_during_training=evaluate_during_training,
        weight_decay=weight_decay,
        adam_epsilon=adam_epsilon,
        max_grad_norm=max_grad_norm,
        save_steps=save_steps)

    data_args = DataTrainingArguments(task_name=task_name,
                                      data_dir=data_dir,
                                      max_seq_length=max_seq_length,
                                      overwrite_cache=overwrite_cache)

    model_args = ModelArguments(model_name_or_path=model_name_or_path, )

    def simple_accuracy(preds, labels):
        return (preds == labels).mean()

    def acc_and_f1(preds, labels):
        acc = simple_accuracy(preds, labels)
        f1 = f1_score(y_true=labels, y_pred=preds)
        return {
            "acc": acc,
            "f1": f1,
            "acc_and_f1": (acc + f1) / 2,
        }

    def pearson_and_spearman(preds, labels):
        pearson_corr = pearsonr(preds, labels)[0]
        spearman_corr = spearmanr(preds, labels)[0]
        return {
            "pearson": pearson_corr,
            "spearmanr": spearman_corr,
            "corr": (pearson_corr + spearman_corr) / 2,
        }

    def compute_metrics(preds, labels):
        assert len(preds) == len(labels)
        return acc_and_f1(preds, labels)

    class TwitterProcessor(DataProcessor):
        def __init__(self):

            super(TwitterProcessor, self).__init__()
            '''
            You need to define three variables here:
            - self.train_dataset -> train dataset
            - self.dev_dataset -> dev dataset
            - self.test_dataset -> test dataset
            - self.labels -> a list of the labels

            Each {train,dev,test}_dataset must have (at least) two columns:
            - "tweet" -> includes the text of the tweet
            - "label" -> includes the label of the tweet


            '''

            self.train_dataset = train_dataset
            self.dev_dataset = dev_dataset
            self.test_dataset = test_dataset
            self.labels = labels

        def get_train_examples(self):
            return self._create_examples(self.train_dataset, "train")

        def get_dev_examples(self):
            return self._create_examples(self.dev_dataset, "train")

        def get_test_examples(self):
            return self._create_examples(self.test_dataset, "train")

        def get_labels(self):
            """See base class."""
            return self.labels

        def _preprocess_text(self, text):
            # 1
            text = emoji.demojize(text)

            # 2
            words = text.split()
            for word in words:
                if word[0] != '#':
                    continue
                hashtag = word[1:]
                replacement_words = wordsegment.segment(hashtag)
                text = text.replace(word, " ".join(replacement_words))

            # 3
            text = text.replace("URL", "http")

            # 4
            text = re.sub(r'(@[A-Za-z]+)( \1\b){3,}', r'\1 \1 \1', text)
            return text

        def _create_examples(self, data, set_type):

            examples = []

            raw_texts = data.tweet.values.tolist()
            raw_labels = data.label.values.tolist()

            for i in range(0, len(raw_texts)):
                guid = "%s-%s" % (set_type, i)
                raw_text = raw_texts[i]
                raw_label = raw_labels[i]
                label = raw_label

                text = self._preprocess_text(raw_text)
                examples.append(
                    InputExample(guid=guid,
                                 text_a=text,
                                 text_b=None,
                                 label=label))

            return examples

    def convert_examples_to_features(
        examples: List[InputExample],
        tokenizer: PreTrainedTokenizer,
        max_length: Optional[int] = None,
        task=None,
        label_list=None,
        output_mode=None,
    ):
        if max_length is None:
            max_length = tokenizer.max_len

        processor = TwitterProcessor()
        label_list = processor.get_labels()

        label_map = {label: i for i, label in enumerate(label_list)}

        def label_from_example(
                example: InputExample) -> Union[int, float, None]:
            return label_map[example.label]

        labels = [label_from_example(example) for example in examples]

        batch_encoding = tokenizer.batch_encode_plus(
            [(example.text_a, example.text_b) for example in examples],
            max_length=max_length,
            pad_to_max_length=True,
        )

        features = []
        for i in range(len(examples)):
            inputs = {k: batch_encoding[k][i] for k in batch_encoding}

            feature = InputFeatures(**inputs, label=labels[i])
            features.append(feature)

        for i, example in enumerate(examples[:5]):
            logger.info("*** Example ***")
            logger.info("guid: %s" % (example.guid))
            logger.info("features: %s" % features[i])

        return features

    class TwitterDataset(Dataset):
        """
        This will be superseded by a framework-agnostic approach
        soon.
        """
        def __init__(
            self,
            tokenizer,
            mode="train",
            cache_dir=cache_dir,
            args=data_args,
        ):
            self.args = args
            self.processor = TwitterProcessor()
            self.output_mode = 'Classification'

            label_list = self.processor.get_labels()
            self.label_list = label_list

            if mode == "dev":
                examples = self.processor.get_dev_examples()
            elif mode == "test":
                examples = self.processor.get_test_examples()
            else:
                examples = self.processor.get_train_examples()

            self.features = convert_examples_to_features(
                examples,
                tokenizer,
                max_length=max_seq_length,
                label_list=label_list,
                output_mode=self.output_mode,
            )

        def __len__(self):
            return len(self.features)

        def __getitem__(self, i) -> InputFeatures:
            return self.features[i]

        def get_labels(self):
            return self.label_list

    def build_compute_metrics_fn(
            task_name: str) -> Callable[[EvalPrediction], Dict]:
        def compute_metrics_fn(p: EvalPrediction):
            preds = np.argmax(p.predictions, axis=1)
            return compute_metrics(preds, p.label_ids)

        return compute_metrics_fn

    # Create model
    config = AutoConfig.from_pretrained(
        model_name_or_path,
        num_labels=num_labels,
        cache_dir=cache_dir,
    )

    tokenizer = AutoTokenizer.from_pretrained(
        model_name_or_path,
        cache_dir=cache_dir,
    )

    model = AutoModelForSequenceClassification.from_pretrained(
        model_name_or_path,
        config=config,
        cache_dir=cache_dir,
    )

    # Get datasets
    train_dataset = (TwitterDataset(tokenizer=tokenizer,
                                    mode="train",
                                    cache_dir=cache_dir))
    eval_dataset = (TwitterDataset(tokenizer=tokenizer,
                                   mode="dev",
                                   cache_dir=cache_dir))

    if use_test:
        test_dataset = (TwitterDataset(tokenizer=tokenizer,
                                       mode="test",
                                       cache_dir=cache_dir))

    # Initialize our Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=build_compute_metrics_fn(task_name),
    )

    # Train the model
    if training_args.do_train:
        trainer.train(model_path=model_args.model_name_or_path if os.path.
                      isdir(model_args.model_name_or_path) else None)
        trainer.save_model(f"{training_args.output_dir}/{task_name}")
        tokenizer.save_pretrained(f"{training_args.output_dir}/{task_name}")

    # Evaluation
    eval_results = []
    if training_args.do_eval:
        logger.info("*** Evaluate ***")

        if use_test:
            step_names = ["dev", "test"]
            eval_datasets = [eval_dataset, test_dataset]
        else:
            step_names = ["dev"]
            eval_datasets = [eval_dataset]

        ct = 0
        for eval_dataset in eval_datasets:

            step_name = step_names[ct]

            trainer.compute_metrics = build_compute_metrics_fn(
                eval_dataset.args.task_name)
            eval_result = trainer.evaluate(eval_dataset=eval_dataset)

            logger.info("***** Eval results {} - {}*****".format(
                eval_dataset.args.task_name, step_name.upper()))
            for key, value in eval_result.items():
                logger.info("  %s = %s", key, value)

            # output_eval_file = os.path.join(
            #     training_args.output_dir, f"eval_results_{eval_dataset.args.task_name}_{step_name}.txt"
            # )

            # if ct == 0:
            #     with open(output_eval_file, "w") as writer:
            #         logger.info("***** Eval results {} - {}*****".format(eval_dataset.args.task_name, step_name.upper()))
            #         for key, value in eval_result.items():
            #             logger.info("  %s = %s", key, value)
            # else:
            #     with open(output_eval_file, "a") as writer:
            #         logger.info("***** Eval results {} - {}*****".format(eval_dataset.args.task_name, step_name.upper()))
            #         for key, value in eval_result.items():
            #             logger.info("  %s = %s", key, value)

            eval_results.append(eval_result)

            write_type = 'a' if os.path.exists(temp_json) else 'w'
            with open(temp_json, write_type) as f:
                f.write(json.dumps(eval_result))
                f.write("\n")

            ct += 1

    return eval_results[-1]
Ejemplo n.º 11
0
    config_name: Optional[str] = field(
        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
    )
    tokenizer_name: Optional[str] = field(
        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
    )
    cache_dir: Optional[str] = field(
        default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
    )

"""### Here are all the training parameters we are going to use:"""

model_args = ModelArguments(
    model_name_or_path="bert-base-cased",
)
data_args = DataTrainingArguments(task_name="mnli", data_dir="./glue_data/MNLI")
training_args = TrainingArguments(
    output_dir="./models/model_name",
    overwrite_output_dir=True,
    do_train=True,
    do_eval=True,
    per_gpu_train_batch_size=32,
    per_gpu_eval_batch_size=128,
    num_train_epochs=0.1,
    logging_steps=500,
    logging_first_step=True,
    save_steps=1000,
    evaluate_during_training=True,
)

!ls glue_data