Esempio n. 1
0
    def test_early_stopping_callback(self):
        # early stopping stops training before num_training_epochs
        with tempfile.TemporaryDirectory() as tmp_dir:
            trainer = get_regression_trainer(
                output_dir=tmp_dir,
                num_train_epochs=20,
                gradient_accumulation_steps=1,
                per_device_train_batch_size=16,
                load_best_model_at_end=True,
                evaluation_strategy=EvaluationStrategy.EPOCH,
                compute_metrics=AlmostAccuracy(),
                metric_for_best_model="accuracy",
            )
            trainer.add_callback(EarlyStoppingCallback(1, 0.0001))
            train_output = trainer.train()
            self.assertLess(train_output.global_step, 20 * 64 / 16)

        # Invalid inputs to trainer with early stopping callback result in assertion error
        with tempfile.TemporaryDirectory() as tmp_dir:
            trainer = get_regression_trainer(
                output_dir=tmp_dir,
                num_train_epochs=20,
                gradient_accumulation_steps=1,
                per_device_train_batch_size=16,
                evaluation_strategy=EvaluationStrategy.EPOCH,
                compute_metrics=AlmostAccuracy(),
                metric_for_best_model="accuracy",
            )
            trainer.add_callback(EarlyStoppingCallback(1))
            self.assertEqual(trainer.state.global_step, 0)
            try:
                trainer.train()
            except AssertionError:
                self.assertEqual(trainer.state.global_step, 0)
Esempio n. 2
0
    def __init__(self,
                 model,
                 dataset,
                 train_range: 0.95,
                 output_dir: str = "results",
                 num_train_epochs: int = 100,
                 per_device_train_batch_size: int = 4,
                 per_device_eval_batch_size: int = 4,
                 warmup_steps: int = 500,
                 weight_decay: float = 0.01,
                 logging_dir: str = "logs",
                 early_stopping_patience: int = 20,
                 early_stopping_threshold: float = 1e-5):
        """
        Create DIETTrainer class

        :param model: model to train
        :param dataset: dataset (including train and eval)
        :param train_range: percentage of training dataset
        :param output_dir: model output directory
        :param num_train_epochs: number of training epochs
        :param per_device_train_batch_size: batch_size of training stage
        :param per_device_eval_batch_size: batch_size of evaluating stage
        :param warmup_steps: warmup steps
        :param weight_decay: weight decay
        :param logging_dir: logging directory
        """
        self.training_args = TrainingArguments(
            output_dir=output_dir,
            num_train_epochs=num_train_epochs,
            per_device_train_batch_size=per_device_train_batch_size,
            per_device_eval_batch_size=per_device_eval_batch_size,
            warmup_steps=warmup_steps,
            weight_decay=weight_decay,
            logging_dir=logging_dir,
            load_best_model_at_end=True,
            metric_for_best_model="loss",
            greater_is_better=False,
            evaluation_strategy="epoch",
            label_names=["entities_labels", "intent_labels"],
            save_total_limit=1)

        train_dataset, eval_dataset = random_split(
            dataset, [
                int(len(dataset) * train_range),
                len(dataset) - int(len(dataset) * train_range)
            ],
            generator=torch.Generator().manual_seed(42))

        self.trainer = Trainer(
            model=model,
            args=self.training_args,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
            callbacks=[
                EarlyStoppingCallback(
                    early_stopping_patience=early_stopping_patience,
                    early_stopping_threshold=early_stopping_threshold),
                TensorBoardCallback()
            ])
Esempio n. 3
0
def train_model(config_path: str):
    writer = SummaryWriter()
    config = read_training_pipeline_params(config_path)
    logger.info("pretrained_emb {b}", b=config.net_params.pretrained_emb)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    logger.info("Device is {device}", device=device)
    SRC, TRG, dataset = get_dataset(config.dataset_path, False)
    train_data, valid_data, test_data = split_data(
        dataset, **config.split_ration.__dict__)
    SRC.build_vocab(train_data, min_freq=3)
    TRG.build_vocab(train_data, min_freq=3)
    torch.save(SRC.vocab, config.src_vocab_name)
    torch.save(TRG.vocab, config.trg_vocab_name)
    logger.info("Vocab saved")
    print(f"Unique tokens in source (ru) vocabulary: {len(SRC.vocab)}")
    print(f"Unique tokens in target (en) vocabulary: {len(TRG.vocab)}")
    train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
        (train_data, valid_data, test_data),
        batch_size=config.BATCH_SIZE,
        device=device,
        sort_key=_len_sort_key,
    )
    INPUT_DIM = len(SRC.vocab)
    OUTPUT_DIM = len(TRG.vocab)

    config_encoder = BertConfig(vocab_size=INPUT_DIM)
    config_decoder = BertConfig(vocab_size=OUTPUT_DIM)
    config = EncoderDecoderConfig.from_encoder_decoder_configs(
        config_encoder, config_decoder)
    model = EncoderDecoderModel(config=config)
    config_encoder = model.config.encoder
    config_decoder = model.config.decoder
    config_decoder.is_decoder = True
    config_decoder.add_cross_attention = True
    config = EncoderDecoderConfig.from_encoder_decoder_configs(
        config_encoder, config_decoder)
    model = EncoderDecoderModel(config=config)
    args = TrainingArguments(
        output_dir="output",
        evaluation_strategy="steps",
        eval_steps=500,
        per_device_train_batch_size=128,
        per_device_eval_batch_size=128,
        num_train_epochs=10,
        save_steps=3000,
        seed=0,
        load_best_model_at_end=True,
    )
    # args.place_model_on_device = device
    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=train_iterator,
        eval_dataset=valid_iterator,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
    )
    trainer.train()

    model.save_pretrained("bert2bert")
Esempio n. 4
0
    def _train_transformer(
            self,
            epochs=1,
            early_stopping=True):
        """Train on parsed dialogs with Transformer"""
        self._prepare_labels()
        output_dim = self.target_labels.shape[1]

        sentences = [" ".join(words) for words in self.utterances]

        with open(f"corpora/dataset_{self.train_on}.tsv", "w") as trainset_file:
            print(f"sentence\tlabel", file=trainset_file)
            for sentence, label in zip(sentences, self.emotion_labels):
                print(f"{sentence}\t{label}", file=trainset_file)

        train_texts, val_texts, train_labels, val_labels = train_test_split(sentences, self.target_labels, test_size=.2)

        self.tokenizer = AutoTokenizer.from_pretrained("allegro/herbert-base-cased")

        train_encodings = self.tokenizer(train_texts, truncation=True, padding=True)
        val_encodings = self.tokenizer(val_texts, truncation=True, padding=True)

        train_dataset = EmotionsDataset(train_encodings, train_labels)
        val_dataset = EmotionsDataset(val_encodings, val_labels)

        callbacks = []
        if early_stopping:
            callbacks.append(EarlyStoppingCallback(early_stopping_patience=3))

        training_args = TrainingArguments(
            output_dir="./results",
            num_train_epochs=epochs,
            # per_device_train_batch_size=16,
            # per_device_eval_batch_size=64,
            # warmup_steps=500,
            # weight_decay=0.01,
            # logging_dir='./logs',
            logging_steps=100,
            save_steps=100,
            eval_steps=100,
            evaluation_strategy="steps",
            load_best_model_at_end=True,
        )

        self.model = AutoModelForSequenceClassification.from_pretrained("allegro/herbert-base-cased", num_labels=output_dim)

        if self.use_cuda:
            self.model.cuda()

        trainer = Trainer(
            model=self.model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            callbacks=callbacks
        )

        trainer.train()
Esempio n. 5
0
def main(cfg):
    dataset = load_from_disk(TDATA_PATH)
    dataset.set_format(type="torch",
                       columns=['input_ids', 'attention_mask', 'label'])
    train_ds, test_ds = dataset["train"], dataset['test']

    model = AutoModelForSequenceClassification.from_pretrained(ckpt_path)
    trainConfig = cfg.train
    output_dir = os.path.join(trainConfig["output_dir"])
    train_args = TrainingArguments(
        # module pred/ckpt
        output_dir=output_dir,
        # tensorboard logs
        logging_dir="./logs",
        num_train_epochs=trainConfig["epoch"],
        per_device_train_batch_size=trainConfig["train_batch_size"],
        per_device_eval_batch_size=trainConfig["eval_batch_size"],
        # x (logging / eval /save) every acc * x_steps
        gradient_accumulation_steps=trainConfig["acc_batch"],
        evaluation_strategy=IntervalStrategy.EPOCH,
        label_smoothing_factor=trainConfig["label_smooth"],
        # AdamW
        learning_rate=trainConfig["lr"],
        warmup_steps=trainConfig["warmup"],
        # apply to all layers but bias / LayerNorm
        weight_decay=trainConfig["wd"],
        save_total_limit=2,
        # if True, ignore param save_strategy / save_steps / save_total_limit
        load_best_model_at_end=True,
        # report_to=["none"],
        report_to=["wandb"],
        seed=cfg.seed,
        logging_strategy=IntervalStrategy.STEPS,
        metric_for_best_model=trainConfig["metric"])
    trainer = Trainer(
        model,
        args=train_args,
        train_dataset=train_ds,
        eval_dataset=test_ds,
        callbacks=[
            EarlyStoppingCallback(
                early_stopping_patience=trainConfig["early_stopping_patience"]
            ),
        ],
        compute_metrics=compute_metrics,
    )

    y_pred_tuple = trainer.predict(test_ds)
    logits, y_true, metrics = y_pred_tuple
    y_pred = logits.argmax(-1)
    with open("LF.pl", "wb") as f:
        import pickle
        pickle.dump([y_pred, y_true], f)
    print(metrics)
    acc = accuracy_score(y_true, y_pred)
    print(acc)
Esempio n. 6
0
def run_hyperp(train_dataset,
               eval_dataset,
               config,
               model_args,
               labels,
               num_labels,
               label_map,
               tokenizer,
               xargs={}):
    wandb.log({"params": params})
    wandb.log({"xargs": xargs})
    training_args_dict = {
        'output_dir': params["OUTPUT_DIR"],
        'num_train_epochs': params["EPOCH_TOP"],
        'train_batch_size': params["BATCH_SIZE"],
        "save_strategy": "epoch",
        "evaluation_strategy": "steps",
        "eval_steps": max(10,
                          train_dataset.__len__() // params["BATCH_SIZE"]),
        "logging_steps": max(10,
                             train_dataset.__len__() // params["BATCH_SIZE"]),
        "do_train": True,
        "load_best_model_at_end": params["LOAD_BEST_MODEL"],
        "learning_rate": params["lr"],
        "weight_decay": params["weight_decay"],
        "save_total_limit": 2
    }
    print(training_args_dict)
    with open(params["TRAIN_ARGS_FILE"], 'w') as fp:
        json.dump(training_args_dict, fp)
    parser = HfArgumentParser(TrainingArguments)
    training_args = parser.parse_json_file(
        json_file=params["TRAIN_ARGS_FILE"])[0]

    # Initialize the Trainer
    trainer = Trainer(
        model_init=model_init,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        callbacks=[
            EarlyStoppingCallback(early_stopping_patience=params["patience"]),
            LogCallback(params["OUTPUT_DIR"] + "/train_log.json")
        ])
    best_t = trainer.hyperparameter_search(
        backend="ray",
        # Choose among many libraries:
        # https://docs.ray.io/en/latest/tune/api_docs/suggestion.html
        n_trials=10)
    print(best_t)
Esempio n. 7
0
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, Seq2SeqTrainingArguments,
         MultiLingAdapterArguments))
    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args, adapter_args = parser.parse_json_file(
            json_file=os.path.abspath(sys.argv[1]))
    else:
        model_args, data_args, training_args, adapter_args = parser.parse_args_into_dataclasses(
        )

    if data_args.source_prefix is None and model_args.model_name_or_path in [
            "t5-small",
            "t5-base",
            "t5-large",
            "t5-3b",
            "t5-11b",
    ]:
        logger.warning(
            "You're running a t5 model but didn't provide a source prefix, which is the expected, e.g. with "
            "`--source_prefix 'summarize: ' `")

    # Detecting last checkpoint.
    last_checkpoint = None
    if os.path.isdir(
            training_args.output_dir
    ) and training_args.do_train and not training_args.overwrite_output_dir:
        last_checkpoint = get_last_checkpoint(training_args.output_dir)
        if last_checkpoint is None and len(os.listdir(
                training_args.output_dir)) > 0:
            raise ValueError(
                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
                "Use --overwrite_output_dir to overcome.")
        elif last_checkpoint is not None:
            logger.info(
                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
            )

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        handlers=[logging.StreamHandler(sys.stdout)],
    )
    logger.setLevel(logging.INFO if is_main_process(training_args.local_rank
                                                    ) else logging.WARN)

    # Log on each process the small summary:
    logger.warning(
        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
        +
        f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
    )
    # Set the verbosity to info of the Transformers logger (on main process only):
    if is_main_process(training_args.local_rank):
        transformers.utils.logging.set_verbosity_info()
    logger.info(f"Training/evaluation parameters {training_args}")

    # Set seed before initializing model.
    set_seed(training_args.seed)

    # Get the datasets: you can either provide your own CSV/JSON training and evaluation files (see below)
    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
    # (the dataset will be downloaded automatically from the datasets Hub).
    #
    # For CSV/JSON files this script will use the first column for the full texts and the second column for the
    # summaries (unless you specify column names for this with the `text_column` and `summary_column` arguments).
    #
    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
    # download the dataset.
    if data_args.dataset_name is not None:
        # Downloading and loading a dataset from the hub.
        datasets = load_dataset(data_args.dataset_name,
                                data_args.dataset_config_name)
    else:
        data_files = {}
        if data_args.train_file is not None:
            data_files["train"] = data_args.train_file
            extension = data_args.train_file.split(".")[-1]
        if data_args.validation_file is not None:
            data_files["validation"] = data_args.validation_file
            extension = data_args.validation_file.split(".")[-1]
        if data_args.test_file is not None:
            data_files["test"] = data_args.test_file
            extension = data_args.test_file.split(".")[-1]
        datasets = load_dataset(extension, data_files=data_files)
    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
    # https://huggingface.co/docs/datasets/loading_datasets.html.

    # Load pretrained model and tokenizer
    #
    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.
    config = AutoConfig.from_pretrained(
        model_args.config_name
        if model_args.config_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
        revision=model_args.model_revision,
        use_auth_token=True if model_args.use_auth_token else None,
    )
    tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name
        if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
        use_fast=model_args.use_fast_tokenizer,
        revision=model_args.model_revision,
        use_auth_token=True if model_args.use_auth_token else None,
    )
    model = AutoModelForSeq2SeqLM.from_pretrained(
        model_args.model_name_or_path,
        from_tf=bool(".ckpt" in model_args.model_name_or_path),
        config=config,
        cache_dir=model_args.cache_dir,
        revision=model_args.model_revision,
        use_auth_token=True if model_args.use_auth_token else None,
    )

    if model.config.decoder_start_token_id is None:
        raise ValueError(
            "Make sure that `config.decoder_start_token_id` is correctly defined"
        )

    # Setup adapters
    if adapter_args.train_adapter:
        task_name = data_args.dataset_name or "summarization"
        # check if adapter already exists, otherwise add it
        if task_name not in model.config.adapters:
            # resolve the adapter config
            adapter_config = AdapterConfig.load(
                adapter_args.adapter_config,
                non_linearity=adapter_args.adapter_non_linearity,
                reduction_factor=adapter_args.adapter_reduction_factor,
            )
            # load a pre-trained from Hub if specified
            if adapter_args.load_adapter:
                model.load_adapter(
                    adapter_args.load_adapter,
                    config=adapter_config,
                    load_as=task_name,
                )
            # otherwise, add a fresh adapter
            else:
                model.add_adapter(task_name, config=adapter_config)
        # optionally load a pre-trained language adapter
        if adapter_args.load_lang_adapter:
            # resolve the language adapter config
            lang_adapter_config = AdapterConfig.load(
                adapter_args.lang_adapter_config,
                non_linearity=adapter_args.lang_adapter_non_linearity,
                reduction_factor=adapter_args.lang_adapter_reduction_factor,
            )
            # load the language adapter from Hub
            lang_adapter_name = model.load_adapter(
                adapter_args.load_lang_adapter,
                config=lang_adapter_config,
                load_as=adapter_args.language,
            )
        else:
            lang_adapter_name = None
        # Freeze all model weights except of those of this adapter
        model.train_adapter([task_name])
        # Set the adapters to be used in every forward pass
        if lang_adapter_name:
            model.set_active_adapters([lang_adapter_name, task_name])
        else:
            model.set_active_adapters([task_name])
    else:
        if adapter_args.load_adapter or adapter_args.load_lang_adapter:
            raise ValueError(
                "Adapters can only be loaded in adapters training mode."
                "Use --train_adapter to enable adapter training")

    prefix = data_args.source_prefix if data_args.source_prefix is not None else ""

    # Preprocessing the datasets.
    # We need to tokenize inputs and targets.
    if training_args.do_train:
        column_names = datasets["train"].column_names
    elif training_args.do_eval:
        column_names = datasets["validation"].column_names
    elif training_args.do_predict:
        column_names = datasets["test"].column_names
    else:
        logger.info(
            "There is nothing to do. Please pass `do_train`, `do_eval` and/or `do_predict`."
        )
        return

    # Get the column names for input/target.
    dataset_columns = summarization_name_mapping.get(data_args.dataset_name,
                                                     None)
    if data_args.text_column is None:
        text_column = dataset_columns[
            0] if dataset_columns is not None else column_names[0]
    else:
        text_column = data_args.text_column
        if text_column not in column_names:
            raise ValueError(
                f"--text_column' value '{data_args.text_column}' needs to be one of: {', '.join(column_names)}"
            )
    if data_args.summary_column is None:
        summary_column = dataset_columns[
            1] if dataset_columns is not None else column_names[1]
    else:
        summary_column = data_args.summary_column
        if summary_column not in column_names:
            raise ValueError(
                f"--summary_column' value '{data_args.summary_column}' needs to be one of: {', '.join(column_names)}"
            )

    # Temporarily set max_target_length for training.
    max_target_length = data_args.max_target_length
    padding = "max_length" if data_args.pad_to_max_length else False

    if training_args.label_smoothing_factor > 0 and not hasattr(
            model, "prepare_decoder_input_ids_from_labels"):
        logger.warn(
            "label_smoothing is enabled but the `prepare_decoder_input_ids_from_labels` method is not defined for"
            f"`{model.__class__.__name__}`. This will lead to loss being calculated twice and will take up more memory"
        )

    def preprocess_function(examples):
        inputs = examples[text_column]
        targets = examples[summary_column]
        inputs = [prefix + inp for inp in inputs]
        model_inputs = tokenizer(inputs,
                                 max_length=data_args.max_source_length,
                                 padding=padding,
                                 truncation=True)

        # Setup the tokenizer for targets
        with tokenizer.as_target_tokenizer():
            labels = tokenizer(targets,
                               max_length=max_target_length,
                               padding=padding,
                               truncation=True)

        # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
        # padding in the loss.
        if padding == "max_length" and data_args.ignore_pad_token_for_loss:
            labels["input_ids"] = [[
                (l if l != tokenizer.pad_token_id else -100) for l in label
            ] for label in labels["input_ids"]]

        model_inputs["labels"] = labels["input_ids"]
        return model_inputs

    if training_args.do_train:
        train_dataset = datasets["train"]
        if "train" not in datasets:
            raise ValueError("--do_train requires a train dataset")
        if data_args.max_train_samples is not None:
            train_dataset = train_dataset.select(
                range(data_args.max_train_samples))
        train_dataset = train_dataset.map(
            preprocess_function,
            batched=True,
            num_proc=data_args.preprocessing_num_workers,
            remove_columns=column_names,
            load_from_cache_file=not data_args.overwrite_cache,
        )

    if training_args.do_eval:
        max_target_length = data_args.val_max_target_length
        if "validation" not in datasets:
            raise ValueError("--do_eval requires a validation dataset")
        eval_dataset = datasets["validation"]
        if data_args.max_val_samples is not None:
            eval_dataset = eval_dataset.select(range(
                data_args.max_val_samples))
        eval_dataset = eval_dataset.map(
            preprocess_function,
            batched=True,
            num_proc=data_args.preprocessing_num_workers,
            remove_columns=column_names,
            load_from_cache_file=not data_args.overwrite_cache,
        )

    if training_args.do_predict:
        max_target_length = data_args.val_max_target_length
        if "test" not in datasets:
            raise ValueError("--do_predict requires a test dataset")
        test_dataset = datasets["test"]
        if data_args.max_test_samples is not None:
            test_dataset = test_dataset.select(
                range(data_args.max_test_samples))
        test_dataset = test_dataset.map(
            preprocess_function,
            batched=True,
            num_proc=data_args.preprocessing_num_workers,
            remove_columns=column_names,
            load_from_cache_file=not data_args.overwrite_cache,
        )

    # Data collator
    label_pad_token_id = -100 if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id
    data_collator = DataCollatorForSeq2Seq(
        tokenizer,
        model=model,
        label_pad_token_id=label_pad_token_id,
        pad_to_multiple_of=8 if training_args.fp16 else None,
    )

    # Metric
    metric = load_metric("rouge")

    def postprocess_text(preds, labels):
        preds = [pred.strip() for pred in preds]
        labels = [label.strip() for label in labels]

        # rougeLSum expects newline after each sentence
        preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
        labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]

        return preds, labels

    def compute_metrics(eval_preds):
        preds, labels = eval_preds
        if isinstance(preds, tuple):
            preds = preds[0]
        decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
        if data_args.ignore_pad_token_for_loss:
            # Replace -100 in the labels as we can't decode them.
            labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
        decoded_labels = tokenizer.batch_decode(labels,
                                                skip_special_tokens=True)

        # Some simple post-processing
        decoded_preds, decoded_labels = postprocess_text(
            decoded_preds, decoded_labels)

        result = metric.compute(predictions=decoded_preds,
                                references=decoded_labels,
                                use_stemmer=True)
        # Extract a few results from ROUGE
        result = {
            key: value.mid.fmeasure * 100
            for key, value in result.items()
        }

        prediction_lens = [
            np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds
        ]
        result["gen_len"] = np.mean(prediction_lens)
        result = {k: round(v, 4) for k, v in result.items()}
        return result

    # Early stopping
    if data_args.patience and data_args.patience > 0:
        training_args.load_best_model_at_end = True

    # Initialize our Trainer
    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset if training_args.do_train else None,
        eval_dataset=eval_dataset if training_args.do_eval else None,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics
        if training_args.predict_with_generate else None,
        do_save_full_model=not adapter_args.train_adapter,
        do_save_adapters=adapter_args.train_adapter,
    )
    if data_args.patience and data_args.patience > 0:
        callback = EarlyStoppingCallback(
            early_stopping_patience=data_args.patience)
        trainer.add_callback(callback)

    # Training
    if training_args.do_train:
        if last_checkpoint is not None:
            checkpoint = last_checkpoint
        elif os.path.isdir(model_args.model_name_or_path):
            checkpoint = model_args.model_name_or_path
        else:
            checkpoint = None
        train_result = trainer.train(resume_from_checkpoint=checkpoint)
        trainer.save_model()  # Saves the tokenizer too for easy upload

        metrics = train_result.metrics
        max_train_samples = (data_args.max_train_samples
                             if data_args.max_train_samples is not None else
                             len(train_dataset))
        metrics["train_samples"] = min(max_train_samples, len(train_dataset))

        trainer.log_metrics("train", metrics)
        trainer.save_metrics("train", metrics)
        trainer.save_state()

    # Evaluation
    results = {}
    if training_args.do_eval:
        logger.info("*** Evaluate ***")

        metrics = trainer.evaluate(max_length=data_args.val_max_target_length,
                                   num_beams=data_args.num_beams,
                                   metric_key_prefix="eval")
        max_val_samples = data_args.max_val_samples if data_args.max_val_samples is not None else len(
            eval_dataset)
        metrics["eval_samples"] = min(max_val_samples, len(eval_dataset))

        trainer.log_metrics("eval", metrics)
        trainer.save_metrics("eval", metrics)

    if training_args.do_predict:
        logger.info("*** Test ***")

        test_results = trainer.predict(
            test_dataset,
            metric_key_prefix="test",
            max_length=data_args.val_max_target_length,
            num_beams=data_args.num_beams,
        )
        metrics = test_results.metrics
        max_test_samples = data_args.max_test_samples if data_args.max_test_samples is not None else len(
            test_dataset)
        metrics["test_samples"] = min(max_test_samples, len(test_dataset))

        trainer.log_metrics("test", metrics)
        trainer.save_metrics("test", metrics)

        if trainer.is_world_process_zero():
            if training_args.predict_with_generate:
                test_preds = tokenizer.batch_decode(
                    test_results.predictions,
                    skip_special_tokens=True,
                    clean_up_tokenization_spaces=True)
                test_preds = [pred.strip() for pred in test_preds]
                output_test_preds_file = os.path.join(training_args.output_dir,
                                                      "test_generations.txt")
                with open(output_test_preds_file, "w") as writer:
                    writer.write("\n".join(test_preds))

    return results
Esempio n. 8
0
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, Seq2SeqTrainingArguments,
         MultiLingAdapterArguments))
    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args, adapter_args = parser.parse_json_file(
            json_file=os.path.abspath(sys.argv[1]))
    else:
        model_args, data_args, training_args, adapter_args = parser.parse_args_into_dataclasses(
        )

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        handlers=[logging.StreamHandler(sys.stdout)],
    )

    log_level = training_args.get_process_log_level()
    logger.setLevel(log_level)
    datasets.utils.logging.set_verbosity(log_level)
    transformers.utils.logging.set_verbosity(log_level)
    transformers.utils.logging.enable_default_handler()
    transformers.utils.logging.enable_explicit_format()

    # Log on each process the small summary:
    logger.warning(
        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
        +
        f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
    )
    logger.info(f"Training/evaluation parameters {training_args}")

    if data_args.source_prefix is None and model_args.model_name_or_path in [
            "t5-small",
            "t5-base",
            "t5-large",
            "t5-3b",
            "t5-11b",
    ]:
        logger.warning(
            "You're running a t5 model but didn't provide a source prefix, which is expected, e.g. with "
            "`--source_prefix 'translate English to German: ' `")

    # Detecting last checkpoint.
    last_checkpoint = None
    if os.path.isdir(
            training_args.output_dir
    ) and training_args.do_train and not training_args.overwrite_output_dir:
        last_checkpoint = get_last_checkpoint(training_args.output_dir)
        if last_checkpoint is None and len(os.listdir(
                training_args.output_dir)) > 0:
            raise ValueError(
                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
                "Use --overwrite_output_dir to overcome.")
        elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
            logger.info(
                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
            )

    # Set seed before initializing model.
    set_seed(training_args.seed)

    # Get the datasets: you can either provide your own JSON training and evaluation files (see below)
    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
    # (the dataset will be downloaded automatically from the datasets Hub).
    #
    # For translation, only JSON files are supported, with one field named "translation" containing two keys for the
    # source and target languages (unless you adapt what follows).
    #
    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
    # download the dataset.
    if data_args.dataset_name is not None:
        # Downloading and loading a dataset from the hub.
        raw_datasets = load_dataset(data_args.dataset_name,
                                    data_args.dataset_config_name,
                                    cache_dir=model_args.cache_dir)
    else:
        data_files = {}
        if data_args.train_file is not None:
            data_files["train"] = data_args.train_file
            extension = data_args.train_file.split(".")[-1]
        if data_args.validation_file is not None:
            data_files["validation"] = data_args.validation_file
            extension = data_args.validation_file.split(".")[-1]
        if data_args.test_file is not None:
            data_files["test"] = data_args.test_file
            extension = data_args.test_file.split(".")[-1]
        raw_datasets = load_dataset(extension,
                                    data_files=data_files,
                                    cache_dir=model_args.cache_dir)
    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
    # https://huggingface.co/docs/datasets/loading_datasets.html.

    # Load pretrained model and tokenizer
    #
    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.
    config = AutoConfig.from_pretrained(
        model_args.config_name
        if model_args.config_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
        revision=model_args.model_revision,
        use_auth_token=True if model_args.use_auth_token else None,
    )
    tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name
        if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
        use_fast=model_args.use_fast_tokenizer,
        revision=model_args.model_revision,
        use_auth_token=True if model_args.use_auth_token else None,
    )
    model = AutoModelForSeq2SeqLM.from_pretrained(
        model_args.model_name_or_path,
        from_tf=bool(".ckpt" in model_args.model_name_or_path),
        config=config,
        cache_dir=model_args.cache_dir,
        revision=model_args.model_revision,
        use_auth_token=True if model_args.use_auth_token else None,
    )

    model.resize_token_embeddings(len(tokenizer))

    # Set decoder_start_token_id
    if model.config.decoder_start_token_id is None and isinstance(
            tokenizer, (MBartTokenizer, MBartTokenizerFast)):
        if isinstance(tokenizer, MBartTokenizer):
            model.config.decoder_start_token_id = tokenizer.lang_code_to_id[
                data_args.target_lang]
        else:
            model.config.decoder_start_token_id = tokenizer.convert_tokens_to_ids(
                data_args.target_lang)

    if model.config.decoder_start_token_id is None:
        raise ValueError(
            "Make sure that `config.decoder_start_token_id` is correctly defined"
        )

    # Setup adapters
    if adapter_args.train_adapter:
        task_name = data_args.source_lang.split(
            "_")[0] + "_" + data_args.target_lang.split("_")[0]
        # check if adapter already exists, otherwise add it
        if task_name not in model.config.adapters:
            # resolve the adapter config
            adapter_config = AdapterConfig.load(
                adapter_args.adapter_config,
                non_linearity=adapter_args.adapter_non_linearity,
                reduction_factor=adapter_args.adapter_reduction_factor,
            )
            # load a pre-trained from Hub if specified
            if adapter_args.load_adapter:
                model.load_adapter(
                    adapter_args.load_adapter,
                    config=adapter_config,
                    load_as=task_name,
                )
            # otherwise, add a fresh adapter
            else:
                model.add_adapter(task_name, config=adapter_config)
        # optionally load a pre-trained language adapter
        if adapter_args.load_lang_adapter:
            # resolve the language adapter config
            lang_adapter_config = AdapterConfig.load(
                adapter_args.lang_adapter_config,
                non_linearity=adapter_args.lang_adapter_non_linearity,
                reduction_factor=adapter_args.lang_adapter_reduction_factor,
            )
            # load the language adapter from Hub
            lang_adapter_name = model.load_adapter(
                adapter_args.load_lang_adapter,
                config=lang_adapter_config,
                load_as=adapter_args.language,
            )
        else:
            lang_adapter_name = None
        # Freeze all model weights except of those of this adapter
        model.train_adapter([task_name])
        # Set the adapters to be used in every forward pass
        if lang_adapter_name:
            model.set_active_adapters(ac.Stack(lang_adapter_name, task_name))
        else:
            model.set_active_adapters([task_name])
    else:
        if adapter_args.load_adapter or adapter_args.load_lang_adapter:
            raise ValueError(
                "Adapters can only be loaded in adapters training mode."
                "Use --train_adapter to enable adapter training")

    prefix = data_args.source_prefix if data_args.source_prefix is not None else ""

    # Preprocessing the datasets.
    # We need to tokenize inputs and targets.
    if training_args.do_train:
        column_names = raw_datasets["train"].column_names
    elif training_args.do_eval:
        column_names = raw_datasets["validation"].column_names
    elif training_args.do_predict:
        column_names = raw_datasets["test"].column_names
    else:
        logger.info(
            "There is nothing to do. Please pass `do_train`, `do_eval` and/or `do_predict`."
        )
        return

    # For translation we set the codes of our source and target languages (only useful for mBART, the others will
    # ignore those attributes).
    if isinstance(tokenizer, tuple(MULTILINGUAL_TOKENIZERS)):
        assert data_args.target_lang is not None and data_args.source_lang is not None, (
            f"{tokenizer.__class__.__name__} is a multilingual tokenizer which requires --source_lang and "
            "--target_lang arguments.")

        tokenizer.src_lang = data_args.source_lang
        tokenizer.tgt_lang = data_args.target_lang

        # For multilingual translation models like mBART-50 and M2M100 we need to force the target language token
        # as the first generated token. We ask the user to explicitly provide this as --forced_bos_token argument.
        forced_bos_token_id = (
            tokenizer.lang_code_to_id[data_args.forced_bos_token]
            if data_args.forced_bos_token is not None else None)
        model.config.forced_bos_token_id = forced_bos_token_id

    # Get the language codes for input/target.
    source_lang = data_args.source_lang.split("_")[0]
    target_lang = data_args.target_lang.split("_")[0]

    # Temporarily set max_target_length for training.
    max_target_length = data_args.max_target_length
    padding = "max_length" if data_args.pad_to_max_length else False

    if training_args.label_smoothing_factor > 0 and not hasattr(
            model, "prepare_decoder_input_ids_from_labels"):
        logger.warning(
            "label_smoothing is enabled but the `prepare_decoder_input_ids_from_labels` method is not defined for"
            f"`{model.__class__.__name__}`. This will lead to loss being calculated twice and will take up more memory"
        )

    def preprocess_function(examples):
        inputs = [ex[source_lang] for ex in examples["translation"]]
        targets = [ex[target_lang] for ex in examples["translation"]]
        inputs = [prefix + inp for inp in inputs]
        model_inputs = tokenizer(inputs,
                                 max_length=data_args.max_source_length,
                                 padding=padding,
                                 truncation=True)

        # Setup the tokenizer for targets
        with tokenizer.as_target_tokenizer():
            labels = tokenizer(targets,
                               max_length=max_target_length,
                               padding=padding,
                               truncation=True)

        # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
        # padding in the loss.
        if padding == "max_length" and data_args.ignore_pad_token_for_loss:
            labels["input_ids"] = [[
                (l if l != tokenizer.pad_token_id else -100) for l in label
            ] for label in labels["input_ids"]]

        model_inputs["labels"] = labels["input_ids"]
        return model_inputs

    if training_args.do_train:
        if "train" not in raw_datasets:
            raise ValueError("--do_train requires a train dataset")
        train_dataset = raw_datasets["train"]
        if data_args.max_train_samples is not None:
            train_dataset = train_dataset.select(
                range(data_args.max_train_samples))
        with training_args.main_process_first(
                desc="train dataset map pre-processing"):
            train_dataset = train_dataset.map(
                preprocess_function,
                batched=True,
                num_proc=data_args.preprocessing_num_workers,
                remove_columns=column_names,
                load_from_cache_file=not data_args.overwrite_cache,
                desc="Running tokenizer on train dataset",
            )

    if training_args.do_eval:
        max_target_length = data_args.val_max_target_length
        if "validation" not in raw_datasets:
            raise ValueError("--do_eval requires a validation dataset")
        eval_dataset = raw_datasets["validation"]
        if data_args.max_eval_samples is not None:
            eval_dataset = eval_dataset.select(
                range(data_args.max_eval_samples))
        with training_args.main_process_first(
                desc="validation dataset map pre-processing"):
            eval_dataset = eval_dataset.map(
                preprocess_function,
                batched=True,
                num_proc=data_args.preprocessing_num_workers,
                remove_columns=column_names,
                load_from_cache_file=not data_args.overwrite_cache,
                desc="Running tokenizer on validation dataset",
            )

    if training_args.do_predict:
        max_target_length = data_args.val_max_target_length
        if "test" not in raw_datasets:
            raise ValueError("--do_predict requires a test dataset")
        predict_dataset = raw_datasets["test"]
        if data_args.max_predict_samples is not None:
            predict_dataset = predict_dataset.select(
                range(data_args.max_predict_samples))
        with training_args.main_process_first(
                desc="prediction dataset map pre-processing"):
            predict_dataset = predict_dataset.map(
                preprocess_function,
                batched=True,
                num_proc=data_args.preprocessing_num_workers,
                remove_columns=column_names,
                load_from_cache_file=not data_args.overwrite_cache,
                desc="Running tokenizer on prediction dataset",
            )

    # Data collator
    label_pad_token_id = -100 if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id
    if data_args.pad_to_max_length:
        data_collator = default_data_collator
    else:
        data_collator = DataCollatorForSeq2Seq(
            tokenizer,
            model=model,
            label_pad_token_id=label_pad_token_id,
            pad_to_multiple_of=8 if training_args.fp16 else None,
        )

    # Metric
    metric = load_metric("sacrebleu")

    def postprocess_text(preds, labels):
        preds = [pred.strip() for pred in preds]
        labels = [[label.strip()] for label in labels]

        return preds, labels

    def compute_metrics(eval_preds):
        preds, labels = eval_preds
        if isinstance(preds, tuple):
            preds = preds[0]
        decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
        if data_args.ignore_pad_token_for_loss:
            # Replace -100 in the labels as we can't decode them.
            labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
        decoded_labels = tokenizer.batch_decode(labels,
                                                skip_special_tokens=True)

        # Some simple post-processing
        decoded_preds, decoded_labels = postprocess_text(
            decoded_preds, decoded_labels)

        result = metric.compute(predictions=decoded_preds,
                                references=decoded_labels)
        result = {"bleu": result["score"]}

        prediction_lens = [
            np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds
        ]
        result["gen_len"] = np.mean(prediction_lens)
        result = {k: round(v, 4) for k, v in result.items()}
        return result

    # Early stopping
    if data_args.patience and data_args.patience > 0:
        training_args.load_best_model_at_end = True

    # Initialize our Trainer
    trainer_class = Seq2SeqAdapterTrainer if adapter_args.train_adapter else Seq2SeqTrainer
    trainer = trainer_class(
        model=model,
        args=training_args,
        train_dataset=train_dataset if training_args.do_train else None,
        eval_dataset=eval_dataset if training_args.do_eval else None,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics
        if training_args.predict_with_generate else None,
    )
    if data_args.patience and data_args.patience > 0:
        callback = EarlyStoppingCallback(
            early_stopping_patience=data_args.patience)
        trainer.add_callback(callback)

    # Training
    if training_args.do_train:
        checkpoint = None
        if training_args.resume_from_checkpoint is not None:
            checkpoint = training_args.resume_from_checkpoint
        elif last_checkpoint is not None:
            checkpoint = last_checkpoint
        train_result = trainer.train(resume_from_checkpoint=checkpoint)
        trainer.save_model()  # Saves the tokenizer too for easy upload

        metrics = train_result.metrics
        max_train_samples = (data_args.max_train_samples
                             if data_args.max_train_samples is not None else
                             len(train_dataset))
        metrics["train_samples"] = min(max_train_samples, len(train_dataset))

        trainer.log_metrics("train", metrics)
        trainer.save_metrics("train", metrics)
        trainer.save_state()

    # Evaluation
    results = {}
    max_length = (training_args.generation_max_length
                  if training_args.generation_max_length is not None else
                  data_args.val_max_target_length)
    num_beams = data_args.num_beams if data_args.num_beams is not None else training_args.generation_num_beams
    if training_args.do_eval:
        logger.info("*** Evaluate ***")

        metrics = trainer.evaluate(max_length=max_length,
                                   num_beams=num_beams,
                                   metric_key_prefix="eval")
        max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(
            eval_dataset)
        metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))

        trainer.log_metrics("eval", metrics)
        trainer.save_metrics("eval", metrics)

    if training_args.do_predict:
        logger.info("*** Predict ***")

        predict_results = trainer.predict(predict_dataset,
                                          metric_key_prefix="predict",
                                          max_length=max_length,
                                          num_beams=num_beams)
        metrics = predict_results.metrics
        max_predict_samples = (data_args.max_predict_samples
                               if data_args.max_predict_samples is not None
                               else len(predict_dataset))
        metrics["predict_samples"] = min(max_predict_samples,
                                         len(predict_dataset))

        trainer.log_metrics("predict", metrics)
        trainer.save_metrics("predict", metrics)

        if trainer.is_world_process_zero():
            if training_args.predict_with_generate:
                predictions = tokenizer.batch_decode(
                    predict_results.predictions,
                    skip_special_tokens=True,
                    clean_up_tokenization_spaces=True)
                predictions = [pred.strip() for pred in predictions]
                output_prediction_file = os.path.join(
                    training_args.output_dir, "generated_predictions.txt")
                with open(output_prediction_file, "w",
                          encoding="utf-8") as writer:
                    writer.write("\n".join(predictions))

    kwargs = {
        "finetuned_from": model_args.model_name_or_path,
        "tasks": "translation"
    }
    if data_args.dataset_name is not None:
        kwargs["dataset_tags"] = data_args.dataset_name
        if data_args.dataset_config_name is not None:
            kwargs["dataset_args"] = data_args.dataset_config_name
            kwargs[
                "dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
        else:
            kwargs["dataset"] = data_args.dataset_name

    languages = [
        l for l in [data_args.source_lang, data_args.target_lang]
        if l is not None
    ]
    if len(languages) > 0:
        kwargs["language"] = languages

    if training_args.push_to_hub:
        trainer.push_to_hub(**kwargs)
    else:
        trainer.create_model_card(**kwargs)

    return results
Esempio n. 9
0
    # Initialize our Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics_pos
        if args.task_name == "en_ewt" else compute_metrics_ner,
    )

    # Early stop
    if args.inoculation_patience_count != -1:
        trainer.add_callback(
            EarlyStoppingCallback(args.inoculation_patience_count))

    # Training
    if training_args.do_train:
        checkpoint = None
        train_result = trainer.train(resume_from_checkpoint=checkpoint)
        metrics = train_result.metrics
        trainer.save_model()  # Saves the tokenizer too for easy upload

        metrics["train_samples"] = len(train_dataset)

        # trainer.log_metrics("train", metrics)
        # trainer.save_metrics("train", metrics)
        # trainer.save_state()

    # Evaluation
Esempio n. 10
0
    test_dataset = test_dataset.map(tokenize_fn,
                                    batched=True,
                                    batch_size=len(test_dataset))
    test_dataset = test_dataset.rename_column("label", "labels")
    test_dataset.set_format('torch',
                            columns=['input_ids', 'attention_mask', 'labels'])

    #############################################################
    ## Callbacks + Collator #####################################
    #############################################################

    callbacks = []

    tmcb = None
    escb = EarlyStoppingCallback(early_stopping_patience=10)
    callbacks.append(escb)

    transform = None
    num_sampled_INV = 0
    num_sampled_SIB = 0
    label_type = "soft"
    keep_original = True

    if t == "ORIG":
        label_type = "hard"
    elif t == "INV":
        num_sampled_INV = 2
        label_type = "hard"
    elif t == "SIB":
        num_sampled_SIB = 2
Esempio n. 11
0
def train(args):  # + inference 과정까지 추가하였습니다.
  assert sum([args.use_kfold,args.use_simple_fold,args.no_valid])==1
  assert (args.concat_exp_p==0 or args.concat_log_p==0)
  # assert args.eval_steps == args.logging_steps
  if args.use_kfold==True:
    assert (args.num_fold_k>=2)

  seed_everything(args.seed)
  USE_KFOLD = args.use_kfold
  # load model and tokenizer
  model_type_getattr = args.model_type  # ELECTRA # BERT
  model_name_from_pretrained = args.pretrained_model # "monologg/koelectra-small-discriminator", "monologg/koelectra-small-discriminator"
  tokenizer = AutoTokenizer.from_pretrained(model_name_from_pretrained)
  device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

  # load dataset
  # load_data_module = getattr(import_module("load_data"), f'load_data')
  # dataset = load_tr_val_data("../input/data/train/train.tsv",args)
  dataset = load_tr_val_data("../input/data/train/final_train_ner.tsv",args)

  # setting model hyperparameter
  # config_module = getattr(import_module("transformers"), f'{args.model_type}Config')
  # model_config = config_module.from_pretrained(model_name_from_pretrained)
  model_module = getattr(import_module("transformers"), f'{model_type_getattr}ForSequenceClassification')
  model = model_module.from_pretrained(model_name_from_pretrained, num_labels=42)
  model.parameters
  model.to(device)

  # model_saved_dir = increment_output_dir(args.model_output_dir)
  model_saved_dir = increment_output_dir(model_name_from_pretrained.replace('/','_')) # f'./results/{output_path}'

  neptune.append_tag(f"{model_saved_dir.split('/')[-1]}")
  neptune.append_tag(f"{args.name}")


  with open('../input/data/label_type.pkl', 'rb') as f:
    label_type = pickle.load(f)

  # Simple Train Valid Split
  # Not KFOLD # => StratifiedShuffleSplit

  #################################################################################################
  #################################################################################################
  elif args.use_kfold==True: # KFOLD
    if not os.path.isdir('./kfold_results'):  # 모델들을 저장할 상위폴더
      os.makedirs('./kfold_results')
    kfold = StratifiedKFold(n_splits=args.num_fold_k, random_state=args.seed, shuffle=True)
    label = dataset['label']

    # 이미 해당 모델로 kfold가 수행되고 모델 저장된 적이 있는지 확인
    model_name_from_pretrained_used_for_save = model_name_from_pretrained.replace('/','_')
    check_upper_dir = f'./kfold_results/{model_name_from_pretrained_used_for_save}'
    if not os.path.isdir(check_upper_dir+'0'):  # 존재하지 않는다면 그대로 사용
      upper_dir=check_upper_dir+'0'
    else: # 존재한다면 존재하는 것들 중 숫자 찾아서 최댓값 +1 을 사용
      all_directories = glob.glob(f'./kfold_results/*')
      max_num = max(int(re.search(rf"{model_name_from_pretrained_used_for_save}[0-9]+",ad).group().replace(model_name_from_pretrained_used_for_save,'')) for ad in all_directories if re.search(rf"{model_name_from_pretrained_used_for_save}[0-9]+",ad))
      upper_dir = check_upper_dir+str(max_num+1)

    neptune.log_text('Model_Name_Number', f"{upper_dir.split('/')[-1]}")

    kfold_train_acc_score = []
    kfold_val_acc_score = []

    k=0
    for train_idx, val_idx in kfold.split(dataset, label):
      # model_module = getattr(import_module("transformers"), f'{model_type_getattr}ForSequenceClassification')
      # model = model_module.from_pretrained(model_name_from_pretrained, num_labels=42)
      config_module = getattr(import_module("transformers"), f'{model_type_getattr}Config')
      model_config = config_module.from_pretrained(model_name_from_pretrained)
      # model_config = ElectraConfig.from_pretrained(model_name_from_pretrained)
      model_config.num_labels = 42
      model_config.hidden_dropout_prob = args.hidden_dropout_prob
      model_module = getattr(import_module("transformers"), f'{model_type_getattr}ForSequenceClassification')
      model = model_module.from_pretrained(model_name_from_pretrained, config=model_config)

      model.parameters
      model.to(device)
      print('='*50)
      print('=' * 15 + f'{k}-th Fold Cross Validation Started ({k+1}/{args.num_fold_k})' + '=' * 15)

      train_dataset = dataset.iloc[train_idx]
      val_dataset = dataset.iloc[val_idx]

      # 새로운 외부데이터 추가해서 학습해보기
      if args.concat_external_data==True:
        train_dataset = concat_external_data(train_dataset,label_type,args)


      train_label = train_dataset['label'].values
      val_label = val_dataset['label'].values
      
      # tokenizing dataset
      tokenized_train = tokenized_dataset(train_dataset, tokenizer, args)
      tokenized_val = tokenized_dataset(val_dataset, tokenizer, args)

      # make dataset for pytorch.
      RE_train_dataset = RE_Dataset(tokenized_train, train_label)
      RE_val_dataset = RE_Dataset(tokenized_val, val_label)
      print('='*50)
      print('Train & Valid Loaded Successfully!!')
      print(f'len(RE_train_dataset) : {len(RE_train_dataset)}, len(RE_val_dataset) : {len(RE_val_dataset)}')
      
      model_saved_dir = upper_dir+f'/{k}fold' # f'./kfold_results/{model_name_from_pretrained_used_for_save}'+f'/{k}fold'
      neptune.log_text(f'{k}-th model_saved_dir',model_saved_dir)
      neptune.log_text(f'Num_Data : {k}-th len(RE_train_dataset)',str(len(RE_train_dataset)))
      neptune.log_text(f'Num_Data : {k}-th len(RE_val_dataset)',str(len(RE_val_dataset)))

      # 사용한 option 외에도 다양한 option들이 있습니다.
      # https://huggingface.co/transformers/main_classes/trainer.html#trainingarguments 참고해주세요.
      # https://huggingface.co/transformers/main_classes/trainer.html?highlight=trainingarguments#trainingarguments

      total_num_epochs = (len(RE_train_dataset)//args.batch_size+1)*args.epochs
      if args.use_warmup_ratio:
        warmup_steps = total_num_epochs*args.warmup_ratio
      else:
        warmup_steps = 0
      wandb_run_name = model_saved_dir.replace('./kfold_results/',f'Total {args.num_fold_k}fold :')
      training_args = TrainingArguments(
          report_to = 'wandb', # 'all'
          run_name = f"{args.name+wandb_run_name.replace('/','_')}",
          output_dir=model_saved_dir,          # output directory
          # overwrite_output_dir=False, # 모델을 저장할 때 덮어쓰기 할 것인지
          save_total_limit=args.save_total_limit,              # number of total save model.
          save_steps=args.model_save_steps,                 # model saving step.
          num_train_epochs=args.epochs,              # total number of training epochs
          learning_rate=args.lr,               # learning_rate
          per_device_train_batch_size=args.batch_size,  # batch size per device during training
          per_device_eval_batch_size=args.val_batch_size,   # batch size for evaluation
          warmup_steps=warmup_steps,                # number of warmup steps for learning rate scheduler
          weight_decay=args.weight_decay,               # strength of weight decay
          logging_dir='./logs',            # directory for storing logs
          logging_steps=args.logging_steps,              # log saving step.
          evaluation_strategy='steps', # evaluation strategy to adopt during training
          eval_steps = args.eval_steps,            # evaluation step.
          # max_grad_norm=1,
          label_smoothing_factor = args.label_smoothing_factor,
          load_best_model_at_end = args.load_best_model_at_end,  # default => False
          # greater_is_better = True,
          metric_for_best_model = args.metric_for_best_model, # metric_for_best_model: Optional[str] = None
          # fp16 = True,  # Whether to use 16-bit (mixed) precision training instead of 32-bit training.
          # dataloader_num_workers = 2,
        )

      # EarlyStopping
      # 여기선 global epochs 하이퍼파라미터를 기준으로 하지 않고, Total_Step을 본다.
      # 만약 patience를 1로 설정하면 eval_step * 1만큼을 기준으로 판단한다. (eval_step=25로 설정했다면 25만큼 patience)
      early_stopping = EarlyStoppingCallback(
                                            early_stopping_patience = args.early_stopping_patience, 
                                            early_stopping_threshold = 1e-4)

      ## Optimizer
      if args.optimizer_name == "Adam":
        optimizer = Adam(model.parameters(), lr=args.min_lr)
      elif args.optimizer_name == "AdamW":
        optimizer = AdamW(model.parameters(), lr=args.lr, weight_decay=args.weight_decay)
      elif args.optimizer_name == "SGD":
        optimizer = SGD(model.parameters(), lr=args.lr, momentum=0.9, weight_decay=args.weight_decay)
      
      # https://arxiv.org/pdf/1608.03983.pdf
      ## Scheduler
      T_0 = int(np.ceil(total_num_epochs*args.first_cycle_ratio))
      if args.scheduler_name == "Custom":
        scheduler = CustomizedCosineAnnealingWarmRestarts(optimizer,
                                                          T_0=T_0,
                                                          T_mult=2,
                                                          eta_max=args.lr,
                                                          T_up=int(T_0*args.first_warmup_ratio), 
                                                          gamma=args.scheduler_gamma,
                                                          last_epoch=-1)
      elif args.scheduler_name == "Original":
        scheduler = CosineAnnealingWarmRestarts(optimizer, T_0=T_0, T_mult=2, eta_min=args.min_lr)

      # https://huggingface.co/tkransformers/main_classes/trainer.html?highlight=trainer#id1
      trainer = Trainer(
        model=model,                         # Transformers model to be trained
        args=training_args,                  # training arguments, defined above
        train_dataset=RE_train_dataset,         # training dataset
        eval_dataset=RE_val_dataset,             # evaluation dataset
        compute_metrics=compute_metrics,         # define metrics function
        optimizers=  (optimizer,scheduler), # optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]=(None, None))
        callbacks=  [early_stopping], # callbacks: Optional[List[TrainerCallback]]=None
        # model_init= 
      )
      # train model
      trainer.train()

      print(f'Neptune Saving {k}-th Model Logs Plot')
      # Get Log from Model
      # Neptune Save Plot (train_eval_loss, learning_rate, eval_accuracy)
      train_eval_loss_plot, learning_rate_plot, eval_accuracy_plot = k_th_plot_from_logs(trainer.state.log_history)
      neptune.log_image(f'Logs : {k}-th train_eval_loss_plot',train_eval_loss_plot)
      neptune.log_image(f'Logs : {k}-th learning_rate_plot',learning_rate_plot)
      neptune.log_image(f'Logs : {k}-th eval_accuracy_plot',eval_accuracy_plot)

      print(f'{k}-th train finished!!')


      state_log_history = trainer.state.log_history
      eval_log_dict = [log_dict for log_dict in state_log_history if 'eval_loss' in log_dict.keys() ]
      k_th_val_logs_dict = defaultdict(list)
      for dict_per_step in eval_log_dict:
        for key,value in dict_per_step.items():
          k_th_val_logs_dict[key].append(value)

      best_val_acc_score = max(k_th_val_logs_dict['eval_accuracy'])

      # neptune.log_metric(f'{k}-th train_acc_score',best_train_acc_score)
      neptune.log_metric(f'{k}-th val_acc_score',best_val_acc_score)

      kfold_val_acc_score.append(best_val_acc_score)
      k=int(k)
      k+=1
      
    # neptune.log_text(f"{args.num_fold_k}-fold train best acc list", f"{kfold_train_acc_score}")
    neptune.log_text(f"{args.num_fold_k}-fold val best acc list", f"{kfold_val_acc_score}")
    # neptune.log_metric(f"Result ACC : {args.num_fold_k}-fold train Total Average acc", np.mean(kfold_train_acc_score))
    neptune.log_metric(f"Result ACC : {args.num_fold_k}-fold val Total Average acc", np.mean(kfold_val_acc_score))
Esempio n. 12
0
def train(cfg):
    SEED = cfg.values.seed
    MODEL_NAME = cfg.values.model_name
    USE_KFOLD = cfg.values.val_args.use_kfold
    TRAIN_ONLY = cfg.values.train_only

    seed_everything(SEED)
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

    # model_config_module = getattr(import_module('transformers'), cfg.values.model_arc + 'Config')
    model_config = AutoConfig.from_pretrained(MODEL_NAME)
    model_config.num_labels = 42

    whole_df = load_data("/opt/ml/input/data/train/train.tsv")
    additional_df = load_data("/opt/ml/input/data/train/additional_train.tsv")

    whole_label = whole_df['label'].values
    # additional_label = additional_df['label'].values

    if cfg.values.tokenizer_arc:
        tokenizer_module = getattr(import_module('transformers'),
                                   cfg.values.tokenizer_arc)
        tokenizer = tokenizer_module.from_pretrained(MODEL_NAME)
    else:
        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

    early_stopping = EarlyStoppingCallback(early_stopping_patience=9999999,
                                           early_stopping_threshold=0.001)

    training_args = TrainingArguments(
        output_dir=cfg.values.train_args.output_dir,  # output directory
        save_total_limit=cfg.values.train_args.
        save_total_limit,  # number of total save model.
        save_steps=cfg.values.train_args.save_steps,  # model saving step.
        num_train_epochs=cfg.values.train_args.
        num_epochs,  # total number of training epochs
        learning_rate=cfg.values.train_args.lr,  # learning_rate
        per_device_train_batch_size=cfg.values.train_args.
        train_batch_size,  # batch size per device during training
        per_device_eval_batch_size=cfg.values.train_args.
        eval_batch_size,  # batch size for evaluation         
        warmup_steps=cfg.values.train_args.
        warmup_steps,  # number of warmup steps for learning rate scheduler
        weight_decay=cfg.values.train_args.
        weight_decay,  # strength of weight decay            
        max_grad_norm=cfg.values.train_args.max_grad_norm,
        logging_dir=cfg.values.train_args.
        logging_dir,  # directory for storing logs
        logging_steps=cfg.values.train_args.logging_steps,  # log saving step.
        evaluation_strategy=cfg.values.train_args.
        evaluation_strategy,  # evaluation strategy to adopt during training
        # `no`: No evaluation during training.
        # `steps`: Evaluate every `eval_steps`.
        # `epoch`: Evaluate every end of epoch.
        eval_steps=cfg.values.train_args.eval_steps,  # evaluation step.
        dataloader_num_workers=4,
        seed=SEED,
        label_smoothing_factor=cfg.values.train_args.label_smoothing_factor,
        load_best_model_at_end=True,
        # metric_for_best_model='accuracy'
    )

    if USE_KFOLD:
        kfold = StratifiedKFold(n_splits=cfg.values.val_args.num_k)

        k = 1
        for train_idx, val_idx in kfold.split(whole_df, whole_label):
            print('\n')
            cpprint('=' * 15 + f'{k}-Fold Cross Validation' + '=' * 15)
            train_df = whole_df.iloc[train_idx]
            # train_df = pd.concat((train_df, additional_df))
            val_df = whole_df.iloc[val_idx]

            if cfg.values.model_arc == 'Roberta':
                tokenized_train = roberta_tokenized_dataset(
                    train_df, tokenizer)
                tokenized_val = roberta_tokenized_dataset(val_df, tokenizer)
            else:
                tokenized_train = tokenized_dataset(train_df, tokenizer)
                tokenized_val = tokenized_dataset(val_df, tokenizer)

            RE_train_dataset = RE_Dataset(tokenized_train,
                                          train_df['label'].values)
            RE_val_dataset = RE_Dataset(tokenized_val, val_df['label'].values)

            try:
                if cfg.values.model_name == 'Bert':
                    model = BertForSequenceClassification.from_pretrained(
                        MODEL_NAME, config=model_config)
                else:
                    model = AutoModelForSequenceClassification.from_pretrained(
                        MODEL_NAME, config=model_config)
            except:
                # model_module = getattr(import_module('transformers'), cfg.values.model_arc)
                model_module = getattr(
                    import_module('transformers'),
                    cfg.values.model_arc + 'ForSequenceClassification')
                model = model_module.from_pretrained(MODEL_NAME,
                                                     config=model_config)

            model.parameters
            model.to(device)

            training_args.output_dir = cfg.values.train_args.output_dir + f'/{k}fold'
            training_args.logging_dir = cfg.values.train_args.output_dir + f'/{k}fold'

            optimizer = MADGRAD(model.parameters(),
                                lr=training_args.learning_rate)
            total_step = len(
                RE_train_dataset
            ) / training_args.per_device_train_batch_size * training_args.num_train_epochs
            scheduler = transformers.get_linear_schedule_with_warmup(
                optimizer,
                num_warmup_steps=training_args.warmup_steps,
                num_training_steps=total_step)
            optimizers = optimizer, scheduler

            trainer = Trainer(
                model=
                model,  # the instantiated 🤗 Transformers model to be trained
                args=training_args,  # training arguments, defined above
                train_dataset=RE_train_dataset,  # training dataset
                eval_dataset=RE_val_dataset,  # evaluation dataset
                compute_metrics=compute_metrics,  # define metrics function
                optimizers=optimizers,
                # callbacks=[early_stopping]
            )
            k += 1
            # train model
            trainer.train()

    else:
        cpprint('=' * 20 + f'START TRAINING' + '=' * 20)
        if not TRAIN_ONLY:
            train_df, val_df = train_test_split(
                whole_df,
                test_size=cfg.values.val_args.test_size,
                random_state=SEED)
            # train_df = pd.concat((train_df, additional_df))

            if cfg.values.model_arc == 'Roberta':
                tokenized_train = roberta_tokenized_dataset(
                    train_df, tokenizer)
                tokenized_val = roberta_tokenized_dataset(val_df, tokenizer)
            else:
                tokenized_train = tokenized_dataset(train_df, tokenizer)
                tokenized_val = tokenized_dataset(val_df, tokenizer)

            RE_train_dataset = RE_Dataset(tokenized_train,
                                          train_df['label'].values)
            RE_val_dataset = RE_Dataset(tokenized_val, val_df['label'].values)

            try:
                if cfg.values.model_name == 'Bert':
                    model = BertForSequenceClassification.from_pretrained(
                        MODEL_NAME, config=model_config)
                else:
                    model = AutoModelForSequenceClassification.from_pretrained(
                        MODEL_NAME, config=model_config)
            except:
                # model_module = getattr(import_module('transformers'), cfg.values.model_arc)
                model_module = getattr(
                    import_module('transformers'),
                    cfg.values.model_arc + 'ForSequenceClassification')
                model = model_module.from_pretrained(MODEL_NAME,
                                                     config=model_config)

            model.parameters
            model.to(device)

            optimizer = transformers.AdamW(model.parameters(),
                                           lr=training_args.learning_rate)
            total_step = len(
                RE_train_dataset
            ) / training_args.per_device_train_batch_size * training_args.num_train_epochs
            # scheduler = transformers.get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=training_args.warmup_steps, num_training_steps=total_step)
            scheduler = transformers.get_linear_schedule_with_warmup(
                optimizer,
                num_warmup_steps=training_args.warmup_steps,
                num_training_steps=total_step)
            optimizers = optimizer, scheduler

            trainer = Trainer(
                model=
                model,  # the instantiated 🤗 Transformers model to be trained
                args=training_args,  # training arguments, defined above
                train_dataset=RE_train_dataset,  # training dataset
                eval_dataset=RE_val_dataset,  # evaluation dataset
                compute_metrics=compute_metrics,  # define metrics function
                optimizers=optimizers,
                callbacks=[early_stopping])

            # train model
            trainer.train()

        else:
            training_args.evaluation_strategy = 'no'

            if cfg.values.model_arc == 'Roberta':
                print('Roberta')
                tokenized_train = roberta_tokenized_dataset(
                    whole_df, tokenizer)
            else:
                tokenized_train = tokenized_dataset(whole_df, tokenizer)

            RE_train_dataset = RE_Dataset(tokenized_train,
                                          whole_df['label'].values)

            try:
                model = AutoModelForSequenceClassification.from_pretrained(
                    MODEL_NAME, config=model_config)
            except:
                # model_module = getattr(import_module('transformers'), cfg.values.model_arc)
                model_module = getattr(
                    import_module('transformers'),
                    cfg.values.model_arc + 'ForSequenceClassification')
                model = model_module.from_pretrained(MODEL_NAME,
                                                     config=model_config)

            model.parameters
            model.to(device)

            training_args.output_dir = cfg.values.train_args.output_dir + '/only_train'
            training_args.logging_dir = cfg.values.train_args.output_dir + '/only_train'

            optimizer = AdamP(model.parameters(),
                              lr=training_args.learning_rate)
            total_step = len(
                RE_train_dataset
            ) / training_args.per_device_train_batch_size * training_args.num_train_epochs
            scheduler = transformers.get_linear_schedule_with_warmup(
                optimizer,
                num_warmup_steps=training_args.warmup_steps,
                num_training_steps=total_step)
            optimizers = optimizer, scheduler

            trainer = Trainer(
                model=
                model,  # the instantiated 🤗 Transformers model to be trained
                args=training_args,  # training arguments, defined above
                train_dataset=RE_train_dataset,  # training dataset
                optimizers=optimizers,
                # callbacks=[early_stopping]
            )

            # train model
            trainer.train()
Esempio n. 13
0
def run_train(train_dataset,
              eval_dataset,
              config,
              model_args,
              labels,
              num_labels,
              label_map,
              tokenizer,
              xargs={}):
    # First freeze bert weights and train
    #     log_params = copy.copy(params)
    #     log_params['model_type']= params['model_type'].name
    #     wandb.log({"params":log_params})
    #     wandb.log({"xargs":xargs})

    wb_run = wandb.init(project="NER",
                        name=params['exp_name'] + "_top_model",
                        reinit=True)
    xargs['tf'] = params.get('tf', False)
    model = get_model(model_path=model_args["model_name_or_path"],
                      cache_dir=model_args['cache_dir'],
                      config=config,
                      model_type=params['model_type'],
                      xargs=xargs)

    if not params['grad_e2e']:
        for param in model.base_model.parameters():
            param.requires_grad = False
    else:
        freeze_model(model)
    if 'add_vocab' in params.keys():
        model.resize_token_embeddings(len(tokenizer))
        for param in model.bert.embeddings.parameters():
            param.requires_grad = True

    # Change from default eval mode to train mode
    model.train()
    print(model)

    training_args_dict = {
        'output_dir': params["OUTPUT_DIR"],
        'num_train_epochs': params["EPOCH_TOP"],
        'train_batch_size': params["BATCH_SIZE"],
        "save_strategy": "epoch",
        "evaluation_strategy": "steps",
        "eval_steps": max(10,
                          train_dataset.__len__() // params["BATCH_SIZE"]),
        "logging_steps": max(10,
                             train_dataset.__len__() // params["BATCH_SIZE"]),
        "do_train": True,
        "load_best_model_at_end": params["LOAD_BEST_MODEL"],
        "learning_rate": params["lr"],
        "weight_decay": params["weight_decay"],
        "save_total_limit": 2,
        "report_to": "wandb",  # enable logging to W&B
        "run_name": params['exp_name'] + "_top_model"
    }
    print(training_args_dict)
    with open(params["TRAIN_ARGS_FILE"], 'w') as fp:
        json.dump(training_args_dict, fp)
    parser = HfArgumentParser(TrainingArguments)
    training_args = parser.parse_json_file(
        json_file=params["TRAIN_ARGS_FILE"])[0]

    # Initialize the Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        callbacks=[
            EarlyStoppingCallback(early_stopping_patience=params["patience"]),
            LogCallback(params["OUTPUT_DIR"] + "/train_log.json")
        ])

    # Start training
    trainOutput = trainer.train()
    trainer.save_model(params["OUTPUT_DIR"])
    plot_loss_log(params["OUTPUT_DIR"] + "/train_log.json")
    best_model = trainer.state.best_model_checkpoint
    print("top_model_path is at ...", best_model)
    wb_run.finish()

    if params['grad_finetune']:

        # Now reload the model from best model we have found
        # Reading from file

        wb_run = wandb.init(project="NER",
                            name=params['exp_name'] + "_full_model",
                            reinit=True)
        print("The file is loaded from ---------------------------> ",
              params["OUTPUT_DIR"] + 'config.json')
        data = json.loads(
            open(params["OUTPUT_DIR"] + 'config.json', "r").read())
        top_model_path = best_model
        checkpoint = top_model_path.split("/")[-1]
        print("checkpoint is at ... ", checkpoint)
        print("top_model_path is at ...", top_model_path)

        # Config #
        config = BertConfig.from_pretrained(
            top_model_path,
            num_labels=num_labels,
            id2label=label_map,
            label2id={label: i
                      for i, label in enumerate(labels)},
            cache_dir=model_args['cache_dir'])

        # Model #
        xargs['tf'] = False
        reloaded_model = get_model(model_path=top_model_path + "/",
                                   cache_dir=model_args['cache_dir'],
                                   config=None,
                                   model_type=params['model_type'],
                                   xargs=xargs)
        print("Reloaded", reloaded_model.bert.embeddings)

        adam_beta1 = 0.9
        if params.get('xargs') and params.get('xargs').get('beta1_finetune'):
            adam_beta1 = params.get('xargs').get('beta1_finetune')
        # Training args #
        training_args_dict = {
            'output_dir':
            params["OUTPUT_DIR"],
            'num_train_epochs':
            params["EPOCH_TOP"] + params["EPOCH_END2END"],
            'train_batch_size':
            params["BATCH_SIZE"],
            "evaluation_strategy":
            "steps",
            "eval_steps":
            max(10,
                train_dataset.__len__() // params["BATCH_SIZE"]),
            "logging_steps":
            max(10,
                train_dataset.__len__() // params["BATCH_SIZE"]),
            "do_train":
            True,
            "load_best_model_at_end":
            params["LOAD_BEST_MODEL"],
            "save_total_limit":
            2,
            "learning_rate":
            params["lr_finetune"],
            "weight_decay":
            params["wd_finetune"] if "wd_finetune" in params.keys() else 0,
            "ignore_data_skip":
            True,
            "report_to":
            "wandb",  # enable logging to W&B
            "run_name":
            params['exp_name'] + "_full_model",
            "adam_beta1":
            adam_beta1
        }

        with open(params["TRAIN_ARGS_FILE"], 'w') as fp:
            json.dump(training_args_dict, fp)
        parser = HfArgumentParser(TrainingArguments)
        training_args = parser.parse_json_file(
            json_file=params["TRAIN_ARGS_FILE"])[0]

        # Then unfreeze the bert weights and fine tune end-to-end
        model = reloaded_model
        freeze_model(model)
        model.to('cuda')

        # Set to train mode.
        model.train()

        # Initialize our Trainer
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
            callbacks=[
                EarlyStoppingCallback(
                    early_stopping_patience=params["patience"],
                    early_stopping_threshold=params.get('esth', 0)),
                LogCallback(params["OUTPUT_DIR"] + "/train_finetune_log.json")
            ])

        # checkpiont is here.
        trainer.train()
        plot_loss_log(params["OUTPUT_DIR"] + "/train_finetune_log.json")
        wb_run.finish()
    return trainer, model
Esempio n. 14
0
def run_mrc(data_args, training_args, model_args, datasets, tokenizer, model):
    # wandb 사용할 경우
    # wandb.login()    

    question_column_name = "question" if "question" in column_names else column_names[0]
    context_column_name = "context" if "context" in column_names else column_names[1]
    answer_column_name = "answers" if "answers" in column_names else column_names[2]

    # Padding side determines if we do (question|context) or (context|question).
    pad_on_right = tokenizer.padding_side == "right"

    # check if there is an error
    last_checkpoint, max_seq_length = check_no_error(training_args, data_args, tokenizer, datasets)

    # train 데이터 가공 과정
    def prepare_train_features(examples):
        # 토크나이징 + 정답 start, end 토큰 위치 모두 겸비한 input data 만들기!

        tokenized_examples = tokenizer(
            examples[question_column_name if pad_on_right else context_column_name],
            examples[context_column_name if pad_on_right else question_column_name],
            truncation="only_second" if pad_on_right else "only_first",
            max_length=max_seq_length,
            stride=data_args.doc_stride,
            return_overflowing_tokens=True,
            return_offsets_mapping=True, 
            padding="max_length" if data_args.pad_to_max_length else False,
        )

        # truncation으로 분리된 data가 원래 어느 context에 소속되었는지 저장.
        sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")

        # 모든 token에 대해, 각 토큰의 원래 문장에서의 위치값(시작, 끝) 정보 저장.
        offset_mapping = tokenized_examples.pop("offset_mapping")

        # Let's label those examples!
        tokenized_examples["start_positions"] = []
        tokenized_examples["end_positions"] = []

        for i, offsets in enumerate(offset_mapping):
            # We will label impossible answers with the index of the CLS token.
            input_ids = tokenized_examples["input_ids"][i]
            cls_index = input_ids.index(tokenizer.cls_token_id)

            # Grab the sequence corresponding to that example (to know what is the context and what is the question).
            # sequence_ids -> cls, 질문, cls, 문단, end 구분 기능 -> [None, 0, 0, ..., 0, None, 1, 1, ...., 1, None] 
            sequence_ids = tokenized_examples.sequence_ids(i)

            # One example can give several spans, this is the index of the example containing this span of text.
            # 지금 보고있는 i 번째 tokenized 결과가 어느 context로 만들어졌는지 그 번호 찾음. 
            sample_index = sample_mapping[i]
            
            answers = examples[answer_column_name][sample_index]
            # print(answers)
            # If no answers are given, set the cls_index as answer.
            if len(answers["answer_start"]) == 0:
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
            else:
                # Start/end character index of the answer in the text.
                start_char = answers["answer_start"][0]
                end_char = start_char + len(answers["text"][0])

                # Start token index of the current span in the text.
                token_start_index = 0
                while sequence_ids[token_start_index] != (1 if pad_on_right else 0):
                    token_start_index += 1

                # End token index of the current span in the text.
                token_end_index = len(input_ids) - 1
                while sequence_ids[token_end_index] != (1 if pad_on_right else 0):
                    token_end_index -= 1

                # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
                if not (
                    offsets[token_start_index][0] <= start_char
                    and offsets[token_end_index][1] >= end_char
                ):
                    tokenized_examples["start_positions"].append(cls_index)
                    tokenized_examples["end_positions"].append(cls_index)
                else:
                    # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
                    # Note: we could go after the last offset if the answer is the last word (edge case).
                    while (
                        token_start_index < len(offsets)
                        and offsets[token_start_index][0] <= start_char
                    ):
                        token_start_index += 1
                    tokenized_examples["start_positions"].append(token_start_index - 1)
                    while offsets[token_end_index][1] >= end_char:
                        token_end_index -= 1
                    tokenized_examples["end_positions"].append(token_end_index + 1)

        return tokenized_examples

    if "train" not in datasets:
        raise ValueError("--do_train requires a train dataset")

    column_names = datasets["train"].column_names
    train_dataset = datasets["train"]

    train_dataset = train_dataset.map(
        prepare_train_features,
        batched=True,
        num_proc=data_args.preprocessing_num_workers,
        remove_columns=column_names,
        load_from_cache_file=not data_args.overwrite_cache,
    )

    ## validation 데이터 가공 과정
    def prepare_validation_features(examples):
        tokenized_examples = tokenizer(
            examples[question_column_name if pad_on_right else context_column_name],
            examples[context_column_name if pad_on_right else question_column_name],
            truncation="only_second" if pad_on_right else "only_first",
            max_length=max_seq_length,
            stride=data_args.doc_stride,
            return_overflowing_tokens=True,
            return_offsets_mapping=True,
            padding="max_length" if data_args.pad_to_max_length else False,
        )

        sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")

        tokenized_examples["example_id"] = []

        for i in range(len(tokenized_examples["input_ids"])):
            sequence_ids = tokenized_examples.sequence_ids(i)
            context_index = 1 if pad_on_right else 0

            sample_index = sample_mapping[i]
            tokenized_examples["example_id"].append(examples["id"][sample_index])

            # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token
            # position is part of the context or not.
            # offset_mapping : 각 토큰의 (시작위치, 끝 위치) 정보를 담고 있는데, 
            # query 토큰들의 (시작위치, 끝 위치) 정보를 None으로 바꾸는 과정
            # 왜? validation 할 때, output 으로 start logit과 end logit을 받게 된다. 
            # 이때 해당 인덱스를 query 가 아닌 passage에서 찾기 위함.
            tokenized_examples["offset_mapping"][i] = [
                (o if sequence_ids[k] == context_index else None)
                for k, o in enumerate(tokenized_examples["offset_mapping"][i])
            ]
        return tokenized_examples

    eval_dataset = datasets["validation"]
    column_names = datasets["validation"].column_names

    eval_dataset = eval_dataset.map(
        prepare_validation_features,
        batched=True,
        num_proc=data_args.preprocessing_num_workers,
        remove_columns=column_names,
        load_from_cache_file=not data_args.overwrite_cache,
    )

    # Data collator
    # We have already padded to max length if the corresponding flag is True, otherwise we need to pad in the data collator.
    # pad_to_multiple_of : mixed precision을 사용할 때, 텐서 사이즈가 8의 배수일때 더 효과적이다.'
    # 따라서,(Funnel Transformer? 뭔지 모르겠지만 이건 32로 세팅) 8로 세팅해서 max_length을 조절 하게 된다. 
    # 근데 이미 tokeneizer가 max_length를 384로 처리하고 있어서 작동 안할 듯.
    data_collator = (
        DataCollatorWithPadding(
            tokenizer, pad_to_multiple_of=8 if training_args.fp16 else None
        )
    )

    # Post-processing : 
    def post_processing_function(examples, features, predictions, training_args):
        # Post-processing: we match the start logits and end logits to answers in the original context.
        predictions = postprocess_qa_predictions(
            examples=examples,
            features=features,
            predictions=predictions,
            max_answer_length=data_args.max_answer_length,
            output_dir=training_args.output_dir,
        )
       # Format the result to the format the metric expects.
        formatted_predictions = [
            {"id": k, "prediction_text": v} for k, v in predictions.items()
        ]

        if training_args.do_predict:
            return formatted_predictions        
        else:
            references = [
                {"id": ex["id"], "answers": ex[answer_column_name]}
                for ex in datasets["validation"]
            ]
            return EvalPrediction(predictions=formatted_predictions, label_ids=references)

    metric = load_metric("squad")

    metric_key_prefix = 'eval'   
    def compute_metrics(p: EvalPrediction):
        before_prefix_metrics = metric.compute(predictions=p.predictions, references=p.label_ids)
        metrics ={f'{metric_key_prefix}_{k}':v for k,v in before_prefix_metrics.items()}        
        return metrics
    
    # early stop 조건
    early_stopping = EarlyStoppingCallback(early_stopping_patience = 50, early_stopping_threshold = 0.2)
    
    # QuestionAnsweringTrainer
    trainer = QuestionAnsweringTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        eval_examples=datasets["validation"],
        tokenizer=tokenizer,
        data_collator=data_collator,
        post_process_function=post_processing_function,
        compute_metrics=compute_metrics,
        callbacks=[early_stopping],
    )
   
    # Training 
    if last_checkpoint is not None:
        checkpoint = last_checkpoint
    elif os.path.isdir(model_args.model_name_or_path):
        checkpoint = model_args.model_name_or_path
    else:
        checkpoint = None

    train_result = trainer.train(resume_from_checkpoint=checkpoint)

    metrics = train_result.metrics
    metrics["train_samples"] = len(train_dataset)

    trainer.log_metrics("train", metrics)
    
    output_train_file = os.path.join(training_args.output_dir, "train_results.txt")

    with open(output_train_file, "w") as writer:
        logger.info("***** Train results *****")
        for key, value in sorted(train_result.metrics.items()):
            logger.info(f"  {key} = {value}")
            writer.write(f"{key} = {value}\n")
Esempio n. 15
0
def run_train(train_dataset, eval_dataset, config, model_args, labels,
              num_labels, label_map):
    # First freeze bert weights and train
    model = get_model(model_path=model_args["model_name_or_path"],
                      cache_dir=model_args['cache_dir'],
                      config=config,
                      model_type=params['model_type'])

    if not params['grad_e2e']:
        for param in model.base_model.parameters():
            param.requires_grad = False

    # Change from default eval mode to train mode
    model.train()

    training_args_dict = {
        'output_dir': params['OUTPUT_DIR'],
        'num_train_epochs': params['EPOCH_TOP'],
        'train_batch_size': params['BATCH_SIZE'],
        "save_strategy": "epoch",
        "evaluation_strategy": "epoch",
        "load_best_model_at_end": params['LOAD_BEST_MODEL'],
        "learning_rate": params["lr"],
        "weight_decay": params["weight_decay"]
    }

    with open(params['TRAIN_ARGS_FILE'], 'w') as fp:
        json.dump(training_args_dict, fp)
    parser = HfArgumentParser(TrainingArguments)
    training_args = parser.parse_json_file(
        json_file=params['TRAIN_ARGS_FILE'])[0]

    # Initialize the Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)])

    # Start training
    trainOutput = trainer.train()
    trainer.save_model(params['OUTPUT_DIR'])

    if params['grad_finetune']:

        # Now reload the model from best model we have found
        # Reading from file
        print("The file is loaded from ---------------------------> ",
              params['OUTPUT_DIR'] + 'config.json')
        data = json.loads(
            open(params['OUTPUT_DIR'] + 'config.json', "r").read())
        top_model_path = data['_name_or_path']
        checkpoint = top_model_path.split("/")[-1]
        print("checkpoint is at ... ", checkpoint)
        print("top_model_path is at ...", params['LOAD_BEST_MODEL'])

        # Config #
        config = BertConfig.from_pretrained(
            top_model_path,
            num_labels=num_labels,
            id2label=label_map,
            label2id={label: i
                      for i, label in enumerate(labels)},
            cache_dir=model_args['cache_dir'])

        # Model #
        reloaded_model = get_model(model_path=top_model_path,
                                   cache_dir=model_args['cache_dir'],
                                   config=config,
                                   model_type=params['model_type'])

        # Training args #
        training_args_dict = {
            'output_dir': params['OUTPUT_DIR'],
            'num_train_epochs': params['EPOCH_END2END'],
            'train_batch_size': params['BATCH_SIZE'],
            "evaluation_strategy": "epoch",
            "load_best_model_at_end": params['LOAD_BEST_MODEL'],
            "learning_rate": params["lr"],
            "weight_decay": params["weight_decay"]
        }

        with open(params['TRAIN_ARGS_FILE'], 'w') as fp:
            json.dump(training_args_dict, fp)
        parser = HfArgumentParser(TrainingArguments)
        training_args = parser.parse_json_file(
            json_file=params['TRAIN_ARGS_FILE'])[0]

        # Then unfreeze the bert weights and fine tune end-to-end
        model = reloaded_model
        COUNT = 1
        for param in model.base_model.parameters():
            if COUNT >= params['grad_finetune_layers']:
                param.requires_grad = True
            COUNT += 1

        model.to('cuda')

        # Set to train mode.
        model.train()

        # Initialize our Trainer
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
            callbacks=[EarlyStoppingCallback(early_stopping_patience=3)])

        # checkpiont is here.
        trainer.train(checkpoint)

    return trainer, model
Esempio n. 16
0
def fine_tune():

    print("=== Fine-tune ===")

    args = get_args()
    print(args)

    if args.task == "imdb":
        data_dir = "./../../asset/imdb/"
        train_labels, train_texts = read_imdb_train(data_dir)
    elif args.task == "twitter_semeval":
        data_dir = "./../../asset/twitter_semeval/"
        train_labels, train_texts = read_twitter_train(data_dir)
    elif args.task == "twitter_s140":
        data_dir = "./../../asset/twitter_s140/"
        train_labels, train_texts = read_twitter_train(data_dir)

    # check_data()

    train_texts, val_texts, train_labels, val_labels = train_test_split(
        train_texts, train_labels, test_size=args.test_size)

    ## IF HAVE MUCH TIME, try to increase test size because the fine-tuning run fast

    train_texts = list(train_texts)
    val_texts = list(val_texts)
    train_labels = list(train_labels)
    val_labels = list(val_labels)

    model_name = args.model
    # model_name = "bert-base-cased"
    # model_name = "roberta-base"
    # model_name = "microsoft/deberta-large-mnli"
    # model_name = "bert-base-uncased"

    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # check_data()

    train_encodings = tokenizer(train_texts,
                                truncation=True,
                                padding=True,
                                max_length=512)
    val_encodings = tokenizer(val_texts,
                              truncation=True,
                              padding=True,
                              max_length=512)

    train_dataset = CustomDataset(train_encodings, train_labels)
    val_dataset = CustomDataset(val_encodings, val_labels)

    model = AutoModelForSequenceClassification.from_pretrained(model_name)

    training_args = TrainingArguments(
        # output directory
        output_dir=f'./models/{args.task}/{model_name}/',
        num_train_epochs=args.epochs,  # total number of training epochs
        per_device_train_batch_size=args.
        train_bs,  # batch size per device during training
        per_device_eval_batch_size=64,  # batch size for evaluation
        warmup_steps=args.
        warmup_steps,  # number of warmup steps for learning rate scheduler
        weight_decay=args.weight_decay,  # strength of weight decay
        # directory for storing logs
        logging_dir=f'./logs/{args.task}/{model_name}/',
        logging_steps=args.logging_steps,
        learning_rate=args.learning_rate,
        seed=0,
        evaluation_strategy="steps",
        eval_steps=args.eval_steps,
        save_total_limit=5,
        save_steps=args.save_steps,
        load_best_model_at_end=True)

    # trainer = Trainer(
    #     # the instantiated 🤗 Transformers model to be trained
    #     model=model,
    #     args=training_args,                  # training arguments, defined above
    #     train_dataset=train_dataset,         # training dataset
    #     eval_dataset=val_dataset,             # evaluation dataset
    #     compute_metrics=compute_metrics,
    # )

    trainer = Trainer(
        # the instantiated 🤗 Transformers model to be trained
        model=model,
        args=training_args,  # training arguments, defined above
        train_dataset=train_dataset,  # training dataset
        eval_dataset=val_dataset,  # evaluation dataset
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=7)],
    )

    trainer.train()
Esempio n. 17
0
def train(args):
    wandb.login()
    seed_everything(args.seed)
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

    MODEL_NAME = args.model_name
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

    # load dataset
    train_dataset_dir = "/opt/ml/input/data/train/train.tsv"
    dataset = load_data(train_dataset_dir)
    label = dataset['label'].values

    # k-fold cross validation
    cv = StratifiedKFold(n_splits=5, random_state=args.seed, shuffle=True)

    for idx, (train_idx, val_idx) in enumerate(cv.split(dataset, label)):
        # prepare tokenized datasets and labels each fold
        train_dataset = tokenized_dataset(dataset.iloc[train_idx], tokenizer)
        val_dataset = tokenized_dataset(dataset.iloc[val_idx], tokenizer)

        train_y = label[train_idx]
        val_y = label[val_idx]

        # make dataset for pytorch
        RE_train_dataset = RE_Dataset(train_dataset, train_y)
        RE_valid_dataset = RE_Dataset(val_dataset, val_y)

        # instantiate pretrained language model
        model = AutoModelForSequenceClassification.from_pretrained(
            MODEL_NAME, num_labels=42)
        model.to(device)

        # callbacks
        early_stopping = EarlyStoppingCallback(
            early_stopping_patience=args.early_stopping_patience,
            early_stopping_threshold=0.00005)

        # set training arguemnts
        output_dir = './result' + str(idx)
        training_args = TrainingArguments(
            output_dir=output_dir,
            logging_dir='./logs',
            logging_steps=100,
            save_total_limit=1,
            evaluation_strategy='steps',
            eval_steps=100,
            load_best_model_at_end=True,
            metric_for_best_model='accuracy',
            greater_is_better=True,
            dataloader_num_workers=args.num_workers,
            fp16=True,
            seed=args.seed,
            run_name=args.run_name,
            num_train_epochs=args.epochs,
            per_device_train_batch_size=args.train_batch_size,
            per_device_eval_batch_size=args.eval_batch_size,
            label_smoothing_factor=args.label_smoothing_factor,
            learning_rate=args.lr,
            warmup_steps=args.warmup_steps,
            weight_decay=args.weight_decay,
        )

        # traniner
        trainer = Trainer(
            model=
            model,  # the instantiated 🤗 Transformers model to be trained
            args=training_args,  # training arguments, defined above
            train_dataset=RE_train_dataset,  # training dataset
            eval_dataset=RE_valid_dataset,  # evaluation dataset
            tokenizer=tokenizer,
            compute_metrics=compute_metrics,  # define metrics function
            callbacks=[early_stopping])

        # train model
        trainer.train()

        # del model
        model.cpu()
        del model
        gc.collect()
        torch.cuda.empty_cache()

        # del cache
        path = glob(f"/opt/ml/code/result{idx}/*")[0]
        for filename in os.listdir(path):
            if filename not in [
                    'config.json', 'pytorch_model.bin', '.ipynb_checkpoints'
            ]:
                rm_filename = os.path.join(path, filename)
                os.remove(rm_filename)

    wandb.finish()
Esempio n. 18
0
def bert(train_path, val_path, INPUT_EPOCH, EVAL_STEPS, test_count, \
         hyper_count, fold_count, predict):

    # define pretrained tokenizer and model
    model_name = "bert-base-uncased"
    tokenizer = BertTokenizer.from_pretrained(model_name)
    model = BertForSequenceClassification.from_pretrained(model_name,
                                                          num_labels=2)

    # read data
    train_data = pd.read_csv(train_path,
                             sep='\t',
                             encoding="utf-8",
                             names=["y", "X"])
    val_data = pd.read_csv(val_path,
                           sep='\t',
                           encoding="utf-8",
                           names=["y", "X"])

    # preprocess data
    X_train = list(train_data["X"])
    y_train = list(train_data["y"])

    X_val = list(val_data["X"])
    y_val = list(val_data["y"])

    X_train_tokenized = tokenizer(X_train,
                                  padding=True,
                                  truncation=True,
                                  max_length=128)
    X_val_tokenized = tokenizer(X_val,
                                padding=True,
                                truncation=True,
                                max_length=128)

    train_dataset = Dataset(X_train_tokenized, y_train)
    val_dataset = Dataset(X_val_tokenized, y_val)

    # define trainer
    args = TrainingArguments(
        output_dir="output",
        evaluation_strategy="steps",
        eval_steps=EVAL_STEPS,
        per_device_train_batch_size=32,
        per_device_eval_batch_size=32,
        num_train_epochs=INPUT_EPOCH,
        save_steps=3000,
        seed=0,
        load_best_model_at_end=True,
    )

    # train pre-trained BERT model
    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=5)],
    )

    trainer.train()

    # find the final stored model
    result = os.listdir("./output")[-1]

    with open('./output/{}/trainer_state.json'.format(result)) as f:
        data = json.load(f)

        # not predict (test) mode
        if predict == False:
            with open(
                    f'./result/final_trainer_state_{test_count}_{hyper_count}_{fold_count}_.json',
                    'w') as output_file:
                json.dump(data, output_file)
        else:
            with open(f'./result/final_test_state_{test_count}.json',
                      'w') as output_file:
                json.dump(data, output_file)

    # retrieve best training loss, eval loss and accuracy
    best = data['best_model_checkpoint'].split("-")[-1]
    history = {}
    history['train_acc'] = 0
    history['train_loss'] = 0
    print(data)
    print(data['log_history'])
    for i in data['log_history']:
        print(i)
        if i['step'] == int(best):
            if 'loss' in i:
                print("training loss:\t", i['loss'])
                history['train_loss'] = i['loss']
            if 'eval_accuracy' in i:
                print("eval loss:\t", i['eval_loss'])
                print("eval accuracy:\t", i['eval_accuracy'])
                print("eval f1:\t", i['eval_f1'])
                print("eval precision:\t", i['eval_precision'])
                print("eval recall:\t", i['eval_recall'])

                history['val_loss'] = i['eval_loss']
                history['val_acc'] = i['eval_accuracy']
                history['val_f1'] = i['eval_f1']
                history['val_precision'] = i['eval_precision']
                history['val_recall'] = i['eval_recall']

    raw_pred_train, _, _ = trainer.predict(train_dataset)
    y_pred_train = np.argmax(raw_pred_train, axis=1)
    accuracy = accuracy_score(y_true=y_train, y_pred=y_pred_train)
    history['train_acc'] = accuracy

    if predict == True:
        raw_pred, _, _ = trainer.predict(val_dataset)

        # preprocess raw predictions
        y_pred = np.argmax(raw_pred, axis=1)
        report = classification_report(y_val,
                                       y_pred,
                                       target_names=class_names,
                                       digits=4)
        report_path = "./result/report_{}.txt".format(test_count)
        text_file = open(report_path, "w")
        text_file.write(report)

        # copy the best trained model for current test fold
        copytree(f'./output/checkpoint-{best}',
                 f'./result/best_test_{test_count}')

    # clearn the output dictory
    shutil.rmtree('./output', ignore_errors=True)

    return history
    def train(self, inoculation_train_df, eval_df, model_path, training_args, max_length=128,
              inoculation_patience_count=5, pd_format=True, 
              scramble_proportion=0.0, eval_with_scramble=False):

        if pd_format:
            datasets = {}
            datasets["train"] = Dataset.from_pandas(inoculation_train_df)
            datasets["validation"] = Dataset.from_pandas(eval_df)
        else:
            datasets = {}
            datasets["train"] = inoculation_train_df
            datasets["validation"] = eval_df
        logger.info(f"***** Train Sample Count (Verify): %s *****"%(len(datasets["train"])))
        logger.info(f"***** Valid Sample Count (Verify): %s *****"%(len(datasets["validation"])))
    
        label_list = datasets["validation"].unique("label")
        label_list.sort()  # Let's sort it for determinism

        sentence1_key, sentence2_key = self.task_config
        
        # we will scramble out input sentence here
        # TODO: we scramble both train and eval sets
        if self.task_name == "sst3" or self.task_name == "cola":
            def scramble_inputs(proportion, example):
                original_text = example[sentence1_key]
                original_sentence = basic_tokenizer.tokenize(original_text)
                max_length = len(original_sentence)
                scramble_length = int(max_length*proportion)
                scramble_start = random.randint(0, len(original_sentence)-scramble_length)
                scramble_end = scramble_start + scramble_length
                scramble_sentence = original_sentence[scramble_start:scramble_end]
                random.shuffle(scramble_sentence)
                scramble_text = original_sentence[:scramble_start] + scramble_sentence + original_sentence[scramble_end:]

                out_string = " ".join(scramble_text).replace(" ##", "").strip()
                example[sentence1_key] = out_string
                return example
        elif self.task_name == "snli" or             self.task_name == "mrpc" or             self.task_name == "qnli":
            def scramble_inputs(proportion, example):
                original_premise = example[sentence1_key]
                original_hypothesis = example[sentence2_key]
                if original_hypothesis == None:
                    original_hypothesis = ""
                try:
                    original_premise_tokens = basic_tokenizer.tokenize(original_premise)
                    original_hypothesis_tokens = basic_tokenizer.tokenize(original_hypothesis)
                except:
                    print("Please debug these sequence...")
                    print(original_premise)
                    print(original_hypothesis)

                max_length = len(original_premise_tokens)
                scramble_length = int(max_length*proportion)
                scramble_start = random.randint(0, max_length-scramble_length)
                scramble_end = scramble_start + scramble_length
                scramble_sentence = original_premise_tokens[scramble_start:scramble_end]
                random.shuffle(scramble_sentence)
                scramble_text_premise = original_premise_tokens[:scramble_start] + scramble_sentence + original_premise_tokens[scramble_end:]

                max_length = len(original_hypothesis_tokens)
                scramble_length = int(max_length*proportion)
                scramble_start = random.randint(0, max_length-scramble_length)
                scramble_end = scramble_start + scramble_length
                scramble_sentence = original_hypothesis_tokens[scramble_start:scramble_end]
                random.shuffle(scramble_sentence)
                scramble_text_hypothesis = original_hypothesis_tokens[:scramble_start] + scramble_sentence + original_hypothesis_tokens[scramble_end:]

                out_string_premise = " ".join(scramble_text_premise).replace(" ##", "").strip()
                out_string_hypothesis = " ".join(scramble_text_hypothesis).replace(" ##", "").strip()
                example[sentence1_key] = out_string_premise
                example[sentence2_key] = out_string_hypothesis
                return example
        
        if scramble_proportion > 0.0:
            logger.info(f"You are scrambling the inputs to test syntactic feature importance!")
            datasets["train"] = datasets["train"].map(partial(scramble_inputs, scramble_proportion))
            if eval_with_scramble:
                logger.info(f"You are scrambling the evaluation data as well!")
                datasets["validation"] = datasets["validation"].map(partial(scramble_inputs, scramble_proportion))
        
        padding = "max_length"
        sentence1_key, sentence2_key = self.task_config
        label_to_id = None
        def preprocess_function(examples):
            # Tokenize the texts
            args = (
                (examples[sentence1_key],) if sentence2_key is None else (examples[sentence1_key], examples[sentence2_key])
            )
            result = self.tokenizer(*args, padding=padding, max_length=max_length, truncation=True)
            # Map labels to IDs (not necessary for GLUE tasks)
            if label_to_id is not None and "label" in examples:
                result["label"] = [label_to_id[l] for l in examples["label"]]
            return result
        datasets["train"] = datasets["train"].map(preprocess_function, batched=True)
        datasets["validation"] = datasets["validation"].map(preprocess_function, batched=True)
        
        train_dataset = datasets["train"]
        eval_dataset = datasets["validation"]
        
        # Log a few random samples from the training set:
        for index in random.sample(range(len(train_dataset)), 3):
            logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")
            
        metric = load_metric("glue", "sst2") # any glue task will do the job, just for eval loss
        
        def asenti_compute_metrics(p: EvalPrediction):
            preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
            preds = np.argmax(preds, axis=1)
            result_to_print = classification_report(p.label_ids, preds, digits=5, output_dict=True)
            print(classification_report(p.label_ids, preds, digits=5))
            mcc_scores = matthews_corrcoef(p.label_ids, preds)
            logger.info(f"MCC scores: {mcc_scores}.")
            result_to_return = metric.compute(predictions=preds, references=p.label_ids)
            result_to_return["Macro-F1"] = result_to_print["macro avg"]["f1-score"]
            result_to_return["MCC"] = mcc_scores
            return result_to_return

        # Initialize our Trainer. We are only intersted in evaluations
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
            compute_metrics=asenti_compute_metrics,
            tokenizer=self.tokenizer,
            # Data collator will default to DataCollatorWithPadding, so we change it if we already did the padding.
            data_collator=default_data_collator
        )
        # Early stop
        if inoculation_patience_count != -1:
            trainer.add_callback(EarlyStoppingCallback(inoculation_patience_count))
        
        # Training
        if training_args.do_train:
            logger.info("*** Training our model ***")
            trainer.train(
                # we don't need this now.
                # model_path=model_path
            )
            trainer.save_model()  # Saves the tokenizer too for easy upload
        
        # Evaluation
        eval_results = {}
        if training_args.do_eval:
            logger.info("*** Evaluate ***")
            tasks = [self.task_name]
            eval_datasets = [eval_dataset]
            for eval_dataset, task in zip(eval_datasets, tasks):
                eval_result = trainer.evaluate(eval_dataset=eval_dataset)
                output_eval_file = os.path.join(training_args.output_dir, f"eval_results_{task}.txt")
                if trainer.is_world_process_zero():
                    with open(output_eval_file, "w") as writer:
                        logger.info(f"***** Eval results {task} *****")
                        for key, value in eval_result.items():
                            logger.info(f"  {key} = {value}")
                            writer.write(f"{key} = {value}\n")
                eval_results.update(eval_result)
Esempio n. 20
0
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TrainingArguments))
    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args = parser.parse_json_file(
            json_file=os.path.abspath(sys.argv[1]))
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses(
        )

    # override default run name and log all args
    wandb.init(project="wav2vec4humans", config=parser.parse_args())

    # Detecting last checkpoint.
    last_checkpoint = None
    if (os.path.isdir(training_args.output_dir) and training_args.do_train
            and not training_args.overwrite_output_dir):
        last_checkpoint = get_last_checkpoint(training_args.output_dir)
        if last_checkpoint is None and len(os.listdir(
                training_args.output_dir)) > 0:
            raise ValueError(
                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
                "Use --overwrite_output_dir to overcome.")
        elif last_checkpoint is not None:
            logger.info(
                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
            )

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        handlers=[logging.StreamHandler(sys.stdout)],
    )
    logger.setLevel(logging.INFO if is_main_process(training_args.local_rank
                                                    ) else logging.WARN)

    # Log on each process the small summary:
    logger.warning(
        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
        +
        f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
    )
    # Set the verbosity to info of the Transformers logger (on main process only):
    if is_main_process(training_args.local_rank):
        transformers.utils.logging.set_verbosity_info()
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed before initializing model.
    set_seed(training_args.seed)

    chars_to_ignore_regex = f'[{"".join(data_args.chars_to_ignore)}]'

    def remove_special_characters(batch, train=True):
        batch["text"] = (re.sub(chars_to_ignore_regex, "",
                                unidecode(batch["sentence"])).lower().strip())
        if train:
            batch["text"] += " "
        return batch

    def extract_all_chars(batch):
        all_text = " ".join(batch["text"])
        vocab = list(set(all_text))
        return {"vocab": [vocab], "all_text": [all_text]}

    resampler = dict()

    def get_resampler(sampling_rate):
        if sampling_rate in resampler.keys():
            return resampler[sampling_rate]
        else:
            logger.info(f"Creating new resampler for {sampling_rate}")
            resampler[sampling_rate] = torchaudio.transforms.Resample(
                sampling_rate, 16_000)
            return resampler[sampling_rate]

    # Preprocessing the datasets.
    # We need to read the audio files as arrays and tokenize the targets.
    def speech_file_to_array_fn(batch):
        speech_array, sampling_rate = torchaudio.load(batch["path"])
        batch["speech"] = get_resampler(sampling_rate)(
            speech_array).squeeze().numpy()
        batch["sampling_rate"] = 16_000
        batch["target_text"] = batch["text"]
        batch["duration"] = len(speech_array.squeeze()) / sampling_rate
        return batch

    def filter_by_duration(batch):
        return (batch["duration"] <= 10 and batch["duration"] >= 1
                and len(batch["target_text"]) > 5)  # about 98% of samples

    def prepare_dataset(batch):
        # check that all files have the correct sampling rate
        assert (
            len(set(batch["sampling_rate"])) == 1
        ), f"Make sure all inputs have the same sampling rate of {processor.feature_extractor.sampling_rate}."
        batch["input_values"] = processor(
            batch["speech"],
            sampling_rate=batch["sampling_rate"][0]).input_values
        # Setup the processor for targets
        with processor.as_target_processor():
            batch["labels"] = processor(batch["target_text"]).input_ids
        return batch

    def get_length(item):
        # speeds up grouping by length in pre-loaded dataset
        item["length"] = len(item["input_values"])
        return item

    # Pre-processed datasets
    dataset_path = Path(os.getenv("HF_HOME", ".")) / "datasets"
    dataset_train_path = f"{dataset_path}/{data_args.dataset_config_name}/train/{data_args.train_split_name}"
    dataset_eval_path = f"{dataset_path}/{data_args.dataset_config_name}/eval"
    dataset_test_path = f"{dataset_path}/{data_args.dataset_config_name}/test"
    vocab_path = f"{dataset_path}/{data_args.dataset_config_name}/vocab/vocab_test_{data_args.train_split_name}.json"

    train_dataset = None
    eval_dataset = None if training_args.do_eval else False

    log_timestamp()
    if Path(dataset_train_path).exists() and Path(vocab_path).exists():
        train_dataset = datasets.load_from_disk(dataset_train_path)
        log_timestamp("load pre-processed data")
    else:
        train_dataset = datasets.load_dataset(
            "common_voice",
            data_args.dataset_config_name,
            split=data_args.train_split_name,
        )
        log_timestamp("load data")
        train_dataset = train_dataset.map(remove_special_characters,
                                          remove_columns=["sentence"])
        log_timestamp("remove special characters")

    if training_args.do_eval:
        if Path(dataset_eval_path).exists():
            eval_dataset = datasets.load_from_disk(dataset_eval_path)
        else:
            eval_dataset = datasets.load_dataset("common_voice",
                                                 data_args.dataset_config_name,
                                                 split="test")
            eval_dataset = eval_dataset.map(remove_special_characters,
                                            remove_columns=["sentence"])
    log_timestamp()

    if Path(dataset_test_path).exists() and Path(vocab_path).exists():
        test_dataset = datasets.load_from_disk(dataset_test_path)
    else:
        test_dataset = datasets.load_dataset("common_voice",
                                             data_args.dataset_config_name,
                                             split="test")
        test_dataset = test_dataset.map(
            lambda x: remove_special_characters(x, train=False),
            remove_columns=["sentence"],
        )
    log_timestamp()

    if not Path(vocab_path).exists():
        # create vocab
        vocab_train = train_dataset.map(
            extract_all_chars,
            batched=True,
            batch_size=-1,
            keep_in_memory=True,
            remove_columns=train_dataset.column_names,
        )
        vocab_test = test_dataset.map(
            extract_all_chars,
            batched=True,
            batch_size=-1,
            keep_in_memory=True,
            remove_columns=test_dataset.column_names,
        )
        vocab_list = list(
            set(vocab_train["vocab"][0]) | set(vocab_test["vocab"][0]))
        vocab_dict = {v: k for k, v in enumerate(vocab_list)}
        vocab_dict["|"] = vocab_dict[" "]
        del vocab_dict[" "]
        vocab_dict["[UNK]"] = len(vocab_dict)
        vocab_dict["[PAD]"] = len(vocab_dict)
        Path(vocab_path).parent.mkdir(parents=True, exist_ok=True)
        with open(vocab_path, "w") as vocab_file:
            json.dump(vocab_dict, vocab_file)
        log_timestamp("create vocab")

    # Load pretrained model and tokenizer
    #
    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.
    tokenizer = Wav2Vec2CTCTokenizer(
        vocab_path,
        unk_token="[UNK]",
        pad_token="[PAD]",
        word_delimiter_token="|",
    )
    feature_extractor = Wav2Vec2FeatureExtractor(
        feature_size=1,
        sampling_rate=16_000,
        padding_value=0.0,
        do_normalize=True,
        return_attention_mask=True,
    )
    processor = Wav2Vec2Processor(feature_extractor=feature_extractor,
                                  tokenizer=tokenizer)
    model = Wav2Vec2ForCTC.from_pretrained(
        model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
        activation_dropout=model_args.activation_dropout,
        attention_dropout=model_args.attention_dropout,
        hidden_dropout=model_args.hidden_dropout,
        feat_proj_dropout=model_args.feat_proj_dropout,
        mask_time_prob=model_args.mask_time_prob,
        gradient_checkpointing=model_args.gradient_checkpointing,
        layerdrop=model_args.layerdrop,
        ctc_loss_reduction="mean",
        pad_token_id=processor.tokenizer.pad_token_id,
        vocab_size=len(processor.tokenizer),
    )
    log_timestamp("load model")

    if not Path(dataset_train_path).exists():
        train_dataset = train_dataset.map(
            speech_file_to_array_fn,
            remove_columns=train_dataset.column_names,
            num_proc=data_args.preprocessing_num_workers,
        )
        log_timestamp("load audio")
        train_dataset = train_dataset.filter(
            filter_by_duration,
            remove_columns=["duration"],
            num_proc=data_args.preprocessing_num_workers,
        )
        log_timestamp("filter data")
        train_dataset = train_dataset.map(
            prepare_dataset,
            remove_columns=train_dataset.column_names,
            batch_size=training_args.per_device_train_batch_size,
            batched=True,
            num_proc=data_args.preprocessing_num_workers,
        )
        log_timestamp("process data")
        train_dataset = train_dataset.map(
            get_length,
            num_proc=data_args.preprocessing_num_workers,
        )
        log_timestamp("add input length")
        train_dataset.save_to_disk(dataset_train_path)
        log_timestamp("save to disk")

    if not Path(dataset_eval_path).exists() and training_args.do_eval:
        eval_dataset = eval_dataset.map(
            speech_file_to_array_fn,
            remove_columns=eval_dataset.column_names,
            num_proc=data_args.preprocessing_num_workers,
        )
        eval_dataset = eval_dataset.filter(
            filter_by_duration,
            remove_columns=["duration"],
            num_proc=data_args.preprocessing_num_workers,
        )
        eval_dataset = eval_dataset.map(
            prepare_dataset,
            remove_columns=eval_dataset.column_names,
            batch_size=training_args.per_device_eval_batch_size,
            batched=True,
            num_proc=data_args.preprocessing_num_workers,
        )
        eval_dataset = eval_dataset.map(
            get_length,
            num_proc=data_args.preprocessing_num_workers,
        )
        eval_dataset.save_to_disk(dataset_eval_path)
    log_timestamp()

    if not Path(dataset_test_path).exists():
        test_dataset = test_dataset.map(
            speech_file_to_array_fn,
            num_proc=data_args.preprocessing_num_workers,
        )
        test_dataset = test_dataset.filter(filter_by_duration,
                                           remove_columns=["duration"])
        test_dataset.save_to_disk(dataset_test_path)
    log_timestamp()

    # Metric
    cer_metric = datasets.load_metric("cer")
    # we use a custom WER that considers punctuation
    wer_metric = datasets.load_metric("metrics/wer_punctuation.py")

    def compute_metrics(pred):
        pred_logits = pred.predictions
        pred_ids = np.argmax(pred_logits, axis=-1)

        pred.label_ids[pred.label_ids ==
                       -100] = processor.tokenizer.pad_token_id

        pred_str = processor.batch_decode(pred_ids)
        # we do not want to group tokens when computing the metrics
        label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

        cer = cer_metric.compute(predictions=pred_str, references=label_str)
        wer = wer_metric.compute(predictions=pred_str, references=label_str)

        return {"cer": cer, "wer": wer}

    log_timestamp()

    if model_args.freeze_feature_extractor:
        model.freeze_feature_extractor()
        log_timestamp("freeze feature extractor")

    # Data collator
    data_collator = DataCollatorCTCWithPadding(processor=processor,
                                               padding=True)
    log_timestamp("create data collator")

    # Initialize our Trainer
    trainer = CTCTrainer(
        model=model,
        data_collator=data_collator,
        args=training_args,
        compute_metrics=compute_metrics,
        train_dataset=train_dataset if training_args.do_train else None,
        eval_dataset=eval_dataset if training_args.do_eval else None,
        tokenizer=processor.feature_extractor,
    )
    loss_nan_stopping_callback = LossNaNStoppingCallback()
    early_stopping_callback = EarlyStoppingCallback()
    timing_callback = TimingCallback()
    trainer.add_callback(loss_nan_stopping_callback)
    trainer.add_callback(early_stopping_callback)
    trainer.add_callback(timing_callback)

    # Training
    log_timestamp("setup trainer")
    if training_args.do_train:
        if last_checkpoint is not None:
            checkpoint = last_checkpoint
        elif os.path.isdir(model_args.model_name_or_path):
            checkpoint = model_args.model_name_or_path
        else:
            checkpoint = None
        log_timestamp()
        train_result = trainer.train(resume_from_checkpoint=checkpoint)
        log_timestamp("train model")
        trainer.save_model()

        # save the feature_extractor and the tokenizer
        if is_main_process(training_args.local_rank):
            processor.save_pretrained(training_args.output_dir)

        metrics = train_result.metrics
        metrics["train_samples"] = len(train_dataset)

        trainer.log_metrics("train", metrics)
        trainer.save_metrics("train", metrics)
        trainer.save_state()

    # Final test metrics
    logger.info("*** Test ***")
    log_timestamp()

    if loss_nan_stopping_callback.stopped:
        test_cer, test_wer = 1.0, 2.0
        logger.info(
            "Loss NaN detected, typically resulting in bad WER & CER so we won't calculate them."
        )
    else:

        def evaluate(batch):
            inputs = processor(batch["speech"],
                               sampling_rate=16_000,
                               return_tensors="pt",
                               padding=True)
            with torch.no_grad():
                logits = model(
                    inputs.input_values.to("cuda"),
                    attention_mask=inputs.attention_mask.to("cuda"),
                ).logits
            pred_ids = torch.argmax(logits, dim=-1)
            batch["pred_strings"] = processor.batch_decode(pred_ids)
            return batch

        model.to("cuda")
        # no need to cache mapped test_dataset
        datasets.set_caching_enabled(False)
        result = test_dataset.map(
            evaluate,
            batched=True,
            batch_size=training_args.per_device_eval_batch_size)
        log_timestamp("get test predictions")
        test_cer = cer_metric.compute(predictions=result["pred_strings"],
                                      references=result["text"])
        test_wer = wer_metric.compute(predictions=result["pred_strings"],
                                      references=result["text"])
        log_timestamp("compute test metrics")

    metrics = {"cer": test_cer, "wer": test_wer}
    wandb.log({f"test/{k}": v for k, v in metrics.items()})
    trainer.save_metrics("test", metrics)
    logger.info(metrics)

    # save model files
    log_timestamp()
    if not loss_nan_stopping_callback.stopped:
        artifact = wandb.Artifact(name=f"model-{wandb.run.id}",
                                  type="model",
                                  metadata={"cer": test_cer})
        for f in Path(training_args.output_dir).iterdir():
            if f.is_file():
                artifact.add_file(str(f))
        wandb.run.log_artifact(artifact)
        log_timestamp("log artifacts")
Esempio n. 21
0
def train(args):
    wandb.login()
    seed_everything(args.seed)
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

    MODEL_NAME = args.model_name
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

    # load dataset
    train_dataset_dir = "/opt/ml/code2/data/train_new_tag.tsv"
    train_data = load_dataset(train_dataset_dir)
    train_x = list(train_data.iloc[:, 0])
    train_y = list(train_data.iloc[:, -1])

    valid_dataset_dir = "/opt/ml/code2/data/valid_tag.tsv"
    valid_data = load_dataset(valid_dataset_dir)
    val_x = list(valid_data.iloc[:, 0])
    val_y = list(valid_data.iloc[:, -1])

    # tokenize datasets
    tokenized_train = tokenized_dataset(train_x, tokenizer)
    tokenized_val = tokenized_dataset(val_x, tokenizer)

    # make dataset for pytorch
    RE_train_dataset = RE_Dataset(tokenized_train, train_y)
    RE_valid_dataset = RE_Dataset(tokenized_val, val_y)

    # instantiate pretrained language model
    model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME,
                                                               num_labels=8)
    model.resize_token_embeddings(len(tokenizer))
    model.to(device)

    # optimizer and scheduler
    # optimizer = AdamW(model.parameters(), lr=args.lr, weight_decay=args.weight_decay)
    # scheduler = get_linear_schedule_with_warmup(optimizer=optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=300*args.epochs)

    # callbacks
    early_stopping = EarlyStoppingCallback(
        early_stopping_patience=args.early_stopping_patience,
        early_stopping_threshold=0.00005)

    training_args = TrainingArguments(
        output_dir='./results',
        logging_dir='./logs',
        logging_steps=100,
        save_total_limit=1,
        evaluation_strategy='steps',
        eval_steps=100,
        load_best_model_at_end=True,
        metric_for_best_model='accuracy',
        greater_is_better=True,
        dataloader_num_workers=args.num_workers,
        fp16=True,
        seed=args.seed,
        run_name=args.run_name,
        num_train_epochs=args.epochs,
        per_device_train_batch_size=args.train_batch_size,
        per_device_eval_batch_size=args.eval_batch_size,
        label_smoothing_factor=args.label_smoothing_factor,
        learning_rate=args.lr,
        warmup_steps=args.warmup_steps,
        weight_decay=args.weight_decay,
    )

    trainer = Trainer(
        model=model,  # the instantiated 🤗 Transformers model to be trained
        args=training_args,  # training arguments, defined above
        train_dataset=RE_train_dataset,  # training dataset
        eval_dataset=RE_valid_dataset,  # evaluation dataset
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,  # define metrics function
        # optimizers=[optimizer, scheduler],
        callbacks=[early_stopping])

    # train model
    trainer.train()
    wandb.finish()
        output_dir=args.checkpoint_dir,  # output directory
        num_train_epochs=args.epochs,  # total number of training epochs
        per_device_train_batch_size=args.
        bs,  # batch size per device during training
        per_device_eval_batch_size=args.bs,  # batch size for evaluation
        # warmup_steps=warmup_steps,                                                                            # number of warmup steps for learning rate scheduler
        weight_decay=args.wd,  # strength of weight decay
        evaluation_strategy="epoch",  # evaluation interval
        logging_dir=args.checkpoint_dir,  # directory for storing logs
        save_strategy="epoch",  # checkpoint save interval
        logging_steps=500,
        metric_for_best_model=args.criterion,
        load_best_model_at_end=True)

    collator = get_collator(tokenizer)
    es_callback = EarlyStoppingCallback(early_stopping_patience=5)

    print(f"- Training args: {training_args}")
    trainer = Trainer(model,
                      args=training_args,
                      train_dataset=train_ds,
                      eval_dataset=test_ds,
                      compute_metrics=compute_metrics,
                      optimizers=(optimizer, scheduler),
                      data_collator=collator)

    trainer.add_callback(es_callback)

    trainer.train()

    print(f"- Label encoder mapping:")
Esempio n. 23
0
        "recall": recall,
        "f1": f1
    }


# Define Trainer
args = TrainingArguments(
    output_dir=
    "E:\Projects\Emotion_detection_gihan\\finbert_experiments\models\emotion_lines_500_steps",
    evaluation_strategy="steps",
    eval_steps=500,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    save_steps=3000,
    seed=0,
    load_best_model_at_end=True,
)
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)

# Train pre-trained model
trainer.train()

# 500 is the best
Esempio n. 24
0
def fine_tune(cfg: DictConfig) -> float:
    """fine tune bert module"""
    init_wandb(cfg)
    train_ds, test_ds = getDataset(cfg)

    config = AutoConfig.from_pretrained(cfg.model.arch,
                                        num_labels=cfg.model.num_labels)

    model = AutoModelForSequenceClassification.from_pretrained(cfg.model.arch,
                                                               config=config)
    id = wandb.run.name.rsplit("-", 1)[1]
    trainConfig = cfg.train
    output_dir = os.path.join(trainConfig["output_dir"], id)
    print("module output dir = ", output_dir)
    train_args = TrainingArguments(
        # module pred/ckpt
        output_dir=output_dir,
        # tensorboard logs
        logging_dir="./logs",
        num_train_epochs=trainConfig["epoch"],
        per_device_train_batch_size=trainConfig["train_batch_size"],
        per_device_eval_batch_size=trainConfig["eval_batch_size"],
        # x (logging / eval /save) every acc * x_steps
        gradient_accumulation_steps=trainConfig["acc_batch"],
        evaluation_strategy=IntervalStrategy.EPOCH,
        label_smoothing_factor=trainConfig["label_smooth"],
        # AdamW
        learning_rate=trainConfig["lr"],
        warmup_steps=trainConfig["warmup"],
        # apply to all layers but bias / LayerNorm
        weight_decay=trainConfig["wd"],
        # save_total_limit=2,
        # if True, ignore param save_strategy / save_steps / save_total_limit
        load_best_model_at_end=True,
        # report_to=["none"],
        report_to=["wandb"],
        seed=cfg.seed,
        logging_strategy=IntervalStrategy.STEPS,
        metric_for_best_model=trainConfig["metric"])

    trainer = Trainer(
        model,
        args=train_args,
        train_dataset=train_ds,
        eval_dataset=test_ds,
        callbacks=[
            EarlyStoppingCallback(
                early_stopping_patience=trainConfig["early_stopping_patience"]
            ),
        ],
        compute_metrics=compute_metrics,
    )

    print("logs in dir", os.getcwd())
    print("gpu count = ", trainer.args.n_gpu, "is_fp16 =", trainer.args.fp16)

    trainer.train()
    trainer.evaluate()

    # best module
    trainer.model.save_pretrained(os.path.join(output_dir, "best"))
    y_pred_tuple = trainer.predict(test_ds)
    logits, y_true, metrics = y_pred_tuple
    y_pred = logits.argmax(-1)

    plot_heat_map(y_true, y_pred, cfg.model.num_labels)

    acc = accuracy_score(y_true, y_pred)
    print(acc)
    wandb.finish()
    return acc