Example #1
0
def get_dataset_and_collater(vocab_path, merges_path, data_path, seq_len):
    tokenizer = get_seq_tokenizer(vocab_path, merges_path)

    dataset = transformers.LineByLineTextDataset(tokenizer=tokenizer,
                                                 file_path=data_path,
                                                 block_size=seq_len)

    data_collator = transformers.DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False)

    return dataset, data_collator
Example #2
0
def train_bert(corpus_path, hebrew_model=False):
    """
    Bert model training
    :param corpus_path: Corpus to train Bert on
    :param hebrew_model: Model in Hebrew or not
    :return: The name of the new trained model
    """
    language = 'hebrew' if hebrew_model else 'english'
    df = pd.read_csv(corpus_path)
    corpus_name = get_corpus_name(corpus_path)
    print("Preprocess...")
    if hebrew_model:
        model_name, vocab, raw_text_file = preprocess_hebrew(df, corpus_name)
    else:
        model_name, vocab, raw_text_file = preprocess_english(df, corpus_name)
        pass

    print("Cuda availability :", torch.cuda.is_available())
    print("Getting tokenizer...")
    tokenizer = transformers.AutoTokenizer.from_pretrained(conf.bert_model[language], use_fast=True)
    model = transformers.AutoModelForMaskedLM.from_pretrained(conf.bert_model[language]).to('cuda')

    tokenizer.add_tokens(vocab)
    model.resize_token_embeddings(len(tokenizer))

    if os.path.exists(conf.MODELS_PATH + conf.TYPE_MODEL_PATH['bert'] + model_name):
        shutil.rmtree(conf.MODELS_PATH + conf.TYPE_MODEL_PATH['bert'] + model_name)

    os.mkdir(conf.MODELS_PATH + conf.TYPE_MODEL_PATH['bert'] + model_name)
    tokenizer.save_pretrained(conf.MODELS_PATH + conf.TYPE_MODEL_PATH['bert'] + model_name)

    print("Tokenizing...")
    dataset = transformers.LineByLineTextDataset(
        tokenizer=tokenizer,
        file_path=raw_text_file,
        block_size=128,
    )

    data_collator = transformers.DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=True, mlm_probability=0.15
    )

    training_args = transformers.TrainingArguments(
        output_dir=conf.MODELS_PATH + conf.TYPE_MODEL_PATH['bert'] + model_name,
        overwrite_output_dir=True,
        num_train_epochs=20,
        per_device_train_batch_size=16,
        save_steps=300,
        logging_steps=100,
        save_total_limit=3,
    )

    trainer = transformers.Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=dataset
    )
    print("Begin training...")
    os.environ["TOKENIZERS_PARALLELISM"] = "false"
    trainer.train()
    trainer.save_model(conf.MODELS_PATH + conf.TYPE_MODEL_PATH['bert'] + model_name)
    print('The model has been saved under : ', conf.MODELS_PATH + conf.TYPE_MODEL_PATH['bert'] + model_name)

    return conf.MODELS_PATH + conf.TYPE_MODEL_PATH['bert'] + model_name
Example #3
0
    def build_datasets(self) -> Union[datasets.Dataset, datasets.DatasetDict]:
        column_names = self.raw_datasets["train"].column_names
        text_column_name = "text" if "text" in column_names else column_names[0]

        if self.data_config.max_seq_length is None:
            max_seq_length = self.tokenizer.model_max_length
            if max_seq_length > 1024:
                self.logger.warning(
                    "The tokenizer picked seems to have a very large `model_max_length` "
                    f"({self.tokenizer.model_max_length}). Using 1024 instead. You can change "
                    "that default value by setting max_seq_length in the experiment config."
                )
                max_seq_length = 1024
        else:
            if self.data_config.max_seq_length > self.tokenizer.model_max_length:
                self.logger.warning(
                    f"The max_seq_length passed ({self.data_config.max_seq_length}) is larger "
                    f"than the maximum length for the model ({self.tokenizer.model_max_length}). "
                    f"Using max_seq_length={self.tokenizer.model_max_length}.")
            max_seq_length = min(self.data_config.max_seq_length,
                                 self.tokenizer.model_max_length)

        # We cannot use self.tokenizer as a non-local variable in the tokenize_function if we want
        # map to be able to cache the output of the tokenizer.  Hence, the tokenize_function takes
        # a tokenizer explicitly as an input and we create a closure using functools.partial.
        if self.data_config.line_by_line:
            # When using line_by_line, we just tokenize each nonempty line.
            padding = "max_length" if self.data_config.pad_to_max_length else False

            def tokenize_function(tokenizer, padding, max_seq_length,
                                  examples):
                # Remove empty lines
                examples["text"] = [
                    line for line in examples["text"]
                    if len(line) > 0 and not line.isspace()
                ]
                return tokenizer(
                    examples["text"],
                    padding=padding,
                    truncation=True,
                    max_length=max_seq_length,
                    # We use this option because DataCollatorForLanguageModeling (see below) is
                    # more efficient when it receives the `special_tokens_mask`.
                    return_special_tokens_mask=True,
                )

            tokenized_datasets = self.raw_datasets.map(
                functools.partial(tokenize_function, self.tokenizer, padding,
                                  max_seq_length),
                batched=True,
                num_proc=self.data_config.preprocessing_num_workers,
                remove_columns=[text_column_name],
                load_from_cache_file=not self.data_config.overwrite_cache,
            )
        else:
            # Otherwise, we tokenize every text, then concatenate them together before splitting
            # them in smaller parts. We use `return_special_tokens_mask=True` because
            # DataCollatorForLanguageModeling (see below) is more efficient when it receives
            # the `special_tokens_mask`.
            def tokenize_function(tokenizer, examples):
                return tokenizer(examples[text_column_name],
                                 return_special_tokens_mask=True)

            tokenized_datasets = self.raw_datasets.map(
                functools.partial(tokenize_function, self.tokenizer),
                batched=True,
                num_proc=self.data_config.preprocessing_num_workers,
                remove_columns=column_names,
                load_from_cache_file=not self.data_config.overwrite_cache,
            )

            # Main data processing function that will concatenate all texts from our dataset and
            # generate chunks of max_seq_length.
            def group_texts(examples):
                # Concatenate all texts.
                concatenated_examples = {
                    k: sum(examples[k], [])
                    for k in examples.keys()
                }
                total_length = len(concatenated_examples[list(
                    examples.keys())[0]])
                # We drop the small remainder, we could add padding if the model supported it
                # instead of this drop, you can customize this part to your needs.
                total_length = (total_length //
                                max_seq_length) * max_seq_length
                # Split by chunks of max_len.
                result = {
                    k: [
                        t[i:i + max_seq_length]
                        for i in range(0, total_length, max_seq_length)
                    ]
                    for k, t in concatenated_examples.items()
                }
                return result

            # Note that with `batched=True`, this map processes 1,000 texts together, so
            # group_texts throws away a remainder for each of those groups of 1,000 texts.
            # You can adjust that batch_size here but a higher value might be slower to preprocess.
            #
            # To speed up this part, we use multiprocessing. See the documentation of the map
            # method for more information:
            # https://huggingface.co/docs/datasets/package_reference/main_classes.html
            tokenized_datasets = tokenized_datasets.map(
                group_texts,
                batched=True,
                num_proc=self.data_config.preprocessing_num_workers,
                load_from_cache_file=not self.data_config.overwrite_cache,
            )
        for _, data in tokenized_datasets.items():
            hf.remove_unused_columns(self.model, data)
        self.collator = transformers.DataCollatorForLanguageModeling(
            tokenizer=self.tokenizer,
            mlm_probability=self.data_config.mlm_probability)
        return tokenized_datasets
Example #4
0
import argparse
import transformers

parser = argparse.ArgumentParser()
parser.add_argument('--vocab', type=str)
parser.add_argument('--model', type=str)
parser.add_argument('--data', type=str)
args = parser.parse_args()

tokenizer = transformers.BertTokenizer(vocab_file=args.vocab,
                                       do_lower_case=False,
                                       do_basic_tokenize=True)
model = transformers.BertForMaskedLM.from_pretrained(args.model)

dataset = transformers.LineByLineTextDataset(tokenizer=tokenizer,
                                             file_path=args.data,
                                             block_size=128)
data_collator = transformers.DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15)
train_args = transformers.TrainingArguments(
    per_device_eval_batch_size=16, output_dir=f"/tmp/echau18/{args.model}")
trainer = transformers.Trainer(model=model,
                               eval_dataset=dataset,
                               data_collator=data_collator,
                               prediction_loss_only=True,
                               args=train_args)

eval_output = trainer.evaluate()
print(eval_output)
Example #5
0
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = transformers.HfArgumentParser(
        (ModelArguments, DataTrainingArguments,
         transformers.TrainingArguments))
    model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    if data_args.eval_data_file is None and training_args.do_eval:
        raise ValueError(
            "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file "
            "or remove the --do_eval argument.")

    if (os.path.exists(training_args.output_dir)
            and os.listdir(training_args.output_dir) and training_args.do_train
            and not training_args.overwrite_output_dir):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
        )

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO
        if training_args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        training_args.local_rank,
        training_args.device,
        training_args.n_gpu,
        bool(training_args.local_rank != -1),
        training_args.fp16,
    )
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed
    transformers.set_seed(training_args.seed)

    # Load pretrained model and tokenizer
    #
    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.

    if model_args.config_name:
        config = transformers.AutoConfig.from_pretrained(
            model_args.config_name, cache_dir=model_args.cache_dir)
    elif model_args.model_name_or_path:
        config = transformers.AutoConfig.from_pretrained(
            model_args.model_name_or_path, cache_dir=model_args.cache_dir)
    else:
        config = transformers.CONFIG_MAPPING[model_args.model_type]()
        logger.warning(
            "You are instantiating a new config instance from scratch.")

    if model_args.tokenizer_name:
        tokenizer = transformers.AutoTokenizer.from_pretrained(
            model_args.tokenizer_name, cache_dir=model_args.cache_dir)
    elif model_args.model_name_or_path:
        tokenizer = transformers.AutoTokenizer.from_pretrained(
            model_args.model_name_or_path, cache_dir=model_args.cache_dir)
    else:
        raise ValueError(
            "You are instantiating a new tokenizer from scratch. This is not supported, but you can do it from another script, save it,"
            "and load it from here, using --tokenizer_name")

    if model_args.model_name_or_path:
        model = transformers.AutoModelForPreTraining.from_pretrained(
            model_args.model_name_or_path,
            from_tf=bool(".ckpt" in model_args.model_name_or_path),
            config=config,
            cache_dir=model_args.cache_dir,
        )
    else:
        logger.info("Training new model from scratch")
        model = transformers.AutoModelWithLMHead.from_config(config)

    model.resize_token_embeddings(len(tokenizer))

    if config.model_type in ["bert", "roberta", "distilbert", "camembert"
                             ] and not data_args.mlm:
        raise ValueError(
            "BERT and RoBERTa-like models do not have LM heads but masked LM heads. They must be run using the --mlm "
            "flag (masked language modeling).")

    if data_args.block_size <= 0:
        data_args.block_size = tokenizer.max_len
        # Our input block size will be the max possible for the model
    else:
        data_args.block_size = min(data_args.block_size, tokenizer.max_len)

    # Get datasets

    train_dataset = get_dataset(
        data_args, tokenizer=tokenizer) if training_args.do_train else None
    eval_dataset = get_dataset(
        data_args, tokenizer=tokenizer,
        evaluate=True) if training_args.do_eval else None
    data_collator = transformers.DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=data_args.mlm,
        mlm_probability=data_args.mlm_probability)

    # Initialize our Trainer
    trainer = transformers.Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        prediction_loss_only=True,
    )

    # Training
    if training_args.do_train:
        model_path = (model_args.model_name_or_path
                      if model_args.model_name_or_path is not None
                      and os.path.isdir(model_args.model_name_or_path) else
                      None)
        trainer.train(model_path=model_path)
        trainer.save_model()
        # For convenience, we also re-save the tokenizer to the same directory,
        # so that you can share your model easily on huggingface.co/models =)
        if trainer.is_world_master():
            tokenizer.save_pretrained(training_args.output_dir)

    # Evaluation
    results = {}
    if training_args.do_eval:
        logger.info("*** Evaluate ***")

        eval_output = trainer.evaluate()

        perplexity = math.exp(eval_output["eval_loss"])
        result = {"perplexity": perplexity}

        output_eval_file = os.path.join(training_args.output_dir,
                                        "eval_results_lm.txt")
        if trainer.is_world_master():
            with open(output_eval_file, "w") as writer:
                logger.info("***** Eval results *****")
                for key in sorted(result.keys()):
                    logger.info("  %s = %s", key, str(result[key]))
                    writer.write("%s = %s\n" % (key, str(result[key])))

        results.update(result)

    return results