def create_trainer(tokenizer, model):
    dataset = LineByLineTextDataset(
        tokenizer=tokenizer,
        file_path="data/processed/recipes_train.txt",
        block_size=256,
    )
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=True, mlm_probability=0.15
    )
    training_args = TrainingArguments(
        output_dir="./artifacts",
        overwrite_output_dir=True,
        num_train_epochs=1,
        per_gpu_train_batch_size=128,
        save_steps=100_000_000,
        save_total_limit=2,
        fp16=True,
    )
    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=dataset,
        prediction_loss_only=True,
    )
    return trainer
Example #2
0
 def test_trainer_eval_lm(self):
     MODEL_ID = "distilroberta-base"
     tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
     dataset = LineByLineTextDataset(
         tokenizer=tokenizer, file_path=PATH_SAMPLE_TEXT, block_size=tokenizer.max_len_single_sentence,
     )
     self.assertEqual(len(dataset), 31)
Example #3
0
    def __init__(self, opts):
        # Command line arguments
        self.opts = opts

        # Load model and tokenizer
        config = AutoConfig.from_pretrained(opts.ckpt_file)
        self.tokenizer = AutoTokenizer.from_pretrained(opts.ckpt_file)
        self.model = AutoModelWithLMHead.from_pretrained(opts.ckpt_file,
                                                         config=config)
        self.model.resize_token_embeddings(len(self.tokenizer))

        # Load training arguments
        if opts.mode == 'train' or opts.mode == 'eval':
            self.training_args = TrainingArguments
            self.training_args.device = 'cpu'
            self.training_args.n_gpu = 0
            self.training_args.logging_dir = opts.output_dir
            self.training_args.output_dir = opts.output_dir
            self.training_args.num_train_epochs = opts.num_epochs
            self.training_args.learning_rate = opts.learning_rate
            self.training_args.train_batch_size = opts.batch_size
            self.training_args.eval_batch_size = opts.batch_size

        # Load dataset
        if opts.mode == 'train' or opts.mode == 'eval':
            self.dataset = LineByLineTextDataset(  # TextDataset
                tokenizer=self.tokenizer,
                file_path=opts.text_file,
                block_size=self.tokenizer.max_len)
            self.data_collator = DataCollatorForLanguageModeling(
                tokenizer=self.tokenizer, mlm=False)
Example #4
0
    def test_plm(self):
        tokenizer = AutoTokenizer.from_pretrained("xlnet-base-cased")
        data_collator = DataCollatorForPermutationLanguageModeling(tokenizer)
        # ^ permutation lm

        dataset = LineByLineTextDataset(tokenizer,
                                        file_path=PATH_SAMPLE_TEXT,
                                        block_size=512)
        examples = [dataset[i] for i in range(len(dataset))]
        batch = data_collator(examples)
        self.assertIsInstance(batch, dict)
        self.assertEqual(batch["input_ids"].shape, torch.Size((31, 112)))
        self.assertEqual(batch["perm_mask"].shape, torch.Size((31, 112, 112)))
        self.assertEqual(batch["target_mapping"].shape,
                         torch.Size((31, 112, 112)))
        self.assertEqual(batch["labels"].shape, torch.Size((31, 112)))

        dataset = TextDataset(tokenizer,
                              file_path=PATH_SAMPLE_TEXT,
                              block_size=512,
                              overwrite_cache=True)
        examples = [dataset[i] for i in range(len(dataset))]
        batch = data_collator(examples)
        self.assertIsInstance(batch, dict)
        self.assertEqual(batch["input_ids"].shape, torch.Size((2, 512)))
        self.assertEqual(batch["perm_mask"].shape, torch.Size((2, 512, 512)))
        self.assertEqual(batch["target_mapping"].shape,
                         torch.Size((2, 512, 512)))
        self.assertEqual(batch["labels"].shape, torch.Size((2, 512)))

        example = [torch.randint(5, [5])]
        with self.assertRaises(ValueError):
            # Expect error due to odd sequence length
            data_collator(example)
Example #5
0
    def execute(self, environment_path: str) -> None:
        dataset = LineByLineTextDataset(tokenizer=self.tokenizer,
                                        file_path=self.file_path,
                                        block_size=self.block_size)

        data_collator = DataCollatorForLanguageModeling(
            tokenizer=self.tokenizer,
            mlm=True,
            mlm_probability=self.mlm_probability)

        training_args = TrainingArguments(
            output_dir=os.path.join(environment_path, "temp"),
            overwrite_output_dir=True,
            num_train_epochs=self.epochs,
            per_gpu_train_batch_size=self.batch_size_per_gpu,
            save_steps=self.save_steps,
            save_total_limit=self.save_total_limit,
        )

        trainer = Trainer(
            model=self.model,
            args=training_args,
            data_collator=data_collator,
            train_dataset=dataset,
            prediction_loss_only=True,
        )

        trainer.train()

        trainer.save_model(os.path.join(environment_path, "model"))
        self.tokenizer.save_pretrained(
            os.path.join(environment_path, "tokenizer"))
Example #6
0
def get_dataset(filepath,
                tokenizer,
                block_size,
                line_by_line=False,
                overwrite_cache=False):
    '''
    Load a dataset from the specified filepath.

    :param filepath:
        The filepath of the dataset.
    :param tokenizer:
        The tokenizer to parse the dataset with.
    :param block_size:
        The length of a single input sequence (block).
    :param line_by_line:
        Indicates whether distinct lines of text in the dataset are to be handled as
        separate sequence (i.e. whether to add the BOS adn EOS tokens to each line).
        Defaults to False.
    :param overwrite_cache:
        Overwrite the cached training and evaluation sets. Defaults to False.
    :returns:
        A :class:`torch.utils.data.Dataset` object.

    '''

    if line_by_line:
        return LineByLineTextDataset(tokenizer=tokenizer,
                                     file_path=filepath,
                                     block_size=block_size)
    else:
        return TextDataset(tokenizer=tokenizer,
                           file_path=filepath,
                           block_size=block_size,
                           overwrite_cache=overwrite_cache)
Example #7
0
    def _dataset(file_path, ref_path=None):
        if args.line_by_line:
            if ref_path is not None:
                if not args.whole_word_mask or not args.mlm:
                    raise ValueError(
                        "You need to set world whole masking and mlm to True for Chinese Whole Word Mask"
                    )
                return LineByLineWithRefDataset(
                    tokenizer=tokenizer,
                    file_path=file_path,
                    block_size=args.block_size,
                    ref_path=ref_path,
                )

            return LineByLineTextDataset(tokenizer=tokenizer,
                                         file_path=file_path,
                                         block_size=args.block_size)
        else:
            return TextDataset(
                tokenizer=tokenizer,
                file_path=file_path,
                block_size=args.block_size,
                overwrite_cache=args.overwrite_cache,
                cache_dir=cache_dir,
            )
def get_dataset(args: DataTrainingArguments,
                tokenizer: PreTrainedTokenizer,
                max_len,
                evaluate=False):

    file_path = args.eval_data_file if evaluate else args.train_data_file

    if args.line_by_line:
        return LineByLineTextDataset(tokenizer=tokenizer,
                                     file_path=file_path,
                                     block_size=args.block_size)
    elif args.text_dataset:
        return TextDataset(tokenizer=tokenizer,
                           file_path=file_path,
                           block_size=args.block_size,
                           overwrite_cache=args.overwrite_cache)
    else:
        """
        When use common tab separated text dataset, use nlp.data.TSVDataset.
        If you want to use other type of dataset, refer to other class of nlp.data,
        or set DataTrainingArguments.line_by_line or DataTrainingArguments.text_dataset True.
        """
        dataset = nlp.data.TSVDataset(file_path,
                                      field_indices=[1],
                                      num_discard_samples=1)

        return Get_dataset(dataset, 0, tokenizer, max_len, True, False)
Example #9
0
def get_train_data(epoch):
    p = Path('data/raw/oscar') / f'he_dedup-train-{(epoch % 2) + 1}.txt'
    logger.info(f'{transformer_type} training data: {p}')
    return LineByLineTextDataset(
        tokenizer=tokenizer,
        file_path=str(p),
        block_size=128,
    )
def get_dataset(args: DataTrainingArguments, tokenizer: PreTrainedTokenizer, evaluate=False):
    file_path = args.eval_data_file if evaluate else args.train_data_file
    if args.line_by_line:
        return LineByLineTextDataset(tokenizer=tokenizer, file_path=file_path, block_size=args.block_size)
    else:
        return TextDataset(
            tokenizer=tokenizer, file_path=file_path, block_size=args.block_size, overwrite_cache=args.overwrite_cache
        )
def finetune_model(transformers_model_name: str, corpus_file_path: str):
    config = AutoConfig.from_pretrained(
        transformers_model_name,
        force_download=False,
        cache_dir='../data/download_transformer_models')

    tokenizer = AutoTokenizer.from_pretrained(
        transformers_model_name,
        force_download=False,
        cache_dir='../data/download_transformer_models')
    # tokenizer = RobertaTokenizerFast.from_pretrained(transformers_model_name,force_download=False,cache_dir='../data/download_transformer_models')

    model = AutoModelForMaskedLM.from_pretrained(
        transformers_model_name,
        config=config,
        force_download=False,
        cache_dir='../data/download_transformer_models')
    dataset = LineByLineTextDataset(tokenizer=tokenizer,
                                    file_path=corpus_file_path,
                                    block_size=512)
    train_set, valid_set = train_test_split(dataset,
                                            test_size=0.25,
                                            random_state=32)

    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,
                                                    mlm=True,
                                                    mlm_probability=0.15)

    training_args = TrainingArguments(
        output_dir="../data/finetune_transformer_models/",
        logging_dir='../saved/finetune_logging',
        logging_steps=500,
        overwrite_output_dir=True,
        weight_decay=0.01,
        adam_epsilon=1e-6,
        learning_rate=2e-5,
        num_train_epochs=5,
        per_gpu_train_batch_size=4,
        per_gpu_eval_batch_size=32,
        max_grad_norm=5.0,
        save_steps=1000,
        save_total_limit=2,
        gradient_accumulation_steps=32,
        evaluate_during_training=True,
        do_train=True,
        do_eval=True,
        do_predict=False)

    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_set,
        eval_dataset=valid_set,
    )
    os.environ["CUDA_VISIBLE_DEVICES"] = "0"
    trainer.train()
Example #12
0
 def _dataset(file_path):
     if args.line_by_line:
         return LineByLineTextDataset(tokenizer=tokenizer, file_path=file_path, block_size=args.block_size)
     else:
         return TextDataset(
             tokenizer=tokenizer,
             file_path=file_path,
             block_size=args.block_size,
             overwrite_cache=args.overwrite_cache,
         )
def get_dataset(args: DataTrainingArguments, tokenizer: PreTrainedTokenizer, evaluate=False, local_rank=-1):
    file_path = args.eval_data_file if evaluate else args.train_data_file
    if args.line_by_line:
        return LineByLineTextDataset(
            tokenizer=tokenizer, file_path=file_path, block_size=args.block_size, local_rank=local_rank
        )
    else:
        return TextDataset(
            tokenizer=tokenizer, file_path=file_path, block_size=args.block_size, local_rank=local_rank,
        )
Example #14
0
def get_dataset(args: DataTrainingArguments,
                tokenizer: PreTrainedTokenizer,
                model_args: ModelArguments,
                evaluate=False,
                cache_dir=None):
    file_path = args.eval_data_file if evaluate else args.train_data_file
    return LineByLineTextDataset(tokenizer=tokenizer,
                                 file_path=file_path,
                                 block_size=args.block_size,
                                 cache_dir=cache_dir)
Example #15
0
def get_dataset(args: DataTrainingArguments, tokenizer: PreTrainedTokenizer,model_args:ModelArguments, evaluate=False):
    file_path = args.eval_data_file if evaluate else args.train_data_file
    if args.line_by_line:
        if args.mlm_sample_times > 1:
            return FullyLineByLineTextDataset(tokenizer=tokenizer, file_path=file_path, block_size=args.block_size, cache_dir=model_args.cache_dir)
        else:
            return LineByLineTextDataset(tokenizer=tokenizer, file_path=file_path, block_size=args.block_size, cache_dir=model_args.cache_dir)
    else:
        return TextDataset(
            tokenizer=tokenizer, file_path=file_path, block_size=args.block_size, overwrite_cache=args.overwrite_cache
        )
def train():
    dialogues = pd.read_csv('/content/TlkPersonaChatRus/dialogues.tsv',
                            sep='\t')
    for column in dialogues.columns:
        dialogues[column].replace(to_replace=r'<[a-zA-Z0-9_=\/ ]+>',
                                  value=' ',
                                  regex=True,
                                  inplace=True)
    dialogues['dialogue'].replace(
        to_replace=r'Пользователь [12]:|Привет|Здравствуйте|[!)?,]',
        value='',
        regex=True,
        inplace=True)
    dialogues['dialogue'].replace(to_replace=r'\s\s+',
                                  value=' ',
                                  regex=True,
                                  inplace=True)
    dialogues = dialogues['dialogue']
    dialogues.to_csv('./Datasets/dialogues')

    tokenizer = AutoTokenizer.from_pretrained('distilgpt2')
    model = AutoModelWithLMHead.from_pretrained('distilgpt2')
    tokenizer.pad_token = tokenizer.eos_token

    dataset = LineByLineTextDataset(
        tokenizer=tokenizer,
        file_path=
        '/content/drive/MyDrive/semester-practice-3rd/Datasets/dialogues.txt',
        block_size=128,
    )

    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,
                                                    mlm=False)

    training_args = TrainingArguments(
        output_dir=
        '/content/drive/MyDrive/semester-practice-3rd/Models/distilgpt2',
        overwrite_output_dir=True,
        num_train_epochs=10,
        per_device_train_batch_size=8,
    )

    trainer = Trainer(model=model,
                      args=training_args,
                      data_collator=data_collator,
                      train_dataset=dataset)

    trainer.train()
    trainer.save_model('model/gpt2_chat')
def main():
    model_path, data_path, output_path = set_path()
    bort_tokenizer = BertTokenizer.from_pretrained(model_path)
    seed_everyone(20210409)
    dataset = LineByLineTextDataset(
        tokenizer=bort_tokenizer,
        file_path=data_path,
        block_size=42,
    )
    model = NeZhaForMaskedLM.from_pretrained(model_path)
    data_collator = DataCollatorForLanguageModeling(tokenizer=bort_tokenizer,
                                                    mlm=True,
                                                    mlm_probability=0.15)

    logging_path = os.path.join(output_path, 'log')
    model_save_path = os.path.join(output_path, 'best_model_ckpt')
    tokenizer_and_configs = os.path.join(output_path, 'tokenizer_and_configs')
    check_path(model_save_path)
    check_path(logging_path)
    check_path(tokenizer_and_configs)

    training_args = TrainingArguments(
        output_dir=output_path,
        overwrite_output_dir=True,
        num_train_epochs=60,  # 60
        learning_rate=6e-5,
        fp16_backend='auto',
        per_device_train_batch_size=128,  # 64
        save_steps=1000,  # 1000
        logging_steps=1000,
        save_total_limit=10,  # 10
        run_name='80',
        logging_dir=logging_path,
        logging_first_step=True,
        dataloader_num_workers=4,
        disable_tqdm=False,
        seed=20200409)

    nezha_bert_trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=dataset,
    )

    nezha_bert_trainer.train()
    nezha_bert_trainer.save_model(model_save_path)
    bort_tokenizer.save_pretrained(tokenizer_and_configs)
Example #18
0
def get_dataset(
    args: DataTrainingArguments,
    tokenizer: PreTrainedTokenizer,
    evaluate: bool = False,
    cache_dir: Optional[str] = None,
):
    file_path = args.eval_data_file if evaluate else args.train_data_file
    if args.line_by_line:
        return LineByLineTextDataset(tokenizer=tokenizer, file_path=file_path, block_size=args.block_size)
    else:
        return TextDataset(
            tokenizer=tokenizer,
            file_path=file_path,
            block_size=args.block_size,
            overwrite_cache=args.overwrite_cache,
            cache_dir=cache_dir,
        )
Example #19
0
    def test_lm_tokenizer_without_padding(self):
        tokenizer = AutoTokenizer.from_pretrained("gpt2")
        data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
        # ^ causal lm

        dataset = LineByLineTextDataset(tokenizer, file_path=PATH_SAMPLE_TEXT, block_size=512)
        examples = [dataset[i] for i in range(len(dataset))]
        with self.assertRaises(ValueError):
            # Expect error due to padding token missing on gpt2:
            data_collator.collate_batch(examples)

        dataset = TextDataset(tokenizer, file_path=PATH_SAMPLE_TEXT, block_size=512, overwrite_cache=True)
        examples = [dataset[i] for i in range(len(dataset))]
        batch = data_collator.collate_batch(examples)
        self.assertIsInstance(batch, dict)
        self.assertEqual(batch["input_ids"].shape, torch.Size((2, 512)))
        self.assertEqual(batch["labels"].shape, torch.Size((2, 512)))
Example #20
0
def build(config):

    tokenizer = RobertaTokenizerFast.from_pretrained(
                                        os.path.join(config.save_directory),
                                        max_len=config.max_length
                                        )

    model_config = RobertaConfig(
        vocab_size=config.vocab_size,
        max_position_embeddings=config.max_length,
        num_attention_heads=config.num_attention_heads,
        num_hidden_layers=config.num_hidden_layers,
        type_vocab_size=1
    )

    model = RobertaForMaskedLM(config=model_config)
    print("the number of parameters of model: ", model.num_parameters())

    dataset = LineByLineTextDataset(
        tokenizer=tokenizer,
        file_path=config.files,
        block_size=32
    )

    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=True, mlm_probability=config.mlm_probability
    )

    training_args = TrainingArguments(
        output_dir=os.path.join(config.save_directory),
        overwrite_output_dir=config.overwrite_output_dir,
        num_train_epochs=config.num_train_epochs,
        per_gpu_train_batch_size=config.per_gpu_train_batch_size,
        save_steps=config.save_steps,
        save_total_limit=config.save_total_limit
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=dataset,
        prediction_loss_only=config.prediction_loss_only
    )

    return trainer
Example #21
0
def main():
    args = parse_arguments()
    if args.input_model is None:
        model = GPT2LMHeadModel.from_pretrained("antoiloui/belgpt2")
    else:
        print('loading pre trained model')
        model = GPT2LMHeadModel.from_pretrained(args.input_model)

    tokenizer = GPT2Tokenizer.from_pretrained("antoiloui/belgpt2")

    training_args = TrainingArguments(
        output_dir=args.output_dir + '_checkpoint',  # output directory
        num_train_epochs=3,  # total number of training epochs
        per_device_train_batch_size=64,  # batch size per device during training
        warmup_steps=100,  # number of warmup steps for learning rate scheduler
        weight_decay=0.01,  # strength of weight decay
        logging_dir='./logs_hyca',  # directory for storing logs
        logging_steps=100,
    )
    special_tokens_dict = {
        'bos_token': '<BOS>',
        'eos_token': '<EOS>',
        'pad_token': '<PAD>'
    }
    tokenizer.add_special_tokens(special_tokens_dict)

    model.resize_token_embeddings(len(tokenizer))
    dataset = LineByLineTextDataset(tokenizer=tokenizer,
                                    file_path=args.input_file,
                                    block_size=32)

    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,
                                                    mlm=False)

    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=dataset,
    )
    if args.input_model is not None:
        trainer.train(resume_from_checkpoint=args.input_model + '_checkpoint')
    else:
        trainer.train()
    model.save_pretrained(args.output_dir)
Example #22
0
    def test_lm_tokenizer_with_padding(self):
        tokenizer = AutoTokenizer.from_pretrained("distilroberta-base")
        data_collator = DataCollatorForLanguageModeling(tokenizer)
        # ^ masked lm

        dataset = LineByLineTextDataset(tokenizer, file_path=PATH_SAMPLE_TEXT, block_size=512)
        examples = [dataset[i] for i in range(len(dataset))]
        batch = data_collator.collate_batch(examples)
        self.assertIsInstance(batch, dict)
        self.assertEqual(batch["input_ids"].shape, torch.Size((31, 107)))
        self.assertEqual(batch["masked_lm_labels"].shape, torch.Size((31, 107)))

        dataset = TextDataset(tokenizer, file_path=PATH_SAMPLE_TEXT, block_size=512, overwrite_cache=True)
        examples = [dataset[i] for i in range(len(dataset))]
        batch = data_collator.collate_batch(examples)
        self.assertIsInstance(batch, dict)
        self.assertEqual(batch["input_ids"].shape, torch.Size((2, 512)))
        self.assertEqual(batch["masked_lm_labels"].shape, torch.Size((2, 512)))
Example #23
0
 def _get_dataset(
     self,
     file_path: str,
     line_by_line: bool,
     block_size: int,
     overwrite_cache: bool,
 ) -> Dataset:
     if line_by_line:
         return LineByLineTextDataset(
             tokenizer=self.tokenizer, file_path=file_path, block_size=block_size
         )
     else:
         return TextDataset(
             tokenizer=self.tokenizer,
             file_path=file_path,
             block_size=block_size,
             overwrite_cache=overwrite_cache,
         )
Example #24
0
def get_dataset(args: DataTrainingArguments,
                model_name_or_path,
                tokenizer: PreTrainedTokenizer,
                evaluate=False):
    file_path = args.eval_data_file if evaluate else args.train_data_file
    if args.line_by_line:
        if 'dialogpt' in model_name_or_path.lower():
            return LineByLinePersonaChatDataset(tokenizer=tokenizer,
                                                file_path=file_path,
                                                block_size=args.block_size)
        else:
            return LineByLineTextDataset(tokenizer=tokenizer,
                                         file_path=file_path,
                                         block_size=args.block_size)
    else:
        return TextDataset(tokenizer=tokenizer,
                           file_path=file_path,
                           block_size=args.block_size,
                           overwrite_cache=args.overwrite_cache)
def main():
    tokenizer = BertTokenizer.from_pretrained('vocab/bert-base-chinese-vocab.txt')

    dataset = LineByLineTextDataset(
        tokenizer=tokenizer,
        file_path="data/dialogue_lined/multi-sents-further-pretrain/train_test_dialogues.txt",
        block_size=512,
    )

    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=True, mlm_probability=0.15
    )
    
    training_args = TrainingArguments(
        output_dir="model/multi-sents-test-further-pretrained-bert",
        do_train=True,
        warmup_steps=int(100 * (len(dataset) / 32) * 0.1),
        #warmup_steps=10000,
        overwrite_output_dir=True,
        num_train_epochs=100,
        #max_steps=100000,
        per_device_train_batch_size=8,
        gradient_accumulation_steps=4,
        save_steps=1000,
        logging_steps=10,
        weight_decay=0.01
    )

    model = BertForMaskedLM.from_pretrained('bert-base-chinese')
    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=dataset,
        prediction_loss_only=True,
    )
    
    trainer.train()
    
    trainer.save_model('model/multi-sents-test-further-pretrained-bert')
    
    return
Example #26
0
def main(args):

    # Import the custom trained tokenizer
    tokenizer = RobertaTokenizerFast.from_pretrained(args.tokenizer)

    # Define the model
    config = RobertaConfig(vocab_size=32000)
    model = RobertaForMaskedLM(config=config)

    # Import the dataset
    dataset = LineByLineTextDataset(
        tokenizer=tokenizer,
        file_path=args.data,
        block_size=128,
    )

    # Initialize the data collector
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer)

    # Set all of the training arguments
    training_args = TrainingArguments(
        output_dir=args.output,
        overwrite_output_dir=True,
        num_train_epochs=10,
        per_gpu_train_batch_size=24,
        save_steps=10_000,
        save_total_limit=10,
    )

    # Train the model
    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=dataset,
    )

    trainer.train()

    # Save the mode
    trainer.save_model("./roBERTaCODE_{}_{}".format(args.language, args.size))
Example #27
0
def train_mod(txt_dir, tokenizer, model_dir):
    config = RobertaConfig(
        vocab_size=3305,
        max_position_embeddings=1024,
        num_attention_heads=12,
        num_hidden_layers=6,
        output_attentions=True,
        type_vocab_size=1,
    )

    dataset = LineByLineTextDataset(tokenizer=tokenizer,
                                    file_path=txt_dir,
                                    block_size=1024)

    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,
                                                    mlm=True,
                                                    mlm_probability=0.15)

    model = RobertaForMaskedLM(config=config)

    training_args = TrainingArguments(
        output_dir=model_dir,
        overwrite_output_dir=True,
        num_train_epochs=1000,
        per_gpu_train_batch_size=16,
        save_steps=1000,
        save_total_limit=37,
        prediction_loss_only=True,
    )

    trainer = Trainer(model=model,
                      args=training_args,
                      data_collator=data_collator,
                      train_dataset=dataset)

    trainer.train()

    trainer.save_model(model_dir)
def get_dataset(args: DataTrainingArguments,
                tokenizer: PreTrainedTokenizer,
                inline_meta: str = None,
                local_rank=-1):
    file_path = args.eval_data_file
    if args.webtext:
        return WebTextPretokenizedDataset(tokenizer=tokenizer,
                                          file_path=file_path,
                                          block_size=args.block_size,
                                          inline_meta=inline_meta,
                                          local_rank=local_rank)
    elif args.line_by_line:
        return LineByLineTextDataset(tokenizer=tokenizer,
                                     file_path=file_path,
                                     block_size=args.block_size,
                                     local_rank=local_rank)
    else:
        return TextDataset(
            tokenizer=tokenizer,
            file_path=file_path,
            block_size=args.block_size,
            local_rank=local_rank,
        )
Example #29
0
def main() -> None:
    parser = argparse.ArgumentParser()
    parser.add_argument("--config_file", default=None, type=str, required=True, help="Configuration file")
    args = parser.parse_args()

    config = TrainModelConfig.load(args.config_file)
    logger = config.logger()  # noqa: F841

    dataset = LineByLineTextDataset(
        tokenizer=config.tokenizer, file_path=config.file_path, block_size=config.block_size
    )

    data_collator = DataCollatorForLanguageModeling(
        tokenizer=config.tokenizer, mlm=True, mlm_probability=config.mlm_probability
    )

    training_args = TrainingArguments(
        output_dir=config.saving_folder,
        overwrite_output_dir=True,
        num_train_epochs=config.epochs,
        per_gpu_train_batch_size=config.batch_size_per_gpu,
        save_steps=config.save_steps,
        save_total_limit=config.save_total_limit,
    )

    trainer = Trainer(
        model=config.model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=dataset,
        prediction_loss_only=True,
    )

    trainer.train()

    trainer.save_model(config.saving_folder)
    config.tokenizer.save_pretrained(config.saving_folder)
Example #30
0
    for name, param in model.named_parameters():
        if freeze_layer in name:
            print(name)
            param.requires_grad = False
        else:
            pass


print('===========================')
print('The model has: ', count_parameters(model))
print('===========================')

file_path = 'multi-label_train.csv.txt'


dataset = LineByLineTextDataset(tokenizer=tokenizer, file_path= file_path, block_size=128)
#dataset = load_dataset("./csv_for_ft_new.py", data_files=file_path)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)


dir = str(args.resultpath) + str(args.data) + '_' + str(args.LM) + '_e20' + '_b' + str(args.batch_size)

training_args = TrainingArguments(
    do_train=True,
    do_predict=True,
    output_dir=dir,
    overwrite_output_dir=True,
    num_train_epochs= args.num_train_epochs,
    per_device_train_batch_size=args.batch_size,
    save_steps=10000,
    save_total_limit=2,