def test_sop(self):
        tokenizer = BertTokenizer(self.vocab_file)
        features = [{
            "input_ids": tf.convert_to_tensor([0, 1, 2, 3, 4]),
            "token_type_ids": tf.convert_to_tensor([0, 1, 2, 3, 4]),
            "sentence_order_label": i,
        } for i in range(2)]
        data_collator = DataCollatorForLanguageModeling(tokenizer,
                                                        return_tensors="tf")
        batch = data_collator(features)

        self.assertEqual(batch["input_ids"].shape.as_list(), [2, 5])
        self.assertEqual(batch["token_type_ids"].shape.as_list(), [2, 5])
        self.assertEqual(batch["labels"].shape.as_list(), [2, 5])
        self.assertEqual(batch["sentence_order_label"].shape.as_list(), [2])

        data_collator = DataCollatorForLanguageModeling(tokenizer,
                                                        pad_to_multiple_of=8,
                                                        return_tensors="tf")
        batch = data_collator(features)

        self.assertEqual(batch["input_ids"].shape.as_list(), [2, 8])
        self.assertEqual(batch["token_type_ids"].shape.as_list(), [2, 8])
        self.assertEqual(batch["labels"].shape.as_list(), [2, 8])
        self.assertEqual(batch["sentence_order_label"].shape.as_list(), [2])
Esempio n. 2
0
    def test_sop(self):
        tokenizer = BertTokenizer(self.vocab_file)
        features = [{
            "input_ids": torch.tensor([0, 1, 2, 3, 4]),
            "token_type_ids": torch.tensor([0, 1, 2, 3, 4]),
            "sentence_order_label": i,
        } for i in range(2)]
        data_collator = DataCollatorForLanguageModeling(tokenizer)
        batch = data_collator(features)

        self.assertEqual(batch["input_ids"].shape, torch.Size((2, 5)))
        self.assertEqual(batch["token_type_ids"].shape, torch.Size((2, 5)))
        self.assertEqual(batch["labels"].shape, torch.Size((2, 5)))
        self.assertEqual(batch["sentence_order_label"].shape, torch.Size(
            (2, )))

        data_collator = DataCollatorForLanguageModeling(tokenizer,
                                                        pad_to_multiple_of=8)
        batch = data_collator(features)

        self.assertEqual(batch["input_ids"].shape, torch.Size((2, 8)))
        self.assertEqual(batch["token_type_ids"].shape, torch.Size((2, 8)))
        self.assertEqual(batch["labels"].shape, torch.Size((2, 8)))
        self.assertEqual(batch["sentence_order_label"].shape, torch.Size(
            (2, )))
    def test_nsp(self):
        tokenizer = BertTokenizer(self.vocab_file)
        features = [{
            "input_ids": [0, 1, 2, 3, 4],
            "token_type_ids": [0, 1, 2, 3, 4],
            "next_sentence_label": i
        } for i in range(2)]
        data_collator = DataCollatorForLanguageModeling(tokenizer,
                                                        return_tensors="np")
        batch = data_collator(features)

        self.assertEqual(batch["input_ids"].shape, (2, 5))
        self.assertEqual(batch["token_type_ids"].shape, (2, 5))
        self.assertEqual(batch["labels"].shape, (2, 5))
        self.assertEqual(batch["next_sentence_label"].shape, (2, ))

        data_collator = DataCollatorForLanguageModeling(tokenizer,
                                                        pad_to_multiple_of=8,
                                                        return_tensors="np")
        batch = data_collator(features)

        self.assertEqual(batch["input_ids"].shape, (2, 8))
        self.assertEqual(batch["token_type_ids"].shape, (2, 8))
        self.assertEqual(batch["labels"].shape, (2, 8))
        self.assertEqual(batch["next_sentence_label"].shape, (2, ))
Esempio n. 4
0
    def __init__(self,
                 file_path,
                 sets,
                 bucket_size,
                 max_timestep=0,
                 drop=False,
                 acoustic_config=None,
                 semantic_config=None,
                 tokenizer=None):
        super().__init__(file_path, sets, bucket_size, max_timestep, drop)
        self.acoustic_config = acoustic_config
        self.semantic_config = semantic_config
        self.tokenizer = tokenizer
        self.mlm_collater = DataCollatorForLanguageModeling(
            tokenizer=tokenizer, mlm=True, mlm_probability=0.15)
        self.sample_step = 0

        X_a = self.table['file_path'].tolist()
        X_lens = self.table['length'].tolist()
        X_t = self.table['align_path'].tolist()
        # Use bucketing to allow different batch size at run time
        self.X_a, self.X_t = [], []
        batch_x_a, batch_len, batch_x_t = [], [], []

        for x_a, x_len, x_t in zip(X_a, X_lens, X_t):
            batch_x_a.append(x_a)
            batch_len.append(x_len)
            batch_x_t.append(x_t)

            # Fill in batch_x until batch is full
            if len(batch_x_a) == bucket_size:
                # Half the batch size if seq too long
                if (bucket_size >= 2) and (max(batch_len) > HALF_BATCHSIZE_TIME
                                           ) and self.sample_step == 0:
                    self.X_a.append(batch_x_a[:bucket_size // 2])
                    self.X_a.append(batch_x_a[bucket_size // 2:])
                    self.X_t.append(batch_x_t[:bucket_size // 2])
                    self.X_t.append(batch_x_t[bucket_size // 2:])
                else:
                    self.X_a.append(batch_x_a)
                    self.X_t.append(batch_x_t)
                batch_x_a, batch_len, batch_x_t = [], [], []

        # Gather the last batch
        if len(batch_x_a) > 1:
            self.X_a.append(batch_x_a)
            self.X_t.append(batch_x_t)

        assert len(self.X_a) == len(self.X_t)
Esempio n. 5
0
    def test_data_collator_for_language_modeling(self):
        tokenizer = BertTokenizer(self.vocab_file)
        no_pad_features = [{
            "input_ids": list(range(10))
        }, {
            "input_ids": list(range(10))
        }]
        pad_features = [{
            "input_ids": list(range(5))
        }, {
            "input_ids": list(range(10))
        }]

        data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
        batch = data_collator(no_pad_features)
        self.assertEqual(batch["input_ids"].shape, torch.Size((2, 10)))
        self.assertEqual(batch["labels"].shape, torch.Size((2, 10)))

        batch = data_collator(pad_features)
        self.assertEqual(batch["input_ids"].shape, torch.Size((2, 10)))
        self.assertEqual(batch["labels"].shape, torch.Size((2, 10)))

        tokenizer._pad_token = None
        data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
        with self.assertRaises(ValueError):
            # Expect error due to padding token missing
            data_collator(pad_features)

        set_seed(42)  # For reproducibility
        tokenizer = BertTokenizer(self.vocab_file)
        data_collator = DataCollatorForLanguageModeling(tokenizer)
        batch = data_collator(no_pad_features)
        self.assertEqual(batch["input_ids"].shape, torch.Size((2, 10)))
        self.assertEqual(batch["labels"].shape, torch.Size((2, 10)))

        masked_tokens = batch["input_ids"] == tokenizer.mask_token_id
        self.assertTrue(torch.any(masked_tokens))
        self.assertTrue(
            all(x == -100 for x in batch["labels"][~masked_tokens].tolist()))

        batch = data_collator(pad_features)
        self.assertEqual(batch["input_ids"].shape, torch.Size((2, 10)))
        self.assertEqual(batch["labels"].shape, torch.Size((2, 10)))

        masked_tokens = batch["input_ids"] == tokenizer.mask_token_id
        self.assertTrue(torch.any(masked_tokens))
        self.assertTrue(
            all(x == -100 for x in batch["labels"][~masked_tokens].tolist()))
Esempio n. 6
0
    def __init__(self, config: Munch):
        self.config = config

        # ---- Neuron ----
        self.neuron = Neuron(self.config)

        # ---- Model ----
        self.model = BertMLMSynapse(self.config)

        # ---- Optimizer ----
        self.optimizer = torch.optim.SGD(self.model.parameters(),
                                         lr=self.config.session.learning_rate,
                                         momentum=self.config.session.momentum)
        self.scheduler = WarmupCosineWithHardRestartsSchedule(
            self.optimizer, 50, 300)

        # ---- Dataset ----
        # Dataset: 74 million sentences pulled from books.
        self.dataset = load_dataset('bookcorpus')['train']
        # The collator accepts a list [ dict{'input_ids, ...; } ] where the internal dict
        # is produced by the tokenizer.
        self.data_collator = DataCollatorForLanguageModeling(
            tokenizer=bittensor.__tokenizer__(),
            mlm=True,
            mlm_probability=0.15)

        # ---- Logging ----
        self.tensorboard = SummaryWriter(log_dir=self.config.session.full_path)
        if self.config.session.record_log:
            logger.add(
                self.config.session.full_path + "/{}_{}.log".format(
                    self.config.session.name, self.config.session.trial_uid),
                format="{time:YYYY-MM-DD at HH:mm:ss} | {level} | {message}")
def create_trainer(tokenizer, model):
    dataset = LineByLineTextDataset(
        tokenizer=tokenizer,
        file_path="data/processed/recipes_train.txt",
        block_size=256,
    )
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=True, mlm_probability=0.15
    )
    training_args = TrainingArguments(
        output_dir="./artifacts",
        overwrite_output_dir=True,
        num_train_epochs=1,
        per_gpu_train_batch_size=128,
        save_steps=100_000_000,
        save_total_limit=2,
        fp16=True,
    )
    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=dataset,
        prediction_loss_only=True,
    )
    return trainer
Esempio n. 8
0
    def __init__(self, config: Munch = None, **kwargs):
        if config == None:
            config = Miner.default_config();       
        bittensor.config.Config.update_with_kwargs(config.miner, kwargs) 
        Miner.check_config(config)
        self.config = config

        # ---- Model ----
        self.model = BertMLMSynapse( self.config )

        # ---- Optimizer ----
        self.optimizer = torch.optim.SGD(self.model.parameters(), lr = self.config.miner.learning_rate, momentum=self.config.miner.momentum)
        self.scheduler = WarmupCosineWithHardRestartsSchedule(self.optimizer, 50, 300)

        # ---- Model Load/Save tools ----
        self.model_toolbox = ModelToolbox(BertMLMSynapse, torch.optim.SGD)

        # ---- Dataset ----
        # Dataset: 74 million sentences pulled from books.
        self.dataset = load_dataset('ag_news')['train']
        # The collator accepts a list [ dict{'input_ids, ...; } ] where the internal dict 
        # is produced by the tokenizer.
        self.data_collator = DataCollatorForLanguageModeling (
            tokenizer=bittensor.__tokenizer__(), mlm=True, mlm_probability=0.15
        )
        super( Miner, self ).__init__( self.config, **kwargs )
Esempio n. 9
0
 def collate_fn(self) -> Callable:
     if self.cfg.wwm:
         return DataCollatorForWholeWordMask(
             self.tokenizer, mlm_probability=self.cfg.mlm_probability)
     else:
         return DataCollatorForLanguageModeling(
             self.tokenizer, mlm_probability=self.cfg.mlm_probability)
Esempio n. 10
0
    def execute(self, environment_path: str) -> None:
        dataset = LineByLineTextDataset(tokenizer=self.tokenizer,
                                        file_path=self.file_path,
                                        block_size=self.block_size)

        data_collator = DataCollatorForLanguageModeling(
            tokenizer=self.tokenizer,
            mlm=True,
            mlm_probability=self.mlm_probability)

        training_args = TrainingArguments(
            output_dir=os.path.join(environment_path, "temp"),
            overwrite_output_dir=True,
            num_train_epochs=self.epochs,
            per_gpu_train_batch_size=self.batch_size_per_gpu,
            save_steps=self.save_steps,
            save_total_limit=self.save_total_limit,
        )

        trainer = Trainer(
            model=self.model,
            args=training_args,
            data_collator=data_collator,
            train_dataset=dataset,
            prediction_loss_only=True,
        )

        trainer.train()

        trainer.save_model(os.path.join(environment_path, "model"))
        self.tokenizer.save_pretrained(
            os.path.join(environment_path, "tokenizer"))
Esempio n. 11
0
    def __init__(self,
                 data_dir: Path,
                 tokenizer: PreTrainedTokenizer,
                 dataset: Dataset,
                 local_rank=-1):
        assert data_dir, "data_dir input needed"

        self.model_dir = f"{data_dir}/results"
        self.dataset = dataset

        self.config = RobertaConfig(
            vocab_size=52_000,
            max_position_embeddings=514,
            num_attention_heads=12,
            num_hidden_layers=6,
            type_vocab_size=1,
        )
        self.training_args = TrainingArguments(
            run_name=data_dir.name,
            local_rank=local_rank,
            learning_rate=0.00005,  # default 0.00005
            output_dir=f"{self.model_dir}",
            overwrite_output_dir=False,
            num_train_epochs=1,
            per_device_train_batch_size=48,  # Nvidia K80 99%
            seed=42,
            save_steps=10_000,
            save_total_limit=1,
        )

        self.data_collator = DataCollatorForLanguageModeling(
            tokenizer=tokenizer, mlm=True, mlm_probability=0.15)
Esempio n. 12
0
def get_dataloaders(model, tokenizer, batch_size, train_path, eval_path):
    block_size = 1024
    train_dataset = TextDataset(tokenizer=tokenizer,
                                file_path=train_path,
                                block_size=block_size)
    test_dataset = TextDataset(tokenizer=tokenizer,
                               file_path=eval_path,
                               block_size=block_size)
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,
                                                    mlm=False,
                                                    mlm_probability=0.15)
    trainloader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        collate_fn=data_collator,
        drop_last=False,
        num_workers=0,
    )
    testloader = DataLoader(
        test_dataset,
        batch_size=batch_size,
        collate_fn=data_collator,
        drop_last=False,
        num_workers=0,
    )
    return trainloader, testloader
Esempio n. 13
0
    def train(self,
              num_epochs=500,
              batch_size=32,
              save_total_limit=2,
              save_steps=500,
              logging_steps=100):
        training_args = TrainingArguments(
            output_dir=f"./saved/{self.model_name}",
            overwrite_output_dir=True,
            num_train_epochs=num_epochs,
            per_device_train_batch_size=batch_size,
            save_steps=save_steps,
            save_total_limit=save_total_limit,
            prediction_loss_only=True,
            logging_steps=logging_steps)

        data_collator = DataCollatorForLanguageModeling(
            tokenizer=self.tokenizer, mlm=True, mlm_probability=MASK_PROB)
        card_trainset = CardDataset(self.train_path,
                                    self.tokenizer,
                                    to_tensor=True)
        trainer = Trainer(model=self.model,
                          args=training_args,
                          data_collator=data_collator,
                          train_dataset=card_trainset)

        self.tokenizer.save_pretrained(f"./saved/{self.model_name}")
        trainer.train()
        trainer.save_model(f"./saved/{self.model_name}")
Esempio n. 14
0
def train(model: PreTrainedModel,
          tokenizer: PreTrainedTokenizer,
          train_batch_size: int,
          eval_batch_size: int,
          mlm=False):
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,
                                                    mlm=mlm,
                                                    mlm_probability=0.15)
    args = TrainingArguments()
    args.per_gpu_train_batch_size = train_batch_size
    args.per_gpu_eval_batch_size = eval_batch_size
    args.per_device_train_batch_size = train_batch_size
    train_dataset, eval_dataset = dataset.get_dataset(tokenizer)

    trainer = Trainer(
        model=model,
        args=args,
        data_collator=data_collator,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        prediction_loss_only=True,
    )

    trainer.train(model_path=None)
    trainer.save_model()
Esempio n. 15
0
    def __init__(self, opts):
        # Command line arguments
        self.opts = opts

        # Load model and tokenizer
        config = AutoConfig.from_pretrained(opts.ckpt_file)
        self.tokenizer = AutoTokenizer.from_pretrained(opts.ckpt_file)
        self.model = AutoModelWithLMHead.from_pretrained(opts.ckpt_file,
                                                         config=config)
        self.model.resize_token_embeddings(len(self.tokenizer))

        # Load training arguments
        if opts.mode == 'train' or opts.mode == 'eval':
            self.training_args = TrainingArguments
            self.training_args.device = 'cpu'
            self.training_args.n_gpu = 0
            self.training_args.logging_dir = opts.output_dir
            self.training_args.output_dir = opts.output_dir
            self.training_args.num_train_epochs = opts.num_epochs
            self.training_args.learning_rate = opts.learning_rate
            self.training_args.train_batch_size = opts.batch_size
            self.training_args.eval_batch_size = opts.batch_size

        # Load dataset
        if opts.mode == 'train' or opts.mode == 'eval':
            self.dataset = LineByLineTextDataset(  # TextDataset
                tokenizer=self.tokenizer,
                file_path=opts.text_file,
                block_size=self.tokenizer.max_len)
            self.data_collator = DataCollatorForLanguageModeling(
                tokenizer=self.tokenizer, mlm=False)
Esempio n. 16
0
def get_trainer(model_file, train_file, valid_file, model_output_dir):

    model = AutoModelForMaskedLM.from_pretrained(model_file)
    tokenizer = AutoTokenizer.from_pretrained(model_file, do_lower_case=True, use_fast=False)
    lm_datasets = create_dataset(tokenizer, train_file, valid_file)

    print(tokenizer.decode(lm_datasets["train"][1]["input_ids"]))

    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

    training_args = TrainingArguments(
        output_dir=model_output_dir,          # output directory
        overwrite_output_dir=True,
        num_train_epochs=3,              # total # of training epochs
        per_device_train_batch_size=16,  # batch size per device during training
        per_device_eval_batch_size=64,   # batch size for evaluation
        warmup_steps=500,                # number of warmup steps for learning rate scheduler
        weight_decay=0.01,               # strength of weight decay
        logging_dir=path+'logs',            # directory for storing logs
          )

    trainer = Trainer(
        model=model,                         # the instantiated 🤗 Transformers model to be trained
        args=training_args,                  # training arguments, defined above
        train_dataset=lm_datasets["train"],
        eval_dataset=lm_datasets["validation"],        # evaluation dataset
        data_collator=data_collator,
      )
    
    return trainer
Esempio n. 17
0
    def train(self, epochs, lr=5e-5, batch_size=1):
        data_collator = DataCollatorForLanguageModeling(
            tokenizer=self.tokenizer, mlm=False)
        training_args = TrainingArguments(
            output_dir="./output-lm",
            no_cuda=(self.device != torch.device("cuda")),
            num_train_epochs=epochs,
            per_device_train_batch_size=batch_size,
            # save_steps=10,
            save_total_limit=1,
            learning_rate=lr,
            evaluation_strategy="epoch",
            logging_steps=float("inf"),
            prediction_loss_only=False,
        )

        self.trainer = Trainer(
            model=self.model,
            args=training_args,
            data_collator=data_collator,
            train_dataset=self.train_dataset,
            eval_dataset=self.val_dataset,
        )
        self.trainer.train()
        return self.trainer
Esempio n. 18
0
    def __init__(self, evaluate: bool = False):
        self.model_args = ModelArguments(model_name_or_path=None, model_type='bert', tokenizer_name='models/danbert-small/vocab.json', config_name="models/danbert-small/config.json")
        self.data_args = DataTrainingArguments(train_data_file="handler/datadir/da-train.txt", eval_data_file="handler/datadir/da-eval.txt", labels="handler/datadir/labels.txt", mlm=True, line_by_line=True)
        self.training_args = TrainingArguments(output_dir="models/danbert-small", num_train_epochs=3, per_gpu_eval_batch_size=8, save_steps=750, seed=42, learning_rate=1e-4, save_total_limit=2)

        # Setup logging
        logging.basicConfig(
            format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
            datefmt="%m/%d/%Y %H:%M:%S",
            level=logging.INFO if self.training_args.local_rank in [-1, 0] else logging.WARN,
        )
        logger.warning(
            "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
            self.training_args.local_rank,
            self.training_args.device,
            self.training_args.n_gpu,
            bool(self.training_args.local_rank != -1),
            self.training_args.fp16,
        )
        logger.info("Training/evaluation parameters %s", self.training_args)

        set_seed(self.training_args.seed)

        config = CONFIG_MAPPING[self.model_args.model_type]()
        logger.warning("You are instantiating a new config instance from scratch.")

        tokenizer = AutoTokenizer.from_pretrained('models/danbert-small')

        model = AutoModelWithLMHead.from_config(config)

        # model.resize_token_embeddings(len(tokenizer))

        if self.data_args.block_size <= 0:
            self.data_args.block_size = 512
        # Our input block size will be the max possible for the model
        else:
            self.data_args.block_size = min(self.data_args.block_size, 512)

        train_dataset = (
            self.get_dataset(self.data_args, tokenizer=tokenizer, local_rank=self.training_args.local_rank, evaluate=True)
        )

        eval_dataset = (
            self.get_dataset(self.data_args, tokenizer=tokenizer, local_rank=self.training_args.local_rank, evaluate=True)
        )

        data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=self.data_args.mlm, mlm_probability=self.data_args.mlm_probability)

        trainer = Trainer(
            model=model,
            args=self.training_args,
            data_collator=data_collator,
            train_dataset=train_dataset,
            # eval_dataset=eval_dataset,
            prediction_loss_only=True,
        )

        trainer.train(model_path=None)
        trainer.save_model()
Esempio n. 19
0
def main():
    print("PREPROCESSING DATA")
    preprocess()
    print("LOADING TOKENIZER")
    tokenizer = get_tokenizer()
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,
                                                    mlm=False)
    print("LOADING MODEL", cfg('model'))
    model = get_model(tokenizer)

    print("LOADING DATA")
    if cfg('encoding') == 'LBL':
        train_dataset = LBLDataset(tokenizer=tokenizer,
                                   file_path=filename('train'))
    elif cfg('encoding') == 'blocked':
        train_dataset = BlockedDataset(tokenizer=tokenizer,
                                       file_path=filename('train'))
    elif cfg('encoding') == 'text':
        train_dataset = TextDataset(tokenizer=tokenizer,
                                    file_path=filename('train'),
                                    block_size=cfg('max_block'))
    elif cfg('encoding').startswith('inter'):
        if cfg('encoding').endswith('LBL'):
            loader = LBLDataset
        elif cfg('encoding').endswith('blocked'):
            loader = BlockedDataset

        d1 = loader(tokenizer=tokenizer, file_path=filename('train'))
        d2 = loader(tokenizer=tokenizer, file_path=filename('dirty'))
        train_dataset = CombinedDataset(d1, d2)
    else:
        raise ValueError("Unkown encoding")

    trainer = get_trainer(train_dataset, data_collator, model)

    def validator(x, y):
        global BEST_metric
        model.save_pretrained(session)
        metric, pred = validate(model, tokenizer, x, y)
        if np.mean(metric) > BEST_metric:
            print("NEW BEST (saving)")
            BEST_metric = np.mean(metric)

        # save predicitions and model
        save(session + "metric.txt", str(metric) + "\n")
        save(session + "pred.txt", str(pred) + "\n\n")
        return metric, pred

    trainer.validator = validator
    trainer.val_dataset = get_validation_data()

    # saving configuration
    print("SAVING...")
    session = get_session_path()
    print(session)
    save(session + "conf.txt", repr(cfg()))

    print("STARTING TRAINING...")
    trainer.train()
def pretrain_and_evaluate(args, model, tokenizer, eval_only, model_path=None):
    # train from scrath if model_path=None
    def _dataset(file_path):
        return LineByLineTextDataset(tokenizer=tokenizer,
                                     file_path=file_path,
                                     block_size=tokenizer.max_len)

    val_dataset = _dataset(args.val_datapath)
    if eval_only:
        print("Assign validation dataset")
        train_dataset = val_dataset
    else:
        logger.info(
            f'Loading and tokenizing training data is usually slow: {args.train_datapath}'
        )

        train_dataset = ConcatDataset([
            _dataset(f)
            for f in glob.glob('./Preprocessed_Data/splited_train/*')
        ])

    print("Creating data collator with mlm")
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,
                                                    mlm=True,
                                                    mlm_probability=0.15)
    print("Start Trainer")
    trainer = Trainer(
        model=model,
        args=args,
        data_collator=data_collator,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        prediction_loss_only=True,
    )

    if not eval_only:
        trainer.train(model_path=model_path)  # None train from scratch
        trainer.save_model(args.output_dir)  # save model to the output_dir

    # Evaluation
    results = {}

    logger.info("*** Evaluate ***")

    eval_loss = trainer.evaluate()
    eval_loss = eval_loss['eval_loss']

    perplexity = math.exp(eval_loss)
    results["perplexity"] = perplexity
    results["bpc"] = eval_loss / math.log(2)

    output_eval_file = os.path.join(training_args.output_dir,
                                    "eval_results_mlm.txt")
    with open(output_eval_file, "a") as writer:
        writer.write("***** Eval results *****")
        logger.info("***** Eval results *****")
        for key, value in results.items():
            logger.info(f"  {key} = {value}")
            writer.write(f"{key} = {value}\n")
Esempio n. 21
0
    def run(self):
        result_folder = luigi.configuration.get_config().get(
            'GlobalConfig', 'result_folder')
        model = GPT2LMHeadModel.from_pretrained("distilgpt2")
        tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2")

        train_dataset = TextDataset(tokenizer,
                                    self.input()['train'].path,
                                    block_size=self.block_size)
        test_dataset = TextDataset(tokenizer,
                                   self.input()['test'].path,
                                   block_size=self.block_size)

        data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,
                                                        mlm=False)

        training_args = TrainingArguments(
            do_eval=self.do_eval,
            do_train=self.do_train,
            eval_steps=self.eval_steps,
            evaluate_during_training=self.evaluate_during_training,
            gradient_accumulation_steps=self.gradient_accumulation_steps,
            logging_dir='./logs',
            logging_steps=self.logging_steps,
            learning_rate=self.learning_rate,
            max_grad_norm=self.max_grad_norm,
            num_train_epochs=self.num_train_epochs,
            output_dir=result_folder,
            overwrite_output_dir=True,
            per_device_train_batch_size=self.per_device_train_batch_size,
            per_device_eval_batch_size=self.per_device_eval_batch_size,
            save_steps=self.save_steps,
            seed=self.seed,
            warmup_steps=self.warmup_steps,
            weight_decay=self.weight_decay,
        )

        trainer = Trainer(model=model,
                          args=training_args,
                          data_collator=data_collator,
                          train_dataset=train_dataset,
                          eval_dataset=test_dataset)

        trainer.train()

        trainer.save_model()
        tokenizer.save_pretrained(result_folder)

        wanb_disabled = os.environ.get('WANDB_DISABLED', False)

        if wanb_disabled:
            run_name = time.strftime('%Y%m%d-%H%M%S')
        else:
            wandb.run.save()
            wandb.join()
            run_name = wandb.run.name

        with open(self.output()['run_name'].path, 'w') as f:
            f.write(run_name)
Esempio n. 22
0
    def test_lm_tokenizer_without_padding(self):
        tokenizer = AutoTokenizer.from_pretrained("gpt2")
        data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
        # ^ causal lm

        dataset = LineByLineTextDataset(tokenizer, file_path=PATH_SAMPLE_TEXT, block_size=512)
        examples = [dataset[i] for i in range(len(dataset))]
        with self.assertRaises(ValueError):
            # Expect error due to padding token missing on gpt2:
            data_collator.collate_batch(examples)

        dataset = TextDataset(tokenizer, file_path=PATH_SAMPLE_TEXT, block_size=512, overwrite_cache=True)
        examples = [dataset[i] for i in range(len(dataset))]
        batch = data_collator.collate_batch(examples)
        self.assertIsInstance(batch, dict)
        self.assertEqual(batch["input_ids"].shape, torch.Size((2, 512)))
        self.assertEqual(batch["labels"].shape, torch.Size((2, 512)))
Esempio n. 23
0
 def __init__(self, config, mode, *args, **params):
     self.max_len = config.getint("train", "max_len")
     self.mode = mode
     self.tokenizer = AutoTokenizer.from_pretrained(
         "hfl/chinese-roberta-wwm-ext")
     self.mlm_prob = config.getfloat("train", "mlm_prob")
     self.data_collator = DataCollatorForLanguageModeling(
         tokenizer=self.tokenizer, mlm_probability=self.mlm_prob)
Esempio n. 24
0
def get_data_collator(
        tokenizer: BertTokenizer,
        mlm: bool = True,
        mlm_prob: float = 0.15) -> DataCollatorForLanguageModeling:
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,
                                                    mlm=mlm,
                                                    mlm_probability=mlm_prob)
    return data_collator
Esempio n. 25
0
    def _collate(self, batch_examples: List) -> Dict:

        data_collator = DataCollatorForLanguageModeling(
            tokenizer=self.dataset_reader.encoder.tokenizer,
            mlm=True,
            mlm_probability=0.15)

        return data_collator(batch_examples)
Esempio n. 26
0
def pretrain_and_evaluate(args, model, tokenizer, eval_only, model_path):
    if tokenizer.model_max_length > 1e8:
        val_dataset = TextDataset(tokenizer=tokenizer,
                                  file_path=args.val_datapath,
                                  block_size=512)
        logger.info(
            f'[WARNING] tokenizer.model_max_length > 10^8: {tokenizer.model_max_length} setting the value as 512 instead.'
        )
    else:
        val_dataset = TextDataset(
            tokenizer=tokenizer,
            file_path=args.val_datapath,
            block_size=tokenizer.model_max_length
        )  #  The `max_len` attribute has been deprecated

    if eval_only:
        train_dataset = val_dataset
    else:
        logger.info(
            f'Loading and tokenizing training data is usually slow: {args.train_datapath}'
        )
        if tokenizer.model_max_length > 1e8:
            train_dataset = TextDataset(tokenizer=tokenizer,
                                        file_path=args.train_datapath,
                                        block_size=512)
            logger.info(
                f'[WARNING] tokenizer.model_max_length > 10^8: {tokenizer.model_max_length} setting the value as 512 instead.'
            )
        else:
            train_dataset = TextDataset(tokenizer=tokenizer,
                                        file_path=args.train_datapath,
                                        block_size=tokenizer.model_max_length)

    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,
                                                    mlm=True,
                                                    mlm_probability=0.15)

    trainer = Trainer(
        model=model,
        args=args,
        data_collator=data_collator,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        prediction_loss_only=True,
    )

    eval_loss = trainer.evaluate()
    #pdb.set_trace()
    eval_loss = eval_loss['eval_loss']
    logger.info(f'Initial eval bpc: {eval_loss/math.log(2)}')

    if not eval_only:
        trainer.train(model_path=model_path)
        trainer.save_model()

        eval_loss = trainer.evaluate()
        eval_loss = eval_loss['eval_loss']
        logger.info(f'Eval bpc after pretraining: {eval_loss/math.log(2)}')
Esempio n. 27
0
def pretrain_and_evaluate(training_args, dataset_args, model, tokenizer, eval_only):
    """
    # adapted from https://colab.research.google.com/drive/1-JIJlao4dI-Ilww_NnTc0rxtp-ymgDgM?usp=sharing#scrollTo=N8J-TLhBuaOf
    :param training_args: HF training args object
    :param dataset_args: object storing dataset config, requires train_datapath and val_datapath to be defined
    :param model: transformers.PreTrainedModel
    :param tokenizer: PreTrainedTokenizerBase
    :param eval_only: boolean, True only performs evaluation
    :return:
    """

    val_dataset = TextDataset(
        tokenizer=tokenizer,
        file_path=dataset_args.val_datapath,
        block_size=tokenizer.model_max_length,
    )
    if eval_only:
        train_dataset = val_dataset
    else:
        logging.info(
            f"Loading and tokenizing training data is usually slow: {dataset_args.train_datapath}"
        )
        train_dataset = TextDataset(
            tokenizer=tokenizer,
            file_path=dataset_args.train_datapath,
            block_size=tokenizer.model_max_length,
        )

    # https://github.com/huggingface/transformers/blob/master/src/transformers/data/data_collator.py
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=True, mlm_probability=0.15
    )

    # https://huggingface.co/transformers/_modules/transformers/trainer.html
    trainer = Trainer_(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
    )

    metrics = trainer.evaluate()
    # eval_loss = metrics["eval_loss"]
    # logging.info(f"Initial eval bpc: {eval_loss / math.log(2)}")
    logging.info(f"Initial metrics: {metrics}")

    if not eval_only:
        # to change if we want to continue training existing models
        # same path as from_checkpoint argument from the builder
        trainer.train(model_path=None)

        trainer.save_model()

        metrics = trainer.evaluate()
        eval_loss = metrics["eval_loss"]
        logging.info(f"Eval bpc after pretraining: {eval_loss / math.log(2)}")
Esempio n. 28
0
def pretrain_and_evaluate(args, model, tokenizer, eval_only, model_path):

    val_dataset, train_dataset = None, None
    for val_datapath in args.val_datapath:
        if not val_dataset:
            val_dataset = CustomIterableDataset(
                tokenizer=tokenizer,
                file_path=val_datapath,
                block_size=model.config.max_position_embeddings)
        else:
            val_dataset += CustomIterableDataset(
                tokenizer=tokenizer,
                file_path=val_datapath,
                block_size=model.config.max_position_embeddings)
    if eval_only:
        train_dataset = val_dataset
    else:
        logger.info(
            f'Loading and tokenizing training data is usually slow: {args.train_datapath}'
        )
        for train_datapath in args.train_datapath:
            if not train_dataset:
                train_dataset = CustomIterableDataset(
                    tokenizer=tokenizer,
                    file_path=train_datapath,
                    block_size=model.config.max_position_embeddings,
                )
            else:
                train_dataset += CustomIterableDataset(
                    tokenizer=tokenizer,
                    file_path=train_datapath,
                    block_size=model.config.max_position_embeddings,
                )
        # train_dataset = val_dataset

    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,
                                                    mlm=True,
                                                    mlm_probability=0.15)
    trainer = CustomTrainer(
        model=model,
        args=args,
        data_collator=data_collator,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        prediction_loss_only=True,
    )

    eval_loss = trainer.evaluate()
    eval_loss = eval_loss['eval_loss']
    logger.info(f'Initial eval bpc: {eval_loss / math.log(2)}')

    if not eval_only:
        trainer.train(model_path=model_path)
        trainer.save_model()

        eval_loss = trainer.evaluate()
        eval_loss = eval_loss['eval_loss']
        logger.info(f'Eval bpc after pretraining: {eval_loss / math.log(2)}')
Esempio n. 29
0
    def test_lm_tokenizer_with_padding(self):
        tokenizer = AutoTokenizer.from_pretrained("distilroberta-base")
        data_collator = DataCollatorForLanguageModeling(tokenizer)
        # ^ masked lm

        dataset = LineByLineTextDataset(tokenizer, file_path=PATH_SAMPLE_TEXT, block_size=512)
        examples = [dataset[i] for i in range(len(dataset))]
        batch = data_collator.collate_batch(examples)
        self.assertIsInstance(batch, dict)
        self.assertEqual(batch["input_ids"].shape, torch.Size((31, 107)))
        self.assertEqual(batch["masked_lm_labels"].shape, torch.Size((31, 107)))

        dataset = TextDataset(tokenizer, file_path=PATH_SAMPLE_TEXT, block_size=512, overwrite_cache=True)
        examples = [dataset[i] for i in range(len(dataset))]
        batch = data_collator.collate_batch(examples)
        self.assertIsInstance(batch, dict)
        self.assertEqual(batch["input_ids"].shape, torch.Size((2, 512)))
        self.assertEqual(batch["masked_lm_labels"].shape, torch.Size((2, 512)))
def finetune_model(transformers_model_name: str, corpus_file_path: str):
    config = AutoConfig.from_pretrained(
        transformers_model_name,
        force_download=False,
        cache_dir='../data/download_transformer_models')

    tokenizer = AutoTokenizer.from_pretrained(
        transformers_model_name,
        force_download=False,
        cache_dir='../data/download_transformer_models')
    # tokenizer = RobertaTokenizerFast.from_pretrained(transformers_model_name,force_download=False,cache_dir='../data/download_transformer_models')

    model = AutoModelForMaskedLM.from_pretrained(
        transformers_model_name,
        config=config,
        force_download=False,
        cache_dir='../data/download_transformer_models')
    dataset = LineByLineTextDataset(tokenizer=tokenizer,
                                    file_path=corpus_file_path,
                                    block_size=512)
    train_set, valid_set = train_test_split(dataset,
                                            test_size=0.25,
                                            random_state=32)

    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,
                                                    mlm=True,
                                                    mlm_probability=0.15)

    training_args = TrainingArguments(
        output_dir="../data/finetune_transformer_models/",
        logging_dir='../saved/finetune_logging',
        logging_steps=500,
        overwrite_output_dir=True,
        weight_decay=0.01,
        adam_epsilon=1e-6,
        learning_rate=2e-5,
        num_train_epochs=5,
        per_gpu_train_batch_size=4,
        per_gpu_eval_batch_size=32,
        max_grad_norm=5.0,
        save_steps=1000,
        save_total_limit=2,
        gradient_accumulation_steps=32,
        evaluate_during_training=True,
        do_train=True,
        do_eval=True,
        do_predict=False)

    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_set,
        eval_dataset=valid_set,
    )
    os.environ["CUDA_VISIBLE_DEVICES"] = "0"
    trainer.train()