Exemple #1
0
    def create_tokenizer(self):
        if self.isComputed():
            logger.info("Tokenizer for this dataset has already been created")
            self.tokenizer = RobertaTokenizerFast.from_pretrained(
                f"{self.data_dir}", max_len=512)
            return

        logger.info(f"Training tokenizer on data in {self.data_dir}")

        self.train()
        azure_storage.upload(self.data_dir / "vocab.json")
        azure_storage.upload(self.data_dir / "merges.txt")
Exemple #2
0
    def upload(self):
        paths = [str(x) for x in Path(self.model_dir).glob("**/*")]

        for file in paths:
            azure_storage.upload(file)
Exemple #3
0
 def upload(self):
     azure_storage.upload(self.data_dir / self.file_name)