Beispiel #1
0
 def test_instantiate(self):
     processor = BertProcessing(("[SEP]", 0), ("[CLS]", 1))
     assert processor is not None
     assert isinstance(processor, PostProcessor)
     assert isinstance(processor, BertProcessing)
     assert isinstance(
         pickle.loads(pickle.dumps(BertProcessing(("[SEP]", 0), ("[CLS]", 1)))),
         BertProcessing,
     )
Beispiel #2
0
def initialize_model():

    config = get_config()
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    #device = torch.device('cpu')
    print("device", device)
    '''create tokenizers'''

    tokenizer = ByteLevelBPETokenizer(
        "data/english_tokenizer-vocab.json",
        "data/english_tokenizer-merges.txt",
    )
    tokenizer._tokenizer.post_processor = BertProcessing(
        ("</s>", tokenizer.token_to_id("</s>")),
        ("<s>", tokenizer.token_to_id("<s>")),
    )
    tokenizer.enable_padding(pad_token='[PAD]', length=config['max_len'])
    tokenizer.enable_truncation(max_length=config['max_len'])
    '''
    Create model
    '''
    vocab_size = len(tokenizer.get_vocab())
    print("tokenizer.vocab_size", vocab_size)
    model = TransformerModel(config['embedding_size'], vocab_size, vocab_size,
                             config['src_pad_idx'], config['num_heads'],
                             config['num_encoder_layers'],
                             config['num_decoder_layers'],
                             config['forward_expansion'], config['dropout'],
                             config['max_len'], device)
    checkpoint = torch.load(config['pretrained_model'], map_location=device)
    model.load_state_dict(checkpoint['net'])
    model.eval()
    model = model.to(device)

    return config, model, tokenizer, device
Beispiel #3
0
    def __init__(self, evaluate: bool = False):
        tokenizer = ByteLevelBPETokenizer(
            "./model/bbpe/vocab.json",
            "./model/bbpe/merges.txt",
        )
        tokenizer._tokenizer.post_processor = BertProcessing(
            ("</s>", tokenizer.token_to_id("</s>")),
            ("<s>", tokenizer.token_to_id("<s>")),
        )
        tokenizer.enable_truncation(max_length=512)
        # or use the RobertaTokenizer from `transformers` directly.

        self.examples = []

        src_files = Path("./data/").glob("*_eval.csv") if evaluate else Path(
            "./data/").glob("*_eval.csv")
        for src_file in src_files:
            print("🔥", src_file)
            with open(src_file, 'r', encoding='utf-8') as f:
                for index, line in enumerate(f):
                    self.examples += [
                        x.ids for x in tokenizer.encode_batch(line)
                    ]
                    if index % 10000 == 0:
                        print(src_file, index // 10000)
    def __init__(self, t: PreTrainedTokenizer, args, file_path: str, block_size=512):
        assert os.path.isfile(file_path)
        logger.info("Creating features from dataset file at %s", file_path)
        
        # -------------------------- CHANGES START
        bert_tokenizer = os.path.join(args.tokenizer_name, "vocab.txt")
        if os.path.exists(bert_tokenizer):
            logger.info("Loading BERT tokenizer")
            from tokenizers import BertWordPieceTokenizer
            tokenizer = BertWordPieceTokenizer(os.path.join(args.tokenizer_name, "vocab.txt"), handle_chinese_chars=False, lowercase=False)
            tokenizer.enable_truncation(512)
        else:
            from tokenizers import ByteLevelBPETokenizer
            from tokenizers.processors import BertProcessing
            logger.info("Loading RoBERTa tokenizer")
            
            tokenizer = ByteLevelBPETokenizer(
                os.path.join(args.tokenizer_name, "vocab.json"),
                os.path.join(args.tokenizer_name, "merges.txt")
            )
            tokenizer._tokenizer.post_processor = BertProcessing(
                ("</s>", tokenizer.token_to_id("</s>")),
                ("<s>", tokenizer.token_to_id("<s>")),
            )
            tokenizer.enable_truncation(max_length=512)

        logger.info("Reading file %s", file_path)
        with open(file_path, encoding="utf-8") as f:
            lines = [line for line in f.read().splitlines() if (len(line) > 0 and not line.isspace())]

        logger.info("Running tokenization")
        self.examples = tokenizer.encode_batch(lines)
Beispiel #5
0
def get_tokenizer(path):
    tokenizer = ByteLevelBPETokenizer(path + 'vocab.json', path + 'merges.txt')
    tokenizer._tokenizer.post_processor = BertProcessing(
        ("</s>", tokenizer.token_to_id("</s>")),
        ("<s>", tokenizer.token_to_id("<s>")),
    )
    return tokenizer
    def __init__(self,
                 tokenizer: PreTrainedTokenizer,
                 args,
                 file_path: str,
                 block_size=512):
        assert os.path.isfile(file_path)
        # Here, we do not cache the features, operating under the assumption
        # that we will soon use fast multithreaded tokenizers from the
        # `tokenizers` repo everywhere =)
        logger.info(" Creating features from dataset file at %s", file_path)

        with open(file_path, encoding="utf-8") as f:
            lines = [
                line for line in f.read().splitlines()
                if (len(line) > 0 and not line.isspace())
            ]

        tokenizer = ByteLevelBPETokenizer(
            f"{args['tokenizer_name']}/vocab.json",
            f"{args['tokenizer_name']}/merges.txt",
        )
        tokenizer._tokenizer.post_processor = BertProcessing(
            ("</s>", tokenizer.token_to_id("</s>")),
            ("<s>", tokenizer.token_to_id("<s>")),
        )

        tokenizer.enable_truncation(max_length=block_size)
        self.examples = [t.ids for t in tokenizer.encode_batch(lines)]
Beispiel #7
0
def test_tokenizer(test_sentence, vocab_path, merge_path):
    r"""
        Illustrates how the individual Tokenizer works

        Args:
            test_sentence (:obj:`str`):
            	Sentence for demonstration purposes
            vocab_path (:obj:`str`):
				Path where the vocabulary (most frequent tokens ranked by frequency) is saved
			merge_path (:obj:`str`):
				Path where the merges file is saved
    """

    tokenizer = ByteLevelBPETokenizer(vocab_path, merge_path)

    tokenizer._tokenizer.post_processor = BertProcessing(
        ("</s>", tokenizer.token_to_id("</s>")),
        ("<s>", tokenizer.token_to_id("<s>")))
    tokenizer.enable_truncation(max_length=512)

    print("Original sentence " + test_sentence)
    print("Encoded string: {}".format(tokenizer.encode(test_sentence).tokens))

    encoding = tokenizer.encode(test_sentence)
    decoded = tokenizer.decode(encoding.ids)
    print("Decoded string: {}".format(decoded))
Beispiel #8
0
def initialize_model():

    config = get_config()
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    #device = torch.device('cpu')
    print("device", device)

    '''
    Create dataloaders
    '''
    train_dataset = SplitReshapeTrainDataset(config['complex_sentences_file'], config['simple_sentences_file'])
    train_data, val_data = torch.utils.data.random_split(train_dataset, [round(config["train_data_percentage"] * len(train_dataset)), round(config["val_data_percentage"] * len(train_dataset))])

    train_dataloader = DataLoader(train_data, batch_size=config["batch_size"], num_workers=config["num_of_workers"], pin_memory=True)
    val_dataloader = DataLoader(val_data, batch_size=config["batch_size"], num_workers=config["num_of_workers"], pin_memory=True)

    '''
    create tokenizer
    '''
    tokenizer = ByteLevelBPETokenizer(
        "./data/english_tokenizer-vocab.json",
        "./data/english_tokenizer-merges.txt",
    )
    tokenizer._tokenizer.post_processor = BertProcessing(
        ("</s>", tokenizer.token_to_id("</s>")),
        ("<s>", tokenizer.token_to_id("<s>")),
    )


    '''
    Create model
    '''
    vocab_size = len(tokenizer.get_vocab())
    print("tokenizer.vocab_size", vocab_size)
    model = TransformerModel(config['embedding_size'],
           vocab_size,
           vocab_size,
           config['src_pad_idx'],
           config['num_heads'],
           config['num_encoder_layers'],
           config['num_decoder_layers'],
           config['forward_expansion'],
           config['dropout'],
           config['max_len'],
           device)

    model.train()

    trainer = model.to(device)

    '''
    Create Optimizer
    '''
    loss_fun = nn.CrossEntropyLoss(ignore_index = config['src_pad_idx'])
    optimizer = optim.Adam(trainer.parameters(), lr = config["learning_rate"])
    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, 10)

    writer = SummaryWriter()

    return config, train_dataloader, val_dataloader, trainer, loss_fun, optimizer, writer, device, scheduler, tokenizer
    def __init__(
        self,
        vocab_file,
        delimiter,
        lowercase,
        unk_token,
        eos_token,
        add_eos=False,
        add_double_eos=False,
        normalization: Optional[str] = None,
    ):

        try:
            tokenizer = WordLevel(vocab_file, unk_token=unk_token)
            tokenizer = Tokenizer(tokenizer)
        except Exception:
            raise ValueError(
                "Unable to parse file {}. Unknown format. "
                "If you tried to load a model saved through TransfoXLTokenizer,"
                "please note they are not compatible.".format(vocab_file))

        # Create the correct normalization path
        normalizer = []

        # Include unicode normalization
        if normalization:
            normalizer += [unicode_normalizer_from_str(normalization)]

        # Include case normalization
        if lowercase:
            normalizer += [Lowercase()]

        # Strip normalizer at the end
        normalizer += [Strip(left=True, right=True)]

        if len(normalizer) > 0:
            tokenizer.normalizer = Sequence(
                normalizer) if len(normalizer) > 1 else normalizer[0]

        # Setup the splitter
        tokenizer.pre_tokenizer = CharDelimiterSplit(
            delimiter) if delimiter else WhitespaceSplit()

        if add_double_eos:
            tokenizer.post_processor = BertProcessing(
                (eos_token, tokenizer.token_to_id(eos_token)),
                (eos_token, tokenizer.token_to_id(eos_token)))

        parameters = {
            "model": "TransfoXLModel",
            "add_eos": add_eos,
            "add_double_eos": add_double_eos,
            "unk_token": unk_token,
            "eos_token": eos_token,
            "delimiter": delimiter,
            "lowercase": lowercase,
        }

        super().__init__(tokenizer, parameters)
Beispiel #10
0
 def from_pretrained(cls, tokenizer_name, cache_dir=None):
     tokenizer = KariBERTaTokenizer(tokenizer_name)
     tokenizer._tokenizer.post_processor = BertProcessing(
         ("</s>", tokenizer.token_to_id("</s>")),
         ("<s>", tokenizer.token_to_id("<s>")),
     )
     tokenizer.enable_truncation(max_length=512)
     tokenizer.enable_padding()
     return tokenizer
    def __init__(self, cfg):
        super().__init__(cfg)
        self.scales = [str((cfg.load_size // (2**i))) for i in range(3)]
        self.scales.reverse()

        self.device_map = {
            'style': self.devices[0],
            'content': self.devices[0],
            'img': self.devices[0]
        }
        self.network_names = [
            'style_model', 'content_model', 'generator', 'discriminators'
        ]
        self.device_name_map = {
            'style_model': 'style',
            'content_model': 'content',
            'generators': 'img',
            'discriminators': 'img'
        }

        tokenizer = ByteLevelBPETokenizer(
            "vocab.json",
            "merges.txt",
        )
        tokenizer._tokenizer.post_processor = BertProcessing(
            ("</s>", tokenizer.token_to_id("</s>")),
            ("<s>", tokenizer.token_to_id("<s>")),
        )

        self.cold = True

        self.language_model = LanguageModel(cfg, tokenizer,
                                            self.device_map['style']).to(
                                                self.device_map['style'])
        self.content_model = VAE(cfg.rnn_hidden_dim, self.device_map['style'],
                                 cfg).to(self.device_map['style'])
        self.style_model = VAE(cfg.rnn_hidden_dim, self.device_map['style'],
                               cfg).to(self.device_map['style'])

        self.generator = StyleGenerator(cfg).to(self.device_map['img'])
        self.discriminator = FeatureConvolutionalDiscriminator(cfg).to(
            self.device_map['img'])

        self.visual_names = ['visual_dict']
        self.visual_dict = {'real': None, 'fake': None}
        self.loss_names = ['loss']
        self.visualizer = Visualizer(cfg)

        self.generator_criterion = BinaryCrossEntropyLoss(cfg).to(
            self.device_map['img'])
        self.consistency_criterion = ColorConsistencyLoss(cfg).to(
            self.device_map['img'])
        self.distribution_criterion = KLDLoss().to(self.device_map['img'])

        self.latent_scale = int(cfg.load_size // (2**6))
        self.latent_channels = int(cfg.latent_dim) // (self.latent_scale**2)
        self.channels_z = 8 * self.cfg.ngf - self.latent_channels
Beispiel #12
0
    def test_processing(self):
        tokenizer = Tokenizer(BPE())
        tokenizer.add_special_tokens(["[SEP]", "[CLS]"])
        tokenizer.add_tokens(["my", "name", "is", "john", "pair"])
        tokenizer.post_processor = BertProcessing(("[SEP]", 0), ("[CLS]", 1))

        output = tokenizer.encode("my name", "pair")
        assert output.tokens == ["[CLS]", "my", "name", "[SEP]", "pair", "[SEP]"]
        assert output.ids == [1, 2, 3, 0, 6, 0]
Beispiel #13
0
    def __init__(
        self,
        vocab_file: Optional[str] = None,
        add_special_tokens: bool = True,
        unk_token: str = "[UNK]",
        sep_token: str = "[SEP]",
        cls_token: str = "[CLS]",
        clean_text: bool = True,
        handle_chinese_chars: bool = True,
        strip_accents: bool = True,
        lowercase: bool = True,
        wordpieces_prefix: str = "##",
    ):

        if vocab_file is not None:
            tokenizer = Tokenizer(
                WordPiece.from_files(vocab_file, unk_token=unk_token))
        else:
            tokenizer = Tokenizer(WordPiece.empty())

        tokenizer.add_special_tokens([unk_token, sep_token, cls_token])
        tokenizer.normalizer = BertNormalizer(
            clean_text=clean_text,
            handle_chinese_chars=handle_chinese_chars,
            strip_accents=strip_accents,
            lowercase=lowercase,
        )
        tokenizer.pre_tokenizer = BertPreTokenizer()

        if add_special_tokens and vocab_file is not None:
            sep_token_id = tokenizer.token_to_id(sep_token)
            if sep_token_id is None:
                raise TypeError("sep_token not found in the vocabulary")
            cls_token_id = tokenizer.token_to_id(cls_token)
            if cls_token_id is None:
                raise TypeError("cls_token not found in the vocabulary")

            tokenizer.post_processor = BertProcessing(
                (sep_token, sep_token_id), (cls_token, cls_token_id))
        tokenizer.decoders = decoders.WordPiece(prefix=wordpieces_prefix)

        parameters = {
            "model": "BertWordPiece",
            "add_special_tokens": add_special_tokens,
            "unk_token": unk_token,
            "sep_token": sep_token,
            "cls_token": cls_token,
            "clean_text": clean_text,
            "handle_chinese_chars": handle_chinese_chars,
            "strip_accents": strip_accents,
            "lowercase": lowercase,
            "wordpieces_prefix": wordpieces_prefix,
        }

        super().__init__(tokenizer, parameters)
Beispiel #14
0
def train_tokenizer(input_path, output_path, vocab_size=10000):
    tokenizer = ByteLevelBPETokenizer()
    tokenizer.train(files=[input_path],
                    vocab_size=vocab_size,
                    special_tokens=["[PAD]", "<s>", "</s>", "<unk>"])
    tokenizer._tokenizer.post_processor = BertProcessing(
        ("</s>", tokenizer.token_to_id("</s>")),
        ("<s>", tokenizer.token_to_id("<s>")),
    )
    tokenizer.save_model(output_path)
    return tokenizer
Beispiel #15
0
    def test_bert_parity(self):
        tokenizer = Tokenizer(BPE())
        tokenizer.add_special_tokens(["[SEP]", "[CLS]"])
        tokenizer.add_tokens(["my", "name", "is", "john", "pair"])
        tokenizer.post_processor = BertProcessing(("[SEP]", 0), ("[CLS]", 1))

        original = tokenizer.encode("my name", "pair")

        tokenizer.post_processor = self.get_bert()
        template = tokenizer.encode("my name", "pair")
        assert original.ids == template.ids
Beispiel #16
0
    def __init__(self, max_tokens=512):

        ## RoBERTa uses BPE tokenizer similar to GPT
        t = ByteLevelBPETokenizer("tokenizer/vocab.json",
                                  "tokenizer/merges.txt")
        t._tokenizer.post_processor = BertProcessing(
            ("</s>", t.token_to_id("</s>")),
            ("<s>", t.token_to_id("<s>")),
        )
        t.enable_truncation(max_tokens)
        t.enable_padding(length=max_tokens, pad_id=t.token_to_id("<pad>"))
        self.tokenizer = t
Beispiel #17
0
def load_sentence_piece_model():
    tokenizer = ByteLevelBPETokenizer(path_vocab, path_model)
    tokenizer._tokenizer.post_processor = BertProcessing(
        ("</s>", tokenizer.token_to_id("</s>")),
        ("<s>", tokenizer.token_to_id("<s>")))

    tokenizer.enable_truncation(max_length=512)
    encoding = tokenizer.encode("배고파요")
    print(encoding.tokens)
    print(encoding.special_tokens_mask)
    print(encoding.ids)
    print(encoding.normalized_str)
Beispiel #18
0
def create_norwegian_tokenizer():
    tokenizer = ByteLevelBPETokenizer(
        "./models/KariBERTa-tiny/vocab.json",
        "./models/KariBERTa-tiny/merges.txt",
    )
    tokenizer._tokenizer.post_processor = BertProcessing(
        ("</s>", tokenizer.token_to_id("</s>")),
        ("<s>", tokenizer.token_to_id("<s>")),
    )
    tokenizer.enable_truncation(max_length=512)
    tokenizer.enable_padding()
    return tokenizer
    def __init__(self, tok_dir, max_seq_len, **kwargs):

        tokenizer = CharBPETokenizer(f"{tok_dir}/vocab.json",
                                     f"{tok_dir}/merges.txt")
        tokenizer._tokenizer.post_processor = BertProcessing(
            ("</s>", tokenizer.token_to_id("</s>")),
            ("<s>", tokenizer.token_to_id("<s>")),
        )
        tokenizer.enable_truncation(max_length=max_seq_len)

        self._pretrained_tokenizer = tokenizer
        self.max_seq_len = max_seq_len
Beispiel #20
0
    def __load_tokenizer(self):
        parent_path, _ = os.path.split(__file__)
        data_path = os.path.join(parent_path, "data")
        tokenizer_path = os.path.join(data_path, str(self.__vocab_size),
                                      str(self.__min_frequence),
                                      self.__tokenizer_name)

        self.__tokenizer = RobertaTokenizerFast.from_pretrained(
            tokenizer_path, max_len=self.max_length)

        self.__tokenizer._tokenizer.post_processor = BertProcessing(
            ("</s>", self.__tokenizer.convert_tokens_to_ids("</s>")),
            ("<s>", self.__tokenizer.convert_tokens_to_ids("<s>")),
        )
Beispiel #21
0
    def __init__(self, evaluate: bool = false):
        tokenizer = ByteLevelBPETokenizer(
            "./esperberto-vocab.json",
            './esperberto-merges.txt',
        )
        tokenizer._tokenizer.post_processor = BertProcessing(
            ("</s>", tokenizer.token_to_id("</s>")),
            ("<s>", tokenizer.token_to_id("<s>")),
        )
        tokenizer.enable_truncation(max_length=512)

        self.examples = []

        src_files = Path("./")
 def load_custom_tokenizer(self, path):
     tokenizer = ByteLevelBPETokenizer(path + "-vocab.json",
                                       path + "-merges.txt")
     # Add preprocessing tokens like Roberta
     tokenizer._tokenizer.post_processor = BertProcessing(
         ("</s>", tokenizer.token_to_id("</s>")),
         ("<s>", tokenizer.token_to_id("<s>")),
     )
     return PreTrainedTokenizerFast(tokenizer,
                                    pad_token="<pad>",
                                    mask_token="<mask>",
                                    unk_token="<unk>",
                                    bos_token="<s>",
                                    eos_token="</s>")
Beispiel #23
0
    def __init__(
        self,
        vocab_file,
        delimiter,
        lowercase,
        unk_token,
        eos_token,
        add_eos=False,
        add_double_eos=False,
        normalization: Optional[str] = None,
    ):

        tokenizer = WordLevel.from_files(vocab_file, unk_token=unk_token)
        tokenizer = Tokenizer(tokenizer)

        # Create the correct normalization path
        normalizer = []

        # Include unicode normalization
        if normalization:
            normalizer += [unicode_normalizer_from_str(normalization)]

        # Include case normalization
        if lowercase:
            normalizer += [Lowercase()]

        if len(normalizer) > 0:
            tokenizer.normalizer = Sequence(
                normalizer) if len(normalizer) > 1 else normalizer[0]

        # Setup the splitter
        tokenizer.pre_tokenizer = CharDelimiterSplit(
            delimiter) if delimiter else WhitespaceSplit()

        if add_double_eos:
            tokenizer.post_processor = BertProcessing(
                (eos_token, tokenizer.token_to_id(eos_token)),
                (eos_token, tokenizer.token_to_id(eos_token)))

        parameters = {
            "model": "TransfoXLModel",
            "add_eos": add_eos,
            "add_double_eos": add_double_eos,
            "unk_token": unk_token,
            "eos_token": eos_token,
            "delimiter": delimiter,
            "lowercase": lowercase,
        }

        super().__init__(tokenizer, parameters)
Beispiel #24
0
 def load_tokenizer(path,
                    enable_truncation=True,
                    enable_padding=True,
                    max_length=512):
     tokenizer = SentencePieceBPETokenizer(os.path.join(path, "vocab.json"),
                                           os.path.join(path, "merges.txt"))
     tokenizer._tokenizer.post_processor = BertProcessing(
         ("</s>", tokenizer.token_to_id("</s>")),
         ("<s>", tokenizer.token_to_id("<s>")),
     )
     if enable_truncation:
         tokenizer.enable_truncation(max_length=max_length)
     if enable_padding:
         tokenizer.enable_padding(pad_token="<pad>",
                                  pad_id=tokenizer.token_to_id("<pad>"))
     return tokenizer
Beispiel #25
0
def main(args):
    data = np.load(args.data, allow_pickle=True)
    tokenizer_path = args.tokenizer
    tokenizer = PreTrainedTokenizerFast(tokenizer_file=tokenizer_path,
                                        max_len=512,
                                        mask_token="<mask>",
                                        pad_token="<pad>")
    tokenizer._tokenizer.post_processor = BertProcessing(
        ("</s>", tokenizer.convert_tokens_to_ids("</s>")),
        ("<s>", tokenizer.convert_tokens_to_ids("<s>")),
    )

    config = RobertaConfig(
        vocab_size=tokenizer.vocab_size,
        max_position_embeddings=514,
        num_attention_heads=12,
        num_hidden_layers=6,
        type_vocab_size=1,
    )

    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,
                                                    mlm=True,
                                                    mlm_probability=0.15)
    dataset = PhoneDatasetMLM(data, tokenizer)

    model = RobertaForMaskedLM(config=config)

    training_args = TrainingArguments(
        output_dir=args.output_dir,
        overwrite_output_dir=True,
        num_train_epochs=1,
        per_device_train_batch_size=64,
        logging_steps=2,
        save_steps=10_000,
        save_total_limit=2,
        prediction_loss_only=True,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=dataset,
    )

    trainer.train()
    trainer.save_model(args.output_dir)
Beispiel #26
0
    def __init__(self, args, file_path: str):
        tokenizer = CharBPETokenizer(
            f'{args.tokenizer_name}/vocab.json',
            f'{args.tokenizer_name}/merges.txt',
        )
        tokenizer._tokenizer.post_processor = BertProcessing(
            ("</s>", tokenizer.token_to_id("</s>")),
            ("<s>", tokenizer.token_to_id("<s>")),
        )
        tokenizer.enable_truncation(max_length=256)
        # or use the RobertaTokenizer from `transformers` directly.

        self.examples = []

        with open(file_path, encoding="utf-8") as f:
            lines = [line for line in f.read().splitlines() if (len(line) > 0 and not line.isspace())]
        self.examples += [x.ids for x in tokenizer.encode_batch(lines)]
Beispiel #27
0
    def __init__(self, evaluate: bool = False):
        tokenizer = ByteLevelBPETokenizer(
            "./roberta-lm/vocab.json",
            "./roberta-lm/merges.txt",
        )
        tokenizer._tokenizer.post_processor = BertProcessing(
            ("</s>", tokenizer.token_to_id("</s>")),
            ("<s>", tokenizer.token_to_id("<s>")),
        )
        tokenizer.enable_truncation(max_length=512)
        # or use the RobertaTokenizer from `transformers` directly.

        self.examples = []

        src_files = Path("./data/montecristo/").glob("**/*.txt")
        for src_file in src_files:
            print("🔥", src_file)
            lines = src_file.read_text(encoding="utf-8").splitlines()
            self.examples += [x.ids for x in tokenizer.encode_batch(lines)]
Beispiel #28
0
    def __init__(
        self,
        vocab_file,
        sep_token="<sep>",
        cls_token="<cls>",
        pad_token="<pad>",
        mask_token="<mask>",
        lowercase: bool = True,
    ):

        tokenizer = Tokenizer(WordLevel(vocab_file, unk_token=unk_token))
        tokenizer.normalizer = Strip()
        tokenizer.pre_tokenizer = CharDelimiterSplit(" ")

        tokenizer.post_processor = BertProcessing(
            ("</s>", tokenizer.token_to_id("</s>")),
            ("<s>", tokenizer.token_to_id("<s>")),
        )
        tokenizer.enable_truncation(max_length=512)

        # Let the tokenizer know about special tokens if they are part of the vocab
        if tokenizer.token_to_id(str(unk_token)) is not None:
            tokenizer.add_special_tokens([str(unk_token)])
        if tokenizer.token_to_id(str(sep_token)) is not None:
            tokenizer.add_special_tokens([str(sep_token)])
        if tokenizer.token_to_id(str(cls_token)) is not None:
            tokenizer.add_special_tokens([str(cls_token)])
        if tokenizer.token_to_id(str(pad_token)) is not None:
            tokenizer.add_special_tokens([str(pad_token)])
        if tokenizer.token_to_id(str(mask_token)) is not None:
            tokenizer.add_special_tokens([str(mask_token)])

        parameters = {
            "model": "WordLevel",
            "unk_token": unk_token,
            "sep_token": sep_token,
            "cls_token": cls_token,
            "pad_token": pad_token,
            "mask_token": mask_token,
            "lowercase": lowercase,
        }

        super().__init__(tokenizer, parameters)
Beispiel #29
0
def load_sentence_piece_model(path_vocab, path_model):
    tokenizer = ByteLevelBPETokenizer(path_vocab, path_model)
    tokenizer._tokenizer.post_processor = BertProcessing(
        ("<bos>", tokenizer.token_to_id("<bos>")),
        ("<eos>", tokenizer.token_to_id("<eos>"))
    )

    tokenizer.enable_truncation(max_length=512)

    # encoding = tokenizer.encode("배고파요")
    # print(encoding.tokens)
    # print(encoding.special_tokens_mask)
    # print(encoding.ids)
    # print(encoding.normalized_str)
    #
    # decoding = tokenizer.decode([2, 1177, 276, 692, 571, 1])
    # print(decoding)

    return tokenizer
Beispiel #30
0
    def __init__(self, file_path: str = None, tokenizer_path: str = None):
        tokenizer = ByteLevelBPETokenizer(
            tokenizer_path + "/vocab.json",
            tokenizer_path + "/merges.txt",
        )
        tokenizer._tokenizer.post_processor = BertProcessing(
            ("</s>", tokenizer.token_to_id("</s>")),
            ("<s>", tokenizer.token_to_id("<s>")),
        )
        tokenizer.enable_truncation(max_length=512)

        self.examples = []

        with open(file_path, encoding="utf-8") as f:
            lines = f.readlines()
            lines = [
                line for line in lines
                if (len(line) > 0 and not line.isspace())
            ]
            self.examples += [x.ids for x in tokenizer.encode_batch(lines)]