Esempio n. 1
0
def main():
    tokenizer = ByteLevelBPETokenizer()
    tokenizer.train(files=get_file(),
                    vocab_size=config.VOCAB_SIZE,
                    min_frequency=config.MIN_FREQUENCY,
                    special_tokens=config.SPECIAL_TOKENS)

    tokenizer.save_model(config.TOKENIZER_PATH)
Esempio n. 2
0
def train_tok(txt_dir, tokenizer_dir):
    # Initialize a tokenizer
    tokenizer = ByteLevelBPETokenizer()

    # Customize training
    tokenizer.train(files=txt_dir,
                    vocab_size=52_000,
                    min_frequency=2,
                    special_tokens=[
                        "<s>",
                        "<pad>",
                        "</s>",
                        "<unk>",
                        "<mask>",
                    ])

    tokenizer.save_model(tokenizer_dir)
    # train tokenizer
    _pretty_print("Training tokenizer")
    bpe_tokenizer.train([input_path, input_path_val],
                        vocab_size=vocab_size,
                        min_frequency=min_freq,
                        special_tokens=[
                            "<s>",
                            "<pad>",
                            "</s>",
                            "<unk>",
                            "<mask>",
                        ])
    # save tokenizer
    tok_path = os.path.join(output_path, "tokenizer")
    os.makedirs(tok_path, exist_ok=True)
    bpe_tokenizer.save_model(tok_path)

    # load tokenizer with Roberta configuration
    bpe_tokenizer = RobertaTokenizerFast.from_pretrained(tok_path,
                                                         max_len=max_len)

    # create data objects
    dataset_gen = LineByLineTextDataset(tokenizer=bpe_tokenizer,
                                        file_path=input_path,
                                        block_size=block_size)
    dataset_gen_val = LineByLineTextDataset(tokenizer=bpe_tokenizer,
                                            file_path=input_path_val,
                                            block_size=block_size)

    data_collator = DataCollatorForLanguageModeling(
        tokenizer=bpe_tokenizer, mlm=True, mlm_probability=mlm_probability)
Esempio n. 4
0
paths = [str(x) for x in Path("./").glob("**/corpus.txt")]

# Byte Level Tokernize
# --------------------------------------------------
# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer()
# Customize training
tokenizer.train(files=paths, vocab_size=vocab_size, min_frequency=2, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
])
# Save files to disk
tokenizer.save_model("BR_BERTo")
# Test
tokenizer = ByteLevelBPETokenizer(
    "./BR_BERTo/vocab.json",
    "./BR_BERTo/merges.txt",
)
tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)
tokenizer.enable_truncation(max_length=512)
print(tokenizer.encode("gostei muito dessa ideia".lower()).tokens)

# Model type
# --------------------------------------------------
config = RobertaConfig(
Esempio n. 5
0
from transformers import BertConfig
from transformers import BertTokenizer
from transformers import BertForMaskedLM
from transformers import LineByLineTextDataset
from transformers import DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments

PATH = os.getcwd()
SAVE_MODEL = os.getcwd()

tokenizer = ByteLevelBPETokenizer()
tokenizer.train(files="kant.txt",
                vocab_size=52_000,
                min_frequency=2,
                special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
tokenizer.save_model(SAVE_MODEL)
tokenizer = ByteLevelBPETokenizer(
    SAVE_MODEL + "/vocab.json",
    SAVE_MODEL + "/merges.txt",
)

tokenizer.enable_truncation(max_length=512)
print(tokenizer.encode("For it is in reality vain to profess"))

config = BertConfig(
    vocab_size=52_000,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
)