Example #1
0
def initialize_model():

    config = get_config()
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    #device = torch.device('cpu')
    print("device", device)
    '''create tokenizers'''

    tokenizer = ByteLevelBPETokenizer(
        "data/english_tokenizer-vocab.json",
        "data/english_tokenizer-merges.txt",
    )
    tokenizer._tokenizer.post_processor = BertProcessing(
        ("</s>", tokenizer.token_to_id("</s>")),
        ("<s>", tokenizer.token_to_id("<s>")),
    )
    tokenizer.enable_padding(pad_token='[PAD]', length=config['max_len'])
    tokenizer.enable_truncation(max_length=config['max_len'])
    '''
    Create model
    '''
    vocab_size = len(tokenizer.get_vocab())
    print("tokenizer.vocab_size", vocab_size)
    model = TransformerModel(config['embedding_size'], vocab_size, vocab_size,
                             config['src_pad_idx'], config['num_heads'],
                             config['num_encoder_layers'],
                             config['num_decoder_layers'],
                             config['forward_expansion'], config['dropout'],
                             config['max_len'], device)
    checkpoint = torch.load(config['pretrained_model'], map_location=device)
    model.load_state_dict(checkpoint['net'])
    model.eval()
    model = model.to(device)

    return config, model, tokenizer, device
Example #2
0
def create_norwegian_tokenizer():
    tokenizer = ByteLevelBPETokenizer(
        "./models/KariBERTa-tiny/vocab.json",
        "./models/KariBERTa-tiny/merges.txt",
    )
    tokenizer._tokenizer.post_processor = BertProcessing(
        ("</s>", tokenizer.token_to_id("</s>")),
        ("<s>", tokenizer.token_to_id("<s>")),
    )
    tokenizer.enable_truncation(max_length=512)
    tokenizer.enable_padding()
    return tokenizer
Example #3
0
    def __init__(self, evaluate: bool = false):
        tokenizer = ByteLevelBPETokenizer(
            "./esperberto-vocab.json",
            './esperberto-merges.txt',
        )
        tokenizer._tokenizer.post_processor = BertProcessing(
            ("</s>", tokenizer.token_to_id("</s>")),
            ("<s>", tokenizer.token_to_id("<s>")),
        )
        tokenizer.enable_truncation(max_length=512)

        self.examples = []

        src_files = Path("./")
Example #4
0
    def __init__(self, evaluate: bool = False):
        tokenizer = ByteLevelBPETokenizer(
            "./roberta-lm/vocab.json",
            "./roberta-lm/merges.txt",
        )
        tokenizer._tokenizer.post_processor = BertProcessing(
            ("</s>", tokenizer.token_to_id("</s>")),
            ("<s>", tokenizer.token_to_id("<s>")),
        )
        tokenizer.enable_truncation(max_length=512)
        # or use the RobertaTokenizer from `transformers` directly.

        self.examples = []

        src_files = Path("./data/montecristo/").glob("**/*.txt")
        for src_file in src_files:
            print("🔥", src_file)
            lines = src_file.read_text(encoding="utf-8").splitlines()
            self.examples += [x.ids for x in tokenizer.encode_batch(lines)]
Example #5
0
    def __init__(self, file_path: str = None, tokenizer_path: str = None):
        tokenizer = ByteLevelBPETokenizer(
            tokenizer_path + "/vocab.json",
            tokenizer_path + "/merges.txt",
        )
        tokenizer._tokenizer.post_processor = BertProcessing(
            ("</s>", tokenizer.token_to_id("</s>")),
            ("<s>", tokenizer.token_to_id("<s>")),
        )
        tokenizer.enable_truncation(max_length=512)

        self.examples = []

        with open(file_path, encoding="utf-8") as f:
            lines = f.readlines()
            lines = [
                line for line in lines
                if (len(line) > 0 and not line.isspace())
            ]
            self.examples += [x.ids for x in tokenizer.encode_batch(lines)]
Example #6
0
    def __init__(self, evaluate=False):
        tokenizer = ByteLevelBPETokenizer(
            "/home/zheng/sde/previous_small_model/bpe/esperberto_10000size-vocab.json",
            "/home/zheng/sde/previous_small_model/bpe/esperberto_10000size-merges.txt",
        )
        tokenizer._tokenizer.post_processor = BertProcessing(
            ("</s>", tokenizer.token_to_id("</s>")),
            ("<s>", tokenizer.token_to_id("<s>")),
        )
        tokenizer.enable_truncation(max_length=512)
        # or use the RobertaTokenizer from `transformers` directly.

        self.examples = []
        if evaluate:
            src_files = ["/home/zheng/sde/data/valid.txt"]
        else:
            src_files = ["/home/zheng/sde/data/test.txt"]

        for src_file in src_files:
            print(src_file)
            f = open(src_file, 'r')
            lines = f.readlines()
            self.examples += [x.ids for x in tokenizer.encode_batch(lines)]
Example #7
0
    "</s>",
    "<unk>",
    "<mask>",
])
# Save files to disk
tokenizer.save_model("BR_BERTo")
# Test
tokenizer = ByteLevelBPETokenizer(
    "./BR_BERTo/vocab.json",
    "./BR_BERTo/merges.txt",
)
tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)
tokenizer.enable_truncation(max_length=512)
print(tokenizer.encode("gostei muito dessa ideia".lower()).tokens)

# Model type
# --------------------------------------------------
config = RobertaConfig(
    vocab_size=vocab_size,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=8,
    type_vocab_size=1,
)
model = RobertaForMaskedLM(config=config)
print("Params: ", model.num_parameters())
tokenizer = RobertaTokenizerFast.from_pretrained("./BR_BERTo", max_len=512)
Example #8
0
# Customize training
tokenizer.train(files=paths,
                vocab_size=52_000,
                min_frequency=2,
                special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>"])

# Need to save it to model dir for inference
tokenizer.save(args.model_dir)

tokenizer = ByteLevelBPETokenizer(os.path.join(args.model_dir, "vocab.json"),
                                  os.path.join(args.model_dir, "merges.txt"))

tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")))
tokenizer.enable_truncation(max_length=args.token_max_len)

print(tokenizer.encode("Nay, but speak not."))
print(tokenizer.encode("Nay, but speak not.").tokens)

from transformers import RobertaConfig

config = RobertaConfig(vocab_size=args.vocab_size,
                       max_position_embeddings=args.max_position_embeddings,
                       num_attention_heads=args.num_attention_heads,
                       num_hidden_layers=args.num_hidden_layers,
                       type_vocab_size=args.type_vocab_size)

from transformers import RobertaTokenizerFast

tokenizer = RobertaTokenizerFast.from_pretrained(args.model_dir,