Example #1
0
 def load_mlm_data(path):
     from pytorch_pretrained_bert.tokenization import BertTokenizer
     tokenizer = BertTokenizer.from_pretrained(bert_model,
                                               do_lower_case=do_lower_case)
     vocab_words = list(tokenizer.vocab.keys())
     data = load_loose_json(path)
     docs = []
     for doc in data:
         paras = doc['text'].split('\n\n')
         paras = [para.strip() for para in paras if len(para.strip()) > 0]
         tokens = [tokenizer.tokenize(para) for para in paras]
         docs.append(tokens)
     return docs, tokenizer
Example #2
0
 def load_mlm_data(path):
     from transformers import AutoTokenizer
     tokenizer = AutoTokenizer.from_pretrained(bert_model,
                                               cache_dir=".cache")
     vocab_words = list(tokenizer.vocab.keys())
     data = load_loose_json(path)
     docs = []
     for doc in data:
         paras = doc["text"].split("\n\n")
         paras = [
             para.strip() for para in paras if len(para.strip()) > 0
         ]
         tokens = [tokenizer.tokenize(para) for para in paras]
         docs.append(tokens)
     return docs, tokenizer