def load_mlm_data(path): from pytorch_pretrained_bert.tokenization import BertTokenizer tokenizer = BertTokenizer.from_pretrained(bert_model, do_lower_case=do_lower_case) vocab_words = list(tokenizer.vocab.keys()) data = load_loose_json(path) docs = [] for doc in data: paras = doc['text'].split('\n\n') paras = [para.strip() for para in paras if len(para.strip()) > 0] tokens = [tokenizer.tokenize(para) for para in paras] docs.append(tokens) return docs, tokenizer
def load_mlm_data(path): from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained(bert_model, cache_dir=".cache") vocab_words = list(tokenizer.vocab.keys()) data = load_loose_json(path) docs = [] for doc in data: paras = doc["text"].split("\n\n") paras = [ para.strip() for para in paras if len(para.strip()) > 0 ] tokens = [tokenizer.tokenize(para) for para in paras] docs.append(tokens) return docs, tokenizer