def __init__(self, vocab_size, hidden_size, dropout, n_layers=1, vocab_file='./data/vocab.txt'): super(UntrainedEncoderBERT, self).__init__() self.vocab_size = vocab_size self.hidden_size = hidden_size self.dropout = dropout self.dropout_layer = nn.Dropout(dropout) self.embedding = nn.Embedding(vocab_size, hidden_size, padding_idx=PAD_token) self.embedding.weight.data.normal_(0, 0.1) self.config = transformers.BertConfig(vocab_size=self.vocab_size, hidden_size=self.hidden_size, num_hidden_layers=n_layers, hidden_dropout_prob=dropout, attention_probs_dropout=dropout, num_attention_heads=16, output_hidden_states=True, max_position_embeddings=1024) self.tokenizer = transformers.BertTokenizer(vocab_file, pad_token='PAD', unk_token='UNK', sep_token='EOS') self.BERT = transformers.BertModel(self.config) self.training = True
def bert_tokenizer_returner(self): if self.config.bert_name == 'japanese-bert': vocab_file = './vocab_file/vocab.txt' return transformers.BertTokenizer(vocab_file=vocab_file, do_basic_tokenize=True) else: print('currently not supported:', self.config.bert_name) raise NotImplementedError
def get_tokenizer(file_config=None, use_vocab=False): """ 加载tokenizer use_vocab: 是否用现有的字典文件""" if use_vocab: tokenizer = transformers.BertTokenizer(vocab_file=args.tokenizer_vocab) else: tokenizer = transformers.BertTokenizer.from_pretrained(file_config.bert_tokenizer_dir) logging.info('字典大小:{}'.format(len(tokenizer))) return tokenizer
def two(): tokenizer = transformers.BertTokenizer("../model/chinese_L-12_H-768_A-12/vocab.txt") tokenizer.add_special_tokens({"additional_special_tokens":["[SPACE]","“","”"]}) vocab_f = open("../model/chinese_L-12_H-768_A-12/vocab.txt","r",encoding="utf8") list_vocab = vocab_f.readlines() list_vocab = [data.strip() for data in list_vocab] dict_vocab = {k:v for k,v in enumerate(list_vocab)} f_sentence = open(two_train_sentence_path,"w",encoding="utf8") f_label = open(two_train_label_path,"w",encoding="utf8") with open(train_sentence_path,"r",encoding="utf8") as f1,open(train_label_path,"r",encoding="utf8") as f2: test_list_label = [] for sentence,label in zip(f1.readlines(),f2.readlines()): sentence = sentence.strip() label = label.strip() # 目标所求 # tokened_text = tokenizer.encode(sentence)[1:-1] #全部是id 摘除[cls] tokened_text = tokenizer.encode(sentence) #全部是id tokened_text_str = [str(i) for i in tokened_text] print(" ".join(tokened_text_str),file=f_sentence) tokened_text = tokened_text[1:-1] # print(tokened_text) tokened_list_text = [dict_vocab[data].lstrip("##") for data in tokened_text] # print(A) # print(tokened_list_text) list_label = label.split(" ") #目标所求。 list_label_final = [] for i in range(len(tokened_list_text)): num = len(list(tokened_list_text[i])) if tokened_list_text[i] in ["[SPACE]","[UNK]"]: # print(1) num =1 # 这里处理合并问题 maybe = list_label[0:num] if num != 1: maybe_a = [int(i)%3 for i in maybe] if maybe_a[0]==1 and maybe_a[-1]==0: a = str(int(maybe[0])//3 + 13) elif maybe_a[0]==1 and maybe_a[-1]==2: a = maybe[0] elif maybe_a[0]==2 and maybe_a[-1]==0: a = maybe[-1] else: a = maybe[0] else: a = maybe[0] list_label_final.append(a) list_label = list_label[num:] print(" ".join(["0"]+list_label_final),file=f_label)
def berttokenizer_returner(self): if self.args.bert_name == 'bert-base-uncased': vocab_file = './src/vocab_file/bert-base-uncased-vocab.txt' do_lower_case = True else: print('currently not supported:', self.args.bert_name) raise NotImplementedError return transformers.BertTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case, do_basic_tokenize=True, never_split=NEVER_SPLIT_TOKEN)
def doc_context_similarity(request): """Predict probability of two documents appearing in the same context.""" print('Starting document context similarity prediction...') global model, tokenizer if not tokenizer: print('Loading tokenizer...') if os.getenv('ENV', '') == 'local': # TODO: Think about whether to keep the cased or uncased? tokenizer = transformers.BertTokenizer('./ext/model/vocab.txt', do_lower_case=False) else: tokenizer = (transformers.BertTokenizer.from_pretrained( 'bert-base-finnish-cased-v1')) print('Tokenizer loaded!') if not model: print('Loading model...') model_path = ('./ext/model' if os.getenv('ENV', '') == 'local' else 'bert-base-finnish-cased-v1') model = (transformers.BertForNextSentencePrediction.from_pretrained( model_path)) model.eval() print('Model loaded!') print('Predicting...') data = request.get_json()['data'] # Parse data doc1 = data['doc1'] doc2 = data['doc2'] # Inference tokens1 = ['[CLS]'] + tokenizer.tokenize(doc1) + ['[SEP]'] tokens2 = tokenizer.tokenize(doc2) + ['[SEP]'] tokens = tokens1 + tokens2 indexed_tokens = tokenizer.convert_tokens_to_ids(tokens) segments_ids = [0] * len(tokens1) + [1] * len(tokens2) tokens_tensor = torch.tensor([indexed_tokens]) segments_tensors = torch.tensor([segments_ids]) pred = model(tokens_tensor, token_type_ids=segments_tensors)[0] probability = float(torch.nn.Softmax(dim=1)(pred).data.numpy()[0][0]) print('Prediction done!') return { 'status': 'success', 'message': 'Prediction obtained successfully!', 'data': { 'probability': probability } }, 200
def berttokenizer_returner(self): if self.args.bert_name == 'bert-base-uncased': vocab_file = './vocab_file/bert-base-uncased-vocab.txt' do_lower_case = True elif self.args.bert_name == 'biobert': vocab_file = './vocab_file/biobert_v1.1_pubmed_vocab.txt' do_lower_case = False else: print('currently not supported:', self.args.bert_name) raise NotImplementedError return transformers.BertTokenizer( vocab_file=vocab_file, do_lower_case=do_lower_case, do_basic_tokenize=True, never_split=['<target>', '</target>'])
def update_vocab(self): self.tokenizer.save_vocabulary( '/home/qianhoude/Neural-OpenIE/Transformers-version/') with open( '/home/qianhoude/Neural-OpenIE/Transformers-version/vocab.txt', 'a') as f: for i in [ '<arg1>', '</arg1>', '<rel>', '</rel>', '<arg2>', '</arg2>' ]: f.write(i + '\n') self.tokenizer = transformers.BertTokenizer( vocab_file= '/home/qianhoude/Neural-OpenIE/Transformers-version/vocab.txt') os.system( 'rm /home/qianhoude/Neural-OpenIE/Transformers-version/vocab.txt')
def load_bert(path): ''' Load the Chinese Bert model in the specified folder ''' config_path = os.path.join(path, 'chinese_wwm_ext_pytorch/bert_config.json') model_path = os.path.join(path, 'chinese_wwm_ext_pytorch/pytorch_model.bin') vocab_path = os.path.join(path, 'chinese_wwm_ext_pytorch/vocab.txt') config = transformers.BertConfig.from_pretrained(config_path) config.output_hidden_states = True model = transformers.BertModel.from_pretrained(model_path, config=config) model.eval() tokenizer = transformers.BertTokenizer(vocab_path) return model, tokenizer
def __init__(self, args): super(BertEmbedderModule, self).__init__(args) config = transformers.BertConfig.from_json_file( '/work/dcml0714/bert_data/en_pretrain/bert_config.json') config.output_hidden_states = True self.model = transformers.BertForPreTraining.from_pretrained( pretrained_model_name_or_path=None, config=config, state_dict=torch.load(args.bert_model_path)) self.max_pos = self.model.config.max_position_embeddings self.tokenizer = transformers.BertTokenizer( vocab_file='/work/dcml0714/bert_data/bi_pretrain/vocab.txt') self._sep_id = self.tokenizer.convert_tokens_to_ids("[SEP]") self._cls_id = self.tokenizer.convert_tokens_to_ids("[CLS]") self._pad_id = self.tokenizer.convert_tokens_to_ids("[PAD]") self._unk_id = self.tokenizer.convert_tokens_to_ids("[UNK]") self.parameter_setup(args)
def device_setup(self): """ 设备配置并加载BERT模型 :return: """ # 使用GPU,通过model.to(device)的方式使用 self.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") model_save_path = self.config.get("result", "model_save_path") config_save_path = self.config.get("result", "config_save_path") vocab_save_path = self.config.get("result", "vocab_save_path") self.model_config = BertConfig.from_json_file(config_save_path) self.model = BertForSequenceClassification(self.model_config) self.state_dict = torch.load(model_save_path) self.model.load_state_dict(self.state_dict) self.tokenizer = transformers.BertTokenizer(vocab_save_path) self.model.to(self.device) self.model.eval()
def get_tokenizer() -> transformers.BertTokenizer: """Returns the BERT tokenizer.""" do_lower_case = ('uncased' in common_flags.BERT_CONFIG.value or 'cased' not in common_flags.BERT_CONFIG.value) with tempfile.TemporaryDirectory() as tdir: vocab_fn = os.path.join(tdir, 'vocab.txt') tf.io.gfile.copy(common_flags.BERT_VOCAB.value, vocab_fn) # special symbols grammar_symbols = state_tree.NQStateTree.tree_node_symbols operator_symbols = [ operator.value for operator in state_tree.Operator ] + ['[stop]'] # add field symbols operator_symbols += [field.value for field in state_tree.Field] special_symbols = grammar_symbols + operator_symbols assert len(special_symbols) < 99, 'Too many special symbols.' tokenizer = transformers.BertTokenizer(vocab_fn, do_lower_case=do_lower_case) tokenizer.add_tokens(special_symbols) return tokenizer
def __init__(self, device, cfg): super().__init__() if cfg.tokens_pretrained: self.tokenizer = transformers.BertTokenizer.from_pretrained( 'bert-base-uncased') else: self.tokenizer = transformers.BertTokenizer( cfg.vocab_path, cfg.merge_path) if cfg.embeddings_pretrained: self.model = transformers.BertModel.from_pretrained( 'bert-base-uncased') else: self.model = transformers.BertModel('bert-base-uncased') self.model = self.model.to(device) self.pad_token = 'pad_token' self.device = device self.max_len = cfg.max_seq_len self.trainable = cfg.embeddings_trainable
# @Time: 2020/7/8 10:37 # @Author: R.Jian # @Note: 用于验证集的预处理 import json import transformers f = open("ccks_8_data_v2/validate_data.json",encoding="utf8") dict_vali = json.load(f) f.close() # token tokenizer = transformers.BertTokenizer("../model/chinese_L-12_H-768_A-12/vocab.txt") tokenizer.add_special_tokens({"additional_special_tokens": ["[SPACE]", "“", "”"]}) vocab_f = open("../model/chinese_L-12_H-768_A-12/vocab.txt", "r", encoding="utf8") list_vocab = vocab_f.readlines() list_vocab = [data.strip() for data in list_vocab] dict_vocab = {k: v for k, v in enumerate(list_vocab)} f_sentence = open("val_token.txt", "w", encoding="utf8") f_word = open("val_word.txt","w",encoding="utf8") for i in range(1,101): sen = dict_vali["validate_V2_"+str(i)+".json"] sen = sen.strip("\r\n").replace("\r\n"," ") sen = sen.replace(" "," [SPACE] ") list_token = tokenizer.encode(sen) list_token_str = [str(i) for i in list_token] print(" ".join(list_token_str), file=f_sentence) list_token = list_token[1:-1] print(list_token)
Settings related to input data, file exports and model configuration will be available in this script. """ # data files TRAIN_FILE = "./data/train.tsv" TEST_FILE = "./data/test.tsv" # use for evaluation # model files TRAINED_MODEL_FILE = './results/trained_model.pt' NER_RESULTS_FILE = './results/ner_result.csv' # torch setting import torch DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") # BioBERT base import transformers BERT_CONFIG_FILE = './biobert_v1.1_pubmed' # /config.json' TOKENIZER = transformers.BertTokenizer( vocab_file='biobert_v1.1_pubmed/vocab.txt', do_lower_case=False) WEIGHTS_BIN = torch.load('./biobert_v1.1_pubmed/pytorch_model.bin', map_location=DEVICE) # params config MAX_LEN = 75 MAX_GRAD_NORM = 1.0 TRAIN_BATCH_SIZE = 32 TEST_BATCH_SIZE = 8 EPOCHS = 1
# code borrowed from official pytorch discussion forum with torch.no_grad(): correct = rankings.eq(0) res = [] for k in topk: correct_k = correct[:, :k].float().sum() res.append(correct_k.mul_(1.0 / rankings.size(0)).item()) return res if __name__ == '__main__': args = parse_args() # create tokenizer tokenizer = trs.BertTokenizer(vocab_file='bert_vocab.txt', do_lower_case=True) assert args.split in ['valid', 'test'], f"{args.split} not allowed" # create dataloader if args.dataset.startswith('udc'): dataset = UDC(root=args.dataset_root, split=args.split, tokenizer=tokenizer) dataloader = data.DataLoader(dataset, batch_size=args.batch_size, shuffle=False, num_workers=2) else: raise Exception(F"unknown dataset: {args.dataset}")
import argparse import transformers parser = argparse.ArgumentParser() parser.add_argument('--vocab', type=str) parser.add_argument('--model', type=str) parser.add_argument('--data', type=str) args = parser.parse_args() tokenizer = transformers.BertTokenizer(vocab_file=args.vocab, do_lower_case=False, do_basic_tokenize=True) model = transformers.BertForMaskedLM.from_pretrained(args.model) dataset = transformers.LineByLineTextDataset(tokenizer=tokenizer, file_path=args.data, block_size=128) data_collator = transformers.DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm=True, mlm_probability=0.15) train_args = transformers.TrainingArguments( per_device_eval_batch_size=16, output_dir=f"/tmp/echau18/{args.model}") trainer = transformers.Trainer(model=model, eval_dataset=dataset, data_collator=data_collator, prediction_loss_only=True, args=train_args) eval_output = trainer.evaluate() print(eval_output)
import transformers import os MAX_LEN = 512 TRAIN_BATCH_SIZE = 8 VALID_BATCH_SIZE = 4 EPOCHS = 10 BERT_PATH = "../input/bert_base_uncased/" MODEL_PATH = "../models/model.bin" TRAINING_FILE = "../input/imdb.csv" TOKENIZER = transformers.BertTokenizer(os.path.join(BERT_PATH, "vocab.txt"), do_lower_case=True)