def create_bert_for_next_sequence_prediction(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels): model = BertForNextSentencePrediction(config=config) model.eval() loss = model(input_ids, token_type_ids, input_mask, sequence_labels) seq_relationship_score = model(input_ids, token_type_ids, input_mask) outputs = { "loss": loss, "seq_relationship_score": seq_relationship_score, } return outputs
def load_pretrained_model_tokenizer(model_type="BertForSequenceClassification", base_model=None, base_tokenizer=None, device="cuda", chinese=False, num_labels=2): # Load pre-trained model (weights) if base_model is None: if chinese: base_model = "bert-base-chinese" else: base_model = "bert-base-uncased" if model_type == "BertForSequenceClassification": model = BertForSequenceClassification.from_pretrained( base_model, num_labels=num_labels) # Load pre-trained model tokenizer (vocabulary) elif model_type == "BertForNextSentencePrediction": model = BertForNextSentencePrediction.from_pretrained(base_model) elif model_type == "BertForTokenClassification": model = BertForTokenClassification.from_pretrained( base_model, num_labels=num_labels) elif model_type == "BertMSE": model = BertMSE() else: print("[Error]: unsupported model type") return None, None if base_tokenizer is None: # Download from huggingface tokenizer = BertTokenizer.from_pretrained(base_model) else: # Load local file tokenizer = BertTokenizer.from_pretrained(base_tokenizer) model.to(device) return model, tokenizer
def load_pretrained_model_tokenizer(model_type="BertForSequenceClassification", device="cuda", config=None): bert_model = config['bert_model'] # Load pre-trained model (weights) if model_type == "BertForSequenceClassification": model = BertForSequenceClassification.from_pretrained(bert_model, num_labels=2) # Load pre-trained model tokenizer (vocabulary) elif model_type == "BertForNextSentencePrediction": model = BertForNextSentencePrediction.from_pretrained(bert_model) elif model_type == "specific_shared": model = SpecificShared(config) elif model_type == "siamese_bert": model = SiameseBert(config) elif model_type == "n_bert": model = nBert(config) elif model_type == "bert_sts": model = BertSts(config) elif model_type == "bert_fine_tune": model = BertFineTune(config) else: print("[Error]: unsupported model type") return None, None tokenizer = BertTokenizer.from_pretrained(bert_model) model.to(device) print("Initialized model and tokenizer") return model, tokenizer
def load(path, new_args=None): params = torch.load(path, map_location=lambda storage, loc: storage) args = params['args'] print('new args', new_args) args.rl_baseline = new_args.rl_baseline args.hrl = new_args.hrl args.ngram = new_args.ngram args.rl_weight = new_args.rl_weight args.reward_alpha = new_args.reward_alpha args.avoid_ngram = new_args.avoid_ngram args.optim = new_args.optim args.cuda = new_args.cuda args.rl_bleu = new_args.rl_bleu args.rl_f1 = new_args.rl_f1 args.rl_relevance = new_args.rl_relevance args.rl_reward = new_args.rl_reward args.rl_relevance_weight = new_args.rl_relevance_weight args.rl_relevance_beta = new_args.rl_relevance_beta args.rl_expressiveness = new_args.rl_expressiveness args.rl_coherence = new_args.rl_coherence args.bert_weight_path = new_args.bert_weight_path args.bert_vocab_path = new_args.bert_vocab_path args.decode_len_constraint = new_args.decode_len_constraint if new_args.new_vocab is not None: print('loading new vocab from ' + new_args.new_vocab) vocab = torch.load(new_args.new_vocab) else: vocab = params['decoder_vocab'] # model = VistModel.build_model(args, vocab) model = VistModel(args, vocab) try: model.encoder.load_state_dict(params['encoder_state_dict']) except KeyError: print('****Warming loading stat dict missing parameters ****') try: print('decoder params', params['decoder_state_dict'].keys()) model.decoder.load_state_dict(params['decoder_state_dict'], strict=False) except KeyError: print('****Warming loading stat dict missing parameters ****') params_name = params['decoder_state_dict'].keys() for n, p in model.named_parameters(): if n not in params_name: print('uniformly initial new parameters %s in [-%f,%f]' % (n, -args.uniform_init, args.uniform_init)) p.data.uniform_(-args.uniform_init, args.uniform_init) if 'coherence' in args.rl_reward: model.bert_tokenizer = BertTokenizer.from_pretrained( new_args.bert_vocab_path) model.bert_nsp = BertForNextSentencePrediction.from_pretrained( new_args.bert_weight_path) model.bert_nsp.eval() return model
def score_nsp(file, bert_weight, bert_vocab): pairs = [] for l in open(file): sents = l.strip().split('\t')[1].strip().split('.') for i in range(len(sents)): pre = "" if i == 0 else sents[i - 1].strip() + ' . ' cur = sents[i].strip() + ' .' pairs.append("[CLS] {}[SEP] {} [SEP]".format(pre, cur)) bert_tokenizer = BertTokenizer.from_pretrained(bert_vocab) bert_nsp = BertForNextSentencePrediction.from_pretrained(bert_weight) bert_nsp.eval() scores = get_nsp(pairs, bert_tokenizer, bert_nsp) # print('scores', scores) for s, p in zip(pairs, scores): print('s={}, p={}'.format(s, p)) return np.mean(scores)
def load_pretrained_model_tokenizer(base_model=None, base_tokenizer=None, device='cuda'): if device == 'cuda': assert torch.cuda.is_available() # Load pre-trained model (weights) if base_model is None: # Download from huggingface base_model = 'bert-base-uncased' model = BertForNextSentencePrediction.from_pretrained(base_model) if base_tokenizer is None: # Download from huggingface tokenizer = BertTokenizer.from_pretrained(base_model) else: # Load local vocab file tokenizer = BertTokenizer.from_pretrained(base_tokenizer) model.to(device) return model, tokenizer
def model_init(self, model='bert-base-chinese'): """ 初始化模型内容 >>> model_init(model='bert-base-chinese') >>> model_init(model='bert-base-chinese') """ self.tokeniser = BertTokenizer.from_pretrained(model) self.model = BertForNextSentencePrediction.from_pretrained(model) self.model.eval() if torch.cuda.is_available(): self.dirve = 'cuda' else: self.dirve = 'cpu' print('use ', self.dirve) self.model.to(self.dirve)
def load_pretrained_model_tokenizer(model_type="BertForSequenceClassification", device="cuda", chinese=False): # Load pre-trained model (weights) if chinese: base_model = "bert-base-chinese" else: base_model = "bert-base-uncased" if model_type == "BertForSequenceClassification": model = BertForSequenceClassification.from_pretrained(base_model) # Load pre-trained model tokenizer (vocabulary) elif model_type == "BertForNextSentencePrediction": model = BertForNextSentencePrediction.from_pretrained(base_model) else: print("[Error]: unsupported model type") return None, None tokenizer = BertTokenizer.from_pretrained(base_model) model.to(device) return model, tokenizer
def load_pretrained_model_tokenizer(base_model=None, base_tokenizer=None, device='cuda', chinese=False): # Load pre-trained model (weights) if base_model is None: # Download from huggingface if chinese: base_model = 'bert-base-chinese' else: base_model = 'bert-base-uncased' model = BertForNextSentencePrediction.from_pretrained(base_model) # Load pre-trained model tokenizer (vocabulary) if base_tokenizer is None: tokenizer = BertTokenizer.from_pretrained( base_model) # Download from huggingface else: tokenizer = BertTokenizer.from_pretrained( base_tokenizer) # Load local file model.to(device) return model, tokenizer
def start_inference(batch_count, bert_model, data_type): torch.manual_seed(10) if data_type == 'testing': data_location = '../data/testing_db.csv' else: data_location = '../data/validation_db.csv' tokenizer = BertTokenizer.from_pretrained(bert_model) model = BertForNextSentencePrediction.from_pretrained(bert_model) model.eval() model.cuda() if batch_count != -1: batch_size = 125000 df = pd.read_csv(data_location, skiprows=range(1, batch_count * batch_size + 1), nrows=batch_size) print( f'About to process batch number {batch_count}, which contains {df.shape[0]} samples.' ) else: df = pd.read_csv(data_location) normal_probs_single, db_probs_single = [], [] for _, row in tqdm(df.iterrows(), total=df.shape[0]): context = row.context #.split('#_new_utterance_#')[-1] normal_probs_single.append( bert_prediction(context, row.response, model, tokenizer)) db_probs_single.append( bert_prediction(context, row.db_response_new, model, tokenizer)) with open(f'{data_type}_full_normal.pkl', 'wb') as f: pickle.dump(normal_probs_single, f) with open(f'{data_type}_full_db.pkl', 'wb') as f: pickle.dump(db_probs_single, f)
#sentence2 = "It is possible that by making more research more available, online searching could conceivably broaden the work cited and lead researchers, as a collective, away from the “core” journals of their fields and to dispersed but individually relevant work." sentence2 = "I will show, however, that even as deeper journal back issues became available online, scientists and scholars cited more recent articles; even as more total journals became available online, fewer were cited." text = sentence1 + " " + sentence2 tokenized_s1 = tokenizer.tokenize(sentence1) #text = "Who was Jim Morrison ? Jim Morrison was a puppeteer" tokenized_text = tokenizer.tokenize(text) print(tokenized_text) # Convert token to vocabulary indices indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text) print(indexed_tokens) # Define sentence A and B indices associated to 1st and 2nd sentences (see paper) segments_ids = [1] * len(tokenized_text) for x in range(len(tokenized_s1)): segments_ids[x] = 0 #segments_ids = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1] # Convert inputs to PyTorch tensors tokens_tensor = torch.tensor([indexed_tokens]) segments_tensors = torch.tensor([segments_ids]) # Load pre-trained model (weights) model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased') model.eval() # Predict is Next Sentence ? predictions = model(tokens_tensor, segments_tensors) print(predictions[0].data.tolist())
import ingest from pytorch_pretrained_bert import BertTokenizer, BertConfig from keras.preprocessing.sequence import pad_sequences from pytorch_pretrained_bert import BertTokenizer, BertConfig from pytorch_pretrained_bert import BertAdam, BertForNextSentencePrediction from tqdm import tqdm, trange import torch from torch.nn import CrossEntropyLoss MAX_LEN = 90 tokenizer = BertTokenizer.from_pretrained('data/mini/vocab.txt', do_lower_case=True) model = BertForNextSentencePrediction.from_pretrained('data/mini/') def tokenize(sentence): # tokenize the paragraph sentence = tokenizer.tokenize(sentence) # convert each token to its vocab id sentence = tokenizer.convert_tokens_to_ids(sentence) return sentence def transform(data, returnAsPair=False): transformedData = [] for dataPoint in data: # prestory = ["[CLS] " + query + " [SEP]" for query in dataPoint.inputSentences] prestory = " [SEP] ".join(dataPoint.inputSentences) prestory = "[CLS] " + prestory + " [SEP] " prestory = tokenize(prestory)
output_dir = "/home/terry/pan/github/bert/model/bert-base-chinese/" # Step 1: Save a model, configuration and vocabulary that you have fine-tuned import os import torch from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM, BertForNextSentencePrediction # OPTIONAL: if you want to have more information on what's happening, activate the logger as follows import logging logging.basicConfig(level=logging.INFO) # Load pre-trained model tokenizer (vocabulary) tokenizer = BertTokenizer.from_pretrained('bert-base-chinese') #model = BertModel.from_pretrained('bert-base-chinese') model = BertForNextSentencePrediction.from_pretrained('bert-base-chinese') # If we have a distributed model, save only the encapsulated model # (it was wrapped in PyTorch DistributedDataParallel or DataParallel) model_to_save = model.module if hasattr(model, 'module') else model # If we save using the predefined names, we can load using `from_pretrained` output_model_file = os.path.join(output_dir, WEIGHTS_NAME) output_config_file = os.path.join(output_dir, CONFIG_NAME) torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) tokenizer.save_vocabulary(output_dir)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--batch_size', default=1, type=int, help='Batch size for inference') parser.add_argument( '--bert_model', default='bert-base-cased', type=str, help= 'Bert pre-trained model selected, e.g. bert-base-uncased, bert-large-uncased, bert-base-multilingual-case, bert-base-chinese' ) parser.add_argument( '--max_seq_length', default=128, type=int, help='Maximum total input sequence length after tokenization') args = parser.parse_args() input_ids = torch.zeros([args.batch_size, args.max_seq_length], dtype=torch.long) token_type_ids = torch.zeros([args.batch_size, args.max_seq_length], dtype=torch.long) # Export various BERT models # Note: For argument definitions used here see modeling.py from pytorch-pretrained-bert # repository # # Fully trained models model = BertModel.from_pretrained(args.bert_model) torch.onnx.export( model, (input_ids, token_type_ids), 'bert_' + 'batch' + str(args.batch_size) + '_' + args.bert_model + '.onnx') model = BertForMaskedLM.from_pretrained(args.bert_model) torch.onnx.export( model, (input_ids, token_type_ids), 'bert_maskedlm_' + 'batch' + str(args.batch_size) + '_' + args.bert_model + '.onnx') model = BertForNextSentencePrediction.from_pretrained(args.bert_model) torch.onnx.export( model, (input_ids, token_type_ids), 'bert_nextsentence_' + 'batch' + str(args.batch_size) + '_' + args.bert_model + '.onnx') model = BertForPreTraining.from_pretrained(args.bert_model) torch.onnx.export( model, (input_ids, token_type_ids), 'bert_pretraining_' + 'batch' + str(args.batch_size) + '_' + args.bert_model + '.onnx') # Partially trained models model = BertForSequenceClassification.from_pretrained(args.bert_model, 2) torch.onnx.export( model, (input_ids, token_type_ids), 'bert_classify_' + 'batch' + str(args.batch_size) + '_' + args.bert_model + '.untrained.onnx') model = BertForTokenClassification.from_pretrained(args.bert_model, 2) torch.onnx.export( model, (input_ids, token_type_ids), 'bert_tokenclassify_' + 'batch' + str(args.batch_size) + '_' + args.bert_model + '.untrained.onnx') # Returns error on ONNX export about "squeeze with negative axis -1 might cause onnx model to be incorrect, so commented out. # # model = BertForQuestionAnswering.from_pretrained(args.bert_model) # torch.onnx.export(model,(input_ids,token_type_ids),'bert_question_'+'batch'+str(args.batch_size)+'_'+args.bert_model+'.untrained.onnx') choices = 2 input_ids = torch.zeros([args.batch_size, choices, args.max_seq_length], dtype=torch.long) token_type_ids = torch.zeros( [args.batch_size, choices, args.max_seq_length], dtype=torch.long) model = BertForMultipleChoice.from_pretrained(args.bert_model, choices) torch.onnx.export( model, (input_ids, token_type_ids), 'bert_multiplechoice_' + 'batch' + str(args.batch_size) + '_' + args.bert_model + '.untrained.onnx')
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForNextSentencePrediction CACHE_DIR = 'cache/' BERT_MODEL = 'model.tar.gz' # Use only for the pre-trained model # -------------------------------------------------------------------------------------/ # tokenizer = BertTokenizer.from_pretrained('bert-large-uncased') # model = BertForNextSentencePrediction.from_pretrained('bert-large-uncased') # model.eval() # -------------------------------------------------------------------------------------/ #Use only for the fine-tuned model # -------------------------------------------------------------------------------------/ tokenizer = BertTokenizer.from_pretrained('vocab.txt') model = BertForNextSentencePrediction.from_pretrained(CACHE_DIR + BERT_MODEL, cache_dir=CACHE_DIR) model.eval() # -------------------------------------------------------------------------------------/ # ### AllenNLP - Loading # In[3]: #**************************************************************************************/ # Title: AllenNLP # Author: AllenAI # Date: 2019 # Code version: #3032 # Availability: https://github.com/allenai/allennlp/blob/master/allennlp/predictors/decomposable_attention.py #**************************************************************************************/
def guess(input_text, use355M, iteration): nltk.download('punkt') next_text = '' checkpoint_dir = '' if use355M: checkpoint_dir = 'tf_model/355M_diary' else: checkpoint_dir = 'tf_model/124M_diary' sess = gpt2.start_tf_sess() gpt2.load_gpt2(sess, checkpoint_dir=checkpoint_dir) sents = [] for i in range(iteration): text = gpt2.generate(sess, return_as_list=True, checkpoint_dir=checkpoint_dir, length=200, prefix=input_text, truncate="<|endoftext|>") text = text[0] input_len = len(nltk.sent_tokenize(input_text)) temp = nltk.sent_tokenize(text) sents += temp[input_len:-1] # Load pre-trained model tokenizer (vocabulary) tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') # Load pre-trained model (weights) model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased') model.eval() predicts_text = [] for sent in sents: next_text = sent # Tokenized input text = "[CLS] " + input_text + " [SEP] " + next_text + " [SEP]" tokenized_text = tokenizer.tokenize(text) # Convert token to vocabulary indices indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text) # Define sentence A and B indices associated to 1st and 2nd sentences (see paper) len_1 = len(tokenizer.tokenize(input_text)) + 2 # [CLS] & [SEP] len_2 = len(tokenizer.tokenize(next_text)) + 1 # [SEP] segments_ids = len_1 * [0] + len_2 * [1] # Convert inputs to PyTorch tensors tokens_tensor = torch.tensor([indexed_tokens]) segments_tensors = torch.tensor([segments_ids]) # Predict is Next Sentence ? predictions = model(tokens_tensor, segments_tensors) predicts_text.append((predictions[0][0].item(), next_text)) final_shuang = sorted(predicts_text, key=lambda x: x[0], reverse=True) if len(predicts_text) < 3: guess1 = " \n" + final_shuang[0][1] guess2 = "Then maybe: \n" + final_shuang[1][1] print("OH F**K") else: guess1 = final_shuang[0][1] guess2 = final_shuang[1][1] guess3 = final_shuang[2][1] return guess1, guess2, guess3
def run_bert_ns(data, year, predictions_dict): """ Train the BERT LM_experiments for the Next sentence prediction :param data: The actual data of the year stored on dictionary :param year: The corresponding year of the data. It is used when we save the predictions :param predictions_dict: A dict where we save the predictions from our experiments :return: """ tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) vocab_size = len(tokenizer.vocab) model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased') model.eval() model.to('cuda') # It is used when we normalize the predicted probabilities of LM_experiments to [0, 1] soft = Softmax(dim=-1) for doc_id, doc in data.items(): for peer_id, peer in doc['peer_summarizers'].items(): summary = peer['system_summary'] if not_valid(peer_id=peer_id, doc_id=doc_id): predictions_dict[year][doc_id][peer_id]['BERT_NS'] = vocab_size continue with torch.no_grad(): if summary != '': summary_sentences = sent_tokenize(summary) tokenized_sentences = tokenize_sentences( sentences=summary_sentences, tokenizer=tokenizer) sentences_ids = convert_sentences( sentences=tokenized_sentences, tokenizer=tokenizer) log_probabilities = [] for i in range(len(sentences_ids) - 1): indexed_tokens = sentences_ids[i] + sentences_ids[i + 1] tokens_tensor = torch.tensor([indexed_tokens]) tokens_tensor = tokens_tensor.to('cuda') segments_ids = [0] * len(sentences_ids[i]) + [1] * len( sentences_ids[i + 1]) segments_tensor = torch.tensor([segments_ids]) segments_tensor = segments_tensor.to('cuda') # predict the next sentence an normalize the prediction predictions = model(tokens_tensor, segments_tensor) predictions = soft(predictions) # [0][0] the probability of Next sentence, actually following # [0][1] the probability of Next sentence, not following p = predictions[0][0].item() log_probabilities.append(math.log(p, 2)) if len(log_probabilities) != 0: mean_of_probabilities = np.mean( np.array(log_probabilities)) perplexity = math.pow(2, -mean_of_probabilities) else: perplexity = math.pow( 2, 0) # All the summary is 1 sentence predictions_dict[year][doc_id][peer_id][ 'BERT_NS'] = perplexity else: print('BLANK') predictions_dict[year][doc_id][peer_id][ 'BERT_NS'] = vocab_size # Saves the predictions on prediction_dict that holds all the predictions of the experiments predictions_path = os.path.join(OUTPUT_DIR, 'predictions of models.json') with open(predictions_path, 'w') as of: json.dump(obj=predictions_dict, fp=of, sort_keys=True, indent=4) return predictions_dict