def predictor(sentence1, sentence2): text = "[CLS] " + sentence1 + " [SEP] a" + sentence2 + " [SEP]" ids1 = [0] *(len(tokenizer.tokenize(sentence1)) + 2) ids2 = [1] *(len(tokenizer.tokenize(sentence2)) + 1) ids1.extend(ids2) tokenized_text = tokenizer.tokenize(text) indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text) segments_ids = ids1 tokens_tensor = torch.tensor([indexed_tokens]) segments_tensors = torch.tensor([segments_ids]) model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased') model.eval() tokens_tensor = tokens_tensor.to('cuda') segments_tensors = segments_tensors.to('cuda') model.to('cuda') # Predict the next sentence classification logits with torch.no_grad(): next_sent_classif_logits = model(tokens_tensor, segments_tensors) ret = torch.softmax(next_sent_classif_logits[0], dim=1) a.cpu() return a[0][0].item()
import torch from pytorch_transformers import BertForNextSentencePrediction from pytorch_transformers import BertTokenizer from torch.nn.functional import cosine_similarity from torch.nn.functional import softmax from torch.nn.utils.rnn import pad_sequence BERT_MODEL_VERSION = 'bert-base-uncased' MAX_SENTENCE_LENGTH = 512 tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_VERSION) model = BertForNextSentencePrediction.from_pretrained( BERT_MODEL_VERSION, output_hidden_states=True, ) model.eval() if torch.cuda.is_available(): model.cuda() def calculate_similarities( query_embedding, document_embeddings, ): return cosine_similarity( query_embedding, document_embeddings, dim=1,
print("文章2を入力してください") sentence2 = input() text = "[CLS] " + sentence1 + " [SEP] " + sentence2 + " [SEP]" ids1 = [0] *(len(tokenizer.tokenize(sentence1)) + 2) ids2 = [1] *(len(tokenizer.tokenize(sentence2)) + 1) ids1.extend(ids2) tokenized_text = tokenizer.tokenize(text) indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text) segments_ids = ids1 tokens_tensor = torch.tensor([indexed_tokens]) segments_tensors = torch.tensor([segments_ids]) model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased') model.eval() tokens_tensor = tokens_tensor.to('cuda') segments_tensors = segments_tensors.to('cuda') model.to('cuda') # Predict the next sentence classification logits with torch.no_grad(): next_sent_classif_logits = model(tokens_tensor, segments_tensors) a = torch.softmax(next_sent_classif_logits[0], dim=1) print(torch.softmax(next_sent_classif_logits[0], dim=1)) print(torch.softmax(next_sent_classif_logits[0].cpu(), dim=1)) print(a.cpu()) #a.item()
'input_ids': batch[0], 'attention_mask': batch[1], 'token_type_ids': batch[2] } outputs = model(**inputs) # logger.info(outputs) # logger.info(torch.argmax(outputs[0],dim=-1)) # 二分类,0代表是下一句 if torch.argmax(F.softmax(outputs[0], dim=-1), dim=-1).item() == 0: logger.info([texts_a, texts_b]) logger.info(F.softmax(outputs[0], dim=-1)) return [texts_b, F.softmax(outputs[0], dim=-1)] if __name__ == "__main__": import pickle import pandas as pd tokenizer = BertTokenizer.from_pretrained('bert-base-chinese', do_lower_case=True) model = BertForNextSentencePrediction.from_pretrained('out/') model.to('cuda') break_tag = pd.read_excel( '/data/jh/notebooks/wanglei/1688/data/break_tag.xlsx', header=None) break_tag = list(break_tag[0]) ret = [] for tag in break_tag: text_a = ['露出精致的锁骨和优美的天鹅颈'] text_b = [tag] ret.append(test(text_a, text_b, tokenizer, model))