def start_inference(data, dialogue_type, dest, batchsize, bert_model, cuda): assert torch.cuda.is_available( ) == True, 'PyTorch not running on GPU! #sadpanda' torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False torch.manual_seed(100) dialogue_type_dict = {'DB': 'db_response_new', 'normal': 'response'} config = BertConfig.from_pretrained(bert_model) tokenizer = BertTokenizer.from_pretrained(bert_model) model = BertForNextSentencePrediction(config) model.cuda() model.eval() df = pd.read_csv(data, usecols=['id']) df.dropna(inplace=True) row_count = df.shape[0] del df chunk_count = math.ceil(row_count / batchsize) with open(dest, 'w+'): pass cols = ['context', dialogue_type_dict[dialogue_type]] for i, chunk in enumerate( tqdm(pd.read_csv(open(data, 'r'), usecols=cols, chunksize=batchsize), desc='Batches', total=chunk_count)): samples = get_batch(chunk, dialogue_type_dict[dialogue_type]) assert len(samples) == chunk.shape[0], 'Some samples went missing!' if batchsize == 1: results = convert_single_example_to_features(samples, tokenizer) else: results = convert_examples_to_features(samples, tokenizer) with torch.no_grad(): input_ids = torch.tensor([x.input_ids for x in results]).cuda() token_type_ids = torch.tensor([x.input_type_ids for x in results]).cuda() attention_mask = torch.tensor([x.input_mask for x in results]).cuda() outputs = model(input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)[0] outputs = torch.softmax(outputs, dim=1) db_probs = outputs[:, 1] with open(dest, 'a') as f: f.write('\n'.join([str(x) for x in db_probs.tolist()]) + '\n')
def create_and_check_bert_for_next_sequence_prediction(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels): model = BertForNextSentencePrediction(config=config) model.eval() loss, seq_relationship_score = model(input_ids, token_type_ids, input_mask, sequence_labels) result = { "loss": loss, "seq_relationship_score": seq_relationship_score, } self.parent.assertListEqual( list(result["seq_relationship_score"].size()), [self.batch_size, 2]) self.check_loss_output(result)
import torch from pytorch_transformers import BertConfig, BertTokenizer, BertForNextSentencePrediction import numpy as np import pandas as pd config = BertConfig.from_pretrained('bert-base-uncased') tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = BertForNextSentencePrediction(config) model.eval() model.cuda() df = pd.read_csv('breaker-of-dialogues/validation_db.csv') max_word_count = 550 class SampleType: text_a = '' text_b = None unique_id = 0 def get_batch(df): samples = [] for _, row in df.iterrows(): temp_sample = SampleType() temp_sample.unique_id = row.id temp_sample.text_a = 'hello my name is lionel messi' temp_sample.text_b = 'and I play football' samples.append(temp_sample)