def predict(self, to_predict): """ Performs predictions on a list of text. Args: to_predict: A python list of text (str) to be sent to the model for prediction. Returns: preds: A Python list of lists with dicts containg each word mapped to its NER tag. model_outputs: A python list of the raw model outputs for each text. """ device = self.device model = self.model args = self.args pad_token_label_id = self.pad_token_label_id self._move_model_to_device() predict_examples = [ InputExample(i, sentence.split(), ["O" for word in sentence.split()]) for i, sentence in enumerate(to_predict) ] eval_dataset = self.load_and_cache_examples(None, to_predict=predict_examples) eval_sampler = SequentialSampler(eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args["eval_batch_size"]) eval_loss = 0.0 nb_eval_steps = 0 preds = None out_label_ids = None model.eval() for batch in tqdm(eval_dataloader, disable=args["silent"]): batch = tuple(t.to(device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3], } # XLM and RoBERTa don"t use segment_ids if args["model_type"] in ["bert", "xlnet"]: inputs["token_type_ids"] = batch[2] outputs = model(**inputs) tmp_eval_loss, logits = outputs[:2] eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if preds is None: preds = logits.detach().cpu().numpy() out_label_ids = inputs["labels"].detach().cpu().numpy() else: preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) out_label_ids = np.append(out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps # model_outputs_shape = (Number_of_sentences, sentence_length, number_of_labels) model_outputs = preds # For each words in each sentence, find the index of the largest values, i.e the most likely tag preds = np.argmax(preds, axis=2) # Dictionary: {Key: Label_id, Value: Label} label_map = {i: label for i, label in enumerate(self.labels)} # ..... out_label_list = [[] for _ in range(out_label_ids.shape[0])] preds_list = [[] for _ in range(out_label_ids.shape[0])] # List of indicies in 'out_label_ids' that represent words relevantindicies = [] # List of uncertainties for each word. Uncertainties are the likelihood of each tag uncertainties = [] for i in range(out_label_ids.shape[0]): for j in range(out_label_ids.shape[1]): if out_label_ids[i, j] != pad_token_label_id: out_label_list[i].append(label_map[out_label_ids[i][j]]) preds_list[i].append(label_map[preds[i][j]]) # Append idx relevantindicies list relevantindicies.append(j) # Get length of all sentences sentencesLength = [] for i, sentence in enumerate(to_predict): sentencesLength.append(len(preds_list[i])) # Make a copy of relevant indicies: AllIdx = relevantindicies # Split the certainties into an array of certainties for each sentence relIdxSenSplit = [] for l in sentencesLength: relIdxSenSplit.append(AllIdx[0:l]) del AllIdx[:l] # Iterate over each sentence in sentence output for i in range(len(sentencesLength)): # List to store uncertainties for a given sentence senUncertainty = [] for j in range(sentencesLength[i]): ## Get the uncertainties for the relevant words modelOutput = model_outputs[i][relIdxSenSplit[i][j]] uncertainty = softmax(modelOutput) senUncertainty.append(uncertainty) uncertainties.append(senUncertainty) preds = [ [{word: preds_list[i][j]} for j, word in enumerate(sentence.split()[: len(preds_list[i])])] for i, sentence in enumerate(to_predict) ] return preds, model_outputs, uncertainties
def predict(self, to_predict): """ Performs predictions on a list of text. Args: to_predict: A python list of text (str) to be sent to the model for prediction. Returns: preds: A Python list of lists with dicts containg each word mapped to its NER tag. model_outputs: A python list of the raw model outputs for each text. """ device = self.device model = self.model args = self.args pad_token_label_id = self.pad_token_label_id self._move_model_to_device() predict_examples = [ InputExample(i, sentence.split(), [self.labels[0] for word in sentence.split()]) for i, sentence in enumerate(to_predict) ] eval_dataset = self.load_and_cache_examples( None, to_predict=predict_examples) eval_sampler = SequentialSampler(eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args["eval_batch_size"]) eval_loss = 0.0 nb_eval_steps = 0 preds = None out_label_ids = None model.eval() for batch in tqdm(eval_dataloader, disable=args["silent"]): batch = tuple(t.to(device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3], } # XLM and RoBERTa don"t use segment_ids if args["model_type"] in ["bert", "xlnet"]: inputs["token_type_ids"] = batch[2] outputs = model(**inputs) tmp_eval_loss, logits = outputs[:2] eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if preds is None: preds = logits.detach().cpu().numpy() out_label_ids = inputs["labels"].detach().cpu().numpy() else: preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) out_label_ids = np.append( out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps model_outputs = preds preds = np.argmax(preds, axis=2) label_map = {i: label for i, label in enumerate(self.labels)} out_label_list = [[] for _ in range(out_label_ids.shape[0])] preds_list = [[] for _ in range(out_label_ids.shape[0])] for i in range(out_label_ids.shape[0]): for j in range(out_label_ids.shape[1]): if out_label_ids[i, j] != pad_token_label_id: out_label_list[i].append(label_map[out_label_ids[i][j]]) preds_list[i].append(label_map[preds[i][j]]) preds = [[{ word: preds_list[i][j] } for j, word in enumerate(sentence.split()[:len(preds_list[i])])] for i, sentence in enumerate(to_predict)] return preds, model_outputs
def predict(self, to_predict, split_on_space=True): """ Performs predictions on a list of text. Args: to_predict: A python list of text (str) to be sent to the model for prediction. split_on_space: If True, each sequence will be split by spaces for assigning labels. If False, to_predict must be a a list of lists, with the inner list being a list of strings consisting of the split sequences. The outer list is the list of sequences to predict on. Returns: preds: A Python list of lists with dicts containing each word mapped to its NER tag. model_outputs: A Python list of lists with dicts containing each word mapped to its list with raw model output. """ # noqa: ignore flake8" device = self.device model = self.model args = self.args pad_token_label_id = self.pad_token_label_id self._move_model_to_device() if split_on_space: predict_examples = [ InputExample(i, sentence.split(), [self.args.labels_list[0] for word in sentence.split()]) for i, sentence in enumerate(to_predict) ] else: predict_examples = [ InputExample(i, sentence, [self.args.labels_list[0] for word in sentence]) for i, sentence in enumerate(to_predict) ] eval_dataset = self.load_and_cache_examples(None, to_predict=predict_examples) eval_sampler = SequentialSampler(eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) eval_loss = 0.0 nb_eval_steps = 0 preds = None out_label_ids = None model.eval() for batch in tqdm(eval_dataloader, disable=args.silent, desc="Running Prediction"): batch = tuple(t.to(device) for t in batch) with torch.no_grad(): inputs = { "input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3], } # XLM and RoBERTa don"t use segment_ids if args.model_type in ["bert", "xlnet"]: inputs["token_type_ids"] = batch[2] outputs = model(**inputs) tmp_eval_loss, logits = outputs[:2] eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if preds is None: preds = logits.detach().cpu().numpy() out_label_ids = inputs["labels"].detach().cpu().numpy() out_input_ids = inputs["input_ids"].detach().cpu().numpy() out_attention_mask = inputs["attention_mask"].detach().cpu().numpy() else: preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) out_label_ids = np.append(out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0) out_input_ids = np.append(out_input_ids, inputs["input_ids"].detach().cpu().numpy(), axis=0) out_attention_mask = np.append( out_attention_mask, inputs["attention_mask"].detach().cpu().numpy(), axis=0, ) eval_loss = eval_loss / nb_eval_steps token_logits = preds preds = np.argmax(preds, axis=2) label_map = {i: label for i, label in enumerate(self.args.labels_list)} out_label_list = [[] for _ in range(out_label_ids.shape[0])] preds_list = [[] for _ in range(out_label_ids.shape[0])] for i in range(out_label_ids.shape[0]): for j in range(out_label_ids.shape[1]): if out_label_ids[i, j] != pad_token_label_id: out_label_list[i].append(label_map[out_label_ids[i][j]]) preds_list[i].append(label_map[preds[i][j]]) if split_on_space: preds = [ [{word: preds_list[i][j]} for j, word in enumerate(sentence.split()[: len(preds_list[i])])] for i, sentence in enumerate(to_predict) ] else: preds = [ [{word: preds_list[i][j]} for j, word in enumerate(sentence[: len(preds_list[i])])] for i, sentence in enumerate(to_predict) ] word_tokens = [] for n, sentence in enumerate(to_predict): w_log = self._convert_tokens_to_word_logits( out_input_ids[n], out_label_ids[n], out_attention_mask[n], token_logits[n], ) word_tokens.append(w_log) if split_on_space: model_outputs = [ [{word: word_tokens[i][j]} for j, word in enumerate(sentence.split()[: len(preds_list[i])])] for i, sentence in enumerate(to_predict) ] else: model_outputs = [ [{word: word_tokens[i][j]} for j, word in enumerate(sentence[: len(preds_list[i])])] for i, sentence in enumerate(to_predict) ] return preds, model_outputs