Esempio n. 1
0
    def predict(self, to_predict):
        """
        Performs predictions on a list of text.

        Args:
            to_predict: A python list of text (str) to be sent to the model for prediction.

        Returns:
            preds: A Python list of lists with dicts containg each word mapped to its NER tag.
            model_outputs: A python list of the raw model outputs for each text.
        """

        device = self.device
        model = self.model
        args = self.args
        pad_token_label_id = self.pad_token_label_id

        self._move_model_to_device()

        predict_examples = [
            InputExample(i, sentence.split(), ["O" for word in sentence.split()])
            for i, sentence in enumerate(to_predict)
        ]

        eval_dataset = self.load_and_cache_examples(None, to_predict=predict_examples)

        eval_sampler = SequentialSampler(eval_dataset)
        eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args["eval_batch_size"])

        eval_loss = 0.0
        nb_eval_steps = 0
        preds = None
        out_label_ids = None
        model.eval()

        for batch in tqdm(eval_dataloader, disable=args["silent"]):
            batch = tuple(t.to(device) for t in batch)

            with torch.no_grad():
                inputs = {
                    "input_ids": batch[0],
                    "attention_mask": batch[1],
                    "labels": batch[3],
                }
                # XLM and RoBERTa don"t use segment_ids
                if args["model_type"] in ["bert", "xlnet"]:
                    inputs["token_type_ids"] = batch[2]
                outputs = model(**inputs)
                tmp_eval_loss, logits = outputs[:2]

                eval_loss += tmp_eval_loss.mean().item()

            nb_eval_steps += 1

            if preds is None:
                preds = logits.detach().cpu().numpy()
                out_label_ids = inputs["labels"].detach().cpu().numpy()
            else:
                preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
                out_label_ids = np.append(out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0)

        eval_loss = eval_loss / nb_eval_steps

        # model_outputs_shape = (Number_of_sentences, sentence_length, number_of_labels)
        model_outputs = preds

        # For each words in each sentence, find the index of the largest values, i.e the most likely tag 
        preds = np.argmax(preds, axis=2)

        # Dictionary: {Key: Label_id, Value: Label}
        label_map = {i: label for i, label in enumerate(self.labels)}
        
        # .....
        out_label_list = [[] for _ in range(out_label_ids.shape[0])]

        preds_list = [[] for _ in range(out_label_ids.shape[0])]


        # List of indicies in 'out_label_ids' that represent words
        relevantindicies = []

        # List of uncertainties for each word. Uncertainties are the likelihood of each tag
        uncertainties = []

        for i in range(out_label_ids.shape[0]):
            for j in range(out_label_ids.shape[1]):
                if out_label_ids[i, j] != pad_token_label_id:
                    out_label_list[i].append(label_map[out_label_ids[i][j]])
                    preds_list[i].append(label_map[preds[i][j]])

                    # Append idx relevantindicies list
                    relevantindicies.append(j)
        

        # Get length of all sentences
        sentencesLength = []
        for i, sentence in enumerate(to_predict):
            sentencesLength.append(len(preds_list[i]))


        # Make a copy of relevant indicies:
        AllIdx = relevantindicies

        # Split the certainties into an array of certainties for each sentence
        relIdxSenSplit = []
        for l in sentencesLength:
            relIdxSenSplit.append(AllIdx[0:l])
            del AllIdx[:l]


        # Iterate over each sentence in sentence output 
        for i in range(len(sentencesLength)):
            
            # List to store uncertainties for a given sentence
            senUncertainty = []

            for j in range(sentencesLength[i]):

                ## Get the uncertainties for the relevant words
                modelOutput = model_outputs[i][relIdxSenSplit[i][j]]
                uncertainty = softmax(modelOutput)
                senUncertainty.append(uncertainty)
            
            uncertainties.append(senUncertainty)
        
    
        preds = [
            [{word: preds_list[i][j]} for j, word in enumerate(sentence.split()[: len(preds_list[i])])]
            for i, sentence in enumerate(to_predict)
        ]
    
        return preds, model_outputs, uncertainties
Esempio n. 2
0
    def predict(self, to_predict):
        """
        Performs predictions on a list of text.

        Args:
            to_predict: A python list of text (str) to be sent to the model for prediction.

        Returns:
            preds: A Python list of lists with dicts containg each word mapped to its NER tag.
            model_outputs: A python list of the raw model outputs for each text.
        """

        device = self.device
        model = self.model
        args = self.args
        pad_token_label_id = self.pad_token_label_id

        self._move_model_to_device()

        predict_examples = [
            InputExample(i, sentence.split(),
                         [self.labels[0] for word in sentence.split()])
            for i, sentence in enumerate(to_predict)
        ]

        eval_dataset = self.load_and_cache_examples(
            None, to_predict=predict_examples)

        eval_sampler = SequentialSampler(eval_dataset)
        eval_dataloader = DataLoader(eval_dataset,
                                     sampler=eval_sampler,
                                     batch_size=args["eval_batch_size"])

        eval_loss = 0.0
        nb_eval_steps = 0
        preds = None
        out_label_ids = None
        model.eval()

        for batch in tqdm(eval_dataloader, disable=args["silent"]):
            batch = tuple(t.to(device) for t in batch)

            with torch.no_grad():
                inputs = {
                    "input_ids": batch[0],
                    "attention_mask": batch[1],
                    "labels": batch[3],
                }
                # XLM and RoBERTa don"t use segment_ids
                if args["model_type"] in ["bert", "xlnet"]:
                    inputs["token_type_ids"] = batch[2]
                outputs = model(**inputs)
                tmp_eval_loss, logits = outputs[:2]

                eval_loss += tmp_eval_loss.mean().item()

            nb_eval_steps += 1

            if preds is None:
                preds = logits.detach().cpu().numpy()
                out_label_ids = inputs["labels"].detach().cpu().numpy()
            else:
                preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
                out_label_ids = np.append(
                    out_label_ids,
                    inputs["labels"].detach().cpu().numpy(),
                    axis=0)

        eval_loss = eval_loss / nb_eval_steps
        model_outputs = preds
        preds = np.argmax(preds, axis=2)

        label_map = {i: label for i, label in enumerate(self.labels)}

        out_label_list = [[] for _ in range(out_label_ids.shape[0])]
        preds_list = [[] for _ in range(out_label_ids.shape[0])]

        for i in range(out_label_ids.shape[0]):
            for j in range(out_label_ids.shape[1]):
                if out_label_ids[i, j] != pad_token_label_id:
                    out_label_list[i].append(label_map[out_label_ids[i][j]])
                    preds_list[i].append(label_map[preds[i][j]])

        preds = [[{
            word: preds_list[i][j]
        } for j, word in enumerate(sentence.split()[:len(preds_list[i])])]
                 for i, sentence in enumerate(to_predict)]

        return preds, model_outputs
Esempio n. 3
0
    def predict(self, to_predict, split_on_space=True):
        """
        Performs predictions on a list of text.

        Args:
            to_predict: A python list of text (str) to be sent to the model for prediction.
            split_on_space: If True, each sequence will be split by spaces for assigning labels.
                            If False, to_predict must be a a list of lists, with the inner list being a
                            list of strings consisting of the split sequences. The outer list is the list of sequences to
                            predict on.

        Returns:
            preds: A Python list of lists with dicts containing each word mapped to its NER tag.
            model_outputs: A Python list of lists with dicts containing each word mapped to its list with raw model output.
        """  # noqa: ignore flake8"

        device = self.device
        model = self.model
        args = self.args
        pad_token_label_id = self.pad_token_label_id

        self._move_model_to_device()

        if split_on_space:
            predict_examples = [
                InputExample(i, sentence.split(), [self.args.labels_list[0] for word in sentence.split()])
                for i, sentence in enumerate(to_predict)
            ]
        else:
            predict_examples = [
                InputExample(i, sentence, [self.args.labels_list[0] for word in sentence])
                for i, sentence in enumerate(to_predict)
            ]

        eval_dataset = self.load_and_cache_examples(None, to_predict=predict_examples)

        eval_sampler = SequentialSampler(eval_dataset)
        eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)

        eval_loss = 0.0
        nb_eval_steps = 0
        preds = None
        out_label_ids = None
        model.eval()

        for batch in tqdm(eval_dataloader, disable=args.silent, desc="Running Prediction"):
            batch = tuple(t.to(device) for t in batch)

            with torch.no_grad():
                inputs = {
                    "input_ids": batch[0],
                    "attention_mask": batch[1],
                    "labels": batch[3],
                }
                # XLM and RoBERTa don"t use segment_ids
                if args.model_type in ["bert", "xlnet"]:
                    inputs["token_type_ids"] = batch[2]
                outputs = model(**inputs)
                tmp_eval_loss, logits = outputs[:2]

                eval_loss += tmp_eval_loss.mean().item()

            nb_eval_steps += 1

            if preds is None:
                preds = logits.detach().cpu().numpy()
                out_label_ids = inputs["labels"].detach().cpu().numpy()
                out_input_ids = inputs["input_ids"].detach().cpu().numpy()
                out_attention_mask = inputs["attention_mask"].detach().cpu().numpy()
            else:
                preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
                out_label_ids = np.append(out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0)
                out_input_ids = np.append(out_input_ids, inputs["input_ids"].detach().cpu().numpy(), axis=0)
                out_attention_mask = np.append(
                    out_attention_mask, inputs["attention_mask"].detach().cpu().numpy(), axis=0,
                )

        eval_loss = eval_loss / nb_eval_steps
        token_logits = preds
        preds = np.argmax(preds, axis=2)

        label_map = {i: label for i, label in enumerate(self.args.labels_list)}

        out_label_list = [[] for _ in range(out_label_ids.shape[0])]
        preds_list = [[] for _ in range(out_label_ids.shape[0])]

        for i in range(out_label_ids.shape[0]):
            for j in range(out_label_ids.shape[1]):
                if out_label_ids[i, j] != pad_token_label_id:
                    out_label_list[i].append(label_map[out_label_ids[i][j]])
                    preds_list[i].append(label_map[preds[i][j]])

        if split_on_space:
            preds = [
                [{word: preds_list[i][j]} for j, word in enumerate(sentence.split()[: len(preds_list[i])])]
                for i, sentence in enumerate(to_predict)
            ]
        else:
            preds = [
                [{word: preds_list[i][j]} for j, word in enumerate(sentence[: len(preds_list[i])])]
                for i, sentence in enumerate(to_predict)
            ]

        word_tokens = []
        for n, sentence in enumerate(to_predict):
            w_log = self._convert_tokens_to_word_logits(
                out_input_ids[n], out_label_ids[n], out_attention_mask[n], token_logits[n],
            )
            word_tokens.append(w_log)

        if split_on_space:
            model_outputs = [
                [{word: word_tokens[i][j]} for j, word in enumerate(sentence.split()[: len(preds_list[i])])]
                for i, sentence in enumerate(to_predict)
            ]
        else:
            model_outputs = [
                [{word: word_tokens[i][j]} for j, word in enumerate(sentence[: len(preds_list[i])])]
                for i, sentence in enumerate(to_predict)
            ]

        return preds, model_outputs