Beispiel #1
0
    def validation_epoch_end(self, outputs):
        if self.testing:
            prefix = 'test'
        else:
            prefix = 'val'

        avg_loss = torch.stack([x[f'{prefix}_loss'] for x in outputs]).mean()

        unique_ids = torch.cat([x[f'{prefix}_tensors']['unique_ids'] for x in outputs])
        start_logits = torch.cat([x[f'{prefix}_tensors']['start_logits'] for x in outputs])
        end_logits = torch.cat([x[f'{prefix}_tensors']['end_logits'] for x in outputs])

        all_unique_ids = []
        all_start_logits = []
        all_end_logits = []
        if torch.distributed.is_initialized():
            world_size = torch.distributed.get_world_size()
            for ind in range(world_size):
                all_unique_ids.append(torch.empty_like(unique_ids))
                all_start_logits.append(torch.empty_like(start_logits))
                all_end_logits.append(torch.empty_like(end_logits))
            torch.distributed.all_gather(all_unique_ids, unique_ids)
            torch.distributed.all_gather(all_start_logits, start_logits)
            torch.distributed.all_gather(all_end_logits, end_logits)
        else:
            all_unique_ids.append(unique_ids)
            all_start_logits.append(start_logits)
            all_end_logits.append(end_logits)

        exact_match, f1, all_predictions, all_nbest = -1, -1, [], []
        if not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0:

            unique_ids = []
            start_logits = []
            end_logits = []
            for u in all_unique_ids:
                unique_ids.extend(tensor2list(u))
            for u in all_start_logits:
                start_logits.extend(tensor2list(u))
            for u in all_end_logits:
                end_logits.extend(tensor2list(u))

            eval_dataset = self._test_dl.dataset if self.testing else self._validation_dl.dataset
            exact_match, f1, all_predictions, all_nbest = eval_dataset.evaluate(
                unique_ids=unique_ids,
                start_logits=start_logits,
                end_logits=end_logits,
                n_best_size=self._cfg.dataset.n_best_size,
                max_answer_length=self._cfg.dataset.max_answer_length,
                version_2_with_negative=self._cfg.dataset.version_2_with_negative,
                null_score_diff_threshold=self._cfg.dataset.null_score_diff_threshold,
                do_lower_case=self._cfg.dataset.do_lower_case,
            )

        logging.info(f"{prefix} exact match {exact_match}")
        logging.info(f"{prefix} f1 {f1}")

        self.log(f'{prefix}_loss', avg_loss)
        self.log(f'{prefix}_exact_match', exact_match)
        self.log(f'{prefix}_f1', f1)
Beispiel #2
0
    def _infer(self, sents: List[str]) -> List[List[int]]:
        """ Main function for Inference

        Args:
            sents: A list of input sentences (lowercase spoken-domain words separated by space).

        Returns:
            all_preds: A list of tab-separated text records, same size as input list. Each record consists of 4 items:
                - final output text
                - input words
                - tags predicted for input words
                - tags after swap preprocessing
        """

        # all input sentences go into one batch
        dataloader_cfg = {
            "batch_size": len(sents),
            "num_workers": 3,
            "pin_memory": False
        }
        infer_datalayer = self._setup_infer_dataloader(dataloader_cfg, sents)

        batch = next(iter(infer_datalayer))
        input_ids, input_mask, segment_ids = batch

        tag_logits, semiotic_logits = self.forward(
            input_ids=input_ids.to(self.device),
            input_mask=input_mask.to(self.device),
            segment_ids=segment_ids.to(self.device),
        )

        all_preds = []
        for i, sent in enumerate(sents):
            example = self.builder.build_bert_example(source=sent, infer=True)
            tag_preds = tensor2list(torch.argmax(tag_logits[i], dim=-1))
            semiotic_preds = tensor2list(
                torch.argmax(semiotic_logits[i], dim=-1))

            # this mask is required by get_token_labels
            example.features["labels_mask"] = [
                0
            ] + [1] * (len(semiotic_preds) - 2) + [0]
            example.features["tag_labels"] = tag_preds
            example.features["semiotic_labels"] = semiotic_preds
            tags = [
                self.id_2_tag[label_id]
                for label_id in example.get_token_labels("tag_labels")
            ]
            semiotic_labels = [
                self.id_2_semiotic[label_id]
                for label_id in example.get_token_labels("semiotic_labels")
            ]

            prediction, inp_str, tag_str, tag_with_swap_str = example.editing_task.realize_output(
                tags, semiotic_labels)
            all_preds.append(prediction + "\t" + inp_str + "\t" + tag_str +
                             "\t" + tag_with_swap_str + "\t" +
                             " ".join(semiotic_labels))

        return all_preds
Beispiel #3
0
    def test_epoch_end(self, outputs):
        unique_ids = tensor2list(
            torch.cat([x['test_tensors']['unique_ids'] for x in outputs]))
        logits = torch.cat([x['test_tensors']['logits'] for x in outputs])
        s, e = logits.split(dim=-1, split_size=1)
        start_logits = tensor2list(s.squeeze())
        end_logits = tensor2list(e.squeeze())
        (all_predictions, all_nbest,
         scores_diff) = self.test_dataset.get_predictions(
             unique_ids=unique_ids,
             start_logits=start_logits,
             end_logits=end_logits,
             n_best_size=self._cfg.test_ds.n_best_size,
             max_answer_length=self._cfg.test_ds.max_answer_length,
             version_2_with_negative=self._cfg.dataset.version_2_with_negative,
             null_score_diff_threshold=self._cfg.test_ds.
             null_score_diff_threshold,
             do_lower_case=self._cfg.dataset.do_lower_case,
         )

        if self._cfg.test_ds.output_nbest_file is not None:
            with open(self._cfg.test_ds.output_nbest_file, "w") as writer:
                writer.write(json.dumps(all_nbest, indent=4) + "\n")
        if self._cfg.test_ds.output_prediction_file is not None:
            with open(self._cfg.test_ds.output_prediction_file, "w") as writer:
                writer.write(json.dumps(all_predictions, indent=4) + "\n")
        return {}
Beispiel #4
0
    def multi_validation_epoch_end(self, outputs, dataloader_idx: int = 0):
        """
        Called at the end of validation to aggregate outputs.
        outputs: list of individual outputs of each validation step.
        """
        avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
        preds = torch.cat([x['eval_tensors']['preds'] for x in outputs])
        labels = torch.cat([x['eval_tensors']['labels'] for x in outputs])

        all_preds = []
        all_labels = []
        if torch.distributed.is_initialized():
            world_size = torch.distributed.get_world_size()
            for ind in range(world_size):
                all_preds.append(torch.empty_like(preds))
                all_labels.append(torch.empty_like(labels))
            torch.distributed.all_gather(all_preds, preds)
            torch.distributed.all_gather(all_labels, labels)
        else:
            all_preds.append(preds)
            all_labels.append(labels)

        tensorboard_logs = {}
        if not torch.distributed.is_initialized(
        ) or torch.distributed.get_rank() == 0:
            preds = []
            labels = []
            for p in all_preds:
                preds.extend(tensor2list(p))
            for l in all_labels:
                labels.extend(tensor2list(l))

            tensorboard_logs = compute_metrics(self.task_name, np.array(preds),
                                               np.array(labels))
            val_name = self._validation_names[dataloader_idx].upper()
            logging.info(f'{val_name} evaluation: {tensorboard_logs}')

            # writing labels and predictions to a file in output_dir is specified in the config
            output_dir = self._cfg.output_dir
            if output_dir:
                os.makedirs(output_dir, exist_ok=True)
                filename = os.path.join(output_dir,
                                        f'{self.task_name}_{val_name}.txt')
                logging.info(f'Saving labels and predictions to {filename}')
                with open(filename, 'w') as f:
                    f.write('labels\t' + list2str(labels) + '\n')
                    f.write('preds\t' + list2str(preds) + '\n')

        tensorboard_logs['val_loss'] = avg_loss
        for key in tensorboard_logs:
            self.log(f'{key}', tensorboard_logs[key], prog_bar=True)

        return {'val_loss': avg_loss, 'log': tensorboard_logs}
Beispiel #5
0
def calc_class_weights_from_dataloader(
        dataloader: 'torch.utils.data.DataLoader', num_classes: int,
        data_dir: str) -> List[float]:
    """
    Calculate the weights of each class to be used for weighted loss. This is similar to the function calc_class_weights
    in text_classification_dataset, but it gets the labels from a dataloader rather than from a file.
    Args:
        dataloader: the dataloader for the training set
        num_classes: number of classes in the dataset
    """
    labels = []
    for batch in dataloader:
        labels.extend(tensor2list(batch[-1]))
    logging.info(f'Calculating label frequency stats...')
    total_sents, sent_label_freq, max_id = get_label_stats(
        labels, os.path.join(data_dir, 'sentence_stats.tsv'), verbose=False)
    if max_id >= num_classes:
        raise ValueError(
            f'Found an invalid label! Labels should be from [0, num_classes-1].'
        )

    class_weights_dict = get_freq_weights(sent_label_freq)

    logging.info(f'Total Sentence Pairs: {total_sents}')
    logging.info(f'Class Frequencies: {sent_label_freq}')
    logging.info(f'Class Weights: {class_weights_dict}')
    class_weights = fill_class_weights(weights=class_weights_dict,
                                       max_id=num_classes - 1)
    return class_weights
    def _infer(self, queries: List[str], batch_size: int = None) -> List[int]:
        """
        Get prediction for the queries
        Args:
            queries: text sequences
            batch_size: batch size to use during inference.
        Returns:
            all_preds: model predictions
        """
        # store predictions for all queries in a single list
        all_preds = []
        mode = self.training
        try:
            device = 'cuda' if torch.cuda.is_available() else 'cpu'
            # Switch model to evaluation mode
            self.eval()
            self.to(device)
            infer_datalayer = self._setup_infer_dataloader(queries, batch_size)

            for batch in infer_datalayer:
                input_ids, input_type_ids, input_mask, subtokens_mask = batch

                logits = self.forward(
                    input_ids=input_ids.to(device),
                    token_type_ids=input_type_ids.to(device),
                    attention_mask=input_mask.to(device),
                )

                subtokens_mask = subtokens_mask > 0.5
                preds = tensor2list(torch.argmax(logits, axis=-1)[subtokens_mask])
                all_preds.extend(preds)
        finally:
            # set mode back to its original value
            self.train(mode=mode)
        return all_preds
Beispiel #7
0
    def predict_from_examples(self, queries: List[str],
                              test_ds) -> List[List[str]]:
        """
        Get prediction for the queries (intent and slots)
        Args:
            queries: text sequences
            test_ds: Dataset configuration section.
        Returns:
            predicted_intents, predicted_slots: model intent and slot predictions
        """

        predicted_intents = []
        predicted_slots = []
        mode = self.training

        device = 'cuda' if torch.cuda.is_available() else 'cpu'

        # Switch model to evaluation mode
        self.eval()
        self.to(device)

        # Dataset.
        infer_datalayer = self._setup_infer_dataloader(queries, test_ds)

        for batch in infer_datalayer:
            input_ids, input_type_ids, input_mask, loss_mask, subtokens_mask = batch

            intent_logits, slot_logits = self.forward(
                input_ids=input_ids.to(device),
                token_type_ids=input_type_ids.to(device),
                attention_mask=input_mask.to(device),
            )

            # predict intents
            intent_preds = tensor2list(torch.argmax(intent_logits, axis=-1))
            predicted_intents += self.convert_intent_ids_to_intent_names(
                intent_preds)

            # predict slots
            slot_preds = torch.argmax(slot_logits, axis=-1)
            predicted_slots += self.mask_unused_subword_slots(
                slot_preds, subtokens_mask)

        # set mode back to its original value
        self.train(mode=mode)

        return predicted_intents, predicted_slots
Beispiel #8
0
    def classifytext(self,
                     queries: List[str],
                     batch_size: int = 1,
                     max_seq_length: int = -1) -> List[int]:
        """
        Get prediction for the queries
        Args:
            queries: text sequences
            batch_size: batch size to use during inference
            max_seq_length: sequences longer than max_seq_length will get truncated. default -1 disables truncation.
        Returns:
            all_preds: model predictions
        """
        # store predictions for all queries in a single list
        all_preds = []
        mode = self.training
        device = next(self.parameters()).device
        try:
            # Switch model to evaluation mode
            self.eval()
            logging_level = logging.get_verbosity()
            logging.set_verbosity(logging.WARNING)
            dataloader_cfg = {
                "batch_size": batch_size,
                "num_workers": 3,
                "pin_memory": False
            }
            infer_datalayer = self._setup_infer_dataloader(
                dataloader_cfg, queries, max_seq_length)

            for i, batch in enumerate(infer_datalayer):
                input_ids, input_type_ids, input_mask, subtokens_mask = batch

                logits = self.forward(
                    input_ids=input_ids.to(device),
                    token_type_ids=input_type_ids.to(device),
                    attention_mask=input_mask.to(device),
                )

                preds = tensor2list(torch.argmax(logits, axis=-1))
                all_preds.extend(preds)
        finally:
            # set mode back to its original value
            self.train(mode=mode)
            logging.set_verbosity(logging_level)
        return all_preds
def main(args):
    config_path = args.re_config
    config = OmegaConf.load(config_path)

    config.trainer.gpus = args.gpus
    print(OmegaConf.to_yaml(config))

    trainer = pl.Trainer(plugins=[NLPDDPPlugin()], **config.trainer)
    exp_dir = exp_manager(trainer, config.get("exp_manager", None))

    model = nemo_nlp.models.TextClassificationModel(cfg=config.model,
                                                    trainer=trainer)

    trainer.fit(model)

    model.save_to(config.model.nemo_path)

    all_preds = []
    mode = model.training
    with torch.no_grad():
        device = 'cuda' if torch.cuda.is_available() else 'cpu'
        model.eval()
        model.to(device)
        # infer_datalayer = model._setup_infer_dataloader(queries, batch_size)
        ts_dl = model._setup_dataloader_from_config(cfg=config.model.test_ds)
        for i, batch in enumerate(ts_dl):
            if i == 0:
                print(batch)
            input_ids, input_type_ids, input_mask, subtokens_mask = batch

            logits = model.forward(
                input_ids=input_ids.to(device),
                token_type_ids=input_type_ids.to(device),
                attention_mask=input_mask.to(device),
            )
            preds = tensor2list(torch.argmax(logits, axis=-1))
            all_preds.extend(preds)

    output_path = Path(args.pred_output)
    output_path.mkdir(parents=True, exist_ok=True)
    with open(output_path / "predict_labels.txt", "w") as f:
        f.write("\n".join([str(e) for e in all_preds]))
Beispiel #10
0
        def get_str_example_id(split: str, ids_to_service_names_dict: dict, example_id_num: torch.Tensor) -> str:
            """
            Constructs string representation of example ID
            Args:
                split: evaluation data split
                ids_to_service_names_dict: id to service name mapping
                example_id_num: tensor example id
            """

            def format_turn_id(ex_id_num):
                dialog_id_1, dialog_id_2, turn_id, service_id, model_task_id, slot_intent_id, value_id = ex_id_num
                return "{}-{}_{:05d}-{:02d}-{}-{}-{}-{}".format(
                    split,
                    dialog_id_1,
                    dialog_id_2,
                    turn_id,
                    ids_to_service_names_dict[service_id],
                    model_task_id,
                    slot_intent_id,
                    value_id,
                )

            return list(map(format_turn_id, tensor2list(example_id_num)))
Beispiel #11
0
    def validation_epoch_end(self, outputs):
        avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean()

        unique_ids = torch.cat(
            [x['eval_tensors']['unique_ids'] for x in outputs])
        start_logits = torch.cat(
            [x['eval_tensors']['start_logits'] for x in outputs])
        end_logits = torch.cat(
            [x['eval_tensors']['end_logits'] for x in outputs])

        all_unique_ids = []
        all_start_logits = []
        all_end_logits = []
        if torch.distributed.is_initialized():
            world_size = torch.distributed.get_world_size()
            for ind in range(world_size):
                all_unique_ids.append(torch.empty_like(unique_ids))
                all_start_logits.append(torch.empty_like(start_logits))
                all_end_logits.append(torch.empty_like(end_logits))
            torch.distributed.all_gather(all_unique_ids, unique_ids)
            torch.distributed.all_gather(all_start_logits, start_logits)
            torch.distributed.all_gather(all_end_logits, end_logits)
        else:
            all_unique_ids.append(unique_ids)
            all_start_logits.append(start_logits)
            all_end_logits.append(end_logits)

        exact_match, f1, all_predictions, all_nbest = -1, -1, [], []
        if not torch.distributed.is_initialized(
        ) or torch.distributed.get_rank() == 0:

            unique_ids = []
            start_logits = []
            end_logits = []
            for u in all_unique_ids:
                unique_ids.extend(tensor2list(u))
            for u in all_start_logits:
                start_logits.extend(tensor2list(u))
            for u in all_end_logits:
                end_logits.extend(tensor2list(u))

            exact_match, f1, all_predictions, all_nbest = self.validation_dataset.evaluate(
                unique_ids=unique_ids,
                start_logits=start_logits,
                end_logits=end_logits,
                n_best_size=self._cfg.validation_ds.n_best_size,
                max_answer_length=self._cfg.validation_ds.max_answer_length,
                version_2_with_negative=self._cfg.dataset.
                version_2_with_negative,
                null_score_diff_threshold=self._cfg.validation_ds.
                null_score_diff_threshold,
                do_lower_case=self._cfg.dataset.do_lower_case,
            )

            if self._cfg.validation_ds.output_nbest_file is not None:
                with open(self._cfg.validation_ds.output_nbest_file,
                          "w") as writer:
                    writer.write(json.dumps(all_nbest, indent=4) + "\n")
            if self._cfg.validation_ds.output_prediction_file is not None:
                with open(self._cfg.validation_ds.output_prediction_file,
                          "w") as writer:
                    writer.write(json.dumps(all_predictions, indent=4) + "\n")

        logging.info(f"exact match {exact_match}")
        logging.info(f"f1 {f1}")
        self.log('val_loss', avg_loss)
        self.log('exact_match', exact_match)
        self.log('f1', f1)
Beispiel #12
0
    def inference(
        self,
        file: str,
        batch_size: int = 1,
        num_samples: int = -1,
        output_nbest_file: Optional[str] = None,
        output_prediction_file: Optional[str] = None,
    ):
        """
        Get prediction for unlabeled inference data

        Args:
            file: inference data
            batch_size: batch size to use during inference
            num_samples: number of samples to use of inference data. Default: -1 if all data should be used.
            output_nbest_file: optional output file for writing out nbest list
            output_prediction_file: optional output file for writing out predictions
            
        Returns:
            model predictions, model nbest list
        """
        # store predictions for all queries in a single list
        all_predictions = []
        all_nbest = []
        mode = self.training
        device = 'cuda' if torch.cuda.is_available() else 'cpu'
        try:
            # Switch model to evaluation mode
            self.eval()
            self.to(device)
            logging_level = logging.get_verbosity()
            logging.set_verbosity(logging.WARNING)
            dataloader_cfg = {
                "batch_size": batch_size,
                "file": file,
                "shuffle": False,
                "num_samples": num_samples,
                'num_workers': 2,
                'pin_memory': False,
                'drop_last': False,
            }
            dataloader_cfg = OmegaConf.create(dataloader_cfg)
            infer_datalayer = self._setup_dataloader_from_config(
                cfg=dataloader_cfg, mode=INFERENCE_MODE)

            all_logits = []
            all_unique_ids = []
            for i, batch in enumerate(infer_datalayer):
                input_ids, token_type_ids, attention_mask, unique_ids = batch
                logits = self.forward(
                    input_ids=input_ids.to(device),
                    token_type_ids=token_type_ids.to(device),
                    attention_mask=attention_mask.to(device),
                )
                all_logits.append(logits)
                all_unique_ids.append(unique_ids)
            logits = torch.cat(all_logits)
            unique_ids = tensor2list(torch.cat(all_unique_ids))
            s, e = logits.split(dim=-1, split_size=1)
            start_logits = tensor2list(s.squeeze(-1))
            end_logits = tensor2list(e.squeeze(-1))
            (all_predictions, all_nbest,
             scores_diff) = infer_datalayer.dataset.get_predictions(
                 unique_ids=unique_ids,
                 start_logits=start_logits,
                 end_logits=end_logits,
                 n_best_size=self._cfg.dataset.n_best_size,
                 max_answer_length=self._cfg.dataset.max_answer_length,
                 version_2_with_negative=self._cfg.dataset.
                 version_2_with_negative,
                 null_score_diff_threshold=self._cfg.dataset.
                 null_score_diff_threshold,
                 do_lower_case=self._cfg.dataset.do_lower_case,
             )

            with open(file, 'r') as test_file_fp:
                test_data = json.load(test_file_fp)["data"]
                id_to_question_mapping = {}
                for title in test_data:
                    for par in title["paragraphs"]:
                        for question in par["qas"]:
                            id_to_question_mapping[
                                question["id"]] = question["question"]

            for question_id in all_predictions:
                all_predictions[question_id] = (
                    id_to_question_mapping[question_id],
                    all_predictions[question_id])

            if output_nbest_file is not None:
                with open(output_nbest_file, "w") as writer:
                    writer.write(json.dumps(all_nbest, indent=4) + "\n")
            if output_prediction_file is not None:
                with open(output_prediction_file, "w") as writer:
                    writer.write(json.dumps(all_predictions, indent=4) + "\n")

        finally:
            # set mode back to its original value
            self.train(mode=mode)
            logging.set_verbosity(logging_level)

        return all_predictions, all_nbest
Beispiel #13
0
    def add_punctuation_capitalization(self, queries: List[str], batch_size: int = None) -> List[str]:
        """
        Adds punctuation and capitalization to the queries. Use this method for debugging and prototyping.
        Args:
            queries: lower cased text without punctuation
            batch_size: batch size to use during inference
        Returns:
            result: text with added capitalization and punctuation
        """
        if queries is None or len(queries) == 0:
            return []
        if batch_size is None:
            batch_size = len(queries)
            logging.info(f'Using batch size {batch_size} for inference')

        # We will store the output here
        result = []

        # Model's mode and device
        mode = self.training
        device = 'cuda' if torch.cuda.is_available() else 'cpu'
        try:
            # Switch model to evaluation mode
            self.eval()
            self = self.to(device)
            infer_datalayer = self._setup_infer_dataloader(queries, batch_size)

            # store predictions for all queries in a single list
            all_punct_preds = []
            all_capit_preds = []

            for batch in infer_datalayer:
                input_ids, input_type_ids, input_mask, subtokens_mask = batch

                punct_logits, capit_logits = self.forward(
                    input_ids=input_ids.to(device),
                    token_type_ids=input_type_ids.to(device),
                    attention_mask=input_mask.to(device),
                )

                subtokens_mask = subtokens_mask > 0.5
                punct_preds = tensor2list(torch.argmax(punct_logits, axis=-1)[subtokens_mask])
                capit_preds = tensor2list(torch.argmax(capit_logits, axis=-1)[subtokens_mask])
                all_punct_preds.extend(punct_preds)
                all_capit_preds.extend(capit_preds)

            queries = [q.strip().split() for q in queries]
            queries_len = [len(q) for q in queries]

            if sum(queries_len) != len(all_punct_preds) or sum(queries_len) != len(all_capit_preds):
                raise ValueError('Pred and words must have the same length')

            punct_ids_to_labels = {v: k for k, v in self._cfg.punct_label_ids.items()}
            capit_ids_to_labels = {v: k for k, v in self._cfg.capit_label_ids.items()}

            start_idx = 0
            end_idx = 0
            for query in queries:
                end_idx += len(query)

                # extract predictions for the current query from the list of all predictions
                punct_preds = all_punct_preds[start_idx:end_idx]
                capit_preds = all_capit_preds[start_idx:end_idx]
                start_idx = end_idx

                query_with_punct_and_capit = ''
                for j, word in enumerate(query):
                    punct_label = punct_ids_to_labels[punct_preds[j]]
                    capit_label = capit_ids_to_labels[capit_preds[j]]

                    if capit_label != self._cfg.dataset.pad_label:
                        word = word.capitalize()
                    query_with_punct_and_capit += word
                    if punct_label != self._cfg.dataset.pad_label:
                        query_with_punct_and_capit += punct_label
                    query_with_punct_and_capit += ' '

                result.append(query_with_punct_and_capit.strip())
        finally:
            # set mode back to its original value
            self.train(mode=mode)
        return result
    def predict_from_examples(self,
                              queries: List[str],
                              batch_size: int = 32) -> List[List[str]]:
        """
        Get prediction for the queries (intent and slots)
        Args:
            queries: text sequences
            batch_size: batch size to use during inference
        Returns:
            predicted_intents, predicted_slots: model intent and slot predictions
        """
        predicted_intents = []
        predicted_slots = []
        mode = self.training
        try:
            device = 'cuda' if torch.cuda.is_available() else 'cpu'
            # Switch model to evaluation mode
            self.eval()
            self.to(device)
            infer_datalayer = self._setup_infer_dataloader(queries, batch_size)

            # load intent and slot labels from the dictionary files (user should have them in a data directory)
            intent_labels, slot_labels = IntentSlotDataDesc.intent_slot_dicts(
                self.data_dir)

            for batch in infer_datalayer:
                input_ids, input_type_ids, input_mask, loss_mask, subtokens_mask = batch

                intent_logits, slot_logits = self.forward(
                    input_ids=input_ids.to(device),
                    token_type_ids=input_type_ids.to(device),
                    attention_mask=input_mask.to(device),
                )

                # predict intents and slots for these examples
                # intents
                intent_preds = tensor2list(torch.argmax(intent_logits,
                                                        axis=-1))

                # convert numerical outputs to Intent and Slot labels from the dictionaries
                for intent_num in intent_preds:
                    if intent_num < len(intent_labels):
                        predicted_intents.append(intent_labels[intent_num])
                    else:
                        # should not happen
                        predicted_intents.append("Unknown Intent")

                # slots
                slot_preds = torch.argmax(slot_logits, axis=-1)

                for slot_preds_query, mask_query in zip(
                        slot_preds, subtokens_mask):
                    query_slots = ''
                    for slot, mask in zip(slot_preds_query, mask_query):
                        if mask == 1:
                            if slot < len(slot_labels):
                                query_slots += slot_labels[slot] + ' '
                            else:
                                query_slots += 'Unknown_slot '
                    predicted_slots.append(query_slots.strip())

        finally:
            # set mode back to its original value
            self.train(mode=mode)

        return predicted_intents, predicted_slots
Beispiel #15
0
    def predict_from_examples(self, queries: List[str], test_ds) -> List[List[str]]:
        """
        Get prediction for the queries (intent and slots)
        Args:
            queries: text sequences
            test_ds: Dataset configuration section.
        Returns:
            predicted_intents, predicted_slots: model intent and slot predictions
        """
        predicted_intents = []
        predicted_slots = []
        mode = self.training
        try:
            device = 'cuda' if torch.cuda.is_available() else 'cpu'

            # Retrieve intent and slot vocabularies from configuration.
            intent_labels = self.cfg.data_desc.intent_labels
            slot_labels = self.cfg.data_desc.slot_labels

            # Initialize tokenizer.
            # if not hasattr(self, "tokenizer"):
            #    self._setup_tokenizer(self.cfg.tokenizer)
            # Initialize modules.
            # self._reconfigure_classifier()

            # Switch model to evaluation mode
            self.eval()
            self.to(device)

            # Dataset.
            infer_datalayer = self._setup_infer_dataloader(queries, test_ds)

            for batch in infer_datalayer:
                input_ids, input_type_ids, input_mask, loss_mask, subtokens_mask = batch

                intent_logits, slot_logits = self.forward(
                    input_ids=input_ids.to(device),
                    token_type_ids=input_type_ids.to(device),
                    attention_mask=input_mask.to(device),
                )

                # predict intents and slots for these examples
                # intents
                intent_preds = tensor2list(torch.argmax(intent_logits, axis=-1))

                # convert numerical outputs to Intent and Slot labels from the dictionaries
                for intent_num in intent_preds:
                    if intent_num < len(intent_labels):
                        predicted_intents.append(intent_labels[int(intent_num)])
                    else:
                        # should not happen
                        predicted_intents.append("Unknown Intent")

                # slots
                slot_preds = torch.argmax(slot_logits, axis=-1)

                for slot_preds_query, mask_query in zip(slot_preds, subtokens_mask):
                    query_slots = ''
                    for slot, mask in zip(slot_preds_query, mask_query):
                        if mask == 1:
                            if slot < len(slot_labels):
                                query_slots += slot_labels[int(slot)] + ' '
                            else:
                                query_slots += 'Unknown_slot '
                    predicted_slots.append(query_slots.strip())

        finally:
            # set mode back to its original value
            self.train(mode=mode)

        return predicted_intents, predicted_slots
Beispiel #16
0
    def predict_from_examples(
        self, queries: List[str], test_ds: DictConfig, threshold: float = None
    ) -> Tuple[List[List[Tuple[str, float]]], List[str], List[List[int]]]:
        """
        Get prediction for the queries (intent and slots)


        Args:
            queries: text sequences
            test_ds: Dataset configuration section.
            threshold: Threshold for rounding prediction logits
        
        Returns:
            predicted_intents: model intent predictions with their probabilities
                Example:  [[('flight', 0.84)], [('airfare', 0.54), 
                            ('flight', 0.73), ('meal', 0.24)]]
            predicted_slots: model slot predictions
                Example:  ['O B-depart_date.month_name B-depart_date.day_number',
                           'O O B-flight_stop O O O']

            predicted_vector: model intent predictions for each individual query. Binary values within each list 
                indicate whether a class is prediced for the given query (1 for True, 0 for False)
                Example: [[1,0,0,0,0,0], [0,0,1,0,0,0]]
        """
        predicted_intents = []

        if threshold is None:
            threshold = self.threshold
        logging.info(f'Using threshold = {threshold}')

        predicted_slots = []
        predicted_vector = []

        mode = self.training
        try:
            device = "cuda" if torch.cuda.is_available() else "cpu"

            # Retrieve intent and slot vocabularies from configuration.
            intent_labels = self.cfg.data_desc.intent_labels
            slot_labels = self.cfg.data_desc.slot_labels

            # Switch model to evaluation mode
            self.eval()
            self.to(device)

            # Dataset.
            infer_datalayer = self._setup_infer_dataloader(queries, test_ds)

            for batch in infer_datalayer:
                input_ids, input_type_ids, input_mask, loss_mask, subtokens_mask = batch

                intent_logits, slot_logits = self.forward(
                    input_ids=input_ids.to(device),
                    token_type_ids=input_type_ids.to(device),
                    attention_mask=input_mask.to(device),
                )

                # predict intents and slots for these examples
                # intents
                intent_preds = tensor2list(torch.sigmoid(intent_logits))
                # convert numerical outputs to Intent and Slot labels from the dictionaries
                for intents in intent_preds:
                    intent_lst = []
                    temp_list = []
                    for intent_num, probability in enumerate(intents):
                        if probability >= threshold:
                            intent_lst.append((intent_labels[int(intent_num)], round(probability, 2)))
                            temp_list.append(1)
                        else:
                            temp_list.append(0)

                    predicted_vector.append(temp_list)
                    predicted_intents.append(intent_lst)

                # slots
                slot_preds = torch.argmax(slot_logits, axis=-1)
                temp_slots_preds = []

                for slot_preds_query, mask_query in zip(slot_preds, subtokens_mask):
                    temp_slots = ""
                    query_slots = ""
                    for slot, mask in zip(slot_preds_query, mask_query):
                        if mask == 1:
                            if slot < len(slot_labels):
                                query_slots += slot_labels[int(slot)] + " "
                                temp_slots += f"{slot} "
                            else:
                                query_slots += "Unknown_slot "
                                temp_slots += "0 "
                    predicted_slots.append(query_slots.strip())
                    temp_slots_preds.append(temp_slots)

        finally:
            # set mode back to its original value
            self.train(mode=mode)

        return predicted_intents, predicted_slots, predicted_vector
Beispiel #17
0
    def add_punctuation_capitalization(
        self, queries: List[str], batch_size: int = None, max_seq_length: int = 512
    ) -> List[str]:
        """
        Adds punctuation and capitalization to the queries. Use this method for debugging and prototyping.
        Args:
            queries: lower cased text without punctuation
            batch_size: batch size to use during inference
            max_seq_length: maximum sequence length after tokenization
        Returns:
            result: text with added capitalization and punctuation
        """
        if queries is None or len(queries) == 0:
            return []
        if batch_size is None:
            batch_size = len(queries)
            logging.info(f'Using batch size {batch_size} for inference')

        # We will store the output here
        result = []

        # Model's mode and device
        mode = self.training
        device = 'cuda' if torch.cuda.is_available() else 'cpu'
        try:
            # Switch model to evaluation mode
            self.eval()
            self = self.to(device)

            infer_datalayer = self._setup_infer_dataloader(queries, batch_size, max_seq_length)

            # store predictions for all queries in a single list
            all_punct_preds = []
            all_capit_preds = []

            for i, batch in tqdm(enumerate(infer_datalayer), total=len(infer_datalayer), desc="infer queries"):
                input_ids, input_type_ids, input_mask, subtokens_mask = batch

                punct_logits, capit_logits = self.forward(
                    input_ids=input_ids.to(device),
                    token_type_ids=input_type_ids.to(device),
                    attention_mask=input_mask.to(device),
                )

                subtokens_mask = subtokens_mask > 0.5

                punct_preds = [
                    tensor2list(p_l[subtokens_mask[i]]) for i, p_l in enumerate(torch.argmax(punct_logits, axis=-1))
                ]
                capit_preds = [
                    tensor2list(c_l[subtokens_mask[i]]) for i, c_l in enumerate(torch.argmax(capit_logits, axis=-1))
                ]

                all_punct_preds.extend(punct_preds)
                all_capit_preds.extend(capit_preds)

            punct_ids_to_labels = {v: k for k, v in self._cfg.punct_label_ids.items()}
            capit_ids_to_labels = {v: k for k, v in self._cfg.capit_label_ids.items()}

            queries = [q.strip().split() for q in queries]
            for i, query in enumerate(queries):
                punct_preds = all_punct_preds[i]
                capit_preds = all_capit_preds[i]
                if len(query) != len(punct_preds):
                    logging.warning(
                        f'Max sequence length of query {query} is set to {max_seq_length}. Truncating the input.'
                    )

                    # removing the end of phrase punctuation of the truncated segment
                    punct_preds[-1] = 0
                    max_len = len(punct_preds)
                    query = query[:max_len]

                query_with_punct_and_capit = ''
                for j, word in enumerate(query):
                    punct_label = punct_ids_to_labels[punct_preds[j]]
                    capit_label = capit_ids_to_labels[capit_preds[j]]

                    if capit_label != self._cfg.dataset.pad_label:
                        word = word.capitalize()
                    query_with_punct_and_capit += word
                    if punct_label != self._cfg.dataset.pad_label:
                        query_with_punct_and_capit += punct_label
                    query_with_punct_and_capit += ' '

                result.append(query_with_punct_and_capit.strip())
        finally:
            # set mode back to its original value
            self.train(mode=mode)
        return result