Example #1
0
    def __init__(self, cfg: DictConfig, trainer=None):
        if 'tokenizer' not in cfg:
            raise ValueError(
                "`cfg` must have `tokenizer` config to create a tokenizer !")

        self.tokenizer_cfg = OmegaConf.to_container(cfg.tokenizer,
                                                    resolve=True)  # type: dict
        self.tokenizer_dir = self.tokenizer_cfg.pop(
            'dir')  # Remove tokenizer directory
        self.tokenizer_type = self.tokenizer_cfg.pop(
            'type').lower()  # Remove tokenizer_type

        # Setup the tokenizer
        self._setup_tokenizer()

        # Initialize a dummy vocabulary
        vocabulary = self.tokenizer.tokenizer.get_vocab()

        # Set the new vocabulary
        cfg.decoder.params.vocabulary = ListConfig(list(vocabulary.values()))

        # Override number of classes if placeholder provided
        if cfg.decoder.params['num_classes'] < 1:
            logging.info(
                "\nReplacing placeholder number of classes ({}) with actual number of classes - {}"
                .format(cfg.decoder.params['num_classes'], len(vocabulary)))
            cfg.decoder.params['num_classes'] = len(vocabulary)

        super().__init__(cfg=cfg, trainer=trainer)

        # Setup metric objects
        self._wer = WERBPE(tokenizer=self.tokenizer,
                           batch_dim_index=0,
                           use_cer=False,
                           ctc_decode=True)
Example #2
0
    def test_wer_metric_randomized(self, test_wer_bpe):
        """This test relies on correctness of word_error_rate function."""
        def __random_string(length):
            return ''.join(
                random.choice(''.join(self.vocabulary)) for _ in range(length))

        if test_wer_bpe:
            wer = WERBPE(deepcopy(self.char_tokenizer),
                         batch_dim_index=0,
                         use_cer=False,
                         ctc_decode=True)
        else:
            wer = WER(vocabulary=self.vocabulary,
                      batch_dim_index=0,
                      use_cer=False,
                      ctc_decode=True)

        for test_id in range(256):
            n1 = random.randint(1, 512)
            n2 = random.randint(1, 512)
            s1 = __random_string(n1)
            s2 = __random_string(n2)
            # skip empty strings as reference
            if s2.strip():
                assert (abs(
                    self.get_wer(wer,
                                 prediction=s1,
                                 reference=s2,
                                 use_tokenizer=test_wer_bpe) -
                    word_error_rate(hypotheses=[s1], references=[s2])) < 1e-6)
Example #3
0
    def change_vocabulary(self, new_tokenizer_dir: str, new_tokenizer_type: str):
        """
        Changes vocabulary of the tokenizer used during CTC decoding process.
        Use this method when fine-tuning on from pre-trained model.
        This method changes only decoder and leaves encoder and pre-processing modules unchanged. For example, you would
        use it if you want to use pretrained encoder when fine-tuning on a data in another language, or when you'd need
        model to learn capitalization, punctuation and/or special characters.

        Args:
            new_tokenizer_dir: Path to the new tokenizer directory.
            new_tokenizer_type: Either `bpe` or `wpe`. `bpe` is used for SentencePiece tokenizers,
                whereas `wpe` is used for `BertTokenizer`.

        Returns: None

        """
        if not os.path.isdir(new_tokenizer_dir):
            raise NotADirectoryError(
                f'New tokenizer dir must be non-empty path to a directory. But I got: {new_tokenizer_dir}'
            )

        if new_tokenizer_type.lower() not in ('bpe', 'wpe'):
            raise ValueError(f'New tokenizer type must be either `bpe` or `wpe`')

        self.tokenizer_dir = new_tokenizer_dir  # Remove tokenizer directory
        self.tokenizer_type = new_tokenizer_type.lower()  # Remove tokenizer_type

        # Setup the tokenizer
        self._setup_tokenizer()

        # Initialize a dummy vocabulary
        vocabulary = self.tokenizer.tokenizer.get_vocab()

        # Set the new vocabulary
        decoder_config = copy.deepcopy(self.decoder.to_config_dict())
        decoder_config.params.vocabulary = ListConfig(list(vocabulary.values()))

        # Override number of classes if placeholder provided
        logging.info(
            "\nReplacing old number of classes ({}) with new number of classes - {}".format(
                decoder_config['params']['num_classes'], len(vocabulary)
            )
        )
        decoder_config['params']['num_classes'] = len(vocabulary)

        del self.decoder
        self.decoder = EncDecCTCModelBPE.from_config_dict(decoder_config)
        del self.loss
        self.loss = CTCLoss(num_classes=self.decoder.num_classes_with_blank - 1, zero_infinity=True)
        self._wer = WERBPE(tokenizer=self.tokenizer, batch_dim_index=0, use_cer=False, ctc_decode=True)

        # Update config
        OmegaConf.set_struct(self._cfg.decoder, False)
        self._cfg.decoder = decoder_config
        OmegaConf.set_struct(self._cfg.decoder, True)

        logging.info(f"Changed tokenizer to {self.decoder.vocabulary} vocabulary.")
Example #4
0
    def test_wer_metric_decode(self, test_wer_bpe):
        if test_wer_bpe:
            wer = WERBPE(self.char_tokenizer,
                         batch_dim_index=0,
                         use_cer=False,
                         ctc_decode=True)
        else:
            wer = WER(vocabulary=self.vocabulary.copy(),
                      batch_dim_index=0,
                      use_cer=False,
                      ctc_decode=True)

        tokens = self.__string_to_ctc_tensor(
            'cat', use_tokenizer=test_wer_bpe)[0].int().numpy().tolist()
        assert tokens == [3, 1, 20]

        tokens_decoded = wer.decode_ids_to_tokens(tokens)
        assert tokens_decoded == ['c', 'a', 't']

        str_decoded = wer.decode_tokens_to_str(tokens)
        assert str_decoded == 'cat'
Example #5
0
    def __init__(self, cfg: DictConfig, trainer=None):
        if 'tokenizer' not in cfg:
            raise ValueError("`cfg` must have `tokenizer` config to create a tokenizer !")

        # Setup the tokenizer
        self._setup_tokenizer(cfg.tokenizer)

        # Initialize a dummy vocabulary
        vocabulary = self.tokenizer.tokenizer.get_vocab()

        # Set the new vocabulary
        with open_dict(cfg):
            if "params" in cfg.decoder:
                cfg.decoder.params.vocabulary = ListConfig(list(vocabulary.values()))
            else:
                cfg.decoder.vocabulary = ListConfig(list(vocabulary.values()))

        # Override number of classes if placeholder provided
        if "params" in cfg.decoder:
            num_classes = cfg.decoder["params"]["num_classes"]
        else:
            num_classes = cfg.decoder["num_classes"]

        if num_classes < 1:
            logging.info(
                "\nReplacing placeholder number of classes ({}) with actual number of classes - {}".format(
                    num_classes, len(vocabulary)
                )
            )
            if "params" in cfg.decoder:
                cfg.decoder["params"]["num_classes"] = len(vocabulary)
            else:
                cfg.decoder["num_classes"] = len(vocabulary)

        super().__init__(cfg=cfg, trainer=trainer)

        # Setup metric objects
        self._wer = WERBPE(
            tokenizer=self.tokenizer,
            batch_dim_index=0,
            use_cer=self._cfg.get('use_cer', False),
            ctc_decode=True,
            dist_sync_on_step=True,
            log_prediction=self._cfg.get("log_prediction", False),
        )
Example #6
0
    def __init__(self, cfg: DictConfig, trainer=None):
        # Convert to Hydra 1.0 compatible DictConfig
        cfg = model_utils.convert_model_config_to_dict_config(cfg)
        cfg = model_utils.maybe_update_config_version(cfg)

        if 'tokenizer' not in cfg:
            raise ValueError(
                "`cfg` must have `tokenizer` config to create a tokenizer !")

        # Setup the tokenizer
        self._setup_tokenizer(cfg.tokenizer)

        # Initialize a dummy vocabulary
        vocabulary = self.tokenizer.tokenizer.get_vocab()

        # Set the new vocabulary
        with open_dict(cfg):
            # sidestepping the potential overlapping tokens issue in aggregate tokenizers
            if self.tokenizer_type == "agg":
                cfg.decoder.vocabulary = ListConfig(vocabulary)
            else:
                cfg.decoder.vocabulary = ListConfig(list(vocabulary.keys()))

        # Override number of classes if placeholder provided
        num_classes = cfg.decoder["num_classes"]

        if num_classes < 1:
            logging.info(
                "\nReplacing placeholder number of classes ({}) with actual number of classes - {}"
                .format(num_classes, len(vocabulary)))
            cfg.decoder["num_classes"] = len(vocabulary)

        super().__init__(cfg=cfg, trainer=trainer)

        # Setup metric objects
        self._wer = WERBPE(
            tokenizer=self.tokenizer,
            batch_dim_index=0,
            use_cer=self._cfg.get('use_cer', False),
            ctc_decode=True,
            dist_sync_on_step=True,
            log_prediction=self._cfg.get("log_prediction", False),
        )
Example #7
0
    def test_wer_metric_simple(self, batch_dim_index, test_wer_bpe):
        if test_wer_bpe:
            wer = WERBPE(self.char_tokenizer,
                         batch_dim_index,
                         use_cer=False,
                         ctc_decode=True)
        else:
            wer = WER(vocabulary=self.vocabulary,
                      batch_dim_index=batch_dim_index,
                      use_cer=False,
                      ctc_decode=True)

        assert self.get_wer(wer, 'cat', 'cot', test_wer_bpe) == 1.0
        assert self.get_wer(wer, 'gpu', 'g p u', test_wer_bpe) == 1.0
        assert self.get_wer(wer, 'g p u', 'gpu', test_wer_bpe) == 3.0
        assert self.get_wer(wer, 'ducati motorcycle', 'motorcycle',
                            test_wer_bpe) == 1.0
        assert self.get_wer(wer, 'ducati motorcycle', 'ducuti motorcycle',
                            test_wer_bpe) == 0.5
        assert abs(
            self.get_wer(wer, 'a f c', 'a b c', test_wer_bpe) -
            1.0 / 3.0) < 1e-6
Example #8
0
    def change_vocabulary(self, new_tokenizer_dir: Union[str, DictConfig],
                          new_tokenizer_type: str):
        """
        Changes vocabulary of the tokenizer used during CTC decoding process.
        Use this method when fine-tuning on from pre-trained model.
        This method changes only decoder and leaves encoder and pre-processing modules unchanged. For example, you would
        use it if you want to use pretrained encoder when fine-tuning on a data in another language, or when you'd need
        model to learn capitalization, punctuation and/or special characters.

        Args:
            new_tokenizer_dir: Directory path to tokenizer or a config for a new tokenizer (if the tokenizer type is `agg`)
            new_tokenizer_type: Either `agg`, `bpe` or `wpe`. `bpe` is used for SentencePiece tokenizers,
                whereas `wpe` is used for `BertTokenizer`.
            new_tokenizer_cfg: A config for the new tokenizer. if provided, pre-empts the dir and type

        Returns: None

        """
        if isinstance(new_tokenizer_dir, DictConfig):
            if new_tokenizer_type == 'agg':
                new_tokenizer_cfg = new_tokenizer_dir
            else:
                raise ValueError(
                    f'New tokenizer dir should be a string unless the tokenizer is `agg`, but this tokenizer type is: {new_tokenizer_type}'
                )
        else:
            new_tokenizer_cfg = None

        if new_tokenizer_cfg is not None:
            tokenizer_cfg = new_tokenizer_cfg
        else:
            if not os.path.isdir(new_tokenizer_dir):
                raise NotADirectoryError(
                    f'New tokenizer dir must be non-empty path to a directory. But I got: {new_tokenizer_dir}'
                    f"New tokenizer dir must be non-empty path to a directory. But I got: {new_tokenizer_dir}"
                )

            if new_tokenizer_type.lower() not in ('bpe', 'wpe'):
                raise ValueError(
                    f'New tokenizer type must be either `bpe` or `wpe`')

            tokenizer_cfg = OmegaConf.create({
                'dir': new_tokenizer_dir,
                'type': new_tokenizer_type
            })

        # Setup the tokenizer
        self._setup_tokenizer(tokenizer_cfg)

        # Initialize a dummy vocabulary
        vocabulary = self.tokenizer.tokenizer.get_vocab()

        # Set the new vocabulary
        decoder_config = copy.deepcopy(self.decoder.to_config_dict())
        # sidestepping the potential overlapping tokens issue in aggregate tokenizers
        if self.tokenizer_type == "agg":
            decoder_config.vocabulary = ListConfig(vocabulary)
        else:
            decoder_config.vocabulary = ListConfig(list(vocabulary.keys()))

        decoder_num_classes = decoder_config['num_classes']

        # Override number of classes if placeholder provided
        logging.info(
            "\nReplacing old number of classes ({}) with new number of classes - {}"
            .format(decoder_num_classes, len(vocabulary)))

        decoder_config['num_classes'] = len(vocabulary)

        del self.decoder
        self.decoder = EncDecCTCModelBPE.from_config_dict(decoder_config)
        del self.loss
        self.loss = CTCLoss(
            num_classes=self.decoder.num_classes_with_blank - 1,
            zero_infinity=True,
            reduction=self._cfg.get("ctc_reduction", "mean_batch"),
        )
        self._wer = WERBPE(
            tokenizer=self.tokenizer,
            batch_dim_index=0,
            use_cer=self._cfg.get('use_cer', False),
            ctc_decode=True,
            log_prediction=self._cfg.get("log_prediction", False),
        )

        # Update config
        OmegaConf.set_struct(self._cfg.decoder, False)
        self._cfg.decoder = decoder_config
        OmegaConf.set_struct(self._cfg.decoder, True)

        logging.info(
            f"Changed tokenizer to {self.decoder.vocabulary} vocabulary.")