Ejemplo n.º 1
0
def test_prepare_token_list():
    """Test the prepare_token_list function for CTC segmentation.

    Results are checked and compared with test vectors.
    """
    text = [np.array([2, 1, 7]), np.array([3, 5, 4, 6])]
    char_list = ["•", "a", "c", "d", "▁g", "▁o", "▁s", "t"]
    config = CtcSegmentationParameters(char_list=char_list)
    ground_truth_mat, utt_begin_indices = prepare_token_list(config, text)
    correct_begin_indices = np.array([1, 5, 10])
    assert (utt_begin_indices == correct_begin_indices).all()
    gtm = list(ground_truth_mat.shape)
    assert gtm[0] == 11
    assert gtm[1] == 1
Ejemplo n.º 2
0
    def prepare_segmentation_task(self, text, lpz, name=None, speech_len=None):
        """Preprocess text, and gather text and lpz into a task object.

        Text is pre-processed and tokenized depending on configuration.
        If ``speech_len`` is given, the timing configuration is updated.
        Text, lpz, and configuration is collected in a CTCSegmentationTask
        object. The resulting object can be serialized and passed in a
        multiprocessing computation.

        A minimal amount of text processing is done, i.e., splitting the
        utterances in ``text`` into a list and applying ``text_cleaner``.
        It is recommended that you normalize the text beforehand, e.g.,
        change numbers into their spoken equivalent word, remove special
        characters, and convert UTF-8 characters to chars corresponding to
        your ASR model dictionary.

        The text is tokenized based on the ``text_converter`` setting:

        The "tokenize" method is more efficient and the easiest for models
        based on latin or cyrillic script that only contain the main chars,
        ["a", "b", ...] or for Japanese or Chinese ASR models with ~3000
        short Kanji / Hanzi tokens.

        The "classic" method improves the the accuracy of the alignments
        for models that contain longer tokens, but with a greater complexity
        for computation. The function scans for partial tokens which may
        improve time resolution.
        For example, the word "▁really" will be broken down into
        ``['▁', '▁r', '▁re', '▁real', '▁really']``. The alignment will be
        based on the most probable activation sequence given by the network.

        Args:
            text: List or multiline-string with utterance ground truths.
            lpz: Log CTC posterior probabilities obtained from the CTC-network;
                numpy array shaped as ( <time steps>, <classes> ).
            name: Audio file name. Choose a unique name, or the original audio
                file name, to distinguish multiple audio files. Default: None.
            speech_len: Number of sample points. If given, the timing
                configuration is automatically derived from length of fs, length
                of speech and length of lpz. If None is given, make sure the
                timing parameters are correct, see time_stamps for reference!
                Default: None.

        Returns:
            task: CTCSegmentationTask object that can be passed to
                ``get_segments()`` in order to obtain alignments.
        """
        config = self.config
        # Update timing parameters, if needed
        if speech_len is not None:
            lpz_len = lpz.shape[0]
            timing_cfg = self.get_timing_config(speech_len, lpz_len)
            config.set(**timing_cfg)
        # `text` is needed in the form of a list.
        utt_ids, text = self._split_text(text)
        # Obtain utterance & label sequence from text
        if self.text_converter == "tokenize":
            # list of str --tokenize--> list of np.array
            token_list = [
                self.preprocess_fn("<dummy>", {"text": utt})["text"]
                for utt in text
            ]
            # filter out any instances of the <unk> token
            unk = config.char_list.index("<unk>")
            token_list = [utt[utt != unk] for utt in token_list]
            ground_truth_mat, utt_begin_indices = prepare_token_list(
                config, token_list)
        else:
            assert self.text_converter == "classic"
            text = [self.preprocess_fn.text_cleaner(utt) for utt in text]
            token_list = [
                "".join(self.preprocess_fn.tokenizer.text2tokens(utt))
                for utt in text
            ]
            token_list = [utt.replace("<unk>", "") for utt in token_list]
            ground_truth_mat, utt_begin_indices = prepare_text(
                config, token_list)
        task = CTCSegmentationTask(
            config=config,
            name=name,
            text=text,
            ground_truth_mat=ground_truth_mat,
            utt_begin_indices=utt_begin_indices,
            utt_ids=utt_ids,
            lpz=lpz,
        )
        return task
Ejemplo n.º 3
0
        phonemes[j][:len(temp)] = temp

        model_out = model.predict(mel[np.newaxis, :mel_len[j], ...])
        pred_phon = model_out['encoder_output'][0]
        pred_phon = tf.nn.log_softmax(pred_phon)
        iphon_tar = model.text_pipeline.tokenizer.decode(
            phonemes[j][:phon_len[j]])
        iphon_tar = iphon_tar.split()

        char_list = [''] + list(
            model.text_pipeline.tokenizer.idx_to_token.values())
        config = CtcSegmentationParameters(char_list=char_list)
        config.index_duration = 0.0115545

        text = [phonemes[j][:phon_len[j]]]
        ground_truth_mat, utt_begin_indices = prepare_token_list(config, text)
        timings, char_probs, state_list = ctc_segmentation(
            config, pred_phon.numpy(), ground_truth_mat)
        utt_begin_indices = list(range(2, len(timings)))
        segments = determine_utterance_segments(config, utt_begin_indices,
                                                char_probs, timings, text[0])

        tg = tgt.core.TextGrid('haa')
        tier = tgt.core.IntervalTier(name='phonemes')

        if (segments[0][-1] < -0.001):
            segments[0] = (0, segments[0][1], segments[0][2])
        else:
            itv = tgt.core.Interval(0, segments[0][0], text='sp')
            tier.add_interval(itv)