コード例 #1
0
def test_ctcsegmentationparameters():
    # test repr and init
    config = CtcSegmentationParameters(fs=16000)
    config = eval(str(config))
    assert config.fs == 16000
    config.subsampling_factor = 512
    assert config.index_duration_in_seconds == 0.032
コード例 #2
0
def test_ctc_segmentation():
    """Test CTC segmentation.

    This is a minimal example for the function.
    Only executes CTC segmentation, does not check its result.
    """
    config = CtcSegmentationParameters()
    config.min_window_size = 20
    config.max_window_size = 50
    char_list = [config.blank, "a", "c", "d", "g", "o", "s", "t"]
    text = ["catzz#\n", "dogs!!\n"]
    # lpz = torch.nn.functional.log_softmax(torch.rand(30, 8) * 10, dim=0).numpy()
    ground_truth_mat, utt_begin_indices = prepare_text(config, text, char_list)
    timings, char_probs, state_list = ctc_segmentation(config, lpz, ground_truth_mat)
コード例 #3
0
def test_ctc_segmentation():
    """Test CTC segmentation.

    This is a minimal example for the function.
    Only executes CTC segmentation, does not check its result.
    """
    config = CtcSegmentationParameters()
    config.min_window_size = 20
    config.max_window_size = 50
    char_list = ["•", "a", "c", "d", "g", "o", "s", "t"]
    text = ["catzz#\n", "dogs!!\n"]
    ground_truth_mat, utt_begin_indices = prepare_text(config, text, char_list)
    timings, char_probs, state_list = ctc_segmentation(config, lpz,
                                                       ground_truth_mat)
コード例 #4
0
def test_determine_utterance_segments():
    """Test the generation of segments from aligned utterances.

    This is a function that is used after a completed CTC segmentation.
    Results are checked and compared with test vectors.
    """
    config = CtcSegmentationParameters()
    frame_duration_ms = 1000
    config.index_duration = frame_duration_ms / 1000.0
    config.score_min_mean_over_L = 2
    utt_begin_indices = [1, 4, 9]
    text = ["catzz#\n", "dogs!!\n"]
    char_probs = np.array([-0.5] * 10)
    timings = np.array(list(range(10))) + 0.5
    segments = determine_utterance_segments(config, utt_begin_indices,
                                            char_probs, timings, text)
    correct_segments = [(2.0, 4.0, -0.5), (5.0, 9.0, -0.5)]
    for i in [0, 1]:
        for j in [0, 1, 2]:
            assert segments[i][j] == correct_segments[i][j]
コード例 #5
0
def test_prepare_text():
    """Test the prepare_text function for CTC segmentation.

    Results are checked and compared with test vectors.
    """
    config = CtcSegmentationParameters()
    text = ["catzz#\n", "dogs!!\n"]
    char_list = [config.blank, "a", "c", "d", "g", "o", "s", "t"]
    ground_truth_mat, utt_begin_indices = prepare_text(config, text, char_list)
    correct_begin_indices = np.array([1, 5, 10])
    assert (utt_begin_indices == correct_begin_indices).all()
    gtm = list(ground_truth_mat.shape)
    assert gtm[0] == 11
    assert gtm[1] == 1
コード例 #6
0
def test_prepare_tokenized_text():
    """Test the prepare_tokenized_text function for CTC segmentation.

    Results are checked and compared with test vectors.
    """
    text = ["c a t", "d ▁o ▁g ▁s"]
    char_list = ["•", "a", "c", "d", "▁g", "▁o", "▁s", "t"]
    config = CtcSegmentationParameters(char_list=char_list)
    ground_truth_mat, utt_begin_indices = prepare_tokenized_text(config, text)
    correct_begin_indices = np.array([1, 5, 10])
    assert (utt_begin_indices == correct_begin_indices).all()
    gtm = list(ground_truth_mat.shape)
    assert gtm[0] == 11
    assert gtm[1] == 1
コード例 #7
0
def test_ctcsegmentationparameters():
    """Test the configuration object.
    Test repr and init.
    """
    config = CtcSegmentationParameters()
    config = eval(str(config))
    assert config.index_duration_in_seconds == 0.025
    config.index_duration = 0.025
    assert config.index_duration_in_seconds == 0.025
    # test excluded parameters and update procedure
    config.set(char_list=["a", "»"])
    config.update_excluded_characters()
    assert "»" not in config.excluded_characters
コード例 #8
0
ファイル: asr_align.py プロジェクト: yuekaizhang/espnet
def ctc_align(args, device):
    """ESPnet-specific interface for CTC segmentation.

    Parses configuration, infers the CTC posterior probabilities,
    and then aligns start and end of utterances using CTC segmentation.
    Results are written to the output file given in the args.

    :param args: given configuration
    :param device: for inference; one of ['cuda', 'cpu']
    :return:  0 on success
    """
    model, train_args = load_trained_model(args.model)
    assert isinstance(model, ASRInterface)
    load_inputs_and_targets = LoadInputsAndTargets(
        mode="asr",
        load_output=True,
        sort_in_input_length=False,
        preprocess_conf=train_args.preprocess_conf
        if args.preprocess_conf is None
        else args.preprocess_conf,
        preprocess_args={"train": False},
    )
    logging.info(f"Decoding device={device}")
    # Warn for nets with high memory consumption on long audio files
    if hasattr(model, "enc"):
        encoder_module = model.enc.__class__.__module__
    elif hasattr(model, "encoder"):
        encoder_module = model.encoder.__class__.__module__
    else:
        encoder_module = "Unknown"
    logging.info(f"Encoder module: {encoder_module}")
    logging.info(f"CTC module:     {model.ctc.__class__.__module__}")
    if "rnn" not in encoder_module:
        logging.warning("No BLSTM model detected; memory consumption may be high.")
    model.to(device=device).eval()
    # read audio and text json data
    with open(args.data_json, "rb") as f:
        js = json.load(f)["utts"]
    with open(args.utt_text, "r", encoding="utf-8") as f:
        lines = f.readlines()
        i = 0
        text = {}
        segment_names = {}
        for name in js.keys():
            text_per_audio = []
            segment_names_per_audio = []
            while i < len(lines) and lines[i].startswith(name):
                text_per_audio.append(lines[i][lines[i].find(" ") + 1 :])
                segment_names_per_audio.append(lines[i][: lines[i].find(" ")])
                i += 1
            text[name] = text_per_audio
            segment_names[name] = segment_names_per_audio
    # apply configuration
    config = CtcSegmentationParameters()
    subsampling_factor = 1
    frame_duration_ms = 10
    if args.subsampling_factor is not None:
        subsampling_factor = args.subsampling_factor
    if args.frame_duration is not None:
        frame_duration_ms = args.frame_duration
    # Backwards compatibility to ctc_segmentation <= 1.5.3
    if hasattr(config, "index_duration"):
        config.index_duration = frame_duration_ms * subsampling_factor / 1000
    else:
        config.subsampling_factor = subsampling_factor
        config.frame_duration_ms = frame_duration_ms
    if args.min_window_size is not None:
        config.min_window_size = args.min_window_size
    if args.max_window_size is not None:
        config.max_window_size = args.max_window_size
    config.char_list = train_args.char_list
    if args.use_dict_blank is not None:
        logging.warning(
            "The option --use-dict-blank is deprecated. If needed,"
            " use --set-blank instead."
        )
    if args.set_blank is not None:
        config.blank = args.set_blank
    if args.replace_spaces_with_blanks is not None:
        if args.replace_spaces_with_blanks:
            config.replace_spaces_with_blanks = True
        else:
            config.replace_spaces_with_blanks = False
    if args.gratis_blank:
        config.blank_transition_cost_zero = True
    if config.blank_transition_cost_zero and args.replace_spaces_with_blanks:
        logging.error(
            "Blanks are inserted between words, and also the transition cost of blank"
            " is zero. This configuration may lead to misalignments!"
        )
    if args.scoring_length is not None:
        config.score_min_mean_over_L = args.scoring_length
    logging.info(f"Frame timings: {frame_duration_ms}ms * {subsampling_factor}")
    # Iterate over audio files to decode and align
    for idx, name in enumerate(js.keys(), 1):
        logging.info("(%d/%d) Aligning " + name, idx, len(js.keys()))
        batch = [(name, js[name])]
        feat, label = load_inputs_and_targets(batch)
        feat = feat[0]
        with torch.no_grad():
            # Encode input frames
            enc_output = model.encode(torch.as_tensor(feat).to(device)).unsqueeze(0)
            # Apply ctc layer to obtain log character probabilities
            lpz = model.ctc.log_softmax(enc_output)[0].cpu().numpy()
        # Prepare the text for aligning
        ground_truth_mat, utt_begin_indices = prepare_text(config, text[name])
        # Align using CTC segmentation
        timings, char_probs, state_list = ctc_segmentation(
            config, lpz, ground_truth_mat
        )
        logging.debug(f"state_list = {state_list}")
        # Obtain list of utterances with time intervals and confidence score
        segments = determine_utterance_segments(
            config, utt_begin_indices, char_probs, timings, text[name]
        )
        # Write to "segments" file
        for i, boundary in enumerate(segments):
            utt_segment = (
                f"{segment_names[name][i]} {name} {boundary[0]:.2f}"
                f" {boundary[1]:.2f} {boundary[2]:.9f}\n"
            )
            args.output.write(utt_segment)
    return 0
コード例 #9
0
class CTCSegmentation:
    """Align text to audio using CTC segmentation.

    Usage:
        Initialize with given ASR model and parameters.
        If needed, parameters for CTC segmentation can be set with ``set_config(·)``.
        Then call the instance as function to align text within an audio file.

    Example:
        >>> # example file included in the ESPnet repository
        >>> import soundfile
        >>> speech, fs = soundfile.read("test_utils/ctc_align_test.wav")
        >>> # load an ASR model
        >>> from espnet_model_zoo.downloader import ModelDownloader
        >>> d = ModelDownloader()
        >>> wsjmodel = d.download_and_unpack( "kamo-naoyuki/wsj" )
        >>> # Apply CTC segmentation
        >>> aligner = CTCSegmentation( **wsjmodel )
        >>> text=["utt1 THE SALE OF THE HOTELS", "utt2 ON PROPERTY MANAGEMENT"]
        >>> aligner.set_config( gratis_blank=True )
        >>> segments = aligner( speech, text, fs=fs )
        >>> print( segments )
        utt1 utt 0.27 1.72 -0.1663 THE SALE OF THE HOTELS
        utt2 utt 4.54 6.10 -4.9646 ON PROPERTY MANAGEMENT

    On multiprocessing:
        To parallelize the computation with multiprocessing, these three steps
        can be separated:
        (1) ``get_lpz``: obtain the lpz,
        (2) ``prepare_segmentation_task``: prepare the task, and
        (3) ``get_segments``: perform CTC segmentation.
        Note that the function `get_segments` is a staticmethod and therefore
        independent of an already initialized CTCSegmentation object.

    References:
        CTC-Segmentation of Large Corpora for German End-to-end Speech Recognition
        2020, Kürzinger, Winkelbauer, Li, Watzel, Rigoll
        https://arxiv.org/abs/2007.09127

    More parameters are described in https://github.com/lumaku/ctc-segmentation

    """

    fs = 16000
    samples_to_frames_ratio = None
    time_stamps = "auto"
    choices_time_stamps = ["auto", "fixed"]
    text_converter = "tokenize"
    choices_text_converter = ["tokenize", "classic"]
    warned_about_misconfiguration = False
    config = CtcSegmentationParameters()

    def __init__(
        self,
        asr_train_config: Union[Path, str],
        asr_model_file: Union[Path, str] = None,
        fs: int = 16000,
        ngpu: int = 0,
        batch_size: int = 1,
        dtype: str = "float32",
        kaldi_style_text: bool = True,
        text_converter: str = "tokenize",
        time_stamps: str = "auto",
        **ctc_segmentation_args,
    ):
        """Initialize the CTCSegmentation module.

        Args:
            asr_train_config: ASR model config file (yaml).
            asr_model_file: ASR model file (pth).
            fs: Sample rate of audio file.
            ngpu: Number of GPUs. Set 0 for processing on CPU, set to 1 for
                processing on GPU. Multi-GPU aligning is currently not
                implemented. Default: 0.
            batch_size: Currently, only batch size == 1 is implemented.
            dtype: Data type used for inference. Set dtype according to
                the ASR model.
            kaldi_style_text: A kaldi-style text file includes the name of the
                utterance at the start of the line. If True, the utterance name
                is expected as first word at each line. If False, utterance
                names are automatically generated. Set this option according to
                your input data. Default: True.
            text_converter: How CTC segmentation handles text.
                "tokenize": Use ESPnet 2 preprocessing to tokenize the text.
                "classic": The text is preprocessed as in ESPnet 1 which takes
                token length into account. If the ASR model has longer tokens,
                this option may yield better results. Default: "tokenize".
            time_stamps: Choose the method how the time stamps are
                calculated. While "fixed" and "auto" use both the sample rate,
                the ratio of samples to one frame is either automatically
                determined for each inference or fixed at a certain ratio that
                is initially determined by the module, but can be changed via
                the parameter ``samples_to_frames_ratio``. Recommended for
                longer audio files: "auto".
            **ctc_segmentation_args: Parameters for CTC segmentation.
        """
        assert check_argument_types()

        # Basic settings
        if batch_size > 1:
            raise NotImplementedError("Batch decoding is not implemented")
        device = "cpu"
        if ngpu == 1:
            device = "cuda"
        elif ngpu > 1:
            logging.error("Multi-GPU not yet implemented.")
            raise NotImplementedError("Only single GPU decoding is supported")

        # Prepare ASR model
        asr_model, asr_train_args = ASRTask.build_model_from_file(
            asr_train_config, asr_model_file, device)
        asr_model.to(dtype=getattr(torch, dtype)).eval()
        self.preprocess_fn = ASRTask.build_preprocess_fn(asr_train_args, False)

        # Warn for nets with high memory consumption on long audio files
        if hasattr(asr_model, "encoder"):
            encoder_module = asr_model.encoder.__class__.__module__
        else:
            encoder_module = "Unknown"
        logging.info(f"Encoder module: {encoder_module}")
        logging.info(f"CTC module:     {asr_model.ctc.__class__.__module__}")
        if "rnn" not in encoder_module.lower():
            logging.warning(
                "No RNN model detected; memory consumption may be high.")

        self.asr_model = asr_model
        self.asr_train_args = asr_train_args
        self.device = device
        self.dtype = dtype
        self.ctc = asr_model.ctc

        self.kaldi_style_text = kaldi_style_text
        self.token_list = asr_model.token_list
        # Apply configuration
        self.set_config(
            fs=fs,
            time_stamps=time_stamps,
            kaldi_style_text=kaldi_style_text,
            text_converter=text_converter,
            **ctc_segmentation_args,
        )
        # last token "<sos/eos>", not needed
        self.config.char_list = asr_model.token_list[:-1]

    def set_config(self, **kwargs):
        """Set CTC segmentation parameters.

        Parameters for timing:
            time_stamps: Select method how CTC index duration is estimated, and
                thus how the time stamps are calculated.
            fs: Sample rate.
            samples_to_frames_ratio: If you want to directly determine the
                ratio of samples to CTC frames, set this parameter, and
                set ``time_stamps`` to "fixed".
                Note: If you want to calculate the time stamps as in
                ESPnet 1, set this parameter to:
                ``subsampling_factor * frame_duration / 1000``.

        Parameters for text preparation:
            set_blank: Index of blank in token list. Default: 0.
            replace_spaces_with_blanks: Inserts blanks between words, which is
                useful for handling long pauses between words. Only used in
                ``text_converter="classic"`` preprocessing mode. Default: False.
            kaldi_style_text: Determines whether the utterance name is expected
                as fist word of the utterance. Set at module initialization.
            text_converter: How CTC segmentation handles text.
                Set at module initialization.

        Parameters for alignment:
            min_window_size: Minimum number of frames considered for a single
                utterance. The current default value of 8000 corresponds to
                roughly 4 minutes (depending on ASR model) and should be OK in
                most cases. If your utterances are further apart, increase
                this value, or decrease it for smaller audio files.
            max_window_size: Maximum window size. It should not be necessary
                to change this value.
            gratis_blank: If True, the transition cost of blank is set to zero.
                Useful for long preambles or if there are large unrelated segments
                between utterances. Default: False.

        Parameters for calculation of confidence score:
            scoring_length: Block length to calculate confidence score. The
                default value of 30 should be OK in most cases.
        """
        # Parameters for timing
        if "time_stamps" in kwargs:
            if kwargs["time_stamps"] not in self.choices_time_stamps:
                raise NotImplementedError(
                    f"Parameter ´time_stamps´ has to be one of "
                    f"{list(self.choices_time_stamps)}", )
            self.time_stamps = kwargs["time_stamps"]
        if "fs" in kwargs:
            self.fs = float(kwargs["fs"])
        if "samples_to_frames_ratio" in kwargs:
            self.samples_to_frames_ratio = float(
                kwargs["samples_to_frames_ratio"])
        # Parameters for text preparation
        if "set_blank" in kwargs:
            assert isinstance(kwargs["set_blank"], int)
            self.config.blank = kwargs["set_blank"]
        if "replace_spaces_with_blanks" in kwargs:
            self.config.replace_spaces_with_blanks = bool(
                kwargs["replace_spaces_with_blanks"])
        if "kaldi_style_text" in kwargs:
            assert isinstance(kwargs["kaldi_style_text"], bool)
            self.kaldi_style_text = kwargs["kaldi_style_text"]
        if "text_converter" in kwargs:
            if kwargs["text_converter"] not in self.choices_text_converter:
                raise NotImplementedError(
                    f"Parameter ´text_converter´ has to be one of "
                    f"{list(self.choices_text_converter)}", )
            self.text_converter = kwargs["text_converter"]
        # Parameters for alignment
        if "min_window_size" in kwargs:
            assert isinstance(kwargs["min_window_size"], int)
            self.config.min_window_size = kwargs["min_window_size"]
        if "max_window_size" in kwargs:
            assert isinstance(kwargs["max_window_size"], int)
            self.config.max_window_size = kwargs["max_window_size"]
        if "gratis_blank" in kwargs:
            self.config.blank_transition_cost_zero = bool(
                kwargs["gratis_blank"])
        if (self.config.blank_transition_cost_zero
                and self.config.replace_spaces_with_blanks
                and not self.warned_about_misconfiguration):
            logging.error(
                "Blanks are inserted between words, and also the transition cost of"
                " blank is zero. This configuration may lead to misalignments!"
            )
            self.warned_about_misconfiguration = True
        # Parameter for calculation of confidence score
        if "scoring_length" in kwargs:
            assert isinstance(kwargs["scoring_length"], int)
            self.config.score_min_mean_over_L = kwargs["scoring_length"]

    def get_timing_config(self, speech_len=None, lpz_len=None):
        """Obtain parameters to determine time stamps."""
        timing_cfg = {
            "index_duration": self.config.index_duration,
        }
        # As the parameter ctc_index_duration vetoes the other
        if self.time_stamps == "fixed":
            # Initialize the value, if not yet available
            if self.samples_to_frames_ratio is None:
                ratio = self.estimate_samples_to_frames_ratio()
                self.samples_to_frames_ratio = ratio
            index_duration = self.samples_to_frames_ratio / self.fs
        else:
            assert self.time_stamps == "auto"
            samples_to_frames_ratio = speech_len / lpz_len
            index_duration = samples_to_frames_ratio / self.fs
        timing_cfg["index_duration"] = index_duration
        return timing_cfg

    def estimate_samples_to_frames_ratio(self, speech_len=215040):
        """Determine the ratio of encoded frames to sample points.

        This method helps to determine the time a single encoded frame occupies.
        As the sample rate already gave the number of samples, only the ratio
        of samples per encoded CTC frame are needed. This function estimates them by
        doing one inference, which is only needed once.

        Args:
            speech_len: Length of randomly generated speech vector for single
                inference. Default: 215040.

        Returns:
            samples_to_frames_ratio: Estimated ratio.
        """
        random_input = torch.rand(speech_len)
        lpz = self.get_lpz(random_input)
        lpz_len = lpz.shape[0]
        # Most frontends (DefaultFrontend, SlidingWindow) discard trailing data
        lpz_len = lpz_len + 1
        samples_to_frames_ratio = speech_len // lpz_len
        return samples_to_frames_ratio

    @torch.no_grad()
    def get_lpz(self, speech: Union[torch.Tensor, np.ndarray]):
        """Obtain CTC posterior log probabilities for given speech data.

        Args:
            speech: Speech audio input.

        Returns:
            lpz: Numpy vector with CTC log posterior probabilities.
        """
        if isinstance(speech, np.ndarray):
            speech = torch.tensor(speech)
        # data: (Nsamples,) -> (1, Nsamples)
        speech = speech.unsqueeze(0).to(getattr(torch, self.dtype))
        # lengths: (1,)
        lengths = speech.new_full([1],
                                  dtype=torch.long,
                                  fill_value=speech.size(1))
        batch = {"speech": speech, "speech_lengths": lengths}
        batch = to_device(batch, device=self.device)
        # Encode input
        enc, _ = self.asr_model.encode(**batch)
        assert len(enc) == 1, len(enc)
        # Apply ctc layer to obtain log character probabilities
        lpz = self.ctc.log_softmax(enc).detach()
        #  Shape should be ( <time steps>, <classes> )
        lpz = lpz.squeeze(0).cpu().numpy()
        return lpz

    def _split_text(self, text):
        """Convert text to list and extract utterance IDs."""
        utt_ids = None
        # Handle multiline strings
        if isinstance(text, str):
            text = text.splitlines()
        # Remove empty lines
        text = list(filter(len, text))
        # Handle kaldi-style text format
        if self.kaldi_style_text:
            utt_ids_and_text = [utt.split(" ", 1) for utt in text]
            # remove utterances with empty text
            utt_ids_and_text = filter(lambda ui: len(ui) == 2,
                                      utt_ids_and_text)
            utt_ids_and_text = list(utt_ids_and_text)
            utt_ids = [utt[0] for utt in utt_ids_and_text]
            text = [utt[1] for utt in utt_ids_and_text]
        return utt_ids, text

    def prepare_segmentation_task(self, text, lpz, name=None, speech_len=None):
        """Preprocess text, and gather text and lpz into a task object.

        Text is pre-processed and tokenized depending on configuration.
        If ``speech_len`` is given, the timing configuration is updated.
        Text, lpz, and configuration is collected in a CTCSegmentationTask
        object. The resulting object can be serialized and passed in a
        multiprocessing computation.

        A minimal amount of text processing is done, i.e., splitting the
        utterances in ``text`` into a list and applying ``text_cleaner``.
        It is recommended that you normalize the text beforehand, e.g.,
        change numbers into their spoken equivalent word, remove special
        characters, and convert UTF-8 characters to chars corresponding to
        your ASR model dictionary.

        The text is tokenized based on the ``text_converter`` setting:

        The "tokenize" method is more efficient and the easiest for models
        based on latin or cyrillic script that only contain the main chars,
        ["a", "b", ...] or for Japanese or Chinese ASR models with ~3000
        short Kanji / Hanzi tokens.

        The "classic" method improves the the accuracy of the alignments
        for models that contain longer tokens, but with a greater complexity
        for computation. The function scans for partial tokens which may
        improve time resolution.
        For example, the word "▁really" will be broken down into
        ``['▁', '▁r', '▁re', '▁real', '▁really']``. The alignment will be
        based on the most probable activation sequence given by the network.

        Args:
            text: List or multiline-string with utterance ground truths.
            lpz: Log CTC posterior probabilities obtained from the CTC-network;
                numpy array shaped as ( <time steps>, <classes> ).
            name: Audio file name. Choose a unique name, or the original audio
                file name, to distinguish multiple audio files. Default: None.
            speech_len: Number of sample points. If given, the timing
                configuration is automatically derived from length of fs, length
                of speech and length of lpz. If None is given, make sure the
                timing parameters are correct, see time_stamps for reference!
                Default: None.

        Returns:
            task: CTCSegmentationTask object that can be passed to
                ``get_segments()`` in order to obtain alignments.
        """
        config = self.config
        # Update timing parameters, if needed
        if speech_len is not None:
            lpz_len = lpz.shape[0]
            timing_cfg = self.get_timing_config(speech_len, lpz_len)
            config.set(**timing_cfg)
        # `text` is needed in the form of a list.
        utt_ids, text = self._split_text(text)
        # Obtain utterance & label sequence from text
        if self.text_converter == "tokenize":
            # list of str --tokenize--> list of np.array
            token_list = [
                self.preprocess_fn("<dummy>", {"text": utt})["text"]
                for utt in text
            ]
            # filter out any instances of the <unk> token
            unk = config.char_list.index("<unk>")
            token_list = [utt[utt != unk] for utt in token_list]
            ground_truth_mat, utt_begin_indices = prepare_token_list(
                config, token_list)
        else:
            assert self.text_converter == "classic"
            text = [self.preprocess_fn.text_cleaner(utt) for utt in text]
            token_list = [
                "".join(self.preprocess_fn.tokenizer.text2tokens(utt))
                for utt in text
            ]
            token_list = [utt.replace("<unk>", "") for utt in token_list]
            ground_truth_mat, utt_begin_indices = prepare_text(
                config, token_list)
        task = CTCSegmentationTask(
            config=config,
            name=name,
            text=text,
            ground_truth_mat=ground_truth_mat,
            utt_begin_indices=utt_begin_indices,
            utt_ids=utt_ids,
            lpz=lpz,
        )
        return task

    @staticmethod
    def get_segments(task: CTCSegmentationTask):
        """Obtain segments for given utterance texts and CTC log posteriors.

        Args:
            task: CTCSegmentationTask object that contains ground truth and
                CTC posterior probabilities.

        Returns:
            result: Dictionary with alignments. Combine this with the task
                object to obtain a human-readable segments representation.
        """
        assert check_argument_types()
        assert task.config is not None
        config = task.config
        lpz = task.lpz
        ground_truth_mat = task.ground_truth_mat
        utt_begin_indices = task.utt_begin_indices
        text = task.text
        # Align using CTC segmentation
        timings, char_probs, state_list = ctc_segmentation(
            config, lpz, ground_truth_mat)
        # Obtain list of utterances with time intervals and confidence score
        segments = determine_utterance_segments(config, utt_begin_indices,
                                                char_probs, timings, text)
        # Store results
        result = {
            "name": task.name,
            "timings": timings,
            "char_probs": char_probs,
            "state_list": state_list,
            "segments": segments,
            "done": True,
        }
        return result

    def __call__(
        self,
        speech: Union[torch.Tensor, np.ndarray],
        text: Union[List[str], str],
        fs: Optional[int] = None,
        name: Optional[str] = None,
    ) -> CTCSegmentationTask:
        """Align utterances.

        Args:
            speech: Audio file.
            text: List or multiline-string with utterance ground truths.
            fs: Sample rate in Hz. Optional, as this can be given when
                the module is initialized.
            name: Name of the file. Utterance names are derived from it.

        Returns:
            CTCSegmentationTask object with segments.
        """
        assert check_argument_types()
        if fs is not None:
            self.set_config(fs=fs)
        # Get log CTC posterior probabilities
        lpz = self.get_lpz(speech)
        # Conflate text & lpz & config as a segmentation task object
        task = self.prepare_segmentation_task(text, lpz, name, speech.shape[0])
        # Apply CTC segmentation
        segments = self.get_segments(task)
        task.set(**segments)
        assert check_return_type(task)
        return task
コード例 #10
0
ファイル: forced_align.py プロジェクト: taylorlu/dulcet
            if (phoneme != 358):
                temp.append(phoneme)
            else:
                phon_len[j] -= 1
        phonemes[j][:len(temp)] = temp

        model_out = model.predict(mel[np.newaxis, :mel_len[j], ...])
        pred_phon = model_out['encoder_output'][0]
        pred_phon = tf.nn.log_softmax(pred_phon)
        iphon_tar = model.text_pipeline.tokenizer.decode(
            phonemes[j][:phon_len[j]])
        iphon_tar = iphon_tar.split()

        char_list = [''] + list(
            model.text_pipeline.tokenizer.idx_to_token.values())
        config = CtcSegmentationParameters(char_list=char_list)
        config.index_duration = 0.0115545

        text = [phonemes[j][:phon_len[j]]]
        ground_truth_mat, utt_begin_indices = prepare_token_list(config, text)
        timings, char_probs, state_list = ctc_segmentation(
            config, pred_phon.numpy(), ground_truth_mat)
        utt_begin_indices = list(range(2, len(timings)))
        segments = determine_utterance_segments(config, utt_begin_indices,
                                                char_probs, timings, text[0])

        tg = tgt.core.TextGrid('haa')
        tier = tgt.core.IntervalTier(name='phonemes')

        if (segments[0][-1] < -0.001):
            segments[0] = (0, segments[0][1], segments[0][2])
コード例 #11
0
class CTCSegmentation:
    """Align text to audio using CTC segmentation.

    Usage
    -----
    Initialize with given ASR model and parameters.
    If needed, parameters for CTC segmentation can be set with ``set_config(·)``.
    Then call the instance as function to align text within an audio file.

    Arguments
    ---------
    asr_model : EncoderDecoderASR
        Speechbrain ASR interface. This requires a model that has a
        trained CTC layer for inference. It is better to use a model with
        single-character tokens to get a better time resolution.
        Please note that the inference complexity with Transformer models
        usually increases quadratically with audio length.
        It is therefore recommended to use RNN-based models, if available.
    kaldi_style_text : bool
        A kaldi-style text file includes the name of the
        utterance at the start of the line. If True, the utterance name
        is expected as first word at each line. If False, utterance
        names are automatically generated. Set this option according to
        your input data. Default: True.
    text_converter : str
        How CTC segmentation handles text.
        "tokenize": Use the ASR model tokenizer to tokenize the text.
        "classic": The text is preprocessed as text pieces which takes
        token length into account. If the ASR model has longer tokens,
        this option may yield better results. Default: "tokenize".
    time_stamps : str
        Choose the method how the time stamps are
        calculated. While "fixed" and "auto" use both the sample rate,
        the ratio of samples to one frame is either automatically
        determined for each inference or fixed at a certain ratio that
        is initially determined by the module, but can be changed via
        the parameter ``samples_to_frames_ratio``. Recommended for
        longer audio files: "auto".
    **ctc_segmentation_args
        Parameters for CTC segmentation.
        The full list of parameters is found in ``set_config``.

    Example
    -------
        >>> # using example file included in the SpeechBrain repository
        >>> from speechbrain.pretrained import EncoderDecoderASR
        >>> from speechbrain.alignment.ctc_segmentation import CTCSegmentation
        >>> # load an ASR model
        >>> pre_trained = "speechbrain/asr-transformer-transformerlm-librispeech"
        >>> asr_model = EncoderDecoderASR.from_hparams(source=pre_trained)
        >>> aligner = CTCSegmentation(asr_model, kaldi_style_text=False)
        >>> # load data
        >>> audio_path = "./samples/audio_samples/example1.wav"
        >>> text = ["THE BIRCH CANOE", "SLID ON THE", "SMOOTH PLANKS"]
        >>> segments = aligner(audio_path, text, name="example1")

    On multiprocessing
    ------------------
    To parallelize the computation with multiprocessing, these three steps
    can be separated:
    (1) ``get_lpz``: obtain the lpz,
    (2) ``prepare_segmentation_task``: prepare the task, and
    (3) ``get_segments``: perform CTC segmentation.
    Note that the function `get_segments` is a staticmethod and therefore
    independent of an already initialized CTCSegmentation obj́ect.

    References
    ----------
    CTC-Segmentation of Large Corpora for German End-to-end Speech Recognition
    2020, Kürzinger, Winkelbauer, Li, Watzel, Rigoll
    https://arxiv.org/abs/2007.09127

    More parameters are described in https://github.com/lumaku/ctc-segmentation

    """

    fs = 16000
    kaldi_style_text = True
    samples_to_frames_ratio = None
    time_stamps = "auto"
    choices_time_stamps = ["auto", "fixed"]
    text_converter = "tokenize"
    choices_text_converter = ["tokenize", "classic"]
    warned_about_misconfiguration = False
    config = CtcSegmentationParameters()

    def __init__(
        self,
        asr_model: Union[EncoderASR, EncoderDecoderASR],
        kaldi_style_text: bool = True,
        text_converter: str = "tokenize",
        time_stamps: str = "auto",
        **ctc_segmentation_args,
    ):
        """Initialize the CTCSegmentation module."""
        # Prepare ASR model
        if (isinstance(asr_model, EncoderDecoderASR)
                and not (hasattr(asr_model, "mods")
                         and hasattr(asr_model.mods, "decoder")
                         and hasattr(asr_model.mods.decoder, "ctc_weight"))
            ) or (isinstance(asr_model, EncoderASR)
                  and not (hasattr(asr_model, "mods")
                           and hasattr(asr_model.mods, "encoder")
                           and hasattr(asr_model.mods.encoder, "ctc_lin"))):
            raise AttributeError("The given asr_model has no CTC module!")
        if not hasattr(asr_model, "tokenizer"):
            raise AttributeError(
                "The given asr_model has no tokenizer in asr_model.tokenizer!")
        self.asr_model = asr_model
        self._encode = self.asr_model.encode_batch
        if isinstance(asr_model, EncoderDecoderASR):
            # Assumption: log-softmax is already included in ctc_forward_step
            self._ctc = self.asr_model.mods.decoder.ctc_forward_step
        else:
            # Apply log-softmax to encoder output
            self._ctc = self.asr_model.hparams.log_softmax
        self._tokenizer = self.asr_model.tokenizer

        # Apply configuration
        self.set_config(
            fs=self.asr_model.hparams.sample_rate,
            time_stamps=time_stamps,
            kaldi_style_text=kaldi_style_text,
            text_converter=text_converter,
            **ctc_segmentation_args,
        )

        # determine token or character list
        char_list = [
            asr_model.tokenizer.id_to_piece(i)
            for i in range(asr_model.tokenizer.vocab_size())
        ]
        self.config.char_list = char_list

        # Warn about possible misconfigurations
        max_char_len = max([len(c) for c in char_list])
        if len(char_list) > 500 and max_char_len >= 8:
            logger.warning(f"The dictionary has {len(char_list)} tokens with "
                           f"a max length of {max_char_len}. This may lead "
                           f"to low alignment performance and low accuracy.")

    def set_config(
        self,
        time_stamps: Optional[str] = None,
        fs: Optional[int] = None,
        samples_to_frames_ratio: Optional[float] = None,
        set_blank: Optional[int] = None,
        replace_spaces_with_blanks: Optional[bool] = None,
        kaldi_style_text: Optional[bool] = None,
        text_converter: Optional[str] = None,
        gratis_blank: Optional[bool] = None,
        min_window_size: Optional[int] = None,
        max_window_size: Optional[int] = None,
        scoring_length: Optional[int] = None,
    ):
        """Set CTC segmentation parameters.

        Parameters for timing
        ---------------------
        time_stamps : str
            Select method how CTC index duration is estimated, and
            thus how the time stamps are calculated.
        fs : int
            Sample rate. Usually derived from ASR model; use this parameter
            to overwrite the setting.
        samples_to_frames_ratio : float
            If you want to directly determine the
            ratio of samples to CTC frames, set this parameter, and
            set ``time_stamps`` to "fixed".
            Note: If you want to calculate the time stamps from a model
            with fixed subsampling, set this parameter to:
            ``subsampling_factor * frame_duration / 1000``.

        Parameters for text preparation
        -------------------------------
        set_blank : int
            Index of blank in token list. Default: 0.
        replace_spaces_with_blanks : bool
            Inserts blanks between words, which is
            useful for handling long pauses between words. Only used in
            ``text_converter="classic"`` preprocessing mode. Default: False.
        kaldi_style_text : bool
            Determines whether the utterance name is expected
            as fist word of the utterance. Set at module initialization.
        text_converter : str
            How CTC segmentation handles text.
            Set at module initialization.

        Parameters for alignment
        ------------------------
        min_window_size : int
            Minimum number of frames considered for a single
            utterance. The current default value of 8000 corresponds to
            roughly 4 minutes (depending on ASR model) and should be OK in
            most cases. If your utterances are further apart, increase
            this value, or decrease it for smaller audio files.
        max_window_size : int
            Maximum window size. It should not be necessary
            to change this value.
        gratis_blank : bool
            If True, the transition cost of blank is set to zero.
            Useful for long preambles or if there are large unrelated segments
            between utterances. Default: False.

        Parameters for calculation of confidence score
        ----------------------------------------------
        scoring_length : int
            Block length to calculate confidence score. The
            default value of 30 should be OK in most cases.
            30 corresponds to roughly 1-2s of audio.
        """
        # Parameters for timing
        if time_stamps is not None:
            if time_stamps not in self.choices_time_stamps:
                raise NotImplementedError(
                    f"Parameter ´time_stamps´ has to be one of "
                    f"{list(self.choices_time_stamps)}", )
            self.time_stamps = time_stamps
        if fs is not None:
            self.fs = float(fs)
        if samples_to_frames_ratio is not None:
            self.samples_to_frames_ratio = float(samples_to_frames_ratio)
        # Parameters for text preparation
        if set_blank is not None:
            self.config.blank = int(set_blank)
        if replace_spaces_with_blanks is not None:
            self.config.replace_spaces_with_blanks = bool(
                replace_spaces_with_blanks)
        if kaldi_style_text is not None:
            self.kaldi_style_text = bool(kaldi_style_text)
        if text_converter is not None:
            if text_converter not in self.choices_text_converter:
                raise NotImplementedError(
                    f"Parameter ´text_converter´ has to be one of "
                    f"{list(self.choices_text_converter)}", )
            self.text_converter = text_converter
        # Parameters for alignment
        if min_window_size is not None:
            self.config.min_window_size = int(min_window_size)
        if max_window_size is not None:
            self.config.max_window_size = int(max_window_size)
        if gratis_blank is not None:
            self.config.blank_transition_cost_zero = bool(gratis_blank)
        if (self.config.blank_transition_cost_zero
                and self.config.replace_spaces_with_blanks
                and not self.warned_about_misconfiguration):
            logger.error(
                "Blanks are inserted between words, and also the transition cost of"
                " blank is zero. This configuration may lead to misalignments!"
            )
            self.warned_about_misconfiguration = True
        # Parameter for calculation of confidence score
        if scoring_length is not None:
            self.config.score_min_mean_over_L = int(scoring_length)

    def get_timing_config(self, speech_len=None, lpz_len=None):
        """Obtain parameters to determine time stamps."""
        timing_cfg = {
            "index_duration": self.config.index_duration,
        }
        # As the parameter ctc_index_duration vetoes the other
        if self.time_stamps == "fixed":
            # Initialize the value, if not yet available
            if self.samples_to_frames_ratio is None:
                ratio = self.estimate_samples_to_frames_ratio()
                self.samples_to_frames_ratio = ratio
            index_duration = self.samples_to_frames_ratio / self.fs
        else:
            assert self.time_stamps == "auto"
            samples_to_frames_ratio = speech_len / lpz_len
            index_duration = samples_to_frames_ratio / self.fs
        timing_cfg["index_duration"] = index_duration
        return timing_cfg

    def estimate_samples_to_frames_ratio(self, speech_len=215040):
        """Determine the ratio of encoded frames to sample points.

        This method helps to determine the time a single encoded frame occupies.
        As the sample rate already gave the number of samples, only the ratio
        of samples per encoded CTC frame are needed. This function estimates them by
        doing one inference, which is only needed once.

        Args
        ----
        speech_len : int
            Length of randomly generated speech vector for single
            inference. Default: 215040.

        Returns
        -------
        int
            Estimated ratio.
        """
        random_input = torch.rand(speech_len)
        lpz = self.get_lpz(random_input)
        lpz_len = lpz.shape[0]
        # CAVEAT assumption: Frontend does not discard trailing data!
        samples_to_frames_ratio = speech_len / lpz_len
        return samples_to_frames_ratio

    @torch.no_grad()
    def get_lpz(self, speech: Union[torch.Tensor, np.ndarray]):
        """Obtain CTC posterior log probabilities for given speech data.

        Args
        ----
        speech : Union[torch.Tensor, np.ndarray]
            Speech audio input.

        Returns
        -------
        np.ndarray
            Numpy vector with CTC log posterior probabilities.
        """
        if isinstance(speech, np.ndarray):
            speech = torch.tensor(speech)
        # Batch data: (Nsamples,) -> (1, Nsamples)
        speech = speech.unsqueeze(0).to(self.asr_model.device)
        wav_lens = torch.tensor([1.0]).to(self.asr_model.device)
        enc = self._encode(speech, wav_lens)
        # Apply ctc layer to obtain log character probabilities
        lpz = self._ctc(enc).detach()
        #  Shape should be ( <time steps>, <classes> )
        lpz = lpz.squeeze(0).cpu().numpy()
        return lpz

    def _split_text(self, text):
        """Convert text to list and extract utterance IDs."""
        utt_ids = None
        # Handle multiline strings
        if isinstance(text, str):
            text = text.splitlines()
        # Remove empty lines
        text = list(filter(len, text))
        # Handle kaldi-style text format
        if self.kaldi_style_text:
            utt_ids_and_text = [utt.split(" ", 1) for utt in text]
            # remove utterances with empty text
            utt_ids_and_text = filter(lambda ui: len(ui) == 2,
                                      utt_ids_and_text)
            utt_ids_and_text = list(utt_ids_and_text)
            utt_ids = [utt[0] for utt in utt_ids_and_text]
            text = [utt[1] for utt in utt_ids_and_text]
        return utt_ids, text

    def prepare_segmentation_task(self, text, lpz, name=None, speech_len=None):
        """Preprocess text, and gather text and lpz into a task object.

        Text is pre-processed and tokenized depending on configuration.
        If ``speech_len`` is given, the timing configuration is updated.
        Text, lpz, and configuration is collected in a CTCSegmentationTask
        object. The resulting object can be serialized and passed in a
        multiprocessing computation.

        It is recommended that you normalize the text beforehand, e.g.,
        change numbers into their spoken equivalent word, remove special
        characters, and convert UTF-8 characters to chars corresponding to
        your ASR model dictionary.

        The text is tokenized based on the ``text_converter`` setting:

        The "tokenize" method is more efficient and the easiest for models
        based on latin or cyrillic script that only contain the main chars,
        ["a", "b", ...] or for Japanese or Chinese ASR models with ~3000
        short Kanji / Hanzi tokens.

        The "classic" method improves the the accuracy of the alignments
        for models that contain longer tokens, but with a greater complexity
        for computation. The function scans for partial tokens which may
        improve time resolution.
        For example, the word "▁really" will be broken down into
        ``['▁', '▁r', '▁re', '▁real', '▁really']``. The alignment will be
        based on the most probable activation sequence given by the network.

        Args
        ----
        text : list
            List or multiline-string with utterance ground truths.
        lpz : np.ndarray
            Log CTC posterior probabilities obtained from the CTC-network;
            numpy array shaped as ( <time steps>, <classes> ).
        name : str
            Audio file name that will be included in the segments output.
            Choose a unique name, or the original audio
            file name, to distinguish multiple audio files. Default: None.
        speech_len : int
            Number of sample points. If given, the timing
            configuration is automatically derived from length of fs, length
            of speech and length of lpz. If None is given, make sure the
            timing parameters are correct, see time_stamps for reference!
            Default: None.

        Returns
        -------
        CTCSegmentationTask
            Task object that can be passed to
            ``CTCSegmentation.get_segments()`` in order to obtain alignments.
        """
        config = self.config
        # Update timing parameters, if needed
        if speech_len is not None:
            lpz_len = lpz.shape[0]
            timing_cfg = self.get_timing_config(speech_len, lpz_len)
            config.set(**timing_cfg)
        # `text` is needed in the form of a list.
        utt_ids, text = self._split_text(text)
        # Obtain utterance & label sequence from text
        if self.text_converter == "tokenize":
            # list of str --tokenize--> list of np.array
            token_list = [
                np.array(self._tokenizer.encode_as_ids(utt)) for utt in text
            ]
            # filter out any instances of the <unk> token
            unk = config.char_list.index("<unk>")
            token_list = [utt[utt != unk] for utt in token_list]
            ground_truth_mat, utt_begin_indices = prepare_token_list(
                config, token_list)
        else:
            assert self.text_converter == "classic"
            text_pieces = [
                "".join(self._tokenizer.encode_as_pieces(utt)) for utt in text
            ]
            # filter out any instances of the <unk> token
            text_pieces = [utt.replace("<unk>", "") for utt in text_pieces]
            ground_truth_mat, utt_begin_indices = prepare_text(
                config, text_pieces)
        task = CTCSegmentationTask(
            config=config,
            name=name,
            text=text,
            ground_truth_mat=ground_truth_mat,
            utt_begin_indices=utt_begin_indices,
            utt_ids=utt_ids,
            lpz=lpz,
        )
        return task

    @staticmethod
    def get_segments(task: CTCSegmentationTask):
        """Obtain segments for given utterance texts and CTC log posteriors.

        Args
        ----
        task : CTCSegmentationTask
            Task object that contains ground truth and
            CTC posterior probabilities.

        Returns
        -------
        dict
            Dictionary with alignments. Combine this with the task
            object to obtain a human-readable segments representation.
        """
        assert type(task) == CTCSegmentationTask
        assert task.config is not None
        config = task.config
        lpz = task.lpz
        ground_truth_mat = task.ground_truth_mat
        utt_begin_indices = task.utt_begin_indices
        text = task.text
        # Align using CTC segmentation
        timings, char_probs, state_list = ctc_segmentation(
            config, lpz, ground_truth_mat)
        # Obtain list of utterances with time intervals and confidence score
        segments = determine_utterance_segments(config, utt_begin_indices,
                                                char_probs, timings, text)
        # Store results
        result = {
            "name": task.name,
            "timings": timings,
            "char_probs": char_probs,
            "state_list": state_list,
            "segments": segments,
            "done": True,
        }
        return result

    def __call__(
        self,
        speech: Union[torch.Tensor, np.ndarray, str, Path],
        text: Union[List[str], str],
        name: Optional[str] = None,
    ) -> CTCSegmentationTask:
        """Align utterances.

        Args
        ----
        speech : Union[torch.Tensor, np.ndarray, str, Path]
            Audio file that can be given as path or as array.
        text : Union[List[str], str]
            List or multiline-string with utterance ground truths.
            The required formatting depends on the setting ``kaldi_style_text``.
        name : str
            Name of the file. Utterance names are derived from it.

        Returns
        -------
        CTCSegmentationTask
            Task object with segments. Apply str(·) or print(·) on it
            to obtain the segments list.
        """
        if isinstance(speech, str) or isinstance(speech, Path):
            speech = self.asr_model.load_audio(speech)
        # Get log CTC posterior probabilities
        lpz = self.get_lpz(speech)
        # Conflate text & lpz & config as a segmentation task object
        task = self.prepare_segmentation_task(text, lpz, name, speech.shape[0])
        # Apply CTC segmentation
        segments = self.get_segments(task)
        task.set(**segments)
        return task
コード例 #12
0
def validate_asr_with_alignment(asr_model,val_ds,num_to_validate):
    
    val_set = []
    with open(val_ds) as F:
        for line in F:
            val = json.loads(line)
            val_set.append(val)
    val_files = [t["audio_filepath"] for t in val_set[0:num_to_validate]]
    val_text = [t["text"] for t in val_set[0:num_to_validate]]
    test_cfg = asr_model.cfg['validation_ds']
    test_cfg['manifest_filepath'] = val_ds
    asr_model.setup_test_data(test_cfg)  #TODO: what is this doing?
    calc_wer(asr_model)
    asr_model.preprocessor._sample_rate = test_cfg['sample_rate']
    print("batch size: ", test_cfg['batch_size'],
          "preprocessor sample_rate: ", asr_model.preprocessor._sample_rate)
    
    logprobs_list = asr_model.transcribe(val_files, batch_size=test_cfg['batch_size'], logprobs=True)
    nlogprobs = len(logprobs_list)
    alphabet  = [t for t in asr_model.cfg['labels']] + ['%'] # converting to list and adding blank character.

    # adapted example from here:
    # https://github.com/lumaku/ctc-segmentation
    config = CtcSegmentationParameters()
    config.frame_duration_ms = 20  #frame duration is the window of the predictions (i.e. logprobs prediction window) 
    config.blank = len(alphabet)-1 #index for character that is intended for 'blank' - in our case, we specify the last character in alphabet.

    for ii in range(nlogprobs):
        transcript = val_text[ii]

        ground_truth_mat, utt_begin_indices = prepare_text(config,transcript,alphabet)

        timings, char_probs, state_list     = ctc_segmentation(config,logprobs_list[ii].cpu().numpy(),ground_truth_mat)
        
        # Obtain list of utterances with time intervals and confidence score
        segments                            = determine_utterance_segments(config, utt_begin_indices, char_probs, timings, transcript)
        
        quartznet_transcript = asr_model.transcribe([val_files[ii]])

        print('Ground Truth Transcript:',transcript)
        print('Quartznet Transcript:',quartznet_transcript[0])
        print('CTC Segmentation Dense Sequnce:\n',''.join(state_list))

        #save onset per word.
        print('Saving timing prediction.')
        fname = open(val_files[ii][:-4]+'_align.csv','w') #jamendolyrics convention
        for i in transcript.split():
           # re.search performs regular expression operations.
           # .format inserts characters into {}.  
           # r'<string>' is considered a raw string.
           # char.start() gives you the start index of the starting character of the word (i) in transcript string
           # char.end() gives you the last index of the ending character** of the word (i) in transcript string
           # **the ending character is offset by one for the regex command, so a -1 is required to get the right 
           # index
           char = re.search(r'\b({})\b'.format(i),transcript)
           #       segments[index of character][start time of char=0]
           onset = segments[char.start()][0]
           #       segments[index of character][end time of char=1]
           term  = segments[char.end()-1][1]
           fname.write(str(onset)+','+str(term)+'\n')
        fname.close()
コード例 #13
0
    transcript = 'MISTER QUILTER IS THE APOSTLE OF THE MIDDLE CLASSES AND WE ARE GLAD TO WELCOME HIS GOSPEL'.lower()

    #build typical alphabet
    alphabet = [" ", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m",
                   "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "'",'%']

    quartznet = nemo_asr.models.EncDecCTCModel.from_pretrained(model_name="QuartzNet15x5Base-En")
    
    logprobs = quartznet.transcribe([filename],logprobs=True)
    
    greedy_transcript = predict_labels_greedy(alphabet,logprobs[0].cpu().numpy())

    # adapted example from here:
    # https://github.com/lumaku/ctc-segmentation
    config                              = CtcSegmentationParameters()

    #frame duration is the window of the predictions (i.e. logprobs prediction window)
    config.frame_duration_ms = 20
    #character that is intended for 'blank' - in our case, we specify the last character in alphabet.
    config.blank = len(alphabet)-1
    ground_truth_mat, utt_begin_indices = prepare_text(config,transcript,alphabet)
    
    timings, char_probs, state_list     = ctc_segmentation(config,logprobs[0].cpu().numpy(),ground_truth_mat)
    
    # Obtain list of utterances with time intervals and confidence score
    segments                            = determine_utterance_segments(config, utt_begin_indices, char_probs, timings, transcript)
    
    quartznet_transcript = quartznet.transcribe([filename])

    print('Ground Truth Transcript:',transcript)
コード例 #14
0
ファイル: extract_durations.py プロジェクト: taylorlu/dulcet
                                     keep_checkpoint_every_n_hours=config_dict['keep_checkpoint_every_n_hours'])
manager_training = tf.train.CheckpointManager(checkpoint, str(config.weights_dir / 'latest'),
                                              max_to_keep=1, checkpoint_name='latest')

checkpoint.restore(manager_training.latest_checkpoint)
if manager_training.latest_checkpoint:
    print(f'\nresuming training from step {model.step} ({manager_training.latest_checkpoint})')
else:
    print(f'\nstarting training from scratch')

all_durations = np.array([])
iterator = tqdm(enumerate(dataset.all_batches()))
step = 0

char_list = [''] +list(model.text_pipeline.tokenizer.idx_to_token.values())
smt_config = CtcSegmentationParameters(char_list=char_list)
smt_config.index_duration = 0.0115545

labelFile = open(r'/root/mydata/Corpus/transformer_tts_data.corpus/phonemized_metadata.NoStress2.txt', 'w')

for c, (spk_name_batch, mel_batch, phoneme_batch, mel_len_batch, phon_len_batch, fname_batch) in iterator:
    iterator.set_description(f'Processing dataset')

    model_out = model.predict(mel_batch)
    pred_phon = model_out['encoder_output']
    pred_phon = tf.nn.log_softmax(pred_phon)

    for i, name in enumerate(fname_batch):
        os.makedirs(os.path.join(config.duration_dir, spk_name_batch[i].numpy().decode()), exist_ok=True)

        text = list(phoneme_batch[i][:phon_len_batch[i]].numpy())
コード例 #15
0
ファイル: asr_align.py プロジェクト: sw005320/espnet-1
def ctc_align(args, device):
    """ESPnet-specific interface for CTC segmentation.

    Parses configuration, infers the CTC posterior probabilities,
    and then aligns start and end of utterances using CTC segmentation.
    Results are written to the output file given in the args.

    :param args: given configuration
    :param device: for inference; one of ['cuda', 'cpu']
    :return:  0 on success
    """
    model, train_args = load_trained_model(args.model)
    assert isinstance(model, ASRInterface)
    load_inputs_and_targets = LoadInputsAndTargets(
        mode="asr",
        load_output=True,
        sort_in_input_length=False,
        preprocess_conf=train_args.preprocess_conf
        if args.preprocess_conf is None else args.preprocess_conf,
        preprocess_args={"train": False},
    )
    logging.info(f"Decoding device={device}")
    model.to(device=device).eval()
    # read audio and text json data
    with open(args.data_json, "rb") as f:
        js = json.load(f)["utts"]
    with open(args.utt_text, "r") as f:
        lines = f.readlines()
        i = 0
        text = {}
        segment_names = {}
        for name in js.keys():
            text_per_audio = []
            segment_names_per_audio = []
            while i < len(lines) and lines[i].startswith(name):
                text_per_audio.append(lines[i][lines[i].find(" ") + 1:])
                segment_names_per_audio.append(lines[i][:lines[i].find(" ")])
                i += 1
            text[name] = text_per_audio
            segment_names[name] = segment_names_per_audio
    # apply configuration
    config = CtcSegmentationParameters()
    if args.subsampling_factor is not None:
        config.subsampling_factor = args.subsampling_factor
    if args.frame_duration is not None:
        config.frame_duration_ms = args.frame_duration
    if args.min_window_size is not None:
        config.min_window_size = args.min_window_size
    if args.max_window_size is not None:
        config.max_window_size = args.max_window_size
    char_list = train_args.char_list
    if args.use_dict_blank:
        config.blank = char_list[0]
    logging.debug(
        f"Frame timings: {config.frame_duration_ms}ms * {config.subsampling_factor}"
    )
    # Iterate over audio files to decode and align
    for idx, name in enumerate(js.keys(), 1):
        logging.info("(%d/%d) Aligning " + name, idx, len(js.keys()))
        batch = [(name, js[name])]
        feat, label = load_inputs_and_targets(batch)
        feat = feat[0]
        with torch.no_grad():
            # Encode input frames
            enc_output = model.encode(
                torch.as_tensor(feat).to(device)).unsqueeze(0)
            # Apply ctc layer to obtain log character probabilities
            lpz = model.ctc.log_softmax(enc_output)[0].cpu().numpy()
        # Prepare the text for aligning
        ground_truth_mat, utt_begin_indices = prepare_text(
            config, text[name], char_list)
        # Align using CTC segmentation
        timings, char_probs, state_list = ctc_segmentation(
            config, lpz, ground_truth_mat)
        # Obtain list of utterances with time intervals and confidence score
        segments = determine_utterance_segments(config, utt_begin_indices,
                                                char_probs, timings,
                                                text[name])
        # Write to "segments" file
        for i, boundary in enumerate(segments):
            utt_segment = (f"{segment_names[name][i]} {name} {boundary[0]:.2f}"
                           f" {boundary[1]:.2f} {boundary[2]:.9f}\n")
            args.output.write(utt_segment)
    return 0