Esempio n. 1
0
def supervision_set():
    return SupervisionSet.from_segments([
        SupervisionSegment(id='segment-1',
                           recording_id='recording-1',
                           channel=0,
                           start=0.1,
                           duration=0.3,
                           text='transcript of the first segment',
                           language='english',
                           speaker='Norman Dyhrentfurth',
                           gender='male',
                           alignment={
                               'word': [
                                   AlignmentItem(symbol='transcript',
                                                 start=0.1,
                                                 duration=0.08),
                                   AlignmentItem(symbol='of',
                                                 start=0.18,
                                                 duration=0.02),
                                   AlignmentItem(symbol='the',
                                                 start=0.2,
                                                 duration=0.03),
                                   AlignmentItem(symbol='first',
                                                 start=0.23,
                                                 duration=0.07),
                                   AlignmentItem(symbol='segment',
                                                 start=0.3,
                                                 duration=0.1),
                               ]
                           })
    ])
Esempio n. 2
0
def supervisions():
    return [
        SupervisionSegment(
            "sup",
            "rec",
            start=0,
            duration=0.5,
            speaker="SpkA",
            alignment={
                "word": [
                    AlignmentItem(symbol="a", start=0, duration=0.1),
                    AlignmentItem(symbol="b", start=0.2, duration=0.2),
                ]
            },
        ),
        SupervisionSegment(
            "sup",
            "rec",
            start=0.6,
            duration=0.2,
            speaker="SpkB",
            alignment={
                "word": [
                    AlignmentItem(symbol="a", start=0.6, duration=0.2),
                ]
            },
        ),
    ]
Esempio n. 3
0
def supervision_set():
    return SupervisionSet.from_segments([
        SupervisionSegment(
            id="segment-1",
            recording_id="recording-1",
            channel=0,
            start=0.1,
            duration=0.3,
            text="transcript of the first segment",
            language="english",
            speaker="Norman Dyhrentfurth",
            gender="male",
            alignment={
                "word": [
                    AlignmentItem(symbol="transcript",
                                  start=0.1,
                                  duration=0.08),
                    AlignmentItem(symbol="of", start=0.18, duration=0.02),
                    AlignmentItem(symbol="the", start=0.2, duration=0.03),
                    AlignmentItem(symbol="first", start=0.23, duration=0.07),
                    AlignmentItem(symbol="segment", start=0.3, duration=0.1),
                ]
            },
        )
    ])
Esempio n. 4
0
def supervisions():
    return [
        SupervisionSegment('sup',
                           'rec',
                           start=0,
                           duration=0.5,
                           speaker="SpkA",
                           alignment={
                               'word': [
                                   AlignmentItem(symbol='a',
                                                 start=0,
                                                 duration=0.1),
                                   AlignmentItem(symbol='b',
                                                 start=0.2,
                                                 duration=0.2)
                               ]
                           }),
        SupervisionSegment('sup',
                           'rec',
                           start=0.6,
                           duration=0.2,
                           speaker="SpkB",
                           alignment={
                               'word': [
                                   AlignmentItem(symbol='a',
                                                 start=0.6,
                                                 duration=0.2),
                               ]
                           })
    ]
Esempio n. 5
0
def external_alignment() -> Dict[str, List[AlignmentItem]]:
    return {'word': [
        AlignmentItem("transcript", 0.1, 0.08),
        AlignmentItem("of", 0.18, 0.02),
        AlignmentItem("the", 0.2, 0.03),
        AlignmentItem("first", 0.23, 0.07),
        AlignmentItem("segment", 0.3, 0.1)
      ]}
Esempio n. 6
0
def test_create_supervision_segment_with_all_metadata():
    SupervisionSegment(
        id="X",
        recording_id="X",
        start=0.0,
        duration=0.1,
        channel=0,
        text="wysokie szczyty",
        language="polish",
        speaker="Janusz",
        gender="male",
        alignment={
            "word": [
                AlignmentItem(symbol="wysokie", start=0.0, duration=0.05),
                AlignmentItem(symbol="szczyty", start=0.05, duration=0.05),
            ]
        },
    )
Esempio n. 7
0
def test_create_supervision_segment_with_all_metadata():
    SupervisionSegment(
        id='X',
        recording_id='X',
        start=0.0,
        duration=0.1,
        channel=0,
        text='wysokie szczyty',
        language='polish',
        speaker='Janusz',
        gender='male',
        alignment={
            'word': [
                AlignmentItem(symbol='wysokie', start=0.0, duration=0.05),
                AlignmentItem(symbol='szczyty', start=0.05, duration=0.05)
            ]
        }
    )
Esempio n. 8
0
 def _with_alignment(self, cut: MonoCut,
                     text: str) -> Dict[str, List[AlignmentItem]]:
     subwords = [text[i:i + 3] for i in range(0, len(text), 3)
                 ]  # Create subwords of 3 chars
     dur = cut.duration / len(subwords)
     alignment = [
         AlignmentItem(symbol=sub, start=i * dur, duration=dur)
         for i, sub in enumerate(subwords)
     ]
     return {"subword": alignment}
Esempio n. 9
0
def dummy_alignment(text: str = "irrelevant",
                    start: float = 0.0,
                    duration: float = 1.0) -> AlignmentItem:
    subwords = [text[i:i + 3]
                for i in range(0, len(text), 3)]  # Create subwords of 3 chars
    dur = duration / len(subwords)
    alignment = [
        AlignmentItem(symbol=sub, start=start + i * dur, duration=dur)
        for i, sub in enumerate(subwords)
    ]
    return {'subword': alignment}
Esempio n. 10
0
def parse_alignments(ali_path: Pathlike) -> Dict[str, List[AlignmentItem]]:
    alignments = {}
    for line in Path(ali_path).read_text().splitlines():
        utt_id, words, timestamps = line.split()
        words = words.replace('"', "").split(",")
        timestamps = [0.0] + list(map(float, timestamps.replace('"', "").split(",")))
        alignments[utt_id] = [
            AlignmentItem(
                symbol=word, start=start, duration=round(end - start, ndigits=8)
            )
            for word, start, end in zip(words, timestamps, timestamps[1:])
        ]
    return alignments
Esempio n. 11
0
    def align_ctm(self, cuts: Union[CutSet,
                                    AnyCut]) -> List[List[AlignmentItem]]:
        """
        Perform forced alignment and parse the phones into a CTM-like format:
            >>> [[0.0, 0.12, 'SIL'], [0.12, 0.2, 'AH0'], ...]
        """
        # TODO: I am not sure that this method is extracting the alignment 100% correctly:
        #       need to revise...
        # TODO: when K2/Snowfall has a standard way of indicating what is silence,
        #       or we update the model, update the constants below.
        EPS = 0
        SIL = 1
        non_speech = {EPS, SIL}

        def to_s(n: int) -> float:
            FRAME_SHIFT = 0.04  # 0.01 * 4 subsampling
            return round(n * FRAME_SHIFT, ndigits=3)

        if isinstance(cuts, (Cut, MixedCut)):
            cuts = CutSet.from_cuts([cuts])

        # Uppercase and remove punctuation
        cuts = cuts.map_supervisions(self.normalize_text)
        alignments = self.align(cuts).tolist()

        ctm_alis = []
        for cut, alignment in zip(cuts, alignments):
            # First we determine the silence regions at the beginning and the end:
            # we assume that every SIL and <eps> before the first phone, and after the last phone,
            # are representing silence.
            first_speech_idx = [
                idx for idx, s in enumerate(alignment) if s not in non_speech
            ][0]
            last_speech_idx = [
                idx for idx, s in reversed(list(enumerate(alignment)))
                if s not in non_speech
            ][0]
            speech_ali = alignment[first_speech_idx:last_speech_idx]
            ctm_ali = [
                AlignmentItem(start=0.0,
                              duration=to_s(first_speech_idx),
                              symbol=self.lexicon.phones[SIL])
            ]

            # Then, we iterate over the speech region: since the K2 model uses 2-state HMM
            # topology that allows blank (<eps>) to follow a phone symbol, we treat <eps>
            # as continuation of the "previous" phone.
            # TODO: I think this implementation is wrong in that it merges repeating phones...
            #       Will fix.
            # TODO: I think it could be simplified by using some smart semi-ring and FSA operations...
            start = first_speech_idx
            prev_s = speech_ali[0]
            curr_s = speech_ali[0]
            cntr = 1
            for s in speech_ali[1:]:
                curr_s = s if s != EPS else curr_s
                if curr_s != prev_s:
                    ctm_ali.append(
                        AlignmentItem(start=to_s(start),
                                      duration=to_s(cntr),
                                      symbol=self.lexicon.phones[prev_s]))
                    start = start + cntr
                    prev_s = curr_s
                    cntr = 1
                else:
                    cntr += 1
            if cntr:
                ctm_ali.append(
                    AlignmentItem(start=to_s(start),
                                  duration=to_s(cntr),
                                  symbol=self.lexicon.phones[prev_s]))

            speech_end_timestamp = to_s(last_speech_idx)
            if speech_end_timestamp > cut.duration:
                logging.warning(
                    f"speech_end_timestamp <= cut.duration. Skipping cut {cut.id}"
                )
                ctm_alis.append(None)
                continue

            ctm_ali.append(
                AlignmentItem(start=speech_end_timestamp,
                              duration=round(cut.duration -
                                             speech_end_timestamp,
                                             ndigits=8),
                              symbol=self.lexicon.phones[SIL]))
            ctm_alis.append(ctm_ali)

        return ctm_alis