def supervision_set(): return SupervisionSet.from_segments([ SupervisionSegment(id='segment-1', recording_id='recording-1', channel=0, start=0.1, duration=0.3, text='transcript of the first segment', language='english', speaker='Norman Dyhrentfurth', gender='male', alignment={ 'word': [ AlignmentItem(symbol='transcript', start=0.1, duration=0.08), AlignmentItem(symbol='of', start=0.18, duration=0.02), AlignmentItem(symbol='the', start=0.2, duration=0.03), AlignmentItem(symbol='first', start=0.23, duration=0.07), AlignmentItem(symbol='segment', start=0.3, duration=0.1), ] }) ])
def supervisions(): return [ SupervisionSegment( "sup", "rec", start=0, duration=0.5, speaker="SpkA", alignment={ "word": [ AlignmentItem(symbol="a", start=0, duration=0.1), AlignmentItem(symbol="b", start=0.2, duration=0.2), ] }, ), SupervisionSegment( "sup", "rec", start=0.6, duration=0.2, speaker="SpkB", alignment={ "word": [ AlignmentItem(symbol="a", start=0.6, duration=0.2), ] }, ), ]
def supervision_set(): return SupervisionSet.from_segments([ SupervisionSegment( id="segment-1", recording_id="recording-1", channel=0, start=0.1, duration=0.3, text="transcript of the first segment", language="english", speaker="Norman Dyhrentfurth", gender="male", alignment={ "word": [ AlignmentItem(symbol="transcript", start=0.1, duration=0.08), AlignmentItem(symbol="of", start=0.18, duration=0.02), AlignmentItem(symbol="the", start=0.2, duration=0.03), AlignmentItem(symbol="first", start=0.23, duration=0.07), AlignmentItem(symbol="segment", start=0.3, duration=0.1), ] }, ) ])
def supervisions(): return [ SupervisionSegment('sup', 'rec', start=0, duration=0.5, speaker="SpkA", alignment={ 'word': [ AlignmentItem(symbol='a', start=0, duration=0.1), AlignmentItem(symbol='b', start=0.2, duration=0.2) ] }), SupervisionSegment('sup', 'rec', start=0.6, duration=0.2, speaker="SpkB", alignment={ 'word': [ AlignmentItem(symbol='a', start=0.6, duration=0.2), ] }) ]
def external_alignment() -> Dict[str, List[AlignmentItem]]: return {'word': [ AlignmentItem("transcript", 0.1, 0.08), AlignmentItem("of", 0.18, 0.02), AlignmentItem("the", 0.2, 0.03), AlignmentItem("first", 0.23, 0.07), AlignmentItem("segment", 0.3, 0.1) ]}
def test_create_supervision_segment_with_all_metadata(): SupervisionSegment( id="X", recording_id="X", start=0.0, duration=0.1, channel=0, text="wysokie szczyty", language="polish", speaker="Janusz", gender="male", alignment={ "word": [ AlignmentItem(symbol="wysokie", start=0.0, duration=0.05), AlignmentItem(symbol="szczyty", start=0.05, duration=0.05), ] }, )
def test_create_supervision_segment_with_all_metadata(): SupervisionSegment( id='X', recording_id='X', start=0.0, duration=0.1, channel=0, text='wysokie szczyty', language='polish', speaker='Janusz', gender='male', alignment={ 'word': [ AlignmentItem(symbol='wysokie', start=0.0, duration=0.05), AlignmentItem(symbol='szczyty', start=0.05, duration=0.05) ] } )
def _with_alignment(self, cut: MonoCut, text: str) -> Dict[str, List[AlignmentItem]]: subwords = [text[i:i + 3] for i in range(0, len(text), 3) ] # Create subwords of 3 chars dur = cut.duration / len(subwords) alignment = [ AlignmentItem(symbol=sub, start=i * dur, duration=dur) for i, sub in enumerate(subwords) ] return {"subword": alignment}
def dummy_alignment(text: str = "irrelevant", start: float = 0.0, duration: float = 1.0) -> AlignmentItem: subwords = [text[i:i + 3] for i in range(0, len(text), 3)] # Create subwords of 3 chars dur = duration / len(subwords) alignment = [ AlignmentItem(symbol=sub, start=start + i * dur, duration=dur) for i, sub in enumerate(subwords) ] return {'subword': alignment}
def parse_alignments(ali_path: Pathlike) -> Dict[str, List[AlignmentItem]]: alignments = {} for line in Path(ali_path).read_text().splitlines(): utt_id, words, timestamps = line.split() words = words.replace('"', "").split(",") timestamps = [0.0] + list(map(float, timestamps.replace('"', "").split(","))) alignments[utt_id] = [ AlignmentItem( symbol=word, start=start, duration=round(end - start, ndigits=8) ) for word, start, end in zip(words, timestamps, timestamps[1:]) ] return alignments
def align_ctm(self, cuts: Union[CutSet, AnyCut]) -> List[List[AlignmentItem]]: """ Perform forced alignment and parse the phones into a CTM-like format: >>> [[0.0, 0.12, 'SIL'], [0.12, 0.2, 'AH0'], ...] """ # TODO: I am not sure that this method is extracting the alignment 100% correctly: # need to revise... # TODO: when K2/Snowfall has a standard way of indicating what is silence, # or we update the model, update the constants below. EPS = 0 SIL = 1 non_speech = {EPS, SIL} def to_s(n: int) -> float: FRAME_SHIFT = 0.04 # 0.01 * 4 subsampling return round(n * FRAME_SHIFT, ndigits=3) if isinstance(cuts, (Cut, MixedCut)): cuts = CutSet.from_cuts([cuts]) # Uppercase and remove punctuation cuts = cuts.map_supervisions(self.normalize_text) alignments = self.align(cuts).tolist() ctm_alis = [] for cut, alignment in zip(cuts, alignments): # First we determine the silence regions at the beginning and the end: # we assume that every SIL and <eps> before the first phone, and after the last phone, # are representing silence. first_speech_idx = [ idx for idx, s in enumerate(alignment) if s not in non_speech ][0] last_speech_idx = [ idx for idx, s in reversed(list(enumerate(alignment))) if s not in non_speech ][0] speech_ali = alignment[first_speech_idx:last_speech_idx] ctm_ali = [ AlignmentItem(start=0.0, duration=to_s(first_speech_idx), symbol=self.lexicon.phones[SIL]) ] # Then, we iterate over the speech region: since the K2 model uses 2-state HMM # topology that allows blank (<eps>) to follow a phone symbol, we treat <eps> # as continuation of the "previous" phone. # TODO: I think this implementation is wrong in that it merges repeating phones... # Will fix. # TODO: I think it could be simplified by using some smart semi-ring and FSA operations... start = first_speech_idx prev_s = speech_ali[0] curr_s = speech_ali[0] cntr = 1 for s in speech_ali[1:]: curr_s = s if s != EPS else curr_s if curr_s != prev_s: ctm_ali.append( AlignmentItem(start=to_s(start), duration=to_s(cntr), symbol=self.lexicon.phones[prev_s])) start = start + cntr prev_s = curr_s cntr = 1 else: cntr += 1 if cntr: ctm_ali.append( AlignmentItem(start=to_s(start), duration=to_s(cntr), symbol=self.lexicon.phones[prev_s])) speech_end_timestamp = to_s(last_speech_idx) if speech_end_timestamp > cut.duration: logging.warning( f"speech_end_timestamp <= cut.duration. Skipping cut {cut.id}" ) ctm_alis.append(None) continue ctm_ali.append( AlignmentItem(start=speech_end_timestamp, duration=round(cut.duration - speech_end_timestamp, ndigits=8), symbol=self.lexicon.phones[SIL])) ctm_alis.append(ctm_ali) return ctm_alis