def test_deterministic(self, prep_org_data): """ Ensures loading and processing utterences from ELAN files is deterministic. """ bkw_org_path = prep_org_data utterances_1 = elan.utterances_from_dir(bkw_org_path, ["rf", "xv"]) utterances_2 = elan.utterances_from_dir(bkw_org_path, ["rf", "xv"]) assert utterances_1 == utterances_2 utterances_1 = [ utter for utter in utterances_1 if bkw.bkw_filter(utter) ] utterances_2 = [ utter for utter in utterances_2 if bkw.bkw_filter(utter) ] assert utterances_1 == utterances_2 utterances_1 = utterance.remove_duplicates(utterances_1) utterances_2 = utterance.remove_duplicates(utterances_2) assert utterances_1 == utterances_2 utterances_1 = [ bkw.bkw_label_segmenter.segment_labels(utter) for utter in utterances_1 ] utterances_2 = [ bkw.bkw_label_segmenter.segment_labels(utter) for utter in utterances_2 ] assert utterances_1 == utterances_2 utterances_1 = utterance.remove_empty_text(utterances_1) utterances_2 = utterance.remove_empty_text(utterances_2) assert utterances_1 == utterances_2
def test_utterances_from_dir(self, prep_org_data): bkw_org_path = prep_org_data utterances = elan.utterances_from_dir(bkw_org_path, ["xv"]) assert len(utterances) == 1036 assert len(utterance.remove_empty_text(utterances)) == 1035 assert len(utterance.remove_duplicates(utterances)) == 1029 assert len( utterance.remove_duplicates( utterance.remove_empty_text(utterances))) == 1028 utterances = elan.utterances_from_dir(bkw_org_path, ["rf"]) assert len(utterances) == 1242 assert len(utterance.remove_empty_text(utterances)) == 631 assert len(utterance.remove_duplicates(utterances)) == 1239 assert len( utterance.remove_duplicates( utterance.remove_empty_text(utterances))) == 631 utterances = elan.utterances_from_dir(bkw_org_path, ["rf", "xv"]) assert len(utterances) == 2278 assert len(utterance.remove_empty_text(utterances)) == 1666 assert len(utterance.remove_duplicates(utterances)) == 1899 assert len( utterance.remove_duplicates( utterance.remove_empty_text(utterances))) == 1291
def test_poly_durations(self, prep_org_data): bkw_org_path = prep_org_data utterances = elan.utterances_from_dir(bkw_org_path, ["rf", "xv"]) print("Total duration of utterances is {}".format( utterance.duration(utterances))) print("Total duration of the first utterance is {}".format( utterance.duration(utterances[0])))
def test_explore_code_switching(self, prep_org_data): bkw_org_path = prep_org_data utterances = elan.utterances_from_dir(bkw_org_path, ["rf", "xv"]) utterances = utterance.remove_empty_text( utterance.remove_duplicates(utterances)) codeswitched_path = tgt_dir / "codeswitched.txt" bkw.explore_code_switching(utterances, codeswitched_path)
def test_speaker_durations(self, prep_org_data): bkw_org_path = prep_org_data utterances = elan.utterances_from_dir(bkw_org_path, ["rf", "xv"]) print(len(utterances)) utterances = utterance.remove_empty_text(utterances) print(len(utterances)) utterances = utterance.remove_duplicates(utterances) print(len(utterances)) utterances = [utter for utter in utterances if bkw.bkw_filter(utter)] print(len(utterances)) utterances = [ utter for utter in utterances if utterance.duration(utter) < 10000 ] total = 0 fmt = "{:20}{:10}" dur_fmt = "{:<10.3f}" print(fmt.format("Speaker", "Duration")) for speaker, duration in sorted( utterance.speaker_durations(utterances), key=lambda x: x[1], reverse=True): dur_mins = (duration * ureg.milliseconds).to( ureg.minutes).magnitude dur_str = dur_fmt.format(dur_mins) total += dur_mins print(fmt.format(speaker, dur_str)) print(fmt.format("Total", dur_fmt.format(total)))
def test_empty_wav(self, prep_org_data): # Checking the origin of the empty wav. bkw_org_path = prep_org_data utterances = elan.utterances_from_dir(bkw_org_path, ["rf", "xv"]) filtered = utterance.remove_too_short(utterances) if filtered != utterances: diff = set(utterances) - set(filtered) print("set(utterances) - set(filtered): {}:\n".format( pprint.pformat(diff))) assert False
def test_speaker_id(self, prep_org_data): bkw_org_path = prep_org_data utterances = elan.utterances_from_dir(bkw_org_path, ["rf", "xv"]) no_speaker_tiers = set() speaker_tiers = set() speakers = set() for utter in utterances: tier_id = splitext(utter.prefix)[0] if utter.speaker == None: no_speaker_tiers.add(tier_id) else: speaker_tiers.add((tier_id, utter.speaker)) speakers.add(utter.speaker) assert len(no_speaker_tiers) == 0 assert len(speakers) == NUM_SPEAKERS