def __init__(self, profile, bounds): self.speak_id = profile["xml:id"] self.ageGroup = profile["ageGroup"] self.role = profile["role"] self.sex = profile["sex"] self.soc = profile["soc"] self.dialect = profile["dialect"] self.phoneInterval = textgrid.IntervalTier("{} - phones".format(self.speak_id), bounds[0], bounds[1]) self.wordInterval = textgrid.IntervalTier("{} - words".format(self.speak_id), bounds[0], bounds[1])
def convert(word_file_a, phone_file_a, word_file_b, phone_file_b, textgrid_file): """ given the word and phone files for speakers a and b, converts them into one textgrid file Parameters ---------- word_file_a : str path to .A.phonwords file phone_file_a : str path to .A.phones file word_file_b : str path to B.phonwords file phone_file_b : str path to B.phones file textgrid_file : str path to desired resulting textgrid """ error_count = 0 tup_a = get_lists(phone_file_a, word_file_a) tup_b = get_lists(phone_file_b, word_file_b) phone_list_a, word_list_a, final_phone_end_a, final_word_end_a, phone_length_a, word_length_a = tup_a[ 0], tup_a[1], tup_a[2], tup_a[3], tup_a[4], tup_a[5] phone_list_b, word_list_b, final_phone_end_b, final_word_end_b, phone_length_b, word_length_b = tup_b[ 0], tup_b[1], tup_b[2], tup_b[3], tup_b[4], tup_b[5] phones_words = [phone_list_a, word_list_a, phone_list_b, word_list_b] all_tiers = [] textgrid = tg.TextGrid() phone_tierA = tg.IntervalTier(name="phones_A") word_tierA = tg.IntervalTier(name="words_A") all_tiers.append(phone_tierA) all_tiers.append(word_tierA) phone_tierB = tg.IntervalTier(name="phones_B") word_tierB = tg.IntervalTier(name="words_B") all_tiers.append(phone_tierB) all_tiers.append(word_tierB) for i, tier in enumerate(all_tiers): for element in phones_words[i]: try: tier.add(float(element.start), float(element.end), element.label) except ValueError: error_count += 1 textgrid.append(tier) # textgrid.write(textgrid_file) return (error_count, int(phone_length_b) + int(phone_length_a))
def chunked_vad(grid, rate, frames, wav, get_extra=True, return_qual=False): '''VAD of just utterances''' utterances = chunk_utterances(grid) word = grid[grid.getNames().index("word")] phones = grid[grid.getNames().index("phone")] ratio_list = [] if OUTPUT_UTTERANCES: quality = textgrid.IntervalTier(name="quality",\ minTime=word.minTime,\ maxTime=word.maxTime) for u in utterances: u_words = word[u] u_duration = u_words[-1].maxTime - u_words[0].minTime dur = second_to_frame(u_duration, rate) offset = second_to_frame(u_words[0].minTime - word.minTime, rate) ratio = get_section_vad(frames, word, dur, rate, offset=offset, utterance=u_duration) ratio_list += [((u_words[0].minTime, u_words[-1].maxTime), ratio)] if OUTPUT_UTTERANCES: quality.add(u_words[0].minTime, u_words[-1].maxTime, "{:.2f}".format(ratio[0])) if get_extra: swipe_feat = get_custom_features(wav, quality, phones) for i in range(len(swipe_feat[0][1])): swipe_features = textgrid.IntervalTier(name="swipe_features_{}".format(i),\ minTime=grid.minTime,\ maxTime=grid.maxTime) for phone in phones: for (feat_phone, feature) in swipe_feat: if phone == feat_phone: swipe_features.add(phone.minTime, phone.maxTime, str(feature[i])) break grid.append(swipe_features) if return_qual: return quality if OUTPUT_UTTERANCES: grid.append(quality) if OUTPUT_FINE: grid.append(get_fine_detail(frames, word, rate)) grid.write( f"{OUTPUT_DIR}/{grid.name.split('.TextGrid')[0]}_chunked.TextGrid") return ratio_list
def export_textgrid(config, path, wav_path=None): with CorpusContext(config) as c: discourses = c.discourses levels = c.hierarchy.annotation_types for d in discourses: grid = tg.TextGrid() tier = tg.IntervalTier() q = c.query_graph(c.phone) q = q.filter(c.phone.discourse.name == d) q = q.order_by(c.phone.begin) res = q.all() for phone in res: tier.add(phone.begin / SAM_RATE, phone.end / SAM_RATE, phone.label) grid.append(tier) speaker = d.split("_")[0] just_file = d.split("_")[1] filename = just_file + ".TextGrid" per_speaker_path = os.path.join(path, speaker) if not os.path.exists(per_speaker_path): os.mkdir(per_speaker_path) grid.write(os.path.join(per_speaker_path, filename)) top_filename = just_file + ".top" export_tops(res, os.path.join(per_speaker_path, top_filename)) if wav_path is not None: try: print("running ", just_file) path_to_wav = wav_path[just_file] mfcc(path_to_wav, os.path.join(per_speaker_path), "mfcc_16khz.conf") except KeyError: pass
def save_textgrid(self, output_file: str): tg = textgrid.TextGrid() words = textgrid.IntervalTier('words') phones = textgrid.IntervalTier('phones') for word in self.words: words.add(word.start / 16000, word.end / 16000, word.label) if word.end < self.duration: words.add(word.end / 16000, self.duration / 16000, '') for phone in self.base_units: phones.add(phone.start / 16000, phone.end / 16000, phone.label) tg.append(words) tg.append(phones) with open(output_file, 'w') as f: tg.write(f)
def save_textgrid(self, filename): """Save alignment as textgrid""" # Construct phoneme tier phon_tier = textgrid.IntervalTier('phone', self.start(), self.end()) for phoneme in self.phonemes(): mark = 'sil' if str(phoneme) == pypar.SILENCE else str(phoneme) phon_tier.add(phoneme.start(), phoneme.end(), mark) # Construct word tier word_tier = textgrid.IntervalTier('word', self.start(), self.end()) for word in self: word_tier.add(word.start(), word.end(), str(word)) # Construct textgrid grid = textgrid.TextGrid(Path(filename).stem, self.start(), self.end()) grid.extend([phon_tier, word_tier]) # Save grid.write(filename)
def examine_phonemes(grid): '''Iterates over the phones and flags unrealistic ones''' phones = grid[grid.getNames().index("phone")] bad_phones = textgrid.IntervalTier(name="bad_phones",\ minTime=phones.minTime,\ maxTime=phones.maxTime) bad_phones.add(phones.minTime, phones.maxTime, "good") for phone in phones: if phone.mark not in ["{LG}", "sil", "ns", "sp" ] and phone.duration() > 2: overwrite_interval(bad_phones, phone.minTime - BAD_PHONE_BUFFER, phone.maxTime + BAD_PHONE_BUFFER, "bad") #This phoneme is not realistic grid.append(bad_phones) return grid
def extract_from_textgrid(label_path): error_num = 0 phoneme_num = 0 syllable_num = 0 target_seg_tier = textgrid.IntervalTier('phoneme') py_grid = textgrid.TextGrid.fromFile(label_path) source_tier = py_grid.tiers[0] duration = source_tier.maxTime - source_tier.minTime assert source_tier != None temp_list = [] for i, interval in enumerate(source_tier): phn = interval.mark.strip() # check phn if phn not in get_all_phon(): print(str(interval) + ' in file: ' + raw_path) error_num += 1 else: target_seg_tier.addInterval(interval) phoneme_num += 1 # check pinyin if phn in get_shengmu(): temp_list.append(phn) continue elif len(temp_list) > 0: temp_list.append(phn) dict_key = str(temp_list) if dict_key not in pp_dict_reverse: print(str(interval) + ' in file: ' + raw_path) error_num += 1 else: syllable_num += 1 temp_list.clear() else: pass # dict_key = str([phn]) # if dict_key not in pp_dict_reverse: # print(interval) return duration, phoneme_num, syllable_num, error_num
def processAli(ali_ls): int_list = [] for counter, line in enumerate(ali_ls, 0): if counter > 0: l_list = line[:-1].split("\t") phon_start = decimal.Decimal(l_list[4]) phon_end = phon_start + decimal.Decimal(l_list[5]) phon_lab = "SIL" if l_list[6] == "SIL" else l_list[6].split("_")[0] int_list.append((phon_start, phon_end, phon_lab)) phon_tier = textgrid.IntervalTier(name="PHONEMES", minTime=0, maxTime=int_list[-1][1]) for i in int_list: if i[0] == i[1]: continue phon_tier.add(i[0], i[1], phon_dict[i[2]]) phon_textgrid = textgrid.TextGrid(name=None, minTime=0, maxTime=int_list[-1][1]) phon_textgrid.append(phon_tier) return phon_textgrid
def get_fine_detail(frames, word, rate): '''Print exactly where the VAD detected speech''' frame_width = second_to_frame(SAMPLE_SIZE / 1000, rate) minTime = float(word.minTime) maxTime = float(word.maxTime - word.minTime) fine_detail = textgrid.IntervalTier(name="fine_detail",\ minTime=word.minTime,\ maxTime=word.maxTime) last_mark = VAD[2].is_speech(frames[0:frame_width], rate) start = word.minTime duration = second_to_frame(maxTime, rate) for t in range(0, duration, frame_width): inter = word.indexContaining(t / rate / 2 + minTime) if inter is not None: speech = VAD[2].is_speech(frames[t:t + frame_width], rate) if last_mark != speech and start != t / rate / 2 + minTime: fine_detail.add(start, min(t / rate / 2 + minTime, word.maxTime), str(last_mark)) start = t / rate / 2 + minTime last_mark = speech return fine_detail
def write_textgrid(phone_sequence, sample_rate, filename): """ Given a sequence of tuples of the form (label, start_sample, end_sample), output a .TextGrid file showing the phone labels and boundaries. Requires Kyle Gorman's textgrid library for Python, available at https://github.com/kylebgorman/textgrid. (Do pip install git+http://github.com/kylebgorman/textgrid.git for easy installation) """ sample_dur = 1 / sample_rate end_time = phone_sequence[-1][2] * sample_dur print("end_time=", end_time) my_textgrid = tg.TextGrid(name='filename', maxTime=end_time) my_tier = tg.IntervalTier(name='phones', maxTime=end_time) for phone in phone_sequence: my_tier.add(minTime=phone[1] * sample_dur, maxTime=phone[2] * sample_dur, mark=str(phone[0])) my_textgrid.append(my_tier) my_textgrid.write(filename) pass
def read_textgrid_tier(textgrid_filename, vot_tier): # read TextGrid textgrid = tg.TextGrid() textgrid.read(textgrid_filename) # extract tier names tier_names = textgrid.getNames() # check if the VOT tier is one of the tiers in the TextGrid vots = tg.IntervalTier(minTime=textgrid.minTime, maxTime=textgrid.maxTime) if vot_tier in tier_names: # run over all intervals in the tier for interval in textgrid.getFirst(vot_tier): if re.search(r'\S', interval.mark): vots.addInterval(interval) else: logging.error("Tier %s not found in TextGrid %s" % (vot_tier, textgrid_filename)) logging.error( "(If you think the tier is there, perhaps there's extra whitespace in the tier name?)" ) return vots
def makeData(from_file, to_file, core): for counter, wp in enumerate(wavpaths[from_file:to_file + 1], 1): rate, sig = scipy.io.wavfile.read(wp) np_mfcc = python_speech_features.mfcc(sig, rate, winlen=window, winstep=step) np_mfcc_d = python_speech_features.delta(np_mfcc, 2) np_mfcc_dd = python_speech_features.delta(np_mfcc_d, 2) np_mfcc_all = np.append(np.append(np_mfcc, np_mfcc_d, axis=1), np_mfcc_dd, axis=1) # print(np_mfcc_all.shape) wn = wp.split("/")[-1] # get corpus info if wn[0] in ["F", "M"]: corpus = "ifa" elif wn[0] == "D": corpus = "ifadv" elif wn[0] == "p": corpus = "ecsd" else: corpus = "cgn-" + wn[0] # sent_id = ".".join(wn.split(".")[:-1]) print(core, counter, "/", to_file - from_file, sent_id) tg_path = af_path + chunk_folder + sent_id + ".TextGrid" tg = textgrid.TextGrid() with makeTempFile(tg_path) as tempf: tg.read(tempf.name) intervals = tg.tiers[2].intervals end_time = round(intervals[-1].maxTime, 3) start_time = round(intervals[0].minTime, 3) # get number of syllables for each word word_intervals = tg.tiers[0].intervals word_syls = [] word_segs = [] label_dictionary = {key: [] for key in features} for w in word_intervals: n_syls = 0 word_start = round(w.minTime, 3) word_end = round(w.maxTime, 3) first_seg = tg.tiers[2].indexContaining(word_start + 0.001) assert first_seg == 0 or intervals[first_seg - 1].mark[-1] == "#" last_seg = tg.tiers[2].indexContaining(word_end - 0.001) assert intervals[last_seg].mark[-1] == "#" n_segs = last_seg + 1 - first_seg word_segs.append(n_segs) for seg in intervals[first_seg:last_seg + 1]: if seg.mark.strip("#") in vowels: n_syls += 1 word_syls.append(n_syls) assert len(word_syls) == len(word_intervals) comments = tg.tiers[1].intervals # print(start_time, end_time) classes = np.zeros((0, num_cols_per_frame)) int_i = 0 num_frames = np_mfcc_all.shape[0] useable_frame_indices = [] for frame in range(1, num_frames + 1): frame_s = round(start_time + (frame - 1) * step, 3) frame_e = frame_s + window if frame_e > end_time: # because '0' samples can be appended to sig so it can be divided by an integer of frames frame_e = end_time intvl = intervals[int_i] if frame_s < round(intvl.minTime, 3): print(frame_s, round(intvl.minTime, 3)) assert frame_s >= round(intvl.minTime, 3) if frame_e <= round(intvl.maxTime, 3): # calculate the proportion of the frame that is within the useable centre of the interval # this needs to be done separately for the nasalization feature, which should be aligned with end of intervals # for feat in ["@", "n", "~"] # pass feat to getFeatureLabel # which returns labels for each feat # construct label_list after for loop completes label_list = [] for feat in features: label = getFeatureLabel(frame_s, frame_e, feat, int_i, tg.tiers[2], tg.tiers[0], word_syls, word_segs, comments) label_list.append(label) label_dictionary[feat].append(label) if sum(label_list) > -3: useable_frame_indices.append(frame - 1) row = np.array( [np.append(np_mfcc_all[frame - 1, ], label_list)]) classes = np.append(classes, row, axis=0) else: assert frame_e > round(intvl.maxTime, 3) proportions = [(round(intvl.maxTime, 3) - frame_s, int_i)] new_int = intvl new_int_i = int_i next_int_i = int_i while frame_e > round(new_int.maxTime, 3): new_int_i += 1 new_int = intervals[new_int_i] overlap = (frame_e - round(new_int.minTime, 3) ) if frame_e <= round(new_int.maxTime, 3) else ( round(new_int.maxTime, 3) - round(new_int.minTime, 3)) proportions.append((overlap, new_int_i)) if (frame_s + step) >= round(new_int.minTime, 3): next_int_i = new_int_i best_int_i = max(proportions)[1] # calculate the proportion of the frame that is within the useable centre of the interval label_list = [] for feat in features: label = getFeatureLabel(frame_s, frame_e, feat, best_int_i, tg.tiers[2], tg.tiers[0], word_syls, word_segs, comments) label_list.append(label) label_dictionary[feat].append(label) if sum(label_list) > -3: useable_frame_indices.append(frame - 1) row = np.array( [np.append(np_mfcc_all[frame - 1, ], label_list)]) classes = np.append(classes, row, axis=0) int_i = next_int_i # print(useable_frame_indices) for old_row in range(classes.shape[0]): if (old_row >= 2 * frame_window) and ((old_row - frame_window) in useable_frame_indices): new_labels = classes[old_row - frame_window, num_cols_per_frame - len(features):] new_feat = classes[old_row - (2 * frame_window):old_row + 1, :num_cols_per_frame - len(features)].flatten() new_row = np.array([ np.append(np.append(new_feat, corpora[corpus]), new_labels) ]) # samples = np.append(samples, new_row, axis=0) with open( af_path + "AF_en" + str(int(core) + running_cores) + ".csv", "a") as f: np.savetxt(f, new_row, fmt='%.5e', delimiter=",") if produce_output_tg: tg_out = textgrid.TextGrid(minTime=float(start_time)) for af in features: tier = textgrid.IntervalTier(name=af, minTime=float(start_time)) prev_class = -1 min_time = float(start_time) max_time = min_time + ( window - step ) / 2 + step # in reality we can't draw clear boundaries between labels; we would need a 0, 1, and -1 tier for each feature to show overlapping frames for frame_n, lab_cl in enumerate(label_dictionary[af], 1): if prev_class != lab_cl: if frame_n != 1: tier.add(min_time, max_time, str(prev_class)) min_time = max_time max_time = float(start_time) + (window - step) / 2 + frame_n * step if len(label_dictionary[af]) == frame_n: max_time += (window - step) / 2 tier.add(min_time, max_time, str(lab_cl)) break prev_class = lab_cl tg_out.append(tier) with open( af_path + "en_training_labels_tgs/" + sent_id + ".TextGrid", "w") as f: tg_out.write(f)
# for the purposes of testing, # just match the one file if name.endswith(".TextGrid"): print("Processing {}".format(name)) tg = textgrid.TextGrid() tg.read(os.path.join(root, name)) # also make a new TG to write to ntg = textgrid.TextGrid() ## start iterating through the tiers for tier in tg: # make new interval tier with speaker name ntier = textgrid.IntervalTier(name=tier.name) # loop through the intervals wordList = [] intList = [] for interval in tier.intervals: # add non-silence to a word list if interval.mark != args.separator: wordList.append(interval.mark) intList.append(interval) else: # check it's not just the start if len(wordList) > 0: # merge the list together,
def convert(word_file_a, phone_file_a,word_file_b, phone_file_b, textgrid_file): """ given the word and phone files for speakers a and b, converts them into one textgrid file Parameters ---------- word_file_a : str path to .A.phonwords file phone_file_a : str path to .A.phones file word_file_b : str path to B.phonwords file phone_file_b : str path to B.phones file textgrid_file : str path to desired resulting textgrid """ tup_a = get_lists(phone_file_a, word_file_a) tup_b = get_lists(phone_file_b, word_file_b) phone_list_a, word_list_a, final_phone_end_a, final_word_end_a, phone_length_a, word_length_a = tup_a[0], tup_a[1], tup_a[2], tup_a[3], tup_a[4], tup_a[5] phone_list_b, word_list_b, final_phone_end_b, final_word_end_b, phone_length_b, word_length_b = tup_b[0], tup_b[1], tup_b[2], tup_b[3], tup_b[4], tup_b[5] phones_words = [phone_list_a, word_list_a, phone_list_b, word_list_b] all_tiers = [] textgrid = tg.TextGrid() phone_tierA = tg.IntervalTier(name = "A - phone") word_tierA = tg.IntervalTier(name = "A - word") all_tiers.append(phone_tierA) all_tiers.append(word_tierA) phone_tierB = tg.IntervalTier(name = "B - phone") word_tierB = tg.IntervalTier(name = "B - word") all_tiers.append(phone_tierB) all_tiers.append(word_tierB) finished_tiers = [] for i,tier in enumerate(all_tiers): for element in phones_words[i]: try: tier.add(float(element.start), float(element.end), element.label) except (ValueError, IndexError) as e: continue finished_tiers.append(tier) for i, tier in enumerate(finished_tiers): if (i+1)%2==0 and i>0: fixed = extend_last(finished_tiers[i-1], finished_tiers[i]) finished_tiers[i-1] = fixed[0] finished_tiers[i] = fixed[1] textgrid.append(finished_tiers[i-1]) textgrid.append(finished_tiers[i]) textgrid.write(textgrid_file)
filter(lambda x: x.endswith(file_ending), os.listdir(TEMP_DIR))) for subtitle in subtitle_files: #Check there's an audio file associated audio = subtitle.replace(file_ending, ".wav") if not os.path.isfile(os.path.join(TEMP_DIR, audio)): print("Audio file {} is missing".format(audio)) continue speaker = subtitle.split(".")[0] #Make sure the subtitles don't overlap as there are overlapping subtitles #in youtube's auto-generated subs captions = webvtt.read(os.path.join(TEMP_DIR, subtitle)).captions #Even captions have the time of the utterance (as well as the time of the individual words) #Odd captions have the actual utterance string tier = textgrid.IntervalTier(speaker) for cap_time, cap_string in zip(captions[::2], captions[1::2]): tier.add(cap_time.start_in_seconds, cap_time.end_in_seconds, \ cap_string.text.strip()) tg = textgrid.TextGrid() tg.append(tier) tg.write( os.path.join(TEXTGRID_DIR, subtitle.replace(file_ending, ".TextGrid"))) shutil.move(os.path.join(TEMP_DIR, audio), \ os.path.join(TEXTGRID_DIR, audio)) if not args.skip_mfa: subprocess.run([os.path.join(MFA_BIN, "mfa_align"), TEXTGRID_DIR, \ args.mfa_dict, args.mfa_model, ALIGNED_DIR, "--verbose"])
def gentle_to_grid(gentle_file, out_file=None): "Convert *.json file from Gentle to Praat TextGrid" if '*' in gentle_file: if out_file is not None: raise TypeError("out can not be set during batch-conversion") for filename in glob(gentle_file): gentle_to_grid(filename) return gentle_file = Path(gentle_file) if out_file is None: out_file = gentle_file.with_suffix('.TextGrid') else: out_file = Path(out_file) if out_file.suffix.lower() != '.textgrid': out_file = out_file.with_suffix('.TextGrid') with gentle_file.open() as fid: g = json.load(fid) # find valid words words = g['words'] n_issues = 0 for i, word in enumerate(words): if word['case'] == 'success': if word['alignedWord'] == '<unk>': n_issues += 1 word['issue'] = 'OOV' else: word['issue'] = None else: n_issues += 1 word['issue'] = word['case'] # add missing times last_end = 0 not_in_audio_words = [] # buffer for word in words: if 'start' in word: if not_in_audio_words: duration = word['start'] - last_end for j, word_ in enumerate(not_in_audio_words): word_['start'] = last_end + j * duration word_['end'] = last_end + (j + 1) * duration not_in_audio_words = [] last_end = word['end'] else: not_in_audio_words.append(word) for word in not_in_audio_words: word['start'] = last_end word['end'] = last_end = last_end + 0.100 # round times for word in words: word['start'] = round(word['start'], 3) word['end'] = round(word['end'], 3) # avoid overlapping words last_start = words[-1]['end'] + 1 for word in reversed(words): if word['end'] > last_start: word['end'] = last_start if word['start'] >= word['end']: word['start'] = word['end'] - .001 last_start = word['start'] # gentle seems to work at 10 ms resolution if word['end'] - word['start'] < 0.015 and 'issue' not in word: word['issue'] = 'short' # log issues if n_issues: log = fmtxt.Table('rrrll') log.cell('Time') log.cell('Duration', width=2) log.cells('Word', 'Issue') log.midrule() for word in words: if word['issue']: duration = word['end'] - word['start'] d_marker = '*' if duration < 0.015 else '' log.cells(f"{word['start']:.3f}", d_marker, f"{duration:.3f}", word['word'], word['issue']) print(log) log.save_tsv(out_file.with_suffix('.log')) # build textgrid phone_tier = textgrid.IntervalTier('phones') word_tier = textgrid.IntervalTier('words') for i, word in enumerate(words): t = word['start'] word_tstop = word['end'] # add word word_tier.add(t, word_tstop, word['word']) # make sure we have at least one phone phones = word.get('phones', ()) if not phones: phones = ({'phone': '', 'duration': word['end'] - word['start']}, ) # add phones for phone in phones: tstop = min(round(t + phone['duration'], 3), word_tstop) if t >= tstop: continue mark = phone['phone'].split('_')[0].upper() if mark == 'OOV': continue phone_tier.add(t, tstop, mark) t = tstop grid = textgrid.TextGrid() grid.extend((phone_tier, word_tier)) grid.write(out_file)
float_format="%.2f") pd_samples = pd.DataFrame(data=unseen_samples, columns=_CSV_COLUMNS) predict_test_input_fn = create_predict_input_fn(pd_samples, batch_size=batch_size) tg = textgrid.TextGrid(minTime=float(start_time)) for af in ["s"]: classifier = classifiers[af] classifier_output = list( classifier.predict(input_fn=predict_test_input_fn)) predictions = np.array( [item['class_ids'][0] for item in classifier_output]) print(af, predictions) probabilities = np.array( [item['probabilities'] for item in classifier_output]) # construct textgrids tier = textgrid.IntervalTier(name=af, minTime=float(start_time)) prev_class = 99 min_time = float(start_time) max_time = min_time + frame_window * step tier.add( min_time, max_time, "" ) # add the empty interval for the initial buffer (due to windowing over preceding 5 frames) min_time = max_time # create 'probability' tier if af in expected_features: prob_i = expected_features[af] af_probs = list(probabilities[:, prob_i]) # print(af_probs) with open( tens_path + "pred_textgrids/sep_mod/" + fragment_id + "_" + af + ".IntensityTier", "w") as f:
def predict_files(core_num="", start_line=1, end_line=num_index_lines): for fp in file_paths[start_line - 1:end_line]: fragment_id = ".".join(fp.split(".")[:-1]).split("/")[-1] # get corpus info if fragment_id[0] in ["F", "M"]: corpus = "ifa" elif fragment_id[0] == "D": corpus = "ifadv" elif fragment_id[0] == "p": corpus = "ecsd" else: corpus = "cgn-" + frag_fol # corpus = "cgn-" + fragment_id[0] # fn_num = int(fragment_id.split("_")[0][2:]) # corpus = "cgn-o" if fn_num > 1000 and fn_num < 1566 else "cgn-a" # print(core_num, fragment_id) fragment_id_split = fragment_id.split("_") wav = "_".join(fragment_id_split[:-3]) chan, start_time, end_time = fragment_id_split[-3:] # wav, chan, start_time, end_time = fragment_id.split("_") rate, sig = scipy.io.wavfile.read(fp) np_mfcc = python_speech_features.mfcc(sig, rate, winlen=window, winstep=step) np_mfcc_d = python_speech_features.delta(np_mfcc, 2) np_mfcc_dd = python_speech_features.delta(np_mfcc_d, 2) np_mfcc_all = np.append(np.append(np_mfcc, np_mfcc_d, axis=1), np_mfcc_dd, axis=1) unseen_samples = np.zeros((0, num_feat)) for old_row in range(np_mfcc_all.shape[0]): if old_row >= 2 * frame_window: new_feat = np_mfcc_all[old_row - (2 * frame_window):old_row + 1, :num_feat_per_frame].flatten() new_row = np.array([np.append(new_feat, corpora[corpus])]) # print(unseen_samples.shape, np.array(new_row).shape) unseen_samples = np.append(unseen_samples, np.array(new_row), axis=0) # save unseen samples for inspection summary_text = pd.DataFrame(np_mfcc).describe() summary_text.to_csv(tens_path + "pred_mfcc/" + corpus + "/" + fragment_id + "_sum.csv", float_format="%.2f") pd_samples = pd.DataFrame(data=unseen_samples, columns=_CSV_COLUMNS) predict_test_input_fn = create_predict_input_fn(pd_samples, batch_size=batch_size) tg = textgrid.TextGrid(minTime=float(start_time)) for af in ["s"]: classifier = classifiers[af] classifier_output = list(classifier.predict(input_fn=predict_test_input_fn)) predictions = np.array([item['class_ids'][0] for item in classifier_output]) print(af, predictions) probabilities = np.array([item['probabilities'] for item in classifier_output]) # construct textgrids tier = textgrid.IntervalTier(name=af, minTime=float(start_time)) prev_class = 99 min_time = float(start_time) max_time = min_time + frame_window * step tier.add(min_time, max_time, "") # add the empty interval for the initial buffer (due to windowing over preceding 5 frames) min_time = max_time # create 'probability' tier if af in expected_features: prob_i = expected_features[af] af_probs = list(probabilities[:, prob_i]) # print(af_probs) with open(tens_path + "pred_textgrids/" + corpus + "/" + fragment_id + "_" + af + ".IntensityTier", "w") as f: f.write('File type = "ooTextFile"\nObject class = "IntensityTier"\n\n{}\n{}\n{}\n'.format(start_time, end_time, len(af_probs))) for frame_i, prob in enumerate(af_probs, 0): f_time = min_time + frame_i * step f.write('{}\n{}\n'.format(f_time, prob)) for frame_n, pred_cl in enumerate(predictions, 1): # print(frame_n, pred_cl, prev_class, min_time, max_time, end_time, features[af][pred_cl]) if prev_class != pred_cl: if frame_n != 1: tier.add(min_time, max_time, features[af][prev_class]) min_time = max_time max_time = float(start_time) + (frame_window * step) + frame_n * step # if max_time > float(end_time): # max_time = float(end_time) if len(predictions) == frame_n: # or max_time == float(end_time): max_time += (window - step) tier.add(min_time, max_time, features[af][pred_cl]) break prev_class = pred_cl if max_time < float(end_time): # add the empty interval for the final buffer (due to windowing over subsequent 5 frames) tier.add(max_time, float(end_time), "") tg.append(tier) with open(tens_path + "pred_textgrids/" + corpus + "/" + fragment_id + "_" + af + ".TextGrid", "w") as f: tg.write(f)
tg = textgrid.TextGrid() tg.read(os.path.join(root, name)) for tier in tg.getNames(): # ignore non-transcript tiers if re.search('(transcript.*)', tier) is None: continue # make list of 'transcript' tiers & iterate through them transcriptTiers = re.findall('(transcript.*)', tier) for item in transcriptTiers: pos = tg.getNames().index(item) # create interval tier and # and cleaned intervals wt = textgrid.IntervalTier(name=item) tr = tg.getList(item)[0] # remove original tier from TextGrid tier list tg.tiers.pop(pos) for interval in tr: try: wt.add(interval.minTime, interval.maxTime, strip_punct(interval.mark)) except Exception as e: print(name, e) # add new tier to list tg.append(wt) # save textgrid
new_tiers = [[] for x in tiers] for utterance in arg_grid[arg_grid.getNames().index("classification")]: if utterance.mark == "good": for tier, new_tier in zip(tiers, new_tiers): bounds = (tier.indexContaining(float(utterance.minTime) + 0.0001), \ tier.indexContaining(float(utterance.maxTime) - 0.0001) + 1) for j in range(bounds[0], bounds[1]): interval = tier.intervals[j] if interval.minTime >= utterance.minTime and interval.maxTime <= utterance.maxTime: new_tier.append(interval) out_grid = textgrid.TextGrid(name=arg.split("/")[-1], minTime=arg_grid.minTime, maxTime=arg_grid.maxTime) for tier, new_tier in zip(tiers, new_tiers): out_tier = textgrid.IntervalTier(tier.name, tier.minTime, tier.maxTime) out_tier.intervals = new_tier out_grid.append(out_tier) out_grid.write( f"{OUTPUT_DIR}/{arg_grid.name.split('.TextGrid')[0]}_cleaned.TextGrid" ) print(f"{i+1}/{len(sys.argv[1:])}") except (KeyboardInterrupt, SystemExit): raise except: print(f"problematic file, {arg}") problematic_files += [arg] with open("bad_files.txt", "a+") as f: for x in problematic_files: f.write(f"{x}\n")
def get_utterances_p2_4(file, textgrid_file): """ read utterance from .trn file and write them to textgrid format collapses utterances with <.15s pauses between them into 1 utterance textgrid tiers are based on speakers (each speaker |-> 1 tier). For textgrids in Part1 Parameters ---------- file : str path to .trn file textgrid_file : str path to textgrid file """ with open(file) as f1: lines = f1.readlines() ordered_tups = [] speakers = defaultdict(list) skipped = 0 for i, line in enumerate(lines): if '\x00' in line: line = "c".join(line.split('\x00')) splitline = re.split("\t+", line) start = splitline[0] end = splitline[1] if splitline[2].strip().endswith(":"): speaker = splitline[2] try: label = splitline[3] except IndexError: label = "" else: speaker = ordered_tups[i - 1][2] try: label = splitline[2] except IndexError: label = "" label = remove_regex.sub("", label) label = sub_regex.sub("", label) label = breath_regex.sub("BREATH_WORD_SB", label) label = laugh_regex.sub("LAUGH_WORD_SB", label) label = re.sub("([a-zA-Z]-)", r"[\1]", label) speaker = re.sub("[:\s]", "", speaker) speaker = re.sub(">env", "env", speaker.lower()) # if label not in ["", " ", None]: ordered_tups.append((start, end, speaker, label)) speakers[speaker.lower()].append((start, end, label)) speakers = clean(speakers) textgrid = tg.TextGrid() for i, speaker in enumerate(speakers.keys()): tier = tg.IntervalTier(name="{} - utterance".format(speaker.strip())) for j, tup in enumerate(speakers[speaker]): try: if float(tup[0]) == float(tup[1]): continue except ValueError: pass try: tier.add(float(tup[0]), float(tup[1]), tup[2].strip()) except ValueError: try: previous = tier[-1] except IndexError: skipped += 1 continue previous_end = previous.maxTime difference = previous.maxTime - float(tup[0]) if difference < 0: skipped += 1 continue if float(tup[0]) + difference == float(tup[1]): skipped += 1 continue tier.add( float(tup[0]) + difference, float(tup[1]), tup[2].strip()) if len(tier.intervals) > 0: textgrid.append(tier) textgrid.write(textgrid_file) print("skipped: {}".format(skipped))
if name.endswith(".TextGrid"): print("Processing {}".format(name), end="...") ## read textgrid tg = textgrid.TextGrid() tg.read(os.path.join(root, name)) # search textgrid for a tier with a specific name # if no name specified, look at the first tier for Tier in tg: if args.kana_tier is None: args.kana_tier = tg.getNames()[0] if Tier.name == args.kana_tier: romajiTier = textgrid.IntervalTier() if args.speaker_as_dir: path = os.path.basename( os.path.dirname(os.path.join(root, name))) romajiTier.name = path else: romajiTier.name = os.path.splitext(name)[0] # convert kanji to romaji for interval in Tier.intervals: # skip intervals with dummy silence if interval.mark == "#": continue # take interval information from original
elif args.outformat == "textgrid": txg = textgrid.TextGrid() txg.minTime = 0 transcr_wrd_starts = endpoints[0] transcr_wrd_ends = endpoints[1] transcr_wrd_txt = endpoints[2] transcr_ph_starts = endpoints[3] transcr_ph_ends = endpoints[4] transcr_ph_txt = endpoints[5] offsets = decoded_results["output"][0]["offsets"] txg.endTime = offsets[len(offsets) - 1] words_tier = textgrid.IntervalTier("words", 0, offsets[len(offsets) - 1]) phones_tier = textgrid.IntervalTier("phones", 0, offsets[len(offsets) - 1]) for jx in range(0, len(transcr_wrd_txt) - 1): #print(transcr_wrd_txt[jx]+" "+str(transcr_wrd_starts[jx])+" "+str(transcr_wrd_ends[jx])) #if transcr_wrd_starts[jx] == transcr_wrd_ends[jx]: transcr_wrd_ends[jx] += 0.01 words_tier.addInterval( textgrid.Interval(transcr_wrd_starts[jx], transcr_wrd_ends[jx], transcr_wrd_txt[jx])) for px in range(0, len(transcr_ph_txt) - 1): #print(transcr_ph_txt[px]+" "+str(transcr_ph_starts[px])+" "+str(transcr_ph_ends[px])) #if transcr_ph_starts[jx] == transcr_ph_ends[jx]: transcr_ph_ends[jx] += 0.01 #if transcr_ph_txt[px] == " ": transcr_ph_txt[px] = "SIL" phones_tier.addInterval(
import textgrid import numpy as np def linear_classifier(X): return X["word-number"] > 1 and X["duration"] > 1 and X["mfa_found"] == 1 and X["hnr"] > 5.4 and X["alignment-diff"] < 0.03 OUTPUT_DIR = "out_with_labels" uts = 0 good_uts = 0 for i, arg in enumerate(sys.argv[1:]): if not os.path.isfile(arg): print("{} is not a valid file".format(arg)) arg_grid = textgrid.TextGrid(name=arg.split("/")[-1]) arg_grid.read(arg) classification = textgrid.IntervalTier(name="classification", minTime=arg_grid.minTime, maxTime=arg_grid.maxTime) word = arg_grid[arg_grid.getNames().index("word")] hnr_quality = arg_grid[arg_grid.getNames().index("hnr")] mfa_found = arg_grid[arg_grid.getNames().index("mfa_found")] for utterance in arg_grid[arg_grid.getNames().index("quality")]: bounds = slice(word.indexContaining(float(utterance.minTime)+0.001), \ word.indexContaining(float(utterance.maxTime)-0.001) + 1) hnr_index = hnr_quality.indexContaining(utterance.minTime + utterance.duration()/2) mfa_index = mfa_found.indexContaining(utterance.minTime + utterance.duration()/2) if utterance.mark != "": X = {"duration": float(utterance.duration()), "alignment-diff": float(utterance.mark), "hnr": float(hnr_quality[hnr_index].mark), "mfa_found": float(mfa_found[mfa_index].mark), "word-number": len(word[bounds])} if X["duration"] > 1 and X["word-number"] > 1:
def main(args): model, waveglow, denoiser, hparams = load_tts_vocoder_models( args.tacotron_checkpoint_path, args.waveglow_checkpoint_path) cmu_dict = load_cmudict(args.cmudict_path) # phoneme_pairs, texts = parse_input(args.input_text_file) texts = parse_input(args.input_text_file) phoneme_pairs = [ 'W V', 'IH0 IY0', 'IH1 IY1', 'IH2 IY2', 'EH0 AE0', 'EH1 AE1', 'EH2 AE2', 'NG N', 'S TH', 'Z S', 'AA0 AH0', 'AA1 AH1', 'AA2 AH2', 'UW0 UH0', 'UW1 UH1', 'UW2 UH2', 'DH D' ] # assert len(phoneme_pairs) == len(texts), "Lines of phoneme pairs and texts must be the same, please check" \ # "the input text file." wav_dir = args.output_dir.joinpath('wav') wav_dir.mkdir(exist_ok=True, parents=True) trans_dir = args.output_dir.joinpath('transcript') trans_dir.mkdir(exist_ok=True, parents=True) tg_dir = args.output_dir.joinpath('annotation') tg_dir.mkdir(exist_ok=True, parents=True) utt_i = 0 for text in texts: for phoneme_pair in phoneme_pairs: # Synthesize speech phoneme_pair = phoneme_pair.split() try: text_arpabet = text_to_arpabet( cmu_dict, phoneme_pair, text, swap_phoneme=args.mispronunciation) except: continue if re.sub(r'[^\w]', '', text_arpabet).strip() == re.sub(r'[^\w]', '', text).strip(): continue sequence = np.array( text_to_sequence(text_arpabet, ['english_cleaners']))[None, :] sequence = torch.autograd.Variable( torch.from_numpy(sequence)).cuda().long() _, mel_outputs_postnet, _, alignments, is_max_steps = model.inference( sequence) if is_max_steps: continue with torch.no_grad(): wav = waveglow.infer(mel_outputs_postnet, sigma=0.666) # wav_denoised = denoiser(wav, strength=0.01)[:, 0].cpu().numpy().T output_wav_file = wav_dir.joinpath('{:s}_{:04d}.wav'.format( args.prefix, utt_i + 1)) wavfile.write(output_wav_file, hparams.sampling_rate, wav.cpu().numpy().T) # Save transcript output_trans_file = trans_dir.joinpath('{:s}_{:04d}.txt'.format( args.prefix, utt_i + 1)) with open(output_trans_file, 'w') as f: f.write(text) # Generate textgrid # The startTime and endTime are just indices, not alignment boundarys. Do not use it. tg = textgrid.TextGrid() word_tier = textgrid.IntervalTier(name='words') text = re.sub(r'[^\w ]', '', text) idx = 0 for word in text.split(): word_tier.add(float(idx), float(idx + 1), word) idx += 1 idx = 0 phone_tier = textgrid.IntervalTier(name='phones') # phone_tier.add(float(idx), float(idx + 1), 'sil') # idx += 1 for word in text.split(): arpabet = cmu_dict.lookup(word)[0] for phoneme in arpabet.split(): if args.mispronunciation: if phoneme == phoneme_pair[0]: phone_tier.add( float(idx), float(idx + 1), phoneme + ',' + phoneme_pair[1] + ',s') elif phoneme == phoneme_pair[1]: phone_tier.add( float(idx), float(idx + 1), phoneme + ',' + phoneme_pair[0] + ',s') else: phone_tier.add(float(idx), float(idx + 1), phoneme) else: phone_tier.add(float(idx), float(idx + 1), phoneme) idx += 1 # phone_tier.add(float(idx), float(idx + 1), 'sil') tg.append(word_tier) tg.append(phone_tier) tg_file = tg_dir.joinpath('{:s}_{:04d}.TextGrid'.format( args.prefix, utt_i + 1)) tg.write(tg_file) print('{:d}: {:s} | {:s} | {:s}'.format(utt_i + 1, text, ' '.join(phoneme_pair), text_arpabet)) utt_i += 1
if not os.path.isfile(arg): print("{} is not a valid file".format(arg)) continue try: grid = textgrid.TextGrid(name=arg.split("/")[-1]) grid.read(arg) except (AttributeError, ValueError): print(f"{arg.split('/')[-1]} can't load") continue utterances = chunk_utterances(grid) words = grid[grid.getNames().index("word")] phones = grid[grid.getNames().index("phone")] wav = grid.name.split('_')[0] name = grid.name.split('.TextGrid')[0] crop_wav(f"{WAV_DIR}/{wav}.wav", f"{OUTPUT_DIR}/{name}.wav", grid.minTime, grid.maxTime) out_grid = textgrid.TextGrid(name) speaker = textgrid.IntervalTier("speaker", 0, grid.maxTime - grid.minTime) for i, utterance in enumerate(utterances): utterance_text = "" maxTime = words[utterance.stop - 1].maxTime - grid.minTime minTime = words[utterance.start].minTime - grid.minTime for word in words[utterance]: if word.mark != "sp": utterance_text += " " + remap_words(word, phones) speaker.add(minTime, maxTime, utterance_text.strip()) out_grid.append(speaker) out_grid.write(f"{OUTPUT_DIR}/{grid.name}")
xmax_preds.append(xmin_proc_win[k] + xmax / 1000.0) if prevoicing_decision: mark_preds.append("-{:.12g}".format(float(confidence))) else: mark_preds.append("{:.12g}".format(float(confidence))) else: # negative VOT xmin_preds.append(xmin_proc_win[k] + xmax / 1000.0) xmax_preds.append(xmin_proc_win[k] + xmin / 1000.0) mark_preds.append("neg {:.12g}".format(float(confidence))) k += 1 # add "AutoVOT" tier to textgrid_filename textgrid = tg.TextGrid() textgrid.read(textgrid_file) auto_vot_tier = tg.IntervalTier(name='AutoVOT', minTime=textgrid.minTime, maxTime=textgrid.maxTime) auto_vot_tier.add(textgrid.minTime, xmin_preds[0], '') try: for i in range(len(xmin_preds) - 1): ## instead of mark_preds[i] (confidence number), just put 'pred' in the interval auto_vot_tier.add(xmin_preds[i], xmax_preds[i], 'pred') auto_vot_tier.add(xmax_preds[i], xmin_preds[i + 1], '') ## instead of mark_preds[i] (confidence number), just put 'pred' in the interval auto_vot_tier.add(xmin_preds[-1], xmax_preds[-1], 'pred') auto_vot_tier.add(xmax_preds[-1], textgrid.maxTime, '') except ValueError as e: logging.error( "Overlapping stops detected, textgrid output stopped at {}".