def get_timestamps(train_files, test_files, output_folder): for idx, files in enumerate([train_files, test_files]): pairs = [] for i in range(len(files)): for j in range(i + 1, len(files)): file1 = files[i].split('/')[-1].split('.')[0] file2 = files[j].split('/')[-1].split('.')[0] if file1 == file2: sorter = file1.split('-') s = "-" num_zeros = 10 - len(sorter[1]) for _ in range(num_zeros): s += '0' sorter = sorter[0] + s + sorter[1] if files[i].endswith('.txt'): pairs.append((files[j], files[i], sorter)) else: pairs.append((files[i], files[j], sorter)) pairs = sorted(pairs, key=lambda x: x[-1]) for wav, txt, _ in pairs: try: original = [] with open(txt, 'r', encoding='utf8') as f: for line in f: original.append(line.strip()) phoneme_alignments, word_alignments = align.align(wav, txt) output_file = txt.split('/')[-1] if idx == 0: output_path = os.path.join(output_folder, 'train') if idx == 1: output_path = os.path.join(output_folder, 'test') if not os.path.exists(output_path): os.makedirs(output_path) with open(os.path.join(output_path, output_file), 'w', encoding='utf8') as f: for word, start, stop in word_alignments: f.write(word + ',' + str(start) + ',' + str(stop) + '\n') print('Alignment successful', wav, txt) except: print('Alignment failed: ', wav, txt) print('-----------------------------') print()
for names in items: if names.endswith(".txt"): txtfilelist.append(names) # Iterate through text files, performing aligner and saving Praat as we go # for file in txtfilelist: for ff in range(0,len(txtfilelist)): curr_text = txtfilelist[ff] curr_wav = curr_text.replace('txt', "wav") curr_TextGird = curr_text.replace('txt', "TextGrid") print([str(ff)+': '+curr_text]) phoneme_alignments, word_alignments = align.align(wavdirname+'/'+curr_wav, txtdirname+'/'+curr_text,TGdirname+'/'+curr_TextGird) with open(Path(matdirname,curr_text), 'w') as fp: fp.writelines('Phonemes\n') for pp in range(1,len(phoneme_alignments)): fp.write(' '.join('%s' % x for x in phoneme_alignments[pp])) fp.writelines('\n') fp.writelines('Words\n') for pp in range(1,len(word_alignments)): fp.write(' '.join('%s' % x for x in word_alignments[pp])) fp.writelines('\n') ## For Spanish (unfortunately P2FA doesn't have Spanish support) txtdirname = r'/home/sakkol/Documents/Forced_Alignment/FORCE/Fast-Spanish/txt'
def test_aligner(self): align.align(self.input_wav, self.input_transcription, self.outfile) self.assertTrue(filecmp.cmp(self.outfile, self.true_alignment_file))
def extract_phoneme_data(args): audio_file_name, lyrics_file_name, audio_length = args print(audio_file_name) # Extract phonemes using Penn's force aligner ph_align, w_align = align.align(audio_file_name, lyrics_file_name) step = params.frame_period / 1000 phoneme_position = 0 phoneme_array = [] x = 0 while x < audio_length: if ph_align[phoneme_position][1] <= x * step: if ph_align[phoneme_position][2] > x * step: phoneme_array.append(ph_align[phoneme_position][0]) x = x + 1 elif phoneme_position + 1 < len(ph_align): phoneme_position = phoneme_position + 1 else: x = x + 1 phoneme_array.append(ph_align[phoneme_position][0]) else: phoneme_array.append("sp") x = x + 1 x = 0 timing_array = [] while x < audio_length: phoneme = phoneme_array[x] counter = 0 while x + counter < audio_length and phoneme_array[x + counter] == phoneme: counter = counter + 1 numerator = 1 for y in range(counter): # timing_array.append(numerator / counter) timing = numerator / counter if timing <= 0.333: timing_array.append(0) elif timing <= 0.666: timing_array.append(1) else: timing_array.append(2) numerator = numerator + 1 x = x + counter pre_phoneme_array = [] post_phoneme_array = [] for y in range(audio_length): try: pre_phoneme_array.append(phoneme_array[y - 1]) except IndexError: pre_phoneme_array.append("sp") try: post_phoneme_array.append(phoneme_array[y + 1]) except IndexError: post_phoneme_array.append("sp") phoneme_position_data = pd.DataFrame(phoneme_array, columns=['Phoneme']) pre_phoneme_position_data = pd.DataFrame(pre_phoneme_array, columns=['Pre_phoneme']) post_phoneme_position_data = pd.DataFrame(post_phoneme_array, columns=['Post_phoneme']) phoneme_timing_data = pd.DataFrame(timing_array, columns=['Phoneme_timings']) phoneme_data = pd.concat([ phoneme_position_data, phoneme_timing_data, pre_phoneme_position_data, post_phoneme_position_data ], axis=1) return phoneme_data