-
Notifications
You must be signed in to change notification settings - Fork 0
/
forced_alignment.py
166 lines (160 loc) · 9.49 KB
/
forced_alignment.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
#!/usr/bin/env python
# coding=utf-8
import json
import os
from aeneas.executetask import ExecuteTask
from aeneas.task import Task
from moviepy.editor import AudioFileClip, concatenate_videoclips
from audio_handler import write_audio_of_videos_in_parts
from process_transcript import process_csv
class Transcribe_project:
def __init__(self,
video_folder="./videos/",
transcript_inputs_folder = "./transcripts/inputs/",
audio_folder = "./audios/", audio_prefix = "audio_",
transcript_processed_folder = "./transcripts/processed_inputs/", transcript_processed_prefix = "processed_",
output_folder = "./transcripts/outputs/", output_prefix = "output_"):
super().__init__()
self.video_folder = video_folder
self.transcripts_inputs_folder = transcript_inputs_folder
self.audio_folder = audio_folder
self.audio_prefix = audio_prefix
self.transcript_processed_folder = transcript_processed_folder
self.transcript_processed_prefix = transcript_processed_prefix
self.output_folder = output_folder
self.output_prefix = output_prefix
"""
Starts the transcription job by steps:
- STEP 1 : Processing the videos (combining them and extracting the audio)
- STEP 2 : Process the transcript (originally a .csv file) --> write a .text file with all the words
- STEP 3 : Use the audio and text file to force allign it and then write a json file with the results
- STEP 4 : Use the speaker order from above to insert it in the json file
"""
def start_transcribe_job(self, filename, num_parts, lazy_update_audio = True, lazy_update_processed_transcript = True):
## STEP 1: Processing the videos (combining them and extracting the audio)
real_part_length = self.write_audio(filename, num_parts, lazy_update_audio)
## STEP 2: Process the transcript (originally a .csv file) --> write a .text file with all the words
all_info = self.process_transcript(filename, num_parts, lazy_update_processed_transcript)
## STEP 3: Use the audio and text file to force allign it and then write a json file with the results
os.makedirs(f"{self.output_folder}{filename}/", exist_ok=True) #create directory necessary
self.transcribe_all(filename, num_parts)
## STEP 4: Use the speaker order from above to insert it in the json file
self.add_speaker_labels_in_parts(filename, num_parts, all_info, real_part_length)
return f"\nSuccesfully processed the filename: {filename}\n"
"""
Combines the videos found in self.video_folder/filename/
Set lazy_update = True if you already got the audio and don't want to do the process again, as this takes time
For more details in the implementation, see audio_handler.py
"""
def write_audio(self, filename, num_parts, lazy_update):
return write_audio_of_videos_in_parts(filename, num_parts, self.video_folder, self.audio_folder, self.audio_prefix, lazy_update)
"""
Process the .csv file found in self.transcripts_inputs_folder/filename.csv by extracting the words
in all the sentences. It returns a list of the speaker_labels for every word said.
Set lazy_update = Trye if you already got the processed_transcript and you don't want to do it again.
For more details in implementation, see process_transcript.py
"""
def process_transcript(self, filename, num_parts, lazy_update):
return process_csv(filename, num_parts, self.transcripts_inputs_folder, self.transcript_processed_folder, self.transcript_processed_prefix, lazy_update)
'''
Does most of the work of forced allignment. It assumes that an audio file was already created by
self.write_audio and that there is already a processed transcript after running self.process_transcript
'''
def transcribe_all(self, filename, num_parts):
audio_subpart_path = lambda part: get_path(f"{self.audio_folder}{filename}/", f"{self.audio_prefix}{num_parts}_part_", part, "mp3")
transcript_subpart_path = lambda part: get_path(f"{self.transcript_processed_folder}{filename}/", f"{self.transcript_processed_prefix}{num_parts}_part_", part, "txt")
output_subpart_path = lambda part: get_path(f"{self.output_folder}{filename}/", f"{self.output_prefix}{num_parts}_part_", part, "json")
for part in range(1, num_parts+1):
print(f"Writing output for part #{part}")
self.transcribe(
audio_subpart_path(part),
transcript_subpart_path(part),
output_subpart_path(part)
)
def transcribe(self, audio, transcript, output):
config_string = u"task_language=eng|is_text_type=plain|os_task_file_format=json"
task = Task(config_string=config_string)
task.audio_file_path_absolute = audio#get_path(self.audio_folder, self.audio_prefix, filename, "mp3")
task.text_file_path_absolute = transcript#get_path(self.transcript_processed_folder, self.transcript_processed_prefix, filename, "txt")
task.sync_map_file_path_absolute = output#get_path(self.output_folder, self.output_prefix, filename, "json")
print("Processing task...\n")
# process Task
ExecuteTask(task).execute()
print(f"Taks processed. Writing output to {output}")
# output sync map to file
task.output_sync_map_file()
"""
When first starting with this file, run this function to create the necessary directories
you need for this to work. For more details in what are the meanings of this folders, see the README.
"""
def make_necessary_folders(self):
os.makedirs(self.video_folder)
os.makedirs(self.audio_folder)
os.makedirs(self.transcript_processed_folder)
os.makedirs(self.transcripts_inputs_folder)
os.makedirs(self.output_folder)
"""
Uses the given speaker_labels to update our output from calling self.transcribe. The result is that
the output json file contains the label of the speaker for every fragment.
"""
def add_speaker_labels(self, filename, speaker_labels):
output_path = get_path(self.output_folder, self.output_prefix, filename, "json")
with open(output_path, 'r') as f:
out = json.load(f)
all_fragments = out["fragments"]
for i, fragment in enumerate(all_fragments):
fragment["speaker_label"] = speaker_labels[i]
with open(output_path, 'w+') as json_to_write:
json.dump(out, json_to_write, indent=4)
def add_speaker_labels_in_parts(self, filename, num_parts, all_info, real_part_length):
output_subpart_path = lambda part: get_path(f"{self.output_folder}{filename}/", f"{self.output_prefix}{num_parts}_part_", part, "json")
output_path = get_path(f"{self.output_folder}{filename}/", f"{self.output_prefix}_final_{num_parts}_", filename, "json")
with open(output_path, 'w+') as json_to_write:
all_values = {
"fragments":[]
}
for part in range(1, num_parts+1):
with open(output_subpart_path(part)) as f:
all_fragments_subpart = json.load(f)["fragments"]
for i, fragment in enumerate(all_fragments_subpart):
fragment["begin"]= float(fragment["begin"]) + real_part_length* (part-1)
fragment["end"]= float(fragment["end"]) + real_part_length * (part-1)
fragment["speaker_label"] = all_info[part]["speaker_labels"][i] #i-th word of the respective part
all_values["fragments"].extend(all_fragments_subpart)
print(f"Added speaker labels for part #{part}")
json.dump(all_values, json_to_write, indent=4)
'''
Returns the complete path given a folder, prefix, name and type
Folder must end in "/"
For example, create_path("./audios/", "audio_", "file1", "mp3") returns "./audios/audio_file1.mp3"
'''
def get_path(folder, prefix, name, file_type):
return "{}{}{}.{}".format(folder, prefix, name, file_type)
def update_mapping(a, b, num_parts= 1, video_folder="./videos"):
resized_a = str(a) if a >= 10 else f"0{a}"
start = f"p{resized_a}_s{b}"
for video_dir in os.listdir(video_folder):
if video_dir.startswith(start):
print(f"Adding {video_dir} to mappings.json")
with open("./mappings.json") as f:
mappings = json.load(f)
mappings[f"{a}_{b}"] = {
"filename": video_dir,
"num_parts": num_parts
}
with open("./mappings.json", "w+") as json_to_write:
json.dump(mappings, json_to_write, indent=4)
return video_dir
raise Exception(f"Didn't found any videos for family {a} - session #{b}")
if __name__ == "__main__":
project = Transcribe_project()
###### FIRST: IF YOU JUST GOT THIS FILE, please uncomment below (and run) ###########
#project.make_folders() ### HELLO, UNCOMMENT ME :)
###### SECOND: comment line above^ and uncomment this below (and run) #################
family_number, session_number = 16, 2
num_parts = 1
filename = update_mapping(family_number, session_number, num_parts)
response = project.start_transcribe_job(filename, num_parts)
print(response)
###### THIRD: Try to change the filename to other (valid) names and see if it works! ####
########### Your output JSON files will be located in /transcripts/outputs/##############