def run_gentle(key='103/1241/103_1241_000000_000001'): text_file = f'{TRAIN_PATH}/{key}.normalized.txt' audio_file = f'{TRAIN_PATH}/{key}.wav' json_file = f'{TRAIN_PATH}/{key}.json' if os.path.isfile(json_file): with open(json_file) as r: return json.loads(r.read()) with open(text_file, encoding="utf-8") as fh: transcript = fh.read() logging.info("converting audio to 8K sampled wav") def on_progress(p): return # for k,v in p.items(): # logging.debug("%s: %s" % (k, v)) with gentle.resampled(audio_file) as wavfile: # logging.info("starting alignment") aligner = gentle.ForcedAligner(resources, transcript, nthreads=multiprocessing.cpu_count(), disfluency=False, # include disfluencies (uh, um) in alignment conservative=False, disfluencies=set(['uh', 'um'])) result = aligner.transcribe(wavfile, progress_cb=None, logging=logging) result_dict = result.to_dict() with open(json_file, 'w') as f: f.write(json.dumps(result_dict, indent=2)) return result_dict
def align_file(transcription, snd_filename): # TODO: Add a file of "sound files that did not finish" class TimeoutException(Exception): pass @contextmanager def time_limit(seconds): def signal_handler(signum, frame): raise TimeoutException("Timed out!") signal.signal(signal.SIGALRM, signal_handler) signal.alarm(seconds) try: yield finally: signal.alarm(0) with gentle.resampled(snd_filename) as wavfile: aligner = gentle.ForcedAligner(resources, transcription, nthreads=nthreads) try: with time_limit(10): result = {snd_filename: aligner.transcribe(wavfile)} except TimeoutException as e: print( "Transcription of {} timed out! Please check that your transcription is accurate." .format(fid)) result = {snd_filename: None} return result
def get_gentle_response(self, parsed_txt_path): """Returns response from gentle Args: parsed_txt_path (str): parsed txt path Returns: list: aligned words """ with open(parsed_txt_path, encoding="utf-8") as fh: transcript = fh.read() resources = gentle.Resources() # words for gentle to ignore when aligning disfluencies = set(['uh', 'um']) with gentle.resampled(self.audiopath) as wavfile: aligner = gentle.ForcedAligner( resources, transcript, nthreads=multiprocessing.cpu_count(), disfluency=False, conservative=False, disfluencies=disfluencies) result = aligner.transcribe(wavfile) return [word.as_dict(without="duration") for word in result.words]
def run_gentle(seg, transcript): """ Takes in a segment 1. create new text file containing text 2. create new audio with pydub 3. run Gentle with these two 4. delete text file/audio files Parameters --------- seg : Segment object to align with Gentle transcript : string holding the relevant transcript for this segment """ audio_cut = seg.audio_file[1000 * seg.start_audio:1000 * seg.end_audio] audio_cut.export("temp_audio.wav", format="wav") # run Gentle resources = gentle.Resources() with gentle.resampled("temp_audio.wav") as wavfile: aligner = gentle.ForcedAligner(resources, transcript) result = aligner.transcribe(wavfile).words # delete cut audio file os.remove("temp_audio.wav") # fix unaligned-word start/end time data fix_unaligned(result, len(audio_cut) / 1000) # put gentle timestamps in relation to entire file for word in result: word.start += seg.start_audio word.end += seg.start_audio return result
def run_gentle(audio_path: str, text_content: str, tokenization_view: View = None): with gentle.resampled(audio_path) as audio_file: resources = gentle.Resources() aligner = gentle.ForcedAligner( resources, text_content, nthreads=multiprocessing.cpu_count(), disfluencies={'uh', 'um'}, disfluency=True, conservative=False) if tokenization_view is not None: aligner.ms._seq = [] for token in tokenization_view.get_annotations(Uri.TOKEN): print(token.serialize(pretty=True)) start = token.properties['start'] end = token.properties['end'] token_text = text_content[start:end] kaldi_token = { 'start': start, 'end': end, 'token': metasentence.kaldi_normalize(token_text, aligner.ms.vocab) } aligner.ms._seq.append(kaldi_token) result = aligner.transcribe(audio_file) return result
def align(audiopath, text, nthreads=1): resources = gentle.Resources() logging.info("converting audio to 8K sampled wav") with gentle.resampled(audiopath) as wavfile: logging.info("Starting alignment") aligner = gentle.ForcedAligner(resources, text, nthreads=nthreads, disfluency=False, conservative=False) return aligner.transcribe(wavfile, progress_cb=on_progress, logging=logging)
def test_transcriber(self): from gentle import resampled, kaldi_queue, standard_kaldi, Resources from gentle.transcriber import MultiThreadedTranscriber resources = Resources() k_queue = kaldi_queue.build(resources, 1) trans = MultiThreadedTranscriber(k_queue) with resampled('examples/data/lucier.mp3', 10.5, 2.5) as filename: words, duration = trans.transcribe(filename) self.assertEqual(words[0].word, "different")
def align(audiopath, transcript): logging.info("converting audio to 8K sampled wav") with gentle.resampled(audiopath) as wavfile: logging.info("starting alignment") aligner = gentle.ForcedAligner(resources, transcript, nthreads=nthreads, disfluency=False, conservative=False) return json.loads( aligner.transcribe(wavfile, progress_cb=on_progress, logging=logging).to_json())
def test_transcriber(self): import subprocess from gentle import resampled, kaldi_queue, standard_kaldi, Resources from gentle.transcriber import MultiThreadedTranscriber standard_kaldi.STDERR = subprocess.STDOUT resources = Resources() k_queue = kaldi_queue.build(resources, 1) trans = MultiThreadedTranscriber(k_queue) with resampled(self.audio, 10.5, 2.5) as filename: words, duration = trans.transcribe(filename) self.assertEqual(words[0].word, "different")
def align_audio(wav_path, transcript): with gentle.resampled(wav_path) as wavfile: print("starting alignment {}".format(wav_path)) aligner = gentle.ForcedAligner(RESOURCES, transcript, nthreads=N_THREADS, disfluency=False, conservative=False, disfluencies=DISFLUENCIES) result = aligner.transcribe(wavfile, progress_cb=_on_progress, logging=logging) result_json = json.loads(result.to_json()) return result_json
def run_gentle(video_path, vid, result_path): vtt_subtitle = read_subtitle(vid) transcript = '' for i, sub in enumerate(vtt_subtitle): transcript += (vtt_subtitle[i].text + ' ') transcript = re.sub('\n', ' ', transcript) # remove newline characters # align with gentle.resampled(video_path) as wav_file: aligner = gentle.ForcedAligner(resources, transcript, nthreads=nthreads, disfluency=False, conservative=False, disfluencies=disfluencies) result = aligner.transcribe(wav_file, logging=logging) # write results with open(result_path, 'w', encoding="utf-8") as fh: fh.write(result.to_json(indent=2))
def align(args): audiopath, transcript = args with gentle.resampled(audiopath) as wavfile: logging.info("Audio file: {}".format(audiopath)) logging.info("Transcript: <{}...>".format(transcript[:40])) aligner = gentle.ForcedAligner(resources, transcript, nthreads=1, disfluency=False, conservative=False) result = json.loads( aligner.transcribe(wavfile, progress_cb=on_progress, logging=logging).to_json()) result['audiopath'] = audiopath return result
def test_aligner(self): import subprocess from gentle import resampled, standard_kaldi, Resources from gentle.forced_aligner import ForcedAligner from gentle.transcription import Word standard_kaldi.STDERR = subprocess.STDOUT resources = Resources() align = ForcedAligner(resources, self.transcript, nthreads=1) with resampled(self.audio, 5.0, 5.0) as filename: transcription = align.transcribe(filename) words = transcription.words self.assertEqual(words[0].word, "i") self.assertEqual(words[1].word, "am") self.assertEqual(words[1].case, Word.SUCCESS)
def align_db(data): import pathlib except_i_list = list(range(len(data))) while True: for i in tqdm(except_i_list): row = data.iloc[i] f = row.sentence_path transcript = row.transcription with gentle.resampled(f) as wavfile: aligner = gentle.ForcedAligner(resources, transcript, nthreads=40) #print("Align starts") try: result = aligner.transcribe(wavfile, progress_cb=on_progress, logging=logging) except: except_i_list.append(i) continue #print("Align ends") # os.system('python align.py '+f+' words.txt -o test.json') output = os.path.join( 'alignments', '/'.join(f.split('/')[-4:]).split('.')[0] + '.json') pathlib.Path('/'.join(output.split('/')[0:-1])).mkdir( parents=True, exist_ok=True) fh = open(output, 'w') #print("{} starts to be written".format(output)) fh.write(result.to_json(indent=2)) #print("{} written".format(output)) if output: logging.info("output written to %s" % (output)) fh.close() #print("i={}".format(i)) #print("f={}".format(row.sentence_path)) print("except_i_list:", except_i_list) if len(except_i_list) == 0: break
def get_transcribed_words(textFile, audioFile): with open(textFile) as file: transcript = file.read() resources = gentle.Resources() with gentle.resampled(audioFile) as wavfile: aligner = gentle.ForcedAligner(resources, transcript) result = aligner.transcribe(wavfile) transcribed_words = [] for word in result.words: phones = word.phones if phones is not None: root_phones = [] for phone in phones: root_phone = phone['phone'][0:phone['phone'].index('_')] root_phones.append(root_phone) transcribed_words.append(Word(word.word, root_phones)) return transcribed_words
def align_onefile(data, align_json_path): import pathlib splitted_path = split_path(align_json_path) json_file_name = splitted_path[-1] id, _ = os.path.splitext(json_file_name) emotion = splitted_path[-2] speaker = splitted_path[-3] row = data[(data.id == id) & (data.speaker == speaker)].iloc[0] # iloc: df to series f = row.sentence_path transcript = row.transcription with gentle.resampled(f) as wavfile: aligner = gentle.ForcedAligner(resources, transcript, nthreads=40) #print("Align starts") try: result = aligner.transcribe(wavfile, progress_cb=on_progress, logging=logging) except: return #except_i_list.append(i) #print("except_i_list:", except_i_list) #print("Align ends") # os.system('python align.py '+f+' words.txt -o test.json') output = os.path.join('alignments', '/'.join(f.split('/')[-4:]).split('.')[0] + '.json') pathlib.Path('/'.join(output.split('/')[0:-1])).mkdir(parents=True, exist_ok=True) fh = open(output, 'w') #print("{} starts to be written".format(output)) fh.write(result.to_json(indent=2)) #print("{} written".format(output)) if output: logging.info("output written to %s" % (output)) fh.close()
def gentle_solve(self, audio_path, transcript): """ gentle wrapper to solve the forced alignment given audio file and text string """ args = { 'log': 'INFO', 'nthreads': self.num_thread, 'conservative': True, 'disfluency': True, } disfluencies = set(['uh', 'um']) resources = gentle.Resources() with gentle.resampled(audio_path) as wavfile: aligner = gentle.ForcedAligner( resources, transcript, nthreads=args['nthreads'], disfluency=args['disfluency'], conservative=args['conservative'], disfluencies=disfluencies) result = aligner.transcribe(wavfile) return [word.as_dict(without="phones") for word in result.words]
def align_db(data): import pathlib for i, row in data.iterrows(): f = row.sentence_path transcript = row.transcription with gentle.resampled(f) as wavfile: aligner = gentle.ForcedAligner(resources, transcript) result = aligner.transcribe(wavfile, progress_cb=on_progress, logging=logging) # os.system('python align.py '+f+' words.txt -o test.json') output = os.path.join( 'alignments', '/'.join(f.split('/')[-4:]).split('.')[0] + '.json') pathlib.Path('/'.join(output.split('/')[0:-1])).mkdir(parents=True, exist_ok=True) fh = open(output, 'w') fh.write(result.to_json(indent=2)) if output: logging.info("output written to %s" % (output)) fh.close()
def start_aligning(audiofile, txtfile, output): log_level = "INFO" #can be one of the following: (DEBUG, INFO, WARNING, ERROR, or CRITICAL) logging.getLogger().setLevel(log_level) disfluencies = set(['uh', 'um']) with open(txtfile, encoding="utf-8") as fh: transcript = fh.read() resources = gentle.Resources() logging.info("converting audio to 8K sampled wav") with gentle.resampled(audiofile) as wavfile: logging.info("starting alignment") aligner = gentle.ForcedAligner( resources, transcript, nthreads ) #, True, False, disfluencies)#, conservative, disfluencies) result = aligner.transcribe(wavfile, progress_cb=on_progress, logging=logging) fh = open(output, 'w', encoding="utf-8") fh.write(result.to_json(indent=2)) logging.info("output written to %s" % (output))
# word in the audio. words.sort(key=lambda word: word.start) words.append(transcription.Word(word="__dummy__")) words = [words[i] for i in range(len(words)-1) if not words[i].corresponds(words[i+1])] return words, duration if __name__=='__main__': # full transcription import json import sys import logging logging.getLogger().setLevel('INFO') import gentle from gentle import standard_kaldi from gentle import kaldi_queue resources = gentle.Resources() k_queue = kaldi_queue.build(resources, 3) trans = MultiThreadedTranscriber(k_queue) with gentle.resampled(sys.argv[1]) as filename: words, duration = trans.transcribe(filename) open(sys.argv[2], 'w').write(transcription.Transcription(words=words).to_json())
def data_generator(file_id, min_dur=2, max_dur=(5, 20), randomize=False): """Given a file id and random seed, align the audio and text versions after dividing into single-speaker utterances, and write out texts of unbroken captured strings and their corresponding audio segments when the latter are between 2 and max_length seconds. """ if randomize: seed = ord(file_id[-1]) random.seed(seed) max_length = random.randint(max_dur[0], max_dur[1]) else: max_length = max_dur[1] logger.info("Processing file id {}...".format(file_id)) # grab audio file from s3 mp3 = os.path.join(mp3_dir, "{}.mp3".format(file_id)) wav = os.path.join(mp3_dir, "{}.wav".format(file_id)) if not os.path.isfile(wav): if not os.path.isfile(mp3): bucket = boto3.resource("s3").Bucket("cgws") logger.info("Downloading file {} from S3...".format(file_id)) try: bucket.download_file("{}.mp3".format(file_id), mp3) except: logger.warning( "Could not download file {} from S3.".format(file_id)) return FNULL = open(os.devnull, 'w') subprocess.call([ "sox", "{}".format(mp3), "-r", "16k", "{}".format(wav), "remix", "-" ], stdout=FNULL, stderr=FNULL) # transcript txt_file = os.path.join(records_dir, "{}.txt".format(file_id)) logger.info("Reading transcript {}...".format(file_id)) try: with open(txt_file, "r") as tr: transcript = tr.read() except IOError: logger.warning("File {} does not exist.".format(txt_file)) return # split transcript by speaker, and get timestamps (as seconds) # of the boundaries of each paragraph logger.info("Splitting transcript by speaker...") paragraphs = [] times = [] for paragraph in transcript.split("\n"): catch = re.match("\d:\d+:\d+\.\d", paragraph) if catch: timestamp = catch.group() h, m, s = timestamp.split(":") time = int(h) * 60 * 60 + int(m) * 60 + float(s) paragraphs.append(paragraph) times.append(time) file_end = get_duration(mp3) times.append(file_end) total_captures, captures_dur = 0, 0 # taking one speaker at a time, find unbroken alignments up to max_length # and write out corresponding files for i, paragraph in enumerate(paragraphs): logger.info("Cleaning and trimming paragraph {}: \n{}".format( i, paragraph)) paragraph_start, paragraph_end = times[i], times[i + 1] # don't bother with short files if paragraph_end - paragraph_start < min_dur: logger.info("Skipping paragraph {} (too short)...".format(i)) continue if len(paragraph.split()) < 2: logger.info("Skipping paragraph {} (too few words)...".format(i)) continue temp_wav = trim(file_id, wav, paragraph_start, paragraph_end, 0, "/tmp") # unique name of json object to read/write paragraph_hash = hashlib.sha1("{}{}{}{}".format( file_id, paragraph, paragraph_start, paragraph_end)).hexdigest() if use_filename_json is True: json_file = os.path.join( json_out_dir, "{}_{}_{}.json".format(file_id, paragraph_start, paragraph_end)) else: json_file = os.path.join(json_out_dir, "{}.json".format(paragraph_hash)) result = None # check if json object has been written from a previous run if not os.path.isfile(json_file): logger.info( "JSON file with hash {} not found.".format(paragraph_hash)) try: logger.info("Resampling paragraph {}...".format(i)) with gentle.resampled(temp_wav) as wav_file: resources = gentle.Resources() cleaned = clean(paragraph) logger.info( "Aligning paragraph {} with gentle...".format(i)) aligner = gentle.ForcedAligner( resources, cleaned, nthreads=multiprocessing.cpu_count(), disfluency=False, conservative=False, disfluencies=set(["uh", "um"])) logger.info( "Transcribing audio segment {} with gentle...".format( i)) result = aligner.transcribe(wav_file) except: logger.warning("Paragraph {} - {} ".format( i, sys.exc_info()[2])) os.remove(temp_wav) continue aligned_words = result.to_json() with open(json_file, "w") as f: f.write(aligned_words) if not result: logger.info("Empty result for paragraph {}.".format(i)) os.remove(temp_wav) continue else: logger.info( "Found JSON of paragraph {} -- skipping alignment and transcription by gentle" .format(i)) # dictionary of aligned words with open(json_file) as f: aligned = json.loads(f.read()) # save all consecutively captured strings # and keep track of their start and stop times captures = [] current, start_time, end_time = [], 0, 0 # loop through every word as returned from gentle logger.info("Capturing strings in paragraph {}...".format(i)) if not "words" in aligned: logger.info("No words in paragraph {}.".format(i)) os.remove(temp_wav) continue # first two seconds will be skipped even if it contains a capture for catch in aligned["words"]: # successful capture if catch["case"] == "success" and catch[ "alignedWord"] != "<unk>" and catch[ 'start'] > 5 and catch['end'] - catch['start'] > .07: # new capture group if not current: # begin capturing if it has been two seconds since the last word if catch["start"] - end_time > 1: current = [catch["alignedWord"]] start_time = catch["start"] end_time = catch["end"] # continuation of a capture group else: # large gap between last capture and this one # likely that something was missing in the transcript if catch["start"] - end_time > 1: save_capture(captures, start_time, end_time, current) current = [] # adding this word would equal or exceed max_length elif catch["end"] - start_time >= max_length: save_capture(captures, start_time, end_time, current, min_dur) current = [] if randomize: max_length = random.randint(max_dur[0], max_dur[1]) # continue capturing else: current.append(catch["alignedWord"]) end_time = catch["end"] # a miss after prior success(es) elif current: save_capture(captures, start_time, end_time, current, min_dur) current = [] # last word was a success but current capture hasn't been saved yet if current: save_capture(captures, start_time, end_time, current, min_dur) # write strings and split audio into consituent segments logger.info( "Writing text and audio segments from paragraph {}...".format(i)) for result in captures: txt_segment = os.path.join( text_out_dir, "{}_{}_{}.txt".format( file_id, "{:07d}".format(int((times[i] + result["start"]) * 100)), "{:07d}".format(int((times[i] + result["end"]) * 100)))) with open(txt_segment, "w") as f: f.write("{}\n".format(result["string"])) segment = trim(file_id, temp_wav, result["start"], result["end"], times[i], wav_out_dir) # make sure durations match segment_dur = get_duration(segment) assert segment_dur - result["duration"] <= .01 total_captures += 1 captures_dur += segment_dur # delete the clip of this speaker os.remove(temp_wav) # per-file logging total_dur = get_duration(mp3) logger.info("Wrote {} segments from {}, totalling {} seconds, out of a possible {}, ratio {:.2f}."\ .format(total_captures,file_id,captures_dur,total_dur,captures_dur/total_dur)) return
def generate_diphones(audio_file, transcript_file, output_folder, pre_padding=0.0, post_padding=0.0) -> set: """Generates the list of diphones for a given audio_file using the transcript and store the diphones in the output_folder Args: :param audio_file:(str) Name of the audio file to segment (.wav) :param transcript_file:(str) Name of the text file with the transcript :param output_folder:(str) Name of the destination directory to store the diphones :param pre_padding:(float) A fraction of audio to clip before the generated diphone :param post_padding:(float) A fraction of audio to clip after the generated diphone Returns: :return set of generated diphones """ nthreads = multiprocessing.cpu_count() disfluency = False conservative = False disfluencies = {'uh', 'um'} with open(transcript_file, encoding="utf-8") as fh: transcript = fh.read() print(transcript) resources = gentle.Resources() with gentle.resampled(audio_file) as wavfile: aligner = gentle.ForcedAligner(resources, transcript, nthreads=nthreads, disfluency=disfluency, conservative=conservative, disfluencies=disfluencies) result = aligner.transcribe(wavfile) r = json.loads(result.to_json()) phone_time_list = [] diphones = set() for word in r['words']: start = word['start'] * 1000 for phone in word['phones']: diphones.add(phone['phone']) phone_time_list.append( [phone['phone'], start, start + phone['duration'] * 1000]) start = start + phone['duration'] * 1000 for entry in phone_time_list: diphone = segment_audio(audio_file, entry[1], entry[2], pre_padding, post_padding) # print('Old ' + str(entry[0]) + ':' + str(len(diphone))) if len(diphone) < 150: try: diphone = ensure_length(diphone, 150) except exceptions.CouldntDecodeError: print( entry[0], 'is very small.........................................................' ) if not os.path.exists(output_folder): os.mkdir(output_folder) output_filename = output_folder + '/' + str(entry[0]) + '.wav' diphone.export(output_filename, format='wav') print('New ' + str(entry[0]) + ':' + str(len(diphone))) return diphones
disfluencies = set(['uh', 'um']) def on_progress(p): for k, v in p.items(): logging.debug("%s: %s" % (k, v)) with open(args.txtfile) as fh: transcript = fh.read() resources = gentle.Resources() logging.info("converting audio to 8K sampled wav") with gentle.resampled(args.audiofile) as wavfile: logging.info("starting alignment") aligner = gentle.ForcedAligner(resources, transcript, nthreads=args.nthreads, disfluency=args.disfluency, conservative=args.conservative, disfluencies=disfluencies) result = aligner.transcribe(wavfile, progress_cb=on_progress, logging=logging) fh = open(args.output, 'w') if args.output else sys.stdout fh.write(result.to_json(indent=2)) if args.output: logging.info("output written to %s" % (args.output))
parser.add_argument( 'txtfile', type=str, help='transcript text file') args = parser.parse_args() log_level = args.log.upper() logging.getLogger().setLevel(log_level) disfluencies = set(['uh', 'um']) def on_progress(p): for k,v in p.items(): logging.debug("%s: %s" % (k, v)) with open(args.txtfile) as fh: transcript = fh.read() resources = gentle.Resources() logging.info("converting audio to 8K sampled wav") with gentle.resampled(args.audiofile) as wavfile: logging.info("starting alignment") aligner = gentle.ForcedAligner(resources, transcript, nthreads=args.nthreads, disfluency=args.disfluency, conservative=args.conservative, disfluencies=disfluencies) result = aligner.transcribe(wavfile, progress_cb=on_progress, logging=logging) fh = open(args.output, 'w') if args.output else sys.stdout fh.write(result.to_json(indent=2)) if args.output: logging.info("output written to %s" % (args.output))
audio_path = os.path.join(DIR_PATH, "audio", audio_name) transcript_path = os.path.join(DIR_PATH, "transcripts", transcript_name) if not os.path.isfile(transcript_path): continue # get transcript text transcript_text = "" with open(transcript_path) as f: transcript_text = f.read() # run Gentle print "Running Gentle on", transcript_name resources = gentle.Resources() with gentle.resampled(audio_path) as wavfile: aligner = gentle.ForcedAligner(resources, transcript_text) result = aligner.transcribe(wavfile).words # create gentle_results directory if it doesn't already exist # usually better to use try-catch here, but not worried about race conditions right now if not os.path.exists(os.path.join(DIR_PATH, "gentle_results")): os.makedirs(os.path.join(DIR_PATH, "gentle_results")) # write Gentle output to gentle_results directory with open( os.path.join(DIR_PATH, "gentle_results", transcript_name + ".txt"), "w") as f: output = [] for word in result: output.append({
parsed_source_xml = ET.parse(source_path_xml) parsed_root = parsed_source_xml.getroot() for turn in parsed_root.findall('.//vx:Turn', namespaces): if 'DISCLAIMER' != turn.attrib['Speaker']: for fragment in turn.findall('.//vx:Fragment', namespaces): target_file_txt.write(fragment.text) with open(target_path_txt) as target_file_txt: transcript = target_file_txt.read() source_path_mp3 = source_path_xml.replace('transcripts/extracted', 'audio').replace('.xml', '.mp3') if os.path.isfile(source_path_mp3) and transcript: target_path_json = target_path_txt.replace('.txt', '.json') with open(target_path_json, 'w') as target_file_json: print('converting audio to 8K sampled wav') with gentle.resampled(source_path_mp3) as wavfile: print('starting alignment for', source_path_xml, ' and ', source_path_mp3) aligner = gentle.ForcedAligner(resources, transcript, nthreads=nthreads, disfluency=False, conservative=False, disfluencies=disfluencies) result = aligner.transcribe(wavfile, progress_cb=on_progress, logging=logging) target_file_json.write(result.to_json(indent=2)) print('finished alignment in', target_path_json)
def on_progress(p): for k,v in p.items(): logging.debug("%s: %s" % (k, v)) file_name = 'dave/she_will_eat_rice_tomorrow' with open(file_name+'.txt', encoding="utf-8") as fh: transcript = fh.read() resources = gentle.Resources() logging.info("converting audio to 8K sampled wav") with gentle.resampled(file_name+'.wav') as wavfile: logging.info("starting alignment") aligner = gentle.ForcedAligner(resources, transcript, nthreads=nthreads, disfluency=disfluency, conservative=conservative, disfluencies=disfluencies) result = aligner.transcribe(wavfile, progress_cb=on_progress, logging=logging) r = json.loads(result.to_json()) dur_list = [] phone_list = [] clip = AudioSegment.from_wav(file_name+'.wav') for word in r['words']: dur_list.append([word['alignedWord'],word['start'],word['end']]) start = word['start']*1000 for phone in word['phones']: phone_list.append([phone['phone'], start, start+phone['duration']*1000])
if __name__ == '__main__': # full transcription from Queue import Queue import json import sys import logging logging.getLogger().setLevel('INFO') import gentle from gentle import standard_kaldi resources = gentle.Resources() k_queue = Queue() for i in range(3): k_queue.put( standard_kaldi.Kaldi(resources.nnet_gpu_path, resources.full_hclg_path, resources.proto_langdir)) trans = MultiThreadedTranscriber(k_queue) with gentle.resampled(sys.argv[1]) as filename: out = trans.transcribe(filename) open(sys.argv[2], 'w').write(transcription.Transcription(words=out).to_json())
paragraph_end)) if os.path.isfile(new_json_file): if args.abort: print(" aborting") break else: if not os.path.isfile(json_file): temp_wav = trim(file_id, wav, paragraph_start, paragraph_end, 0, "/tmp") if not os.path.isfile(temp_wav): continue try: with gentle.resampled(temp_wav) as wav_file: resources = gentle.Resources() cleaned = clean(paragraph) aligner = gentle.ForcedAligner( resources, cleaned, nthreads=multiprocessing.cpu_count() * args.threads_multiplier, disfluency=False, conservative=False, disfluencies=set(["uh", "um"])) result = aligner.transcribe(wav_file) aligned_words = result.to_json() with open(json_file, "w") as f: f.write(aligned_words)