def run_gentle(seg, transcript): """ Takes in a segment 1. create new text file containing text 2. create new audio with pydub 3. run Gentle with these two 4. delete text file/audio files Parameters --------- seg : Segment object to align with Gentle transcript : string holding the relevant transcript for this segment """ audio_cut = seg.audio_file[1000 * seg.start_audio:1000 * seg.end_audio] audio_cut.export("temp_audio.wav", format="wav") # run Gentle resources = gentle.Resources() with gentle.resampled("temp_audio.wav") as wavfile: aligner = gentle.ForcedAligner(resources, transcript) result = aligner.transcribe(wavfile).words # delete cut audio file os.remove("temp_audio.wav") # fix unaligned-word start/end time data fix_unaligned(result, len(audio_cut) / 1000) # put gentle timestamps in relation to entire file for word in result: word.start += seg.start_audio word.end += seg.start_audio return result
def serve(port=8765, interface='0.0.0.0', installSignalHandlers=0, nthreads=4, ntranscriptionthreads=2, data_dir=get_datadir('webdata'), modelDir='exp'): logging.info("SERVE %d, %s, %d", port, interface, installSignalHandlers) if not os.path.exists(data_dir): os.makedirs(data_dir) zip_dir = os.path.join(data_dir, 'zip') if not os.path.exists(zip_dir): os.makedirs(zip_dir) f = File(data_dir) f.putChild('', File(get_resource('www/index.html'))) f.putChild('status.html', File(get_resource('www/status.html'))) f.putChild('preloader.gif', File(get_resource('www/preloader.gif'))) resources = gentle.Resources(modelDir) trans = Transcriber(data_dir, nthreads=nthreads, ntranscriptionthreads=ntranscriptionthreads, modelDir=modelDir) config = trans.config logging.info("CONFIG: samplerate %d, silencephones %s, context-width %s", config['samplerate'], config['silencephones'], config['context-width']) trans_ctrl = TranscriptionsController(trans) f.putChild('transcriptions', trans_ctrl) trans_zippr = TranscriptionZipper(zip_dir, trans) f.putChild('zip', trans_zippr) s = Site(f) logging.info("about to listen") reactor.listenTCP(port, s, interface=interface) logging.info("listening") reactor.run(installSignalHandlers=installSignalHandlers)
def run_gentle(audio_path: str, text_content: str, tokenization_view: View = None): with gentle.resampled(audio_path) as audio_file: resources = gentle.Resources() aligner = gentle.ForcedAligner( resources, text_content, nthreads=multiprocessing.cpu_count(), disfluencies={'uh', 'um'}, disfluency=True, conservative=False) if tokenization_view is not None: aligner.ms._seq = [] for token in tokenization_view.get_annotations(Uri.TOKEN): print(token.serialize(pretty=True)) start = token.properties['start'] end = token.properties['end'] token_text = text_content[start:end] kaldi_token = { 'start': start, 'end': end, 'token': metasentence.kaldi_normalize(token_text, aligner.ms.vocab) } aligner.ms._seq.append(kaldi_token) result = aligner.transcribe(audio_file) return result
def get_gentle_response(self, parsed_txt_path): """Returns response from gentle Args: parsed_txt_path (str): parsed txt path Returns: list: aligned words """ with open(parsed_txt_path, encoding="utf-8") as fh: transcript = fh.read() resources = gentle.Resources() # words for gentle to ignore when aligning disfluencies = set(['uh', 'um']) with gentle.resampled(self.audiopath) as wavfile: aligner = gentle.ForcedAligner( resources, transcript, nthreads=multiprocessing.cpu_count(), disfluency=False, conservative=False, disfluencies=disfluencies) result = aligner.transcribe(wavfile) return [word.as_dict(without="duration") for word in result.words]
def __init__(self, data_dir, nthreads=4, ntranscriptionthreads=2): self.data_dir = data_dir self.nthreads = nthreads self.ntranscriptionthreads = ntranscriptionthreads self.resources = gentle.Resources() self.full_transcriber = gentle.FullTranscriber(self.resources, nthreads=ntranscriptionthreads) self._status_dicts = {}
def align(audiopath, text, nthreads=1): resources = gentle.Resources() logging.info("converting audio to 8K sampled wav") with gentle.resampled(audiopath) as wavfile: logging.info("Starting alignment") aligner = gentle.ForcedAligner(resources, text, nthreads=nthreads, disfluency=False, conservative=False) return aligner.transcribe(wavfile, progress_cb=on_progress, logging=logging)
def __init__(self, data_dir, nthreads=4, ntranscriptionthreads=2, modelDir='exp'): self.data_dir = data_dir self.nthreads = nthreads self.ntranscriptionthreads = ntranscriptionthreads self.resources = gentle.Resources(modelDir) self.config = self.resources.getConfig() self.full_transcriber = gentle.FullTranscriber(self.resources, nthreads=ntranscriptionthreads) self._status_dicts = {}
def align(audiopath, transcript): import gentle global resources if resources is None: resources = gentle.Resources() logging.info("converting audio to 8K sampled wav") with gentle.resampled(audiopath) as wavfile: logging.info("starting alignment") aligner = gentle.ForcedAligner(resources, transcript, nthreads=nthreads, disfluency=False, conservative=False) return json.loads( aliger.transcribe(wavfile, progress_cb=on_progress, logging=logging).to_json())
def call_gentle_chunk(wav_path, transcript, disfluency=False, conservative=False): """""" resources = gentle.Resources() aligner = gentle.ForcedAligner(resources, transcript, nthreads=multiprocessing.cpu_count(), disfluency=disfluency, conservative=conservative, disfluencies=('uh', 'um')) result = aligner.transcribe(wav_path, progress_cb=_on_progress, logging=logging) return json.loads(result.to_json())
def get_transcribed_words(textFile, audioFile): with open(textFile) as file: transcript = file.read() resources = gentle.Resources() with gentle.resampled(audioFile) as wavfile: aligner = gentle.ForcedAligner(resources, transcript) result = aligner.transcribe(wavfile) transcribed_words = [] for word in result.words: phones = word.phones if phones is not None: root_phones = [] for phone in phones: root_phone = phone['phone'][0:phone['phone'].index('_')] root_phones.append(root_phone) transcribed_words.append(Word(word.word, root_phones)) return transcribed_words
def gentle_solve(self, audio_path, transcript): """ gentle wrapper to solve the forced alignment given audio file and text string """ args = { 'log': 'INFO', 'nthreads': self.num_thread, 'conservative': True, 'disfluency': True, } disfluencies = set(['uh', 'um']) resources = gentle.Resources() with gentle.resampled(audio_path) as wavfile: aligner = gentle.ForcedAligner( resources, transcript, nthreads=args['nthreads'], disfluency=args['disfluency'], conservative=args['conservative'], disfluencies=disfluencies) result = aligner.transcribe(wavfile) return [word.as_dict(without="phones") for word in result.words]
def start_aligning(audiofile, txtfile, output): log_level = "INFO" #can be one of the following: (DEBUG, INFO, WARNING, ERROR, or CRITICAL) logging.getLogger().setLevel(log_level) disfluencies = set(['uh', 'um']) with open(txtfile, encoding="utf-8") as fh: transcript = fh.read() resources = gentle.Resources() logging.info("converting audio to 8K sampled wav") with gentle.resampled(audiofile) as wavfile: logging.info("starting alignment") aligner = gentle.ForcedAligner( resources, transcript, nthreads ) #, True, False, disfluencies)#, conservative, disfluencies) result = aligner.transcribe(wavfile, progress_cb=on_progress, logging=logging) fh = open(output, 'w', encoding="utf-8") fh.write(result.to_json(indent=2)) logging.info("output written to %s" % (output))
import json import multiprocessing as mp import logging try: import gentle except ModuleNotFoundError: import sys sys.path.append('/roaming/gentle') import gentle resources = gentle.Resources() def on_progress(p): for k, v in p.items(): logging.debug("%s: %s" % (k, v)) def align_many(audiopaths, transcripts): with mp.Pool(mp.cpu_count()) as pool: result = pool.map(align, zip(audiopaths, transcripts)) return result def align(args): audiopath, transcript = args with gentle.resampled(audiopath) as wavfile: logging.info("Audio file: {}".format(audiopath)) logging.info("Transcript: <{}...>".format(transcript[:40])) aligner = gentle.ForcedAligner(resources, transcript,
import json import logging import multiprocessing import os import gentle import scipy.io.wavfile as sciwav DISFLUENCIES = {'uh', 'um'} # set of disfluencies RESOURCES = gentle.Resources() N_THREADS = multiprocessing.cpu_count() logging.getLogger().setLevel("INFO") def _on_progress(p): for k, v in p.items(): logging.debug("%s: %s" % (k, v)) def _get_key_val_pair(line): line_split = line[:-1].split() word = line_split[0] if word[-1] == ')': word = word.split('(')[0] word = word.lower() key = [word] val = [] for phoneme in line_split[1:]: val.append(phoneme.lower())
def data_generator(file_id, min_dur=2, max_dur=(5, 20), randomize=False): """Given a file id and random seed, align the audio and text versions after dividing into single-speaker utterances, and write out texts of unbroken captured strings and their corresponding audio segments when the latter are between 2 and max_length seconds. """ if randomize: seed = ord(file_id[-1]) random.seed(seed) max_length = random.randint(max_dur[0], max_dur[1]) else: max_length = max_dur[1] logger.info("Processing file id {}...".format(file_id)) # grab audio file from s3 mp3 = os.path.join(mp3_dir, "{}.mp3".format(file_id)) wav = os.path.join(mp3_dir, "{}.wav".format(file_id)) if not os.path.isfile(wav): if not os.path.isfile(mp3): bucket = boto3.resource("s3").Bucket("cgws") logger.info("Downloading file {} from S3...".format(file_id)) try: bucket.download_file("{}.mp3".format(file_id), mp3) except: logger.warning( "Could not download file {} from S3.".format(file_id)) return FNULL = open(os.devnull, 'w') subprocess.call([ "sox", "{}".format(mp3), "-r", "16k", "{}".format(wav), "remix", "-" ], stdout=FNULL, stderr=FNULL) # transcript txt_file = os.path.join(records_dir, "{}.txt".format(file_id)) logger.info("Reading transcript {}...".format(file_id)) try: with open(txt_file, "r") as tr: transcript = tr.read() except IOError: logger.warning("File {} does not exist.".format(txt_file)) return # split transcript by speaker, and get timestamps (as seconds) # of the boundaries of each paragraph logger.info("Splitting transcript by speaker...") paragraphs = [] times = [] for paragraph in transcript.split("\n"): catch = re.match("\d:\d+:\d+\.\d", paragraph) if catch: timestamp = catch.group() h, m, s = timestamp.split(":") time = int(h) * 60 * 60 + int(m) * 60 + float(s) paragraphs.append(paragraph) times.append(time) file_end = get_duration(mp3) times.append(file_end) total_captures, captures_dur = 0, 0 # taking one speaker at a time, find unbroken alignments up to max_length # and write out corresponding files for i, paragraph in enumerate(paragraphs): logger.info("Cleaning and trimming paragraph {}: \n{}".format( i, paragraph)) paragraph_start, paragraph_end = times[i], times[i + 1] # don't bother with short files if paragraph_end - paragraph_start < min_dur: logger.info("Skipping paragraph {} (too short)...".format(i)) continue if len(paragraph.split()) < 2: logger.info("Skipping paragraph {} (too few words)...".format(i)) continue temp_wav = trim(file_id, wav, paragraph_start, paragraph_end, 0, "/tmp") # unique name of json object to read/write paragraph_hash = hashlib.sha1("{}{}{}{}".format( file_id, paragraph, paragraph_start, paragraph_end)).hexdigest() if use_filename_json is True: json_file = os.path.join( json_out_dir, "{}_{}_{}.json".format(file_id, paragraph_start, paragraph_end)) else: json_file = os.path.join(json_out_dir, "{}.json".format(paragraph_hash)) result = None # check if json object has been written from a previous run if not os.path.isfile(json_file): logger.info( "JSON file with hash {} not found.".format(paragraph_hash)) try: logger.info("Resampling paragraph {}...".format(i)) with gentle.resampled(temp_wav) as wav_file: resources = gentle.Resources() cleaned = clean(paragraph) logger.info( "Aligning paragraph {} with gentle...".format(i)) aligner = gentle.ForcedAligner( resources, cleaned, nthreads=multiprocessing.cpu_count(), disfluency=False, conservative=False, disfluencies=set(["uh", "um"])) logger.info( "Transcribing audio segment {} with gentle...".format( i)) result = aligner.transcribe(wav_file) except: logger.warning("Paragraph {} - {} ".format( i, sys.exc_info()[2])) os.remove(temp_wav) continue aligned_words = result.to_json() with open(json_file, "w") as f: f.write(aligned_words) if not result: logger.info("Empty result for paragraph {}.".format(i)) os.remove(temp_wav) continue else: logger.info( "Found JSON of paragraph {} -- skipping alignment and transcription by gentle" .format(i)) # dictionary of aligned words with open(json_file) as f: aligned = json.loads(f.read()) # save all consecutively captured strings # and keep track of their start and stop times captures = [] current, start_time, end_time = [], 0, 0 # loop through every word as returned from gentle logger.info("Capturing strings in paragraph {}...".format(i)) if not "words" in aligned: logger.info("No words in paragraph {}.".format(i)) os.remove(temp_wav) continue # first two seconds will be skipped even if it contains a capture for catch in aligned["words"]: # successful capture if catch["case"] == "success" and catch[ "alignedWord"] != "<unk>" and catch[ 'start'] > 5 and catch['end'] - catch['start'] > .07: # new capture group if not current: # begin capturing if it has been two seconds since the last word if catch["start"] - end_time > 1: current = [catch["alignedWord"]] start_time = catch["start"] end_time = catch["end"] # continuation of a capture group else: # large gap between last capture and this one # likely that something was missing in the transcript if catch["start"] - end_time > 1: save_capture(captures, start_time, end_time, current) current = [] # adding this word would equal or exceed max_length elif catch["end"] - start_time >= max_length: save_capture(captures, start_time, end_time, current, min_dur) current = [] if randomize: max_length = random.randint(max_dur[0], max_dur[1]) # continue capturing else: current.append(catch["alignedWord"]) end_time = catch["end"] # a miss after prior success(es) elif current: save_capture(captures, start_time, end_time, current, min_dur) current = [] # last word was a success but current capture hasn't been saved yet if current: save_capture(captures, start_time, end_time, current, min_dur) # write strings and split audio into consituent segments logger.info( "Writing text and audio segments from paragraph {}...".format(i)) for result in captures: txt_segment = os.path.join( text_out_dir, "{}_{}_{}.txt".format( file_id, "{:07d}".format(int((times[i] + result["start"]) * 100)), "{:07d}".format(int((times[i] + result["end"]) * 100)))) with open(txt_segment, "w") as f: f.write("{}\n".format(result["string"])) segment = trim(file_id, temp_wav, result["start"], result["end"], times[i], wav_out_dir) # make sure durations match segment_dur = get_duration(segment) assert segment_dur - result["duration"] <= .01 total_captures += 1 captures_dur += segment_dur # delete the clip of this speaker os.remove(temp_wav) # per-file logging total_dur = get_duration(mp3) logger.info("Wrote {} segments from {}, totalling {} seconds, out of a possible {}, ratio {:.2f}."\ .format(total_captures,file_id,captures_dur,total_dur,captures_dur/total_dur)) return
def __init__(self, nthreads=4, ntranscriptionthreads=2): self.nthreads = nthreads self.ntranscriptionthreads = ntranscriptionthreads self.resources = gentle.Resources()
def generate_diphones(audio_file, transcript_file, output_folder, pre_padding=0.0, post_padding=0.0) -> set: """Generates the list of diphones for a given audio_file using the transcript and store the diphones in the output_folder Args: :param audio_file:(str) Name of the audio file to segment (.wav) :param transcript_file:(str) Name of the text file with the transcript :param output_folder:(str) Name of the destination directory to store the diphones :param pre_padding:(float) A fraction of audio to clip before the generated diphone :param post_padding:(float) A fraction of audio to clip after the generated diphone Returns: :return set of generated diphones """ nthreads = multiprocessing.cpu_count() disfluency = False conservative = False disfluencies = {'uh', 'um'} with open(transcript_file, encoding="utf-8") as fh: transcript = fh.read() print(transcript) resources = gentle.Resources() with gentle.resampled(audio_file) as wavfile: aligner = gentle.ForcedAligner(resources, transcript, nthreads=nthreads, disfluency=disfluency, conservative=conservative, disfluencies=disfluencies) result = aligner.transcribe(wavfile) r = json.loads(result.to_json()) phone_time_list = [] diphones = set() for word in r['words']: start = word['start'] * 1000 for phone in word['phones']: diphones.add(phone['phone']) phone_time_list.append( [phone['phone'], start, start + phone['duration'] * 1000]) start = start + phone['duration'] * 1000 for entry in phone_time_list: diphone = segment_audio(audio_file, entry[1], entry[2], pre_padding, post_padding) # print('Old ' + str(entry[0]) + ':' + str(len(diphone))) if len(diphone) < 150: try: diphone = ensure_length(diphone, 150) except exceptions.CouldntDecodeError: print( entry[0], 'is very small.........................................................' ) if not os.path.exists(output_folder): os.mkdir(output_folder) output_filename = output_folder + '/' + str(entry[0]) + '.wav' diphone.export(output_filename, format='wav') print('New ' + str(entry[0]) + ':' + str(len(diphone))) return diphones
log_level = args.log.upper() logging.getLogger().setLevel(log_level) disfluencies = set(['uh', 'um']) def on_progress(p): for k, v in p.items(): logging.debug("%s: %s" % (k, v)) with open(args.txtfile) as fh: transcript = fh.read() resources = gentle.Resources(args.model_dir) config = resources.getConfig() logging.info("converting audio to {} sampled wav".format(config['samplerate'])) with gentle.resampled(args.audiofile) as wavfile: logging.info("starting alignment") aligner = gentle.ForcedAligner(resources, transcript, nthreads=args.nthreads, context_width=config['context-width'], disfluency=args.disfluency, conservative=args.conservative, disfluencies=disfluencies) result = aligner.transcribe(wavfile, progress_cb=on_progress, logging=logging)