def download(source_name): for s3, local, key in MAPPINGS: if key in {'releases'}: continue local = os.path.abspath(local) options = '' if source_name: source = training_speech.get_source(source_name, validate=False) options += f' --exclude * --include {source[key]}' sync_cmd = f'aws s3 sync {s3} {local}{options}' print(sync_cmd) subprocess.call(sync_cmd.strip().split(' '))
def build_transcript(source_name, yes, add_to_git): source = training_speech.get_source(source_name) path_to_epub = os.path.join(CURRENT_DIR, 'data/epubs/', source['ebook']) path_to_transcript = os.path.join(CURRENT_DIR, f'data/transcripts/{source_name}.txt') if yes is False and os.path.isfile(path_to_transcript): click.confirm(text=f'{path_to_transcript} already exists. Override ?', default=False, abort=True) with open(path_to_transcript, 'w') as f: f.writelines(utils.read_epub(path_to_epub, path_to_xhtmls=source.get('ebook_parts', ['part1.xhtml']))) if add_to_git: subprocess.call(f'git add {path_to_transcript}'.split(' ')) click.echo(f'transcript {path_to_transcript} added to git')
def upload(source_name, releases): for s3, local, key in MAPPINGS: local = os.path.abspath(local) options = ' --exclude .gitkeep' if (source_name and key in {'releases'}) or (releases and key != 'releases'): continue if key == 'releases': options += ' --acl public-read' else: options += ' --exclude *.zip' if source_name: source = training_speech.get_source(source_name) options += f' --exclude * --include {source[key]}' sync_cmd = f'aws s3 sync {local} {s3}{options}' print(sync_cmd) subprocess.call(sync_cmd.strip().split(' '))
def check_alignment(source_name, restart, speed, audio_rate, no_cache, fast, start): import inquirer source = training_speech.get_source(source_name) path_to_alignment = os.path.join(CURRENT_DIR, f'data/alignments/{source_name}.json') path_to_transcript = os.path.join(CURRENT_DIR, f'data/transcripts/{source_name}.txt') path_to_mp3 = os.path.join(CURRENT_DIR, 'data/mp3', source['audio']) if no_cache and os.path.isdir(utils.CACHE_DIR): shutil.rmtree(utils.CACHE_DIR) os.mkdir(utils.CACHE_DIR) # generate wav if do not exists yet with open(path_to_mp3, 'rb') as f: file_hash = utils.hash_file(f) path_to_wav = os.path.join(utils.CACHE_DIR, f'{file_hash}.wav') if not os.path.exists(path_to_wav): ffmpeg.convert( from_=path_to_mp3, to=os.path.join(utils.CACHE_DIR, f'{file_hash}.wav'), rate=audio_rate, channels=1 ) # retrieve transcript with open(path_to_transcript) as f: transcript = [l.strip() for l in f.readlines()] transcript = [l for l in transcript if l] # rm empty lines # detect silences through VAD silences = vad.list_silences(path_to_wav=path_to_wav, frame_duration=20) if not restart and os.path.isfile(path_to_alignment): with open(path_to_alignment) as f: existing_alignment = json.load(f) else: existing_alignment = [] alignment = utils.build_alignment( transcript=transcript, path_to_audio=path_to_wav, existing_alignment=existing_alignment, silences=silences, generate_labels=True, ) def _check_alignment(index: int, alignment: List[dict]): click.clear() fragment = alignment[index] prev_fragments = alignment[max(i - 1, 0):i] next_fragments = alignment[i + 1:i + 3] print(colored( f'\nplaying #{i + 1:03d}: @@ {timedelta(seconds=fragment["begin"])} {timedelta(seconds=fragment["end"])} ({fragment["end"] - fragment["begin"]:0.3f}) @@', # noqa 'yellow', attrs=['bold'] )) if prev_fragments: for prev_ in prev_fragments: print(colored(prev_['text'], 'grey')) print(colored(fragment['text'], 'magenta' if fragment.get('warn') else 'green', attrs=['bold'])) if next_fragments: for next_ in next_fragments: print(colored(next_['text'], 'grey')) todo = set() pool = ThreadPoolExecutor() def play_audio(): path_to_audio = cut_fragment_audio(fragment, path_to_wav) global audio_player with sox.play(path_to_audio, speed=speed) as player: audio_player = player def play_audio_slow(): path_to_audio = cut_fragment_audio(fragment, path_to_wav) global audio_player with sox.play(path_to_audio) as player: audio_player = player todo.add(pool.submit(play_audio)) def ask_right_transcript(current: List[str]): new_text = click.edit(text='\n'.join(current), require_save=False) return [ l.strip() for l in new_text.strip().split('\n') if l.strip() ] def ask_what_next(): if prev_fragments: silence_before, silence_between, silence_after = utils.transition_silences( prev_fragments[-1], fragment, silences ) can_cut_start_on_prev_silence = silence_before is not None can_cut_start_on_next_silence = silence_after is not None else: can_cut_start_on_prev_silence = can_cut_start_on_next_silence = False if next_fragments: silence_before, silence_between, silence_after = utils.transition_silences( fragment, next_fragments[0], silences ) can_cut_end_on_prev_silence = silence_before is not None can_cut_end_on_next_silence = silence_after is not None else: can_cut_end_on_prev_silence = can_cut_end_on_next_silence = False try: next_: str = inquirer.prompt([ inquirer.List( 'next', message="\nWhat should I do ?", choices=( ['approve', 'repeat'] + (['go_back'] if prev_fragments else []) + ['edit'] + (['wrong_start__cut_on_previous_silence'] if can_cut_start_on_prev_silence else []) + (['wrong_start__cut_on_next_silence'] if can_cut_start_on_next_silence else []) + (['wrong_end__cut_on_previous_silence'] if can_cut_end_on_prev_silence else []) + (['wrong_end__cut_on_next_silence'] if can_cut_end_on_next_silence else []) + (['enable'] if fragment.get('disabled') else ['disable']) + ['toggle_fast_mode','quit']), ), ])['next'] except TypeError: raise exceptions.QuitException except Exception: next_ = 'quit' global audio_player try: audio_player.kill() except: pass if next_ == 'repeat': todo.add(pool.submit(play_audio_slow)) todo.add(pool.submit(ask_what_next)) elif next_ == 'toggle_fast_mode': raise exceptions.ToggleFastModeException elif next_ == 'go_back': prev_fragments[-1].pop('disabled', None) prev_fragments[-1].pop('approved', None) raise exceptions.GoBackException elif next_ == 'edit': new_transcript = ask_right_transcript([t['text'] for t in prev_fragments + [fragment] + next_fragments]) raise exceptions.SplitException( start=i-len(prev_fragments), end=i+len(next_fragments), new_transcript=new_transcript, ) elif next_ == 'wrong_start__cut_on_previous_silence': prev_fragment = prev_fragments[-1] silence_before, _, _ = utils.transition_silences(prev_fragment, fragment, silences) fragment['begin'] = round(max(silence_before[1] - 0.35, silence_before[1]), 3) prev_fragment['end'] = round(min(silence_before[0] + 0.35, silence_before[1]), 3) cut_fragment_audio(fragment, input_file=path_to_wav) cut_fragment_audio(fragment, input_file=path_to_wav) todo.add(pool.submit(play_audio)) todo.add(pool.submit(ask_what_next)) elif next_ == 'wrong_start__cut_on_next_silence': prev_fragment = prev_fragments[-1] _, _, silence_after = utils.transition_silences(prev_fragment, fragment, silences) prev_fragment['end'] = round(min(silence_after[0] + 0.35, silence_after[1]), 3) fragment['begin'] = round(max(silence_after[1] - 0.35, silence_after[0]), 3) cut_fragment_audio(prev_fragment, input_file=path_to_wav) cut_fragment_audio(fragment, input_file=path_to_wav) todo.add(pool.submit(play_audio)) todo.add(pool.submit(ask_what_next)) elif next_ == 'wrong_end__cut_on_previous_silence': next_fragment = next_fragments[0] silence_before, _, _ = utils.transition_silences(fragment, next_fragment, silences) fragment['end'] = round(min(silence_before[0] + 0.35, silence_before[1]), 3) next_fragment['begin'] = round(max(silence_before[1] - 0.35, silence_before[1]), 3) cut_fragment_audio(fragment, input_file=path_to_wav) cut_fragment_audio(next_fragment, input_file=path_to_wav) todo.add(pool.submit(play_audio)) todo.add(pool.submit(ask_what_next)) elif next_ == 'wrong_end__cut_on_next_silence': next_fragment = next_fragments[0] _, _, silence_after = utils.transition_silences(fragment, next_fragment, silences) fragment['end'] = round(min(silence_after[0] + 0.35, silence_after[1]), 3) next_fragment['begin'] = round(max(silence_after[1] - 0.35, silence_after[0]), 3) cut_fragment_audio(fragment, input_file=path_to_wav) cut_fragment_audio(next_fragment, input_file=path_to_wav) todo.add(pool.submit(play_audio)) todo.add(pool.submit(ask_what_next)) elif next_ == 'approve': fragment['approved'] = True elif next_ == 'pass': fragment.pop('approved', None) elif next_ == 'disable': fragment['disabled'] = True fragment.pop('approved', None) elif next_ == 'enabled': fragment['approved'] = True fragment.pop('disabled', None) elif next_ == 'quit': raise exceptions.QuitException else: raise NotImplementedError todo.add(pool.submit(ask_what_next)) while todo: for future in as_completed(t for t in todo): todo.remove(future) future.result() pool.shutdown(wait=True) cut_fragments_audio(alignment, input_file=path_to_wav) # iterate over successive fragments start = start - 1 i = start done = False while i < len(alignment) and not done: fragment = alignment[i] if start != i and (fragment.get('approved') or fragment.get('disabled')): click.echo(f'skip fragment#{i} {fragment["text"]}') i += 1 continue if fast and i != 0 and i < len(alignment) - 1 and not fragment.get('warn'): fragment.update( approved=True, approved_auto=True, ) click.echo(f'approve fragment#{i} {fragment["text"]}') i += 1 continue try: _check_alignment(index=i, alignment=alignment) except exceptions.ToggleFastModeException: fast = not fast continue except exceptions.GoBackException: i -= 1 continue except exceptions.QuitException: try: audio_player.kill() except: pass exit(1) except exceptions.SplitException as e: fragment.pop('approved', None) fragment.pop('disabled', None) audio_start: float = alignment[e.start]['begin'] audio_end: float = alignment[e.end]['end'] with tempfile.NamedTemporaryFile(suffix='.wav') as file_: sox.trim(path_to_wav, file_.name, from_=audio_start, to=audio_end) sub_alignment = utils.build_alignment( transcript=e.new_transcript, path_to_audio=file_.name, existing_alignment=[ dict( text=f['text'], begin=f['begin'] - audio_start, end=f['end'] - audio_start, approved=f.get('approved', False), disabled=f.get('disabled', False), ) for f in alignment[e.start:e.end+1] ], silences=[ [max(s_start - audio_start, 0.), s_end - audio_start] for s_start, s_end in silences if s_end > audio_start and s_start < audio_end ], generate_labels=False, language=source['language'], ) for nf in sub_alignment: nf["begin"] += audio_start nf["end"] += audio_start alignment = ( alignment[:e.start] + sub_alignment + alignment[e.end+1:] ) cut_fragments_audio(alignment, input_file=path_to_wav) i -= e.start # save progress with open(path_to_alignment, 'w') as dest: to_save = deepcopy(alignment) for f in to_save: f.pop('warn', None) json.dump( obj=to_save, fp=dest, sort_keys=True, indent=2, ) with open(path_to_transcript, 'w') as f: f.writelines('\n'.join(f['text'] for f in alignment) + '\n') i += 1
def source_stats(source_name): source = training_speech.get_source(source_name) path_to_alignment = os.path.join(CURRENT_DIR, f'data/alignments/{source_name}.json') path_to_transcript = os.path.join(CURRENT_DIR, f'data/transcripts/{source_name}.txt') path_to_mp3 = os.path.join(CURRENT_DIR, 'data/mp3', source['audio']) # generate wav if do not exists yet with open(path_to_mp3, 'rb') as f: file_hash = utils.hash_file(f) path_to_wav = os.path.join(utils.CACHE_DIR, f'{file_hash}.wav') if not os.path.exists(path_to_wav): ffmpeg.convert( from_=path_to_mp3, to=os.path.join(utils.CACHE_DIR, f'{file_hash}.wav'), rate=16000, channels=1 ) # retrieve transcript with open(path_to_transcript) as f: transcript = [l.strip() for l in f.readlines()] transcript = [l for l in transcript if l] # rm empty lines if os.path.isfile(path_to_alignment): with open(path_to_alignment) as f: existing_alignment = json.load(f) else: existing_alignment = [] # detect silences silences = vad.list_silences(path_to_wav=path_to_wav, frame_duration=utils.DEFAULT_VAD_FRAME_DURATION) alignment = utils.build_alignment( transcript=transcript, path_to_audio=path_to_wav, existing_alignment=existing_alignment, silences=silences, generate_labels=True, ) transitions_durations = [] fragments_durations = [] for prev_fragment, next_fragment in zip(alignment[:-1], alignment[1:]): silence_before, silence_between, silence_after = \ utils.transition_silences(prev_fragment, next_fragment, silences) if silence_between: transitions_durations.append(silence_between[1] - silence_between[0]) fragments_durations.append(next_fragment['end'] - next_fragment['begin']) t_mean = np.array(transitions_durations).mean() t_std = np.array(transitions_durations).std() f_mean = np.array(fragments_durations).mean() f_std = np.array(fragments_durations).std() print('\n' + tabulate( [ [ 'transition dur (s)', len(transitions_durations), round(t_mean, 3), round(t_std, 3), f'{round(t_mean - t_std, 3)} - {round(t_mean + t_std, 3)}', f'{round(t_mean - 2 * t_std, 3)} - {round(t_mean + 2 * t_std, 3)}', f'{max(0, round(t_mean - 3 * t_std, 3))} - {round(t_mean + 3 * t_std, 3)}', min(transitions_durations), max(transitions_durations), ], [ 'fragment dur (s)', len(fragments_durations), round(f_mean, 3), round(f_std, 3), f'{round(f_mean - f_std, 3)} - {round(f_mean + f_std, 3)}', f'{round(f_mean - 2 * f_std, 3)} - {round(f_mean + 2 * f_std, 3)}', f'{max(0, round(f_mean - 3 * f_std, 3))} - {round(f_mean + 3 * f_std, 3)}', min(fragments_durations), max(fragments_durations), ], ], headers=['Metric', 'count', 'avg', 'std', '70%', '95%', '95%', 'min', 'max'], tablefmt='pipe', ))
def make_test(source_name, from_id, to_id, audio_rate): source = training_speech.get_source(source_name) path_to_alignment = os.path.join(CURRENT_DIR, f'data/alignments/{source_name}.json') path_to_transcript = os.path.join(CURRENT_DIR, f'data/transcripts/{source_name}.txt') path_to_mp3 = os.path.join(CURRENT_DIR, 'data/mp3', source['audio']) # generate wav if do not exists yet with open(path_to_mp3, 'rb') as f: file_hash = utils.hash_file(f) path_to_wav = os.path.join(utils.CACHE_DIR, f'{file_hash}.wav') if not os.path.exists(path_to_wav): ffmpeg.convert( from_=path_to_mp3, to=os.path.join(utils.CACHE_DIR, f'{file_hash}.wav'), rate=audio_rate, channels=1 ) # retrieve transcript with open(path_to_transcript) as f: transcript = [l.strip() for l in f.readlines()] transcript = [l for l in transcript if l] # rm empty lines if os.path.isfile(path_to_alignment): with open(path_to_alignment) as f: existing_alignment = json.load(f) else: existing_alignment = [] # detect silences silences = vad.list_silences(path_to_wav=path_to_wav, frame_duration=20) alignment = utils.build_alignment( transcript=transcript, path_to_audio=path_to_wav, existing_alignment=existing_alignment, silences=silences, generate_labels=True, ) remaining = alignment[from_id - 1:to_id] with tempfile.NamedTemporaryFile(suffix='.wav') as file_: ffmpeg.cut(path_to_wav, file_.name, from_=remaining[0]['begin'], to=remaining[-1]['end']) with open(file_.name, 'rb') as f: file_hash = utils.hash_file(f)[:8] path_to_sub_audio = f'tests/assets/{file_hash}.wav' shutil.copy(file_.name, os.path.join(CURRENT_DIR, path_to_sub_audio)) new_alignment = utils.build_alignment( transcript=[f['text'] for f in remaining], path_to_audio=path_to_sub_audio, existing_alignment=[], silences=vad.list_silences(path_to_wav=path_to_sub_audio, frame_duration=20), generate_labels=True, ) print(BUILD_ALIGNMENT_TEST_TEMPLATE.format( file_hash=file_hash, source_name=source_name, from_=timedelta(seconds=remaining[0]['begin']), to=timedelta(seconds=remaining[-1]['end']), transcript='\n '.join(f"'{f['text']}'," for f in new_alignment), alignment='\n '.join("dict(begin={begin}, end={end}, text='{text}'),".format(**f) for f in new_alignment), vad_mode=utils.DEFAULT_VAD_MODE, vad_frame_duration=utils.DEFAULT_VAD_FRAME_DURATION, ))