Example #1
0
def download(source_name):
    for s3, local, key in MAPPINGS:
        if key in {'releases'}:
            continue
        local = os.path.abspath(local)
        options = ''
        if source_name:
            source = training_speech.get_source(source_name, validate=False)
            options += f' --exclude * --include {source[key]}'

        sync_cmd = f'aws s3 sync {s3} {local}{options}'
        print(sync_cmd)
        subprocess.call(sync_cmd.strip().split(' '))
Example #2
0
def build_transcript(source_name, yes, add_to_git):
    source = training_speech.get_source(source_name)
    path_to_epub = os.path.join(CURRENT_DIR, 'data/epubs/', source['ebook'])

    path_to_transcript = os.path.join(CURRENT_DIR, f'data/transcripts/{source_name}.txt')
    if yes is False and os.path.isfile(path_to_transcript):
        click.confirm(text=f'{path_to_transcript} already exists. Override ?', default=False, abort=True)

    with open(path_to_transcript, 'w') as f:
        f.writelines(utils.read_epub(path_to_epub, path_to_xhtmls=source.get('ebook_parts', ['part1.xhtml'])))

    if add_to_git:
        subprocess.call(f'git add {path_to_transcript}'.split(' '))
    click.echo(f'transcript {path_to_transcript} added to git')
Example #3
0
def upload(source_name, releases):
    for s3, local, key in MAPPINGS:
        local = os.path.abspath(local)
        options = ' --exclude .gitkeep'
        if (source_name and key in {'releases'}) or (releases and key != 'releases'):
            continue
        if key == 'releases':
            options += ' --acl public-read'
        else:
            options += ' --exclude *.zip'
        if source_name:
            source = training_speech.get_source(source_name)
            options += f' --exclude * --include {source[key]}'

        sync_cmd = f'aws s3 sync {local} {s3}{options}'
        print(sync_cmd)
        subprocess.call(sync_cmd.strip().split(' '))
Example #4
0
def check_alignment(source_name, restart, speed, audio_rate, no_cache, fast, start):
    import inquirer
    source = training_speech.get_source(source_name)
    path_to_alignment = os.path.join(CURRENT_DIR, f'data/alignments/{source_name}.json')
    path_to_transcript = os.path.join(CURRENT_DIR, f'data/transcripts/{source_name}.txt')
    path_to_mp3 = os.path.join(CURRENT_DIR, 'data/mp3', source['audio'])

    if no_cache and os.path.isdir(utils.CACHE_DIR):
        shutil.rmtree(utils.CACHE_DIR)
        os.mkdir(utils.CACHE_DIR)

    # generate wav if do not exists yet
    with open(path_to_mp3, 'rb') as f:
        file_hash = utils.hash_file(f)
    path_to_wav = os.path.join(utils.CACHE_DIR, f'{file_hash}.wav')
    if not os.path.exists(path_to_wav):
        ffmpeg.convert(
            from_=path_to_mp3,
            to=os.path.join(utils.CACHE_DIR, f'{file_hash}.wav'),
            rate=audio_rate,
            channels=1
        )

    # retrieve transcript
    with open(path_to_transcript) as f:
        transcript = [l.strip() for l in f.readlines()]
    transcript = [l for l in transcript if l]  # rm empty lines

    # detect silences through VAD
    silences = vad.list_silences(path_to_wav=path_to_wav, frame_duration=20)

    if not restart and os.path.isfile(path_to_alignment):
        with open(path_to_alignment) as f:
            existing_alignment = json.load(f)
    else:
        existing_alignment = []

    alignment = utils.build_alignment(
        transcript=transcript,
        path_to_audio=path_to_wav,
        existing_alignment=existing_alignment,
        silences=silences,
        generate_labels=True,
    )

    def _check_alignment(index: int, alignment: List[dict]):
        click.clear()
        fragment = alignment[index]
        prev_fragments = alignment[max(i - 1, 0):i]
        next_fragments = alignment[i + 1:i + 3]

        print(colored(
            f'\nplaying #{i + 1:03d}: @@ {timedelta(seconds=fragment["begin"])}  {timedelta(seconds=fragment["end"])} ({fragment["end"] - fragment["begin"]:0.3f}) @@',  # noqa
            'yellow',
            attrs=['bold']
        ))
        if prev_fragments:
            for prev_ in prev_fragments:
                print(colored(prev_['text'], 'grey'))
        print(colored(fragment['text'], 'magenta' if fragment.get('warn') else 'green', attrs=['bold']))
        if next_fragments:
            for next_ in next_fragments:
                print(colored(next_['text'], 'grey'))

        todo = set()
        pool = ThreadPoolExecutor()

        def play_audio():
            path_to_audio = cut_fragment_audio(fragment, path_to_wav)
            global audio_player

            with sox.play(path_to_audio, speed=speed) as player:
                audio_player = player

        def play_audio_slow():
            path_to_audio = cut_fragment_audio(fragment, path_to_wav)
            global audio_player

            with sox.play(path_to_audio) as player:
                audio_player = player

        todo.add(pool.submit(play_audio))

        def ask_right_transcript(current: List[str]):
            new_text = click.edit(text='\n'.join(current), require_save=False)
            return [
                l.strip()
                for l in new_text.strip().split('\n')
                if l.strip()
            ]

        def ask_what_next():
            if prev_fragments:
                silence_before, silence_between, silence_after = utils.transition_silences(
                    prev_fragments[-1],
                    fragment,
                    silences
                )
                can_cut_start_on_prev_silence = silence_before is not None
                can_cut_start_on_next_silence = silence_after is not None
            else:
                can_cut_start_on_prev_silence = can_cut_start_on_next_silence = False
            if next_fragments:
                silence_before, silence_between, silence_after = utils.transition_silences(
                    fragment,
                    next_fragments[0],
                    silences
                )
                can_cut_end_on_prev_silence = silence_before is not None
                can_cut_end_on_next_silence = silence_after is not None
            else:
                can_cut_end_on_prev_silence = can_cut_end_on_next_silence = False


            try:
                next_: str = inquirer.prompt([
                    inquirer.List(
                        'next',
                        message="\nWhat should I do ?",
                        choices=(
                                ['approve', 'repeat'] +
                                (['go_back'] if prev_fragments else []) +
                                ['edit'] +
                                (['wrong_start__cut_on_previous_silence'] if can_cut_start_on_prev_silence else []) +
                                (['wrong_start__cut_on_next_silence'] if can_cut_start_on_next_silence else []) +
                                (['wrong_end__cut_on_previous_silence'] if can_cut_end_on_prev_silence else []) +
                                (['wrong_end__cut_on_next_silence'] if can_cut_end_on_next_silence else []) +
                                (['enable'] if fragment.get('disabled') else ['disable']) +
                                ['toggle_fast_mode','quit']),
                    ),
                ])['next']
            except TypeError:
                raise exceptions.QuitException
            except Exception:
                next_ = 'quit'

            global audio_player
            try:
                audio_player.kill()
            except:
                pass

            if next_ == 'repeat':
                todo.add(pool.submit(play_audio_slow))
                todo.add(pool.submit(ask_what_next))
            elif next_ == 'toggle_fast_mode':
                raise exceptions.ToggleFastModeException
            elif next_ == 'go_back':
                prev_fragments[-1].pop('disabled', None)
                prev_fragments[-1].pop('approved', None)
                raise exceptions.GoBackException
            elif next_ == 'edit':
                new_transcript = ask_right_transcript([t['text'] for t in prev_fragments + [fragment] + next_fragments])
                raise exceptions.SplitException(
                    start=i-len(prev_fragments),
                    end=i+len(next_fragments),
                    new_transcript=new_transcript,
                )

            elif next_ == 'wrong_start__cut_on_previous_silence':
                prev_fragment = prev_fragments[-1]
                silence_before, _, _ = utils.transition_silences(prev_fragment, fragment, silences)
                fragment['begin'] = round(max(silence_before[1] - 0.35, silence_before[1]), 3)
                prev_fragment['end'] = round(min(silence_before[0] + 0.35, silence_before[1]), 3)
                cut_fragment_audio(fragment, input_file=path_to_wav)
                cut_fragment_audio(fragment, input_file=path_to_wav)
                todo.add(pool.submit(play_audio))
                todo.add(pool.submit(ask_what_next))
            elif next_ == 'wrong_start__cut_on_next_silence':
                prev_fragment = prev_fragments[-1]
                _, _, silence_after = utils.transition_silences(prev_fragment, fragment, silences)
                prev_fragment['end'] = round(min(silence_after[0] + 0.35, silence_after[1]), 3)
                fragment['begin'] = round(max(silence_after[1] - 0.35, silence_after[0]), 3)
                cut_fragment_audio(prev_fragment, input_file=path_to_wav)
                cut_fragment_audio(fragment, input_file=path_to_wav)
                todo.add(pool.submit(play_audio))
                todo.add(pool.submit(ask_what_next))
            elif next_ == 'wrong_end__cut_on_previous_silence':
                next_fragment = next_fragments[0]
                silence_before, _, _ = utils.transition_silences(fragment, next_fragment, silences)
                fragment['end'] = round(min(silence_before[0] + 0.35, silence_before[1]), 3)
                next_fragment['begin'] = round(max(silence_before[1] - 0.35, silence_before[1]), 3)
                cut_fragment_audio(fragment, input_file=path_to_wav)
                cut_fragment_audio(next_fragment, input_file=path_to_wav)
                todo.add(pool.submit(play_audio))
                todo.add(pool.submit(ask_what_next))
            elif next_ == 'wrong_end__cut_on_next_silence':
                next_fragment = next_fragments[0]
                _, _, silence_after = utils.transition_silences(fragment, next_fragment, silences)
                fragment['end'] = round(min(silence_after[0] + 0.35, silence_after[1]), 3)
                next_fragment['begin'] = round(max(silence_after[1] - 0.35, silence_after[0]), 3)
                cut_fragment_audio(fragment, input_file=path_to_wav)
                cut_fragment_audio(next_fragment, input_file=path_to_wav)
                todo.add(pool.submit(play_audio))
                todo.add(pool.submit(ask_what_next))
            elif next_ == 'approve':
                fragment['approved'] = True
            elif next_ == 'pass':
                fragment.pop('approved', None)
            elif next_ == 'disable':
                fragment['disabled'] = True
                fragment.pop('approved', None)
            elif next_ == 'enabled':
                fragment['approved'] = True
                fragment.pop('disabled', None)
            elif next_ == 'quit':
                raise exceptions.QuitException
            else:
                raise NotImplementedError

        todo.add(pool.submit(ask_what_next))

        while todo:
            for future in as_completed(t for t in todo):
                todo.remove(future)
                future.result()

        pool.shutdown(wait=True)

    cut_fragments_audio(alignment, input_file=path_to_wav)

    # iterate over successive fragments
    start = start - 1
    i = start
    done = False
    while i < len(alignment) and not done:
        fragment = alignment[i]

        if start != i and (fragment.get('approved') or fragment.get('disabled')):
            click.echo(f'skip fragment#{i} {fragment["text"]}')
            i += 1
            continue

        if fast and i != 0 and i < len(alignment) - 1 and not fragment.get('warn'):
            fragment.update(
                approved=True,
                approved_auto=True,
            )
            click.echo(f'approve fragment#{i} {fragment["text"]}')
            i += 1
            continue

        try:
            _check_alignment(index=i, alignment=alignment)
        except exceptions.ToggleFastModeException:
            fast = not fast
            continue
        except exceptions.GoBackException:
            i -= 1
            continue
        except exceptions.QuitException:
            try:
                audio_player.kill()
            except:
                pass
            exit(1)
        except exceptions.SplitException as e:
            fragment.pop('approved', None)
            fragment.pop('disabled', None)
            audio_start: float = alignment[e.start]['begin']
            audio_end: float = alignment[e.end]['end']

            with tempfile.NamedTemporaryFile(suffix='.wav') as file_:
                sox.trim(path_to_wav, file_.name, from_=audio_start, to=audio_end)

                sub_alignment = utils.build_alignment(
                    transcript=e.new_transcript,
                    path_to_audio=file_.name,
                    existing_alignment=[
                        dict(
                            text=f['text'],
                            begin=f['begin'] - audio_start,
                            end=f['end'] - audio_start,
                            approved=f.get('approved', False),
                            disabled=f.get('disabled', False),
                        )
                        for f in alignment[e.start:e.end+1]
                    ],
                    silences=[
                        [max(s_start - audio_start, 0.), s_end - audio_start]
                        for s_start, s_end in silences
                        if s_end > audio_start and s_start < audio_end
                    ],
                    generate_labels=False,
                    language=source['language'],
                )

            for nf in sub_alignment:
                nf["begin"] += audio_start
                nf["end"] += audio_start

            alignment = (
                    alignment[:e.start] +
                    sub_alignment +
                    alignment[e.end+1:]
            )
            cut_fragments_audio(alignment, input_file=path_to_wav)
            i -= e.start

        # save progress
        with open(path_to_alignment, 'w') as dest:
            to_save = deepcopy(alignment)
            for f in to_save:
                f.pop('warn', None)
            json.dump(
                obj=to_save,
                fp=dest,
                sort_keys=True,
                indent=2,
            )
        with open(path_to_transcript, 'w') as f:
            f.writelines('\n'.join(f['text'] for f in alignment) + '\n')

        i += 1
Example #5
0
def source_stats(source_name):
    source = training_speech.get_source(source_name)
    path_to_alignment = os.path.join(CURRENT_DIR, f'data/alignments/{source_name}.json')
    path_to_transcript = os.path.join(CURRENT_DIR, f'data/transcripts/{source_name}.txt')
    path_to_mp3 = os.path.join(CURRENT_DIR, 'data/mp3', source['audio'])

    # generate wav if do not exists yet
    with open(path_to_mp3, 'rb') as f:
        file_hash = utils.hash_file(f)
    path_to_wav = os.path.join(utils.CACHE_DIR, f'{file_hash}.wav')
    if not os.path.exists(path_to_wav):
        ffmpeg.convert(
            from_=path_to_mp3,
            to=os.path.join(utils.CACHE_DIR, f'{file_hash}.wav'),
            rate=16000,
            channels=1
        )

    # retrieve transcript
    with open(path_to_transcript) as f:
        transcript = [l.strip() for l in f.readlines()]
    transcript = [l for l in transcript if l]  # rm empty lines

    if os.path.isfile(path_to_alignment):
        with open(path_to_alignment) as f:
            existing_alignment = json.load(f)
    else:
        existing_alignment = []

    # detect silences
    silences = vad.list_silences(path_to_wav=path_to_wav, frame_duration=utils.DEFAULT_VAD_FRAME_DURATION)

    alignment = utils.build_alignment(
        transcript=transcript,
        path_to_audio=path_to_wav,
        existing_alignment=existing_alignment,
        silences=silences,
        generate_labels=True,
    )

    transitions_durations = []
    fragments_durations = []
    for prev_fragment, next_fragment in zip(alignment[:-1], alignment[1:]):
        silence_before, silence_between, silence_after = \
            utils.transition_silences(prev_fragment, next_fragment, silences)
        if silence_between:
            transitions_durations.append(silence_between[1] - silence_between[0])
        fragments_durations.append(next_fragment['end'] - next_fragment['begin'])
    t_mean = np.array(transitions_durations).mean()
    t_std = np.array(transitions_durations).std()
    f_mean = np.array(fragments_durations).mean()
    f_std = np.array(fragments_durations).std()
    print('\n' + tabulate(
        [
            [
                'transition dur (s)',
                len(transitions_durations),
                round(t_mean, 3),
                round(t_std, 3),
                f'{round(t_mean - t_std, 3)} - {round(t_mean + t_std, 3)}',
                f'{round(t_mean - 2 * t_std, 3)} - {round(t_mean + 2 * t_std, 3)}',
                f'{max(0, round(t_mean - 3 * t_std, 3))} - {round(t_mean + 3 * t_std, 3)}',
                min(transitions_durations),
                max(transitions_durations),
            ],
            [
                'fragment dur (s)',
                len(fragments_durations),
                round(f_mean, 3),
                round(f_std, 3),
                f'{round(f_mean - f_std, 3)} - {round(f_mean + f_std, 3)}',
                f'{round(f_mean - 2 * f_std, 3)} - {round(f_mean + 2 * f_std, 3)}',
                f'{max(0, round(f_mean - 3 * f_std, 3))} - {round(f_mean + 3 * f_std, 3)}',
                min(fragments_durations),
                max(fragments_durations),
            ],
        ],
        headers=['Metric', 'count', 'avg', 'std', '70%', '95%', '95%', 'min', 'max'],
        tablefmt='pipe',
    ))
Example #6
0
def make_test(source_name, from_id, to_id, audio_rate):
    source = training_speech.get_source(source_name)
    path_to_alignment = os.path.join(CURRENT_DIR, f'data/alignments/{source_name}.json')
    path_to_transcript = os.path.join(CURRENT_DIR, f'data/transcripts/{source_name}.txt')
    path_to_mp3 = os.path.join(CURRENT_DIR, 'data/mp3', source['audio'])

    # generate wav if do not exists yet
    with open(path_to_mp3, 'rb') as f:
        file_hash = utils.hash_file(f)
    path_to_wav = os.path.join(utils.CACHE_DIR, f'{file_hash}.wav')
    if not os.path.exists(path_to_wav):
        ffmpeg.convert(
            from_=path_to_mp3,
            to=os.path.join(utils.CACHE_DIR, f'{file_hash}.wav'),
            rate=audio_rate,
            channels=1
        )

    # retrieve transcript
    with open(path_to_transcript) as f:
        transcript = [l.strip() for l in f.readlines()]
    transcript = [l for l in transcript if l]  # rm empty lines

    if os.path.isfile(path_to_alignment):
        with open(path_to_alignment) as f:
            existing_alignment = json.load(f)
    else:
        existing_alignment = []

    # detect silences
    silences = vad.list_silences(path_to_wav=path_to_wav, frame_duration=20)

    alignment = utils.build_alignment(
        transcript=transcript,
        path_to_audio=path_to_wav,
        existing_alignment=existing_alignment,
        silences=silences,
        generate_labels=True,
    )
    remaining = alignment[from_id - 1:to_id]

    with tempfile.NamedTemporaryFile(suffix='.wav') as file_:
        ffmpeg.cut(path_to_wav, file_.name, from_=remaining[0]['begin'], to=remaining[-1]['end'])
        with open(file_.name, 'rb') as f:
            file_hash = utils.hash_file(f)[:8]
        path_to_sub_audio = f'tests/assets/{file_hash}.wav'
        shutil.copy(file_.name, os.path.join(CURRENT_DIR, path_to_sub_audio))

    new_alignment = utils.build_alignment(
        transcript=[f['text'] for f in remaining],
        path_to_audio=path_to_sub_audio,
        existing_alignment=[],
        silences=vad.list_silences(path_to_wav=path_to_sub_audio, frame_duration=20),
        generate_labels=True,
    )

    print(BUILD_ALIGNMENT_TEST_TEMPLATE.format(
        file_hash=file_hash,
        source_name=source_name,
        from_=timedelta(seconds=remaining[0]['begin']),
        to=timedelta(seconds=remaining[-1]['end']),
        transcript='\n        '.join(f"'{f['text']}'," for f in new_alignment),
        alignment='\n        '.join("dict(begin={begin}, end={end}, text='{text}'),".format(**f) for f in new_alignment),
        vad_mode=utils.DEFAULT_VAD_MODE,
        vad_frame_duration=utils.DEFAULT_VAD_FRAME_DURATION,
    ))