コード例 #1
0
def test_perfect_match():
    s = np.arange(10, dtype='float64').reshape(-1,1)
    t = np.arange(10, dtype='float64').reshape(-1,1)
    distance, path = c_FastDTWBD(s, t, skip_penalty=100, radius=10)
    assert distance == pytest.approx(0.0)
    np.testing.assert_equal(path[:,0], np.arange(10))
    np.testing.assert_equal(path[:,1], np.arange(10))
コード例 #2
0
def test_all_to_one_match():
    s = 5 * np.ones(10, dtype='float64').reshape(-1,1)
    t = np.array([[5]], dtype='float64')
    distance, path = c_FastDTWBD(s, t, skip_penalty=1, radius=10)
    assert distance == pytest.approx(0.0)
    assert len(path) == 10
    np.testing.assert_equal(path[:,0], np.arange(10))
    np.testing.assert_equal(path[:,1], np.zeros(10))
コード例 #3
0
def test_perfect_match_in_the_middle():
    skip_penalty = 0.5
    s = np.arange(20, 80, dtype='float64').reshape(-1,1)
    t = np.arange(100, dtype='float64').reshape(-1,1)
    distance, path = c_FastDTWBD(s, t, skip_penalty=skip_penalty, radius=100)
    assert distance == pytest.approx((len(t) - len(s)) * skip_penalty)
    assert len(path) == len(s)
    np.testing.assert_equal(path[:,0], np.arange(60))
    np.testing.assert_equal(path[:,1], np.arange(20, 80))
コード例 #4
0
def test_allocate_large_matrix():
    s = np.arange(100000, dtype='float64').reshape(-1,1)
    t = np.arange(100000, dtype='float64').reshape(-1,1)
    c_FastDTWBD(s, t, skip_penalty=0.5, radius=100)
コード例 #5
0
def test_no_match():
    s = np.arange(10, dtype='float64').reshape(-1,1)
    t = np.arange(10, 20, dtype='float64').reshape(-1,1)
    distance, path = c_FastDTWBD(s, t, skip_penalty=0, radius=10)
    assert distance == pytest.approx(0.0)
    assert len(path) == 0
コード例 #6
0
ファイル: __init__.py プロジェクト: r4victor/afaligner
def build_sync_map(
    text_paths, audio_paths, tmp_dir,
    sync_map_text_path_prefix, sync_map_audio_path_prefix,
    skip_penalty, radius
):
    """
    This is an algorithm for building a sync map.
    It synthesizes text and then aligns synthesized audio with the recorded audio
    using a variation of the DTW (Dynamic Time Warping) algorithm.
    
    The main features of this algorithm are:
    1) It can handle structural differences in the beginning and in the end of files.
    2) It finds an approximation to an optimal warping path in linear time and space using FastDTW approach.

    Note that while the algorithm does not require one-to-one correspondance
    between text and audio files (i.e. the splitting can be done differently),
    the quality of the result is sensitive to the choice of skip_penalty and radius parameters,
    so it is recommended to have such a correspondance.

    Alignment details:
    Synthesized and recorded audio are represented as sequences of MFCC frames.
    These sequences are aligned using variation of the DTW algorithm.
    In contrast to the classic DTW, this algorithms can be used
    to align sequences with structural differences in the beginning or in the end.
    
    Steps to build a sync map:
    1) Synthesize text file and produce a list of anchors.
    Each anchor represents the start of the corresponding text fragment in a synthesized audio.
    2) Get sequences of MFCC frames of synthesized and recorded audio.
    3) Get their warping path by calling the alignment algorithm.
    4) Check whether the extra content is found, calculate mapping boundaries.
    5) Map anchors inside the boundaries to the recorded MFCC sequence using warping path from step 3.
    6) Start all over again considering:
    If there is an extra content in the end of synthesized sequence, align it with the next audio file.
    If there is an extra content in the end of recorded sequence, align it with the next text file.
    If both sequences have extra content in the end, align text tail with the next audio file.
    If none of the above, align next text and audio files.
    """

    synthesizer = Synthesizer()
    parse_parameters = {'is_text_unparsed_id_regex': 'f[0-9]+'}
    
    sync_map = {}
    process_next_text = True
    process_next_audio = True

    while True:
        if process_next_text:
            try:
                text_path = next(text_paths)
            except StopIteration:
                break

            text_name = get_name_from_path(text_path)
            output_text_name = os.path.join(sync_map_text_path_prefix, text_name)
            textfile = TextFile(text_path, file_format=TextFileFormat.UNPARSED, parameters=parse_parameters)
            textfile.set_language(Language.ENG)
            text_wav_path = os.path.join(tmp_dir, f'{drop_extension(text_name)}_text.wav')
            sync_map[output_text_name] = {}

            # Produce synthesized audio, get anchors
            anchors,_,_ = synthesizer.synthesize(textfile, text_wav_path)
            
            # Get fragments, convert anchors timings to the frames indicies
            fragments = [a[1] for a in anchors]
            anchors = np.array([int(a[0] / TimeValue('0.040')) for a in anchors])

            # MFCC frames sequence memory layout is a n x l 2D array,
            # where n - number of frames and l - number of MFFCs
            # i.e it is c-contiguous, but after dropping the first coefficient it siezes to be c-contiguous.
            # Should decide whether to make a copy or to work around the first coefficient.
            text_mfcc_sequence = np.ascontiguousarray(
                AudioFileMFCC(text_wav_path).all_mfcc.T[:, 1:]
            )
            
        if process_next_audio:
            try:
                audio_path = next(audio_paths)
            except StopIteration:
                break

            audio_name = get_name_from_path(audio_path)
            output_audio_name = os.path.join(sync_map_audio_path_prefix, audio_name)
            audio_wav_path = os.path.join(tmp_dir, f'{drop_extension(audio_name)}_audio.wav')
            subprocess.run(['ffmpeg', '-n', '-i', audio_path, audio_wav_path])

            audio_mfcc_sequence = np.ascontiguousarray(
                AudioFileMFCC(audio_wav_path).all_mfcc.T[:, 1:]
            )
            
            # Keep track to calculate frames timings
            audio_start_frame = 0
        
        n = len(text_mfcc_sequence)
        m = len(audio_mfcc_sequence)

        _, path = c_FastDTWBD(text_mfcc_sequence, audio_mfcc_sequence, skip_penalty, radius=radius)
        
        if len(path) == 0:
            print(
                f'No match between {text_name} and {audio_name}. '
                f'Alignment is terminated. '
                f'Adjust skip_penalty or input files.'
            )
            return {}
        
        # Project path to the text and audio sequences
        text_path_frames = path[:,0]
        audio_path_frames = path[:,1]
        
        last_matched_audio_frame = audio_path_frames[-1]

        # Find first and last matched frames
        first_matched_text_frame = text_path_frames[0]
        last_matched_text_frame = text_path_frames[-1]

        # Map only those fragments that intersect matched frames
        anchors_boundary_indices = np.searchsorted(
            anchors, [first_matched_text_frame, last_matched_text_frame]
        )
        map_anchors_from = max(anchors_boundary_indices[0] - 1, 0)
        map_anchors_to = anchors_boundary_indices[1]
        anchors_to_map = anchors[map_anchors_from:map_anchors_to]
        fragments_to_map = fragments[map_anchors_from:map_anchors_to]

        # Get anchors indicies in the path projection to the text sequence
        text_path_anchor_indices = np.searchsorted(text_path_frames, anchors_to_map)
        
        # Get anchors' frames in audio sequence, calculate their timings
        anchors_matched_frames = audio_path_frames[text_path_anchor_indices]
        timings = (np.append(anchors_matched_frames, audio_path_frames[-1]) + audio_start_frame) * 0.040
        
        # Map fragment_ids to timings, update mapping of the current text file
        fragment_map = {
            f: {
                'audio_file': output_audio_name,
                'begin_time': time_to_str(bt),
                'end_time': time_to_str(et)
            }
            for f, bt, et in zip(fragments_to_map, timings[:-1], timings[1:])
        }

        sync_map[output_text_name].update(fragment_map)
        
        # Decide whether to process next file or to align the tail of the current one

        if map_anchors_to == len(anchors):
            # Process next text if no fragments are left
            process_next_text = True
        else:
            # Otherwise align tail of the current text
            process_next_text = False
            text_mfcc_sequence = text_mfcc_sequence[last_matched_text_frame:]
            fragments = fragments[map_anchors_to:]
            anchors = anchors[map_anchors_to:] - last_matched_text_frame
            
        if last_matched_audio_frame == m - 1 or not process_next_text:
            # Process next audio if there are no unmatched audio frames in the tail
            # or there are more text fragments to map, i.e.
            # we choose to process next audio if we cannot decide.
            # This strategy is correct if there are no extra fragments in the end.
            process_next_audio = True
        else:
            # Otherwise align tail of the current audio
            process_next_audio = False
            audio_mfcc_sequence = audio_mfcc_sequence[last_matched_audio_frame:]
            audio_start_frame += last_matched_audio_frame
    
    return sync_map