Exemple #1
0
def sample_multi_frame_dataset(tmpdir):
    inputs_path = os.path.join(tmpdir.strpath, 'inputs.hdf5')
    targets_path = os.path.join(tmpdir.strpath, 'targets.hdf5')

    corpus = resources.create_dataset()
    container_inputs = containers.Container(inputs_path)
    container_targets = containers.Container(targets_path)

    container_inputs.open()
    container_targets.open()

    container_inputs.set('utt-1', np.arange(60).reshape(15, 4))
    container_inputs.set('utt-2', np.arange(80).reshape(20, 4))
    container_inputs.set('utt-3', np.arange(44).reshape(11, 4))
    container_inputs.set('utt-4', np.arange(12).reshape(3, 4))
    container_inputs.set('utt-5', np.arange(16).reshape(4, 4))

    container_targets.set('utt-1', np.arange(30).reshape(15, 2))
    container_targets.set('utt-2', np.arange(40).reshape(20, 2))
    container_targets.set('utt-3', np.arange(22).reshape(11, 2))
    container_targets.set('utt-4', np.arange(6).reshape(3, 2))
    container_targets.set('utt-5', np.arange(8).reshape(4, 2))

    return feeding.MultiFrameDataset(corpus,
                                     [container_inputs, container_targets], 4)
    def test_scan_computes_correct_size_for_multiple_containers(self, tmpdir):
        c1 = containers.Container(os.path.join(tmpdir.strpath, 'c1.h5'))
        c2 = containers.Container(os.path.join(tmpdir.strpath, 'c2.h5'))
        c3 = containers.Container(os.path.join(tmpdir.strpath, 'c3.h5'))
        c1.open()
        c1.set('utt-1', np.random.random((6, 6)).astype(np.float32))
        c1.set('utt-2', np.random.random((2, 6)).astype(np.float32))
        c1.set('utt-3', np.random.random((9, 6)).astype(np.float32))
        c2.open()
        c2.set('utt-1', np.random.random((2, 6)).astype(np.float32))
        c2.set('utt-2', np.random.random((1, 6)).astype(np.float32))
        c2.set('utt-3', np.random.random((4, 6)).astype(np.float32))
        c3.open()
        c3.set('utt-1', np.random.random((1, 6)).astype(np.float32))
        c3.set('utt-2', np.random.random((3, 6)).astype(np.float32))
        c3.set('utt-3', np.random.random((8, 6)).astype(np.float32))

        loader = partitioning.PartitioningContainerLoader(
            ['utt-1', 'utt-2', 'utt-3'], [c1, c2, c3],
            '1000',
            shuffle=True,
            seed=88)

        sizes = loader._scan()

        assert sizes == {
            'utt-1': (6 + 2 + 1) * 6 * np.dtype(np.float32).itemsize,
            'utt-2': (2 + 1 + 3) * 6 * np.dtype(np.float32).itemsize,
            'utt-3': (9 + 4 + 8) * 6 * np.dtype(np.float32).itemsize
        }
    def test_get_lengths_returns_correct_lengths_for_multiple_containers(
            self, tmpdir):
        c1 = containers.Container(os.path.join(tmpdir.strpath, 'c1.h5'))
        c2 = containers.Container(os.path.join(tmpdir.strpath, 'c2.h5'))
        c3 = containers.Container(os.path.join(tmpdir.strpath, 'c3.h5'))
        c1.open()
        c1.set('utt-1', np.random.random((6, 6)).astype(np.float32))
        c1.set('utt-2', np.random.random((2, 6)).astype(np.float32))
        c1.set('utt-3', np.random.random((9, 6)).astype(np.float32))
        c2.open()
        c2.set('utt-1', np.random.random((2, 6)).astype(np.float32))
        c2.set('utt-2', np.random.random((1, 6)).astype(np.float32))
        c2.set('utt-3', np.random.random((4, 6)).astype(np.float32))
        c3.open()
        c3.set('utt-1', np.random.random((1, 6)).astype(np.float32))
        c3.set('utt-2', np.random.random((3, 6)).astype(np.float32))
        c3.set('utt-3', np.random.random((8, 6)).astype(np.float32))

        loader = partitioning.PartitioningContainerLoader(
            ['utt-1', 'utt-2', 'utt-3'], [c1, c2, c3],
            '1000',
            shuffle=True,
            seed=88)

        lengths = loader._get_all_lengths()

        assert len(lengths) == 3
        assert lengths['utt-1'] == (6, 2, 1)
        assert lengths['utt-2'] == (2, 1, 3)
        assert lengths['utt-3'] == (9, 4, 8)
Exemple #4
0
    def test_init_missing_utterance_in_container_raises_error(self, tmpdir):
        c = containers.Container(os.path.join(tmpdir.strpath, 'test.h5'))
        c.open()
        c.set('utt-1', data=np.arange(20))
        c.set('utt-3', data=np.arange(20))

        with pytest.raises(ValueError):
            feeding.Dataset(['utt-1', 'utt-2', 'utt-3'],
                            [containers.Container('blub')])
    def test_reload_creates_different_partitions_on_second_run(self, tmpdir):
        c1 = containers.Container(os.path.join(tmpdir.strpath, 'c1.h5'))
        c1.open()
        c1.set('utt-1', np.random.random((6, 6)).astype(np.float32))
        c1.set('utt-2', np.random.random((2, 6)).astype(np.float32))
        c1.set('utt-3', np.random.random((9, 6)).astype(np.float32))
        c1.set('utt-4', np.random.random((2, 6)).astype(np.float32))
        c1.set('utt-5', np.random.random((5, 6)).astype(np.float32))

        loader = partitioning.PartitioningContainerLoader(
            ['utt-1', 'utt-2', 'utt-3', 'utt-4', 'utt-5'],
            c1,
            '250',
            shuffle=True,
            seed=100)

        partitions_one = loader.partitions
        loader.reload()
        partitions_two = loader.partitions

        len_changed = len(partitions_one) == len(partitions_two)

        if len_changed:
            assert True
        else:
            utt_ids_changed = False

            for x, y in zip(partitions_one, partitions_two):
                if x.utt_ids != y.utt_ids:
                    utt_ids_changed = True

            assert utt_ids_changed
Exemple #6
0
    def test_next_emits_all_features_if_partition_spans_multiple_data_sets_in_random_order(
            self, tmpdir):
        ds1 = np.array([[0.1, 0.1, 0.1, 0.1, 0.1], [0.2, 0.2, 0.2, 0.2, 0.2]])
        ds2 = np.array([[0.3, 0.3, 0.3, 0.3, 0.3], [0.4, 0.4, 0.4, 0.4, 0.4],
                        [0.5, 0.5, 0.5, 0.5, 0.5]])
        ds3 = np.array([[0.6, 0.6, 0.6, 0.6, 0.6], [0.7, 0.7, 0.7, 0.7, 0.7]])
        file_path = os.path.join(tmpdir.strpath, 'features.h5')
        cont = containers.Container(file_path)
        cont.open()
        cont.set('utt-1', ds1)
        cont.set('utt-2', ds2)
        cont.set('utt-3', ds3)

        frames = tuple(
            iterator.FrameIterator(['utt-1', 'utt-2', 'utt-3'], [cont],
                                   240,
                                   shuffle=True,
                                   seed=333))

        assert 7 == len(frames)

        assert np.allclose(([0.5, 0.5, 0.5, 0.5, 0.5]), frames[0][0])
        assert np.allclose(([0.3, 0.3, 0.3, 0.3, 0.3]), frames[1][0])
        assert np.allclose(([0.4, 0.4, 0.4, 0.4, 0.4]), frames[2][0])
        assert np.allclose(([0.2, 0.2, 0.2, 0.2, 0.2]), frames[3][0])
        assert np.allclose(([0.1, 0.1, 0.1, 0.1, 0.1]), frames[4][0])
        assert np.allclose(([0.6, 0.6, 0.6, 0.6, 0.6]), frames[5][0])
        assert np.allclose(([0.7, 0.7, 0.7, 0.7, 0.7]), frames[6][0])
Exemple #7
0
    def test_next_emits_chunks_with_length(self, tmpdir):
        ds1 = np.array([[0.1, 0.1, 0.1, 0.1, 0.1], [0.2, 0.2, 0.2, 0.2, 0.2]])
        ds2 = np.array([[0.3, 0.3, 0.3, 0.3, 0.3], [0.4, 0.4, 0.4, 0.4, 0.4],
                        [0.5, 0.5, 0.5, 0.5, 0.5]])
        ds3 = np.array([[0.6, 0.6, 0.6, 0.6, 0.6]])
        file_path = os.path.join(tmpdir.strpath, 'features.h5')
        cont = containers.Container(file_path)
        cont.open()
        cont.set('utt-1', ds1)
        cont.set('utt-2', ds2)
        cont.set('utt-3', ds3)

        frames = tuple(
            iterator.MultiFrameIterator(['utt-1', 'utt-2', 'utt-3'], [cont],
                                        120,
                                        2,
                                        return_length=True,
                                        shuffle=True,
                                        seed=6))

        assert 4 == len(frames)

        assert np.allclose(([[0.5, 0.5, 0.5, 0.5, 0.5]]), frames[0][0])
        assert np.allclose(
            ([[0.3, 0.3, 0.3, 0.3, 0.3], [0.4, 0.4, 0.4, 0.4, 0.4]]),
            frames[1][0])
        assert np.allclose(([[0.6, 0.6, 0.6, 0.6, 0.6]]), frames[2][0])
        assert np.allclose(
            ([[0.1, 0.1, 0.1, 0.1, 0.1], [0.2, 0.2, 0.2, 0.2, 0.2]]),
            frames[3][0])

        assert frames[0][1] == 1
        assert frames[1][1] == 2
        assert frames[2][1] == 1
        assert frames[3][1] == 2
    def test_reload_creates_correct_partitions(self, tmpdir):
        c1 = containers.Container(os.path.join(tmpdir.strpath, 'c1.h5'))
        c1.open()
        c1.set('utt-1', np.random.random((6, 6)).astype(np.float32))
        c1.set('utt-2', np.random.random((2, 6)).astype(np.float32))
        c1.set('utt-3', np.random.random((9, 6)).astype(np.float32))
        c1.set('utt-4', np.random.random((2, 6)).astype(np.float32))
        c1.set('utt-5', np.random.random((5, 6)).astype(np.float32))

        loader = partitioning.PartitioningContainerLoader(
            ['utt-1', 'utt-2', 'utt-3', 'utt-4', 'utt-5'],
            c1,
            '250',
            shuffle=False)

        assert len(loader.partitions) == 3
        assert loader.partitions[0].utt_ids == ['utt-1', 'utt-2']
        assert loader.partitions[0].utt_lengths == [(6, ), (2, )]
        assert loader.partitions[0].size == 192
        assert loader.partitions[1].utt_ids == ['utt-3']
        assert loader.partitions[1].utt_lengths == [(9, )]
        assert loader.partitions[1].size == 216
        assert loader.partitions[2].utt_ids == ['utt-4', 'utt-5']
        assert loader.partitions[2].utt_lengths == [(2, ), (5, )]
        assert loader.partitions[2].size == 168
Exemple #9
0
    def test_next_emits_no_frames_if_file_is_empty(self, tmpdir):
        file_path = os.path.join(tmpdir.strpath, 'features.h5')
        cont = containers.Container(file_path)
        cont.open()

        frames = tuple(iterator.FrameIterator([], [cont], 120))
        assert 0 == len(frames)
Exemple #10
0
    def test_next_emits_features_only_from_included_ds_in_random_order(
            self, tmpdir):
        ds1 = np.array([[0.1, 0.1, 0.1, 0.1, 0.1], [0.2, 0.2, 0.2, 0.2, 0.2]])
        ds2 = np.array([[0.3, 0.3, 0.3, 0.3, 0.3], [0.4, 0.4, 0.4, 0.4, 0.4],
                        [0.5, 0.5, 0.5, 0.5, 0.5]])
        ds3 = np.array([[0.6, 0.6, 0.6, 0.6, 0.6], [0.7, 0.7, 0.7, 0.7, 0.7]])
        file_path = os.path.join(tmpdir.strpath, 'features.h5')
        cont = containers.Container(file_path)
        cont.open()
        cont.set('utt-1', ds1)
        cont.set('utt-2', ds2)
        cont.set('utt-3', ds3)

        frames = tuple(
            iterator.FrameIterator(['utt-1', 'utt-3'], [cont],
                                   120,
                                   shuffle=True,
                                   seed=1236))

        assert 4 == len(frames)

        assert np.allclose(([0.1, 0.1, 0.1, 0.1, 0.1]), frames[0][0])
        assert np.allclose(([0.2, 0.2, 0.2, 0.2, 0.2]), frames[1][0])
        assert np.allclose(([0.7, 0.7, 0.7, 0.7, 0.7]), frames[2][0])
        assert np.allclose(([0.6, 0.6, 0.6, 0.6, 0.6]), frames[3][0])
Exemple #11
0
def sample_container():
    container_path = resources.get_resource_path(
        ['sample_files', 'feat_container'])
    sample_container = containers.Container(container_path)
    sample_container.open()
    yield sample_container
    sample_container.close()
Exemple #12
0
    def test_load_partition_data(self, tmpdir):
        c1 = containers.Container(os.path.join(tmpdir.strpath, 'c1.h5'))
        c1.open()
        utt_1_data = np.random.random((6, 6)).astype(np.float32)
        utt_2_data = np.random.random((2, 6)).astype(np.float32)
        utt_3_data = np.random.random((9, 6)).astype(np.float32)
        utt_4_data = np.random.random((2, 6)).astype(np.float32)
        utt_5_data = np.random.random((5, 6)).astype(np.float32)
        c1.set('utt-1', utt_1_data)
        c1.set('utt-2', utt_2_data)
        c1.set('utt-3', utt_3_data)
        c1.set('utt-4', utt_4_data)
        c1.set('utt-5', utt_5_data)

        loader = partitioning.PartitioningContainerLoader(
            ['utt-1', 'utt-2', 'utt-3', 'utt-4', 'utt-5'],
            c1,
            '250',
            shuffle=False)

        part_1 = loader.load_partition_data(0)
        assert part_1.info.utt_ids == ['utt-1', 'utt-2']
        assert np.allclose(part_1.utt_data[0], utt_1_data)
        assert np.allclose(part_1.utt_data[1], utt_2_data)

        part_2 = loader.load_partition_data(1)
        assert part_2.info.utt_ids == ['utt-3']
        assert np.allclose(part_2.utt_data[0], utt_3_data)

        part_3 = loader.load_partition_data(2)
        assert part_3.info.utt_ids == ['utt-4', 'utt-5']
        assert np.allclose(part_3.utt_data[0], utt_4_data)
        assert np.allclose(part_3.utt_data[1], utt_5_data)
Exemple #13
0
    def test_next_emits_no_features_if_data_set_is_empty(self, tmpdir):
        file_path = os.path.join(tmpdir.strpath, 'features.h5')
        cont = containers.Container(file_path)
        cont.open()
        cont.set('utt-1', np.array([]))

        frames = tuple(iterator.FrameIterator(['utt-1'], [cont], 120))
        assert 0 == len(frames)
Exemple #14
0
    def test_init_with_utterance_list(self, tmpdir):
        c = containers.Container(os.path.join(tmpdir.strpath, 'test.h5'))
        c.open()
        c.set('utt-1', data=np.arange(20))
        c.set('utt-2', data=np.arange(20))

        it = feeding.Dataset(['utt-1', 'utt-2'], [c])
        assert it.utt_ids == ['utt-1', 'utt-2']
Exemple #15
0
    def test_container_has_utterances_returns_false_if_one_is_missing(
            self, tmpdir):
        c = containers.Container(os.path.join(tmpdir.strpath, 'test.h5'))
        c.open()
        c.set('utt-1', data=np.arange(20))
        c.set('utt-3', data=np.arange(20))

        assert not feeding.Dataset.container_has_utterances(
            c, ['utt-1', 'utt-2', 'utt-3'])
Exemple #16
0
    def test_raises_error_if_utt_is_missing_in_container(self, tmpdir):
        c1 = containers.Container(os.path.join(tmpdir.strpath, 'c1.h5'))
        c1.open()
        c1.set('utt-1', np.random.random((6, 6)).astype(np.float32))
        c1.set('utt-3', np.random.random((9, 6)).astype(np.float32))

        with pytest.raises(ValueError):
            partitioning.PartitioningContainerLoader(
                ['utt-1', 'utt-2', 'utt-3'], c1, '250', shuffle=True, seed=88)
Exemple #17
0
    def test_reload_creates_no_partition_with_no_utterances(self, tmpdir):
        c1 = containers.Container(os.path.join(tmpdir.strpath, 'c1.h5'))
        c1.open()

        loader = partitioning.PartitioningContainerLoader([],
                                                          c1,
                                                          '250',
                                                          shuffle=False)

        assert len(loader.partitions) == 0
Exemple #18
0
    def test_init_with_corpus_view(self):
        corpus = resources.create_dataset()
        subview = subset.Subview(
            corpus,
            filter_criteria=[
                subset.MatchingUtteranceIdxFilter(
                    utterance_idxs={'utt-1', 'utt-2', 'utt-4'})
            ])

        it = feeding.DataIterator(subview, [containers.Container('blub')])
        assert set(it.utt_ids) == set(subview.utterances.keys())
Exemple #19
0
    def test_append_with_different_dimension_raises_error(self, tmpdir):
        path = os.path.join(tmpdir.strpath, 'container')
        tmp_container = containers.Container(path)
        tmp_container.open()

        tmp_container.append('utt-1', np.arange(20).reshape(5, 2, 2))

        with pytest.raises(ValueError):
            tmp_container.append('utt-1', np.arange(42).reshape(7, 2, 3))

        tmp_container.close()
Exemple #20
0
    def test_init_with_corpus(self, tmpdir):
        c = containers.Container(os.path.join(tmpdir.strpath, 'test.h5'))
        c.open()
        c.set('utt-1', data=np.arange(20))
        c.set('utt-2', data=np.arange(20))
        c.set('utt-3', data=np.arange(20))
        c.set('utt-4', data=np.arange(20))
        c.set('utt-5', data=np.arange(20))

        corpus = resources.create_dataset()
        it = feeding.Dataset(corpus, [c])
        assert it.utt_ids == ['utt-1', 'utt-2', 'utt-3', 'utt-4', 'utt-5']
Exemple #21
0
def container_dim_x(tmpdir):
    inputs_path = os.path.join(tmpdir.strpath, 'outputs.hdf5')

    cnt = containers.Container(inputs_path)
    cnt.open()

    cnt.set('utt-1', np.arange(6))
    cnt.set('utt-2', np.arange(8))
    cnt.set('utt-3', np.arange(4))
    cnt.set('utt-4', np.arange(2))
    cnt.set('utt-5', np.arange(6))

    return cnt
Exemple #22
0
    def test_append(self, tmpdir):
        path = os.path.join(tmpdir.strpath, 'container')
        tmp_container = containers.Container(path)
        tmp_container.open()

        data = np.arange(100).reshape(20, 5)

        tmp_container.append('utt-1', data[:8])
        tmp_container.append('utt-1', data[8:])

        res = tmp_container.get('utt-1', mem_map=False)

        assert np.array_equal(data, res)

        tmp_container.close()
Exemple #23
0
def sample_frame_dataset(tmpdir):
    inputs_path = os.path.join(tmpdir.strpath, 'inputs.hdf5')
    targets_path = os.path.join(tmpdir.strpath, 'targets.hdf5')

    corpus = resources.create_dataset()
    container_inputs = containers.Container(inputs_path)
    container_targets = containers.Container(targets_path)

    container_inputs.open()
    container_targets.open()

    container_inputs.set('utt-1', np.arange(20).reshape(5, 4))
    container_inputs.set('utt-2', np.arange(28).reshape(7, 4))
    container_inputs.set('utt-3', np.arange(36).reshape(9, 4))
    container_inputs.set('utt-4', np.arange(8).reshape(2, 4))
    container_inputs.set('utt-5', np.arange(16).reshape(4, 4))

    container_targets.set('utt-1', np.arange(20).reshape(5, 4) + 10)
    container_targets.set('utt-2', np.arange(28).reshape(7, 4) + 10)
    container_targets.set('utt-3', np.arange(36).reshape(9, 4) + 10)
    container_targets.set('utt-4', np.arange(8).reshape(2, 4) + 10)
    container_targets.set('utt-5', np.arange(16).reshape(4, 4) + 10)

    return feeding.FrameDataset(corpus, [container_inputs, container_targets])
Exemple #24
0
    def test_scan_computes_correct_size_for_one_container(self, tmpdir):
        c1 = containers.Container(os.path.join(tmpdir.strpath, 'c1.h5'))
        c1.open()
        c1.set('utt-1', np.random.random((6, 6)).astype(np.float32))
        c1.set('utt-2', np.random.random((2, 6)).astype(np.float32))
        c1.set('utt-3', np.random.random((9, 6)).astype(np.float32))

        loader = partitioning.PartitioningContainerLoader(
            ['utt-1', 'utt-2', 'utt-3'], c1, '250', shuffle=True, seed=88)

        sizes = loader._scan()

        assert sizes == {
            'utt-1': 6 * 6 * np.dtype(np.float32).itemsize,
            'utt-2': 2 * 6 * np.dtype(np.float32).itemsize,
            'utt-3': 9 * 6 * np.dtype(np.float32).itemsize
        }
Exemple #25
0
    def test_init_with_corpus_view(self, tmpdir):
        c = containers.Container(os.path.join(tmpdir.strpath, 'test.h5'))
        c.open()
        c.set('utt-1', data=np.arange(20))
        c.set('utt-2', data=np.arange(20))
        c.set('utt-3', data=np.arange(20))
        c.set('utt-4', data=np.arange(20))
        c.set('utt-5', data=np.arange(20))

        corpus = resources.create_dataset()
        subview = subset.Subview(
            corpus,
            filter_criteria=[
                subset.MatchingUtteranceIdxFilter(
                    utterance_idxs={'utt-1', 'utt-2', 'utt-4'})
            ])

        it = feeding.Dataset(subview, [c])
        assert it.utt_ids == ['utt-1', 'utt-2', 'utt-4']
Exemple #26
0
    def test_next_emits_all_features_in_sequential_order(self, tmpdir):
        ds1 = np.array([[0.1, 0.1, 0.1, 0.1, 0.1], [0.2, 0.2, 0.2, 0.2, 0.2]])
        ds2 = np.array([[0.3, 0.3, 0.3, 0.3, 0.3], [0.4, 0.4, 0.4, 0.4, 0.4],
                        [0.5, 0.5, 0.5, 0.5, 0.5]])
        file_path = os.path.join(tmpdir.strpath, 'features.h5')
        cont = containers.Container(file_path)
        cont.open()
        cont.set('utt-1', ds1)
        cont.set('utt-2', ds2)

        frames = tuple(
            iterator.FrameIterator(['utt-1', 'utt-2'], [cont],
                                   120,
                                   shuffle=False))
        assert 5 == len(frames)

        assert np.allclose(([0.1, 0.1, 0.1, 0.1, 0.1]), frames[0][0])
        assert np.allclose(([0.2, 0.2, 0.2, 0.2, 0.2]), frames[1][0])
        assert np.allclose(([0.3, 0.3, 0.3, 0.3, 0.3]), frames[2][0])
        assert np.allclose(([0.4, 0.4, 0.4, 0.4, 0.4]), frames[3][0])
        assert np.allclose(([0.5, 0.5, 0.5, 0.5, 0.5]), frames[4][0])
Exemple #27
0
    def encode_corpus(self, corpus, output_path):
        """
        Encode all utterances of the given corpus and store them in a :class:`audiomate.container.Container`.

        Args:
            corpus (Corpus): The corpus to process.
            output_path (str): The path to store the container with the encoded data.

        Returns:
            Container: The container with the encoded data.
        """

        out_container = containers.Container(output_path)
        out_container.open()

        for utterance in corpus.utterances.values():
            data = self.encode_utterance(utterance, corpus=corpus)
            out_container.set(utterance.idx, data)

        out_container.close()
        return out_container
Exemple #28
0
 def test_init_with_corpus(self):
     corpus = resources.create_dataset()
     it = feeding.DataIterator(corpus, [containers.Container('blub')])
     assert set(it.utt_ids) == set(corpus.utterances.keys())
Exemple #29
0
 def test_init_with_utterance_list(self):
     it = feeding.DataIterator(['utt-1', 'utt-2'],
                               [containers.Container('blub')])
     assert set(it.utt_ids) == {'utt-1', 'utt-2'}