def sample_multi_frame_dataset(tmpdir): inputs_path = os.path.join(tmpdir.strpath, 'inputs.hdf5') targets_path = os.path.join(tmpdir.strpath, 'targets.hdf5') corpus = resources.create_dataset() container_inputs = containers.Container(inputs_path) container_targets = containers.Container(targets_path) container_inputs.open() container_targets.open() container_inputs.set('utt-1', np.arange(60).reshape(15, 4)) container_inputs.set('utt-2', np.arange(80).reshape(20, 4)) container_inputs.set('utt-3', np.arange(44).reshape(11, 4)) container_inputs.set('utt-4', np.arange(12).reshape(3, 4)) container_inputs.set('utt-5', np.arange(16).reshape(4, 4)) container_targets.set('utt-1', np.arange(30).reshape(15, 2)) container_targets.set('utt-2', np.arange(40).reshape(20, 2)) container_targets.set('utt-3', np.arange(22).reshape(11, 2)) container_targets.set('utt-4', np.arange(6).reshape(3, 2)) container_targets.set('utt-5', np.arange(8).reshape(4, 2)) return feeding.MultiFrameDataset(corpus, [container_inputs, container_targets], 4)
def test_scan_computes_correct_size_for_multiple_containers(self, tmpdir): c1 = containers.Container(os.path.join(tmpdir.strpath, 'c1.h5')) c2 = containers.Container(os.path.join(tmpdir.strpath, 'c2.h5')) c3 = containers.Container(os.path.join(tmpdir.strpath, 'c3.h5')) c1.open() c1.set('utt-1', np.random.random((6, 6)).astype(np.float32)) c1.set('utt-2', np.random.random((2, 6)).astype(np.float32)) c1.set('utt-3', np.random.random((9, 6)).astype(np.float32)) c2.open() c2.set('utt-1', np.random.random((2, 6)).astype(np.float32)) c2.set('utt-2', np.random.random((1, 6)).astype(np.float32)) c2.set('utt-3', np.random.random((4, 6)).astype(np.float32)) c3.open() c3.set('utt-1', np.random.random((1, 6)).astype(np.float32)) c3.set('utt-2', np.random.random((3, 6)).astype(np.float32)) c3.set('utt-3', np.random.random((8, 6)).astype(np.float32)) loader = partitioning.PartitioningContainerLoader( ['utt-1', 'utt-2', 'utt-3'], [c1, c2, c3], '1000', shuffle=True, seed=88) sizes = loader._scan() assert sizes == { 'utt-1': (6 + 2 + 1) * 6 * np.dtype(np.float32).itemsize, 'utt-2': (2 + 1 + 3) * 6 * np.dtype(np.float32).itemsize, 'utt-3': (9 + 4 + 8) * 6 * np.dtype(np.float32).itemsize }
def test_get_lengths_returns_correct_lengths_for_multiple_containers( self, tmpdir): c1 = containers.Container(os.path.join(tmpdir.strpath, 'c1.h5')) c2 = containers.Container(os.path.join(tmpdir.strpath, 'c2.h5')) c3 = containers.Container(os.path.join(tmpdir.strpath, 'c3.h5')) c1.open() c1.set('utt-1', np.random.random((6, 6)).astype(np.float32)) c1.set('utt-2', np.random.random((2, 6)).astype(np.float32)) c1.set('utt-3', np.random.random((9, 6)).astype(np.float32)) c2.open() c2.set('utt-1', np.random.random((2, 6)).astype(np.float32)) c2.set('utt-2', np.random.random((1, 6)).astype(np.float32)) c2.set('utt-3', np.random.random((4, 6)).astype(np.float32)) c3.open() c3.set('utt-1', np.random.random((1, 6)).astype(np.float32)) c3.set('utt-2', np.random.random((3, 6)).astype(np.float32)) c3.set('utt-3', np.random.random((8, 6)).astype(np.float32)) loader = partitioning.PartitioningContainerLoader( ['utt-1', 'utt-2', 'utt-3'], [c1, c2, c3], '1000', shuffle=True, seed=88) lengths = loader._get_all_lengths() assert len(lengths) == 3 assert lengths['utt-1'] == (6, 2, 1) assert lengths['utt-2'] == (2, 1, 3) assert lengths['utt-3'] == (9, 4, 8)
def test_init_missing_utterance_in_container_raises_error(self, tmpdir): c = containers.Container(os.path.join(tmpdir.strpath, 'test.h5')) c.open() c.set('utt-1', data=np.arange(20)) c.set('utt-3', data=np.arange(20)) with pytest.raises(ValueError): feeding.Dataset(['utt-1', 'utt-2', 'utt-3'], [containers.Container('blub')])
def test_reload_creates_different_partitions_on_second_run(self, tmpdir): c1 = containers.Container(os.path.join(tmpdir.strpath, 'c1.h5')) c1.open() c1.set('utt-1', np.random.random((6, 6)).astype(np.float32)) c1.set('utt-2', np.random.random((2, 6)).astype(np.float32)) c1.set('utt-3', np.random.random((9, 6)).astype(np.float32)) c1.set('utt-4', np.random.random((2, 6)).astype(np.float32)) c1.set('utt-5', np.random.random((5, 6)).astype(np.float32)) loader = partitioning.PartitioningContainerLoader( ['utt-1', 'utt-2', 'utt-3', 'utt-4', 'utt-5'], c1, '250', shuffle=True, seed=100) partitions_one = loader.partitions loader.reload() partitions_two = loader.partitions len_changed = len(partitions_one) == len(partitions_two) if len_changed: assert True else: utt_ids_changed = False for x, y in zip(partitions_one, partitions_two): if x.utt_ids != y.utt_ids: utt_ids_changed = True assert utt_ids_changed
def test_next_emits_all_features_if_partition_spans_multiple_data_sets_in_random_order( self, tmpdir): ds1 = np.array([[0.1, 0.1, 0.1, 0.1, 0.1], [0.2, 0.2, 0.2, 0.2, 0.2]]) ds2 = np.array([[0.3, 0.3, 0.3, 0.3, 0.3], [0.4, 0.4, 0.4, 0.4, 0.4], [0.5, 0.5, 0.5, 0.5, 0.5]]) ds3 = np.array([[0.6, 0.6, 0.6, 0.6, 0.6], [0.7, 0.7, 0.7, 0.7, 0.7]]) file_path = os.path.join(tmpdir.strpath, 'features.h5') cont = containers.Container(file_path) cont.open() cont.set('utt-1', ds1) cont.set('utt-2', ds2) cont.set('utt-3', ds3) frames = tuple( iterator.FrameIterator(['utt-1', 'utt-2', 'utt-3'], [cont], 240, shuffle=True, seed=333)) assert 7 == len(frames) assert np.allclose(([0.5, 0.5, 0.5, 0.5, 0.5]), frames[0][0]) assert np.allclose(([0.3, 0.3, 0.3, 0.3, 0.3]), frames[1][0]) assert np.allclose(([0.4, 0.4, 0.4, 0.4, 0.4]), frames[2][0]) assert np.allclose(([0.2, 0.2, 0.2, 0.2, 0.2]), frames[3][0]) assert np.allclose(([0.1, 0.1, 0.1, 0.1, 0.1]), frames[4][0]) assert np.allclose(([0.6, 0.6, 0.6, 0.6, 0.6]), frames[5][0]) assert np.allclose(([0.7, 0.7, 0.7, 0.7, 0.7]), frames[6][0])
def test_next_emits_chunks_with_length(self, tmpdir): ds1 = np.array([[0.1, 0.1, 0.1, 0.1, 0.1], [0.2, 0.2, 0.2, 0.2, 0.2]]) ds2 = np.array([[0.3, 0.3, 0.3, 0.3, 0.3], [0.4, 0.4, 0.4, 0.4, 0.4], [0.5, 0.5, 0.5, 0.5, 0.5]]) ds3 = np.array([[0.6, 0.6, 0.6, 0.6, 0.6]]) file_path = os.path.join(tmpdir.strpath, 'features.h5') cont = containers.Container(file_path) cont.open() cont.set('utt-1', ds1) cont.set('utt-2', ds2) cont.set('utt-3', ds3) frames = tuple( iterator.MultiFrameIterator(['utt-1', 'utt-2', 'utt-3'], [cont], 120, 2, return_length=True, shuffle=True, seed=6)) assert 4 == len(frames) assert np.allclose(([[0.5, 0.5, 0.5, 0.5, 0.5]]), frames[0][0]) assert np.allclose( ([[0.3, 0.3, 0.3, 0.3, 0.3], [0.4, 0.4, 0.4, 0.4, 0.4]]), frames[1][0]) assert np.allclose(([[0.6, 0.6, 0.6, 0.6, 0.6]]), frames[2][0]) assert np.allclose( ([[0.1, 0.1, 0.1, 0.1, 0.1], [0.2, 0.2, 0.2, 0.2, 0.2]]), frames[3][0]) assert frames[0][1] == 1 assert frames[1][1] == 2 assert frames[2][1] == 1 assert frames[3][1] == 2
def test_reload_creates_correct_partitions(self, tmpdir): c1 = containers.Container(os.path.join(tmpdir.strpath, 'c1.h5')) c1.open() c1.set('utt-1', np.random.random((6, 6)).astype(np.float32)) c1.set('utt-2', np.random.random((2, 6)).astype(np.float32)) c1.set('utt-3', np.random.random((9, 6)).astype(np.float32)) c1.set('utt-4', np.random.random((2, 6)).astype(np.float32)) c1.set('utt-5', np.random.random((5, 6)).astype(np.float32)) loader = partitioning.PartitioningContainerLoader( ['utt-1', 'utt-2', 'utt-3', 'utt-4', 'utt-5'], c1, '250', shuffle=False) assert len(loader.partitions) == 3 assert loader.partitions[0].utt_ids == ['utt-1', 'utt-2'] assert loader.partitions[0].utt_lengths == [(6, ), (2, )] assert loader.partitions[0].size == 192 assert loader.partitions[1].utt_ids == ['utt-3'] assert loader.partitions[1].utt_lengths == [(9, )] assert loader.partitions[1].size == 216 assert loader.partitions[2].utt_ids == ['utt-4', 'utt-5'] assert loader.partitions[2].utt_lengths == [(2, ), (5, )] assert loader.partitions[2].size == 168
def test_next_emits_no_frames_if_file_is_empty(self, tmpdir): file_path = os.path.join(tmpdir.strpath, 'features.h5') cont = containers.Container(file_path) cont.open() frames = tuple(iterator.FrameIterator([], [cont], 120)) assert 0 == len(frames)
def test_next_emits_features_only_from_included_ds_in_random_order( self, tmpdir): ds1 = np.array([[0.1, 0.1, 0.1, 0.1, 0.1], [0.2, 0.2, 0.2, 0.2, 0.2]]) ds2 = np.array([[0.3, 0.3, 0.3, 0.3, 0.3], [0.4, 0.4, 0.4, 0.4, 0.4], [0.5, 0.5, 0.5, 0.5, 0.5]]) ds3 = np.array([[0.6, 0.6, 0.6, 0.6, 0.6], [0.7, 0.7, 0.7, 0.7, 0.7]]) file_path = os.path.join(tmpdir.strpath, 'features.h5') cont = containers.Container(file_path) cont.open() cont.set('utt-1', ds1) cont.set('utt-2', ds2) cont.set('utt-3', ds3) frames = tuple( iterator.FrameIterator(['utt-1', 'utt-3'], [cont], 120, shuffle=True, seed=1236)) assert 4 == len(frames) assert np.allclose(([0.1, 0.1, 0.1, 0.1, 0.1]), frames[0][0]) assert np.allclose(([0.2, 0.2, 0.2, 0.2, 0.2]), frames[1][0]) assert np.allclose(([0.7, 0.7, 0.7, 0.7, 0.7]), frames[2][0]) assert np.allclose(([0.6, 0.6, 0.6, 0.6, 0.6]), frames[3][0])
def sample_container(): container_path = resources.get_resource_path( ['sample_files', 'feat_container']) sample_container = containers.Container(container_path) sample_container.open() yield sample_container sample_container.close()
def test_load_partition_data(self, tmpdir): c1 = containers.Container(os.path.join(tmpdir.strpath, 'c1.h5')) c1.open() utt_1_data = np.random.random((6, 6)).astype(np.float32) utt_2_data = np.random.random((2, 6)).astype(np.float32) utt_3_data = np.random.random((9, 6)).astype(np.float32) utt_4_data = np.random.random((2, 6)).astype(np.float32) utt_5_data = np.random.random((5, 6)).astype(np.float32) c1.set('utt-1', utt_1_data) c1.set('utt-2', utt_2_data) c1.set('utt-3', utt_3_data) c1.set('utt-4', utt_4_data) c1.set('utt-5', utt_5_data) loader = partitioning.PartitioningContainerLoader( ['utt-1', 'utt-2', 'utt-3', 'utt-4', 'utt-5'], c1, '250', shuffle=False) part_1 = loader.load_partition_data(0) assert part_1.info.utt_ids == ['utt-1', 'utt-2'] assert np.allclose(part_1.utt_data[0], utt_1_data) assert np.allclose(part_1.utt_data[1], utt_2_data) part_2 = loader.load_partition_data(1) assert part_2.info.utt_ids == ['utt-3'] assert np.allclose(part_2.utt_data[0], utt_3_data) part_3 = loader.load_partition_data(2) assert part_3.info.utt_ids == ['utt-4', 'utt-5'] assert np.allclose(part_3.utt_data[0], utt_4_data) assert np.allclose(part_3.utt_data[1], utt_5_data)
def test_next_emits_no_features_if_data_set_is_empty(self, tmpdir): file_path = os.path.join(tmpdir.strpath, 'features.h5') cont = containers.Container(file_path) cont.open() cont.set('utt-1', np.array([])) frames = tuple(iterator.FrameIterator(['utt-1'], [cont], 120)) assert 0 == len(frames)
def test_init_with_utterance_list(self, tmpdir): c = containers.Container(os.path.join(tmpdir.strpath, 'test.h5')) c.open() c.set('utt-1', data=np.arange(20)) c.set('utt-2', data=np.arange(20)) it = feeding.Dataset(['utt-1', 'utt-2'], [c]) assert it.utt_ids == ['utt-1', 'utt-2']
def test_container_has_utterances_returns_false_if_one_is_missing( self, tmpdir): c = containers.Container(os.path.join(tmpdir.strpath, 'test.h5')) c.open() c.set('utt-1', data=np.arange(20)) c.set('utt-3', data=np.arange(20)) assert not feeding.Dataset.container_has_utterances( c, ['utt-1', 'utt-2', 'utt-3'])
def test_raises_error_if_utt_is_missing_in_container(self, tmpdir): c1 = containers.Container(os.path.join(tmpdir.strpath, 'c1.h5')) c1.open() c1.set('utt-1', np.random.random((6, 6)).astype(np.float32)) c1.set('utt-3', np.random.random((9, 6)).astype(np.float32)) with pytest.raises(ValueError): partitioning.PartitioningContainerLoader( ['utt-1', 'utt-2', 'utt-3'], c1, '250', shuffle=True, seed=88)
def test_reload_creates_no_partition_with_no_utterances(self, tmpdir): c1 = containers.Container(os.path.join(tmpdir.strpath, 'c1.h5')) c1.open() loader = partitioning.PartitioningContainerLoader([], c1, '250', shuffle=False) assert len(loader.partitions) == 0
def test_init_with_corpus_view(self): corpus = resources.create_dataset() subview = subset.Subview( corpus, filter_criteria=[ subset.MatchingUtteranceIdxFilter( utterance_idxs={'utt-1', 'utt-2', 'utt-4'}) ]) it = feeding.DataIterator(subview, [containers.Container('blub')]) assert set(it.utt_ids) == set(subview.utterances.keys())
def test_append_with_different_dimension_raises_error(self, tmpdir): path = os.path.join(tmpdir.strpath, 'container') tmp_container = containers.Container(path) tmp_container.open() tmp_container.append('utt-1', np.arange(20).reshape(5, 2, 2)) with pytest.raises(ValueError): tmp_container.append('utt-1', np.arange(42).reshape(7, 2, 3)) tmp_container.close()
def test_init_with_corpus(self, tmpdir): c = containers.Container(os.path.join(tmpdir.strpath, 'test.h5')) c.open() c.set('utt-1', data=np.arange(20)) c.set('utt-2', data=np.arange(20)) c.set('utt-3', data=np.arange(20)) c.set('utt-4', data=np.arange(20)) c.set('utt-5', data=np.arange(20)) corpus = resources.create_dataset() it = feeding.Dataset(corpus, [c]) assert it.utt_ids == ['utt-1', 'utt-2', 'utt-3', 'utt-4', 'utt-5']
def container_dim_x(tmpdir): inputs_path = os.path.join(tmpdir.strpath, 'outputs.hdf5') cnt = containers.Container(inputs_path) cnt.open() cnt.set('utt-1', np.arange(6)) cnt.set('utt-2', np.arange(8)) cnt.set('utt-3', np.arange(4)) cnt.set('utt-4', np.arange(2)) cnt.set('utt-5', np.arange(6)) return cnt
def test_append(self, tmpdir): path = os.path.join(tmpdir.strpath, 'container') tmp_container = containers.Container(path) tmp_container.open() data = np.arange(100).reshape(20, 5) tmp_container.append('utt-1', data[:8]) tmp_container.append('utt-1', data[8:]) res = tmp_container.get('utt-1', mem_map=False) assert np.array_equal(data, res) tmp_container.close()
def sample_frame_dataset(tmpdir): inputs_path = os.path.join(tmpdir.strpath, 'inputs.hdf5') targets_path = os.path.join(tmpdir.strpath, 'targets.hdf5') corpus = resources.create_dataset() container_inputs = containers.Container(inputs_path) container_targets = containers.Container(targets_path) container_inputs.open() container_targets.open() container_inputs.set('utt-1', np.arange(20).reshape(5, 4)) container_inputs.set('utt-2', np.arange(28).reshape(7, 4)) container_inputs.set('utt-3', np.arange(36).reshape(9, 4)) container_inputs.set('utt-4', np.arange(8).reshape(2, 4)) container_inputs.set('utt-5', np.arange(16).reshape(4, 4)) container_targets.set('utt-1', np.arange(20).reshape(5, 4) + 10) container_targets.set('utt-2', np.arange(28).reshape(7, 4) + 10) container_targets.set('utt-3', np.arange(36).reshape(9, 4) + 10) container_targets.set('utt-4', np.arange(8).reshape(2, 4) + 10) container_targets.set('utt-5', np.arange(16).reshape(4, 4) + 10) return feeding.FrameDataset(corpus, [container_inputs, container_targets])
def test_scan_computes_correct_size_for_one_container(self, tmpdir): c1 = containers.Container(os.path.join(tmpdir.strpath, 'c1.h5')) c1.open() c1.set('utt-1', np.random.random((6, 6)).astype(np.float32)) c1.set('utt-2', np.random.random((2, 6)).astype(np.float32)) c1.set('utt-3', np.random.random((9, 6)).astype(np.float32)) loader = partitioning.PartitioningContainerLoader( ['utt-1', 'utt-2', 'utt-3'], c1, '250', shuffle=True, seed=88) sizes = loader._scan() assert sizes == { 'utt-1': 6 * 6 * np.dtype(np.float32).itemsize, 'utt-2': 2 * 6 * np.dtype(np.float32).itemsize, 'utt-3': 9 * 6 * np.dtype(np.float32).itemsize }
def test_init_with_corpus_view(self, tmpdir): c = containers.Container(os.path.join(tmpdir.strpath, 'test.h5')) c.open() c.set('utt-1', data=np.arange(20)) c.set('utt-2', data=np.arange(20)) c.set('utt-3', data=np.arange(20)) c.set('utt-4', data=np.arange(20)) c.set('utt-5', data=np.arange(20)) corpus = resources.create_dataset() subview = subset.Subview( corpus, filter_criteria=[ subset.MatchingUtteranceIdxFilter( utterance_idxs={'utt-1', 'utt-2', 'utt-4'}) ]) it = feeding.Dataset(subview, [c]) assert it.utt_ids == ['utt-1', 'utt-2', 'utt-4']
def test_next_emits_all_features_in_sequential_order(self, tmpdir): ds1 = np.array([[0.1, 0.1, 0.1, 0.1, 0.1], [0.2, 0.2, 0.2, 0.2, 0.2]]) ds2 = np.array([[0.3, 0.3, 0.3, 0.3, 0.3], [0.4, 0.4, 0.4, 0.4, 0.4], [0.5, 0.5, 0.5, 0.5, 0.5]]) file_path = os.path.join(tmpdir.strpath, 'features.h5') cont = containers.Container(file_path) cont.open() cont.set('utt-1', ds1) cont.set('utt-2', ds2) frames = tuple( iterator.FrameIterator(['utt-1', 'utt-2'], [cont], 120, shuffle=False)) assert 5 == len(frames) assert np.allclose(([0.1, 0.1, 0.1, 0.1, 0.1]), frames[0][0]) assert np.allclose(([0.2, 0.2, 0.2, 0.2, 0.2]), frames[1][0]) assert np.allclose(([0.3, 0.3, 0.3, 0.3, 0.3]), frames[2][0]) assert np.allclose(([0.4, 0.4, 0.4, 0.4, 0.4]), frames[3][0]) assert np.allclose(([0.5, 0.5, 0.5, 0.5, 0.5]), frames[4][0])
def encode_corpus(self, corpus, output_path): """ Encode all utterances of the given corpus and store them in a :class:`audiomate.container.Container`. Args: corpus (Corpus): The corpus to process. output_path (str): The path to store the container with the encoded data. Returns: Container: The container with the encoded data. """ out_container = containers.Container(output_path) out_container.open() for utterance in corpus.utterances.values(): data = self.encode_utterance(utterance, corpus=corpus) out_container.set(utterance.idx, data) out_container.close() return out_container
def test_init_with_corpus(self): corpus = resources.create_dataset() it = feeding.DataIterator(corpus, [containers.Container('blub')]) assert set(it.utt_ids) == set(corpus.utterances.keys())
def test_init_with_utterance_list(self): it = feeding.DataIterator(['utt-1', 'utt-2'], [containers.Container('blub')]) assert set(it.utt_ids) == {'utt-1', 'utt-2'}