def test_process_features_online_with_frame_hop_size_change_stores_correct( self, processor, tmpdir): ds = resources.create_dataset() in_feat_path = os.path.join(tmpdir.strpath, 'in_feats') out_feat_path = os.path.join(tmpdir.strpath, 'out_feats') in_feats = assets.FeatureContainer(in_feat_path) utt_feats = np.arange(30).reshape(5, 6) with in_feats: in_feats.sampling_rate = 16000 in_feats.frame_size = 400 in_feats.hop_size = 160 for utt_idx in ds.utterances.keys(): in_feats.set(utt_idx, utt_feats) processor.mock_frame_size_scale = 2.0 processor.mock_hop_size_scale = 2.0 processor.process_features_online(ds, in_feats, out_feat_path) out_feats = assets.FeatureContainer(out_feat_path) with out_feats: assert out_feats.frame_size == 800 assert out_feats.hop_size == 320
def test_process_features_online(self, processor, tmpdir): ds = resources.create_dataset() in_feat_path = os.path.join(tmpdir.strpath, 'in_feats') out_feat_path = os.path.join(tmpdir.strpath, 'out_feats') in_feats = assets.FeatureContainer(in_feat_path) utt_feats = np.arange(30).reshape(5, 6) with in_feats: in_feats.sampling_rate = 16000 in_feats.frame_size = 400 in_feats.hop_size = 160 for utt_idx in ds.utterances.keys(): in_feats.set(utt_idx, utt_feats) processor.process_features_online(ds, in_feats, out_feat_path) out_feats = assets.FeatureContainer(out_feat_path) with out_feats: assert len(out_feats.keys()) == 5 assert np.array_equal(out_feats.get('utt-1', mem_map=False), utt_feats) assert np.array_equal(out_feats.get('utt-2', mem_map=False), utt_feats) assert np.array_equal(out_feats.get('utt-3', mem_map=False), utt_feats) assert np.array_equal(out_feats.get('utt-4', mem_map=False), utt_feats) assert np.array_equal(out_feats.get('utt-5', mem_map=False), utt_feats)
def new_feature_container(self, idx, path=None): """ Add a new feature container with the given data. Parameters: idx (str): An unique identifier within the dataset. path (str): The path to store the feature file. If None a default path is used. Returns: FeatureContainer: The newly added feature-container. """ new_feature_idx = idx new_feature_path = path # Add index to idx if already existing if new_feature_idx in self._feature_containers.keys(): new_feature_idx = naming.index_name_if_in_list( new_feature_idx, self._feature_containers.keys()) # Set default path if none given if new_feature_path is None: if not os.path.isdir(self.path): raise ValueError( 'To copy file the dataset needs to have a path.') new_feature_path = os.path.join(self.path, DEFAULT_FEAT_SUBDIR, new_feature_idx) else: new_feature_path = os.path.abspath(new_feature_path) container = assets.FeatureContainer(new_feature_path) self._feature_containers[new_feature_idx] = container return container
def test_process_features_online_ignores_none(self, processor, tmpdir): ds = resources.create_dataset() in_feat_path = os.path.join(tmpdir.strpath, 'in_feats') out_feat_path = os.path.join(tmpdir.strpath, 'out_feats') in_feats = assets.FeatureContainer(in_feat_path) utt_feats = np.arange(90).reshape(15, 6) with in_feats: in_feats.sampling_rate = 16000 in_feats.frame_size = 400 in_feats.hop_size = 160 for utt_idx in ds.utterances.keys(): in_feats.set(utt_idx, utt_feats) def return_none(*args, **kwargs): return None processor.process_frames = return_none processor.process_features_online(ds, in_feats, out_feat_path, chunk_size=4) assert True
def _process_corpus(self, corpus, output_path, processing_func, frame_size=400, hop_size=160, sr=None): """ Utility function for processing a corpus with a separate processing function. """ feat_container = assets.FeatureContainer(output_path) feat_container.open() sampling_rate = -1 for utterance in corpus.utterances.values(): utt_sampling_rate = utterance.sampling_rate if sr is None: if sampling_rate > 0 and sampling_rate != utt_sampling_rate: raise ValueError( 'File {} has a different sampling-rate than the previous ones!' .format(utterance.file.idx)) sampling_rate = utt_sampling_rate processing_func(utterance, feat_container, frame_size, hop_size, sr, corpus) tf_frame_size, tf_hop_size = self.frame_transform(frame_size, hop_size) feat_container.frame_size = tf_frame_size feat_container.hop_size = tf_hop_size feat_container.sampling_rate = sr or sampling_rate feat_container.close() return feat_container
def process_features_online(self, corpus, input_features, output_path, chunk_size=1): """ Process all features of the given corpus and save the processed features in a feature-container. The features are processed in **online** mode, chunk by chunk. Args: corpus (Corpus): The corpus to process the utterances from. input_features (FeatureContainer): The feature-container to process the frames from. output_path (str): A path to save the feature-container to. chunk_size (int): Number of frames to process per chunk. Returns: FeatureContainer: The feature-container containing the processed features. """ feat_container = assets.FeatureContainer(output_path) feat_container.open() input_features.open() for utterance in corpus.utterances.values(): sampling_rate = input_features.sampling_rate frames = input_features.get(utterance.idx, mem_map=True) current_frame = 0 while current_frame < frames.shape[0]: last = current_frame + chunk_size > frames.shape[0] to_frame = current_frame + chunk_size chunk = frames[current_frame:to_frame] processed = self.process_frames(chunk, sampling_rate, current_frame, last=last, utterance=utterance, corpus=corpus) if processed is not None: feat_container.append(utterance.idx, processed) current_frame += chunk_size tf_frame_size, tf_hop_size = self.frame_transform( input_features.frame_size, input_features.hop_size) feat_container.frame_size = tf_frame_size feat_container.hop_size = tf_hop_size feat_container.sampling_rate = input_features.sampling_rate feat_container.close() return feat_container
def test_append_with_different_dimension_raises_error(self, tmpdir): container = assets.FeatureContainer( os.path.join(tmpdir.strpath, 'container')) container.open() container.append('utt-1', np.arange(20).reshape(5, 2, 2)) with pytest.raises(ValueError): container.append('utt-1', np.arange(42).reshape(7, 2, 3)) container.close()
def test_append(self, tmpdir): container = assets.FeatureContainer( os.path.join(tmpdir.strpath, 'container')) container.open() data = np.arange(100).reshape(20, 5) container.append('utt-1', data[:8]) container.append('utt-1', data[8:]) res = container.get('utt-1', mem_map=False) assert np.array_equal(data, res) container.close()
def test_process_corpus_with_frame_hop_size_change_stores_correct( self, processor, tmpdir): ds = resources.create_dataset() feat_path = os.path.join(tmpdir.strpath, 'feats') processor.mock_frame_size_scale = 2.5 processor.mock_hop_size_scale = 5 processor.process_corpus(ds, feat_path, frame_size=4096, hop_size=2048) fc = assets.FeatureContainer(feat_path) fc.open() assert fc.frame_size == 10240 assert fc.hop_size == 10240 fc.close()
def process_features(self, corpus, input_features, output_path): """ Process all features of the given corpus and save the processed features in a feature-container. The features are processed in **offline** mode, all features of an utterance at once. Args: corpus (Corpus): The corpus to process the utterances from. input_features (FeatureContainer): The feature-container to process the frames from. output_path (str): A path to save the feature-container to. Returns: FeatureContainer: The feature-container containing the processed features. """ feat_container = assets.FeatureContainer(output_path) feat_container.open() input_features.open() for utterance in corpus.utterances.values(): sampling_rate = input_features.sampling_rate frames = input_features.get(utterance.idx, mem_map=False) processed = self.process_frames(frames, sampling_rate, offset=0, last=True, utterance=utterance, corpus=corpus) feat_container.set(utterance.idx, processed) tf_frame_size, tf_hop_size = self.frame_transform( input_features.frame_size, input_features.hop_size) feat_container.frame_size = tf_frame_size feat_container.hop_size = tf_hop_size feat_container.sampling_rate = input_features.sampling_rate feat_container.close() return feat_container
def sample_feature_container(): container = assets.FeatureContainer( resources.get_resource_path(['sample_files', 'feat_container'])) container.open() yield container container.close()