def mean_var_norm_per_file(self, h5f, mvn_h5f, vad_file=None): # normalize either per channel or on the whole spectrum. axis = 0 if self.norm_per_channel else None dset_name = list(h5py.File(h5f).keys())[0] files = h5py.File(h5f)[dset_name]['items'] reader = h5features.Reader(h5f) means_vars = [] for f in files: data = reader.read(from_item=f) items, features, times = (data.items(), data.features()[0], data.labels()[0]) # VAD filtered_features = None if vad_file is not None: vad_data = read_vad_file(vad_file) if str(f) in vad_data: filtered_features = self.filter_vad_one_file( features, times, vad_data[str(f)]) if filtered_features is None: mean = np.mean(features, axis=axis) std = np.std(features, axis=axis) else: mean = np.mean(filtered_features, axis=axis) std = np.std(filtered_features, axis=axis) features = (features - mean) / (std + np.finfo(features.dtype).eps) h5features.write(mvn_h5f, '/features/', items, [times], [features]) means_vars.append((f, mean, std)) return means_vars
def run(files, output_path, config_file, save, batch_size=50): """Split in the file list into batches. Handle arguments. Parameters: ----------- batch_size: int, max batch size in number of files (adjust for RAM usage) """ if 'h5' in save: import h5features # with open(config_file, 'r') as fid: # config = json.load(fid) if config_file: raise NotImplementedError batches = [files[i:i + batch_size] for i in range(0, len(files), batch_size)] res = {} for files_batch in batches: new_res = extract_features(files_batch, delta=0) if 'np' in save: for f in new_res: np.save(output_path + f, new_res[f]) if 'h5' in save: h5features.write(output_path, 'features', [new_res.values()], map(lambda f: os.path.basename(f).split('.')[0], new_res.keys()), map(lambda d: np.arange(d.shape[0], dtype=float) / 100 + 0.0125)) res.update(new_res)
def setup(self): self.file_v1 = 'v1.0.h5' self.file_v2 = 'v1.1.h5' self.teardown() # in case files already exist, remove it items, times, features = generate.full(20, 10) h5f_1_0.write(self.file_v1, 'features', items, times, features) h5f_1_1.write(self.file_v2, 'features', items, times, features)
def setup(self): self.file_v1 = 'v1.0.h5' self.file_v2 = 'v1.1.h5' self.teardown() # in case files already exist, remove it items, times, features = generate.full(20,10) h5f_1_0.write(self.file_v1, 'features', items, times, features) h5f_1_1.write(self.file_v2, 'features', items, times, features)
def transcription2features(phones_file, tra_file, out_file, word_position_dependent=True): """ Kaldi 1-best aligned transcription to h5features format in h5features is frame by frame, as this allows both frame-to-frame DTW distance and edit distance to be used (for edit_distance the first step would be extracting the phone-level sequence from the frame-level sequence, discarding segments that have too few frames) This avoids problems with long phones if coding only the centerpoint of a phone (a long time interval within the phone, but that does not include the centerpoint will have empty representation). Allowing representations indexed by time intervals instead of time points could be more elegant when one wants to use edit_distance but this would require some (substantial but not huge) recoding in h5features and ABXpy.distances. One would need to check that the time-intervals have no overlap and are consecutive and one would need to adapt the features reading to provide the sequence of consecutive feature vectors with their durations and for the first and last their degree of overlap with the required time segment. """ phonemap = read_kaldi_phonemap(phones_file, word_position_dependent) # get order used to encode the phones as integer in the features files phone_order = get_phone_order(phonemap) utt_ids = [] times = [] features = [] current_utt = None utt_times = [] utt_features = [] i = 1 for utt_id, start, stop, phone in read_kaldi_alignment(phonemap, tra_file): print i i = i + 1 if current_utt is None: current_utt = utt_id if utt_id != current_utt: utt_ids.append(current_utt) times.append(np.array(utt_times)) nb_phones = len(utt_features) # not sure how h5features handles 1-d arrays, so reshaping features.append(np.array(utt_features).reshape((nb_phones, 1))) current_utt = utt_id utt_times = [] utt_features = [] else: # expanding to frame by frame using ad hoc 10ms window spacing # since start and stop are spaced by a multiple of 10ms due to # standard window spacing used by kaldi nframes = (stop - start) / 0.01 assert np.abs(nframes - np.round(nframes)) < 1e-7 # ad hoc tolerance nframes = int(np.round(nframes)) utt_features = utt_features + [phone_order.index(phone)] * nframes frame_times = start + 0.01 * np.arange(nframes) utt_times = utt_times + list(frame_times) h5features.write(out_file, 'features', utt_ids, times, features)
def h5features_stack_fbanks(fbanks_file, stacked_fbanks_file): import h5features index = h5features.read_index(fbanks_file) files = index['files'] for f in files: times, fbanks = h5features.read( fbanks_file, 'features', from_internal_file=f, index=index) stacked_fbanks = stack_fbanks(fbanks[f]) h5features.write(stacked_fbanks_file, 'features', [f], [times[f]], [stacked_fbanks])
def generate_features(n_files, n_feat=2, max_frames=3, name='data.features'): """Random feature file generator """ features = [] times = [] files = [] for i in xrange(n_files): n_frames = np.random.randint(max_frames) + 1 features.append(np.random.randn(n_frames, n_feat)) times.append(np.linspace(0, 1, n_frames)) files.append('s%d' % i) h5features.write(name, 'features', files, times, features)
def test_normalization_per_file(self): tempdir = Path(tempfile.mkdtemp()) h5f = str(tempdir / 'h5.features') feature1 = np.vstack([np.full((100, 40), 1.), np.full((100, 40), -1.)]) feature2 = np.vstack([np.full((100, 40), 1.), np.full((100, 40), 2.)]) features = [feature1, feature2] items = ['file1', 'file2'] times = [np.arange(feature1.shape[0], dtype=float) * 0.01 + 0.0025] times.append(np.arange(feature2.shape[0], dtype=float) * 0.01 + 0.0025) h5features.write(h5f, '/features/', items, times, features) h5f_mean_var = str(tempdir / 'h5-normalized.features') features_generator = FeaturesGenerator(normalization=True, norm_per_file=True) meansvars = features_generator.mean_var_norm_per_file( h5f, h5f_mean_var) assert meansvars[0][0] == 'file1' assert all(meansvars[0][1] == np.mean(feature1, axis=0)) assert all(meansvars[0][2] == np.std(feature1, axis=0)) assert meansvars[1][0] == 'file2' assert all(meansvars[1][1] == np.mean(feature2, axis=0)) assert all(meansvars[1][2] == np.std(feature2, axis=0)) reader = h5features.Reader(h5f_mean_var) data = reader.read() for file in data.items(): assert np.mean(data.dict_features()[file]) == pytest.approx(0) assert np.std(data.dict_features()[file]) == pytest.approx(1) # no per channel tmp2 = str(tempdir / 'h5-tmp2') features_generator = FeaturesGenerator(normalization=True, norm_per_file=True, norm_per_channel=False) meansvars = features_generator.mean_var_norm_per_file(h5f, tmp2) assert meansvars == [ ('file1', 0, np.std(feature1)), ('file2', 1.5, np.std(feature2)), ] reader = h5features.Reader(tmp2) data = reader.read() for file in data.items(): assert np.mean(data.dict_features()[file]) == pytest.approx(0) assert np.std(data.dict_features()[file]) == pytest.approx(1) shutil.rmtree(str(tempdir))
def _test_wr(self, labeldim): """Test retrieving labels and files after a write/read operation.""" items, t_gold, feat = generate.full(self.nbitems, tformat=labeldim) write(self.filename, self.group, items, t_gold, feat) t, _ = read(self.filename, self.group) assert len(t) == self.nbitems if not labeldim == 1: assert all([tt.shape[1] == labeldim for tt in t.values()]) # build a dict from gold to compare with t d = dict(zip(items, t_gold)) for dd, tt in zip(d, t): assert tt == dd
def generate_features(n_files, n_feat=2, max_frames=3, name='data.features'): """Random feature file generator """ if os.path.exists(name): os.remove(name) features = [] times = [] files = [] for i in range(n_files): n_frames = np.random.randint(max_frames) + 1 features.append(np.random.randn(n_frames, n_feat)) times.append(np.linspace(0, 1, n_frames)) files.append('s%d' % i) h5features.write(name, 'features', files, times, features)
def yaafe2features(wavefiles, out_file, feature_type='MFCC'): """Generate features with yaafe and put them in h5features format. Whole wavefiles are encoded as internal h5features files. To use them with abkhazia's ABX tasks, these need to be segmented according to an abkhazia segments.txt (abkhazia/utilities/segment_features.py can be used for this) Supported feature types: - 'MFCC' (default) - 'CMSP13' (cubic-root-compressed 13-frequency-channels Mel spectrogram) """ assert feature_type in ['MFCC', 'CMSP13'], \ 'Unsupported feature_type {0}'.format(feature_type) feature_plan = ya.FeaturePlan(sample_rate=16000) if feature_type == 'MFCC': feat_name = 'mfcc' feature_plan.addFeature('{0}: MFCC blockSize=400 stepSize=160'.format( feat_name)) # 0.025s + 0.01s elif feature_type == 'CMSP13': feat_name = 'melsp' feature_plan.addFeature( '{0}: MelSpectrum MelNbFilters=13 blockSize=400 stepSize=160'. format(feat_name)) # 0.025s + 0.01s engine = ya.Engine() engine.load(feature_plan.getDataFlow()) wav_ids = [] times = [] features = [] for wavefile in wavefiles: wav_ids.append(p.splitext(p.basename(wavefile))[0]) afp = ya.AudioFileProcessor() afp.processFile(engine, wavefile) feat_out = engine.readAllOutputs()[feat_name] if feature_type == 'CMSP13': # need to add compression by hand feat_out = np.power(feat_out, 1 / 3.) # times according to: # http://yaafe.sourceforge.net/features.html?highlight=mfcc#yaafefeatures.Frames nframes = feat_out.shape[0] # 0.01 here is ad hoc and dependent on 160 above times.append(0.01 * np.arange(nframes)) features.append(feat_out) h5features.write(out_file, 'features', wav_ids, times, features)
def h5features_from_nparray(input_path, h5f, timefunc=None, rm_last_number=False, transpose=False): """Compute speech features (such as posteriogram) that are in numpy array in h5features format. Parameters: ---------- input_path: path of the directory containing the features of audio files in numpy array h5f: str. Name of the h5features file to create. timefunc: callable. Function that returns timestamps for the aforementionned features. By default, it assume a window length of 25 ms and a window step of 10 ms. rm_last_number :bool, wether or not to remove the last number in each file name (the filenames of posteriograms have an additional number compared to the audio filenames ) """ filenames = [ f for f in listdir(input_path) if os.path.splitext(f)[-1] == ".npy" ] batch_size = 500 features = [] times = [] internal_files = [] i = 0 for f in filenames: data = np.load(input_path + f) if i == batch_size: h5features.write(h5f, "/features/", internal_files, times, features) features = [] times = [] internal_files = [] i = 0 i = i + 1 features.append(data) if timefunc == None: time = np.arange(data.shape[0], dtype=float) * 0.01 + 0.0025 else: time = timefunc(data) times.append(time) if rm_last_number: name = os.path.splitext(f)[0] internal_files.append(os.path.basename(name)) else: internal_files.append(os.path.basename(os.path.splitext(f)[0])) if features: h5features.write(h5f, "/features/", internal_files, times, features)
def test_normalization(self): tempdir = Path(tempfile.mkdtemp()) h5f = str(tempdir / 'h5.features') features = [np.full((100, 40), 1.0), np.full((150, 40), 2.0)] items = ['file1', 'file2'] times = [ np.arange(features[0].shape[0], dtype=np.float32) * 0.01 + 0.0025 ] times.append( np.arange(features[1].shape[0], dtype=np.float32) * 0.01 + 0.0025) h5features.write(h5f, '/features/', items, times, features) features_generator = FeaturesGenerator(norm_per_channel=True) h5f_mean_var = str(tempdir / 'h5-normalized.features') mean, variance = features_generator.mean_variance_normalisation( h5f, h5f_mean_var) stacked_features = np.vstack(features) assert mean == pytest.approx(np.mean(stacked_features, axis=0)) assert variance == pytest.approx(np.std(stacked_features, axis=0)) # check that the new file has 0 mean and 1 variance dset = list(h5py.File(h5f_mean_var).keys())[0] data = h5py.File(h5f_mean_var)[dset]['features'][:] means = np.mean(data, axis=0) assert np.allclose(means, 0.0, atol=1e-6) assert np.std(data, axis=0) == pytest.approx(1.0, abs=1e-6) ## test normalization across all chanels tmp2 = str(tempdir / 'h5temp.h5') features_generator = FeaturesGenerator(normalization=True, norm_per_channel=False) mean, variance = features_generator.mean_variance_normalisation( h5f, tmp2) assert mean == pytest.approx(np.mean(stacked_features)) assert variance == pytest.approx(np.std(np.vstack(features))) # check that the new file has 0 mean and 1 variance dset = list(h5py.File(tmp2).keys())[0] data = h5py.File(h5f_mean_var)[dset]['features'][:] assert np.mean(data) == pytest.approx(0, abs=1e-6) assert np.std(data) == pytest.approx(1) shutil.rmtree(str(tempdir))
def any_to_h5features(path, files, h5_filename, h5_groupname, batch_size=500, load=np.load): """Append a list of npz files to a h5features file. Files must have a relative name to a directory precised by the 'path' argument. Parameters ---------- path : str Path of the directory where the numpy files are stored. files : list of filename List of file to convert and append. h5_filename : filename The output h5features file. h5_groupname : str Name of the h5 group where to store the numpy files (use '/features/') for h5features files) batch_size : int Size of the writing buffer (in number of npz files). By default 500. """ features = [] times = [] internal_files = [] i = 0 for f in files: if i == batch_size: h5features.write(h5_filename, h5_groupname, internal_files, times, features) features = [] times = [] internal_files = [] i = 0 i = i + 1 data = load(os.path.join(path, f)) features.append(data['features']) times.append(data['time']) internal_files.append(os.path.splitext(f)[0]) if features: h5features.write(h5_filename, h5_groupname, internal_files, times, features)
def npz_to_h5features(path, files, h5_filename, h5_groupname, batch_size=500): """Append a list of npz files to a h5features file. Files must have a relative name to a directory precised by the 'path' argument. Parameters ---------- path : str Path of the directory where the numpy files are stored. files : list of filename List of file to convert and append. h5_filename : filename The output h5features file. h5_groupname : str Name of the h5 group where to store the numpy files (use '/features/') for h5features files) batch_size : int Size of the writing buffer (in number of npz files). By default 500. """ features = [] times = [] internal_files = [] i = 0 for f in files: if i == batch_size: h5features.write( h5_filename, h5_groupname, internal_files, times, features) features = [] times = [] internal_files = [] i = 0 i = i+1 data = np.load(os.path.join(path, f)) features.append(data['features']) times.append(data['time']) internal_files.append(os.path.splitext(f)[0]) if features: h5features.write( h5_filename, h5_groupname, internal_files, times, features)
def h5features_compute(self, files, h5f, featfunc=None, timefunc=None): """Compute mfcc or filterbanks (or other) in h5features format. Parameters: ---------- files: list, list of files on which to compute the features. You must give the complete relative or absolute path of the wave file h5f: str. Name of the h5features file to create. featfunc: callable. "do_fbanks" to compute fbanks, "do_mfccs" to compute mfccs. Or any callable function that return features given a wave file. timefunc: callable. Function that returns timestamps for the aforementionned features. By default, it assume a window length of 25 ms and a window step of 10 ms. """ if featfunc is None: featfunc = self.do_fbank batch_size = 500 features = [] times = [] internal_files = [] i = 0 for f in files: if i == batch_size: h5features.write(h5f, '/features/', internal_files, times, features) features = [] times = [] internal_files = [] i = 0 i = i + 1 data = featfunc(f) features.append(data) if timefunc is None: time = np.arange(data.shape[0], dtype=float) * 0.01 + 0.0025 else: time = timefunc(f) times.append(time) internal_files.append(os.path.basename(os.path.splitext(f)[0])) if features: h5features.write(h5f, '/features/', internal_files, times, features)
def npz_to_h5features(path, files, h5_filename, h5_groupname, batch_size=500): features = [] times = [] internal_files = [] i = 0 for f in files: if i == batch_size: h5features.write(h5_filename, h5_groupname, internal_files, times, features) features = [] times = [] internal_files = [] i = 0 i = i + 1 data = np.load(os.path.join(path, f)) features.append(data['features']) times.append(data['time']) internal_files.append(os.path.splitext(f)[0]) if features: h5features.write(h5_filename, h5_groupname, internal_files, times, features)
def segment_features(features_file, segments_file, out_file): """ Segment h5features file containing features for whole wavefiles of an abkhazia corpus (or split of a corpus) into features for segments as described in the provided segments.txt file. """ utt_ids, wavefiles, starts, stops = io.read_segments(segments_file) if all([e is None for e in starts]) and all([e is None for e in stops]): # TODO use a log instead of a print statement print( "segment_features: segments already match wavefiles, " "doing nothing...") else: # Group utterances by wavefiles data = zip(utt_ids, wavefiles, starts, stops) for wav, utts in groupby(data, lambda e: e[1]): # TODO use a log instead of a print statement print "Segmenting features for file {} by utterance".format(wav) # load features for whole wavefile wav_id = os.path.splitext(wav)[0] # TODO fix that times, features = h5features.read(features_file, from_internal_file=wav_id) # no need for dict here times, features = times[wav_id], features[wav_id] utt_ids, utt_times, utt_features = [], [], [] for utt_id, _, start, stop in utts: # select features for appropriate segment utt_ids.append(utt_id) indices = np.where( np.logical_and(times >= start, times <= stop))[0] # get times relative to beginning of utterance utt_times.append(times[indices] - start) utt_features.append(features[indices, :]) # write to out_file once for each wavefile h5features.write(out_file, 'features', utt_ids, utt_times, utt_features)
def h5features_compute(files, h5f, featfunc=do_fbank, timefunc=None): """Compute mfcc or filterbanks (or other) in h5features format. Parameters: ---------- files: list, list of files on which to compute the features. You must give the complete relative or absolute path of the wave file h5f: str. Name of the h5features file to create. featfunc: callable. "do_fbanks" to compute fbanks, "do_mfccs" to compute mfccs. Or any callable function that return features given a wave file. timefunc: callable. Function that returns timestamps for the aforementionned features. By default, it assume a window length of 25 ms and a window step of 10 ms. """ batch_size = 500 features = [] times = [] internal_files = [] i = 0 for f in files: if i == batch_size: h5features.write(h5f, '/features/', internal_files, times, features) features = [] times = [] internal_files = [] i = 0 i = i+1 data = featfunc(f) features.append(data) if timefunc == None: time = np.arange(data.shape[0], dtype=float) * 0.01 + 0.0025 else: time = timefunc(f) times.append(time) internal_files.append(os.path.basename(os.path.splitext(f)[0])) if features: h5features.write(h5f, '/features/', internal_files, times, features)
def features2features(in_file, out_file): """ kaldi input features (mfcc, etc.) to h5features this loads everything into memory, but it would be easy to write an incremental version if this poses a problem Input features must be in a single archive text format, that can be obtained using the 'copy-feats' kaldi utility """ # below is basically a parser for kaldi vector format for each line # parse input text file outside_utt = True features = [] utt_ids = [] times = [] with codecs.open(in_file, mode='r', encoding='UTF-8') as inp: for index, line in enumerate(inp): print("Processing line {0}".format(index + 1) ) # / {1}".format(index+1, len(lines))) tokens = line.strip().split(u" ") if outside_utt: assert len( tokens) == 3 and tokens[1] == u"" and tokens[2] == u"[" utt_id = tokens[0] outside_utt = False frames = [] else: if tokens[-1] == u"]": # end of utterance outside_utt = True tokens = tokens[:-1] frames.append(np.array(tokens, dtype=np.float)) if outside_utt: # end of utterance, continued features.append(np.row_stack(frames)) # as in kaldi2abkhazia, this is ad hoc and has not been checked formally times.append(0.0125 + 0.01 * np.arange(len(frames))) utt_ids.append(utt_id) h5features.write(out_file, 'features', utt_ids, times, features)
def h5features_from_nparray(input_path, h5f, timefunc=None): """Compute speech features (such as posteriogram) that are in numpy array in h5features format. Parameters: ---------- input_path: path of the directory containing the features of audio files in numpy array h5f: str. Name of the h5features file to create. timefunc: callable. Function that returns timestamps for the aforementionned features. By default, it assume a window length of 25 ms and a window step of 10 ms. """ files = [f for f in listdir(input_path) if isfile(join(input_path, f))] batch_size = 500 features = [] times = [] internal_files = [] i = 0 for f in files: if i == batch_size: h5features.write(h5f, '/features/', internal_files, times, features) features = [] times = [] internal_files = [] i = 0 i = i + 1 features.append(f) if timefunc == None: time = np.arange(f.shape[0], dtype=float) * 0.01 + 0.0025 else: time = timefunc(f) times.append(time) internal_files.append(os.path.basename(os.path.splitext(f)[0])) if features: h5features.write(h5f, '/features/', internal_files, times, features)
def setup(self): items, self.data, feats = generate.full(10, tformat=1) self.filename = 'test.h5' self.teardown() write(self.filename, 'group', items, self.data, feats) self.group = h5py.File(self.filename, 'a')['group']
def setup(self): items, self.data, feats = generate.full(10,tformat=1) self.filename = 'test.h5' self.teardown() write(self.filename, 'group', items, self.data, feats) self.group = h5py.File(self.filename, 'a')['group']
def test_normalization_with_VAD(self): # paths tempdir = Path(tempfile.mkdtemp()) h5f = str(tempdir / 'h5.features') vad_file = str(tempdir / 'vad') # write VAD data for file 1 with open(vad_file, 'w') as vad1: vad1.write("file,start,stop\n" "file1,0.0025,0.5000\n" "file1,0.7525,1.000\n") items = ['file1', 'file2'] # generate data feature1 = np.vstack([np.full((50, 40), 1.0), np.full((50, 40), -1.0)]) feature2 = np.vstack([np.full((50, 40), 1.0), np.full((50, 40), -1.0)]) features = [feature1, feature2] times = [np.arange(feature1.shape[0], dtype=float) * 0.01 + 0.0025] times.append(np.arange(feature2.shape[0], dtype=float) * 0.01 + 0.0025) h5features.write(h5f, '/features/', items, times, features) h5f_mean_var = str(tempdir / 'h5-normalized.features') features_generator = FeaturesGenerator(normalization=True, norm_per_file=True, norm_per_channel=True) mean, var = features_generator.mean_variance_normalisation( h5f, h5f_mean_var, vad_file=vad_file) assert mean == pytest.approx( np.mean(np.vstack([feature1[:75], feature2]), axis=0)) assert var == pytest.approx( np.std(np.vstack([feature1[:75], feature2]), axis=0)) reader = h5features.Reader(h5f_mean_var) data = reader.read() assert data.dict_features()['file1'] == pytest.approx( (feature1 - mean) / var) assert data.dict_features()['file2'] == pytest.approx( (feature2 - mean) / var) ## test no per channel tmp2 = str(tempdir / 'tmp2.h5') features_generator = FeaturesGenerator(normalization=True, norm_per_file=True, norm_per_channel=False) mean, var = features_generator.mean_variance_normalisation( h5f, tmp2, vad_file=vad_file) assert mean == pytest.approx\ (np.mean(np.vstack([feature1[:75], feature2]))) assert var == pytest.approx( np.std(np.vstack([feature1[:75], feature2]))) reader = h5features.Reader(tmp2) data = reader.read() assert data.dict_features()['file1'] == pytest.approx( (feature1 - mean) / var) assert data.dict_features()['file2'] == pytest.approx( (feature2 - mean) / var) shutil.rmtree(str(tempdir))
def test_norm_per_file_with_VAD(self): # paths tempdir = Path(tempfile.mkdtemp()) h5f = str(tempdir / 'h5.features') vad_path = str(tempdir / 'vad') # write VAD data for file 1 with open(str(vad_path), 'w') as vad1: vad1.write("file,start,stop\n" "file1,0.0025,0.5000\n" "file1,0.7525,1.000\n") items = ['file1', 'file2'] # generate data feature1 = np.vstack([np.full((50, 40), 1.0), np.full((50, 40), -1.0)]) feature2 = np.vstack([np.full((50, 40), 1.0), np.full((50, 40), -1.0)]) features = [feature1, feature2] times = [np.arange(feature1.shape[0], dtype=float) * 0.01 + 0.0025] times.append(np.arange(feature2.shape[0], dtype=float) * 0.01 + 0.0025) h5features.write(h5f, '/features/', items, times, features) h5f_mean_var = str(tempdir / 'h5-normalized.features') features_generator = FeaturesGenerator(normalization=True, norm_per_file=True, norm_per_channel=True) meansvars = features_generator.mean_var_norm_per_file( h5f, h5f_mean_var, vad_file=str(vad_path)) assert meansvars[0][0] == 'file1' assert all(meansvars[0][1] == np.mean(feature1[:75], axis=0)) assert all(meansvars[0][2] == np.std(feature1[:75], axis=0)) assert meansvars[1][0] == 'file2' assert all(meansvars[1][1] == np.mean(feature2, axis=0)) assert all(meansvars[1][2] == np.std(feature2, axis=0)) reader = h5features.Reader(h5f_mean_var) data = reader.read() assert data.dict_features()['file1'] == pytest.approx( (feature1 - np.mean(feature1[:75])) / np.std(feature1[:75])) assert np.mean(data.dict_features()['file2']) == pytest.approx(0) assert np.std(data.dict_features()['file2']) == pytest.approx(1) # test no per channel features_generator = FeaturesGenerator( normalization=True, norm_per_file=True, norm_per_channel=False, ) tmp2 = str(tempdir / 'tmp2.h5') meansvars = features_generator.mean_var_norm_per_file( h5f, tmp2, vad_file=str(vad_path)) assert meansvars == [ ('file1', np.mean(feature1[:75]), np.std(feature1[:75])), ('file2', np.mean(feature2), np.std(feature2)), ] reader = h5features.Reader(tmp2) data = reader.read() assert data.dict_features()['file1'] == pytest.approx( (feature1 - np.mean(feature1[:75])) / np.std(feature1[:75])) assert np.mean(data.dict_features()['file2']) == pytest.approx(0) assert np.std(data.dict_features()['file2']) == pytest.approx(1) shutil.rmtree(str(tempdir))
def lattice2features(phones_file, post_file, out_file, out_phones_file, word_position_dependent=True): """ kaldi lattice posteriors to h5features this loads everything into memory, but it would be easy to write an incremental version if this poses a problem """ phonemap = read_kaldi_phonemap(phones_file, word_position_dependent) # get order in which phones will be represented in the dimensions of the posteriorgram phone_order = get_phone_order(phonemap) d = len(phone_order) # posteriorgram dimension # below is basically a parser for kaldi matrix format for each line # parse input text file with codecs.open(post_file, mode='r', encoding='UTF-8') as inp: lines = inp.readlines( ) # xreadlines supposed to be more efficient for large files? # here would be nice to use sparse feature format (need to have it in h5features though) # might want to begin by using sparse numpy matrix format features = [] utt_ids = [] times = [] for index, line in enumerate(lines): print("Processing line {0} / {1}".format(index + 1, len(lines))) tokens = line.strip().split(u" ") utt_id, tokens = tokens[0], tokens[1:] frames = [] inside = False for token in tokens: if token == u"[": assert not (inside) inside = True frame = [] elif token == u"]": assert inside inside = False frames.append(frame) else: assert inside frame.append(token) utt_features = np.zeros(shape=(len(frames), d), dtype=np.float64) for f, frame in enumerate(frames): assert len(frame) % 2 == 0 probas = [float(p) for p in frame[1::2]] phones = [phonemap[code] for code in frame[::2]] # optimisation 1 would be mapping directly a given code to a given posterior dim for phone, proba in zip(phones, probas): i = phone_order.index(phone) # add to previous proba since different variants of a same phone will map to # the same dimension i of the posteriorgram utt_features[f, i] = utt_features[f, i] + proba # normalize posteriorgrams to correct for rounding or thresholding errors # by rescaling globally total_proba = np.sum(utt_features, axis=1) if np.max(np.abs(total_proba - 1)) >= 1e-5: # ad hoc numerical tolerance... raise IOError( "In utterance {0}, frame {1}: posteriorgram does not sum to one, difference is {2}: " .format(utt_id, f, np.max(np.abs(total_proba - 1)))) utt_features = utt_features / np.tile(total_proba, (d, 1)).T features.append(utt_features) utt_ids.append(utt_id) # as in kaldi2abkhazia, this is ad hoc and has not been checked formally times.append(0.0125 + 0.01 * np.arange(len(frames))) h5features.write(out_file, 'features', utt_ids, times, features) o_ph = codecs.open(out_phones_file, encoding="utf-8", mode="w") for ph in phone_order: o_ph.write(ph + u'\n')