def test_normalization_per_file(self): tempdir = Path(tempfile.mkdtemp()) h5f = str(tempdir / 'h5.features') feature1 = np.vstack([np.full((100, 40), 1.), np.full((100, 40), -1.)]) feature2 = np.vstack([np.full((100, 40), 1.), np.full((100, 40), 2.)]) features = [feature1, feature2] items = ['file1', 'file2'] times = [np.arange(feature1.shape[0], dtype=float) * 0.01 + 0.0025] times.append(np.arange(feature2.shape[0], dtype=float) * 0.01 + 0.0025) h5features.write(h5f, '/features/', items, times, features) h5f_mean_var = str(tempdir / 'h5-normalized.features') features_generator = FeaturesGenerator(normalization=True, norm_per_file=True) meansvars = features_generator.mean_var_norm_per_file( h5f, h5f_mean_var) assert meansvars[0][0] == 'file1' assert all(meansvars[0][1] == np.mean(feature1, axis=0)) assert all(meansvars[0][2] == np.std(feature1, axis=0)) assert meansvars[1][0] == 'file2' assert all(meansvars[1][1] == np.mean(feature2, axis=0)) assert all(meansvars[1][2] == np.std(feature2, axis=0)) reader = h5features.Reader(h5f_mean_var) data = reader.read() for file in data.items(): assert np.mean(data.dict_features()[file]) == pytest.approx(0) assert np.std(data.dict_features()[file]) == pytest.approx(1) # no per channel tmp2 = str(tempdir / 'h5-tmp2') features_generator = FeaturesGenerator(normalization=True, norm_per_file=True, norm_per_channel=False) meansvars = features_generator.mean_var_norm_per_file(h5f, tmp2) assert meansvars == [ ('file1', 0, np.std(feature1)), ('file2', 1.5, np.std(feature2)), ] reader = h5features.Reader(tmp2) data = reader.read() for file in data.items(): assert np.mean(data.dict_features()[file]) == pytest.approx(0) assert np.std(data.dict_features()[file]) == pytest.approx(1) shutil.rmtree(str(tempdir))
def test_rw_one_frame_2D(tmpdir): h5file = os.path.join(str(tmpdir), 'exemple.h5') gold = generate.full_data(1, 3, 1, 2) h5f.Writer(h5file).write(gold) test = h5f.Reader(h5file).read() assert test == gold
def read_feats(features_file, align_features_file=None): with h5features.Reader(features_file, 'features') as fh: features = fh.read() # load all at once here... times = features.dict_labels() feats = features.dict_features() feat_dim = feats[list(feats.keys())[0]].shape[1] features = Features_Accessor(times, feats) if align_features_file is None: align_features = None else: with h5features.Reader(features_file, 'features') as fh: align_features = fh.read() # load all at once here... times = align_features.dict_labels() feats = align_features.dict_features() align_features = Features_Accessor(times, feats) return features, align_features, feat_dim
def test_init_not_hdf(self): with tempfile.NamedTemporaryFile(delete=False) as temp: temp.write(b'This is not a HDF5 file') with pytest.raises(IOError) as err: h5f.Reader(temp.name, self.groupname) assert 'not a HDF5 file' in str(err.value) remove(temp.name)
def test_read_tofromtimes(tmpdir, dim): filename = os.path.join(str(tmpdir), 'test.h5f') groupname = 'group' data = generate.full_data(1, dim, 300) h5f.Writer(filename, mode='w').write(data, groupname=groupname) data2 = h5f.Reader(filename, groupname).read() assert data == data2 data3 = h5f.Reader(filename, groupname).read(from_time=0, to_time=1) assert data3 == data data4 = h5f.Reader(filename, groupname).read(from_time=0.4, to_time=0.5) #print data4.labels() assert data4.labels()[0][0] >= 0.4 assert data4.labels()[0][-1] <= 0.5
def test_write_mode(tmpdir, mode, append): h5file = os.path.join(str(tmpdir) + 'test.h5') data = generate.full_data(1, 3, 1, 2) copy_data = copy.deepcopy(data) h5f.Writer(h5file).write(data, append=False) assert h5f.Reader(h5file).read() == data # write data a second time if mode == 'a' and append is True: with pytest.raises(IOError): h5f.Writer(h5file, mode=mode).write(data, append=append) else: h5f.Writer(h5file, mode=mode).write(data, append=append) assert data == copy_data assert h5f.Reader(h5file).read() == copy_data
def mean_var_norm_per_file(self, h5f, mvn_h5f, vad_file=None): # normalize either per channel or on the whole spectrum. axis = 0 if self.norm_per_channel else None dset_name = list(h5py.File(h5f).keys())[0] files = h5py.File(h5f)[dset_name]['items'] reader = h5features.Reader(h5f) means_vars = [] for f in files: data = reader.read(from_item=f) items, features, times = (data.items(), data.features()[0], data.labels()[0]) # VAD filtered_features = None if vad_file is not None: vad_data = read_vad_file(vad_file) if str(f) in vad_data: filtered_features = self.filter_vad_one_file( features, times, vad_data[str(f)]) if filtered_features is None: mean = np.mean(features, axis=axis) std = np.std(features, axis=axis) else: mean = np.mean(filtered_features, axis=axis) std = np.std(filtered_features, axis=axis) features = (features - mean) / (std + np.finfo(features.dtype).eps) h5features.write(mvn_h5f, '/features/', items, [times], [features]) means_vars.append((f, mean, std)) return means_vars
def test_features(pitch, ftype, corpus, tmpdir): output_dir = str(tmpdir.mkdir('feats')) flog = os.path.join(output_dir, 'feats.log') log = utils.logger.get_log(flog) # keep only 3 utterances for testing speed subcorpus = corpus.subcorpus(list(corpus.utts())[0:3]) assert len(list(subcorpus.utts())) == 3 # mfcc with few channels nbc = 3 feat = features.Features(subcorpus, output_dir, log=log) feat.type = ftype feat.njobs = 1 feat.use_pitch = pitch feat.delete_recipe = False feat.features_options.append( ('num-ceps' if ftype in ('mfcc', 'plp') else 'num-mel-bins', nbc)) try: feat.compute() except RuntimeError as err: import sys sys.stdout.write(open(flog, 'r').read()) sys.stdout.write( open( os.path.join( output_dir, 'recipe', 'exp/make_mfcc/features/make_mfcc_pitch_features.1.log'), 'r').read()) sys.stdout.write( open(os.path.join(output_dir, 'recipe/conf/mfcc.conf'), 'r').read()) raise err # # actually ERROR is in the vocabulary so this test fails... # assert_no_expr_in_log(flog, 'error') # basic asserts on files assert os.path.isfile(os.path.join(output_dir, 'meta.txt')) features.Features.check_features(output_dir) # convert to h5features and read it back h5 = os.path.join(output_dir, 'feats.h5') ark.scp_to_h5f(os.path.join(output_dir, 'feats.scp'), h5) data = h5features.Reader(h5, 'features').read() # check we have nbc or nbc+3 channels dim = data.features()[0].shape[1] exp = nbc + 3 if pitch else nbc assert dim == exp, 'bad dim: {}, expected {}'.format(dim, exp) # check utt_ids in h5f are consistent with corpus times = data.dict_labels() assert len(times.keys()) == len(subcorpus.utts()) for t, c in zip(times.keys(), subcorpus.utts()): assert t == c
def test_const_on_write(self, tmpdir, mode, append): # A Data instance must not change before/after writing it to a # group h5file = os.path.join(str(tmpdir) + 'test.h5') # first write assert self.data.items() == self.items h5f.Writer(h5file, mode=mode).write(self.data, append=append) assert self.data.items() == self.items assert h5f.Reader(h5file).read() == self.data # second write of the same data if mode == 'a' and append is True: with pytest.raises(IOError): h5f.Writer(h5file, mode=mode).write(self.data, append=append) else: h5f.Writer(h5file, mode=mode).write(self.data, append=append) assert self.data.items() == self.items assert h5f.Reader(h5file).read() == self.data
def embed(self): """ Embed method to embed features based on a saved network """ if self.network_path is not None: self.network.load_network(self.network_path) self.network.eval() if self.cuda: self.network.cuda() items = None times = None features_list = [] for path in self.feature_path: with h5features.Reader(path, 'features') as fh: features = fh.read() features_list.append(features.features()) check_items = features.items() check_times = features.labels() if not items: items = check_items if not times: times = check_times print("Done loading input feature file") zipped_feats = zip(*features_list) embeddings = [] for feats in zipped_feats: modes_list = [] for feat in feats: if feat.dtype != np.float32: feat = feat.astype(np.float32) feat_torch = Variable(torch.from_numpy(feat), volatile=True) if self.cuda: feat_torch = feat_torch.cuda() modes_list.append(feat_torch) emb, _ = self.network(modes_list, modes_list) emb = emb.cpu() embeddings.append(emb.data.numpy()) #Register activity on observer for observer in self.observers: observer.register_status() data = h5features.Data(items, times, embeddings, check=True) with h5features.Writer(self.output_path + "embedded.features") as fh: fh.write(data, 'features') #Save observer registers for observer in self.observers: observer.save(items, times)
def test_from_exemple(tmpdir): filename = os.path.join(str(tmpdir), 'exemple.h5') a1, a2, a3 = generate.full(100) data = h5f.Data(a1, a2, a3) h5f.Writer(filename).write(data, 'group') with h5f.Reader(filename, 'group') as r: rdata = r.read() assert len(rdata.items()) == 100 assert data == rdata
def test_h5f_name_of_utterance(tmpdir, data, name): data = {name: data['test']} ark = os.path.join(str(tmpdir), 'ark') io.dict_to_ark(ark, data) # convert it to h5features file h5file = os.path.join(str(tmpdir), 'h5f') io.ark_to_h5f([ark], h5file) data2 = h5f.Reader(h5file).read() assert np.allclose(data2.dict_features()[name], data[name])
def _load(self, groupname='features'): self._log.info('loading %s', self.filename) data = h5features.Reader(self.filename, groupname=groupname).read() features = self._features_collection() for n in range(len(data.items())): features[data.items()[n]] = self._features( data.features()[n], data.labels()[n], properties=data.properties()[n], validate=False) return features
def write_kl_to_column(distance_list, PG_file, root): """ Write distances into original table """ hf5_file = root + PG_file times_r, features_r = h5f.read(hf5_file, 'features') items = h5f.Reader(hf5_file, 'features').items.data[0:] oth_x_array = np.array([]) tgt_x_array = np.array([]) for TRIP_NUM in range(1, 113): # select only item names which correspond to same triplet trip_id = 'triplet' + str('{0:03}'.format(TRIP_NUM)) trip_items = [itm for itm in items if trip_id in itm] # trace the 01 = OTH, 02 = TGT, 03 = X item_oth = [oth for oth in trip_items if '_01' in oth][0] item_tgt = [tgt for tgt in trip_items if '_02' in tgt][0] item_x = [x for x in trip_items if '_03' in x][0] # find vectors feat_vector_oth = features_r[item_oth] feat_vector_tgt = features_r[item_tgt] feat_vector_x = features_r[item_x] # time_vector = times_r[item] # get KL divergence for TGT-X and OTH-X kl_oth_x = dtw_kl_divergence(feat_vector_oth, feat_vector_x) kl_tgt_x = dtw_kl_divergence(feat_vector_tgt, feat_vector_x) # put them into an array oth_x_array = np.append(oth_x_array, kl_oth_x) tgt_x_array = np.append(tgt_x_array, kl_tgt_x) name_othX = PG_file.split('.')[0] + '_oth_x' name_tgtX = PG_file.split('.')[0] + '_tgt_x' distance_list[name_othX] = pd.Series(oth_x_array, \ index=distance_list.index) distance_list[name_tgtX] = pd.Series(tgt_x_array, \ index=distance_list.index) return distance_list
def embed(self): """ Embed method to embed features based on a saved network """ if self.network_path is not None: self.network.load_network(self.network_path) self.network.eval() if self.cuda: self.network.cuda() print("Done loading network weights") with h5features.Reader(self.feature_path, 'features') as fh: features = fh.read() items = features.items() times = features.labels() feats = features.features() print("Done loading input feature file") embeddings = [] for feat in feats: if feat.dtype != np.float32: feat = feat.astype(np.float32) n_batches = len(feat) // self.batch_size + 1 batches_feat = np.array_split(feat, n_batches) outputs = [] for b_feat in batches_feat: feat_torch = Variable(torch.from_numpy(b_feat), volatile=True) if self.cuda: feat_torch = feat_torch.cuda() emb, _ = self.network(feat_torch, feat_torch) emb = emb.cpu() outputs.append(emb.data.numpy()) outputs = np.vstack(outputs) embeddings.append(outputs) data = h5features.Data(items, times, embeddings, check=True) with h5features.Writer(self.output_path) as fh: fh.write(data, 'features')
def embed(self): """ Embed method to embed features based on a saved network """ if self.network_path is not None: self.network.load_network(self.network_path) self.network.eval() if self.cuda: self.network.cuda() with h5features.Reader(self.feature_path, 'features') as fh: features = fh.read() items = features.items() times = features.labels() feats = features.features() embeddings_spk, embeddings_phn = [], [] for feat in feats: if feat.dtype != np.float32: feat = feat.astype(np.float32) feat_torch = Variable(torch.from_numpy(feat), volatile=True) if self.cuda: feat_torch = feat_torch.cuda() emb_spk, emb_phn, _, _ = self.network(feat_torch, feat_torch) emb_spk = emb_spk.cpu() emb_phn = emb_phn.cpu() embeddings_spk.append(emb_spk.data.numpy()) embeddings_phn.append(emb_phn.data.numpy()) data_spk = h5features.Data(items, times, embeddings_spk, check=True) data_phn = h5features.Data(items, times, embeddings_phn, check=True) with h5features.Writer(self.output_path+'.spk') as fh: fh.write(data_spk, 'features') with h5features.Writer(self.output_path+'.phn') as fh: fh.write(data_phn, 'features')
def test_h5f_twice(tmpdir, data): # write the array as an ark file ark = os.path.join(str(tmpdir), 'ark') ark2 = os.path.join(str(tmpdir), 'ark2') io.dict_to_ark(ark, data) io.dict_to_ark(ark2, {k+'_2': v for k, v in data.items()}) # convert it to h5features file h5file = os.path.join(str(tmpdir), 'h5f') io.ark_to_h5f([ark, ark2], h5file, 'test') # get back data from h5f data2 = h5f.Reader(h5file, 'test').read() assert data2.items() == ['test', 'test2', 'test2_2', 'test_2'] assert data2.dict_labels()['test'].shape[0] == data['test'].shape[0] assert data['test'].shape == data2.dict_features()['test'].shape assert data['test'].shape == data2.dict_features()['test2_2'].shape assert np.allclose(data2.dict_features()['test'], data['test']) assert np.allclose(data2.dict_features()['test2_2'], data['test2']) # test writing in an existing group with pytest.raises(AssertionError): io.ark_to_h5f([ark], h5file, 'test')
data2 = generate_data(10, base='item2') writer.write(data2, 'group2', append=True) # If append is not True, existing data in the group is overwrited. data3 = generate_data(10, base='item3') writer.write(data3, 'group2', append=True) # 120 items writer.write(data3, 'group2') # 10 items ########################## # Reading data from a file ########################## # Initialize a reader and load the entire group. A notable difference # with the Writer is that a Reader is attached to a specific group of # a file. This allows optimized read operations. rdata = h5f.Reader('exemple.h5', 'group1').read() # Hopefully we read the same data we just wrote assert rdata == data # Some more advance reading facilities with h5f.Reader('exemple.h5', 'group1') as reader: # Same as before, read the whole data whole_data = reader.read() # Read the first item stored on the group. first_item = reader.items.data[0] rdata = reader.read(first_item) assert len(rdata.items()) == 1 # Read an interval composed of the 10 first items.
def test_normalization_with_VAD(self): # paths tempdir = Path(tempfile.mkdtemp()) h5f = str(tempdir / 'h5.features') vad_file = str(tempdir / 'vad') # write VAD data for file 1 with open(vad_file, 'w') as vad1: vad1.write("file,start,stop\n" "file1,0.0025,0.5000\n" "file1,0.7525,1.000\n") items = ['file1', 'file2'] # generate data feature1 = np.vstack([np.full((50, 40), 1.0), np.full((50, 40), -1.0)]) feature2 = np.vstack([np.full((50, 40), 1.0), np.full((50, 40), -1.0)]) features = [feature1, feature2] times = [np.arange(feature1.shape[0], dtype=float) * 0.01 + 0.0025] times.append(np.arange(feature2.shape[0], dtype=float) * 0.01 + 0.0025) h5features.write(h5f, '/features/', items, times, features) h5f_mean_var = str(tempdir / 'h5-normalized.features') features_generator = FeaturesGenerator(normalization=True, norm_per_file=True, norm_per_channel=True) mean, var = features_generator.mean_variance_normalisation( h5f, h5f_mean_var, vad_file=vad_file) assert mean == pytest.approx( np.mean(np.vstack([feature1[:75], feature2]), axis=0)) assert var == pytest.approx( np.std(np.vstack([feature1[:75], feature2]), axis=0)) reader = h5features.Reader(h5f_mean_var) data = reader.read() assert data.dict_features()['file1'] == pytest.approx( (feature1 - mean) / var) assert data.dict_features()['file2'] == pytest.approx( (feature2 - mean) / var) ## test no per channel tmp2 = str(tempdir / 'tmp2.h5') features_generator = FeaturesGenerator(normalization=True, norm_per_file=True, norm_per_channel=False) mean, var = features_generator.mean_variance_normalisation( h5f, tmp2, vad_file=vad_file) assert mean == pytest.approx\ (np.mean(np.vstack([feature1[:75], feature2]))) assert var == pytest.approx( np.std(np.vstack([feature1[:75], feature2]))) reader = h5features.Reader(tmp2) data = reader.read() assert data.dict_features()['file1'] == pytest.approx( (feature1 - mean) / var) assert data.dict_features()['file2'] == pytest.approx( (feature2 - mean) / var) shutil.rmtree(str(tempdir))
def test_norm_per_file_with_VAD(self): # paths tempdir = Path(tempfile.mkdtemp()) h5f = str(tempdir / 'h5.features') vad_path = str(tempdir / 'vad') # write VAD data for file 1 with open(str(vad_path), 'w') as vad1: vad1.write("file,start,stop\n" "file1,0.0025,0.5000\n" "file1,0.7525,1.000\n") items = ['file1', 'file2'] # generate data feature1 = np.vstack([np.full((50, 40), 1.0), np.full((50, 40), -1.0)]) feature2 = np.vstack([np.full((50, 40), 1.0), np.full((50, 40), -1.0)]) features = [feature1, feature2] times = [np.arange(feature1.shape[0], dtype=float) * 0.01 + 0.0025] times.append(np.arange(feature2.shape[0], dtype=float) * 0.01 + 0.0025) h5features.write(h5f, '/features/', items, times, features) h5f_mean_var = str(tempdir / 'h5-normalized.features') features_generator = FeaturesGenerator(normalization=True, norm_per_file=True, norm_per_channel=True) meansvars = features_generator.mean_var_norm_per_file( h5f, h5f_mean_var, vad_file=str(vad_path)) assert meansvars[0][0] == 'file1' assert all(meansvars[0][1] == np.mean(feature1[:75], axis=0)) assert all(meansvars[0][2] == np.std(feature1[:75], axis=0)) assert meansvars[1][0] == 'file2' assert all(meansvars[1][1] == np.mean(feature2, axis=0)) assert all(meansvars[1][2] == np.std(feature2, axis=0)) reader = h5features.Reader(h5f_mean_var) data = reader.read() assert data.dict_features()['file1'] == pytest.approx( (feature1 - np.mean(feature1[:75])) / np.std(feature1[:75])) assert np.mean(data.dict_features()['file2']) == pytest.approx(0) assert np.std(data.dict_features()['file2']) == pytest.approx(1) # test no per channel features_generator = FeaturesGenerator( normalization=True, norm_per_file=True, norm_per_channel=False, ) tmp2 = str(tempdir / 'tmp2.h5') meansvars = features_generator.mean_var_norm_per_file( h5f, tmp2, vad_file=str(vad_path)) assert meansvars == [ ('file1', np.mean(feature1[:75]), np.std(feature1[:75])), ('file2', np.mean(feature2), np.std(feature2)), ] reader = h5features.Reader(tmp2) data = reader.read() assert data.dict_features()['file1'] == pytest.approx( (feature1 - np.mean(feature1[:75])) / np.std(feature1[:75])) assert np.mean(data.dict_features()['file2']) == pytest.approx(0) assert np.std(data.dict_features()['file2']) == pytest.approx(1) shutil.rmtree(str(tempdir))
def test_read_time(self): reader = h5f.Reader(self.filename, self.groupname) assert reader.read(from_time=0, to_time=1) == reader.read()
def test_init_basic(self): reader = h5f.Reader(self.filename, self.groupname) assert reader.version == '1.1' assert reader.dformat == 'dense' assert len(reader.items.data) == self.nitems
def test_groupname_is_none(self): data = h5f.Reader(self.filename, None).read() assert self.data == data
def test_read_basic(self): data = h5f.Reader(self.filename, self.groupname).read() assert self.data == data
def test_init_not_group(self): with pytest.raises(IOError) as err: h5f.Reader(self.filename, self.groupname + 'spam') assert 'not a valid group' in str(err.value)
def test_init_not_file(self): with pytest.raises(IOError) as err: h5f.Reader(self.filename + 'spam', self.groupname) assert 'not a HDF5 file' in str(err.value)