def compute_percentile_param(percentile, features_file, comparison='within', samples=100000, with_vad=False, lsh=False): """Compute the threshold parameter from a percentile value percentile: int or list of int """ assert comparison in set(['general', 'within', 'across']) if comparison != 'within': raise NotImplementedError if not lsh: if not with_vad: features_dict = h5features.read(features_file)[1] else: features_dict = {} vad = read_vad(with_vad) for f, intervals in vad.iteritems(): feats = [h5features.read(features_file, from_item=f, from_time=interval[0], to_time=interval[1])[1][f] for interval in intervals] features_dict[f] = np.concatenate(feats, axis=0) else: features_dict = read_sigs_remove_sils(features_file) # else: # features_dict = {} # vad = read_vad(with_vad) # for f, intervals in vad.iteritems(): # feats = [] # for interval in intervals: # interval = (np.array(interval) * 100).astype(int) # feats.append(np.fromfile(features_file[f]['sig'], dtype=dtype)[interval[0]:interval[1]]) # features_dict[f] = np.concatenate(feats, axis=0) samples_per_file = samples // len(features_dict) def estimate_similarities_distribution(feats1, feats2, n_samples, S=False): samples1 = np.random.choice(feats1.shape[0], n_samples) samples2 = np.random.choice(feats2.shape[0], n_samples) sampled_feats1 = feats1[samples1] sampled_feats2 = feats2[samples2] if not S: fun = do_cosine_similarity else: fun = lambda x, y: do_norm_hamming_sim(x, y, S) return fun(sampled_feats1, sampled_feats2) similarities = [] for feats in features_dict.itervalues(): S = False if lsh: S = features_file.stats['S'] similarities.append(estimate_similarities_distribution( feats, feats, samples_per_file, S=S)) similarities = np.concatenate(similarities) return np.percentile(similarities, percentile)
def test_read_works(self): fname = self.file_v1 t1, f1 = h5f_1_0.read(fname, 'features') t2, f2 = h5f_1_1.read(fname, 'features') for tt1, tt2 in zip(t1, t2): assert tt1 == tt2 for ff1, ff2 in zip(f1, f2): assert ff1 == ff2
def do_transform(fname): times, feats = h5features.read(stacked_fb_file, from_internal_file=fname, index=index) times = times[fname] feats = feats[fname] X = np.asarray((feats - mean) / std, dtype='float32') # times = np.arange(0.01, 0.01*npz.shape[0], 0.01) emb_wrd, emb_spkr = transform(X) return emb_wrd
def h5features_stack_fbanks(fbanks_file, stacked_fbanks_file): import h5features index = h5features.read_index(fbanks_file) files = index['files'] for f in files: times, fbanks = h5features.read( fbanks_file, 'features', from_internal_file=f, index=index) stacked_fbanks = stack_fbanks(fbanks[f]) h5features.write(stacked_fbanks_file, 'features', [f], [times[f]], [stacked_fbanks])
def _test_wr(self, labeldim): """Test retrieving labels and files after a write/read operation.""" items, t_gold, feat = generate.full(self.nbitems, tformat=labeldim) write(self.filename, self.group, items, t_gold, feat) t, _ = read(self.filename, self.group) assert len(t) == self.nbitems if not labeldim == 1: assert all([tt.shape[1] == labeldim for tt in t.values()]) # build a dict from gold to compare with t d = dict(zip(items, t_gold)) for dd, tt in zip(d, t): assert tt == dd
def write_kl_to_column(distance_list, PG_file, root): """ Write distances into original table """ hf5_file = root + PG_file times_r, features_r = h5f.read(hf5_file, 'features') items = h5f.Reader(hf5_file, 'features').items.data[0:] oth_x_array = np.array([]) tgt_x_array = np.array([]) for TRIP_NUM in range(1, 113): # select only item names which correspond to same triplet trip_id = 'triplet' + str('{0:03}'.format(TRIP_NUM)) trip_items = [itm for itm in items if trip_id in itm] # trace the 01 = OTH, 02 = TGT, 03 = X item_oth = [oth for oth in trip_items if '_01' in oth][0] item_tgt = [tgt for tgt in trip_items if '_02' in tgt][0] item_x = [x for x in trip_items if '_03' in x][0] # find vectors feat_vector_oth = features_r[item_oth] feat_vector_tgt = features_r[item_tgt] feat_vector_x = features_r[item_x] # time_vector = times_r[item] # get KL divergence for TGT-X and OTH-X kl_oth_x = dtw_kl_divergence(feat_vector_oth, feat_vector_x) kl_tgt_x = dtw_kl_divergence(feat_vector_tgt, feat_vector_x) # put them into an array oth_x_array = np.append(oth_x_array, kl_oth_x) tgt_x_array = np.append(tgt_x_array, kl_tgt_x) name_othX = PG_file.split('.')[0] + '_oth_x' name_tgtX = PG_file.split('.')[0] + '_tgt_x' distance_list[name_othX] = pd.Series(oth_x_array, \ index=distance_list.index) distance_list[name_tgtX] = pd.Series(tgt_x_array, \ index=distance_list.index) return distance_list
def segment_features(features_file, segments_file, out_file): """ Segment h5features file containing features for whole wavefiles of an abkhazia corpus (or split of a corpus) into features for segments as described in the provided segments.txt file. """ utt_ids, wavefiles, starts, stops = io.read_segments(segments_file) if all([e is None for e in starts]) and all([e is None for e in stops]): # TODO use a log instead of a print statement print( "segment_features: segments already match wavefiles, " "doing nothing...") else: # Group utterances by wavefiles data = zip(utt_ids, wavefiles, starts, stops) for wav, utts in groupby(data, lambda e: e[1]): # TODO use a log instead of a print statement print "Segmenting features for file {} by utterance".format(wav) # load features for whole wavefile wav_id = os.path.splitext(wav)[0] # TODO fix that times, features = h5features.read(features_file, from_internal_file=wav_id) # no need for dict here times, features = times[wav_id], features[wav_id] utt_ids, utt_times, utt_features = [], [], [] for utt_id, _, start, stop in utts: # select features for appropriate segment utt_ids.append(utt_id) indices = np.where( np.logical_and(times >= start, times <= stop))[0] # get times relative to beginning of utterance utt_times.append(times[indices] - start) utt_features.append(features[indices, :]) # write to out_file once for each wavefile h5features.write(out_file, 'features', utt_ids, utt_times, utt_features)
def get_features_flat(fn, gn, feature_index, key, stat): features_dict = h5features.read(fn, gn, key, index=feature_index)[1] features = features_dict[features_dict.keys()[0]] return stat(features)
def time_f(f): return h5features.read(fb_h5f, from_item=f)[0][f]
def aux(f): return stack_fbanks(h5features.read(fb_h5f, from_item=f)[1][f], nframes=nframes)
def get_features_from_file(self, fileid): """return the features associated to a file""" return h5features.read(self.feature_file, from_item=fileid)[1][fileid]
def run_distance_job(job_description, distance_file, distance, feature_files, feature_groups, splitted_features, job_id, normalize): if lock is None: synchronize = False else: synchronize = True if not (splitted_features): times = {} features = {} for feature_file, feature_group in zip(feature_files, feature_groups): t, f = h5features.read(feature_file, feature_group) assert not (set(times.keys()).intersection( t.keys())), ("The same file is indexed by (at least) two " "different feature files") times.update(t) features.update(f) get_features = Features_Accessor(times, features).get_features_from_raw pair_file = job_description['pair_file'] n_blocks = len(job_description['by']) for b in range(n_blocks): print('Job %d: computing distances for block %d on %d' % (job_id, b, n_blocks)) # get block spec by = job_description['by'][b] start = job_description['start'][b] stop = job_description['stop'][b] if splitted_features: # FIXME modify feature_file/feature_group to adapt to 'by' # FIXME any change needed when several feature files before # splitting ? times = {} features = {} for feature_file, feature_group in zip(feature_files, feature_groups): t, f = h5features.read(feature_file, feature_group) assert not (set(times.keys()).intersection( t.keys())), ("The same file is indexed by (at least) two " "different feature files") times.update(t) features.update(f) accessor = Features_Accessor(times, features) get_features = accessor.get_features_from_splitted # load pandas dataframe containing info for loading the features if synchronize: lock.acquire() store = pandas.HDFStore(pair_file) by_db = store['feat_dbs/' + by] store.close() # load pairs to be computed # indexed relatively to the above dataframe with h5py.File(pair_file) as fh: attrs = fh['unique_pairs'].attrs[by] pair_list = fh['unique_pairs/data'][attrs[1] + start:attrs[1] + stop, 0] base = attrs[0] if synchronize: lock.release() A = np.mod(pair_list, base) B = pair_list // base pairs = np.column_stack([A, B]) n_pairs = pairs.shape[0] # get dataframe with one entry by item involved in this block # indexed by its 'by'-specific index by_inds = np.unique(np.concatenate([A, B])) items = by_db.iloc[by_inds] # get a dictionary whose keys are the 'by' indices features = get_features(items) dis = np.empty(shape=(n_pairs, 1)) # FIXME: second dim is 1 because of the way it is stored to disk, # but ultimately it shouldn't be necessary anymore # (if using axis arg in np2h5, h52np and h5io...) for i in range(n_pairs): dataA = features[pairs[i, 0]] dataB = features[pairs[i, 1]] if dataA.shape[0] == 0: warnings.warn( 'No features found for file {}, {} - {}'.format( items['file'][pairs[i, 0]], items['onset'][pairs[i, 0]], items['offset'][pairs[i, 0]]), UserWarning) if dataB.shape[0] == 0: warnings.warn( 'No features found for file {}, {} - {}'.format( items['file'][pairs[i, 1]], items['onset'][pairs[i, 1]], items['offset'][pairs[i, 1]]), UserWarning) try: if normalize is not None: if normalize == 1: normalize = True elif normalize == 0: normalize = False else: print('normalized parameter neither 1 nor 0,' 'using normalization') normalize = True dis[i, 0] = distance(dataA, dataB, normalized=normalize) else: dis[i, 0] = distance(dataA, dataB) except: sys.stderr.write( 'Error when calculating the distance between item {}, {} - {} ' 'and item {}, {} - {}\n'.format( items['file'][pairs[i, 0]], items['onset'][pairs[i, 0]], items['offset'][pairs[i, 0]], items['file'][pairs[i, 1]], items['onset'][pairs[i, 1]], items['offset'][pairs[i, 1]]), ) raise if synchronize: lock.acquire() with h5py.File(distance_file) as fh: fh['distances/data'][attrs[1] + start:attrs[1] + stop, :] = dis if synchronize: lock.release()
def get_features(self, segment): """return the features associated to a segment = (file, start, end)""" fileid, start, end = segment return h5features.read(self.feature_file, from_internal_file=fileid, from_time=start, to_time=end, index=self.index)[1][fileid]
# Author: Nika Jurov import h5features as h5f import numpy as np import pandas as pd import sys import os hf5_file = sys.argv[1] # the posterior gram file in .h5 format TRIPLET_NAME = sys.argv[2] # which index / utterance do we want NAMED_PG = sys.argv[3] # name the extracted PG times_r, features_r = h5f.read(hf5_file, 'features') #items = h5f.Reader(hf5_file, 'features').items.data[0:] #utterance = items[int(NUMBER)] f = pd.DataFrame(features_r[TRIPLET_NAME]) f['times'] = ["time_" + str('{0:03}'.format(i)) for i in range(0, f.shape[0])] f.to_csv("model/supervised/posterior_grams/extracted_pgs/" \ + NAMED_PG + ".csv", index=False) #print "The utterance ID is: " + str(utterance)
def run_distance_job(job_description, distance_file, distance, feature_files, feature_groups, splitted_features, job_id, distance_file_lock=None): if distance_file_lock is None: synchronize = False else: synchronize = True if not(splitted_features): times = {} features = {} for feature_file, feature_group in zip(feature_files, feature_groups): t, f = h5features.read(feature_file, feature_group) assert not(set(times.keys()).intersection( t.keys())), ("The same file is indexed by (at least) two " "different feature files") times.update(t) features.update(f) get_features = Features_Accessor(times, features).get_features_from_raw pair_file = job_description['pair_file'] n_blocks = len(job_description['by']) for b in range(n_blocks): print('Job %d: computing distances for block %d on %d' % (job_id, b, n_blocks)) # get block spec by = job_description['by'][b] start = job_description['start'][b] stop = job_description['stop'][b] if splitted_features: # FIXME modify feature_file/feature_group to adapt to 'by' # FIXME any change needed when several feature files before # splitting ? times = {} features = {} for feature_file, feature_group in zip(feature_files, feature_groups): t, f = h5features.read(feature_file, feature_group) assert not(set(times.keys()).intersection( t.keys())), ("The same file is indexed by (at least) two " "different feature files") times.update(t) features.update(f) accessor = Features_Accessor(times, features) get_features = accessor.get_features_from_splitted # load pandas dataframe containing info for loading the features store = pandas.HDFStore(pair_file) by_db = store['feat_dbs/' + by] store.close() # load pairs to be computed # indexed relatively to the above dataframe with h5py.File(pair_file) as fh: pair_list = fh['unique_pairs/' + by][start:stop, 0] base = fh['unique_pairs'].attrs[by] A = np.mod(pair_list, base) B = pair_list // base pairs = np.column_stack([A, B]) n_pairs = pairs.shape[0] # get dataframe with one entry by item involved in this block # indexed by its 'by'-specific index by_inds = np.unique(np.concatenate([A, B])) items = by_db.iloc[by_inds] # get a dictionary whose keys are the 'by' indices features = get_features(items) dis = np.empty(shape=(n_pairs, 1)) # FIXME: second dim is 1 because of the way it is stored to disk, # but ultimately it shouldn't be necessary anymore # (if using axis arg in np2h5, h52np and h5io...) for i in range(n_pairs): dataA = features[pairs[i, 0]] dataB = features[pairs[i, 1]] if dataA.shape[0] == 0: warnings.warn('No features found for file {}, {} - {}' .format(items['file'][pairs[i, 0]], items['onset'][pairs[i, 0]], items['offset'][pairs[i, 0]]), UserWarning) if dataB.shape[0] == 0: warnings.warn('No features found for file {}, {} - {}' .format(items['file'][pairs[i, 1]], items['onset'][pairs[i, 1]], items['offset'][pairs[i, 1]]), UserWarning) try: dis[i, 0] = distance(dataA, dataB) except: sys.stderr.write( 'Error when calculating the distance between item {}, {} - {} ' 'and item {}, {} - {}\n' .format(items['file'][pairs[i, 0]], items['onset'][pairs[i, 0]], items['offset'][pairs[i, 0]], items['file'][pairs[i, 0]], items['onset'][pairs[i, 0]], items['offset'][pairs[i, 0]]), ) raise if synchronize: distance_file_lock.acquire() with h5py.File(distance_file) as fh: fh['distances/' + by][start:stop, :] = dis if synchronize: distance_file_lock.release()
def launch_lsh(features_file, featsdir, S=64, files=None, with_vad=None, split=False): """Launch lsh for specified features, return a dictionnary containing for all files the path to their signature, features and vad. Parameters: ---------- features_file: h5features file name featsdir: output folder where the features will be written S: int, number of bits of the lsh signature files: list, only launch lsh on the specified files. (must be the basename of the file, like in the h5features file) with_vad: optional VAD file """ if not os.path.isfile(features_file): raise ValueError('file {} doesn\'t exist'.format(features_file)) if S not in VALID_BITS: raise ValueError('S={} must be 32 or 64'.format(S)) def aux(f, feats, S, D, featfile, sigfile, vadfile=None, vad=None): with open(featfile, 'wb') as fout: fout.write(feats.tobytes()) # lsh creates .sig file command_ = '{}/lsh -S {} -D {} -projfile proj_b{}xd{}_seed1 -featfile {} -sigfile {}' command_ = command_.format(binpath, S, D, S, D, featfile, sigfile) if vadfile: with open(vadfile, 'w') as fout: for interval in vad[f]: fout.write(' '.join(map(str, interval)) + '\n') command_ += ' -vadfile {}'.format(vadfile) p_ = Popen(command_, shell=True, stdout=PIPE, stderr=PIPE) output_, err_ = p_.communicate() if 'usage' in err_: print(err_) raise NameError('Cannot run command: {}'.format(command_)) if os.stat(sigfile).st_size == 0: # warn lsh without results warnings.warn('no results from lsh ({} empty)'.format(sigfile)) vad = {} if with_vad: vad_ = defaultdict(list) with open(with_vad) as fin: for line in fin: fname, start, end = line.strip().split() start, end = map(lambda t: int(float(t) * 100), (start, end)) vad_[fname].append((start, end)) vad = dict(vad_) # generate a file with DxS normal random values [=numpy.random.norm(0.0,1.0,D*S)] # TODO: can be different the output file from genproj? D = h5py.File(features_file)['features']['features'].shape[1] proj_f_ = 'proj_b{}xd{}_seed1'.format(S, D) command_ = '{}/genproj -S {} -D {} -seed 1'.format(binpath, S, D) p_ = Popen(command_, shell=True, stdout=PIPE, stderr=PIPE) output_, err_ = p_.communicate() if 'unknown' in err_: print(err_) raise NameError('Cannot run command: {}'.format(command_)) if os.stat(proj_f_).st_size == 0: warnings.warn('genproj didn\'t create file {} or empty'.format(proj_f_)) res = fdict() res.stats = {'S':S, 'D':D} if not os.path.exists(featsdir): try: os.makedirs(featsdir) except: pass if files == None: files = h5features.read(features_file)[0].keys() for f in files: spk = get_speaker(f) if not split: sigfile = os.path.join(featsdir, f + ".sig") vadfile = os.path.join(featsdir, f + ".vad") featfile = os.path.join(featsdir, f + ".fea") feats = h5features.read(features_file, from_item=f)[1][f] if with_vad: aux(f, feats, S, D, featfile, sigfile, vadfile, vad) else: aux(f, feats, S, D, featfile, sigfile) try : res[spk][f] = {'sig': sigfile, 'fea': featfile} except KeyError: res[spk] = {f: {'sig': sigfile, 'fea': featfile}} if with_vad: res[spk][f]['vad'] = vadfile else: intervals = vad[f] for i, (start, end) in enumerate(intervals): fi = '{}_{}'.format(f, i) sigfile = os.path.join(featsdir, fi + ".sig") vadfile = os.path.join(featsdir, fi + ".vad") featfile = os.path.join(featsdir, fi + ".fea") feats = h5features.read( features_file, from_item=f, from_time=float(start)/100, to_time=float(end)/100)[1][f] aux(fi, feats, S, D, featfile, sigfile) try: res[spk][fi] = {'sig': sigfile, 'fea': featfile} except KeyError: res[spk] = {fi: {'sig': sigfile, 'fea': featfile}} return res
def get_features_from_file(self, fileid): """return the features accosiated to a file""" return h5features.read(self.feature_file, from_internal_file=fileid, index=self.index)[1][fileid]
def get_features_from_file(self, fileid): """return the features associated to a file""" return h5features.read( self.feature_file, from_item=fileid)[1][fileid]