Beispiel #1
0
def compute_percentile_param(percentile, features_file, comparison='within',
                             samples=100000, with_vad=False, lsh=False):
    """Compute the threshold parameter from a percentile value

    percentile: int or list of int
    """
    assert comparison in set(['general', 'within', 'across'])
    if comparison != 'within':
        raise NotImplementedError
    if not lsh:
        if not with_vad:
            features_dict = h5features.read(features_file)[1]
        else:
            features_dict = {}
            vad = read_vad(with_vad)
            for f, intervals in vad.iteritems():
                feats = [h5features.read(features_file, from_item=f, from_time=interval[0], to_time=interval[1])[1][f] for interval in intervals]
                features_dict[f] = np.concatenate(feats, axis=0)
    else:
        features_dict = read_sigs_remove_sils(features_file)
        # else:
        #     features_dict = {}
        #     vad = read_vad(with_vad)
        #     for f, intervals in vad.iteritems():
        #         feats = []
        #         for interval in intervals:
        #             interval = (np.array(interval) * 100).astype(int)
        #             feats.append(np.fromfile(features_file[f]['sig'], dtype=dtype)[interval[0]:interval[1]])
        #         features_dict[f] = np.concatenate(feats, axis=0)

    samples_per_file = samples // len(features_dict)

    def estimate_similarities_distribution(feats1, feats2, n_samples, S=False):
        samples1 = np.random.choice(feats1.shape[0], n_samples)
        samples2 = np.random.choice(feats2.shape[0], n_samples)
        sampled_feats1 = feats1[samples1]
        sampled_feats2 = feats2[samples2]
        if not S:
            fun = do_cosine_similarity
        else:
            fun = lambda x, y: do_norm_hamming_sim(x, y, S)
        return fun(sampled_feats1, sampled_feats2)

    similarities = []
    for feats in features_dict.itervalues():
        S = False
        if lsh:
            S = features_file.stats['S']
        similarities.append(estimate_similarities_distribution(
            feats, feats, samples_per_file, S=S))

    similarities = np.concatenate(similarities)
    return np.percentile(similarities, percentile)
Beispiel #2
0
    def test_read_works(self):
        fname = self.file_v1
        t1, f1 = h5f_1_0.read(fname, 'features')
        t2, f2 = h5f_1_1.read(fname, 'features')

        for tt1, tt2 in zip(t1, t2):
            assert tt1 == tt2
        for ff1, ff2 in zip(f1, f2):
            assert ff1 == ff2
    def test_read_works(self):
        fname = self.file_v1
        t1, f1 = h5f_1_0.read(fname, 'features')
        t2, f2 = h5f_1_1.read(fname, 'features')

        for tt1, tt2 in zip(t1, t2):
            assert tt1 == tt2
        for ff1, ff2 in zip(f1, f2):
            assert ff1 == ff2
Beispiel #4
0
 def do_transform(fname):
     times, feats = h5features.read(stacked_fb_file,
                                    from_internal_file=fname,
                                    index=index)
     times = times[fname]
     feats = feats[fname]
     X = np.asarray((feats - mean) / std, dtype='float32')
     # times = np.arange(0.01, 0.01*npz.shape[0], 0.01)
     emb_wrd, emb_spkr = transform(X)
     return emb_wrd
Beispiel #5
0
def h5features_stack_fbanks(fbanks_file, stacked_fbanks_file):
    import h5features
    index = h5features.read_index(fbanks_file)
    files = index['files']
    for f in files:
        times, fbanks = h5features.read(
            fbanks_file, 'features', from_internal_file=f, index=index)
        stacked_fbanks = stack_fbanks(fbanks[f])
        h5features.write(stacked_fbanks_file, 'features', [f],
                         [times[f]], [stacked_fbanks])
Beispiel #6
0
    def _test_wr(self, labeldim):
        """Test retrieving labels and files after a write/read operation."""
        items, t_gold, feat = generate.full(self.nbitems, tformat=labeldim)
        write(self.filename, self.group, items, t_gold, feat)
        t, _ = read(self.filename, self.group)

        assert len(t) == self.nbitems
        if not labeldim == 1:
            assert all([tt.shape[1] == labeldim for tt in t.values()])

        # build a dict from gold to compare with t
        d = dict(zip(items, t_gold))
        for dd, tt in zip(d, t):
            assert tt == dd
Beispiel #7
0
    def _test_wr(self, labeldim):
        """Test retrieving labels and files after a write/read operation."""
        items, t_gold, feat = generate.full(self.nbitems, tformat=labeldim)
        write(self.filename, self.group, items, t_gold, feat)
        t, _ = read(self.filename, self.group)

        assert len(t) == self.nbitems
        if not labeldim == 1:
            assert all([tt.shape[1] == labeldim for tt in t.values()])

        # build a dict from gold to compare with t
        d = dict(zip(items, t_gold))
        for dd, tt in zip(d, t):
            assert tt == dd
Beispiel #8
0
def write_kl_to_column(distance_list, PG_file, root):
    """ Write distances into original table """

    hf5_file = root + PG_file
    times_r, features_r = h5f.read(hf5_file, 'features')
    items = h5f.Reader(hf5_file, 'features').items.data[0:]

    oth_x_array = np.array([])
    tgt_x_array = np.array([])

    for TRIP_NUM in range(1, 113):

        # select only item names which correspond to same triplet
        trip_id = 'triplet' + str('{0:03}'.format(TRIP_NUM))
        trip_items = [itm for itm in items if trip_id in itm]

        # trace the 01 = OTH, 02 = TGT, 03 = X
        item_oth = [oth for oth in trip_items if '_01' in oth][0]
        item_tgt = [tgt for tgt in trip_items if '_02' in tgt][0]
        item_x = [x for x in trip_items if '_03' in x][0]

        # find vectors
        feat_vector_oth = features_r[item_oth]
        feat_vector_tgt = features_r[item_tgt]
        feat_vector_x = features_r[item_x]
        # time_vector = times_r[item]

        # get KL divergence for TGT-X and OTH-X
        kl_oth_x = dtw_kl_divergence(feat_vector_oth, feat_vector_x)
        kl_tgt_x = dtw_kl_divergence(feat_vector_tgt, feat_vector_x)

        # put them into an array
        oth_x_array = np.append(oth_x_array, kl_oth_x)
        tgt_x_array = np.append(tgt_x_array, kl_tgt_x)

    name_othX = PG_file.split('.')[0] + '_oth_x'
    name_tgtX = PG_file.split('.')[0] + '_tgt_x'

    distance_list[name_othX] = pd.Series(oth_x_array, \
                                        index=distance_list.index)
    distance_list[name_tgtX] = pd.Series(tgt_x_array, \
                                        index=distance_list.index)

    return distance_list
Beispiel #9
0
def segment_features(features_file, segments_file, out_file):
    """
    Segment h5features file containing features for whole wavefiles
    of an abkhazia corpus (or split of a corpus) into features for
    segments as described in the provided segments.txt file.
    """
    utt_ids, wavefiles, starts, stops = io.read_segments(segments_file)
    if all([e is None for e in starts]) and all([e is None for e in stops]):
        # TODO use a log instead of a print statement
        print(
            "segment_features: segments already match wavefiles, "
            "doing nothing...")
    else:
        # Group utterances by wavefiles
        data = zip(utt_ids, wavefiles, starts, stops)
        for wav, utts in groupby(data, lambda e: e[1]):
            # TODO use a log instead of a print statement
            print "Segmenting features for file {} by utterance".format(wav)
            # load features for whole wavefile
            wav_id = os.path.splitext(wav)[0]
            # TODO fix that
            times, features = h5features.read(features_file,
                                              from_internal_file=wav_id)
            # no need for dict here
            times, features = times[wav_id], features[wav_id]

            utt_ids, utt_times, utt_features = [], [], []
            for utt_id, _, start, stop in utts:
                # select features for appropriate segment
                utt_ids.append(utt_id)
                indices = np.where(
                    np.logical_and(times >= start, times <= stop))[0]

                # get times relative to beginning of utterance
                utt_times.append(times[indices] - start)
                utt_features.append(features[indices, :])

            # write to out_file once for each wavefile
            h5features.write(out_file, 'features', utt_ids, utt_times,
                             utt_features)
def get_features_flat(fn, gn, feature_index, key, stat):
    features_dict = h5features.read(fn, gn, key, index=feature_index)[1]
    features = features_dict[features_dict.keys()[0]]
    return stat(features)
Beispiel #11
0
 def time_f(f):
     return h5features.read(fb_h5f, from_item=f)[0][f]
Beispiel #12
0
 def aux(f):
     return stack_fbanks(h5features.read(fb_h5f, from_item=f)[1][f],
                         nframes=nframes)
Beispiel #13
0
 def get_features_from_file(self, fileid):
     """return the features associated to a file"""
     return h5features.read(self.feature_file, from_item=fileid)[1][fileid]
Beispiel #14
0
def run_distance_job(job_description, distance_file, distance, feature_files,
                     feature_groups, splitted_features, job_id, normalize):
    if lock is None:
        synchronize = False
    else:
        synchronize = True
    if not (splitted_features):
        times = {}
        features = {}
        for feature_file, feature_group in zip(feature_files, feature_groups):
            t, f = h5features.read(feature_file, feature_group)
            assert not (set(times.keys()).intersection(
                t.keys())), ("The same file is indexed by (at least) two "
                             "different feature files")
            times.update(t)
            features.update(f)
        get_features = Features_Accessor(times, features).get_features_from_raw
    pair_file = job_description['pair_file']
    n_blocks = len(job_description['by'])
    for b in range(n_blocks):
        print('Job %d: computing distances for block %d on %d' %
              (job_id, b, n_blocks))
        # get block spec
        by = job_description['by'][b]
        start = job_description['start'][b]
        stop = job_description['stop'][b]
        if splitted_features:
            # FIXME modify feature_file/feature_group to adapt to 'by'
            # FIXME any change needed when several feature files before
            # splitting ?
            times = {}
            features = {}
            for feature_file, feature_group in zip(feature_files,
                                                   feature_groups):
                t, f = h5features.read(feature_file, feature_group)
                assert not (set(times.keys()).intersection(
                    t.keys())), ("The same file is indexed by (at least) two "
                                 "different feature files")
                times.update(t)
                features.update(f)
            accessor = Features_Accessor(times, features)
            get_features = accessor.get_features_from_splitted
        # load pandas dataframe containing info for loading the features
        if synchronize:
            lock.acquire()
        store = pandas.HDFStore(pair_file)
        by_db = store['feat_dbs/' + by]
        store.close()
        # load pairs to be computed
        # indexed relatively to the above dataframe
        with h5py.File(pair_file) as fh:
            attrs = fh['unique_pairs'].attrs[by]
            pair_list = fh['unique_pairs/data'][attrs[1] + start:attrs[1] +
                                                stop, 0]
            base = attrs[0]
        if synchronize:
            lock.release()

        A = np.mod(pair_list, base)
        B = pair_list // base
        pairs = np.column_stack([A, B])
        n_pairs = pairs.shape[0]
        # get dataframe with one entry by item involved in this block
        # indexed by its 'by'-specific index
        by_inds = np.unique(np.concatenate([A, B]))
        items = by_db.iloc[by_inds]
        # get a dictionary whose keys are the 'by' indices
        features = get_features(items)
        dis = np.empty(shape=(n_pairs, 1))
        # FIXME: second dim is 1 because of the way it is stored to disk,
        # but ultimately it shouldn't be necessary anymore
        # (if using axis arg in np2h5, h52np and h5io...)
        for i in range(n_pairs):
            dataA = features[pairs[i, 0]]
            dataB = features[pairs[i, 1]]
            if dataA.shape[0] == 0:
                warnings.warn(
                    'No features found for file {}, {} - {}'.format(
                        items['file'][pairs[i, 0]], items['onset'][pairs[i,
                                                                         0]],
                        items['offset'][pairs[i, 0]]), UserWarning)
            if dataB.shape[0] == 0:
                warnings.warn(
                    'No features found for file {}, {} - {}'.format(
                        items['file'][pairs[i, 1]], items['onset'][pairs[i,
                                                                         1]],
                        items['offset'][pairs[i, 1]]), UserWarning)
            try:
                if normalize is not None:
                    if normalize == 1:
                        normalize = True
                    elif normalize == 0:
                        normalize = False
                    else:
                        print('normalized parameter neither 1 nor 0,'
                              'using normalization')
                        normalize = True
                    dis[i, 0] = distance(dataA, dataB, normalized=normalize)
                else:
                    dis[i, 0] = distance(dataA, dataB)
            except:
                sys.stderr.write(
                    'Error when calculating the distance between item {}, {} - {} '
                    'and item {}, {} - {}\n'.format(
                        items['file'][pairs[i, 0]], items['onset'][pairs[i,
                                                                         0]],
                        items['offset'][pairs[i, 0]], items['file'][pairs[i,
                                                                          1]],
                        items['onset'][pairs[i,
                                             1]], items['offset'][pairs[i,
                                                                        1]]), )
                raise
        if synchronize:
            lock.acquire()
        with h5py.File(distance_file) as fh:
            fh['distances/data'][attrs[1] + start:attrs[1] + stop, :] = dis
        if synchronize:
            lock.release()
 def get_features(self, segment):
     """return the features associated to a segment = (file, start, end)"""
     fileid, start, end = segment
     return h5features.read(self.feature_file, from_internal_file=fileid,
                                from_time=start, to_time=end,
                                index=self.index)[1][fileid]
 def time_f(f):
     return h5features.read(fb_h5f, from_item=f)[0][f]
 def aux(f):
     return stack_fbanks(h5features.read(fb_h5f, from_item=f)[1][f],
                         nframes=nframes)
Beispiel #18
0
# Author: Nika Jurov

import h5features as h5f
import numpy as np
import pandas as pd
import sys
import os

hf5_file = sys.argv[1]  # the posterior gram file in .h5 format
TRIPLET_NAME = sys.argv[2]  # which index / utterance do we want
NAMED_PG = sys.argv[3]  # name the extracted PG

times_r, features_r = h5f.read(hf5_file, 'features')
#items = h5f.Reader(hf5_file, 'features').items.data[0:]

#utterance = items[int(NUMBER)]

f = pd.DataFrame(features_r[TRIPLET_NAME])

f['times'] = ["time_" + str('{0:03}'.format(i)) for i in range(0, f.shape[0])]
f.to_csv("model/supervised/posterior_grams/extracted_pgs/" \
            + NAMED_PG + ".csv", index=False)

#print "The utterance ID is: " + str(utterance)
Beispiel #19
0
def run_distance_job(job_description, distance_file, distance,
                     feature_files, feature_groups, splitted_features,
                     job_id, distance_file_lock=None):
    if distance_file_lock is None:
        synchronize = False
    else:
        synchronize = True
    if not(splitted_features):
        times = {}
        features = {}
        for feature_file, feature_group in zip(feature_files, feature_groups):
            t, f = h5features.read(feature_file, feature_group)
            assert not(set(times.keys()).intersection(
                t.keys())), ("The same file is indexed by (at least) two "
                             "different feature files")
            times.update(t)
            features.update(f)
        get_features = Features_Accessor(times, features).get_features_from_raw
    pair_file = job_description['pair_file']
    n_blocks = len(job_description['by'])
    for b in range(n_blocks):
        print('Job %d: computing distances for block %d on %d' % (job_id, b,
                                                                  n_blocks))
        # get block spec
        by = job_description['by'][b]
        start = job_description['start'][b]
        stop = job_description['stop'][b]
        if splitted_features:
            # FIXME modify feature_file/feature_group to adapt to 'by'
            # FIXME any change needed when several feature files before
            # splitting ?
            times = {}
            features = {}
            for feature_file, feature_group in zip(feature_files,
                                                   feature_groups):
                t, f = h5features.read(feature_file, feature_group)
                assert not(set(times.keys()).intersection(
                    t.keys())), ("The same file is indexed by (at least) two "
                                 "different feature files")
                times.update(t)
                features.update(f)
            accessor = Features_Accessor(times, features)
            get_features = accessor.get_features_from_splitted
        # load pandas dataframe containing info for loading the features
        store = pandas.HDFStore(pair_file)
        by_db = store['feat_dbs/' + by]
        store.close()
        # load pairs to be computed
        # indexed relatively to the above dataframe
        with h5py.File(pair_file) as fh:
            pair_list = fh['unique_pairs/' + by][start:stop, 0]
            base = fh['unique_pairs'].attrs[by]
        A = np.mod(pair_list, base)
        B = pair_list // base
        pairs = np.column_stack([A, B])
        n_pairs = pairs.shape[0]
        # get dataframe with one entry by item involved in this block
        # indexed by its 'by'-specific index
        by_inds = np.unique(np.concatenate([A, B]))
        items = by_db.iloc[by_inds]
        # get a dictionary whose keys are the 'by' indices
        features = get_features(items)
        dis = np.empty(shape=(n_pairs, 1))
        # FIXME: second dim is 1 because of the way it is stored to disk,
        # but ultimately it shouldn't be necessary anymore
        # (if using axis arg in np2h5, h52np and h5io...)
        for i in range(n_pairs):
            dataA = features[pairs[i, 0]]
            dataB = features[pairs[i, 1]]
            if dataA.shape[0] == 0:
                warnings.warn('No features found for file {}, {} - {}'
                              .format(items['file'][pairs[i, 0]],
                                      items['onset'][pairs[i, 0]],
                                      items['offset'][pairs[i, 0]]),
                              UserWarning)
            if dataB.shape[0] == 0:
                warnings.warn('No features found for file {}, {} - {}'
                              .format(items['file'][pairs[i, 1]],
                                      items['onset'][pairs[i, 1]],
                                      items['offset'][pairs[i, 1]]),
                              UserWarning)
            try:
                dis[i, 0] = distance(dataA, dataB)
            except:
                sys.stderr.write(
                    'Error when calculating the distance between item {}, {} - {} '
                    'and item {}, {} - {}\n'
                    .format(items['file'][pairs[i, 0]],
                            items['onset'][pairs[i, 0]],
                            items['offset'][pairs[i, 0]],
                            items['file'][pairs[i, 0]],
                            items['onset'][pairs[i, 0]],
                            items['offset'][pairs[i, 0]]),
                )
                raise
        if synchronize:
            distance_file_lock.acquire()
        with h5py.File(distance_file) as fh:
            fh['distances/' + by][start:stop, :] = dis
        if synchronize:
            distance_file_lock.release()
Beispiel #20
0
def launch_lsh(features_file, featsdir, S=64, files=None, with_vad=None,
               split=False):
    """Launch lsh for specified features, return a dictionnary containing for
    all files the path to their signature, features and vad.

    Parameters:
    ----------
    features_file: h5features file name
    featsdir: output folder where the features will be written
    S: int, number of bits of the lsh signature
    files: list, only launch lsh on the specified files.
        (must be the basename of the file, like in the h5features file)
    with_vad: optional VAD file
    
    """

    if not os.path.isfile(features_file):
        raise ValueError('file {} doesn\'t exist'.format(features_file))

    if S not in VALID_BITS:
        raise ValueError('S={} must be 32 or 64'.format(S))

    def aux(f, feats, S, D, featfile, sigfile, vadfile=None, vad=None):
        with open(featfile, 'wb') as fout:
            fout.write(feats.tobytes())
        
        # lsh creates .sig file 
        command_ = '{}/lsh -S {} -D {} -projfile proj_b{}xd{}_seed1 -featfile {} -sigfile {}'
        command_ = command_.format(binpath, S, D, S, D, featfile, sigfile)
        if vadfile:
            with open(vadfile, 'w') as fout:
                for interval in vad[f]:
                    fout.write(' '.join(map(str, interval)) + '\n')
            command_ += ' -vadfile {}'.format(vadfile)
        p_ = Popen(command_, shell=True, stdout=PIPE, stderr=PIPE)
        output_, err_ = p_.communicate() 
        if 'usage' in err_:
            print(err_)
            raise NameError('Cannot run command: {}'.format(command_))
        if os.stat(sigfile).st_size == 0: # warn lsh without results
            warnings.warn('no results from lsh ({} empty)'.format(sigfile))

    vad = {}
    if with_vad:
        vad_ = defaultdict(list)
        with open(with_vad) as fin:
            for line in fin:
                fname, start, end = line.strip().split()
                start, end = map(lambda t: int(float(t) * 100), (start, end))
                vad_[fname].append((start, end))
        vad = dict(vad_)

    # generate a file with DxS normal random values [=numpy.random.norm(0.0,1.0,D*S)] 
    # TODO: can be different the output file from genproj? 
    D = h5py.File(features_file)['features']['features'].shape[1]
    proj_f_ = 'proj_b{}xd{}_seed1'.format(S, D)
    command_ = '{}/genproj -S {} -D {} -seed 1'.format(binpath, S, D)
    p_ = Popen(command_, shell=True, stdout=PIPE, stderr=PIPE)
    output_, err_ = p_.communicate()
    if 'unknown' in err_:
        print(err_)
        raise NameError('Cannot run command: {}'.format(command_))
    
    if os.stat(proj_f_).st_size == 0: 
        warnings.warn('genproj didn\'t create file {} or empty'.format(proj_f_))

    res = fdict()
    res.stats = {'S':S, 'D':D} 
    if not os.path.exists(featsdir):
        try:
            os.makedirs(featsdir)
        except:
            pass

    if files == None:
        files = h5features.read(features_file)[0].keys()
    
    for f in files:
        spk = get_speaker(f)
        if not split:
            sigfile = os.path.join(featsdir, f + ".sig")
            vadfile = os.path.join(featsdir, f + ".vad")
            featfile = os.path.join(featsdir, f + ".fea")
            feats = h5features.read(features_file, from_item=f)[1][f]
            
            if with_vad:
                aux(f, feats, S, D, featfile, sigfile, vadfile, vad)
            else:
                aux(f, feats, S, D, featfile, sigfile)
            
            try :
                res[spk][f] =  {'sig': sigfile, 'fea': featfile}
            except KeyError:
                res[spk] = {f: {'sig': sigfile, 'fea': featfile}}
            
            if with_vad:
                res[spk][f]['vad'] = vadfile
        else:
            intervals = vad[f]
            for i, (start, end) in enumerate(intervals):
                fi = '{}_{}'.format(f, i)
                sigfile = os.path.join(featsdir, fi + ".sig")
                vadfile = os.path.join(featsdir, fi + ".vad")
                featfile = os.path.join(featsdir, fi + ".fea")
                feats = h5features.read(
                    features_file, from_item=f,
                    from_time=float(start)/100, to_time=float(end)/100)[1][f]
                aux(fi, feats, S, D, featfile, sigfile)
                try:
                    res[spk][fi] = {'sig': sigfile, 'fea': featfile}
                except KeyError:
                    res[spk] = {fi: {'sig': sigfile, 'fea': featfile}}
    return res
 def get_features_from_file(self, fileid):
     """return the features accosiated to a file"""
     return h5features.read(self.feature_file, from_internal_file=fileid,
                            index=self.index)[1][fileid]
Beispiel #22
0
 def get_features_from_file(self, fileid):
     """return the features associated to a file"""
     return h5features.read(
         self.feature_file, from_item=fileid)[1][fileid]