Example #1
0
def getFeats(args, srate=16000, window=np.hanning):
    wavs = args.scp
    scp_type = args.scp_type
    outfile = args.outfile
    add_reverb = args.add_reverb
    coeff_0 = args.coeff_0
    coeff_n = args.coeff_n
    order = args.order
    fduration = args.fduration
    frate = args.frate
    nfilters = args.nfilters
    kaldi_cmd = args.kaldi_cmd

    # Set up mel-filterbank
    fbank_type = args.fbank_type.strip().split(',')
    if args.complex_modulation:
        dur = int(fduration * srate)
    else:
        dur = int(2 * fduration * srate)

    if fbank_type[0] == "mel":
        if len(fbank_type) < 2:
            raise ValueError('Mel filter bank not configured properly....')
        fbank = createFbank(nfilters,
                            dur,
                            srate,
                            warp_fact=float(fbank_type[1]))
    elif fbank_type[0] == "cochlear":
        if len(fbank_type) < 6:
            raise ValueError(
                'Cochlear filter bank not configured properly....')
        if int(fbank_type[3]) == 1:
            print(
                '%s: Alpha is fixed and will not change as a function of the center frequency...'
                % sys.argv[0])
        fbank = createFbankCochlear(nfilters,
                                    dur,
                                    srate,
                                    om_w=float(fbank_type[1]),
                                    alp=float(fbank_type[2]),
                                    fixed=int(fbank_type[3]),
                                    bet=float(fbank_type[4]),
                                    warp_fact=float(fbank_type[5]))
    else:
        raise ValueError(
            'Invalid type of filter bank, use mel or cochlear with proper configuration'
        )
    coeff_num = coeff_n - coeff_0 + 1

    if args.keep_even:
        temp = np.arange(0, coeff_num)
        if coeff_0 % 2 == 0:
            # It starts from odd coefficients
            feat_len = temp[1::2].shape[0]
        else:
            feat_len = temp[0::2].shape[0]

    elif args.complex_modulation:
        if args.absolute_value:
            feat_len = coeff_num
        else:
            feat_len = 2 * coeff_num
    else:
        feat_len = coeff_num

    if args.compensate_noise:
        if args.complex_modulation:
            fmax = coeff_num / (fduration)
            faxis = np.linspace(0, fmax, coeff_n)
        else:
            fmax = coeff_num / (2 * fduration)
            faxis = np.linspace(0, fmax, coeff_n)

    if args.no_window:
        print('%s: Using square windows' % sys.argv[0])
        window = sq_wind

    if add_reverb:
        if add_reverb == 'small_room':
            sr_r, rir = read('./RIR/RIR_SmallRoom1_near_AnglA.wav')
            rir = rir[:, 1]
            rir = rir / np.power(2, 15)
        elif add_reverb == 'large_room':
            sr_r, rir = read('./RIR/RIR_LargeRoom1_far_AnglA.wav')
            rir = rir[:, 1]
            rir = rir / np.power(2, 15)
        elif add_reverb == 'clean':
            print('%s: No reverberation added!' % sys.argv[0])
        else:
            raise ValueError('Invalid type of reverberation!')

    with open(wavs, 'r') as fid:
        all_feats = {}

        for line in fid:
            tokens = line.strip().split()
            uttid, inwav = tokens[0], ' '.join(tokens[1:])

            if scp_type == 'wav':
                if inwav[-1] == '|':
                    try:
                        proc = subprocess.run(inwav[:-1],
                                              shell=True,
                                              stdout=subprocess.PIPE)
                        sr, signal = read(io.BytesIO(proc.stdout))
                        skip_rest = False
                    except Exception:
                        skip_rest = True
                else:
                    try:
                        sr, signal = read(inwav)
                        skip_rest = False
                    except Exception:
                        skip_rest = True

                assert sr == srate, 'Input file has different sampling rate.'
            elif scp_type == 'segment':
                try:
                    cmd = 'wav-copy ' + inwav + ' - '
                    proc = subprocess.run(cmd,
                                          shell=True,
                                          stdout=subprocess.PIPE)
                    sr, signal = read(io.BytesIO(proc.stdout))
                    skip_rest = False
                except Exception:
                    skip_rest = True
            else:
                raise ValueError(
                    'Invalid type of scp type, it should be either wav or segment'
                )

            if not skip_rest:
                # I want to work with numbers from 0 to 1 so....
                # signal = signal / np.power(2, 15)

                if add_reverb:
                    if not add_reverb == 'clean':
                        signal = addReverb(signal, rir)

                time_frames = np.array([
                    frame for frame in getFrames(signal, srate, frate,
                                                 fduration, window)
                ])

                if args.complex_modulation:
                    cos_trans = freqAnalysis.ifft(time_frames)
                    cos_trans = cos_trans[:, :int(fduration * srate / 2)]
                else:
                    cos_trans = freqAnalysis.dct(time_frames) / np.sqrt(
                        2 * int(srate * fduration))

                [frame_num, ndct] = np.shape(cos_trans)

                feats = np.zeros((frame_num, nfilters * feat_len))

                print('%s: Computing Features for file: %s, also %d' %
                      (sys.argv[0], uttid, time_frames.shape[0]))
                sys.stdout.flush()
                for i in range(frame_num):

                    each_feat = np.zeros([nfilters, feat_len])
                    for j in range(nfilters):
                        filt = fbank[j, 0:-1]
                        band_dct = filt * cos_trans[i, :]
                        if args.complex_modulation:
                            xlpc, gg = computeLpcFast(
                                band_dct, order,
                                keepreal=False)  # Compute LPC coefficients
                            mod_spec = computeModSpecFromLpc(gg, xlpc, coeff_n)
                            if args.compensate_noise:
                                mod_spec = mod_spec * faxis
                            if args.absolute_value:
                                temp2 = np.abs(mod_spec[coeff_0 - 1:coeff_n])
                            else:
                                temp2 = np.append(
                                    np.real(mod_spec[coeff_0 - 1:coeff_n]),
                                    np.imag(mod_spec[coeff_0 - 1:coeff_n]))
                        else:
                            xlpc, gg = computeLpcFast(band_dct, order)
                            mod_spec = np.real(
                                computeModSpecFromLpc(gg, xlpc, coeff_n))
                            if args.compensate_noise:
                                mod_spec = mod_spec * faxis
                            if args.absolute_value:
                                temp2 = np.abs(mod_spec[coeff_0 - 1:coeff_n])
                            else:
                                temp2 = mod_spec[coeff_0 - 1:coeff_n]

                        if args.keep_even:
                            if coeff_0 % 2 == 0:
                                each_feat[j, :] = temp2[1::2]
                            else:
                                each_feat[j, :] = temp2[0::2]
                        else:
                            each_feat[j, :] = temp2

                    each_feat = np.reshape(each_feat, (1, nfilters * feat_len))

                    feats[i, :] = each_feat

                all_feats[uttid] = feats

        dict2Ark(all_feats, outfile, kaldi_cmd)
def getFeats(args, srate=16000, window=np.hanning):
    
    wavs=args.scp
    segment=args.segment
    outfile=args.outfile
    add_reverb=args.add_reverb
    set_unity_gain=args.set_unity_gain
    nmodulations=args.nmodulations
    order=args.order
    fduration=args.fduration
    frate=args.frate
    nfilters=args.nfilters
    kaldi_cmd=args.kaldi_cmd
    
    fbank = createFbank(nfilters, int(2*fduration*srate), srate)
    
    if add_reverb:
        if add_reverb=='small_room':
            sr_r, rir=read('./RIR/RIR_SmallRoom1_near_AnglA.wav')
            rir=rir[:,1]
            rir=rir/np.power(2,15)
        elif add_reverb=='large_room':
            sr_r, rir=read('./RIR/RIR_LargeRoom1_far_AnglA.wav')
            rir=rir[:,1]
            rir=rir/np.power(2,15)
        elif add_reverb=='clean':
            print('%s: No reverberation added!' % sys.argv[0])
        else:
            raise ValueError('Invalid type of reverberation!')
            
    wav_in_buffer='' # Wav file that is currently in RAM
    
    # Load Location and Ids of all wav files 
    wav_ids=[]; wav_locs=[]
    with open(wavs, 'r') as fid:
        for line in fid:
             tokens = line.strip().split()
             uttid, inwav = tokens[0], ' '.join(tokens[1:])
             wav_ids.append(uttid)
             wav_locs.append(inwav)
    
    # Compute features for all the segments
    with open(segment, 'r') as fid_s:
         all_feats={}
         for line_s in fid_s:
            token_s = line_s.strip().split()
            seg_id=token_s[0]; wav_id=token_s[1] 
            
            # Load wav file it is already not in RAM unload
            if wav_in_buffer!=wav_id:
               wav_in_buffer=wav_id
               inwav=wav_locs[wav_ids.index(wav_id)]
               if inwav[-1] == '|':
                   proc = subprocess.run(inwav[:-1], shell=True,
                                         stdout=subprocess.PIPE)
                   sr, signal_big = read(io.BytesIO(proc.stdout))
               else:
                   sr, signal_big = read(inwav)
                   assert sr == srate, 'Input file has different sampling rate.'
               
            t_beg=int(float(token_s[2])*sr); t_end=int(float(token_s[3])*sr)
            signal=signal_big[t_beg:t_end]
            signal=signal/np.power(2,15)

            if add_reverb:
                if not add_reverb=='clean':
                    signal=addReverb(signal,rir)
                
            time_frames = np.array([frame for frame in
                getFrames(signal, srate, frate, fduration, window)])

            cos_trans=freqAnalysis.dct(time_frames)/np.sqrt(2*int(srate * fduration))
            
            [frame_num, ndct]=np.shape(cos_trans)
                            
            feats=np.zeros((frame_num,nfilters*nmodulations))
            print('%s: Computing Features for file: %s and segment: %s' % (sys.argv[0],wav_id,seg_id))
            sys.stdout.flush()
            for i in range(frame_num):
                each_feat=np.zeros([nfilters,nmodulations])
                for j in range(nfilters):
                    filt=fbank[j,0:-1]
                    band_dct=filt*cos_trans[i,:]
                    xlpc, gg=computeLpcFast(band_dct,order) # Compute LPC coefficients 
                    if set_unity_gain:
                        gg=1
                    mod_spec=computeModSpecFromLpc(gg,xlpc,nmodulations)
                    each_feat[j,:]=mod_spec
                each_feat=np.reshape(each_feat,(1,nfilters*nmodulations))
                feats[i,:]=each_feat
        
            all_feats[seg_id]=feats
    dict2Ark(all_feats,outfile,kaldi_cmd)
Example #3
0
def get_modspec(args, srate=16000, window=np.hanning):

    wavs = args.scp
    add_reverb = args.add_reverb
    set_unity_gain = args.set_unity_gain
    nmodulations = args.nmodulations
    order = args.order
    fduration = args.fduration_modspec
    frate = args.frate
    nfilters = args.nfilters

    fbank = createFbank(nfilters, int(2 * fduration * srate), srate)

    if add_reverb:
        if add_reverb == 'small_room':
            sr_r, rir = read('./RIR/RIR_SmallRoom1_near_AnglA.wav')
            rir = rir[:, 1]
            rir = rir / np.power(2, 15)
        elif add_reverb == 'large_room':
            sr_r, rir = read('./RIR/RIR_LargeRoom1_far_AnglA.wav')
            rir = rir[:, 1]
            rir = rir / np.power(2, 15)
        elif add_reverb == 'clean':
            print('%s: No reverberation added!' % sys.argv[0])
        else:
            raise ValueError('Invalid type of reverberation!')

    with open(wavs, 'r') as fid:
        all_feats = {}

        for line in fid:
            tokens = line.strip().split()
            uttid, inwav = tokens[0], ' '.join(tokens[1:])

            if inwav[-1] == '|':
                proc = subprocess.run(inwav[:-1],
                                      shell=True,
                                      stdout=subprocess.PIPE)
                sr, signal = read(io.BytesIO(proc.stdout))
            else:
                sr, signal = read(inwav)
            assert sr == srate, 'Input file has different sampling rate.'

            # I want to work with numbers from 0 to 1 so....
            signal = signal / np.power(2, 15)

            if add_reverb:
                if not add_reverb == 'clean':
                    signal = addReverb(signal, rir)

            time_frames = np.array([
                frame
                for frame in getFrames(signal, srate, frate, fduration, window)
            ])

            cos_trans = freqAnalysis.dct(time_frames) / np.sqrt(
                2 * int(srate * fduration))

            [frame_num, ndct] = np.shape(cos_trans)

            if set_unity_gain:
                feats = np.zeros((frame_num, nfilters * (nmodulations - 1)))
            else:
                feats = np.zeros((frame_num, nfilters * nmodulations))

            print('%s: Computing Features for file: %s' % (sys.argv[0], uttid))
            sys.stdout.flush()
            for i in range(frame_num):
                if set_unity_gain:
                    each_feat = np.zeros([nfilters, nmodulations - 1])
                else:
                    each_feat = np.zeros([nfilters, nmodulations])
                for j in range(nfilters):
                    filt = fbank[j, 0:-1]
                    band_dct = filt * cos_trans[i, :]
                    xlpc, gg = computeLpcFast(
                        band_dct, order)  # Compute LPC coefficients
                    if set_unity_gain:
                        gg = 1
                    mod_spec = computeModSpecFromLpc(gg, xlpc, nmodulations)
                    if set_unity_gain:
                        mod_spec = mod_spec[1:]
                    each_feat[j, :] = mod_spec
                if set_unity_gain:
                    each_feat = np.reshape(each_feat,
                                           (1, nfilters * (nmodulations - 1)))
                else:
                    each_feat = np.reshape(each_feat,
                                           (1, nfilters * nmodulations))
                feats[i, :] = each_feat

            all_feats[uttid] = feats

    return all_feats
Example #4
0
def extractFDLPTfPattern(wavs,
                         outdir,
                         phone_map,
                         phn_file_dir,
                         get_phone_labels=False,
                         only_center=False,
                         ignore_edge=False,
                         order=50,
                         fduration=0.5,
                         frate=100,
                         nfft=20,
                         nfilters=15,
                         srate=16000,
                         window=np.hanning):

    if not only_center:
        fbank = createFbank(nfilters, int(2 * fduration * srate), srate)

        # Get list of phonemes
        phn_list = []

        with open(phone_map, 'r') as fid2:
            for line2 in fid2:
                line2 = line2.strip().split()
                if len(line2) == 2:
                    if 'sil' not in line2 and 'SIL' not in line2:
                        phn_list.append(line2[1])

        phn_list = list(set(phn_list))
        phn_list.sort()

        with open(wavs, 'r') as fid:

            # Initialize matrix for all features
            if get_phone_labels:
                all_feats = np.empty(nfilters + 1)
            else:
                all_feats = np.empty(nfilters)

            for line in fid:
                tokens = line.strip().split()
                uttid, inwav = tokens[0], ' '.join(tokens[1:])

                fname_phn = uttid + '.PHN'

                if get_phone_labels:
                    if os.path.isfile(os.path.join(phn_file_dir, fname_phn)):
                        # Get first line of phone file in the beginning
                        phn_file = open(os.path.join(phn_file_dir, fname_phn))
                        phn_line = phn_file.readline()
                        phn_locs = phn_line.strip().split()
                        # Get phoneme information
                        phone_now = phn_locs[2]
                        phone_end = int(int(phn_locs[1]))
                        beg_frame = int(int(phn_locs[0]))
                    else:
                        break
                else:
                    beg_frame = 0

                if inwav[-1] == '|':
                    proc = subprocess.run(inwav[:-1],
                                          shell=True,
                                          stdout=subprocess.PIPE)
                    sr, signal = read(io.BytesIO(proc.stdout))

                else:
                    sr, signal = read(inwav)

                assert sr == srate, 'Input file has different sampling rate.'
                # I want to work with numbers from 0 to 1 so....
                signal = signal / np.power(2, 15)

                time_frames = np.array([
                    frame for frame in getFrames(signal, srate, frate,
                                                 fduration, window)
                ])

                cos_trans = freqAnalysis.dct(time_frames) / np.sqrt(
                    2 * int(srate * fduration))
                [frame_num, ndct] = np.shape(cos_trans)

                if get_phone_labels:
                    feats = np.zeros([frame_num, nfilters + 1])
                else:
                    feats = np.zeros([frame_num, nfilters])

                print('Computing Features for file: %s' % uttid)

                for i in range(beg_frame, frame_num):

                    each_feat = np.zeros(nfilters)
                    for j in range(nfilters):
                        filt = fbank[j, 0:-1]
                        band_dct = filt * cos_trans[i, :]
                        xlpc, gg = computeLpcFast(
                            band_dct, order)  # Compute LPC coefficients
                        w, h = freqz(np.sqrt(gg), xlpc, ndct)
                        h_mid = np.log10(
                            np.mean(
                                np.abs(h[int(ndct / 2 - 160):int(ndct / 2 +
                                                                 160)])))
                        each_feat[j] = h_mid

                    if get_phone_labels:
                        # Updates to current phoneme
                        if i > phone_end:
                            # Get new phone label
                            phn_line = phn_file.readline()
                            if phn_line:
                                phn_locs = phn_line.strip().split()
                                phone_now = phn_locs[2]
                                phone_end = int(phn_locs[1])
                                phone_end = int(int(phn_locs[1]) / 160)
                            else:
                                break  # Break if no more phones are remaining

                        ind = phn_list.index(phone_now)
                        each_feat = np.append(each_feat, ind)
                    feats[i, :] = each_feat

                all_feats = np.vstack([all_feats, feats])
            all_feats = all_feats[1:, :]

            # Save the final BIG feature file
            np.save(os.path.join(outdir), all_feats)
            np.save(os.path.join(os.path.dirname(outdir), 'phone_list'),
                    phn_list)
    else:
        fbank = createFbank(nfilters, int(2 * fduration * srate), srate)

        # Get list of phonemes
        phn_list = []

        with open(phone_map, 'r') as fid2:
            for line2 in fid2:
                line2 = line2.strip().split()
                if len(line2) == 2:
                    if 'sil' not in line2 and 'SIL' not in line2:
                        phn_list.append(line2[1])

        phn_list = list(set(phn_list))
        phn_list.sort()

        with open(wavs, 'r') as fid:
            # Initialize matrix for all features
            if get_phone_labels:
                all_feats = np.empty(nfilters * nfft + 1)
            else:
                all_feats = np.empty(nfilters * nfft)

            for line in fid:
                tokens = line.strip().split()
                uttid, inwav = tokens[0], ' '.join(tokens[1:])

                if inwav[-1] == '|':
                    proc = subprocess.run(inwav[:-1],
                                          shell=True,
                                          stdout=subprocess.PIPE)
                    sr, signal = read(io.BytesIO(proc.stdout))
                else:
                    sr, signal = read(inwav)
                assert sr == srate, 'Input file has different sampling rate.'
                # I want to work with numbers from 0 to 1 so....
                signal = signal / np.power(2, 15)

                fname_phn = uttid + '.PHN'

                # Get all phones and their center

                if os.path.isfile(os.path.join(phn_file_dir, fname_phn)):
                    phn_file = open(os.path.join(phn_file_dir, fname_phn))
                    phone_mid = np.empty(0)
                    phone_now = np.empty(0)
                    for line2 in phn_file:

                        phn_locs = line2.strip().split()
                        if phn_locs[2] in phn_list:
                            ind = phn_list.index(phn_locs[2])
                            phone_now = np.append(phone_now, ind)
                            phone_mid = np.append(
                                phone_mid,
                                int(int(phn_locs[0]) + int(phn_locs[1])) / 2)

                    time_frames = np.array([
                        frame for frame in getFrames(signal, srate, frate,
                                                     fduration, window)
                    ])

                    cos_trans = freqAnalysis.dct(time_frames) / np.sqrt(
                        2 * int(srate * fduration))

                    [frame_num, ndct] = np.shape(cos_trans)

                    if ignore_edge:
                        phone_mid = phone_mid[1:-1]
                        phone_now = phone_now[1:-1]

                    only_compute = len(phone_mid)

                    feats = np.zeros([only_compute, nfilters * nfft])

                    print('Computing Features for file: %s' % uttid)

                    for kk in range(only_compute):
                        i = int(np.floor((phone_mid[kk])))
                        each_feat = each_feat = np.zeros([nfilters, nfft])
                        for j in range(nfilters):
                            filt = fbank[j, 0:-1]
                            band_dct = filt * cos_trans[i, :]
                            xlpc, gg = computeLpcFast(
                                band_dct, order)  # Compute LPC coefficients
                            w, h = freqz(np.sqrt(gg), xlpc, nfft)
                            each_feat[j, :] = np.log10(np.abs(h))
                        each_feat = np.reshape(each_feat, (1, nfilters * nfft))
                        feats[kk, :] = each_feat
                    if get_phone_labels:
                        feats = np.append(feats,
                                          np.reshape(phone_now,
                                                     (len(phone_now), 1)),
                                          axis=1)
                    all_feats = np.vstack([all_feats, feats])

            all_feats = all_feats[1:, :]

            # Save the final BIG feature file
            np.save(os.path.join(outdir), all_feats)
            np.save(os.path.join(os.path.dirname(outdir), 'phone_list'),
                    phn_list)
def extractModSpecFeatures(wavs,
                           outdir,
                           phone_map,
                           phn_file_dir,
                           get_phone_labels=True,
                           only_center=True,
                           around_center=1,
                           ignore_edge=False,
                           nmodulations=12,
                           order=50,
                           fduration=0.5,
                           frate=100,
                           nfft=512,
                           nfilters=15,
                           srate=16000,
                           window=np.hanning):
    '''Extract the Modulation Spectral Features.

    Args:
        wavs (list): List of (uttid, 'filename or pipe-command').
        outdir (string): Output of an existing directory.
        phone_map(string): Map of the phonemes from Kaldi
        get_phone_labels(bool): Set True if you want to get the phoneme labels  
        fduration (float): Frame duration in seconds.
        frate (int): Frame rate in Hertz.
        hz2scale (function): Hz -> 'scale' conversion.
        nfft (int): Number of points to compute the FFT.
        nfilters (int): Number of filters.
        postproc (function): User defined post-processing function.
        srate (int): Expected sampling rate of the audio.
        scale2hz (function): 'scale' -> Hz conversion.
        srate (int): Expected sampling rate.
        window (function): Windowing function.

    Note:
        It is possible to use a Kaldi like style to read the audio
        using a "pipe-command" e.g.: "sph2pipe -f wav /path/file.wav |"

    '''

    if not only_center:
        fbank = createFbank(nfilters, int(2 * fduration * srate), srate)

        # Get list of phonemes
        phn_list = []

        with open(phone_map, 'r') as fid2:
            for line2 in fid2:
                line2 = line2.strip().split()
                if len(line2) == 2:
                    if 'sil' not in line2 and 'SIL' not in line2:
                        phn_list.append(line2[1])

        phn_list = list(set(phn_list))
        phn_list.sort()

        with open(wavs, 'r') as fid:

            # Initialize matrix for all features
            if get_phone_labels:
                all_feats = np.empty(nmodulations * nfilters + 1)
            else:
                all_feats = np.empty(nmodulations * nfilters)

            for line in fid:
                tokens = line.strip().split()
                uttid, inwav = tokens[0], ' '.join(tokens[1:])

                fname_phn = uttid + '.PHN'

                if get_phone_labels:

                    # Get first line of phone file in the beginning
                    phn_file = open(fname_phn)
                    phn_line = phn_file.readline()
                    phn_locs = phn_line.strip().split()
                    # Get phoneme information
                    phone_now = phn_locs[2]
                    phone_end = int(int(phn_locs[1]) / 160)
                    beg_frame = int(int(phn_locs[0]) / 160)

                if inwav[-1] == '|':
                    proc = subprocess.run(inwav[:-1],
                                          shell=True,
                                          stdout=subprocess.PIPE)
                    sr, signal = read(io.BytesIO(proc.stdout))
                else:
                    sr, signal = read(inwav)
                assert sr == srate, 'Input file has different sampling rate.'

                # I want to work with numbers from 0 to 1 so....
                signal = signal / np.power(2, 15)

                time_frames = np.array([
                    frame for frame in getFrames(signal, srate, frate,
                                                 fduration, window)
                ])

                cos_trans = freqAnalysis.dct(time_frames) / np.sqrt(
                    2 * int(srate * fduration))

                [frame_num, ndct] = np.shape(cos_trans)

                if get_phone_labels:
                    feats = np.zeros([frame_num, nmodulations * nfilters + 1])
                else:
                    feats = np.zeros([frame_num, nmodulations * nfilters])

                    print('Computing Features for file: %s' % uttid)

                for i in range(beg_frame, frame_num):

                    each_feat = np.zeros([nfilters, nmodulations])
                    for j in range(nfilters):
                        filt = fbank[j, 0:-1]
                        band_dct = filt * cos_trans[i, :]
                        xlpc, gg = computeLpcFast(
                            band_dct, order)  # Compute LPC coefficients
                        mod_spec = computeModSpecFromLpc(
                            gg, xlpc, nmodulations)
                        each_feat[j, :] = mod_spec
                    each_feat = np.reshape(each_feat,
                                           (1, nfilters * nmodulations))

                    if get_phone_labels:
                        # Udates to current phoneme
                        if i > phone_end:
                            # Get new phone label
                            phn_line = phn_file.readline()
                            if phn_line:
                                phn_locs = phn_line.strip().split()
                                phone_now = phn_locs[2]
                                phone_end = int(int(phn_locs[1]) / 160)
                            else:
                                break  # Break if no more phones are remaining

                        ind = phn_list.index(phone_now)
                        each_feat = np.append(each_feat, ind)
                    feats[i, :] = each_feat

                all_feats = np.vstack([all_feats, feats])
            all_feats = all_feats[1:, :]

            # Save the final BIG feature file
            np.save(os.path.join(outdir), all_feats)
            np.save(os.path.join(os.path.dirname(outdir), 'phone_list'),
                    phn_list)
    else:
        fbank = createFbank(nfilters, int(2 * fduration * srate), srate)

        # Get list of phonemes
        phn_list = []

        with open(phone_map, 'r') as fid2:
            for line2 in fid2:
                line2 = line2.strip().split()
                if len(line2) == 2:
                    if 'sil' not in line2 and 'SIL' not in line2:
                        phn_list.append(line2[1])

        phn_list = list(set(phn_list))
        phn_list.sort()

        with open(wavs, 'r') as fid:
            # Initialize matrix for all features
            if get_phone_labels:
                all_feats = np.empty(nmodulations * nfilters + 1)
            else:
                all_feats = np.empty(nmodulations * nfilters)

            for line in fid:
                tokens = line.strip().split()
                uttid, inwav = tokens[0], ' '.join(tokens[1:])

                if inwav[-1] == '|':
                    proc = subprocess.run(inwav[:-1],
                                          shell=True,
                                          stdout=subprocess.PIPE)
                    sr, signal = read(io.BytesIO(proc.stdout))
                else:
                    sr, signal = read(inwav)
                assert sr == srate, 'Input file has different sampling rate.'

                # I want to work with numbers from 0 to 1 so....
                signal = signal / np.power(2, 15)

                fname_phn = uttid + '.PHN'

                # Get all phones and their center

                if os.path.isfile(os.path.join(phn_file_dir, fname_phn)):
                    phn_file = open(os.path.join(phn_file_dir, fname_phn))
                    phone_mid = np.empty(0)
                    phone_now = np.empty(0)
                    for line2 in phn_file:

                        phn_locs = line2.strip().split()
                        if phn_locs[2] in phn_list:
                            ind = phn_list.index(phn_locs[2])
                            phone_now = np.append(phone_now, ind)
                            phone_mid = np.append(
                                phone_mid,
                                int(int(phn_locs[0]) + int(phn_locs[1])) / 2)

                    time_frames = np.array([
                        frame for frame in getFrames(signal, srate, frate,
                                                     fduration, window)
                    ])

                    cos_trans = freqAnalysis.dct(time_frames) / np.sqrt(
                        2 * int(srate * fduration))

                    [frame_num, ndct] = np.shape(cos_trans)

                    if ignore_edge:
                        phone_mid = phone_mid[1:-1]
                        phone_now = phone_now[1:-1]

                    only_compute = len(phone_mid)

                    if get_phone_labels:
                        feats = np.zeros([
                            only_compute * around_center,
                            nmodulations * nfilters + 1
                        ])
                    else:
                        feats = np.zeros([
                            only_compute * around_center,
                            nmodulations * nfilters
                        ])

                    print('Computing Features for file: %s' % uttid)
                    for kk in range(only_compute):
                        i_mid = int(np.floor((phone_mid[kk])))
                        for cont in range(around_center):
                            i = i_mid + cont - int((around_center - 1) / 2)
                            each_feat = np.zeros([nfilters, nmodulations])
                            for j in range(nfilters):
                                filt = fbank[j, 0:-1]
                                band_dct = filt * cos_trans[i, :]
                                xlpc, gg = computeLpcFast(
                                    band_dct,
                                    order)  # Compute LPC coefficients
                                mod_spec = computeModSpecFromLpc(
                                    gg, xlpc, nmodulations)
                                each_feat[j, :] = mod_spec
                            each_feat = np.reshape(
                                each_feat, (1, nfilters * nmodulations))
                            if get_phone_labels:
                                feats[around_center * kk +
                                      cont, :] = np.append(
                                          each_feat, phone_now[kk])
                            else:
                                feats[around_center * kk + cont, :] = each_feat

                    #if get_phone_labels:
                    #    feats=np.append(feats,np.reshape(phone_now,(len(phone_now),1)),axis=1)
                    all_feats = np.vstack([all_feats, feats])

            all_feats = all_feats[1:, :]

            # Save the final BIG feature file
            np.save(os.path.join(outdir), all_feats)
            np.save(os.path.join(os.path.dirname(outdir), 'phone_list'),
                    phn_list)
Example #6
0
def extractModSpecFeatures(args, srate=16000, window=np.hanning):

    wavs = args.scp
    outfile = args.outfile
    phone_map = args.phn_file
    phn_file_dir = args.phn_file_dir
    get_phone_labels = args.get_phone_labels
    add_reverb = args.add_reverb
    set_unity_gain = args.set_unity_gain
    nmodulations = args.nmodulations
    order = args.order
    fduration = args.fduration
    frate = args.frate
    nfilters = args.nfilters
    '''Extract the Modulation Spectral Features.

    Args:
        wavs (list): List of (uttid, 'filename or pipe-command').
        outdir (string): Output of an existing directory.
        phone_map(string): Map of the phonemes from Kaldi
        get_phone_labels(bool): Set True if you want to get the phoneme labels  
        fduration (float): Frame duration in seconds.
        frate (int): Frame rate in Hertz.
        hz2scale (function): Hz -> 'scale' conversion.
        nfft (int): Number of points to compute the FFT.
        nfilters (int): Number of filters.
        postproc (function): User defined post-processing function.
        srate (int): Expected sampling rate of the audio.
        scale2hz (function): 'scale' -> Hz conversion.
        srate (int): Expected sampling rate.
        window (function): Windowing function.

    Note:
        It is possible to use a Kaldi like style to read the audio
        using a "pipe-command" e.g.: "sph2pipe -f wav /path/file.wav |"

    '''

    fbank = createFbank(nfilters, int(2 * fduration * srate), srate)

    if add_reverb:
        if add_reverb == 'small_room':
            sr_r, rir = read('./RIR/RIR_SmallRoom1_near_AnglA.wav')
            rir = rir[:, 1]
            rir = rir / np.power(2, 15)
        elif add_reverb == 'large_room':
            sr_r, rir = read('./RIR/RIR_LargeRoom1_far_AnglA.wav')
            rir = rir[:, 1]
            rir = rir / np.power(2, 15)
        elif add_reverb == 'clean':
            print('%s: No reverberation added!' % sys.argv[0])
        else:
            raise ValueError('Invalid type of reverberation!')

    # Get list of phonemes
    phn_list = []

    with open(phone_map, 'r') as fid2:
        for line2 in fid2:
            line2 = line2.strip().split()
            if len(line2) == 2:
                if 'sil' not in line2 and 'SIL' not in line2:
                    phn_list.append(line2[1])

    phn_list = list(set(phn_list))
    phn_list.sort()

    with open(wavs, 'r') as fid:
        all_feats = {}

        for line in fid:
            tokens = line.strip().split()
            uttid, inwav = tokens[0], ' '.join(tokens[1:])

            if inwav[-1] == '|':
                proc = subprocess.run(inwav[:-1],
                                      shell=True,
                                      stdout=subprocess.PIPE)
                sr, signal = read(io.BytesIO(proc.stdout))
            else:
                sr, signal = read(inwav)
            assert sr == srate, 'Input file has different sampling rate.'

            # I want to work with numbers from 0 to 1 so....
            signal = signal / np.power(2, 15)

            if add_reverb:
                if not add_reverb == 'clean':
                    signal = addReverb(signal, rir)

            fname_phn = uttid + '.PHN'

            # Get all phones and their center

            if os.path.isfile(os.path.join(phn_file_dir, fname_phn)):
                phn_file = open(os.path.join(phn_file_dir, fname_phn))
                phone_mid = np.empty(0)
                phone_now = np.empty(0)
                for line2 in phn_file:

                    phn_locs = line2.strip().split()
                    if phn_locs[2] in phn_list:
                        ind = phn_list.index(phn_locs[2])
                        phone_now = np.append(phone_now, ind)
                        phone_mid = np.append(
                            phone_mid,
                            int(int(phn_locs[0]) + int(phn_locs[1])) / 2)

                time_frames = np.array([
                    frame for frame in getFrames(signal, srate, frate,
                                                 fduration, window)
                ])

                cos_trans = freqAnalysis.dct(time_frames) / np.sqrt(
                    2 * int(srate * fduration))

                [frame_num, ndct] = np.shape(cos_trans)

                only_compute = len(phone_mid)

                if get_phone_labels:
                    feats = np.zeros(
                        [only_compute, nmodulations * nfilters + 1])
                else:
                    feats = np.zeros([only_compute, nmodulations * nfilters])

                print('Computing Features for file: %s' % uttid)
                sys.stdout.flush()
                for kk in range(only_compute):
                    i = int(np.floor((phone_mid[kk])))
                    each_feat = np.zeros([nfilters, nmodulations])
                    for j in range(nfilters):
                        filt = fbank[j, 0:-1]
                        band_dct = filt * cos_trans[i, :]
                        #band_dct=band_dct[band_dct>0]
                        xlpc, gg = computeLpcFast(
                            band_dct, order)  # Compute LPC coefficients
                        if set_unity_gain:
                            gg = 1
                        mod_spec = computeModSpecFromLpc(
                            gg, xlpc, nmodulations)
                        each_feat[j, :] = mod_spec
                    each_feat = np.reshape(each_feat,
                                           (1, nfilters * nmodulations))
                    if get_phone_labels:
                        feats[kk, :] = np.append(each_feat, phone_now[kk])
                    else:
                        feats[kk, :] = each_feat

                all_feats[uttid] = feats

        # Save the final BIG feature file
        pickle.dump(all_feats, open(outfile, 'wb'))
        np.save(os.path.join(os.path.dirname(outfile), 'phone_list'), phn_list)
Example #7
0
def getFeats(args, srate=16000, window=np.hamming):
    wavs = args.scp
    scp_type = args.scp_type
    outfile = args.outfile
    coeff_num = args.coeff_num
    coeff_range = args.coeff_range
    order = args.order
    fduration = args.fduration
    frate = args.frate
    nfilters = args.nfilters
    kaldi_cmd = args.kaldi_cmd
    add_noise = args.add_noise
    add_reverb = args.add_reverb

    if args.lifter_config:
        fid = open(args.lifter_config, 'r')
        lifter_config = fid.readline().strip().split(',')
        lifter_config = np.asarray([float(x) for x in lifter_config])

    # Set up mel-filterbank
    fbank_type = args.fbank_type.strip().split(',')
    if fbank_type[0] == "mel":
        if len(fbank_type) < 2:
            raise ValueError('Mel filter bank not configured properly....')
        fbank = createFbank(nfilters, int(2 * fduration * srate), srate, warp_fact=float(fbank_type[1]))
    elif fbank_type[0] == "cochlear":
        if len(fbank_type) < 6:
            raise ValueError('Cochlear filter bank not configured properly....')
        if int(fbank_type[3]) == 1:
            print('%s: Alpha is fixed and will not change as a function of the center frequency...' % sys.argv[0])
        fbank = createFbankCochlear(nfilters, int(2 * fduration * srate), srate, om_w=float(fbank_type[1]),
                                    alp=float(fbank_type[2]), fixed=int(fbank_type[3]), bet=float(fbank_type[4]),
                                    warp_fact=float(fbank_type[5]))
    else:
        raise ValueError('Invalid type of filter bank, use mel or cochlear with proper configuration')

    # Ignore odd modulations
    if args.odd_mod_zero:
        print('%s: Ignoring odd modulations... ' % sys.argv[0])
    if add_noise:
        if add_noise == "clean" or add_noise == "diff":
            print('%s: No noise added!' % sys.argv[0])
        else:
            noise_info = add_noise.strip().split(',')
            noise = load_noise(noise_info[0])

    if add_reverb:
        if add_reverb == 'small_room':
            sr_r, rir = read('./RIR/RIR_SmallRoom1_near_AnglA.wav')
            rir = rir[:, 1]
            rir = rir / np.power(2, 15)
        elif add_reverb == 'medium_room':
            sr_r, rir = read('./RIR/RIR_MediumRoom1_far_AnglA.wav')
            rir = rir[:, 1]
            rir = rir / np.power(2, 15)
        elif add_reverb == 'large_room':
            sr_r, rir = read('./RIR/RIR_LargeRoom1_far_AnglA.wav')
            rir = rir[:, 1]
            rir = rir / np.power(2, 15)
        elif add_reverb == 'clean':
            print('%s: No reverberation added!' % sys.argv[0])
        else:
            raise ValueError('Invalid type of reverberation!')

    # Set up mask
    coeff_range = coeff_range.split(',')
    lowpass = int(coeff_range[0])
    highpass = int(coeff_range[1])
    mask = []
    for i in range(coeff_num):
        if i >= lowpass and i <= highpass:
            mask.append(1)
        else:
            mask.append(0)
    mask = np.asarray(mask)
    args.overlap_fraction = 1 - args.overlap_fraction

    # Setup modulation weights
    args.gamma_weight = args.gamma_weight.strip().split(',')
    if not args.gamma_weight[0] == "None":
        print('%s: Adding gamma filter on modulation frequencies...' % sys.argv[0])
        x = np.linspace(0, order - 1, order)
        scale = float(args.gamma_weight[0])
        shape = float(args.gamma_weight[1])
        pk_required = float(args.gamma_weight[2])
        res = 2 * fduration
        pk_required = pk_required * res
        pk = (shape - 1) * scale
        loc = -pk + pk_required
        mod_wts = stats.gamma.pdf(x, a=shape, loc=loc, scale=scale) * 3 * scale
    with open(wavs, 'r') as fid:

        all_feats = {}
        if args.write_utt2num_frames:
            all_lens = {}

        for line in fid:
            tokens = line.strip().split()
            uttid, inwav = tokens[0], ' '.join(tokens[1:])

            if scp_type == 'wav':
                if inwav[-1] == '|':
                    try:
                        proc = subprocess.run(inwav[:-1], shell=True, stdout=subprocess.PIPE)
                        sr, signal = read(io.BytesIO(proc.stdout))
                        skip_rest=False
                    except Exception:
                        skip_rest=True
                else:
                    try:
                        sr, signal = read(inwav)
                        skip_rest = False
                    except Exception:
                        skip_rest = True

                assert sr == srate, 'Input file has different sampling rate.'
            elif scp_type == 'segment':
                try:
                    cmd = 'wav-copy ' + inwav + ' - '
                    proc = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE)
                    sr, signal = read(io.BytesIO(proc.stdout))
                    skip_rest = False
                except Exception:
                    skip_rest = True
            else:
                raise ValueError('Invalid type of scp type, it should be either wav or segment')

            # I want to work with numbers from 0 to 1 so....
            # signal = signal / np.power(2, 15)

            if not skip_rest:
                if add_noise:
                    if not add_noise == "clean":
                        if add_noise == "diff":
                            a = [1, 2, 3, 2, 0, -2, -5, -2, 0, 2, 3, 2, 1]
                            signal = convolve(signal, a, mode='same')
                        else:
                            signal = add_noise_to_wav(signal, noise, float(noise_info[1]))

                if add_reverb:
                    if not add_reverb == 'clean':
                        signal = addReverb(signal, rir)

                tframes = signal.shape[0]  # Number of samples in the signal

                lfr = 1 / (args.overlap_fraction * fduration)
                time_frames = np.array([frame for frame in
                                        getFrames(signal, srate, lfr, fduration, window)])

                cos_trans = freqAnalysis.dct(time_frames) / np.sqrt(2 * int(srate * fduration))

                [frame_num, ndct] = np.shape(cos_trans)

                feats = np.zeros((nfilters, int(np.ceil(tframes * frate / srate))))
                ptr = int(0)

                print('%s: Computing Features for file: %s' % (sys.argv[0], uttid))
                sys.stdout.flush()

                for i in range(0, frame_num):
                    for j in range(nfilters):
                        filt = fbank[j, 0:-1]
                        band_dct = filt * cos_trans[i, :]
                        xlpc, gg = computeLpcFast(band_dct, order)  # Compute LPC coefficients
                        ms = computeModSpecFromLpc(gg, xlpc, coeff_num)
                        ms = ms * mask
                        if args.lifter_config:
                            ms = ms * lifter_config
                        if not args.gamma_weight[0] == "None":
                            ms = ms * mod_wts
                        if args.odd_mod_zero:
                            ms[1::2] = 0
                        ms = fft(ms, 2 * int(fduration * frate))
                        ms = np.abs(np.exp(ms))
                        kk = int(np.round(fduration * frate))
                        kkb2 = int(np.round(fduration * frate / 2))
                        ms = ms[0:kk] * np.hanning(kk) / window(kk)

                        if i == 0:
                            if feats.shape[1] < kkb2:
                                feats[j, :] += ms[kkb2:kkb2 + feats.shape[1]]
                            else:
                                feats[j, ptr:ptr + kkb2] += ms[kkb2:]
                        elif i == frame_num - 1 or i == frame_num - 2:
                            if ms.shape[0] >= feats.shape[1] - ptr:
                                feats[j, ptr:] += ms[:feats.shape[1] - ptr]
                            else:
                                feats[j, ptr:ptr + kk] += ms
                        else:
                            feats[j, ptr:ptr + kk] += ms

                    kk = int(np.round(fduration * frate * args.overlap_fraction))
                    kkb2 = int(np.round(fduration * frate / 2))
                    if i == 0:
                        ptr = int(ptr + kk - kkb2)
                    else:
                        ptr = int(ptr + kk + randrange(2))

                all_feats[uttid] = np.log(np.clip(feats.T, a_max=None, a_min=0.00000000000001))
                if args.write_utt2num_frames:
                    all_lens[uttid] = feats.shape[1]

        dict2Ark(all_feats, outfile, kaldi_cmd)
        if args.write_utt2num_frames:
            with open(outfile + '.len', 'w+') as file:
                for key, lens in all_lens.items():
                    p = "{:s} {:d}".format(key, lens)
                    file.write(p)
                    file.write("\n")
def extractModSpecFeatures(args, srate=16000, window=np.hamming):
    """ Extract the mel scale filter-bank energy features
    """

    wavs = args.scp
    outfile = args.outfile
    add_reverb = args.add_reverb
    set_unity_gain = args.set_unity_gain
    nmodulations = args.nmodulations
    order = args.order
    fduration = args.fduration
    frate = args.frate
    nfilters = args.nfilters
    phone_map = args.phn_file
    phn_file_dir = args.phn_file_dir
    get_phone_labels = args.get_phone_labels

    fbank = createFbank(nfilters, int(2 * fduration * srate), srate)

    if add_reverb:
        if add_reverb == 'small_room':
            sr_r, rir = read('./RIR/RIR_SmallRoom1_near_AnglA.wav')
            rir = rir[:, 1]
            rir = rir / np.power(2, 15)
        elif add_reverb == 'large_room':
            sr_r, rir = read('./RIR/RIR_LargeRoom1_far_AnglA.wav')
            rir = rir[:, 1]
            rir = rir / np.power(2, 15)
        elif add_reverb == 'clean':
            print('%s: No reverberation added!' % sys.argv[0])
        else:
            raise ValueError('Invalid type of reverberation!')
        sys.stdout.flush()

    # Get list of phonemes
    phn_list = []

    with open(phone_map, 'r') as fid2:
        for line2 in fid2:
            line2 = line2.strip().split()
            if len(line2) == 2:
                if 'sil' not in line2 and 'SIL' not in line2:
                    phn_list.append(line2[1])

    phn_list = list(set(phn_list))
    phn_list.sort()
    np.save('phone_list', phn_list)

    with open(wavs, 'r') as fid:

        all_feats = {}
        for line in fid:
            tokens = line.strip().split()
            uttid, inwav = tokens[0], ' '.join(tokens[1:])

            fname_phn = uttid + '.PHN'
            fname_phn_base = uttid[0:-2] + '.PHN'

            if isfile(join(phn_file_dir, fname_phn_base)):
                fname_phn = fname_phn_base

            # Get all the locations of phonemes
            phone_now = np.empty(0)
            phone_end = np.empty(0)
            phone_beg = np.empty(0)

            if isfile(join(phn_file_dir, fname_phn)):
                print('%s: Computing Features for file: %s' %
                      (sys.argv[0], uttid))
                sys.stdout.flush()
                with open(join(phn_file_dir, fname_phn)) as phn_file:

                    for phn_line in phn_file:

                        phn_locs = phn_line.strip().split()

                        # Get phoneme information

                        phone_now = np.append(phone_now, phn_locs[2])
                        phone_end = np.append(phone_end, phn_locs[1])
                        phone_beg = np.append(phone_beg, phn_locs[0])
                phn_file.close()
                if np.size(phone_end) == 0:
                    print('%s: Corrupted Phone file.. hence skipped...' %
                          sys.argv[0])
                    continue

                if inwav[-1] == '|':
                    proc = subprocess.run(inwav[:-1],
                                          shell=True,
                                          stdout=subprocess.PIPE)
                    sr, signal = read(io.BytesIO(proc.stdout))
                else:
                    sr, signal = read(inwav)
                assert sr == srate, 'Input file has different sampling rate.'

                signal = signal / np.power(2, 15)

                if add_reverb:
                    if not add_reverb == 'clean':
                        signal = addReverb(signal, rir)

                time_frames = np.array([
                    frame for frame in getFrames(signal, srate, frate,
                                                 fduration, window)
                ])

                cos_trans = dct(time_frames) / np.sqrt(
                    2 * int(srate * fduration))

                [frame_num, ndct] = np.shape(cos_trans)

                # Main feature computation loop

                feats = np.zeros((frame_num, nfilters * nmodulations))
                sys.stdout.flush()
                for i in range(frame_num):
                    each_feat = np.zeros([nfilters, nmodulations])
                    for j in range(nfilters):
                        filt = fbank[j, 0:-1]
                        band_dct = filt * cos_trans[i, :]
                        xlpc, gg = computeLpcFast(
                            band_dct, order)  # Compute LPC coefficients
                        if set_unity_gain:
                            gg = 1
                        mod_spec = computeModSpecFromLpc(
                            gg, xlpc, nmodulations)
                        each_feat[j, :] = mod_spec
                    each_feat = np.reshape(each_feat,
                                           (1, nfilters * nmodulations))
                    feats[i, :] = each_feat

                if not get_phone_labels:
                    now_feats = np.empty(nfilters * nmodulations)
                else:
                    now_feats = np.empty(nfilters * nmodulations + 1)

                for num, phn in enumerate(phone_now):

                    now_frames = feats[
                        int(phone_beg[num]):int(phone_end[num]), :]
                    if get_phone_labels:
                        ind = phn_list.index(phn)
                        fr_num = now_frames.shape[0]
                        now_frames = np.concatenate(
                            (now_frames, np.tile(ind, (fr_num, 1))), axis=1)

                    now_feats = np.vstack([now_feats, now_frames])
                now_feats = now_feats[1:, :]
            all_feats[uttid] = now_feats

        pickle.dump(all_feats, open(outfile, 'wb'))