Exemple #1
0
def split_dict_and_save_ark(in_dict,split_num,data_folder,name,kaldi_cmd):
    
    all_keys=list(in_dict.keys())
    dict_size=int(np.ceil(len(all_keys)/split_num))
    split_keys=list(chunks(all_keys,dict_size))
    
    if not exists(join(data_folder,name)):
        os.makedirs(join(data_folder,name))
    
    for i in range(0,split_num):
        ark_file_name=join(data_folder,name,'normalized.'+str(i+1))
        dict2Ark(sub_dict(in_dict,split_keys[i]),ark_file_name,kaldi_cmd)
            for utt_id, mat in kaldi_io.read_mat_ark(cmd)
        }
        all_feats.append(feat_dict)

    post_dict = {}
    # I assume 3 streams from here
    for utt_id in all_feats[0]:
        batch_x1 = all_feats[0][utt_id][None, :, :]
        batch_x2 = all_feats[1][utt_id][None, :, :]
        batch_x3 = all_feats[2][utt_id][None, :, :]
        batch_l = Variable(torch.IntTensor([batch_x1.shape[1]]))
        out = model([batch_x1, batch_x2, batch_x3], batch_l)
        if config.prior:
            post_dict[utt_id] = lsm(
                out[0, :, :]).data.numpy() - config.prior_weight * prior
        else:
            if config.add_softmax:
                post_dict[utt_id] = sm(out[0, :, :]).data.numpy()
            else:
                post_dict[utt_id] = out[0, :, :].data.numpy()

    return post_dict


if __name__ == '__main__':
    config = get_args()
    post_dict = get_output(config)
    dict2Ark(post_dict,
             os.path.abspath(config.save_file),
             kaldi_cmd='copy-feats')
Exemple #3
0
def getFeats(args, srate=16000, window=np.hanning):
    wavs = args.scp
    scp_type = args.scp_type
    outfile = args.outfile
    add_reverb = args.add_reverb
    coeff_0 = args.coeff_0
    coeff_n = args.coeff_n
    order = args.order
    fduration = args.fduration
    frate = args.frate
    nfilters = args.nfilters
    kaldi_cmd = args.kaldi_cmd

    # Set up mel-filterbank
    fbank_type = args.fbank_type.strip().split(',')
    if args.complex_modulation:
        dur = int(fduration * srate)
    else:
        dur = int(2 * fduration * srate)

    if fbank_type[0] == "mel":
        if len(fbank_type) < 2:
            raise ValueError('Mel filter bank not configured properly....')
        fbank = createFbank(nfilters,
                            dur,
                            srate,
                            warp_fact=float(fbank_type[1]))
    elif fbank_type[0] == "cochlear":
        if len(fbank_type) < 6:
            raise ValueError(
                'Cochlear filter bank not configured properly....')
        if int(fbank_type[3]) == 1:
            print(
                '%s: Alpha is fixed and will not change as a function of the center frequency...'
                % sys.argv[0])
        fbank = createFbankCochlear(nfilters,
                                    dur,
                                    srate,
                                    om_w=float(fbank_type[1]),
                                    alp=float(fbank_type[2]),
                                    fixed=int(fbank_type[3]),
                                    bet=float(fbank_type[4]),
                                    warp_fact=float(fbank_type[5]))
    else:
        raise ValueError(
            'Invalid type of filter bank, use mel or cochlear with proper configuration'
        )
    coeff_num = coeff_n - coeff_0 + 1

    if args.keep_even:
        temp = np.arange(0, coeff_num)
        if coeff_0 % 2 == 0:
            # It starts from odd coefficients
            feat_len = temp[1::2].shape[0]
        else:
            feat_len = temp[0::2].shape[0]

    elif args.complex_modulation:
        if args.absolute_value:
            feat_len = coeff_num
        else:
            feat_len = 2 * coeff_num
    else:
        feat_len = coeff_num

    if args.compensate_noise:
        if args.complex_modulation:
            fmax = coeff_num / (fduration)
            faxis = np.linspace(0, fmax, coeff_n)
        else:
            fmax = coeff_num / (2 * fduration)
            faxis = np.linspace(0, fmax, coeff_n)

    if args.no_window:
        print('%s: Using square windows' % sys.argv[0])
        window = sq_wind

    if add_reverb:
        if add_reverb == 'small_room':
            sr_r, rir = read('./RIR/RIR_SmallRoom1_near_AnglA.wav')
            rir = rir[:, 1]
            rir = rir / np.power(2, 15)
        elif add_reverb == 'large_room':
            sr_r, rir = read('./RIR/RIR_LargeRoom1_far_AnglA.wav')
            rir = rir[:, 1]
            rir = rir / np.power(2, 15)
        elif add_reverb == 'clean':
            print('%s: No reverberation added!' % sys.argv[0])
        else:
            raise ValueError('Invalid type of reverberation!')

    with open(wavs, 'r') as fid:
        all_feats = {}

        for line in fid:
            tokens = line.strip().split()
            uttid, inwav = tokens[0], ' '.join(tokens[1:])

            if scp_type == 'wav':
                if inwav[-1] == '|':
                    try:
                        proc = subprocess.run(inwav[:-1],
                                              shell=True,
                                              stdout=subprocess.PIPE)
                        sr, signal = read(io.BytesIO(proc.stdout))
                        skip_rest = False
                    except Exception:
                        skip_rest = True
                else:
                    try:
                        sr, signal = read(inwav)
                        skip_rest = False
                    except Exception:
                        skip_rest = True

                assert sr == srate, 'Input file has different sampling rate.'
            elif scp_type == 'segment':
                try:
                    cmd = 'wav-copy ' + inwav + ' - '
                    proc = subprocess.run(cmd,
                                          shell=True,
                                          stdout=subprocess.PIPE)
                    sr, signal = read(io.BytesIO(proc.stdout))
                    skip_rest = False
                except Exception:
                    skip_rest = True
            else:
                raise ValueError(
                    'Invalid type of scp type, it should be either wav or segment'
                )

            if not skip_rest:
                # I want to work with numbers from 0 to 1 so....
                # signal = signal / np.power(2, 15)

                if add_reverb:
                    if not add_reverb == 'clean':
                        signal = addReverb(signal, rir)

                time_frames = np.array([
                    frame for frame in getFrames(signal, srate, frate,
                                                 fduration, window)
                ])

                if args.complex_modulation:
                    cos_trans = freqAnalysis.ifft(time_frames)
                    cos_trans = cos_trans[:, :int(fduration * srate / 2)]
                else:
                    cos_trans = freqAnalysis.dct(time_frames) / np.sqrt(
                        2 * int(srate * fduration))

                [frame_num, ndct] = np.shape(cos_trans)

                feats = np.zeros((frame_num, nfilters * feat_len))

                print('%s: Computing Features for file: %s, also %d' %
                      (sys.argv[0], uttid, time_frames.shape[0]))
                sys.stdout.flush()
                for i in range(frame_num):

                    each_feat = np.zeros([nfilters, feat_len])
                    for j in range(nfilters):
                        filt = fbank[j, 0:-1]
                        band_dct = filt * cos_trans[i, :]
                        if args.complex_modulation:
                            xlpc, gg = computeLpcFast(
                                band_dct, order,
                                keepreal=False)  # Compute LPC coefficients
                            mod_spec = computeModSpecFromLpc(gg, xlpc, coeff_n)
                            if args.compensate_noise:
                                mod_spec = mod_spec * faxis
                            if args.absolute_value:
                                temp2 = np.abs(mod_spec[coeff_0 - 1:coeff_n])
                            else:
                                temp2 = np.append(
                                    np.real(mod_spec[coeff_0 - 1:coeff_n]),
                                    np.imag(mod_spec[coeff_0 - 1:coeff_n]))
                        else:
                            xlpc, gg = computeLpcFast(band_dct, order)
                            mod_spec = np.real(
                                computeModSpecFromLpc(gg, xlpc, coeff_n))
                            if args.compensate_noise:
                                mod_spec = mod_spec * faxis
                            if args.absolute_value:
                                temp2 = np.abs(mod_spec[coeff_0 - 1:coeff_n])
                            else:
                                temp2 = mod_spec[coeff_0 - 1:coeff_n]

                        if args.keep_even:
                            if coeff_0 % 2 == 0:
                                each_feat[j, :] = temp2[1::2]
                            else:
                                each_feat[j, :] = temp2[0::2]
                        else:
                            each_feat[j, :] = temp2

                    each_feat = np.reshape(each_feat, (1, nfilters * feat_len))

                    feats[i, :] = each_feat

                all_feats[uttid] = feats

        dict2Ark(all_feats, outfile, kaldi_cmd)
Exemple #4
0
                        help='Set LPC gain to 1 (True)')
    parser.add_argument('--kaldi_cmd',
                        help='Kaldi command to use to get ark files')

    args = parser.parse_args()

    start_time = time.time()
    print('%s: Computing MFCC features' % sys.argv[0])
    sys.stdout.flush()

    all_mfcc = get_mfcc(args)

    print('%s: Computing Modulation Spectral features' % sys.argv[0])
    sys.stdout.flush()

    all_modspec = get_modspec(args)

    print('%s: Combining Modulation Spectral features' % sys.argv[0])
    sys.stdout.flush()

    all_feats = {}
    for uttid in list(all_mfcc.keys()):
        mfcc = all_mfcc[uttid]
        modspec = all_modspec[uttid]
        all_feats[uttid] = np.concatenate((modspec, mfcc), axis=1)

    dict2Ark(all_feats, args.outfile, args.kaldi_cmd)
    time_note = 'Execution Time: {t:.3f} seconds'.format(t=time.time() -
                                                         start_time)
    print(time_note)
    sys.stdout.flush()
def getFeats(args, srate=16000, window=np.hanning):
    
    wavs=args.scp
    segment=args.segment
    outfile=args.outfile
    add_reverb=args.add_reverb
    set_unity_gain=args.set_unity_gain
    nmodulations=args.nmodulations
    order=args.order
    fduration=args.fduration
    frate=args.frate
    nfilters=args.nfilters
    kaldi_cmd=args.kaldi_cmd
    
    fbank = createFbank(nfilters, int(2*fduration*srate), srate)
    
    if add_reverb:
        if add_reverb=='small_room':
            sr_r, rir=read('./RIR/RIR_SmallRoom1_near_AnglA.wav')
            rir=rir[:,1]
            rir=rir/np.power(2,15)
        elif add_reverb=='large_room':
            sr_r, rir=read('./RIR/RIR_LargeRoom1_far_AnglA.wav')
            rir=rir[:,1]
            rir=rir/np.power(2,15)
        elif add_reverb=='clean':
            print('%s: No reverberation added!' % sys.argv[0])
        else:
            raise ValueError('Invalid type of reverberation!')
            
    wav_in_buffer='' # Wav file that is currently in RAM
    
    # Load Location and Ids of all wav files 
    wav_ids=[]; wav_locs=[]
    with open(wavs, 'r') as fid:
        for line in fid:
             tokens = line.strip().split()
             uttid, inwav = tokens[0], ' '.join(tokens[1:])
             wav_ids.append(uttid)
             wav_locs.append(inwav)
    
    # Compute features for all the segments
    with open(segment, 'r') as fid_s:
         all_feats={}
         for line_s in fid_s:
            token_s = line_s.strip().split()
            seg_id=token_s[0]; wav_id=token_s[1] 
            
            # Load wav file it is already not in RAM unload
            if wav_in_buffer!=wav_id:
               wav_in_buffer=wav_id
               inwav=wav_locs[wav_ids.index(wav_id)]
               if inwav[-1] == '|':
                   proc = subprocess.run(inwav[:-1], shell=True,
                                         stdout=subprocess.PIPE)
                   sr, signal_big = read(io.BytesIO(proc.stdout))
               else:
                   sr, signal_big = read(inwav)
                   assert sr == srate, 'Input file has different sampling rate.'
               
            t_beg=int(float(token_s[2])*sr); t_end=int(float(token_s[3])*sr)
            signal=signal_big[t_beg:t_end]
            signal=signal/np.power(2,15)

            if add_reverb:
                if not add_reverb=='clean':
                    signal=addReverb(signal,rir)
                
            time_frames = np.array([frame for frame in
                getFrames(signal, srate, frate, fduration, window)])

            cos_trans=freqAnalysis.dct(time_frames)/np.sqrt(2*int(srate * fduration))
            
            [frame_num, ndct]=np.shape(cos_trans)
                            
            feats=np.zeros((frame_num,nfilters*nmodulations))
            print('%s: Computing Features for file: %s and segment: %s' % (sys.argv[0],wav_id,seg_id))
            sys.stdout.flush()
            for i in range(frame_num):
                each_feat=np.zeros([nfilters,nmodulations])
                for j in range(nfilters):
                    filt=fbank[j,0:-1]
                    band_dct=filt*cos_trans[i,:]
                    xlpc, gg=computeLpcFast(band_dct,order) # Compute LPC coefficients 
                    if set_unity_gain:
                        gg=1
                    mod_spec=computeModSpecFromLpc(gg,xlpc,nmodulations)
                    each_feat[j,:]=mod_spec
                each_feat=np.reshape(each_feat,(1,nfilters*nmodulations))
                feats[i,:]=each_feat
        
            all_feats[seg_id]=feats
    dict2Ark(all_feats,outfile,kaldi_cmd)
Exemple #6
0
def getFeats(args, srate=16000, window=np.hamming):
    wavs = args.scp
    scp_type = args.scp_type
    outfile = args.outfile
    coeff_num = args.coeff_num
    coeff_range = args.coeff_range
    order = args.order
    fduration = args.fduration
    frate = args.frate
    nfilters = args.nfilters
    kaldi_cmd = args.kaldi_cmd
    add_noise = args.add_noise
    add_reverb = args.add_reverb

    if args.lifter_config:
        fid = open(args.lifter_config, 'r')
        lifter_config = fid.readline().strip().split(',')
        lifter_config = np.asarray([float(x) for x in lifter_config])

    # Set up mel-filterbank
    fbank_type = args.fbank_type.strip().split(',')
    if fbank_type[0] == "mel":
        if len(fbank_type) < 2:
            raise ValueError('Mel filter bank not configured properly....')
        fbank = createFbank(nfilters, int(2 * fduration * srate), srate, warp_fact=float(fbank_type[1]))
    elif fbank_type[0] == "cochlear":
        if len(fbank_type) < 6:
            raise ValueError('Cochlear filter bank not configured properly....')
        if int(fbank_type[3]) == 1:
            print('%s: Alpha is fixed and will not change as a function of the center frequency...' % sys.argv[0])
        fbank = createFbankCochlear(nfilters, int(2 * fduration * srate), srate, om_w=float(fbank_type[1]),
                                    alp=float(fbank_type[2]), fixed=int(fbank_type[3]), bet=float(fbank_type[4]),
                                    warp_fact=float(fbank_type[5]))
    else:
        raise ValueError('Invalid type of filter bank, use mel or cochlear with proper configuration')

    # Ignore odd modulations
    if args.odd_mod_zero:
        print('%s: Ignoring odd modulations... ' % sys.argv[0])
    if add_noise:
        if add_noise == "clean" or add_noise == "diff":
            print('%s: No noise added!' % sys.argv[0])
        else:
            noise_info = add_noise.strip().split(',')
            noise = load_noise(noise_info[0])

    if add_reverb:
        if add_reverb == 'small_room':
            sr_r, rir = read('./RIR/RIR_SmallRoom1_near_AnglA.wav')
            rir = rir[:, 1]
            rir = rir / np.power(2, 15)
        elif add_reverb == 'medium_room':
            sr_r, rir = read('./RIR/RIR_MediumRoom1_far_AnglA.wav')
            rir = rir[:, 1]
            rir = rir / np.power(2, 15)
        elif add_reverb == 'large_room':
            sr_r, rir = read('./RIR/RIR_LargeRoom1_far_AnglA.wav')
            rir = rir[:, 1]
            rir = rir / np.power(2, 15)
        elif add_reverb == 'clean':
            print('%s: No reverberation added!' % sys.argv[0])
        else:
            raise ValueError('Invalid type of reverberation!')

    # Set up mask
    coeff_range = coeff_range.split(',')
    lowpass = int(coeff_range[0])
    highpass = int(coeff_range[1])
    mask = []
    for i in range(coeff_num):
        if i >= lowpass and i <= highpass:
            mask.append(1)
        else:
            mask.append(0)
    mask = np.asarray(mask)
    args.overlap_fraction = 1 - args.overlap_fraction

    # Setup modulation weights
    args.gamma_weight = args.gamma_weight.strip().split(',')
    if not args.gamma_weight[0] == "None":
        print('%s: Adding gamma filter on modulation frequencies...' % sys.argv[0])
        x = np.linspace(0, order - 1, order)
        scale = float(args.gamma_weight[0])
        shape = float(args.gamma_weight[1])
        pk_required = float(args.gamma_weight[2])
        res = 2 * fduration
        pk_required = pk_required * res
        pk = (shape - 1) * scale
        loc = -pk + pk_required
        mod_wts = stats.gamma.pdf(x, a=shape, loc=loc, scale=scale) * 3 * scale
    with open(wavs, 'r') as fid:

        all_feats = {}
        if args.write_utt2num_frames:
            all_lens = {}

        for line in fid:
            tokens = line.strip().split()
            uttid, inwav = tokens[0], ' '.join(tokens[1:])

            if scp_type == 'wav':
                if inwav[-1] == '|':
                    try:
                        proc = subprocess.run(inwav[:-1], shell=True, stdout=subprocess.PIPE)
                        sr, signal = read(io.BytesIO(proc.stdout))
                        skip_rest=False
                    except Exception:
                        skip_rest=True
                else:
                    try:
                        sr, signal = read(inwav)
                        skip_rest = False
                    except Exception:
                        skip_rest = True

                assert sr == srate, 'Input file has different sampling rate.'
            elif scp_type == 'segment':
                try:
                    cmd = 'wav-copy ' + inwav + ' - '
                    proc = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE)
                    sr, signal = read(io.BytesIO(proc.stdout))
                    skip_rest = False
                except Exception:
                    skip_rest = True
            else:
                raise ValueError('Invalid type of scp type, it should be either wav or segment')

            # I want to work with numbers from 0 to 1 so....
            # signal = signal / np.power(2, 15)

            if not skip_rest:
                if add_noise:
                    if not add_noise == "clean":
                        if add_noise == "diff":
                            a = [1, 2, 3, 2, 0, -2, -5, -2, 0, 2, 3, 2, 1]
                            signal = convolve(signal, a, mode='same')
                        else:
                            signal = add_noise_to_wav(signal, noise, float(noise_info[1]))

                if add_reverb:
                    if not add_reverb == 'clean':
                        signal = addReverb(signal, rir)

                tframes = signal.shape[0]  # Number of samples in the signal

                lfr = 1 / (args.overlap_fraction * fduration)
                time_frames = np.array([frame for frame in
                                        getFrames(signal, srate, lfr, fduration, window)])

                cos_trans = freqAnalysis.dct(time_frames) / np.sqrt(2 * int(srate * fduration))

                [frame_num, ndct] = np.shape(cos_trans)

                feats = np.zeros((nfilters, int(np.ceil(tframes * frate / srate))))
                ptr = int(0)

                print('%s: Computing Features for file: %s' % (sys.argv[0], uttid))
                sys.stdout.flush()

                for i in range(0, frame_num):
                    for j in range(nfilters):
                        filt = fbank[j, 0:-1]
                        band_dct = filt * cos_trans[i, :]
                        xlpc, gg = computeLpcFast(band_dct, order)  # Compute LPC coefficients
                        ms = computeModSpecFromLpc(gg, xlpc, coeff_num)
                        ms = ms * mask
                        if args.lifter_config:
                            ms = ms * lifter_config
                        if not args.gamma_weight[0] == "None":
                            ms = ms * mod_wts
                        if args.odd_mod_zero:
                            ms[1::2] = 0
                        ms = fft(ms, 2 * int(fduration * frate))
                        ms = np.abs(np.exp(ms))
                        kk = int(np.round(fduration * frate))
                        kkb2 = int(np.round(fduration * frate / 2))
                        ms = ms[0:kk] * np.hanning(kk) / window(kk)

                        if i == 0:
                            if feats.shape[1] < kkb2:
                                feats[j, :] += ms[kkb2:kkb2 + feats.shape[1]]
                            else:
                                feats[j, ptr:ptr + kkb2] += ms[kkb2:]
                        elif i == frame_num - 1 or i == frame_num - 2:
                            if ms.shape[0] >= feats.shape[1] - ptr:
                                feats[j, ptr:] += ms[:feats.shape[1] - ptr]
                            else:
                                feats[j, ptr:ptr + kk] += ms
                        else:
                            feats[j, ptr:ptr + kk] += ms

                    kk = int(np.round(fduration * frate * args.overlap_fraction))
                    kkb2 = int(np.round(fduration * frate / 2))
                    if i == 0:
                        ptr = int(ptr + kk - kkb2)
                    else:
                        ptr = int(ptr + kk + randrange(2))

                all_feats[uttid] = np.log(np.clip(feats.T, a_max=None, a_min=0.00000000000001))
                if args.write_utt2num_frames:
                    all_lens[uttid] = feats.shape[1]

        dict2Ark(all_feats, outfile, kaldi_cmd)
        if args.write_utt2num_frames:
            with open(outfile + '.len', 'w+') as file:
                for key, lens in all_lens.items():
                    p = "{:s} {:d}".format(key, lens)
                    file.write(p)
                    file.write("\n")
def getFeats(args, srate=16000, window=np.hanning):

    wavs = args.scp
    outfile = args.outfile
    add_reverb = args.add_reverb
    set_unity_gain = args.set_unity_gain
    nmodulations = args.nmodulations
    order = args.order
    fduration = args.fduration
    frate = args.frate
    nfilters = args.nfilters
    kaldi_cmd = args.kaldi_cmd

    fbank = createFbank(nfilters, int(2 * fduration * srate), srate)

    if add_reverb:
        if add_reverb == 'small_room':
            sr_r, rir = read('./RIR/RIR_SmallRoom1_near_AnglA.wav')
            rir = rir[:, 1]
            rir = rir / np.power(2, 15)
        elif add_reverb == 'large_room':
            sr_r, rir = read('./RIR/RIR_LargeRoom1_far_AnglA.wav')
            rir = rir[:, 1]
            rir = rir / np.power(2, 15)
        elif add_reverb == 'clean':
            print('%s: No reverberation added!' % sys.argv[0])
        else:
            raise ValueError('Invalid type of reverberation!')

    with open(wavs, 'r') as fid:
        all_feats = {}

        for line in fid:
            tokens = line.strip().split()
            uttid, inwav = tokens[0], ' '.join(tokens[1:])

            if inwav[-1] == '|':
                proc = subprocess.run(inwav[:-1],
                                      shell=True,
                                      stdout=subprocess.PIPE)
                sr, signal = read(io.BytesIO(proc.stdout))
            else:
                sr, signal = read(inwav)
            assert sr == srate, 'Input file has different sampling rate.'

            # I want to work with numbers from 0 to 1 so....
            signal = signal / np.power(2, 15)

            if add_reverb:
                if not add_reverb == 'clean':
                    signal = addReverb(signal, rir)

            time_frames = np.array([
                frame
                for frame in getFrames(signal, srate, frate, fduration, window)
            ])

            cos_trans = freqAnalysis.dct(time_frames) / np.sqrt(
                2 * int(srate * fduration))

            [frame_num, ndct] = np.shape(cos_trans)

            if set_unity_gain:
                feats = np.zeros((frame_num, nfilters * (nmodulations - 1)))
            else:
                feats = np.zeros((frame_num, nfilters * nmodulations))

            print('%s: Computing Features for file: %s' % (sys.argv[0], uttid))
            sys.stdout.flush()
            for i in range(frame_num):
                if set_unity_gain:
                    each_feat = np.zeros([nfilters, nmodulations - 1])
                else:
                    each_feat = np.zeros([nfilters, nmodulations])
                for j in range(nfilters):
                    filt = fbank[j, 0:-1]
                    band_dct = filt * cos_trans[i, :]
                    xlpc, gg = computeLpcFast(
                        band_dct, order)  # Compute LPC coefficients
                    if set_unity_gain:
                        gg = 1
                    mod_spec = computeModSpecFromLpc(gg, xlpc, nmodulations)
                    if set_unity_gain:
                        mod_spec = mod_spec[1:]
                    each_feat[j, :] = mod_spec
                if set_unity_gain:
                    each_feat = np.reshape(each_feat,
                                           (1, nfilters * (nmodulations - 1)))
                else:
                    each_feat = np.reshape(each_feat,
                                           (1, nfilters * nmodulations))
                feats[i, :] = each_feat

            all_feats[uttid] = feats

        dict2Ark(all_feats, outfile, kaldi_cmd)