def get_mfcc_feat(filename, winstep=0.01, nfilt=40, numcep=13, preemph=0.95, appendEnergy=True, begin_index=-1,
                  end_index=-1):
    """
    Compute MFCC features from an audio signal.

    :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
    :param numcep: the number of cepstrum to return, default 13
    :param nfilt: the number of filters in the filterbank, default 26.
    :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.95.
    :param appendEnergy: if this is true, the zeroth cepstral coefficient is replaced with the log of the total frame energy.
    :param begin_index: the frame when record begin
    :param end_index: the frame when record end
    :return mfcc_feat: A numpy array of size (NUMFRAMES by numcep) containing features. Each row holds 1 feature vector.
    :return fbank_feat: A numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector.
    :return normalization_feature: the 39-dimensions feature after finite difference and normalization
    """
    (rate, signal) = wav.read(filename)
    if begin_index != -1:
        signal = signal[begin_index * 44100 * 0.02:end_index * 44100 * 0.02]
    # print signal
    # print type(signal[0])
    if rate == 44100:
        winlen = 0.02
        nfft = 1024
    elif rate == 16000:
        winlen = 0.025
        nfft = 512
    mfcc_feat = mfcc.mfcc(signal=signal, samplerate=rate, winlen=winlen, winstep=winstep, nfft=nfft,
                          nfilt=nfilt, numcep=numcep, preemph=preemph, appendEnergy=appendEnergy)
    numpy.savetxt("result.txt", mfcc_feat, delimiter=",")
    fbank_feat = mfcc.logfbank(signal=signal, samplerate=rate, winlen=winlen, winstep=winstep, nfft=nfft,
                               nfilt=nfilt, preemph=preemph)
    # numpy.savetxt("result2.txt", fbank_feat, delimiter=",")
    normalization_feature = mfcc.normalization(mfcc_feat)
    return mfcc_feat, fbank_feat, normalization_feature
Exemple #2
0
    def get_class_proba_sound(clf_gb, clf_rf, data, interval, numFrames):

        coeff = data.shape[0] / numFrames
        features_nb = 2587
        X = np.zeros((len(interval), features_nb))
        for i, inter in enumerate(interval):
            name, (beg, end) = inter
            space = end - beg - 9
            #print 'space:', space
            limit = 40
            data_interval = np.zeros(40 * coeff)
            if (space > limit):
                data_interval = data[(beg - 1) * coeff:(beg + limit - 1) *
                                     coeff]
            else:
                data_interval[:space * coeff] = data[(beg - 1) *
                                                     coeff:(end - 10) * coeff]
            ceps, mspec, spec = mf.mfcc(data_interval)
            #print 'ceps, mspec, spec', ceps.shape, mspec.shape, spec.shape
            X[i, :2587] = ceps.reshape(2587)
            #X[i, 2587:10547] = mspec.reshape(7960)
            #X[i, 10547:22835] = spec.reshape(12288)
        X = np.nan_to_num(X)
        X = X.clip(min=-100)

        return clf_gb.predict_proba(X), clf_rf.predict_proba(X)
 def __init__(self, audio):
     self.audio = audio
     self.window_length = 0.025
     self.window_step = 0.01
     self.num_cepstrals = 12
     self.nfft = 512
     self.mfcc = mfcc.mfcc()
    def record(self, button):
        text, ok = QtWidgets.QInputDialog().getText(self,
                                                    "Emotion Recognition",
                                                    "Enter Your Name:")

        if ok and text:
            self.speaker = text
            print "Speaker: " + self.speaker
            wFile = "Test/" + self.speaker + ".wav"
            start_record(self, wFile, 5, self.state)
            state = "File saved as " + self.speaker
            self.repaint()
            print 'File saved as ' + self.speaker + ".wav"
            self.add_files()
            fFile = "feat/Test/" + self.speaker + ".htk"
            try:
                # call MFCC feature extraction subroutine
                f, E, fs = mfcc(wFile, self.winlen, self.ovrlen, self.pre_coef,
                                self.nfilter, self.nftt)
                # VAD part
                if self.opts == 1:
                    f = vad_thr(f, E)
                    # Energy threshold based VAD [comment this  line if you would like to plugin the rVAD labels]
                elif self.opts == 0:
                    l = numpy.loadtxt('..corresponding vad label file')
                    # [Plugin the VAD label generated by rVAD matlab]

                    if (len(f) - len(l)) == 1:
                        # 1-[end-frame] correction [matlab/python]
                        l = numpy.append(l, l[-1:, ])
                    elif (len(f) - len(l)) == -1:
                        l = numpy.delete(l, -1)

                    if (len(l)
                            == len(f)) and (len(numpy.argwhere(l == 1)) != 0):
                        idx = numpy.where(l == 1)
                        f = f[idx]
                    else:
                        print "mismatch frames between: label and feature files or no voice-frame in VAD"
                        exit()
                    # Zero mean unit variance  normalize after VAD
                f = cmvn(f)

                # write the VAD+normalized features  in file
                if not os.path.exists(os.path.dirname(fFile)):
                    # create director for the feature file
                    os.makedirs(os.path.dirname(fFile))

                writehtk(fFile, f, 0.01)

            except:
                print("Fail1..%s ---> %s\n" % (wFile, fFile))

            if button.text() == "Live Detection":
                self.test_emotion(name=self.speaker)
        else:
            self.state = "Not valid Name"
            self.repaint()

        return
def collect_mfcc(arg, dirname, fnames):
    for fname in fnames:
        if fname.endswith(".wav"):
            signal, FS = waveio.wave_from_file(os.path.join(dirname, fname))
            feature = list(mfcc.mfcc(signal, FS))
            feature_collection[fname] = []
            for point in feature:
                feature_collection[fname].append(lbg.cluster_point(point, mu))
Exemple #6
0
    def calc_mfcc(arg, dirname, fnames):

        for fname in fnames:
            if fname.endswith(".wav"):
                signal, FS = waveio.wave_from_file(os.path.join(
                    dirname, fname))
                feature = list(mfcc(signal, FS))
                print "File: %s generate %d number of MFCC" % (fname,
                                                               len(feature))
                feature_collection["feature"] += feature
def make_fake_mfcc(construct_record):
    """制作fake录音\n
    输入:fake录音的字符串描述\n
    输出:(label,fake_mfcc_sequence)元组\n
    e.g 输入123,输出(['1','2','3'],123的mfcc_sequence)"""
    label = []
    return_mfcc_sequence = []
    for digit in construct_record:
        label.append(digit)
        return_mfcc_sequence += mfcc.mfcc(
            readwave.read_wave("./fake/" + digit + ".wav"), "fm")
    return label, return_mfcc_sequence
Exemple #8
0
def get_mfcc_feat(filename,
                  winstep=0.01,
                  nfilt=40,
                  numcep=13,
                  preemph=0.95,
                  appendEnergy=True,
                  begin_index=-1,
                  end_index=-1):
    """
    Compute MFCC features from an audio signal.

    :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
    :param numcep: the number of cepstrum to return, default 13
    :param nfilt: the number of filters in the filterbank, default 26.
    :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.95.
    :param appendEnergy: if this is true, the zeroth cepstral coefficient is replaced with the log of the total frame energy.
    :param begin_index: the frame when record begin
    :param end_index: the frame when record end
    :return mfcc_feat: A numpy array of size (NUMFRAMES by numcep) containing features. Each row holds 1 feature vector.
    :return fbank_feat: A numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector.
    :return normalization_feature: the 39-dimensions feature after finite difference and normalization
    """
    (rate, signal) = wav.read(filename)
    if begin_index != -1:
        signal = signal[begin_index * 44100 * 0.02:end_index * 44100 * 0.02]
    # print signal
    # print type(signal)
    if rate == 44100:
        winlen = 0.02
        nfft = 1024
    elif rate == 16000:
        winlen = 0.025
        nfft = 512
    mfcc_feat = mfcc.mfcc(signal=signal,
                          samplerate=rate,
                          winlen=winlen,
                          winstep=winstep,
                          nfft=nfft,
                          nfilt=nfilt,
                          numcep=numcep,
                          preemph=preemph,
                          appendEnergy=appendEnergy)
    numpy.savetxt("result.txt", mfcc_feat, delimiter=",")
    fbank_feat = mfcc.logfbank(signal=signal,
                               samplerate=rate,
                               winlen=winlen,
                               winstep=winstep,
                               nfft=nfft,
                               nfilt=nfilt,
                               preemph=preemph)
    # numpy.savetxt("result2.txt", fbank_feat, delimiter=",")
    normalization_feature = mfcc.normalization(mfcc_feat)
    return mfcc_feat, fbank_feat, normalization_feature
Exemple #9
0
 def loop(self):
     self.working = True
     while self.working:
         frames = self.input_audio.read()
         if self.input_frames is not None:
             self.input_frames = numpy.append(self.input_frames, frames)
         mfcc_features = mfcc.mfcc(frames)
         self.state.step(self, mfcc_features)
         if self.output_queue is not None:
             output = self.network.capture_output(1)
             self.output_queue.put(output)
     self.input_audio.close()
Exemple #10
0
def compare(arg, dirname, fnames):
    for fname in fnames:
        if test in fname:
            # Get MFCC for each test sample
            test_features = mfcc(signal = waveio.wave_from_file(os.path.join(dirname, fname))[0], fs = waveio.wave_from_file(os.path.join(dirname, fname))[1])
            # Record overhead and score for each test sample versus each template.(Each sample is of a dict corresponding to each template)
            overhead[fname]= {}
            score[fname] = {}
            for template in template_feature.keys():
                time_start = time()
                score[fname][template] = dtw(template_feature[template], test_features)
                overhead[fname][template] = time() - time_start
def training(nfiltbank):
    nSpeaker = 1
    nCentroid = 16
    codebooks_mfcc = np.empty((nSpeaker, nfiltbank, nCentroid))
    directory = 'train'
    fname = str()

    fname = '/shefali1.wav'
    (fs, s) = read(directory + fname)
    mel_coeff = mfcc(s, fs, nfiltbank)
    codebook = lbg(mel_coeff, nCentroid)

    print('Training complete')
    return (codebook)
def training(nfiltbank):
    nSpeaker = 8
    nCentroid = 16
    codebooks = np.empty((nSpeaker, nfiltbank, nCentroid))
    directory = 'train'
    fname = str()

    for i in range(nSpeaker):
        fname = '/s' + str(i + 1) + '.wav'
        (fs, s) = read(directory + fname)
        mel_coeff = mfcc(s, fs, nfiltbank)
        codebooks[i, :, :] = lbg(mel_coeff, nCentroid)

    print('Training complete')
    return (codebooks)
def predict(model_name, filename):

    samples, sample_rate = audio.decode_wav(filename)
    coefficients = mfcc(samples, sample_rate)
    coefficients = tf.reshape(tf.cast(coefficients, tf.float32),
                              [1, 98, 13, 1])
    model = tf.keras.models.load_model(model_name)
    prediction = model.predict(coefficients)

    if np.argmax(prediction) >= 0.8:
        output = constant.train_commands[np.argmax(prediction)]
    else:
        output = None

    return output
Exemple #14
0
def train(n):
    k = 16
    import wave
    code = []

    for i in range(n):
        wav_file=wave.open('{}'.format(i+1) + '.wav')
        raw_frames = wav_file.readframes(-1)
        num_frames = wav_file.getnframes()
        num_channels = wav_file.getnchannels()
        sample_rate = wav_file.getframerate()
        sample_width = wav_file.getsampwidth()
        temp_buffer = np.empty((num_frames, num_channels, 4), dtype=np.uint8)
        raw_bytes = np.frombuffer(raw_frames, dtype=np.uint8)
        temp_buffer[:, :, :sample_width] = raw_bytes.reshape(-1, num_channels, 
                                                    sample_width)
        temp_buffer[:, :, sample_width:] = \
            (temp_buffer[:, :, sample_width-1:sample_width] >> 7) * 255
        frames = temp_buffer.view('<i4').reshape(temp_buffer.shape[:-1])
        
        #sample_rate = wav_file.getframerate()
        sample_rate = 16000;
        signal=frames[::3,0]
        sample_rate, signal = cut.cut(sample_rate, signal)
#        sample_rate, signal = scipy.io.wavfile.read('{}'.format(i+1) + '.wav')
        v = mfcc(signal, sample_rate)

        code.append(vqlbg(v, k))
        print(i)


    if n == 3:
        d1 = disteu(code[0], code[1])
        d2 = disteu(code[1], code[2])
        d3 = disteu(code[2], code[0])

        dk = []
        dk.append(sum(d1.min(1)) / d1.shape[0])
        dk.append(sum(d2.min(1)) / d2.shape[0])
        dk.append(sum(d3.min(1)) / d3.shape[0])

        dmax = max(dk)
        dmin = min(dk)
    else:
        dmax = 0
        dmin = 0

    return code, dmax, dmin
Exemple #15
0
def compare_files(filenames, save_preprocessed_files):
    class Sim:
        def __init__(self, name1, name2):
            self.name1 = name1
            self.name2 = name2

    characteristics = {}
    for filename in filenames:
        print("processing {}".format(filename))
        signal = sound_file.load(filename)
        signal = pre_processing.process(signal)
        if save_preprocessed_files:
            sound_file.save(filename[:-4] + "_p.wav", signal)
        characteristics[filename] = mfcc.mfcc(signal)
    metrics = _metrics_of_characteristics(characteristics, filenames)
    _simple_metrics_comparison(metrics)
    _dbscan_metrics_comparison(metrics)
    return metrics
Exemple #16
0
def get_mpc(path,filename,ncep,dataset):
    signal, rate = get_audio_info(path)
    mfcc_feat,mpc_feat,spectrum  = mfcc(signal,fs=rate,nfft=512,nceps=ncep,nwin=512)
    feat = mpc_feat[:,:ncep]
    mean_feat = np.mean(feat, axis=0)
    covvar = np.cov(feat,rowvar=0)
    cov_feat = []
    #cov_feat = np.empty([0,0])
    for i in range(0,covvar.shape[0]):
        for j in range(0,i+1):
            cov_feat.append(covvar[i][j])
    cov_feat = np.array(cov_feat)
    feature_vector = np.append(mean_feat,cov_feat)
    data.append(feature_vector)
    if dataset == 'ismir':
        labels.append(re.split('-', filename, maxsplit=1)[0])
    elif dataset == 'gtzan':
        labels.append(re.split('\.', filename, maxsplit=1)[0])
    else:
        print 'unrecognised dataset type'
        sys.exit()
Exemple #17
0
def test(isOnline, test_data):
    if isOnline:
        test_sequence = mfcc.mfcc(record.record(), "fast_mode")
        hmm(test_sequence, 3, "test0", True)
    else:
        for i in xrange(10):
            ### 俊优的test sequence
            for index in test_data[0]:
                name = ".\\txtDictionary\\junyo_" + str(i) + "_" + str(
                    index) + ".txt"
                sequence = np.loadtxt(name)
                sequence = sequence[1:]
                hmm(sequence, i, name, False)

            ### 健炜的test sequence
            for index in test_data[1]:
                name = ".\\txtDictionary\\" + str(i) + "_" + str(
                    index) + ".txt"
                sequence = np.loadtxt(name)
                sequence = sequence[1:]
                hmm(sequence, i, name, False)
Exemple #18
0
def test(name, code):
    import wave
    wav_file = wave.open(name + '.wav')
    raw_frames = wav_file.readframes(-1)
    num_frames = wav_file.getnframes()
    num_channels = wav_file.getnchannels()
    sample_rate = wav_file.getframerate()
    sample_width = wav_file.getsampwidth()
    temp_buffer = np.empty((num_frames, num_channels, 4), dtype=np.uint8)
    raw_bytes = np.frombuffer(raw_frames, dtype=np.uint8)
    temp_buffer[:, :, :sample_width] = raw_bytes.reshape(
        -1, num_channels, sample_width)
    temp_buffer[:, :, sample_width:] = \
        (temp_buffer[:, :, sample_width-1:sample_width] >> 7) * 255
    frames = temp_buffer.view('<i4').reshape(temp_buffer.shape[:-1])

    #sample_rate = wav_file.getframerate()/3
    sample_rate = 16000
    signal = frames[::3, 0]
    sample_rate, signal = cut.cut(sample_rate, signal)
    v = mfcc(signal, sample_rate)

    distmin = float('inf')
    dist = 0

    for i in range(len(code)):
        d = disteu(v, code[i])
        dist += sum(d.min(1)) / d.shape[0]
    dist = dist / len(code)
    # dist = sum(d.min(1)) / d.shape[0]
    #
    # if dist < distmin:
    #     distmin = dist

    print(dist)

    if dist < 4.0:
        return True
    else:
        return False
    def GetNumber(self, AudioIn, Fs):

        #对输入信号求mfcc
        ReceivedSignal = mfcc.mfcc(AudioIn, Fs)

        value = []  #开始进行模板匹配
        value.append(dtw.dtw(self.mfcc0, ReceivedSignal))
        value.append(dtw.dtw(self.mfcc1, ReceivedSignal))
        value.append(dtw.dtw(self.mfcc2, ReceivedSignal))
        value.append(dtw.dtw(self.mfcc3, ReceivedSignal))
        value.append(dtw.dtw(self.mfcc4, ReceivedSignal))
        value.append(dtw.dtw(self.mfcc5, ReceivedSignal))
        value.append(dtw.dtw(self.mfcc6, ReceivedSignal))
        value.append(dtw.dtw(self.mfcc7, ReceivedSignal))
        value.append(dtw.dtw(self.mfcc8, ReceivedSignal))
        value.append(dtw.dtw(self.mfcc9, ReceivedSignal))

        value.append(dtw.dtw(self.mfccAuther, ReceivedSignal))

        #print(value)
        number = value.index(min(value))  #获取最接近的值
        #print(number)
        return number
Exemple #20
0
def features(audio, rate, ncp=ncep):
    audio = clip(audio.astype(np.float64))

    feats = np.empty(ncp + nother)

    coefs = mfcc.mfcc(audio,
                      samplerate=rate,
                      numcep=ncp,
                      nfilt=2 * ncp,
                      nfft=4096,
                      winlen=4096 / rate,
                      winstep=4096 / 3 / rate)
    feats[0:ncp] = np.mean(coefs, axis=0)

    fft = np.abs(np.fft.rfft(audio))
    pwr = fft**2
    feats[ncp] = ms.gmean(pwr) / np.mean(pwr)

    freq = np.fft.rfftfreq(len(audio), 1 / rate)
    feats[ncp + 1] = np.sum(fft * freq) / np.sum(fft)

    feats[ncp + 2] = np.sqrt(np.mean(np.square(audio)))

    return feats
 def get_class_proba_sound(clf_gb, clf_rf, data, interval, numFrames):
 
     coeff = data.shape[0] / numFrames
     features_nb = 2587 
     X = np.zeros((len(interval), features_nb))
     for i, inter in enumerate(interval):
         name, (beg, end) = inter
         space = end - beg - 9
         #print 'space:', space
         limit = 40
         data_interval = np.zeros(40*coeff)
         if(space > limit):
             data_interval = data[(beg-1)*coeff:(beg+limit-1)*coeff]
         else:
             data_interval[:space*coeff] = data[(beg-1)*coeff:(end-10)*coeff]
         ceps, mspec, spec = mf.mfcc(data_interval)
         #print 'ceps, mspec, spec', ceps.shape, mspec.shape, spec.shape
         X[i, :2587] = ceps.reshape(2587)
         #X[i, 2587:10547] = mspec.reshape(7960)
         #X[i, 10547:22835] = spec.reshape(12288)
     X = np.nan_to_num(X)
     X = X.clip(min=-100)
     
     return clf_gb.predict_proba(X), clf_rf.predict_proba(X)
                min_cost = node.cost
        for node in trellis:
            if node.cost > min_cost + PRUNING_THRESHOLD:
                node.active = False
                node.cost = sys.float_info.max
        # pruning done, now start next loop, taking in next mfcc in the sequence.
    # collect result
    pointer = trellis[-1].backPointer
    result = []
    while pointer != 0:
        result.insert(0, back_pointer_table[pointer].meaning)
        pointer = back_pointer_table[pointer].previous_index
    for i in range(len(result)):
        ob = result[i].find('\"')
        result[i] = result[i][ob+1]
    return result

if __name__ == '__main__':
    generated_trellis = trellis_generate("network_gen.txt")
    print('trellis construct done')
    # continuous_speech = mfcc.mfcc(record.record(), "fm")
    continuous_speech = mfcc.mfcc(readwave.read_wave("./fake/9.wav"),"fm")
    continuous_speech += mfcc.mfcc(readwave.read_wave("./fake/1.wav"),"fm")
    continuous_speech += mfcc.mfcc(readwave.read_wave("./fake/0.wav"),"fm")
    continuous_speech += mfcc.mfcc(readwave.read_wave("./fake/9.wav"),"fm")
    continuous_speech += mfcc.mfcc(readwave.read_wave("./fake/6.wav"), "fm")
    continuous_speech += mfcc.mfcc(readwave.read_wave("./fake/7.wav"), "fm")
    continuous_speech += mfcc.mfcc(readwave.read_wave("./fake/8.wav"), "fm")
    getresult = back_pointer_dtw(continuous_speech, generated_trellis)
    print(getresult)
    def SystemInit(self):

        #输入模板信号
        self.sample_rate, self.signal1 = scipy.io.wavfile.read('1.wav')
        self.sample_rate, self.signal2 = scipy.io.wavfile.read('2.wav')
        self.sample_rate, self.signal3 = scipy.io.wavfile.read('3.wav')
        self.sample_rate, self.signal4 = scipy.io.wavfile.read('4.wav')
        self.sample_rate, self.signal5 = scipy.io.wavfile.read('5.wav')
        self.sample_rate, self.signal6 = scipy.io.wavfile.read('6.wav')
        self.sample_rate, self.signal7 = scipy.io.wavfile.read('7.wav')
        self.sample_rate, self.signal8 = scipy.io.wavfile.read('8.wav')
        self.sample_rate, self.signal9 = scipy.io.wavfile.read('9.wav')
        self.sample_rate, self.signal0 = scipy.io.wavfile.read('0.wav')

        self.sample_rate, self.signalAuther = scipy.io.wavfile.read(
            'Auther.wav')

        #对模板信号求mfcc
        self.mfcc0 = mfcc.mfcc(self.signal0, self.sample_rate)
        self.mfcc1 = mfcc.mfcc(self.signal1, self.sample_rate)
        self.mfcc2 = mfcc.mfcc(self.signal2, self.sample_rate)
        self.mfcc3 = mfcc.mfcc(self.signal3, self.sample_rate)
        self.mfcc4 = mfcc.mfcc(self.signal4, self.sample_rate)
        self.mfcc5 = mfcc.mfcc(self.signal5, self.sample_rate)
        self.mfcc6 = mfcc.mfcc(self.signal6, self.sample_rate)
        self.mfcc7 = mfcc.mfcc(self.signal7, self.sample_rate)
        self.mfcc8 = mfcc.mfcc(self.signal8, self.sample_rate)
        self.mfcc9 = mfcc.mfcc(self.signal9, self.sample_rate)

        self.mfccAuther = mfcc.mfcc(self.signalAuther, self.sample_rate)
Exemple #24
0
clf = pickle.load(open('model_svm', 'rb'))
#读取标签对应的乐器名称
instruments = pickle.load(open('names', 'rb'))

#读取测试文件目录
musicpath = input("Enter the path of the folder containing testing audios :")
print("Computing...")
filename = [name for name in os.listdir(musicpath) if re.match(r'.*wav', name)]
result = []
for i in range(len(filename)):
    fs, music = wavfile.read(os.path.join(musicpath, filename[i]))
    if music.dtype.type not in [np.int16, np.int32, np.float32]:
        raise TypeError(
            'only 16bit,32bit PCM and 32bit floating-point wavefiles are supported'
        )
        #只支持这三种精度的wav文件,注意不支持8bit与24bit

    #预处理
    frameTime = 0.02  #(s)
    #f:处理后的各帧对象 frameLength:以取样点数表示的帧长
    f, frameLength = preprocessing(music, fs, frameTime)

    #计算出mfcc
    ceps = mfcc(f, fs, frameLength)
    result.append(clf.predict(ceps))  #得到对每一帧的预测结果

for i in range(len(result)):
    result[i] = np.argmax(np.bincount(result[i]))  #每帧投票,选出得票最多的结果
    print('The instrument played in "' + filename[i] + '" is detected as: ' +
          instruments[result[i]])
Exemple #25
0
import readwave
import mfcc
import numpy as np

for digit in range(0, 10):
    for index in range(0, 5):
        file_name = "./junyo_iso_11_1/" + str(digit) + "_" + str(
            index) + ".wav"
        speech = mfcc.mfcc(readwave.read_wave(file_name), "fm")
        np.savetxt(str(digit) + "_" + str(index) + ".txt", speech)
Exemple #26
0
def main():
    sample_rate, signal11 = scipy.io.wavfile.read('1.wav')
    sample_rate, signal12 = scipy.io.wavfile.read('2.wav')
    sample_rate, signal13 = scipy.io.wavfile.read('3.wav')
    sample_rate, signal14 = scipy.io.wavfile.read('4.wav')
    sample_rate, signal15 = scipy.io.wavfile.read('5.wav')
    sample_rate, signal16 = scipy.io.wavfile.read('6.wav')
    sample_rate, signal17 = scipy.io.wavfile.read('7.wav')
    sample_rate, signal18 = scipy.io.wavfile.read('8.wav')
    sample_rate, signal19 = scipy.io.wavfile.read('9.wav')
    sample_rate, signal10 = scipy.io.wavfile.read('0.wav')

    print('ok1')

    mfcc10 = mfcc(signal10, sample_rate)
    mfcc11 = mfcc(signal11, sample_rate)
    mfcc12 = mfcc(signal12, sample_rate)
    mfcc13 = mfcc(signal13, sample_rate)
    mfcc14 = mfcc(signal14, sample_rate)
    mfcc15 = mfcc(signal15, sample_rate)
    mfcc16 = mfcc(signal16, sample_rate)
    mfcc17 = mfcc(signal17, sample_rate)
    mfcc18 = mfcc(signal18, sample_rate)
    mfcc19 = mfcc(signal19, sample_rate)

    print('ok2')
    value = []

    sample_rate, signalinput = scipy.io.wavfile.read('1.wav')
    ReceivedSignal = mfcc(signalinput, sample_rate)

    value.append(dtw(mfcc10, ReceivedSignal))
    value.append(dtw(mfcc11, ReceivedSignal))
    value.append(dtw(mfcc12, ReceivedSignal))
    value.append(dtw(mfcc13, ReceivedSignal))
    value.append(dtw(mfcc14, ReceivedSignal))
    value.append(dtw(mfcc15, ReceivedSignal))
    value.append(dtw(mfcc16, ReceivedSignal))
    value.append(dtw(mfcc17, ReceivedSignal))
    value.append(dtw(mfcc18, ReceivedSignal))
    value.append(dtw(mfcc19, ReceivedSignal))

    print(value)
    number = value.index(min(value))
    print(number)
Exemple #27
0
import matplotlib.pyplot as plt
import numpy as np
from mfcc import mfcc 
from read import read
from stft import stft

fname = "sineSweep.wav"
(srate, data) = read(fname, "mono")
N = 1024
X= stft(data, N)
X = np.abs(X)
X = X[:N/2+1]
X = mfcc(X, 44100)
#mag to dec conversion
#X = 10 * np.log10(X)
plt.imshow(X[1:], interpolation='nearest', aspect='auto', origin='lower')
plt.show()
Exemple #28
0
    getwav = [line.replace('\n','') for line in wavedata] # Remove to '\n'
  print('Get wave data list: ', getwav)
  
  """ Read wave datas """
  wavdata_, fs_, time_shift = read_wav_data(getwav[0])
  

  ''' Compute features
  Used functions:
    MFCC, Delta MFCC, AveMFCC, AveDeltaMFCC
    MFCC: 	mfcc.mfcc(line)
    Delta MFCC:	mfcc.delta(line)
    Averages:	line.mean(axis=1)
  '''
  # MFCC
  wavdata, fs, time_song, ceps = mfcc.mfcc(wavdata_, fs_)
  
  # Delta MFCC
  delta_mfcc = np.empty((0, len(ceps)), int)
  for line in range(len(ceps.T)):
    delta_mfcc = np.append(delta_mfcc, [mfcc.delta(ceps.T[line])], axis=0)

  # MFCC Average
  ave_mfcc = []
  ave_mfcc.append(ceps.T.mean(axis=1))

  # Delta MFCC Average
  ave_DeltaMfcc = []
  ave_DeltaMfcc.append(delta_mfcc.mean(axis=1))

Exemple #29
0
    def get_mfcc(filename):

        samples, sample_rate = decode_wav(filename)
        return mfcc(samples, sample_rate)
ubmDir= 'GMM' + str(nmix)
wFile = "Test_dec.wav"
start_record(wFile,3)
fFile= "feat/Test/"+wFile+".htk"

with open(ubmDir + '/' + 'ubm') as f:
    print "lood ubm .. %s" %(f)
    ubm_mu, ubm_cov, ubm_w = pickle.load(f)

winlen, ovrlen, pre_coef, nfilter, nftt = 0.025, 0.02, 0.97, 26 , 512
opts=1

try:

    #call MFCC feature extraction subroutine
    f, E, fs=mfcc(wFile,winlen, ovrlen, pre_coef, nfilter, nftt)


    # VAD part
    if opts == 1:

        f=vad_thr(f,E)       #Energy threshold based VAD [comment this  line if you would like to plugin the rVAD labels]

    elif opts == 0:

        l=numpy.loadtxt('..corresponding vad label file');     #[Pluggin the VAD label generated by rVAD matlab]

        if (len(f) - len(l)) ==1: #1-[end-frame] correction [matlab/python]
            l= numpy.append(l,l[-1:,])
        elif (len(f) -len(l)) == -1:
            l=numpy.delete(l,-1)
Exemple #31
0
            fft = numpy.fft.rfft(buf.data, n_fft, axis=-1)

            # Power spectrum and flitering
            mspec = numpy.log(numpy.dot(fft.real**2 + fft.imag**2, fbank.T))

            # Use the DCT to 'compress' the coefficients (spectrum -> cepstrum domain)
            # The C0 term is removed as it is the constant term
            buf.data = scipy.fftpack.realtransforms.dct(mspec,
                                                        type=2,
                                                        norm='ortho',
                                                        axis=-1)[1:nceps + 1]

            buf.data = buf.data[:, numpy.newaxis]
            next.send(buf)

    except GeneratorExit:
        next.close()


if __name__ == '__main__':
    import numpy
    import ringBuffer
    from testPipeline import audio_reader, sink
    from mfcc import mfcc

    output = sink()
    buf_cep = ringBuffer.ring_buffer(output, 2, 0, 13)
    cep = mfcc(buf_cep)
    buf_sig = ringBuffer.ring_buffer(cep, 200, 80)
    inp = audio_reader(buf_sig, "taaa.wav")
    next(inp)
    def test_emotion(self, name):
        if name:
            value = name
        else:
            file = self.files.currentItem()
            print file.text()
            value = file.text().split(".")[0]
        fFile = "feat/Test/" + value + ".htk"
        self.emotion = test(fFile, self.Tardest, self.ubm_mu, self.ubm_cov,
                            self.ubm_w)
        self.result = "Detected as: "
        self.repaint()
        start_play(self.emotion + "_dec.wav")
        self.state = "Say Yes or No"
        self.repaint()

        wFile = "Test_dec.wav"
        start_record(self, wFile, 3, self.state)
        fFile = "feat/Test/" + wFile + ".htk"

        try:

            # call MFCC feature extraction subroutine
            f, E, fs = mfcc(wFile, self.winlen, self.ovrlen, self.pre_coef,
                            self.nfilter, self.nftt)

            # VAD part
            if self.opts == 1:
                f = vad_thr(f, E)
                # Energy threshold based VAD [comment this  line if you would like to plugin the rVAD labels]
            elif self.opts == 0:
                l = numpy.loadtxt('..corresponding vad label file')
                # [Plugin the VAD label generated by rVAD matlab]

                if (len(f) - len(l)) == 1:
                    # 1-[end-frame] correction [matlab/python]
                    l = numpy.append(l, l[-1:, ])
                elif (len(f) - len(l)) == -1:
                    l = numpy.delete(l, -1)

                if (len(l) == len(f)) and (len(numpy.argwhere(l == 1)) != 0):
                    idx = numpy.where(l == 1)
                    f = f[idx]
                else:
                    print "mismatch frames between: label and feature files or no voice-frame in VAD"
                    exit()

        # Zero mean unit variance  normalize after VAD
            f = cmvn(f)

            # write the VAD+normalized features  in file
            if not os.path.exists(os.path.dirname(fFile)):
                # create director for the feature file
                os.makedirs(os.path.dirname(fFile))

        #print("%s --> %s\n" %(wFile,fFile))

            writehtk(fFile, f, 0.01)

        except:
            print("Fail ..%s ---> %s\n" % (wFile, fFile))

        decision = test_decision(fFile, 'DEC3_Tau10.0', self.ubm_mu,
                                 self.ubm_cov, self.ubm_w)

        if decision == "YES":
            start_play(self.emotion + ".wav")

        self.result = ""
        self.emotion = ""
        self.state = ""
        self.repaint()
        return
def feature_fetch(target, dirname, fnames):
    for fname in fnames:
        if target in fname:
            with open(os.path.join(dirname, fname)) as f:
                signal, FS = waveio.wave_from_file(f)
                samples.append(mfcc.mfcc(signal, FS))
Exemple #34
0
import os
import waveio
from mfcc import mfcc
from dtw import dtw
from time import time

voice_dir = "voiceMaterial"
templates = ["strawberry", "apple", "beef", "egg", "lemon", "mushroom", "noodle", "orange", "spaghetti", "spam", "watermelon"]
test = "mushroom"

# Get MFCC for each template, using the 1st sample of them
print "Get MFCC for each template, begin..."
template_feature = {}
for template in templates:
    template_feature[template] = mfcc(signal = waveio.wave_from_file(os.path.join(voice_dir, template+"1.wav"))[0], fs = waveio.wave_from_file(os.path.join(voice_dir, template+"1.wav"))[1])
print "Get MFCC for each template, finish."

overhead = {}
score = {}

def compare(arg, dirname, fnames):
    for fname in fnames:
        if test in fname:
            # Get MFCC for each test sample
            test_features = mfcc(signal = waveio.wave_from_file(os.path.join(dirname, fname))[0], fs = waveio.wave_from_file(os.path.join(dirname, fname))[1])
            # Record overhead and score for each test sample versus each template.(Each sample is of a dict corresponding to each template)
            overhead[fname]= {}
            score[fname] = {}
            for template in template_feature.keys():
                time_start = time()
from scipy.io.wavfile import read
from vqlbg import EUDistance
from mfcc import mfcc
from train_id import training

nSpeaker = 1
nfiltbank = 20
(codebooks) = training(nfiltbank)
directory = 'test'
fname = str()
nCorrect_MFCC = 0


def minDistance(features, codebooks):
    speaker = 0
    distmin = np.inf
    for k in range(np.shape(codebooks)[0]):
        D = EUDistance(features, codebooks[k, :, :])
        dist = np.sum(np.min(D, axis=1)) / (np.shape(D)[0])
        if dist < distmin:
            distmin = dist
            speaker = k
    return speaker


fname = '/s1.wav'
(fs, s) = read(directory + fname)
mel_coefs = mfcc(s, fs, nfiltbank)
sp_mfcc = minDistance(mel_coefs, codebooks)
print('Speaker', ' is matches with speaker', (sp_mfcc + 1))
Exemple #36
0
def get_mfccs(index, paths=csv['path']):
    import mfcc as mf
    mf.set_param_values()
    mfccs = np.asarray([mf.mfcc(i) for i in paths[index]], dtype=np.float32)
    return mfccs
import argparse
import traceback
import numpy as np

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--input",
                        default=None,
                        type=str,
                        help="Input path for mp3 file")
    parser.add_argument("--output",
                        default=None,
                        type=str,
                        help="Output path for numpy array")
    args = parser.parse_args()

    try:
        audio, sr = librosa.load(args.input,
                                 sr=16000,
                                 mono=True,
                                 dtype=np.float32)
    except Exception as ex:
        print("Load autio failed:", ex)
        traceback.print_exc(file=sys.stdout)
        sys.exit(-1)

    audio = (audio * 32768).astype(np.int16)
    output = mfcc.mfcc(sr, audio)

    np.save(args.output, output)
Exemple #38
0
def train_model_on_sound(wav_list): 
    gestures = {'vattene':0, 'vieniqui':1, 'perfetto':2, 'furbo':3, 'cheduepalle':4,
                'chevuoi':5, 'daccordo':6, 'seipazzo':7, 'combinato':8, 'freganiente':9, 
                'ok':10, 'cosatifarei':11, 'basta':12, 'prendere':13, 'noncenepiu':14,
                'fame':15, 'tantotempo':16, 'buonissimo':17, 'messidaccordo':18, 'sonostufo':19}
    dataX = []
    
    for wav in wav_list:
        path = re.sub('\_audio.wav$', '', wav)
        print '\n', '##############'
        print path[-25:]
        sample = VideoMat(path, True)
        sk = Skelet(sample)
        rate, data = get_data(wav)
        labels = sample.labels
        coeff = data.shape[0] / sample.numFrames
        
        interval = get_interval(data, sample.numFrames) #comment to use true interval data
        interval = interval_analysis(interval, sk)
        interval = [['', (beg, end)] for name, (beg, end) in interval if end-beg>10]
        
        features_nb = 2588 #Change to add mspec and spec features
        for value in labels:
            if value != 0:
                name, (beg, end) = value
                for inter in interval:
                    name2, (beg2, end2) = inter
                    if beg2>beg-5 and end2<end+5:
                        space = end2 - beg2 - 9
                        limit = 40
                        data_interval = np.zeros(40*coeff)
                        if(space > limit):
                            data_interval = data[(beg2-1)*coeff:(beg2+limit-1)*coeff]
                        else:
                            data_interval[:space*coeff] = data[(beg2-1)*coeff:(end2-10)*coeff]
                        ceps, mspec, spec = mf.mfcc(data_interval)
                        #print ceps.shape, mspec.shape, spec.shape
                        data_tmp = np.zeros(features_nb)
                        data_tmp[0] = gestures[name]
                        data_tmp[1:2588] = ceps.reshape(2587)
                        #data_tmp[2588:10548] = mspec.reshape(7960)
                        #data_tmp[1273:13561] = spec.reshape(12288)
                        data_tmp = np.nan_to_num(data_tmp)
                        dataX.append(copy.copy(data_tmp))
                        break
    print len(dataX)
    data = np.asarray(dataX)
    Y = data[:, 0]
    X = data[:, 1:2588]
    X = X.clip(min=-100)
    clf = GradientBoostingClassifier(n_estimators=200, verbose=2, max_depth=7, min_samples_leaf=10, min_samples_split=20, random_state=0)
    clf = clf.fit(X, Y) 
    pickle.dump(clf, open('gradient_boosting_model_sound.pkl','wb'))
    
    clf = RandomForestClassifier(n_estimators=300, criterion='entropy', min_samples_split=10, min_samples_leaf=1, verbose=2, random_state=1) #n_jobs=2
    clf = clf.fit(X, Y) 
    pickle.dump(clf, open('random_forest_model_sound.pkl','wb'))
    
    clf = ExtraTreesClassifier(n_estimators=300, min_samples_split=10, min_samples_leaf=1, verbose=2, random_state=1) #n_jobs=2
    clf = clf.fit(X, Y) 
    pickle.dump(clf, open('extra_trees_model_sound.pkl','wb'))
Exemple #39
0
RATE=16000
RECORD_SECONDS = 2
CHUNKSIZE = 1024

# initialize portaudio
p = pyaudio.PyAudio()
stream = p.open(format=pyaudio.paInt16, channels=2, rate=RATE, input=True, frames_per_buffer=CHUNKSIZE)

frames = []

print "Record"

for _ in range(0, int(RATE / CHUNKSIZE * RECORD_SECONDS)):
    data = stream.read(CHUNKSIZE)
    frames.append(np.fromstring(data, dtype=np.int16))

print "EIEI"

numpydata = np.hstack(frames)
signal = numpydata.astype(np.int64)

mfcc_feat = mfcc.mfcc(signal, samplerate=RATE, winlen=0.03, winstep=0.01, numcep=12, nfilt=15, lowfreq=300, highfreq=3700)

print mfcc_feat.shape

# close stream
stream.stop_stream()
stream.close()
p.terminate()
Exemple #40
0
    #~~~~~~~~~~~~~
    # Real world demo
    #~~~~~~~~~~~~~
    from time import time

    M_set = [64, 128, 256]
    for M in M_set:
        print "Generating VQ with M == %d" % M
        timer = time()
        # Train VQ
        mu, clusters = vq_generator(
            dirname="/home/magodo/code/voiceMaterial/word",
            M=M,
            return_clusters=True)
        # Store the trained VQ(tuple of mu(list of 256 different vectors) and clusters(dict of 256 different vector sets))
        import pickle
        with open("VQ.pkl" + "-%d" % M, "wb") as f:
            pickle.dump((mu, clusters), f)

        train_time = (time() - timer) / 60.0
        print "Used %f minutes" % train_time

    # Perform the trained VQ to a demo.wav
    signal, FS = waveio.wave_from_file("demo.wav")
    feature = list(mfcc(signal, FS))
    vq_array = []
    for i in feature:
        vq_array.append(lbg.cluster_point(i, mu))
    print vq_array