labels = []

label_dir = '/home/pony/github/ASR_libri/libri/cha-level/mfcc_and_label/label/'
mfcc_dir = '/home/pony/github/ASR_libri/libri/cha-level/mfcc_and_label/mfcc/'

if True:
    for subdir, dirs, files in os.walk(rootdir):
        if True:
            for f in files:
                fullFilename = os.path.join(subdir, f)
                filenameNoSuffix = os.path.splitext(fullFilename)[0]
                if f.endswith('.wav'):
                    print fullFilename
                    (rate, sig) = wav.read(fullFilename)
                    mfcc = calcMFCC_delta_delta(sig,
                                                rate,
                                                win_length=0.020,
                                                win_step=0.020)
                    # transpose mfcc to array of (39,time_length)
                    mfcc = np.transpose(mfcc)
                    print mfcc.shape
                    # save mfcc to file
                    m_f = mfcc_dir + filenameNoSuffix.split('/')[-1] + '.npy'
                    np.save(m_f, mfcc)

                    labelFilename = filenameNoSuffix + '.label'
                    with open(labelFilename, 'r') as f:
                        characters = f.readline().strip()
                    print characters
                    targets = []
                    for c in characters:
                        if c == ' ':
def wav_to_mfcc(wav_file_path):
    rate, sig = wav.read(wav_file_path)
    mfcc = calcMFCC_delta_delta(sig, rate, win_length=0.020, win_step=0.010)
    mfcc = np.transpose(mfcc)
    return mfcc
예제 #3
0
        output_logits = model(feat)
    output_probs = nn.functional.softmax(output_logits, dim=1)
    output_label = int(np.argmax(output_probs.squeeze().cpu().numpy()))
    output_probs = output_probs.squeeze().tolist()
    print("output_logits:{}".format(output_logits[0][output_label]))
    end_time=time.time()
    print("Model predicting time {}".format(end_time - start_time))
    #print(output_logits)
    #print(output_probs)
    #print(output_label)
    output_person = label2person[output_label]

    return output_probs, output_label, output_person


if __name__=="__main__":
    import scipy.io.wavfile as wav
    from calcmfcc import calcMFCC_delta_delta

    path = "./storage/Qiaomu.wav"
    (rate,sig) = wav.read(path)
    mfcc_feat = calcMFCC_delta_delta(sig,rate)

    model = RNN_model(14)
    model.load_state_dict(torch.load("models/best_model.pt"))
    with open("models/label_person_mapping.pickle", "rb") as file:
        label_person_mapping = pickle.load(file)
    label2person = label_person_mapping["label2person"]
    print(label2person)
    output_prob, output_person = classify_one_cycle(model, mfcc_feat, label2person)
    print("output_person:{}".format(output_person))
예제 #4
0
    def POST(self):
        web.header('Access-Control-Allow-Origin', '*')
        web.header('Access-Control-Allow-Credentials', 'true')
        web.header('Access-Control-Allow-Methods', 'POST, GET, OPTIONS')
        data = web.input()

        start_time = []
        end_time = []
        sil_start_time = []
        sil_end_time = []

        if data.type == 'audio/mpeg':
            wf = open(dir_ + 'storage/' + str(data.url)[-6:] + '.mp3', 'wb')
            wf.write(data.file)
            wf.close()

        elif data.type == 'audio/wav':
            #read sample and create a copy
            sample_name = data.filename
            sf = open(dir_ + 'storage/' + sample_name, 'rb')
            wav_bytes = sf.read()

            file_name = 'curr.wav'
            wf = open(dir_ + 'storage/' + file_name, 'wb')
            wf.write(wav_bytes)
            wf.close()
            print("audio received ...")

            # make single channel [uncomment if passing dual channel audio]
            #silence_classf.make_one_channel(dir_+'storage',file_name)

            # call voice separation required coz due to ambient noise the silence detection doesn't work properly
            #new_fn_bg,new_fn_fg=silence_classf.vocal_separation(dir_+'storage',file_name)
            #sil_file_name=new_fn_fg

            # call umm segmentation
            (rate, sig) = wav.read(dir_ + 'storage/' + file_name)
            start_time = time.time()
            mfcc_feat = calcMFCC_delta_delta(sig, rate)
            mid_time = time.time()
            print("Signal processing time:{} s".format(mid_time - start_time))

            model = RNN_model(15)
            if torch.cuda.is_available():
                device = torch.device('cuda:0')
            else:
                device = torch.device('cpu')
            model.load_state_dict(
                torch.load("models/best_model.pt", map_location=device))
            with open("models/label_person_mapping_new.pickle", "rb") as file:
                label_person_mapping = pickle.load(file)
            label2person = label_person_mapping["label2person"]
            output_prob, output_label, output_person = classify_one_cycle(
                model, mfcc_feat, label2person)
            end_time = time.time()
            target_prob = output_prob[output_label]
            if "_" in output_person:
                output_person = output_person.split(
                    "_")[0] + " " + output_person.split("_")[1]
            print("output_person:{}".format(output_person))
            print("output_probabilities:{}".format(target_prob))
            print("Model predicting time:{} s".format(end_time - mid_time))

            return json.dumps({
                'msg': "Your file is uploaded!",
                'output_person': output_person,
                "output_probability": round(target_prob, 4),
                "total_time": round(end_time - start_time, 3),
            })