Python fast_icRM Examples, utils.fast_icRM Python Examples

Example #1

0

Show file

File: AO_predict_video.py Project: alexmak916/FYP2

def run_predict(part_one=PART_ONE_RANGE, part_two=PART_TWO_RANGE):
    '''Load pretrained model'''
    loss_func = audio_loss(gamma=gamma_loss,
                           beta=beta_loss,
                           num_speaker=num_people)
    AO_model = load_model(MODEL_PATH,
                          custom_objects={
                              "tf": tf,
                              'loss_func': loss_func
                          })
    '''Load audio data'''
    loaded_file = 0
    for i, j in zip(range(part_one[0], part_one[1]),
                    range(part_two[0], part_two[1])):
        try:
            audio_data = np.load(dir_path_mix + "mix-%05d-%05d.npy" % (i, j))
            loaded_file += 1
            print(audio_data.shape)
            '''check shape - first dim should be 298'''
            audio_data = audio_data[:298]
            if len(audio_data) < 298:
                a = np.zeros((298, 257, 2))
                a[:len(audio_data), :, :] = audio_data
                audio_data = a
            print(audio_data.shape)
            mix_expand = np.expand_dims(audio_data, axis=0)
            print(mix_expand.shape)

            print("===== Completed processing audio =====")
            '''Predict data'''
            cRMs = AO_model.predict(mix_expand)
            cRMs = cRMs[0]

            print("===== Completed predicting cRMs =====")
            '''Save output as wav'''
            for k in range(num_people):
                cRM = cRMs[:, :, :, k]
                assert cRM.shape == (298, 257, 2)
                F = utils.fast_icRM(audio_data, cRM)
                T = utils.fast_istft(F, power=False)
                filename = dir_path_pred + '%05d-%05d_pred_output%d.wav' % (
                    i, j, k)
                wavfile.write(filename, 16000, T)
                print("%05d-%05d_pred_output%d.wav created" % (i, j, k))

            print("===== Completed saving output ===== \n")

        except FileNotFoundError:
            print('mix-%05d-%05d.npy is not found' % (i, j))

    print('num of processed audio : %d' % loaded_file)

Example #2

0

Show file

File: predict_video.py Project: ktam069/speech_separation_modified

def run_predict(video_name=VIDEO_NAME):
    '''Load audio data'''
    audio_data = np.load('preprocessed-%s.npy' % video_name)
    print(audio_data.shape)
    # TODO: check shape - first dim should be 298
    audio_data = audio_data[:298]
    if len(audio_data) < 298:
        a = np.zeros((298, 257, 2))
        a[:len(audio_data), :, :] = audio_data
        audio_data = a
    print(audio_data.shape)
    mix_expand = np.expand_dims(audio_data, axis=0)
    print(mix_expand.shape)
    '''Load visual data'''
    face_embs = np.zeros((1, 75, 1, 1792, num_people))
    print(face_embs.shape)
    for i in range(num_people):
        try:
            # face_embs[1,:,:,:,i] = np.load(dir_path_face_embs+"%s_face_emb.npy"%single_idxs[i])
            '''Currently does not use the correct face input for both speakers (uses the same images for both right now)'''
            face_embs[0, :, :, :,
                      i] = np.load(dir_path_face_embs + "%s_face_emb_p%d.npy" %
                                   (video_name, i))
        except Exception as e:
            print('No face embedding for speaker', i, "\n", e)
    '''TODO: use Google Vision AI to find the face embedding for each speaker'''

    # '''Load pretrained model'''
    loss_func = audio_loss(gamma=gamma_loss,
                           beta=beta_loss,
                           num_speaker=num_people)
    AV_model = load_model(MODEL_PATH,
                          custom_objects={
                              "tf": tf,
                              'loss_func': loss_func
                          })

    # '''Predict data'''
    cRMs = AV_model.predict([mix_expand, face_embs])
    cRMs = cRMs[0]

    # '''Save output as wav'''
    for j in range(num_people):
        cRM = cRMs[:, :, :, j]
        assert cRM.shape == (298, 257, 2)
        F = utils.fast_icRM(audio_data, cRM)
        T = utils.fast_istft(F, power=False)
        filename = dir_path_pred + 'pred_%s_output%d.wav' % (video_name, j)
        wavfile.write(filename, 16000, T)

Example #3

0

Show file

File: test.py Project: sjpberge/Looking-to-Listen

av_model = load_model(model_path, custom_objects={'tf': tf})
if num_gpu > 1:
    parallel = ModelMGPU(av_model, num_gpu)
    for line in test_file:
        mix, single_idxs, face_emb = get_data_name(line, people, database,
                                                   face_emb)
        mix_ex = np.expand_dims(mix, axis=0)
        cRMs = parallel.predict([mix_ex, face_emb])
        cRMs = cRMs[0]
        prefix = ''
        for idx in single_idxs:
            prefix += idx + '-'
        for i in range(len(cRMs)):
            cRM = cRMs[:, :, :, i]
            assert cRM.shape == (298, 257, 2)
            F = utils.fast_icRM(mix, cRM)
            T = utils.fase_istft(F, power=False)
            filename = result_path + str(single_idxs[i]) + '.wav'
            wavfile.write(filename, 16000, T)

if num_gpu <= 1:
    for line in test_file:
        mix, single_idxs, face_emb = get_data_name(line, people, database,
                                                   face_emb)
        mix_ex = np.expand_dims(mix, axis=0)
        cRMs = av_model.predict([mix_ex, face_emb])
        cRMs = cRMs[0]
        prefix = ''
        for idx in single_idxs:
            prefix += idx + '-'
        for i in range(len(cRMs)):