def run_predict(part_one=PART_ONE_RANGE, part_two=PART_TWO_RANGE): '''Load pretrained model''' loss_func = audio_loss(gamma=gamma_loss, beta=beta_loss, num_speaker=num_people) AO_model = load_model(MODEL_PATH, custom_objects={ "tf": tf, 'loss_func': loss_func }) '''Load audio data''' loaded_file = 0 for i, j in zip(range(part_one[0], part_one[1]), range(part_two[0], part_two[1])): try: audio_data = np.load(dir_path_mix + "mix-%05d-%05d.npy" % (i, j)) loaded_file += 1 print(audio_data.shape) '''check shape - first dim should be 298''' audio_data = audio_data[:298] if len(audio_data) < 298: a = np.zeros((298, 257, 2)) a[:len(audio_data), :, :] = audio_data audio_data = a print(audio_data.shape) mix_expand = np.expand_dims(audio_data, axis=0) print(mix_expand.shape) print("===== Completed processing audio =====") '''Predict data''' cRMs = AO_model.predict(mix_expand) cRMs = cRMs[0] print("===== Completed predicting cRMs =====") '''Save output as wav''' for k in range(num_people): cRM = cRMs[:, :, :, k] assert cRM.shape == (298, 257, 2) F = utils.fast_icRM(audio_data, cRM) T = utils.fast_istft(F, power=False) filename = dir_path_pred + '%05d-%05d_pred_output%d.wav' % ( i, j, k) wavfile.write(filename, 16000, T) print("%05d-%05d_pred_output%d.wav created" % (i, j, k)) print("===== Completed saving output ===== \n") except FileNotFoundError: print('mix-%05d-%05d.npy is not found' % (i, j)) print('num of processed audio : %d' % loaded_file)
def run_predict(video_name=VIDEO_NAME): '''Load audio data''' audio_data = np.load('preprocessed-%s.npy' % video_name) print(audio_data.shape) # TODO: check shape - first dim should be 298 audio_data = audio_data[:298] if len(audio_data) < 298: a = np.zeros((298, 257, 2)) a[:len(audio_data), :, :] = audio_data audio_data = a print(audio_data.shape) mix_expand = np.expand_dims(audio_data, axis=0) print(mix_expand.shape) '''Load visual data''' face_embs = np.zeros((1, 75, 1, 1792, num_people)) print(face_embs.shape) for i in range(num_people): try: # face_embs[1,:,:,:,i] = np.load(dir_path_face_embs+"%s_face_emb.npy"%single_idxs[i]) '''Currently does not use the correct face input for both speakers (uses the same images for both right now)''' face_embs[0, :, :, :, i] = np.load(dir_path_face_embs + "%s_face_emb_p%d.npy" % (video_name, i)) except Exception as e: print('No face embedding for speaker', i, "\n", e) '''TODO: use Google Vision AI to find the face embedding for each speaker''' # '''Load pretrained model''' loss_func = audio_loss(gamma=gamma_loss, beta=beta_loss, num_speaker=num_people) AV_model = load_model(MODEL_PATH, custom_objects={ "tf": tf, 'loss_func': loss_func }) # '''Predict data''' cRMs = AV_model.predict([mix_expand, face_embs]) cRMs = cRMs[0] # '''Save output as wav''' for j in range(num_people): cRM = cRMs[:, :, :, j] assert cRM.shape == (298, 257, 2) F = utils.fast_icRM(audio_data, cRM) T = utils.fast_istft(F, power=False) filename = dir_path_pred + 'pred_%s_output%d.wav' % (video_name, j) wavfile.write(filename, 16000, T)
av_model = load_model(model_path, custom_objects={'tf': tf}) if num_gpu > 1: parallel = ModelMGPU(av_model, num_gpu) for line in test_file: mix, single_idxs, face_emb = get_data_name(line, people, database, face_emb) mix_ex = np.expand_dims(mix, axis=0) cRMs = parallel.predict([mix_ex, face_emb]) cRMs = cRMs[0] prefix = '' for idx in single_idxs: prefix += idx + '-' for i in range(len(cRMs)): cRM = cRMs[:, :, :, i] assert cRM.shape == (298, 257, 2) F = utils.fast_icRM(mix, cRM) T = utils.fase_istft(F, power=False) filename = result_path + str(single_idxs[i]) + '.wav' wavfile.write(filename, 16000, T) if num_gpu <= 1: for line in test_file: mix, single_idxs, face_emb = get_data_name(line, people, database, face_emb) mix_ex = np.expand_dims(mix, axis=0) cRMs = av_model.predict([mix_ex, face_emb]) cRMs = cRMs[0] prefix = '' for idx in single_idxs: prefix += idx + '-' for i in range(len(cRMs)):