def load_all_test_data(self): for track_name in self.test_track_list: track_folder_path = self.test_data_folder_path + '/' + track_name bass_data, _fs = AudioModule.read(track_folder_path + '/bass.wav', self.fs) drums_data, _fs = AudioModule.read( track_folder_path + '/drums.wav', self.fs) other_data, _fs = AudioModule.read( track_folder_path + '/other.wav', self.fs) vocals_data, _fs = AudioModule.read( track_folder_path + '/vocals.wav', self.fs) self.test_all_bass_list.append(bass_data) self.test_all_drums_list.append(drums_data) self.test_all_other_list.append(other_data) self.test_all_vocals_list.append(vocals_data) return self.test_all_bass_list, self.test_all_drums_list, self.test_all_other_list, self.test_all_vocals_list
def __call__(self): # load all train data provider = DataProvider() test_bass_list, test_drums_list, test_other_list, test_vocals_list = provider.load_all_test_data() # define model tf_mix = tf.placeholder(tf.float32, (None, self.sample_len)) #Batch, Sample tf_est_source = self.__model(tf_mix) # GPU config config = tf.ConfigProto( gpu_options=tf.GPUOptions( visible_device_list='0', # specify GPU number allow_growth = True ) ) saver = tf.train.import_meta_graph('./../results/model/mini-U-Net/mini-U-Net_1999.ckpt.meta') with tf.Session(config = config) as sess: saver.restore(sess, './../results/model/mini-U-Net/mini-U-Net_1999.ckpt') test_mixed_list = [] for bass, drums, other, vocals in zip(test_bass_list, test_drums_list, test_other_list, test_vocals_list): test_mixed_list.append(AudioModule.mixing( bass, drums, other, vocals )) test_target_list = test_vocals_list tf.keras.backend.set_learning_phase(0) # make mix audio est_start = time.time() for mix in test_mixed_list: cutted_mix_array = provider.test_data_split_and_pad(mix, self.sample_len) tmp_est_data_array = np.zeros((len(cutted_mix_array), self.sample_len)) for index, mix_packet in enumerate(cutted_mix_array): mix_packet = mix_packet.reshape(1,-1) est_source = sess.run(tf_est_source, feed_dict = { tf_mix: mix_packet[:,:] } ) tmp_est_data_array[index,:] = est_source self.est_audio_list.append(tmp_est_data_array.reshape(1,-1)) est_end = time.time() print("excuted time", est_end - est_start) evaluate_start = time.time() for est, target, mix in zip(self.est_audio_list, test_target_list, test_mixed_list): target = target.reshape(1,-1) mix = mix.reshape(1,-1) est_array = np.zeros((2, target.shape[1])) est_array[0,:] = est[:, :target.shape[1]] est_array[1,:] = mix[:, :target.shape[1]] - est[:, :target.shape[1]] target_array = np.zeros((2, target.shape[1])) target_array[0,:] = target target_array[1,:] = mix[:, :target.shape[1]] - target sdr, sir, sar, perm = mir_eval.separation.bss_eval_sources(target_array, est_array) self.sdr_list.append(sdr[0]) self.sir_list.append(sir[0]) self.sar_list.append(sar[0]) print('sdr mean',np.mean(self.sdr_list)) print('sir mean',np.mean(self.sir_list)) print('sar mean',np.mean(self.sar_list)) print('sdr median', np.median(self.sdr_list)) print('sir median', np.median(self.sir_list)) print('sar median', np.median(self.sar_list)) evaluate_end = time.time() print('evaluate time', evaluate_end - evaluate_start) return self.est_audio_list, test_target_list, test_mixed_list
def __call__(self): # load all train data provider = DataProvider() bass_list, drums_list, other_list, vocals_list = provider.load_all_train_data() # split train valid train_bass_list, valid_bass_list = provider.split_to_train_valid(bass_list) train_drums_list, valid_drums_list = provider.split_to_train_valid(drums_list) train_other_list, valid_other_list = provider.split_to_train_valid(other_list) train_vocals_list, valid_vocals_list = provider.split_to_train_valid(vocals_list) # define model tf_lr = tf.placeholder(tf.float32) # learning rate tf_mix = tf.placeholder(tf.float32, (None, self.sample_len)) #Batch, Sample tf_target = tf.placeholder(tf.float32, (None, self.sample_len)) #Batch,Sample tf_train_step, tf_loss , tf_target_spec, tf_mag_mix_spec, tf_ori_mix_spec, tf_est_masks, tf_est_spec = self.__model(tf_mix, tf_target, tf_lr) # GPU config config = tf.ConfigProto( gpu_options=tf.GPUOptions( visible_device_list='0', # specify GPU number allow_growth = True ) ) with tf.Session(config = config) as sess: init = tf.global_variables_initializer() sess.run(init) print("Start Training") net_saver = NetSaver(saver_folder_name='UNet_other_sources_bass', saver_file_name='u_net_bass') early_stopping = EarlyStopping() for epoch in range(self.epoch_num): sys.stdout.flush() print('epoch:' + str(epoch)) start = time.time() train_data_argument = DataArgument(self.fs, self.sec, self.train_data_num) train_arg_bass_array = train_data_argument(train_bass_list) train_arg_drums_array = train_data_argument(train_drums_list) train_arg_other_array = train_data_argument(train_other_list) train_arg_vocals_array = train_data_argument(train_vocals_list) valid_data_argument = DataArgument(self.fs, self.sec, self.valid_data_num) valid_arg_bass_array = valid_data_argument(valid_bass_list) valid_arg_drums_array = valid_data_argument(valid_drums_list) valid_arg_other_array = valid_data_argument(valid_other_list) valid_arg_vocals_array = valid_data_argument(valid_vocals_list) self.train_iter = int(len(train_arg_bass_array) / self.batch_size) self.valid_iter = int(len(valid_arg_bass_array) / self.batch_size) # mixing train_mixed_array = AudioModule.mixing( train_arg_bass_array, train_arg_drums_array, train_arg_other_array, train_arg_vocals_array ) train_target_array = train_arg_bass_array valid_mixed_array = AudioModule.mixing( valid_arg_bass_array, valid_arg_drums_array, valid_arg_other_array, valid_arg_vocals_array ) valid_target_array = valid_arg_bass_array # # training tf.keras.backend.set_learning_phase(1) for train_time in range(self.train_iter): sess.run(tf_train_step, feed_dict = { tf_mix: train_mixed_array[train_time*self.batch_size:(train_time+1)*self.batch_size, :self.sample_len], tf_target: train_target_array[train_time*self.batch_size:(train_time+1)*self.batch_size, :self.sample_len], tf_lr: self.lr_init } ) tmp_valid_loss_list = [] tf.keras.backend.set_learning_phase(0) for valid_time in range(self.valid_iter): valid_loss = sess.run(tf_loss, feed_dict = { tf_mix: valid_mixed_array[valid_time*self.batch_size:(valid_time+1)*self.batch_size, :self.sample_len], tf_target: valid_target_array[valid_time*self.batch_size:(valid_time+1)*self.batch_size, :self.sample_len], tf_lr: 0. } ) tmp_valid_loss_list.append(valid_loss) self.valid_loss_list.append(np.mean(tmp_valid_loss_list)) vmin = -70 vmax = 0 target_spec, mag_mix_spec, ori_spec_mix, est_mask, est_spec = sess.run([tf_target_spec, tf_mag_mix_spec , tf_ori_mix_spec, tf_est_masks, tf_est_spec], feed_dict ={ tf_mix: train_mixed_array[0:1, :self.sample_len], tf_target: train_target_array[0:1, :self.sample_len], tf_lr: 0. }) est_mask = np.squeeze(est_mask, axis=-1) target_spec = np.squeeze(target_spec, axis=-1) mag_mix_spec = np.squeeze(mag_mix_spec, axis=-1) est_spec = np.squeeze(est_spec, axis=-1) print("original spec mix") visualize_spec.plot_spec(ori_spec_mix[0], self.fs, self.sec, vmax, vmin) print("magnitude spec mix") visualize_spec.plot_log_spec(mag_mix_spec[0], self.fs, self.sec, 10, -10) print("target spec") visualize_spec.plot_spec(target_spec[0], self.fs, self.sec, vmax, vmin) print("est mask") visualize_spec.plot_log_spec(est_mask[0], self.fs, self.sec, 1, 0) print("est spec") visualize_spec.plot_spec(est_spec[0], self.fs, self.sec, vmax, vmin) visualize_loss.plot_loss(self.valid_loss_list) end = time.time() print(' excute time', end - start) if epoch%9 == 0: net_saver(sess, step=epoch)
est_array[ 1, :] = mix[:, :target.shape[1]] - est[:, :target.shape[1]] target_array = np.zeros((2, target.shape[1])) target_array[0, :] = target target_array[1, :] = mix[:, :target.shape[1]] - target sdr, sir, sar, perm = mir_eval.separation.bss_eval_sources( target_array, est_array) self.sdr_list.append(sdr[0]) self.sir_list.append(sir[0]) self.sar_list.append(sar[0]) print('sdr mean', np.mean(self.sdr_list)) print('sir mean', np.mean(self.sir_list)) print('sar mean', np.mean(self.sar_list)) print('sdr median', np.median(self.sdr_list)) print('sir median', np.median(self.sir_list)) print('sar median', np.median(self.sar_list)) evaluate_end = time.time() print('evaluate time', evaluate_end - evaluate_start) return self.est_audio_list, test_target_list, test_mixed_list if __name__ == '__main__': test = Test() est_list, target_list, mixed_list = test() file_path = './../results/audio/UNet/singing_voice_separation/' AudioModule.to_pickle(est_list, file_path + 'est_list')