def load_songs(self): for set in ['train', 'val']: for condition in ['mixture', 'vocals']: for filepath in self.file_paths[set][condition]: if condition == 'vocals': sequence = util.load_wav(filepath, self.sample_rate) self.sequences[set][condition].append(sequence) self.num_sequences_in_memory += 1 if self.extract_voice_percent > 0: self.voice_indices[set].append( util.get_sequence_with_singing_indices( sequence)) else: if self.in_memory_percentage == 1 or np.random.uniform( 0, 1) <= (self.in_memory_percentage - 0.5) * 2: sequence = util.load_wav(filepath, self.sample_rate) self.sequences[set][condition].append(sequence) self.num_sequences_in_memory += 1 else: self.sequences[set][condition].append([-1])
def inference(config, cla): if cla.batch_size is not None: batch_size = int(cla.batch_size) else: batch_size = config['training']['batch_size'] if cla.target_field_length is not None: cla.target_field_length = int(cla.target_field_length) if not bool(cla.one_shot): model = models.DenoisingWavenet(config, target_field_length=cla.target_field_length, load_checkpoint=cla.load_checkpoint, print_model_summary=cla.print_model_summary) print('Performing inference..') else: print('Performing one-shot inference..') samples_folder_path = os.path.join(config['training']['path'], 'samples') output_folder_path = get_valid_output_folder_path(samples_folder_path) #If input_path is a single wav file, then set filenames to single element with wav filename if cla.noisy_input_path.endswith('.wav'): filenames = [cla.noisy_input_path.rsplit('/', 1)[-1]] cla.noisy_input_path = cla.noisy_input_path.rsplit('/', 1)[0] + '/' if cla.clean_input_path is not None: cla.clean_input_path = cla.clean_input_path.rsplit('/', 1)[0] + '/' else: if not cla.noisy_input_path.endswith('/'): cla.noisy_input_path += '/' filenames = [filename for filename in os.listdir(cla.noisy_input_path) if filename.endswith('.wav')] clean_input = None for filename in filenames: noisy_input = util.load_wav(cla.noisy_input_path + filename, config['dataset']['sample_rate']) if cla.clean_input_path is not None: if not cla.clean_input_path.endswith('/'): cla.clean_input_path += '/' clean_input = util.load_wav(cla.clean_input_path + filename, config['dataset']['sample_rate']) input = {'noisy': noisy_input, 'clean': clean_input} output_filename_prefix = filename[0:-4] + '_' if config['model']['condition_encoding'] == 'one_hot': condition_input = util.one_hot_encode(int(cla.condition_value), 29)[0] else: condition_input = util.binary_encode(int(cla.condition_value), 29)[0] if bool(cla.one_shot): if len(input['noisy']) % 2 == 0: # If input length is even, remove one sample input['noisy'] = input['noisy'][:-1] if input['clean'] is not None: input['clean'] = input['clean'][:-1] model = models.DenoisingWavenet(config, load_checkpoint=cla.load_checkpoint, input_length=len(input['noisy']), print_model_summary=cla.print_model_summary) print("Denoising: " + filename) denoise.denoise_sample(model, input, condition_input, batch_size, output_filename_prefix, config['dataset']['sample_rate'], output_folder_path)
def load_directory(self, directory_path, condition): filenames = [ filename for filename in os.listdir(directory_path) if filename.endswith('.wav') ] speakers = [] file_paths = [] speech_onset_offset_indices = [] regain_factors = [] sequences = [] for filename in filenames: speaker_name = filename[0:4] speakers.append(speaker_name) filepath = os.path.join(directory_path, filename) if condition == 'clean': sequence = util.load_wav(filepath, self.sample_rate) sequences.append(sequence) self.num_sequences_in_memory += 1 regain_factors.append(self.regain / util.rms(sequence)) #如果extract_voice为true,则需要进行去除前后静音操作 if self.extract_voice: #speech_onset_offset_indices是非静音段的起止点 speech_onset_offset_indices.append( util.get_subsequence_with_speech_indices(sequence)) else: if self.in_memory_percentage == 1 or np.random.uniform( 0, 1) <= (self.in_memory_percentage - 0.5) * 2: sequence = util.load_wav(filepath, self.sample_rate) sequences.append(sequence) self.num_sequences_in_memory += 1 else: sequences.append([-1]) if speaker_name not in self.speaker_mapping: self.speaker_mapping[speaker_name] = len( self.speaker_mapping) + 1 file_paths.append(filepath) return sequences, file_paths, speakers, speech_onset_offset_indices, regain_factors
def load_directory(self, filenames, spk): sequences = [] for filename in tqdm(filenames): sequence = util.load_wav(filename, self.sample_rate) sequences.append(sequence) sequences = np.array(sequences) return sequences
def encode_dataset(self): print('Encoding from {} into {}'.format(self.path, self.tfr)) print('Input length : {}'.format(self.input_length)) total = 0 less_than_target = 0 with tf.python_io.TFRecordWriter(self.tfr) as writer: filenames = os.listdir(os.path.join(self.path, self.mode, 's1')) for filename in tqdm(filenames): s1 = util.load_wav(os.path.join(self.path, self.mode, 's1', filename), self.sample_rate) s2 = util.load_wav(os.path.join(self.path, self.mode, 's2', filename), self.sample_rate) def write(_s1, _s2): example = tf.train.Example( features=tf.train.Features( feature={ "s1": tf.train.Feature(float_list=tf.train.FloatList(value=_s1)), "s2": tf.train.Feature(float_list=tf.train.FloatList(value=_s2)) })) writer.write(example.SerializeToString()) if len(s1) < self.input_length: # b = np.random.random_integers(self.receptive_field_length//2, # self.input_length-len(s1)-self.receptive_field_length//2) b = np.random.random_integers(0, self.input_length-len(s1)) s1_pad = np.zeros((self.input_length)) s1_pad[b:b+len(s1)] = s1 s2_pad = np.zeros((self.input_length)) s2_pad[b:b+len(s2)] = s2 write(s1_pad, s2_pad) less_than_target += 1 else: stride = self.input_length // 2 for i in range(0, len(s1) - self.input_length, stride): s1_pad = s1[i:i+self.input_length] s2_pad = s2[i:i+self.input_length] write(s1_pad, s2_pad) total += 1 print('total example : {}, less than target : {}'.format(total + less_than_target, less_than_target))
def test(config, cla): log_file = os.path.join(config['training']['path'], cla.ckpt_name, 'log_'+cla.test_set) if not os.path.exists(os.path.join(config['training']['path'], cla.ckpt_name)): os.mkdir(os.path.join(config['training']['path'], cla.ckpt_name)) output_path = os.path.join(config['training']['path'], 'sample') sess = tf.Session() G = hparams.get_model(config['model']['type'])(config, sess) G_save_path = os.path.join(config['training']['path'], 'generat.ckpt') G.load(G_save_path, cla.ckpt_name) if not cla.mix_input_path.endswith('/'): cla.mix_input_path += '/' filenames = [filename for filename in os.listdir(cla.mix_input_path) if filename.endswith('.wav')] sdr_sum = [] sisnr_sum = [] pesq_sum = [] for filename in filenames: util.myprint(log_file, filename) mix_audio = util.load_wav(cla.mix_input_path + filename, config['dataset']['sample_rate']) clean_1 = util.load_wav(cla.clean_input_path + 's1/' + filename, config['dataset']['sample_rate']) clean_2 = util.load_wav(cla.clean_input_path + 's2/' + filename, config['dataset']['sample_rate']) sdr, sisnr, pesq, pit_ch = separate.separate_sample(sess, G, config, mix_audio, clean_1, clean_2) util.myprint(log_file, ' sdr: {}, {}'.format(sdr[0], sdr[1])) util.myprint(log_file, ' sisnr: {}, {}'.format(sisnr[0], sisnr[1])) util.myprint(log_file, ' pesq: {}, {}'.format(pesq[0], pesq[1])) sdr_sum.append(sdr) sisnr_sum.append(sisnr) pesq_sum.append(pesq) sdr_sum = np.array(sdr_sum) sisnr_sum = np.array(sisnr_sum) pesq_sum = np.array(pesq_sum) util.myprint(log_file, 'test sdr : {}'.format(np.mean(sdr_sum))) util.myprint(log_file, 'test sisnr : {}'.format(np.mean(sisnr_sum))) util.myprint(log_file, 'test pesq : {}'.format(np.mean(pesq_sum)))
def retrieve_sequence(self, set, condition, sequence_num): if len(self.sequences[set][condition][sequence_num]) == 1: sequence = util.load_wav(self.file_paths[set][condition][sequence_num], self.sample_rate) if (float(self.num_sequences_in_memory) / self.get_num_sequences_in_dataset()) < self.in_memory_percentage: self.sequences[set][condition][sequence_num] = sequence self.num_sequences_in_memory += 1 else: sequence = self.sequences[set][condition][sequence_num] return np.array(sequence)
def load_directory(self, filenames, spk): speakers = [] file_paths = [] speech_onset_offset_indices = [] regain_factors = [] sequences = [] for filename in filenames: speaker_name = filename.split('/')[-1].split('_')[0][:3] if spk=='a' else \ filename.split('/')[-1].split('_')[2][:3] speakers.append(speaker_name) sequence = util.load_wav(filename, self.sample_rate) sequences.append(sequence) self.num_sequences_in_memory += 1 # regain_factors.append(self.regain / util.rms(sequence)) if self.extract_voice: # get sub-sequence without front and ending silence speech_onset_offset_indices.append(util.get_subsequence_with_speech_indices(sequence)) if speaker_name not in self.speaker_mapping: self.speaker_mapping[speaker_name] = len(self.speaker_mapping) + 1 return sequences, speakers, speech_onset_offset_indices, regain_factors
def test(config, cla): if cla.batch_size is not None: batch_size = int(cla.batch_size) else: batch_size = config['training']['batch_size'] if cla.target_field_length is not None: cla.target_field_length = int(cla.target_field_length) model = models.DenoisingWavenet( config, target_field_length=cla.target_field_length, load_checkpoint=cla.load_checkpoint, print_model_summary=cla.print_model_summary) samples_folder_path = os.path.join(config['training']['path'], 'samples') output_folder_path = get_valid_output_folder_path(samples_folder_path) if not cla.noisy_input_path.endswith('/'): cla.noisy_input_path += '/' filenames = [ filename for filename in os.listdir(cla.noisy_input_path) if filename.endswith('.wav') ] with open('spk_info.json') as f: spk_info = json.load(f) sdr = [] n_output = config['training']['n_output'] if 'n_output' in config[ 'training'] else 2 n_speaker = config['training']['n_speaker'] if 'n_speaker' in config[ 'training'] else 2 gender_stat = { 'ch' + str(i + 1): { 'M': 0, 'F': 0 } for i in range(n_output) } # gender_stat = {'ch1':{'M':0,'F':0}, 'ch2':{'M':0,'F':0}} for filename in filenames: noisy_input = util.load_wav(cla.noisy_input_path + filename, config['dataset']['sample_rate']) if cla.clean_input_path is not None: if not cla.clean_input_path.endswith('/'): cla.clean_input_path += '/' clean_input_1 = util.load_wav( cla.clean_input_path + 's1/' + filename, config['dataset']['sample_rate']) clean_input_2 = util.load_wav( cla.clean_input_path + 's2/' + filename, config['dataset']['sample_rate']) input = { 'noisy': noisy_input, 'clean_1': clean_input_1, 'clean_2': clean_input_2 } output_filename_prefix = filename[0:-4] + '_' spk1 = output_filename_prefix.split('_')[0][:3] spk2 = output_filename_prefix.split('_')[2][:3] spk_name = [spk1, spk2] spk_gender = [spk_info[spk1], spk_info[spk2]] # print("Denoising: " + filename). condition_input = None print(filename) _sdr, ch_gender, pit_idx = denoise.denoise_sample( model, input, condition_input, batch_size, output_filename_prefix, config['dataset']['sample_rate'], n_speaker, n_output, output_folder_path, spk_gender=spk_gender, use_pit=cla.use_pit, pad=cla.zero_pad) # print('sdr = %f, %f' %(_sdr[0],_sdr[1])) if spk_gender[0] == 'F' and spk_gender[1] == 'M': for i in range(1, -1, -1): print('{} {}: sdr={}, idx={}'.format(spk_gender[i], spk_name[i], _sdr[i], pit_idx[i])) else: for i in range(2): print('{} {}: sdr={}, idx={}'.format(spk_gender[i], spk_name[i], _sdr[i], pit_idx[i])) # print(ch_gender) # for ch, stat in ch_gender.items(): # for gen, num in stat.items(): # gender_stat[ch][gen] += num sdr.append(_sdr) sdr = np.array(sdr) print('Testing SDR:', np.mean(sdr)) print(gender_stat)
hann = np.hanning(frame_width) spacing = 1024 bitrate = 44100 f_0 = 50 f_1 = 2000 f_r = 10 power_thresh = 10 if len(sys.argv) == 2: input_file = sys.argv[1] else: print("usage: ./harmonicity.py input.wav") sys.exit() data = util.load_wav(input_file) all_weights = [] fft_len = frame_width * 4 zeropad = np.zeros(frame_width * 3) best_frequencies = [] k0 = int(np.floor(util.hz_to_fourier(f_0, frame_width * 4, bitrate))) k1 = int(np.ceil(util.hz_to_fourier(f_1, frame_width * 4, bitrate))) # iterate through frames for i in tqdm(range(0, int((len(data) - frame_width) / spacing))): # spectrum generation and preprocessing frame = data[i * spacing:i * spacing + frame_width] window = frame * hann raw_fft = fft(np.concatenate((window, zeropad)))
def inference(config, cla): if cla.batch_size is not None: batch_size = int(cla.batch_size) else: batch_size = config['training']['batch_size'] if cla.target_field_length is not None: cla.target_field_length = int(cla.target_field_length) if not bool(cla.one_shot): if config['model']['type'] == 'singing-voice': model = models.SingingVoiceSeparationWavenet( config, target_field_length=cla.target_field_length, load_checkpoint=cla.load_checkpoint, print_model_summary=cla.print_model_summary) elif config['model']['type'] == 'multi-instrument': model = models.MultiInstrumentSeparationWavenet( config, target_field_length=cla.target_field_length, load_checkpoint=cla.load_checkpoint, print_model_summary=cla.print_model_summary) print 'Performing inference..' else: print 'Performing one-shot inference..' samples_folder_path = os.path.join(config['training']['path'], 'samples') output_folder_path = get_valid_output_folder_path(samples_folder_path) #If input_path is a single wav file, then set filenames to single element with wav filename if cla.mixture_input_path.endswith('.wav'): filenames = [cla.mixture_input_path.rsplit('/', 1)[-1]] cla.mixture_input_path = cla.mixture_input_path.rsplit('/', 1)[0] + '/' else: if not cla.mixture_input_path.endswith('/'): cla.mixture_input_path += '/' filenames = [ filename for filename in os.listdir(cla.mixture_input_path) if filename.endswith('.wav') ] for filename in filenames: mixture_input = util.load_wav(cla.mixture_input_path + filename, config['dataset']['sample_rate']) input = {'mixture': mixture_input} output_filename_prefix = filename[0:-4] if bool(cla.one_shot): if len(input['mixture'] ) % 2 == 0: # If input length is even, remove one sample input['mixture'] = input['mixture'][:-1] if config['model']['type'] == 'singing-voice': model = models.SingingVoiceSeparationWavenet( config, target_field_length=cla.target_field_length, load_checkpoint=cla.load_checkpoint, print_model_summary=cla.print_model_summary) elif config['model']['type'] == 'multi-instrument': model = models.MultiInstrumentSeparationWavenet( config, target_field_length=cla.target_field_length, load_checkpoint=cla.load_checkpoint, print_model_summary=cla.print_model_summary) print "Separating: " + filename separate.separate_sample(model, input, batch_size, output_filename_prefix, config['dataset']['sample_rate'], output_folder_path, config['model']['type'])