def __init__( self, frame_length=3, sample_rate=44100, num_worker=1, MUSDB18_PATH="", BIG_DATA=False, additional_background_data=[], additional_vocal_data=[], ): np.random.seed(1) self.sample_rate = sample_rate self.wh = WaveHandler() self.BIG_DATA = BIG_DATA self.music_folders = [] for each in additional_background_data: self.music_folders += self.readList(each) self.vocal_folders = [] for each in additional_vocal_data: self.vocal_folders += self.readList(each) self.frame_length = frame_length self.bac_file_num = len(self.music_folders) self.voc_file_num = len(self.vocal_folders) self.num_worker = num_worker self.mus = musdb.DB(MUSDB18_PATH, is_wav=True, subsets='train') self.pitch_shift_high = [2.0, 3.0, 4.0, 5.0, 6.0, 7.0] self.pitch_shift_low = [-2.0, -3.0, -4.0, -5.0, -6.0, -7.0]
def read_wav(estimate_fname, target_fname): from util.wave_util import WaveHandler wh = WaveHandler() estimate = wh.read_wave(estimate_fname, channel=1) truth = wh.read_wave(target_fname, channel=2) min_length = min(estimate.shape[0], truth.shape[0]) estimate, truth = estimate[:min_length].reshape( (1, min_length, 1)), truth[:min_length].reshape((1, min_length, 1)) return estimate, truth
def get_total_time_in_folder(path): if (not path[-1] == '/'): raise ValueError("Error: path should end with /") wh = WaveHandler() total_time = 0 for cnt, file in enumerate(os.listdir(path)): total_time += wh.get_duration(path + file) print("total: ") print(total_time, "s") print(total_time / 60, "min") print(total_time / 3600, "h")
def delete_unproper_training_data(path): if (not path[-1] == '/'): raise ValueError("Error: path should end with /") wh = WaveHandler() files = os.listdir(path) for cnt, each in enumerate(files): file_pth = path + each if (file_pth.split('.')[-1] == 'wav'): judge = wh.get_channels_sampwidth_and_sample_rate(file_pth) if (not judge[0]): print(each, "Unproper! params:", judge[1]) os.remove(file_pth)
def seg_data(): wh = WaveHandler() dir = Config.datahub_root + "song/441_song_data/" seg_dir = Config.datahub_root + "song/seg_song_data/" for cnt, fname in enumerate(os.listdir(dir)): print("Doing segmentation on ", fname + "...") unseg_f = dir + fname data = wh.read_wave(unseg_f, channel=2) length = data.shape[0] for start in np.linspace(0, 0.95, 20): seg_data = data[int(start * length):int((start + 0.05) * length)] wh.save_wave(seg_data, seg_dir + fname.split('.')[-2] + "_" + str('%.2f' % start) + ".wav", channels=2)
class VocalFilter(): def __init__(self): self.vad = webrtcvad.Vad() self.wh = WaveHandler() self.kernal = np.ones(44100*1)/4410*5 self.threshold = 20 # self.kernal = np.append(np.linspace(0,1,44100*1.5),np.linspace(1,0,44100*1.5)) def normalize(self,frames): return frames/np.max(frames) def variance(self,frames): return np.var(frames) def flattern(self,arr,smooth = 44100*2): for i in range(arr.shape[0]): arr[i] = np.sum(arr[i:i+smooth])/smooth def conv(self,arr,ker): return scipy.signal.convolve(arr, ker) def calculate_variance(self,fpath,name = ""): self.frames = self.wh.read_wave(fpath) self.frames = self.frames[self.frames>=0.0] self.frames = self.conv(self.frames,self.kernal) # frames = self.flattern(frames) length = self.wh.get_framesLength(fpath) zero_count = np.sum(~(self.frames>self.threshold)) if(name != ""):plotWav(self.frames,name) self.frames = self.frames[44100*100:44100*100+10000] return zero_count/length def filter_music(self,pth): if(pth[-1] != '/'): raise ValueError("Error: Path should end with /") dict = {} for each in os.listdir(pth): fpath = pth+each ratio = self.calculate_variance(fpath) dict[each] = ratio print(ratio,each) def myVad(self): self.vad.set_mode(0) sample_rate = 16000 frame_duration = 10 frame = b'\x10\x20' * int(sample_rate * frame_duration / 1000) print('Contains speech: %s' % (self.vad.is_speech(frame, sample_rate)))
def get_total_time_in_txt(txtpath): wh = WaveHandler() cnt = 0 files = readList(txtpath) total_time = 0 for file in files: try: total_time += wh.get_duration(file) cnt += 1 except: print("error:", file) # print(total_time,"s") # print(total_time/60,"min") print( txtpath.split('/')[-1].split('.')[-2], ",", str(total_time / 3600) + "h,", cnt, ", " + txtpath) return total_time / 3600, cnt
def __init__( self, frame_length=Config.frame_length, sample_rate=Config.sample_rate, num_worker=Config.num_workers, sampleNo=20000, mu=Config.mu, empty_every_n=50, sigma=Config.sigma, alpha_low=Config.alpha_low, alpha_high=Config. alpha_high # If alpha_high get a value greater than 0.5, it would have probability to overflow ): np.random.seed(1) self.sample_rate = sample_rate self.frame_length = frame_length # self.music_folders = self.readList(Config.musdb_train_background) self.music_folders = [] for each in Config.background_data: self.music_folders += self.readList(each) self.vocal_folders = [] for each in Config.vocal_data: self.vocal_folders += self.readList(each) # prev_data_size = len(self.vocal_folders) # if(Config.exclude_list != ""): # for each in self.readList(Config.exclude_list): # self.vocal_folders.remove(each) # print(prev_data_size-len(self.vocal_folders)," songs were removed from vocal datasets") self.sample_length = int(self.sample_rate * self.frame_length) self.cnt = 0 self.data_counter = 0 self.empty_every_n = empty_every_n self.sampleNo = sampleNo self.num_worker = num_worker self.wh = WaveHandler() # This alpha is to balance the energy between vocal and background # Also, this alpha is used to simulate different energy leval between vocal and background self.normal_distribution = np.random.normal(mu, sigma, sampleNo) self.normal_distribution = self.normal_distribution[ self.normal_distribution > alpha_low] self.normal_distribution = self.normal_distribution[ self.normal_distribution < alpha_high] self.sampleNo = self.normal_distribution.shape[0]
def eval_spleeter(): from evaluate.sdr import sdr_evaluate wh = WaveHandler() from evaluate.si_sdr_numpy import sdr, si_sdr output_test_pth = Config.datahub_root + "musdb18hq/spleeter_out/test/" mus_test_pth = Config.datahub_root + "musdb18hq/test/" vocal = [] background = [] # # for each in os.listdir(mus_train_pth): # mus_dir = mus_train_pth + each + "/" # out_dir = output_train_pth + each + "/output/combined/" # # try: # mus_vocal = wh.read_wave(mus_dir + "vocals.wav") # mus_background = wh.read_wave(mus_dir + "background.wav") # output_vocal = wh.read_wave(out_dir + "vocals.wav") # output_background = wh.read_wave(out_dir + "accompaniment.wav") # # output_vocal, mus_vocal = unify(output_vocal, mus_vocal) # output_background, mus_background = unify(output_background, mus_background) # # v = sdr(output_vocal, mus_vocal) # b = sdr(output_background, mus_background) # vocal.append(v) # background.append(b) # print("FileName: ",each, "\tSDR-VOCAL: ",v,"SDR-BACKGROUND: " ,b) for each in sorted(os.listdir(musdb_test_pth)): mus_dir = mus_test_pth + each + "/" out_dir = output_test_pth + each + "/output/combined/" # try: mus_vocal = wh.read_wave(mus_dir + "vocals.wav") mus_background = wh.read_wave(mus_dir + "background.wav") output_vocal = wh.read_wave(out_dir + "vocals.wav") output_background = wh.read_wave(out_dir + "accompaniment.wav") output_vocal, mus_vocal = unify(output_vocal, mus_vocal) output_background, mus_background = unify(output_background, mus_background) v = sdr(output_vocal, mus_vocal) b = sdr(output_background, mus_background) vocal.append(v) background.append(b) print("FileName: ", each, "\tSDR-BACKGROUND: ", b, "\tSDR-VOCAL: ", v) # except: # print("Error",each) print("AVG-SDR-VOCAL", sum(vocal) / len(vocal)) print("AVG-SDR-BACKGROUND", sum(background) / len(background))
def __init__(self): self.vad = webrtcvad.Vad() self.wh = WaveHandler() self.kernal = np.ones(44100*1)/4410*5 self.threshold = 20
Config.trail_name + "/" + "data_background.txt") write_list( Config.vocal_data, Config.project_root + "saved_models/" + Config.trail_name + "/" + "data_vocal.txt") # Cache for data freq_bac_loss_cache = [] freq_voc_loss_cache = [] freq_cons_loss_cache = [] best_sdr_vocal, best_sdr_background = Config.best_sdr_vocal, Config.best_sdr_background # exclude_dict = load_json("config/json/ExcludeData.json") # exclude_start_point,vocal_sisdr_min,vocal_sisdr_max,background_sisdr_min,background_sisdr_max = exclude_dict["start_exclude_point"],exclude_dict["vocal_sisdr"][0],exclude_dict["vocal_sisdr"][1],exclude_dict["background_sisdr"][0],exclude_dict["background_sisdr"][1] wh = WaveHandler() loss = torch.nn.L1Loss() if (not Config.start_point == 0): model = torch.load(Config.load_model_path + "/model" + str(Config.start_point) + ".pkl", map_location=Config.device) else: if (Config.split_band): model = Spleeter(channels=2, unet_inchannels=2 * Config.subband, unet_outchannels=2 * Config.subband).cuda( Config.device) else: model = Spleeter(channels=2, unet_inchannels=2, unet_outchannels=2).cuda(Config.device)
class WavenetDataloader(Dataset): def __init__( self, frame_length=Config.frame_length, sample_rate=Config.sample_rate, num_worker=Config.num_workers, sampleNo=20000, mu=Config.mu, empty_every_n=50, sigma=Config.sigma, alpha_low=Config.alpha_low, alpha_high=Config. alpha_high # If alpha_high get a value greater than 0.5, it would have probability to overflow ): np.random.seed(1) self.sample_rate = sample_rate self.frame_length = frame_length # self.music_folders = self.readList(Config.musdb_train_background) self.music_folders = [] for each in Config.background_data: self.music_folders += self.readList(each) self.vocal_folders = [] for each in Config.vocal_data: self.vocal_folders += self.readList(each) # prev_data_size = len(self.vocal_folders) # if(Config.exclude_list != ""): # for each in self.readList(Config.exclude_list): # self.vocal_folders.remove(each) # print(prev_data_size-len(self.vocal_folders)," songs were removed from vocal datasets") self.sample_length = int(self.sample_rate * self.frame_length) self.cnt = 0 self.data_counter = 0 self.empty_every_n = empty_every_n self.sampleNo = sampleNo self.num_worker = num_worker self.wh = WaveHandler() # This alpha is to balance the energy between vocal and background # Also, this alpha is used to simulate different energy leval between vocal and background self.normal_distribution = np.random.normal(mu, sigma, sampleNo) self.normal_distribution = self.normal_distribution[ self.normal_distribution > alpha_low] self.normal_distribution = self.normal_distribution[ self.normal_distribution < alpha_high] self.sampleNo = self.normal_distribution.shape[0] def update_empty_n(self): # dataloader_dict = load_json(Config.project_root+"config/json/Dataloader.json") # self.empty_every_n = dataloader_dict["empty_every_n"] pass def __getitem__(self, item): self.data_counter += 1 # np.random.seed(os.getpid()+self.cnt) # Select background(background only) and vocal file randomly self.cnt += self.num_worker while (True): random_music = np.random.randint(0, len(self.music_folders)) random_vocal = np.random.randint(0, len(self.vocal_folders)) music_fname = self.music_folders[random_music] vocal_fname = self.vocal_folders[random_vocal] music_length = self.wh.get_duration(music_fname) vocal_length = self.wh.get_duration(vocal_fname) if ((music_length - self.frame_length) <= 0 or (vocal_length - self.frame_length) <= 0): continue else: music_sr, vocal_sr = self.wh.get_sample_rate( music_fname), self.wh.get_sample_rate(vocal_fname) music_length, vocal_length = music_length * music_sr, vocal_length * vocal_sr break background_start = np.random.randint(0, music_length - self.sample_length) # print(background_start,background_start + self.frame_length * music_sr,self.wh.get_channels(music_fname)) background_crop = self.wh.read_wave( music_fname, portion_start=background_start, portion_end=background_start + self.frame_length * music_sr, channel=self.wh.get_channels(music_fname), sample_rate=self.sample_rate) # if (self.cnt % self.empty_every_n == 0): # return background_crop,np.zeros(background_crop.shape).astype(np.int16),background_crop,(music_fname,"_empty_") vocal_start = np.random.randint( 0, vocal_length - self.frame_length * vocal_sr) vocal_crop = self.wh.read_wave( vocal_fname, portion_start=vocal_start, portion_end=vocal_start + self.frame_length * vocal_sr, channel=self.wh.get_channels(vocal_fname), sample_rate=self.sample_rate).astype(np.float32) max_background = np.max(np.abs(background_crop)) max_vocal = np.max(np.abs(vocal_crop)) # To avoid magnify the blank vocal if (not max_vocal == 0 and (max_background / max_vocal) < 50): vocal_crop /= max_vocal background_crop, vocal_crop = background_crop, ( vocal_crop * max_background).astype(np.int16) alpha_vocal = self.normal_distribution[self.data_counter % self.sampleNo] alpha_background = self.normal_distribution[-(self.data_counter % self.sampleNo)] background_crop, vocal_crop = background_crop * alpha_background, vocal_crop * alpha_vocal background_crop, vocal_crop = background_crop.astype( np.int16), vocal_crop.astype(np.int16) b, v, s = torch.Tensor(background_crop), torch.Tensor( vocal_crop), torch.Tensor(background_crop + vocal_crop) # if(not Config.time_domain_loss): # b,v,s = stft(b.float(),Config.sample_rate),stft(v.float(),Config.sample_rate),stft(s.float(),Config.sample_rate) return b, v, s, (music_fname, vocal_fname) def __len__(self): # Actually infinit due to the random dynamic sampling return int(36000 / Config.frame_length) def readList(self, fname): result = [] with open(fname, "r") as f: for each in f.readlines(): each = each.strip('\n') result.append(each) return result
class WavenetDataloader(Dataset): def __init__( self, frame_length=3, sample_rate=44100, num_worker=1, MUSDB18_PATH="", BIG_DATA=False, additional_background_data=[], additional_vocal_data=[], ): np.random.seed(1) self.sample_rate = sample_rate self.wh = WaveHandler() self.BIG_DATA = BIG_DATA self.music_folders = [] for each in additional_background_data: self.music_folders += self.readList(each) self.vocal_folders = [] for each in additional_vocal_data: self.vocal_folders += self.readList(each) self.frame_length = frame_length self.bac_file_num = len(self.music_folders) self.voc_file_num = len(self.vocal_folders) self.num_worker = num_worker self.mus = musdb.DB(MUSDB18_PATH, is_wav=True, subsets='train') self.pitch_shift_high = [2.0, 3.0, 4.0, 5.0, 6.0, 7.0] self.pitch_shift_low = [-2.0, -3.0, -4.0, -5.0, -6.0, -7.0] def random_trunk(self): track = random.choice(self.mus.tracks) while (track.name == "Alexander Ross - Goodbye Bolero"): track = random.choice(self.mus.tracks) track.chunk_duration = self.frame_length track.chunk_start = random.uniform( 0, track.duration - track.chunk_duration) return track def random_bac_trunk(self): fname = self.music_folders[np.random.randint(0, self.bac_file_num)] return self.wh.random_chunk(fname, self.frame_length, normalize=True), fname def random_voc_trunk(self): fname = self.vocal_folders[np.random.randint(0, self.voc_file_num)] return self.wh.random_chunk(fname, self.frame_length, normalize=True), fname def switch_pitch_high(self, vocal): shift = np.random.choice(self.pitch_shift_high) p_vocal = np.zeros(shape=vocal.shape, dtype=np.float) # todo we assume all data are not mono p_vocal[:, 0] = librosa.effects.pitch_shift(vocal[:, 0].astype(np.float), sr=self.sample_rate, n_steps=shift) p_vocal[:, 1] = librosa.effects.pitch_shift(vocal[:, 1].astype(np.float), sr=self.sample_rate, n_steps=shift) return p_vocal def switch_pitch_low(self, vocal): shift = np.random.choice(self.pitch_shift_low) p_vocal = np.zeros(shape=vocal.shape) # todo we assume all data are not mono # start = time.time() p_vocal[:, 0] = librosa.effects.pitch_shift(vocal[:, 0].astype(np.float), sr=self.sample_rate, n_steps=shift) p_vocal[:, 1] = librosa.effects.pitch_shift(vocal[:, 1].astype(np.float), sr=self.sample_rate, n_steps=shift) end = time.time() return p_vocal def generate_chorus(self, vocal): coin = np.random.random() if (coin < 0.4): protion = 0.3 + coin return protion * vocal + (1 - protion) * self.switch_pitch_high(vocal) elif (coin < 0.8): protion = (coin - 0.4) + 0.3 return protion * vocal + (1 - protion) * self.switch_pitch_low(vocal) else: portion = (coin - 0.8) + 0.3 portion_chorus = (1 - portion) / 2 return portion * vocal + portion_chorus * self.switch_pitch_low( vocal) + portion_chorus * self.switch_pitch_high(vocal) def get_upper(self): return np.random.random() * 0.2 + 0.3 def unify_energy(self, audio): upper = 0.4 val_max = np.max(audio) if (val_max < 0.001): return audio else: return audio * (upper / val_max) def __getitem__(self, item): if (self.BIG_DATA): dice = np.random.random() else: dice = -1 if (dice == -1 or dice < 0.05): keys = ['bass', 'drums', 'other', 'accompaniment'] track_bac = self.random_trunk() track_voc = self.random_trunk() bac_target = random.choice(keys) b = self.unify_energy(track_bac.targets[bac_target].audio) v = self.unify_energy(track_voc.targets['vocals'].audio) if (dice < 0.02): v = self.generate_chorus(v) return b, v, b + v, (bac_target + "-" + track_bac.name, "vocals-" + track_voc.name) else: track_bac, name_bac = self.random_bac_trunk() track_voc, name_voc = self.random_voc_trunk() track_voc, track_bac = self.unify_energy( track_voc), self.unify_energy(track_bac) if (dice < 0.45): track_voc = self.generate_chorus(track_voc) return track_bac, track_voc, track_voc + track_bac, (name_bac, name_voc) def __len__(self): # Actually infinit due to the random dynamic sampling return int(36000 / self.frame_length) def readList(self, fname): result = [] with open(fname, "r") as f: for each in f.readlines(): each = each.strip('\n') result.append(each) return result