def __init__(
        self,
        frame_length=3,
        sample_rate=44100,
        num_worker=1,
        MUSDB18_PATH="",
        BIG_DATA=False,
        additional_background_data=[],
        additional_vocal_data=[],
    ):
        np.random.seed(1)
        self.sample_rate = sample_rate
        self.wh = WaveHandler()
        self.BIG_DATA = BIG_DATA
        self.music_folders = []
        for each in additional_background_data:
            self.music_folders += self.readList(each)
        self.vocal_folders = []
        for each in additional_vocal_data:
            self.vocal_folders += self.readList(each)
        self.frame_length = frame_length
        self.bac_file_num = len(self.music_folders)
        self.voc_file_num = len(self.vocal_folders)

        self.num_worker = num_worker
        self.mus = musdb.DB(MUSDB18_PATH, is_wav=True, subsets='train')
        self.pitch_shift_high = [2.0, 3.0, 4.0, 5.0, 6.0, 7.0]
        self.pitch_shift_low = [-2.0, -3.0, -4.0, -5.0, -6.0, -7.0]
def read_wav(estimate_fname, target_fname):
    from util.wave_util import WaveHandler
    wh = WaveHandler()
    estimate = wh.read_wave(estimate_fname, channel=1)
    truth = wh.read_wave(target_fname, channel=2)
    min_length = min(estimate.shape[0], truth.shape[0])
    estimate, truth = estimate[:min_length].reshape(
        (1, min_length, 1)), truth[:min_length].reshape((1, min_length, 1))
    return estimate, truth
Esempio n. 3
0
def get_total_time_in_folder(path):
    if (not path[-1] == '/'):
        raise ValueError("Error: path should end with /")
    wh = WaveHandler()
    total_time = 0
    for cnt, file in enumerate(os.listdir(path)):
        total_time += wh.get_duration(path + file)
    print("total: ")
    print(total_time, "s")
    print(total_time / 60, "min")
    print(total_time / 3600, "h")
Esempio n. 4
0
def delete_unproper_training_data(path):
    if (not path[-1] == '/'):
        raise ValueError("Error: path should end with /")
    wh = WaveHandler()
    files = os.listdir(path)
    for cnt, each in enumerate(files):
        file_pth = path + each
        if (file_pth.split('.')[-1] == 'wav'):
            judge = wh.get_channels_sampwidth_and_sample_rate(file_pth)
            if (not judge[0]):
                print(each, "Unproper! params:", judge[1])
                os.remove(file_pth)
Esempio n. 5
0
def seg_data():
    wh = WaveHandler()
    dir = Config.datahub_root + "song/441_song_data/"
    seg_dir = Config.datahub_root + "song/seg_song_data/"
    for cnt, fname in enumerate(os.listdir(dir)):
        print("Doing segmentation on ", fname + "...")
        unseg_f = dir + fname
        data = wh.read_wave(unseg_f, channel=2)
        length = data.shape[0]
        for start in np.linspace(0, 0.95, 20):
            seg_data = data[int(start * length):int((start + 0.05) * length)]
            wh.save_wave(seg_data,
                         seg_dir + fname.split('.')[-2] + "_" +
                         str('%.2f' % start) + ".wav",
                         channels=2)
Esempio n. 6
0
class VocalFilter():
    def __init__(self):
        self.vad = webrtcvad.Vad()
        self.wh = WaveHandler()
        self.kernal = np.ones(44100*1)/4410*5
        self.threshold = 20
        # self.kernal = np.append(np.linspace(0,1,44100*1.5),np.linspace(1,0,44100*1.5))
    def normalize(self,frames):
        return frames/np.max(frames)

    def variance(self,frames):
        return np.var(frames)

    def flattern(self,arr,smooth = 44100*2):
        for i in range(arr.shape[0]):
            arr[i] = np.sum(arr[i:i+smooth])/smooth

    def conv(self,arr,ker):
        return scipy.signal.convolve(arr, ker)

    def calculate_variance(self,fpath,name = ""):
        self.frames = self.wh.read_wave(fpath)
        self.frames = self.frames[self.frames>=0.0]
        self.frames = self.conv(self.frames,self.kernal)
        # frames = self.flattern(frames)
        length = self.wh.get_framesLength(fpath)
        zero_count = np.sum(~(self.frames>self.threshold))
        if(name != ""):plotWav(self.frames,name)
        self.frames = self.frames[44100*100:44100*100+10000]
        return zero_count/length

    def filter_music(self,pth):
        if(pth[-1] != '/'):
            raise ValueError("Error: Path should end with /")
        dict = {}
        for each in os.listdir(pth):
            fpath = pth+each
            ratio = self.calculate_variance(fpath)
            dict[each] = ratio
            print(ratio,each)

    def myVad(self):
        self.vad.set_mode(0)
        sample_rate = 16000
        frame_duration = 10
        frame = b'\x10\x20' * int(sample_rate * frame_duration / 1000)
        print('Contains speech: %s' % (self.vad.is_speech(frame, sample_rate)))
Esempio n. 7
0
def get_total_time_in_txt(txtpath):
    wh = WaveHandler()
    cnt = 0
    files = readList(txtpath)
    total_time = 0
    for file in files:
        try:
            total_time += wh.get_duration(file)
            cnt += 1
        except:
            print("error:", file)

    # print(total_time,"s")
    # print(total_time/60,"min")
    print(
        txtpath.split('/')[-1].split('.')[-2], ",",
        str(total_time / 3600) + "h,", cnt, ", " + txtpath)
    return total_time / 3600, cnt
Esempio n. 8
0
 def __init__(
     self,
     frame_length=Config.frame_length,
     sample_rate=Config.sample_rate,
     num_worker=Config.num_workers,
     sampleNo=20000,
     mu=Config.mu,
     empty_every_n=50,
     sigma=Config.sigma,
     alpha_low=Config.alpha_low,
     alpha_high=Config.
     alpha_high  # If alpha_high get a value greater than 0.5, it would have probability to overflow
 ):
     np.random.seed(1)
     self.sample_rate = sample_rate
     self.frame_length = frame_length
     # self.music_folders = self.readList(Config.musdb_train_background)
     self.music_folders = []
     for each in Config.background_data:
         self.music_folders += self.readList(each)
     self.vocal_folders = []
     for each in Config.vocal_data:
         self.vocal_folders += self.readList(each)
     # prev_data_size = len(self.vocal_folders)
     # if(Config.exclude_list != ""):
     #     for each in self.readList(Config.exclude_list):
     #         self.vocal_folders.remove(each)
     # print(prev_data_size-len(self.vocal_folders)," songs were removed from vocal datasets")
     self.sample_length = int(self.sample_rate * self.frame_length)
     self.cnt = 0
     self.data_counter = 0
     self.empty_every_n = empty_every_n
     self.sampleNo = sampleNo
     self.num_worker = num_worker
     self.wh = WaveHandler()
     # This alpha is to balance the energy between vocal and background
     # Also, this alpha is used to simulate different energy leval between vocal and background
     self.normal_distribution = np.random.normal(mu, sigma, sampleNo)
     self.normal_distribution = self.normal_distribution[
         self.normal_distribution > alpha_low]
     self.normal_distribution = self.normal_distribution[
         self.normal_distribution < alpha_high]
     self.sampleNo = self.normal_distribution.shape[0]
Esempio n. 9
0
def eval_spleeter():
    from evaluate.sdr import sdr_evaluate
    wh = WaveHandler()
    from evaluate.si_sdr_numpy import sdr, si_sdr
    output_test_pth = Config.datahub_root + "musdb18hq/spleeter_out/test/"
    mus_test_pth = Config.datahub_root + "musdb18hq/test/"

    vocal = []
    background = []
    #
    # for each in os.listdir(mus_train_pth):
    #     mus_dir = mus_train_pth + each + "/"
    #     out_dir = output_train_pth + each + "/output/combined/"
    #     # try:
    #     mus_vocal = wh.read_wave(mus_dir + "vocals.wav")
    #     mus_background = wh.read_wave(mus_dir + "background.wav")
    #     output_vocal = wh.read_wave(out_dir + "vocals.wav")
    #     output_background = wh.read_wave(out_dir + "accompaniment.wav")
    #
    #     output_vocal, mus_vocal = unify(output_vocal, mus_vocal)
    #     output_background, mus_background = unify(output_background, mus_background)
    #
    #     v = sdr(output_vocal, mus_vocal)
    #     b = sdr(output_background, mus_background)
    #     vocal.append(v)
    #     background.append(b)
    #     print("FileName: ",each, "\tSDR-VOCAL: ",v,"SDR-BACKGROUND: " ,b)

    for each in sorted(os.listdir(musdb_test_pth)):
        mus_dir = mus_test_pth + each + "/"
        out_dir = output_test_pth + each + "/output/combined/"
        # try:
        mus_vocal = wh.read_wave(mus_dir + "vocals.wav")
        mus_background = wh.read_wave(mus_dir + "background.wav")
        output_vocal = wh.read_wave(out_dir + "vocals.wav")
        output_background = wh.read_wave(out_dir + "accompaniment.wav")

        output_vocal, mus_vocal = unify(output_vocal, mus_vocal)
        output_background, mus_background = unify(output_background,
                                                  mus_background)

        v = sdr(output_vocal, mus_vocal)
        b = sdr(output_background, mus_background)
        vocal.append(v)
        background.append(b)
        print("FileName: ", each, "\tSDR-BACKGROUND: ", b, "\tSDR-VOCAL: ", v)
        # except:
        #     print("Error",each)
    print("AVG-SDR-VOCAL", sum(vocal) / len(vocal))
    print("AVG-SDR-BACKGROUND", sum(background) / len(background))
Esempio n. 10
0
 def __init__(self):
     self.vad = webrtcvad.Vad()
     self.wh = WaveHandler()
     self.kernal = np.ones(44100*1)/4410*5
     self.threshold = 20
Esempio n. 11
0
    Config.trail_name + "/" + "data_background.txt")
write_list(
    Config.vocal_data, Config.project_root + "saved_models/" +
    Config.trail_name + "/" + "data_vocal.txt")

# Cache for data
freq_bac_loss_cache = []
freq_voc_loss_cache = []
freq_cons_loss_cache = []

best_sdr_vocal, best_sdr_background = Config.best_sdr_vocal, Config.best_sdr_background

# exclude_dict = load_json("config/json/ExcludeData.json")
# exclude_start_point,vocal_sisdr_min,vocal_sisdr_max,background_sisdr_min,background_sisdr_max = exclude_dict["start_exclude_point"],exclude_dict["vocal_sisdr"][0],exclude_dict["vocal_sisdr"][1],exclude_dict["background_sisdr"][0],exclude_dict["background_sisdr"][1]

wh = WaveHandler()
loss = torch.nn.L1Loss()

if (not Config.start_point == 0):
    model = torch.load(Config.load_model_path + "/model" +
                       str(Config.start_point) + ".pkl",
                       map_location=Config.device)
else:
    if (Config.split_band):
        model = Spleeter(channels=2,
                         unet_inchannels=2 * Config.subband,
                         unet_outchannels=2 * Config.subband).cuda(
                             Config.device)
    else:
        model = Spleeter(channels=2, unet_inchannels=2,
                         unet_outchannels=2).cuda(Config.device)
Esempio n. 12
0
class WavenetDataloader(Dataset):
    def __init__(
        self,
        frame_length=Config.frame_length,
        sample_rate=Config.sample_rate,
        num_worker=Config.num_workers,
        sampleNo=20000,
        mu=Config.mu,
        empty_every_n=50,
        sigma=Config.sigma,
        alpha_low=Config.alpha_low,
        alpha_high=Config.
        alpha_high  # If alpha_high get a value greater than 0.5, it would have probability to overflow
    ):
        np.random.seed(1)
        self.sample_rate = sample_rate
        self.frame_length = frame_length
        # self.music_folders = self.readList(Config.musdb_train_background)
        self.music_folders = []
        for each in Config.background_data:
            self.music_folders += self.readList(each)
        self.vocal_folders = []
        for each in Config.vocal_data:
            self.vocal_folders += self.readList(each)
        # prev_data_size = len(self.vocal_folders)
        # if(Config.exclude_list != ""):
        #     for each in self.readList(Config.exclude_list):
        #         self.vocal_folders.remove(each)
        # print(prev_data_size-len(self.vocal_folders)," songs were removed from vocal datasets")
        self.sample_length = int(self.sample_rate * self.frame_length)
        self.cnt = 0
        self.data_counter = 0
        self.empty_every_n = empty_every_n
        self.sampleNo = sampleNo
        self.num_worker = num_worker
        self.wh = WaveHandler()
        # This alpha is to balance the energy between vocal and background
        # Also, this alpha is used to simulate different energy leval between vocal and background
        self.normal_distribution = np.random.normal(mu, sigma, sampleNo)
        self.normal_distribution = self.normal_distribution[
            self.normal_distribution > alpha_low]
        self.normal_distribution = self.normal_distribution[
            self.normal_distribution < alpha_high]
        self.sampleNo = self.normal_distribution.shape[0]

    def update_empty_n(self):
        # dataloader_dict = load_json(Config.project_root+"config/json/Dataloader.json")
        # self.empty_every_n = dataloader_dict["empty_every_n"]
        pass

    def __getitem__(self, item):
        self.data_counter += 1
        # np.random.seed(os.getpid()+self.cnt)
        # Select background(background only) and vocal file randomly
        self.cnt += self.num_worker
        while (True):
            random_music = np.random.randint(0, len(self.music_folders))
            random_vocal = np.random.randint(0, len(self.vocal_folders))
            music_fname = self.music_folders[random_music]
            vocal_fname = self.vocal_folders[random_vocal]
            music_length = self.wh.get_duration(music_fname)
            vocal_length = self.wh.get_duration(vocal_fname)
            if ((music_length - self.frame_length) <= 0
                    or (vocal_length - self.frame_length) <= 0):
                continue
            else:
                music_sr, vocal_sr = self.wh.get_sample_rate(
                    music_fname), self.wh.get_sample_rate(vocal_fname)
                music_length, vocal_length = music_length * music_sr, vocal_length * vocal_sr
                break
        background_start = np.random.randint(0,
                                             music_length - self.sample_length)
        # print(background_start,background_start + self.frame_length * music_sr,self.wh.get_channels(music_fname))
        background_crop = self.wh.read_wave(
            music_fname,
            portion_start=background_start,
            portion_end=background_start + self.frame_length * music_sr,
            channel=self.wh.get_channels(music_fname),
            sample_rate=self.sample_rate)
        # if (self.cnt % self.empty_every_n == 0):
        #     return background_crop,np.zeros(background_crop.shape).astype(np.int16),background_crop,(music_fname,"_empty_")

        vocal_start = np.random.randint(
            0, vocal_length - self.frame_length * vocal_sr)
        vocal_crop = self.wh.read_wave(
            vocal_fname,
            portion_start=vocal_start,
            portion_end=vocal_start + self.frame_length * vocal_sr,
            channel=self.wh.get_channels(vocal_fname),
            sample_rate=self.sample_rate).astype(np.float32)
        max_background = np.max(np.abs(background_crop))
        max_vocal = np.max(np.abs(vocal_crop))
        # To avoid magnify the blank vocal

        if (not max_vocal == 0 and (max_background / max_vocal) < 50):
            vocal_crop /= max_vocal
            background_crop, vocal_crop = background_crop, (
                vocal_crop * max_background).astype(np.int16)
            alpha_vocal = self.normal_distribution[self.data_counter %
                                                   self.sampleNo]
            alpha_background = self.normal_distribution[-(self.data_counter %
                                                          self.sampleNo)]
            background_crop, vocal_crop = background_crop * alpha_background, vocal_crop * alpha_vocal

        background_crop, vocal_crop = background_crop.astype(
            np.int16), vocal_crop.astype(np.int16)
        b, v, s = torch.Tensor(background_crop), torch.Tensor(
            vocal_crop), torch.Tensor(background_crop + vocal_crop)
        # if(not Config.time_domain_loss):
        #     b,v,s = stft(b.float(),Config.sample_rate),stft(v.float(),Config.sample_rate),stft(s.float(),Config.sample_rate)

        return b, v, s, (music_fname, vocal_fname)

    def __len__(self):
        # Actually infinit due to the random dynamic sampling
        return int(36000 / Config.frame_length)

    def readList(self, fname):
        result = []
        with open(fname, "r") as f:
            for each in f.readlines():
                each = each.strip('\n')
                result.append(each)
        return result
class WavenetDataloader(Dataset):
    def __init__(
        self,
        frame_length=3,
        sample_rate=44100,
        num_worker=1,
        MUSDB18_PATH="",
        BIG_DATA=False,
        additional_background_data=[],
        additional_vocal_data=[],
    ):
        np.random.seed(1)
        self.sample_rate = sample_rate
        self.wh = WaveHandler()
        self.BIG_DATA = BIG_DATA
        self.music_folders = []
        for each in additional_background_data:
            self.music_folders += self.readList(each)
        self.vocal_folders = []
        for each in additional_vocal_data:
            self.vocal_folders += self.readList(each)
        self.frame_length = frame_length
        self.bac_file_num = len(self.music_folders)
        self.voc_file_num = len(self.vocal_folders)

        self.num_worker = num_worker
        self.mus = musdb.DB(MUSDB18_PATH, is_wav=True, subsets='train')
        self.pitch_shift_high = [2.0, 3.0, 4.0, 5.0, 6.0, 7.0]
        self.pitch_shift_low = [-2.0, -3.0, -4.0, -5.0, -6.0, -7.0]

    def random_trunk(self):
        track = random.choice(self.mus.tracks)
        while (track.name == "Alexander Ross - Goodbye Bolero"):
            track = random.choice(self.mus.tracks)
        track.chunk_duration = self.frame_length
        track.chunk_start = random.uniform(
            0, track.duration - track.chunk_duration)
        return track

    def random_bac_trunk(self):
        fname = self.music_folders[np.random.randint(0, self.bac_file_num)]
        return self.wh.random_chunk(fname, self.frame_length,
                                    normalize=True), fname

    def random_voc_trunk(self):
        fname = self.vocal_folders[np.random.randint(0, self.voc_file_num)]
        return self.wh.random_chunk(fname, self.frame_length,
                                    normalize=True), fname

    def switch_pitch_high(self, vocal):
        shift = np.random.choice(self.pitch_shift_high)
        p_vocal = np.zeros(shape=vocal.shape, dtype=np.float)
        # todo we assume all data are not mono
        p_vocal[:, 0] = librosa.effects.pitch_shift(vocal[:,
                                                          0].astype(np.float),
                                                    sr=self.sample_rate,
                                                    n_steps=shift)
        p_vocal[:, 1] = librosa.effects.pitch_shift(vocal[:,
                                                          1].astype(np.float),
                                                    sr=self.sample_rate,
                                                    n_steps=shift)
        return p_vocal

    def switch_pitch_low(self, vocal):
        shift = np.random.choice(self.pitch_shift_low)
        p_vocal = np.zeros(shape=vocal.shape)
        # todo we assume all data are not mono
        # start = time.time()
        p_vocal[:, 0] = librosa.effects.pitch_shift(vocal[:,
                                                          0].astype(np.float),
                                                    sr=self.sample_rate,
                                                    n_steps=shift)
        p_vocal[:, 1] = librosa.effects.pitch_shift(vocal[:,
                                                          1].astype(np.float),
                                                    sr=self.sample_rate,
                                                    n_steps=shift)
        end = time.time()
        return p_vocal

    def generate_chorus(self, vocal):
        coin = np.random.random()
        if (coin < 0.4):
            protion = 0.3 + coin
            return protion * vocal + (1 -
                                      protion) * self.switch_pitch_high(vocal)
        elif (coin < 0.8):
            protion = (coin - 0.4) + 0.3
            return protion * vocal + (1 -
                                      protion) * self.switch_pitch_low(vocal)
        else:
            portion = (coin - 0.8) + 0.3
            portion_chorus = (1 - portion) / 2
            return portion * vocal + portion_chorus * self.switch_pitch_low(
                vocal) + portion_chorus * self.switch_pitch_high(vocal)

    def get_upper(self):
        return np.random.random() * 0.2 + 0.3

    def unify_energy(self, audio):
        upper = 0.4
        val_max = np.max(audio)
        if (val_max < 0.001):
            return audio
        else:
            return audio * (upper / val_max)

    def __getitem__(self, item):
        if (self.BIG_DATA):
            dice = np.random.random()
        else:
            dice = -1
        if (dice == -1 or dice < 0.05):
            keys = ['bass', 'drums', 'other', 'accompaniment']
            track_bac = self.random_trunk()
            track_voc = self.random_trunk()
            bac_target = random.choice(keys)
            b = self.unify_energy(track_bac.targets[bac_target].audio)
            v = self.unify_energy(track_voc.targets['vocals'].audio)
            if (dice < 0.02):
                v = self.generate_chorus(v)
            return b, v, b + v, (bac_target + "-" + track_bac.name,
                                 "vocals-" + track_voc.name)
        else:
            track_bac, name_bac = self.random_bac_trunk()
            track_voc, name_voc = self.random_voc_trunk()
            track_voc, track_bac = self.unify_energy(
                track_voc), self.unify_energy(track_bac)
            if (dice < 0.45):
                track_voc = self.generate_chorus(track_voc)
            return track_bac, track_voc, track_voc + track_bac, (name_bac,
                                                                 name_voc)

    def __len__(self):
        # Actually infinit due to the random dynamic sampling
        return int(36000 / self.frame_length)

    def readList(self, fname):
        result = []
        with open(fname, "r") as f:
            for each in f.readlines():
                each = each.strip('\n')
                result.append(each)
        return result