Ejemplo n.º 1
0
def time_stretch_hpss(audio, f):

    if f == 1.0:
        return audio

    stft = core.stft(audio)

    # Perform HPSS
    stft_harm, stft_perc = decompose.hpss(
        stft, kernel_size=31)  # original kernel size 31

    # OLA the percussive part
    # make sure the signals properly overlap
    y_perc = librosa.util.fix_length(core.istft(stft_perc, dtype=audio.dtype),
                                     len(audio))
    y_perc = time_stretch_sola(y_perc, f, wsola=True)

    #~ # Phase-vocode the harmonic part
    stft_stretch = core.phase_vocoder(stft_harm, 1.0 / f)
    #~ # Inverse STFT of harmonic
    y_harm = librosa.util.fix_length(
        core.istft(stft_stretch, dtype=y_perc.dtype), len(y_perc))
    # y_harm = librosa.util.fix_length(core.istft(stft_harm, dtype=audio.dtype), len(audio))
    # y_harm = librosa.util.fix_length(time_stretch_sola(core.istft(stft_harm, dtype=audio.dtype), f, wsola = True), len(y_perc))

    # Add them together
    return y_harm + y_perc
Ejemplo n.º 2
0
        def get_data(train=True):
            batch_out = []
            interval = int(self.clip_sec * self.samplerate) * 2
            for batch_idx in range(self.batch_size):
                if train:
                    rec_idx = np.random.randint(len(train_data_wav))
                    crop_idx = np.random.randint(len(train_data_wav[rec_idx][0]) - interval)
                    sources = [i[crop_idx:crop_idx + interval] for i in train_data_wav[rec_idx]]
                else:
                    rec_idx = np.random.randint(len(test_data_wav))
                    crop_idx = np.random.randint(len(test_data_wav[rec_idx][0]) - interval)
                    sources = [i[crop_idx:crop_idx + interval] for i in test_data_wav[rec_idx]]

                if config.pitch_aug and train:
                    n_steps = pitch_shift_list[np.random.randint(len(pitch_shift_list))]
                    if not n_steps==0:
                        sources = [pitch_shift(i, self.samplerate, n_steps=n_steps) for i in sources]

                sources = [from_polar(to_stft(i, self.nfft)) for i in sources]
                if config.bpm_aug and train:
                    rate = stretch_rate_list[np.random.randint(len(stretch_rate_list))]
                    if not rate==1.0:
                        for i in range(len(sources)):
                            augmented = phase_vocoder(sources[i][:, :, 0] + 1j * sources[i][:, :, 1], rate=rate)
                            sources[i] = np.array([np.real(augmented), np.imag(augmented)]).transpose(1, 2, 0)
                if config.amp_aug and train:
                    sources = [i * (0.75 + (np.random.random() * 0.5)) for i in sources]
                sources = random_crop(sources, self.ydim)
                batch_out.append(sources)

            batch_out = np.array(batch_out).transpose(1, 0, 2, 3, 4)
            if train and true_wp(config.shuffle_sources_aug_prob) == 1.0:
                for source_i in range(self.num_sources):
                    np.random.shuffle(batch_out[source_i])
            return batch_out
Ejemplo n.º 3
0
def eval(net1, net2, speech_file_loc, melody_file_loc):
    # Evaluates the result of net1, net2 on a given speech file and melody file
    # speech_file_loc, melody_file_loc are strings that specify the location of the respective audio files
    network1, network2 = net1.eval(), net2.eval()
    # Read input audio
    orig_speech = core.load(speech_file_loc, sr)[0]
    inp_speech = DL.remove_silent_frames(orig_speech)
    #inp_speech = 1.0 * orig_speech
    stft_inp = core.stft(inp_speech,
                         n_fft=nfft,
                         hop_length=hop,
                         win_length=wlen)

    # Extract melody and create its image
    melody = utils.MelodyExt.melody_extraction(melody_file_loc,
                                               'runtime_folder/ref_melody')[0]
    ref_pc = melody[:, 1]
    ref_time = melody[:, 0]
    const = hop * 1.0 / sr
    new_sampling_time = np.arange(const, ref_time[-1], const)
    interp_melody = np.interp(new_sampling_time, ref_time, ref_pc)
    n_frames = new_sampling_time.shape[0]
    idx1 = (1.0 * interp_melody * nfft / sr).astype(int)
    idx2 = np.array(range(n_frames))
    pc = np.zeros([1 + nfft / 2, n_frames])
    pc[idx1, idx2] = 1
    pc[-1] = 1 * pc[0]
    pc[0] = 0 * pc[0]

    # Complete input preprocessing
    rate = stft_inp.shape[1] * 1.0 / n_frames
    stft_inp = core.phase_vocoder(stft_inp, rate,
                                  hop)  # Stretch input speech to target length
    n_frames += 8 - n_frames % 8
    # Append zeros to make it suitable for network
    stft_inp = np.concatenate([
        stft_inp,
        np.zeros([stft_inp.shape[0], n_frames - stft_inp.shape[1]])
    ],
                              axis=1)
    pc = np.concatenate(
        [pc, np.zeros([pc.shape[0], n_frames - pc.shape[1]])], axis=1)
    stft_inp = np.log(1 + np.abs(stft_inp))
    stft_inp, pc = torch.from_numpy(stft_inp).float().unsqueeze(
        0), torch.from_numpy(pc).float().unsqueeze(0)  # Make tensors

    # Extract output
    encode2 = network2(Variable(pc.to(device)))
    pred, encode1 = network1(Variable(stft_inp.to(device)), encode2)
    pred = pred[0].cpu().data.numpy()
    pred[pred < 0] = 0
    pred = np.exp(pred) - 1
    time_pred = 3.0 * utils.gl_rec(pred, hop, wlen, core.istft(
        pred, hop, wlen))  # Adding a multiplier to increase loudness
    return time_pred
Ejemplo n.º 4
0
def change_speed(input_signal, rate):
    """Change the playback speed of an audio signal

    Parameters
    ----------
    input_signal : numpy.array
        Input array, must have numerical type.
    rate : numeric
        Desired rate of change to the speed.
        To increase the speed, pass in a value greater than 1.0.
        To decrease the speed, pass in a value between 0.0 and 1.0.

    Returns
    -------
    numpy.array representing the audio signal with changed speed.

    """

    if input_signal.dtype.kind not in 'iu' and input_signal.dtype.kind != 'f':
        raise TypeError(
            "'input_signal' must be an array of integers or floats")

    if rate <= 0:
        raise Exception('rate must be a positive number')

    # Convert input signal to a -1.0 to 1.0 float if it's an integer type
    if input_signal.dtype.kind in 'iu':
        i = np.iinfo('float32')
        abs_max = 2**(i.bits - 1)
        offset = i.min + abs_max
        input_signal = (input_signal.astype('float32') - offset) / abs_max

    # Transform signal to frequency domain
    frequency_domain_signal = core.stft(input_signal)

    # Change speed with the phase vocoding method
    fds_changed_speed = core.phase_vocoder(frequency_domain_signal, rate)

    # Transform frequency domain signal back to time domain
    output_signal = core.istft(fds_changed_speed, dtype=input_signal.dtype)

    return output_signal
Ejemplo n.º 5
0
    def __getitem__(self, samp_info):

        usr = samp_info[0]  # Which user
        snum = samp_info[1]  # Which song of the user
        inp_start = float(
            samp_info[4]) * self.sr  # Start index of the time-domian signal
        inp_end = float(
            samp_info[5]) * self.sr  # End index of the time-domain signal
        lines_read = samp_info[6]
        lines_sung = samp_info[7]

        inp_audio = np.array([])
        file_path = self.root_dir + str(usr) + '/read/' + str(snum) + '.wav'
        inp_audio = self.all_audio[file_path][int(inp_start):int(inp_end)]
        inp_audio = remove_silent_frames(inp_audio)

        rps = np.random.uniform(-1.0, 1.0)
        inp_rps = librosa.effects.pitch_shift(inp_audio, self.sr, n_steps=rps)
        stft_inp = core.stft(inp_audio,
                             n_fft=self.nfft,
                             hop_length=self.hop,
                             win_length=self.wlen)
        stft_rps = core.stft(inp_rps,
                             n_fft=self.nfft,
                             hop_length=self.hop,
                             win_length=self.wlen)

        out_start = float(
            samp_info[2]) * self.sr  # Start index of the time signal
        out_end = float(samp_info[3]) * self.sr  # End index of the time signal

        file_path = self.root_dir + str(usr) + '/sing/' + str(snum) + '.wav'
        #out_audio = core.load(file_path, self.sr)[0][int(out_start):int(out_end)]
        out_audio = self.all_audio[file_path][int(out_start):int(out_end)]
        out_rps = librosa.effects.pitch_shift(out_audio, self.sr, n_steps=rps)
        stft_out = core.stft(out_audio,
                             n_fft=self.nfft,
                             hop_length=self.hop,
                             win_length=self.wlen)
        stft_rps_out = core.stft(out_rps,
                                 n_fft=self.nfft,
                                 hop_length=self.hop,
                                 win_length=self.wlen)

        rate = stft_inp.shape[1] * 1.0 / stft_out.shape[1]
        stft_inp_orig = 1 * stft_inp
        stft_inp = core.phase_vocoder(stft_inp, rate, self.hop)
        stft_inp = stft_inp[:, :stft_out.shape[1]]
        stft_rps = core.phase_vocoder(stft_rps, rate, self.hop)
        stft_rps = stft_rps[:, :stft_out.shape[1]]

        #phn_matrix = np.zeros([len(cmu_phn), stft_out.shape[1]])
        phn_matrix = np.zeros(stft_out.shape[1]).astype(int)
        hop_time = (self.hop * 1.0 / self.sr)
        for idx in range(0, len(lines_read)):
            phn_start, phn_end = extract_time(lines_sung[idx])
            if (lines_sung[idx][-3] == ' '):
                cur_phn = lines_sung[idx][-2:-1]
            elif (lines_sung[idx][-4] == ' '):
                cur_phn = lines_sung[idx][-3:-1]

            if (cur_phn[-1] == ' '):
                cur_phn = cur_phn[0]
            start_time = float(samp_info[2])
            end_time = float(samp_info[3])
            if (
                    phn_end - phn_start > 0.005
            ):  # Just a check that the phone should sustain for more than a few ms
                start_idx = int((phn_start - start_time) / hop_time)
                end_idx = int((phn_end - start_time) / hop_time)
                #print start_idx, end_idx, cur_phn, phn_matrix.shape
                #phn_matrix[:, start_idx:end_idx] = np.tile(phn_dict[cur_phn], (end_idx-start_idx, 1)).transpose()
                phn_matrix[start_idx:end_idx] = int(phn_dict[cur_phn])

        return [
            np.abs(stft_inp),
            np.abs(stft_out),
            pitch_max(np.abs(stft_inp)),
            pitch_pyin(int(out_start), usr, snum, stft_out.shape), rate,
            stft_inp_orig, stft_inp, stft_out,
            np.abs(stft_rps),
            pitch_max(np.abs(stft_rps)),
            np.abs(stft_rps_out),
            self.fld.index(usr), phn_matrix
        ]
Ejemplo n.º 6
0
    def __getitem__(self, samp_info):

        #print samp_info[0], samp_info[1], samp_info[2], samp_info[3]
        usr = samp_info[0]  # Which user
        snum = samp_info[1]  # Which song out of user's 4 songs
        inp_start = float(
            samp_info[4]) * self.sr  # Start index of the time-domian signal
        inp_end = float(
            samp_info[5]) * self.sr  # End index of the time-domain signal
        lines_read = samp_info[6]
        lines_sung = samp_info[7]

        inp_audio = np.array([])
        file_path = self.root_dir + str(usr) + '/read/' + str(snum) + '.wav'
        #'''
        inp_full = self.all_audio[file_path]
        for idx in range(0, len(lines_read)):
            r_start, r_end = extract_time(lines_read[idx])
            s_start, s_end = extract_time(lines_sung[idx])
            stretch_rate = (r_end - r_start) / (1e-3 + s_end - s_start)
            #print r_end, r_start, s_end, s_start
            inp_phn = inp_full[int(r_start * self.sr):int(r_end * self.sr)]
            inp_phn_stretch = librosa.effects.time_stretch(
                inp_phn, stretch_rate)
            inp_audio = np.append(inp_audio, inp_phn_stretch)

        rps = np.random.uniform(-1.0, 1.0)
        #rps = 3.0
        inp_rps = librosa.effects.pitch_shift(inp_audio, self.sr, n_steps=rps)
        stft_inp = core.stft(inp_audio,
                             n_fft=self.nfft,
                             hop_length=self.hop,
                             win_length=self.wlen)
        stft_rps = core.stft(inp_rps,
                             n_fft=self.nfft,
                             hop_length=self.hop,
                             win_length=self.wlen)

        out_start = float(
            samp_info[2]) * self.sr  # Start index of the time signal
        out_end = float(samp_info[3]) * self.sr  # End index of the time signal

        file_path = self.root_dir + str(usr) + '/sing/' + str(snum) + '.wav'
        #out_audio = core.load(file_path, self.sr)[0][int(out_start):int(out_end)]
        out_audio = self.all_audio[file_path][int(out_start):int(out_end)]
        out_rps = librosa.effects.pitch_shift(out_audio, self.sr, n_steps=rps)
        stft_out = core.stft(out_audio,
                             n_fft=self.nfft,
                             hop_length=self.hop,
                             win_length=self.wlen)
        stft_rps_out = core.stft(out_rps,
                                 n_fft=self.nfft,
                                 hop_length=self.hop,
                                 win_length=self.wlen)

        # Making input also length of 3 seconds (if not wanted comment next 2 lines)
        rate = stft_inp.shape[1] * 1.0 / stft_out.shape[1]
        stft_inp_orig = 1 * stft_inp
        stft_inp = core.phase_vocoder(stft_inp, rate, self.hop)
        stft_inp = stft_inp[:, :stft_out.shape[1]]
        stft_rps = core.phase_vocoder(stft_rps, rate, self.hop)
        stft_rps = stft_rps[:, :stft_out.shape[1]]

        #phn_matrix = np.zeros([len(cmu_phn), stft_out.shape[1]])
        phn_matrix = np.zeros(stft_out.shape[1]).astype(int)
        hop_time = (hop * 1.0 / sr)
        for idx in range(0, len(lines_read)):
            phn_start, phn_end = extract_time(lines_sung[idx])
            if (lines_sung[idx][-3] == ' '):
                cur_phn = lines_sung[idx][-2:-1]
            elif (lines_sung[idx][-4] == ' '):
                cur_phn = lines_sung[idx][-3:-1]

            if (cur_phn[-1] == ' '):
                cur_phn = cur_phn[0]
            start_time = float(samp_info[2])
            end_time = float(samp_info[3])
            if (phn_end - phn_start > 0.005):
                start_idx = int((phn_start - start_time) / hop_time)
                end_idx = int((phn_end - start_time) / hop_time)
                #print start_idx, end_idx, cur_phn, phn_matrix.shape
                #phn_matrix[:, start_idx:end_idx] = np.tile(phn_dict[cur_phn], (end_idx-start_idx, 1)).transpose()
                phn_matrix[start_idx:end_idx] = int(phn_dict[cur_phn])

        return [
            np.abs(stft_inp),
            np.abs(stft_out),
            pitch_max(np.abs(stft_inp)),
            pitch_pyin(int(out_start), usr, snum, stft_out.shape), rate,
            stft_inp_orig, stft_inp, stft_out,
            np.abs(stft_rps),
            pitch_max(np.abs(stft_rps)),
            np.abs(stft_rps_out),
            self.fld.index(usr), phn_matrix
        ]  # Input, Output, Original Input-Output length ratio
Ejemplo n.º 7
0
    def __getitem__(self, index):
        audio_list = []
        tf = self.id_list[index]
        read_file = self.get_read(tf)

        read_audio = f'{read_file[:-4]}.wav'
        song_audio = f'{tf[:-4]}.wav'

        with open(tf, 'rb') as f:
            song_txt = f.read().splitlines()
        with open(read_file, 'rb') as f:
            read_txt = f.read().splitlines()

        melody_name = "_".join([tf.split('/')[-3], tf.split('/')[-1]])[:-4]
        melody = np.load(f'../sp2si-code/melody_contour/{melody_name}.npy')

        index_begin = randint(0, len(read_txt) - 40) + 1
        index_end = index_begin + 10

        song_begin = float(song_txt[index_begin].split()[0])
        song_end = float(song_txt[index_end].split()[1])

        song_dur = song_end - song_begin
        while song_dur < 5 and index_end < len(read_txt) - 2:
            index_end += 1
            song_end = float(song_txt[index_end].split()[1])
            song_dur = song_end - song_begin
        while song_dur > 8:
            index_end -= 1
            song_end = float(song_txt[index_end].split()[1])
            song_dur = song_end - song_begin

        read_begin = float(read_txt[index_begin].split()[0])
        read_end = float(read_txt[index_end].split()[1])
        read_dur = read_end - read_begin

        read_audio = core.load(read_audio,
                               sr=self.sr,
                               mono=True,
                               offset=read_begin,
                               duration=read_dur)[0]
        song_audio = core.load(song_audio,
                               sr=self.sr,
                               mono=True,
                               offset=song_begin,
                               duration=song_dur)[0]

        read_stft = core.stft(read_audio,
                              n_fft=self.nfft,
                              hop_length=self.hop,
                              win_length=self.wlen)

        song_stft = core.stft(song_audio,
                              n_fft=self.nfft,
                              hop_length=self.hop,
                              win_length=self.wlen)

        song_stft = song_stft[..., :-1]
        melody = torch.from_numpy(melody).unsqueeze(0).unsqueeze(0)

        melody = F.interpolate(melody,
                               scale_factor=(22050 / 16000 / 2),
                               mode='nearest')
        melody = torch.squeeze(melody)

        pitch = melody[int(song_begin * self.sr /
                           self.hop):int(song_begin * self.sr / self.hop) +
                       song_stft.shape[1]]
        pitch = pitch.cpu().numpy()
        pitch = librosa.core.hz_to_midi(pitch)

        rate = read_stft.shape[1] / song_stft.shape[1]
        read_stft = core.phase_vocoder(read_stft, rate, self.hop)
        read_stft = read_stft[:, :song_stft.shape[1]]

        read_stft = abs(read_stft)
        song_stft = abs(song_stft)

        if args.feat_type == "mel":
            read_stft = np.matmul(mel_basis, read_stft)
            song_stft = np.matmul(mel_basis, song_stft)

        read_mag = np.log10(np.clip((read_stft), a_min=1e-5, a_max=100000))
        song_mag = np.log10(np.clip((song_stft), a_min=1e-5, a_max=100000))

        return song_mag, read_mag, pitch, read_audio
Ejemplo n.º 8
0
    def __getitem__(self, index):
        audio_list = []
        tf = self.id_list[index]
        read_file = self.get_read(tf)

        read_audio = f'{read_file[:-4]}.wav'
        song_audio = f'{tf[:-4]}.wav'
        
        with open(tf, 'rb') as f:
            song_txt = f.read().splitlines()
        with open(read_file, 'rb') as f:
            read_txt = f.read().splitlines()

        melody_name = "_".join([tf.split('/')[-3],
                            tf.split('/')[-1]])[:-4]
        melody = np.load(f'../sp2si-code/melody_contour/{melody_name}.npy')
        
        index_begin = randint(0,len(read_txt) - 40) + 1
        index_end = index_begin + 30


        song_begin = float(song_txt[index_begin].split()[0])
        song_end = float(song_txt[index_end].split()[1])
        

        song_dur = song_end - song_begin
        while song_dur < 7 and index_end < len(read_txt) - 2:
            index_end += 1
            song_end = float(song_txt[index_end].split()[1])
            song_dur = song_end - song_begin
        while song_dur > 10 :
            index_end -= 1
            song_end = float(song_txt[index_end].split()[1])
            song_dur = song_end - song_begin
        
        read_begin = float(read_txt[index_begin].split()[0])
        read_end = float(read_txt[index_end].split()[1])
        read_dur = read_end - read_begin

        
        read_audio = core.load(read_audio, sr=self.sr, mono=True, offset=read_begin, duration=read_dur)[0]
        song_audio = core.load(song_audio, sr=self.sr, mono=True, offset=song_begin, duration=song_dur)[0]

        index_sing_dur = []
        index_read_dur = []
        t = self.sr
        rsi = float(read_txt[index_begin].split()[0])
        ssi = float(song_txt[index_begin].split()[0])
        i = index_begin
        while i < index_end+1:
            s_begin, s_end, _ = song_txt[i].split()
            r_begin, r_end, _ = read_txt[i].split()
            s_begin, s_end, r_begin, r_end = float(s_begin), float(s_end), float(r_begin), float(r_end)
            while (s_end - s_begin< 0.2 or r_end - r_begin < 0.2) and i<index_end + 1:
                i = i + 1
                _, s_end, _ = song_txt[i].split()
                _, r_end, _ = read_txt[i].split()
                s_end , r_end = float(s_end), float(r_end)
            index_read_dur += [((r_begin-rsi)*t , (r_end-rsi)*t) ]
            index_sing_dur += [((s_begin-ssi)*t  , (s_end-ssi)*t)]
            i = i + 1
        
        read_audio_list = []
        for i in range(len(index_read_dur)):
            r_begin, r_end = index_read_dur[i]
            s_begin, s_end = index_sing_dur[i]
            if r_end - r_begin ==0 or s_end - s_begin == 0:
                read_audio_list += [read_audio[int(r_begin):int(r_end)]]
                continue
            rate = (s_end - s_begin)/(r_end - r_begin)
            read_audio_slice = librosa.effects.time_stretch(read_audio[int(r_begin):int(r_end)], 1/rate)
            read_audio_list += [read_audio_slice]
        
        read_audio = np.concatenate(read_audio_list, axis=0)

        read_stft = core.stft(read_audio, n_fft=self.nfft, hop_length=self.hop, win_length=self.wlen)
        
        song_stft = core.stft(song_audio, n_fft=self.nfft, hop_length=self.hop, win_length=self.wlen)
        
        song_stft = song_stft[...,:-1]
        
        rate = read_stft.shape[1] / song_stft.shape[1]
        read_stft = core.phase_vocoder(read_stft, rate, self.hop)
        read_stft = read_stft[:, :song_stft.shape[1]]
        
        read_stft = abs(read_stft)
        song_stft = abs(song_stft)
        
        if args.feat_type == "mel":
            read_stft = np.matmul(mel_basis, read_stft)
            song_stft = np.matmul(mel_basis, song_stft)
            
        read_mag = np.log10(np.clip((read_stft), a_min=1e-5, a_max=100000))
        song_mag = np.log10(np.clip((song_stft), a_min=1e-5, a_max=100000))
        
        return song_mag, read_mag,  read_audio