Exemple #1
0
def SavespecArg(y_mix, y_inst, fname, shift, stretch):
    Savespec(y_mix, y_inst, fname)
    for sh in shift:
        y_mix_shift = pitch_shift(y_mix, C.SR, sh)
        y_inst_shift = pitch_shift(y_inst, C.SR, sh)
        Savespec(y_mix_shift, y_inst_shift, "%s_shift%d" % (fname, sh))

        y_mix_shift = pitch_shift(y_mix, C.SR, -sh)
        y_inst_shift = pitch_shift(y_inst, C.SR, -sh)
        Savespec(y_mix_shift, y_inst_shift, "%s_shift-%d" % (fname, sh))

    for st in stretch:
        y_mix_stretch = time_stretch(y_mix, st)
        y_inst_stretch = time_stretch(y_inst, st)
        Savespec(y_mix_stretch, y_inst_stretch,
                 "%s_stretch%d" % (fname, int(st * 10)))
Exemple #2
0
def stretch_sound(x, rate=1.1):
    input_length = len(x)
    x = time_stretch(x, rate)
    if len(x) > input_length:
        return x[:input_length]
    else:
        return np.pad(x, (0, max(0, input_length - len(x))), "constant")
Exemple #3
0
def edit_sounds(path, pitch, length):
    sr = 44100
    output_path = get_random_name('wav')
    speed = HIGHLIGHT_LENGTH / length
    y, sr = librosa.load(path, sr=sr)
    write_wav(output_path, time_stretch(pitch_shift(y, sr, pitch), speed), sr)
    return output_path
def get_time_stretches(audio_signal):
    versions = []
    for t in time_stretch_factors:
        if t == 1:
            versions.append(audio_signal)
        else:
            versions.append(time_stretch(audio_signal, t))
    return versions
Exemple #5
0
def generate_time_stretch_augmentation(target_directory, input_directory):
    audio_file_extension = ".wav"

    if not os.path.exists(target_directory):
        os.makedirs(target_directory)

    for root, dirs, files in os.walk(input_directory):
        for file in files:
            if file.endswith(audio_file_extension):
                audio_path = os.path.join(root, file)
                signal, sample_rate = librosa.load(audio_path, sr=None)

                librosa.output.write_wav(
                    target_directory + '/' + 'slower' + file,
                    time_stretch(signal, 1.2), sample_rate)

                librosa.output.write_wav(
                    target_directory + '/' + 'faster' + file,
                    time_stretch(signal, 0.88), sample_rate)
Exemple #6
0
    def time_stretch(self, rate, **kwargs):
        """
        Time-stretch the audio time series and return the new object.

        This method is a wrapper over librosa_'s ``time_stretch`` method.
        """
        new = self.copy()
        new.data = time_stretch(new.data, rate, **kwargs)

        return new
Exemple #7
0
def LoadAudio_Arg(fname, pitch_shift, time_stretch):
    y, sr = load(fname, sr=C.SR)
    if sr != C.SR:
        y = resample(y, sr, C.SR)
    y = pitch_shift(y, C.SR, pitch_shift)
    y = time_stretch(y, time_stretch)
    spec = stft(y, n_fft=C.FFT_SIZE, hop_length=C.H, win_length=C.FFT_SIZE)
    mag = np.abs(spec)
    mag /= np.max(mag)
    phase = np.exp(1.j * np.angle(spec))
    return mag, phase
Exemple #8
0
    def transform(self, x, sr):
        """Applies the audio conversion.

            # Arguments
                x: numpy array.
                    The audio signal.
                sr: int.
                    Audio sample rate.

            # Returns
                A transformed version (x, sr) of the input.
        """
        input_length = len(x)

        if self.stretch:
            x = time_stretch(x, self.stretch)
            if len(x) > input_length:
                x = x[:input_length]
            else:
                x = np.pad(x, (0, max(0, int(input_length - len(x)))),
                           "constant")

        if self.shift:
            x = np.pad(x, (int(self.shift * sr),
                           0) if np.random.random() < 0.5 else
                       (0, int(self.shift * sr)), 'constant')
            if len(x) > input_length:
                x = x[:input_length]

        if self.noise:
            x = x + self.noise * np.random.randn(len(x))

        if self.bg_noise_dir:
            bg_noise_data = glob(self.bg_noise_dir, "*.wav")
            index_chosen_bg_file = int(len(bg_noise_data) * np.random.random())
            x_bg, sr_bg = load_audio(
                os.path.join(self.bg_noise_dir,
                             bg_noise_data[index_chosen_bg_file]))
            x_bg_rand = x_bg[int(len(x_bg) * np.random.random()):]
            while input_length > len(x_bg_rand):
                x_bg_rand = np.concatenate([x_bg_rand, x_bg])

            if len(x_bg_rand) > input_length:
                x_bg = x_bg_rand[:input_length]

            x = x + (x_bg * self.bg_noise_volume)

        if self.volume:
            x = x * self.volume

        return x, sr
    def __call__(self, signal):
        rate = random.uniform(self.scale_min, self.scale_max)
        signal = (signal / (2**15)).astype(np.float32)
        stretched_signal = time_stretch(signal, rate)
        stretched_signal = (stretched_signal * (2**15)).astype(np.int16)
        new_length = len(stretched_signal)

        if new_length < len(signal):
            padding = np.zeros(len(signal) - new_length, dtype=np.int16)
            index = np.random.randint(0, len(padding) + 1)
            return np.concatenate(
                (padding[:index], stretched_signal, padding[index:]))
        index = np.random.randint(0, new_length - len(signal) + 1)
        return stretched_signal[index:index + len(signal)]
Exemple #10
0
def main():
    tta_speed = 0.9  # slow down (i.e. &lt; 1.0)
    samples_per_sec = 16000
    test_fns = sorted(glob('data/test/audio/*.wav'))
    tta_dir = 'data/tta_test/audio'
    for fn in tqdm(test_fns):
        basename = bn(fn)
        rate, data = wf.read(fn)
        assert len(data) == samples_per_sec
        data = np.float32(data) / 32767
        data = effects.time_stretch(data, tta_speed)
        data = data[-samples_per_sec:]
        out_fn = jp(tta_dir, basename)
        wf.write(out_fn, rate, np.int16(data * 32767))
Exemple #11
0
def stretch_wave(wave: np.array, speed_factor: float = 0.8) -> np.array:
    """
    @topic: Change the speed of audio wave by given speed_factor.
    @inpit: wave: audio wave, speed_factor: the factor of speed stretch.
    @return: wave_stretch: stretched audio wave with the same length as original wave.
    """
    wave_stretch = time_stretch(wave, speed_factor)
    # pruning or padding the strethed wave
    if speed_factor < 1:  # 音频长度被拉长
        wave_stretch = wave_stretch[:len(wave)]  # 截取wave的同等长度
    elif speed_factor > 1:  # 音频长度被缩短
        pad_zero = np.array([0. for _ in range(len(wave) - len(wave_stretch))])
        wave_stretch = np.hstack((wave_stretch, pad_zero))  # 后面补零
    assert len(wave_stretch) == len(wave)
    return wave_stretch
Exemple #12
0
    def apply(self, waveform, **params):
        assert waveform.shape[
            1] <= self.max_duration * self.sr, 'waveform length > max_duration*sr'
        assert waveform.shape[0] == 1, 'waveform should have 1-channel'
        assert waveform.shape[1] > 0, 'waveform is empty'
        waveform = waveform.clone()

        rate = np.random.uniform(
            self.min_rate,
            self.max_rate)  # rate < 1.0 -- slow down, rate > 1.0 -- speed up

        if waveform.shape[1] / rate >= self.max_duration * self.sr - 1000:
            rate = np.random.uniform(
                1., 2.
            )  # If length is greater than max_duration then we increase speed up to 2 times
        waveform = time_stretch(waveform[0].numpy(), rate)

        return torch.tensor(waveform, dtype=torch.float).unsqueeze(0)
Exemple #13
0
def time_stret(x, sr=16000, length=1):
    x = effects.time_stretch(x, np.random.uniform(.5, 1.5, [1])[0])
    x = librosa.resample(x, x.shape[0] / length, sr)
    if x.shape[0] > (sr * length):
        return x[:(sr * length)]
    return np.pad(x, [0, (sr * length) - x.shape[0]])
Exemple #14
0
def speed_tune(sample: np.ndarray, max_tune: float) -> np.ndarray:
    rate = random.uniform(1 - max_tune, 1 + max_tune)
    return time_stretch(sample, rate)
Exemple #15
0
def change_tempo(audio, factor=1.0):
    return time_stretch(audio, factor)
Exemple #16
0
def main():
    if len(sys.argv) < 3 and not (sys.argv[1] == 'train'
                                  or sys.argv[1] == 'datagen'):
        print(USAGE_STRING)
    else:
        mode = sys.argv[1]
        if mode == 'test':
            fname = sys.argv[2]
            # Minimum window duration
            MIN_DURATION = 20
            spec = Spectrogram(2048, 128, 6)

            # TODO: Load the classifier weights

            # Use scipy's wavfile.read(...) to process our input
            # and re-sample it to 22050Hz (same as training examples)
            rate, data = io.read(fname, mmap=True, backend='scipy', rate=22050)

            # Ensure that window size is an even number
            wsize = (rate * MIN_DURATION) // 1000
            if wsize % 2 == 1:
                wsize += 1

            # Compute the speech segments to feed into classifier
            # This returns the speech samples to feed into the classifier by default
            speech_segments = segmentation.segment_speech(
                data, np.ones(len(data)), wsize)
            cepstra = []
            for segment in speech_segments:
                ts_ratio = len(segment) / (0.7 * rate)
                segment = segment / np.max(segment)
                segment = time_stretch(segment, ts_ratio)
                cep = spec.compute_mel_cepstrum(segment.astype('float32'), 54,
                                                (0, 8000))
                if cep.shape[0] != cep.shape[1]:
                    interpolant = interp1d(np.linspace(0, 1, cep.shape[1]),
                                           cep,
                                           axis=1)
                    cep = interpolant(np.linspace(0, 1, cep.shape[0]))
                cepstra.append(cep.ravel())

            # Feed cepstra into network sequentially:
            network = pickle.load(open(TRAINED_DUMP, "rb"))
            labels = pickle.load(open(LABELS_FILE, 'rb'))
            predictions = [network.predict(cep) for cep in cepstra]

            # Get index of 1 from 1-hot vectors to
            predictions = [np.argmax(p) for p in predictions]

            # Convert to words
            predictions = [labels['label_names'][p] for p in predictions]

            print("\n\nNetwork's estimate:\n\n", predictions)

            # Call the classifier
        elif mode == 'train':
            # denseffn.main()
            data_file = gzip.open(DATA_PKL_GZ)
            print('== Loading datasets...')
            data = pickle.load(data_file)
            train_x = data['train_x']
            train_y = data['train_y']
            validate_x = data['validate_x']
            validate_y = data['validate_y']
            print('== Done loading.')

            # Input dimension (2916 for current dataset)
            idim = train_x[0].shape[0]

            # Output dimension (1011)
            odim = train_y[0].shape[0]

            # Hidden layer dimensions
            hdims = (100, )

            network = denseffn.DenseFFN(ACT_FUNC, idim, *hdims, odim)

            print(
                "Training network with validation for params: EPOCHS={}, RATE={}, ACTIVATION={}"
                .format(EPOCHS, RATE, ACT_FUNC))
            result = network.train(train_x,
                                   train_y,
                                   validate_x,
                                   validate_y,
                                   epochs=EPOCHS,
                                   rate=RATE)

            print("\tTraining results={}".format(result))

            # Dump training results that we can use for classification later
            nnet_file = open(TRAINED_DUMP, 'wb')
            pickle.dump(network, nnet_file, protocol=pickle.HIGHEST_PROTOCOL)
            nnet_file.close()

        elif mode == 'datagen':
            generate_dataset.generate_data()

        else:
            print(USAGE_STRING)
def create_dataset_for_one_song(
    song_name,
    wav_path,
    y,
    sr,
    k,
    idx,
    npy_dataset_name,
    crop_size,
    beat_dir_path,
):
    shifts = [-12, -6, 0, 6, 12]
    for shift in tqdm(shifts, desc=song_name):
        shifted_y = y if shift == 0 else pitch_shift(y, sr, n_steps=shift)
        save_name1 = song_name if shift == 0 else f"{song_name}shift{shift}"
        for stretch_i in range(5):  # stretch 5 times randomly
            if stretch_i == 0:
                save_name2 = f"{save_name1}original" if shift == 0 else save_name1
                beat_path = f"{beat_dir_path}{song_name}.BEAT.TXT"
                melspec = convertAudio2MelSpec(wav_path)  # 100Hz
                activation, downbeats = convertBeatText2Activation(
                    beat_path, song_length=len(melspec), units="ms"
                )
                bpm = convertBeatText2Bpm(beat_path, len(melspec))
                max_bpm = np.max(bpm)
            else:
                max_rate = 300 / (max_bpm + 10)
                stretch_rates = [None, 0.5, 0.75, max_rate / 2, max_rate - 0.01]
                # stretch_rate = random.choice(np.arange(0.5, max_rate, 0.05))
                stretch_rate = stretch_rates[stretch_i]
                rounded_rate = np.round(stretch_rate, decimals=2)
                save_name2 = f"{save_name1}stretch{stretch_i}x{rounded_rate}"
                stretched_y = time_stretch(shifted_y, stretch_rate)
                melspec = convertAudio2MelSpec(None, True, stretched_y, sr)  # 100Hz
                stretched_activation = stretch_beat(activation, stretch_rate)
                stretched_bpm = stretch_bpm(bpm, stretch_rate)
                stretched_downbeats = (np.rint(downbeats / stretch_rate)).astype(
                    np.int64
                )
            beattheta = activation2beattheta(
                activation if stretch_i == 0 else stretched_activation
            )
            bartheta = activation2bartheta(
                activation if stretch_i == 0 else stretched_activation,
                downbeats if stretch_i == 0 else stretched_downbeats,
            )
            assert np.max(bartheta) > 0.5
            assert np.max(beattheta) > 0.5
            features = [
                ["melspec", melspec],
                ["activation", activation if stretch_i == 0 else stretched_activation],
                ["bpm", bpm if stretch_i == 0 else stretched_bpm],
                ["beattheta", beattheta],
                ["downbeattheta", bartheta],
            ]
            for feature in features:
                if feature[0] == "bpm":
                    feature[1][feature[1] > 300] = 300
                feature[1] = np.ascontiguousarray(feature[1])
                feature_length = len(feature[1])
                cropped_features = (
                    [feature[1]]
                    if feature_length < crop_size
                    else librosa.util.frame(
                        x=feature[1],
                        frame_length=crop_size,
                        hop_length=crop_size,
                        axis=0,
                    )
                )
                for index, cropped_feature in enumerate(cropped_features):
                    cropped_feature[cropped_feature < 0] = 0
                    fname = feature[0]
                    if fname == "activation":
                        fname = "beat"
                    elif fname == "melspec":
                        fname = f"melspec_sr{sr}_nfft1024"
                    if "original" in save_name2:
                        path = gen_path(k, "test", fname, npy_dataset_name)
                        np.save(
                            f"{path}{save_name2}-{str(index)}.{feature[0]}",
                            cropped_feature,
                        )
                    if idx > 25 and (idx - 1) % 25 < 5 and "shift" not in save_name2:
                        path = gen_path(k, "valid", fname, npy_dataset_name)
                        np.save(
                            f"{path}{save_name2}-{str(index)}.{feature[0]}",
                            cropped_feature,
                        )
                    else:
                        path = gen_path(k, "train", fname, npy_dataset_name)
                        np.save(
                            f"{path}{save_name2}-{str(index)}.{feature[0]}",
                            cropped_feature,
                        )
Exemple #18
0
 def change_speed(self):
     speed_change = np.random.uniform(low=0.9, high=1.1)
     tmp = time_stretch(self.X, speed_change)
     minlen = min(self.orig.shape[0], tmp.shape[0])
     self.X *= 0
     self.X[0:minlen] = tmp[0:minlen]
Exemple #19
0
def pitch(st, log_level, input_file, output_file, quantize_bits,
          skip_normalize, skip_quantize, skip_input_filter, skip_output_filter,
          skip_time_stretch, custom_time_stretch):

    log = logging.getLogger(__name__)
    sh = logging.StreamHandler()
    sh.setFormatter(logging.Formatter('%(levelname)-8s %(message)s'))
    log.addHandler(sh)

    valid_levels = list(log_levels.keys())
    if (not log_level) or (log_level.upper() not in valid_levels):
        log.warn(f'Invalid log-level: "{log_level}", log-level set to "INFO", '
                 f'valid log levels are {valid_levels}')
        log_level = 'INFO'

    log_level = log_levels[log_level]
    log.setLevel(log_level)

    log.info(f'loading: "{input_file}" at oversampled rate: {INPUT_SR}')
    y, s = load(input_file, sr=INPUT_SR)
    log.info('done loading')

    midrise, midtread = calc_quantize_function(quantize_bits, log)

    if skip_input_filter:
        log.info('skipping input anti aliasing filter')
    else:
        y = filter_input(y, log)

    resampled = scipy_resample(y, INPUT_SR, TARGET_SR, RESAMPLE_MULTIPLIER,
                               log)

    if skip_quantize:
        log.info('skipping quantize')
    else:
        # simulate analog -> digital conversion
        # TODO: midtread/midrise option?
        resampled = quantize(resampled, midtread, quantize_bits, log)

    pitched = adjust_pitch(resampled, st, skip_time_stretch, log)

    if skip_time_stretch:
        ratio = len(pitched) / len(resampled)
        log.info(
            '\"skipping\" time stretch: stretching back to original length...')
        pitched = time_stretch(pitched, ratio)

    if custom_time_stretch:
        log.info('running custom time stretch of ratio: {custom_time_stretch}')
        pitched = time_stretch(pitched, custom_time_stretch)

    # oversample again (default factor of 4) to simulate ZOH
    # TODO: retest output against freq aliased sinc fn
    post_zero_order_hold = zero_order_hold(pitched, ZOH_MULTIPLIER, log)

    # TODO: try using scipy resample here?
    output = resample(np.asfortranarray(post_zero_order_hold),
                      TARGET_SR * ZOH_MULTIPLIER, OUTPUT_SR)

    if skip_output_filter:
        log.info('skipping output eq filter')
    else:
        output = filter_output(output, log)  # eq filter

    log.info(f'writing {output_file}, at sample rate {OUTPUT_SR} '
             f'with skip_normalize set to {skip_normalize}')

    if '.mp3' in output_file:
        write_mp3(output_file, output, OUTPUT_SR, not skip_normalize)
    else:
        output_file = output_file
        af.write(output_file, output, OUTPUT_SR, '16bit', not skip_normalize)

    log.info(f'done! output_file at: {output_file}')
    return
Exemple #20
0
 def _speed_tune(self, sample):
     rate_ = np.random.uniform(1 - self.speed_tune, 1 + self.speed_tune)
     return time_stretch(sample.astype('float'), rate_)