def SavespecArg(y_mix, y_inst, fname, shift, stretch): Savespec(y_mix, y_inst, fname) for sh in shift: y_mix_shift = pitch_shift(y_mix, C.SR, sh) y_inst_shift = pitch_shift(y_inst, C.SR, sh) Savespec(y_mix_shift, y_inst_shift, "%s_shift%d" % (fname, sh)) y_mix_shift = pitch_shift(y_mix, C.SR, -sh) y_inst_shift = pitch_shift(y_inst, C.SR, -sh) Savespec(y_mix_shift, y_inst_shift, "%s_shift-%d" % (fname, sh)) for st in stretch: y_mix_stretch = time_stretch(y_mix, st) y_inst_stretch = time_stretch(y_inst, st) Savespec(y_mix_stretch, y_inst_stretch, "%s_stretch%d" % (fname, int(st * 10)))
def stretch_sound(x, rate=1.1): input_length = len(x) x = time_stretch(x, rate) if len(x) > input_length: return x[:input_length] else: return np.pad(x, (0, max(0, input_length - len(x))), "constant")
def edit_sounds(path, pitch, length): sr = 44100 output_path = get_random_name('wav') speed = HIGHLIGHT_LENGTH / length y, sr = librosa.load(path, sr=sr) write_wav(output_path, time_stretch(pitch_shift(y, sr, pitch), speed), sr) return output_path
def get_time_stretches(audio_signal): versions = [] for t in time_stretch_factors: if t == 1: versions.append(audio_signal) else: versions.append(time_stretch(audio_signal, t)) return versions
def generate_time_stretch_augmentation(target_directory, input_directory): audio_file_extension = ".wav" if not os.path.exists(target_directory): os.makedirs(target_directory) for root, dirs, files in os.walk(input_directory): for file in files: if file.endswith(audio_file_extension): audio_path = os.path.join(root, file) signal, sample_rate = librosa.load(audio_path, sr=None) librosa.output.write_wav( target_directory + '/' + 'slower' + file, time_stretch(signal, 1.2), sample_rate) librosa.output.write_wav( target_directory + '/' + 'faster' + file, time_stretch(signal, 0.88), sample_rate)
def time_stretch(self, rate, **kwargs): """ Time-stretch the audio time series and return the new object. This method is a wrapper over librosa_'s ``time_stretch`` method. """ new = self.copy() new.data = time_stretch(new.data, rate, **kwargs) return new
def LoadAudio_Arg(fname, pitch_shift, time_stretch): y, sr = load(fname, sr=C.SR) if sr != C.SR: y = resample(y, sr, C.SR) y = pitch_shift(y, C.SR, pitch_shift) y = time_stretch(y, time_stretch) spec = stft(y, n_fft=C.FFT_SIZE, hop_length=C.H, win_length=C.FFT_SIZE) mag = np.abs(spec) mag /= np.max(mag) phase = np.exp(1.j * np.angle(spec)) return mag, phase
def transform(self, x, sr): """Applies the audio conversion. # Arguments x: numpy array. The audio signal. sr: int. Audio sample rate. # Returns A transformed version (x, sr) of the input. """ input_length = len(x) if self.stretch: x = time_stretch(x, self.stretch) if len(x) > input_length: x = x[:input_length] else: x = np.pad(x, (0, max(0, int(input_length - len(x)))), "constant") if self.shift: x = np.pad(x, (int(self.shift * sr), 0) if np.random.random() < 0.5 else (0, int(self.shift * sr)), 'constant') if len(x) > input_length: x = x[:input_length] if self.noise: x = x + self.noise * np.random.randn(len(x)) if self.bg_noise_dir: bg_noise_data = glob(self.bg_noise_dir, "*.wav") index_chosen_bg_file = int(len(bg_noise_data) * np.random.random()) x_bg, sr_bg = load_audio( os.path.join(self.bg_noise_dir, bg_noise_data[index_chosen_bg_file])) x_bg_rand = x_bg[int(len(x_bg) * np.random.random()):] while input_length > len(x_bg_rand): x_bg_rand = np.concatenate([x_bg_rand, x_bg]) if len(x_bg_rand) > input_length: x_bg = x_bg_rand[:input_length] x = x + (x_bg * self.bg_noise_volume) if self.volume: x = x * self.volume return x, sr
def __call__(self, signal): rate = random.uniform(self.scale_min, self.scale_max) signal = (signal / (2**15)).astype(np.float32) stretched_signal = time_stretch(signal, rate) stretched_signal = (stretched_signal * (2**15)).astype(np.int16) new_length = len(stretched_signal) if new_length < len(signal): padding = np.zeros(len(signal) - new_length, dtype=np.int16) index = np.random.randint(0, len(padding) + 1) return np.concatenate( (padding[:index], stretched_signal, padding[index:])) index = np.random.randint(0, new_length - len(signal) + 1) return stretched_signal[index:index + len(signal)]
def main(): tta_speed = 0.9 # slow down (i.e. < 1.0) samples_per_sec = 16000 test_fns = sorted(glob('data/test/audio/*.wav')) tta_dir = 'data/tta_test/audio' for fn in tqdm(test_fns): basename = bn(fn) rate, data = wf.read(fn) assert len(data) == samples_per_sec data = np.float32(data) / 32767 data = effects.time_stretch(data, tta_speed) data = data[-samples_per_sec:] out_fn = jp(tta_dir, basename) wf.write(out_fn, rate, np.int16(data * 32767))
def stretch_wave(wave: np.array, speed_factor: float = 0.8) -> np.array: """ @topic: Change the speed of audio wave by given speed_factor. @inpit: wave: audio wave, speed_factor: the factor of speed stretch. @return: wave_stretch: stretched audio wave with the same length as original wave. """ wave_stretch = time_stretch(wave, speed_factor) # pruning or padding the strethed wave if speed_factor < 1: # 音频长度被拉长 wave_stretch = wave_stretch[:len(wave)] # 截取wave的同等长度 elif speed_factor > 1: # 音频长度被缩短 pad_zero = np.array([0. for _ in range(len(wave) - len(wave_stretch))]) wave_stretch = np.hstack((wave_stretch, pad_zero)) # 后面补零 assert len(wave_stretch) == len(wave) return wave_stretch
def apply(self, waveform, **params): assert waveform.shape[ 1] <= self.max_duration * self.sr, 'waveform length > max_duration*sr' assert waveform.shape[0] == 1, 'waveform should have 1-channel' assert waveform.shape[1] > 0, 'waveform is empty' waveform = waveform.clone() rate = np.random.uniform( self.min_rate, self.max_rate) # rate < 1.0 -- slow down, rate > 1.0 -- speed up if waveform.shape[1] / rate >= self.max_duration * self.sr - 1000: rate = np.random.uniform( 1., 2. ) # If length is greater than max_duration then we increase speed up to 2 times waveform = time_stretch(waveform[0].numpy(), rate) return torch.tensor(waveform, dtype=torch.float).unsqueeze(0)
def time_stret(x, sr=16000, length=1): x = effects.time_stretch(x, np.random.uniform(.5, 1.5, [1])[0]) x = librosa.resample(x, x.shape[0] / length, sr) if x.shape[0] > (sr * length): return x[:(sr * length)] return np.pad(x, [0, (sr * length) - x.shape[0]])
def speed_tune(sample: np.ndarray, max_tune: float) -> np.ndarray: rate = random.uniform(1 - max_tune, 1 + max_tune) return time_stretch(sample, rate)
def change_tempo(audio, factor=1.0): return time_stretch(audio, factor)
def main(): if len(sys.argv) < 3 and not (sys.argv[1] == 'train' or sys.argv[1] == 'datagen'): print(USAGE_STRING) else: mode = sys.argv[1] if mode == 'test': fname = sys.argv[2] # Minimum window duration MIN_DURATION = 20 spec = Spectrogram(2048, 128, 6) # TODO: Load the classifier weights # Use scipy's wavfile.read(...) to process our input # and re-sample it to 22050Hz (same as training examples) rate, data = io.read(fname, mmap=True, backend='scipy', rate=22050) # Ensure that window size is an even number wsize = (rate * MIN_DURATION) // 1000 if wsize % 2 == 1: wsize += 1 # Compute the speech segments to feed into classifier # This returns the speech samples to feed into the classifier by default speech_segments = segmentation.segment_speech( data, np.ones(len(data)), wsize) cepstra = [] for segment in speech_segments: ts_ratio = len(segment) / (0.7 * rate) segment = segment / np.max(segment) segment = time_stretch(segment, ts_ratio) cep = spec.compute_mel_cepstrum(segment.astype('float32'), 54, (0, 8000)) if cep.shape[0] != cep.shape[1]: interpolant = interp1d(np.linspace(0, 1, cep.shape[1]), cep, axis=1) cep = interpolant(np.linspace(0, 1, cep.shape[0])) cepstra.append(cep.ravel()) # Feed cepstra into network sequentially: network = pickle.load(open(TRAINED_DUMP, "rb")) labels = pickle.load(open(LABELS_FILE, 'rb')) predictions = [network.predict(cep) for cep in cepstra] # Get index of 1 from 1-hot vectors to predictions = [np.argmax(p) for p in predictions] # Convert to words predictions = [labels['label_names'][p] for p in predictions] print("\n\nNetwork's estimate:\n\n", predictions) # Call the classifier elif mode == 'train': # denseffn.main() data_file = gzip.open(DATA_PKL_GZ) print('== Loading datasets...') data = pickle.load(data_file) train_x = data['train_x'] train_y = data['train_y'] validate_x = data['validate_x'] validate_y = data['validate_y'] print('== Done loading.') # Input dimension (2916 for current dataset) idim = train_x[0].shape[0] # Output dimension (1011) odim = train_y[0].shape[0] # Hidden layer dimensions hdims = (100, ) network = denseffn.DenseFFN(ACT_FUNC, idim, *hdims, odim) print( "Training network with validation for params: EPOCHS={}, RATE={}, ACTIVATION={}" .format(EPOCHS, RATE, ACT_FUNC)) result = network.train(train_x, train_y, validate_x, validate_y, epochs=EPOCHS, rate=RATE) print("\tTraining results={}".format(result)) # Dump training results that we can use for classification later nnet_file = open(TRAINED_DUMP, 'wb') pickle.dump(network, nnet_file, protocol=pickle.HIGHEST_PROTOCOL) nnet_file.close() elif mode == 'datagen': generate_dataset.generate_data() else: print(USAGE_STRING)
def create_dataset_for_one_song( song_name, wav_path, y, sr, k, idx, npy_dataset_name, crop_size, beat_dir_path, ): shifts = [-12, -6, 0, 6, 12] for shift in tqdm(shifts, desc=song_name): shifted_y = y if shift == 0 else pitch_shift(y, sr, n_steps=shift) save_name1 = song_name if shift == 0 else f"{song_name}shift{shift}" for stretch_i in range(5): # stretch 5 times randomly if stretch_i == 0: save_name2 = f"{save_name1}original" if shift == 0 else save_name1 beat_path = f"{beat_dir_path}{song_name}.BEAT.TXT" melspec = convertAudio2MelSpec(wav_path) # 100Hz activation, downbeats = convertBeatText2Activation( beat_path, song_length=len(melspec), units="ms" ) bpm = convertBeatText2Bpm(beat_path, len(melspec)) max_bpm = np.max(bpm) else: max_rate = 300 / (max_bpm + 10) stretch_rates = [None, 0.5, 0.75, max_rate / 2, max_rate - 0.01] # stretch_rate = random.choice(np.arange(0.5, max_rate, 0.05)) stretch_rate = stretch_rates[stretch_i] rounded_rate = np.round(stretch_rate, decimals=2) save_name2 = f"{save_name1}stretch{stretch_i}x{rounded_rate}" stretched_y = time_stretch(shifted_y, stretch_rate) melspec = convertAudio2MelSpec(None, True, stretched_y, sr) # 100Hz stretched_activation = stretch_beat(activation, stretch_rate) stretched_bpm = stretch_bpm(bpm, stretch_rate) stretched_downbeats = (np.rint(downbeats / stretch_rate)).astype( np.int64 ) beattheta = activation2beattheta( activation if stretch_i == 0 else stretched_activation ) bartheta = activation2bartheta( activation if stretch_i == 0 else stretched_activation, downbeats if stretch_i == 0 else stretched_downbeats, ) assert np.max(bartheta) > 0.5 assert np.max(beattheta) > 0.5 features = [ ["melspec", melspec], ["activation", activation if stretch_i == 0 else stretched_activation], ["bpm", bpm if stretch_i == 0 else stretched_bpm], ["beattheta", beattheta], ["downbeattheta", bartheta], ] for feature in features: if feature[0] == "bpm": feature[1][feature[1] > 300] = 300 feature[1] = np.ascontiguousarray(feature[1]) feature_length = len(feature[1]) cropped_features = ( [feature[1]] if feature_length < crop_size else librosa.util.frame( x=feature[1], frame_length=crop_size, hop_length=crop_size, axis=0, ) ) for index, cropped_feature in enumerate(cropped_features): cropped_feature[cropped_feature < 0] = 0 fname = feature[0] if fname == "activation": fname = "beat" elif fname == "melspec": fname = f"melspec_sr{sr}_nfft1024" if "original" in save_name2: path = gen_path(k, "test", fname, npy_dataset_name) np.save( f"{path}{save_name2}-{str(index)}.{feature[0]}", cropped_feature, ) if idx > 25 and (idx - 1) % 25 < 5 and "shift" not in save_name2: path = gen_path(k, "valid", fname, npy_dataset_name) np.save( f"{path}{save_name2}-{str(index)}.{feature[0]}", cropped_feature, ) else: path = gen_path(k, "train", fname, npy_dataset_name) np.save( f"{path}{save_name2}-{str(index)}.{feature[0]}", cropped_feature, )
def change_speed(self): speed_change = np.random.uniform(low=0.9, high=1.1) tmp = time_stretch(self.X, speed_change) minlen = min(self.orig.shape[0], tmp.shape[0]) self.X *= 0 self.X[0:minlen] = tmp[0:minlen]
def pitch(st, log_level, input_file, output_file, quantize_bits, skip_normalize, skip_quantize, skip_input_filter, skip_output_filter, skip_time_stretch, custom_time_stretch): log = logging.getLogger(__name__) sh = logging.StreamHandler() sh.setFormatter(logging.Formatter('%(levelname)-8s %(message)s')) log.addHandler(sh) valid_levels = list(log_levels.keys()) if (not log_level) or (log_level.upper() not in valid_levels): log.warn(f'Invalid log-level: "{log_level}", log-level set to "INFO", ' f'valid log levels are {valid_levels}') log_level = 'INFO' log_level = log_levels[log_level] log.setLevel(log_level) log.info(f'loading: "{input_file}" at oversampled rate: {INPUT_SR}') y, s = load(input_file, sr=INPUT_SR) log.info('done loading') midrise, midtread = calc_quantize_function(quantize_bits, log) if skip_input_filter: log.info('skipping input anti aliasing filter') else: y = filter_input(y, log) resampled = scipy_resample(y, INPUT_SR, TARGET_SR, RESAMPLE_MULTIPLIER, log) if skip_quantize: log.info('skipping quantize') else: # simulate analog -> digital conversion # TODO: midtread/midrise option? resampled = quantize(resampled, midtread, quantize_bits, log) pitched = adjust_pitch(resampled, st, skip_time_stretch, log) if skip_time_stretch: ratio = len(pitched) / len(resampled) log.info( '\"skipping\" time stretch: stretching back to original length...') pitched = time_stretch(pitched, ratio) if custom_time_stretch: log.info('running custom time stretch of ratio: {custom_time_stretch}') pitched = time_stretch(pitched, custom_time_stretch) # oversample again (default factor of 4) to simulate ZOH # TODO: retest output against freq aliased sinc fn post_zero_order_hold = zero_order_hold(pitched, ZOH_MULTIPLIER, log) # TODO: try using scipy resample here? output = resample(np.asfortranarray(post_zero_order_hold), TARGET_SR * ZOH_MULTIPLIER, OUTPUT_SR) if skip_output_filter: log.info('skipping output eq filter') else: output = filter_output(output, log) # eq filter log.info(f'writing {output_file}, at sample rate {OUTPUT_SR} ' f'with skip_normalize set to {skip_normalize}') if '.mp3' in output_file: write_mp3(output_file, output, OUTPUT_SR, not skip_normalize) else: output_file = output_file af.write(output_file, output, OUTPUT_SR, '16bit', not skip_normalize) log.info(f'done! output_file at: {output_file}') return
def _speed_tune(self, sample): rate_ = np.random.uniform(1 - self.speed_tune, 1 + self.speed_tune) return time_stretch(sample.astype('float'), rate_)