def __getitem__(self, idx): if torch.is_tensor(idx): #Ensure the elements are always scalars idx = idx.tolist() audio_file_name = self.nsynth_meta_df.iloc[idx].note_str + '.wav' audio_pitch = self.nsynth_meta_df.iloc[idx].pitch audio_data, _ = librosa.load(os.path.join(self.audio_dir, audio_file_name), sr=self.sr) mult = 0.25 + ((self.nsynth_meta_df.iloc[idx].part - 1) * 0.5) start_location = int(16000 * mult) audio_data = audio_data[start_location:start_location + self.sample_length] audio_data_stft = librosa.stft(audio_data, n_fft=(self.sample_length - 1) * 2) audio_data = librosa.mu_compress(audio_data, quantize=False) audio_data = librosa.util.normalize(audio_data) new_sample = np.concatenate([ np.reshape(audio_data, (self.sample_length, 1)) * 2, np.abs(audio_data_stft), np.angle(audio_data_stft) ], axis=1) return new_sample, self.classes.index(audio_pitch)
def _mulaw_compression(wav): """Compress the waveform using mu-law compression """ wav = np.pad(wav, (cfg.win_length // 2, ), mode="reflect") wav = wav[:((wav.shape[0] - cfg.win_length) // cfg.hop_length + 1) * cfg.hop_length] wav = 2**(cfg.num_bits - 1) + librosa.mu_compress(wav, mu=2**cfg.num_bits - 1) return wav
def getAllSamples(wavsDir, generatedWavsDir): realAudioList, generatedAudioList = [], [] for wav, generatedWav in zip(os.listdir(wavsDir), os.listdir(generatedWavsDir)): audio, _ = librosa.load(wavsDir + '/' + wav, sr=24000) duration = librosa.get_duration(audio, sr=24000) offset = np.random.randint(0, duration - 2) audio, _ = librosa.load(wavsDir + '/' + wav, sr=24000, offset=int(offset), duration=2) quantizedAudio = librosa.mu_compress(audio) realAudioList.append(quantizedAudio) fakeAudio, _ = librosa.load(generatedWavsDir + '/' + generatedWav, sr=24000) generatedAudioList.append(fakeAudio) return realAudioList, generatedAudioList
def test_mu_law_correctness(quantization_channels): # test reconstruction mu_src = np.arange(0, quantization_channels).astype(np.int) src = KPB.mu_law_decoding(mu_src, quantization_channels=quantization_channels) mu_src_recon = KPB.mu_law_encoding(src, quantization_channels=quantization_channels) np.testing.assert_equal(mu_src, mu_src_recon) # test against librosa resol = 1 / (2 ** 16) src = np.arange(-1.0, 1.0, resol).astype(np.float32) mu = quantization_channels - 1 mu_src_ref = librosa.mu_compress(src, mu=quantization_channels - 1, quantize=False) mu_src_ref = (mu_src_ref + 1.0) / 2.0 * mu + 0.5 mu_src_ref = mu_src_ref.astype(np.int) mu_src_kapre = KPB.mu_law_encoding( tf.convert_to_tensor(src), quantization_channels=quantization_channels ) np.testing.assert_equal(mu_src_ref, mu_src_kapre.numpy())
def getDataset(wavsDir, textDir): audioList, textList = [], [] for wav in os.listdir(wavsDir): audio, _ = librosa.load(wavsDir + '/' + wav, sr=24000) duration = librosa.get_duration(audio, sr=24000) offset = np.random.randint(0, duration - 2) audio, _ = librosa.load(wavsDir + '/' + wav, sr=24000, offset=int(offset), duration=2) quantizedAudio = librosa.mu_compress(audio) quantizedAudio = tf.reshape(quantizedAudio[0:-1], (1, 48000, 1)) audioList.append(quantizedAudio) textFile = wav.split('.')[0] + ".txt" content = "" with open(os.path.join(textDir, textFile), 'r+', encoding='utf-8') as f: content = f.read() textList.append(content) audioDataset = tf.concat(audioList, axis=0) textDataset = BERT_MODEL(textList) return audioDataset, textDataset
def mulaw_encode(samples): # Rescale to -1.0..1.0. Encode to -128..127. Return 0..255. return ( librosa.mu_compress(samples / peak_amplitude(samples), quantize=True) + 128).astype('uint8')
def mu_compress(wav, hop_length=200, frame_length=800, bits=10): wav = np.pad(wav, (frame_length // 2, ), mode="reflect") wav = wav[:((wav.shape[0] - frame_length) // hop_length + 1) * hop_length] wav = 2**(bits - 1) + librosa.mu_compress(wav, mu=2**bits - 1) return wav
def mu_compress(x: np.array, p): "Mu expand from C, W in [-1., 1.] to C, W in [-1., 1.] " return librosa.mu_compress(x, mu=p.n_classes - 1, quantize=False)
def __init__(self, track_list, x_len, y_len=1, bitrate=16, twos_comp=True, num_classes=256, store_tracks=False, encoder=None, is_scalar=False, sample_step=2000, gc=None, class_label=None): self.data = [] self.tracks = [] self.receptive_field = x_len self.y_len = y_len self.num_channels = 1 self.num_classes = num_classes self.bitrate = bitrate self.datarange = (-2**(bitrate - 1), 2**(bitrate - 1) - 1) self.twos_comp = twos_comp self.is_scalar = is_scalar # self.bins = np.linspace(-1, 1, num_classes) self.data_len = 0 self.sample_step = sample_step self.tracks_buckets = {} self.class_label = class_label self.gc = gc if encoder is None: self.encoder = MuEncoder(self.datarange) if not track_list: raise FileNotFoundError( 'The data directory contains no file with postfix .wav') self.data_root = os.path.dirname(track_list[0]) # data_root = os.path.join(os.path.dirname(track_list[0]), "data.h5") # if os.path.exists(data_root): # hf = h5py.File(data_root, 'r') # matrix = hf.get('dataset') # self.data_array = np.array(matrix) # hf.close() # else: # data_matrix = None self.track_list = track_list store_tracks = True for idx, track in enumerate(track_list): audio, dtype, sample_rate = self._load_audio_from_wav(track) audio = self.trim_silence(audio) audio = audio.reshape(-1, 1) audio = librosa.mu_compress(audio) + 128 # if len(audio) > 160000: # audio = audio[:160000] # elif len(audio) < 160000: # audio = np.pad(audio, (160000 - len(audio), 0), mode='constant', constant_values=(0, 0)) # if data_matrix is None: # data_matrix = audio # else: # data_matrix = np.concatenate([data_matrix, audio], axis=0) # audio = np.pad(audio, ((self.receptive_field, 0), (0, 0)), mode='constant', constant_values=0) if store_tracks: self.tracks.append({ 'name': track, 'audio': audio, 'sample_rate': sample_rate }) # This is a problem this dataset is none-overlapping sampling, # that means larger x_len results in smaller data volume. # for i in range(0, len(audio) - self.receptive_field - y_len, self.sample_step): for i in range(0, len(audio) - self.receptive_field - y_len, self.receptive_field): # x, y = self._extract_segment(audio, self.receptive_field, y_len, start_idx=i) self.data.append({ 'file_idx': idx, 'start_idx': i, }) # x, y = self.preprocess(x, y) # self.data.append({'x': x, 'y': y}) # self.data.append((track, i)) # start_idx = self.data_len # self.data_len += (len(audio) - x_len - y_len) // self.sample_step # self.tracks_buckets[track] = (start_idx, self.data_len - 1) # self.data.append(audio) # self.data_array = data_matrix # hf = h5py.File(data_root, 'w') # hf.create_dataset('dataset', data=self.data_array) # hf.close() self.dtype = np.dtype('int16') self.sample_rate = 16000