def collect_data_for_one_sample(self, sample_name): # Get root dirs unprocessed_dir = os.path.join(self.data_root_dir, 'mix_clean') groundtruth_spk0_dir = os.path.join(self.data_root_dir, 's1') groundtruth_spk1_dir = os.path.join(self.data_root_dir, 's2') # Get full file paths unprocessed_file = os.path.join(unprocessed_dir, sample_name + '.wav') groundtruth_spk0_file = os.path.join(groundtruth_spk0_dir, sample_name + '.wav') groundtruth_spk1_file = os.path.join(groundtruth_spk1_dir, sample_name + '.wav') # Collect signals unprocessed, _ = load(unprocessed_file, sr=self.samplerate_hz) groundtruth_spk0, _ = load(groundtruth_spk0_file, sr=self.samplerate_hz) groundtruth_spk1, _ = load(groundtruth_spk1_file, sr=self.samplerate_hz) # Determine start point of segment if unprocessed.size > self.samples_per_utterance: max_shift = unprocessed.size - self.samples_per_utterance start_point = np.random.randint(max_shift) else: start_point = 0 # Cut segment out segment unprocessed = fix_length(unprocessed[start_point:], self.samples_per_utterance) groundtruth_spk0 = fix_length(groundtruth_spk0[start_point:], self.samples_per_utterance) groundtruth_spk1 = fix_length(groundtruth_spk1[start_point:], self.samples_per_utterance) groundtruth = np.stack([groundtruth_spk0, groundtruth_spk1], axis=0) return unprocessed, groundtruth
def separate_single_mixture(self, mixture): original_length = mixture.size mixture_padded = fix_length(mixture, self.max_length) speaker_signals_padded = self.tasnet.model.predict( np.expand_dims(mixture_padded, axis=0)) speaker_signals = fix_length(speaker_signals_padded[0, :, :], original_length, axis=1) return speaker_signals
def transform_audio(self, y): '''Compute the Mel spectrogram Parameters ---------- y : np.ndarray The audio buffer Returns ------- data : dict data['mag'] : np.ndarray, shape=(n_frames, n_mels) The Mel spectrogram ''' n_frames = self.n_frames(get_duration(y=y, sr=self.sr)) mel = np.sqrt(melspectrogram(y=y, sr=self.sr, n_fft=self.n_fft, hop_length=self.hop_length, n_mels=self.n_mels, fmax=self.fmax)).astype(np.float32) mel = fix_length(mel, n_frames) if self.log: mel = amplitude_to_db(mel, ref=np.max) return {'mag': mel.T[self.idx]}
def resample(y, orig_sr, target_sr, res_type='kaiser_best', fix=True, scale=False, **kwargs): # First, validate the audio buffer util.valid_audio(y, mono=False) if orig_sr == target_sr: return y ratio = float(target_sr) / orig_sr n_samples = int(np.ceil(y.shape[-1] * ratio)) if res_type == 'scipy': y_hat = scipy.signal.resample(y, n_samples, axis=-1) else: y_hat = resampy.resample(y, orig_sr, target_sr, filter=res_type, axis=-1) if fix: y_hat = util.fix_length(y_hat, n_samples, **kwargs) if scale: y_hat /= np.sqrt(ratio) return np.ascontiguousarray(y_hat, dtype=y.dtype)
def transform_audio(self, y): '''Compute the STFT magnitude and phase. Parameters ---------- y : np.ndarray The audio buffer Returns ------- data : dict data['mag'] : np.ndarray, shape=(n_frames, 1 + n_fft//2) STFT magnitude data['phase'] : np.ndarray, shape=(n_frames, 1 + n_fft//2) STFT phase ''' n_frames = self.n_frames(get_duration(y=y, sr=self.sr)) D = stft(y, hop_length=self.hop_length, n_fft=self.n_fft) D = fix_length(D, n_frames) mag, phase = magphase(D) if self.log: mag = amplitude_to_db(mag, ref=np.max) return { 'mag': mag.T[self.idx].astype(np.float32), 'phase': np.angle(phase.T)[self.idx].astype(np.float32) }
def to_stft(seq, nfft): """ :param seq: Raw audio :param nfft: parameter of STFT :return: STFT of the input seq, broken down into magnitude in one channel and phase in the other. """ nfft_padlen = int(len(seq) + nfft / 2) stft = lc.stft(fix_length(seq, nfft_padlen), n_fft=nfft) return np.array([np.abs(stft), np.angle(stft)]).transpose(1, 2, 0)
def __getitem__(self, index): file_path, class_label = self.file_names[index] (seq, _) = load(file_path, sr=self.sample_rate, mono=True) seq = fix_length(seq, size=self.audio_length, mode='edge') return torch.cat(( torch.LongTensor([class_label]), # torch.LongTensor(self.overlap_len) # .fill_(utils.q_zero(self.q_levels)), utils.linear_quantize(torch.from_numpy(seq), self.q_levels)))
def to_image(seq, nfft): ''' Spectrogram computation for a sequence seq Returns ------- abs (magnitude) and angle (phase) of spectrogram ''' nfft_padlen = int(len(seq) + nfft / 2) stft = lc.stft(fix_length(seq, nfft_padlen), n_fft=nfft) return np.array([np.abs(stft), np.angle(stft)]).transpose(1, 2, 0)
def get_annotations(self, file_name, features, time_resolution): label_file = self.wav_to_labels[file_name] labels = read_csv(label_file, delimiter='\t', header=None) labels.columns = ['event_onset', 'event_offset', 'event_label'] event_roll = event_list_to_event_roll(labels.to_dict('records'), self.label_list, time_resolution) if event_roll.shape[0] > features.shape[0]: event_roll = event_roll[:len(features)] else: event_roll = fix_length(event_roll, features.shape[0], axis=0) assert event_roll.shape[0] == features.shape[0] return event_roll
def predict(file): label_encoder = LabelEncoder() label_encoder.classes_ = np.load(os.path.join(outdir, encoder_filename)) model_json_handle = open(os.path.join(outdir, model_filename), "r") model_json = model_json_handle.read() model_json_handle.close() model = model_from_json(model_json) model.load_weights(os.path.join(outdir, model_weights_filename)) model.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer='adam') file_path = os.path.join(os.getcwd(), file) y, sr = librosa.load(file_path, res_type='kaiser_fast') prediction_feature = np.array([get_mfcc(y, sr)]) predicted_proba_vector = model.predict_proba(prediction_feature) predicted_proba = predicted_proba_vector[0] fixed_size = 44100 centroid = spectral_centroid(y=y, sr=sr) frequency = np.average(centroid) centroid = fix_length(centroid, size=fixed_size) length = librosa.get_duration(y=y, sr=sr) result = { 'file_path': file_path, 'classes': {}, 'position': { 'frequency': frequency, 'length': length } } for i in range(len(predicted_proba)): category = label_encoder.inverse_transform(np.array([i])) result['classes'][category[0]] = format(predicted_proba[i], '.32f') return result
def getClassSplit(self, class_num=0, seq_len=64): file_path = '' for key in self.class_mapping: if self.class_mapping[key] == class_num: file_path = join(self.path, key) files = listdir(file_path) result = None pick_one = files[np.random.randint(0, len(files))] pick_one = join(file_path, pick_one) seq, _ = load(pick_one, sr=self.sample_rate, mono=True) seq = fix_length(seq, size=self.audio_length, mode='edge') while result is None or len(result) != seq_len: start_idx = np.random.randint(0, len(seq - seq_len - 1)) result = seq[start_idx:start_idx + seq_len] return result
def transform_audio(self, y): '''Compute the HCQT Parameters ---------- y : np.ndarray The audio buffer Returns ------- data : dict data['mag'] : np.ndarray, shape = (n_frames, n_bins, n_harmonics) The CQT magnitude data['phase']: np.ndarray, shape = mag.shape The CQT phase ''' cqtm, phase = [], [] n_frames = self.n_frames(get_duration(y=y, sr=self.sr)) for h in self.harmonics: C = cqt(y=y, sr=self.sr, hop_length=self.hop_length, fmin=self.fmin * h, n_bins=(self.n_octaves * self.over_sample * 12), bins_per_octave=(self.over_sample * 12)) C = fix_length(C, n_frames) C, P = magphase(C) if self.log: C = amplitude_to_db(C, ref=np.max) cqtm.append(C) phase.append(P) cqtm = to_dtype(np.asarray(cqtm), self.dtype) phase = np.angle(np.asarray(phase)) dphase = to_dtype(phase_diff(self._index(phase), self.conv), self.dtype) return {'mag': self._index(cqtm), 'dphase': dphase}
def transform_audio(self, y, func): '''Compute the transform Parameters ---------- y : np.ndarray The audio buffer Returns ------- data : dict data['mag'] : np.ndarray, shape = (n_frames, n_bins) The CQT magnitude data['phase']: np.ndarray, shape = mag.shape The CQT phase ''' n_frames = self.n_frames(get_duration(y=y, sr=self.sr)) '''C = cqt(y=y, sr=self.sr, hop_length=self.hop_length, fmin=self.fmin, n_bins=(self.n_octaves * self.over_sample * 12), bins_per_octave=(self.over_sample * 12))''' C = func(y=y, sr=self.sr, hop_length=self.hop_length, fmin=self.fmin, n_bins=(self.n_octaves * self.over_sample * 12), bins_per_octave=(self.over_sample * 12)) C = fix_length(C, n_frames) cqtm, phase = magphase(C) if self.log: cqtm = amplitude_to_db(cqtm, ref=np.max) return { 'mag': cqtm.T.astype(np.float32)[self.idx], 'phase': np.angle(phase).T.astype(np.float32)[self.idx] }
def loadTransform(audioChunk): if audioChunk.endswith(".wav"): windowNum = 2 sound = load(audioChunk, sr=16000) window = windowNum * 16000 X = [] a, b = 0, 80000 while len(sound[0]) - b >= 0: X.append(sound[0][a:b]) a += window b += window X.append(sound[0][a:b]) X[-1] = fix_length(X[-1], 80000) os.remove(audioChunk) X = np.array(X) return X else: newFile = convertFile(audioChunk) try: X = loadTransform(newFile) return X except FileNotFoundError: return "Error with processing your request"
def transform_audio(self, y): '''Compute the tempogram Parameters ---------- y : np.ndarray Audio buffer Returns ------- data : dict data['tempogram'] : np.ndarray, shape=(n_frames, win_length) The tempogram ''' n_frames = self.n_frames(get_duration(y=y, sr=self.sr)) tgram = tempogram(y=y, sr=self.sr, hop_length=self.hop_length, win_length=self.win_length).astype(np.float32) tgram = fix_length(tgram, n_frames) return {'tempogram': tgram.T[self.idx]}
def hcqt(y, sr=22050, hop_size=256, fmin=32.7, bins_per_octave=60, n_octaves=6, harmonics=(0.5, 1, 2, 3, 4, 5)): """ Harmonic CQT. Compute CQT at harmonics of `fmin`. See librosa for cqt params. """ cqt_mag, cqt_phase = [], [] n_frames = time_to_frames(get_duration(y=y, sr=sr), sr=sr, hop_length=hop_size) for h in harmonics: y_cqt = cqt(y=y, sr=sr, hop_length=hop_size, fmin=fmin * h, n_bins=n_octaves * bins_per_octave, bins_per_octave=bins_per_octave, res_type='kaiser_best') y_cqt = fix_length(y_cqt, n_frames) y_cqt_mag, y_cqt_phase = magphase(y_cqt) cqt_mag.append(y_cqt_mag) cqt_phase.append(y_cqt_phase) cqt_mag = np.asarray(cqt_mag).astype(np.float32) cqt_phase = np.angle(np.asarray(cqt_phase)).astype(np.float32) return cqt_mag, cqt_phase
fertility_griffin = '/home/jan/synthesized_audio/fertility_griffin' # TODO: # load each file, find max len and pad all remaining to the same length. Save for f in files: dv = load(os.path.join(deepvoice, f))[0] et = load(os.path.join(efficient_tts, f))[0] t2 = load(os.path.join(tacotron2, f))[0] fert = load(os.path.join(fertility, f))[0] fert_grif = load(os.path.join(fertility_griffin, f))[0] ref = load(os.path.join(reference, f))[0] max_len = max(len(dv), len(et), len(t2), len(ref), len(fert), len(fert_grif)) dv = fix_length(dv, max_len)[:, None].repeat(2, axis=1) et = fix_length(et, max_len) # [:, None].repeat(2, axis=1) t2 = fix_length(t2, max_len)[:, None].repeat(2, axis=1) ref = fix_length(ref, max_len)[:, None].repeat(2, axis=1) fert = fix_length(fert, max_len)[:, None].repeat(2, axis=1) fert_grif = fix_length(fert_grif, max_len)[:, None].repeat(2, axis=1) sf.write(os.path.join(deepvoice, f), dv, 22050, subtype='PCM_16') sf.write(os.path.join(efficient_tts, f), et, 22050, subtype='PCM_16') sf.write(os.path.join(tacotron2, f), t2, 22050, subtype='PCM_16') sf.write(os.path.join(reference, f), ref, 22050, subtype='PCM_16') sf.write(os.path.join(fertility, f), fert, 22050, subtype='PCM_16') sf.write(os.path.join(fertility_griffin, f), fert_grif, 22050, subtype='PCM_16')