def predict(self, path): self.eval() if path.find('.wav'): wav = feature.load_audio(wav_path=path, wav=None) else: wav = feature.load_audio(wav=path) spec = feature.spectrogram(wav) spec.unsqueeze_(0) x_lens = spec.size(-1) out = self.cnn(spec) out_len = torch.tensor([out.size(-1)]) text = self.decode(out, out_len) self.train() return text[0]
def __getitem__(self, index): wav, transcript = self.idx[index] wav = feature.load_audio(os.path.join(dataPath, wav)) spect = feature.spectrogram(wav) transcript = list( filter(None, [self.labels.get(x) for x in transcript])) return spect, transcript
def __getitem__(self, index): wav, transcript = self.idx[index] wav = feature.load_audio(wav) spect = feature.spectrogram(wav) transcript = list(filter(None, [self.vocabulary.get(x) for x in transcript])) return spect, transcript
def predict(self, path): self.eval() wav = feature.load_audio(path) spec = feature.spectrogram(wav) spec.unsqueeze_(0) x_lens = spec.size(-1) out = self.cnn(spec) out_len = torch.tensor([out.size(-1)]) text = self.decode(out, out_len) self.train() return text[0]
def predict(f): wav = feature.load_audio(f) spec = feature.spectrogram(wav) spec.unsqueeze_(0) with torch.no_grad(): y = model.cnn(spec) y = F.softmax(y, 1) y_len = torch.tensor([y.size(-1)]) y = y.permute(0, 2, 1) # B * T * V print("decoding") out, score, offset, out_len = decoder.decode(y, y_len) return translate(model.vocabulary, out[0][0], out_len[0][0])
def main(): file_path = 'music_data/shortName.flac' y = f.load_audio(file_path) # COMPUTE SPECTROGRAM log_mel_spectrogram = f.compute_spectrogram(y, 2048, 1024, 40) # COMPUTE ONSET DETECTION FUNCTION # skip normalization # norm_spectrogram = o.normalize_frequencies(log_mel_spectrogram) odf = o.compute_odf(log_mel_spectrogram) #o.compute_odf(norm_spectrogram) # DETECT ONSETS peaks = o.apply_threshold(odf, 1500) print(odf) # print(peaks) for i, p in enumerate(peaks): if p > 0: print(i/100, ' ', p)
'street_fold{}_train.txt'.format(1)) evaluate_file = os.path.join(evaluation_setup_folder, 'street_fold{}_evaluate.txt'.format(1)) desc_dict = feature.load_desc_file(train_file, __class_labels) #make dict desc_dict.update(feature.load_desc_file( evaluate_file, __class_labels)) # contains labels for all the audio in the dataset #till here labels are stored for only 1st fold # Extract features for all audio files, and save it along with labels for audio_filename in os.listdir(audio_folder): audio_file = os.path.join(audio_folder, audio_filename) print('Extracting features and label for : {}'.format(audio_file)) y, sr = feature.load_audio( audio_file, mono=is_mono, fs=sr) #y = audio data [shape=(signal_length, channel)] mbe = None #now we extract mel band energies for mono or binaural audio if is_mono: mbe = feature.extract_mbe(y, sr, nfft, nb_mel_bands).T else: for ch in range(y.shape[0]): #for each channel extract mbe mbe_ch = feature.extract_mbe(y[ch, :], sr, nfft, nb_mel_bands).T if mbe is None: mbe = mbe_ch else: mbe = np.concatenate((mbe, mbe_ch), 1)