def get_audio_embedding(path): try: audio, _ = librosa.core.load(path, sr=32000, mono=True) audio = audio[None, :] at = AudioTagging(checkpoint_path=None, device='cuda') _, embedding = at.inference(audio) embedding = embedding / np.linalg.norm(embedding) embedding = embedding.tolist()[0] return embedding except Exception as e: print("error with embedding:", path) return None
class Voice2vec(object): def __init__(self, device='cuda'): self.at = AudioTagging(checkpoint_path=None, device=device) # self.sed = SoundEventDetection(checkpoint_path=None, device='cuda') def get_embedding(self, audio_path): audio = self.get_audio(audio_path) (clipwise_output, embedding) = self.at.inference(audio) return embedding @staticmethod def get_audio(audio_path): audio, _ = librosa.core.load(audio_path, sr=32000, mono=True) return audio[None, :] # (batch_size, segment_samples)
def pann(filepath): musical = False (audio, _) = librosa.core.load(filepath, sr=32000, mono=True) audio = audio[None, :] # (batch_size, segment_samples) at = AudioTagging(checkpoint_path='/Users/jacksongoode/panns_data/MobileNetV2_mAP=0.383.pth', device='cuda', model='MobileNet') (clipwise_output, embedding) = at.inference(audio) probs = list(clipwise_output[0]) if probs.index(max(probs)) in range(137, 283): # range of music classes in AudioNet print("\nIt's music!") musical = True else: print('\nNot music!') return musical
def generate(data_dir, batch_size=256, device="cuda", sr=16000): save_dir = _make_save_dir(data_dir=data_dir) inferenced_labels = [] panns = AudioTagging(device=device) data_loader = _build_data_loader(data_dir=data_dir, batch_size=batch_size, sr=sr) for data_dict in tqdm(data_loader): inferences, _ = panns.inference(data_dict["audio"]) inferenced_labels += inferences.argmax(axis=1).tolist() for idx, inferenced_label in enumerate(inferenced_labels): np.save( os.path.join(save_dir, f"{idx}_audio_label.npy"), np.array([inferenced_label]), )
lines.append(line) plt.legend(handles=lines) plt.xlabel('Frames') plt.ylabel('Probability') plt.ylim(0, 1.) plt.savefig(out_fig_path) print('Save fig to {}'.format(out_fig_path)) if __name__ == '__main__': """Example of using panns_inferece for audio tagging and sound evetn detection. """ device = 'cpu' # 'cuda' | 'cpu' audio_path = 'resources/R9_ZSCveAHg_7s.wav' (audio, _) = librosa.core.load(audio_path, sr=32000, mono=True) audio = audio[None, :] # (batch_size, segment_samples) print('------ Audio tagging ------') at = AudioTagging(checkpoint_path=None, device=device) (clipwise_output, embedding) = at.inference(audio) """clipwise_output: (batch_size, classes_num), embedding: (batch_size, embedding_size)""" print_audio_tagging_result(clipwise_output[0]) print('------ Sound event detection ------') sed = SoundEventDetection(checkpoint_path=None, device=device) framewise_output = sed.inference(audio) """(batch_size, time_steps, classes_num)""" plot_sound_event_detection_result(framewise_output[0])