Esempio n. 1
0
def get_audio_embedding(path):
    try:
        audio, _ = librosa.core.load(path, sr=32000, mono=True)
        audio = audio[None, :]
        at = AudioTagging(checkpoint_path=None, device='cuda')
        _, embedding = at.inference(audio)
        embedding = embedding / np.linalg.norm(embedding)
        embedding = embedding.tolist()[0]
        return embedding
    except Exception as e:
        print("error with embedding:", path)
        return None
Esempio n. 2
0
def pann(filepath):
    musical = False
    (audio, _) = librosa.core.load(filepath, sr=32000, mono=True)
    audio = audio[None, :]  # (batch_size, segment_samples)

    at = AudioTagging(checkpoint_path='/Users/jacksongoode/panns_data/MobileNetV2_mAP=0.383.pth',
                        device='cuda', model='MobileNet')
    (clipwise_output, embedding) = at.inference(audio)
    probs = list(clipwise_output[0])
    if probs.index(max(probs)) in range(137, 283): # range of music classes in AudioNet
        print("\nIt's music!")
        musical = True
    else:
        print('\nNot music!')

    return musical
Esempio n. 3
0
def generate(data_dir, batch_size=256, device="cuda", sr=16000):
    save_dir = _make_save_dir(data_dir=data_dir)

    inferenced_labels = []
    panns = AudioTagging(device=device)
    data_loader = _build_data_loader(data_dir=data_dir,
                                     batch_size=batch_size,
                                     sr=sr)

    for data_dict in tqdm(data_loader):
        inferences, _ = panns.inference(data_dict["audio"])
        inferenced_labels += inferences.argmax(axis=1).tolist()

    for idx, inferenced_label in enumerate(inferenced_labels):
        np.save(
            os.path.join(save_dir, f"{idx}_audio_label.npy"),
            np.array([inferenced_label]),
        )
Esempio n. 4
0
class Voice2vec(object):

    def __init__(self, device='cuda'):
        self.at = AudioTagging(checkpoint_path=None, device=device)
        # self.sed = SoundEventDetection(checkpoint_path=None, device='cuda')

    def get_embedding(self, audio_path):
        audio = self.get_audio(audio_path)
        (clipwise_output, embedding) = self.at.inference(audio)
        return embedding

    @staticmethod
    def get_audio(audio_path):
        audio, _ = librosa.core.load(audio_path, sr=32000, mono=True)
        return audio[None, :]  # (batch_size, segment_samples)
Esempio n. 5
0
 def __init__(self, device='cuda'):
     self.at = AudioTagging(checkpoint_path=None, device=device)
Esempio n. 6
0
        lines.append(line)

    plt.legend(handles=lines)
    plt.xlabel('Frames')
    plt.ylabel('Probability')
    plt.ylim(0, 1.)
    plt.savefig(out_fig_path)
    print('Save fig to {}'.format(out_fig_path))


if __name__ == '__main__':
    """Example of using panns_inferece for audio tagging and sound evetn detection.
    """
    device = 'cpu'  # 'cuda' | 'cpu'
    audio_path = 'resources/R9_ZSCveAHg_7s.wav'
    (audio, _) = librosa.core.load(audio_path, sr=32000, mono=True)
    audio = audio[None, :]  # (batch_size, segment_samples)

    print('------ Audio tagging ------')
    at = AudioTagging(checkpoint_path=None, device=device)
    (clipwise_output, embedding) = at.inference(audio)
    """clipwise_output: (batch_size, classes_num), embedding: (batch_size, embedding_size)"""

    print_audio_tagging_result(clipwise_output[0])

    print('------ Sound event detection ------')
    sed = SoundEventDetection(checkpoint_path=None, device=device)
    framewise_output = sed.inference(audio)
    """(batch_size, time_steps, classes_num)"""

    plot_sound_event_detection_result(framewise_output[0])
Esempio n. 7
0
'''

print("<--- Audio Tagging ---->")

model = Transfer_Cnn14(sample_rate,
                       window_size,
                       hop_size,
                       mel_bins,
                       fmin,
                       fmax,
                       classes_num,
                       freeze_base=0)
cp = "checkpoints/main/holdout_fold=1/Transfer_Cnn14/pretrain=True/loss_type=clip_nll/augmentation=none/batch_size=1/freeze_base=False/1000_iterations.pth"

at = AudioTagging(model=model, checkpoint_path=cp, device='cuda')
preds = []
bird_name = "black-faced_antbird"
count = 0
for ii, filename in enumerate(glob.glob(audio_path)):
    print("Iteration ", ii)
    audio, _ = librosa.load(filename, sr=32000, mono=True)
    audio = audio[None, :]
    (clipwise_output, embedding) = at.inference(audio)

    nm = print_audio_tagging_result(clipwise_output[0])
    if nm == bird_name:
        count += 1

print("Accuracy: ", count / (ii + 1))
'''for audio in audio_arrs:
Esempio n. 8
0
        lines.append(line)

    plt.legend(handles=lines)
    plt.xlabel('Frames')
    plt.ylabel('Probability')
    plt.ylim(0, 1.)
    plt.savefig(out_fig_path)
    print('Save fig to {}'.format(out_fig_path))


if __name__ == '__main__':
    """Example of using panns_inferece for audio tagging and sound evetn detection.
    """
    device = 'cuda'
    audio_path = 'examples/R9_ZSCveAHg_7s.wav'
    (audio, _) = librosa.core.load(audio_path, sr=32000, mono=True)
    audio = audio[None, :]  # (batch_size, segment_samples)

    print('------ Audio tagging ------')
    at = AudioTagging(device=device)
    (clipwise_output, embedding) = at.inference(audio)
    """clipwise_output: (batch_size, classes_num), embedding: (batch_size, embedding_size)"""

    print_audio_tagging_result(clipwise_output[0])

    print('------ Sound event detection ------')
    sed = SoundEventDetection(device=device)
    framewise_output = sed.inference(audio)
    """(batch_size, time_steps, classes_num)"""

    plot_sound_event_detection_result(framewise_output[0])