Example #1
0
def load_data(from_path=None, ckpt_path=None, data_path=None, save_path=None):
    if from_path is None:
        if ckpt_path is None:
            raise Exception('No checkpoint path provided')

        from resemblyzer import preprocess_wav, VoiceEncoder
        from tqdm import tqdm

        device = torch.device('cuda')
        encoder = VoiceEncoder(device=device, loss_device=device)
        encoder.load_ckpt(ckpt_path, device=device)
        encoder.eval()
        wav_fpaths = list(Path(data_path).glob("**/*.flac"))

        # Preprocess and save encoded utterance and label to list
        X = []
        y = []
        for wav_fpath in tqdm(wav_fpaths):
            wav = preprocess_wav(wav_fpath)
            X.append(encoder.embed_utterance(wav).cpu().numpy())
            y.append(wav_fpath.parent.parent.stem)

        # Save for testing
        if save_path is not None:
            np.save(Path(save_path, 'embeds.npy'), X)
            np.save(Path(save_path, 'labels.npy'), y)
        else:
            raise Exception('No save_path provided')
    else:
        X = np.load(Path(from_path, 'embeds.npy'), allow_pickle=True)
        y = np.load(Path(from_path, 'labels.npy'), allow_pickle=True)
    return X, y
Example #2
0
class Predictor():
    def __init__(self,
                 clf_ckpt_path='exp/clv/mlp/mlp_best_val_loss.pt',
                 enc_ckpt_path='ckpt/pretrained.pt',
                 device=torch.device('cuda'),
                 num_class=381,
                 verbose=False):
        start = timer()
        self.encoder = VoiceEncoder(device=device, loss_device=device)
        self.encoder.load_ckpt(enc_ckpt_path, device)
        self.encoder.eval()
        self.classifier = MLP(num_class=num_class)
        self.classifier.load_ckpt(clf_ckpt_path, device)
        self.classifier.eval()
        if verbose:
            print(
                f'Encoder and classifier models loaded successfully in {timer() - start}s'
            )

    def preprocess(self, f):
        """
        Applies preprocessing operations to a waveform either on disk or in memory such that
        The waveform will be resampled to match the data hyperparameters.

        :param f: either a filepath to an audio file or the waveform as a numpy array of floats.
        """
        return preprocess_wav(f)

    def predict(self, audio, topk=2):
        """
        Predict top_k classes with highest probabilities.

        :param audio: preprocessed waveform.
        :param topk: Keep topk classes with highest probabilities.
        """
        embed = self.encoder.embed_utterance(audio)
        inp = embed.unsqueeze(dim=0)
        top_probs, top_classes = self.classifier.predict(inp, topk=topk)
        return top_probs, top_classes