def load_data(from_path=None, ckpt_path=None, data_path=None, save_path=None): if from_path is None: if ckpt_path is None: raise Exception('No checkpoint path provided') from resemblyzer import preprocess_wav, VoiceEncoder from tqdm import tqdm device = torch.device('cuda') encoder = VoiceEncoder(device=device, loss_device=device) encoder.load_ckpt(ckpt_path, device=device) encoder.eval() wav_fpaths = list(Path(data_path).glob("**/*.flac")) # Preprocess and save encoded utterance and label to list X = [] y = [] for wav_fpath in tqdm(wav_fpaths): wav = preprocess_wav(wav_fpath) X.append(encoder.embed_utterance(wav).cpu().numpy()) y.append(wav_fpath.parent.parent.stem) # Save for testing if save_path is not None: np.save(Path(save_path, 'embeds.npy'), X) np.save(Path(save_path, 'labels.npy'), y) else: raise Exception('No save_path provided') else: X = np.load(Path(from_path, 'embeds.npy'), allow_pickle=True) y = np.load(Path(from_path, 'labels.npy'), allow_pickle=True) return X, y
class Predictor(): def __init__(self, clf_ckpt_path='exp/clv/mlp/mlp_best_val_loss.pt', enc_ckpt_path='ckpt/pretrained.pt', device=torch.device('cuda'), num_class=381, verbose=False): start = timer() self.encoder = VoiceEncoder(device=device, loss_device=device) self.encoder.load_ckpt(enc_ckpt_path, device) self.encoder.eval() self.classifier = MLP(num_class=num_class) self.classifier.load_ckpt(clf_ckpt_path, device) self.classifier.eval() if verbose: print( f'Encoder and classifier models loaded successfully in {timer() - start}s' ) def preprocess(self, f): """ Applies preprocessing operations to a waveform either on disk or in memory such that The waveform will be resampled to match the data hyperparameters. :param f: either a filepath to an audio file or the waveform as a numpy array of floats. """ return preprocess_wav(f) def predict(self, audio, topk=2): """ Predict top_k classes with highest probabilities. :param audio: preprocessed waveform. :param topk: Keep topk classes with highest probabilities. """ embed = self.encoder.embed_utterance(audio) inp = embed.unsqueeze(dim=0) top_probs, top_classes = self.classifier.predict(inp, topk=topk) return top_probs, top_classes