def main(wav, ckpt, prominence, outpath): print(f"running inference on: {wav}") print(f"running inferece using ckpt: {ckpt}") print("\n\n", 90 * "-") ckpt = torch.load(ckpt, map_location=lambda storage, loc: storage) hp = Namespace(**dict(ckpt["hparams"])) # load weights and peak detection params model = NextFrameClassifier(hp) weights = ckpt["state_dict"] weights = {k.replace("NFC.", ""): v for k,v in weights.items()} model.load_state_dict(weights) peak_detection_params = dill.loads(ckpt['peak_detection_params'])['cpc_1'] if prominence is not None: print(f"overriding prominence with {prominence}") peak_detection_params["prominence"] = prominence # load data wav_name = wav.split("/")[-1] audio, sr = torchaudio.load(wav) assert sr == 16000, "model was trained with audio sampled at 16khz, please downsample." audio = audio[0] audio = audio.unsqueeze(0) # run inference preds = model(audio) # get scores preds = preds[1][0] # get scores of positive pairs num_features = preds.size(1) preds = replicate_first_k_frames(preds, k=1, dim=1) # padding preds = 1 - max_min_norm(preds) # normalize scores (good for visualizations) preds = detect_peaks(x=preds, lengths=[preds.shape[1]], prominence=peak_detection_params["prominence"], width=peak_detection_params["width"], distance=peak_detection_params["distance"]) # run peak detection on scores mult = audio.size(1)/num_features preds = preds[0] * mult / sr # transform frame indexes to seconds if not os.path.exists(outpath): os.makedirs(outpath) create_textgrid(preds, audio.size(1)/sr, os.path.join(outpath,wav_name.replace(".wav", ".TextGrid") )) print("predicted boundaries (in seconds):") print(preds)
def predict(self, wav_path: Path) -> np.ndarray: logging.debug(f"running inference on: {wav_filepath}") audio, sr = torchaudio.load(str(wav_path)) assert sr == 16000, "model was trained with audio sampled at 16khz, please downsample." audio = audio[0] audio = audio.unsqueeze(0) # run inference preds = self.model(audio) # get scores preds = preds[1][0] # get scores of positive pairs preds = replicate_first_k_frames(preds, k=1, dim=1) # padding preds = 1 - max_min_norm(preds) # normalize scores (good for visualizations) preds = detect_peaks(x=preds, lengths=[preds.shape[1]], prominence=self.peak_detection_params["prominence"], width=self.peak_detection_params["width"], distance=self.peak_detection_params["distance"]) # run peak detection on scores preds = preds[0] * 160 / sr # transform frame indexes to seconds logging.debug("predicted boundaries (in seconds):") return preds
def forward(self, data_batch, batch_i, mode): loss = 0 # TRAIN audio, seg, phonemes, length, fname = data_batch preds = self.NFC(audio) NFC_loss = self.NFC.loss(preds, length) self.stats['nfc_loss'][mode].update(NFC_loss.item()) loss += NFC_loss # INFERENCE if mode == "test" or (mode == "val" and self.hp.early_stop_metric == "val_max_rval"): positives = 0 for t in self.NFC.pred_steps: p = preds[t][0] p = replicate_first_k_frames(p, k=t, dim=1) positives += p positives = 1 - max_min_norm(positives) self.pr[f'cpc_{t}'][mode].update(seg, positives, length) loss_key = "loss" if mode == "train" else f"{mode}_loss" return OrderedDict({loss_key: loss})