コード例 #1
0
def main(wav, ckpt, prominence, outpath):
    print(f"running inference on: {wav}")
    print(f"running inferece using ckpt: {ckpt}")
    print("\n\n", 90 * "-")

    ckpt = torch.load(ckpt, map_location=lambda storage, loc: storage)
    hp = Namespace(**dict(ckpt["hparams"]))

    # load weights and peak detection params
    model = NextFrameClassifier(hp)
    weights = ckpt["state_dict"]
    weights = {k.replace("NFC.", ""): v for k,v in weights.items()}
    model.load_state_dict(weights)
    peak_detection_params = dill.loads(ckpt['peak_detection_params'])['cpc_1']
    if prominence is not None:
        print(f"overriding prominence with {prominence}")
        peak_detection_params["prominence"] = prominence

    # load data
    wav_name = wav.split("/")[-1]
    audio, sr = torchaudio.load(wav)
    assert sr == 16000, "model was trained with audio sampled at 16khz, please downsample."
    audio = audio[0]
    audio = audio.unsqueeze(0)

    # run inference
    preds = model(audio)  # get scores
    preds = preds[1][0]  # get scores of positive pairs
    num_features = preds.size(1)
    preds = replicate_first_k_frames(preds, k=1, dim=1)  # padding
    preds = 1 - max_min_norm(preds)  # normalize scores (good for visualizations)
    
    preds = detect_peaks(x=preds,
                         lengths=[preds.shape[1]],
                         prominence=peak_detection_params["prominence"],
                         width=peak_detection_params["width"],
                         distance=peak_detection_params["distance"])  # run peak detection on scores

    mult = audio.size(1)/num_features
    preds = preds[0] * mult / sr  # transform frame indexes to seconds

    if not os.path.exists(outpath):
        os.makedirs(outpath)
    create_textgrid(preds, audio.size(1)/sr, os.path.join(outpath,wav_name.replace(".wav", ".TextGrid") ))
    print("predicted boundaries (in seconds):")
    print(preds)
コード例 #2
0
    def predict(self, wav_path: Path) -> np.ndarray:
        logging.debug(f"running inference on: {wav_filepath}")
        audio, sr = torchaudio.load(str(wav_path))
        assert sr == 16000, "model was trained with audio sampled at 16khz, please downsample."
        audio = audio[0]
        audio = audio.unsqueeze(0)

        # run inference
        preds = self.model(audio)  # get scores
        preds = preds[1][0]  # get scores of positive pairs
        preds = replicate_first_k_frames(preds, k=1, dim=1)  # padding
        preds = 1 - max_min_norm(preds)  # normalize scores (good for visualizations)
        preds = detect_peaks(x=preds,
                             lengths=[preds.shape[1]],
                             prominence=self.peak_detection_params["prominence"],
                             width=self.peak_detection_params["width"],
                             distance=self.peak_detection_params["distance"])  # run peak detection on scores
        preds = preds[0] * 160 / sr  # transform frame indexes to seconds

        logging.debug("predicted boundaries (in seconds):")
        return preds
コード例 #3
0
    def forward(self, data_batch, batch_i, mode):
        loss = 0

        # TRAIN
        audio, seg, phonemes, length, fname = data_batch
        preds = self.NFC(audio)
        NFC_loss = self.NFC.loss(preds, length)
        self.stats['nfc_loss'][mode].update(NFC_loss.item())
        loss += NFC_loss

        # INFERENCE
        if mode == "test" or (mode == "val"
                              and self.hp.early_stop_metric == "val_max_rval"):
            positives = 0
            for t in self.NFC.pred_steps:
                p = preds[t][0]
                p = replicate_first_k_frames(p, k=t, dim=1)
                positives += p
            positives = 1 - max_min_norm(positives)
            self.pr[f'cpc_{t}'][mode].update(seg, positives, length)

        loss_key = "loss" if mode == "train" else f"{mode}_loss"
        return OrderedDict({loss_key: loss})