Exemple #1
0
def main(
    nemo_file,
    enemo_file,
    onnx_file,
    model_type="asr",
):
    if model_type == "asr":
        logging.info("Preparing ASR model")
        model = EncDecCTCModel.restore_from(nemo_file)
    elif model_type == "speech_label":
        logging.info("Preparing Speech Label Classification model")
        model = EncDecClassificationModel.restore_from(nemo_file)
    elif model_type == "speaker":
        logging.info("Preparing Speaker Recognition model")
        model = EncDecSpeakerLabelModel.restore_from(nemo_file)
    else:
        raise NameError(
            "Available model names are asr, speech_label and speaker")

    logging.info("Writing onnx file")
    model.export(onnx_file, onnx_opset_version=12)
    logging.info("succesfully ported onnx file")

    with tarfile.open(nemo_file, "r") as archive:
        archive.extract("./model_config.yaml")
        with tarfile.open(enemo_file, "w") as enemo_archive:
            enemo_archive.add("./model_config.yaml")
            copyfile(onnx_file, "model_graph.onnx")
            enemo_archive.add("model_graph.onnx")
            os.remove("model_graph.onnx")  # cleanup extra file
Exemple #2
0
def main(
    nemo_file,
    enemo_file,
    onnx_file,
    model_type='asr',
):
    if model_type == 'asr':
        logging.info("Preparing ASR model")
        model = EncDecCTCModel.restore_from(nemo_file)
    elif model_type == 'speech_label':
        logging.info("Preparing Speech Label Classification model")
        model = EncDecClassificationModel.restore_from(nemo_file)
    elif model_type == 'speaker':
        logging.info("Preparing Speaker Recognition model")
        model = EncDecSpeakerLabelModel.restore_from(nemo_file)
    else:
        raise NameError(
            "Available model names are asr, speech_label and speaker")

    logging.info("Writing onnx file")
    model.export(onnx_file, onnx_opset_version=12)
    logging.info("succesfully ported onnx file")

    with tarfile.open(nemo_file, 'r') as archive:
        archive.extract('./model_config.yaml')
        with tarfile.open(enemo_file, 'w') as enemo_archive:
            enemo_archive.add('./model_config.yaml')
            enemo_archive.addfile(tarfile.TarInfo("model_graph.onnx"),
                                  open(onnx_file))
Exemple #3
0
def main():
    parser = ArgumentParser()
    parser.add_argument(
        "--pretrained_model",
        type=str,
        default="speakerrecognition_speakernet",
        required=False,
        help="Pass your trained .nemo model",
    )
    parser.add_argument(
        "--finetune_config_file",
        type=str,
        required=True,
        help="path to speakernet config yaml file to load train, validation dataset and also for trainer parameters",
    )

    parser.add_argument(
        "--freeze_encoder",
        type=bool,
        required=False,
        default=True,
        help="True if speakernet encoder paramteres needs to be frozen while finetuning",
    )

    args = parser.parse_args()

    if args.pretrained_model.endswith('.nemo'):
        logging.info(f"Using local speaker model from {args.pretrained_model}")
        speaker_model = EncDecSpeakerLabelModel.restore_from(restore_path=args.pretrained_model)
    elif args.pretrained_model.endswith('.ckpt'):
        logging.info(f"Using local speaker model from checkpoint {args.pretrained_model}")
        speaker_model = EncDecSpeakerLabelModel.load_from_checkpoint(checkpoint_path=args.pretrained_model)
    else:
        logging.info("Using pretrained speaker recognition model from NGC")
        speaker_model = EncDecSpeakerLabelModel.from_pretrained(model_name=args.pretrained_model)

    finetune_config = OmegaConf.load(args.finetune_config_file)

    if 'test_ds' in finetune_config.model and finetune_config.model.test_ds is not None:
        finetune_config.model.test_ds = None
        logging.warning("Removing test ds")

    speaker_model.setup_finetune_model(finetune_config.model)
    finetune_trainer = pl.Trainer(**finetune_config.trainer)
    speaker_model.set_trainer(finetune_trainer)

    _ = exp_manager(finetune_trainer, finetune_config.get('exp_manager', None))
    speaker_model.setup_optimization(finetune_config.optim)

    if args.freeze_encoder:
        for param in speaker_model.encoder.parameters():
            param.requires_grad = False

    finetune_trainer.fit(speaker_model)
Exemple #4
0
def main(
    nemo_file, onnx_file, model_type='asr',
):
    if model_type == 'asr':
        logging.info("Preparing ASR model")
        model = EncDecCTCModel.restore_from(nemo_file)
    elif model_type == 'speech_label':
        logging.info("Preparing Speech Label Classification model")
        model = EncDecClassificationModel.restore_from(nemo_file)
    elif model_type == 'speaker':
        logging.info("Preparing Speaker Recognition model")
        model = EncDecSpeakerLabelModel.restore_from(nemo_file)
    else:
        raise NameError("Available model names are asr, speech_label and speaker")

    logging.info("Writing onnx file")
    model.export(onnx_file, onnx_opset_version=12)
    logging.info("succesfully ported onnx file")
Exemple #5
0
def main(cfg):

    logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}')

    device = 'cuda' if torch.cuda.is_available() else 'cpu'

    enrollment_manifest = cfg.data.enrollment_manifest
    test_manifest = cfg.data.test_manifest
    out_manifest = cfg.data.out_manifest
    sample_rate = cfg.data.sample_rate

    backend = cfg.backend.backend_model.lower()

    if backend == 'cosine_similarity':
        model_path = cfg.backend.cosine_similarity.model_path
        batch_size = cfg.backend.cosine_similarity.batch_size
        if model_path.endswith('.nemo'):
            speaker_model = EncDecSpeakerLabelModel.restore_from(model_path)
        else:
            speaker_model = EncDecSpeakerLabelModel.from_pretrained(model_path)

        enroll_embs, _, enroll_truelabels, enroll_id2label = EncDecSpeakerLabelModel.get_batch_embeddings(
            speaker_model,
            enrollment_manifest,
            batch_size,
            sample_rate,
            device=device,
        )

        test_embs, _, _, _ = EncDecSpeakerLabelModel.get_batch_embeddings(
            speaker_model,
            test_manifest,
            batch_size,
            sample_rate,
            device=device,
        )

        # length normalize
        enroll_embs = enroll_embs / (np.linalg.norm(
            enroll_embs, ord=2, axis=-1, keepdims=True))
        test_embs = test_embs / (np.linalg.norm(
            test_embs, ord=2, axis=-1, keepdims=True))

        # reference embedding
        reference_embs = []
        keyslist = list(enroll_id2label.keys())
        for label_id in keyslist:
            indices = np.where(enroll_truelabels == label_id)
            embedding = (enroll_embs[indices].sum(
                axis=0).squeeze()) / len(indices)
            reference_embs.append(embedding)

        reference_embs = np.asarray(reference_embs)

        scores = np.matmul(test_embs, reference_embs.T)
        matched_labels = scores.argmax(axis=-1)

    elif backend == 'neural_classifier':
        model_path = cfg.backend.neural_classifier.model_path
        batch_size = cfg.backend.neural_classifier.batch_size

        if model_path.endswith('.nemo'):
            speaker_model = EncDecSpeakerLabelModel.restore_from(model_path)
        else:
            speaker_model = EncDecSpeakerLabelModel.from_pretrained(model_path)

        featurizer = WaveformFeaturizer(sample_rate=sample_rate)
        dataset = AudioToSpeechLabelDataset(
            manifest_filepath=enrollment_manifest,
            labels=None,
            featurizer=featurizer)
        enroll_id2label = dataset.id2label

        if speaker_model.decoder.final.out_features != len(enroll_id2label):
            raise ValueError(
                "number of labels mis match. Make sure you trained or finetuned neural classifier with labels from enrollement manifest_filepath"
            )

        _, test_logits, _, _ = EncDecSpeakerLabelModel.get_batch_embeddings(
            speaker_model,
            test_manifest,
            batch_size,
            sample_rate,
            device=device,
        )
        matched_labels = test_logits.argmax(axis=-1)

    with open(test_manifest, 'rb') as f1, open(out_manifest,
                                               'w',
                                               encoding='utf-8') as f2:
        lines = f1.readlines()
        for idx, line in enumerate(lines):
            line = line.strip()
            item = json.loads(line)
            item['infer'] = enroll_id2label[matched_labels[idx]]
            json.dump(item, f2)
            f2.write('\n')

    logging.info(
        "Inference labels have been written to {} manifest file".format(
            out_manifest))
Exemple #6
0
def main():
    parser = ArgumentParser()
    parser.add_argument(
        "--spkr_model", type=str, default="titanet_large", required=True, help="Pass your trained .nemo model",
    )
    parser.add_argument(
        "--train_manifest", type=str, required=True, help="path to train manifest file to match labels"
    )
    parser.add_argument(
        "--test_manifest", type=str, required=True, help="path to test manifest file to perform inference"
    )
    parser.add_argument("--batch_size", type=int, default=32)
    args = parser.parse_args()
    torch.set_grad_enabled(False)

    if args.spkr_model.endswith('.nemo'):
        logging.info(f"Using local speaker model from {args.spkr_model}")
        speaker_model = EncDecSpeakerLabelModel.restore_from(restore_path=args.spkr_model)
    else:
        logging.error(f"Please pass a trained .nemo file")
        sys.exit()

    labels = []
    with open(args.train_manifest, 'rb') as f:
        lines = f.readlines()
        for line in lines:
            line = line.strip()
            item = json.loads(line)
            labels.append(item['label'])

    labels_map = sorted(set(labels))
    label2id, id2label = {}, {}
    for label_id, label in enumerate(labels_map):
        label2id[label] = label_id
        id2label[label_id] = label

    speaker_model.setup_test_data(
        test_data_layer_params={
            'sample_rate': 16000,
            'manifest_filepath': args.test_manifest,
            'labels': labels_map,
            'batch_size': args.batch_size,
            'trim_silence': False,
            'shuffle': False,
        }
    )
    if can_gpu:
        speaker_model = speaker_model.cuda()
    speaker_model.eval()

    speaker_model.test_dataloader()
    all_labels = []
    all_logits = []
    for test_batch in tqdm(speaker_model.test_dataloader()):
        if can_gpu:
            test_batch = [x.cuda() for x in test_batch]
        with autocast():
            audio_signal, audio_signal_len, labels, _ = test_batch
            logits, _ = speaker_model.forward(input_signal=audio_signal, input_signal_length=audio_signal_len)

            all_logits.extend(logits.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    all_logits, true_labels = np.asarray(all_logits), np.asarray(all_labels)
    infer_labels = all_logits.argmax(axis=1)

    out_manifest = os.path.basename(args.test_manifest).split('.')[0] + '_infer.json'
    out_manifest = os.path.join(os.path.dirname(args.test_manifest), out_manifest)
    with open(args.test_manifest, 'rb') as f1, open(out_manifest, 'w', encoding='utf-8') as f2:
        lines = f1.readlines()
        for idx, line in enumerate(lines):
            line = line.strip()
            item = json.loads(line)
            item['infer'] = id2label[infer_labels[idx]]
            json.dump(item, f2)
            f2.write('\n')

    logging.info("Inference labels have been written to {} manifest file".format(out_manifest))