Ejemplo n.º 1
0
def main():
    parser = ArgumentParser()
    parser.add_argument(
        "--pretrained_model",
        type=str,
        default="speakerrecognition_speakernet",
        required=False,
        help="Pass your trained .nemo model",
    )
    parser.add_argument(
        "--finetune_config_file",
        type=str,
        required=True,
        help="path to speakernet config yaml file to load train, validation dataset and also for trainer parameters",
    )

    parser.add_argument(
        "--freeze_encoder",
        type=bool,
        required=False,
        default=True,
        help="True if speakernet encoder paramteres needs to be frozen while finetuning",
    )

    args = parser.parse_args()

    if args.pretrained_model.endswith('.nemo'):
        logging.info(f"Using local speaker model from {args.pretrained_model}")
        speaker_model = EncDecSpeakerLabelModel.restore_from(restore_path=args.pretrained_model)
    elif args.pretrained_model.endswith('.ckpt'):
        logging.info(f"Using local speaker model from checkpoint {args.pretrained_model}")
        speaker_model = EncDecSpeakerLabelModel.load_from_checkpoint(checkpoint_path=args.pretrained_model)
    else:
        logging.info("Using pretrained speaker recognition model from NGC")
        speaker_model = EncDecSpeakerLabelModel.from_pretrained(model_name=args.pretrained_model)

    finetune_config = OmegaConf.load(args.finetune_config_file)

    if 'test_ds' in finetune_config.model and finetune_config.model.test_ds is not None:
        finetune_config.model.test_ds = None
        logging.warning("Removing test ds")

    speaker_model.setup_finetune_model(finetune_config.model)
    finetune_trainer = pl.Trainer(**finetune_config.trainer)
    speaker_model.set_trainer(finetune_trainer)

    _ = exp_manager(finetune_trainer, finetune_config.get('exp_manager', None))
    speaker_model.setup_optimization(finetune_config.optim)

    if args.freeze_encoder:
        for param in speaker_model.encoder.parameters():
            param.requires_grad = False

    finetune_trainer.fit(speaker_model)
Ejemplo n.º 2
0
def main(cfg):

    logging.info(f'Hydra config: {cfg.pretty()}')
    trainer = pl.Trainer(**cfg.trainer)
    log_dir = exp_manager(trainer, cfg.get("exp_manager", None))
    speaker_model = EncDecSpeakerLabelModel(cfg=cfg.model, trainer=trainer)
    trainer.fit(speaker_model)
    model_path = os.path.join(log_dir, '..', 'spkr.nemo')
    speaker_model.save_to(model_path)

    if hasattr(cfg.model, 'test_ds') and cfg.model.test_ds.manifest_filepath is not None:
        gpu = 1 if cfg.trainer.gpus != 0 else 0
        trainer = pl.Trainer(gpus=gpu)
        if speaker_model.prepare_test(trainer):
            trainer.test(speaker_model)
Ejemplo n.º 3
0
def speaker_label_model():
    preprocessor = {'cls': 'nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor', 'params': dict({})}
    encoder = {
        'cls': 'nemo.collections.asr.modules.ConvASREncoder',
        'params': {
            'feat_in': 64,
            'activation': 'relu',
            'conv_mask': True,
            'jasper': [
                {
                    'filters': 512,
                    'repeat': 1,
                    'kernel': [1],
                    'stride': [1],
                    'dilation': [1],
                    'dropout': 0.0,
                    'residual': False,
                    'separable': False,
                }
            ],
        },
    }

    decoder = {
        'cls': 'nemo.collections.asr.modules.SpeakerDecoder',
        'params': {'feat_in': 512, 'num_classes': 2, 'pool_mode': 'xvector', 'emb_sizes': [1024]},
    }

    modelConfig = DictConfig(
        {'preprocessor': DictConfig(preprocessor), 'encoder': DictConfig(encoder), 'decoder': DictConfig(decoder)}
    )
    speaker_model = EncDecSpeakerLabelModel(cfg=modelConfig)
    return speaker_model
Ejemplo n.º 4
0
def main(
    nemo_file,
    enemo_file,
    onnx_file,
    model_type="asr",
):
    if model_type == "asr":
        logging.info("Preparing ASR model")
        model = EncDecCTCModel.restore_from(nemo_file)
    elif model_type == "speech_label":
        logging.info("Preparing Speech Label Classification model")
        model = EncDecClassificationModel.restore_from(nemo_file)
    elif model_type == "speaker":
        logging.info("Preparing Speaker Recognition model")
        model = EncDecSpeakerLabelModel.restore_from(nemo_file)
    else:
        raise NameError(
            "Available model names are asr, speech_label and speaker")

    logging.info("Writing onnx file")
    model.export(onnx_file, onnx_opset_version=12)
    logging.info("succesfully ported onnx file")

    with tarfile.open(nemo_file, "r") as archive:
        archive.extract("./model_config.yaml")
        with tarfile.open(enemo_file, "w") as enemo_archive:
            enemo_archive.add("./model_config.yaml")
            copyfile(onnx_file, "model_graph.onnx")
            enemo_archive.add("model_graph.onnx")
            os.remove("model_graph.onnx")  # cleanup extra file
Ejemplo n.º 5
0
def main(cfg):

    logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}')
    trainer = pl.Trainer(**cfg.trainer)
    log_dir = exp_manager(trainer, cfg.get("exp_manager", None))
    speaker_model = EncDecSpeakerLabelModel(cfg=cfg.model, trainer=trainer)
    trainer.fit(speaker_model)
    if not trainer.fast_dev_run:
        model_path = os.path.join(log_dir, '..', 'spkr.nemo')
        speaker_model.save_to(model_path)

    if hasattr(cfg.model,
               'test_ds') and cfg.model.test_ds.manifest_filepath is not None:
        trainer = pl.Trainer(devices=1, accelerator=cfg.trainer.accelerator)
        if speaker_model.prepare_test(trainer):
            trainer.test(speaker_model)
Ejemplo n.º 6
0
def main(
    nemo_file,
    enemo_file,
    onnx_file,
    model_type='asr',
):
    if model_type == 'asr':
        logging.info("Preparing ASR model")
        model = EncDecCTCModel.restore_from(nemo_file)
    elif model_type == 'speech_label':
        logging.info("Preparing Speech Label Classification model")
        model = EncDecClassificationModel.restore_from(nemo_file)
    elif model_type == 'speaker':
        logging.info("Preparing Speaker Recognition model")
        model = EncDecSpeakerLabelModel.restore_from(nemo_file)
    else:
        raise NameError(
            "Available model names are asr, speech_label and speaker")

    logging.info("Writing onnx file")
    model.export(onnx_file, onnx_opset_version=12)
    logging.info("succesfully ported onnx file")

    with tarfile.open(nemo_file, 'r') as archive:
        archive.extract('./model_config.yaml')
        with tarfile.open(enemo_file, 'w') as enemo_archive:
            enemo_archive.add('./model_config.yaml')
            enemo_archive.addfile(tarfile.TarInfo("model_graph.onnx"),
                                  open(onnx_file))
Ejemplo n.º 7
0
    def test_ecapa_enc_dec(self):
        preprocessor = {'cls': 'nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor', 'params': dict({})}
        encoder = {
            'cls': 'nemo.collections.asr.modules.ECAPAEncoder',
            'params': {
                'feat_in': 80,
                'filters': [4, 4, 4, 4, 3],
                'kernel_sizes': [5, 3, 3, 3, 1],
                'dilations': [1, 1, 1, 1, 1],
                'scale': 2,
            },
        }

        decoder = {
            'cls': 'nemo.collections.asr.modules.SpeakerDecoder',
            'params': {'feat_in': 3, 'num_classes': 2, 'pool_mode': 'attention', 'emb_sizes': 192},
        }

        modelConfig = DictConfig(
            {'preprocessor': DictConfig(preprocessor), 'encoder': DictConfig(encoder), 'decoder': DictConfig(decoder)}
        )
        speaker_model = EncDecSpeakerLabelModel(cfg=modelConfig)
        speaker_model.train()
        # TODO: make proper config and assert correct number of weights

        # Check to/from config_dict:
        confdict = speaker_model.to_config_dict()
        instance2 = EncDecSpeakerLabelModel.from_config_dict(confdict)
        self.assertTrue(isinstance(instance2, EncDecSpeakerLabelModel))
def main(cfg):

    # add paths to manifests to config
    cfg.model.train_ds.manifest_filepath = '/Users/xujinghua/speaker-verification-with-NeMo/data/train.json'
    cfg.model.validation_ds.manifest_filepath = '/Users/xujinghua/speaker-verification-with-NeMo/data/train.json'

    # an4 test files have a different set of speakers
    # cfg.model.test_ds.manifest_filepath = '/Users/xujinghua/NeMo/data/an4/wav/an4_clstk/dev.json'

    cfg.model.decoder.num_classes = 74

    os.environ["OMP_NUM_THREADS"] = '1'

    # tutorial default setting: flags
    # modify some trainer configs for this demo
    # Checks if we have GPU available and uses it
    cuda = 1 if torch.cuda.is_available() else 0
    cfg.trainer.gpus = cuda

    # Reduces maximum number of epochs to 5 for quick demonstration
    cfg.trainer.max_epochs = 5

    # Remove distributed training flags
    cfg.trainer.accelerator = None

    logging.info(f'Hydra config: {cfg.pretty()}')
    trainer = pl.Trainer(**cfg.trainer)
    log_dir = exp_manager(trainer, cfg.get("exp_manager", None))
    speaker_model = EncDecSpeakerLabelModel(cfg=cfg.model, trainer=trainer)
    trainer.fit(speaker_model)

    if not trainer.fast_dev_run:
        model_path = os.path.join(log_dir, '..', 'spkr.nemo')
        speaker_model.save_to(model_path)

    # no need for testing

    '''
Ejemplo n.º 9
0
def conformer_model():
    preprocessor = {
        'cls':
        'nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor',
        'params': dict({})
    }
    encoder = {
        'cls': 'nemo.collections.asr.modules.ConformerEncoder',
        'params': {
            'feat_in': 80,
            'feat_out': -1,
            'n_layers': 2,
            'd_model': 256,
            'subsampling': 'striding',
            'subsampling_factor': 4,
            'subsampling_conv_channels': 512,
            'ff_expansion_factor': 4,
            'self_attention_model': 'rel_pos',
            'n_heads': 8,
            'att_context_size': [-1, -1],
            'xscaling': True,
            'untie_biases': True,
            'pos_emb_max_len': 500,
            'conv_kernel_size': 31,
            'dropout': 0.1,
            'dropout_emb': 0.0,
            'dropout_att': 0.1,
        },
    }

    decoder = {
        'cls': 'nemo.collections.asr.modules.ConvASRDecoder',
        'params': {
            'feat_in': 256,
            'num_classes': 1024,
            'vocabulary': list(chr(i % 28) for i in range(0, 1024))
        },
    }

    modelConfig = DictConfig({
        'preprocessor': DictConfig(preprocessor),
        'encoder': DictConfig(encoder),
        'decoder': DictConfig(decoder)
    })
    citri_model = EncDecSpeakerLabelModel(cfg=modelConfig)
    return citri_model
Ejemplo n.º 10
0
def main(
    nemo_file, onnx_file, model_type='asr',
):
    if model_type == 'asr':
        logging.info("Preparing ASR model")
        model = EncDecCTCModel.restore_from(nemo_file)
    elif model_type == 'speech_label':
        logging.info("Preparing Speech Label Classification model")
        model = EncDecClassificationModel.restore_from(nemo_file)
    elif model_type == 'speaker':
        logging.info("Preparing Speaker Recognition model")
        model = EncDecSpeakerLabelModel.restore_from(nemo_file)
    else:
        raise NameError("Available model names are asr, speech_label and speaker")

    logging.info("Writing onnx file")
    model.export(onnx_file, onnx_opset_version=12)
    logging.info("succesfully ported onnx file")
Ejemplo n.º 11
0
    def test_constructor(self):
        preprocessor = {'cls': 'nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor', 'params': dict({})}
        encoder = {
            'cls': 'nemo.collections.asr.modules.ConvASREncoder',
            'params': {
                'feat_in': 64,
                'activation': 'relu',
                'conv_mask': True,
                'jasper': [
                    {
                        'filters': 512,
                        'repeat': 1,
                        'kernel': [1],
                        'stride': [1],
                        'dilation': [1],
                        'dropout': 0.0,
                        'residual': False,
                        'separable': False,
                    }
                ],
            },
        }

        decoder = {
            'cls': 'nemo.collections.asr.modules.SpeakerDecoder',
            'params': {'feat_in': 512, 'num_classes': 2, 'pool_mode': 'xvector', 'emb_sizes': [1024]},
        }

        modelConfig = DictConfig(
            {'preprocessor': DictConfig(preprocessor), 'encoder': DictConfig(encoder), 'decoder': DictConfig(decoder)}
        )
        speaker_model = EncDecSpeakerLabelModel(cfg=modelConfig)
        speaker_model.train()
        # TODO: make proper config and assert correct number of weights

        # Check to/from config_dict:
        confdict = speaker_model.to_config_dict()
        instance2 = EncDecSpeakerLabelModel.from_config_dict(confdict)
        self.assertTrue(isinstance(instance2, EncDecSpeakerLabelModel))
Ejemplo n.º 12
0
def citrinet_model():
    preprocessor = {'cls': 'nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor', 'params': dict({})}
    encoder = {
        'cls': 'nemo.collections.asr.modules.ConvASREncoder',
        'params': {
            'feat_in': 80,
            'activation': 'relu',
            'conv_mask': True,
            'jasper': [
                {
                    'filters': 512,
                    'repeat': 1,
                    'kernel': [5],
                    'stride': [1],
                    'dilation': [1],
                    'dropout': 0.0,
                    'residual': False,
                    'separable': True,
                    'se': True,
                    'se_context_size': -1,
                },
                {
                    'filters': 512,
                    'repeat': 5,
                    'kernel': [11],
                    'stride': [2],
                    'dilation': [1],
                    'dropout': 0.1,
                    'residual': True,
                    'separable': True,
                    'se': True,
                    'se_context_size': -1,
                    'stride_last': True,
                    'residual_mode': 'stride_add',
                },
                {
                    'filters': 512,
                    'repeat': 5,
                    'kernel': [13],
                    'stride': [1],
                    'dilation': [1],
                    'dropout': 0.1,
                    'residual': True,
                    'separable': True,
                    'se': True,
                    'se_context_size': -1,
                },
                {
                    'filters': 640,
                    'repeat': 1,
                    'kernel': [41],
                    'stride': [1],
                    'dilation': [1],
                    'dropout': 0.0,
                    'residual': True,
                    'separable': True,
                    'se': True,
                    'se_context_size': -1,
                },
            ],
        },
    }

    decoder = {
        'cls': 'nemo.collections.asr.modules.ConvASRDecoder',
        'params': {'feat_in': 640, 'num_classes': 1024, 'vocabulary': list(chr(i % 28) for i in range(0, 1024))},
    }

    modelConfig = DictConfig(
        {'preprocessor': DictConfig(preprocessor), 'encoder': DictConfig(encoder), 'decoder': DictConfig(decoder)}
    )
    citri_model = EncDecSpeakerLabelModel(cfg=modelConfig)
    return citri_model
Ejemplo n.º 13
0
def main(cfg):

    logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}')

    device = 'cuda' if torch.cuda.is_available() else 'cpu'

    enrollment_manifest = cfg.data.enrollment_manifest
    test_manifest = cfg.data.test_manifest
    out_manifest = cfg.data.out_manifest
    sample_rate = cfg.data.sample_rate

    backend = cfg.backend.backend_model.lower()

    if backend == 'cosine_similarity':
        model_path = cfg.backend.cosine_similarity.model_path
        batch_size = cfg.backend.cosine_similarity.batch_size
        if model_path.endswith('.nemo'):
            speaker_model = EncDecSpeakerLabelModel.restore_from(model_path)
        else:
            speaker_model = EncDecSpeakerLabelModel.from_pretrained(model_path)

        enroll_embs, _, enroll_truelabels, enroll_id2label = EncDecSpeakerLabelModel.get_batch_embeddings(
            speaker_model,
            enrollment_manifest,
            batch_size,
            sample_rate,
            device=device,
        )

        test_embs, _, _, _ = EncDecSpeakerLabelModel.get_batch_embeddings(
            speaker_model,
            test_manifest,
            batch_size,
            sample_rate,
            device=device,
        )

        # length normalize
        enroll_embs = enroll_embs / (np.linalg.norm(
            enroll_embs, ord=2, axis=-1, keepdims=True))
        test_embs = test_embs / (np.linalg.norm(
            test_embs, ord=2, axis=-1, keepdims=True))

        # reference embedding
        reference_embs = []
        keyslist = list(enroll_id2label.keys())
        for label_id in keyslist:
            indices = np.where(enroll_truelabels == label_id)
            embedding = (enroll_embs[indices].sum(
                axis=0).squeeze()) / len(indices)
            reference_embs.append(embedding)

        reference_embs = np.asarray(reference_embs)

        scores = np.matmul(test_embs, reference_embs.T)
        matched_labels = scores.argmax(axis=-1)

    elif backend == 'neural_classifier':
        model_path = cfg.backend.neural_classifier.model_path
        batch_size = cfg.backend.neural_classifier.batch_size

        if model_path.endswith('.nemo'):
            speaker_model = EncDecSpeakerLabelModel.restore_from(model_path)
        else:
            speaker_model = EncDecSpeakerLabelModel.from_pretrained(model_path)

        featurizer = WaveformFeaturizer(sample_rate=sample_rate)
        dataset = AudioToSpeechLabelDataset(
            manifest_filepath=enrollment_manifest,
            labels=None,
            featurizer=featurizer)
        enroll_id2label = dataset.id2label

        if speaker_model.decoder.final.out_features != len(enroll_id2label):
            raise ValueError(
                "number of labels mis match. Make sure you trained or finetuned neural classifier with labels from enrollement manifest_filepath"
            )

        _, test_logits, _, _ = EncDecSpeakerLabelModel.get_batch_embeddings(
            speaker_model,
            test_manifest,
            batch_size,
            sample_rate,
            device=device,
        )
        matched_labels = test_logits.argmax(axis=-1)

    with open(test_manifest, 'rb') as f1, open(out_manifest,
                                               'w',
                                               encoding='utf-8') as f2:
        lines = f1.readlines()
        for idx, line in enumerate(lines):
            line = line.strip()
            item = json.loads(line)
            item['infer'] = enroll_id2label[matched_labels[idx]]
            json.dump(item, f2)
            f2.write('\n')

    logging.info(
        "Inference labels have been written to {} manifest file".format(
            out_manifest))
Ejemplo n.º 14
0
def main():
    parser = ArgumentParser()
    parser.add_argument(
        "--spkr_model", type=str, default="titanet_large", required=True, help="Pass your trained .nemo model",
    )
    parser.add_argument(
        "--train_manifest", type=str, required=True, help="path to train manifest file to match labels"
    )
    parser.add_argument(
        "--test_manifest", type=str, required=True, help="path to test manifest file to perform inference"
    )
    parser.add_argument("--batch_size", type=int, default=32)
    args = parser.parse_args()
    torch.set_grad_enabled(False)

    if args.spkr_model.endswith('.nemo'):
        logging.info(f"Using local speaker model from {args.spkr_model}")
        speaker_model = EncDecSpeakerLabelModel.restore_from(restore_path=args.spkr_model)
    else:
        logging.error(f"Please pass a trained .nemo file")
        sys.exit()

    labels = []
    with open(args.train_manifest, 'rb') as f:
        lines = f.readlines()
        for line in lines:
            line = line.strip()
            item = json.loads(line)
            labels.append(item['label'])

    labels_map = sorted(set(labels))
    label2id, id2label = {}, {}
    for label_id, label in enumerate(labels_map):
        label2id[label] = label_id
        id2label[label_id] = label

    speaker_model.setup_test_data(
        test_data_layer_params={
            'sample_rate': 16000,
            'manifest_filepath': args.test_manifest,
            'labels': labels_map,
            'batch_size': args.batch_size,
            'trim_silence': False,
            'shuffle': False,
        }
    )
    if can_gpu:
        speaker_model = speaker_model.cuda()
    speaker_model.eval()

    speaker_model.test_dataloader()
    all_labels = []
    all_logits = []
    for test_batch in tqdm(speaker_model.test_dataloader()):
        if can_gpu:
            test_batch = [x.cuda() for x in test_batch]
        with autocast():
            audio_signal, audio_signal_len, labels, _ = test_batch
            logits, _ = speaker_model.forward(input_signal=audio_signal, input_signal_length=audio_signal_len)

            all_logits.extend(logits.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    all_logits, true_labels = np.asarray(all_logits), np.asarray(all_labels)
    infer_labels = all_logits.argmax(axis=1)

    out_manifest = os.path.basename(args.test_manifest).split('.')[0] + '_infer.json'
    out_manifest = os.path.join(os.path.dirname(args.test_manifest), out_manifest)
    with open(args.test_manifest, 'rb') as f1, open(out_manifest, 'w', encoding='utf-8') as f2:
        lines = f1.readlines()
        for idx, line in enumerate(lines):
            line = line.strip()
            item = json.loads(line)
            item['infer'] = id2label[infer_labels[idx]]
            json.dump(item, f2)
            f2.write('\n')

    logging.info("Inference labels have been written to {} manifest file".format(out_manifest))