コード例 #1
0
def main(cfg):
    trainer = pl.Trainer(**cfg.trainer)
    exp_manager(trainer, cfg.get("exp_manager", None))
    asr_model = EncDecClassificationModel(cfg=cfg.model, trainer=trainer)

    trainer.fit(asr_model)

    if hasattr(cfg.model,
               'test_ds') and cfg.model.test_ds.manifest_filepath is not None:
        gpu = 1 if cfg.trainer.gpus != 0 else 0
        trainer = pl.Trainer(gpus=gpu)
        if asr_model.prepare_test(trainer):
            trainer.test(asr_model)
コード例 #2
0
ファイル: vad_utils.py プロジェクト: NVIDIA/NeMo
def init_vad_model(model_path: str):
    """
    Initiate VAD model with model path
    """
    if model_path.endswith('.nemo'):
        logging.info(f"Using local VAD model from {model_path}")
        vad_model = EncDecClassificationModel.restore_from(restore_path=model_path)
    elif model_path.endswith('.ckpt'):
        vad_model = EncDecClassificationModel.load_from_checkpoint(checkpoint_path=model_path)
    else:
        logging.info(f"Using NGC cloud VAD model {model_path}")
        vad_model = EncDecClassificationModel.from_pretrained(model_name=model_path)
    return vad_model
コード例 #3
0
ファイル: speech_to_label.py プロジェクト: ggrunin/NeMo
def main(cfg):
    logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}')

    trainer = pl.Trainer(**cfg.trainer)
    exp_manager(trainer, cfg.get("exp_manager", None))
    asr_model = EncDecClassificationModel(cfg=cfg.model, trainer=trainer)

    # Initialize the weights of the model from another model, if provided via config
    asr_model.maybe_init_from_pretrained_checkpoint(cfg)

    trainer.fit(asr_model)

    if hasattr(cfg.model, 'test_ds') and cfg.model.test_ds.manifest_filepath is not None:
        if asr_model.prepare_test(trainer):
            trainer.test(asr_model)
コード例 #4
0
def main(
    nemo_file,
    enemo_file,
    onnx_file,
    model_type="asr",
):
    if model_type == "asr":
        logging.info("Preparing ASR model")
        model = EncDecCTCModel.restore_from(nemo_file)
    elif model_type == "speech_label":
        logging.info("Preparing Speech Label Classification model")
        model = EncDecClassificationModel.restore_from(nemo_file)
    elif model_type == "speaker":
        logging.info("Preparing Speaker Recognition model")
        model = EncDecSpeakerLabelModel.restore_from(nemo_file)
    else:
        raise NameError(
            "Available model names are asr, speech_label and speaker")

    logging.info("Writing onnx file")
    model.export(onnx_file, onnx_opset_version=12)
    logging.info("succesfully ported onnx file")

    with tarfile.open(nemo_file, "r") as archive:
        archive.extract("./model_config.yaml")
        with tarfile.open(enemo_file, "w") as enemo_archive:
            enemo_archive.add("./model_config.yaml")
            copyfile(onnx_file, "model_graph.onnx")
            enemo_archive.add("model_graph.onnx")
            os.remove("model_graph.onnx")  # cleanup extra file
コード例 #5
0
ファイル: convasr_to_enemo.py プロジェクト: zt706/NeMo
def main(
    nemo_file,
    enemo_file,
    onnx_file,
    model_type='asr',
):
    if model_type == 'asr':
        logging.info("Preparing ASR model")
        model = EncDecCTCModel.restore_from(nemo_file)
    elif model_type == 'speech_label':
        logging.info("Preparing Speech Label Classification model")
        model = EncDecClassificationModel.restore_from(nemo_file)
    elif model_type == 'speaker':
        logging.info("Preparing Speaker Recognition model")
        model = EncDecSpeakerLabelModel.restore_from(nemo_file)
    else:
        raise NameError(
            "Available model names are asr, speech_label and speaker")

    logging.info("Writing onnx file")
    model.export(onnx_file, onnx_opset_version=12)
    logging.info("succesfully ported onnx file")

    with tarfile.open(nemo_file, 'r') as archive:
        archive.extract('./model_config.yaml')
        with tarfile.open(enemo_file, 'w') as enemo_archive:
            enemo_archive.add('./model_config.yaml')
            enemo_archive.addfile(tarfile.TarInfo("model_graph.onnx"),
                                  open(onnx_file))
コード例 #6
0
def speech_classification_model():
    preprocessor = {
        'cls':
        'nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor',
        'params': dict({})
    }
    encoder = {
        'cls': 'nemo.collections.asr.modules.ConvASREncoder',
        'params': {
            'feat_in':
            64,
            'activation':
            'relu',
            'conv_mask':
            True,
            'jasper': [{
                'filters': 32,
                'repeat': 1,
                'kernel': [1],
                'stride': [1],
                'dilation': [1],
                'dropout': 0.0,
                'residual': False,
                'separable': True,
                'se': True,
                'se_context_size': -1,
            }],
        },
    }

    decoder = {
        'cls': 'nemo.collections.asr.modules.ConvASRDecoderClassification',
        'params': {
            'feat_in': 32,
            'num_classes': 30,
        },
    }

    modelConfig = DictConfig({
        'preprocessor':
        DictConfig(preprocessor),
        'encoder':
        DictConfig(encoder),
        'decoder':
        DictConfig(decoder),
        'labels':
        ListConfig(["dummy_cls_{}".format(i + 1) for i in range(30)]),
    })
    model = EncDecClassificationModel(cfg=modelConfig)
    return model
コード例 #7
0
    def test_constructor(self, speech_classification_model):
        asr_model = speech_classification_model.train()

        conv_cnt = (64 * 32 * 1 + 32) + (64 * 1 * 1 + 32)  # separable kernel + bias + pointwise kernel + bias
        bn_cnt = (4 * 32) * 2  # 2 * moving averages
        dec_cnt = 32 * 30 + 30  # fc + bias

        param_count = conv_cnt + bn_cnt + dec_cnt
        assert asr_model.num_weights == param_count

        # Check to/from config_dict:
        confdict = asr_model.to_config_dict()
        instance2 = EncDecClassificationModel.from_config_dict(confdict)

        assert isinstance(instance2, EncDecClassificationModel)
コード例 #8
0
def main(
    nemo_file, onnx_file, model_type='asr',
):
    if model_type == 'asr':
        logging.info("Preparing ASR model")
        model = EncDecCTCModel.restore_from(nemo_file)
    elif model_type == 'speech_label':
        logging.info("Preparing Speech Label Classification model")
        model = EncDecClassificationModel.restore_from(nemo_file)
    elif model_type == 'speaker':
        logging.info("Preparing Speaker Recognition model")
        model = EncDecSpeakerLabelModel.restore_from(nemo_file)
    else:
        raise NameError("Available model names are asr, speech_label and speaker")

    logging.info("Writing onnx file")
    model.export(onnx_file, onnx_opset_version=12)
    logging.info("succesfully ported onnx file")
コード例 #9
0
ファイル: vad_infer.py プロジェクト: researchase/NeMo
def main():
    parser = ArgumentParser()
    parser.add_argument("--vad_model",
                        type=str,
                        default="MatchboxNet-VAD-3x2",
                        required=False,
                        help="Pass: '******'")
    parser.add_argument(
        "--dataset",
        type=str,
        required=True,
        help=
        "Path of json file of evaluation data. Audio files should have unique names.",
    )
    parser.add_argument("--out_dir",
                        type=str,
                        default="vad_frame",
                        help="Dir of your vad outputs")
    parser.add_argument("--time_length", type=float, default=0.63)
    parser.add_argument("--shift_length", type=float, default=0.01)
    parser.add_argument("--normalize_audio", type=bool, default=False)
    parser.add_argument("--num_workers", type=float, default=20)
    parser.add_argument("--split_duration", type=float, default=400)
    parser.add_argument(
        "--dont_auto_split",
        default=False,
        action='store_true',
        help=
        "Whether to automatically split manifest entry by split_duration to avoid potential CUDA out of memory issue.",
    )

    args = parser.parse_args()

    torch.set_grad_enabled(False)

    if args.vad_model.endswith('.nemo'):
        logging.info(f"Using local VAD model from {args.vad_model}")
        vad_model = EncDecClassificationModel.restore_from(
            restore_path=args.vad_model)
    else:
        logging.info(f"Using NGC cloud VAD model {args.vad_model}")
        vad_model = EncDecClassificationModel.from_pretrained(
            model_name=args.vad_model)

    if not os.path.exists(args.out_dir):
        os.mkdir(args.out_dir)

    # Prepare manifest for streaming VAD
    manifest_vad_input = args.dataset
    if not args.dont_auto_split:
        logging.info("Split long audio file to avoid CUDA memory issue")
        logging.debug(
            "Try smaller split_duration if you still have CUDA memory issue")
        config = {
            'manifest_filepath': manifest_vad_input,
            'time_length': args.time_length,
            'split_duration': args.split_duration,
            'num_workers': args.num_workers,
        }
        manifest_vad_input = prepare_manifest(config)
    else:
        logging.warning(
            "If you encounter CUDA memory issue, try splitting manifest entry by split_duration to avoid it."
        )

    # setup_test_data
    vad_model.setup_test_data(
        test_data_config={
            'vad_stream': True,
            'sample_rate': 16000,
            'manifest_filepath': manifest_vad_input,
            'labels': [
                'infer',
            ],
            'num_workers': args.num_workers,
            'shuffle': False,
            'time_length': args.time_length,
            'shift_length': args.shift_length,
            'trim_silence': False,
            'normalize_audio': args.normalize_audio,
        })

    vad_model = vad_model.to(device)
    vad_model.eval()

    time_unit = int(args.time_length / args.shift_length)
    trunc = int(time_unit / 2)
    trunc_l = time_unit - trunc
    all_len = 0

    data = []
    for line in open(args.dataset, 'r'):
        file = json.loads(line)['audio_filepath'].split("/")[-1]
        data.append(file.split(".wav")[0])
    logging.info(f"Inference on {len(data)} audio files/json lines!")

    status = get_vad_stream_status(data)
    for i, test_batch in enumerate(vad_model.test_dataloader()):
        test_batch = [x.to(device) for x in test_batch]
        with autocast():
            log_probs = vad_model(input_signal=test_batch[0],
                                  input_signal_length=test_batch[1])
            probs = torch.softmax(log_probs, dim=-1)
            pred = probs[:, 1]

            if status[i] == 'start':
                to_save = pred[:-trunc]
            elif status[i] == 'next':
                to_save = pred[trunc:-trunc_l]
            elif status[i] == 'end':
                to_save = pred[trunc_l:]
            else:
                to_save = pred

            all_len += len(to_save)
            outpath = os.path.join(args.out_dir, data[i] + ".frame")
            with open(outpath, "a") as fout:
                for f in range(len(to_save)):
                    fout.write('{0:0.4f}\n'.format(to_save[f]))
        del test_batch
        if status[i] == 'end' or status[i] == 'single':
            logging.debug(
                f"Overall length of prediction of {data[i]} is {all_len}!")
            all_len = 0
コード例 #10
0
ファイル: vad_infer.py プロジェクト: wgfi110/NeMo
def main():
    parser = ArgumentParser()
    parser.add_argument("--vad_model",
                        type=str,
                        default="MatchboxNet-VAD-3x2",
                        required=False,
                        help="Pass: '******'")
    parser.add_argument(
        "--dataset",
        type=str,
        required=True,
        help=
        "Path of json file of evaluation data. Audio files should have unique names.",
    )
    parser.add_argument("--out_dir",
                        type=str,
                        default="vad_frame",
                        help="Dir of your vad outputs")
    parser.add_argument("--time_length", type=float, default=0.63)
    parser.add_argument("--shift_length", type=float, default=0.01)
    args = parser.parse_args()

    torch.set_grad_enabled(False)

    if args.vad_model.endswith('.nemo'):
        logging.info(f"Using local VAD model from {args.vad_model}")
        vad_model = EncDecClassificationModel.restore_from(
            restore_path=args.vad_model)
    else:
        logging.info(f"Using NGC cloud VAD model {args.vad_model}")
        vad_model = EncDecClassificationModel.from_pretrained(
            model_name=args.vad_model)

    if not os.path.exists(args.out_dir):
        os.mkdir(args.out_dir)

    # setup_test_data
    vad_model.setup_test_data(
        test_data_config={
            'vad_stream': True,
            'sample_rate': 16000,
            'manifest_filepath': args.dataset,
            'labels': [
                'infer',
            ],
            'num_workers': 20,
            'shuffle': False,
            'time_length': args.time_length,
            'shift_length': args.shift_length,
            'trim_silence': False,
        })

    vad_model = vad_model.to(device)
    vad_model.eval()

    data = []
    for line in open(args.dataset, 'r'):
        file = json.loads(line)['audio_filepath'].split("/")[-1]
        data.append(file.split(".wav")[0])
    print(f"Inference on {len(data)} audio files/json lines!")

    time_unit = int(args.time_length / args.shift_length)
    trunc = int(time_unit / 2)
    trunc_l = time_unit - trunc
    all_len = 0

    for i, test_batch in enumerate(vad_model.test_dataloader()):
        if i == 0:
            status = 'start' if data[i] == data[i + 1] else 'single'
        elif i == len(data) - 1:
            status = 'end' if data[i] == data[i - 1] else 'single'
        else:
            if data[i] != data[i - 1] and data[i] == data[i + 1]:
                status = 'start'
            elif data[i] == data[i - 1] and data[i] == data[i + 1]:
                status = 'next'
            elif data[i] == data[i - 1] and data[i] != data[i + 1]:
                status = 'end'
            else:
                status = 'single'
        print(data[i], status)

        test_batch = [x.to(device) for x in test_batch]
        with autocast():
            log_probs = vad_model(input_signal=test_batch[0],
                                  input_signal_length=test_batch[1])
            probs = torch.softmax(log_probs, dim=-1)
            pred = probs[:, 1]

            if status == 'start':
                to_save = pred[:-trunc]
            elif status == 'next':
                to_save = pred[trunc:-trunc_l]
            elif status == 'end':
                to_save = pred[trunc_l:]
            else:
                to_save = pred
            all_len += len(to_save)

            outpath = os.path.join(args.out_dir, data[i] + ".frame")
            with open(outpath, "a") as fout:
                for f in range(len(to_save)):
                    fout.write('{0:0.4f}\n'.format(to_save[f]))

        del test_batch
        if status == 'end' or status == 'single':
            print(f"Overall length of prediction of {data[i]} is {all_len}!")
            all_len = 0