Beispiel #1
0
def main():
    args = get_args()
    if args.pretrained_name is None:
        model = PunctuationCapitalizationModel.restore_from(args.model_path)
    else:
        model = PunctuationCapitalizationModel.from_pretrained(
            args.pretrained_name)
    if args.input_manifest is None:
        texts = []
        with args.input_text.open() as f:
            texts.append(f.readline().strip())
    else:
        manifest = load_manifest(args.input_manifest)
        text_key = "pred_text" if "pred_text" in manifest[0] else "text"
        texts = []
        for item in manifest:
            texts.append(item[text_key])
    processed_texts = model.add_punctuation_capitalization(
        texts,
        batch_size=args.batch_size,
        max_seq_length=args.max_seq_length,
        step=args.step,
        margin=args.margin,
    )
    if args.output_manifest is None:
        with args.output_text.open('w') as f:
            for t in processed_texts:
                f.write(t + '\n')
    else:
        with args.output_manifest.open('w') as f:
            for item, t in zip(manifest, processed_texts):
                item[text_key] = t
                f.write(json.dumps(item) + '\n')
Beispiel #2
0
    def __init__(self, torch_device=None):
        if torch_device is None:
            if torch.cuda.is_available():
                torch_device = torch.device('cuda')
            else:
                torch_device = torch.device('cpu')

        self.file_config = path.join(WORK_DIR, _MODEL_CONFIG)
        self.file_checkpoints = path.join(WORK_DIR, _MODEL_WEIGHTS)

        model_config = OmegaConf.load(self.file_config)
        OmegaConf.set_struct(model_config, True)

        if isinstance(model_config, DictConfig):
            self.config = OmegaConf.to_container(model_config, resolve=True)
            self.config = OmegaConf.create(self.config)
            OmegaConf.set_struct(self.config, True)

        # PunctuationCapitalizationModel.super().__set_model_restore_state(_MODEL_IS_RESTORED)
        instance = PunctuationCapitalizationModel(cfg=self.config)

        self.model_instance = instance
        self.model_instance.to(torch_device)
        self.model_instance.load_state_dict(
            torch.load(self.file_checkpoints, torch_device), False)
Beispiel #3
0
def main():
    args = get_args()
    if args.pretrained_name is None:
        model = PunctuationCapitalizationModel.restore_from(args.model_path)
    else:
        model = PunctuationCapitalizationModel.from_pretrained(
            args.pretrained_name)
    if args.device is None:
        if torch.cuda.is_available():
            model = model.cuda()
        else:
            model = model.cpu()
    else:
        model = model.to(args.device)
    model = model.cpu()
    if args.input_manifest is None:
        texts = []
        with args.input_text.open() as f:
            for line in f:
                texts.append(line.strip())
    else:
        manifest = load_manifest(args.input_manifest)
        text_key = "pred_text" if "pred_text" in manifest[0] else "text"
        texts = []
        for item in manifest:
            texts.append(item[text_key])
    processed_texts = model.add_punctuation_capitalization(
        texts,
        batch_size=args.batch_size,
        max_seq_length=args.max_seq_length,
        step=args.step,
        margin=args.margin,
        return_labels=args.save_labels_instead_of_text,
    )
    if args.output_manifest is None:
        args.output_text.parent.mkdir(exist_ok=True, parents=True)
        with args.output_text.open('w') as f:
            for t in processed_texts:
                f.write(t + '\n')
    else:
        args.output_manifest.parent.mkdir(exist_ok=True, parents=True)
        with args.output_manifest.open('w') as f:
            for item, t in zip(manifest, processed_texts):
                item[text_key] = t
                f.write(json.dumps(item) + '\n')
Beispiel #4
0
 def test_PunctuationCapitalization(self):
     # TODO: Switch to using named configs because here we don't really care about weights
     pn = PunctuationCapitalizationModel.from_pretrained(
         model_name='punctuation_en_distilbert')
     self.__test_restore_elsewhere(model=pn,
                                   attr_for_eq_check=set([
                                       "punct_classifier.log_softmax",
                                       "punct_classifier.log_softmax"
                                   ]))
Beispiel #5
0
def main(cfg: DictConfig) -> None:
    logging.info(
        'During evaluation/testing, it is currently advisable to construct a new Trainer with single GPU and \
            no DDP to obtain accurate results')

    if not hasattr(cfg.model, 'test_ds'):
        raise ValueError(
            f'model.test_ds was not found in the config, skipping evaluation')
    else:
        gpu = 1 if cfg.trainer.gpus != 0 else 0

    trainer = pl.Trainer(
        gpus=gpu,
        precision=cfg.trainer.precision,
        amp_level=cfg.trainer.amp_level,
        logger=False,
        checkpoint_callback=False,
    )
    exp_dir = exp_manager(trainer, cfg.exp_manager)

    if not cfg.pretrained_model:
        raise ValueError(
            'To run evaluation and inference script a pre-trained model or .nemo file must be provided.'
            f'Choose from {PunctuationCapitalizationModel.list_available_models()} or "pretrained_model"="your_model.nemo"'
        )

    if os.path.exists(cfg.pretrained_model):
        model = PunctuationCapitalizationModel.restore_from(
            cfg.pretrained_model)
    elif cfg.pretrained_model in PunctuationCapitalizationModel.get_available_model_names(
    ):
        model = PunctuationCapitalizationModel.from_pretrained(
            cfg.pretrained_model)
    else:
        raise ValueError(
            f'Provide path to the pre-trained .nemo file or choose from {PunctuationCapitalizationModel.list_available_models()}'
        )

    data_dir = cfg.model.dataset.get('data_dir', None)

    if data_dir is None:
        logging.error(
            'No dataset directory provided. Skipping evaluation. '
            'To run evaluation on a file, specify path to the directory that contains test_ds.text_file and test_ds.labels_file with "model.dataset.data_dir" argument.'
        )
    elif not os.path.exists(data_dir):
        logging.error(
            f'{data_dir} is not found, skipping evaluation on the test set.')
    else:
        model.update_data_dir(data_dir=data_dir)
        model._cfg.dataset = cfg.model.dataset

        if not hasattr(cfg.model, 'test_ds'):
            logging.error(
                f'model.test_ds was not found in the config, skipping evaluation'
            )
        elif model.prepare_test(trainer):
            model.setup_test_data(cfg.model.test_ds)
            trainer.test(model)
        else:
            logging.error(
                'Skipping the evaluation. The trainer is not setup properly.')

    # run an inference on a few examples
    queries = [
        'we bought four shirts one pen and a mug from the nvidia gear store in santa clara',
        'what can i do for you today',
        'how are you',
    ]
    inference_results = model.add_punctuation_capitalization(
        queries, batch_size=len(queries), max_seq_length=512)

    for query, result in zip(queries, inference_results):
        logging.info(f'Query : {query}')
        logging.info(f'Result: {result.strip()}\n')

    logging.info(f'Results are saved at {exp_dir}')
Beispiel #6
0
def get_args():
    default_model_parameter = "pretrained_name"
    default_model = "punctuation_en_bert"
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=
        "The script is for restoring punctuation and capitalization in text. Long strings are split into "
        "segments of length `--max_seq_length`. `--max_seq_length` is the length which includes [CLS] and [SEP] "
        "tokens. Parameter `--step` controls segments overlapping. `--step` is a distance between beginnings of "
        "consequent segments. Model outputs for tokens near the borders of tensors are less accurate and can be "
        "discarded before final predictions computation. Parameter `--margin` is number of discarded outputs near "
        "segments borders. Probabilities of tokens in overlapping parts of segments multiplied before selecting the "
        "best prediction. Default values of parameters `--max_seq_length`, `--step`, and `--margin` are optimal for "
        "IWSLT 2019 test dataset.",
    )
    input_ = parser.add_mutually_exclusive_group(required=True)
    input_.add_argument(
        "--input_manifest",
        "-m",
        type=Path,
        help=
        "Path to the file with NeMo manifest which needs punctuation and capitalization. If the first element "
        "of manifest contains key 'pred_text', 'pred_text' values are passed for tokenization. Otherwise 'text' "
        "values are passed for punctuation and capitalization. Exactly one parameter of `--input_manifest` and "
        "`--input_text` should be provided.",
    )
    input_.add_argument(
        "--input_text",
        "-t",
        type=Path,
        help=
        "Path to file with text which needs punctuation and capitalization. Exactly one parameter of "
        "`--input_manifest` and `--input_text` should be provided.",
    )
    output = parser.add_mutually_exclusive_group(required=True)
    output.add_argument(
        "--output_manifest",
        "-M",
        type=Path,
        help=
        "Path to output NeMo manifest. Text with restored punctuation and capitalization will be saved in "
        "'pred_text' elements if 'pred_text' key is present in the input manifest. Otherwise text with restored "
        "punctuation and capitalization will be saved in 'text' elements. Exactly one parameter of `--output_manifest` "
        "and `--output_text` should be provided.",
    )
    output.add_argument(
        "--output_text",
        "-T",
        type=Path,
        help=
        "Path to file with text with restored punctuation and capitalization. Exactly one parameter of "
        "`--output_manifest` and `--output_text` should be provided.",
    )
    model = parser.add_mutually_exclusive_group(required=False)
    model.add_argument(
        "--pretrained_name",
        "-p",
        help=
        f"The name of NGC pretrained model. No more than one of parameters `--pretrained_name`, `--model_path`"
        f"should be provided. If neither of parameters `--pretrained_name` and `--model_path` are provided, then the "
        f"script is run with `--{default_model_parameter}={default_model}`.",
        choices=[
            m.pretrained_model_name
            for m in PunctuationCapitalizationModel.list_available_models()
        ],
    )
    model.add_argument(
        "--model_path",
        "-P",
        type=Path,
        help=
        f"Path to .nemo checkpoint of punctuation and capitalization model. No more than one of parameters "
        f"`--pretrained_name` and `--model_path` should be provided. If neither of parameters `--pretrained_name` and "
        f"`--model_path` are provided, then the script is run with `--{default_model_parameter}={default_model}`.",
    )
    parser.add_argument(
        "--max_seq_length",
        "-L",
        type=int,
        default=64,
        help=
        "Length of segments into which queries are split. `--max_seq_length` includes [CLS] and [SEP] tokens.",
    )
    parser.add_argument(
        "--step",
        "-s",
        type=int,
        default=8,
        help=
        "Relative shift of consequent segments into which long queries are split. Long queries are split into "
        "segments which can overlap. Parameter `step` controls such overlapping. Imagine that queries are "
        "tokenized into characters, `max_seq_length=5`, and `step=2`. In such a case query 'hello' is tokenized "
        "into segments `[['[CLS]', 'h', 'e', 'l', '[SEP]'], ['[CLS]', 'l', 'l', 'o', '[SEP]']]`.",
    )
    parser.add_argument(
        "--margin",
        "-g",
        type=int,
        default=16,
        help=
        "A number of subtokens in the beginning and the end of segments which output probabilities are not used "
        "for prediction computation. The first segment does not have left margin and the last segment does not have "
        "right margin. For example, if input sequence is tokenized into characters, `max_seq_length=5`, `step=1`, "
        "and `margin=1`, then query 'hello' will be tokenized into segments `[['[CLS]', 'h', 'e', 'l', '[SEP]'], "
        "['[CLS]', 'e', 'l', 'l', '[SEP]'], ['[CLS]', 'l', 'l', 'o', '[SEP]']]`. These segments are passed to the "
        "model. Before final predictions computation, margins are removed. In the next list, subtokens which logits "
        "are not used for final predictions computation are marked with asterisk: `[['[CLS]'*, 'h', 'e', 'l'*, "
        "'[SEP]'*], ['[CLS]'*, 'e'*, 'l', 'l'*, '[SEP]'*], ['[CLS]'*, 'l'*, 'l', 'o', '[SEP]'*]]`.",
    )
    parser.add_argument(
        "--batch_size",
        "-b",
        type=int,
        default=128,
        help="Number of segments which are processed simultaneously.",
    )
    parser.add_argument(
        "--save_labels_instead_of_text",
        "-B",
        action="store_true",
        help=
        "If this option is set save punctuation and capitalization labels instead text with restored punctuation "
        "and capitalization. Labels are saved in format described here https://docs.nvidia.com/deeplearning/nemo/"
        "user-guide/docs/en/main/nlp/punctuation_and_capitalization.html#nemo-data-format",
    )
    parser.add_argument(
        "--device",
        "-d",
        choices=['cpu', 'cuda'],
        help=
        "Which device to use. If device is not set and CUDA is available GPU will be used. If device is not set "
        "and CUDA is not available CPU is used.",
    )
    args = parser.parse_args()
    if args.input_manifest is None and args.output_manifest is not None:
        parser.error("--output_manifest requires --input_manifest")
    if args.pretrained_name is None and args.model_path is None:
        setattr(args, default_model_parameter, default_model)
    for name in [
            "input_manifest", "input_text", "output_manifest", "output_text",
            "model_path"
    ]:
        if getattr(args, name) is not None:
            setattr(args, name, getattr(args, name).expanduser())
    return args
def main(cfg: DictConfig) -> None:
    trainer = pl.Trainer(**cfg.trainer)
    exp_manager(trainer, cfg.get("exp_manager", None))

    if not cfg.pretrained_model:
        logging.info(f'Config: {OmegaConf.to_yaml(cfg)}')
        model = PunctuationCapitalizationModel(cfg.model, trainer=trainer)
    else:
        if os.path.exists(cfg.pretrained_model):
            model = PunctuationCapitalizationModel.restore_from(
                cfg.pretrained_model)
        elif cfg.pretrained_model in PunctuationCapitalizationModel.get_available_model_names(
        ):
            model = PunctuationCapitalizationModel.from_pretrained(
                cfg.pretrained_model)
        else:
            raise ValueError(
                f'Provide path to the pre-trained .nemo file or choose from {PunctuationCapitalizationModel.list_available_models()}'
            )

        data_dir = cfg.model.dataset.get('data_dir', None)
        if data_dir:
            if not os.path.exists(data_dir):
                raise ValueError(f'{data_dir} is not found at')

            # we can also do finetuning of the pretrained model but we would need to update the data dir
            model.update_data_dir(data_dir)
            # setup train and validation Pytorch DataLoaders
            model.setup_training_data()
            model.setup_validation_data()
            logging.info(f'Using config file of the pretrained model')
        else:
            raise ValueError(
                'Specify a valid dataset directory that contains test_ds.text_file and test_ds.labels_file \
                with "model.dataset.data_dir" argument')

    trainer.fit(model)
Beispiel #8
0
def main(cfg: DictConfig) -> None:
    torch.manual_seed(42)
    cfg = OmegaConf.merge(
        OmegaConf.structured(PunctuationCapitalizationConfig()), cfg)
    trainer = pl.Trainer(**cfg.trainer)
    exp_manager(trainer, cfg.get("exp_manager", None))
    if not cfg.do_training and not cfg.do_testing:
        raise ValueError(
            "At least one of config parameters `do_training` and `do_testing` has to `true`."
        )
    if cfg.do_training:
        if cfg.model.get('train_ds') is None:
            raise ValueError(
                '`model.train_ds` config section is required if `do_training` config item is `True`.'
            )
    if cfg.do_testing:
        if cfg.model.get('test_ds') is None:
            raise ValueError(
                '`model.test_ds` config section is required if `do_testing` config item is `True`.'
            )

    if not cfg.pretrained_model:
        logging.info(f'Config: {OmegaConf.to_yaml(cfg)}')
        model = PunctuationCapitalizationModel(cfg.model, trainer=trainer)
    else:
        if os.path.exists(cfg.pretrained_model):
            model = PunctuationCapitalizationModel.restore_from(
                cfg.pretrained_model)
        elif cfg.pretrained_model in PunctuationCapitalizationModel.get_available_model_names(
        ):
            model = PunctuationCapitalizationModel.from_pretrained(
                cfg.pretrained_model)
        else:
            raise ValueError(
                f'Provide path to the pre-trained .nemo file or choose from '
                f'{PunctuationCapitalizationModel.list_available_models()}')
        model.update_config_after_restoring_from_checkpoint(
            class_labels=cfg.model.class_labels,
            common_dataset_parameters=cfg.model.common_dataset_parameters,
            train_ds=cfg.model.get('train_ds') if cfg.do_training else None,
            validation_ds=cfg.model.get('validation_ds')
            if cfg.do_training else None,
            test_ds=cfg.model.get('test_ds') if cfg.do_testing else None,
            optim=cfg.model.get('optim') if cfg.do_training else None,
        )
        model.set_trainer(trainer)
        if cfg.do_training:
            model.setup_training_data()
            model.setup_validation_data()
            model.setup_optimization()
        else:
            model.setup_test_data()
    if cfg.do_training:
        trainer.fit(model)
    if cfg.do_testing:
        trainer.test(model)
def main(cfg: DictConfig) -> None:
    trainer = pl.Trainer(**cfg.trainer)
    exp_manager(trainer, cfg.get("exp_manager", None))
    do_training = True
    if not cfg.pretrained_model:
        logging.info(f'Config: {OmegaConf.to_yaml(cfg)}')
        model = PunctuationCapitalizationModel(cfg.model, trainer=trainer)
    else:
        logging.info(f'Loading pretrained model {cfg.pretrained_model}')
        model = PunctuationCapitalizationModel.from_pretrained(cfg.pretrained_model)
        data_dir = cfg.model.dataset.get('data_dir', None)
        if data_dir:
            model.update_data_dir(data_dir)
            model.setup_training_data()
            model.setup_validation_data()
            logging.info(f'Using config file of the pretrained model')
        else:
            do_training = False
            logging.info(
                f'Data dir should be specified for training/finetuning. '
                f'Using pretrained {cfg.pretrained_model} model weights and skipping finetuning.'
            )

    if do_training:
        trainer.fit(model)
        if cfg.model.nemo_path:
            model.save_to(cfg.model.nemo_path)

    logging.info(
        'During evaluation/testing, it is currently advisable to construct a new Trainer with single GPU '
        'and no DDP to obtain accurate results'
    )
    gpu = 1 if cfg.trainer.gpus != 0 else 0
    trainer = pl.Trainer(gpus=gpu)
    model.set_trainer(trainer)

    # run an inference on a few examples
    queries = [
        'we bought four shirts one pen and a mug from the nvidia gear store in santa clara',
        'what can i do for you today',
        'how are you',
    ]
    inference_results = model.add_punctuation_capitalization(queries)

    for query, result in zip(queries, inference_results):
        logging.info(f'Query : {query}')
        logging.info(f'Result: {result.strip()}\n')
Beispiel #10
0
import time
from nemo.collections.nlp.models import PunctuationCapitalizationModel
import argparse
import sys
import torch

# to get the list of pre-trained models
#PunctuationCapitalizationModel.list_available_models()

# Read text sentences from input file
parser = argparse.ArgumentParser()
parser.add_argument(
    "--model", "-m", type=str, required=False, default="punctuation_en_distilbert", help="Path to the model",
    choices=[m.pretrained_model_name for m in PunctuationCapitalizationModel.list_available_models()]
)
parser.add_argument(
    "--inpath", "-i", type=str, required=True, help="Input Path"
)
parser.add_argument(
    "--outpath", "-o", type=str, required=True, help="Output Path"
)
parser.add_argument(
    "--batch_size", "-b", type=int, default=128, help="Number of segments which are processed simultaneously.",
)
parser.add_argument(
    "--device",
    "-d",
    choices=['cpu', 'cuda'],
    help="Which device to use. If device is not set and CUDA is available, then GPU will be used. If device is "
    "not set and CUDA is not available, then CPU is used.",
)
def main(cfg: DictConfig) -> None:
    trainer = pl.Trainer(**cfg.trainer)
    exp_manager(trainer, cfg.get("exp_manager", None))
    do_training = True
    if not cfg.pretrained_model:
        logging.info(f'Config: {OmegaConf.to_yaml(cfg)}')
        model = PunctuationCapitalizationModel(cfg.model, trainer=trainer)
    else:
        logging.info(f'Loading pretrained model {cfg.pretrained_model}')
        # TODO: Remove strict, when lightning has persistent parameter support for add_state()
        model = PunctuationCapitalizationModel.from_pretrained(
            cfg.pretrained_model, strict=False)
        data_dir = cfg.model.dataset.get('data_dir', None)
        if data_dir:
            # we can also do finetunining of the pretrained model but it will require
            # setting up train and validation Pytorch DataLoaders
            model.setup_training_data(data_dir=data_dir)
            # evaluation could be done on multiple files, use model.validation_ds.ds_items to specify multiple
            # data directories if needed
            model.setup_validation_data(data_dirs=data_dir)
            logging.info(f'Using config file of the pretrained model')
        else:
            do_training = False
            logging.info(
                f'Data dir should be specified for training/finetuning. '
                f'Using pretrained {cfg.pretrained_model} model weights and skipping finetuning.'
            )

    if do_training:
        trainer.fit(model)
        if cfg.model.nemo_path:
            model.save_to(cfg.model.nemo_path)

    logging.info(
        'During evaluation/testing, it is currently advisable to construct a new Trainer with single GPU '
        'and no DDP to obtain accurate results')
    gpu = 1 if cfg.trainer.gpus != 0 else 0
    trainer = pl.Trainer(gpus=gpu)
    model.set_trainer(trainer)

    # run an inference on a few examples
    queries = [
        'we bought four shirts one pen and a mug from the nvidia gear store in santa clara',
        'what can i do for you today',
        'how are you',
    ]
    inference_results = model.add_punctuation_capitalization(queries)

    for query, result in zip(queries, inference_results):
        logging.info(f'Query : {query}')
        logging.info(f'Result: {result.strip()}\n')