def main(): args = get_args() if args.pretrained_name is None: model = PunctuationCapitalizationModel.restore_from(args.model_path) else: model = PunctuationCapitalizationModel.from_pretrained( args.pretrained_name) if args.input_manifest is None: texts = [] with args.input_text.open() as f: texts.append(f.readline().strip()) else: manifest = load_manifest(args.input_manifest) text_key = "pred_text" if "pred_text" in manifest[0] else "text" texts = [] for item in manifest: texts.append(item[text_key]) processed_texts = model.add_punctuation_capitalization( texts, batch_size=args.batch_size, max_seq_length=args.max_seq_length, step=args.step, margin=args.margin, ) if args.output_manifest is None: with args.output_text.open('w') as f: for t in processed_texts: f.write(t + '\n') else: with args.output_manifest.open('w') as f: for item, t in zip(manifest, processed_texts): item[text_key] = t f.write(json.dumps(item) + '\n')
def __init__(self, torch_device=None): if torch_device is None: if torch.cuda.is_available(): torch_device = torch.device('cuda') else: torch_device = torch.device('cpu') self.file_config = path.join(WORK_DIR, _MODEL_CONFIG) self.file_checkpoints = path.join(WORK_DIR, _MODEL_WEIGHTS) model_config = OmegaConf.load(self.file_config) OmegaConf.set_struct(model_config, True) if isinstance(model_config, DictConfig): self.config = OmegaConf.to_container(model_config, resolve=True) self.config = OmegaConf.create(self.config) OmegaConf.set_struct(self.config, True) # PunctuationCapitalizationModel.super().__set_model_restore_state(_MODEL_IS_RESTORED) instance = PunctuationCapitalizationModel(cfg=self.config) self.model_instance = instance self.model_instance.to(torch_device) self.model_instance.load_state_dict( torch.load(self.file_checkpoints, torch_device), False)
def main(): args = get_args() if args.pretrained_name is None: model = PunctuationCapitalizationModel.restore_from(args.model_path) else: model = PunctuationCapitalizationModel.from_pretrained( args.pretrained_name) if args.device is None: if torch.cuda.is_available(): model = model.cuda() else: model = model.cpu() else: model = model.to(args.device) model = model.cpu() if args.input_manifest is None: texts = [] with args.input_text.open() as f: for line in f: texts.append(line.strip()) else: manifest = load_manifest(args.input_manifest) text_key = "pred_text" if "pred_text" in manifest[0] else "text" texts = [] for item in manifest: texts.append(item[text_key]) processed_texts = model.add_punctuation_capitalization( texts, batch_size=args.batch_size, max_seq_length=args.max_seq_length, step=args.step, margin=args.margin, return_labels=args.save_labels_instead_of_text, ) if args.output_manifest is None: args.output_text.parent.mkdir(exist_ok=True, parents=True) with args.output_text.open('w') as f: for t in processed_texts: f.write(t + '\n') else: args.output_manifest.parent.mkdir(exist_ok=True, parents=True) with args.output_manifest.open('w') as f: for item, t in zip(manifest, processed_texts): item[text_key] = t f.write(json.dumps(item) + '\n')
def test_PunctuationCapitalization(self): # TODO: Switch to using named configs because here we don't really care about weights pn = PunctuationCapitalizationModel.from_pretrained( model_name='punctuation_en_distilbert') self.__test_restore_elsewhere(model=pn, attr_for_eq_check=set([ "punct_classifier.log_softmax", "punct_classifier.log_softmax" ]))
def main(cfg: DictConfig) -> None: logging.info( 'During evaluation/testing, it is currently advisable to construct a new Trainer with single GPU and \ no DDP to obtain accurate results') if not hasattr(cfg.model, 'test_ds'): raise ValueError( f'model.test_ds was not found in the config, skipping evaluation') else: gpu = 1 if cfg.trainer.gpus != 0 else 0 trainer = pl.Trainer( gpus=gpu, precision=cfg.trainer.precision, amp_level=cfg.trainer.amp_level, logger=False, checkpoint_callback=False, ) exp_dir = exp_manager(trainer, cfg.exp_manager) if not cfg.pretrained_model: raise ValueError( 'To run evaluation and inference script a pre-trained model or .nemo file must be provided.' f'Choose from {PunctuationCapitalizationModel.list_available_models()} or "pretrained_model"="your_model.nemo"' ) if os.path.exists(cfg.pretrained_model): model = PunctuationCapitalizationModel.restore_from( cfg.pretrained_model) elif cfg.pretrained_model in PunctuationCapitalizationModel.get_available_model_names( ): model = PunctuationCapitalizationModel.from_pretrained( cfg.pretrained_model) else: raise ValueError( f'Provide path to the pre-trained .nemo file or choose from {PunctuationCapitalizationModel.list_available_models()}' ) data_dir = cfg.model.dataset.get('data_dir', None) if data_dir is None: logging.error( 'No dataset directory provided. Skipping evaluation. ' 'To run evaluation on a file, specify path to the directory that contains test_ds.text_file and test_ds.labels_file with "model.dataset.data_dir" argument.' ) elif not os.path.exists(data_dir): logging.error( f'{data_dir} is not found, skipping evaluation on the test set.') else: model.update_data_dir(data_dir=data_dir) model._cfg.dataset = cfg.model.dataset if not hasattr(cfg.model, 'test_ds'): logging.error( f'model.test_ds was not found in the config, skipping evaluation' ) elif model.prepare_test(trainer): model.setup_test_data(cfg.model.test_ds) trainer.test(model) else: logging.error( 'Skipping the evaluation. The trainer is not setup properly.') # run an inference on a few examples queries = [ 'we bought four shirts one pen and a mug from the nvidia gear store in santa clara', 'what can i do for you today', 'how are you', ] inference_results = model.add_punctuation_capitalization( queries, batch_size=len(queries), max_seq_length=512) for query, result in zip(queries, inference_results): logging.info(f'Query : {query}') logging.info(f'Result: {result.strip()}\n') logging.info(f'Results are saved at {exp_dir}')
def get_args(): default_model_parameter = "pretrained_name" default_model = "punctuation_en_bert" parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description= "The script is for restoring punctuation and capitalization in text. Long strings are split into " "segments of length `--max_seq_length`. `--max_seq_length` is the length which includes [CLS] and [SEP] " "tokens. Parameter `--step` controls segments overlapping. `--step` is a distance between beginnings of " "consequent segments. Model outputs for tokens near the borders of tensors are less accurate and can be " "discarded before final predictions computation. Parameter `--margin` is number of discarded outputs near " "segments borders. Probabilities of tokens in overlapping parts of segments multiplied before selecting the " "best prediction. Default values of parameters `--max_seq_length`, `--step`, and `--margin` are optimal for " "IWSLT 2019 test dataset.", ) input_ = parser.add_mutually_exclusive_group(required=True) input_.add_argument( "--input_manifest", "-m", type=Path, help= "Path to the file with NeMo manifest which needs punctuation and capitalization. If the first element " "of manifest contains key 'pred_text', 'pred_text' values are passed for tokenization. Otherwise 'text' " "values are passed for punctuation and capitalization. Exactly one parameter of `--input_manifest` and " "`--input_text` should be provided.", ) input_.add_argument( "--input_text", "-t", type=Path, help= "Path to file with text which needs punctuation and capitalization. Exactly one parameter of " "`--input_manifest` and `--input_text` should be provided.", ) output = parser.add_mutually_exclusive_group(required=True) output.add_argument( "--output_manifest", "-M", type=Path, help= "Path to output NeMo manifest. Text with restored punctuation and capitalization will be saved in " "'pred_text' elements if 'pred_text' key is present in the input manifest. Otherwise text with restored " "punctuation and capitalization will be saved in 'text' elements. Exactly one parameter of `--output_manifest` " "and `--output_text` should be provided.", ) output.add_argument( "--output_text", "-T", type=Path, help= "Path to file with text with restored punctuation and capitalization. Exactly one parameter of " "`--output_manifest` and `--output_text` should be provided.", ) model = parser.add_mutually_exclusive_group(required=False) model.add_argument( "--pretrained_name", "-p", help= f"The name of NGC pretrained model. No more than one of parameters `--pretrained_name`, `--model_path`" f"should be provided. If neither of parameters `--pretrained_name` and `--model_path` are provided, then the " f"script is run with `--{default_model_parameter}={default_model}`.", choices=[ m.pretrained_model_name for m in PunctuationCapitalizationModel.list_available_models() ], ) model.add_argument( "--model_path", "-P", type=Path, help= f"Path to .nemo checkpoint of punctuation and capitalization model. No more than one of parameters " f"`--pretrained_name` and `--model_path` should be provided. If neither of parameters `--pretrained_name` and " f"`--model_path` are provided, then the script is run with `--{default_model_parameter}={default_model}`.", ) parser.add_argument( "--max_seq_length", "-L", type=int, default=64, help= "Length of segments into which queries are split. `--max_seq_length` includes [CLS] and [SEP] tokens.", ) parser.add_argument( "--step", "-s", type=int, default=8, help= "Relative shift of consequent segments into which long queries are split. Long queries are split into " "segments which can overlap. Parameter `step` controls such overlapping. Imagine that queries are " "tokenized into characters, `max_seq_length=5`, and `step=2`. In such a case query 'hello' is tokenized " "into segments `[['[CLS]', 'h', 'e', 'l', '[SEP]'], ['[CLS]', 'l', 'l', 'o', '[SEP]']]`.", ) parser.add_argument( "--margin", "-g", type=int, default=16, help= "A number of subtokens in the beginning and the end of segments which output probabilities are not used " "for prediction computation. The first segment does not have left margin and the last segment does not have " "right margin. For example, if input sequence is tokenized into characters, `max_seq_length=5`, `step=1`, " "and `margin=1`, then query 'hello' will be tokenized into segments `[['[CLS]', 'h', 'e', 'l', '[SEP]'], " "['[CLS]', 'e', 'l', 'l', '[SEP]'], ['[CLS]', 'l', 'l', 'o', '[SEP]']]`. These segments are passed to the " "model. Before final predictions computation, margins are removed. In the next list, subtokens which logits " "are not used for final predictions computation are marked with asterisk: `[['[CLS]'*, 'h', 'e', 'l'*, " "'[SEP]'*], ['[CLS]'*, 'e'*, 'l', 'l'*, '[SEP]'*], ['[CLS]'*, 'l'*, 'l', 'o', '[SEP]'*]]`.", ) parser.add_argument( "--batch_size", "-b", type=int, default=128, help="Number of segments which are processed simultaneously.", ) parser.add_argument( "--save_labels_instead_of_text", "-B", action="store_true", help= "If this option is set save punctuation and capitalization labels instead text with restored punctuation " "and capitalization. Labels are saved in format described here https://docs.nvidia.com/deeplearning/nemo/" "user-guide/docs/en/main/nlp/punctuation_and_capitalization.html#nemo-data-format", ) parser.add_argument( "--device", "-d", choices=['cpu', 'cuda'], help= "Which device to use. If device is not set and CUDA is available GPU will be used. If device is not set " "and CUDA is not available CPU is used.", ) args = parser.parse_args() if args.input_manifest is None and args.output_manifest is not None: parser.error("--output_manifest requires --input_manifest") if args.pretrained_name is None and args.model_path is None: setattr(args, default_model_parameter, default_model) for name in [ "input_manifest", "input_text", "output_manifest", "output_text", "model_path" ]: if getattr(args, name) is not None: setattr(args, name, getattr(args, name).expanduser()) return args
def main(cfg: DictConfig) -> None: trainer = pl.Trainer(**cfg.trainer) exp_manager(trainer, cfg.get("exp_manager", None)) if not cfg.pretrained_model: logging.info(f'Config: {OmegaConf.to_yaml(cfg)}') model = PunctuationCapitalizationModel(cfg.model, trainer=trainer) else: if os.path.exists(cfg.pretrained_model): model = PunctuationCapitalizationModel.restore_from( cfg.pretrained_model) elif cfg.pretrained_model in PunctuationCapitalizationModel.get_available_model_names( ): model = PunctuationCapitalizationModel.from_pretrained( cfg.pretrained_model) else: raise ValueError( f'Provide path to the pre-trained .nemo file or choose from {PunctuationCapitalizationModel.list_available_models()}' ) data_dir = cfg.model.dataset.get('data_dir', None) if data_dir: if not os.path.exists(data_dir): raise ValueError(f'{data_dir} is not found at') # we can also do finetuning of the pretrained model but we would need to update the data dir model.update_data_dir(data_dir) # setup train and validation Pytorch DataLoaders model.setup_training_data() model.setup_validation_data() logging.info(f'Using config file of the pretrained model') else: raise ValueError( 'Specify a valid dataset directory that contains test_ds.text_file and test_ds.labels_file \ with "model.dataset.data_dir" argument') trainer.fit(model)
def main(cfg: DictConfig) -> None: torch.manual_seed(42) cfg = OmegaConf.merge( OmegaConf.structured(PunctuationCapitalizationConfig()), cfg) trainer = pl.Trainer(**cfg.trainer) exp_manager(trainer, cfg.get("exp_manager", None)) if not cfg.do_training and not cfg.do_testing: raise ValueError( "At least one of config parameters `do_training` and `do_testing` has to `true`." ) if cfg.do_training: if cfg.model.get('train_ds') is None: raise ValueError( '`model.train_ds` config section is required if `do_training` config item is `True`.' ) if cfg.do_testing: if cfg.model.get('test_ds') is None: raise ValueError( '`model.test_ds` config section is required if `do_testing` config item is `True`.' ) if not cfg.pretrained_model: logging.info(f'Config: {OmegaConf.to_yaml(cfg)}') model = PunctuationCapitalizationModel(cfg.model, trainer=trainer) else: if os.path.exists(cfg.pretrained_model): model = PunctuationCapitalizationModel.restore_from( cfg.pretrained_model) elif cfg.pretrained_model in PunctuationCapitalizationModel.get_available_model_names( ): model = PunctuationCapitalizationModel.from_pretrained( cfg.pretrained_model) else: raise ValueError( f'Provide path to the pre-trained .nemo file or choose from ' f'{PunctuationCapitalizationModel.list_available_models()}') model.update_config_after_restoring_from_checkpoint( class_labels=cfg.model.class_labels, common_dataset_parameters=cfg.model.common_dataset_parameters, train_ds=cfg.model.get('train_ds') if cfg.do_training else None, validation_ds=cfg.model.get('validation_ds') if cfg.do_training else None, test_ds=cfg.model.get('test_ds') if cfg.do_testing else None, optim=cfg.model.get('optim') if cfg.do_training else None, ) model.set_trainer(trainer) if cfg.do_training: model.setup_training_data() model.setup_validation_data() model.setup_optimization() else: model.setup_test_data() if cfg.do_training: trainer.fit(model) if cfg.do_testing: trainer.test(model)
def main(cfg: DictConfig) -> None: trainer = pl.Trainer(**cfg.trainer) exp_manager(trainer, cfg.get("exp_manager", None)) do_training = True if not cfg.pretrained_model: logging.info(f'Config: {OmegaConf.to_yaml(cfg)}') model = PunctuationCapitalizationModel(cfg.model, trainer=trainer) else: logging.info(f'Loading pretrained model {cfg.pretrained_model}') model = PunctuationCapitalizationModel.from_pretrained(cfg.pretrained_model) data_dir = cfg.model.dataset.get('data_dir', None) if data_dir: model.update_data_dir(data_dir) model.setup_training_data() model.setup_validation_data() logging.info(f'Using config file of the pretrained model') else: do_training = False logging.info( f'Data dir should be specified for training/finetuning. ' f'Using pretrained {cfg.pretrained_model} model weights and skipping finetuning.' ) if do_training: trainer.fit(model) if cfg.model.nemo_path: model.save_to(cfg.model.nemo_path) logging.info( 'During evaluation/testing, it is currently advisable to construct a new Trainer with single GPU ' 'and no DDP to obtain accurate results' ) gpu = 1 if cfg.trainer.gpus != 0 else 0 trainer = pl.Trainer(gpus=gpu) model.set_trainer(trainer) # run an inference on a few examples queries = [ 'we bought four shirts one pen and a mug from the nvidia gear store in santa clara', 'what can i do for you today', 'how are you', ] inference_results = model.add_punctuation_capitalization(queries) for query, result in zip(queries, inference_results): logging.info(f'Query : {query}') logging.info(f'Result: {result.strip()}\n')
import time from nemo.collections.nlp.models import PunctuationCapitalizationModel import argparse import sys import torch # to get the list of pre-trained models #PunctuationCapitalizationModel.list_available_models() # Read text sentences from input file parser = argparse.ArgumentParser() parser.add_argument( "--model", "-m", type=str, required=False, default="punctuation_en_distilbert", help="Path to the model", choices=[m.pretrained_model_name for m in PunctuationCapitalizationModel.list_available_models()] ) parser.add_argument( "--inpath", "-i", type=str, required=True, help="Input Path" ) parser.add_argument( "--outpath", "-o", type=str, required=True, help="Output Path" ) parser.add_argument( "--batch_size", "-b", type=int, default=128, help="Number of segments which are processed simultaneously.", ) parser.add_argument( "--device", "-d", choices=['cpu', 'cuda'], help="Which device to use. If device is not set and CUDA is available, then GPU will be used. If device is " "not set and CUDA is not available, then CPU is used.", )
def main(cfg: DictConfig) -> None: trainer = pl.Trainer(**cfg.trainer) exp_manager(trainer, cfg.get("exp_manager", None)) do_training = True if not cfg.pretrained_model: logging.info(f'Config: {OmegaConf.to_yaml(cfg)}') model = PunctuationCapitalizationModel(cfg.model, trainer=trainer) else: logging.info(f'Loading pretrained model {cfg.pretrained_model}') # TODO: Remove strict, when lightning has persistent parameter support for add_state() model = PunctuationCapitalizationModel.from_pretrained( cfg.pretrained_model, strict=False) data_dir = cfg.model.dataset.get('data_dir', None) if data_dir: # we can also do finetunining of the pretrained model but it will require # setting up train and validation Pytorch DataLoaders model.setup_training_data(data_dir=data_dir) # evaluation could be done on multiple files, use model.validation_ds.ds_items to specify multiple # data directories if needed model.setup_validation_data(data_dirs=data_dir) logging.info(f'Using config file of the pretrained model') else: do_training = False logging.info( f'Data dir should be specified for training/finetuning. ' f'Using pretrained {cfg.pretrained_model} model weights and skipping finetuning.' ) if do_training: trainer.fit(model) if cfg.model.nemo_path: model.save_to(cfg.model.nemo_path) logging.info( 'During evaluation/testing, it is currently advisable to construct a new Trainer with single GPU ' 'and no DDP to obtain accurate results') gpu = 1 if cfg.trainer.gpus != 0 else 0 trainer = pl.Trainer(gpus=gpu) model.set_trainer(trainer) # run an inference on a few examples queries = [ 'we bought four shirts one pen and a mug from the nvidia gear store in santa clara', 'what can i do for you today', 'how are you', ] inference_results = model.add_punctuation_capitalization(queries) for query, result in zip(queries, inference_results): logging.info(f'Query : {query}') logging.info(f'Result: {result.strip()}\n')