Python PunctuationCapitalizationModel.list_available_models Beispiele

Programmiersprache: Python

Namespace / Paketname: nemo.collections.nlp.models

Klasse / Typ: PunctuationCapitalizationModel

Methode / Funktion: list_available_models

Beispiele auf hotexamples.com: 2

Python PunctuationCapitalizationModel.list_available_models - 2 Beispiele gefunden. Dies sind die am besten bewerteten Python Beispiele für die nemo.collections.nlp.models.PunctuationCapitalizationModel.list_available_models, die aus Open Source-Projekten extrahiert wurden. Sie können Beispiele bewerten, um die Qualität der Beispiele zu verbessern.

Häufig verwendete Methoden

Anzeigen Verbergen

from_pretrained(9)

PunctuationCapitalizationModel(5)

restore_from(5)

setup_training_data(4)

setup_validation_data(4)

get_available_model_names(3)

save_to(3)

set_trainer(3)

add_punctuation_capitalization(2)

list_available_models(2)

update_data_dir(2)

setup_optimization(1)

setup_test_data(1)

update_config_after_restoring_from_checkpoint(1)

Beispiel #1

Datei anzeigen

def get_args():
    default_model_parameter = "pretrained_name"
    default_model = "punctuation_en_bert"
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=
        "The script is for restoring punctuation and capitalization in text. Long strings are split into "
        "segments of length `--max_seq_length`. `--max_seq_length` is the length which includes [CLS] and [SEP] "
        "tokens. Parameter `--step` controls segments overlapping. `--step` is a distance between beginnings of "
        "consequent segments. Model outputs for tokens near the borders of tensors are less accurate and can be "
        "discarded before final predictions computation. Parameter `--margin` is number of discarded outputs near "
        "segments borders. Probabilities of tokens in overlapping parts of segments multiplied before selecting the "
        "best prediction. Default values of parameters `--max_seq_length`, `--step`, and `--margin` are optimal for "
        "IWSLT 2019 test dataset.",
    )
    input_ = parser.add_mutually_exclusive_group(required=True)
    input_.add_argument(
        "--input_manifest",
        "-m",
        type=Path,
        help=
        "Path to the file with NeMo manifest which needs punctuation and capitalization. If the first element "
        "of manifest contains key 'pred_text', 'pred_text' values are passed for tokenization. Otherwise 'text' "
        "values are passed for punctuation and capitalization. Exactly one parameter of `--input_manifest` and "
        "`--input_text` should be provided.",
    )
    input_.add_argument(
        "--input_text",
        "-t",
        type=Path,
        help=
        "Path to file with text which needs punctuation and capitalization. Exactly one parameter of "
        "`--input_manifest` and `--input_text` should be provided.",
    )
    output = parser.add_mutually_exclusive_group(required=True)
    output.add_argument(
        "--output_manifest",
        "-M",
        type=Path,
        help=
        "Path to output NeMo manifest. Text with restored punctuation and capitalization will be saved in "
        "'pred_text' elements if 'pred_text' key is present in the input manifest. Otherwise text with restored "
        "punctuation and capitalization will be saved in 'text' elements. Exactly one parameter of `--output_manifest` "
        "and `--output_text` should be provided.",
    )
    output.add_argument(
        "--output_text",
        "-T",
        type=Path,
        help=
        "Path to file with text with restored punctuation and capitalization. Exactly one parameter of "
        "`--output_manifest` and `--output_text` should be provided.",
    )
    model = parser.add_mutually_exclusive_group(required=False)
    model.add_argument(
        "--pretrained_name",
        "-p",
        help=
        f"The name of NGC pretrained model. No more than one of parameters `--pretrained_name`, `--model_path`"
        f"should be provided. If neither of parameters `--pretrained_name` and `--model_path` are provided, then the "
        f"script is run with `--{default_model_parameter}={default_model}`.",
        choices=[
            m.pretrained_model_name
            for m in PunctuationCapitalizationModel.list_available_models()
        ],
    )
    model.add_argument(
        "--model_path",
        "-P",
        type=Path,
        help=
        f"Path to .nemo checkpoint of punctuation and capitalization model. No more than one of parameters "
        f"`--pretrained_name` and `--model_path` should be provided. If neither of parameters `--pretrained_name` and "
        f"`--model_path` are provided, then the script is run with `--{default_model_parameter}={default_model}`.",
    )
    parser.add_argument(
        "--max_seq_length",
        "-L",
        type=int,
        default=64,
        help=
        "Length of segments into which queries are split. `--max_seq_length` includes [CLS] and [SEP] tokens.",
    )
    parser.add_argument(
        "--step",
        "-s",
        type=int,
        default=8,
        help=
        "Relative shift of consequent segments into which long queries are split. Long queries are split into "
        "segments which can overlap. Parameter `step` controls such overlapping. Imagine that queries are "
        "tokenized into characters, `max_seq_length=5`, and `step=2`. In such a case query 'hello' is tokenized "
        "into segments `[['[CLS]', 'h', 'e', 'l', '[SEP]'], ['[CLS]', 'l', 'l', 'o', '[SEP]']]`.",
    )
    parser.add_argument(
        "--margin",
        "-g",
        type=int,
        default=16,
        help=
        "A number of subtokens in the beginning and the end of segments which output probabilities are not used "
        "for prediction computation. The first segment does not have left margin and the last segment does not have "
        "right margin. For example, if input sequence is tokenized into characters, `max_seq_length=5`, `step=1`, "
        "and `margin=1`, then query 'hello' will be tokenized into segments `[['[CLS]', 'h', 'e', 'l', '[SEP]'], "
        "['[CLS]', 'e', 'l', 'l', '[SEP]'], ['[CLS]', 'l', 'l', 'o', '[SEP]']]`. These segments are passed to the "
        "model. Before final predictions computation, margins are removed. In the next list, subtokens which logits "
        "are not used for final predictions computation are marked with asterisk: `[['[CLS]'*, 'h', 'e', 'l'*, "
        "'[SEP]'*], ['[CLS]'*, 'e'*, 'l', 'l'*, '[SEP]'*], ['[CLS]'*, 'l'*, 'l', 'o', '[SEP]'*]]`.",
    )
    parser.add_argument(
        "--batch_size",
        "-b",
        type=int,
        default=128,
        help="Number of segments which are processed simultaneously.",
    )
    parser.add_argument(
        "--save_labels_instead_of_text",
        "-B",
        action="store_true",
        help=
        "If this option is set save punctuation and capitalization labels instead text with restored punctuation "
        "and capitalization. Labels are saved in format described here https://docs.nvidia.com/deeplearning/nemo/"
        "user-guide/docs/en/main/nlp/punctuation_and_capitalization.html#nemo-data-format",
    )
    parser.add_argument(
        "--device",
        "-d",
        choices=['cpu', 'cuda'],
        help=
        "Which device to use. If device is not set and CUDA is available GPU will be used. If device is not set "
        "and CUDA is not available CPU is used.",
    )
    args = parser.parse_args()
    if args.input_manifest is None and args.output_manifest is not None:
        parser.error("--output_manifest requires --input_manifest")
    if args.pretrained_name is None and args.model_path is None:
        setattr(args, default_model_parameter, default_model)
    for name in [
            "input_manifest", "input_text", "output_manifest", "output_text",
            "model_path"
    ]:
        if getattr(args, name) is not None:
            setattr(args, name, getattr(args, name).expanduser())
    return args

Beispiel #2

Datei anzeigen

Datei: runNemoCasePunc.py Projekt: asrivast13/mydocs

import time
from nemo.collections.nlp.models import PunctuationCapitalizationModel
import argparse
import sys
import torch

# to get the list of pre-trained models
#PunctuationCapitalizationModel.list_available_models()

# Read text sentences from input file
parser = argparse.ArgumentParser()
parser.add_argument(
    "--model", "-m", type=str, required=False, default="punctuation_en_distilbert", help="Path to the model",
    choices=[m.pretrained_model_name for m in PunctuationCapitalizationModel.list_available_models()]
)
parser.add_argument(
    "--inpath", "-i", type=str, required=True, help="Input Path"
)
parser.add_argument(
    "--outpath", "-o", type=str, required=True, help="Output Path"
)
parser.add_argument(
    "--batch_size", "-b", type=int, default=128, help="Number of segments which are processed simultaneously.",
)
parser.add_argument(
    "--device",
    "-d",
    choices=['cpu', 'cuda'],
    help="Which device to use. If device is not set and CUDA is available, then GPU will be used. If device is "
    "not set and CUDA is not available, then CPU is used.",
)