コード例 #1
0
ファイル: huggingface_utils.py プロジェクト: quuhua911/NeMo
def get_huggingface_pretrained_lm_models_list(include_external: bool = False,) -> List[str]:
    """
    Returns the list of pretrained HuggingFace language models

    Args:
        include_external if true includes all HuggingFace model names, not only those supported language models in NeMo.

    Returns the list of HuggingFace models
    """

    huggingface_models = []
    if include_external:
        huggingface_models = list(ALL_PRETRAINED_CONFIG_ARCHIVE_MAP.keys())
    else:
        for model in HUGGINGFACE_MODELS:
            model_names = HUGGINGFACE_MODELS[model]["pretrained_model_list"]
            huggingface_models.extend(model_names)
    return huggingface_models
コード例 #2
0
import ray
from ray.util.sgd.torch import TrainingOperator
from ray.util.sgd import TorchTrainer
from ray.util.sgd.torch.examples.transformers.utils import (
    evaluate, load_and_cache_examples, save_and_evaluate_checkpoints)

try:
    from apex import amp
except ImportError:
    amp = None

MODEL_CONFIG_CLASSES = list(MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING.keys())
MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)

ALL_MODELS = sum(
    (tuple(key for key in ALL_PRETRAINED_CONFIG_ARCHIVE_MAP.keys()
           if key.startswith(conf.model_type))
     for conf in MODEL_CONFIG_CLASSES),
    (),
)

logger = logging.getLogger(__name__)


def set_seed(args):
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

コード例 #3
0
ファイル: tokit.py プロジェクト: binshengliu/irtools
def get_all_models():
    return list(ALL_PRETRAINED_CONFIG_ARCHIVE_MAP.keys())
コード例 #4
0
ファイル: dm.py プロジェクト: georgepar/slp
class PLDataModuleFromCorpus(PLDataModuleFromDatasets):
    accepted_tokenizers: List[str] = ["tokenized", "spacy"] + list(
        ALL_PRETRAINED_CONFIG_ARCHIVE_MAP.keys())

    def __init__(
        self,
        train: List,
        train_labels: Optional[List] = None,
        val: Optional[List] = None,
        val_labels: Optional[List] = None,
        test: Optional[List] = None,
        test_labels: Optional[List] = None,
        val_percent: float = 0.2,
        test_percent: float = 0.2,
        batch_size: int = 64,
        batch_size_eval: int = None,
        seed: int = None,
        num_workers: int = 1,
        pin_memory: bool = True,
        drop_last: bool = False,
        shuffle_eval: bool = False,
        sampler_train: Sampler = None,
        sampler_val: Sampler = None,
        sampler_test: Sampler = None,
        batch_sampler_train: BatchSampler = None,
        batch_sampler_val: BatchSampler = None,
        batch_sampler_test: BatchSampler = None,
        collate_fn: Optional[Callable[..., Any]] = None,
        language_model: bool = False,
        tokenizer: str = "spacy",
        no_test_set: bool = False,
        **corpus_args,
    ):
        """Wrap raw corpus in a LightningDataModule

        * This handles the selection of the appropriate corpus class based on the tokenizer argument.
        * If language_model=True it uses the appropriate dataset from slp.data.datasets.
        * Uses the PLDataModuleFromDatasets to split the val and test sets if not provided

        Args:
            train (List): Raw train corpus
            train_labels (Optional[List]): Train labels. Defaults to None.
            val (Optional[List]): Raw validation corpus. Defaults to None.
            val_labels (Optional[List]): Validation labels. Defaults to None.
            test (Optional[List]): Raw test corpus. Defaults to None.
            test_labels (Optional[List]): Test labels. Defaults to None.
            val_percent (float): Percent of train to be used for validation if no validation set is given. Defaults to 0.2.
            test_percent (float): Percent of train to be used for test set if no test set is given. Defaults to 0.2.
            batch_size (int): Training batch size. Defaults to 1.
            batch_size_eval (Optional[int]): Validation and test batch size. Defaults to None.
            seed (Optional[int]): Seed for deterministic run. Defaults to None.
            num_workers (int): Number of workers in the DataLoader. Defaults to 1.
            pin_memory (bool): Pin tensors to GPU memory. Defaults to True.
            drop_last (bool): Drop last incomplete batch. Defaults to False.
            sampler_train (Sampler): Sampler for train loader. Defaults to None.
            sampler_val (Sampler): Sampler for validation loader. Defaults to None.
            sampler_test (Sampler): Sampler for test loader. Defaults to None.
            batch_sampler_train (BatchSampler): Batch sampler for train loader. Defaults to None.
            batch_sampler_val (BatchSampler): Batch sampler for validation loader. Defaults to None.
            batch_sampler_test (BatchSampler): Batch sampler for test loader. Defaults to None.
            shuffle_eval (bool): Shuffle validation and test dataloaders. Defaults to False.
            collate_fn (Callable[..., Any]): Collator function. Defaults to None.
            language_model (bool): Use corpus for Language Modeling. Defaults to False.
            tokenizer (str): Select one of the cls.accepted_tokenizers. Defaults to "spacy".
            no_test_set (bool): Do not create test set. Useful for tuning
            **corpus_args (kwargs): Extra arguments to be passed to the corpus. See
                slp/data/corpus.py
        Raises:
            ValueError: [description]
            ValueError: [description]
        """
        self.language_model = language_model
        self.tokenizer = tokenizer
        self.corpus_args = corpus_args

        train_data, val_data, test_data = self._zip_corpus_and_labels(
            train, val, test, train_labels, val_labels, test_labels)

        self.no_test_set = no_test_set
        super(PLDataModuleFromCorpus, self).__init__(
            train_data,  # type: ignore
            val=val_data,  # type: ignore
            test=test_data,  # type: ignore
            val_percent=val_percent,
            test_percent=test_percent,
            batch_size=batch_size,
            batch_size_eval=batch_size_eval,
            seed=seed,
            num_workers=num_workers,
            pin_memory=pin_memory,
            drop_last=drop_last,
            shuffle_eval=shuffle_eval,
            sampler_train=sampler_train,
            sampler_val=sampler_val,
            sampler_test=sampler_test,
            batch_sampler_train=batch_sampler_train,
            batch_sampler_val=batch_sampler_val,
            batch_sampler_test=batch_sampler_test,
            collate_fn=collate_fn,
            no_test_set=no_test_set,
        )

    def setup(self, stage=None):
        if self.setup_has_run:
            return

        super(PLDataModuleFromCorpus, self).setup(stage=stage)

        train_corpus, train_labels = zip(*self.train)  # type: ignore
        val_corpus, val_labels = zip(*self.val)  # type: ignore

        if not self.no_test_set:
            test_corpus, test_labels = zip(*self.test)  # type: ignore

        self.train_corpus, self.val_corpus, self.test_corpus = self._create_corpora(
            train_corpus, val_corpus, test_corpus, self.corpus_args)

        to_tensor = ToTensor(device="cpu")

        if self.language_model:
            self.train = CorpusLMDataset(self.train_corpus).map(to_tensor)
            self.val = CorpusLMDataset(self.val_corpus).map(to_tensor)

            if not self.no_test_set:
                self.test = CorpusLMDataset(self.test_corpus).map(to_tensor)
        else:
            self.train = CorpusDataset(self.train_corpus,
                                       train_labels).map(to_tensor)
            self.val = CorpusDataset(self.val_corpus,
                                     val_labels).map(to_tensor)

            if not self.no_test_set:
                self.test = CorpusDataset(self.test_corpus,
                                          test_labels).map(to_tensor)

    def _zip_corpus_and_labels(self, train, val, test, train_labels,
                               val_labels, test_labels):

        if not self.language_model and train_labels is None:
            raise ValueError(
                "You should provide train labels if not performing language modeling"
            )

        if self.language_model:
            train_labels = train

            if val is not None:
                val_labels = val

            if test is not None:
                test_labels = test

        train_data = (list(zip(train, train_labels))
                      if train_labels is not None else train)
        val_data = None

        if val is not None:
            val_data = list(zip(val,
                                val_labels)) if val_labels is not None else val
        test_data = None

        if test is not None:
            test_data = (list(zip(test, test_labels))
                         if test_labels is not None else test)

        return train_data, val_data, test_data

    def _select_corpus_cls(self, corpus_args):
        if self.tokenizer not in self.accepted_tokenizers:
            raise ValueError(
                f"tokenizer kwarg in {self.__class__.__name__} should be one of {self.accepted_tokenizers}"
            )

        if self.tokenizer not in self.accepted_tokenizers:
            raise ValueError(
                f"tokenizer kwarg in {self.__class__.__name__} should be one of {self.accepted_tokenizers}"
            )

        if self.tokenizer == "spacy":
            logger.info(
                'Selecting WordCorpus because tokenizer="spacy" was provided')
            corpus_cls = WordCorpus  # type: ignore
        elif self.tokenizer == "tokenized":
            logger.info(
                'Selecting TokenizedCorpus because tokenizer="tokenized" was provided'
            )
            corpus_cls = TokenizedCorpus  # type: ignore
        else:
            logger.info(
                "Selecting HfCorpus because a huggingface tokenizer was provided"
            )
            corpus_cls = HfCorpus  # type: ignore
            corpus_args["tokenizer_model"] = self.tokenizer

        return corpus_cls, corpus_args

    def _force_train_vocab_on_val_and_test(self, corpus_args, train_corpus):
        if self.tokenizer in {"spacy", "tokenized"}:
            # Force train vocabulary on val & test
            corpus_args["word2idx"] = train_corpus.word2idx

            if self.tokenizer == "spacy":
                corpus_args["embeddings"] = train_corpus.embeddings
                corpus_args["idx2word"] = train_corpus.word2idx

            logger.info(
                "Forcing vocabulary from training set for validation and test sets."
            )

        return corpus_args

    def _create_corpora(self, train_corpus, val_corpus, test_corpus,
                        corpus_args):
        corpus_cls, corpus_args = self._select_corpus_cls(corpus_args)

        train_corpus = corpus_cls(train_corpus, **corpus_args)  # type: ignore

        corpus_args = self._force_train_vocab_on_val_and_test(
            corpus_args, train_corpus)

        val_corpus = corpus_cls(val_corpus, **corpus_args)  # type: ignore

        if not self.no_test_set:
            test_corpus = corpus_cls(test_corpus,
                                     **corpus_args)  # type: ignore
        else:
            test_corpus = None

        return train_corpus, val_corpus, test_corpus

    @property
    def embeddings(self) -> Optional[np.ndarray]:
        """Embeddings matrix

        Returns:
            Optional[np.ndarray]: Embeddings matrix
        """
        emb: Optional[np.ndarray] = self.train_corpus.embeddings

        return emb

    @property
    def vocab_size(self) -> int:
        """Number of tokens in the vocabulary

        Returns:
            int: Number of tokens in the vocabulary
        """
        vsz: int = self.train_corpus.vocab_size

        return vsz

    @classmethod
    def add_argparse_args(cls, parent_parser):
        """Augment input parser with arguments for data loading and corpus processing

        Args:
            parent_parser (argparse.ArgumentParser): Parser created by the user

        Returns:
            argparse.ArgumentParser: Augmented parser
        """
        parser = super(PLDataModuleFromCorpus,
                       cls).add_argparse_args(parent_parser)
        parser.add_argument(
            "--tokenizer",
            dest="data.tokenizer",
            type=str.lower,
            # Corpus can already be tokenized, you can use spacy for word tokenization or any tokenizer from hugging face
            choices=cls.accepted_tokenizers,
            default="spacy",
            help="Token type. The tokenization will happen at this level.",
        )

        # Only when tokenizer == spacy
        parser.add_argument(
            "--limit-vocab",
            dest="data.limit_vocab_size",
            type=int,
            default=-1,
            help=
            "Limit vocab size. -1 means use the whole vocab. Applicable only when --tokenizer=spacy",
        )

        parser.add_argument(
            "--embeddings-file",
            dest="data.embeddings_file",
            type=dir_path,
            default=None,
            help=
            "Path to file with pretrained embeddings. Applicable only when --tokenizer=spacy",
        )

        parser.add_argument(
            "--embeddings-dim",
            dest="data.embeddings_dim",
            type=int,
            default=50,
            help=
            "Embedding dim of pretrained embeddings. Applicable only when --tokenizer=spacy",
        )

        parser.add_argument(
            "--lang",
            dest="data.lang",
            type=str,
            default="en_core_web_md",
            help=
            "Language for spacy tokenizer, e.g. en_core_web_md. Applicable only when --tokenizer=spacy",
        )

        parser.add_argument(
            "--no-add-specials",
            dest="data.add_special_tokens",
            action="store_false",
            help="Do not add special tokens for hugging face tokenizers",
        )

        # Generic args
        parser.add_argument(
            "--lower",
            dest="data.lower",
            action="store_true",
            help="Convert to lowercase.",
        )

        parser.add_argument(
            "--prepend-bos",
            dest="data.prepend_bos",
            action="store_true",
            help="Prepend [BOS] token",
        )

        parser.add_argument(
            "--append-eos",
            dest="data.append_eos",
            action="store_true",
            help="Append [EOS] token",
        )

        parser.add_argument(
            "--max-sentence-length",
            dest="data.max_len",
            type=int,
            default=-1,
            help=
            "Maximum allowed sentence length. -1 means use the whole sentence",
        )

        return parser