Esempio n. 1
0
    def init(self, params: Dict[str, Any]):
        """
        See :meth:`gobbli.model.base.BaseModel.init`.

        SKLearnClassifier parameters:

        - ``estimator_path`` (:obj:`str`): Path to an estimator pickled by joblib.
          The pickle will be loaded, and the resulting object will be used as the estimator.
          If not provided, a default pipeline composed of a TF-IDF vectorizer and a
          logistic regression will be used.
        """
        estimator = None

        for name, value in params.items():
            if name == "estimator_path":
                assert_type(name, value, str)
                estimator = SKLearnClassifier._load_estimator(Path(value))
                SKLearnClassifier._validate_estimator(estimator)
            else:
                raise ValueError(f"Unknown param '{name}'")

        if estimator is None:
            self.estimator = _SafeEstimator(
                make_default_tfidf_logistic_regression())
        else:
            self.estimator = _SafeEstimator(estimator)
Esempio n. 2
0
 def _validate_params(self):
     assert_param_required("data_proportions", self.params)
     proportions = self.params["data_proportions"]
     assert_type("data_proportions", proportions, list)
     for p in proportions:
         assert_type("data_proportion", p, float)
         assert_proportion("data_proportion", p)
Esempio n. 3
0
    def _validate_params(self):
        assert_param_required("vocab_size", self.params)
        assert_type("vocab_size", self.params["vocab_size"], int)

        assert_param_required("sample_size", self.params)
        assert_type("sample_size", self.params["sample_size"], float)

        assert_param_required("window_len_poolings", self.params)
        window_len_poolings = self.params["window_len_poolings"]
        assert_type("window_len_poolings", window_len_poolings, list)
        for w, p in window_len_poolings:
            assert_type("window_len", w, (int, type(None)))
            assert_type("pooling", p, (str, type(None)))

            if p is not None:
                # This raises an exception if p isn't a valid pooling method
                WindowPooling(p)
Esempio n. 4
0
    def init(self, params: Dict[str, Any]):
        """
        See :meth: `gobbli.model.base.BaseModel.init`.

        For more info on fastText parameter semantics, see
        `the docs <https://fasttext.cc/docs/en/options.html>`__.  The fastText
        `supervised tutorial <https://fasttext.cc/docs/en/supervised-tutorial.html>`__ has
        some more detailed explanation.

        fastText parameters:

        - ``word_ngrams`` (:obj:`int`): Max length of word n-grams.
        - ``lr`` (:obj:`float`): Learning rate.
        - ``dim`` (:obj:`int`): Dimension of learned vectors.
        - ``ws`` (:obj:`int`): Context window size.
        - ``fasttext_model`` (:obj:`str`): Name of a pretrained fastText model to use.
          See :obj:`FASTTEXT_VECTOR_ARCHIVES` for a listing of available pretrained models.
        """
        self.word_ngrams = 1
        self.lr = 0.1
        self.ws = 5
        self.fasttext_model = None
        # Default to dimensionality of the passed model, if any;
        # otherwise, default to 100
        if "fasttext_model" in params:
            self.dim = _parse_dim(params["fasttext_model"])
        else:
            self.dim = 100

        for name, value in params.items():
            if name == "word_ngrams":
                assert_type(name, value, int)
                self.word_ngrams = value
            elif name == "lr":
                assert_type(name, value, float)
                self.lr = value
            elif name == "dim":
                assert_type(name, value, int)
                self.dim = value
            elif name == "ws":
                assert_type(name, value, int)
                self.ws = value
            elif name == "fasttext_model":
                assert_in(name, value, set(FASTTEXT_VECTOR_ARCHIVES.keys()))
                self.fasttext_model = value
            else:
                raise ValueError(f"Unknown param '{name}'")

        if (self.fasttext_model is not None
                and f"{self.dim}d" not in self.fasttext_model):
            raise ValueError(
                "When using pretrained vectors, 'dim' must match the"
                f" dimensionality of the vectors; 'dim' value of {self.dim}"
                f" is incompatible with vectors {self.fasttext_model}.")
Esempio n. 5
0
    def init(self, params: Dict[str, Any]):
        """
        See :meth:`gobbli.model.base.BaseModel.init`.

        Transformer parameters:

        - ``transformer_model`` (:obj:`str`): Name of a transformer model architecture to use.
          For training/prediction, the value should be one such that
          ``from pytorch_transformers import <value>ForSequenceClassification`` is
          a valid import.  ex value = "Bert" ->
          ``from pytorch_transformers import BertForSequenceClassification``.  Note this means
          only a subset of the pytorch_transformers models are supported for these tasks -- search
          `the docs <https://huggingface.co/pytorch-transformers/search.html?q=forsequenceclassification&check_keywords=yes&area=default>`__ to see which ones you can use.
          For embedding generation, the import is ``<value>Model``, so any pytorch_transformer
          model is supported.
        - ``transformer_weights`` (:obj:`str`): Name of the pretrained weights to use.
          See the `pytorch-transformers docs <https://huggingface.co/pytorch-transformers/pretrained_models.html>`__
          for supported values.  These depend on the ``transformer_model`` chosen.
        - ``config_overrides`` (:obj:`dict`): Dictionary of keys and values that will
          override config for the model.
        - ``max_seq_length``: Truncate all sequences to this length after tokenization.
          Used to save memory.
        - ``lr``: Learning rate for the AdamW optimizer.
        - ``adam_eps``: Epsilon value for the AdamW optimizer.

        Note that gobbli relies on pytorch-transformers to perform validation on these parameters,
        so initialization errors may not be caught until model runtime.
        """
        self.transformer_model = "Bert"
        self.transformer_weights = "bert-base-uncased"
        self.config_overrides = {}  # type: Dict[str, Any]
        self.max_seq_length = 128
        self.lr = 5e-5
        self.adam_eps = 1e-8

        for name, value in params.items():
            if name == "transformer_model":
                self.transformer_model = value
            elif name == "transformer_weights":
                self.transformer_weights = value
            elif name == "config_overrides":
                assert_type(name, value, dict)
                self.config_overrides = value
            elif name == "max_seq_length":
                assert_type(name, value, int)
                self.max_seq_length = value
            elif name == "lr":
                assert_type(name, value, float)
                self.lr = value
            elif name == "adam_eps":
                assert_type(name, value, float)
                self.adam_eps = value
            else:
                raise ValueError(f"Unknown param '{name}'")
Esempio n. 6
0
    def init(self, params: Dict[str, Any]):
        """
        See :meth:`gobbli.model.base.BaseModel.init`.

        spaCy parameters:

        - ``model`` (:obj:`str`): Name of a spaCy model to use.
          Available values are in `the spaCy model docs <https://spacy.io/models>`__ and
          `the spacy-transformers docs <https://github.com/explosion/spacy-transformers>`__.
        - ``architecture`` (:obj:`str`): Model architecture to use.
          Available values are in `the spaCy API docs <https://spacy.io/api/textcategorizer#architectures>`__.
          This is ignored if using a spacy-transformers model.
        - ``dropout`` (:obj:`float`): Dropout proportion for training.
        - ``full_pipeline`` (:obj:`bool`): If True, enable the full spaCy language pipeline
          (including tagging, parsing, and named entity recognition) for the TextCategorizer
          model used in training and prediction.  This makes training/prediction much slower
          but theoretically provides more information to the model.  This is ignored if using a
          spacy-transformers model.

        Note that gobbli relies on spaCy to perform validation on these parameters,
        so initialization errors may not be caught until model runtime.
        """
        self.model = "en_core_web_lg"
        self.architecture = "ensemble"
        self.dropout = 0.2
        self.full_pipeline = False

        for name, value in params.items():
            if name == "model":
                self.model = value
            elif name == "architecture":
                self.architecture = value
            elif name == "dropout":
                assert_type(name, value, float)
                self.dropout = value
            elif name == "full_pipeline":
                assert_type(name, value, bool)
                self.full_pipeline = value
            else:
                raise ValueError(f"Unknown param '{name}'")
Esempio n. 7
0
    def init(self, params: Dict[str, Any]):
        """
        See :meth:`gobbli.model.base.BaseModel.init`.

        MarianMT parameters:

        - ``batch_size``: Number of documents to run through the Marian model at once.
        - ``target_languages``: List of target languages to translate texts to and back.
          See :attr:`MarianMT.ALL_TARGET_LANGUAGES` for a full list of possible values. You may
          only augment texts up to the number of languages specified, since each language
          will be used at most once.  So if you want to augment 5 times, you need to specify
          at least 5 languages when initializing the model.
        """
        self.batch_size = 32
        # Current default - top 5 lanugages in Wikipedia which are also available
        # in the list of target languages
        # https://en.wikipedia.org/wiki/List_of_Wikipedias#List
        self.target_languages = [
            "french", "german", "japanese", "russian", "italian"
        ]

        for name, value in params.items():
            if name == "batch_size":
                assert_type(name, value, int)
                if value < 1:
                    raise ValueError("batch_size must be >= 1")
                self.batch_size = value
            elif name == "target_languages":
                assert_type(name, value, list)
                for target in value:
                    if target not in MarianMT.LANGUAGE_CODE_MAPPING:
                        raise ValueError(
                            f"invalid target language '{target}'. Valid values are "
                            f"{list(MarianMT.LANGUAGE_CODE_MAPPING.keys())}")
                self.target_languages = value
            else:
                raise ValueError(f"Unknown param '{name}'")
Esempio n. 8
0
    def init(self, params: Dict[str, Any]):
        """
        See :meth:`gobbli.model.base.BaseModel.init`.

        BERT parameters:

        - ``max_seq_length`` (:obj:`int`): The maximum total input sequence length after
          WordPiece tokenization.  Sequences longer than this will be truncated,
          and sequences shorter than this will be padded.  Default: 128
        - ``bert_model`` (:obj:`str`): Name of a pretrained BERT model to use.
          See :obj:`BERT_MODEL_ARCHIVES` for a listing of available BERT models.
        """
        self.max_seq_length = 128
        self.bert_model = "bert-base-uncased"

        for name, value in params.items():
            if name == "max_seq_length":
                assert_type(name, value, int)
                self.max_seq_length = value
            elif name == "bert_model":
                assert_in(name, value, set(BERT_MODEL_ARCHIVES.keys()))
                self.bert_model = value
            else:
                raise ValueError(f"Unknown param '{name}'")
Esempio n. 9
0
    def init(self, params: Dict[str, Any]):
        """
        See :meth:`gobbli.model.base.BaseModel.init`.

        BERTMaskedLM parameters:

        - ``bert_model`` (:obj:`str`): Name of a pretrained BERT model to use.
          See the `pytorch-transformers <https://huggingface.co/pytorch-transformers/pretrained_models.html>`__
          docs for supported values.
        - ``diversity``: 0 < diversity <= 1; determines the likelihood of selecting replacement words
          based on their predicted probability.
          At 1, the most probable words are most likely to be selected
          as replacements.  As diversity decreases, likelihood of selection becomes less
          dependent on predicted probability.
        - ``n_probable``: The number of probable tokens to consider for replacement.
        - ``batch_size``: Number of documents to run through the BERT model at once.
        """
        self.bert_model = "bert-base-uncased"
        self.diversity = 0.8
        self.batch_size = 32
        self.n_probable = 5

        for name, value in params.items():
            if name == "bert_model":
                self.bert_model = value
            elif name == "diversity":
                assert_type(name, value, float)
                if not 0 < value <= 1:
                    raise ValueError("diversity must be > 0 and <= 1")
                self.diversity = value
            elif name == "batch_size":
                assert_type(name, value, int)
                if value < 1:
                    raise ValueError("batch_size must be >= 1")
                self.batch_size = value
            elif name == "n_probable":
                assert_type(name, value, int)
                if value < 1:
                    raise ValueError("n_probable must be >= 1")
                self.n_probable = value
            else:
                raise ValueError(f"Unknown param '{name}'")
Esempio n. 10
0
    def __init__(
        self,
        # Can't make this type more restrictive since gensim might not be
        # available, and we need to make the union include a gensim type
        model: Any,
        tokenizer: Union[str, TokenizeMethod,
                         Callable[[List[str]],
                                  List[List[str]]]] = TokenizeMethod.SPLIT,
        n_similar: int = 10,
        diversity: float = 0.8,
    ):
        try:
            import gensim
            from gensim.scripts.glove2word2vec import glove2word2vec
        except ImportError:
            raise ImportError(
                "word2vec-based data augmentation requires gensim to be installed."
            )

        if isinstance(model, str):
            # Download and extract pretrained weights from a public source
            assert_in("word2vec model", model, set(WORD2VEC_MODELS.keys()))
            archive_name, filename = WORD2VEC_MODELS[model]
            archive_url = _WORD2VEC_MODEL_ARCHIVES[archive_name]

            LOGGER.debug(f"Downloading word2vec model '{model}'")
            # Some downloads aren't contained in archives
            if is_archive(Path(archive_url)):
                extract_dir = download_archive(archive_url,
                                               self.data_dir(),
                                               junk_paths=True)
                model_file = extract_dir / filename
            else:
                model_file = download_file(archive_url)

            if model.startswith("glove"):
                LOGGER.debug(f"Converting GloVe format to word2vec format")
                # Need to convert the downloaded file to word2vec format,
                # since GloVe vectorsr are formatted slightly differently
                with tempfile.NamedTemporaryFile() as f:
                    tempfile_path = Path(f.name)
                    glove2word2vec(model_file, tempfile_path)
                    shutil.copy2(tempfile_path, model_file)

            LOGGER.debug(f"Loading word2vec model '{model}'")
            self._model = gensim.models.KeyedVectors.load_word2vec_format(
                model_file)
            LOGGER.debug(f"word2vec model loaded")
        elif isinstance(model, Path):
            LOGGER.debug(f"Loading word2vec model from path '{model}'")
            self._model = gensim.models.KeyedVectors.load_word2vec_format(
                str(model))
            LOGGER.debug(f"word2vec model loaded")
        elif isinstance(model,
                        (gensim.models.Word2Vec, gensim.models.KeyedVectors)):
            self._model = model
        else:
            raise TypeError(
                f"unsupported type for initializing word2vec model: '{type(model)}'"
            )

        assert_type("n_similar", n_similar, int)
        if n_similar <= 0:
            raise ValueError("n_similar must be > 0")
        self.n_similar = n_similar

        assert_type("diversity", diversity, float)
        if not 0 < diversity <= 1:
            raise ValueError("diversity must be > 0 and <= 1")
        self.diversity = diversity

        if isinstance(tokenizer, str):
            tokenizer = TokenizeMethod[tokenizer]

        if isinstance(tokenizer, TokenizeMethod):
            # Avoid mypy error when passing a partially-applied function created by
            # functools.partial
            self.tokenizer = cast(
                Callable[[List[str]], List[List[str]]],
                functools.partial(tokenize, tokenizer),
            )
        elif callable(tokenizer):
            self.tokenizer = tokenizer
        else:
            raise TypeError(
                f"unsupported type for tokenizer: '{type(tokenizer)}'")
Esempio n. 11
0
    def _validate_params(self):
        assert_param_required("percent_multipliers", self.params)
        percent_multipliers = self.params["percent_multipliers"]
        assert_type("percent_multipliers", percent_multipliers, list)
        for (p, m) in percent_multipliers:
            assert_type("percent", p, float)
            assert_proportion("percent", p)
            assert_type("multiplier", m, (int, float))

        assert_type("param_grid", self.params.get("param_grid", {}), dict)

        assert_param_required("model_name", self.params)
        assert_type("model_name", self.params["model_name"], str)
        assert_valid_model(self.params["model_name"])

        assert_param_required("augment_probability", self.params)
        assert_type("augment_probability", p, float)
        assert_proportion("augment_probability", p)

        assert_param_required("preprocess_func", self.params)
        assert_in("preprocess_func", self.params["preprocess_func"],
                  PREPROCESS_FUNCS)
Esempio n. 12
0
    def init(self, params: Dict[str, Any]):
        """
        See :meth:`gobbli.model.base.BaseModel.init`.

        Transformer parameters:

        - ``transformer_model`` (:obj:`str`): Name of a transformer model architecture to use.
          For training/prediction, the value should be one such that
          ``from transformers import <value>ForSequenceClassification`` is
          a valid import.  ex value = "Bert" ->
          ``from transformers import BertForSequenceClassification``.  Note this means
          only a subset of the transformers models are supported for these tasks -- search
          `the docs <https://huggingface.co/transformers/search.html?q=forsequenceclassification&check_keywords=yes&area=default>`__ to see which ones you can use.
          For embedding generation, the import is ``<value>Model``, so any transformer
          model is supported.
        - ``transformer_weights`` (:obj:`str`): Name of the pretrained weights to use.
          See the `transformers docs <https://huggingface.co/transformers/pretrained_models.html>`__
          for supported values.  These depend on the ``transformer_model`` chosen.
        - ``config_overrides`` (:obj:`dict`): Dictionary of keys and values that will
          override config for the model.
        - ``max_seq_length``: Truncate all sequences to this length after tokenization.
          Used to save memory.
        - ``lr``: Learning rate for the AdamW optimizer.
        - ``adam_eps``: Epsilon value for the AdamW optimizer.
        - ``gradient_accumulation_steps``: Number of iterations to accumulate gradients before
          updating the model.  Used to allow larger effective batch sizes for models too big to
          fit a large batch on the GPU.  The "effective batch size" is
          ``gradient_accumulation_steps`` * :paramref:`TrainInput.params.train_batch_size`.
          If you encounter memory errors while training, try decreasing the batch size and
          increasing ``gradient_accumulation_steps``. For example, if a training batch size of
          32 causes memory errors, try decreasing batch size to 16 and increasing
          ``gradient_accumulation_steps`` to 2.  If you still have problems with memory, you can
          drop batch size to 8 and ``gradient_accumulation_steps`` to 4, and so on.

        Note that gobbli relies on transformers to perform validation on these parameters,
        so initialization errors may not be caught until model runtime.
        """
        self.transformer_model = "Bert"
        self.transformer_weights = "bert-base-uncased"
        self.config_overrides = {}  # type: Dict[str, Any]
        self.max_seq_length = 128
        self.lr = 5e-5
        self.adam_eps = 1e-8
        self.gradient_accumulation_steps = 1

        for name, value in params.items():
            if name == "transformer_model":
                self.transformer_model = value
            elif name == "transformer_weights":
                self.transformer_weights = value
            elif name == "config_overrides":
                assert_type(name, value, dict)
                self.config_overrides = value
            elif name == "max_seq_length":
                assert_type(name, value, int)
                self.max_seq_length = value
            elif name == "lr":
                assert_type(name, value, float)
                self.lr = value
            elif name == "adam_eps":
                assert_type(name, value, float)
                self.adam_eps = value
            elif name == "gradient_accumulation_steps":
                assert_type(name, value, int)
                self.gradient_accumulation_steps = value
            else:
                raise ValueError(f"Unknown param '{name}'")