Esempio n. 1
0
    def __init__(self, config: Dict, vocab: Optional[Vocabulary] = None):
        super().__init__(vocab=vocab or vocabulary.create_empty_vocabulary())

        # saves the config in the pl checkpoints
        self.save_hyperparameters("config")

        config = PipelineConfiguration.from_dict(config)
        tokenizer = config.build_tokenizer()
        featurizer = config.features.compile_featurizer(tokenizer)
        embedder = config.build_embedder(self.vocab)
        head = config.head.compile(backbone=ModelBackbone(
            self.vocab,
            featurizer=featurizer,
            embedder=embedder,
            encoder=config.encoder,
        ))

        self.name = config.name
        self._head = None
        self.set_head(head)

        self.file_path: Optional[str] = None

        self.optimizer: Optional[torch.optim.Optimizer] = None
        # The lr_scheduler dict follows the Lightning format:
        # https://pytorch-lightning.readthedocs.io/en/stable/common/optimizers.html#learning-rate-scheduling
        self.lr_scheduler: Optional[Dict] = None

        self.best_metrics: Optional[Dict[str, torch.Tensor]] = None
        # This is set by our trainer to figure out the best_metrics
        # what metric to monitor?
        self.monitor: Optional[str] = None
        # shall the metric increase ("max") or decrease ("min")?
        self.monitor_mode: Optional[str] = None
Esempio n. 2
0
def test_pipeline_without_word_features():
    tokenizer_config = TokenizerConfiguration()
    char_features = CharFeatures(
        embedding_dim=2,
        encoder={
            "type": "gru",
            "hidden_size": 2,
            "num_layers": 1,
            "bidirectional": True,
        },
        dropout=0.1,
    )
    features_config = FeaturesConfiguration(char=char_features)
    encoder_spec = Seq2SeqEncoderConfiguration(type="gru",
                                               hidden_size=2,
                                               num_layers=1,
                                               bidirectional=True)

    head_spec = TaskHeadConfiguration(
        type="TextClassification",
        labels=["duplicate", "not_duplicate"],
        pooler={"type": "boe"},
    )

    pipeline_config = PipelineConfiguration(
        name="no_word_features",
        head=head_spec,
        features=features_config,
        tokenizer=tokenizer_config,
        encoder=encoder_spec,
    )

    pl = Pipeline.from_config(pipeline_config)
    assert "word" not in pl.backbone.featurizer.indexer
    assert "char" in pl.backbone.featurizer.indexer
Esempio n. 3
0
    def from_config(
        cls,
        config: Union[PipelineConfiguration, dict],
    ) -> "Pipeline":
        """Creates a pipeline from a `PipelineConfiguration` object or a configuration dictionary

        Parameters
        ----------
        config
            A `PipelineConfiguration` object or a configuration dict

        Returns
        -------
        pipeline
            A configured pipeline
        """
        if isinstance(config, PipelineConfiguration):
            config = config.as_dict()

        model = PipelineModel(config=config)

        if not isinstance(model, PipelineModel):
            raise TypeError(f"Cannot load model. Wrong format of {model}")

        cls._add_transformers_vocab_if_needed(model)

        return cls(model, PipelineConfiguration.from_dict(config))
Esempio n. 4
0
    def from_config(
        cls,
        config: Union[PipelineConfiguration, dict],
        vocab_path: Optional[str] = None,
    ) -> "Pipeline":
        """Creates a pipeline from a `PipelineConfiguration` object or a configuration dictionary

        Parameters
        ----------
        config: `Union[PipelineConfiguration, dict]`
            A `PipelineConfiguration` object or a configuration dict
        vocab_path: `Optional[str]`
            If provided, the pipeline vocabulary will be loaded from this path

        Returns
        -------
        pipeline: `Pipeline`
            A configured pipeline
        """
        if isinstance(config, dict):
            config = PipelineConfiguration.from_dict(config)

        model = PipelineModel.from_params(
            Params({"config": config}),
            vocab=Vocabulary.from_files(vocab_path)
            if vocab_path is not None else None,
        )
        if not isinstance(model, PipelineModel):
            raise TypeError(f"Cannot load model. Wrong format of {model}")

        cls._add_transformers_vocab_if_needed(model)

        return cls(model, config)
Esempio n. 5
0
def test_pipeline_config(pipeline_yaml):
    tokenizer_config = TokenizerConfiguration(
        text_cleaning={"rules": ["strip_spaces"]}, use_spacy_tokens=True)

    word_features = WordFeatures(embedding_dim=2, lowercase_tokens=True)
    char_features = CharFeatures(
        embedding_dim=2,
        encoder={
            "type": "gru",
            "hidden_size": 2,
            "num_layers": 1,
            "bidirectional": True,
        },
        dropout=0.1,
    )
    features_config = FeaturesConfiguration(word=word_features,
                                            char=char_features)

    encoder_spec = Seq2SeqEncoderConfiguration(type="gru",
                                               hidden_size=2,
                                               num_layers=1,
                                               bidirectional=True)

    head_spec = TaskHeadConfiguration(
        type=TextClassification,
        labels=["duplicate", "not_duplicate"],
        pooler={"type": "boe"},
    )

    pipeline_config = PipelineConfiguration(
        name="test_pipeline_config",
        head=head_spec,
        features=features_config,
        tokenizer=tokenizer_config,
        encoder=encoder_spec,
    )

    pl = Pipeline.from_config(pipeline_config)

    pl_yaml = Pipeline.from_yaml(pipeline_yaml)

    assert pl.named_trainable_parameters == pl_yaml.named_trainable_parameters
    assert pl.num_trainable_parameters == pl_yaml.num_trainable_parameters
    assert pl.num_parameters == pl_yaml.num_parameters

    sample_text = "My simple text"
    for instance in [
            pl.backbone.featurizer(sample_text),
            pl_yaml.backbone.featurizer(sample_text),
    ]:
        for key, value in instance.items():
            assert key == "record"
            assert isinstance(value, ListField)
            assert len(value) == 1
            for text in value:
                assert isinstance(text, TextField)
                assert all(map(lambda t: isinstance(t, Token), text.tokens))
                assert sample_text == " ".join([t.text for t in text.tokens])
Esempio n. 6
0
    def from_yaml(cls, path: str) -> "Pipeline":
        """Creates a pipeline from a config yaml file

        Parameters
        ----------
        path
            The path to a YAML configuration file

        Returns
        -------
        pipeline
            A configured pipeline
        """
        pipeline_configuration = PipelineConfiguration.from_yaml(path)

        return cls.from_config(pipeline_configuration)
Esempio n. 7
0
    def from_yaml(cls,
                  path: str,
                  vocab_path: Optional[str] = None) -> "Pipeline":
        """Creates a pipeline from a config yaml file

        Parameters
        ----------
        path : `str`
            The path to a YAML configuration file
        vocab_path : `Optional[str]`
            If provided, the pipeline vocab will be loaded from this path

        Returns
        -------
        pipeline: `Pipeline`
            A configured pipeline
        """
        pipeline_configuration = PipelineConfiguration.from_yaml(path)

        return cls.from_config(pipeline_configuration, vocab_path=vocab_path)
Esempio n. 8
0
    def from_config(
        cls,
        config: Union[PipelineConfiguration, dict],
        vocab_path: Optional[str] = None,
    ) -> "Pipeline":
        """Creates a pipeline from a `PipelineConfiguration` object or a configuration dictionary

        Parameters
        ----------
        config: `Union[PipelineConfiguration, dict]`
            A `PipelineConfiguration` object or a configuration dict
        vocab_path: `Optional[str]`
            If provided, the pipeline vocabulary will be loaded from this path

        Returns
        -------
        pipeline: `Pipeline`
            A configured pipeline
        """
        if isinstance(config, dict):
            config = PipelineConfiguration.from_dict(config)
        return _BlankPipeline(config=config,
                              vocab=vocabulary.load_vocabulary(vocab_path))
Esempio n. 9
0
 def _config_from_archive(archive: Archive) -> PipelineConfiguration:
     config = archive.config["model"]["config"]
     return PipelineConfiguration.from_params(config)