Beispiel #1
0
 def test_forward_runs_with_non_bijective_mapping_with_dict(self):
     elmo_fixtures_path = self.FIXTURES_ROOT / 'elmo'
     options_file = str(elmo_fixtures_path / 'options.json')
     weight_file = str(elmo_fixtures_path / 'lm_weights.hdf5')
     params = Params({
         "token_embedders": {
             "words": {
                 "type": "embedding",
                 "num_embeddings": 20,
                 "embedding_dim": 2,
             },
             "elmo": {
                 "type": "elmo_token_embedder",
                 "options_file": options_file,
                 "weight_file": weight_file
             },
         },
         "embedder_to_indexer_map": {
             # pass arguments to `ElmoTokenEmbedder.forward` by dict
             "elmo": {
                 "inputs": "elmo",
                 "word_inputs": "words"
             },
             "words": ["words"]
         }
     })
     token_embedder = BasicTextFieldEmbedder.from_params(self.vocab, params)
     inputs = {
         'words': (torch.rand(3, 6) * 20).long(),
         'elmo': (torch.rand(3, 6, 50) * 15).long(),
     }
     token_embedder(inputs)
Beispiel #2
0
 def setUp(self):
     super(TestBasicTextFieldEmbedder, self).setUp()
     self.vocab = Vocabulary()
     self.vocab.add_token_to_namespace("1")
     self.vocab.add_token_to_namespace("2")
     self.vocab.add_token_to_namespace("3")
     self.vocab.add_token_to_namespace("4")
     params = Params({
         "token_embedders": {
             "words1": {
                 "type": "embedding",
                 "embedding_dim": 2
             },
             "words2": {
                 "type": "embedding",
                 "embedding_dim": 5
             },
             "words3": {
                 "type": "embedding",
                 "embedding_dim": 3
             }
         }
     })
     self.token_embedder = BasicTextFieldEmbedder.from_params(
         vocab=self.vocab, params=params)
     self.inputs = {
         "words1": torch.LongTensor([[0, 2, 3, 5]]),
         "words2": torch.LongTensor([[1, 4, 3, 2]]),
         "words3": torch.LongTensor([[1, 5, 1, 2]])
     }
 def test_forward_works_on_higher_order_input(self):
     params = Params({
             "words": {
                     "type": "embedding",
                     "num_embeddings": 20,
                     "embedding_dim": 2,
                     },
             "characters": {
                     "type": "character_encoding",
                     "embedding": {
                             "embedding_dim": 4,
                             "num_embeddings": 15,
                             },
                     "encoder": {
                             "type": "cnn",
                             "embedding_dim": 4,
                             "num_filters": 10,
                             "ngram_filter_sizes": [3],
                             },
                     }
             })
     token_embedder = BasicTextFieldEmbedder.from_params(self.vocab, params)
     inputs = {
             'words': Variable(torch.rand(3, 4, 5, 6) * 20).long(),
             'characters': Variable(torch.rand(3, 4, 5, 6, 7) * 15).long(),
             }
     assert token_embedder(inputs, num_wrapping_dims=2).size() == (3, 4, 5, 6, 12)
 def setUp(self):
     super(TestBasicTextFieldEmbedder, self).setUp()
     self.vocab = Vocabulary()
     self.vocab.add_token_to_namespace("1")
     self.vocab.add_token_to_namespace("2")
     self.vocab.add_token_to_namespace("3")
     self.vocab.add_token_to_namespace("4")
     params = Params({
             "words1": {
                     "type": "embedding",
                     "embedding_dim": 2
                     },
             "words2": {
                     "type": "embedding",
                     "embedding_dim": 5
                     },
             "words3": {
                     "type": "embedding",
                     "embedding_dim": 3
                     }
             })
     self.token_embedder = BasicTextFieldEmbedder.from_params(self.vocab, params)
     self.inputs = {
             "words1": Variable(torch.LongTensor([[0, 2, 3, 5]])),
             "words2": Variable(torch.LongTensor([[1, 4, 3, 2]])),
             "words3": Variable(torch.LongTensor([[1, 5, 1, 2]]))
             }
Beispiel #5
0
    def test_end_to_end(self, train_parameters: bool, last_layer_only: bool):
        tokenizer = PretrainedTransformerTokenizer(
            model_name="bert-base-uncased")
        token_indexer = PretrainedTransformerIndexer(
            model_name="bert-base-uncased")

        sentence1 = "A, AllenNLP sentence."
        tokens1 = tokenizer.tokenize(sentence1)
        expected_tokens1 = [
            "[CLS]", "a", ",", "allen", "##nl", "##p", "sentence", ".", "[SEP]"
        ]
        assert [t.text for t in tokens1] == expected_tokens1

        sentence2 = "AllenNLP is great"
        tokens2 = tokenizer.tokenize(sentence2)
        expected_tokens2 = [
            "[CLS]", "allen", "##nl", "##p", "is", "great", "[SEP]"
        ]
        assert [t.text for t in tokens2] == expected_tokens2

        vocab = Vocabulary()

        params = Params({
            "token_embedders": {
                "bert": {
                    "type": "pretrained_transformer",
                    "model_name": "bert-base-uncased",
                    "train_parameters": train_parameters,
                    "last_layer_only": last_layer_only,
                }
            }
        })
        token_embedder = BasicTextFieldEmbedder.from_params(vocab=vocab,
                                                            params=params)

        instance1 = Instance(
            {"tokens": TextField(tokens1, {"bert": token_indexer})})
        instance2 = Instance(
            {"tokens": TextField(tokens2, {"bert": token_indexer})})

        batch = Batch([instance1, instance2])
        batch.index_instances(vocab)

        padding_lengths = batch.get_padding_lengths()
        tensor_dict = batch.as_tensor_dict(padding_lengths)
        tokens = tensor_dict["tokens"]
        max_length = max(len(tokens1), len(tokens2))

        assert tokens["bert"]["token_ids"].shape == (2, max_length)

        assert tokens["bert"]["mask"].tolist() == [
            [True, True, True, True, True, True, True, True, True],
            [True, True, True, True, True, True, True, False, False],
        ]

        # Attention mask
        bert_vectors = token_embedder(tokens)
        assert bert_vectors.size() == (2, 9, 768)
        assert bert_vectors.requires_grad == (train_parameters
                                              or not last_layer_only)
Beispiel #6
0
 def test_forward_runs_with_non_bijective_mapping_with_dict(self):
     elmo_fixtures_path = self.FIXTURES_ROOT / "elmo"
     options_file = str(elmo_fixtures_path / "options.json")
     weight_file = str(elmo_fixtures_path / "lm_weights.hdf5")
     params = Params({
         "token_embedders": {
             "words": {
                 "type": "embedding",
                 "num_embeddings": 20,
                 "embedding_dim": 2
             },
             "elmo": {
                 "type": "elmo_token_embedder",
                 "options_file": options_file,
                 "weight_file": weight_file,
             },
         }
     })
     token_embedder = BasicTextFieldEmbedder.from_params(vocab=self.vocab,
                                                         params=params)
     inputs = {
         "words": {
             "tokens": (torch.rand(3, 6) * 20).long()
         },
         "elmo": {
             "tokens": (torch.rand(3, 6, 50) * 15).long()
         },
     }
     token_embedder(inputs)
Beispiel #7
0
 def test_forward_works_on_higher_order_input(self):
     params = Params({
         "token_embedders": {
             "words": {
                 "type": "embedding",
                 "num_embeddings": 20,
                 "embedding_dim": 2,
             },
             "characters": {
                 "type": "character_encoding",
                 "embedding": {
                     "embedding_dim": 4,
                     "num_embeddings": 15,
                 },
                 "encoder": {
                     "type": "cnn",
                     "embedding_dim": 4,
                     "num_filters": 10,
                     "ngram_filter_sizes": [3],
                 },
             }
         }
     })
     token_embedder = BasicTextFieldEmbedder.from_params(vocab=self.vocab,
                                                         params=params)
     inputs = {
         'words': (torch.rand(3, 4, 5, 6) * 20).long(),
         'characters': (torch.rand(3, 4, 5, 6, 7) * 15).long(),
     }
     assert token_embedder(inputs,
                           num_wrapping_dims=2).size() == (3, 4, 5, 6, 12)
Beispiel #8
0
 def test_forward_runs_with_bijective_and_non_bijective_mapping(self):
     params = Params({
         "token_embedders": {
             "bert": {
                 "type": "pretrained_transformer",
                 "model_name": "bert-base-uncased"
             },
             "token_characters": {
                 "type": "character_encoding",
                 "embedding": {
                     "embedding_dim": 5
                 },
                 "encoder": {
                     "type": "cnn",
                     "embedding_dim": 5,
                     "num_filters": 5,
                     "ngram_filter_sizes": [5],
                 },
             },
         }
     })
     token_embedder = BasicTextFieldEmbedder.from_params(vocab=self.vocab,
                                                         params=params)
     inputs = {
         "bert": {
             "token_ids": (torch.rand(3, 5) * 10).long(),
             "mask": (torch.rand(3, 5) * 1).long(),
         },
         "token_characters": {
             "token_characters": (torch.rand(3, 5, 5) * 1).long()
         },
     }
     token_embedder(inputs)
Beispiel #9
0
 def test_forward_runs_with_non_bijective_mapping(self):
     elmo_fixtures_path = self.FIXTURES_ROOT / u'elmo'
     options_file = unicode(elmo_fixtures_path / u'options.json')
     weight_file = unicode(elmo_fixtures_path / u'lm_weights.hdf5')
     params = Params({
         u"words": {
             u"type": u"embedding",
             u"num_embeddings": 20,
             u"embedding_dim": 2,
         },
         u"elmo": {
             u"type": u"elmo_token_embedder",
             u"options_file": options_file,
             u"weight_file": weight_file
         },
         u"embedder_to_indexer_map": {
             u"words": [u"words"],
             u"elmo": [u"elmo", u"words"]
         }
     })
     token_embedder = BasicTextFieldEmbedder.from_params(self.vocab, params)
     inputs = {
         u'words': (torch.rand(3, 6) * 20).long(),
         u'elmo': (torch.rand(3, 6, 50) * 15).long(),
     }
     token_embedder(inputs)
 def test_forward_runs_with_non_bijective_mapping(self):
     elmo_fixtures_path = self.FIXTURES_ROOT / 'elmo'
     options_file = str(elmo_fixtures_path / 'options.json')
     weight_file = str(elmo_fixtures_path / 'lm_weights.hdf5')
     params = Params({
             "token_embedders": {
                     "words": {
                             "type": "embedding",
                             "num_embeddings": 20,
                             "embedding_dim": 2,
                             },
                     "elmo": {
                             "type": "elmo_token_embedder",
                             "options_file": options_file,
                             "weight_file": weight_file
                             },
                     },
             "embedder_to_indexer_map": {"words": ["words"], "elmo": ["elmo", "words"]}
             })
     token_embedder = BasicTextFieldEmbedder.from_params(self.vocab, params)
     inputs = {
             'words': (torch.rand(3, 6) * 20).long(),
             'elmo': (torch.rand(3, 6, 50) * 15).long(),
             }
     token_embedder(inputs)
Beispiel #11
0
    def __init__(self,
                vocab: Vocabulary,
                params: Params,
                regularizer: RegularizerApplicator = None):
                
        super(LayerNer, self).__init__(vocab = vocab, regularizer = regularizer)

        # Base Text Field Embedder
        text_field_embedder_params = params.pop("text_field_embedder")
        text_field_embedder = BasicTextFieldEmbedder.from_params(vocab=vocab, 
                                                                params=text_field_embedder_params)
        self._text_field_embedder = text_field_embedder
        
        ############
        # NER Stuffs
        ############
        ner_params = params.pop("ner")
        
        # Encoder
        encoder_ner_params = ner_params.pop("encoder")
        encoder_ner = Seq2SeqEncoder.from_params(encoder_ner_params)
        self._encoder_ner =  encoder_ner
        
        # Tagger NER - CRF Tagger
        tagger_ner_params = ner_params.pop("tagger")
        tagger_ner = CrfTagger(vocab = vocab,
                            text_field_embedder = self._text_field_embedder,
                            encoder = self._encoder_ner,
                            label_namespace = tagger_ner_params.pop("label_namespace", "labels"),
                            constraint_type = tagger_ner_params.pop("constraint_type", None),
                            dropout = tagger_ner_params.pop("dropout", None),
                            regularizer = regularizer)
        self._tagger_ner = tagger_ner
        
        logger.info("Multi-Task Learning Model has been instantiated.")
Beispiel #12
0
    def test_end_to_end_t5(
        self,
        train_parameters: bool,
        last_layer_only: bool,
        gradient_checkpointing: bool,
    ):
        tokenizer = PretrainedTransformerTokenizer(model_name="patrickvonplaten/t5-tiny-random")
        token_indexer = PretrainedTransformerIndexer(model_name="patrickvonplaten/t5-tiny-random")

        sentence1 = "A, AllenNLP sentence."
        tokens1 = tokenizer.tokenize(sentence1)
        expected_tokens1 = ["▁A", ",", "▁Allen", "N", "LP", "▁sentence", ".", "</s>"]
        assert [t.text for t in tokens1] == expected_tokens1

        sentence2 = "AllenNLP is great"
        tokens2 = tokenizer.tokenize(sentence2)
        expected_tokens2 = ["▁Allen", "N", "LP", "▁is", "▁great", "</s>"]
        assert [t.text for t in tokens2] == expected_tokens2

        vocab = Vocabulary()

        params = Params(
            {
                "token_embedders": {
                    "bert": {
                        "type": "pretrained_transformer",
                        "model_name": "patrickvonplaten/t5-tiny-random",
                        "train_parameters": train_parameters,
                        "last_layer_only": last_layer_only,
                        "gradient_checkpointing": gradient_checkpointing,
                        "sub_module": "encoder",
                    }
                }
            }
        )
        token_embedder = BasicTextFieldEmbedder.from_params(vocab=vocab, params=params)

        instance1 = Instance({"tokens": TextField(tokens1, {"bert": token_indexer})})
        instance2 = Instance({"tokens": TextField(tokens2, {"bert": token_indexer})})

        batch = Batch([instance1, instance2])
        batch.index_instances(vocab)

        padding_lengths = batch.get_padding_lengths()
        tensor_dict = batch.as_tensor_dict(padding_lengths)
        tokens = tensor_dict["tokens"]
        max_length = max(len(tokens1), len(tokens2))

        assert tokens["bert"]["token_ids"].shape == (2, max_length)

        assert tokens["bert"]["mask"].tolist() == [
            [True, True, True, True, True, True, True, True],
            [True, True, True, True, True, True, False, False],
        ]

        # Attention mask
        bert_vectors = token_embedder(tokens)
        assert bert_vectors.size() == (2, 8, 64)
        assert bert_vectors.requires_grad == (train_parameters or not last_layer_only)
    def test_end_to_end(self):
        tokenizer = PretrainedTransformerTokenizer(
            model_name="bert-base-uncased")
        token_indexer = PretrainedTransformerIndexer(
            model_name="bert-base-uncased")

        sentence1 = "A, AllenNLP sentence."
        tokens1 = tokenizer.tokenize(sentence1)
        expected_tokens1 = [
            "[CLS]", "a", ",", "allen", "##nl", "##p", "sentence", ".", "[SEP]"
        ]
        assert [t.text for t in tokens1] == expected_tokens1

        sentence2 = "AllenNLP is great"
        tokens2 = tokenizer.tokenize(sentence2)
        expected_tokens2 = [
            "[CLS]", "allen", "##nl", "##p", "is", "great", "[SEP]"
        ]
        assert [t.text for t in tokens2] == expected_tokens2

        vocab = Vocabulary()

        params = Params({
            "token_embedders": {
                "bert": {
                    "type": "pretrained_transformer",
                    "model_name": "bert-base-uncased"
                }
            },
            "embedder_to_indexer_map": {
                "bert": ["bert", "mask"]
            },
            "allow_unmatched_keys": True,
        })
        token_embedder = BasicTextFieldEmbedder.from_params(vocab=vocab,
                                                            params=params)

        instance1 = Instance(
            {"tokens": TextField(tokens1, {"bert": token_indexer})})
        instance2 = Instance(
            {"tokens": TextField(tokens2, {"bert": token_indexer})})

        batch = Batch([instance1, instance2])
        batch.index_instances(vocab)

        padding_lengths = batch.get_padding_lengths()
        tensor_dict = batch.as_tensor_dict(padding_lengths)
        tokens = tensor_dict["tokens"]
        max_length = max(len(tokens1), len(tokens2))

        assert tokens["bert"].shape == (2, max_length)

        assert tokens["mask"].tolist() == [[1, 1, 1, 1, 1, 1, 1, 1, 1],
                                           [1, 1, 1, 1, 1, 1, 1, 0, 0]]

        # Attention mask
        bert_vectors = token_embedder(tokens)
        assert bert_vectors.size() == (2, 9, 768)
    def test_long_sequence_splitting_end_to_end(self):
        # Mostly the same as the end_to_end test (except for adding max_length=4),
        # because we don't want this splitting behavior to change input/output format.

        tokenizer = PretrainedTransformerTokenizer(
            model_name="bert-base-uncased")
        token_indexer = PretrainedTransformerIndexer(
            model_name="bert-base-uncased", max_length=4)

        sentence1 = "A, AllenNLP sentence."
        tokens1 = tokenizer.tokenize(sentence1)
        sentence2 = "AllenNLP is great"
        tokens2 = tokenizer.tokenize(sentence2)

        vocab = Vocabulary()

        params = Params({
            "token_embedders": {
                "bert": {
                    "type": "pretrained_transformer",
                    "model_name": "bert-base-uncased",
                    "max_length": 4,
                }
            }
        })
        token_embedder = BasicTextFieldEmbedder.from_params(vocab=vocab,
                                                            params=params)

        instance1 = Instance(
            {"tokens": TextField(tokens1, {"bert": token_indexer})})
        instance2 = Instance(
            {"tokens": TextField(tokens2, {"bert": token_indexer})})

        batch = Batch([instance1, instance2])
        batch.index_instances(vocab)

        padding_lengths = batch.get_padding_lengths()
        tensor_dict = batch.as_tensor_dict(padding_lengths)
        tokens = tensor_dict["tokens"]
        max_length = max(len(tokens1), len(tokens2))

        # Adds n_segments * 2 special tokens
        segment_concat_length = int(math.ceil(max_length / 4)) * 2 + max_length
        assert tokens["bert"]["token_ids"].shape == (2, segment_concat_length)

        assert tokens["bert"]["mask"].tolist() == [
            [1, 1, 1, 1, 1, 1, 1, 1, 1],
            [1, 1, 1, 1, 1, 1, 1, 0, 0],
        ]
        assert tokens["bert"]["segment_concat_mask"].tolist() == [
            [1] * segment_concat_length,
            [1] * (segment_concat_length - 4) +
            [0] * 4,  # 4 is hard-coded length difference
        ]

        # Attention mask
        bert_vectors = token_embedder(tokens)
        assert bert_vectors.size() == (2, 9, 768)
Beispiel #15
0
    def test_old_from_params_new_from_params(self):
        old_params = Params({
            "words1": {
                "type": "embedding",
                "embedding_dim": 2
            },
            "words2": {
                "type": "embedding",
                "embedding_dim": 5
            },
            "words3": {
                "type": "embedding",
                "embedding_dim": 3
            }
        })

        # Allow loading the parameters in the old format
        with pytest.warns(DeprecationWarning):
            old_embedder = BasicTextFieldEmbedder.from_params(
                params=old_params, vocab=self.vocab)

        new_params = Params({
            "token_embedders": {
                "words1": {
                    "type": "embedding",
                    "embedding_dim": 2
                },
                "words2": {
                    "type": "embedding",
                    "embedding_dim": 5
                },
                "words3": {
                    "type": "embedding",
                    "embedding_dim": 3
                }
            }
        })

        # But also allow loading the parameters in the new format
        new_embedder = BasicTextFieldEmbedder.from_params(params=new_params,
                                                          vocab=self.vocab)
        assert old_embedder._token_embedders.keys(
        ) == new_embedder._token_embedders.keys()

        assert new_embedder(self.inputs).size() == (1, 4, 10)
Beispiel #16
0
    def setUp(self):
        super().setUp()

        token_indexers = {"tokens": SingleIdTokenIndexer(namespace="tokens")}
        tokenizer = WordTokenizer()

        list_of_sentences = [
                ["words1 words2 words3", "words1 words2"],
                ["words1", "words1 words2"]
        ]

        paragraphs = [
                ["words1 words2 words3 words1 words2"],
                ["words1 words1 words2"]
        ]

        instances = []
        for sentences, paragraph in zip(list_of_sentences, paragraphs):
            sentences_tokens = [tokenizer.tokenize(sentence) for sentence in sentences]
            sentences_text_fields = [TextField(sentence_tokens, token_indexers)
                                     for sentence_tokens in sentences_tokens]
            sentences_field = ListField(sentences_text_fields)

            fields: Dict[str, Field] = {}
            fields['sentences'] = sentences_field
            paragraph_tokens = [token for sentence_tokens in sentences_tokens for token in sentence_tokens]
            paragraph_text_field = TextField(paragraph_tokens, token_indexers)
            fields['paragraph'] = paragraph_text_field
            instances.append(Instance(fields))

        vocab = Vocabulary.from_instances(instances)
        batch = Batch(instances)
        batch.index_instances(vocab)

        tensor_dict = batch.as_tensor_dict()

        sentences = tensor_dict["sentences"]
        paragraph = tensor_dict["paragraph"]

        text_field_embedder_params = Params({
                "token_embedders": {
                        "tokens": {
                                "type": "embedding",
                                "embedding_dim": 3
                                },
                        }
                })
        text_field_embedder = BasicTextFieldEmbedder.from_params(vocab=vocab,
                                                                 params=text_field_embedder_params)

        self.embedded_sentences = text_field_embedder(sentences, num_wrapping_dims=1)
        self.sentences_mask = get_text_field_mask(sentences, num_wrapping_dims=1).float()
        self.sentences_lengths = self.sentences_mask.sum(dim=-1)

        self.embedded_paragraph = text_field_embedder(paragraph)
        self.paragraph_mask = get_text_field_mask(paragraph).float()
    def test_old_from_params_new_from_params(self):
        old_params = Params({
                "words1": {
                        "type": "embedding",
                        "embedding_dim": 2
                        },
                "words2": {
                        "type": "embedding",
                        "embedding_dim": 5
                        },
                "words3": {
                        "type": "embedding",
                        "embedding_dim": 3
                        }
                })

        # Allow loading the parameters in the old format
        with pytest.warns(DeprecationWarning):
            old_embedder = BasicTextFieldEmbedder.from_params(params=old_params, vocab=self.vocab)

        new_params = Params({
                "token_embedders": {
                        "words1": {
                                "type": "embedding",
                                "embedding_dim": 2
                                },
                        "words2": {
                                "type": "embedding",
                                "embedding_dim": 5
                                },
                        "words3": {
                                "type": "embedding",
                                "embedding_dim": 3
                                }
                        }
                })

        # But also allow loading the parameters in the new format
        new_embedder = BasicTextFieldEmbedder.from_params(params=new_params, vocab=self.vocab)
        assert old_embedder._token_embedders.keys() == new_embedder._token_embedders.keys() #pylint: disable=protected-access

        assert new_embedder(self.inputs).size() == (1, 4, 10)
Beispiel #18
0
    def __init__(self,
                 vocab: Vocabulary,
                 params: Params,
                 regularizer: RegularizerApplicator = None):

        super(LayerCoref, self).__init__(vocab=vocab, regularizer=regularizer)

        # Base text Field Embedder
        text_field_embedder_params = params.pop("text_field_embedder")
        text_field_embedder = BasicTextFieldEmbedder.from_params(
            vocab=vocab, params=text_field_embedder_params)
        self._text_field_embedder = text_field_embedder

        ##############
        # Coref Stuffs
        ##############
        coref_params = params.pop("coref")

        # Encoder
        encoder_coref_params = coref_params.pop("encoder")
        encoder_coref = Seq2SeqEncoder.from_params(encoder_coref_params)
        self._encoder_coref = encoder_coref

        # Tagger: Coreference
        tagger_coref_params = coref_params.pop("tagger")
        eval_on_gold_mentions = tagger_coref_params.pop_bool(
            "eval_on_gold_mentions", False)
        init_params = tagger_coref_params.pop("initializer", None)
        initializer = (InitializerApplicator.from_params(init_params)
                       if init_params is not None else InitializerApplicator())

        tagger_coref = CoreferenceCustom(
            vocab=vocab,
            text_field_embedder=self._text_field_embedder,
            context_layer=self._encoder_coref,
            mention_feedforward=FeedForward.from_params(
                tagger_coref_params.pop("mention_feedforward")),
            antecedent_feedforward=FeedForward.from_params(
                tagger_coref_params.pop("antecedent_feedforward")),
            feature_size=tagger_coref_params.pop_int("feature_size"),
            max_span_width=tagger_coref_params.pop_int("max_span_width"),
            spans_per_word=tagger_coref_params.pop_float("spans_per_word"),
            max_antecedents=tagger_coref_params.pop_int("max_antecedents"),
            lexical_dropout=tagger_coref_params.pop_float(
                "lexical_dropout", 0.2),
            initializer=initializer,
            regularizer=regularizer,
            eval_on_gold_mentions=eval_on_gold_mentions,
        )
        self._tagger_coref = tagger_coref
        if eval_on_gold_mentions:
            self._tagger_coref._eval_on_gold_mentions = True

        logger.info("Multi-Task Learning Model has been instantiated.")
Beispiel #19
0
    def test_old_from_params_new_from_params(self):

        old_params = Params({
            u"words1": {
                u"type": u"embedding",
                u"embedding_dim": 2
            },
            u"words2": {
                u"type": u"embedding",
                u"embedding_dim": 5
            },
            u"words3": {
                u"type": u"embedding",
                u"embedding_dim": 3
            }
        })

        with pytest.warns(DeprecationWarning):
            BasicTextFieldEmbedder.from_params(params=old_params,
                                               vocab=self.vocab)

        new_params = Params({
            u"token_embedders": {
                u"words1": {
                    u"type": u"embedding",
                    u"embedding_dim": 2
                },
                u"words2": {
                    u"type": u"embedding",
                    u"embedding_dim": 5
                },
                u"words3": {
                    u"type": u"embedding",
                    u"embedding_dim": 3
                }
            }
        })

        token_embedder = BasicTextFieldEmbedder.from_params(params=new_params,
                                                            vocab=self.vocab)
        assert token_embedder(self.inputs).size() == (1, 4, 10)
Beispiel #20
0
    def from_params(cls, vocab: Vocabulary,
                    params: Params) -> 'QAMultiChoiceMaxAttention':
        embedder_params = params.pop("text_field_embedder")
        text_field_embedder = BasicTextFieldEmbedder.from_params(
            vocab, embedder_params)

        embeddings_dropout_value = params.pop("embeddings_dropout", 0.0)

        # question encoder
        question_encoder_params = params.pop("question_encoder", None)
        question_enc_aggregate = params.pop("question_encoder_aggregate",
                                            "max")
        share_encoders = params.pop("share_encoders", False)

        if question_encoder_params is not None:
            question_encoder = Seq2SeqEncoder.from_params(
                question_encoder_params)
        else:
            question_encoder = None

        if share_encoders:
            choice_encoder = question_encoder
            choice_enc_aggregate = question_enc_aggregate
        else:
            # choice encoder
            choice_encoder_params = params.pop("choice_encoder", None)
            choice_enc_aggregate = params.pop("choice_encoder_aggregate",
                                              "max")

            if choice_encoder_params is not None:
                choice_encoder = Seq2SeqEncoder.from_params(
                    choice_encoder_params)
            else:
                choice_encoder = None

        # question to choice attention
        att_question_to_choice_params = params.get("att_question_to_choice")
        att_question_to_choice = SimilarityFunction.from_params(
            att_question_to_choice_params)

        init_params = params.pop('initializer', None)
        initializer = (InitializerApplicator.from_params(init_params)
                       if init_params is not None else InitializerApplicator())

        return cls(vocab=vocab,
                   text_field_embedder=text_field_embedder,
                   question_encoder=question_encoder,
                   choice_encoder=choice_encoder,
                   initializer=initializer,
                   aggregate_choice=choice_enc_aggregate,
                   aggregate_question=question_enc_aggregate,
                   embeddings_dropout_value=embeddings_dropout_value,
                   att_question_to_choice=att_question_to_choice)
    def test_end_to_end_for_first_sub_token_embedding(self,
                                                      sub_token_mode: str):
        token_indexer = PretrainedTransformerMismatchedIndexer(
            "bert-base-uncased")

        sentence1 = ["A", ",", "AllenNLP", "sentence", "."]
        sentence2 = ["AllenNLP", "is", "open", "source", "NLP", "library"]

        tokens1 = [Token(word) for word in sentence1]
        tokens2 = [Token(word) for word in sentence2]

        vocab = Vocabulary()

        params = Params({
            "token_embedders": {
                "bert": {
                    "type": "pretrained_transformer_mismatched",
                    "model_name": "bert-base-uncased",
                    "sub_token_mode": sub_token_mode,
                }
            }
        })
        token_embedder = BasicTextFieldEmbedder.from_params(vocab=vocab,
                                                            params=params)

        instance1 = Instance(
            {"tokens": TextField(tokens1, {"bert": token_indexer})})
        instance2 = Instance(
            {"tokens": TextField(tokens2, {"bert": token_indexer})})

        batch = Batch([instance1, instance2])
        batch.index_instances(vocab)

        padding_lengths = batch.get_padding_lengths()
        tensor_dict = batch.as_tensor_dict(padding_lengths)
        tokens = tensor_dict["tokens"]

        assert tokens["bert"]["mask"].tolist() == [
            [True, True, True, True, True, False],
            [True, True, True, True, True, True],
        ]

        assert tokens["bert"]["offsets"].tolist() == [
            [[1, 1], [2, 2], [3, 5], [6, 6], [7, 7], [0, 0]],
            [[1, 3], [4, 4], [5, 5], [6, 6], [7, 8], [9, 9]],
        ]

        # Attention mask
        bert_vectors = token_embedder(tokens)

        assert bert_vectors.size() == (2, max(len(sentence1),
                                              len(sentence2)), 768)
        assert not torch.isnan(bert_vectors).any()
    def test_old_from_params_new_from_params(self):

        old_params = Params({
                "words1": {
                        "type": "embedding",
                        "embedding_dim": 2
                        },
                "words2": {
                        "type": "embedding",
                        "embedding_dim": 5
                        },
                "words3": {
                        "type": "embedding",
                        "embedding_dim": 3
                        }
                })

        with pytest.warns(DeprecationWarning):
            BasicTextFieldEmbedder.from_params(params=old_params, vocab=self.vocab)

        new_params = Params({
                "token_embedders": {
                        "words1": {
                                "type": "embedding",
                                "embedding_dim": 2
                                },
                        "words2": {
                                "type": "embedding",
                                "embedding_dim": 5
                                },
                        "words3": {
                                "type": "embedding",
                                "embedding_dim": 3
                                }
                        }
                })

        token_embedder = BasicTextFieldEmbedder.from_params(params=new_params, vocab=self.vocab)
        assert token_embedder(self.inputs).size() == (1, 4, 10)
    def from_params(cls, vocab: Vocabulary,
                    params: Params) -> 'StackedNNAggregateCustom':
        embedder_params = params.pop("text_field_embedder")
        text_field_embedder = BasicTextFieldEmbedder.from_params(
            vocab, embedder_params)

        embeddings_dropout_value = params.pop("embeddings_dropout", 0.0)

        share_encoders = params.pop("share_encoders", False)

        # premise encoder
        premise_encoder_params = params.pop("premise_encoder", None)
        premise_enc_aggregate = params.pop("premise_encoder_aggregate", "max")
        if premise_encoder_params is not None:
            premise_encoder = Seq2SeqEncoder.from_params(
                premise_encoder_params)
        else:
            premise_encoder = None

        # hypothesis encoder
        if share_encoders:
            hypothesis_enc_aggregate = premise_enc_aggregate
            hypothesis_encoder = premise_encoder
        else:
            hypothesis_encoder_params = params.pop("hypothesis_encoder", None)
            hypothesis_enc_aggregate = params.pop(
                "hypothesis_encoder_aggregate", "max")

            if hypothesis_encoder_params is not None:
                hypothesis_encoder = Seq2SeqEncoder.from_params(
                    hypothesis_encoder_params)
            else:
                hypothesis_encoder = None

        aggregate_feedforward = FeedForward.from_params(
            params.pop('aggregate_feedforward'))

        init_params = params.pop('initializer', None)
        initializer = (InitializerApplicator.from_params(init_params)
                       if init_params is not None else InitializerApplicator())

        return cls(vocab=vocab,
                   text_field_embedder=text_field_embedder,
                   aggregate_feedforward=aggregate_feedforward,
                   premise_encoder=premise_encoder,
                   hypothesis_encoder=hypothesis_encoder,
                   initializer=initializer,
                   aggregate_hypothesis=hypothesis_enc_aggregate,
                   aggregate_premise=premise_enc_aggregate,
                   embeddings_dropout_value=embeddings_dropout_value,
                   share_encoders=share_encoders)
    def from_params(cls, vocab: Vocabulary, params: Params) -> 'QAMultiChoice_OneVsRest_Choices_v1':
        embedder_params = params.pop("text_field_embedder")
        text_field_embedder = BasicTextFieldEmbedder.from_params(vocab, embedder_params)

        embeddings_dropout_value = params.pop("embeddings_dropout", 0.0)

        # question encoder
        question_encoder_params = params.pop("question_encoder", None)
        question_enc_aggregate = params.pop("question_encoder_aggregate", "max")
        share_encoders = params.pop("share_encoders", False)

        # condition the choices or facts encoding on quesiton output states
        choices_init_from_question_states = params.pop("choices_init_from_question_states", False)

        if question_encoder_params is not None:
            question_encoder = Seq2SeqEncoder.from_params(question_encoder_params)
        else:
            question_encoder = None

        if share_encoders:
            choice_encoder = question_encoder
            choice_enc_aggregate = question_enc_aggregate
        else:
            # choice encoder
            choice_encoder_params = params.pop("choice_encoder", None)
            choice_enc_aggregate = params.pop("choice_encoder_aggregate", "max")

            if choice_encoder_params is not None:
                choice_encoder = Seq2SeqEncoder.from_params(choice_encoder_params)
            else:
                choice_encoder = None

        use_choice_sum_instead_of_question = params.get("use_choice_sum_instead_of_question", False)
        init_params = params.pop('initializer', None)
        initializer = (InitializerApplicator.from_params(init_params)
                       if init_params is not None
                       else InitializerApplicator())

        return cls(vocab=vocab,
                   text_field_embedder=text_field_embedder,
                   question_encoder=question_encoder,
                   choice_encoder=choice_encoder,
                   initializer=initializer,
                   aggregate_choice=choice_enc_aggregate,
                   aggregate_question=question_enc_aggregate,
                   embeddings_dropout_value=embeddings_dropout_value,
                   share_encoders=share_encoders,
                   choices_init_from_question_states=choices_init_from_question_states,
                   use_choice_sum_instead_of_question=use_choice_sum_instead_of_question,
                   params=params)
Beispiel #25
0
    def from_params(cls, vocab: Vocabulary, params: Params) -> 'LstmSwag':
        embedder_params = params.pop("text_field_embedder")
        text_field_embedder = BasicTextFieldEmbedder.from_params(
            vocab=vocab, params=embedder_params)
        encoder = Seq2SeqEncoder.from_params(params.pop("encoder"))

        initializer = InitializerApplicator.from_params(
            params.pop('initializer', []))
        regularizer = RegularizerApplicator.from_params(
            params.pop('regularizer', []))
        params.assert_empty(cls.__name__)
        return cls(vocab=vocab,
                   text_field_embedder=text_field_embedder,
                   encoder=encoder,
                   initializer=initializer,
                   regularizer=regularizer)
Beispiel #26
0
    def test_token_without_wordpieces(self):
        token_indexer = PretrainedTransformerMismatchedIndexer(
            "bert-base-uncased")

        sentence1 = ["A", "", "AllenNLP", "sentence", "."]
        sentence2 = ["AllenNLP", "", "great"]
        tokens1 = [Token(word) for word in sentence1]
        tokens2 = [Token(word) for word in sentence2]
        vocab = Vocabulary()
        params = Params({
            "token_embedders": {
                "bert": {
                    "type": "pretrained_transformer_mismatched",
                    "model_name": "bert-base-uncased",
                }
            }
        })
        token_embedder = BasicTextFieldEmbedder.from_params(vocab=vocab,
                                                            params=params)

        instance1 = Instance(
            {"tokens": TextField(tokens1, {"bert": token_indexer})})
        instance2 = Instance(
            {"tokens": TextField(tokens2, {"bert": token_indexer})})

        batch = Batch([instance1, instance2])
        batch.index_instances(vocab)

        padding_lengths = batch.get_padding_lengths()
        tensor_dict = batch.as_tensor_dict(padding_lengths)
        tokens = tensor_dict["tokens"]

        assert tokens["bert"]["offsets"].tolist() == [
            [[1, 1], [-1, -1], [2, 4], [5, 5], [6, 6]],
            [[1, 3], [-1, -1], [4, 4], [0, 0], [0, 0]],
        ]

        bert_vectors = token_embedder(tokens)
        assert bert_vectors.size() == (2, max(len(sentence1),
                                              len(sentence2)), 768)
        assert not torch.isnan(bert_vectors).any()
        assert all(bert_vectors[0, 1] == 0)
        assert all(bert_vectors[1, 1] == 0)
Beispiel #27
0
 def test_forward_runs_with_non_bijective_mapping_with_null(self):
     elmo_fixtures_path = self.FIXTURES_ROOT / "elmo"
     options_file = str(elmo_fixtures_path / "options.json")
     weight_file = str(elmo_fixtures_path / "lm_weights.hdf5")
     params = Params({
         "token_embedders": {
             "elmo": {
                 "type": "elmo_token_embedder",
                 "options_file": options_file,
                 "weight_file": weight_file,
             }
         },
         "embedder_to_indexer_map": {
             # ignore `word_inputs` in `ElmoTokenEmbedder.forward`
             "elmo": ["elmo", None]
         },
     })
     token_embedder = BasicTextFieldEmbedder.from_params(self.vocab, params)
     inputs = {"elmo": (torch.rand(3, 6, 50) * 15).long()}
     token_embedder(inputs)
Beispiel #28
0
 def test_forward_runs_with_forward_params(self):
     elmo_fixtures_path = self.FIXTURES_ROOT / "elmo_multilingual" / "es"
     options_file = str(elmo_fixtures_path / "options.json")
     weight_file = str(elmo_fixtures_path / "weights.hdf5")
     params = Params({
         "token_embedders": {
             "elmo": {
                 "type": "elmo_token_embedder_multilang",
                 "options_files": {
                     "es": options_file
                 },
                 "weight_files": {
                     "es": weight_file
                 },
             }
         }
     })
     token_embedder = BasicTextFieldEmbedder.from_params(self.vocab, params)
     inputs = {"elmo": (torch.rand(3, 6, 50) * 15).long()}
     kwargs = {"lang": "es"}
     token_embedder(inputs, **kwargs)
    def test_throws_error_on_incorrect_sub_token_mode(self,
                                                      sub_token_mode: str):
        token_indexer = PretrainedTransformerMismatchedIndexer(
            "bert-base-uncased")

        sentence1 = ["A", ",", "AllenNLP", "sentence", "."]
        sentence2 = ["AllenNLP", "is", "open", "source", "NLP", "library"]

        tokens1 = [Token(word) for word in sentence1]
        tokens2 = [Token(word) for word in sentence2]

        vocab = Vocabulary()

        params = Params({
            "token_embedders": {
                "bert": {
                    "type": "pretrained_transformer_mismatched",
                    "model_name": "bert-base-uncased",
                    "sub_token_mode": sub_token_mode,
                }
            }
        })
        token_embedder = BasicTextFieldEmbedder.from_params(vocab=vocab,
                                                            params=params)

        instance1 = Instance(
            {"tokens": TextField(tokens1, {"bert": token_indexer})})
        instance2 = Instance(
            {"tokens": TextField(tokens2, {"bert": token_indexer})})

        batch = Batch([instance1, instance2])
        batch.index_instances(vocab)

        padding_lengths = batch.get_padding_lengths()
        tensor_dict = batch.as_tensor_dict(padding_lengths)
        tokens = tensor_dict["tokens"]

        with pytest.raises(ConfigurationError):
            token_embedder(tokens)
Beispiel #30
0
    def __init__(self,
                 vocab: Vocabulary,
                 params: Params,
                 regularizer: RegularizerApplicator = None):

        super(LayerRelation, self).__init__(vocab=vocab,
                                            regularizer=regularizer)

        # Base text Field Embedder
        text_field_embedder_params = params.pop("text_field_embedder")
        text_field_embedder = BasicTextFieldEmbedder.from_params(
            vocab=vocab, params=text_field_embedder_params)
        self._text_field_embedder = text_field_embedder

        ############################
        # Relation Extraction Stuffs
        ############################
        relation_params = params.pop("relation")

        # Encoder
        encoder_relation_params = relation_params.pop("encoder")
        encoder_relation = Seq2SeqEncoder.from_params(encoder_relation_params)
        self._encoder_relation = encoder_relation

        # Tagger: Relation
        tagger_relation_params = relation_params.pop("tagger")
        tagger_relation = RelationExtractor(
            vocab=vocab,
            text_field_embedder=self._text_field_embedder,
            context_layer=self._encoder_relation,
            d=tagger_relation_params.pop_int("d"),
            l=tagger_relation_params.pop_int("l"),
            n_classes=tagger_relation_params.pop("n_classes"),
            activation=tagger_relation_params.pop("activation"),
        )
        self._tagger_relation = tagger_relation

        logger.info("Multi-Task Learning Model has been instantiated.")
Beispiel #31
0
    def __init__(self,
                 vocab: Vocabulary,
                 params: Params,
                 regularizer: RegularizerApplicator = None):

        super(LayerNerEmdCoref, self).__init__(vocab=vocab,
                                               regularizer=regularizer)

        # Base text Field Embedder
        text_field_embedder_params = params.pop("text_field_embedder")
        text_field_embedder = BasicTextFieldEmbedder.from_params(
            vocab=vocab, params=text_field_embedder_params)
        self._text_field_embedder = text_field_embedder

        ############
        # NER Stuffs
        ############
        ner_params = params.pop("ner")

        # Encoder
        encoder_ner_params = ner_params.pop("encoder")
        encoder_ner = Seq2SeqEncoder.from_params(encoder_ner_params)
        self._encoder_ner = encoder_ner

        # Tagger NER - CRF Tagger
        tagger_ner_params = ner_params.pop("tagger")
        tagger_ner = CrfTagger(
            vocab=vocab,
            text_field_embedder=self._text_field_embedder,
            encoder=self._encoder_ner,
            label_namespace=tagger_ner_params.pop("label_namespace", "labels"),
            constraint_type=tagger_ner_params.pop("constraint_type", None),
            dropout=tagger_ner_params.pop("dropout", None),
            regularizer=regularizer)
        self._tagger_ner = tagger_ner

        ############
        # EMD Stuffs
        ############
        emd_params = params.pop("emd")

        # Encoder
        encoder_emd_params = emd_params.pop("encoder")
        encoder_emd = Seq2SeqEncoder.from_params(encoder_emd_params)
        self._encoder_emd = encoder_emd

        shortcut_text_field_embedder = ShortcutConnectTextFieldEmbedder(
            base_text_field_embedder=self._text_field_embedder,
            previous_encoders=[self._encoder_ner])
        self._shortcut_text_field_embedder = shortcut_text_field_embedder

        # Tagger: EMD - CRF Tagger
        tagger_emd_params = emd_params.pop("tagger")
        tagger_emd = CrfTagger(
            vocab=vocab,
            text_field_embedder=self._shortcut_text_field_embedder,
            encoder=self._encoder_emd,
            label_namespace=tagger_emd_params.pop("label_namespace", "labels"),
            constraint_type=tagger_emd_params.pop("constraint_type", None),
            dropout=tagger_ner_params.pop("dropout", None),
            regularizer=regularizer)
        self._tagger_emd = tagger_emd

        ##############
        # Coref Stuffs
        ##############
        coref_params = params.pop("coref")

        # Encoder
        encoder_coref_params = coref_params.pop("encoder")
        encoder_coref = Seq2SeqEncoder.from_params(encoder_coref_params)
        self._encoder_coref = encoder_coref

        shortcut_text_field_embedder_coref = ShortcutConnectTextFieldEmbedder(
            base_text_field_embedder=self._text_field_embedder,
            previous_encoders=[self._encoder_ner, self._encoder_emd])
        self._shortcut_text_field_embedder_coref = shortcut_text_field_embedder_coref

        # Tagger: Coreference
        tagger_coref_params = coref_params.pop("tagger")
        eval_on_gold_mentions = tagger_coref_params.pop_bool(
            "eval_on_gold_mentions", False)
        init_params = tagger_coref_params.pop("initializer", None)
        initializer = (InitializerApplicator.from_params(init_params)
                       if init_params is not None else InitializerApplicator())

        tagger_coref = CoreferenceCustom(
            vocab=vocab,
            text_field_embedder=self._shortcut_text_field_embedder_coref,
            context_layer=self._encoder_coref,
            mention_feedforward=FeedForward.from_params(
                tagger_coref_params.pop("mention_feedforward")),
            antecedent_feedforward=FeedForward.from_params(
                tagger_coref_params.pop("antecedent_feedforward")),
            feature_size=tagger_coref_params.pop_int("feature_size"),
            max_span_width=tagger_coref_params.pop_int("max_span_width"),
            spans_per_word=tagger_coref_params.pop_float("spans_per_word"),
            max_antecedents=tagger_coref_params.pop_int("max_antecedents"),
            lexical_dropout=tagger_coref_params.pop_float(
                "lexical_dropout", 0.2),
            initializer=initializer,
            regularizer=regularizer,
            eval_on_gold_mentions=eval_on_gold_mentions)
        self._tagger_coref = tagger_coref
        if eval_on_gold_mentions:
            self._tagger_coref._eval_on_gold_mentions = True

        logger.info("Multi-Task Learning Model has been instantiated.")
Beispiel #32
0
    def __init__(self,
                 vocab: Vocabulary,
                 params: Params,
                 regularizer: RegularizerApplicator = None):
        super(JointBiaffine, self).__init__(vocab=vocab,
                                            regularizer=regularizer)

        # Base text Field Embedder
        text_field_embedder_params = params.pop("text_field_embedder")
        text_field_embedder = BasicTextFieldEmbedder.from_params(
            vocab=vocab, params=text_field_embedder_params)
        self._text_field_embedder = text_field_embedder

        # Encoder
        encoder_params = params.pop("encoder")
        encoder = Seq2SeqEncoder.from_params(encoder_params)
        self._encoder = encoder

        self._group_shared_matrix = torch.FloatTensor()

        self._tag_representation_dim = params.pop('tag_representation_dim')
        self._arc_representation_dim = params.pop('arc_representation_dim')

        self._dropout = params.pop('dropout')
        self._input_dropout = params.pop('input_dropout')

        ############
        # DSP Stuffs
        ############
        dsp_params = params.pop("dsp")

        init_params = dsp_params.pop("initializer", None)
        self._initializer = (InitializerApplicator.from_params(init_params)
                             if init_params is not None else
                             InitializerApplicator())
        pos_params = dsp_params.pop("pos_tag_embedding")
        self._pos_tag_embedding = Embedding.from_params(vocab, pos_params)

        # Tagger DSP - Biaffine Tagger
        tagger_dsp = BiaffineParser(
            vocab=vocab,
            task_type='dsp',
            text_field_embedder=self._text_field_embedder,
            encoder=self._encoder,
            tag_representation_dim=self._tag_representation_dim,
            arc_representation_dim=self._arc_representation_dim,
            pos_tag_embedding=self._pos_tag_embedding,
            dropout=self._dropout,
            input_dropout=self._input_dropout,
            initializer=self._initializer)
        self._tagger_dsp = tagger_dsp

        # arc shared
        self._arc_attention = tagger_dsp.arc_attention
        self._head_arc_feedforward = tagger_dsp.head_arc_feedforward
        self._child_arc_feedforward = tagger_dsp.child_arc_feedforward

        ############
        # SRL Stuffs
        ############
        srl_params = params.pop("srl")

        init_params = srl_params.pop("initializer", None)
        self._initializer = (InitializerApplicator.from_params(init_params)
                             if init_params is not None else
                             InitializerApplicator())
        pos_params = srl_params.pop("pos_tag_embedding")
        self._pos_tag_embedding = Embedding.from_params(vocab, pos_params)

        # Tagger: EMD - CRF Tagger
        tagger_srl = BiaffineParser(
            vocab=vocab,
            task_type='srl',
            text_field_embedder=self._text_field_embedder,
            encoder=self._encoder,
            tag_representation_dim=self._tag_representation_dim,
            arc_representation_dim=self._arc_representation_dim,
            pos_tag_embedding=self._pos_tag_embedding,
            dropout=self._dropout,
            input_dropout=self._input_dropout,
            initializer=self._initializer)
        tagger_srl.arc_attention = self._arc_attention
        tagger_srl.head_arc_feedforward = self._head_arc_feedforward
        tagger_srl.child_arc_feedforward = self._child_arc_feedforward
        self._tagger_srl = tagger_srl

        logger.info("Multi-Task Learning Model has been instantiated.")
Beispiel #33
0
    def __init__(self,
                 vocab: Vocabulary,
                 params: Params,
                 regularizer: Optional[RegularizerApplicator] = None):

        super(LayerPOSChunkDeppar, self).__init__(vocab=vocab,
                                                  regularizer=regularizer)

        # Base text Field Embedder
        text_field_embedder_params = params.pop("text_field_embedder")
        text_field_embedder = BasicTextFieldEmbedder.from_params(
            vocab=vocab, params=text_field_embedder_params)
        self._text_field_embedder = text_field_embedder

        ############
        # POS Stuffs
        ############
        pos_params = params.pop("pos")

        # Encoder
        encoder_pos_params = pos_params.pop("encoder")
        encoder_pos = Seq2SeqEncoder.from_params(encoder_pos_params)
        self._encoder_pos = encoder_pos

        # Tagger POS - Simple Tagger
        tagger_pos_params = pos_params.pop("tagger")
        # Can be updated to the pos model that is created
        tagger_pos = PosSimpleTagger(
            vocab=vocab,
            text_field_embedder=self._text_field_embedder,
            encoder=self._encoder_pos,
            label_namespace=tagger_pos_params.pop("label_namespace", "labels"),
            regularizer=regularizer,
        )
        self._tagger_pos = tagger_pos

        ############
        # Chunk Stuffs
        ############
        chunk_params = params.pop("chunk")

        # Encoder
        encoder_chunk_params = chunk_params.pop("encoder")
        encoder_chunk = Seq2SeqEncoder.from_params(encoder_chunk_params)
        self._encoder_chunk = encoder_chunk

        shortcut_text_field_embedder = ShortcutConnectTextFieldEmbedder(
            base_text_field_embedder=self._text_field_embedder,
            previous_encoders=[self._encoder_pos])
        self._shortcut_text_field_embedder = shortcut_text_field_embedder

        # Tagger: Chunk - CRF Tagger
        tagger_chunk_params = chunk_params.pop("tagger")
        tagger_chunk = ChunkSimpleTagger(
            vocab=vocab,
            text_field_embedder=self._shortcut_text_field_embedder,
            encoder=self._encoder_chunk,
            label_namespace=tagger_chunk_params.pop("label_namespace",
                                                    "labels"),
            label_encoding=tagger_chunk_params.pop("label_encoding", None),
            regularizer=regularizer,
        )
        self._tagger_chunk = tagger_chunk

        ###########
        # Dependency Parsing Stuffs
        ###########
        deppar_params = params.pop("deppar")

        # Encoder
        encoder_deppar_params = deppar_params.pop("encoder")
        encoder_deppar = Seq2SeqEncoder.from_params(encoder_deppar_params)
        self._encoder_deppar = encoder_deppar

        shortcut_text_field_embedder_deppar = ShortcutConnectTextFieldEmbedder(
            base_text_field_embedder=self._text_field_embedder,
            previous_encoders=[self._encoder_pos, self._encoder_chunk])
        self._shortcut_text_field_embedder_deppar = shortcut_text_field_embedder_deppar

        # Parser: Dependency Parser - Biaffine Parser
        parser_deppar_params = deppar_params.pop("parser")
        embedding_deppar_params = deppar_params.pop("pos_tag_embedding")
        embedding = Embedding(
            num_embeddings=vocab.get_vocab_size('tokens'),
            embedding_dim=embedding_deppar_params.pop_int("embedding_dim"),
            vocab_namespace=embedding_deppar_params.pop("vocab_namespace"))
        init_params = parser_deppar_params.pop("initializer", None)
        initializer = (InitializerApplicator.from_params(init_params)
                       if init_params is not None else InitializerApplicator())

        tagger_deppar = BiaffineDependencyParser(
            vocab=vocab,
            text_field_embedder=self._shortcut_text_field_embedder_deppar,
            encoder=self._encoder_deppar,
            tag_representation_dim=parser_deppar_params.pop_int(
                "tag_representation_dim"),
            arc_representation_dim=parser_deppar_params.pop_int(
                "arc_representation_dim"),
            pos_tag_embedding=embedding,
            use_mst_decoding_for_validation=parser_deppar_params.pop(
                "use_mst_decoding_for_validation"),
            dropout=parser_deppar_params.pop_float("dropout"),
            input_dropout=parser_deppar_params.pop_float("input_dropout"),
            initializer=initializer,
            regularizer=regularizer)
        self._tagger_deppar = tagger_deppar

        logger.info("Multi-Task Learning Model has been instantiated.")
Beispiel #34
0
    def from_params(cls, vocab: Vocabulary,
                    params: Params) -> 'QAMultiChoiceKnowReader_v1':
        embedder_params = params.pop("text_field_embedder")
        text_field_embedder = BasicTextFieldEmbedder.from_params(
            vocab, embedder_params)

        embeddings_dropout_value = params.pop("embeddings_dropout", 0.0)

        # whether we want to use knowledge
        use_knowledge = params.pop("use_knowledge", True)
        use_ctx2facts_retrieval_map_as_mask = params.pop(
            "use_ctx2facts_retrieval_map_as_mask", False)

        # question encoder
        question_encoder_params = params.pop("question_encoder", None)
        question_enc_aggregate = params.pop("question_encoder_aggregate",
                                            "max")
        share_encoders = params.pop("share_encoders", False)

        # condition the choices or facts encoding on quesiton output states
        choices_init_from_question_states = params.pop(
            "choices_init_from_question_states", False)
        facts_init_from_question_states = params.pop(
            "facts_init_from_question_states", False)

        if question_encoder_params is not None:
            question_encoder = Seq2SeqEncoder.from_params(
                question_encoder_params)
        else:
            question_encoder = None

        knowledge_encoder = None
        knowledge_enc_aggregate = "max"

        if share_encoders:
            choice_encoder = question_encoder
            choice_enc_aggregate = question_enc_aggregate

            if use_knowledge:
                knowledge_encoder = question_encoder
                knowledge_enc_aggregate = question_enc_aggregate
        else:
            # choice encoder
            choice_encoder_params = params.pop("choice_encoder", None)
            choice_enc_aggregate = params.pop("choice_encoder_aggregate",
                                              "max")

            if choice_encoder_params is not None:
                choice_encoder = Seq2SeqEncoder.from_params(
                    choice_encoder_params)
            else:
                choice_encoder = None

            if use_knowledge:
                knowledge_encoder_params = params.pop("knowledge_encoder",
                                                      None)
                knowledge_enc_aggregate = params.pop(
                    "knowledge_encoder_aggregate", "max")

                if knowledge_encoder_params is not None:
                    knowledge_encoder = Seq2SeqEncoder.from_params(
                        knowledge_encoder_params)
                else:
                    knowledge_encoder = None

        know_interactions_params = params.get("know_interactions")
        know_interactions_aggregate_ffw_params = know_interactions_params.get(
            'aggregate_feedforward')

        # aggregate knowledge input state is inferred automatically
        update_params(know_interactions_aggregate_ffw_params, {
            "input_dim":
            len(know_interactions_params.get("interactions", []))
        })
        know_aggregate_feedforward = FeedForward.from_params(
            params.get("know_interactions").get('aggregate_feedforward'))

        init_params = params.pop('initializer', None)
        initializer = (InitializerApplicator.from_params(init_params)
                       if init_params is not None else InitializerApplicator())

        return cls(
            vocab=vocab,
            text_field_embedder=text_field_embedder,
            question_encoder=question_encoder,
            choice_encoder=choice_encoder,
            use_knowledge=use_knowledge,
            facts_encoder=knowledge_encoder,
            know_aggregate_feedforward=know_aggregate_feedforward,
            initializer=initializer,
            aggregate_choice=choice_enc_aggregate,
            aggregate_question=question_enc_aggregate,
            aggregate_facts=knowledge_enc_aggregate,
            embeddings_dropout_value=embeddings_dropout_value,
            share_encoders=share_encoders,
            choices_init_from_question_states=choices_init_from_question_states,
            facts_init_from_question_states=facts_init_from_question_states,
            use_ctx2facts_retrieval_map_as_mask=
            use_ctx2facts_retrieval_map_as_mask,
            params=params)
Beispiel #35
0
    def __init__(self,
                 vocab: Vocabulary,
                 params: Params,
                 regularizer: RegularizerApplicator = None):

        super(LayerEmdRelation, self).__init__(vocab=vocab,
                                               regularizer=regularizer)

        # Base text Field Embedder
        text_field_embedder_params = params.pop("text_field_embedder")
        text_field_embedder = BasicTextFieldEmbedder.from_params(
            vocab=vocab, params=text_field_embedder_params)
        self._text_field_embedder = text_field_embedder

        ############
        # EMD Stuffs
        ############
        emd_params = params.pop("emd")

        # Encoder
        encoder_emd_params = emd_params.pop("encoder")
        encoder_emd = Seq2SeqEncoder.from_params(encoder_emd_params)
        self._encoder_emd = encoder_emd

        # Tagger EMD - CRF Tagger
        tagger_emd_params = emd_params.pop("tagger")
        tagger_emd = CrfTagger(
            vocab=vocab,
            text_field_embedder=self._text_field_embedder,
            encoder=self._encoder_emd,
            label_namespace=tagger_emd_params.pop("label_namespace", "labels"),
            label_encoding=tagger_emd_params.pop("label_encoding", None),
            dropout=tagger_emd_params.pop("dropout", None),
            regularizer=regularizer,
        )
        self._tagger_emd = tagger_emd

        ############################
        # Relation Extraction Stuffs
        ############################
        relation_params = params.pop("relation")

        # Encoder
        encoder_relation_params = relation_params.pop("encoder")
        encoder_relation = Seq2SeqEncoder.from_params(encoder_relation_params)
        self._encoder_relation = encoder_relation

        shortcut_text_field_embedder_relation = ShortcutConnectTextFieldEmbedder(
            base_text_field_embedder=self._text_field_embedder,
            previous_encoders=[self._encoder_emd])
        self._shortcut_text_field_embedder_relation = shortcut_text_field_embedder_relation

        # Tagger: Relation
        tagger_relation_params = relation_params.pop("tagger")
        tagger_relation = RelationExtractor(
            vocab=vocab,
            text_field_embedder=self._shortcut_text_field_embedder_relation,
            context_layer=self._encoder_relation,
            d=tagger_relation_params.pop_int("d"),
            l=tagger_relation_params.pop_int("l"),
            n_classes=tagger_relation_params.pop("n_classes"),
            activation=tagger_relation_params.pop("activation"),
        )
        self._tagger_relation = tagger_relation

        logger.info("Multi-Task Learning Model has been instantiated.")