Example #1
0
    def __init__(self, 
                 vocab: Vocabulary,
                 bert_embedder: Optional[PretrainedBertEmbedder] = None,
                 encoder: Optional[Seq2SeqEncoder] = None,
                 dropout: Optional[float] = None,
                 use_crf: bool = True) -> None:
        super().__init__(vocab)

        if bert_embedder:
            self.use_bert = True
            self.bert_embedder = bert_embedder
        else:
            self.use_bert = False
            self.basic_embedder = BasicTextFieldEmbedder({
                "tokens": Embedding(vocab.get_vocab_size(namespace="tokens"), 1024)
            })
            self.rnn = Seq2SeqEncoder.from_params(Params({     
                "type": "lstm",
                "input_size": 1024,
                "hidden_size": 512,
                "bidirectional": True,
                "batch_first": True
            }))

        self.encoder = encoder

        if encoder:
            hidden2tag_in_dim = encoder.get_output_dim()
        else:
            hidden2tag_in_dim = bert_embedder.get_output_dim()
        self.hidden2tag = TimeDistributed(torch.nn.Linear(
            in_features=hidden2tag_in_dim,
            out_features=vocab.get_vocab_size("labels")))
        
        if dropout:
            self.dropout = torch.nn.Dropout(dropout)
        else:
            self.dropout = None
        
        self.use_crf = use_crf
        if use_crf:
            crf_constraints = allowed_transitions(
                constraint_type="BIO",
                labels=vocab.get_index_to_token_vocabulary("labels")
            )
            self.crf = ConditionalRandomField(
                num_tags=vocab.get_vocab_size("labels"),
                constraints=crf_constraints,
                include_start_end_transitions=True
            )
        
        self.f1 = SpanBasedF1Measure(vocab, 
                                     tag_namespace="labels",
                                     ignore_classes=["news/type","negation",
                                                     "demonstrative_reference",
                                                     "timer/noun","timer/attributes"],
                                     label_encoding="BIO")
    def test_stacked_bidirectional_lstm_can_build_from_params(self):
        params = Params({"type": "stacked_bidirectional_lstm",
                         "input_size": 5,
                         "hidden_size": 9,
                         "num_layers": 3})
        encoder = Seq2SeqEncoder.from_params(params)

        assert encoder.get_input_dim() == 5
        assert encoder.get_output_dim() == 18
        assert encoder.is_bidirectional
Example #3
0
    def test_stacked_bidirectional_lstm_can_build_from_params(self):
        params = Params({"type": "stacked_bidirectional_lstm",
                         "input_size": 5,
                         "hidden_size": 9,
                         "num_layers": 3})
        encoder = Seq2SeqEncoder.from_params(params)

        assert encoder.get_input_dim() == 5
        assert encoder.get_output_dim() == 18
        assert encoder.is_bidirectional
Example #4
0
    vocab = Vocabulary.from_instances(train_dataset + validation_dataset)

    EMBEDDING_DIM = 100
    HIDDEN_DIM = 200

    model_params = Params({
        'type': 'lstm',
        'input_size': EMBEDDING_DIM,
        'hidden_size': HIDDEN_DIM
    })

    token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                                embedding_dim=EMBEDDING_DIM)
    word_embedding = BasicTextFieldEmbedder({'tokens': token_embedding})
    lstm = Seq2SeqEncoder.from_params(model_params)

    model = POSTagger(word_embedding, lstm, vocab)

    optimizer = optim.Adam(model.parameters())

    iterator = BucketIterator(batch_size=64,
                              sorting_keys=[('sentence', 'num_tokens')])
    iterator.index_with(vocab)

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=iterator,
                      train_dataset=train_dataset,
                      validation_dataset=validation_dataset,
                      patience=10,
Example #5
0
    def __init__(self,
                 vocab: Vocabulary,
                 bert_embedder: Optional[PretrainedBertEmbedder] = None,
                 encoder: Optional[Seq2SeqEncoder] = None,
                 dropout: Optional[float] = None,
                 use_crf: bool = True,
                 add_random_noise: bool = False,
                 add_attack_noise: bool = False,
                 do_noise_normalization: bool = True,
                 noise_norm: Optional[float] = None,
                 noise_loss_prob: Optional[float] = None,
                 add_noise_for: str = "ov",
                 rnn_after_embeddings: bool = False,
                 open_vocabulary_slots: Optional[List[str]] = None,
                 metrics_for_each_slot_type: bool = False) -> None:
        """
        Params
        ------
        vocab: the allennlp Vocabulary object, will be automatically passed
        bert_embedder: the pretrained BERT embedder. If it is not None, the pretrained BERT
                embedding (parameter fixed) will be used as the embedding layer. Otherwise, a look-up
                embedding matrix will be initialized with the embedding size 1024. The default is None.
        encoder: the contextual encoder used after the embedding layer. If set to None, no contextual
                encoder will be used.
        dropout: the dropout rate, won't be set in all our experiments.
        use_crf: if set to True, CRF will be used at the end of the model (as output layer). Otherwise,
                a softmax layer (with cross-entropy loss) will be used.
        add_random_noise: whether to add random noise to slots. Can not be set simultaneously 
                with add_attack_noise. This setting is used as baseline in our experiments.
        add_attack_noise: whether to add adversarial attack noise to slots. Can not be set simultaneously
                with add_random_noise.
        do_noise_normalization: if set to True, the normalization will be applied to gradients w.r.t. 
                token embeddings. Otherwise, the gradients won't be normalized.
        noise_norm: the normalization norm (L2) applied to gradients.
        noise_loss_prob: the alpha hyperparameter to balance the loss from normal forward and adversarial
                forward. See the paper for more details. Should be set from 0 to 1.
        add_noise_for: if set to ov, the noise will only be applied to open-vocabulary slots. Otherwise,
                the noise will be applied to all slots (both open-vocabulary and normal slots).
        rnn_after_embeddings: if set to True, an additional BiLSTM layer will be applied after the embedding
                layer. Default is False.
        open_vocabulary_slots: the list of open-vocabulary slots. If not set, will be set to open-vocabulary
                slots of Snips dataset by default.
        metrics_for_each_slot_type: whether to log metrics for each slot type. Default is False.
        """
        super().__init__(vocab)

        if bert_embedder:
            self.use_bert = True
            self.bert_embedder = bert_embedder
        else:
            self.use_bert = False
            self.basic_embedder = BasicTextFieldEmbedder({
                "tokens":
                Embedding(vocab.get_vocab_size(namespace="tokens"), 1024)
            })
            self.rnn_after_embeddings = rnn_after_embeddings
            if rnn_after_embeddings:
                self.rnn = Seq2SeqEncoder.from_params(
                    Params({
                        "type": "lstm",
                        "input_size": 1024,
                        "hidden_size": 512,
                        "bidirectional": True,
                        "batch_first": True
                    }))

        self.encoder = encoder

        if encoder:
            hidden2tag_in_dim = encoder.get_output_dim()
        else:
            hidden2tag_in_dim = bert_embedder.get_output_dim()
        self.hidden2tag = TimeDistributed(
            torch.nn.Linear(in_features=hidden2tag_in_dim,
                            out_features=vocab.get_vocab_size("labels")))

        if dropout:
            self.dropout = torch.nn.Dropout(dropout)
        else:
            self.dropout = None

        self.use_crf = use_crf
        if use_crf:
            crf_constraints = allowed_transitions(
                constraint_type="BIO",
                labels=vocab.get_index_to_token_vocabulary("labels"))
            self.crf = ConditionalRandomField(
                num_tags=vocab.get_vocab_size("labels"),
                constraints=crf_constraints,
                include_start_end_transitions=True)

        # default open_vocabulary slots: for SNIPS dataset
        open_vocabulary_slots = open_vocabulary_slots or [
            "playlist", "entity_name", "poi", "restaurant_name",
            "geographic_poi", "album", "track", "object_name", "movie_name"
        ]
        self.f1 = OVSpecSpanBasedF1Measure(
            vocab,
            tag_namespace="labels",
            ignore_classes=[],
            label_encoding="BIO",
            open_vocabulary_slots=open_vocabulary_slots)

        self.add_random_noise = add_random_noise
        self.add_attack_noise = add_attack_noise
        assert not (add_random_noise and
                    add_attack_noise), "both random and attack noise applied"
        if add_random_noise or add_attack_noise:
            self.do_noise_normalization = do_noise_normalization
            assert noise_norm is not None
            assert noise_loss_prob is not None and 0. <= noise_loss_prob <= 1.
            self.noise_norm = noise_norm
            self.noise_loss_prob = noise_loss_prob
            assert add_noise_for in ["ov", "all"]
            self.ov_noise_only = (add_noise_for == "ov")

        self.metrics_for_each_slot_type = metrics_for_each_slot_type