Exemple #1
0
    def __init__(self):
        # CopyNet model initialization parameters
        self.vocabulary = Vocabulary()
        self.vocabulary = self.vocabulary.from_files(
            "C:/Users/Selma/PycharmProjects/ROS2SemanticParser/"
            "CN_model_weights/no_embedds/model.tar.gz")
        self.source_embedder = BasicTextFieldEmbedder(
            token_embedders={
                'tokens':
                Embedding(num_embeddings=self.vocabulary.get_vocab_size(
                    'source_tokens'),
                          embedding_dim=310)
            })
        self.dataset_reader = CopyNetDatasetReader(
            target_namespace="target_tokens")
        self.encoder = PytorchSeq2SeqWrapper(
            torch.nn.LSTM(input_size=310,
                          hidden_size=128,
                          num_layers=1,
                          batch_first=True))
        self.attention = BilinearAttention(vector_dim=128, matrix_dim=128)
        self.beam_size = 5
        self.max_decoding_steps = 200
        self.target_embedding_dim = 150

        self.semantic_parser = CopyNetSeq2Seq(
            vocab=self.vocabulary,
            source_embedder=self.source_embedder,
            encoder=self.encoder,
            attention=self.attention,
            beam_size=self.beam_size,
            max_decoding_steps=self.max_decoding_steps,
            target_embedding_dim=self.target_embedding_dim)
Exemple #2
0
    def init_model(self) -> Model:
        """build the model

        Args:
            vocab (Vocabulary): the vocabulary of corpus

        Returns:
            Model: the final models
        """
        bert_text_field_embedder = PretrainedTransformerEmbedder(model_name=self.config.model_name)
        bert_text_field_embedder
        tagger = SimpleTagger(
            vocab=self.vocab,
            text_field_embedder=BasicTextFieldEmbedder(
                token_embedders={
                    'tokens': bert_text_field_embedder
                }
            ),
            encoder=PassThroughEncoder(bert_text_field_embedder.get_output_dim()),
            verbose_metrics=True,
            calculate_span_f1=True,
            label_encoding="BMES",
        )
        
        tagger.to(device=self.config.device)
        return tagger
Exemple #3
0
    def __init__(self, 
                 vocab: Vocabulary,
                 bert_embedder: Optional[PretrainedBertEmbedder] = None,
                 encoder: Optional[Seq2SeqEncoder] = None,
                 dropout: Optional[float] = None,
                 use_crf: bool = True) -> None:
        super().__init__(vocab)

        if bert_embedder:
            self.use_bert = True
            self.bert_embedder = bert_embedder
        else:
            self.use_bert = False
            self.basic_embedder = BasicTextFieldEmbedder({
                "tokens": Embedding(vocab.get_vocab_size(namespace="tokens"), 1024)
            })
            self.rnn = Seq2SeqEncoder.from_params(Params({     
                "type": "lstm",
                "input_size": 1024,
                "hidden_size": 512,
                "bidirectional": True,
                "batch_first": True
            }))

        self.encoder = encoder

        if encoder:
            hidden2tag_in_dim = encoder.get_output_dim()
        else:
            hidden2tag_in_dim = bert_embedder.get_output_dim()
        self.hidden2tag = TimeDistributed(torch.nn.Linear(
            in_features=hidden2tag_in_dim,
            out_features=vocab.get_vocab_size("labels")))
        
        if dropout:
            self.dropout = torch.nn.Dropout(dropout)
        else:
            self.dropout = None
        
        self.use_crf = use_crf
        if use_crf:
            crf_constraints = allowed_transitions(
                constraint_type="BIO",
                labels=vocab.get_index_to_token_vocabulary("labels")
            )
            self.crf = ConditionalRandomField(
                num_tags=vocab.get_vocab_size("labels"),
                constraints=crf_constraints,
                include_start_end_transitions=True
            )
        
        self.f1 = SpanBasedF1Measure(vocab, 
                                     tag_namespace="labels",
                                     ignore_classes=["news/type","negation",
                                                     "demonstrative_reference",
                                                     "timer/noun","timer/attributes"],
                                     label_encoding="BIO")
def main():
    args = parse_args()
    vocabulary = os.path.join(args.serialization_dir, 'vocabulary')
    vocab = Vocabulary.from_files(vocabulary)
    embedding = EmbeddingV2(
        False,
        num_embeddings=23378,
        embedding_dim=200,
        padding_index=0,
        trainable=False)
    token_embedders = {'tokens': embedding}
    basic_text_field_embedder = BasicTextFieldEmbedder(token_embedders)
    transformer = Transformer(
        attention_dropout_prob=0.1,
        attention_type="dot_product",
        dropout_prob=0.1,
        input_size=200,
        intermediate_act_fn="gelu",
        intermediate_size=3072,
        key_depth=1024,
        max_position_embeddings=256,
        memory_size=200,
        num_heads=16,
        num_hidden_layers=6,
        type_vocab_size=2,
        use_fp16=False,
        value_depth=1024,
        use_token_type=False,
        use_position_embeddings=True)
    model = JointIntentSlotDepsModel(
        text_field_embedder=basic_text_field_embedder,
        transformer=transformer,
        vocab=vocab,
        label_encoding="BIO",
        constrain_crf_decoding=True,
        calculate_span_f1=True,
        include_start_end_transitions=True,
        use_fp16=False)
    dummy_input = torch.ones(1, 14, 200, dtype=torch.float)
    dummy_mask = torch.ones(1, 14, dtype=torch.float)
    segment_ids = torch.ones(1, 14, dtype=torch.float)
    output = model._inner_model(dummy_input, dummy_mask)
    model_state = torch.load(
        os.path.join(args.serialization_dir, 'best.th'),
        map_location=torch.device('cpu'))
    model.load_state_dict(model_state)

    torch.onnx.export(
        model=model._inner_model,
        args=(dummy_input, dummy_mask),
        f=args.output_pt,
        verbose=True,
        export_params=True)
    return 0
Exemple #5
0
def test_concat_position_embeddings():
    # Test the normal case
    batch_size = 2
    number_targets = 2
    text_seq_length = 3
    encoded_dim = 4

    encoded_text_tensor = [[[0.5, 0.3, 0.2, 0.6], [0.2, 0.3, 0.4, 0.7],
                            [0.5, 0.4, 0.6, 0.2]],
                           [[0.4, 0.5, 0.3, 0.7], [0.3, 0.1, 0.2, 0.0],
                            [0.0, 0.0, 0.0, 0.0]],
                           [[0.5, 0.3, 0.2, 0.3], [0.0, 0.0, 0.0, 0.0],
                            [0.0, 0.0, 0.0, 0.0]],
                           [[0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0],
                            [0.0, 0.0, 0.0, 0.0]]]
    encoded_text_tensor = torch.Tensor(encoded_text_tensor)
    assert (batch_size * number_targets, text_seq_length,
            encoded_dim) == encoded_text_tensor.shape

    position_indexes = torch.Tensor([[[2, 1, 2], [1, 2, 0]],
                                     [[1, 0, 0], [0, 0, 0]]])
    position_indexes = position_indexes.type(torch.long)
    assert (batch_size, number_targets,
            text_seq_length) == position_indexes.shape
    position_indexes = {'position_tokens': {'tokens': position_indexes}}

    embedding = Embedding(num_embeddings=3, embedding_dim=5, trainable=False)
    target_position_embedding = BasicTextFieldEmbedder(
        {'position_tokens': embedding})
    assert (batch_size, number_targets, text_seq_length,
            5) == target_position_embedding(position_indexes).shape

    test_encoded_text_tensor = util.concat_position_embeddings(
        encoded_text_tensor, position_indexes, target_position_embedding)
    assert (batch_size * number_targets, text_seq_length,
            encoded_dim + 5) == test_encoded_text_tensor.shape

    # Test the case where it should return just the original encoded_text_tensor
    test_encoded_text_tensor = util.concat_position_embeddings(
        encoded_text_tensor, None, None)
    assert torch.all(torch.eq(test_encoded_text_tensor, encoded_text_tensor))

    # Test ValueError when the `target_position_embedding` is not None but
    # position_indexes is None
    with pytest.raises(ValueError):
        util.concat_position_embeddings(encoded_text_tensor, None,
                                        target_position_embedding)
Exemple #6
0
    def init_model(self) -> Model:
        """build the model

        Args:
            vocab (Vocabulary): the vocabulary of corpus

        Returns:
            Model: the final models
        """
        bert_text_field_embedder = PretrainedTransformerEmbedder(
            model_name=self.config.model_name)
        tagger = BasicClassifier(
            vocab=self.vocab,
            text_field_embedder=BasicTextFieldEmbedder(
                token_embedders={'tokens': bert_text_field_embedder}),
            seq2vec_encoder=ClsPooler(
                embedding_dim=bert_text_field_embedder.get_output_dim()),
        )
        tagger.to(device=self.config.device)
        return tagger
def test_sequence_tagging_reader():
    model_name = 'bert-base-chinese'

    bert_token_indexers = PretrainedTransformerIndexer(model_name=model_name)
    reader = SequenceTaggingDatasetReader(
        token_indexers={"tokens": bert_token_indexers})

    train_file = './data/weibo/train.corpus'
    dev_file = './data/weibo/dev.corpus'
    test_file = './data/weibo/dev.corpus'
    train_instances = list(reader.read(train_file))
    dev_instances = list(reader.read(dev_file))
    test_instances = list(reader.read(test_file))

    vocab: Vocabulary = Vocabulary.from_instances(train_instances)
    assert vocab.get_namespaces() is not None

    bert_text_field_embedder = PretrainedTransformerEmbedder(
        model_name=model_name)
    tagger = SimpleTagger(
        vocab=vocab,
        text_field_embedder=BasicTextFieldEmbedder(
            token_embedders={'tokens': bert_text_field_embedder}),
        encoder=PassThroughEncoder(bert_text_field_embedder.get_output_dim()),
        calculate_span_f1=True,
        label_encoding="BMES",
        # verbose_metrics=True
    )

    train_data_loader, dev_data_loader = build_data_loaders(
        train_instances, dev_instances)
    train_data_loader.index_with(vocab)
    dev_data_loader.index_with(vocab)

    trainer = build_trainer(model=tagger,
                            serialization_dir='./output',
                            train_loader=train_data_loader,
                            dev_loader=dev_data_loader)
    print("Starting training")
    trainer.train()
    print("Finished training")
Exemple #8
0
 def init_crf_model(self) -> Model:
     """init crf tagger model
     """
     # 1. import related modules
     from allennlp
     bert_text_field_embedder = PretrainedTransformerEmbedder(model_name=self.config.model_name)
     bert_text_field_embedder
     tagger = SimpleTagger(
         vocab=self.vocab,
         text_field_embedder=BasicTextFieldEmbedder(
             token_embedders={
                 'tokens': bert_text_field_embedder
             }
         ),
         encoder=PassThroughEncoder(bert_text_field_embedder.get_output_dim()),
         verbose_metrics=True,
         calculate_span_f1=True,
         label_encoding="BMES",
     )
     
     tagger.to(device=self.config.device)
     return tagger
Exemple #9
0
def main():
    embedding_dim = 200
    num_embeddings = 26729
    attention_dropout_prob = 0.1
    attention_type = "dot_product"
    dropout_prob = 0.1
    input_size = 200
    intermediate_act_fn = "gelu"
    intermediate_size = 3072
    key_depth = 1024
    max_position_embeddings = 256
    memory_size = 200
    num_heads = 16
    num_hidden_layers = 6
    type_vocab_size = 2
    use_fp16 = False
    value_depth = 1024
    use_token_type = False
    use_position_embeddings = True
    vocabulary = './vocabulary'

    vocab = Vocabulary.from_files(vocabulary)
    transformer = Transformer(use_fp16,
                              num_hidden_layers,
                              intermediate_size,
                              intermediate_act_fn,
                              num_heads,
                              input_size,
                              memory_size,
                              key_depth,
                              value_depth,
                              max_position_embeddings=max_position_embeddings,
                              type_vocab_size=type_vocab_size)
    embedding = Embedding(num_embeddings, embedding_dim)
    text_field_embedder = BasicTextFieldEmbedder({'tokens': embedding})
    model = JointIntentSlotDepsModel(vocab, use_fp16, text_field_embedder,
                                     transformer)
    torch.save(model.state_dict(), '/tmp/model.th')
    tokenizer = PretrainedTransformerTokenizer(model_string, do_lowercase=True)
    token_indexer = PretrainedTransformerIndexer(model_string,
                                                 do_lowercase=True)

    reader = SSTDatasetReader(tokenizer, {"tokens": token_indexer})

    train_dataset = reader.read('sst/trees/train.txt')
    val_dataset = reader.read('sst/trees/dev.txt')

    print(train_dataset[0])

    vocab = Vocabulary.from_instances(train_dataset + val_dataset)

    bert_token_embedder = PretrainedTransformerEmbedder(model_string)
    bert_textfield_embedder = BasicTextFieldEmbedder(
        {"tokens": bert_token_embedder})

    model = BertClassifier(vocab,
                           bert_textfield_embedder,
                           freeze_encoder=False)

    iterator = BucketIterator(sorting_keys=[("tokens", "num_tokens")],
                              batch_size=32)
    iterator.index_with(vocab)

    trainer = Trainer(model=model,
                      optimizer=optim.Adam(model.parameters()),
                      serialization_dir='/tmp/test',
                      iterator=iterator,
                      train_dataset=train_dataset,
                      validation_dataset=val_dataset,
Exemple #11
0
    def __init__(self,
                 vocab: Vocabulary,
                 bert_embedder: Optional[PretrainedBertEmbedder] = None,
                 encoder: Optional[Seq2SeqEncoder] = None,
                 dropout: Optional[float] = None,
                 use_crf: bool = True,
                 add_random_noise: bool = False,
                 add_attack_noise: bool = False,
                 do_noise_normalization: bool = True,
                 noise_norm: Optional[float] = None,
                 noise_loss_prob: Optional[float] = None,
                 add_noise_for: str = "ov",
                 rnn_after_embeddings: bool = False,
                 open_vocabulary_slots: Optional[List[str]] = None,
                 metrics_for_each_slot_type: bool = False) -> None:
        """
        Params
        ------
        vocab: the allennlp Vocabulary object, will be automatically passed
        bert_embedder: the pretrained BERT embedder. If it is not None, the pretrained BERT
                embedding (parameter fixed) will be used as the embedding layer. Otherwise, a look-up
                embedding matrix will be initialized with the embedding size 1024. The default is None.
        encoder: the contextual encoder used after the embedding layer. If set to None, no contextual
                encoder will be used.
        dropout: the dropout rate, won't be set in all our experiments.
        use_crf: if set to True, CRF will be used at the end of the model (as output layer). Otherwise,
                a softmax layer (with cross-entropy loss) will be used.
        add_random_noise: whether to add random noise to slots. Can not be set simultaneously 
                with add_attack_noise. This setting is used as baseline in our experiments.
        add_attack_noise: whether to add adversarial attack noise to slots. Can not be set simultaneously
                with add_random_noise.
        do_noise_normalization: if set to True, the normalization will be applied to gradients w.r.t. 
                token embeddings. Otherwise, the gradients won't be normalized.
        noise_norm: the normalization norm (L2) applied to gradients.
        noise_loss_prob: the alpha hyperparameter to balance the loss from normal forward and adversarial
                forward. See the paper for more details. Should be set from 0 to 1.
        add_noise_for: if set to ov, the noise will only be applied to open-vocabulary slots. Otherwise,
                the noise will be applied to all slots (both open-vocabulary and normal slots).
        rnn_after_embeddings: if set to True, an additional BiLSTM layer will be applied after the embedding
                layer. Default is False.
        open_vocabulary_slots: the list of open-vocabulary slots. If not set, will be set to open-vocabulary
                slots of Snips dataset by default.
        metrics_for_each_slot_type: whether to log metrics for each slot type. Default is False.
        """
        super().__init__(vocab)

        if bert_embedder:
            self.use_bert = True
            self.bert_embedder = bert_embedder
        else:
            self.use_bert = False
            self.basic_embedder = BasicTextFieldEmbedder({
                "tokens":
                Embedding(vocab.get_vocab_size(namespace="tokens"), 1024)
            })
            self.rnn_after_embeddings = rnn_after_embeddings
            if rnn_after_embeddings:
                self.rnn = Seq2SeqEncoder.from_params(
                    Params({
                        "type": "lstm",
                        "input_size": 1024,
                        "hidden_size": 512,
                        "bidirectional": True,
                        "batch_first": True
                    }))

        self.encoder = encoder

        if encoder:
            hidden2tag_in_dim = encoder.get_output_dim()
        else:
            hidden2tag_in_dim = bert_embedder.get_output_dim()
        self.hidden2tag = TimeDistributed(
            torch.nn.Linear(in_features=hidden2tag_in_dim,
                            out_features=vocab.get_vocab_size("labels")))

        if dropout:
            self.dropout = torch.nn.Dropout(dropout)
        else:
            self.dropout = None

        self.use_crf = use_crf
        if use_crf:
            crf_constraints = allowed_transitions(
                constraint_type="BIO",
                labels=vocab.get_index_to_token_vocabulary("labels"))
            self.crf = ConditionalRandomField(
                num_tags=vocab.get_vocab_size("labels"),
                constraints=crf_constraints,
                include_start_end_transitions=True)

        # default open_vocabulary slots: for SNIPS dataset
        open_vocabulary_slots = open_vocabulary_slots or [
            "playlist", "entity_name", "poi", "restaurant_name",
            "geographic_poi", "album", "track", "object_name", "movie_name"
        ]
        self.f1 = OVSpecSpanBasedF1Measure(
            vocab,
            tag_namespace="labels",
            ignore_classes=[],
            label_encoding="BIO",
            open_vocabulary_slots=open_vocabulary_slots)

        self.add_random_noise = add_random_noise
        self.add_attack_noise = add_attack_noise
        assert not (add_random_noise and
                    add_attack_noise), "both random and attack noise applied"
        if add_random_noise or add_attack_noise:
            self.do_noise_normalization = do_noise_normalization
            assert noise_norm is not None
            assert noise_loss_prob is not None and 0. <= noise_loss_prob <= 1.
            self.noise_norm = noise_norm
            self.noise_loss_prob = noise_loss_prob
            assert add_noise_for in ["ov", "all"]
            self.ov_noise_only = (add_noise_for == "ov")

        self.metrics_for_each_slot_type = metrics_for_each_slot_type