Ejemplo n.º 1
0
    def __init__(self, general_embeddings, domain_embeddings, input_size, hidden_size, aspect_tag_classes,
                 polarity_tag_classes, k, dropout=0.5):
        super(DualCrossSharedLSTM, self).__init__()
        self.general_embedding = nn.Embedding(num_embeddings=general_embeddings.size(0),
                                              embedding_dim=general_embeddings.size(1),
                                              padding_idx=0).from_pretrained(general_embeddings)
        self.domain_embedding = nn.Embedding(num_embeddings=domain_embeddings.size(0),
                                             embedding_dim=domain_embeddings.size(1),
                                             padding_idx=0).from_pretrained(domain_embeddings)
        self.general_embedding.weight.requires_grad = False
        self.domain_embedding.weight.requires_grad = False

        self.dropout = dropout

        self.aspect_rnn1 =  DynamicRNN(input_size,hidden_size, num_layers=1, batch_first=True, bidirectional=True)
        self.polarity_rnn1 = DynamicRNN(input_size,hidden_size, num_layers=1, batch_first=True, bidirectional=True)

        self.csu = Cross_Shared_Unit(k, 2 * hidden_size)

        self.aspect_rnn2 =  DynamicRNN(hidden_size*2,hidden_size, num_layers=1, batch_first=True, bidirectional=True)
        self.polarity_rnn2 = DynamicRNN(hidden_size*2,hidden_size, num_layers=1, batch_first=True, bidirectional=True)

        self.aspect_hidden2tag = nn.Linear(2 * hidden_size, aspect_tag_classes)
        self.polarity_hidden2tag = nn.Linear(2 * hidden_size, polarity_tag_classes)

        self.aspect_crf = ConditionalRandomField(aspect_tag_classes)
        self.polarity_crf = ConditionalRandomField(polarity_tag_classes)

        self.dropout_layer = nn.Dropout(dropout)
Ejemplo n.º 2
0
    def __init__(
        self,
        vocab: Vocabulary,
        text_field_embedder: TextFieldEmbedder,
        use_sep: bool = True,
        with_crf: bool = False,
        self_attn: Seq2SeqEncoder = None,
        bert_dropout: float = 0.1,
        sci_sum: bool = False,
        additional_feature_size: int = 0,
    ) -> None:
        super(SeqClassificationModel, self).__init__(vocab)

        self.text_field_embedder = text_field_embedder
        self.vocab = vocab
        self.use_sep = use_sep
        self.with_crf = with_crf
        self.sci_sum = sci_sum
        self.self_attn = self_attn
        self.additional_feature_size = additional_feature_size

        self.dropout = torch.nn.Dropout(p=bert_dropout)

        # define loss
        if self.sci_sum:
            self.loss = torch.nn.MSELoss(
                reduction='none')  # labels are rouge scores
            self.labels_are_scores = True
            self.num_labels = 1
        else:
            self.loss = torch.nn.CrossEntropyLoss(ignore_index=-1,
                                                  reduction='none')
            self.labels_are_scores = False
            self.num_labels = self.vocab.get_vocab_size(namespace='labels')
            # define accuracy metrics
            self.label_accuracy = CategoricalAccuracy()
            self.all_f1_metrics = FBetaMeasure(beta=1.0, average='micro')
            self.label_f1_metrics = {}

            # define F1 metrics per label
            for label_index in range(self.num_labels):
                label_name = self.vocab.get_token_from_index(
                    namespace='labels', index=label_index)
                self.label_f1_metrics[label_name] = F1Measure(label_index)

        encoded_senetence_dim = text_field_embedder._token_embedders[
            'bert'].output_dim

        ff_in_dim = encoded_senetence_dim if self.use_sep else self_attn.get_output_dim(
        )
        ff_in_dim += self.additional_feature_size

        self.time_distributed_aggregate_feedforward = TimeDistributed(
            Linear(ff_in_dim, self.num_labels))

        if self.with_crf:
            self.crf = ConditionalRandomField(
                self.num_labels,
                constraints=None,
                include_start_end_transitions=True)
Ejemplo n.º 3
0
    def __init__(
        self,
        input_dim,
        num_tags,
        low_val=-5,
        high_val=5,
        incl_start_end=True,
        name=None,
    ):
        super(SpanScorerCRF, self).__init__()

        self.input_dim = input_dim
        self.num_tags = num_tags
        self.low_val = low_val
        self.high_val = high_val
        self.incl_start_end = incl_start_end
        self.name = name

        self.span_to_seq, self.seq_to_span = label_map(num_tags)

        self.num_tags_seq = len(self.seq_to_span)
        self.num_tags_span = len(self.span_to_seq)

        # Linear projection layer
        self.projection = nn.Linear(input_dim, self.num_tags_seq)

        # Create event-specific CRF
        self.crf = ConditionalRandomField( \
                        num_tags = self.num_tags_seq,
                        include_start_end_transitions = incl_start_end)
Ejemplo n.º 4
0
    def __init__(self, 
                 vocab: Vocabulary,
                 bert_embedder: Optional[PretrainedBertEmbedder] = None,
                 encoder: Optional[Seq2SeqEncoder] = None,
                 dropout: Optional[float] = None,
                 use_crf: bool = True) -> None:
        super().__init__(vocab)

        if bert_embedder:
            self.use_bert = True
            self.bert_embedder = bert_embedder
        else:
            self.use_bert = False
            self.basic_embedder = BasicTextFieldEmbedder({
                "tokens": Embedding(vocab.get_vocab_size(namespace="tokens"), 1024)
            })
            self.rnn = Seq2SeqEncoder.from_params(Params({     
                "type": "lstm",
                "input_size": 1024,
                "hidden_size": 512,
                "bidirectional": True,
                "batch_first": True
            }))

        self.encoder = encoder

        if encoder:
            hidden2tag_in_dim = encoder.get_output_dim()
        else:
            hidden2tag_in_dim = bert_embedder.get_output_dim()
        self.hidden2tag = TimeDistributed(torch.nn.Linear(
            in_features=hidden2tag_in_dim,
            out_features=vocab.get_vocab_size("labels")))
        
        if dropout:
            self.dropout = torch.nn.Dropout(dropout)
        else:
            self.dropout = None
        
        self.use_crf = use_crf
        if use_crf:
            crf_constraints = allowed_transitions(
                constraint_type="BIO",
                labels=vocab.get_index_to_token_vocabulary("labels")
            )
            self.crf = ConditionalRandomField(
                num_tags=vocab.get_vocab_size("labels"),
                constraints=crf_constraints,
                include_start_end_transitions=True
            )
        
        self.f1 = SpanBasedF1Measure(vocab, 
                                     tag_namespace="labels",
                                     ignore_classes=["news/type","negation",
                                                     "demonstrative_reference",
                                                     "timer/noun","timer/attributes"],
                                     label_encoding="BIO")
Ejemplo n.º 5
0
 def __init__(self, model_path, vocab: Vocabulary):
     super().__init__(vocab)
     self.pretrained_tokenizer = BertForPreTraining.from_pretrained(
         model_path)
     config = BertConfig.from_pretrained(model_path)
     bert_model = BertForPreTraining(config)
     self.bert = bert_model.bert
     tags = vocab.get_index_to_token_vocabulary("tags")
     num_tags = len(tags)
     constraints = allowed_transitions(constraint_type="BMES", labels=tags)
     self.projection = torch.nn.Linear(768, num_tags)
     self.crf = ConditionalRandomField(num_tags=num_tags,
                                       constraints=constraints,
                                       include_start_end_transitions=False)
Ejemplo n.º 6
0
    def __init__(
        self,
        vocab: Vocabulary,
        bert_model: str,
        dropout: float = 0.0,
        requires_grad: str = "none",
        use_crf: bool = False,
        pos_weight: float = 1.0,
        initializer: InitializerApplicator = InitializerApplicator(),
        regularizer: Optional[RegularizerApplicator] = None,
    ):

        super(BertMiddleModel, self).__init__(vocab, regularizer)
        self._vocabulary = vocab
        self._bert_model = BertModel.from_pretrained(bert_model)
        self._dropout = torch.nn.Dropout(p=dropout)
        self._classification_layer = torch.nn.Linear(
            self._bert_model.config.hidden_size, 2)

        self._use_crf = use_crf

        self._pos_weight = torch.Tensor([1 / (1 - pos_weight), 1 / pos_weight])
        self._pos_weight = torch.nn.Parameter(self._pos_weight /
                                              self._pos_weight.min())
        self._pos_weight.requires_grad = False

        if use_crf:
            self._crf = ConditionalRandomField(num_tags=2)

        self.embedding_layers = ["BertEmbedding"]

        if requires_grad in ["none", "all"]:
            for param in self._bert_model.parameters():
                param.requires_grad = requires_grad == "all"
        else:
            model_name_regexes = requires_grad.split(",")
            for name, param in self._bert_model.named_parameters():
                found = any([regex in name for regex in model_name_regexes])
                param.requires_grad = found

        for n, v in self._bert_model.named_parameters():
            if n.startswith("classifier"):
                v.requires_grad = True

        self._token_prf = F1Measure(1)

        initializer(self)
Ejemplo n.º 7
0
    def __init__(self, num_input_features: '(int) number of input features', hidden_size: '(int) number of\
    hidden features the outputs will also have hidden_size features'                                                                    , num_layers: '(int) number of \
    recursion'              , dropout_gru, bidirectional: '(bool) if True, use bidirectional GRU',\
    tags: "(dict[int: str])example: {0:'I', 1:'B', 2:'O', 3:'<PAD>'}", dropout_FCN: '(double)'):
        super().__init__()
        self.gru = nn.GRU(input_size=num_input_features, hidden_size=hidden_size, \
                                 num_layers=num_layers,batch_first = True, dropout=dropout_gru, \
                                 bidirectional=bidirectional)

        all_transition = allowed_transitions('BIO', tags)
        #self.crf = CRF(num_tags=len(tags), batch_first= True)
        self.linear = nn.Linear(hidden_size * 2, hidden_size)
        self.BN = nn.BatchNorm1d(num_layers)
        self.linear2 = nn.Linear(hidden_size, len(tags))
        self.BN2 = nn.BatchNorm1d(num_layers)
        self.crf = ConditionalRandomField(len(tags), all_transition)
        self.dropout = nn.Dropout(dropout_FCN)
Ejemplo n.º 8
0
    def __init__(self, args):
        super(BiLSTM_CRF, self).__init__()

        self.name = args.name
        self.hidden_size = args.hidden_size
        self.num_tags = args.num_tags
        self.embedding = nn.Embedding(args.embed_size, args.embed_dim)

        self.crf = ConditionalRandomField(self.num_tags, args.condtraints)
        self.lstm = nn.LSTM(input_size=args.embed_dim,
                            hidden_size=args.hidden_size // 2,
                            num_layers=1,
                            bidirectional=True)
        self.linear = nn.Linear(self.hidden_size, self.num_tags)

        self.device = args.device
        self.dropout = nn.Dropout(args.dropout)
Ejemplo n.º 9
0
class BiLSTM_CRF(nn.Module):
    def __init__(self, args):
        super(BiLSTM_CRF, self).__init__()

        self.name = args.name
        self.hidden_size = args.hidden_size
        self.num_tags = args.num_tags
        self.embedding = nn.Embedding(args.embed_size, args.embed_dim)

        self.crf = ConditionalRandomField(self.num_tags, args.condtraints)
        self.lstm = nn.LSTM(input_size=args.embed_dim,
                            hidden_size=args.hidden_size // 2,
                            num_layers=1,
                            bidirectional=True)
        self.linear = nn.Linear(self.hidden_size, self.num_tags)

        self.device = args.device
        self.dropout = nn.Dropout(args.dropout)

    def get_logits(self, sequences):
        batch_size = sequences.shape[0]
        sequences = sequences.transpose(0, 1)

        embeded = self.embedding(
            sequences)  # (sequence_len, batch_size, embedding_size)

        h0 = torch.randn(2,
                         batch_size,
                         self.hidden_size // 2,
                         device=sequences.device)
        c0 = torch.randn(2,
                         batch_size,
                         self.hidden_size // 2,
                         device=sequences.device)

        outputs, _ = self.lstm(embeded, (h0, c0))

        outputs = self.dropout(outputs)

        outputs = outputs.transpose(
            0, 1)  # (batch_size, sequence_len, hidden_size)

        logits = self.linear(outputs)

        return logits

    def forward(self, sequences: torch.Tensor, tags: torch.Tensor,
                mask) -> torch.Tensor:
        logits = self.get_logits(sequences)
        log_likelihood = self.crf(logits, tags, mask)
        loss = -log_likelihood
        return loss

    def predict(self, sequences, mask):
        logits = self.get_logits(sequences)
        best_path = self.crf.viterbi_tags(logits, mask)
        tags_pred = [tags for tags, score in best_path]
        return tags_pred
Ejemplo n.º 10
0
    def __init__(
        self,
        vocab: Vocabulary,
        text_field_embedder: TextFieldEmbedder,
        seq2seq_encoder: Seq2SeqEncoder,
        feedforward_encoder: Seq2SeqEncoder,
        dropout: float = 0.0,
        use_crf: bool = False,
        pos_weight: float = 1.0,
        initializer: InitializerApplicator = InitializerApplicator(),
        regularizer: Optional[RegularizerApplicator] = None,
    ):

        super(BertMiddleModel, self).__init__(vocab, regularizer)
        self._vocabulary = vocab
        self._text_field_embedder = text_field_embedder
        self._seq2seq_encoder = seq2seq_encoder
        self._dropout = torch.nn.Dropout(p=dropout)

        self._feedforward_encoder = feedforward_encoder
        self._classifier_input_dim = feedforward_encoder.get_output_dim()

        self._classification_layer = torch.nn.Linear(
            self._classifier_input_dim, 2)

        self._use_crf = use_crf

        self._pos_weight = torch.Tensor([1 / (1 - pos_weight), 1 / pos_weight])
        self._pos_weight = torch.nn.Parameter(self._pos_weight /
                                              self._pos_weight.min())
        self._pos_weight.requires_grad = False

        if use_crf:
            self._crf = ConditionalRandomField(num_tags=2)

        self._token_prf = F1Measure(1)

        initializer(self)
    def __init__(self, config):
        super(RobertaForSequentialSequenceClassification,
              self).__init__(config)
        self.num_labels = config.num_labels

        self.roberta = RobertaModel(config)
        self.classifier = RobertaClassificationHead(config)
        self.sigm = nn.Sigmoid()

        ### SSC attributes
        self.use_sep = True
        self.with_crf = False
        self.sci_sum = False
        self.dropout = torch.nn.Dropout(p=0.1)

        # define loss
        if self.sci_sum:
            self.loss = torch.nn.MSELoss(
                reduction='none')  # labels are rouge scores
            self.labels_are_scores = True
            self.num_labels = 1
        else:
            self.loss = torch.nn.CrossEntropyLoss(
                ignore_index=-1,
                reduction='none')  #weight=torch.tensor([.20, .80]),
            self.labels_are_scores = False
            self.num_labels = 2
            # define accuracy metrics
            self.label_accuracy = CategoricalAccuracy()
            self.label_f1_metrics = {}

            # define F1 metrics per label
            self.label_vocab = {0: 0, 1: 1}
            for label_index in range(self.num_labels):
                label_name = self.label_vocab[label_index]
                self.label_f1_metrics[label_name] = F1Measure(label_index)

        encoded_sentence_dim = 768

        ff_in_dim = encoded_sentence_dim  #if self.use_sep else self_attn.get_output_dim()
        #ff_in_dim += self.additional_feature_size

        self.time_distributed_aggregate_feedforward = TimeDistributed(
            Linear(ff_in_dim, self.num_labels))

        if self.with_crf:
            self.crf = ConditionalRandomField(
                self.num_labels,
                constraints=None,
                include_start_end_transitions=True)
Ejemplo n.º 12
0
 def __init__(self, vocab_size, labels_num, tag2id, embedding_size=32, single_backbone_kwargs={},
              context_backbone_kwargs=None):
     super().__init__()
     if context_backbone_kwargs is None:
         context_backbone_kwargs = {}
     self.embedding_size = embedding_size
     self.char_embeddings = nn.Embedding(vocab_size, embedding_size, padding_idx=0)
     self.single_token_backbone = StackedConv1d(embedding_size, **single_backbone_kwargs)
     self.context_backbone = StackedConv1d(embedding_size, **context_backbone_kwargs)
     self.global_pooling = nn.AdaptiveMaxPool1d(1)
     self.out = nn.Conv1d(embedding_size, labels_num, 1)
     self.labels_num = labels_num
     STATE_TRANSITIONS_CONSTRAINTS = get_state_transitions_constraints(tag2id)
     self.crf = ConditionalRandomField(len(tag2id), constraints=STATE_TRANSITIONS_CONSTRAINTS)
Ejemplo n.º 13
0
    def __init__(self, vocab: Vocabulary, embedding_dim=300, embedder_type=None, bert_trainable=True, **kwargs):
        super().__init__(vocab)
        for k in kwargs:
            self.__setattr__(k, kwargs[k])

        text_field_embedder = get_embeddings(embedder_type, self.vocab, embedding_dim, bert_trainable)
        embedding_dim = text_field_embedder.get_output_dim()

        encoder = PytorchSeq2SeqWrapper(
            torch.nn.LSTM(embedding_dim, self.num_rnn_units, batch_first=True, bidirectional=True, dropout=self.dropout_rate))

        self.label_namespace = label_namespace = 'ner_bio_labels'
        self.num_tags = self.vocab.get_vocab_size(label_namespace)

        self.text_field_embedder = text_field_embedder
        self.encoder = encoder
        self.dropout = torch.nn.Dropout(self.dropout_rate)

        output_dim = self.encoder.get_output_dim()
        self.tag_projection_layer = TimeDistributed(Linear(output_dim,
                                                           self.num_tags))

        self.label_encoding = label_encoding = 'BIO'
        labels = self.vocab.get_index_to_token_vocabulary(label_namespace)
        constraints = allowed_transitions(self.label_encoding, labels)

        self.include_start_end_transitions = True
        self.crf = ConditionalRandomField(
            self.num_tags, constraints,
            include_start_end_transitions=True
        )

        self._f1_metric = SpanBasedF1Measure(self.vocab,
                                             tag_namespace=label_namespace,
                                             label_encoding=label_encoding)
        self._verbose_metrics = False
Ejemplo n.º 14
0
class JointClassifier(Model):
    """
    Classifies NER tags and RE classes jointly. Label encoding is expected to be 'BIO'.

    Parameters
    ----------
    vocab : ``Vocabulary``, required
        A Vocabulary, required in order to compute sizes for input/output projections.
    text_field_embedder : ``TextFieldEmbedder``, required
        Used to embed the ``tokens`` ``TextField`` we get as input to the model.
    ner_tag_embedder : ``Embedding``, required
        Used to embed decoded ner tags as input to the relation scorer.
    encoder : ``Seq2SeqEncoder``
        An encoder that will learn the major logic of the task.
    relation_scorer : ``RelationScorer``
        A subtask model, that performs scoring of relations between entities.
    ner_tag_namespace : ``str``
        The vocabulary namespace of ner tags.
    evaluated_ner_labels : ``List[str]``, optional (default=``None``)
        The list of ner tag types that are to be used for f1 score computation.
    initializer : ``InitializerApplicator``, optional (default=``InitializerApplicator()``)
        Used to initialize the model parameters.
    regularizer : ``RegularizerApplicator``, optional (default=``None``)
        If provided, will be used to calculate the regularization penalty during training.
    """
    def __init__(self,
                 vocab: Vocabulary,
                 text_field_embedder: TextFieldEmbedder,
                 encoder: Seq2SeqEncoder,
                 relation_scorer: RelationScorer,
                 ner_tag_namespace: str = 'tags',
                 evaluated_ner_labels: List[str] = None,
                 re_loss_weight: float = 1.0,
                 ner_tag_embedder: TokenEmbedder = None,
                 use_aux_ner_labels: bool = False,
                 aux_coarse_namespace: str = 'coarse_tags',
                 aux_modifier_namespace: str = 'modifier_tags',
                 aux_loss_weight: float = 1.0,
                 initializer: InitializerApplicator = InitializerApplicator(),
                 regularizer: Optional[RegularizerApplicator] = None) -> None:
        super().__init__(vocab=vocab, regularizer=regularizer)

        self.text_field_embedder = text_field_embedder
        self.encoder = encoder

        # NER subtask 2
        self._ner_label_encoding = 'BIO'
        self._ner_tag_namespace = ner_tag_namespace
        ner_input_dim = self.encoder.get_output_dim()
        num_ner_tags = self.vocab.get_vocab_size(ner_tag_namespace)
        self.tag_projection_layer = TimeDistributed(
            Linear(ner_input_dim, num_ner_tags))

        self._use_aux_ner_labels = use_aux_ner_labels
        if self._use_aux_ner_labels:
            self._coarse_tag_namespace = aux_coarse_namespace
            self._num_coarse_tags = self.vocab.get_vocab_size(
                self._coarse_tag_namespace)
            self._coarse_projection_layer = TimeDistributed(
                Linear(ner_input_dim, self._num_coarse_tags))
            self._modifier_tag_namespace = aux_modifier_namespace
            self._num_modifier_tags = self.vocab.get_vocab_size(
                self._modifier_tag_namespace)
            self._modifier_projection_layer = TimeDistributed(
                Linear(ner_input_dim, self._num_modifier_tags))
            self._coarse_acc = CategoricalAccuracy()
            self._modifier_acc = CategoricalAccuracy()
            self._aux_loss_weight = aux_loss_weight

        self.ner_accuracy = CategoricalAccuracy()
        if evaluated_ner_labels is None:
            ignored_classes = None
        else:
            assert self._ner_label_encoding == 'BIO', 'expected BIO encoding'
            all_ner_tags = self.vocab.get_token_to_index_vocabulary(
                ner_tag_namespace).keys()
            ner_tag_classes = set(
                [bio_tag[2:] for bio_tag in all_ner_tags if len(bio_tag) > 2])
            ignored_classes = list(
                set(ner_tag_classes).difference(evaluated_ner_labels))
        self.ner_f1 = SpanBasedF1Measure(
            vocabulary=vocab,
            tag_namespace=ner_tag_namespace,
            label_encoding=self._ner_label_encoding,
            ignore_classes=ignored_classes)

        # Use constrained crf decoding with the BIO labeling scheme
        ner_labels = self.vocab.get_index_to_token_vocabulary(
            ner_tag_namespace)
        constraints = allowed_transitions(self._ner_label_encoding, ner_labels)

        self.crf = ConditionalRandomField(num_ner_tags,
                                          constraints,
                                          include_start_end_transitions=True)

        # RE subtask 3
        self.ner_tag_embedder = ner_tag_embedder
        self.relation_scorer = relation_scorer
        self._re_loss_weight = re_loss_weight

        initializer(self)

    @overrides
    def forward(
            self,
            tokens: Dict[str, torch.LongTensor],
            tags: torch.LongTensor = None,
            relation_root_idxs: torch.LongTensor = None,
            relations: torch.LongTensor = None,
            binary_coref: torch.FloatTensor = None,
            spacy_patterns: torch.FloatTensor = None,
            coarse_tags: torch.LongTensor = None,
            modifier_tags: torch.LongTensor = None,
            metadata: List[Dict[str, Any]] = None) -> Dict[str, torch.Tensor]:
        # pylint: disable=arguments-differ,no-member
        """
        Parameters
        ----------
        tokens : Dict[str, torch.LongTensor], required
            The output of ``TextField.as_array()``, which should typically be passed directly to a
            ``TextFieldEmbedder``. This output is a dictionary mapping keys to ``TokenIndexer``
            tensors.  At its most basic, using a ``SingleIdTokenIndexer`` this is: ``{"tokens":
            Tensor(batch_size, num_tokens)}``. This dictionary will have the same keys as were used
            for the ``TokenIndexers`` when you created the ``TextField`` representing your
            sequence.  The dictionary is designed to be passed directly to a ``TextFieldEmbedder``,
            which knows how to combine different word representations into a single vector per
            token in your input.
        tags : torch.LongTensor
            An integer tensor containing the gold ner tag label indexes.
        relation_root_idxs : torch.LongTensor, optional (default = None)
            An integer tensor containing the gold relation head indexes for training.
        relations : torch.LongTensor, optional (default = None)
            An integer tensor containing the gold relation label indexes for training.
        metadata : ``List[Dict[str, Any]]``, optional, (default = None)
            Additional information such as the original words and the entity ids.

        Returns
        -------
        An output dictionary consisting of:
        logits : torch.FloatTensor
            A tensor of shape ``(batch_size, num_tokens, tag_vocab_size)`` representing
            unnormalised log probabilities of the tag classes.
        class_probabilities : torch.FloatTensor
            A tensor of shape ``(batch_size, num_tokens, tag_vocab_size)`` representing
            a distribution of the tag classes per word.
        loss : torch.FloatTensor, optional
            A scalar loss to be optimised.
        """
        embedded_text_input = self.text_field_embedder(tokens)
        batch_size, sequence_length, _ = embedded_text_input.size()
        mask = get_text_field_mask(tokens)

        encoder_input_tensors = [embedded_text_input]
        if binary_coref is not None:
            encoder_input_tensors.append(binary_coref.unsqueeze(2))
        if spacy_patterns is not None:
            encoder_input_tensors.append(spacy_patterns.permute(0, 2, 1))
        if len(encoder_input_tensors) > 1:
            encoder_input = torch.cat(encoder_input_tensors, dim=2)
        else:
            encoder_input = encoder_input_tensors[0]

        # Shape: batch x seq_len x emb_dim
        encoded_text = self.encoder(encoder_input, mask)

        ner_logits = self.tag_projection_layer(encoded_text)
        best_ner_paths = self.crf.viterbi_tags(ner_logits, mask)

        # Just get the tags and ignore the score.
        predicted_ner_tags = []
        predicted_ner_tags_tensor = torch.zeros_like(mask)
        for ner_path, _ in best_ner_paths:
            batch_idx = len(predicted_ner_tags)
            predicted_ner_tags.append(ner_path)
            for token_idx, ner_tag_idx in enumerate(ner_path):
                predicted_ner_tags_tensor[batch_idx, token_idx] = ner_tag_idx
        # predicted_ner_tags = [x for x, y in best_ner_paths]

        output_dict = {
            "ner_logits": ner_logits,
            "mask": mask,
            "tags": predicted_ner_tags
        }

        if self._use_aux_ner_labels:
            coarse_logits = self._coarse_projection_layer(encoded_text)
            modifier_logits = self._modifier_projection_layer(encoded_text)

        if self.ner_tag_embedder is not None:
            embedded_tags = self.ner_tag_embedder(predicted_ner_tags_tensor)
            encoded_sequence = torch.cat([encoded_text, embedded_tags], dim=2)
        else:
            encoded_sequence = torch.cat([
                encoded_text, ner_logits,
                predicted_ner_tags_tensor.unsqueeze(2).float()
            ],
                                         dim=2)

        re_output = self.relation_scorer(encoded_sequence, mask,
                                         relation_root_idxs, relations)

        # Add a prefix for relation extraction logits
        output_dict['re_logits'] = re_output['logits']
        output_dict['relation_scores'] = re_output['relation_scores']

        if tags is not None:
            # Add negative log-likelihood as loss
            log_likelihood = self.crf(ner_logits, tags, mask)

            # It's not clear why, but pylint seems to think `log_likelihood` is tuple
            # (in fact, it's a torch.Tensor), so we need a disable.
            output_dict["ner_loss"] = -log_likelihood  # pylint: disable=invalid-unary-operand-type

            # Represent viterbi tags as "class probabilities" that we can
            # feed into the metrics
            class_probabilities = torch.zeros_like(ner_logits)
            for i, instance_tags in enumerate(predicted_ner_tags):
                for j, tag_id in enumerate(instance_tags):
                    class_probabilities[i, j, tag_id] = 1

            self.ner_accuracy(class_probabilities, tags, mask.float())
            self.ner_f1(class_probabilities, tags, mask.float())

            output_dict['loss'] = output_dict[
                'ner_loss'] + self._re_loss_weight * re_output['loss']

            if self._use_aux_ner_labels:
                assert coarse_tags is not None and modifier_tags is not None, 'Auxiliary losses require auxiliary input'
                self._coarse_acc(coarse_logits, coarse_tags, mask.float())
                self._modifier_acc(modifier_logits, modifier_tags,
                                   mask.float())
                coarse_loss = sequence_cross_entropy_with_logits(
                    coarse_logits, coarse_tags, mask)
                modifier_loss = sequence_cross_entropy_with_logits(
                    modifier_logits, modifier_tags, mask)
                output_dict['loss'] += self._aux_loss_weight * (coarse_loss +
                                                                modifier_loss)

        # Attach metadata
        if metadata is not None:
            for key in metadata[0]:
                output_dict[key] = [x[key] for x in metadata]

        return output_dict

    @overrides
    def decode(
            self, output_dict: Dict[str,
                                    torch.Tensor]) -> Dict[str, torch.Tensor]:
        output_dict = self.relation_scorer.decode(output_dict)
        # for key in ['relations', 'heads', 'head_offsets']:
        #     if key in re_output_dict:
        #         output_dict[key] = re_output_dict[key]
        output_dict["tags"] = [[
            self.vocab.get_token_from_index(tag, self._ner_tag_namespace)
            for tag in instance_tags
        ] for instance_tags in output_dict["tags"]]
        return output_dict

    @overrides
    def get_metrics(self, reset: bool = False) -> Dict[str, float]:
        re_metrics = self.relation_scorer.get_metrics(reset=reset)
        joint_metrics = {
            'ner_acc': self.ner_accuracy.get_metric(reset=reset),
            'ner_f1':
            self.ner_f1.get_metric(reset=reset)['f1-measure-overall'],
            're_acc': re_metrics['re_acc'],
        }
        if 're_f1' in re_metrics:
            joint_metrics['re_f1'] = re_metrics['re_f1']
        if self._use_aux_ner_labels:
            joint_metrics['coarse_acc'] = self._coarse_acc.get_metric(
                reset=reset)
            joint_metrics['modifier_acc'] = self._modifier_acc.get_metric(
                reset=reset)
        return joint_metrics
Ejemplo n.º 15
0
    def __init__(self,
                 vocab: Vocabulary,
                 text_field_embedder: TextFieldEmbedder,
                 encoder: Seq2SeqEncoder,
                 relation_scorer: RelationScorer,
                 ner_tag_namespace: str = 'tags',
                 evaluated_ner_labels: List[str] = None,
                 re_loss_weight: float = 1.0,
                 ner_tag_embedder: TokenEmbedder = None,
                 use_aux_ner_labels: bool = False,
                 aux_coarse_namespace: str = 'coarse_tags',
                 aux_modifier_namespace: str = 'modifier_tags',
                 aux_loss_weight: float = 1.0,
                 initializer: InitializerApplicator = InitializerApplicator(),
                 regularizer: Optional[RegularizerApplicator] = None) -> None:
        super().__init__(vocab=vocab, regularizer=regularizer)

        self.text_field_embedder = text_field_embedder
        self.encoder = encoder

        # NER subtask 2
        self._ner_label_encoding = 'BIO'
        self._ner_tag_namespace = ner_tag_namespace
        ner_input_dim = self.encoder.get_output_dim()
        num_ner_tags = self.vocab.get_vocab_size(ner_tag_namespace)
        self.tag_projection_layer = TimeDistributed(
            Linear(ner_input_dim, num_ner_tags))

        self._use_aux_ner_labels = use_aux_ner_labels
        if self._use_aux_ner_labels:
            self._coarse_tag_namespace = aux_coarse_namespace
            self._num_coarse_tags = self.vocab.get_vocab_size(
                self._coarse_tag_namespace)
            self._coarse_projection_layer = TimeDistributed(
                Linear(ner_input_dim, self._num_coarse_tags))
            self._modifier_tag_namespace = aux_modifier_namespace
            self._num_modifier_tags = self.vocab.get_vocab_size(
                self._modifier_tag_namespace)
            self._modifier_projection_layer = TimeDistributed(
                Linear(ner_input_dim, self._num_modifier_tags))
            self._coarse_acc = CategoricalAccuracy()
            self._modifier_acc = CategoricalAccuracy()
            self._aux_loss_weight = aux_loss_weight

        self.ner_accuracy = CategoricalAccuracy()
        if evaluated_ner_labels is None:
            ignored_classes = None
        else:
            assert self._ner_label_encoding == 'BIO', 'expected BIO encoding'
            all_ner_tags = self.vocab.get_token_to_index_vocabulary(
                ner_tag_namespace).keys()
            ner_tag_classes = set(
                [bio_tag[2:] for bio_tag in all_ner_tags if len(bio_tag) > 2])
            ignored_classes = list(
                set(ner_tag_classes).difference(evaluated_ner_labels))
        self.ner_f1 = SpanBasedF1Measure(
            vocabulary=vocab,
            tag_namespace=ner_tag_namespace,
            label_encoding=self._ner_label_encoding,
            ignore_classes=ignored_classes)

        # Use constrained crf decoding with the BIO labeling scheme
        ner_labels = self.vocab.get_index_to_token_vocabulary(
            ner_tag_namespace)
        constraints = allowed_transitions(self._ner_label_encoding, ner_labels)

        self.crf = ConditionalRandomField(num_ner_tags,
                                          constraints,
                                          include_start_end_transitions=True)

        # RE subtask 3
        self.ner_tag_embedder = ner_tag_embedder
        self.relation_scorer = relation_scorer
        self._re_loss_weight = re_loss_weight

        initializer(self)
Ejemplo n.º 16
0
class CWSModel(Model):
    def __init__(self, model_path, vocab: Vocabulary):
        super().__init__(vocab)
        self.pretrained_tokenizer = BertForPreTraining.from_pretrained(
            model_path)
        config = BertConfig.from_pretrained(model_path)
        bert_model = BertForPreTraining(config)
        self.bert = bert_model.bert
        tags = vocab.get_index_to_token_vocabulary("tags")
        num_tags = len(tags)
        constraints = allowed_transitions(constraint_type="BMES", labels=tags)
        self.projection = torch.nn.Linear(768, num_tags)
        self.crf = ConditionalRandomField(num_tags=num_tags,
                                          constraints=constraints,
                                          include_start_end_transitions=False)

    def forward(self,
                tokens,
                attention_mask,
                token_type_ids,
                length,
                tags=None,
                metadata=None) -> Dict[str, torch.Tensor]:
        """

        :param tokens:
        :param attention_mask:
        :param token_type_ids:
        :param length: TODO (batch, 1) or (batch, )? 这个没啥,最后加一个view(-1) or view(-1, 1)就行。
        :param tags:
        :param metadata:
        :return:
        """
        output_dict = dict()
        input_ids = tokens['tokens']['tokens']
        bert_outputs = self.bert(input_ids=input_ids,
                                 attention_mask=attention_mask,
                                 token_type_ids=token_type_ids)
        bert_outputs = bert_outputs[0]  # (batch, sequence, hidden_size)

        # bert_outputs包括了特殊的两个符号CLS, SEP, 并且由于之前tag和attention_mask参照tokens进行了处理。
        # 所以bert_outputs, tag, attention_mask应该对这两个符号进行处理掉。
        # 在allennlp的处理中,输入到crf中第一个位置的tag(tag[0])是必定会处理的,所以这里不能输入CLS对应的tag
        # 后面位置的tag可以通过mask进行mask掉。
        # 但是在predict阶段,还需要手动移出最后一位(根据length的长度)
        bert_outputs = bert_outputs[:, 1:, :]
        logits = self.projection(bert_outputs)
        log_likelihood = torch.nn.functional.log_softmax(logits, -1)
        attention_mask = attention_mask[:, 1:]
        if tags is not None:
            tags = tags[:, 1:]

            loss = -self.crf(log_likelihood, tags, attention_mask)
            output_dict['loss'] = loss

        # 运行viterbi解码
        best_path = self.crf.viterbi_tags(logits, attention_mask)
        output_dict['best_path'] = best_path

        output_dict['metadata'] = metadata

        output_dict['input_ids'] = input_ids[:, 1:]  # 已经进行了切分
        output_dict['attention_mask'] = attention_mask
        if tags is not None:
            output_dict['tags'] = tags
        best_path = [
            path[0][:mask.sum()]
            for path, mask in zip(best_path, attention_mask)
        ]
        output_dict['best_path'] = best_path
        return output_dict

    def make_output_human_readable(
            self, output_dict: Dict[str,
                                    torch.Tensor]) -> Dict[str, torch.Tensor]:
        text_predict_tags = [[
            self.vocab.get_token_from_index(idx, 'tags') for idx in path
        ] for path in output_dict['best_path']]
        output_dict.update({'text_predict_tags': text_predict_tags})
        return output_dict
Ejemplo n.º 17
0
class BertMiddleModel(Model):
    def __init__(
        self,
        vocab: Vocabulary,
        bert_model: str,
        dropout: float = 0.0,
        requires_grad: str = "none",
        use_crf: bool = False,
        pos_weight: float = 1.0,
        initializer: InitializerApplicator = InitializerApplicator(),
        regularizer: Optional[RegularizerApplicator] = None,
    ):

        super(BertMiddleModel, self).__init__(vocab, regularizer)
        self._vocabulary = vocab
        self._bert_model = BertModel.from_pretrained(bert_model)
        self._dropout = torch.nn.Dropout(p=dropout)
        self._classification_layer = torch.nn.Linear(
            self._bert_model.config.hidden_size, 2)

        self._use_crf = use_crf

        self._pos_weight = torch.Tensor([1 / (1 - pos_weight), 1 / pos_weight])
        self._pos_weight = torch.nn.Parameter(self._pos_weight /
                                              self._pos_weight.min())
        self._pos_weight.requires_grad = False

        if use_crf:
            self._crf = ConditionalRandomField(num_tags=2)

        self.embedding_layers = ["BertEmbedding"]

        if requires_grad in ["none", "all"]:
            for param in self._bert_model.parameters():
                param.requires_grad = requires_grad == "all"
        else:
            model_name_regexes = requires_grad.split(",")
            for name, param in self._bert_model.named_parameters():
                found = any([regex in name for regex in model_name_regexes])
                param.requires_grad = found

        for n, v in self._bert_model.named_parameters():
            if n.startswith("classifier"):
                v.requires_grad = True

        self._token_prf = F1Measure(1)

        initializer(self)

    def forward(self,
                document,
                query=None,
                rationale=None,
                metadata=None,
                label=None) -> Dict[str, Any]:
        input_ids = document["bert"]
        input_mask = (input_ids != 0).long()
        starting_offsets = document["bert-starting-offsets"]  # (B, T)

        last_hidden_states, _ = self._bert_model(
            input_ids,
            attention_mask=input_mask,
            position_ids=document["bert-position-ids"])

        token_embeddings, span_mask = generate_embeddings_for_pooling(
            last_hidden_states, starting_offsets,
            document["bert-ending-offsets"])

        token_embeddings = util.masked_max(token_embeddings,
                                           span_mask.unsqueeze(-1),
                                           dim=2)
        token_embeddings = token_embeddings * document["mask"].unsqueeze(-1)

        logits = self._classification_layer(self._dropout(token_embeddings))
        assert logits.shape[0:2] == starting_offsets.shape

        if self._use_crf:
            best_paths = self._crf.viterbi_tags(logits, mask=document["mask"])
            best_paths = [b[0] for b in best_paths]
            best_paths = [
                x + [0] * (logits.shape[1] - len(x)) for x in best_paths
            ]
            best_paths = torch.Tensor(best_paths).to(
                logits.device) * document["mask"]
        else:
            best_paths = (logits[:, :, 1] > 0.5).long() * document["mask"]

        output_dict = {}

        output_dict["predicted_rationales"] = best_paths
        output_dict["mask"] = document["mask"]
        output_dict["metadata"] = metadata

        if rationale is not None:
            if self._use_crf:
                output_dict["loss"] = -self._crf(logits, rationale,
                                                 document["mask"])
            else:
                output_dict["loss"] = ((F.cross_entropy(
                    logits.view(-1, logits.shape[-1]),
                    rationale.view(-1),
                    reduction="none",
                    weight=self._pos_weight,
                ) * document["mask"].view(-1)).sum(-1).mean())

            best_paths = best_paths.unsqueeze(-1)
            best_paths = torch.cat([1 - best_paths, best_paths], dim=-1)
            self._token_prf(best_paths, rationale, document["mask"])
        return output_dict

    def extract_rationale(self, output_dict):
        rationales = []
        sentences = [x["tokens"] for x in output_dict["metadata"]]
        predicted_rationales = output_dict["predicted_rationales"].cpu(
        ).data.numpy()
        for path, words in zip(predicted_rationales, sentences):
            path = list(path)[:len(words)]
            words = [x.text for x in words]
            starts, ends = [], []
            path.append(0)
            for i in range(len(words)):
                if path[i - 1:i] == [0, 1]:
                    starts.append(i)
                if path[i - 1:i] == [1, 0]:
                    ends.append(i)

            assert len(starts) == len(ends)
            spans = list(zip(starts, ends))

            rationales.append({
                "document":
                " ".join([w for i, w in zip(path, words) if i == 1]),
                "spans": [{
                    "span": (s, e),
                    "value": 1
                } for s, e in spans],
                "metadata":
                None,
            })

        return rationales

    def get_metrics(self, reset: bool = False) -> Dict[str, float]:
        metrics = self._token_prf.get_metric(reset)
        return dict(zip(["p", "r", "f1"], metrics))

    def decode(self, output_dict):
        rationales = self.extract_rationale(output_dict)
        new_output_dict = {}

        new_output_dict['rationale'] = rationales
        new_output_dict['document'] = [r['document'] for r in rationales]

        if 'query' in output_dict['metadata'][0]:
            output_dict['query'] = [
                m['query'] for m in output_dict['metadata']
            ]

        for m in output_dict["metadata"]:
            if 'convert_tokens_to_instance' in m:
                del m["convert_tokens_to_instance"]

        new_output_dict['label'] = [
            m['label'] for m in output_dict['metadata']
        ]
        new_output_dict['metadata'] = output_dict['metadata']

        return new_output_dict
class AttentiveCrfTagger(Model):
    """
    The ``CrfTagger`` encodes a sequence of text with a ``Seq2SeqEncoder``,
    then uses a Conditional Random Field model to predict a tag for each token in the sequence.

    Parameters
    ----------
    vocab : ``Vocabulary``, required
        A Vocabulary, required in order to compute sizes for input/output projections.
    text_field_embedder : ``TextFieldEmbedder``, required
        Used to embed the tokens ``TextField`` we get as input to the model.
    encoder : ``Seq2SeqEncoder``
        The encoder that we will use in between embedding tokens and predicting output tags.
    label_namespace : ``str``, optional (default=``labels``)
        This is needed to compute the SpanBasedF1Measure metric.
        Unless you did something unusual, the default value should be what you want.
    feedforward : ``FeedForward``, optional, (default = None).
        An optional feedforward layer to apply after the encoder.
    label_encoding : ``str``, optional (default=``None``)
        Label encoding to use when calculating span f1 and constraining
        the CRF at decoding time . Valid options are "BIO", "BIOUL", "IOB1", "BMES".
        Required if ``calculate_span_f1`` or ``constrain_crf_decoding`` is true.
    include_start_end_transitions : ``bool``, optional (default=``True``)
        Whether to include start and end transition parameters in the CRF.
    constrain_crf_decoding : ``bool``, optional (default=``None``)
        If ``True``, the CRF is constrained at decoding time to
        produce valid sequences of tags. If this is ``True``, then
        ``label_encoding`` is required. If ``None`` and
        label_encoding is specified, this is set to ``True``.
        If ``None`` and label_encoding is not specified, it defaults
        to ``False``.
    calculate_span_f1 : ``bool``, optional (default=``None``)
        Calculate span-level F1 metrics during training. If this is ``True``, then
        ``label_encoding`` is required. If ``None`` and
        label_encoding is specified, this is set to ``True``.
        If ``None`` and label_encoding is not specified, it defaults
        to ``False``.
    dropout:  ``float``, optional (default=``None``)
    verbose_metrics : ``bool``, optional (default = False)
        If true, metrics will be returned per label class in addition
        to the overall statistics.
    initializer : ``InitializerApplicator``, optional (default=``InitializerApplicator()``)
        Used to initialize the model parameters.
    regularizer : ``RegularizerApplicator``, optional (default=``None``)
        If provided, will be used to calculate the regularization penalty during training.
    """

    def __init__(self, vocab: Vocabulary,
                 text_field_embedder: TextFieldEmbedder,
                 encoder: Seq2SeqEncoder,
                 label_namespace: str = "labels",
                 feedforward: Optional[FeedForward] = None,
                 label_encoding: Optional[str] = None,
                 include_start_end_transitions: bool = True,
                 attention=None,
                 constrain_crf_decoding: bool = None,
                 calculate_span_f1: bool = None,
                 dropout: Optional[float] = None,
                 verbose_metrics: bool = False,
                 initializer: InitializerApplicator = InitializerApplicator(),
                 regularizer: Optional[RegularizerApplicator] = None) -> None:
        super().__init__(vocab, regularizer)

        self.label_namespace = label_namespace
        self.text_field_embedder = text_field_embedder
        self.num_tags = self.vocab.get_vocab_size(label_namespace)
        self.encoder = encoder
        self._verbose_metrics = verbose_metrics
        if dropout:
            self.dropout = torch.nn.Dropout(dropout)
        else:
            self.dropout = None
        self._feedforward = feedforward

        if feedforward is not None:
            output_dim = feedforward.get_output_dim()
        else:
            output_dim = self.encoder.get_output_dim()
        self.tag_projection_layer = TimeDistributed(Linear(output_dim,
                                                           self.num_tags))

        # if  constrain_crf_decoding and calculate_span_f1 are not
        # provided, (i.e., they're None), set them to True
        # if label_encoding is provided and False if it isn't.
        if constrain_crf_decoding is None:
            constrain_crf_decoding = label_encoding is not None
        if calculate_span_f1 is None:
            calculate_span_f1 = label_encoding is not None

        self.label_encoding = label_encoding
        if constrain_crf_decoding:
            if not label_encoding:
                raise ConfigurationError("constrain_crf_decoding is True, but "
                                         "no label_encoding was specified.")
            labels = self.vocab.get_index_to_token_vocabulary(label_namespace)
            constraints = allowed_transitions(label_encoding, labels)
        else:
            constraints = None

        self.include_start_end_transitions = include_start_end_transitions
        self.crf = ConditionalRandomField(
                self.num_tags, constraints,
                include_start_end_transitions=include_start_end_transitions
        )

        self.metrics = {
                "accuracy": CategoricalAccuracy(),
                "accuracy3": CategoricalAccuracy(top_k=3)
        }
        self.calculate_span_f1 = calculate_span_f1
        if calculate_span_f1:
            if not label_encoding:
                raise ConfigurationError("calculate_span_f1 is True, but "
                                         "no label_encoding was specified.")
            self._f1_metric = SpanBasedF1Measure(vocab,
                                                 tag_namespace=label_namespace,
                                                 label_encoding=label_encoding)

        check_dimensions_match(text_field_embedder.get_output_dim(), encoder.get_input_dim(),
                               "text field embedding dim", "encoder input dim")
        if feedforward is not None:
            check_dimensions_match(encoder.get_output_dim(), feedforward.get_input_dim(),
                                   "encoder output dim", "feedforward input dim")


        initializer(self)

    @overrides
    def forward(self,  # type: ignore
                tokens: Dict[str, torch.LongTensor],
                tags: torch.LongTensor = None,
                metadata: List[Dict[str, Any]] = None,
                # pylint: disable=unused-argument
                **kwargs) -> Dict[str, torch.Tensor]:
        # pylint: disable=arguments-differ
        """
        Parameters
        ----------
        tokens : ``Dict[str, torch.LongTensor]``, required
            The output of ``TextField.as_array()``, which should typically be passed directly to a
            ``TextFieldEmbedder``. This output is a dictionary mapping keys to ``TokenIndexer``
            tensors.  At its most basic, using a ``SingleIdTokenIndexer`` this is: ``{"tokens":
            Tensor(batch_size, num_tokens)}``. This dictionary will have the same keys as were used
            for the ``TokenIndexers`` when you created the ``TextField`` representing your
            sequence.  The dictionary is designed to be passed directly to a ``TextFieldEmbedder``,
            which knows how to combine different word representations into a single vector per
            token in your input.
        tags : ``torch.LongTensor``, optional (default = ``None``)
            A torch tensor representing the sequence of integer gold class labels of shape
            ``(batch_size, num_tokens)``.
        metadata : ``List[Dict[str, Any]]``, optional, (default = None)
            metadata containg the original words in the sentence to be tagged under a 'words' key.

        Returns
        -------
        An output dictionary consisting of:

        logits : ``torch.FloatTensor``
            The logits that are the output of the ``tag_projection_layer``
        mask : ``torch.LongTensor``
            The text field mask for the input tokens
        tags : ``List[List[int]]``
            The predicted tags using the Viterbi algorithm.
        loss : ``torch.FloatTensor``, optional
            A scalar loss to be optimised. Only computed if gold label ``tags`` are provided.
        """

        embedded_text_input = self.text_field_embedder(tokens)
        mask = util.get_text_field_mask(tokens)

        if self.dropout:
            embedded_text_input = self.dropout(embedded_text_input)

        encoded_text = self.encoder(embedded_text_input, mask)

        if self.dropout:
            encoded_text = self.dropout(encoded_text)

        if self._feedforward is not None:
            encoded_text = self._feedforward(encoded_text)


        logits = self.tag_projection_layer(encoded_text)
        best_paths = self.crf.viterbi_tags(logits, mask)

        # Just get the tags and ignore the score.
        predicted_tags = [x for x, y in best_paths]

        output = {"logits": logits, "mask": mask, "tags": predicted_tags}

        if tags is not None:
            # Add negative log-likelihood as loss
            log_likelihood = self.crf(logits, tags, mask)
            output["loss"] = -log_likelihood

            # Represent viterbi tags as "class probabilities" that we can
            # feed into the metrics
            class_probabilities = logits * 0.
            for i, instance_tags in enumerate(predicted_tags):
                for j, tag_id in enumerate(instance_tags):
                    class_probabilities[i, j, tag_id] = 1

            for metric in self.metrics.values():
                metric(class_probabilities, tags, mask.float())
            if self.calculate_span_f1:
                self._f1_metric(class_probabilities, tags, mask.float())
        if metadata is not None:
            output["words"] = [x["words"] for x in metadata]
        return output

    @overrides
    def decode(self, output_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
        """
        Converts the tag ids to the actual tags.
        ``output_dict["tags"]`` is a list of lists of tag_ids,
        so we use an ugly nested list comprehension.
        """
        output_dict["tags"] = [
                [self.vocab.get_token_from_index(tag, namespace=self.label_namespace)
                 for tag in instance_tags]
                for instance_tags in output_dict["tags"]
        ]

        return output_dict

    @overrides
    def get_metrics(self, reset: bool = False) -> Dict[str, float]:
        metrics_to_return = {metric_name: metric.get_metric(reset) for
                             metric_name, metric in self.metrics.items()}

        if self.calculate_span_f1:
            f1_dict = self._f1_metric.get_metric(reset=reset)
            if self._verbose_metrics:
                metrics_to_return.update(f1_dict)
            else:
                metrics_to_return.update({
                        x: y for x, y in f1_dict.items() if
                        "overall" in x})
        return metrics_to_return
    def __init__(self, vocab: Vocabulary,
                 text_field_embedder: TextFieldEmbedder,
                 encoder: Seq2SeqEncoder,
                 label_namespace: str = "labels",
                 feedforward: Optional[FeedForward] = None,
                 label_encoding: Optional[str] = None,
                 include_start_end_transitions: bool = True,
                 attention=None,
                 constrain_crf_decoding: bool = None,
                 calculate_span_f1: bool = None,
                 dropout: Optional[float] = None,
                 verbose_metrics: bool = False,
                 initializer: InitializerApplicator = InitializerApplicator(),
                 regularizer: Optional[RegularizerApplicator] = None) -> None:
        super().__init__(vocab, regularizer)

        self.label_namespace = label_namespace
        self.text_field_embedder = text_field_embedder
        self.num_tags = self.vocab.get_vocab_size(label_namespace)
        self.encoder = encoder
        self._verbose_metrics = verbose_metrics
        if dropout:
            self.dropout = torch.nn.Dropout(dropout)
        else:
            self.dropout = None
        self._feedforward = feedforward

        if feedforward is not None:
            output_dim = feedforward.get_output_dim()
        else:
            output_dim = self.encoder.get_output_dim()
        self.tag_projection_layer = TimeDistributed(Linear(output_dim,
                                                           self.num_tags))

        # if  constrain_crf_decoding and calculate_span_f1 are not
        # provided, (i.e., they're None), set them to True
        # if label_encoding is provided and False if it isn't.
        if constrain_crf_decoding is None:
            constrain_crf_decoding = label_encoding is not None
        if calculate_span_f1 is None:
            calculate_span_f1 = label_encoding is not None

        self.label_encoding = label_encoding
        if constrain_crf_decoding:
            if not label_encoding:
                raise ConfigurationError("constrain_crf_decoding is True, but "
                                         "no label_encoding was specified.")
            labels = self.vocab.get_index_to_token_vocabulary(label_namespace)
            constraints = allowed_transitions(label_encoding, labels)
        else:
            constraints = None

        self.include_start_end_transitions = include_start_end_transitions
        self.crf = ConditionalRandomField(
                self.num_tags, constraints,
                include_start_end_transitions=include_start_end_transitions
        )

        self.metrics = {
                "accuracy": CategoricalAccuracy(),
                "accuracy3": CategoricalAccuracy(top_k=3)
        }
        self.calculate_span_f1 = calculate_span_f1
        if calculate_span_f1:
            if not label_encoding:
                raise ConfigurationError("calculate_span_f1 is True, but "
                                         "no label_encoding was specified.")
            self._f1_metric = SpanBasedF1Measure(vocab,
                                                 tag_namespace=label_namespace,
                                                 label_encoding=label_encoding)

        check_dimensions_match(text_field_embedder.get_output_dim(), encoder.get_input_dim(),
                               "text field embedding dim", "encoder input dim")
        if feedforward is not None:
            check_dimensions_match(encoder.get_output_dim(), feedforward.get_input_dim(),
                                   "encoder output dim", "feedforward input dim")


        initializer(self)
Ejemplo n.º 20
0
class DualCrossSharedRNN(nn.Module):
    def __init__(self,
                 general_embeddings,
                 domain_embeddings,
                 input_size,
                 hidden_size,
                 aspect_tag_classes,
                 polarity_tag_classes,
                 k,
                 dropout=0.5):
        super(DualCrossSharedRNN, self).__init__()
        self.general_embedding = nn.Embedding(
            num_embeddings=general_embeddings.size(0),
            embedding_dim=general_embeddings.size(1),
            padding_idx=0).from_pretrained(general_embeddings)
        self.domain_embedding = nn.Embedding(
            num_embeddings=domain_embeddings.size(0),
            embedding_dim=domain_embeddings.size(1),
            padding_idx=0).from_pretrained(domain_embeddings)
        self.general_embedding.weight.requires_grad = False
        self.domain_embedding.weight.requires_grad = False
        self.dropout = dropout
        self.hidden_size = hidden_size
        self.aspect_rnn1 = ReGU(input_size,
                                hidden_size,
                                num_layers=1,
                                bidirectional=True)
        self.polarity_rnn1 = ReGU(input_size,
                                  hidden_size,
                                  num_layers=1,
                                  bidirectional=True)
        self.csu = Cross_Shared_Unit(k, 2 * hidden_size)
        self.aspect_rnn2 = ReGU(2 * hidden_size,
                                hidden_size,
                                num_layers=1,
                                bidirectional=True)
        self.polarity_rnn2 = ReGU(2 * hidden_size,
                                  hidden_size,
                                  num_layers=1,
                                  bidirectional=True)
        self.aspect_hidden2tag = nn.Linear(2 * hidden_size, aspect_tag_classes)
        self.polarity_hidden2tag = nn.Linear(2 * hidden_size,
                                             polarity_tag_classes)
        self.aspect_crf = ConditionalRandomField(aspect_tag_classes)
        self.polarity_crf = ConditionalRandomField(polarity_tag_classes)
        self.dropout_layer = nn.Dropout(dropout)

    def forward(self,
                features,
                aspect_tags,
                polarity_tags,
                mask,
                testing=False,
                crf=True):
        batch = features.size(0)
        general_features = self.general_embedding(features)
        domain_features = self.domain_embedding(features)
        features = torch.cat((general_features, domain_features), dim=2)
        states = torch.zeros(1, 2, batch, self.hidden_size).to(features.device)
        features = self.dropout_layer(features)
        aspect_hidden, _ = self.aspect_rnn1(features, states)
        polarity_hidden, _ = self.polarity_rnn1(features, states)
        aspect_hidden, polarity_hidden = self.csu(aspect_hidden,
                                                  polarity_hidden,
                                                  max_pooling=False)
        aspect_hidden, _ = self.aspect_rnn2(aspect_hidden, states)
        polarity_hidden, _ = self.polarity_rnn2(polarity_hidden, states)
        aspect_logit = self.aspect_hidden2tag(aspect_hidden)
        polarity_logit = self.polarity_hidden2tag(polarity_hidden)
        if crf == True:
            if testing == False:
                aspect_score = -self.aspect_crf(aspect_logit, aspect_tags,
                                                mask)
                polarity_score = -self.polarity_crf(polarity_logit,
                                                    polarity_tags, mask)
                return aspect_score + polarity_score
            else:
                aspect_path = self.aspect_crf.viterbi_tags(aspect_logit, mask)
                polarity_path = self.polarity_crf.viterbi_tags(
                    polarity_logit, mask)
                return aspect_path, polarity_path
        else:
            return aspect_logit, polarity_logit
class SeqClassificationModel(Model):
    """
    Question answering model where answers are sentences
    """
    def __init__(
        self,
        vocab: Vocabulary,
        text_field_embedder: TextFieldEmbedder,
        use_sep: bool = True,
        with_crf: bool = False,
        self_attn: Seq2SeqEncoder = None,
        bert_dropout: float = 0.1,
        sci_sum: bool = False,
        additional_feature_size: int = 0,
    ) -> None:
        super(SeqClassificationModel, self).__init__(vocab)

        self.track_embedding_list = []
        self.track_embedding = {}
        self.text_field_embedder = text_field_embedder
        self.vocab = vocab
        self.use_sep = use_sep
        self.with_crf = with_crf
        self.sci_sum = sci_sum
        self.self_attn = self_attn
        self.additional_feature_size = additional_feature_size

        self.dropout = torch.nn.Dropout(p=bert_dropout)

        # define loss
        if self.sci_sum:
            self.loss = torch.nn.MSELoss(
                reduction='none')  # labels are rouge scores
            self.labels_are_scores = True
            self.num_labels = 1
        else:
            self.loss = torch.nn.CrossEntropyLoss(ignore_index=-1,
                                                  reduction='none')
            self.labels_are_scores = False
            self.num_labels = self.vocab.get_vocab_size(namespace='labels')
            # define accuracy metrics
            self.label_accuracy = CategoricalAccuracy()
            self.label_f1_metrics = {}

            # define F1 metrics per label
            for label_index in range(self.num_labels):
                label_name = self.vocab.get_token_from_index(
                    namespace='labels', index=label_index)
                self.label_f1_metrics[label_name] = F1Measure(label_index)

        encoded_senetence_dim = text_field_embedder._token_embedders[
            'bert'].output_dim

        ff_in_dim = encoded_senetence_dim if self.use_sep else self_attn.get_output_dim(
        )
        ff_in_dim += self.additional_feature_size

        self.time_distributed_aggregate_feedforward = TimeDistributed(
            Linear(ff_in_dim, self.num_labels))

        if self.with_crf:
            self.crf = ConditionalRandomField(
                self.num_labels,
                constraints=None,
                include_start_end_transitions=True)
        self.track_embedding["init_info"] = {
            "ff_in_dim": ff_in_dim,
            "encoded_sentence_dim": encoded_senetence_dim,
            "sci_sum": self.sci_sum,
            "use_sep": self.use_sep,
            "with_crf": self.with_crf,
            "additional_feature_size": self.additional_feature_size
        }
        self.t_board_writer = SummaryWriter()
        self.t_board_writer.add_graph(self)

    def forward(
        self,  # type: ignore
        sentences: torch.LongTensor,
        labels: torch.IntTensor = None,
        confidences: torch.Tensor = None,
        additional_features: torch.Tensor = None,
    ) -> Dict[str, torch.Tensor]:
        # pylint: disable=arguments-differ
        """
        Parameters
        ----------
        TODO: add description

        Returns
        -------
        An output dictionary consisting of:
        loss : torch.FloatTensor, optional
            A scalar loss to be optimised.
        """
        # ===========================================================================================================
        # Layer 1: For each sentence, participant pair: create a Glove embedding for each token
        # Input: sentences
        # Output: embedded_sentences
        print(sentences)
        sentences_conv = {}
        for key, val in sentences_conv.items():
            sentences_conv[key] = val.cpu().data.numpy().tolist()
        self.track_embedding["Transformation_0"] = {
            "sentences": sentences_conv
        }
        # embedded_sentences: batch_size, num_sentences, sentence_length, embedding_size
        embedded_sentences = self.text_field_embedder(sentences)
        self.track_embedding["Transformation_1"] = {
            "size": list(embedded_sentences.size()),
            "dim": embedded_sentences.dim()
        }

        # Kacper: Basically a padding mask for bert
        mask = get_text_field_mask(sentences, num_wrapping_dims=1).float()
        batch_size, num_sentences, _, _ = list(embedded_sentences.size())

        if self.use_sep:
            # The following code collects vectors of the SEP tokens from all the examples in the batch,
            # and arrange them in one list. It does the same for the labels and confidences.
            # TODO: replace 103 with '[SEP]'
            # Kacper: This is an important step where we get SEP tokens to later do sentence classification
            # Kacper: We take a location of SEP tokens from the sentences to get a mask
            sentences_mask = sentences[
                'bert'] == 103  # mask for all the SEP tokens in the batch
            # Kacper: We use this mask to get the respective embeddings from the output layer of bert
            embedded_sentences = embedded_sentences[
                sentences_mask]  # given batch_size x num_sentences_per_example x sent_len x vector_len
            # returns num_sentences_per_batch x vector_len
            self.track_embedding["Transformation_2"] = {
                "size": list(embedded_sentences.size()),
                "dim": embedded_sentences.dim()
            }
            # Kacper: I dont get it why it became 2 instead of 4? What is the difference between size() and dim()???
            assert embedded_sentences.dim() == 2
            num_sentences = embedded_sentences.shape[0]
            # Kacper: comment below is vague
            # Kacper: I think we batch in one array because we just need to compute a mean loss from all of them
            # for the rest of the code in this model to work, think of the data we have as one example
            # with so many sentences and a batch of size 1
            batch_size = 1
            embedded_sentences = embedded_sentences.unsqueeze(
                dim=0)  # Kacper: We batch all sentences in one array
            self.track_embedding["Transformation_3"] = {
                "size": list(embedded_sentences.size()),
                "dim": embedded_sentences.dim()
            }
            # Kacper: Dropout layer is between filtered embeddings and linear layer
            embedded_sentences = self.dropout(embedded_sentences)
            self.track_embedding["Transformation_4"] = {
                "size": list(embedded_sentences.size()),
                "dim": embedded_sentences.dim()
            }
            # Kacper: we provide the labels for training (for each sentence)
            if labels is not None:
                if self.labels_are_scores:
                    labels_mask = labels != 0.0  # mask for all the labels in the batch (no padding)
                else:
                    labels_mask = labels != -1  # mask for all the labels in the batch (no padding)

                labels = labels[
                    labels_mask]  # given batch_size x num_sentences_per_example return num_sentences_per_batch
                assert labels.dim() == 1
                if confidences is not None:
                    confidences = confidences[labels_mask]
                    assert confidences.dim() == 1
                if additional_features is not None:
                    additional_features = additional_features[labels_mask]
                    assert additional_features.dim() == 2

                num_labels = labels.shape[0]
                # Kacper: this might be useful to consider in my code as well
                if num_labels != num_sentences:  # bert truncates long sentences, so some of the SEP tokens might be gone
                    assert num_labels > num_sentences  # but `num_labels` should be at least greater than `num_sentences`
                    logger.warning(
                        f'Found {num_labels} labels but {num_sentences} sentences'
                    )
                    labels = labels[:
                                    num_sentences]  # Ignore some labels. This is ok for training but bad for testing.
                    # We are ignoring this problem for now.
                    # TODO: fix, at least for testing

                # do the same for `confidences`
                if confidences is not None:
                    num_confidences = confidences.shape[0]
                    if num_confidences != num_sentences:
                        assert num_confidences > num_sentences
                        confidences = confidences[:num_sentences]

                # and for `additional_features`
                if additional_features is not None:
                    num_additional_features = additional_features.shape[0]
                    if num_additional_features != num_sentences:
                        assert num_additional_features > num_sentences
                        additional_features = additional_features[:
                                                                  num_sentences]

                # similar to `embedded_sentences`, add an additional dimension that corresponds to batch_size=1
                labels = labels.unsqueeze(dim=0)
                if confidences is not None:
                    confidences = confidences.unsqueeze(dim=0)
                if additional_features is not None:
                    additional_features = additional_features.unsqueeze(dim=0)
        else:
            # ['CLS'] token
            # Kacper: this shouldnt be the case for our project
            embedded_sentences = embedded_sentences[:, :, 0, :]
            embedded_sentences = self.dropout(embedded_sentences)
            batch_size, num_sentences, _ = list(embedded_sentences.size())
            sent_mask = (mask.sum(dim=2) != 0)
            embedded_sentences = self.self_attn(embedded_sentences, sent_mask)

        if additional_features is not None:
            embedded_sentences = torch.cat(
                (embedded_sentences, additional_features), dim=-1)

        # Kacper: we unwrap the time dimension of a tensor into the 1st dimension (batch),
        # Kacper: apply a linear layer and wrap the the time dimension back
        # Kacper: I would suspect it is happening only for embeddings related to the [SEP] tokens
        label_logits = self.time_distributed_aggregate_feedforward(
            embedded_sentences)
        # label_logits: batch_size, num_sentences, num_labels
        self.track_embedding["logits"] = {
            "size": list(label_logits.size()),
            "dim": label_logits.dim()
        }
        #print(self.track_embedding)
        self.track_embedding_list.append(deepcopy(self.track_embedding))
        with open(path_json, 'w') as json_out:
            json.dump(self.track_embedding_list, json_out)

        if self.labels_are_scores:
            label_probs = label_logits
        else:
            label_probs = torch.nn.functional.softmax(label_logits, dim=-1)

        # Create output dictionary for the trainer
        # Compute loss and epoch metrics
        output_dict = {"action_probs": label_probs}

        # =====================================================================

        if self.with_crf:
            # Layer 4 = CRF layer across labels of sentences in an abstract
            mask_sentences = (labels != -1)
            best_paths = self.crf.viterbi_tags(label_logits, mask_sentences)
            #
            # # Just get the tags and ignore the score.
            predicted_labels = [x for x, y in best_paths]
            # print(f"len(predicted_labels):{len(predicted_labels)}, (predicted_labels):{predicted_labels}")

            label_loss = 0.0
        if labels is not None:
            # Compute cross entropy loss
            # Kacper: reshape logits to be of the following shape in view()
            flattened_logits = label_logits.view((batch_size * num_sentences),
                                                 self.num_labels)
            # Make labels to be contiguous in memory, reshape it so it is in a one dimension
            flattened_gold = labels.contiguous().view(
                -1)  # Kacper: True labels

            if not self.with_crf:
                # Kacper: We are only interested in this part of the code since we don't use crf
                # Kacper: Get a loss (MSE if sci_sum is True or Crossentropy)
                label_loss = self.loss(flattened_logits.squeeze(),
                                       flattened_gold)
                if confidences is not None:
                    label_loss = label_loss * confidences.type_as(
                        label_loss).view(-1)
                label_loss = label_loss.mean()  # Kacper: Get a mean loss
                # Kacper: Get a probabilities from the logits
                flattened_probs = torch.softmax(flattened_logits, dim=-1)
            else:
                # Kacper: We are not interested in this if statement branch (for our project)
                clamped_labels = torch.clamp(labels, min=0)
                log_likelihood = self.crf(label_logits, clamped_labels,
                                          mask_sentences)
                label_loss = -log_likelihood
                # compute categorical accuracy
                crf_label_probs = label_logits * 0.
                for i, instance_labels in enumerate(predicted_labels):
                    for j, label_id in enumerate(instance_labels):
                        crf_label_probs[i, j, label_id] = 1
                flattened_probs = crf_label_probs.view(
                    (batch_size * num_sentences), self.num_labels)

            if not self.labels_are_scores:
                # Kacper: this will be a case for us as well because labels are numerical for Pubmed data
                evaluation_mask = (flattened_gold != -1)
                # Kacper: CategoricalAccuracy is computed in this case
                self.label_accuracy(flattened_probs.float().contiguous(),
                                    flattened_gold.squeeze(-1),
                                    mask=evaluation_mask)

                # compute F1 per label
                for label_index in range(self.num_labels):
                    label_name = self.vocab.get_token_from_index(
                        namespace='labels', index=label_index)
                    metric = self.label_f1_metrics[label_name]
                    metric(flattened_probs,
                           flattened_gold,
                           mask=evaluation_mask)

        if labels is not None:
            output_dict["loss"] = label_loss
        output_dict['action_logits'] = label_logits
        return output_dict

    def get_metrics(self, reset: bool = False):
        # Kacper: this function has to implemented due to API requirements for AllenNLP
        # Kacper: so it can be run automatically with a config file
        metric_dict = {}

        if not self.labels_are_scores:
            type_accuracy = self.label_accuracy.get_metric(reset)
            metric_dict['acc'] = type_accuracy

            average_F1 = 0.0
            for name, metric in self.label_f1_metrics.items():
                metric_val = metric.get_metric(reset)
                metric_dict[name + 'F'] = metric_val[2]
                average_F1 += metric_val[2]

            average_F1 /= len(self.label_f1_metrics.items())
            metric_dict['avgF'] = average_F1

        return metric_dict
Ejemplo n.º 22
0
class SlotTaggingModel(Model):
    def __init__(self,
                 vocab: Vocabulary,
                 bert_embedder: Optional[PretrainedBertEmbedder] = None,
                 encoder: Optional[Seq2SeqEncoder] = None,
                 dropout: Optional[float] = None,
                 use_crf: bool = True,
                 add_random_noise: bool = False,
                 add_attack_noise: bool = False,
                 do_noise_normalization: bool = True,
                 noise_norm: Optional[float] = None,
                 noise_loss_prob: Optional[float] = None,
                 add_noise_for: str = "ov",
                 rnn_after_embeddings: bool = False,
                 open_vocabulary_slots: Optional[List[str]] = None,
                 metrics_for_each_slot_type: bool = False) -> None:
        """
        Params
        ------
        vocab: the allennlp Vocabulary object, will be automatically passed
        bert_embedder: the pretrained BERT embedder. If it is not None, the pretrained BERT
                embedding (parameter fixed) will be used as the embedding layer. Otherwise, a look-up
                embedding matrix will be initialized with the embedding size 1024. The default is None.
        encoder: the contextual encoder used after the embedding layer. If set to None, no contextual
                encoder will be used.
        dropout: the dropout rate, won't be set in all our experiments.
        use_crf: if set to True, CRF will be used at the end of the model (as output layer). Otherwise,
                a softmax layer (with cross-entropy loss) will be used.
        add_random_noise: whether to add random noise to slots. Can not be set simultaneously 
                with add_attack_noise. This setting is used as baseline in our experiments.
        add_attack_noise: whether to add adversarial attack noise to slots. Can not be set simultaneously
                with add_random_noise.
        do_noise_normalization: if set to True, the normalization will be applied to gradients w.r.t. 
                token embeddings. Otherwise, the gradients won't be normalized.
        noise_norm: the normalization norm (L2) applied to gradients.
        noise_loss_prob: the alpha hyperparameter to balance the loss from normal forward and adversarial
                forward. See the paper for more details. Should be set from 0 to 1.
        add_noise_for: if set to ov, the noise will only be applied to open-vocabulary slots. Otherwise,
                the noise will be applied to all slots (both open-vocabulary and normal slots).
        rnn_after_embeddings: if set to True, an additional BiLSTM layer will be applied after the embedding
                layer. Default is False.
        open_vocabulary_slots: the list of open-vocabulary slots. If not set, will be set to open-vocabulary
                slots of Snips dataset by default.
        metrics_for_each_slot_type: whether to log metrics for each slot type. Default is False.
        """
        super().__init__(vocab)

        if bert_embedder:
            self.use_bert = True
            self.bert_embedder = bert_embedder
        else:
            self.use_bert = False
            self.basic_embedder = BasicTextFieldEmbedder({
                "tokens":
                Embedding(vocab.get_vocab_size(namespace="tokens"), 1024)
            })
            self.rnn_after_embeddings = rnn_after_embeddings
            if rnn_after_embeddings:
                self.rnn = Seq2SeqEncoder.from_params(
                    Params({
                        "type": "lstm",
                        "input_size": 1024,
                        "hidden_size": 512,
                        "bidirectional": True,
                        "batch_first": True
                    }))

        self.encoder = encoder

        if encoder:
            hidden2tag_in_dim = encoder.get_output_dim()
        else:
            hidden2tag_in_dim = bert_embedder.get_output_dim()
        self.hidden2tag = TimeDistributed(
            torch.nn.Linear(in_features=hidden2tag_in_dim,
                            out_features=vocab.get_vocab_size("labels")))

        if dropout:
            self.dropout = torch.nn.Dropout(dropout)
        else:
            self.dropout = None

        self.use_crf = use_crf
        if use_crf:
            crf_constraints = allowed_transitions(
                constraint_type="BIO",
                labels=vocab.get_index_to_token_vocabulary("labels"))
            self.crf = ConditionalRandomField(
                num_tags=vocab.get_vocab_size("labels"),
                constraints=crf_constraints,
                include_start_end_transitions=True)

        # default open_vocabulary slots: for SNIPS dataset
        open_vocabulary_slots = open_vocabulary_slots or [
            "playlist", "entity_name", "poi", "restaurant_name",
            "geographic_poi", "album", "track", "object_name", "movie_name"
        ]
        self.f1 = OVSpecSpanBasedF1Measure(
            vocab,
            tag_namespace="labels",
            ignore_classes=[],
            label_encoding="BIO",
            open_vocabulary_slots=open_vocabulary_slots)

        self.add_random_noise = add_random_noise
        self.add_attack_noise = add_attack_noise
        assert not (add_random_noise and
                    add_attack_noise), "both random and attack noise applied"
        if add_random_noise or add_attack_noise:
            self.do_noise_normalization = do_noise_normalization
            assert noise_norm is not None
            assert noise_loss_prob is not None and 0. <= noise_loss_prob <= 1.
            self.noise_norm = noise_norm
            self.noise_loss_prob = noise_loss_prob
            assert add_noise_for in ["ov", "all"]
            self.ov_noise_only = (add_noise_for == "ov")

        self.metrics_for_each_slot_type = metrics_for_each_slot_type

    def forward(self,
                sentence: Dict[str, torch.Tensor],
                slot_labels: torch.Tensor = None,
                ov_slot_mask: torch.Tensor = None,
                slot_mask: torch.Tensor = None) -> Dict[str, torch.Tensor]:
        """
        Params
        ------
        sentence: a Dict contains tensors of token ids (in "tokens" key) or (If use BERT as embedding
                layer) BERT BPE ids, offsets, segment ids. This parameter is the output of
                TextField.as_tensors(), see ~allennlp.data.fields.text_field.TextField for details.
                Each field should have shape (batch_size, seq_length)
        slot_labels: slot label ids (in BIO format), of shape (batch_size, seq_length)
        ov_slot_mask: binary mask, 1 for tokens of open-vocabulary slots, 0 for otherwise (non-slot tokens
                 and tokens of normal slots). Of shape (batch_size, seq_length)
        slot_mask: binary mask, 1 for tokens of slots (all slots), 0 for non-slot tokens (i.e. the O tag).
                Of shape (batch_size, seq_length)
        
        Return a Dict (str -> torch.Tensor), which contains fields:
                mask - the mask matrix of ``sentence``, shape: (batch_size, seq_length)
                embeddings - the embedded tokens, shape: (batch_size, seq_length, embed_size)
                encoder_out - the output of contextual encoder, shape: (batch_size, seq_length, num_features)
                tag_logits - the output of tag projection layer, shape: (batch_size, seq_length, num_tags)
                predicted_tags - the output of CRF layer (use viterbi algorithm to obtain best paths),
                             shape: (batch_size, seq_length)
        """
        output = {}

        mask = get_text_field_mask(sentence)
        output["mask"] = mask

        if self.use_bert:
            embeddings = self.bert_embedder(sentence["bert"],
                                            sentence["bert-offsets"],
                                            sentence["bert-type-ids"])
            if self.dropout:
                embeddings = self.dropout(embeddings)
            output["embeddings"] = embeddings
        else:
            embeddings = self.basic_embedder(sentence)
            if self.dropout:
                embeddings = self.dropout(embeddings)
            output["embeddings"] = embeddings
            if self.rnn_after_embeddings:
                embeddings = self.rnn(embeddings, mask)
                if self.dropout:
                    embeddings = self.dropout(embeddings)
                output["rnn_out"] = embeddings

        if not self.training:  # when predict or evaluate, no need for adding noise
            output.update(self._inner_forward(embeddings, mask, slot_labels))
        elif not self.add_random_noise and not self.add_attack_noise:  # for baseline
            output.update(self._inner_forward(embeddings, mask, slot_labels))
        else:  # add random noise or attack noise for open-vocabulary slots
            if self.add_random_noise:  # add random noise
                unnormalized_noise = torch.randn(
                    embeddings.shape).to(device=embeddings.device)
            else:  # add attack noise
                normal_loss = self._inner_forward(embeddings, mask,
                                                  slot_labels)["loss"]
                embeddings.retain_grad(
                )  # we need to get gradient w.r.t embeddings
                normal_loss.backward(retain_graph=True)
                unnormalized_noise = embeddings.grad.detach_()
                for p in self.parameters():
                    if p.grad is not None:
                        p.grad.detach_()
                        p.grad.zero_()
            if self.do_noise_normalization:  # do normalization
                norm = unnormalized_noise.norm(p=2, dim=-1)
                normalized_noise = unnormalized_noise / (
                    norm.unsqueeze(dim=-1) + 1e-10)  # add 1e-10 to avoid NaN
            else:  # no normalization
                normalized_noise = unnormalized_noise
            if self.ov_noise_only:  # add noise to open-vocabulary slots only
                ov_slot_noise = self.noise_norm * normalized_noise * ov_slot_mask.unsqueeze(
                    dim=-1).float()
            else:  # add noise to all slots
                ov_slot_noise = self.noise_norm * normalized_noise * slot_mask.unsqueeze(
                    dim=-1).float()
            output["ov_slot_noise"] = ov_slot_noise
            noise_embeddings = embeddings + ov_slot_noise  # semantics decoupling using noise
            normal_sample_loss = self._inner_forward(
                embeddings, mask, slot_labels)["loss"]  # normal forward
            noise_sample_loss = self._inner_forward(
                noise_embeddings, mask,
                slot_labels)["loss"]  # adversarial forward
            loss = normal_sample_loss * (
                1 - self.noise_loss_prob
            ) + noise_sample_loss * self.noise_loss_prob
            output["loss"] = loss
        return output

    def _inner_forward(self, embeddings: torch.Tensor, mask: torch.Tensor,
                       slot_labels: torch.Tensor) -> Dict[str, torch.Tensor]:
        """
        Forward from **embedding space** to a loss or predicted-tags.
        """
        output = {}

        if self.encoder:
            encoder_out = self.encoder(embeddings, mask)
            if self.dropout:
                encoder_out = self.dropout(encoder_out)
            output["encoder_out"] = encoder_out
        else:
            encoder_out = embeddings

        tag_logits = self.hidden2tag(encoder_out)
        output["tag_logits"] = tag_logits

        if self.use_crf:
            best_paths = self.crf.viterbi_tags(tag_logits, mask)
            predicted_tags = [x for x, y in best_paths
                              ]  # get the tags and ignore the score
            predicted_score = [y for _, y in best_paths]
            output["predicted_tags"] = predicted_tags
            output["predicted_score"] = predicted_score
        else:
            output["predicted_tags"] = torch.argmax(tag_logits, dim=-1)  # pylint: disable=no-member

        if slot_labels is not None:
            if self.use_crf:
                log_likelihood = self.crf(tag_logits, slot_labels,
                                          mask)  # returns log-likelihood
                output[
                    "loss"] = -1.0 * log_likelihood  # add negative log-likelihood as loss

                # Represent viterbi tags as "class probabilities" that we can
                # feed into the metrics
                class_probabilities = tag_logits * 0.
                for i, instance_tags in enumerate(predicted_tags):
                    for j, tag_id in enumerate(instance_tags):
                        class_probabilities[i, j, tag_id] = 1
                self.f1(class_probabilities, slot_labels, mask.float())
            else:
                output["loss"] = sequence_cross_entropy_with_logits(
                    tag_logits, slot_labels, mask)
                self.f1(tag_logits, slot_labels, mask.float())

        return output

    def get_metrics(self, reset: bool = False) -> Dict[str, float]:
        metric = self.f1.get_metric(reset)

        results = {}

        if self.metrics_for_each_slot_type:
            results.update(metric)
        else:
            results.update({
                "precision": metric["precision-overall"],
                "precision-ov": metric["precision-ov"],
                "recall": metric["recall-overall"],
                "recall-ov": metric["recall-ov"],
                "f1": metric["f1-measure-overall"],
                "f1-ov": metric["f1-measure-ov"]
            })

        return results
Ejemplo n.º 23
0
class gru_crf(nn.Module):
    def __init__(self, num_input_features: '(int) number of input features', hidden_size: '(int) number of\
    hidden features the outputs will also have hidden_size features', num_layers: '(int) number of \
    recursion', dropout_gru, bidirectional: '(bool) if True, use bidirectional GRU',\
    tags: "(dict[int: str])example: {0:'I', 1:'B', 2:'O', 3:'<PAD>'}", dropout_FCN: '(double)', drop_GRU_out):
        super().__init__()
        self.gru = nn.GRU(input_size = num_input_features, hidden_size = hidden_size, \
                                  num_layers = num_layers, batch_first = True, dropout = dropout_gru, \
                                  bidirectional = bidirectional)
        #self.gru = WeightDropGRU(input_size = num_input_features, hidden_size = hidden_size, \
        #                         num_layers = num_layers, batch_first = True, dropout = dropout_gru, \
        #                         bidirectional = bidirectional, weight_dropout=drop_weight)
        all_transition=allowed_transitions('BIO', tags)
        #self.crf = CRF(num_tags=len(tags), batch_first= True)
        self.linear = nn.Linear(hidden_size*2, hidden_size)
        self.BN = nn.BatchNorm1d(num_layers)
        self.linear2 = nn.Linear(hidden_size, len(tags))
        self.BN2 = nn.BatchNorm1d(num_layers)
        self.crf = ConditionalRandomField(len(tags), all_transition)
        self.dropout = dropout_FCN
        self.drop_GRU_out = drop_GRU_out
        
    def forward(self, samples, target: '(torch.tensor) shape=(...............,)the target tags to be used',\
                mask: 'True for non-pad elements'):
        length = samples[1]
        samples = samples[0]
        batch_size, words, _ = samples.size()
        tmp_t = time()
        #print(samples.size())
        tmp_compute = F.dropout(self.gru(samples)[0], p=self.dropout)
        #print('pass inference gru')
        tmp_compute = tmp_compute.view(batch_size, words, -1)
        #print('pass reshape gru')
#         print(f'total GRU time: {time() - tmp_t}')
        index_to_cut = max(length).item()#get_longest_seq_len(mask)
        #length = torch.mean(length.float()).item()
        ##############################################
        ###cut padding some parts out#################
        #print(tmp_compute.size())
        #tmp_compute = self.dropout(tmp_compute)
        tmp_compute = F.dropout(F.relu(self.BN(self.linear(tmp_compute))), p=self.drop_GRU_out)
        tmp_compute = F.relu(self.BN2(self.linear2(tmp_compute)))
        tmp_compute = F.dropout(tmp_compute[:, :index_to_cut,:],  p=self.dropout)
        target = target[:, :index_to_cut]
        mask = mask[:, :index_to_cut]
        #print(tmp_compute.size())
        nll_loss = self.crf(tmp_compute,target.long(),mask)
#         print(f'total CRF time: {time() - tmp_t}')
        return nll_loss#/length
    def predict(self, samples, mask):
        length = samples[1]
        samples = samples[0]
        batch_size, words, _ = samples.size()
        tmp_t = time()
        tmp_compute = self.gru(samples)[0].view(batch_size, words, -1)
#         print(f'total GRU time: {time() - tmp_t}')
        index_to_cut = max(length).item()#get_longest_seq_len(mask)
        ##############################################
        ###cut padding some parts out#################
        #print(tmp_compute.size())
        
        tmp_compute = F.relu(self.BN(self.linear(tmp_compute)))
        tmp_compute = F.relu(self.BN2(self.linear2(tmp_compute)))
        tmp_compute = tmp_compute[:, :index_to_cut,:]
        mask = mask[:, :index_to_cut]
        #print(tmp_compute.size())
        tmp_t = time()
        tmp_tags = self.crf.viterbi_tags(tmp_compute,mask)
#         print(f'total CRF prediction time: {time() - tmp_t}')
        return tmp_tags
Ejemplo n.º 24
0
    def __init__(self,
                 vocab: Vocabulary,
                 bert_embedder: Optional[PretrainedBertEmbedder] = None,
                 encoder: Optional[Seq2SeqEncoder] = None,
                 dropout: Optional[float] = None,
                 use_crf: bool = True,
                 add_random_noise: bool = False,
                 add_attack_noise: bool = False,
                 do_noise_normalization: bool = True,
                 noise_norm: Optional[float] = None,
                 noise_loss_prob: Optional[float] = None,
                 add_noise_for: str = "ov",
                 rnn_after_embeddings: bool = False,
                 open_vocabulary_slots: Optional[List[str]] = None,
                 metrics_for_each_slot_type: bool = False) -> None:
        """
        Params
        ------
        vocab: the allennlp Vocabulary object, will be automatically passed
        bert_embedder: the pretrained BERT embedder. If it is not None, the pretrained BERT
                embedding (parameter fixed) will be used as the embedding layer. Otherwise, a look-up
                embedding matrix will be initialized with the embedding size 1024. The default is None.
        encoder: the contextual encoder used after the embedding layer. If set to None, no contextual
                encoder will be used.
        dropout: the dropout rate, won't be set in all our experiments.
        use_crf: if set to True, CRF will be used at the end of the model (as output layer). Otherwise,
                a softmax layer (with cross-entropy loss) will be used.
        add_random_noise: whether to add random noise to slots. Can not be set simultaneously 
                with add_attack_noise. This setting is used as baseline in our experiments.
        add_attack_noise: whether to add adversarial attack noise to slots. Can not be set simultaneously
                with add_random_noise.
        do_noise_normalization: if set to True, the normalization will be applied to gradients w.r.t. 
                token embeddings. Otherwise, the gradients won't be normalized.
        noise_norm: the normalization norm (L2) applied to gradients.
        noise_loss_prob: the alpha hyperparameter to balance the loss from normal forward and adversarial
                forward. See the paper for more details. Should be set from 0 to 1.
        add_noise_for: if set to ov, the noise will only be applied to open-vocabulary slots. Otherwise,
                the noise will be applied to all slots (both open-vocabulary and normal slots).
        rnn_after_embeddings: if set to True, an additional BiLSTM layer will be applied after the embedding
                layer. Default is False.
        open_vocabulary_slots: the list of open-vocabulary slots. If not set, will be set to open-vocabulary
                slots of Snips dataset by default.
        metrics_for_each_slot_type: whether to log metrics for each slot type. Default is False.
        """
        super().__init__(vocab)

        if bert_embedder:
            self.use_bert = True
            self.bert_embedder = bert_embedder
        else:
            self.use_bert = False
            self.basic_embedder = BasicTextFieldEmbedder({
                "tokens":
                Embedding(vocab.get_vocab_size(namespace="tokens"), 1024)
            })
            self.rnn_after_embeddings = rnn_after_embeddings
            if rnn_after_embeddings:
                self.rnn = Seq2SeqEncoder.from_params(
                    Params({
                        "type": "lstm",
                        "input_size": 1024,
                        "hidden_size": 512,
                        "bidirectional": True,
                        "batch_first": True
                    }))

        self.encoder = encoder

        if encoder:
            hidden2tag_in_dim = encoder.get_output_dim()
        else:
            hidden2tag_in_dim = bert_embedder.get_output_dim()
        self.hidden2tag = TimeDistributed(
            torch.nn.Linear(in_features=hidden2tag_in_dim,
                            out_features=vocab.get_vocab_size("labels")))

        if dropout:
            self.dropout = torch.nn.Dropout(dropout)
        else:
            self.dropout = None

        self.use_crf = use_crf
        if use_crf:
            crf_constraints = allowed_transitions(
                constraint_type="BIO",
                labels=vocab.get_index_to_token_vocabulary("labels"))
            self.crf = ConditionalRandomField(
                num_tags=vocab.get_vocab_size("labels"),
                constraints=crf_constraints,
                include_start_end_transitions=True)

        # default open_vocabulary slots: for SNIPS dataset
        open_vocabulary_slots = open_vocabulary_slots or [
            "playlist", "entity_name", "poi", "restaurant_name",
            "geographic_poi", "album", "track", "object_name", "movie_name"
        ]
        self.f1 = OVSpecSpanBasedF1Measure(
            vocab,
            tag_namespace="labels",
            ignore_classes=[],
            label_encoding="BIO",
            open_vocabulary_slots=open_vocabulary_slots)

        self.add_random_noise = add_random_noise
        self.add_attack_noise = add_attack_noise
        assert not (add_random_noise and
                    add_attack_noise), "both random and attack noise applied"
        if add_random_noise or add_attack_noise:
            self.do_noise_normalization = do_noise_normalization
            assert noise_norm is not None
            assert noise_loss_prob is not None and 0. <= noise_loss_prob <= 1.
            self.noise_norm = noise_norm
            self.noise_loss_prob = noise_loss_prob
            assert add_noise_for in ["ov", "all"]
            self.ov_noise_only = (add_noise_for == "ov")

        self.metrics_for_each_slot_type = metrics_for_each_slot_type
Ejemplo n.º 25
0
class RNNTagger(nn.Module):
    def __init__(self,
                 n_vocab,
                 unigram_embed_size,
                 rnn_unit_type,
                 rnn_bidirection,
                 rnn_batch_first,
                 rnn_n_layers,
                 rnn_hidden_size,
                 mlp_n_layers,
                 mlp_hidden_size,
                 n_labels,
                 use_crf=True,
                 crf_top_k=1,
                 embed_dropout=0.0,
                 rnn_dropout=0.0,
                 mlp_dropout=0.0,
                 pretrained_unigram_embed_size=0,
                 pretrained_embed_usage=ModelUsage.NONE):
        super(RNNTagger, self).__init__()
        self.n_vocab = n_vocab
        self.unigram_embed_size = unigram_embed_size

        self.rnn_unit_type = rnn_unit_type
        self.rnn_bidirection = rnn_bidirection
        self.rnn_batch_first = rnn_batch_first
        self.rnn_n_layers = rnn_n_layers
        self.rnn_hidden_size = rnn_hidden_size

        self.mlp_n_layers = mlp_n_layers
        self.mlp_hidden_size = mlp_hidden_size
        self.n_labels = n_labels
        self.use_crf = use_crf
        self.crf_top_k = crf_top_k

        self.embed_dropout = embed_dropout
        self.rnn_dropout = rnn_dropout
        self.mlp_dropout = mlp_dropout

        self.pretrained_unigram_embed_size = pretrained_unigram_embed_size
        self.pretrained_embed_usage = pretrained_embed_usage

        self.unigram_embed = None
        self.pretrained_unigram_embed = None
        self.rnn = None
        self.mlp = None
        self.crf = None
        self.cross_entropy_loss = None

        print('### Parameters', file=sys.stderr)

        # embeddings layer(s)

        print('# Embedding dropout ratio={}'.format(self.embed_dropout),
              file=sys.stderr)
        self.unigram_embed, self.pretrained_unigram_embed = models.util.construct_embeddings(
            n_vocab, unigram_embed_size, pretrained_unigram_embed_size,
            pretrained_embed_usage)
        if self.pretrained_embed_usage != ModelUsage.NONE:
            print('# Pretrained embedding usage: {}'.format(
                self.pretrained_embed_usage),
                  file=sys.stderr)
        print('# Unigram embedding matrix: W={}'.format(
            self.unigram_embed.weight.shape),
              file=sys.stderr)
        embed_size = self.unigram_embed.weight.shape[1]
        if self.pretrained_unigram_embed is not None:
            if self.pretrained_embed_usage == ModelUsage.CONCAT:
                embed_size += self.pretrained_unigram_embed_size
                print('# Pretrained unigram embedding matrix: W={}'.format(
                    self.pretrained_unigram_embed.weight.shape),
                      file=sys.stderr)

        # recurrent layers

        self.rnn_unit_type = rnn_unit_type
        self.rnn = models.util.construct_RNN(unit_type=rnn_unit_type,
                                             embed_size=embed_size,
                                             hidden_size=rnn_hidden_size,
                                             n_layers=rnn_n_layers,
                                             batch_first=rnn_batch_first,
                                             dropout=rnn_dropout,
                                             bidirectional=rnn_bidirection)
        rnn_output_size = rnn_hidden_size * (2 if rnn_bidirection else 1)

        # MLP

        print('# MLP', file=sys.stderr)
        mlp_in = rnn_output_size
        self.mlp = MLP(input_size=mlp_in,
                       hidden_size=mlp_hidden_size,
                       n_layers=mlp_n_layers,
                       output_size=n_labels,
                       dropout=mlp_dropout,
                       activation=nn.Identity)

        # Inference layer (CRF/softmax)

        if self.use_crf:
            self.crf = ConditionalRandomField(n_labels)
            print('# CRF cost: {}'.format(self.crf.transitions.shape),
                  file=sys.stderr)
        else:
            self.softmax_cross_entropy = nn.CrossEntropyLoss()

    """
    us: batch of unigram sequences
    ls: batch of label sequences
    """

    # unigram and label
    def forward(self, us, ls=None, calculate_loss=True, decode=False):
        lengths = self.extract_lengths(us)
        us, ls = self.pad_features(us, ls)
        xs = self.extract_features(us)
        rs = self.rnn_output(xs, lengths)
        ys = self.mlp(rs)
        loss, ps = self.predict(ys,
                                ls=ls,
                                lengths=lengths,
                                calculate_loss=calculate_loss,
                                decode=decode)
        return loss, ps

    def extract_lengths(self, ts):
        device = ts[0].device
        return torch.tensor([t.shape[0] for t in ts], device=device)

    def pad_features(self, us, ls):
        batch_first = self.rnn_batch_first
        us = pad_sequence(us, batch_first=batch_first)
        ls = pad_sequence(ls, batch_first=batch_first) if ls else None

        return us, ls

    def extract_features(self, us):
        xs = []

        for u in us:
            ue = self.unigram_embed(u)
            if self.pretrained_unigram_embed is not None:
                if self.pretrained_embed_usage == ModelUsage.ADD:
                    pe = self.pretrained_unigram_embed(u)
                    ue = ue + pe
                elif self.pretrained_embed_usage == ModelUsage.CONCAT:
                    pe = self.pretrained_unigram_embed(u)
                    ue = torch.cat((ue, pe), 1)
            ue = F.dropout(ue, p=self.embed_dropout)
            xe = ue
            xs.append(xe)

        if self.rnn_batch_first:
            xs = torch.stack(xs, dim=0)
        else:
            xs = torch.stack(xs, dim=1)

        return xs

    def rnn_output(self, xs, lengths=None):
        if self.rnn_unit_type == 'lstm':
            hs, (hy, cy) = self.rnn(xs, lengths)
        else:
            hs, hy = self.rnn(xs)
        return hs

    def predict(self,
                rs,
                ls=None,
                lengths=None,
                calculate_loss=True,
                decode=False):
        if self.crf:
            return self.predict_crf(rs, ls, lengths, calculate_loss, decode)
        else:
            return self.predict_softmax(rs, ls, calculate_loss)

    def predict_softmax(self, ys, ls=None, calculate_loss=True):
        ps = []
        loss = torch.tensor(0, dtype=torch.float, device=ys.device)
        if ls is None:
            ls = [None] * len(ys)
        for y, l in zip(ys, ls):
            if calculate_loss:
                loss += self.softmax_cross_entropy(y, l)
            ps.append([torch.argmax(yi.data) for yi in y])

        return loss, ps

    def predict_crf(self,
                    hs,
                    ls=None,
                    lengths=None,
                    calculate_loss=True,
                    decode=False):
        device = hs.device
        if lengths is None:
            lengths = torch.tensor([h.shape[0] for h in hs], device=device)
        mask = get_mask_from_sequence_lengths(lengths, max_length=max(lengths))
        if not decode or self.crf_top_k == 1:
            ps = self.crf.viterbi_tags(hs, mask)
            ps, score = zip(*ps)
        else:
            ps = []
            psks = self.crf.viterbi_tags(hs, mask, top_k=self.crf_top_k)
            for psk in psks:
                psk, score = zip(*psk)
                ps.append(psk)

        if calculate_loss:
            log_likelihood = self.crf(hs, ls, mask)
            loss = -1 * log_likelihood / len(lengths)
        else:
            loss = torch.tensor(np.array(0), dtype=torch.float, device=device)

        return loss, ps

    def decode(self, us):
        with torch.no_grad():
            _, ps = self.forward(us, calculate_loss=False, decode=True)
        return ps
Ejemplo n.º 26
0
    def __init__(self,
                 n_vocab,
                 unigram_embed_size,
                 rnn_unit_type,
                 rnn_bidirection,
                 rnn_batch_first,
                 rnn_n_layers,
                 rnn_hidden_size,
                 mlp_n_layers,
                 mlp_hidden_size,
                 n_labels,
                 use_crf=True,
                 crf_top_k=1,
                 embed_dropout=0.0,
                 rnn_dropout=0.0,
                 mlp_dropout=0.0,
                 pretrained_unigram_embed_size=0,
                 pretrained_embed_usage=ModelUsage.NONE):
        super(RNNTagger, self).__init__()
        self.n_vocab = n_vocab
        self.unigram_embed_size = unigram_embed_size

        self.rnn_unit_type = rnn_unit_type
        self.rnn_bidirection = rnn_bidirection
        self.rnn_batch_first = rnn_batch_first
        self.rnn_n_layers = rnn_n_layers
        self.rnn_hidden_size = rnn_hidden_size

        self.mlp_n_layers = mlp_n_layers
        self.mlp_hidden_size = mlp_hidden_size
        self.n_labels = n_labels
        self.use_crf = use_crf
        self.crf_top_k = crf_top_k

        self.embed_dropout = embed_dropout
        self.rnn_dropout = rnn_dropout
        self.mlp_dropout = mlp_dropout

        self.pretrained_unigram_embed_size = pretrained_unigram_embed_size
        self.pretrained_embed_usage = pretrained_embed_usage

        self.unigram_embed = None
        self.pretrained_unigram_embed = None
        self.rnn = None
        self.mlp = None
        self.crf = None
        self.cross_entropy_loss = None

        print('### Parameters', file=sys.stderr)

        # embeddings layer(s)

        print('# Embedding dropout ratio={}'.format(self.embed_dropout),
              file=sys.stderr)
        self.unigram_embed, self.pretrained_unigram_embed = models.util.construct_embeddings(
            n_vocab, unigram_embed_size, pretrained_unigram_embed_size,
            pretrained_embed_usage)
        if self.pretrained_embed_usage != ModelUsage.NONE:
            print('# Pretrained embedding usage: {}'.format(
                self.pretrained_embed_usage),
                  file=sys.stderr)
        print('# Unigram embedding matrix: W={}'.format(
            self.unigram_embed.weight.shape),
              file=sys.stderr)
        embed_size = self.unigram_embed.weight.shape[1]
        if self.pretrained_unigram_embed is not None:
            if self.pretrained_embed_usage == ModelUsage.CONCAT:
                embed_size += self.pretrained_unigram_embed_size
                print('# Pretrained unigram embedding matrix: W={}'.format(
                    self.pretrained_unigram_embed.weight.shape),
                      file=sys.stderr)

        # recurrent layers

        self.rnn_unit_type = rnn_unit_type
        self.rnn = models.util.construct_RNN(unit_type=rnn_unit_type,
                                             embed_size=embed_size,
                                             hidden_size=rnn_hidden_size,
                                             n_layers=rnn_n_layers,
                                             batch_first=rnn_batch_first,
                                             dropout=rnn_dropout,
                                             bidirectional=rnn_bidirection)
        rnn_output_size = rnn_hidden_size * (2 if rnn_bidirection else 1)

        # MLP

        print('# MLP', file=sys.stderr)
        mlp_in = rnn_output_size
        self.mlp = MLP(input_size=mlp_in,
                       hidden_size=mlp_hidden_size,
                       n_layers=mlp_n_layers,
                       output_size=n_labels,
                       dropout=mlp_dropout,
                       activation=nn.Identity)

        # Inference layer (CRF/softmax)

        if self.use_crf:
            self.crf = ConditionalRandomField(n_labels)
            print('# CRF cost: {}'.format(self.crf.transitions.shape),
                  file=sys.stderr)
        else:
            self.softmax_cross_entropy = nn.CrossEntropyLoss()
Ejemplo n.º 27
0
class SpanScorerCRF(nn.Module):
    '''
    Span extractor
    '''
    def __init__(
        self,
        input_dim,
        num_tags,
        low_val=-5,
        high_val=5,
        incl_start_end=True,
        name=None,
    ):
        super(SpanScorerCRF, self).__init__()

        self.input_dim = input_dim
        self.num_tags = num_tags
        self.low_val = low_val
        self.high_val = high_val
        self.incl_start_end = incl_start_end
        self.name = name

        self.span_to_seq, self.seq_to_span = label_map(num_tags)

        self.num_tags_seq = len(self.seq_to_span)
        self.num_tags_span = len(self.span_to_seq)

        # Linear projection layer
        self.projection = nn.Linear(input_dim, self.num_tags_seq)

        # Create event-specific CRF
        self.crf = ConditionalRandomField( \
                        num_tags = self.num_tags_seq,
                        include_start_end_transitions = incl_start_end)

    def forward(self,
                seq_tensor,
                seq_mask,
                span_map,
                span_indices,
                verbose=False):
        '''
        Calculate logits
        '''
        # Dimensionality
        batch_size, max_seq_len, input_dim = tuple(seq_tensor.shape)

        # Project input tensor sequence to logits
        seq_scores = self.projection(seq_tensor)
        '''
        Decoding sequence tags
        '''

        # Viterbi decode
        best_paths = self.crf.viterbi_tags( \
                                        logits = seq_scores,
                                        mask = seq_mask)
        seq_pred, score = zip(*best_paths)
        seq_pred = list(seq_pred)
        '''
        Convert sequence tags to span predictions
        '''
        # Get spans from sequence tags
        #   Converts list of list of predicted label indices to
        #   tensor of size (batch_size, num_spans)
        span_pred = seq_tags_to_spans( \
                                seq_tags = seq_pred,
                                span_map = span_map,
                                seq_tag_map = self.seq_to_span)

        span_pred = span_pred.to(seq_tensor.device)

        # Get scores from labels
        span_pred = F.one_hot(span_pred,
                              num_classes=self.num_tags_span).float()

        #print('crf seq  pos: ', sum([int(w > 0) for W in seq_pred for w in W]))
        #print('crf span pos: ', (span_pred > 0).sum().tolist())
        #print(span_pred)
        return (seq_scores, span_pred)

    def loss(self, span_labels, seq_scores, seq_mask, span_map):

        batch_size, max_len, embed_dim = tuple(seq_scores.shape)



        seq_labels = get_seq_labels( \
                        span_labels = span_labels,
                        span_map = span_map,
                        span_to_seq = self.span_to_seq,
                        max_len = max_len)

        seq_mask[:, 0] = True

        # Get loss (negative log likelihood)
        loss = -self.crf( \
                            inputs = seq_scores,
                            tags = seq_labels,
                            mask = seq_mask)
        #print('loss', loss)

        return loss
Ejemplo n.º 28
0
def default_crf() -> ConditionalRandomField:
    include_start_end_transitions = True
    constraints = allowed_transitions('BIO', {0: 'O', 1: 'B', 2: 'I'})
    return ConditionalRandomField(3, constraints,
                                  include_start_end_transitions)
Ejemplo n.º 29
0
class BertMiddleModel(Model):
    def __init__(
        self,
        vocab: Vocabulary,
        text_field_embedder: TextFieldEmbedder,
        seq2seq_encoder: Seq2SeqEncoder,
        feedforward_encoder: Seq2SeqEncoder,
        dropout: float = 0.0,
        use_crf: bool = False,
        pos_weight: float = 1.0,
        initializer: InitializerApplicator = InitializerApplicator(),
        regularizer: Optional[RegularizerApplicator] = None,
    ):

        super(BertMiddleModel, self).__init__(vocab, regularizer)
        self._vocabulary = vocab
        self._text_field_embedder = text_field_embedder
        self._seq2seq_encoder = seq2seq_encoder
        self._dropout = torch.nn.Dropout(p=dropout)

        self._feedforward_encoder = feedforward_encoder
        self._classifier_input_dim = feedforward_encoder.get_output_dim()

        self._classification_layer = torch.nn.Linear(
            self._classifier_input_dim, 2)

        self._use_crf = use_crf

        self._pos_weight = torch.Tensor([1 / (1 - pos_weight), 1 / pos_weight])
        self._pos_weight = torch.nn.Parameter(self._pos_weight /
                                              self._pos_weight.min())
        self._pos_weight.requires_grad = False

        if use_crf:
            self._crf = ConditionalRandomField(num_tags=2)

        self._token_prf = F1Measure(1)

        initializer(self)

    def forward(self,
                document,
                query=None,
                rationale=None,
                metadata=None,
                label=None) -> Dict[str, Any]:
        embedded_text = self._text_field_embedder(document)
        mask = util.get_text_field_mask(document).float()

        embedded_text = self._seq2seq_encoder(embedded_text, mask=mask)
        embedded_text = self._dropout(self._feedforward_encoder(embedded_text))

        logits = self._classification_layer(embedded_text)

        if self._use_crf:
            best_paths = self._crf.viterbi_tags(logits, mask=document["mask"])
            best_paths = [b[0] for b in best_paths]
            best_paths = [
                x + [0] * (logits.shape[1] - len(x)) for x in best_paths
            ]
            best_paths = torch.Tensor(best_paths).to(
                logits.device) * document["mask"]
        else:
            best_paths = (logits[:, :, 1] > 0.5).long() * document["mask"]

        output_dict = {}

        output_dict["predicted_rationales"] = best_paths
        output_dict["mask"] = document["mask"]
        output_dict["metadata"] = metadata

        if rationale is not None:
            if self._use_crf:
                output_dict["loss"] = -self._crf(logits, rationale,
                                                 document["mask"])
            else:
                output_dict["loss"] = ((F.cross_entropy(
                    logits.view(-1, logits.shape[-1]),
                    rationale.view(-1),
                    reduction="none",
                    weight=self._pos_weight,
                ) * document["mask"].view(-1)).sum(-1).mean())

            best_paths = best_paths.unsqueeze(-1)
            best_paths = torch.cat([1 - best_paths, best_paths], dim=-1)
            self._token_prf(best_paths, rationale, document["mask"])
        return output_dict

    def extract_rationale(self, output_dict):
        rationales = []
        sentences = [x["tokens"] for x in output_dict["metadata"]]
        predicted_rationales = output_dict["predicted_rationales"].cpu(
        ).data.numpy()
        for path, words in zip(predicted_rationales, sentences):
            path = list(path)[:len(words)]
            words = [x.text for x in words]
            starts, ends = [], []
            path.append(0)
            for i in range(len(words)):
                if path[i - 1:i] == [0, 1]:
                    starts.append(i)
                if path[i - 1:i] == [1, 0]:
                    ends.append(i)

            assert len(starts) == len(ends)
            spans = list(zip(starts, ends))

            rationales.append({
                "document":
                " ".join([w for i, w in zip(path, words) if i == 1]),
                "spans": [{
                    "span": (s, e),
                    "value": 1
                } for s, e in spans],
                "metadata":
                None,
            })

        return rationales

    def get_metrics(self, reset: bool = False) -> Dict[str, float]:
        metrics = self._token_prf.get_metric(reset)
        return dict(zip(["p", "r", "f1"], metrics))

    def decode(self, output_dict):
        rationales = self.extract_rationale(output_dict)
        new_output_dict = {}

        new_output_dict['rationale'] = rationales
        new_output_dict['document'] = [r['document'] for r in rationales]

        if 'query' in output_dict['metadata'][0]:
            output_dict['query'] = [
                m['query'] for m in output_dict['metadata']
            ]

        for m in output_dict["metadata"]:
            if 'convert_tokens_to_instance' in m:
                del m["convert_tokens_to_instance"]

        new_output_dict['label'] = [
            m['label'] for m in output_dict['metadata']
        ]
        new_output_dict['metadata'] = output_dict['metadata']

        return new_output_dict
Ejemplo n.º 30
0
class SeqClassificationModel(Model):
    """
    Question answering model where answers are sentences
    """
    def __init__(
        self,
        vocab: Vocabulary,
        text_field_embedder: TextFieldEmbedder,
        use_sep: bool = True,
        with_crf: bool = False,
        self_attn: Seq2SeqEncoder = None,
        bert_dropout: float = 0.1,
        sci_sum: bool = False,
        additional_feature_size: int = 0,
    ) -> None:
        super(SeqClassificationModel, self).__init__(vocab)

        self.text_field_embedder = text_field_embedder
        self.vocab = vocab
        self.use_sep = use_sep
        self.with_crf = with_crf
        self.sci_sum = sci_sum
        self.self_attn = self_attn
        self.additional_feature_size = additional_feature_size

        self.dropout = torch.nn.Dropout(p=bert_dropout)

        # define loss
        if self.sci_sum:
            self.loss = torch.nn.MSELoss(
                reduction='none')  # labels are rouge scores
            self.labels_are_scores = True
            self.num_labels = 1
        else:
            self.loss = torch.nn.CrossEntropyLoss(ignore_index=-1,
                                                  reduction='none')
            self.labels_are_scores = False
            self.num_labels = self.vocab.get_vocab_size(namespace='labels')
            # define accuracy metrics
            self.label_accuracy = CategoricalAccuracy()
            self.all_f1_metrics = FBetaMeasure(beta=1.0, average='micro')
            self.label_f1_metrics = {}

            # define F1 metrics per label
            for label_index in range(self.num_labels):
                label_name = self.vocab.get_token_from_index(
                    namespace='labels', index=label_index)
                self.label_f1_metrics[label_name] = F1Measure(label_index)

        encoded_senetence_dim = text_field_embedder._token_embedders[
            'bert'].output_dim

        ff_in_dim = encoded_senetence_dim if self.use_sep else self_attn.get_output_dim(
        )
        ff_in_dim += self.additional_feature_size

        self.time_distributed_aggregate_feedforward = TimeDistributed(
            Linear(ff_in_dim, self.num_labels))

        if self.with_crf:
            self.crf = ConditionalRandomField(
                self.num_labels,
                constraints=None,
                include_start_end_transitions=True)

    def forward(
        self,  # type: ignore
        sentences: torch.LongTensor,
        labels: torch.IntTensor = None,
        confidences: torch.Tensor = None,
        additional_features: torch.Tensor = None,
    ) -> Dict[str, torch.Tensor]:
        # pylint: disable=arguments-differ
        """
        Parameters
        ----------
        TODO: add description

        Returns
        -------
        An output dictionary consisting of:
        loss : torch.FloatTensor, optional
            A scalar loss to be optimised.
        """
        # ===========================================================================================================
        # Layer 1: For each sentence, participant pair: create a Glove embedding for each token
        # Input: sentences
        # Output: embedded_sentences

        # embedded_sentences: batch_size, num_sentences, sentence_length, embedding_size
        embedded_sentences = self.text_field_embedder(sentences)
        mask = get_text_field_mask(sentences, num_wrapping_dims=1).float()
        batch_size, num_sentences, _, _ = embedded_sentences.size()

        if self.use_sep:
            # The following code collects vectors of the SEP tokens from all the examples in the batch,
            # and arrange them in one list. It does the same for the labels and confidences.
            # TODO: replace 103 with '[SEP]'
            sentences_mask = sentences[
                'bert'] == 103  # mask for all the SEP tokens in the batch
            embedded_sentences = embedded_sentences[
                sentences_mask]  # given batch_size x num_sentences_per_example x sent_len x vector_len
            # returns num_sentences_per_batch x vector_len
            assert embedded_sentences.dim() == 2
            num_sentences = embedded_sentences.shape[0]
            # for the rest of the code in this model to work, think of the data we have as one example
            # with so many sentences and a batch of size 1
            batch_size = 1
            embedded_sentences = embedded_sentences.unsqueeze(dim=0)
            embedded_sentences = self.dropout(embedded_sentences)

            if labels is not None:
                if self.labels_are_scores:
                    labels_mask = labels != 0.0  # mask for all the labels in the batch (no padding)
                else:
                    labels_mask = labels != -1  # mask for all the labels in the batch (no padding)

                labels = labels[
                    labels_mask]  # given batch_size x num_sentences_per_example return num_sentences_per_batch
                assert labels.dim() == 1
                if confidences is not None:
                    confidences = confidences[labels_mask]
                    assert confidences.dim() == 1
                if additional_features is not None:
                    additional_features = additional_features[labels_mask]
                    assert additional_features.dim() == 2

                num_labels = labels.shape[0]
                if num_labels != num_sentences:  # bert truncates long sentences, so some of the SEP tokens might be gone
                    assert num_labels > num_sentences  # but `num_labels` should be at least greater than `num_sentences`
                    logger.warning(
                        f'Found {num_labels} labels but {num_sentences} sentences'
                    )
                    labels = labels[:
                                    num_sentences]  # Ignore some labels. This is ok for training but bad for testing.
                    # We are ignoring this problem for now.
                    # TODO: fix, at least for testing

                # do the same for `confidences`
                if confidences is not None:
                    num_confidences = confidences.shape[0]
                    if num_confidences != num_sentences:
                        assert num_confidences > num_sentences
                        confidences = confidences[:num_sentences]

                # and for `additional_features`
                if additional_features is not None:
                    num_additional_features = additional_features.shape[0]
                    if num_additional_features != num_sentences:
                        assert num_additional_features > num_sentences
                        additional_features = additional_features[:
                                                                  num_sentences]

                # similar to `embedded_sentences`, add an additional dimension that corresponds to batch_size=1
                labels = labels.unsqueeze(dim=0)
                if confidences is not None:
                    confidences = confidences.unsqueeze(dim=0)
                if additional_features is not None:
                    additional_features = additional_features.unsqueeze(dim=0)
        else:
            # ['CLS'] token
            embedded_sentences = embedded_sentences[:, :, 0, :]
            embedded_sentences = self.dropout(embedded_sentences)
            batch_size, num_sentences, _ = embedded_sentences.size()
            sent_mask = (mask.sum(dim=2) != 0)
            embedded_sentences = self.self_attn(embedded_sentences, sent_mask)

        if additional_features is not None:
            embedded_sentences = torch.cat(
                (embedded_sentences, additional_features), dim=-1)

        label_logits = self.time_distributed_aggregate_feedforward(
            embedded_sentences)
        # label_logits: batch_size, num_sentences, num_labels

        if self.labels_are_scores:
            label_probs = label_logits
        else:
            label_probs = torch.nn.functional.softmax(label_logits, dim=-1)

        # Create output dictionary for the trainer
        # Compute loss and epoch metrics
        output_dict = {"action_probs": label_probs}

        # =====================================================================

        if self.with_crf:
            # Layer 4 = CRF layer across labels of sentences in an abstract
            mask_sentences = (labels != -1)
            best_paths = self.crf.viterbi_tags(label_logits, mask_sentences)
            #
            # # Just get the tags and ignore the score.
            predicted_labels = [x for x, y in best_paths]
            # print(f"len(predicted_labels):{len(predicted_labels)}, (predicted_labels):{predicted_labels}")

            label_loss = 0.0
        if labels is not None:
            # Compute cross entropy loss
            flattened_logits = label_logits.view((batch_size * num_sentences),
                                                 self.num_labels)
            flattened_gold = labels.contiguous().view(-1)

            if not self.with_crf:
                label_loss = self.loss(flattened_logits.squeeze(),
                                       flattened_gold)
                if confidences is not None:
                    label_loss = label_loss * confidences.type_as(
                        label_loss).view(-1)
                label_loss = label_loss.mean()
                flattened_probs = torch.softmax(flattened_logits, dim=-1)
            else:
                clamped_labels = torch.clamp(labels, min=0)
                log_likelihood = self.crf(label_logits, clamped_labels,
                                          mask_sentences)
                label_loss = -log_likelihood
                # compute categorical accuracy
                crf_label_probs = label_logits * 0.
                for i, instance_labels in enumerate(predicted_labels):
                    for j, label_id in enumerate(instance_labels):
                        crf_label_probs[i, j, label_id] = 1
                flattened_probs = crf_label_probs.view(
                    (batch_size * num_sentences), self.num_labels)

            if not self.labels_are_scores:
                evaluation_mask = (flattened_gold != -1)
                self.label_accuracy(flattened_probs.float().contiguous(),
                                    flattened_gold.squeeze(-1),
                                    mask=evaluation_mask)

                self.all_f1_metrics(flattened_probs,
                                    flattened_gold,
                                    mask=evaluation_mask)

                # compute F1 per label
                for label_index in range(self.num_labels):
                    label_name = self.vocab.get_token_from_index(
                        namespace='labels', index=label_index)
                    metric = self.label_f1_metrics[label_name]
                    metric(flattened_probs,
                           flattened_gold,
                           mask=evaluation_mask)

        if labels is not None:
            output_dict["loss"] = label_loss
        output_dict['action_logits'] = label_logits
        return output_dict

    def get_metrics(self, reset: bool = False):
        metric_dict = {}

        if not self.labels_are_scores:
            type_accuracy = self.label_accuracy.get_metric(reset)
            metric_dict['acc'] = type_accuracy
            type_f1 = self.all_f1_metrics.get_metric(reset)
            metric_dict['F1'] = type_f1['fscore']

            average_F1 = 0.0
            for name, metric in self.label_f1_metrics.items():
                metric_val = metric.get_metric(reset)
                metric_dict[name + 'F'] = metric_val[2]
                average_F1 += metric_val[2]

            average_F1 /= len(self.label_f1_metrics.items())
            metric_dict['avgF'] = average_F1

        return metric_dict