class ImageCaptioningDecoder(nn.Module):
    def __init__(self, vocab_size:int = 70, encoder_dim:int = 512, embedding_dim:int = 64, attention_dim:int = 64, decoder_dim:int = 64):
        super(ImageCaptioningDecoder, self).__init__()
        
        self._vocab_size = vocab_size
        self._encoder_dim = encoder_dim
        self._embedding_dim = embedding_dim
        self._attention_dim = attention_dim
        self._decoder_dim = decoder_dim
        
        self._embedding = Embedding(self._vocab_size, self._embedding_dim)
        self._attention = ImageCaptioningAttention(self._encoder_dim, self._decoder_dim, self._attention_dim)
        self._decoder_cell = nn.LSTMCell(self._embedding.get_output_dim() + self._attention.get_output_dim(), self._decoder_dim)
        self._linear = nn.Linear(self._decoder_dim, self._vocab_size)
        
    def forward(self, x: torch.Tensor, h: torch.Tensor, c: torch.Tensor, predicted_indices: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
        embedding = self._embedding(predicted_indices).float().view(-1, self._embedding_dim) # (batch_size, embedding_dim) (64, 64)

        # Shape: (batch_size, encoder_dim) (batch_size, h * w, 1)
        attention, attention_weights = self._attention(x, h)

        ## Change to not use teacher forcing all the time
        # Shape: (batch_size, decoder_dim) (batch_size, decoder_dim)
        h, c = self._decoder_cell(torch.cat([attention, embedding], dim=1), (h, c))
        
        # Get output predictions (one per character in vocab)
        # Shape: (batch_size, vocab_size)
        preds = self._linear(h)

        return h, c, preds, attention_weights
class MultiscaleDecoder(CaptioningDecoder):
    def __init__(self,
                 vocab: Vocabulary,
                 attention: CaptioningAttention,
                 embedding_dim: int = 256,
                 decoder_dim: int = 256):
        super(MultiscaleDecoder, self).__init__(vocab=vocab)

        self._vocab_size = self.vocab.get_vocab_size()
        self._embedding_dim = embedding_dim
        self._decoder_dim = decoder_dim

        self._embedding = Embedding(self._vocab_size, self._embedding_dim)
        self._dropout = nn.Dropout(0.1)
        # Output size of state cell must be decoder dim since state is transformed by the state cell
        self._state_cell = nn.GRUCell(self._embedding.get_output_dim(),
                                      self._decoder_dim)

        self._attention = attention
        self._decoder_cell = nn.GRUCell(self._attention.get_output_dim(),
                                        self._decoder_dim)

        self._linear = nn.Linear(self._decoder_dim, self._vocab_size)

    @overrides
    def forward(
        self, x: torch.Tensor, h: torch.Tensor,
        predicted_indices: torch.Tensor, sum_attention_weights_0: torch.Tensor,
        sum_attention_weights_1: torch.Tensor
    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
        # Shape: (batch_size, embedding_dim)
        embedding = self._embedding(predicted_indices).float().view(
            -1, self._embedding_dim)
        embedding = self._dropout(embedding)

        # Shape: (batch_size, decoder_dim)
        h = self._state_cell(embedding, h)

        # Shape: (batch_size, encoder_dim) (batch_size, h * w, 1)
        attention, attention_weights, sum_attention_weights_0, sum_attention_weights_1 = self._attention(
            x, h, sum_attention_weights_0, sum_attention_weights_1)

        ## Change to not use teacher forcing all the time
        # Shape: (batch_size, decoder_dim) (batch_size, decoder_dim)
        h = self._decoder_cell(attention, h)

        # Get output predictions (one per character in vocab)
        # Shape: (batch_size, vocab_size)
        preds = self._linear(h)

        return h, preds, attention_weights, sum_attention_weights_0, sum_attention_weights_1

    @overrides
    def get_output_dim(self) -> int:
        return self._vocab_size

    @overrides
    def get_input_dim(self) -> int:
        return self._decoder_dim
Exemple #3
0
    def __init__(
        self,
        vocab: Vocabulary,
        text_field_embedder: TextFieldEmbedder,
        span_extractor: SpanExtractor,
        encoder: Seq2SeqEncoder,
        feedforward: FeedForward = None,
        pos_tag_embedding: Embedding = None,
        initializer: InitializerApplicator = InitializerApplicator(),
        regularizer: Optional[RegularizerApplicator] = None,
        evalb_directory_path: str = DEFAULT_EVALB_DIR,
    ) -> None:
        super().__init__(vocab, regularizer)

        self.text_field_embedder = text_field_embedder
        self.span_extractor = span_extractor
        self.num_classes = self.vocab.get_vocab_size("labels")
        self.encoder = encoder
        self.feedforward_layer = TimeDistributed(
            feedforward) if feedforward else None
        self.pos_tag_embedding = pos_tag_embedding or None
        if feedforward is not None:
            output_dim = feedforward.get_output_dim()
        else:
            output_dim = span_extractor.get_output_dim()

        self.tag_projection_layer = TimeDistributed(
            Linear(output_dim, self.num_classes))

        representation_dim = text_field_embedder.get_output_dim()
        if pos_tag_embedding is not None:
            representation_dim += pos_tag_embedding.get_output_dim()
        check_dimensions_match(
            representation_dim,
            encoder.get_input_dim(),
            "representation dim (tokens + optional POS tags)",
            "encoder input dim",
        )
        check_dimensions_match(
            encoder.get_output_dim(),
            span_extractor.get_input_dim(),
            "encoder input dim",
            "span extractor input dim",
        )
        if feedforward is not None:
            check_dimensions_match(
                span_extractor.get_output_dim(),
                feedforward.get_input_dim(),
                "span extractor output dim",
                "feedforward input dim",
            )

        self.tag_accuracy = CategoricalAccuracy()

        if evalb_directory_path is not None:
            self._evalb_score = EvalbBracketingScorer(evalb_directory_path)
        else:
            self._evalb_score = None
        initializer(self)
Exemple #4
0
    def __init__(self, vocab, embedding_in: Embedding, embedding_out:Embedding=None, cuda_device=-1,dropout:float=0.3):
        super().__init__(vocab)
        self.embedding_in = embedding_in        
        self.embedding_out = embedding_out
        self.emb_dimension = self.vocab.get_vocab_size('tags_in')

        self.linear1 = nn.Linear(in_features=embedding_in.get_output_dim(), out_features= self.emb_dimension)        
        self.dropout = nn.Dropout(dropout)
        self.bn = nn.BatchNorm1d(self.emb_dimension)
        self.init_emb()
class ImageCaptioningDecoder(CaptioningDecoder):
    def __init__(self,
                 vocab: Vocabulary,
                 attention: CaptioningAttention,
                 embedding_dim: int = 256,
                 decoder_dim: int = 256):
        super(ImageCaptioningDecoder, self).__init__(vocab=vocab)

        self._vocab_size = self.vocab.get_vocab_size()
        self._embedding_dim = embedding_dim
        self._decoder_dim = decoder_dim

        self._embedding = Embedding(self._vocab_size, self._embedding_dim)
        self._attention = attention
        self._decoder_cell = nn.LSTMCell(
            self._embedding.get_output_dim() +
            self._attention.get_output_dim(), self._decoder_dim)
        self._linear = nn.Linear(self._decoder_dim, self._vocab_size)

    @overrides
    def forward(
        self, x: torch.Tensor, h: torch.Tensor, c: torch.Tensor,
        predicted_indices: torch.Tensor
    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
        # Shape: (batch_size, embedding_dim)
        embedding = self._embedding(predicted_indices).float().view(
            -1, self._embedding_dim)

        # Shape: (batch_size, encoder_dim) (batch_size, h * w, 1)
        attention, attention_weights = self._attention(x, h)

        ## Change to not use teacher forcing all the time
        # Shape: (batch_size, decoder_dim) (batch_size, decoder_dim)
        h, c = self._decoder_cell(torch.cat([attention, embedding], dim=1),
                                  (h, c))

        # Get output predictions (one per character in vocab)
        # Shape: (batch_size, vocab_size)
        preds = self._linear(h)

        return h, c, preds, attention_weights

    @overrides
    def get_output_dim(self) -> int:
        return self._vocab_size

    @overrides
    def get_input_dim(self) -> int:
        return self._decoder_dim
    def __init__(self,
                 vocab: Vocabulary,
                 text_field_embedder: TextFieldEmbedder,
                 span_extractor: SpanExtractor,
                 encoder: Seq2SeqEncoder,
                 feedforward_layer: FeedForward = None,
                 pos_tag_embedding: Embedding = None,
                 initializer: InitializerApplicator = InitializerApplicator(),
                 regularizer: Optional[RegularizerApplicator] = None,
                 evalb_directory_path: str = None) -> None:
        super(SpanConstituencyParser, self).__init__(vocab, regularizer)

        self.text_field_embedder = text_field_embedder
        self.span_extractor = span_extractor
        self.num_classes = self.vocab.get_vocab_size("labels")
        self.encoder = encoder
        self.feedforward_layer = TimeDistributed(feedforward_layer) if feedforward_layer else None
        self.pos_tag_embedding = pos_tag_embedding or None
        if feedforward_layer is not None:
            output_dim = feedforward_layer.get_output_dim()
        else:
            output_dim = span_extractor.get_output_dim()

        self.tag_projection_layer = TimeDistributed(Linear(output_dim, self.num_classes))

        representation_dim = text_field_embedder.get_output_dim()
        if pos_tag_embedding is not None:
            representation_dim += pos_tag_embedding.get_output_dim()
        check_dimensions_match(representation_dim,
                               encoder.get_input_dim(),
                               "representation dim (tokens + optional POS tags)",
                               "encoder input dim")
        check_dimensions_match(encoder.get_output_dim(),
                               span_extractor.get_input_dim(),
                               "encoder input dim",
                               "span extractor input dim")
        if feedforward_layer is not None:
            check_dimensions_match(span_extractor.get_output_dim(),
                                   feedforward_layer.get_input_dim(),
                                   "span extractor output dim",
                                   "feedforward input dim")

        self.tag_accuracy = CategoricalAccuracy()

        if evalb_directory_path is not None:
            self._evalb_score = EvalbBracketingScorer(evalb_directory_path)
        else:
            self._evalb_score = None
        initializer(self)
Exemple #7
0
    def __init__(self,
                 vocab: Vocabulary,
                 kg_model: Model = None,
                 entity_embedding: Embedding = None,
                 concat_entity_embedder: EntityEmbedder = None,
                 contextual_embedding_dim: int = None,
                 span_encoder_config: Dict[str, int] = None,
                 margin: float = 0.2,
                 decode_threshold: float = 0.0,
                 loss_type: str = 'margin',
                 max_sequence_length: int = 512,
                 dropout: float = 0.1,
                 output_feed_forward_hidden_dim: int = 100,
                 initializer_range: float = 0.02,
                 include_null_embedding_in_dot_attention: bool = False,
                 namespace: str = 'entity',
                 regularizer: RegularizerApplicator = None):

        super().__init__(vocab,
                         margin=margin,
                         decode_threshold=decode_threshold,
                         loss_type=loss_type,
                         namespace=namespace,
                         regularizer=regularizer)

        num_embeddings_passed = sum(
            [kg_model is not None, entity_embedding is not None, concat_entity_embedder is not None]
        )
        if num_embeddings_passed != 1:
            raise ValueError("Linking model needs either a kg factorisation model or an entity embedding.")

        elif kg_model is not None:
            entity_embedding = kg_model.get_entity_embedding()
            entity_embedding_dim = entity_embedding.embedding_dim

        elif entity_embedding is not None:
            entity_embedding_dim = entity_embedding.get_output_dim()

        elif concat_entity_embedder is not None:
            entity_embedding_dim = concat_entity_embedder.get_output_dim()
            set_requires_grad(concat_entity_embedder, False)
            entity_embedding = concat_entity_embedder

        if loss_type == 'margin':
            weighted_entity_threshold = decode_threshold
        else:
            weighted_entity_threshold = None

        null_entity_id = self.vocab.get_token_index('@@NULL@@', namespace)
        assert null_entity_id != self.vocab.get_token_index('@@UNKNOWN@@', namespace)

        self.disambiguator = EntityDisambiguator(
            contextual_embedding_dim,
            entity_embedding_dim=entity_embedding_dim,
            entity_embeddings=entity_embedding,
            max_sequence_length=max_sequence_length,
            span_encoder_config=span_encoder_config,
            dropout=dropout,
            output_feed_forward_hidden_dim=output_feed_forward_hidden_dim,
            initializer_range=initializer_range,
            weighted_entity_threshold=weighted_entity_threshold,
            include_null_embedding_in_dot_attention=include_null_embedding_in_dot_attention,
            null_entity_id=null_entity_id)
# テキストの特徴ベクトルの作成
source_embedding = Embedding(
    num_embeddings=vocab.get_vocab_size(namespace="tokens"), embedding_dim=512)
source_text_embedder = BasicTextFieldEmbedder(
    token_embedders={"tokens": source_embedding})
target_embedding = Embedding(
    num_embeddings=vocab.get_vocab_size(namespace="target_tokens"),
    embedding_dim=512)

# Sequence-to-Sequence Model (LSTM, Transformer)
encoder = PytorchTransformer(input_dim=source_text_embedder.get_output_dim(),
                             feedforward_hidden_dim=512,
                             num_layers=4,
                             num_attention_heads=8)
decoder_net = StackedSelfAttentionDecoderNet(
    decoding_dim=target_embedding.get_output_dim(),
    target_embedding_dim=target_embedding.get_output_dim(),
    feedforward_hidden_dim=512,
    num_layers=4,
    num_attention_heads=8)
decoder = AutoRegressiveSeqDecoder(vocab=vocab,
                                   decoder_net=decoder_net,
                                   max_decoding_steps=128,
                                   target_embedder=target_embedding,
                                   beam_size=args.beam_size,
                                   target_namespace='target_tokens')
model = ComposedSeq2Seq(vocab=vocab,
                        source_text_embedder=source_text_embedder,
                        encoder=encoder,
                        decoder=decoder)