class ImageCaptioningDecoder(nn.Module): def __init__(self, vocab_size:int = 70, encoder_dim:int = 512, embedding_dim:int = 64, attention_dim:int = 64, decoder_dim:int = 64): super(ImageCaptioningDecoder, self).__init__() self._vocab_size = vocab_size self._encoder_dim = encoder_dim self._embedding_dim = embedding_dim self._attention_dim = attention_dim self._decoder_dim = decoder_dim self._embedding = Embedding(self._vocab_size, self._embedding_dim) self._attention = ImageCaptioningAttention(self._encoder_dim, self._decoder_dim, self._attention_dim) self._decoder_cell = nn.LSTMCell(self._embedding.get_output_dim() + self._attention.get_output_dim(), self._decoder_dim) self._linear = nn.Linear(self._decoder_dim, self._vocab_size) def forward(self, x: torch.Tensor, h: torch.Tensor, c: torch.Tensor, predicted_indices: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: embedding = self._embedding(predicted_indices).float().view(-1, self._embedding_dim) # (batch_size, embedding_dim) (64, 64) # Shape: (batch_size, encoder_dim) (batch_size, h * w, 1) attention, attention_weights = self._attention(x, h) ## Change to not use teacher forcing all the time # Shape: (batch_size, decoder_dim) (batch_size, decoder_dim) h, c = self._decoder_cell(torch.cat([attention, embedding], dim=1), (h, c)) # Get output predictions (one per character in vocab) # Shape: (batch_size, vocab_size) preds = self._linear(h) return h, c, preds, attention_weights
class MultiscaleDecoder(CaptioningDecoder): def __init__(self, vocab: Vocabulary, attention: CaptioningAttention, embedding_dim: int = 256, decoder_dim: int = 256): super(MultiscaleDecoder, self).__init__(vocab=vocab) self._vocab_size = self.vocab.get_vocab_size() self._embedding_dim = embedding_dim self._decoder_dim = decoder_dim self._embedding = Embedding(self._vocab_size, self._embedding_dim) self._dropout = nn.Dropout(0.1) # Output size of state cell must be decoder dim since state is transformed by the state cell self._state_cell = nn.GRUCell(self._embedding.get_output_dim(), self._decoder_dim) self._attention = attention self._decoder_cell = nn.GRUCell(self._attention.get_output_dim(), self._decoder_dim) self._linear = nn.Linear(self._decoder_dim, self._vocab_size) @overrides def forward( self, x: torch.Tensor, h: torch.Tensor, predicted_indices: torch.Tensor, sum_attention_weights_0: torch.Tensor, sum_attention_weights_1: torch.Tensor ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: # Shape: (batch_size, embedding_dim) embedding = self._embedding(predicted_indices).float().view( -1, self._embedding_dim) embedding = self._dropout(embedding) # Shape: (batch_size, decoder_dim) h = self._state_cell(embedding, h) # Shape: (batch_size, encoder_dim) (batch_size, h * w, 1) attention, attention_weights, sum_attention_weights_0, sum_attention_weights_1 = self._attention( x, h, sum_attention_weights_0, sum_attention_weights_1) ## Change to not use teacher forcing all the time # Shape: (batch_size, decoder_dim) (batch_size, decoder_dim) h = self._decoder_cell(attention, h) # Get output predictions (one per character in vocab) # Shape: (batch_size, vocab_size) preds = self._linear(h) return h, preds, attention_weights, sum_attention_weights_0, sum_attention_weights_1 @overrides def get_output_dim(self) -> int: return self._vocab_size @overrides def get_input_dim(self) -> int: return self._decoder_dim
def __init__( self, vocab: Vocabulary, text_field_embedder: TextFieldEmbedder, span_extractor: SpanExtractor, encoder: Seq2SeqEncoder, feedforward: FeedForward = None, pos_tag_embedding: Embedding = None, initializer: InitializerApplicator = InitializerApplicator(), regularizer: Optional[RegularizerApplicator] = None, evalb_directory_path: str = DEFAULT_EVALB_DIR, ) -> None: super().__init__(vocab, regularizer) self.text_field_embedder = text_field_embedder self.span_extractor = span_extractor self.num_classes = self.vocab.get_vocab_size("labels") self.encoder = encoder self.feedforward_layer = TimeDistributed( feedforward) if feedforward else None self.pos_tag_embedding = pos_tag_embedding or None if feedforward is not None: output_dim = feedforward.get_output_dim() else: output_dim = span_extractor.get_output_dim() self.tag_projection_layer = TimeDistributed( Linear(output_dim, self.num_classes)) representation_dim = text_field_embedder.get_output_dim() if pos_tag_embedding is not None: representation_dim += pos_tag_embedding.get_output_dim() check_dimensions_match( representation_dim, encoder.get_input_dim(), "representation dim (tokens + optional POS tags)", "encoder input dim", ) check_dimensions_match( encoder.get_output_dim(), span_extractor.get_input_dim(), "encoder input dim", "span extractor input dim", ) if feedforward is not None: check_dimensions_match( span_extractor.get_output_dim(), feedforward.get_input_dim(), "span extractor output dim", "feedforward input dim", ) self.tag_accuracy = CategoricalAccuracy() if evalb_directory_path is not None: self._evalb_score = EvalbBracketingScorer(evalb_directory_path) else: self._evalb_score = None initializer(self)
def __init__(self, vocab, embedding_in: Embedding, embedding_out:Embedding=None, cuda_device=-1,dropout:float=0.3): super().__init__(vocab) self.embedding_in = embedding_in self.embedding_out = embedding_out self.emb_dimension = self.vocab.get_vocab_size('tags_in') self.linear1 = nn.Linear(in_features=embedding_in.get_output_dim(), out_features= self.emb_dimension) self.dropout = nn.Dropout(dropout) self.bn = nn.BatchNorm1d(self.emb_dimension) self.init_emb()
class ImageCaptioningDecoder(CaptioningDecoder): def __init__(self, vocab: Vocabulary, attention: CaptioningAttention, embedding_dim: int = 256, decoder_dim: int = 256): super(ImageCaptioningDecoder, self).__init__(vocab=vocab) self._vocab_size = self.vocab.get_vocab_size() self._embedding_dim = embedding_dim self._decoder_dim = decoder_dim self._embedding = Embedding(self._vocab_size, self._embedding_dim) self._attention = attention self._decoder_cell = nn.LSTMCell( self._embedding.get_output_dim() + self._attention.get_output_dim(), self._decoder_dim) self._linear = nn.Linear(self._decoder_dim, self._vocab_size) @overrides def forward( self, x: torch.Tensor, h: torch.Tensor, c: torch.Tensor, predicted_indices: torch.Tensor ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: # Shape: (batch_size, embedding_dim) embedding = self._embedding(predicted_indices).float().view( -1, self._embedding_dim) # Shape: (batch_size, encoder_dim) (batch_size, h * w, 1) attention, attention_weights = self._attention(x, h) ## Change to not use teacher forcing all the time # Shape: (batch_size, decoder_dim) (batch_size, decoder_dim) h, c = self._decoder_cell(torch.cat([attention, embedding], dim=1), (h, c)) # Get output predictions (one per character in vocab) # Shape: (batch_size, vocab_size) preds = self._linear(h) return h, c, preds, attention_weights @overrides def get_output_dim(self) -> int: return self._vocab_size @overrides def get_input_dim(self) -> int: return self._decoder_dim
def __init__(self, vocab: Vocabulary, text_field_embedder: TextFieldEmbedder, span_extractor: SpanExtractor, encoder: Seq2SeqEncoder, feedforward_layer: FeedForward = None, pos_tag_embedding: Embedding = None, initializer: InitializerApplicator = InitializerApplicator(), regularizer: Optional[RegularizerApplicator] = None, evalb_directory_path: str = None) -> None: super(SpanConstituencyParser, self).__init__(vocab, regularizer) self.text_field_embedder = text_field_embedder self.span_extractor = span_extractor self.num_classes = self.vocab.get_vocab_size("labels") self.encoder = encoder self.feedforward_layer = TimeDistributed(feedforward_layer) if feedforward_layer else None self.pos_tag_embedding = pos_tag_embedding or None if feedforward_layer is not None: output_dim = feedforward_layer.get_output_dim() else: output_dim = span_extractor.get_output_dim() self.tag_projection_layer = TimeDistributed(Linear(output_dim, self.num_classes)) representation_dim = text_field_embedder.get_output_dim() if pos_tag_embedding is not None: representation_dim += pos_tag_embedding.get_output_dim() check_dimensions_match(representation_dim, encoder.get_input_dim(), "representation dim (tokens + optional POS tags)", "encoder input dim") check_dimensions_match(encoder.get_output_dim(), span_extractor.get_input_dim(), "encoder input dim", "span extractor input dim") if feedforward_layer is not None: check_dimensions_match(span_extractor.get_output_dim(), feedforward_layer.get_input_dim(), "span extractor output dim", "feedforward input dim") self.tag_accuracy = CategoricalAccuracy() if evalb_directory_path is not None: self._evalb_score = EvalbBracketingScorer(evalb_directory_path) else: self._evalb_score = None initializer(self)
def __init__(self, vocab: Vocabulary, kg_model: Model = None, entity_embedding: Embedding = None, concat_entity_embedder: EntityEmbedder = None, contextual_embedding_dim: int = None, span_encoder_config: Dict[str, int] = None, margin: float = 0.2, decode_threshold: float = 0.0, loss_type: str = 'margin', max_sequence_length: int = 512, dropout: float = 0.1, output_feed_forward_hidden_dim: int = 100, initializer_range: float = 0.02, include_null_embedding_in_dot_attention: bool = False, namespace: str = 'entity', regularizer: RegularizerApplicator = None): super().__init__(vocab, margin=margin, decode_threshold=decode_threshold, loss_type=loss_type, namespace=namespace, regularizer=regularizer) num_embeddings_passed = sum( [kg_model is not None, entity_embedding is not None, concat_entity_embedder is not None] ) if num_embeddings_passed != 1: raise ValueError("Linking model needs either a kg factorisation model or an entity embedding.") elif kg_model is not None: entity_embedding = kg_model.get_entity_embedding() entity_embedding_dim = entity_embedding.embedding_dim elif entity_embedding is not None: entity_embedding_dim = entity_embedding.get_output_dim() elif concat_entity_embedder is not None: entity_embedding_dim = concat_entity_embedder.get_output_dim() set_requires_grad(concat_entity_embedder, False) entity_embedding = concat_entity_embedder if loss_type == 'margin': weighted_entity_threshold = decode_threshold else: weighted_entity_threshold = None null_entity_id = self.vocab.get_token_index('@@NULL@@', namespace) assert null_entity_id != self.vocab.get_token_index('@@UNKNOWN@@', namespace) self.disambiguator = EntityDisambiguator( contextual_embedding_dim, entity_embedding_dim=entity_embedding_dim, entity_embeddings=entity_embedding, max_sequence_length=max_sequence_length, span_encoder_config=span_encoder_config, dropout=dropout, output_feed_forward_hidden_dim=output_feed_forward_hidden_dim, initializer_range=initializer_range, weighted_entity_threshold=weighted_entity_threshold, include_null_embedding_in_dot_attention=include_null_embedding_in_dot_attention, null_entity_id=null_entity_id)
# テキストの特徴ベクトルの作成 source_embedding = Embedding( num_embeddings=vocab.get_vocab_size(namespace="tokens"), embedding_dim=512) source_text_embedder = BasicTextFieldEmbedder( token_embedders={"tokens": source_embedding}) target_embedding = Embedding( num_embeddings=vocab.get_vocab_size(namespace="target_tokens"), embedding_dim=512) # Sequence-to-Sequence Model (LSTM, Transformer) encoder = PytorchTransformer(input_dim=source_text_embedder.get_output_dim(), feedforward_hidden_dim=512, num_layers=4, num_attention_heads=8) decoder_net = StackedSelfAttentionDecoderNet( decoding_dim=target_embedding.get_output_dim(), target_embedding_dim=target_embedding.get_output_dim(), feedforward_hidden_dim=512, num_layers=4, num_attention_heads=8) decoder = AutoRegressiveSeqDecoder(vocab=vocab, decoder_net=decoder_net, max_decoding_steps=128, target_embedder=target_embedding, beam_size=args.beam_size, target_namespace='target_tokens') model = ComposedSeq2Seq(vocab=vocab, source_text_embedder=source_text_embedder, encoder=encoder, decoder=decoder)