def build(self): self._is_direct_features_input = self.config.direct_features_input # Encoders self.text_encoder = build_text_encoder(self.config.text_encoder) self.image_encoder = build_image_encoder( self.config.image_encoder, self._is_direct_features_input ) # Projectors image_proj_config = deepcopy(self.config.image_projection) self.image_proj = build_classifier_layer(image_proj_config) text_proj_config = deepcopy(self.config.text_projection) self.text_proj = build_classifier_layer(text_proj_config) # Aggregators self.image_pool = AttnPool1d(self.config.final_hidden_size, 1) self.text_pool = AttnPool1d(self.config.final_hidden_size, 1) # Shared transformer transformer_layer = torch.nn.TransformerEncoderLayer( self.config.final_hidden_size, 4, 2048, dropout=0.1, activation="relu" ) self.shared_transformer = torch.nn.TransformerEncoder( transformer_layer, num_layers=2 ) # Position embeddings - Image self.image_pos_emb = PositionEmbeddingSine(self.config.final_hidden_size // 2)
def __init__(self, embedding_dim: int, **kwargs): super().__init__() hidden_dim = kwargs.get("hidden_dim", 512) self.sga = SGAEmbedding(embedding_dim, **kwargs) self.sga_pool = AttnPool1d(hidden_dim, 1) self.cbn = CBNEmbedding(embedding_dim, **kwargs) self.out_dim = hidden_dim
def __init__(self, hidden_dim: int, embedding_dim: int, **kwargs): super().__init__() num_attn = kwargs["num_attn"] num_layers = kwargs["num_layers"] dropout = kwargs.get("dropout", 0.1) num_attn_pool = kwargs.get("num_attn_pool", 1) num_feat = kwargs.get("num_feat", -1) self.lstm = nn.LSTM( input_size=embedding_dim, hidden_size=hidden_dim, num_layers=1, batch_first=True, ) self.self_attns = nn.ModuleList( [SelfAttention(hidden_dim, num_attn, dropout) for _ in range(num_layers)] ) self.attn_pool = None self.num_feat = num_feat self.text_out_dim = hidden_dim if num_attn_pool > 0: self.attn_pool = AttnPool1d(hidden_dim, num_feat * num_attn_pool) self.text_out_dim = hidden_dim * num_attn_pool