Ejemplo n.º 1
0
    def build(self):
        self._is_direct_features_input = self.config.direct_features_input
        # Encoders
        self.text_encoder = build_text_encoder(self.config.text_encoder)
        self.image_encoder = build_image_encoder(
            self.config.image_encoder, self._is_direct_features_input
        )

        # Projectors
        image_proj_config = deepcopy(self.config.image_projection)
        self.image_proj = build_classifier_layer(image_proj_config)

        text_proj_config = deepcopy(self.config.text_projection)
        self.text_proj = build_classifier_layer(text_proj_config)

        # Aggregators
        self.image_pool = AttnPool1d(self.config.final_hidden_size, 1)
        self.text_pool = AttnPool1d(self.config.final_hidden_size, 1)

        # Shared transformer
        transformer_layer = torch.nn.TransformerEncoderLayer(
            self.config.final_hidden_size, 4, 2048, dropout=0.1, activation="relu"
        )
        self.shared_transformer = torch.nn.TransformerEncoder(
            transformer_layer, num_layers=2
        )

        # Position embeddings - Image
        self.image_pos_emb = PositionEmbeddingSine(self.config.final_hidden_size // 2)
Ejemplo n.º 2
0
    def _build_encoders(self, config):
        text_encoder = None
        if config.get("text_encoder", None):
            text_encoder = build_text_encoder(config.text_encoder)

        modal_encoder = None
        if config.get("modal_encoder", None):
            modal_encoder = self._build_modal_encoder(config.modal_encoder)

        return (text_encoder, modal_encoder)