コード例 #1
0
    def build(self):
        self._is_direct_features_input = self.config.direct_features_input
        # Encoders
        self.text_encoder = build_text_encoder(self.config.text_encoder)
        self.image_encoder = build_image_encoder(
            self.config.image_encoder, self._is_direct_features_input
        )

        # Projectors
        image_proj_config = deepcopy(self.config.image_projection)
        self.image_proj = build_classifier_layer(image_proj_config)

        text_proj_config = deepcopy(self.config.text_projection)
        self.text_proj = build_classifier_layer(text_proj_config)

        # Aggregators
        self.image_pool = AttnPool1d(self.config.final_hidden_size, 1)
        self.text_pool = AttnPool1d(self.config.final_hidden_size, 1)

        # Shared transformer
        transformer_layer = torch.nn.TransformerEncoderLayer(
            self.config.final_hidden_size, 4, 2048, dropout=0.1, activation="relu"
        )
        self.shared_transformer = torch.nn.TransformerEncoder(
            transformer_layer, num_layers=2
        )

        # Position embeddings - Image
        self.image_pos_emb = PositionEmbeddingSine(self.config.final_hidden_size // 2)
コード例 #2
0
    def build(self):
        self.base = FusionBase(self.config)
        num_features = self.config.num_features
        if not self._is_direct_features_input:
            num_features = self.config.modal_encoder.params.num_output_features

        # As the in_dim is dynamically calculated we need to copy classifier_config
        modal_classifier_config = deepcopy(self.config.modal_classifier)
        modal_classifier_config.params.in_dim = (num_features *
                                                 self.config.modal_hidden_size)
        self.modal_classifier = build_classifier_layer(modal_classifier_config)

        text_classifier_config = deepcopy(self.config.text_classifier)
        text_classifier_config.params.in_dim = self.config.text_hidden_size
        self.text_classifier = build_classifier_layer(text_classifier_config)
コード例 #3
0
 def build(self):
     self.vision_module = build_image_encoder(self.config.image_encoder)
     self.classifier = build_classifier_layer(self.config.classifier)
     self.language_module = ProjectionEmbedding(
         **self.config.text_encoder.params)
     self.dropout = torch.nn.Dropout(self.config.dropout)
     self.fusion = torch.nn.Linear(**self.config.fusion.params)
コード例 #4
0
    def build(self):
        self.base = UnimodalBase(self.config)
        self._is_direct_features_input = self.config.direct_features_input
        num_features = self.config.modal_encoder.params.num_output_features

        # As the in_dim is dynamically calculated we need to copy classifier_config
        classifier_config = deepcopy(self.config.classifier)
        classifier_config.params.in_dim = num_features * self.config.modal_hidden_size
        self.classifier = build_classifier_layer(classifier_config)
コード例 #5
0
ファイル: unimodal.py プロジェクト: facebookresearch/mmf
    def build(self):
        self.base = UnimodalBase(self.config)
        self._is_direct_features_input = self.config.direct_features_input
        if self.config.get("freeze_base", False):
            for param in self.base.parameters():
                param.requires_grad = False

        num_features = self.config.modal_encoder.params.num_output_features

        # As the in_dim is dynamically calculated we need to copy classifier_config
        classifier_config = deepcopy(self.config.classifier)
        classifier_config.params.in_dim = num_features * self.config.modal_hidden_size
        self.classifier = build_classifier_layer(classifier_config)
コード例 #6
0
    def build(self):
        """
       Config's image_encoder attribute will used to build an MMF image
       encoder. This config in yaml will look like:
 
       # "type" parameter specifies the type of encoder we are using here.
       # In this particular case, we are using resnet152
       type: resnet152
    
       # Parameters are passed to underlying encoder class by
       # build_image_encoder
       params:
         # Specifies whether to use a pretrained version
         pretrained: true
         # Pooling type, use max to use AdaptiveMaxPool2D
         pool_type: avg
    
         # Number of output features from the encoder, -1 for original
         # otherwise, supports between 1 to 9
         num_output_features: 1
       """
        self.vision_module = build_image_encoder(self.config.image_encoder)
        """
       For classifer, configuration would look like:
       # Specifies the type of the classifier, in this case mlp
       type: mlp
       # Parameter to the classifier passed through build_classifier_layer
       params:
         # Dimension of the tensor coming into the classifier
         in_dim: 512
         # Dimension of the tensor going out of the classifier
         out_dim: 2
         # Number of MLP layers in the classifier
         num_layers: 0
       """
        self.classifier = build_classifier_layer(self.config.classifier)

        # ProjectionEmbeddings takes in params directly as it is module
        # So, pass in kwargs, which are in_dim, out_dim and module
        # whose value would be "linear" as we want linear layer
        self.language_module = ProjectionEmbedding(
            **self.config.text_encoder.params)
        # Dropout value will come from config now
        self.dropout = torch.nn.Dropout(self.config.dropout)
        # Same as Projection Embedding, fusion's layer params (which are param
        # for linear layer) will come from config now
        self.fusion = torch.nn.Linear(**self.config.fusion.params)
        self.relu = torch.nn.ReLU()
コード例 #7
0
    def build(self):
        self.base = FusionBase(self.config)
        num_features = self.config.num_features
        if not self._is_direct_features_input:
            num_features = self.config.modal_encoder.params.num_output_features

        # As the in_dim is dynamically calculated we need to copy classifier_config
        classifier_config = deepcopy(self.config.classifier)
        classifier_config.params.in_dim = num_features * self.config.modal_hidden_size
        classifier_config.params.in_dim += self.config.text_hidden_size
        self.classifier = build_classifier_layer(classifier_config)

        if self.config.freeze_text or self.config.freeze_complete_base:
            for p in self.base.text.parameters():
                p.requires_grad = False

        if self.config.freeze_modal or self.config.freeze_complete_base:
            for p in self.base.modal.parameters():
                p.requires_grad = False
コード例 #8
0
ファイル: unimodal.py プロジェクト: facebookresearch/mmf
 def build(self):
     self.base = UnimodalBase(self.config)
     # As the in_dim is dynamically calculated we need to copy classifier_config
     classifier_config = deepcopy(self.config.classifier)
     classifier_config.params.in_dim = self.config.text_hidden_size
     self.classifier = build_classifier_layer(classifier_config)