コード例 #1
0
    def __init__(self,
                 vocab: Vocabulary,
                 text_embeddings: TransformerEmbeddings,
                 image_embeddings: ImageFeatureEmbeddings,
                 encoder: BiModalEncoder,
                 pooled_output_dim: int,
                 fusion_method: str = "sum",
                 dropout: float = 0.1,
                 label_namespace: str = "answers",
                 *,
                 ignore_text: bool = False,
                 ignore_image: bool = False) -> None:
        super().__init__(
            vocab,
            text_embeddings,
            image_embeddings,
            encoder,
            pooled_output_dim,
            fusion_method,
            dropout,
            label_namespace,
            is_multilabel=True,
            ignore_text=ignore_text,
            ignore_image=ignore_image,
        )

        from allennlp.training.metrics import F1MultiLabelMeasure
        from allennlp_models.vision.metrics.vqa import VqaMeasure

        self.f1_metric = F1MultiLabelMeasure(average="micro")
        self.vqa_metric = VqaMeasure()
コード例 #2
0
    def __init__(self, vocab: Vocabulary, embedding_dim: int, label_namespace: str = "answers"):
        from allennlp_models.vision.metrics.vqa import VqaMeasure
        from allennlp.training.metrics import F1MultiLabelMeasure

        super().__init__(vocab)

        num_labels = vocab.get_vocab_size(label_namespace)
        self.label_namespace = label_namespace
        self.classifier = torch.nn.Linear(embedding_dim, num_labels)

        self.f1_metric = F1MultiLabelMeasure(average="micro")
        self.vqa_metric = VqaMeasure()
コード例 #3
0
class VqaVilbert(VisionTextModel):
    """
    Model for VQA task based on the VilBERT paper.

    # Parameters

    vocab : `Vocabulary`
    text_embeddings : `TransformerEmbeddings`
    image_embeddings : `ImageFeatureEmbeddings`
    encoder : `BiModalEncoder`
    pooled_output_dim : `int`
    fusion_method : `str`, optional (default = `"sum"`)
    dropout : `float`, optional (default = `0.1`)
    label_namespace : `str`, optional (default = `answers`)
    """
    def __init__(self,
                 vocab: Vocabulary,
                 text_embeddings: TransformerEmbeddings,
                 image_embeddings: ImageFeatureEmbeddings,
                 encoder: BiModalEncoder,
                 pooled_output_dim: int,
                 fusion_method: str = "sum",
                 dropout: float = 0.1,
                 label_namespace: str = "answers",
                 *,
                 ignore_text: bool = False,
                 ignore_image: bool = False) -> None:
        super().__init__(
            vocab,
            text_embeddings,
            image_embeddings,
            encoder,
            pooled_output_dim,
            fusion_method,
            dropout,
            label_namespace,
            is_multilabel=True,
            ignore_text=ignore_text,
            ignore_image=ignore_image,
        )

        from allennlp.training.metrics import F1MultiLabelMeasure
        from allennlp_models.vision.metrics.vqa import VqaMeasure

        self.f1_metric = F1MultiLabelMeasure(average="micro")
        self.vqa_metric = VqaMeasure()

    @overrides
    def forward(
        self,  # type: ignore
        box_features: torch.Tensor,
        box_coordinates: torch.Tensor,
        box_mask: torch.Tensor,
        question: TextFieldTensors,
        labels: Optional[torch.Tensor] = None,
        label_weights: Optional[torch.Tensor] = None,
    ) -> Dict[str, torch.Tensor]:

        return super().forward(
            box_features,
            box_coordinates,
            box_mask,
            text=question,
            labels=labels,
            label_weights=label_weights,
        )

    @overrides
    def _compute_loss_and_metrics(
        self,
        batch_size: int,
        outputs: torch.Tensor,
        label: torch.Tensor,
        label_weights: Optional[torch.Tensor] = None,
    ):
        if label is not None and label_weights is not None:
            logits = outputs["logits"]
            label_mask = label > 1  # 0 is padding, 1 is OOV, which we want to ignore

            weighted_labels = util.masked_index_replace(
                logits.new_zeros(logits.size() + (1, )),
                label.clamp(min=0),
                label_mask,
                label_weights.unsqueeze(-1),
            ).squeeze(-1)

            # weighted_labels now has shape (batch_size, num_labels).  We need to ignore the first
            # two columns of this in our loss function and accuracy metric.  The first column is a
            # padding label, and the second column is an OOV label.  We want the loss function to
            # be computed on every other label.
            binary_label_mask = weighted_labels.new_ones(logits.size())
            binary_label_mask[:, 0] = 0
            binary_label_mask[:, 1] = 0

            outputs["loss"] = (
                torch.nn.functional.binary_cross_entropy_with_logits(
                    logits,
                    weighted_labels,
                    weight=binary_label_mask,
                    reduction="sum") / batch_size)

            self.f1_metric(logits, weighted_labels, binary_label_mask.bool())
            self.vqa_metric(logits, label, label_weights)

        return outputs

    @overrides
    def get_metrics(self, reset: bool = False) -> Dict[str, float]:
        result = self.f1_metric.get_metric(reset)
        result["vqa_score"] = self.vqa_metric.get_metric(reset)["score"]
        return result

    @overrides
    def make_output_human_readable(
            self, output_dict: Dict[str,
                                    torch.Tensor]) -> Dict[str, torch.Tensor]:
        batch_tokens = []
        for batch_index, batch in enumerate(output_dict["probs"]):
            tokens = {}
            for i, prob in enumerate(batch):
                tokens[self.vocab.get_token_from_index(
                    i, self.label_namespace)] = float(prob)
            batch_tokens.append(tokens)
        output_dict["tokens"] = batch_tokens
        return output_dict

    default_predictor = "vilbert_vqa"
コード例 #4
0
class VqaHead(Head):
    def __init__(self,
                 vocab: Vocabulary,
                 embedding_dim: int,
                 label_namespace: str = "answers"):
        from allennlp_models.vision.metrics.vqa import VqaMeasure
        from allennlp.training.metrics import F1MultiLabelMeasure

        super().__init__(vocab)

        num_labels = vocab.get_vocab_size(label_namespace)
        self.label_namespace = label_namespace
        self.classifier = torch.nn.Linear(embedding_dim, num_labels)

        self.f1_metric = F1MultiLabelMeasure(average="micro")
        self.vqa_metric = VqaMeasure()

    @overrides
    def forward(
        self,  # type: ignore
        encoded_boxes: torch.Tensor,
        encoded_boxes_mask: torch.Tensor,
        encoded_boxes_pooled: torch.Tensor,
        encoded_text: torch.Tensor,
        encoded_text_mask: torch.Tensor,
        encoded_text_pooled: torch.Tensor,
        pooled_boxes_and_text: torch.Tensor,
        labels: Optional[torch.Tensor] = None,
        label_weights: Optional[torch.Tensor] = None,
    ) -> Dict[str, torch.Tensor]:
        logits = self.classifier(pooled_boxes_and_text)

        output = {
            "logits": logits,
            "probs": torch.sigmoid(logits),
        }

        if labels is not None and label_weights is not None:
            label_mask = labels > 1  # 0 is padding, 1 is OOV, which we want to ignore

            from allennlp.nn import util

            weighted_labels = util.masked_index_replace(
                logits.new_zeros(logits.size() + (1, )),
                labels.clamp(min=0),
                label_mask,
                label_weights.unsqueeze(-1),
            ).squeeze(-1)

            # weighted_labels now has shape (batch_size, num_labels).  We need to ignore the first
            # two columns of this in our loss function and accuracy metric.  The first column is a
            # padding label, and the second column is an OOV label.  We want the loss function to
            # be computed on every other label.
            binary_label_mask = weighted_labels.new_ones(logits.size())
            binary_label_mask[:, 0] = 0
            binary_label_mask[:, 1] = 0

            output[
                "loss"] = torch.nn.functional.binary_cross_entropy_with_logits(
                    logits,
                    weighted_labels,
                    weight=binary_label_mask,
                    reduction="sum") / logits.size(0)

            self.f1_metric(logits, weighted_labels, binary_label_mask.bool())
            self.vqa_metric(logits, labels, label_weights)

        return output

    @overrides
    def get_metrics(self, reset: bool = False) -> Dict[str, float]:
        result = self.f1_metric.get_metric(reset)
        result["vqa"] = self.vqa_metric.get_metric(reset)["score"]
        return result