def __init__(self, text_query, image_embed_dim, text_embed_dim, use_bert, name):
        super().__init__()
        # img model
        # pretrained表示是否是预训练模型
        img_model = torchvision.models.resnet18(pretrained=True)
        self.name = name

        class GlobalAvgPool2d(torch.nn.Module):

            def forward(self, x):
                # (1,1)表示输出的维度
                # 例如输入x.shape=(512, 3, 64, 64),输出的shape=(512, 3, 1, 1)
                # 即求(64, 64)的平均值
                return F.adaptive_avg_pool2d(x, (1, 1))

        img_model.avgpool = GlobalAvgPool2d()
        # fc --> full connection 全连接层,即线性层
        # Linear(in, out)两个参数表示输入和输出的维度
        img_model.fc = torch.nn.Sequential(torch.nn.Linear(image_embed_dim, image_embed_dim))
        self.img_model = img_model

        # text model
        self.text_model = text_model.TextLSTMModel(
            texts_to_build_vocab = text_query,
            word_embed_dim = text_embed_dim,
            lstm_hidden_dim = text_embed_dim)
Exemple #2
0
    def __init__(self, embed_dim, texts):
        super(ImageTextEncodeTransformModel, self).__init__()

        self.snorm = torch_functions.NormalizationLayer(normalize_scale=4.0,
                                                        learn_scale=True)

        # image
        self.img_encoder = torchvision.models.resnet50(pretrained=True)
        self.img_encoder.fc = torch.nn.Sequential(
            torch.nn.Dropout(0.2), torch.nn.Linear(2048, 2048),
            torch.nn.BatchNorm1d(2048), torch.nn.Dropout(0.2), torch.nn.ReLU(),
            torch.nn.Linear(2048, embed_dim))

        # text
        self.text_encoder = text_model.TextLSTMModel(
            texts_to_build_vocab=texts,
            word_embed_dim=256,
            lstm_hidden_dim=embed_dim)
        self.text_encoder.fc_output = torch.nn.Sequential(
            torch.nn.Dropout(0.1), torch.nn.Linear(embed_dim, 2048),
            torch.nn.BatchNorm1d(2048), torch.nn.Dropout(0.1), torch.nn.ReLU(),
            torch.nn.Linear(2048, embed_dim))

        # transformer
        self.transformer = MTirgTransform(embed_dim)
Exemple #3
0
    def __init__(self, texts, opt):
        super(ImgEncoderTextEncoderBase, self).__init__()
        img_encoder = opt.img_encoder
        text_encoder = opt.text_encoder
        embed_dim = opt.embed_dim

        if img_encoder == 'efficientnet':
            img_model = EfficientNet.from_pretrained('efficientnet-b0')
            img_model._fc = torch.nn.Sequential(
                torch.nn.Linear(1280, embed_dim))
        elif img_encoder == 'resnet18':
            img_model = torchvision.models.resnet18(pretrained=True)
            img_model.avgpool = GlobalAvgPool2d()
            img_model.fc = torch.nn.Sequential(torch.nn.Linear(512, embed_dim))
        elif img_encoder == 'resnet50':
            img_model = torchvision.models.resnet50(pretrained=True)
            img_model.avgpool = GlobalAvgPool2d()
            img_model.fc = torch.nn.Sequential(torch.nn.Linear(
                2048, embed_dim))
        elif img_encoder == 'resnet101':
            img_model = torchvision.models.resnet101(pretrained=True)
            img_model.avgpool = GlobalAvgPool2d()
            img_model.fc = torch.nn.Sequential(torch.nn.Linear(
                2048, embed_dim))
        else:
            print('Invalid image encoder', img_encoder)
            print('available: efficientnet, resnet18, resnet50, resnet101')
            sys.exit()

        class GlobalAvgPool2d(torch.nn.Module):
            def forward(self, x):
                return F.adaptive_avg_pool2d(x, (1, 1))

        self.img_model = img_model
        self.embed_dim = embed_dim

        if text_encoder == 'lstm':
            self.text_model = text_model.TextLSTMModel(
                texts_to_build_vocab=texts,
                word_embed_dim=512,
                lstm_hidden_dim=embed_dim)
        elif text_encoder == 'dualenc':
            self.text_model = text_model.TextDualencModel(
                texts_to_build_vocab=texts,
                embed_dim=embed_dim,
                word_embed_dim=300,
                lstm_hidden_dim=512)
        else:
            print('Invalid text encoder', text_encoder)
            print('available: lstm, dualenc')
            sys.exit()
    def __init__(self, vocab_size, embed_dim=512):
        super(ImgEncoderTextEncoderBase, self).__init__()
        # img model
        img_model = torchvision.models.resnet18(pretrained=True)

        class GlobalAvgPool2d(torch.nn.Module):
            def forward(self, x):
                return F.adaptive_avg_pool2d(x, (1, 1))

        img_model.avgpool = GlobalAvgPool2d()
        img_model.fc = torch.nn.Sequential(torch.nn.Linear(512, embed_dim))
        self.img_model = img_model

        # text model
        self.text_model = text_model.TextLSTMModel(vocab_size,
                                                   word_embed_dim=embed_dim,
                                                   lstm_hidden_dim=embed_dim)
Exemple #5
0
    def __init__(self, texts, embed_dim):
        super(ImgEncoderTextEncoderBase, self).__init__()

        img_model = torchvision.models.resnet18(pretrained=False)
        img_model.load_state_dict(torch.load(Path1 +
                                             r'\resnet18-5c106cde.pth'))

        class GlobalAvgPool2d(torch.nn.Module):
            def forward(self, x):
                return F.adaptive_avg_pool2d(x, (1, 1))

        img_model.avgpool = GlobalAvgPool2d()
        img_model.fc = torch.nn.Sequential(torch.nn.Linear(512, embed_dim))
        self.img_model = img_model

        # text model
        self.text_model = text_model.TextLSTMModel(texts_to_build_vocab=texts,
                                                   word_embed_dim=embed_dim,
                                                   lstm_hidden_dim=embed_dim)
    def __init__(self, text_query, image_embed_dim, text_embed_dim, use_bert,
                 name):
        super().__init__()
        # img model
        img_model = torchvision.models.resnet18(pretrained=True)
        self.name = name

        class GlobalAvgPool2d(torch.nn.Module):
            def forward(self, x):
                return F.adaptive_avg_pool2d(x, (1, 1))

        img_model.avgpool = GlobalAvgPool2d()
        img_model.fc = torch.nn.Sequential(
            torch.nn.Linear(image_embed_dim, image_embed_dim))
        self.img_model = img_model

        # text model
        self.text_model = text_model.TextLSTMModel(
            texts_to_build_vocab=text_query,
            word_embed_dim=text_embed_dim,
            lstm_hidden_dim=text_embed_dim)