Ejemplo n.º 1
0
 def test_gradients(self):
     cbp = CompactBilinearPooling(128, 128, 160).cuda()
     x = torch.autograd.Variable(torch.rand(4, 128).cuda(),
                                 requires_grad=True)
     y = torch.autograd.Variable(torch.rand(4, 128).cuda(),
                                 requires_grad=True)
     self.assertTrue(torch.autograd.gradcheck(cbp, (x, y), eps=1))
Ejemplo n.º 2
0
    def __init__(self, block, layers, num_classes=128):
        self.inplanes = 64
        super(ResNetCBP, self).__init__()
        self.conv1 = nn.Conv2d(3,
                               64,
                               kernel_size=7,
                               stride=2,
                               padding=3,
                               bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        self.layer1 = self._make_layer(block, 64, layers[0])
        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
        self.avgpool = nn.AvgPool2d(7)

        self.cbp = CompactBilinearPooling(512 * block.expansion,
                                          512 * block.expansion, 8192)
        self.fc_action = nn.Linear(8192, num_classes)

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2. / n))
            elif isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()
Ejemplo n.º 3
0
    def test_pooling(self):
        mcb = CompactBilinearPooling(2048, 2048, 16000).double().cuda()

        # Create 4 arrays of positive reals
        x = torch.autograd.Variable(torch.rand(4, 2048).double().cuda(),
                                    requires_grad=True)
        y = torch.autograd.Variable(torch.rand(4, 2048).double().cuda(),
                                    requires_grad=True)
        z = torch.autograd.Variable(torch.rand(4, 2048).double().cuda(),
                                    requires_grad=True)
        w = torch.autograd.Variable(torch.rand(4, 2048).double().cuda(),
                                    requires_grad=True)

        # Compute the real bilinear pooling for each pair of array
        bp_xy = bilinear_pooling(x, y).data.cpu().numpy()
        bp_zw = bilinear_pooling(z, w).data.cpu().numpy()

        # Compute the dot product of the result
        kernel_bp = np.sum(bp_xy * bp_zw, axis=1)

        # Repeat the computation with compact bilinear pooling
        cbp_xy = mcb(x, y).data.cpu().numpy()
        cbp_zw = mcb(z, w).data.cpu().numpy()

        kernel_cbp = np.sum(cbp_xy * cbp_zw, axis=1)

        # The ratio between the two dot product should be close to one.
        ratio = kernel_cbp / kernel_bp

        np.testing.assert_almost_equal(ratio, np.ones_like(ratio), decimal=1)
Ejemplo n.º 4
0
    def __init__(self,
                 LSTM_dim,
                 LSTM_cell_num,
                 LSTM_bidirectional,
                 text_embed_dim,
                 image_embed_dim,
                 decoder_dim,
                 num_classes=2,
                 lstm_dropout=0):
        super().__init__()

        self.LSTM = torch.nn.LSTM(input_size=text_embed_dim,
                                  hidden_size=LSTM_dim,
                                  num_layers=LSTM_cell_num,
                                  bidirectional=LSTM_bidirectional,
                                  batch_first=True,
                                  dropout=lstm_dropout)

        lstm_out = LSTM_dim
        if (LSTM_bidirectional):
            lstm_out *= 2

        self.mcb = CompactBilinearPooling(lstm_out, image_embed_dim,
                                          decoder_dim)

        self.decoder = torch.nn.Linear(in_features=decoder_dim,
                                       out_features=decoder_dim)
        self.classifier = torch.nn.Linear(in_features=decoder_dim,
                                          out_features=num_classes)
Ejemplo n.º 5
0
    def __init__(self, opt, use_maxout=False):
        super(MCBTopDownCore, self).__init__()
        self.drop_prob_lm = opt.drop_prob_lm

        self.att_lstm = nn.LSTMCell(opt.input_encoding_size + opt.rnn_size * 2,
                                    opt.rnn_size)  # we, fc, h^2_t-1
        self.lang_lstm = nn.LSTMCell(opt.rnn_size * 2,
                                     opt.rnn_size)  # h^1_t, \hat v
        self.attention = Attention(opt)
        # print('rnn_size:', opt.rnn_size) # 512
        # print('fc_feats:', opt.fc_feat_size) # 2048
        # print('input_encoding_size:', opt.input_encoding_size) # 512
        self.mcb1 = CompactBilinearPooling(
            opt.rnn_size, opt.input_encoding_size,
            opt.input_encoding_size + opt.rnn_size).cuda()
        self.mcb2 = CompactBilinearPooling(
            opt.rnn_size + opt.input_encoding_size, opt.rnn_size,
            3 * opt.rnn_size).cuda()
Ejemplo n.º 6
0
    def test_multigpu(self):
        mcb = CompactBilinearPooling(2048, 2048, 16000).cuda()
        parallel_mcb = nn.DataParallel(mcb)

        x = torch.autograd.Variable(torch.rand(8, 2048).cuda(),
                                    requires_grad=True)

        z = parallel_mcb(x)

        z.sum().backward()
 def __init__(self,
              backbone_name,
              label_num,
              inc=2048,
              outc=6024,
              backbone_unfreeze_layers='all'):
     super(CompactBilinearCNNModel, self).__init__()
     self.backbone = Backbone[backbone_name](needs_flat=False)
     unfreeze_backbone(self.backbone, backbone_unfreeze_layers)
     self.mcb = CompactBilinearPooling(inc, inc, outc)
     self.c_bilinear_fc = nn.Linear(outc, label_num)
Ejemplo n.º 8
0
    def __init__(self, opt):
        super(MCBLSTMCore, self).__init__()
        print('using *MCB* LSTMCore')
        self.input_encoding_size = opt.input_encoding_size
        self.rnn_size = opt.rnn_size
        self.drop_prob_lm = opt.drop_prob_lm

        # Build a LSTM
        self.h2h = nn.Linear(self.rnn_size, 5 * self.rnn_size)
        self.dropout = nn.Dropout(self.drop_prob_lm)
        self.mcb = CompactBilinearPooling(opt.rnn_size,
                                          opt.input_encoding_size,
                                          opt.rnn_size).cuda()
Ejemplo n.º 9
0
def build_baseline0_newatt(dataset, num_hid):
    w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0)
    q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0)
    v_att = NewAttention(dataset.v_dim, q_emb.num_hid, num_hid)
    cls_att = NewAttention(dataset.cls_dim, q_emb.num_hid, num_hid)
    attr_att = NewAttention(dataset.attr_dim, q_emb.num_hid, num_hid)
    q_net = FCNet([q_emb.num_hid, num_hid])
    v_net = FCNet([dataset.v_dim, num_hid])
    cls_net = FCNet([dataset.cls_dim, num_hid])
    attr_net = FCNet([dataset.attr_dim, num_hid])

    fusion_dim = 16000
    mcb = CompactBilinearPooling(num_hid, num_hid, fusion_dim)
    classifier = SimpleClassifier(fusion_dim, num_hid * 2, dataset.num_ans_candidates, 0.5)
    
    return BaseModel(w_emb, q_emb, v_att, cls_att, attr_att, q_net, v_net, cls_net, attr_net, classifier, mcb)
Ejemplo n.º 10
0
 def __init__(self, cfg):
     super(BottomUp, self).__init__()
     self.cfg = cfg
     q_dim = cfg['rnn_dim'] * 2 if cfg['rnn_bidirection'] else cfg['rnn_dim']
     self.w_emb = WordEmbedding(cfg['n_vocab'], cfg['word_embedding_dim'])
     self.w_emb.init_embedding(cfg['word_dic_file'], cfg['embedding_file'])
     self.q_emb = QuestionEmbedding(cfg['word_embedding_dim'],
                                    cfg['rnn_dim'],
                                    cfg['rnn_layer'],
                                    cfg['rnn_type'],
                                    keep_seq=False,
                                    bidirectional=cfg['rnn_bidirection'])
     self.v_att = NewAttention(cfg['v_dim'], q_dim, cfg['fused_dim'])
     if cfg['fuse_type'] == 'LinearSum':
         self.fuse_net = fusions.LinearSum([cfg['v_dim'], q_dim],
                                           cfg['fused_dim'],
                                           dropout_input=cfg['dropout'])
     if cfg['fuse_type'] == 'MFB':
         self.fuse_net = fusions.MFB([cfg['v_dim'], q_dim],
                                     cfg['fused_dim'],
                                     mm_dim=1000,
                                     factor=5,
                                     dropout_input=cfg['dropout'])
     if cfg['fuse_type'] == 'MLB':
         self.fuse_net = fusions.MLB([cfg['v_dim'], q_dim],
                                     cfg['fused_dim'],
                                     mm_dim=2 * cfg['fused_dim'],
                                     dropout_input=cfg['dropout'])
     if cfg['fuse_type'] == 'MFH':
         self.fuse_net = fusions.MFH([cfg['v_dim'], q_dim],
                                     cfg['fused_dim'],
                                     mm_dim=1000,
                                     factor=5,
                                     dropout_input=cfg['dropout'])
     if cfg['fuse_type'] == 'MCB':
         from compact_bilinear_pooling import CompactBilinearPooling
         self.fuse_net = CompactBilinearPooling(cfg['v_dim'], q_dim,
                                                cfg['fused_dim'])
     # self.fuse_net = fusions.MCB([cfg['v_dim'], q_dim], cfg['fused_dim'], dropout_output=cfg['dropout'])
     self.classifier = SimpleClassifier(cfg['fused_dim'],
                                        cfg['classifier_hid_dim'],
                                        cfg['classes'], 0.5)
Ejemplo n.º 11
0
    def __init__(self,
                 num_classes=400,
                 spatial_squeeze=True,
                 final_endpoint='Mixed_5c',
                 name='inception_i3d',
                 in_channels=3,
                 dropout_keep_prob=0.5):
        """Initializes I3D model instance.
        Args:
          num_classes: The number of outputs in the logit layer (default 400, which
              matches the Kinetics dataset).
          spatial_squeeze: Whether to squeeze the spatial dimensions for the logits
              before returning (default True).
          final_endpoint: The model contains many possible endpoints.
              `final_endpoint` specifies the last endpoint for the model to be built
              up to. In addition to the output at `final_endpoint`, all the outputs
              at endpoints up to `final_endpoint` will also be returned, in a
              dictionary. `final_endpoint` must be one of
              InceptionI3d.VALID_ENDPOINTS (default 'Logits').
          name: A string (optional). The name of this module.
        Raises:
          ValueError: if `final_endpoint` is not recognized.
        """
        if final_endpoint not in self.VALID_ENDPOINTS:
            raise ValueError('Unknown final endpoint %s' % final_endpoint)

        super(InceptionI3d, self).__init__()
        self._num_classes = num_classes
        self._spatial_squeeze = spatial_squeeze
        self._final_endpoint = final_endpoint
        self.logits = None

        if self._final_endpoint not in self.VALID_ENDPOINTS:
            raise ValueError('Unknown final endpoint %s' %
                             self._final_endpoint)

        self.end_points = {}
        end_point = 'Conv3d_1a_7x7'
        self.end_points[end_point] = Unit3D(in_channels=in_channels,
                                            output_channels=64,
                                            kernel_shape=[7, 7, 7],
                                            stride=(2, 2, 2),
                                            padding=(3, 3, 3),
                                            name=name + end_point)
        if self._final_endpoint == end_point:
            return

        end_point = 'MaxPool3d_2a_3x3'
        self.end_points[end_point] = MaxPool3dSamePadding(
            kernel_size=[1, 3, 3], stride=(1, 2, 2), padding=0)
        if self._final_endpoint == end_point:
            return

        end_point = 'Conv3d_2b_1x1'
        self.end_points[end_point] = Unit3D(in_channels=64,
                                            output_channels=64,
                                            kernel_shape=[1, 1, 1],
                                            padding=0,
                                            name=name + end_point)
        if self._final_endpoint == end_point:
            return

        end_point = 'Conv3d_2c_3x3'
        self.end_points[end_point] = Unit3D(in_channels=64,
                                            output_channels=192,
                                            kernel_shape=[3, 3, 3],
                                            padding=1,
                                            name=name + end_point)
        if self._final_endpoint == end_point:
            return

        end_point = 'MaxPool3d_3a_3x3'
        self.end_points[end_point] = MaxPool3dSamePadding(
            kernel_size=[1, 3, 3], stride=(1, 2, 2), padding=0)
        if self._final_endpoint == end_point:
            return

        end_point = 'Mixed_3b'
        self.end_points[end_point] = InceptionModule(192,
                                                     [64, 96, 128, 16, 32, 32],
                                                     name + end_point)
        if self._final_endpoint == end_point:
            return

        end_point = 'Mixed_3c'
        self.end_points[end_point] = InceptionModule(
            256, [128, 128, 192, 32, 96, 64], name + end_point)
        if self._final_endpoint == end_point:
            return

        end_point = 'MaxPool3d_4a_3x3'
        self.end_points[end_point] = MaxPool3dSamePadding(
            kernel_size=[3, 3, 3], stride=(2, 2, 2), padding=0)
        if self._final_endpoint == end_point:
            return

        end_point = 'Mixed_4b'
        self.end_points[end_point] = InceptionModule(
            128 + 192 + 96 + 64, [192, 96, 208, 16, 48, 64], name + end_point)
        if self._final_endpoint == end_point:
            return

        end_point = 'Mixed_4c'
        self.end_points[end_point] = InceptionModule(
            192 + 208 + 48 + 64, [160, 112, 224, 24, 64, 64], name + end_point)
        if self._final_endpoint == end_point:
            return

        end_point = 'Mixed_4d'
        self.end_points[end_point] = InceptionModule(
            160 + 224 + 64 + 64, [128, 128, 256, 24, 64, 64], name + end_point)
        if self._final_endpoint == end_point:
            return

        end_point = 'Mixed_4e'
        self.end_points[end_point] = InceptionModule(
            128 + 256 + 64 + 64, [112, 144, 288, 32, 64, 64], name + end_point)
        if self._final_endpoint == end_point:
            return

        end_point = 'Mixed_4f'
        self.end_points[end_point] = InceptionModule(
            112 + 288 + 64 + 64, [256, 160, 320, 32, 128, 128],
            name + end_point)
        if self._final_endpoint == end_point:
            return

        end_point = 'MaxPool3d_5a_2x2'
        self.end_points[end_point] = MaxPool3dSamePadding(
            kernel_size=[2, 2, 2], stride=(2, 2, 2), padding=0)
        if self._final_endpoint == end_point:
            return

        end_point = 'Mixed_5b'
        self.end_points[end_point] = InceptionModule(
            256 + 320 + 128 + 128, [256, 160, 320, 32, 128, 128],
            name + end_point)
        if self._final_endpoint == end_point:
            return

        end_point = 'Mixed_5c'
        self.end_points[end_point] = InceptionModule(
            256 + 320 + 128 + 128, [384, 192, 384, 48, 128, 128],
            name + end_point)
        # if self._final_endpoint == end_point:
        #    return

        end_point = 'Logits'
        self.avg_pool = nn.AvgPool3d(kernel_size=[4, 7, 7], stride=(1, 1, 1))

        self.dropout = nn.Dropout(dropout_keep_prob)
        self.logits = Unit3D(in_channels=384 + 384 + 128 + 128,
                             output_channels=self._num_classes,
                             kernel_shape=[1, 1, 1],
                             padding=0,
                             activation_fn=None,
                             use_batch_norm=False,
                             use_bias=True,
                             name='logits')

        self.cbp = CompactBilinearPooling(832, 2, 832)
        self.bn_cbp = nn.BatchNorm3d(832)
        self.bn_flow = nn.BatchNorm3d(2)
        self.build()
        self.logger = None
Ejemplo n.º 12
0
 def test_gradients(self):
     cbp = CompactBilinearPooling(128, 128, 160).double()
     x = torch.rand(4, 128).double().requires_grad_()
     y = torch.rand(4, 128).double().requires_grad_()
     self.assertTrue(torch.autograd.gradcheck(cbp, (x, y)))
Ejemplo n.º 13
0
        kernel_bp = np.sum(bp_xy * bp_zw, axis=1)

        # Repeat the computation with compact bilinear pooling
        cbp_xy = mcb(x, y).cpu().numpy()
        cbp_zw = mcb(z, w).cpu().numpy()

        kernel_cbp = np.sum(cbp_xy * cbp_zw, axis=1)

        # The ratio between the two dot product should be close to one.
        ratio = kernel_cbp / kernel_bp

        np.testing.assert_almost_equal(ratio, np.ones_like(ratio), decimal=1)

    def test_gradients(self):
        cbp = CompactBilinearPooling(128, 128, 160).double()
        x = torch.rand(4, 128).double().requires_grad_()
        y = torch.rand(4, 128).double().requires_grad_()
        self.assertTrue(torch.autograd.gradcheck(cbp, (x, y)))


if __name__ == '__main__':

    unittest.main()
    input_size = 2048
    output_size = 16000
    mcb = CompactBilinearPooling(input_size, input_size, output_size).cuda()
    x = torch.rand(4, input_size).cuda()
    y = torch.rand(4, input_size).cuda()
    z = mcb(x, y)
    print(z)