def test_gradients(self): cbp = CompactBilinearPooling(128, 128, 160).cuda() x = torch.autograd.Variable(torch.rand(4, 128).cuda(), requires_grad=True) y = torch.autograd.Variable(torch.rand(4, 128).cuda(), requires_grad=True) self.assertTrue(torch.autograd.gradcheck(cbp, (x, y), eps=1))
def __init__(self, block, layers, num_classes=128): self.inplanes = 64 super(ResNetCBP, self).__init__() self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False) self.bn1 = nn.BatchNorm2d(64) self.relu = nn.ReLU(inplace=True) self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) self.layer1 = self._make_layer(block, 64, layers[0]) self.layer2 = self._make_layer(block, 128, layers[1], stride=2) self.layer3 = self._make_layer(block, 256, layers[2], stride=2) self.layer4 = self._make_layer(block, 512, layers[3], stride=2) self.avgpool = nn.AvgPool2d(7) self.cbp = CompactBilinearPooling(512 * block.expansion, 512 * block.expansion, 8192) self.fc_action = nn.Linear(8192, num_classes) for m in self.modules(): if isinstance(m, nn.Conv2d): n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels m.weight.data.normal_(0, math.sqrt(2. / n)) elif isinstance(m, nn.BatchNorm2d): m.weight.data.fill_(1) m.bias.data.zero_()
def test_pooling(self): mcb = CompactBilinearPooling(2048, 2048, 16000).double().cuda() # Create 4 arrays of positive reals x = torch.autograd.Variable(torch.rand(4, 2048).double().cuda(), requires_grad=True) y = torch.autograd.Variable(torch.rand(4, 2048).double().cuda(), requires_grad=True) z = torch.autograd.Variable(torch.rand(4, 2048).double().cuda(), requires_grad=True) w = torch.autograd.Variable(torch.rand(4, 2048).double().cuda(), requires_grad=True) # Compute the real bilinear pooling for each pair of array bp_xy = bilinear_pooling(x, y).data.cpu().numpy() bp_zw = bilinear_pooling(z, w).data.cpu().numpy() # Compute the dot product of the result kernel_bp = np.sum(bp_xy * bp_zw, axis=1) # Repeat the computation with compact bilinear pooling cbp_xy = mcb(x, y).data.cpu().numpy() cbp_zw = mcb(z, w).data.cpu().numpy() kernel_cbp = np.sum(cbp_xy * cbp_zw, axis=1) # The ratio between the two dot product should be close to one. ratio = kernel_cbp / kernel_bp np.testing.assert_almost_equal(ratio, np.ones_like(ratio), decimal=1)
def __init__(self, LSTM_dim, LSTM_cell_num, LSTM_bidirectional, text_embed_dim, image_embed_dim, decoder_dim, num_classes=2, lstm_dropout=0): super().__init__() self.LSTM = torch.nn.LSTM(input_size=text_embed_dim, hidden_size=LSTM_dim, num_layers=LSTM_cell_num, bidirectional=LSTM_bidirectional, batch_first=True, dropout=lstm_dropout) lstm_out = LSTM_dim if (LSTM_bidirectional): lstm_out *= 2 self.mcb = CompactBilinearPooling(lstm_out, image_embed_dim, decoder_dim) self.decoder = torch.nn.Linear(in_features=decoder_dim, out_features=decoder_dim) self.classifier = torch.nn.Linear(in_features=decoder_dim, out_features=num_classes)
def __init__(self, opt, use_maxout=False): super(MCBTopDownCore, self).__init__() self.drop_prob_lm = opt.drop_prob_lm self.att_lstm = nn.LSTMCell(opt.input_encoding_size + opt.rnn_size * 2, opt.rnn_size) # we, fc, h^2_t-1 self.lang_lstm = nn.LSTMCell(opt.rnn_size * 2, opt.rnn_size) # h^1_t, \hat v self.attention = Attention(opt) # print('rnn_size:', opt.rnn_size) # 512 # print('fc_feats:', opt.fc_feat_size) # 2048 # print('input_encoding_size:', opt.input_encoding_size) # 512 self.mcb1 = CompactBilinearPooling( opt.rnn_size, opt.input_encoding_size, opt.input_encoding_size + opt.rnn_size).cuda() self.mcb2 = CompactBilinearPooling( opt.rnn_size + opt.input_encoding_size, opt.rnn_size, 3 * opt.rnn_size).cuda()
def test_multigpu(self): mcb = CompactBilinearPooling(2048, 2048, 16000).cuda() parallel_mcb = nn.DataParallel(mcb) x = torch.autograd.Variable(torch.rand(8, 2048).cuda(), requires_grad=True) z = parallel_mcb(x) z.sum().backward()
def __init__(self, backbone_name, label_num, inc=2048, outc=6024, backbone_unfreeze_layers='all'): super(CompactBilinearCNNModel, self).__init__() self.backbone = Backbone[backbone_name](needs_flat=False) unfreeze_backbone(self.backbone, backbone_unfreeze_layers) self.mcb = CompactBilinearPooling(inc, inc, outc) self.c_bilinear_fc = nn.Linear(outc, label_num)
def __init__(self, opt): super(MCBLSTMCore, self).__init__() print('using *MCB* LSTMCore') self.input_encoding_size = opt.input_encoding_size self.rnn_size = opt.rnn_size self.drop_prob_lm = opt.drop_prob_lm # Build a LSTM self.h2h = nn.Linear(self.rnn_size, 5 * self.rnn_size) self.dropout = nn.Dropout(self.drop_prob_lm) self.mcb = CompactBilinearPooling(opt.rnn_size, opt.input_encoding_size, opt.rnn_size).cuda()
def build_baseline0_newatt(dataset, num_hid): w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, 0.0) q_emb = QuestionEmbedding(300, num_hid, 1, False, 0.0) v_att = NewAttention(dataset.v_dim, q_emb.num_hid, num_hid) cls_att = NewAttention(dataset.cls_dim, q_emb.num_hid, num_hid) attr_att = NewAttention(dataset.attr_dim, q_emb.num_hid, num_hid) q_net = FCNet([q_emb.num_hid, num_hid]) v_net = FCNet([dataset.v_dim, num_hid]) cls_net = FCNet([dataset.cls_dim, num_hid]) attr_net = FCNet([dataset.attr_dim, num_hid]) fusion_dim = 16000 mcb = CompactBilinearPooling(num_hid, num_hid, fusion_dim) classifier = SimpleClassifier(fusion_dim, num_hid * 2, dataset.num_ans_candidates, 0.5) return BaseModel(w_emb, q_emb, v_att, cls_att, attr_att, q_net, v_net, cls_net, attr_net, classifier, mcb)
def __init__(self, cfg): super(BottomUp, self).__init__() self.cfg = cfg q_dim = cfg['rnn_dim'] * 2 if cfg['rnn_bidirection'] else cfg['rnn_dim'] self.w_emb = WordEmbedding(cfg['n_vocab'], cfg['word_embedding_dim']) self.w_emb.init_embedding(cfg['word_dic_file'], cfg['embedding_file']) self.q_emb = QuestionEmbedding(cfg['word_embedding_dim'], cfg['rnn_dim'], cfg['rnn_layer'], cfg['rnn_type'], keep_seq=False, bidirectional=cfg['rnn_bidirection']) self.v_att = NewAttention(cfg['v_dim'], q_dim, cfg['fused_dim']) if cfg['fuse_type'] == 'LinearSum': self.fuse_net = fusions.LinearSum([cfg['v_dim'], q_dim], cfg['fused_dim'], dropout_input=cfg['dropout']) if cfg['fuse_type'] == 'MFB': self.fuse_net = fusions.MFB([cfg['v_dim'], q_dim], cfg['fused_dim'], mm_dim=1000, factor=5, dropout_input=cfg['dropout']) if cfg['fuse_type'] == 'MLB': self.fuse_net = fusions.MLB([cfg['v_dim'], q_dim], cfg['fused_dim'], mm_dim=2 * cfg['fused_dim'], dropout_input=cfg['dropout']) if cfg['fuse_type'] == 'MFH': self.fuse_net = fusions.MFH([cfg['v_dim'], q_dim], cfg['fused_dim'], mm_dim=1000, factor=5, dropout_input=cfg['dropout']) if cfg['fuse_type'] == 'MCB': from compact_bilinear_pooling import CompactBilinearPooling self.fuse_net = CompactBilinearPooling(cfg['v_dim'], q_dim, cfg['fused_dim']) # self.fuse_net = fusions.MCB([cfg['v_dim'], q_dim], cfg['fused_dim'], dropout_output=cfg['dropout']) self.classifier = SimpleClassifier(cfg['fused_dim'], cfg['classifier_hid_dim'], cfg['classes'], 0.5)
def __init__(self, num_classes=400, spatial_squeeze=True, final_endpoint='Mixed_5c', name='inception_i3d', in_channels=3, dropout_keep_prob=0.5): """Initializes I3D model instance. Args: num_classes: The number of outputs in the logit layer (default 400, which matches the Kinetics dataset). spatial_squeeze: Whether to squeeze the spatial dimensions for the logits before returning (default True). final_endpoint: The model contains many possible endpoints. `final_endpoint` specifies the last endpoint for the model to be built up to. In addition to the output at `final_endpoint`, all the outputs at endpoints up to `final_endpoint` will also be returned, in a dictionary. `final_endpoint` must be one of InceptionI3d.VALID_ENDPOINTS (default 'Logits'). name: A string (optional). The name of this module. Raises: ValueError: if `final_endpoint` is not recognized. """ if final_endpoint not in self.VALID_ENDPOINTS: raise ValueError('Unknown final endpoint %s' % final_endpoint) super(InceptionI3d, self).__init__() self._num_classes = num_classes self._spatial_squeeze = spatial_squeeze self._final_endpoint = final_endpoint self.logits = None if self._final_endpoint not in self.VALID_ENDPOINTS: raise ValueError('Unknown final endpoint %s' % self._final_endpoint) self.end_points = {} end_point = 'Conv3d_1a_7x7' self.end_points[end_point] = Unit3D(in_channels=in_channels, output_channels=64, kernel_shape=[7, 7, 7], stride=(2, 2, 2), padding=(3, 3, 3), name=name + end_point) if self._final_endpoint == end_point: return end_point = 'MaxPool3d_2a_3x3' self.end_points[end_point] = MaxPool3dSamePadding( kernel_size=[1, 3, 3], stride=(1, 2, 2), padding=0) if self._final_endpoint == end_point: return end_point = 'Conv3d_2b_1x1' self.end_points[end_point] = Unit3D(in_channels=64, output_channels=64, kernel_shape=[1, 1, 1], padding=0, name=name + end_point) if self._final_endpoint == end_point: return end_point = 'Conv3d_2c_3x3' self.end_points[end_point] = Unit3D(in_channels=64, output_channels=192, kernel_shape=[3, 3, 3], padding=1, name=name + end_point) if self._final_endpoint == end_point: return end_point = 'MaxPool3d_3a_3x3' self.end_points[end_point] = MaxPool3dSamePadding( kernel_size=[1, 3, 3], stride=(1, 2, 2), padding=0) if self._final_endpoint == end_point: return end_point = 'Mixed_3b' self.end_points[end_point] = InceptionModule(192, [64, 96, 128, 16, 32, 32], name + end_point) if self._final_endpoint == end_point: return end_point = 'Mixed_3c' self.end_points[end_point] = InceptionModule( 256, [128, 128, 192, 32, 96, 64], name + end_point) if self._final_endpoint == end_point: return end_point = 'MaxPool3d_4a_3x3' self.end_points[end_point] = MaxPool3dSamePadding( kernel_size=[3, 3, 3], stride=(2, 2, 2), padding=0) if self._final_endpoint == end_point: return end_point = 'Mixed_4b' self.end_points[end_point] = InceptionModule( 128 + 192 + 96 + 64, [192, 96, 208, 16, 48, 64], name + end_point) if self._final_endpoint == end_point: return end_point = 'Mixed_4c' self.end_points[end_point] = InceptionModule( 192 + 208 + 48 + 64, [160, 112, 224, 24, 64, 64], name + end_point) if self._final_endpoint == end_point: return end_point = 'Mixed_4d' self.end_points[end_point] = InceptionModule( 160 + 224 + 64 + 64, [128, 128, 256, 24, 64, 64], name + end_point) if self._final_endpoint == end_point: return end_point = 'Mixed_4e' self.end_points[end_point] = InceptionModule( 128 + 256 + 64 + 64, [112, 144, 288, 32, 64, 64], name + end_point) if self._final_endpoint == end_point: return end_point = 'Mixed_4f' self.end_points[end_point] = InceptionModule( 112 + 288 + 64 + 64, [256, 160, 320, 32, 128, 128], name + end_point) if self._final_endpoint == end_point: return end_point = 'MaxPool3d_5a_2x2' self.end_points[end_point] = MaxPool3dSamePadding( kernel_size=[2, 2, 2], stride=(2, 2, 2), padding=0) if self._final_endpoint == end_point: return end_point = 'Mixed_5b' self.end_points[end_point] = InceptionModule( 256 + 320 + 128 + 128, [256, 160, 320, 32, 128, 128], name + end_point) if self._final_endpoint == end_point: return end_point = 'Mixed_5c' self.end_points[end_point] = InceptionModule( 256 + 320 + 128 + 128, [384, 192, 384, 48, 128, 128], name + end_point) # if self._final_endpoint == end_point: # return end_point = 'Logits' self.avg_pool = nn.AvgPool3d(kernel_size=[4, 7, 7], stride=(1, 1, 1)) self.dropout = nn.Dropout(dropout_keep_prob) self.logits = Unit3D(in_channels=384 + 384 + 128 + 128, output_channels=self._num_classes, kernel_shape=[1, 1, 1], padding=0, activation_fn=None, use_batch_norm=False, use_bias=True, name='logits') self.cbp = CompactBilinearPooling(832, 2, 832) self.bn_cbp = nn.BatchNorm3d(832) self.bn_flow = nn.BatchNorm3d(2) self.build() self.logger = None
def test_gradients(self): cbp = CompactBilinearPooling(128, 128, 160).double() x = torch.rand(4, 128).double().requires_grad_() y = torch.rand(4, 128).double().requires_grad_() self.assertTrue(torch.autograd.gradcheck(cbp, (x, y)))
kernel_bp = np.sum(bp_xy * bp_zw, axis=1) # Repeat the computation with compact bilinear pooling cbp_xy = mcb(x, y).cpu().numpy() cbp_zw = mcb(z, w).cpu().numpy() kernel_cbp = np.sum(cbp_xy * cbp_zw, axis=1) # The ratio between the two dot product should be close to one. ratio = kernel_cbp / kernel_bp np.testing.assert_almost_equal(ratio, np.ones_like(ratio), decimal=1) def test_gradients(self): cbp = CompactBilinearPooling(128, 128, 160).double() x = torch.rand(4, 128).double().requires_grad_() y = torch.rand(4, 128).double().requires_grad_() self.assertTrue(torch.autograd.gradcheck(cbp, (x, y))) if __name__ == '__main__': unittest.main() input_size = 2048 output_size = 16000 mcb = CompactBilinearPooling(input_size, input_size, output_size).cuda() x = torch.rand(4, input_size).cuda() y = torch.rand(4, input_size).cuda() z = mcb(x, y) print(z)