Esempio n. 1
0
    def __init__(self,
                 classes,
                 embed_dim,
                 inputs_dim,
                 hidden_dim,
                 direction='backward',
                 dropout=0.2,
                 pass_root=False):
        super(DecoderHrTreeLSTM, self).__init__()
        """
        Initializes the RNN
        :param embed_dim: Dimension of the embeddings
        :param encoder_hidden_dim: Hidden dim of the encoder, for attention purposes
        :param hidden_dim: Hidden dim of the decoder
        :param vocab_size: Number of words in the vocab
        :param bos_token: To use during decoding (non teacher forcing mode))
        :param bos: beginning of sentence token
        :param unk: unknown token (not used)
        direction = foreward | backward
        """
        self.classes = classes
        self.hidden_size = hidden_dim
        self.inputs_dim = inputs_dim
        self.nms_thresh = 0.5
        self.dropout = dropout
        self.pass_root = pass_root
        # generate embed layer
        # delete 'start'
        embed_vecs = obj_edge_vectors(self.classes, wv_dim=embed_dim)
        self.obj_embed = nn.Embedding(len(self.classes), embed_dim)
        self.obj_embed.weight.data = embed_vecs
        # generate out layer
        self.out = nn.Linear(self.hidden_size, len(self.classes))
        self.out.weight = torch.nn.init.xavier_normal(self.out.weight,
                                                      gain=1.0)
        self.out.bias.data.fill_(0.0)

        if direction == 'backward':  # a root to all children
            self.input_size = inputs_dim
            self.decoderTreeLSTM = HrTreeLSTM_Backward(
                self.input_size,
                self.hidden_size,
                self.pass_root,
                is_pass_embed=True,
                embed_layer=self.obj_embed,
                embed_out_layer=self.out)
        elif direction == 'foreward':  # multi children to a root
            self.input_size = inputs_dim
            self.decoderTreeLSTM = HrTreeLSTM_Foreward(
                self.input_size,
                self.hidden_size,
                self.pass_root,
                is_pass_embed=True,
                embed_layer=self.obj_embed,
                embed_out_layer=self.out)
        else:
            print('Error Decoder LSTM Direction')
Esempio n. 2
0
    def __init__(self,
                 classes,
                 rel_classes,
                 mode='sgdet',
                 embed_dim=200,
                 hidden_dim=256,
                 obj_dim=2048,
                 nl_obj=2,
                 nl_edge=2,
                 dropout_rate=0.2,
                 order='confidence',
                 pass_in_obj_feats_to_decoder=True,
                 pass_in_obj_feats_to_edge=True):
        super(LinearizedContext, self).__init__()
        self.classes = classes
        self.rel_classes = rel_classes
        assert mode in MODES
        self.mode = mode

        self.nl_obj = nl_obj
        self.nl_edge = nl_edge

        self.embed_dim = embed_dim
        self.hidden_dim = hidden_dim
        self.obj_dim = obj_dim
        self.dropout_rate = dropout_rate
        self.pass_in_obj_feats_to_decoder = pass_in_obj_feats_to_decoder
        self.pass_in_obj_feats_to_edge = pass_in_obj_feats_to_edge

        assert order in ('size', 'confidence', 'random', 'leftright')
        self.order = order

        # EMBEDDINGS
        embed_vecs = obj_edge_vectors(self.classes, wv_dim=self.embed_dim)
        self.obj_embed = nn.Embedding(self.num_classes, self.embed_dim)
        self.obj_embed.weight.data = embed_vecs.clone()

        self.obj_embed2 = nn.Embedding(self.num_classes, self.embed_dim)
        self.obj_embed2.weight.data = embed_vecs.clone()

        # This probably doesn't help it much
        self.pos_embed = nn.Sequential(*[
            nn.BatchNorm1d(4, momentum=BATCHNORM_MOMENTUM / 10.0),
            nn.Linear(4, 128),
            nn.ReLU(inplace=True),
            nn.Dropout(0.1),
        ])
        self.conver_fusion_feature = nn.Sequential(*[
            nn.BatchNorm1d(self.embed_dim + 128,
                           momentum=BATCHNORM_MOMENTUM / 10.0),
            nn.Linear(self.embed_dim + 128, 4096),  # self.obj_dim +
            nn.ReLU(inplace=True),
            nn.Dropout(0.1),
        ])
        self.decoder_lin_ = nn.Linear(4096, self.num_classes)
Esempio n. 3
0
    def __init__(self,
                 classes,
                 embed_dim,
                 inputs_dim,
                 hidden_dim,
                 recurrent_dropout_probability=0.2,
                 use_highway=True,
                 use_input_projection_bias=True):
        """
        Initializes the RNN
        :param embed_dim: Dimension of the embeddings
        :param encoder_hidden_dim: Hidden dim of the encoder, for attention purposes
        :param hidden_dim: Hidden dim of the decoder
        :param vocab_size: Number of words in the vocab
        :param bos_token: To use during decoding (non teacher forcing mode))
        :param bos: beginning of sentence token
        :param unk: unknown token (not used)
        """
        super(DecoderRNN, self).__init__()

        self.classes = classes
        embed_vecs = obj_edge_vectors(['start'] + self.classes, wv_dim=100)
        self.obj_embed = nn.Embedding(len(self.classes), embed_dim)
        self.obj_embed.weight.data = embed_vecs
        self.hidden_size = hidden_dim
        self.inputs_dim = inputs_dim
        self.nms_thresh = 0.3

        self.recurrent_dropout_probability = recurrent_dropout_probability
        self.use_highway = use_highway
        # We do the projections for all the gates all at once, so if we are
        # using highway layers, we need some extra projections, which is
        # why the sizes of the Linear layers change here depending on this flag.
        if use_highway:
            self.input_linearity = torch.nn.Linear(
                self.input_size,
                6 * self.hidden_size,
                bias=use_input_projection_bias)
            self.state_linearity = torch.nn.Linear(self.hidden_size,
                                                   5 * self.hidden_size,
                                                   bias=True)
        else:
            self.input_linearity = torch.nn.Linear(
                self.input_size,
                4 * self.hidden_size,
                bias=use_input_projection_bias)
            self.state_linearity = torch.nn.Linear(self.hidden_size,
                                                   4 * self.hidden_size,
                                                   bias=True)

        self.out = nn.Linear(self.hidden_size, len(self.classes))
        self.reset_parameters()
Esempio n. 4
0
    def __init__(self, classes, mode='sgdet', embed_dim=20, obj_dim=4096):
        super(LC, self).__init__()
        self.classes = classes
        self.embed_dim = embed_dim
        self.obj_dim = obj_dim
        self.mode = mode
        embed_vecs = obj_edge_vectors(self.classes, wv_dim=self.embed_dim)
        self.obj_embed = nn.Embedding(self.num_classes, self.embed_dim)
        self.obj_embed.weight.data = embed_vecs.clone()

        self.pos_embed = nn.Sequential(*[
            nn.BatchNorm1d(4, momentum=BATCHNORM_MOMENTUM / 10.0),
            nn.Linear(4, 128),
            nn.ReLU(inplace=True),
            nn.Dropout(0.1),
        ])

        self.decoder_lin = nn.Linear(self.obj_dim + self.embed_dim + 128,
                                     self.num_classes)
    def __init__(self,
                 classes,
                 rel_classes,
                 mode='sgcls',
                 embed_dim=200,
                 hidden_dim=256,
                 obj_dim=4096,
                 pooling_dim=4096,
                 ctx_dim=512):
        super(LinearizedContext, self).__init__()
        self.classes = classes
        self.rel_classes = rel_classes
        assert mode in MODES
        self.mode = mode
        self.embed_dim = embed_dim
        self.hidden_dim = hidden_dim
        self.obj_dim = obj_dim
        self.pooling_dim = pooling_dim
        self.ctx_dim = ctx_dim

        # EMBEDDINGS
        embed_vecs = obj_edge_vectors(self.classes, wv_dim=self.embed_dim)
        self.obj_embed = nn.Embedding(self.num_classes, self.embed_dim)  # K0
        self.obj_embed.weight.data = embed_vecs.clone()
        self.obj_embed2 = nn.Embedding(self.num_classes, self.embed_dim)  # K1
        self.obj_embed2.weight.data = embed_vecs.clone()

        # Object-Relational Embedding
        self.RE1 = RelationalEmbedding(input_dim=self.obj_dim +
                                       self.embed_dim + self.ctx_dim,
                                       output_dim=self.hidden_dim)
        self.RE2 = RelationalEmbedding(input_dim=self.hidden_dim,
                                       output_dim=self.num_classes)

        # Edge-Relational Embedding
        self.RE3 = RelationalEmbedding(input_dim=self.embed_dim +
                                       self.hidden_dim,
                                       output_dim=self.hidden_dim)
        self.RE4 = RelationalEmbedding(input_dim=self.hidden_dim,
                                       output_dim=self.pooling_dim * 2)
Esempio n. 6
0
    def __init__(self, classes, rel_classes, mode='sgdet',
                 embed_dim=200, obj_dim=2048, order='confidence'):
        super(O_NODE, self).__init__()
        self.classes = classes
        self.rel_classes = rel_classes
        assert mode in MODES
        self.mode = mode

        self.embed_dim = embed_dim

        self.obj_dim = obj_dim

        #----------add sget nms
        self.nms_filter_duplicates = True
        self.max_per_img =64#default 64
        self.thresh = 0.01#############0.001training 0.01test

        assert order in ('size', 'confidence', 'random', 'leftright')
        self.order = order

        # EMBEDDINGS
        embed_vecs = obj_edge_vectors(self.classes, wv_dim=self.embed_dim)
        self.obj_embed = nn.Embedding(self.num_classes, self.embed_dim)
        self.obj_embed.weight.data = embed_vecs.clone()


        # This probably doesn't help it much
        self.pos_embed = nn.Sequential(*[
            nn.BatchNorm1d(4, momentum=BATCHNORM_MOMENTUM / 10.0),
            nn.Linear(4, 128),
            nn.ReLU(inplace=True),
            nn.Dropout(0.1),
        ])

        self.decoder_lin1 = nn.Linear(self.obj_dim + self.embed_dim + 128, 1024)
        self.decoder_lin2 = odeBlock(odeFunc2(use_cuda=True))
        self.decoder_lin3 = nn.Linear(1024, self.num_classes)
Esempio n. 7
0
    def __init__(self, method, embed_objs, subj_pred_obj_pairs,
                 L=0.2, topk=5, alpha=2, uniform=False, degree_smoothing=1,
                 data_dir=None, obj_classes=None, triplet2str=None):

        self.method = method
        if embed_objs is None:
            embed_objs = obj_edge_vectors(obj_classes,
                                          wv_dir=data_dir,
                                          wv_dim=200,
                                          avg_words=True)[0]
            embed_objs = embed_objs / torch.norm(embed_objs, 2, dim=1, keepdim=True)

        self.obj_pairwise = pairwise_similarity(embed_objs)
        self.subj_pred_obj_pairs = subj_pred_obj_pairs
        self.L = L
        self.topk = topk
        self.alpha = alpha
        self.uniform = uniform
        self.degree_smoothing = degree_smoothing
        self.n_obj_classes = self.obj_pairwise.shape[0]
        self.obj_classes = obj_classes
        self.triplet2str = triplet2str
        if self.method == 'neigh':
            assert self.topk > 0, self.topk
    def __init__(self,
                 classes,
                 rel_classes,
                 mode='sgdet',
                 num_gpus=1,
                 use_vision=False,
                 require_overlap_det=True,
                 embed_dim=200,
                 hidden_dim=4096,
                 use_resnet=False,
                 thresh=0.01,
                 use_proposals=False,
                 use_bias=True,
                 limit_vision=True,
                 depth_model=None,
                 pretrained_depth=False,
                 active_features=None,
                 frozen_features=None,
                 use_embed=False,
                 **kwargs):
        """
        :param classes: object classes
        :param rel_classes: relationship classes. None if were not using rel mode
        :param mode: (sgcls, predcls, or sgdet)
        :param num_gpus: how many GPUS 2 use
        :param use_vision: enable the contribution of union of bounding boxes
        :param require_overlap_det: whether two objects must intersect
        :param embed_dim: word2vec embeddings dimension
        :param hidden_dim: dimension of the fusion hidden layer
        :param use_resnet: use resnet as faster-rcnn's backbone
        :param thresh: faster-rcnn related threshold (Threshold for calling it a good box)
        :param use_proposals: whether to use region proposal candidates
        :param use_bias: enable frequency bias
        :param limit_vision: use truncated version of UoBB features
        :param depth_model: provided architecture for depth feature extraction
        :param pretrained_depth: whether the depth feature extractor should be initialized with ImageNet weights
        :param active_features: what set of features should be enabled (e.g. 'vdl' : visual, depth, and location features)
        :param frozen_features: what set of features should be frozen (e.g. 'd' : depth)
        :param use_embed: use word2vec embeddings
        """
        RelModelBase.__init__(self, classes, rel_classes, mode, num_gpus,
                              require_overlap_det, active_features,
                              frozen_features)
        self.pooling_size = 7
        self.embed_dim = embed_dim
        self.hidden_dim = hidden_dim
        self.obj_dim = 2048 if use_resnet else 4096
        self.use_vision = use_vision
        self.use_bias = use_bias
        self.limit_vision = limit_vision

        # -- Store depth related parameters
        assert depth_model in DEPTH_MODELS
        self.depth_model = depth_model
        self.pretrained_depth = pretrained_depth
        self.depth_pooling_dim = DEPTH_DIMS[self.depth_model]
        self.use_embed = use_embed
        self.detector = nn.Module()
        features_size = 0

        # -- Check whether ResNet is selected as faster-rcnn's backbone
        if use_resnet:
            raise ValueError(
                "The current model does not support ResNet as the Faster-RCNN's backbone."
            )
        """ *** DIFFERENT COMPONENTS OF THE PROPOSED ARCHITECTURE *** 
        This is the part where the different components of the proposed relation detection 
        architecture are defined. In the case of RGB images, we have class probability distribution
        features, visual features, and the location ones. If we are considering depth images as well,
        we augment depth features too. """

        # -- Visual features
        if self.has_visual:
            # -- Define faster R-CNN network and it's related feature extractors
            self.detector = ObjectDetector(
                classes=classes,
                mode=('proposals' if use_proposals else 'refinerels')
                if mode == 'sgdet' else 'gtbox',
                use_resnet=use_resnet,
                thresh=thresh,
                max_per_img=64,
            )
            self.roi_fmap_obj = load_vgg(pretrained=False).classifier

            # -- Define union features
            if self.use_vision:
                # -- UoBB pooling module
                self.union_boxes = UnionBoxesAndFeats(
                    pooling_size=self.pooling_size,
                    stride=16,
                    dim=1024 if use_resnet else 512)

                # -- UoBB feature extractor
                roi_fmap = [
                    Flattener(),
                    load_vgg(use_dropout=False,
                             use_relu=False,
                             use_linear=self.hidden_dim == 4096,
                             pretrained=False).classifier,
                ]
                if self.hidden_dim != 4096:
                    roi_fmap.append(nn.Linear(4096, self.hidden_dim))
                self.roi_fmap = nn.Sequential(*roi_fmap)

            # -- Define visual features hidden layer
            self.visual_hlayer = nn.Sequential(*[
                xavier_init(nn.Linear(self.obj_dim * 2, self.FC_SIZE_VISUAL)),
                nn.ReLU(inplace=True),
                nn.Dropout(0.8)
            ])
            self.visual_scale = ScaleLayer(1.0)
            features_size += self.FC_SIZE_VISUAL

        # -- Location features
        if self.has_loc:
            # -- Define location features hidden layer
            self.location_hlayer = nn.Sequential(*[
                xavier_init(nn.Linear(self.LOC_INPUT_SIZE, self.FC_SIZE_LOC)),
                nn.ReLU(inplace=True),
                nn.Dropout(0.1)
            ])
            self.location_scale = ScaleLayer(1.0)
            features_size += self.FC_SIZE_LOC

        # -- Class features
        if self.has_class:
            if self.use_embed:
                # -- Define class embeddings
                embed_vecs = obj_edge_vectors(self.classes,
                                              wv_dim=self.embed_dim)
                self.obj_embed = nn.Embedding(self.num_classes, self.embed_dim)
                self.obj_embed.weight.data = embed_vecs.clone()

            classme_input_dim = self.embed_dim if self.use_embed else self.num_classes
            # -- Define Class features hidden layer
            self.classme_hlayer = nn.Sequential(*[
                xavier_init(
                    nn.Linear(classme_input_dim * 2, self.FC_SIZE_CLASS)),
                nn.ReLU(inplace=True),
                nn.Dropout(0.1)
            ])
            self.classme_scale = ScaleLayer(1.0)
            features_size += self.FC_SIZE_CLASS

        # -- Depth features
        if self.has_depth:
            # -- Initialize depth backbone
            self.depth_backbone = DepthCNN(depth_model=self.depth_model,
                                           pretrained=self.pretrained_depth)

            # -- Create a relation head which is used to carry on the feature extraction
            # from RoIs of depth features
            self.depth_rel_head = self.depth_backbone.get_classifier()

            # -- Define depth features hidden layer
            self.depth_rel_hlayer = nn.Sequential(*[
                xavier_init(
                    nn.Linear(self.depth_pooling_dim * 2, self.FC_SIZE_DEPTH)),
                nn.ReLU(inplace=True),
                nn.Dropout(0.6),
            ])
            self.depth_scale = ScaleLayer(1.0)
            features_size += self.FC_SIZE_DEPTH

        # -- Initialize frequency bias if needed
        if self.use_bias:
            self.freq_bias = FrequencyBias()

        # -- *** Fusion layer *** --
        # -- A hidden layer for concatenated features (fusion features)
        self.fusion_hlayer = nn.Sequential(*[
            xavier_init(nn.Linear(features_size, self.hidden_dim)),
            nn.ReLU(inplace=True),
            nn.Dropout(0.1)
        ])

        # -- Final FC layer which predicts the relations
        self.rel_out = xavier_init(
            nn.Linear(self.hidden_dim, self.num_rels, bias=True))

        # -- Freeze the user specified features
        if self.frz_visual:
            self.freeze_module(self.detector)
            self.freeze_module(self.roi_fmap_obj)
            self.freeze_module(self.visual_hlayer)
            if self.use_vision:
                self.freeze_module(self.roi_fmap)
                self.freeze_module(self.union_boxes.conv)

        if self.frz_class:
            self.freeze_module(self.classme_hlayer)

        if self.frz_loc:
            self.freeze_module(self.location_hlayer)

        if self.frz_depth:
            self.freeze_module(self.depth_backbone)
            self.freeze_module(self.depth_rel_head)
            self.freeze_module(self.depth_rel_hlayer)
Esempio n. 9
0
    def __init__(self, classes, rel_classes, mode='sgdet',
                 embed_dim=200, hidden_dim=256, obj_dim=2048,
                 nl_obj=2, nl_edge=2, dropout_rate=0.2, order='confidence',
                 pass_in_obj_feats_to_decoder=True,
                 pass_in_obj_feats_to_edge=True):
        super(LinearizedContext, self).__init__()
        self.classes = classes
        self.rel_classes = rel_classes
        assert mode in MODES
        self.mode = mode

        self.nl_obj = nl_obj
        self.nl_edge = nl_edge

        self.embed_dim = embed_dim
        self.hidden_dim = hidden_dim
        self.obj_dim = obj_dim
        self.dropout_rate = dropout_rate
        self.pass_in_obj_feats_to_decoder = pass_in_obj_feats_to_decoder
        self.pass_in_obj_feats_to_edge = pass_in_obj_feats_to_edge

        assert order in ('size', 'confidence', 'random', 'leftright')
        self.order = order
        # print('LIN CONTEXT : Start')
        # EMBEDDINGS
        embed_vecs = obj_edge_vectors(self.classes, wv_dim=self.embed_dim)
        self.obj_embed = nn.Embedding(self.num_classes, self.embed_dim)
        self.obj_embed.weight.data = embed_vecs.clone()

        self.obj_embed2 = nn.Embedding(self.num_classes, self.embed_dim)
        self.obj_embed2.weight.data = embed_vecs.clone()
        # print('LIN CONTEXT : 0')
        # This probably doesn't help it much
        self.pos_embed = nn.Sequential(*[
            nn.BatchNorm1d(4, momentum=BATCHNORM_MOMENTUM / 10.0),
            nn.Linear(4, 128),
            nn.ReLU(inplace=True),
            nn.Dropout(0.1),
        ])
        # print('LIN CONTEXT : 1')
        if self.nl_obj > 0:
            # print('LIN CONTEXT : 1.1')
            self.obj_ctx_rnn = AlternatingHighwayLSTM(
                input_size=self.obj_dim+self.embed_dim+128,
                hidden_size=self.hidden_dim,
                num_layers=self.nl_obj,
                recurrent_dropout_probability=dropout_rate)
            # print('LIN CONTEXT : 1.5')
            decoder_inputs_dim = self.hidden_dim
            if self.pass_in_obj_feats_to_decoder:
                decoder_inputs_dim += self.obj_dim + self.embed_dim
            self.decoder_rnn = DecoderRNN(self.classes, embed_dim=self.embed_dim,
                                          inputs_dim=decoder_inputs_dim,
                                          hidden_dim=self.hidden_dim,
                                          recurrent_dropout_probability=dropout_rate)
        else:
            self.decoder_lin = nn.Linear(self.obj_dim + self.embed_dim + 128, self.num_classes)
        # print('LIN CONTEXT : 2')
        if self.nl_edge > 0:
            input_dim = self.embed_dim
            if self.nl_obj > 0:
                input_dim += self.hidden_dim
            if self.pass_in_obj_feats_to_edge:
                input_dim += self.obj_dim
            self.edge_ctx_rnn = AlternatingHighwayLSTM(input_size=input_dim,
                                                       hidden_size=self.hidden_dim,
                                                       num_layers=self.nl_edge,
                                                       recurrent_dropout_probability=dropout_rate)
Esempio n. 10
0
    def __init__(
            self,
            obj_classes,
            rel_classes,
            embed_dim=200,
            hidden_dim=64,
            n_ch=512,
            pool_sz=7,
            fmap_sz=38,
            losses=('D', 'G', 'rec'),
            SN=True,
            BN=True,
            n_layers_G=5,
            vis_cond=None,
            init_embed=False,
            largeD=False,
            data_dir='',  # to load word embeddings
            device='cuda'):
        """
        :param embed_dim: Dimension for all embeddings
        :param obj_dim:
        """
        super(GAN, self).__init__()
        self.obj_classes = obj_classes
        self.rel_classes = rel_classes
        self.embed_dim = embed_dim
        self.n_ch = n_ch
        self.obj_dim = pool_sz**2 * n_ch
        self.pool_sz = pool_sz
        self.fmap_sz = fmap_sz
        self.losses = losses
        self.SN = SN
        self.BN = BN
        self.vis_cond = vis_cond
        self.largeD = largeD
        self.h5_data = None
        self.device = device
        if vis_cond is not None:
            self.h5_data = h5py.File(vis_cond, mode='r')

        self.G_obj_embed = nn.Embedding(len(self.obj_classes), self.embed_dim)
        self.G_rel_embed = nn.Embedding(len(self.rel_classes), self.embed_dim)

        if SN:
            conv = lambda n_in, n_out, ks, pad: spectral_norm(
                nn.Conv2d(n_in, n_out, kernel_size=ks, padding=pad))
        else:
            conv = lambda n_in, n_out, ks, pad: nn.Conv2d(
                n_in, n_out, kernel_size=ks, padding=pad)

        def cond_discriminator(n_classes):  # input is 512x7x7
            return nn.Sequential(
                conv(n_ch + n_classes, n_ch // 2, 3, 0),  # ->256x5x5
                nn.ReLU(),
                conv(n_ch // 2, n_ch // 4, 3, 0),  # ->128x3x3
                nn.ReLU(),
                conv(n_ch // 4, n_ch // 8, 1, 0),  # ->64x3x3
                nn.ReLU(),
                conv(n_ch // 8, 1, 3, 0),  # ->1x1x1
                nn.Flatten())

        # Discriminators (must start with D_) ----------------------------------
        self.D_nodes = cond_discriminator(len(self.obj_classes))
        self.D_edges = cond_discriminator(len(self.rel_classes))
        self.D_global = nn.Sequential(
            conv(n_ch, n_ch // 2, 3, 0),  # 512x38x38->256x36x36
            nn.LeakyReLU(0.2),
            conv(n_ch // 2, n_ch //
                 2, 1, 0) if largeD else nn.Identity(),  # ->256x36x36
            nn.LeakyReLU(0.2) if largeD else nn.Identity(),
            nn.AvgPool2d(2, ceil_mode=True)
            if fmap_sz > 24 else nn.Identity(),  # ->256x18x18
            conv(n_ch // 2, n_ch // 2, 3, 0),  # ->256x16x16
            nn.LeakyReLU(0.2),
            conv(n_ch // 2, n_ch //
                 2, 1, 0) if largeD else nn.Identity(),  # ->256x16x16
            nn.LeakyReLU(0.2) if largeD else nn.Identity(),
            nn.AvgPool2d(2),  # ->256x8x8
            conv(n_ch // 2, n_ch // 4, 3, 0),  # ->128x6x6
            nn.LeakyReLU(0.2),
            conv(n_ch // 4, n_ch //
                 4, 1, 0) if largeD else nn.Identity(),  # ->128x6x6
            nn.LeakyReLU(0.2) if largeD else nn.Identity(),
            nn.AvgPool2d(2),  # ->128x3x3
            conv(n_ch // 4, 1, 3, 0),  # ->128x1x1
            nn.Flatten())
        print('Global Discriminator:', self.D_global)

        # Generators (must start with G_) --------------------------------------

        # Graph Convolutional Network (returns 32x7x7 features)
        self.G_gcn = GraphTripleConvNet(
            input_dim=self.embed_dim + 4,
            input_edge_dim=self.embed_dim,
            output_dim=hidden_dim // 2 * pool_sz * pool_sz,
            num_layers=n_layers_G,
            hidden_dim=hidden_dim,
            pooling='avg',
            mlp_normalization='batch' if BN else 'none')

        # Post process GCN features with conv layers to make them more "spatial"
        self.G_node = nn.Sequential(
            nn.Conv2d(hidden_dim // 2, hidden_dim, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv2d(hidden_dim, hidden_dim, kernel_size=3, padding=1),
            nn.ReLU())

        # To transform hidden features concatenated with visual features
        self.G_proj = nn.Conv2d(hidden_dim + int(vis_cond is not None) * n_ch,
                                hidden_dim,
                                kernel_size=1)

        # To generate large global features
        self.G_refine = RefinementNetwork(dims=(hidden_dim, n_ch // 4,
                                                n_ch // 2, n_ch),
                                          normalization='batch',
                                          activation='leakyrelu-0.2')

        # Predefine 0,1 labels to prevent resource consuming creation of large vectors for every batch during training
        n_max = 50000  # some random big number
        self.y_real_, self.y_fake_ = torch.ones(n_max,
                                                1).to(device), torch.zeros(
                                                    n_max, 1).to(device)
        self.y_real = lambda n: Variable(self.y_real_[:n])
        self.y_fake = lambda n: Variable(self.y_fake_[:n])

        # Load the Glove-based language model to use for SG perturbations and initializing the GAN input embeddings
        embed_objs, word_vectors = obj_edge_vectors(self.obj_classes,
                                                    wv_dir=data_dir,
                                                    wv_dim=embed_dim,
                                                    word_vectors=None,
                                                    avg_words=True)

        self.embed_objs = (
            embed_objs /
            torch.norm(embed_objs, 2, dim=1, keepdim=True)).to(device)

        if init_embed:
            # Initialize learnable GAN embeddings with the Glove ones
            # Using this led to worse results in our experiments, so we don't use it
            assert self.G_obj_embed.weight.shape == self.embed_objs.shape, (
                self.G_obj_embed.weight.shape, self.embed_objs.shape)
            self.G_obj_embed.weight.data = self.embed_objs.clone()

            embed_rels = obj_edge_vectors(self.rel_classes,
                                          wv_dim=embed_dim,
                                          word_vectors=word_vectors,
                                          avg_words=True)[0]

            self.embed_rels = (
                embed_rels /
                torch.norm(embed_rels, 2, dim=1, keepdim=True)).to(device)
            assert self.G_rel_embed.weight.shape == self.embed_rels.shape, (
                self.G_rel_embed.weight.shape, self.embed_rels.shape)
            self.G_rel_embed.weight.data = self.embed_rels.clone()
Esempio n. 11
0
    def __init__(self,
                 classes,
                 rel_classes,
                 mode='sgdet',
                 num_gpus=1,
                 use_vision=True,
                 require_overlap_det=True,
                 embed_dim=200,
                 hidden_dim=256,
                 pooling_dim=2048,
                 nl_obj=1,
                 nl_edge=2,
                 use_resnet=False,
                 order='confidence',
                 thresh=0.01,
                 use_proposals=False,
                 pass_in_obj_feats_to_decoder=True,
                 pass_in_obj_feats_to_edge=True,
                 rec_dropout=0.0,
                 use_bias=True,
                 use_tanh=True,
                 limit_vision=True):
        """
        :param classes: Object classes
        :param rel_classes: Relationship classes. None if were not using rel mode
        :param mode: (sgcls, predcls, or sgdet)
        :param num_gpus: how many GPUS 2 use
        :param use_vision: Whether to use vision in the final product
        :param require_overlap_det: Whether two objects must intersect
        :param embed_dim: Dimension for all embeddings
        :param hidden_dim: LSTM hidden size
        :param obj_dim:
        """
        super(RelModel, self).__init__()
        self.classes = classes
        self.rel_classes = rel_classes
        self.num_gpus = num_gpus
        assert mode in MODES
        self.mode = mode

        self.pooling_size = 7
        self.embed_dim = embed_dim
        self.hidden_dim = hidden_dim
        self.obj_dim = 2048 if use_resnet else 4096
        self.pooling_dim = pooling_dim

        self.use_bias = use_bias
        self.use_vision = use_vision
        self.use_tanh = use_tanh
        self.limit_vision = limit_vision
        self.require_overlap = require_overlap_det and self.mode == 'sgdet'
        self.hook_for_grad = False
        self.gradients = []

        self.detector = ObjectDetector(
            classes=classes,
            mode=('proposals' if use_proposals else 'refinerels')
            if mode == 'sgdet' else 'gtbox',
            use_resnet=use_resnet,
            thresh=thresh,
            max_per_img=64,
        )
        self.ort_embedding = torch.autograd.Variable(
            get_ort_embeds(self.num_classes, 200).cuda())
        embed_vecs = obj_edge_vectors(self.classes, wv_dim=self.embed_dim)
        self.obj_embed = nn.Embedding(self.num_classes, self.embed_dim)
        self.obj_embed.weight.data = embed_vecs.clone()

        # This probably doesn't help it much
        self.pos_embed = nn.Sequential(*[
            nn.BatchNorm1d(4, momentum=BATCHNORM_MOMENTUM / 10.0),
            nn.Linear(4, 128),
            nn.ReLU(inplace=True),
            nn.Dropout(0.1),
        ])

        self.context = LinearizedContext(
            self.classes,
            self.rel_classes,
            mode=self.mode,
            embed_dim=self.embed_dim,
            hidden_dim=self.hidden_dim,
            obj_dim=self.obj_dim,
            nl_obj=nl_obj,
            nl_edge=nl_edge,
            dropout_rate=rec_dropout,
            order=order,
            pass_in_obj_feats_to_decoder=pass_in_obj_feats_to_decoder,
            pass_in_obj_feats_to_edge=pass_in_obj_feats_to_edge)

        # Image Feats (You'll have to disable if you want to turn off the features from here)
        self.union_boxes = UnionBoxesAndFeats(pooling_size=self.pooling_size,
                                              stride=16,
                                              dim=1024 if use_resnet else 512)

        self.merge_obj_feats = nn.Sequential(
            nn.Linear(self.obj_dim + self.embed_dim + 128, self.hidden_dim),
            nn.ReLU())

        # self.trans = nn.Sequential(nn.Linear(self.hidden_dim, self.hidden_dim//4),
        #                             LayerNorm(self.hidden_dim//4), nn.ReLU(),
        #                             nn.Linear(self.hidden_dim//4, self.hidden_dim))

        self.get_phr_feats = nn.Linear(self.pooling_dim, self.hidden_dim)

        self.embeddings4lstm = nn.Embedding(self.num_classes, self.embed_dim)

        self.lstm = nn.LSTM(input_size=self.hidden_dim + self.embed_dim,
                            hidden_size=self.hidden_dim,
                            num_layers=1)

        self.obj_mps1 = Message_Passing4OBJ(self.hidden_dim)
        # self.obj_mps2 = Message_Passing4OBJ(self.hidden_dim)
        self.get_boxes_encode = Boxes_Encode(64)

        if use_resnet:
            self.roi_fmap = nn.Sequential(
                resnet_l4(relu_end=False),
                nn.AvgPool2d(self.pooling_size),
                Flattener(),
            )
        else:
            roi_fmap = [
                Flattener(),
                load_vgg(use_dropout=False,
                         use_relu=False,
                         use_linear=pooling_dim == 4096,
                         pretrained=False).classifier,
            ]
            if pooling_dim != 4096:
                roi_fmap.append(nn.Linear(4096, pooling_dim))
            self.roi_fmap = nn.Sequential(*roi_fmap)
            self.roi_fmap_obj = load_vgg(pretrained=False).classifier

        ###################################
        # self.obj_classify_head = nn.Linear(self.pooling_dim, self.num_classes)

        # self.post_emb_s = nn.Linear(self.pooling_dim, self.pooling_dim//2)
        # self.post_emb_s.weight = torch.nn.init.xavier_normal(self.post_emb_s.weight, gain=1.0)
        # self.post_emb_o = nn.Linear(self.pooling_dim, self.pooling_dim//2)
        # self.post_emb_o.weight = torch.nn.init.xavier_normal(self.post_emb_o.weight, gain=1.0)
        # self.merge_obj_high = nn.Linear(self.hidden_dim, self.pooling_dim//2)
        # self.merge_obj_high.weight = torch.nn.init.xavier_normal(self.merge_obj_high.weight, gain=1.0)
        # self.merge_obj_low = nn.Linear(self.pooling_dim + 5 + self.embed_dim, self.pooling_dim//2)
        # self.merge_obj_low.weight = torch.nn.init.xavier_normal(self.merge_obj_low.weight, gain=1.0)
        # self.rel_compress = nn.Linear(self.pooling_dim//2 + 64, self.num_rels, bias=True)
        # self.rel_compress.weight = torch.nn.init.xavier_normal(self.rel_compress.weight, gain=1.0)
        # self.freq_gate = nn.Linear(self.pooling_dim//2 + 64, self.num_rels, bias=True)
        # self.freq_gate.weight = torch.nn.init.xavier_normal(self.freq_gate.weight, gain=1.0)

        self.post_emb_s = nn.Linear(self.pooling_dim, self.pooling_dim)
        self.post_emb_s.weight = torch.nn.init.xavier_normal(
            self.post_emb_s.weight, gain=1.0)
        self.post_emb_o = nn.Linear(self.pooling_dim, self.pooling_dim)
        self.post_emb_o.weight = torch.nn.init.xavier_normal(
            self.post_emb_o.weight, gain=1.0)
        self.merge_obj_high = nn.Linear(self.hidden_dim, self.pooling_dim)
        self.merge_obj_high.weight = torch.nn.init.xavier_normal(
            self.merge_obj_high.weight, gain=1.0)
        self.merge_obj_low = nn.Linear(self.pooling_dim + 5 + self.embed_dim,
                                       self.pooling_dim)
        self.merge_obj_low.weight = torch.nn.init.xavier_normal(
            self.merge_obj_low.weight, gain=1.0)
        self.rel_compress = nn.Linear(self.pooling_dim + 64,
                                      self.num_rels,
                                      bias=True)
        self.rel_compress.weight = torch.nn.init.xavier_normal(
            self.rel_compress.weight, gain=1.0)
        self.freq_gate = nn.Linear(self.pooling_dim + 64,
                                   self.num_rels,
                                   bias=True)
        self.freq_gate.weight = torch.nn.init.xavier_normal(
            self.freq_gate.weight, gain=1.0)
        # self.ranking_module = nn.Sequential(nn.Linear(self.pooling_dim + 64, self.hidden_dim), nn.ReLU(), nn.Linear(self.hidden_dim, 1))
        if self.use_bias:
            self.freq_bias = FrequencyBias()
    def __init__(self, classes, rel_classes, mode='sgdet',
                 embed_dim=200, hidden_dim=256, obj_dim=2048,
                 nl_obj=2, nl_edge=2, dropout_rate=0.2, order='confidence',
                 pass_in_obj_feats_to_decoder=True,
                 pass_in_obj_feats_to_edge=True,
                 use_rl_tree=True, draw_tree=False):
        super(LinearizedContext, self).__init__()
        self.classes = classes
        self.rel_classes = rel_classes
        assert mode in MODES
        self.mode = mode

        self.nl_obj = nl_obj
        self.nl_edge = nl_edge

        self.embed_dim = embed_dim
        self.hidden_dim = hidden_dim
        self.obj_dim = obj_dim
        self.dropout_rate = dropout_rate
        self.pass_in_obj_feats_to_decoder = pass_in_obj_feats_to_decoder
        self.pass_in_obj_feats_to_edge = pass_in_obj_feats_to_edge
        self.use_rl_tree = use_rl_tree
        self.draw_tree = draw_tree

        assert order in ('size', 'confidence', 'random', 'leftright')
        self.order = order

        # EMBEDDINGS
        embed_vecs = obj_edge_vectors(self.classes, wv_dim=self.embed_dim)
        self.obj_embed = nn.Embedding(self.num_classes, self.embed_dim)
        self.obj_embed.weight.data = embed_vecs.clone()
        self.virtual_node_embed = nn.Embedding(1, self.embed_dim) # used to encode Root Node

        self.obj_embed2 = nn.Embedding(self.num_classes, self.embed_dim)
        self.obj_embed2.weight.data = embed_vecs.clone()

        # This probably doesn't help it much
        self.pos_embed = nn.Sequential(*[
            nn.BatchNorm1d(4, momentum=BATCHNORM_MOMENTUM / 10.0),
            nn.Linear(4, 128),
            nn.ReLU(inplace=True),
            #nn.Dropout(0.1),
        ])

        # generate tree
        self.rl_input_size = 256
        self.rl_hidden_size = 256
        self.feat_preprocess_net = gen_tree.RLFeatPreprocessNet(self.obj_dim, self.embed_dim, 8, 6, self.rl_input_size)
        self.rl_sub = nn.Linear(self.rl_input_size, self.rl_hidden_size)
        self.rl_obj = nn.Linear(self.rl_input_size, self.rl_hidden_size)
        self.rl_scores = nn.Linear(self.rl_hidden_size * 3 + 3, 1)  # (left child score, right child score)
        # init
        self.rl_sub.weight = torch.nn.init.xavier_normal(self.rl_sub.weight,  gain=1.0)
        self.rl_sub.bias.data.zero_()
        self.rl_obj.weight = torch.nn.init.xavier_normal(self.rl_obj.weight,  gain=1.0)
        self.rl_obj.bias.data.zero_()
        self.rl_scores.weight = torch.nn.init.xavier_normal(self.rl_scores.weight,  gain=1.0)
        self.rl_scores.bias.data.zero_()

        # whether draw tree
        if self.draw_tree:
            self.draw_tree_count = 0
            self.draw_tree_max = 600

        if self.nl_obj > 0:
            self.obj_tree_lstm = tree_lstm.MultiLayer_BTreeLSTM(self.obj_dim+self.embed_dim+128, self.hidden_dim, self.nl_obj, dropout_rate)

            decoder_inputs_dim = self.hidden_dim
            if self.pass_in_obj_feats_to_decoder:
                decoder_inputs_dim += self.obj_dim + self.embed_dim

            self.decoder_tree_lstm = DecoderTreeLSTM(classes, embed_dim=100, #embed_dim = self.embed_dim, 
                                          inputs_dim=decoder_inputs_dim, 
                                          hidden_dim=self.hidden_dim, 
                                          direction = 'backward',
                                          dropout=dropout_rate,
                                          pass_root=False,
                                          not_rl = not self.use_rl_tree)
        else:
            self.decoder_lin = nn.Linear(self.obj_dim + self.embed_dim + 128, self.num_classes)

        if self.nl_edge > 0:
            input_dim = self.embed_dim
            if self.nl_obj > 0:
                input_dim += self.hidden_dim
            if self.pass_in_obj_feats_to_edge:
                input_dim += self.obj_dim

            self.edge_tree_lstm = tree_lstm.MultiLayer_BTreeLSTM(input_dim, self.hidden_dim, self.nl_edge, dropout_rate)
Esempio n. 13
0
    def __init__(self,
                 classes,
                 rel_classes,
                 embed_dim,
                 obj_dim,
                 inputs_dim,
                 hidden_dim,
                 pooling_dim,
                 recurrent_dropout_probability=0.2,
                 use_highway=True,
                 use_input_projection_bias=True,
                 use_vision=True,
                 use_bias=True,
                 use_tanh=True,
                 limit_vision=True,
                 sl_pretrain=False,
                 num_iter=-1):
        """
        Initializes the RNN
        :param embed_dim: Dimension of the embeddings
        :param encoder_hidden_dim: Hidden dim of the encoder, for attention purposes
        :param hidden_dim: Hidden dim of the decoder
        :param vocab_size: Number of words in the vocab
        :param bos_token: To use during decoding (non teacher forcing mode))
        :param bos: beginning of sentence token
        :param unk: unknown token (not used)
        """
        super(DecoderRNN, self).__init__()

        self.rel_embedding_dim = 100
        self.classes = classes
        self.rel_classes = rel_classes
        embed_vecs = obj_edge_vectors(['start'] + self.classes, wv_dim=100)
        self.obj_embed = nn.Embedding(len(self.classes), embed_dim)
        self.obj_embed.weight.data = embed_vecs

        embed_rels = obj_edge_vectors(self.rel_classes,
                                      wv_dim=self.rel_embedding_dim)
        self.rel_embed = nn.Embedding(len(self.rel_classes),
                                      self.rel_embedding_dim)
        self.rel_embed.weight.data = embed_rels

        self.embed_dim = embed_dim
        self.obj_dim = obj_dim
        self.hidden_size = hidden_dim
        self.inputs_dim = inputs_dim
        self.pooling_dim = pooling_dim
        self.nms_thresh = 0.3

        self.use_vision = use_vision
        self.use_bias = use_bias
        self.use_tanh = use_tanh
        self.limit_vision = limit_vision
        self.sl_pretrain = sl_pretrain
        self.num_iter = num_iter

        self.recurrent_dropout_probability = recurrent_dropout_probability
        self.use_highway = use_highway
        # We do the projections for all the gates all at once, so if we are
        # using highway layers, we need some extra projections, which is
        # why the sizes of the Linear layers change here depending on this flag.
        if use_highway:
            self.input_linearity = torch.nn.Linear(
                self.input_size,
                6 * self.hidden_size,
                bias=use_input_projection_bias)
            self.state_linearity = torch.nn.Linear(self.hidden_size,
                                                   5 * self.hidden_size,
                                                   bias=True)
        else:
            self.input_linearity = torch.nn.Linear(
                self.input_size,
                4 * self.hidden_size,
                bias=use_input_projection_bias)
            self.state_linearity = torch.nn.Linear(self.hidden_size,
                                                   4 * self.hidden_size,
                                                   bias=True)

        # self.obj_in_lin = torch.nn.Linear(self.rel_embedding_dim, self.rel_embedding_dim, bias=True)

        self.out = nn.Linear(self.hidden_size, len(self.classes))
        self.reset_parameters()

        # For relation predication
        embed_vecs2 = obj_edge_vectors(self.classes, wv_dim=embed_dim)
        self.obj_embed2 = nn.Embedding(self.num_classes, embed_dim)
        self.obj_embed2.weight.data = embed_vecs2.clone()

        # self.post_lstm = nn.Linear(self.hidden_dim, self.pooling_dim * 2)
        self.post_lstm = nn.Linear(self.obj_dim + 2 * self.embed_dim + 128,
                                   self.pooling_dim * 2)
        # Initialize to sqrt(1/2n) so that the outputs all have mean 0 and variance 1.
        # (Half contribution comes from LSTM, half from embedding.
        # In practice the pre-lstm stuff tends to have stdev 0.1 so I multiplied this by 10.
        self.post_lstm.weight.data.normal_(
            0, 10.0 * math.sqrt(1.0 / self.hidden_size)
        )  ######## there may need more consideration
        self.post_lstm.bias.data.zero_()

        self.rel_compress = nn.Linear(self.pooling_dim,
                                      self.num_rels,
                                      bias=True)
        self.rel_compress.weight = torch.nn.init.xavier_normal(
            self.rel_compress.weight, gain=1.0)
        if self.use_bias:
            self.freq_bias = FrequencyBias()

            # simple relation model
            from dataloaders.visual_genome import VG
            from lib.get_dataset_counts import get_counts, box_filter
            fg_matrix, bg_matrix = get_counts(train_data=VG.splits(
                num_val_im=5000,
                filter_non_overlap=True,
                filter_duplicate_rels=True,
                use_proposals=False)[0],
                                              must_overlap=True)
            prob_matrix = fg_matrix.astype(np.float32)
            prob_matrix[:, :, 0] = bg_matrix

            # TRYING SOMETHING NEW.
            prob_matrix[:, :, 0] += 1
            prob_matrix /= np.sum(prob_matrix, 2)[:, :, None]
            # prob_matrix /= float(fg_matrix.max())

            prob_matrix[:, :, 0] = 0  # Zero out BG
            self.prob_matrix = prob_matrix
Esempio n. 14
0
    def __init__(self,
                 classes,
                 rel_classes,
                 mode='sgdet',
                 num_gpus=1,
                 use_vision=True,
                 require_overlap_det=False,
                 embed_dim=200,
                 hidden_dim=256,
                 obj_dim=2048,
                 pooling_dim=4096,
                 nl_obj=1,
                 nl_edge=2,
                 use_resnet=True,
                 order='confidence',
                 thresh=0.01,
                 use_proposals=False,
                 pass_in_obj_feats_to_decoder=True,
                 pass_in_obj_feats_to_edge=True,
                 rec_dropout=0.0,
                 use_bias=True,
                 use_tanh=True,
                 limit_vision=True,
                 spatial_dim=128,
                 mp_iter_num=1,
                 trim_graph=True):
        """
        Args:
            mp_iter_num: integer, number of message passing iteration
            trim_graph: boolean, trim graph in rel pn
        """
        super(FckModel, self).__init__()
        self.classes = classes
        self.rel_classes = rel_classes
        self.num_gpus = num_gpus
        assert mode in MODES
        self.mode = mode

        self.pooling_size = 7
        self.embed_dim = embed_dim
        self.hidden_dim = hidden_dim
        self.obj_dim = obj_dim
        self.pooling_dim = 2048 if use_resnet else 4096
        self.spatial_dim = spatial_dim

        self.use_bias = use_bias
        self.use_vision = use_vision
        self.use_tanh = use_tanh
        self.limit_vision = limit_vision
        self.require_overlap = require_overlap_det and self.mode == 'sgdet'

        self.mp_iter_num = mp_iter_num
        self.trim_graph = trim_graph

        classes_word_vec = obj_edge_vectors(self.classes, wv_dim=embed_dim)
        self.classes_word_embedding = nn.Embedding(self.num_classes, embed_dim)
        self.classes_word_embedding.weight.data = classes_word_vec.clone()
        self.classes_word_embedding.weight.requires_grad = False

        #fg_matrix, bg_matrix = get_counts()
        #rel_obj_distribution = fg_matrix / (fg_matrix.sum(2)[:, :, None] + 1e-5)
        #rel_obj_distribution = torch.FloatTensor(rel_obj_distribution)
        #rel_obj_distribution = rel_obj_distribution.view(-1, self.num_rels)
        #
        #self.rel_obj_distribution = nn.Embedding(rel_obj_distribution.size(0), self.num_rels)
        ## (#obj_class * #obj_class, #rel_class)
        #self.rel_obj_distribution.weight.data = rel_obj_distribution

        if mode == 'sgdet':
            if use_proposals:
                obj_detector_mode = 'proposals'
            else:
                obj_detector_mode = 'refinerels'
        else:
            obj_detector_mode = 'gtbox'

        self.detector = ObjectDetector(
            classes=classes,
            mode=obj_detector_mode,
            use_resnet=use_resnet,
            thresh=thresh,
            max_per_img=64,
        )

        self.union_boxes = UnionBoxesAndFeats(pooling_size=self.pooling_size,
                                              stride=16,
                                              dim=1024 if use_resnet else 512,
                                              use_feats=False)
        self.spatial_fc = nn.Sequential(*[
            nn.Linear(4, spatial_dim),
            nn.BatchNorm1d(spatial_dim, momentum=BATCHNORM_MOMENTUM / 10.),
            nn.ReLU(inplace=True)
        ])
        self.word_fc = nn.Sequential(*[
            nn.Linear(2 * embed_dim, hidden_dim),
            nn.BatchNorm1d(hidden_dim, momentum=BATCHNORM_MOMENTUM / 10.),
            nn.ReLU(inplace=True)
        ])
        # union box feats
        feats_dim = obj_dim + spatial_dim + hidden_dim
        self.relpn_fc = nn.Linear(feats_dim, 2)
        self.relcnn_fc1 = nn.Sequential(
            *[nn.Linear(feats_dim, feats_dim),
              nn.ReLU(inplace=True)])

        # v2 model---------
        self.box_mp_fc = nn.Sequential(*[
            nn.Linear(obj_dim, obj_dim),
        ])
        self.sub_rel_mp_fc = nn.Sequential(*[nn.Linear(feats_dim, obj_dim)])

        self.obj_rel_mp_fc = nn.Sequential(*[
            nn.Linear(feats_dim, obj_dim),
        ])

        self.mp_atten_fc = nn.Sequential(*[
            nn.Linear(feats_dim + obj_dim, obj_dim),
            nn.ReLU(inplace=True),
            nn.Linear(obj_dim, 1)
        ])
        # v2 model----------

        self.cls_fc = nn.Linear(obj_dim, self.num_classes)

        self.relcnn_fc2 = nn.Linear(feats_dim, self.num_rels)

        # v3 model -----------

        self.mem_module = MemoryRNN(classes=classes,
                                    rel_classes=rel_classes,
                                    inputs_dim=feats_dim,
                                    hidden_dim=hidden_dim,
                                    recurrent_dropout_probability=.0)
        # v3 model -----------

        if use_resnet:
            # deprecate
            self.roi_fmap = nn.Sequential(
                resnet_l4(relu_end=False),
                nn.AvgPool2d(self.pooling_size),
                Flattener(),
            )
        else:
            roi_fmap = [
                load_vgg(
                    use_dropout=False,
                    use_relu=False,
                    use_linear=self.obj_dim == 4096,
                    pretrained=False,
                ).classifier,
                nn.Linear(self.pooling_dim, self.obj_dim)
            ]
            self.roi_fmap = nn.Sequential(*roi_fmap)
Esempio n. 15
0
    def __init__(self,
                 vocabs,
                 vocab_size,
                 input_encoding_size,
                 rnn_type='lstm',
                 rnn_size=512,
                 num_layers=1,
                 drop_prob_lm=0.5,
                 seq_length=16,
                 seq_per_img=5,
                 fc_feat_size=4096,
                 att_feat_size=512,
                 num_relation=20,
                 object_classes=None,
                 predicate_classes=None,
                 triplet_embed_dim=-1,
                 embed_triplet=True,
                 freq_bl=False):
        super(RelCaptionModel, self).__init__()
        self.vocabs = vocabs
        self.vocabs['0'] = '__SENTSIGN__'  ## ix
        self.vocabs = {i: self.vocabs[str(i)] for i in range(len(self.vocabs))}
        vocab_list = [self.vocabs[i] for i in range(len(self.vocabs))]
        self.vocab_size = vocab_size + 1  # including all the words and <UNK>, and 0 for <start>/<end>

        self.input_encoding_size = input_encoding_size
        self.rnn_type = rnn_type
        self.rnn_size = rnn_size
        self.num_layers = num_layers
        self.drop_prob_lm = drop_prob_lm
        self.seq_length = seq_length
        self.fc_feat_size = fc_feat_size
        self.ss_prob = 0.0  # Schedule sampling probability
        self.num_relation_per_img = num_relation
        self.seq_per_img = seq_per_img
        self.embed_triplet = embed_triplet
        self.triplet_embed_dim = triplet_embed_dim

        self.freq_bl = freq_bl

        self.linear = nn.Linear(self.fc_feat_size, self.num_layers *
                                self.rnn_size)  # feature to rnn_size
        embed_vec = obj_edge_vectors(vocab_list,
                                     wv_dim=self.input_encoding_size)
        self.embed = nn.Embedding(self.vocab_size, self.input_encoding_size)
        self.embed.weight.data = embed_vec.clone()

        if self.embed_triplet:
            assert object_classes is not None and predicate_classes is not None
            object_embed_vec = obj_edge_vectors(object_classes,
                                                wv_dim=self.triplet_embed_dim)
            predicate_embed_vec = obj_edge_vectors(
                predicate_classes, wv_dim=self.triplet_embed_dim)
            self.object_embed = nn.Embedding(len(object_classes),
                                             self.triplet_embed_dim)
            self.object_embed.weight.data = object_embed_vec.clone()
            self.predicate_embed = nn.Embedding(len(predicate_classes),
                                                self.triplet_embed_dim)
            self.predicate_embed.weight.data = predicate_embed_vec.clone()

        self.logit = nn.Linear(self.rnn_size, self.vocab_size)
        self.dropout = nn.Dropout(self.drop_prob_lm)

        self.core = RelCaptionCore(input_encoding_size, rnn_type, rnn_size,
                                   num_layers, drop_prob_lm, fc_feat_size,
                                   att_feat_size, triplet_embed_dim,
                                   embed_triplet)

        if self.freq_bl:
            self.freq_matrix, _ = get_counts(train_data=VG200(
                mode='train', filter_duplicate_rels=False, num_val_im=1000),
                                             must_overlap=True)
        else:
            self.freq_matrix = None

        self.init_weights()
Esempio n. 16
0

        
Esempio n. 17
0
    def __init__(self, classes, rel_classes, mode='sgdet', num_gpus=1, require_overlap_det=True,
                 embed_dim=200, use_resnet=False, order='confidence', thresh=0.01, use_proposals=False):

        """
        :param classes: Object classes
        :param rel_classes: Relationship classes. None if were not using rel mode
        :param mode: (sgcls, predcls, or sgdet)
        """
        super(NODIS, self).__init__()
        self.classes = classes
        self.rel_classes = rel_classes
        self.num_gpus = num_gpus
        assert mode in MODES
        self.mode = mode

        self.pooling_size = 7
        self.embed_dim = embed_dim

        self.obj_dim = 2048 if use_resnet else 4096


        self.order = 'random'

        self.require_overlap = require_overlap_det and self.mode == 'sgdet'

        self.detector = ObjectDetector(
            classes=classes,
            mode=('proposals' if use_proposals else 'refinerels') if mode == 'sgdet' else 'gtbox',
            use_resnet=use_resnet,
            thresh=thresh,
            max_per_img=64,
        )

        self.context = O_NODE(self.classes, self.rel_classes, mode=self.mode, embed_dim=self.embed_dim, obj_dim=self.obj_dim, order=order)

        # Image Feats (You'll have to disable if you want to turn off the features from here)
        self.union_boxes = UnionBoxesAndFeats(pooling_size=self.pooling_size, stride=16,
                                              dim=1024 if use_resnet else 512)

        if use_resnet:
            self.roi_fmap = nn.Sequential(
                resnet_l4(relu_end=False),
                nn.AvgPool2d(self.pooling_size),
                Flattener(),
            )
        else:
            self.roi_fmap_obj = load_vgg(pretrained=False).classifier
            self.roi_avg_pool = nn.AvgPool2d(kernel_size=7, stride=0)
        ###################################
        embed_vecs = obj_edge_vectors(self.classes, wv_dim=self.embed_dim)
        self.obj_embed = nn.Embedding(self.num_classes, self.embed_dim)
        self.obj_embed.weight.data = embed_vecs.clone()

        self.obj_embed2 = nn.Embedding(self.num_classes, self.embed_dim)
        self.obj_embed2.weight.data = embed_vecs.clone()

        self.lstm_visual = nn.LSTM(input_size=1536, hidden_size=512)
        self.lstm_semantic = nn.LSTM(input_size=400, hidden_size=512)
        self.odeBlock = odeBlock(odeFunc1(bidirectional=True))

        self.fc_predicate = nn.Sequential(nn.Linear(1024, 512),
                                          nn.ReLU(inplace=False),
                                          nn.Linear(512, 51),
                                          nn.ReLU(inplace=False))
Esempio n. 18
0
    def __init__(self,
                 classes,
                 rel_classes,
                 mode='sgdet',
                 num_gpus=1,
                 use_vision=True,
                 require_overlap_det=True,
                 embed_dim=200,
                 hidden_dim=256,
                 obj_dim=2048,
                 pooling_dim=4096,
                 nl_obj=1,
                 nl_edge=2,
                 use_resnet=True,
                 order='confidence',
                 thresh=0.01,
                 use_proposals=False,
                 pass_in_obj_feats_to_decoder=True,
                 pass_in_obj_feats_to_edge=True,
                 rec_dropout=0.0,
                 use_bias=True,
                 use_tanh=True,
                 limit_vision=True,
                 spatial_dim=128,
                 graph_constrain=True,
                 mp_iter_num=1):
        """
        Args:
            mp_iter_num: integer, number of message passing iteration
        """
        super(FckModel, self).__init__()
        self.classes = classes
        self.rel_classes = rel_classes
        self.num_gpus = num_gpus
        assert mode in MODES
        self.mode = mode

        self.pooling_size = 7
        self.embed_dim = embed_dim
        self.hidden_dim = hidden_dim
        self.obj_dim = obj_dim
        self.pooling_dim = 2048 if use_resnet else 4096
        self.spatial_dim = spatial_dim

        self.use_bias = use_bias
        self.use_vision = use_vision
        self.use_tanh = use_tanh
        self.limit_vision = limit_vision
        self.require_overlap = require_overlap_det and self.mode == 'sgdet'

        self.graph_cons = graph_constrain
        self.mp_iter_num = mp_iter_num

        classes_word_vec = obj_edge_vectors(self.classes, wv_dim=embed_dim)
        self.classes_word_embedding = nn.Embedding(self.num_classes, embed_dim)
        self.classes_word_embedding.weight.data = classes_word_vec.clone()
        self.classes_word_embedding.weight.requires_grad = False

        # the last one is dirty bit
        self.rel_mem = nn.Embedding(self.num_rels, self.obj_dim + 1)
        self.rel_mem.weight.data[:, -1] = 0

        if mode == 'sgdet':
            if use_proposals:
                obj_detector_mode = 'proposals'
            else:
                obj_detector_mode = 'refinerels'
        else:
            obj_detector_mode = 'gtbox'

        self.detector = ObjectDetector(
            classes=classes,
            mode=obj_detector_mode,
            use_resnet=use_resnet,
            thresh=thresh,
            max_per_img=64,
        )
        self.union_boxes = UnionBoxesAndFeats(pooling_size=self.pooling_size,
                                              stride=16,
                                              dim=1024 if use_resnet else 512,
                                              use_feats=False)
        self.spatial_fc = nn.Sequential(*[
            nn.Linear(4, spatial_dim),
            nn.BatchNorm1d(spatial_dim, momentum=BATCHNORM_MOMENTUM / 10.),
            nn.ReLU(inplace=True)
        ])
        self.word_fc = nn.Sequential(*[
            nn.Linear(2 * embed_dim, hidden_dim),
            nn.BatchNorm1d(hidden_dim, momentum=BATCHNORM_MOMENTUM / 10.),
            nn.ReLU(inplace=True)
        ])
        # union box feats
        feats_dim = obj_dim + spatial_dim + hidden_dim
        self.relpn_fc = nn.Linear(feats_dim, 2)
        self.relcnn_fc1 = nn.Sequential(
            *[nn.Linear(feats_dim, feats_dim),
              nn.ReLU(inplace=True)])
        self.box_mp_fc = nn.Sequential(*[
            nn.Linear(obj_dim, obj_dim),
        ])
        self.sub_rel_mp_fc = nn.Sequential(*[nn.Linear(feats_dim, obj_dim)])

        self.obj_rel_mp_fc = nn.Sequential(*[
            nn.Linear(feats_dim, obj_dim),
        ])

        self.mp_atten_fc = nn.Sequential(*[
            nn.Linear(feats_dim + obj_dim, obj_dim),
            nn.ReLU(inplace=True),
            nn.Linear(obj_dim, 1)
        ])

        self.cls_fc = nn.Linear(obj_dim, self.num_classes)
        self.relcnn_fc2 = nn.Linear(
            feats_dim, self.num_rels if self.graph_cons else 2 * self.num_rels)

        if use_resnet:
            #deprecate
            self.roi_fmap = nn.Sequential(
                resnet_l4(relu_end=False),
                nn.AvgPool2d(self.pooling_size),
                Flattener(),
            )
        else:
            roi_fmap = [
                load_vgg(
                    use_dropout=False,
                    use_relu=False,
                    use_linear=self.obj_dim == 4096,
                    pretrained=False,
                ).classifier,
                nn.Linear(self.pooling_dim, self.obj_dim)
            ]
            self.roi_fmap = nn.Sequential(*roi_fmap)