Beispiel #1
0
  def __init__(self, input_dim, output_dim=None, hidden_dim=512,
               pooling='avg', mlp_normalization='none',model_type=None):
    super(GraphAttnConv, self).__init__()
    if output_dim is None:
      output_dim = input_dim
    self.input_dim = input_dim
    self.output_dim = output_dim
    self.hidden_dim = hidden_dim

    assert pooling in ['sum', 'avg'], 'Invalid pooling "%s"' % pooling
    self.pooling = pooling
    net1_layers = [3 * input_dim, hidden_dim, 2 * hidden_dim + output_dim]
    net1_layers = [l for l in net1_layers if l is not None]
    self.net1 = build_mlp(net1_layers, batch_norm=mlp_normalization)
    self.net1.apply(_init_weights)

    net2_layers = [hidden_dim, hidden_dim, output_dim]
    self.net2 = build_mlp(net2_layers, batch_norm=mlp_normalization)
    self.net2.apply(_init_weights)

    self.initial_obj_projection_layer = nn.Linear(self.output_dim, self.hidden_dim)
    nn.init.kaiming_normal_(self.initial_obj_projection_layer.weight)

    self.W_sim = nn.Linear(self.hidden_dim, self.hidden_dim)
    nn.init.kaiming_normal_(self.W_sim.weight)
Beispiel #2
0
    def __init__(self,
                 obj_input_dim,
                 object_output_dim,
                 predicate_input_dim,
                 predicate_output_dim,
                 hidden_dim,
                 num_attributes,
                 pooling='avg',
                 mlp_normalization='none',
                 predicates_transitive_weights=None,
                 return_new_p_vecs=True):
        super(GraphTripleConv, self).__init__()

        self.return_new_p_vecs = return_new_p_vecs
        self.hidden_dim = hidden_dim
        self.num_attributes = num_attributes
        self.predicate_output_dim = predicate_output_dim
        assert pooling in ['sum', 'avg'], 'Invalid pooling "%s"' % pooling
        self.pooling = pooling
        net1_layers = [
            2 * obj_input_dim + predicate_input_dim, hidden_dim,
            2 * hidden_dim + self.predicate_output_dim
        ]
        self.net1 = build_mlp(net1_layers,
                              batch_norm=mlp_normalization,
                              final_nonlinearity='relu')
        self.net1.apply(_init_weights)

        net2_layers = [hidden_dim, hidden_dim, object_output_dim]

        self.net2 = build_mlp(net2_layers,
                              batch_norm=mlp_normalization,
                              final_nonlinearity='relu')
        self.net2.apply(_init_weights)
        self.predicates_transitive_weights = predicates_transitive_weights
Beispiel #3
0
    def __init__(self, vocab, image_size=(64, 64), embedding_dim=64,
               gconv_dim=128, gconv_hidden_dim=512,
               gconv_pooling='avg', gconv_num_layers=5,
               refinement_dims=(1024, 512, 256, 128, 64),
               normalization='batch', activation='leakyrelu-0.2',
               mask_size=None, mlp_normalization='none', layout_noise_dim=32,
               context_embedding_dim=0, **kwargs):
        super(Sg2ImModelGB, self).__init__()

        # We used to have some additional arguments: 
        # vec_noise_dim, gconv_mode, box_anchor, decouple_obj_predictions
        if len(kwargs) > 0:
            print('WARNING: Model got unexpected kwargs ', kwargs)

        self.vocab = vocab
        self.image_size = image_size
        self.layout_noise_dim = layout_noise_dim
        self.context_embedding_dim = context_embedding_dim

        num_objs = len(vocab['object_idx_to_name'])
        num_preds = len(vocab['pred_idx_to_name'])
        self.embedding = SGEmbedding(num_objs, num_preds, embedding_dim, gconv_dim)

        gconv_kwargs = {
            'input_dim': gconv_dim,
            'hidden_dim': gconv_hidden_dim,
            'pooling': gconv_pooling,
            'num_layers': gconv_num_layers - 1,
            'mlp_normalization': mlp_normalization,
        }
        self.gconv_net = GraphTripleConvNet(**gconv_kwargs)

        box_net_dim = 4
        box_net_layers = [gconv_dim, gconv_hidden_dim, box_net_dim]
        self.box_net = build_mlp(box_net_layers, batch_norm=mlp_normalization)

        self.mask_net = None
        if mask_size is not None and mask_size > 0:
            self.mask_net = self._build_mask_net(num_objs, gconv_dim, mask_size)

        # SHould it be 2* embedding_dim?
        rel_aux_layers = [2 * gconv_dim + 8, gconv_hidden_dim, num_preds]
        self.rel_aux_net = build_mlp(rel_aux_layers, batch_norm=mlp_normalization)

        # Add context network
    #     self.context_network = Context()
    #     self.noise_layout = nn.Linear(context_embedding_dim + layout_noise_dim, 
    #                                   (context_embedding_dim + layout_noise_dim)*64*64)


        refinement_kwargs = {
          'dims': (gconv_dim + layout_noise_dim + context_embedding_dim,) + refinement_dims,
          'normalization': normalization,
          'activation': activation,
        }
        self.refinement_net = RefinementNetwork(**refinement_kwargs)
Beispiel #4
0
  def __init__(self, input_dim, output_dim=None, hidden_dim=512,
               pooling='avg', mlp_normalization='none',model_type=None):
    super(GraphSageLSTMConv, self).__init__()
    if output_dim is None:
      output_dim = input_dim
    self.input_dim = input_dim
    self.output_dim = output_dim
    self.hidden_dim = hidden_dim

    assert pooling in ['sum', 'avg'], 'Invalid pooling "%s"' % pooling
    self.pooling = pooling
    net1_layers = [3 * input_dim, hidden_dim, 2 * hidden_dim + output_dim]
    net1_layers = [l for l in net1_layers if l is not None]
    self.net1 = build_mlp(net1_layers, batch_norm=mlp_normalization)
    self.net1.apply(_init_weights)

    self.output_linear_layer = nn.Linear(self.hidden_dim * 2, self.output_dim)
    nn.init.kaiming_normal_(self.output_linear_layer.weight)

    self.initial_obj_projection_layer = nn.Linear(self.output_dim, self.hidden_dim)
    nn.init.kaiming_normal_(self.initial_obj_projection_layer.weight)

    ### RNN Component ###
    self.object_lstm = nn.LSTM(
                        input_size=self.hidden_dim,
                        hidden_size=self.hidden_dim,
                        num_layers=1
                        )
Beispiel #5
0
    def __init__(self, opt):
        super().__init__()
        self.opt = opt
        self.attribute_embedding = AttributeEmbeddings(
            self.opt.vocab['attributes'],
            self.opt.embedding_dim,
            use_attr_fc_gen=True)

        self.repr_input = opt.g_mask_dim
        rep_hidden_size = 64
        repr_layers = [self.repr_input, rep_hidden_size, opt.rep_size]
        self.repr_net = build_mlp(repr_layers,
                                  batch_norm=opt.mlp_normalization)
        appearance_encoder_kwargs = {
            'vocab': self.opt.vocab,
            'arch': 'C4-64-2,C4-128-2,C4-256-2',
            'normalization': opt.appearance_normalization,
            'activation': opt.a_activation,
            'padding': 'valid',
            'vecs_size': opt.g_mask_dim
        }
        self.image_encoder = AppearanceEncoder(
            **appearance_encoder_kwargs)  # Ignore
        self.fake_pool = VectorPool(opt.pool_size)  # Ignore
        for i in range(opt.num_D):
            subnetD = NLayerDiscriminator(opt)
            self.add_module('discriminator_%d' % i, subnetD)
Beispiel #6
0
  def __init__(self, input_dim, output_dim=None, hidden_dim=512,
               pooling='avg', mlp_normalization='none',model_type=None):
    super(GraphTripleRandomWalkConv, self).__init__()
    if output_dim is None:
      output_dim = input_dim
    self.input_dim = input_dim
    self.output_dim = output_dim
    self.hidden_dim = hidden_dim

    assert pooling in ['sum', 'avg'], 'Invalid pooling "%s"' % pooling
    self.pooling = pooling
    net1_layers = [3 * input_dim, hidden_dim, 2 * hidden_dim + output_dim]
    net1_layers = [l for l in net1_layers if l is not None]
    self.net1 = build_mlp(net1_layers, batch_norm=mlp_normalization)
    self.net1.apply(_init_weights)

    net2_layers = [hidden_dim, hidden_dim, output_dim]
    self.net2 = build_mlp(net2_layers, batch_norm=mlp_normalization)
    self.net2.apply(_init_weights)
Beispiel #7
0
 def __init__(self,
              inp_nc,
              out_nc,
              nlayers=4,
              normalization='none',
              activation='relu'):
     super().__init__()
     dim_list = (inp_nc + out_nc, ) * (nlayers) + (out_nc, )
     self.net = build_mlp(dim_list,
                          batch_norm=normalization,
                          activation=activation,
                          final_nonlinearity=False)
Beispiel #8
0
  def __init__(self, vocab, image_size=(64, 64), embedding_dim=64,
               gconv_dim=128, gconv_hidden_dim=512,
               gconv_pooling='avg', gconv_num_layers=5,
               refinement_dims=(1024, 512, 256, 128, 64),
               normalization='batch', activation='leakyrelu-0.2',
               mask_size=None, mlp_normalization='none', layout_noise_dim=0,
               **kwargs):
    super(Sg2ImModel, self).__init__()

    # We used to have some additional arguments: 
    # vec_noise_dim, gconv_mode, box_anchor, decouple_obj_predictions
    if len(kwargs) > 0:
      print('WARNING: Model got unexpected kwargs ', kwargs)

    self.vocab = vocab
    self.image_size = image_size
    self.layout_noise_dim = layout_noise_dim

    num_objs = len(vocab['object_idx_to_name'])
    num_preds = len(vocab['pred_idx_to_name'])
    self.obj_embeddings = nn.Embedding(num_objs + 1, embedding_dim)
    self.pred_embeddings = nn.Embedding(num_preds, embedding_dim)

    if gconv_num_layers == 0:
      self.gconv = nn.Linear(embedding_dim, gconv_dim)
    elif gconv_num_layers > 0:
      gconv_kwargs = {
        'input_dim': embedding_dim,
        'output_dim': gconv_dim,
        'hidden_dim': gconv_hidden_dim,
        'pooling': gconv_pooling,
        'mlp_normalization': mlp_normalization,
      }
      self.gconv = GraphTripleConv(**gconv_kwargs)

    self.gconv_net = None
    if gconv_num_layers > 1:
      gconv_kwargs = {
        'input_dim': gconv_dim,
        'hidden_dim': gconv_hidden_dim,
        'pooling': gconv_pooling,
        'num_layers': gconv_num_layers - 1,
        'mlp_normalization': mlp_normalization,
      }
      self.gconv_net = GraphTripleConvNet(**gconv_kwargs)

    box_net_dim = 4
    box_net_layers = [gconv_dim, gconv_hidden_dim, box_net_dim]
    self.box_net = build_mlp(box_net_layers, batch_norm=mlp_normalization)

    self.mask_net = None
    if mask_size is not None and mask_size > 0:
      self.mask_net = self._build_mask_net(num_objs, gconv_dim, mask_size)
Beispiel #9
0
    def __init__(self, opt):
        super().__init__()
        self.attribute_embedding = AttributeEmbeddings(opt.vocab['attributes'],
                                                       opt.embedding_dim)
        self.opt = opt
        nf = opt.ngf
        self.sw, self.sh = self.compute_latent_vector_size(opt)

        if opt.use_vae:
            # In case of VAE, we will sample from random z vector
            self.fc = nn.Linear(opt.z_dim, 16 * nf * self.sw * self.sh)
        else:
            # Otherwise, we make the network deterministic by starting with
            # downsampled segmentation map instead of random z
            self.fc = nn.Conv2d(self.opt.semantic_nc, 16 * nf, 3, padding=1)

        self.head_0 = SPADEResnetBlock(16 * nf, 16 * nf, opt)

        self.G_middle_0 = SPADEResnetBlock(16 * nf, 16 * nf, opt)
        self.G_middle_1 = SPADEResnetBlock(16 * nf, 16 * nf, opt)

        self.up_0 = SPADEResnetBlock(16 * nf, 8 * nf, opt)
        self.up_1 = SPADEResnetBlock(8 * nf, 4 * nf, opt)
        self.up_2 = SPADEResnetBlock(4 * nf, 2 * nf, opt)
        self.up_3 = SPADEResnetBlock(2 * nf, 1 * nf, opt)

        final_nc = nf

        if opt.num_upsampling_layers == 'most':
            self.up_4 = SPADEResnetBlock(1 * nf, nf // 2, opt)
            final_nc = nf // 2

        self.conv_img = nn.Conv2d(final_nc, 3, 3, padding=1)

        self.up = nn.Upsample(scale_factor=2)

        self.repr_input = opt.g_mask_dim
        rep_hidden_size = 64
        repr_layers = [self.repr_input, rep_hidden_size, opt.rep_size]
        self.repr_net = build_mlp(repr_layers,
                                  batch_norm=opt.mlp_normalization)
        appearance_encoder_kwargs = {
            'vocab': self.opt.vocab,
            'arch': 'C4-64-2,C4-128-2,C4-256-2',
            'normalization': opt.appearance_normalization,
            'activation': opt.a_activation,
            'padding': 'valid',
            'vecs_size': opt.g_mask_dim
        }
        self.image_encoder = AppearanceEncoder(**appearance_encoder_kwargs)
Beispiel #10
0
    def __init__(self,
                 feat_dim,
                 pose_dim,
                 nlayers=2,
                 normalization='none',
                 activation='relu'):
        super().__init__()
        self.pose_dim = pose_dim
        self.feat_dim = feat_dim

        resent = models.resnet18(pretrained=True)
        modules = list(resent.children())[:-1]
        self.enc = nn.Sequential(*modules)
        dim_list = (feat_dim + pose_dim, ) + (pose_dim, ) * (nlayers)
        self.linear = build_mlp(dim_list,
                                batch_norm=normalization,
                                activation=activation,
                                final_nonlinearity=True)
Beispiel #11
0
    def __init__(
            self,
            vocab,
            image_size=(64, 64),
            embedding_dim=64,
            gconv_dim=128,
            gconv_hidden_dim=512,
            gconv_pooling='avg',
            gconv_num_layers=5,
            refinement_dims=(1024, 512, 256, 128, 64),
            normalization='batch',
            activation='leakyrelu-0.2',
            mask_size=None,
            mlp_normalization='none',
            layout_noise_dim=0,
            sg_context_dim=0,  #None, 
            sg_context_dim_d=0,  #None, 
            gcnn_pooling='avg',
            triplet_box_net=False,
            triplet_mask_size=0,
            triplet_embedding_size=0,
            use_bbox_info=False,
            triplet_superbox_net=False,
            **kwargs):
        super(Sg2ImModel, self).__init__()

        # We used to have some additional arguments:
        # vec_noise_dim, gconv_mode, box_anchor, decouple_obj_predictions
        if len(kwargs) > 0:
            print('WARNING: Model got unexpected kwargs ', kwargs)

        self.vocab = vocab
        self.image_size = image_size
        self.layout_noise_dim = layout_noise_dim
        self.sg_context_dim = sg_context_dim
        self.sg_context_dim_d = sg_context_dim_d
        self.gcnn_pooling = gcnn_pooling
        self.triplet_box_net = triplet_box_net
        self.triplet_mask_size = triplet_mask_size
        self.triplet_embedding_size = triplet_embedding_size
        self.use_bbox_info = use_bbox_info
        self.triplet_superbox_net = triplet_superbox_net

        num_objs = len(vocab['object_idx_to_name'])
        num_preds = len(vocab['pred_idx_to_name'])
        self.obj_embeddings = nn.Embedding(num_objs + 1, embedding_dim)
        self.pred_embeddings = nn.Embedding(num_preds, embedding_dim)

        if gconv_num_layers == 0:
            self.gconv = nn.Linear(embedding_dim, gconv_dim)
        elif gconv_num_layers > 0:
            gconv_kwargs = {
                'input_dim': embedding_dim,
                'output_dim': gconv_dim,
                'hidden_dim': gconv_hidden_dim,
                'pooling': gconv_pooling,
                'mlp_normalization': mlp_normalization,
            }
            self.gconv = GraphTripleConv(**gconv_kwargs)

        self.gconv_net = None
        if gconv_num_layers > 1:
            gconv_kwargs = {
                'input_dim': gconv_dim,
                'hidden_dim': gconv_hidden_dim,
                'pooling': gconv_pooling,
                'num_layers': gconv_num_layers - 1,
                'mlp_normalization': mlp_normalization,
            }
            self.gconv_net = GraphTripleConvNet(**gconv_kwargs)

        if self.use_bbox_info:
            box_net_dim = 4 + 1  # augment with addition info abt bbox
        else:
            box_net_dim = 4
        box_net_layers = [gconv_dim, gconv_hidden_dim, box_net_dim]
        self.box_net = build_mlp(box_net_layers, batch_norm=mlp_normalization)

        # triplet-related nets
        self.triplet_box_net = None
        self.triplet_embed_net = None
        self.triplet_mask_net = None
        self.triplet_superbox_net = None

        # output dimension
        triplet_box_net_dim = 8
        if triplet_box_net:
            # input dimension is 3*128 for concatenated triplet
            triplet_box_net_layers = [
                3 * gconv_dim, gconv_hidden_dim, triplet_box_net_dim
            ]
            self.triplet_box_net = build_mlp(triplet_box_net_layers,
                                             batch_norm=mlp_normalization)

        # triplet embedding
        if self.triplet_embedding_size > 0:
            # input dimsn is 3*128 for concatenated triplet, output dimsn is triplet_embed_dim
            triplet_embed_layers = [
                3 * gconv_dim, gconv_hidden_dim, triplet_embedding_size
            ]
            self.triplet_embed_net = build_mlp(triplet_embed_layers,
                                               batch_norm=mlp_normalization)

        if self.triplet_mask_size > 0:
            # input dimsn is 3*128 for concatenated triplet, output dimsn is triplet_mask_size
            #self.triplet_mask_net = self._build_mask_net(num_objs, 3*gconv_dim, self.triplet_mask_size)
            self.triplet_mask_net = self._build_triplet_mask_net(
                num_objs, 3 * gconv_dim, self.triplet_mask_size)

        triplet_superbox_net_dim = 4
        if triplet_superbox_net:
            # input dimension is 3*128 for concatenated triplet
            triplet_superbox_net_layers = [
                3 * gconv_dim, gconv_hidden_dim, triplet_superbox_net_dim
            ]
            self.triplet_superbox_net = build_mlp(triplet_superbox_net_layers,
                                                  batch_norm=mlp_normalization)

        self.mask_net = None
        if mask_size is not None and mask_size > 0:
            self.mask_net = self._build_mask_net(num_objs, gconv_dim,
                                                 mask_size)

        ###########################
        self.sg_context_net = None
        self.sg_context_net_d = None
        if sg_context_dim is not None and sg_context_dim > 0:
            H, W = self.image_size
            self.sg_context_net = nn.Linear(gconv_dim, sg_context_dim)
            self.sg_context_net_d = nn.Linear(gconv_dim, sg_context_dim_d)
            # sg_context_net_layers = [gconv_dim, sg_context_dim]
            # sg_context_net_layers = [gconv_dim, sg_context_dim_d]
            # self.sg_context_net = build_mlp(sg_context_net_layers, batch_norm=mlp_normalization)
            # self.sg_context_net_d = build_mlp(sg_context_net_layers, batch_norm=mlp_normalization)
        #######################

        rel_aux_layers = [2 * embedding_dim + 8, gconv_hidden_dim, num_preds]
        self.rel_aux_net = build_mlp(rel_aux_layers,
                                     batch_norm=mlp_normalization)

        if sg_context_dim > 0:
            refinement_kwargs = {
                'dims': (gconv_dim + sg_context_dim + layout_noise_dim, ) +
                refinement_dims,
                'normalization': normalization,
                'activation': activation,
            }
        else:
            refinement_kwargs = {
                'dims': (gconv_dim + layout_noise_dim, ) + refinement_dims,
                'normalization': normalization,
                'activation': activation,
            }
        self.refinement_net = RefinementNetwork(**refinement_kwargs)
Beispiel #12
0
    def __init__(self, opt):
        super(Sg2LayoutModel, self).__init__()
        args = vars(opt)
        self.vocab = args["vocab"]
        self.image_size = args["image_size"]
        self.layout_noise_dim = args["layout_noise_dim"]
        self.mask_noise_dim = args.get("mask_noise_dim")
        self.args = args
        self.attribute_embedding = AttributeEmbeddings(
            self.vocab['attributes'], args["embedding_dim"])
        num_preds = len(self.vocab['pred_idx_to_name'])
        self.pred_embeddings = nn.Embedding(num_preds, args["embedding_dim"])
        num_attributes = len(self.vocab['attributes'].keys())

        self.trans_candidates_weights = get_predicates_weights(
            num_preds, opt.learned_init)
        self.converse_candidates_weights = get_predicates_weights(
            (num_preds, num_preds), opt.learned_init)

        obj_input_dim = len(
            self.vocab['attributes'].keys()) * args["embedding_dim"]
        first_graph_conv_layer = {
            "obj_input_dim": obj_input_dim,
            "object_output_dim": args["gconv_dim"],
            "predicate_input_dim": args["embedding_dim"],
            "predicate_output_dim": args["gconv_dim"],
            "hidden_dim": args["gconv_hidden_dim"],
            "num_attributes": num_attributes,
            "mlp_normalization": args["mlp_normalization"],
            "pooling": args["gconv_pooling"],
            "predicates_transitive_weights":
            self.trans_candidates_weights  # learned softly
        }
        general_graph_conv_layer = first_graph_conv_layer.copy()
        general_graph_conv_layer.update({
            "obj_input_dim":
            first_graph_conv_layer["object_output_dim"],
            "predicate_input_dim":
            args["gconv_dim"]
        })
        layers = [
            first_graph_conv_layer
        ] + [general_graph_conv_layer] * (args["gconv_num_layers"] - 1)

        self.gconvs = nn.ModuleList()

        for layer in layers:
            self.gconvs.append(GraphTripleConv(**layer))

        object_output_dim = layers[-1]["object_output_dim"]
        box_net_dim = 4
        box_net_layers = [
            object_output_dim, args["gconv_hidden_dim"], box_net_dim
        ]
        self.box_net = build_mlp(box_net_layers,
                                 batch_norm=args["mlp_normalization"],
                                 final_nonlinearity=None)

        # masks generation
        self.mask_net = None
        if args["mask_size"] is not None and args["mask_size"] > 0:
            self.mask_net = self._build_mask_net(args['g_mask_dim'],
                                                 args["mask_size"])
Beispiel #13
0
  def __init__(self, vocab, image_size=(64, 64), embedding_dim=64,
               gconv_dim=128, gconv_hidden_dim=512,
               gconv_pooling='avg', gconv_num_layers=5,
               refinement_dims=(1024, 512, 256, 128, 64),
               normalization='batch', activation='leakyrelu-0.2',
               mask_size=None, mlp_normalization='none', layout_noise_dim=0,
               sg_context_dim=0, #None, 
               sg_context_dim_d=0, #None, 
               gcnn_pooling='avg',
               triplet_box_net=False,
               triplet_mask_size=0,
               triplet_embedding_size=0,
               use_bbox_info=False,
               triplet_superbox_net=False,
               use_masked_sg=False,
               **kwargs):
    super(Sg2ImModel, self).__init__()

    # We used to have some additional arguments: 
    # vec_noise_dim, gconv_mode, box_anchor, decouple_obj_predictions
    if len(kwargs) > 0:
      print('WARNING: Model got unexpected kwargs ', kwargs)

    self.vocab = vocab
    self.image_size = image_size
    self.layout_noise_dim = layout_noise_dim
    self.sg_context_dim = sg_context_dim 
    self.sg_context_dim_d = sg_context_dim_d 
    self.gcnn_pooling = gcnn_pooling 
    self.triplet_box_net = triplet_box_net 
    self.triplet_mask_size = triplet_mask_size
    self.triplet_embedding_size = triplet_embedding_size
    self.use_bbox_info = use_bbox_info
    self.triplet_superbox_net = triplet_superbox_net
    self.use_masked_sg = use_masked_sg
    # hack to deal with vocabs with differing # of predicates
    self.mask_pred = 46 # vocab['idx_to_pred_name'][46] = 'none'
    #self.mask_pred = vocab['pred_name_to_idx']['none']
    self.embedding_dim = embedding_dim 
  
    num_objs = len(vocab['object_idx_to_name'])
    num_preds = len(vocab['pred_idx_to_name'])
 
    self.obj_embeddings = nn.Embedding(num_objs + 1, embedding_dim)
    #self.pred_embeddings = nn.Embedding(num_preds, embedding_dim)  
    self.pred_embeddings = nn.Embedding(num_preds + 1 , embedding_dim)  # MASK
  
    # frozen embedding layers 
    self.fr_obj_embeddings = nn.Embedding(num_objs + 1, embedding_dim)
    self.fr_pred_embeddings = nn.Embedding(num_preds + 1, embedding_dim)  
    self.fr_obj_embeddings.requires_grad = False
    self.fr_pred_embeddings.requires_grad = False

    # postional embeddings for bounding boxes (for spatio-semantic retrieval)
    bbox_dim = 4
    self.positional_embeddings = nn.Linear(bbox_dim, embedding_dim)

    if gconv_num_layers == 0:
      self.gconv = nn.Linear(embedding_dim, gconv_dim)
    elif gconv_num_layers > 0:
      gconv_kwargs = {
        'input_dim': embedding_dim,
        'output_dim': gconv_dim,
        'hidden_dim': gconv_hidden_dim,
        'pooling': gconv_pooling,
        'mlp_normalization': mlp_normalization,
      }
      self.gconv = GraphTripleConv(**gconv_kwargs)

    self.gconv_net = None
    if gconv_num_layers > 1:
      gconv_kwargs = {
        'input_dim': gconv_dim,
        'hidden_dim': gconv_hidden_dim,
        'pooling': gconv_pooling,
        'num_layers': gconv_num_layers - 1,
        'mlp_normalization': mlp_normalization,
      }
      self.gconv_net = GraphTripleConvNet(**gconv_kwargs)

    if self.use_bbox_info:
      box_net_dim = 4 + 1 # augment with addition info abt bbox
    else:
      box_net_dim = 4
    box_net_layers = [gconv_dim, gconv_hidden_dim, box_net_dim]
    self.box_net = build_mlp(box_net_layers, batch_norm=mlp_normalization)

    # triplet-related nets 
    self.triplet_box_net = None
    self.triplet_embed_net = None
    self.triplet_mask_net = None
    self.triplet_superbox_net = None
    self.pred_ground_net = None

    # output dimension
    triplet_box_net_dim = 8
    if triplet_box_net:
      # input dimension is 3*128 for concatenated triplet
      triplet_box_net_layers = [3*gconv_dim, gconv_hidden_dim, triplet_box_net_dim]
      self.triplet_box_net = build_mlp(triplet_box_net_layers, batch_norm=mlp_normalization)

    # triplet embedding 
    if self.triplet_embedding_size > 0: 
      # input dimn is 3*128 for concatenated triplet, output dimsn is triplet_embed_dim
      triplet_embed_layers = [3*gconv_dim, gconv_hidden_dim, triplet_embedding_size]
      self.triplet_embed_net = build_mlp(triplet_embed_layers, batch_norm=mlp_normalization)

    if self.triplet_mask_size > 0:
      # input dimsn is 3*128 for concatenated triplet, output dimsn is triplet_mask_size
      #self.triplet_mask_net = self._build_mask_net(num_objs, 3*gconv_dim, self.triplet_mask_size)
      self.triplet_mask_net = self._build_triplet_mask_net(num_objs, 3*gconv_dim, self.triplet_mask_size)

    triplet_superbox_net_dim = 4
    if triplet_superbox_net:
      # input dimension is 3*128 for concatenated triplet
      triplet_superbox_net_layers = [3*gconv_dim, gconv_hidden_dim, triplet_superbox_net_dim]
      self.triplet_superbox_net = build_mlp(triplet_superbox_net_layers, batch_norm=mlp_normalization)

    self.mask_net = None
    if mask_size is not None and mask_size > 0:
      self.mask_net = self._build_mask_net(num_objs, gconv_dim, mask_size)

    ###########################
    self.sg_context_net = None
    self.sg_context_net_d = None
    if sg_context_dim is not None and sg_context_dim > 0:
      H, W = self.image_size
      self.sg_context_net = nn.Linear(gconv_dim, sg_context_dim)
      self.sg_context_net_d = nn.Linear(gconv_dim, sg_context_dim_d) 
      # sg_context_net_layers = [gconv_dim, sg_context_dim]
      # sg_context_net_layers = [gconv_dim, sg_context_dim_d]
      # self.sg_context_net = build_mlp(sg_context_net_layers, batch_norm=mlp_normalization)
      # self.sg_context_net_d = build_mlp(sg_context_net_layers, batch_norm=mlp_normalization)
    ####################### 

    rel_aux_layers = [2 * embedding_dim + 8, gconv_hidden_dim, num_preds]
    self.rel_aux_net = build_mlp(rel_aux_layers, batch_norm=mlp_normalization)

    # subject prediction network
    subj_aux_layers = [2 * embedding_dim + 8, gconv_hidden_dim, num_objs]
    self.subj_aux_net = build_mlp(subj_aux_layers, batch_norm=mlp_normalization)
    
    # object prediction network
    obj_aux_layers = [2 * embedding_dim + 8, gconv_hidden_dim, num_objs]
    self.obj_aux_net = build_mlp(obj_aux_layers, batch_norm=mlp_normalization)
   
    # object class prediction network 
    obj_class_aux_layers = [embedding_dim, gconv_hidden_dim, num_objs]
    #self.obj_class_aux_net = build_mlp(obj_class_aux_layers, batch_norm=mlp_normalization)
    self.obj_class_aux_net = nn.Linear(embedding_dim, num_objs) 

    # relationship embedding network
    self.rel_embed_aux_net = nn.Linear(embedding_dim, embedding_dim) 
    # relationship class prediction network 
    self.rel_class_aux_net = nn.Linear(embedding_dim, num_preds) 

    # predicate mask prediction network
    pred_mask_layers = [2 * embedding_dim, gconv_hidden_dim, num_preds]
    self.pred_mask_net = build_mlp(pred_mask_layers, batch_norm=mlp_normalization)

    pred_ground_net_dim = 4
    # input dimension 128 for relationship
    pred_ground_net_layers = [gconv_dim, gconv_hidden_dim, pred_ground_net_dim]
    self.pred_ground_net = build_mlp(pred_ground_net_layers, batch_norm=mlp_normalization)

    # input dimn is 3*128 for concatenated triplet
    triplet_context_layers = [4*gconv_dim, gconv_hidden_dim, 3*gconv_dim]
    #self.triplet_context_net = nn.Linear(4*gconv_dim, 3*gconv_dim) 
    #triplet_context_layers = [3*gconv_dim, gconv_hidden_dim, 4]
    #self.triplet_context_net = nn.Linear(4*gconv_dim, 3*gconv_dim) 
    self.triplet_context_net = build_mlp(triplet_context_layers, batch_norm=mlp_normalization)

    if sg_context_dim > 0:
      refinement_kwargs = {
      'dims': (gconv_dim + sg_context_dim + layout_noise_dim,) + refinement_dims,
      'normalization': normalization,
      'activation': activation,
    }
    else:
      refinement_kwargs = {
        'dims': (gconv_dim + layout_noise_dim,) + refinement_dims,
        'normalization': normalization,
        'activation': activation,
      }
    self.refinement_net = RefinementNetwork(**refinement_kwargs)
Beispiel #14
0
    def __init__(self,
                 vocab,
                 image_size=(64, 64),
                 embedding_dim=64,
                 gconv_dim=128,
                 gconv_hidden_dim=512,
                 gconv_pooling='avg',
                 gconv_num_layers=5,
                 refinement_dims=(1024, 512, 256, 128, 64),
                 normalization='batch',
                 activation='leakyrelu-0.2',
                 mask_size=None,
                 mlp_normalization='none',
                 layout_noise_dim=0,
                 **kwargs):
        super(ReflectionModel, self).__init__()

        if len(kwargs) > 0:
            print("WARNING: Model got unexpected kwargs ", kwargs)

        self.vocab = vocab
        self.image_size = image_size
        self.layout_noise_dim = layout_noise_dim

        num_objs = len(vocab['object_idx_to_name'])
        num_preds = len(vocab['pred_idx_to_name'])
        self.obj_embeddings = nn.Embedding(num_objs + 1, embedding_dim)
        self.pred_embeddings = nn.Embedding(num_preds, embedding_dim)

        if gconv_num_layers == 0:
            self.gconv = nn.Linear(embedding_dim, gconv_dim)
        elif gconv_num_layers > 0:
            gconv_kwargs = {
                'input_dim': embedding_dim,
                'output_dim': gconv_dim,
                'hidden_dim': gconv_hidden_dim,
                'pooling': gconv_num_layers - 1,
                'mlp_normalization': mlp_normalization
            }
            self.gconv = GraphTripleConv(**gconv_kwargs)

        self.gconv_net = None
        if gconv_num_layers > 1:
            gconv_kwargs = {
                'input_dim': gconv_dim,
                'hidden_dim': gconv_hidden_dim,
                'pooling': gconv_pooling,
                'num_layers': gconv_num_layers - 1,
                'mlp_normalization': mlp_normalization,
            }
            self.gconv_net = GraphTripleConvNet(**gconv_kwargs)

        # Network for regressing bounding box using multilayer perceptual
        box_net_dim = 4
        box_net_layers = [gconv_dim, gconv_hidden_dim, box_net_dim]
        self.box_net = build_mlp(box_net_layers, batch_norm=mlp_normalization)

        # Network for regress segmentation.
        self.mask_net = None
        if mask_size is not None and mask_size > 0:
            self.mask_net = self._build_mask_net(num_objs, gconv_dim,
                                                 mask_size)

        #    rel_aux_layers >>??
        rel_aux_layers = [2 * embedding_dim + 8, gconv_hidden_dim, num_preds]
        self.rel_aux_net = build_mlp(rel_aux_layers,
                                     batch_norm=mlp_normalization)

        # Define cascaded refinement network.
        refinement_kwargs = {
            'dims': (gconv_dim + layout_noise_dim, ) + refinement_dims,
            'normalization': normalization,
            'activation': activation,
        }
        self.refinement_net = RefinementNetwork(**refinement_kwargs)
Beispiel #15
0
    def __init__(self,
                 vocab,
                 image_size=(64, 64),
                 embedding_dim=64,
                 gconv_dim=128,
                 gconv_hidden_dim=512,
                 gconv_pooling='avg',
                 gconv_num_layers=5,
                 refinement_dims=(1024, 512, 256, 128, 64),
                 normalization='batch',
                 activation='leakyrelu-0.2',
                 mask_size=None,
                 mlp_normalization='none',
                 layout_noise_dim=0,
                 **kwargs):
        super(Sg2ImModel, self).__init__()

        # We used to have some additional arguments:
        # vec_noise_dim, gconv_mode, box_anchor, decouple_obj_predictions
        if len(kwargs) > 0:
            print('WARNING: Model got unexpected kwargs ', kwargs)

        self.vocab = vocab  # discitonary/dataframe
        self.image_size = image_size  # H, W
        self.layout_noise_dim = layout_noise_dim  # scalar

        num_objs = len(vocab['object_idx_to_name'])
        num_preds = len(vocab['pred_idx_to_name'])
        self.obj_embeddings = nn.Embedding(num_objs + 1, embedding_dim)
        self.pred_embeddings = nn.Embedding(num_preds, embedding_dim)

        ## Load graph convolution model
        # There are 2 graph convolution model self.gconv and self.gconv_net
        # self.gconv is required, self.gconv_net is an additional (optional) model after self.gconv
        # The reason is to separate one that accepts embedding to the ones that follows with constant input_dim
        if gconv_num_layers == 0:
            self.gconv = nn.Linear(embedding_dim, gconv_dim)
        elif gconv_num_layers > 0:
            gconv_kwargs = {
                'input_dim': embedding_dim,
                'output_dim': gconv_dim,
                'hidden_dim': gconv_hidden_dim,
                'pooling': gconv_pooling,
                'mlp_normalization': mlp_normalization,
            }
            self.gconv = GraphTripleConv(**gconv_kwargs)

        self.gconv_net = None
        if gconv_num_layers > 1:
            gconv_kwargs = {
                'input_dim': gconv_dim,
                'hidden_dim': gconv_hidden_dim,
                'pooling': gconv_pooling,
                'num_layers': gconv_num_layers - 1,
                'mlp_normalization': mlp_normalization,
            }
            self.gconv_net = GraphTripleConvNet(**gconv_kwargs)

        ## BBOX model
        box_net_dim = 4
        box_net_layers = [gconv_dim, gconv_hidden_dim, box_net_dim]
        self.box_net = build_mlp(box_net_layers, batch_norm=mlp_normalization)

        # MASK model
        self.mask_net = None
        if mask_size is not None and mask_size > 0:
            self.mask_net = self._build_mask_net(num_objs, gconv_dim,
                                                 mask_size)

        # AUX model
        # TODO: what task? retrieving the relation between 2 nodes?
        rel_aux_layers = [2 * embedding_dim + 8, gconv_hidden_dim, num_preds]
        self.rel_aux_net = build_mlp(rel_aux_layers,
                                     batch_norm=mlp_normalization)

        # Refinement model
        refinement_kwargs = {
            'dims': (gconv_dim + layout_noise_dim, ) + refinement_dims,
            'normalization': normalization,
            'activation': activation,
        }
        self.refinement_net = RefinementNetwork(**refinement_kwargs)
Beispiel #16
0
    def __init__(self,
                 vocab,
                 image_size=(64, 64),
                 embedding_dim=64,
                 gconv_dim=128,
                 gconv_hidden_dim=512,
                 gconv_pooling='avg',
                 gconv_num_layers=5,
                 refinement_dims=(1024, 512, 256, 128, 64),
                 normalization='batch',
                 activation='leakyrelu-0.2',
                 mask_size=None,
                 mlp_normalization='none',
                 layout_noise_dim=0,
                 model_type=None,
                 **kwargs):
        super(Sg2ImModel, self).__init__()

        # We used to have some additional arguments:
        # vec_noise_dim, gconv_mode, box_anchor, decouple_obj_predictions
        if len(kwargs) > 0:
            print('WARNING: Model got unexpected kwargs ', kwargs)

        self.vocab = vocab
        self.image_size = image_size
        self.layout_noise_dim = layout_noise_dim

        num_objs = len(vocab['object_idx_to_name'])
        num_preds = len(vocab['pred_idx_to_name'])
        self.obj_embeddings = nn.Embedding(num_objs + 1, embedding_dim)
        self.pred_embeddings = nn.Embedding(num_preds, embedding_dim)

        if gconv_num_layers == 0:
            self.gconv = nn.Linear(embedding_dim, gconv_dim)
        elif gconv_num_layers > 0:
            gconv_kwargs = {
                'input_dim': embedding_dim,
                'output_dim': gconv_dim,
                'hidden_dim': gconv_hidden_dim,
                'pooling': gconv_pooling,
                'mlp_normalization': mlp_normalization,
                'model_type': model_type,
            }
            model_constructor = None
            if model_type == 'baseline':
                model_constructor = GraphTripleConv
            elif model_type == 'random-walk-baseline':
                model_constructor = GraphTripleRandomWalkConv
            elif model_type == 'rnn-baseline':
                model_constructor = GraphTripleRnnConv
            elif model_type == 'graphsage-maxpool':
                model_constructor = GraphSageMaxPoolConv
            elif model_type == 'graphsage-lstm':
                model_constructor = GraphSageLSTMConv
            elif model_type == 'graphsage-mean':
                model_constructor = GraphSageMeanConv
            elif model_type == 'gat-baseline':
                model_constructor = GraphAttnConv
            print("gconv_kwargs", gconv_kwargs)
            print("model_type", model_type)
            self.gconv = model_constructor(**gconv_kwargs)

        self.gconv_net = None
        if gconv_num_layers > 1:
            gconv_kwargs = {
                'input_dim': gconv_dim,
                'hidden_dim': gconv_hidden_dim,
                'pooling': gconv_pooling,
                'num_layers': gconv_num_layers - 1,
                'mlp_normalization': mlp_normalization,
                'model_type': model_type,
            }
            self.gconv_net = GraphTripleConvNet(**gconv_kwargs)

        box_net_dim = 4
        box_net_layers = [gconv_dim, gconv_hidden_dim, box_net_dim]
        self.box_net = build_mlp(box_net_layers, batch_norm=mlp_normalization)

        self.mask_net = None
        if mask_size is not None and mask_size > 0:
            self.mask_net = self._build_mask_net(num_objs, gconv_dim,
                                                 mask_size)

        rel_aux_layers = [2 * embedding_dim + 8, gconv_hidden_dim, num_preds]
        self.rel_aux_net = build_mlp(rel_aux_layers,
                                     batch_norm=mlp_normalization)

        refinement_kwargs = {
            'dims': (gconv_dim + layout_noise_dim, ) + refinement_dims,
            'normalization': normalization,
            'activation': activation,
        }
        self.refinement_net = RefinementNetwork(**refinement_kwargs)