def __init__(self, input_dim, output_dim=None, hidden_dim=512, pooling='avg', mlp_normalization='none',model_type=None): super(GraphAttnConv, self).__init__() if output_dim is None: output_dim = input_dim self.input_dim = input_dim self.output_dim = output_dim self.hidden_dim = hidden_dim assert pooling in ['sum', 'avg'], 'Invalid pooling "%s"' % pooling self.pooling = pooling net1_layers = [3 * input_dim, hidden_dim, 2 * hidden_dim + output_dim] net1_layers = [l for l in net1_layers if l is not None] self.net1 = build_mlp(net1_layers, batch_norm=mlp_normalization) self.net1.apply(_init_weights) net2_layers = [hidden_dim, hidden_dim, output_dim] self.net2 = build_mlp(net2_layers, batch_norm=mlp_normalization) self.net2.apply(_init_weights) self.initial_obj_projection_layer = nn.Linear(self.output_dim, self.hidden_dim) nn.init.kaiming_normal_(self.initial_obj_projection_layer.weight) self.W_sim = nn.Linear(self.hidden_dim, self.hidden_dim) nn.init.kaiming_normal_(self.W_sim.weight)
def __init__(self, obj_input_dim, object_output_dim, predicate_input_dim, predicate_output_dim, hidden_dim, num_attributes, pooling='avg', mlp_normalization='none', predicates_transitive_weights=None, return_new_p_vecs=True): super(GraphTripleConv, self).__init__() self.return_new_p_vecs = return_new_p_vecs self.hidden_dim = hidden_dim self.num_attributes = num_attributes self.predicate_output_dim = predicate_output_dim assert pooling in ['sum', 'avg'], 'Invalid pooling "%s"' % pooling self.pooling = pooling net1_layers = [ 2 * obj_input_dim + predicate_input_dim, hidden_dim, 2 * hidden_dim + self.predicate_output_dim ] self.net1 = build_mlp(net1_layers, batch_norm=mlp_normalization, final_nonlinearity='relu') self.net1.apply(_init_weights) net2_layers = [hidden_dim, hidden_dim, object_output_dim] self.net2 = build_mlp(net2_layers, batch_norm=mlp_normalization, final_nonlinearity='relu') self.net2.apply(_init_weights) self.predicates_transitive_weights = predicates_transitive_weights
def __init__(self, vocab, image_size=(64, 64), embedding_dim=64, gconv_dim=128, gconv_hidden_dim=512, gconv_pooling='avg', gconv_num_layers=5, refinement_dims=(1024, 512, 256, 128, 64), normalization='batch', activation='leakyrelu-0.2', mask_size=None, mlp_normalization='none', layout_noise_dim=32, context_embedding_dim=0, **kwargs): super(Sg2ImModelGB, self).__init__() # We used to have some additional arguments: # vec_noise_dim, gconv_mode, box_anchor, decouple_obj_predictions if len(kwargs) > 0: print('WARNING: Model got unexpected kwargs ', kwargs) self.vocab = vocab self.image_size = image_size self.layout_noise_dim = layout_noise_dim self.context_embedding_dim = context_embedding_dim num_objs = len(vocab['object_idx_to_name']) num_preds = len(vocab['pred_idx_to_name']) self.embedding = SGEmbedding(num_objs, num_preds, embedding_dim, gconv_dim) gconv_kwargs = { 'input_dim': gconv_dim, 'hidden_dim': gconv_hidden_dim, 'pooling': gconv_pooling, 'num_layers': gconv_num_layers - 1, 'mlp_normalization': mlp_normalization, } self.gconv_net = GraphTripleConvNet(**gconv_kwargs) box_net_dim = 4 box_net_layers = [gconv_dim, gconv_hidden_dim, box_net_dim] self.box_net = build_mlp(box_net_layers, batch_norm=mlp_normalization) self.mask_net = None if mask_size is not None and mask_size > 0: self.mask_net = self._build_mask_net(num_objs, gconv_dim, mask_size) # SHould it be 2* embedding_dim? rel_aux_layers = [2 * gconv_dim + 8, gconv_hidden_dim, num_preds] self.rel_aux_net = build_mlp(rel_aux_layers, batch_norm=mlp_normalization) # Add context network # self.context_network = Context() # self.noise_layout = nn.Linear(context_embedding_dim + layout_noise_dim, # (context_embedding_dim + layout_noise_dim)*64*64) refinement_kwargs = { 'dims': (gconv_dim + layout_noise_dim + context_embedding_dim,) + refinement_dims, 'normalization': normalization, 'activation': activation, } self.refinement_net = RefinementNetwork(**refinement_kwargs)
def __init__(self, input_dim, output_dim=None, hidden_dim=512, pooling='avg', mlp_normalization='none',model_type=None): super(GraphSageLSTMConv, self).__init__() if output_dim is None: output_dim = input_dim self.input_dim = input_dim self.output_dim = output_dim self.hidden_dim = hidden_dim assert pooling in ['sum', 'avg'], 'Invalid pooling "%s"' % pooling self.pooling = pooling net1_layers = [3 * input_dim, hidden_dim, 2 * hidden_dim + output_dim] net1_layers = [l for l in net1_layers if l is not None] self.net1 = build_mlp(net1_layers, batch_norm=mlp_normalization) self.net1.apply(_init_weights) self.output_linear_layer = nn.Linear(self.hidden_dim * 2, self.output_dim) nn.init.kaiming_normal_(self.output_linear_layer.weight) self.initial_obj_projection_layer = nn.Linear(self.output_dim, self.hidden_dim) nn.init.kaiming_normal_(self.initial_obj_projection_layer.weight) ### RNN Component ### self.object_lstm = nn.LSTM( input_size=self.hidden_dim, hidden_size=self.hidden_dim, num_layers=1 )
def __init__(self, opt): super().__init__() self.opt = opt self.attribute_embedding = AttributeEmbeddings( self.opt.vocab['attributes'], self.opt.embedding_dim, use_attr_fc_gen=True) self.repr_input = opt.g_mask_dim rep_hidden_size = 64 repr_layers = [self.repr_input, rep_hidden_size, opt.rep_size] self.repr_net = build_mlp(repr_layers, batch_norm=opt.mlp_normalization) appearance_encoder_kwargs = { 'vocab': self.opt.vocab, 'arch': 'C4-64-2,C4-128-2,C4-256-2', 'normalization': opt.appearance_normalization, 'activation': opt.a_activation, 'padding': 'valid', 'vecs_size': opt.g_mask_dim } self.image_encoder = AppearanceEncoder( **appearance_encoder_kwargs) # Ignore self.fake_pool = VectorPool(opt.pool_size) # Ignore for i in range(opt.num_D): subnetD = NLayerDiscriminator(opt) self.add_module('discriminator_%d' % i, subnetD)
def __init__(self, input_dim, output_dim=None, hidden_dim=512, pooling='avg', mlp_normalization='none',model_type=None): super(GraphTripleRandomWalkConv, self).__init__() if output_dim is None: output_dim = input_dim self.input_dim = input_dim self.output_dim = output_dim self.hidden_dim = hidden_dim assert pooling in ['sum', 'avg'], 'Invalid pooling "%s"' % pooling self.pooling = pooling net1_layers = [3 * input_dim, hidden_dim, 2 * hidden_dim + output_dim] net1_layers = [l for l in net1_layers if l is not None] self.net1 = build_mlp(net1_layers, batch_norm=mlp_normalization) self.net1.apply(_init_weights) net2_layers = [hidden_dim, hidden_dim, output_dim] self.net2 = build_mlp(net2_layers, batch_norm=mlp_normalization) self.net2.apply(_init_weights)
def __init__(self, inp_nc, out_nc, nlayers=4, normalization='none', activation='relu'): super().__init__() dim_list = (inp_nc + out_nc, ) * (nlayers) + (out_nc, ) self.net = build_mlp(dim_list, batch_norm=normalization, activation=activation, final_nonlinearity=False)
def __init__(self, vocab, image_size=(64, 64), embedding_dim=64, gconv_dim=128, gconv_hidden_dim=512, gconv_pooling='avg', gconv_num_layers=5, refinement_dims=(1024, 512, 256, 128, 64), normalization='batch', activation='leakyrelu-0.2', mask_size=None, mlp_normalization='none', layout_noise_dim=0, **kwargs): super(Sg2ImModel, self).__init__() # We used to have some additional arguments: # vec_noise_dim, gconv_mode, box_anchor, decouple_obj_predictions if len(kwargs) > 0: print('WARNING: Model got unexpected kwargs ', kwargs) self.vocab = vocab self.image_size = image_size self.layout_noise_dim = layout_noise_dim num_objs = len(vocab['object_idx_to_name']) num_preds = len(vocab['pred_idx_to_name']) self.obj_embeddings = nn.Embedding(num_objs + 1, embedding_dim) self.pred_embeddings = nn.Embedding(num_preds, embedding_dim) if gconv_num_layers == 0: self.gconv = nn.Linear(embedding_dim, gconv_dim) elif gconv_num_layers > 0: gconv_kwargs = { 'input_dim': embedding_dim, 'output_dim': gconv_dim, 'hidden_dim': gconv_hidden_dim, 'pooling': gconv_pooling, 'mlp_normalization': mlp_normalization, } self.gconv = GraphTripleConv(**gconv_kwargs) self.gconv_net = None if gconv_num_layers > 1: gconv_kwargs = { 'input_dim': gconv_dim, 'hidden_dim': gconv_hidden_dim, 'pooling': gconv_pooling, 'num_layers': gconv_num_layers - 1, 'mlp_normalization': mlp_normalization, } self.gconv_net = GraphTripleConvNet(**gconv_kwargs) box_net_dim = 4 box_net_layers = [gconv_dim, gconv_hidden_dim, box_net_dim] self.box_net = build_mlp(box_net_layers, batch_norm=mlp_normalization) self.mask_net = None if mask_size is not None and mask_size > 0: self.mask_net = self._build_mask_net(num_objs, gconv_dim, mask_size)
def __init__(self, opt): super().__init__() self.attribute_embedding = AttributeEmbeddings(opt.vocab['attributes'], opt.embedding_dim) self.opt = opt nf = opt.ngf self.sw, self.sh = self.compute_latent_vector_size(opt) if opt.use_vae: # In case of VAE, we will sample from random z vector self.fc = nn.Linear(opt.z_dim, 16 * nf * self.sw * self.sh) else: # Otherwise, we make the network deterministic by starting with # downsampled segmentation map instead of random z self.fc = nn.Conv2d(self.opt.semantic_nc, 16 * nf, 3, padding=1) self.head_0 = SPADEResnetBlock(16 * nf, 16 * nf, opt) self.G_middle_0 = SPADEResnetBlock(16 * nf, 16 * nf, opt) self.G_middle_1 = SPADEResnetBlock(16 * nf, 16 * nf, opt) self.up_0 = SPADEResnetBlock(16 * nf, 8 * nf, opt) self.up_1 = SPADEResnetBlock(8 * nf, 4 * nf, opt) self.up_2 = SPADEResnetBlock(4 * nf, 2 * nf, opt) self.up_3 = SPADEResnetBlock(2 * nf, 1 * nf, opt) final_nc = nf if opt.num_upsampling_layers == 'most': self.up_4 = SPADEResnetBlock(1 * nf, nf // 2, opt) final_nc = nf // 2 self.conv_img = nn.Conv2d(final_nc, 3, 3, padding=1) self.up = nn.Upsample(scale_factor=2) self.repr_input = opt.g_mask_dim rep_hidden_size = 64 repr_layers = [self.repr_input, rep_hidden_size, opt.rep_size] self.repr_net = build_mlp(repr_layers, batch_norm=opt.mlp_normalization) appearance_encoder_kwargs = { 'vocab': self.opt.vocab, 'arch': 'C4-64-2,C4-128-2,C4-256-2', 'normalization': opt.appearance_normalization, 'activation': opt.a_activation, 'padding': 'valid', 'vecs_size': opt.g_mask_dim } self.image_encoder = AppearanceEncoder(**appearance_encoder_kwargs)
def __init__(self, feat_dim, pose_dim, nlayers=2, normalization='none', activation='relu'): super().__init__() self.pose_dim = pose_dim self.feat_dim = feat_dim resent = models.resnet18(pretrained=True) modules = list(resent.children())[:-1] self.enc = nn.Sequential(*modules) dim_list = (feat_dim + pose_dim, ) + (pose_dim, ) * (nlayers) self.linear = build_mlp(dim_list, batch_norm=normalization, activation=activation, final_nonlinearity=True)
def __init__( self, vocab, image_size=(64, 64), embedding_dim=64, gconv_dim=128, gconv_hidden_dim=512, gconv_pooling='avg', gconv_num_layers=5, refinement_dims=(1024, 512, 256, 128, 64), normalization='batch', activation='leakyrelu-0.2', mask_size=None, mlp_normalization='none', layout_noise_dim=0, sg_context_dim=0, #None, sg_context_dim_d=0, #None, gcnn_pooling='avg', triplet_box_net=False, triplet_mask_size=0, triplet_embedding_size=0, use_bbox_info=False, triplet_superbox_net=False, **kwargs): super(Sg2ImModel, self).__init__() # We used to have some additional arguments: # vec_noise_dim, gconv_mode, box_anchor, decouple_obj_predictions if len(kwargs) > 0: print('WARNING: Model got unexpected kwargs ', kwargs) self.vocab = vocab self.image_size = image_size self.layout_noise_dim = layout_noise_dim self.sg_context_dim = sg_context_dim self.sg_context_dim_d = sg_context_dim_d self.gcnn_pooling = gcnn_pooling self.triplet_box_net = triplet_box_net self.triplet_mask_size = triplet_mask_size self.triplet_embedding_size = triplet_embedding_size self.use_bbox_info = use_bbox_info self.triplet_superbox_net = triplet_superbox_net num_objs = len(vocab['object_idx_to_name']) num_preds = len(vocab['pred_idx_to_name']) self.obj_embeddings = nn.Embedding(num_objs + 1, embedding_dim) self.pred_embeddings = nn.Embedding(num_preds, embedding_dim) if gconv_num_layers == 0: self.gconv = nn.Linear(embedding_dim, gconv_dim) elif gconv_num_layers > 0: gconv_kwargs = { 'input_dim': embedding_dim, 'output_dim': gconv_dim, 'hidden_dim': gconv_hidden_dim, 'pooling': gconv_pooling, 'mlp_normalization': mlp_normalization, } self.gconv = GraphTripleConv(**gconv_kwargs) self.gconv_net = None if gconv_num_layers > 1: gconv_kwargs = { 'input_dim': gconv_dim, 'hidden_dim': gconv_hidden_dim, 'pooling': gconv_pooling, 'num_layers': gconv_num_layers - 1, 'mlp_normalization': mlp_normalization, } self.gconv_net = GraphTripleConvNet(**gconv_kwargs) if self.use_bbox_info: box_net_dim = 4 + 1 # augment with addition info abt bbox else: box_net_dim = 4 box_net_layers = [gconv_dim, gconv_hidden_dim, box_net_dim] self.box_net = build_mlp(box_net_layers, batch_norm=mlp_normalization) # triplet-related nets self.triplet_box_net = None self.triplet_embed_net = None self.triplet_mask_net = None self.triplet_superbox_net = None # output dimension triplet_box_net_dim = 8 if triplet_box_net: # input dimension is 3*128 for concatenated triplet triplet_box_net_layers = [ 3 * gconv_dim, gconv_hidden_dim, triplet_box_net_dim ] self.triplet_box_net = build_mlp(triplet_box_net_layers, batch_norm=mlp_normalization) # triplet embedding if self.triplet_embedding_size > 0: # input dimsn is 3*128 for concatenated triplet, output dimsn is triplet_embed_dim triplet_embed_layers = [ 3 * gconv_dim, gconv_hidden_dim, triplet_embedding_size ] self.triplet_embed_net = build_mlp(triplet_embed_layers, batch_norm=mlp_normalization) if self.triplet_mask_size > 0: # input dimsn is 3*128 for concatenated triplet, output dimsn is triplet_mask_size #self.triplet_mask_net = self._build_mask_net(num_objs, 3*gconv_dim, self.triplet_mask_size) self.triplet_mask_net = self._build_triplet_mask_net( num_objs, 3 * gconv_dim, self.triplet_mask_size) triplet_superbox_net_dim = 4 if triplet_superbox_net: # input dimension is 3*128 for concatenated triplet triplet_superbox_net_layers = [ 3 * gconv_dim, gconv_hidden_dim, triplet_superbox_net_dim ] self.triplet_superbox_net = build_mlp(triplet_superbox_net_layers, batch_norm=mlp_normalization) self.mask_net = None if mask_size is not None and mask_size > 0: self.mask_net = self._build_mask_net(num_objs, gconv_dim, mask_size) ########################### self.sg_context_net = None self.sg_context_net_d = None if sg_context_dim is not None and sg_context_dim > 0: H, W = self.image_size self.sg_context_net = nn.Linear(gconv_dim, sg_context_dim) self.sg_context_net_d = nn.Linear(gconv_dim, sg_context_dim_d) # sg_context_net_layers = [gconv_dim, sg_context_dim] # sg_context_net_layers = [gconv_dim, sg_context_dim_d] # self.sg_context_net = build_mlp(sg_context_net_layers, batch_norm=mlp_normalization) # self.sg_context_net_d = build_mlp(sg_context_net_layers, batch_norm=mlp_normalization) ####################### rel_aux_layers = [2 * embedding_dim + 8, gconv_hidden_dim, num_preds] self.rel_aux_net = build_mlp(rel_aux_layers, batch_norm=mlp_normalization) if sg_context_dim > 0: refinement_kwargs = { 'dims': (gconv_dim + sg_context_dim + layout_noise_dim, ) + refinement_dims, 'normalization': normalization, 'activation': activation, } else: refinement_kwargs = { 'dims': (gconv_dim + layout_noise_dim, ) + refinement_dims, 'normalization': normalization, 'activation': activation, } self.refinement_net = RefinementNetwork(**refinement_kwargs)
def __init__(self, opt): super(Sg2LayoutModel, self).__init__() args = vars(opt) self.vocab = args["vocab"] self.image_size = args["image_size"] self.layout_noise_dim = args["layout_noise_dim"] self.mask_noise_dim = args.get("mask_noise_dim") self.args = args self.attribute_embedding = AttributeEmbeddings( self.vocab['attributes'], args["embedding_dim"]) num_preds = len(self.vocab['pred_idx_to_name']) self.pred_embeddings = nn.Embedding(num_preds, args["embedding_dim"]) num_attributes = len(self.vocab['attributes'].keys()) self.trans_candidates_weights = get_predicates_weights( num_preds, opt.learned_init) self.converse_candidates_weights = get_predicates_weights( (num_preds, num_preds), opt.learned_init) obj_input_dim = len( self.vocab['attributes'].keys()) * args["embedding_dim"] first_graph_conv_layer = { "obj_input_dim": obj_input_dim, "object_output_dim": args["gconv_dim"], "predicate_input_dim": args["embedding_dim"], "predicate_output_dim": args["gconv_dim"], "hidden_dim": args["gconv_hidden_dim"], "num_attributes": num_attributes, "mlp_normalization": args["mlp_normalization"], "pooling": args["gconv_pooling"], "predicates_transitive_weights": self.trans_candidates_weights # learned softly } general_graph_conv_layer = first_graph_conv_layer.copy() general_graph_conv_layer.update({ "obj_input_dim": first_graph_conv_layer["object_output_dim"], "predicate_input_dim": args["gconv_dim"] }) layers = [ first_graph_conv_layer ] + [general_graph_conv_layer] * (args["gconv_num_layers"] - 1) self.gconvs = nn.ModuleList() for layer in layers: self.gconvs.append(GraphTripleConv(**layer)) object_output_dim = layers[-1]["object_output_dim"] box_net_dim = 4 box_net_layers = [ object_output_dim, args["gconv_hidden_dim"], box_net_dim ] self.box_net = build_mlp(box_net_layers, batch_norm=args["mlp_normalization"], final_nonlinearity=None) # masks generation self.mask_net = None if args["mask_size"] is not None and args["mask_size"] > 0: self.mask_net = self._build_mask_net(args['g_mask_dim'], args["mask_size"])
def __init__(self, vocab, image_size=(64, 64), embedding_dim=64, gconv_dim=128, gconv_hidden_dim=512, gconv_pooling='avg', gconv_num_layers=5, refinement_dims=(1024, 512, 256, 128, 64), normalization='batch', activation='leakyrelu-0.2', mask_size=None, mlp_normalization='none', layout_noise_dim=0, sg_context_dim=0, #None, sg_context_dim_d=0, #None, gcnn_pooling='avg', triplet_box_net=False, triplet_mask_size=0, triplet_embedding_size=0, use_bbox_info=False, triplet_superbox_net=False, use_masked_sg=False, **kwargs): super(Sg2ImModel, self).__init__() # We used to have some additional arguments: # vec_noise_dim, gconv_mode, box_anchor, decouple_obj_predictions if len(kwargs) > 0: print('WARNING: Model got unexpected kwargs ', kwargs) self.vocab = vocab self.image_size = image_size self.layout_noise_dim = layout_noise_dim self.sg_context_dim = sg_context_dim self.sg_context_dim_d = sg_context_dim_d self.gcnn_pooling = gcnn_pooling self.triplet_box_net = triplet_box_net self.triplet_mask_size = triplet_mask_size self.triplet_embedding_size = triplet_embedding_size self.use_bbox_info = use_bbox_info self.triplet_superbox_net = triplet_superbox_net self.use_masked_sg = use_masked_sg # hack to deal with vocabs with differing # of predicates self.mask_pred = 46 # vocab['idx_to_pred_name'][46] = 'none' #self.mask_pred = vocab['pred_name_to_idx']['none'] self.embedding_dim = embedding_dim num_objs = len(vocab['object_idx_to_name']) num_preds = len(vocab['pred_idx_to_name']) self.obj_embeddings = nn.Embedding(num_objs + 1, embedding_dim) #self.pred_embeddings = nn.Embedding(num_preds, embedding_dim) self.pred_embeddings = nn.Embedding(num_preds + 1 , embedding_dim) # MASK # frozen embedding layers self.fr_obj_embeddings = nn.Embedding(num_objs + 1, embedding_dim) self.fr_pred_embeddings = nn.Embedding(num_preds + 1, embedding_dim) self.fr_obj_embeddings.requires_grad = False self.fr_pred_embeddings.requires_grad = False # postional embeddings for bounding boxes (for spatio-semantic retrieval) bbox_dim = 4 self.positional_embeddings = nn.Linear(bbox_dim, embedding_dim) if gconv_num_layers == 0: self.gconv = nn.Linear(embedding_dim, gconv_dim) elif gconv_num_layers > 0: gconv_kwargs = { 'input_dim': embedding_dim, 'output_dim': gconv_dim, 'hidden_dim': gconv_hidden_dim, 'pooling': gconv_pooling, 'mlp_normalization': mlp_normalization, } self.gconv = GraphTripleConv(**gconv_kwargs) self.gconv_net = None if gconv_num_layers > 1: gconv_kwargs = { 'input_dim': gconv_dim, 'hidden_dim': gconv_hidden_dim, 'pooling': gconv_pooling, 'num_layers': gconv_num_layers - 1, 'mlp_normalization': mlp_normalization, } self.gconv_net = GraphTripleConvNet(**gconv_kwargs) if self.use_bbox_info: box_net_dim = 4 + 1 # augment with addition info abt bbox else: box_net_dim = 4 box_net_layers = [gconv_dim, gconv_hidden_dim, box_net_dim] self.box_net = build_mlp(box_net_layers, batch_norm=mlp_normalization) # triplet-related nets self.triplet_box_net = None self.triplet_embed_net = None self.triplet_mask_net = None self.triplet_superbox_net = None self.pred_ground_net = None # output dimension triplet_box_net_dim = 8 if triplet_box_net: # input dimension is 3*128 for concatenated triplet triplet_box_net_layers = [3*gconv_dim, gconv_hidden_dim, triplet_box_net_dim] self.triplet_box_net = build_mlp(triplet_box_net_layers, batch_norm=mlp_normalization) # triplet embedding if self.triplet_embedding_size > 0: # input dimn is 3*128 for concatenated triplet, output dimsn is triplet_embed_dim triplet_embed_layers = [3*gconv_dim, gconv_hidden_dim, triplet_embedding_size] self.triplet_embed_net = build_mlp(triplet_embed_layers, batch_norm=mlp_normalization) if self.triplet_mask_size > 0: # input dimsn is 3*128 for concatenated triplet, output dimsn is triplet_mask_size #self.triplet_mask_net = self._build_mask_net(num_objs, 3*gconv_dim, self.triplet_mask_size) self.triplet_mask_net = self._build_triplet_mask_net(num_objs, 3*gconv_dim, self.triplet_mask_size) triplet_superbox_net_dim = 4 if triplet_superbox_net: # input dimension is 3*128 for concatenated triplet triplet_superbox_net_layers = [3*gconv_dim, gconv_hidden_dim, triplet_superbox_net_dim] self.triplet_superbox_net = build_mlp(triplet_superbox_net_layers, batch_norm=mlp_normalization) self.mask_net = None if mask_size is not None and mask_size > 0: self.mask_net = self._build_mask_net(num_objs, gconv_dim, mask_size) ########################### self.sg_context_net = None self.sg_context_net_d = None if sg_context_dim is not None and sg_context_dim > 0: H, W = self.image_size self.sg_context_net = nn.Linear(gconv_dim, sg_context_dim) self.sg_context_net_d = nn.Linear(gconv_dim, sg_context_dim_d) # sg_context_net_layers = [gconv_dim, sg_context_dim] # sg_context_net_layers = [gconv_dim, sg_context_dim_d] # self.sg_context_net = build_mlp(sg_context_net_layers, batch_norm=mlp_normalization) # self.sg_context_net_d = build_mlp(sg_context_net_layers, batch_norm=mlp_normalization) ####################### rel_aux_layers = [2 * embedding_dim + 8, gconv_hidden_dim, num_preds] self.rel_aux_net = build_mlp(rel_aux_layers, batch_norm=mlp_normalization) # subject prediction network subj_aux_layers = [2 * embedding_dim + 8, gconv_hidden_dim, num_objs] self.subj_aux_net = build_mlp(subj_aux_layers, batch_norm=mlp_normalization) # object prediction network obj_aux_layers = [2 * embedding_dim + 8, gconv_hidden_dim, num_objs] self.obj_aux_net = build_mlp(obj_aux_layers, batch_norm=mlp_normalization) # object class prediction network obj_class_aux_layers = [embedding_dim, gconv_hidden_dim, num_objs] #self.obj_class_aux_net = build_mlp(obj_class_aux_layers, batch_norm=mlp_normalization) self.obj_class_aux_net = nn.Linear(embedding_dim, num_objs) # relationship embedding network self.rel_embed_aux_net = nn.Linear(embedding_dim, embedding_dim) # relationship class prediction network self.rel_class_aux_net = nn.Linear(embedding_dim, num_preds) # predicate mask prediction network pred_mask_layers = [2 * embedding_dim, gconv_hidden_dim, num_preds] self.pred_mask_net = build_mlp(pred_mask_layers, batch_norm=mlp_normalization) pred_ground_net_dim = 4 # input dimension 128 for relationship pred_ground_net_layers = [gconv_dim, gconv_hidden_dim, pred_ground_net_dim] self.pred_ground_net = build_mlp(pred_ground_net_layers, batch_norm=mlp_normalization) # input dimn is 3*128 for concatenated triplet triplet_context_layers = [4*gconv_dim, gconv_hidden_dim, 3*gconv_dim] #self.triplet_context_net = nn.Linear(4*gconv_dim, 3*gconv_dim) #triplet_context_layers = [3*gconv_dim, gconv_hidden_dim, 4] #self.triplet_context_net = nn.Linear(4*gconv_dim, 3*gconv_dim) self.triplet_context_net = build_mlp(triplet_context_layers, batch_norm=mlp_normalization) if sg_context_dim > 0: refinement_kwargs = { 'dims': (gconv_dim + sg_context_dim + layout_noise_dim,) + refinement_dims, 'normalization': normalization, 'activation': activation, } else: refinement_kwargs = { 'dims': (gconv_dim + layout_noise_dim,) + refinement_dims, 'normalization': normalization, 'activation': activation, } self.refinement_net = RefinementNetwork(**refinement_kwargs)
def __init__(self, vocab, image_size=(64, 64), embedding_dim=64, gconv_dim=128, gconv_hidden_dim=512, gconv_pooling='avg', gconv_num_layers=5, refinement_dims=(1024, 512, 256, 128, 64), normalization='batch', activation='leakyrelu-0.2', mask_size=None, mlp_normalization='none', layout_noise_dim=0, **kwargs): super(ReflectionModel, self).__init__() if len(kwargs) > 0: print("WARNING: Model got unexpected kwargs ", kwargs) self.vocab = vocab self.image_size = image_size self.layout_noise_dim = layout_noise_dim num_objs = len(vocab['object_idx_to_name']) num_preds = len(vocab['pred_idx_to_name']) self.obj_embeddings = nn.Embedding(num_objs + 1, embedding_dim) self.pred_embeddings = nn.Embedding(num_preds, embedding_dim) if gconv_num_layers == 0: self.gconv = nn.Linear(embedding_dim, gconv_dim) elif gconv_num_layers > 0: gconv_kwargs = { 'input_dim': embedding_dim, 'output_dim': gconv_dim, 'hidden_dim': gconv_hidden_dim, 'pooling': gconv_num_layers - 1, 'mlp_normalization': mlp_normalization } self.gconv = GraphTripleConv(**gconv_kwargs) self.gconv_net = None if gconv_num_layers > 1: gconv_kwargs = { 'input_dim': gconv_dim, 'hidden_dim': gconv_hidden_dim, 'pooling': gconv_pooling, 'num_layers': gconv_num_layers - 1, 'mlp_normalization': mlp_normalization, } self.gconv_net = GraphTripleConvNet(**gconv_kwargs) # Network for regressing bounding box using multilayer perceptual box_net_dim = 4 box_net_layers = [gconv_dim, gconv_hidden_dim, box_net_dim] self.box_net = build_mlp(box_net_layers, batch_norm=mlp_normalization) # Network for regress segmentation. self.mask_net = None if mask_size is not None and mask_size > 0: self.mask_net = self._build_mask_net(num_objs, gconv_dim, mask_size) # rel_aux_layers >>?? rel_aux_layers = [2 * embedding_dim + 8, gconv_hidden_dim, num_preds] self.rel_aux_net = build_mlp(rel_aux_layers, batch_norm=mlp_normalization) # Define cascaded refinement network. refinement_kwargs = { 'dims': (gconv_dim + layout_noise_dim, ) + refinement_dims, 'normalization': normalization, 'activation': activation, } self.refinement_net = RefinementNetwork(**refinement_kwargs)
def __init__(self, vocab, image_size=(64, 64), embedding_dim=64, gconv_dim=128, gconv_hidden_dim=512, gconv_pooling='avg', gconv_num_layers=5, refinement_dims=(1024, 512, 256, 128, 64), normalization='batch', activation='leakyrelu-0.2', mask_size=None, mlp_normalization='none', layout_noise_dim=0, **kwargs): super(Sg2ImModel, self).__init__() # We used to have some additional arguments: # vec_noise_dim, gconv_mode, box_anchor, decouple_obj_predictions if len(kwargs) > 0: print('WARNING: Model got unexpected kwargs ', kwargs) self.vocab = vocab # discitonary/dataframe self.image_size = image_size # H, W self.layout_noise_dim = layout_noise_dim # scalar num_objs = len(vocab['object_idx_to_name']) num_preds = len(vocab['pred_idx_to_name']) self.obj_embeddings = nn.Embedding(num_objs + 1, embedding_dim) self.pred_embeddings = nn.Embedding(num_preds, embedding_dim) ## Load graph convolution model # There are 2 graph convolution model self.gconv and self.gconv_net # self.gconv is required, self.gconv_net is an additional (optional) model after self.gconv # The reason is to separate one that accepts embedding to the ones that follows with constant input_dim if gconv_num_layers == 0: self.gconv = nn.Linear(embedding_dim, gconv_dim) elif gconv_num_layers > 0: gconv_kwargs = { 'input_dim': embedding_dim, 'output_dim': gconv_dim, 'hidden_dim': gconv_hidden_dim, 'pooling': gconv_pooling, 'mlp_normalization': mlp_normalization, } self.gconv = GraphTripleConv(**gconv_kwargs) self.gconv_net = None if gconv_num_layers > 1: gconv_kwargs = { 'input_dim': gconv_dim, 'hidden_dim': gconv_hidden_dim, 'pooling': gconv_pooling, 'num_layers': gconv_num_layers - 1, 'mlp_normalization': mlp_normalization, } self.gconv_net = GraphTripleConvNet(**gconv_kwargs) ## BBOX model box_net_dim = 4 box_net_layers = [gconv_dim, gconv_hidden_dim, box_net_dim] self.box_net = build_mlp(box_net_layers, batch_norm=mlp_normalization) # MASK model self.mask_net = None if mask_size is not None and mask_size > 0: self.mask_net = self._build_mask_net(num_objs, gconv_dim, mask_size) # AUX model # TODO: what task? retrieving the relation between 2 nodes? rel_aux_layers = [2 * embedding_dim + 8, gconv_hidden_dim, num_preds] self.rel_aux_net = build_mlp(rel_aux_layers, batch_norm=mlp_normalization) # Refinement model refinement_kwargs = { 'dims': (gconv_dim + layout_noise_dim, ) + refinement_dims, 'normalization': normalization, 'activation': activation, } self.refinement_net = RefinementNetwork(**refinement_kwargs)
def __init__(self, vocab, image_size=(64, 64), embedding_dim=64, gconv_dim=128, gconv_hidden_dim=512, gconv_pooling='avg', gconv_num_layers=5, refinement_dims=(1024, 512, 256, 128, 64), normalization='batch', activation='leakyrelu-0.2', mask_size=None, mlp_normalization='none', layout_noise_dim=0, model_type=None, **kwargs): super(Sg2ImModel, self).__init__() # We used to have some additional arguments: # vec_noise_dim, gconv_mode, box_anchor, decouple_obj_predictions if len(kwargs) > 0: print('WARNING: Model got unexpected kwargs ', kwargs) self.vocab = vocab self.image_size = image_size self.layout_noise_dim = layout_noise_dim num_objs = len(vocab['object_idx_to_name']) num_preds = len(vocab['pred_idx_to_name']) self.obj_embeddings = nn.Embedding(num_objs + 1, embedding_dim) self.pred_embeddings = nn.Embedding(num_preds, embedding_dim) if gconv_num_layers == 0: self.gconv = nn.Linear(embedding_dim, gconv_dim) elif gconv_num_layers > 0: gconv_kwargs = { 'input_dim': embedding_dim, 'output_dim': gconv_dim, 'hidden_dim': gconv_hidden_dim, 'pooling': gconv_pooling, 'mlp_normalization': mlp_normalization, 'model_type': model_type, } model_constructor = None if model_type == 'baseline': model_constructor = GraphTripleConv elif model_type == 'random-walk-baseline': model_constructor = GraphTripleRandomWalkConv elif model_type == 'rnn-baseline': model_constructor = GraphTripleRnnConv elif model_type == 'graphsage-maxpool': model_constructor = GraphSageMaxPoolConv elif model_type == 'graphsage-lstm': model_constructor = GraphSageLSTMConv elif model_type == 'graphsage-mean': model_constructor = GraphSageMeanConv elif model_type == 'gat-baseline': model_constructor = GraphAttnConv print("gconv_kwargs", gconv_kwargs) print("model_type", model_type) self.gconv = model_constructor(**gconv_kwargs) self.gconv_net = None if gconv_num_layers > 1: gconv_kwargs = { 'input_dim': gconv_dim, 'hidden_dim': gconv_hidden_dim, 'pooling': gconv_pooling, 'num_layers': gconv_num_layers - 1, 'mlp_normalization': mlp_normalization, 'model_type': model_type, } self.gconv_net = GraphTripleConvNet(**gconv_kwargs) box_net_dim = 4 box_net_layers = [gconv_dim, gconv_hidden_dim, box_net_dim] self.box_net = build_mlp(box_net_layers, batch_norm=mlp_normalization) self.mask_net = None if mask_size is not None and mask_size > 0: self.mask_net = self._build_mask_net(num_objs, gconv_dim, mask_size) rel_aux_layers = [2 * embedding_dim + 8, gconv_hidden_dim, num_preds] self.rel_aux_net = build_mlp(rel_aux_layers, batch_norm=mlp_normalization) refinement_kwargs = { 'dims': (gconv_dim + layout_noise_dim, ) + refinement_dims, 'normalization': normalization, 'activation': activation, } self.refinement_net = RefinementNetwork(**refinement_kwargs)