def __init__(self, input_dim, attributes_dim=0, output_dim=None, hidden_dim=512, pooling='avg', mlp_normalization='none'): super(GraphTripleConv, self).__init__() if output_dim is None: output_dim = input_dim self.input_dim = input_dim self.output_dim = output_dim self.hidden_dim = hidden_dim assert pooling in ['sum', 'avg'], 'Invalid pooling "%s"' % pooling self.pooling = pooling net1_layers = [ 3 * input_dim + 2 * attributes_dim, hidden_dim, 2 * hidden_dim + output_dim ] net1_layers = [l for l in net1_layers if l is not None] self.net1 = build_mlp(net1_layers, batch_norm=mlp_normalization) self.net1.apply(_init_weights) net2_layers = [hidden_dim, hidden_dim, output_dim] self.net2 = build_mlp(net2_layers, batch_norm=mlp_normalization) self.net2.apply(_init_weights)
def __init__(self, vocab, image_size=(64, 64), embedding_dim=128, gconv_dim=128, gconv_hidden_dim=512, gconv_pooling='avg', gconv_num_layers=5, mask_size=32, mlp_normalization='none', appearance_normalization='', activation='', n_downsample_global=4, box_dim=128, use_attributes=False, box_noise_dim=64, mask_noise_dim=64, pool_size=100, rep_size=32): super(Model, self).__init__() self.vocab = vocab self.image_size = image_size self.use_attributes = use_attributes self.box_noise_dim = box_noise_dim self.mask_noise_dim = mask_noise_dim self.object_size = 64 #was 64 azade self.fake_pool = VectorPool(pool_size) #self.num_objs = len(vocab['object_to_idx']) #cm Azade self.num_objs = len(vocab['object_idx_to_name']) self.num_preds = len(vocab['pred_idx_to_name']) self.obj_embeddings = nn.Embedding(self.num_objs, embedding_dim) self.pred_embeddings = nn.Embedding(self.num_preds, embedding_dim) if use_attributes: attributes_dim = vocab['num_attributes'] else: attributes_dim = 0 if gconv_num_layers == 0: self.gconv = nn.Linear(embedding_dim, gconv_dim) elif gconv_num_layers > 0: gconv_kwargs = { 'input_dim': embedding_dim, 'attributes_dim': attributes_dim, 'output_dim': gconv_dim, 'hidden_dim': gconv_hidden_dim, 'pooling': gconv_pooling, 'mlp_normalization': mlp_normalization, } self.gconv = GraphTripleConv(**gconv_kwargs) self.gconv_net = None if gconv_num_layers > 1: gconv_kwargs = { 'input_dim': gconv_dim, 'hidden_dim': gconv_hidden_dim, 'pooling': gconv_pooling, 'num_layers': gconv_num_layers - 1, 'mlp_normalization': mlp_normalization, } self.gconv_net = GraphTripleConvNet(**gconv_kwargs) box_net_dim = 4 self.box_dim = box_dim box_net_layers = [self.box_dim, gconv_hidden_dim, box_net_dim] self.box_net = build_mlp(box_net_layers, batch_norm=mlp_normalization) self.g_mask_dim = gconv_dim + mask_noise_dim self.mask_net = mask_net(self.g_mask_dim, mask_size) self.repr_input = self.g_mask_dim rep_size = rep_size rep_hidden_size = 64 repr_layers = [self.repr_input, rep_hidden_size, rep_size] self.repr_net = build_mlp(repr_layers, batch_norm=mlp_normalization) appearance_encoder_kwargs = { 'vocab': vocab, 'arch': 'C4-64-2,C4-128-2,C4-256-2', 'normalization': appearance_normalization, 'activation': activation, 'padding': 'valid', 'vecs_size': self.g_mask_dim } self.image_encoder = AppearanceEncoder(**appearance_encoder_kwargs) netG_input_nc = self.num_objs + rep_size output_nc = 3 ngf = 64 n_blocks_global = 9 norm = 'instance' self.layout_to_image = define_G(netG_input_nc, output_nc, ngf, n_downsample_global, n_blocks_global, norm)