def __init__(self, classes, embed_dim, inputs_dim, hidden_dim, direction='backward', dropout=0.2, pass_root=False): super(DecoderHrTreeLSTM, self).__init__() """ Initializes the RNN :param embed_dim: Dimension of the embeddings :param encoder_hidden_dim: Hidden dim of the encoder, for attention purposes :param hidden_dim: Hidden dim of the decoder :param vocab_size: Number of words in the vocab :param bos_token: To use during decoding (non teacher forcing mode)) :param bos: beginning of sentence token :param unk: unknown token (not used) direction = foreward | backward """ self.classes = classes self.hidden_size = hidden_dim self.inputs_dim = inputs_dim self.nms_thresh = 0.5 self.dropout = dropout self.pass_root = pass_root # generate embed layer # delete 'start' embed_vecs = obj_edge_vectors(self.classes, wv_dim=embed_dim) self.obj_embed = nn.Embedding(len(self.classes), embed_dim) self.obj_embed.weight.data = embed_vecs # generate out layer self.out = nn.Linear(self.hidden_size, len(self.classes)) self.out.weight = torch.nn.init.xavier_normal(self.out.weight, gain=1.0) self.out.bias.data.fill_(0.0) if direction == 'backward': # a root to all children self.input_size = inputs_dim self.decoderTreeLSTM = HrTreeLSTM_Backward( self.input_size, self.hidden_size, self.pass_root, is_pass_embed=True, embed_layer=self.obj_embed, embed_out_layer=self.out) elif direction == 'foreward': # multi children to a root self.input_size = inputs_dim self.decoderTreeLSTM = HrTreeLSTM_Foreward( self.input_size, self.hidden_size, self.pass_root, is_pass_embed=True, embed_layer=self.obj_embed, embed_out_layer=self.out) else: print('Error Decoder LSTM Direction')
def __init__(self, classes, rel_classes, mode='sgdet', embed_dim=200, hidden_dim=256, obj_dim=2048, nl_obj=2, nl_edge=2, dropout_rate=0.2, order='confidence', pass_in_obj_feats_to_decoder=True, pass_in_obj_feats_to_edge=True): super(LinearizedContext, self).__init__() self.classes = classes self.rel_classes = rel_classes assert mode in MODES self.mode = mode self.nl_obj = nl_obj self.nl_edge = nl_edge self.embed_dim = embed_dim self.hidden_dim = hidden_dim self.obj_dim = obj_dim self.dropout_rate = dropout_rate self.pass_in_obj_feats_to_decoder = pass_in_obj_feats_to_decoder self.pass_in_obj_feats_to_edge = pass_in_obj_feats_to_edge assert order in ('size', 'confidence', 'random', 'leftright') self.order = order # EMBEDDINGS embed_vecs = obj_edge_vectors(self.classes, wv_dim=self.embed_dim) self.obj_embed = nn.Embedding(self.num_classes, self.embed_dim) self.obj_embed.weight.data = embed_vecs.clone() self.obj_embed2 = nn.Embedding(self.num_classes, self.embed_dim) self.obj_embed2.weight.data = embed_vecs.clone() # This probably doesn't help it much self.pos_embed = nn.Sequential(*[ nn.BatchNorm1d(4, momentum=BATCHNORM_MOMENTUM / 10.0), nn.Linear(4, 128), nn.ReLU(inplace=True), nn.Dropout(0.1), ]) self.conver_fusion_feature = nn.Sequential(*[ nn.BatchNorm1d(self.embed_dim + 128, momentum=BATCHNORM_MOMENTUM / 10.0), nn.Linear(self.embed_dim + 128, 4096), # self.obj_dim + nn.ReLU(inplace=True), nn.Dropout(0.1), ]) self.decoder_lin_ = nn.Linear(4096, self.num_classes)
def __init__(self, classes, embed_dim, inputs_dim, hidden_dim, recurrent_dropout_probability=0.2, use_highway=True, use_input_projection_bias=True): """ Initializes the RNN :param embed_dim: Dimension of the embeddings :param encoder_hidden_dim: Hidden dim of the encoder, for attention purposes :param hidden_dim: Hidden dim of the decoder :param vocab_size: Number of words in the vocab :param bos_token: To use during decoding (non teacher forcing mode)) :param bos: beginning of sentence token :param unk: unknown token (not used) """ super(DecoderRNN, self).__init__() self.classes = classes embed_vecs = obj_edge_vectors(['start'] + self.classes, wv_dim=100) self.obj_embed = nn.Embedding(len(self.classes), embed_dim) self.obj_embed.weight.data = embed_vecs self.hidden_size = hidden_dim self.inputs_dim = inputs_dim self.nms_thresh = 0.3 self.recurrent_dropout_probability = recurrent_dropout_probability self.use_highway = use_highway # We do the projections for all the gates all at once, so if we are # using highway layers, we need some extra projections, which is # why the sizes of the Linear layers change here depending on this flag. if use_highway: self.input_linearity = torch.nn.Linear( self.input_size, 6 * self.hidden_size, bias=use_input_projection_bias) self.state_linearity = torch.nn.Linear(self.hidden_size, 5 * self.hidden_size, bias=True) else: self.input_linearity = torch.nn.Linear( self.input_size, 4 * self.hidden_size, bias=use_input_projection_bias) self.state_linearity = torch.nn.Linear(self.hidden_size, 4 * self.hidden_size, bias=True) self.out = nn.Linear(self.hidden_size, len(self.classes)) self.reset_parameters()
def __init__(self, classes, mode='sgdet', embed_dim=20, obj_dim=4096): super(LC, self).__init__() self.classes = classes self.embed_dim = embed_dim self.obj_dim = obj_dim self.mode = mode embed_vecs = obj_edge_vectors(self.classes, wv_dim=self.embed_dim) self.obj_embed = nn.Embedding(self.num_classes, self.embed_dim) self.obj_embed.weight.data = embed_vecs.clone() self.pos_embed = nn.Sequential(*[ nn.BatchNorm1d(4, momentum=BATCHNORM_MOMENTUM / 10.0), nn.Linear(4, 128), nn.ReLU(inplace=True), nn.Dropout(0.1), ]) self.decoder_lin = nn.Linear(self.obj_dim + self.embed_dim + 128, self.num_classes)
def __init__(self, classes, rel_classes, mode='sgcls', embed_dim=200, hidden_dim=256, obj_dim=4096, pooling_dim=4096, ctx_dim=512): super(LinearizedContext, self).__init__() self.classes = classes self.rel_classes = rel_classes assert mode in MODES self.mode = mode self.embed_dim = embed_dim self.hidden_dim = hidden_dim self.obj_dim = obj_dim self.pooling_dim = pooling_dim self.ctx_dim = ctx_dim # EMBEDDINGS embed_vecs = obj_edge_vectors(self.classes, wv_dim=self.embed_dim) self.obj_embed = nn.Embedding(self.num_classes, self.embed_dim) # K0 self.obj_embed.weight.data = embed_vecs.clone() self.obj_embed2 = nn.Embedding(self.num_classes, self.embed_dim) # K1 self.obj_embed2.weight.data = embed_vecs.clone() # Object-Relational Embedding self.RE1 = RelationalEmbedding(input_dim=self.obj_dim + self.embed_dim + self.ctx_dim, output_dim=self.hidden_dim) self.RE2 = RelationalEmbedding(input_dim=self.hidden_dim, output_dim=self.num_classes) # Edge-Relational Embedding self.RE3 = RelationalEmbedding(input_dim=self.embed_dim + self.hidden_dim, output_dim=self.hidden_dim) self.RE4 = RelationalEmbedding(input_dim=self.hidden_dim, output_dim=self.pooling_dim * 2)
def __init__(self, classes, rel_classes, mode='sgdet', embed_dim=200, obj_dim=2048, order='confidence'): super(O_NODE, self).__init__() self.classes = classes self.rel_classes = rel_classes assert mode in MODES self.mode = mode self.embed_dim = embed_dim self.obj_dim = obj_dim #----------add sget nms self.nms_filter_duplicates = True self.max_per_img =64#default 64 self.thresh = 0.01#############0.001training 0.01test assert order in ('size', 'confidence', 'random', 'leftright') self.order = order # EMBEDDINGS embed_vecs = obj_edge_vectors(self.classes, wv_dim=self.embed_dim) self.obj_embed = nn.Embedding(self.num_classes, self.embed_dim) self.obj_embed.weight.data = embed_vecs.clone() # This probably doesn't help it much self.pos_embed = nn.Sequential(*[ nn.BatchNorm1d(4, momentum=BATCHNORM_MOMENTUM / 10.0), nn.Linear(4, 128), nn.ReLU(inplace=True), nn.Dropout(0.1), ]) self.decoder_lin1 = nn.Linear(self.obj_dim + self.embed_dim + 128, 1024) self.decoder_lin2 = odeBlock(odeFunc2(use_cuda=True)) self.decoder_lin3 = nn.Linear(1024, self.num_classes)
def __init__(self, method, embed_objs, subj_pred_obj_pairs, L=0.2, topk=5, alpha=2, uniform=False, degree_smoothing=1, data_dir=None, obj_classes=None, triplet2str=None): self.method = method if embed_objs is None: embed_objs = obj_edge_vectors(obj_classes, wv_dir=data_dir, wv_dim=200, avg_words=True)[0] embed_objs = embed_objs / torch.norm(embed_objs, 2, dim=1, keepdim=True) self.obj_pairwise = pairwise_similarity(embed_objs) self.subj_pred_obj_pairs = subj_pred_obj_pairs self.L = L self.topk = topk self.alpha = alpha self.uniform = uniform self.degree_smoothing = degree_smoothing self.n_obj_classes = self.obj_pairwise.shape[0] self.obj_classes = obj_classes self.triplet2str = triplet2str if self.method == 'neigh': assert self.topk > 0, self.topk
def __init__(self, classes, rel_classes, mode='sgdet', num_gpus=1, use_vision=False, require_overlap_det=True, embed_dim=200, hidden_dim=4096, use_resnet=False, thresh=0.01, use_proposals=False, use_bias=True, limit_vision=True, depth_model=None, pretrained_depth=False, active_features=None, frozen_features=None, use_embed=False, **kwargs): """ :param classes: object classes :param rel_classes: relationship classes. None if were not using rel mode :param mode: (sgcls, predcls, or sgdet) :param num_gpus: how many GPUS 2 use :param use_vision: enable the contribution of union of bounding boxes :param require_overlap_det: whether two objects must intersect :param embed_dim: word2vec embeddings dimension :param hidden_dim: dimension of the fusion hidden layer :param use_resnet: use resnet as faster-rcnn's backbone :param thresh: faster-rcnn related threshold (Threshold for calling it a good box) :param use_proposals: whether to use region proposal candidates :param use_bias: enable frequency bias :param limit_vision: use truncated version of UoBB features :param depth_model: provided architecture for depth feature extraction :param pretrained_depth: whether the depth feature extractor should be initialized with ImageNet weights :param active_features: what set of features should be enabled (e.g. 'vdl' : visual, depth, and location features) :param frozen_features: what set of features should be frozen (e.g. 'd' : depth) :param use_embed: use word2vec embeddings """ RelModelBase.__init__(self, classes, rel_classes, mode, num_gpus, require_overlap_det, active_features, frozen_features) self.pooling_size = 7 self.embed_dim = embed_dim self.hidden_dim = hidden_dim self.obj_dim = 2048 if use_resnet else 4096 self.use_vision = use_vision self.use_bias = use_bias self.limit_vision = limit_vision # -- Store depth related parameters assert depth_model in DEPTH_MODELS self.depth_model = depth_model self.pretrained_depth = pretrained_depth self.depth_pooling_dim = DEPTH_DIMS[self.depth_model] self.use_embed = use_embed self.detector = nn.Module() features_size = 0 # -- Check whether ResNet is selected as faster-rcnn's backbone if use_resnet: raise ValueError( "The current model does not support ResNet as the Faster-RCNN's backbone." ) """ *** DIFFERENT COMPONENTS OF THE PROPOSED ARCHITECTURE *** This is the part where the different components of the proposed relation detection architecture are defined. In the case of RGB images, we have class probability distribution features, visual features, and the location ones. If we are considering depth images as well, we augment depth features too. """ # -- Visual features if self.has_visual: # -- Define faster R-CNN network and it's related feature extractors self.detector = ObjectDetector( classes=classes, mode=('proposals' if use_proposals else 'refinerels') if mode == 'sgdet' else 'gtbox', use_resnet=use_resnet, thresh=thresh, max_per_img=64, ) self.roi_fmap_obj = load_vgg(pretrained=False).classifier # -- Define union features if self.use_vision: # -- UoBB pooling module self.union_boxes = UnionBoxesAndFeats( pooling_size=self.pooling_size, stride=16, dim=1024 if use_resnet else 512) # -- UoBB feature extractor roi_fmap = [ Flattener(), load_vgg(use_dropout=False, use_relu=False, use_linear=self.hidden_dim == 4096, pretrained=False).classifier, ] if self.hidden_dim != 4096: roi_fmap.append(nn.Linear(4096, self.hidden_dim)) self.roi_fmap = nn.Sequential(*roi_fmap) # -- Define visual features hidden layer self.visual_hlayer = nn.Sequential(*[ xavier_init(nn.Linear(self.obj_dim * 2, self.FC_SIZE_VISUAL)), nn.ReLU(inplace=True), nn.Dropout(0.8) ]) self.visual_scale = ScaleLayer(1.0) features_size += self.FC_SIZE_VISUAL # -- Location features if self.has_loc: # -- Define location features hidden layer self.location_hlayer = nn.Sequential(*[ xavier_init(nn.Linear(self.LOC_INPUT_SIZE, self.FC_SIZE_LOC)), nn.ReLU(inplace=True), nn.Dropout(0.1) ]) self.location_scale = ScaleLayer(1.0) features_size += self.FC_SIZE_LOC # -- Class features if self.has_class: if self.use_embed: # -- Define class embeddings embed_vecs = obj_edge_vectors(self.classes, wv_dim=self.embed_dim) self.obj_embed = nn.Embedding(self.num_classes, self.embed_dim) self.obj_embed.weight.data = embed_vecs.clone() classme_input_dim = self.embed_dim if self.use_embed else self.num_classes # -- Define Class features hidden layer self.classme_hlayer = nn.Sequential(*[ xavier_init( nn.Linear(classme_input_dim * 2, self.FC_SIZE_CLASS)), nn.ReLU(inplace=True), nn.Dropout(0.1) ]) self.classme_scale = ScaleLayer(1.0) features_size += self.FC_SIZE_CLASS # -- Depth features if self.has_depth: # -- Initialize depth backbone self.depth_backbone = DepthCNN(depth_model=self.depth_model, pretrained=self.pretrained_depth) # -- Create a relation head which is used to carry on the feature extraction # from RoIs of depth features self.depth_rel_head = self.depth_backbone.get_classifier() # -- Define depth features hidden layer self.depth_rel_hlayer = nn.Sequential(*[ xavier_init( nn.Linear(self.depth_pooling_dim * 2, self.FC_SIZE_DEPTH)), nn.ReLU(inplace=True), nn.Dropout(0.6), ]) self.depth_scale = ScaleLayer(1.0) features_size += self.FC_SIZE_DEPTH # -- Initialize frequency bias if needed if self.use_bias: self.freq_bias = FrequencyBias() # -- *** Fusion layer *** -- # -- A hidden layer for concatenated features (fusion features) self.fusion_hlayer = nn.Sequential(*[ xavier_init(nn.Linear(features_size, self.hidden_dim)), nn.ReLU(inplace=True), nn.Dropout(0.1) ]) # -- Final FC layer which predicts the relations self.rel_out = xavier_init( nn.Linear(self.hidden_dim, self.num_rels, bias=True)) # -- Freeze the user specified features if self.frz_visual: self.freeze_module(self.detector) self.freeze_module(self.roi_fmap_obj) self.freeze_module(self.visual_hlayer) if self.use_vision: self.freeze_module(self.roi_fmap) self.freeze_module(self.union_boxes.conv) if self.frz_class: self.freeze_module(self.classme_hlayer) if self.frz_loc: self.freeze_module(self.location_hlayer) if self.frz_depth: self.freeze_module(self.depth_backbone) self.freeze_module(self.depth_rel_head) self.freeze_module(self.depth_rel_hlayer)
def __init__(self, classes, rel_classes, mode='sgdet', embed_dim=200, hidden_dim=256, obj_dim=2048, nl_obj=2, nl_edge=2, dropout_rate=0.2, order='confidence', pass_in_obj_feats_to_decoder=True, pass_in_obj_feats_to_edge=True): super(LinearizedContext, self).__init__() self.classes = classes self.rel_classes = rel_classes assert mode in MODES self.mode = mode self.nl_obj = nl_obj self.nl_edge = nl_edge self.embed_dim = embed_dim self.hidden_dim = hidden_dim self.obj_dim = obj_dim self.dropout_rate = dropout_rate self.pass_in_obj_feats_to_decoder = pass_in_obj_feats_to_decoder self.pass_in_obj_feats_to_edge = pass_in_obj_feats_to_edge assert order in ('size', 'confidence', 'random', 'leftright') self.order = order # print('LIN CONTEXT : Start') # EMBEDDINGS embed_vecs = obj_edge_vectors(self.classes, wv_dim=self.embed_dim) self.obj_embed = nn.Embedding(self.num_classes, self.embed_dim) self.obj_embed.weight.data = embed_vecs.clone() self.obj_embed2 = nn.Embedding(self.num_classes, self.embed_dim) self.obj_embed2.weight.data = embed_vecs.clone() # print('LIN CONTEXT : 0') # This probably doesn't help it much self.pos_embed = nn.Sequential(*[ nn.BatchNorm1d(4, momentum=BATCHNORM_MOMENTUM / 10.0), nn.Linear(4, 128), nn.ReLU(inplace=True), nn.Dropout(0.1), ]) # print('LIN CONTEXT : 1') if self.nl_obj > 0: # print('LIN CONTEXT : 1.1') self.obj_ctx_rnn = AlternatingHighwayLSTM( input_size=self.obj_dim+self.embed_dim+128, hidden_size=self.hidden_dim, num_layers=self.nl_obj, recurrent_dropout_probability=dropout_rate) # print('LIN CONTEXT : 1.5') decoder_inputs_dim = self.hidden_dim if self.pass_in_obj_feats_to_decoder: decoder_inputs_dim += self.obj_dim + self.embed_dim self.decoder_rnn = DecoderRNN(self.classes, embed_dim=self.embed_dim, inputs_dim=decoder_inputs_dim, hidden_dim=self.hidden_dim, recurrent_dropout_probability=dropout_rate) else: self.decoder_lin = nn.Linear(self.obj_dim + self.embed_dim + 128, self.num_classes) # print('LIN CONTEXT : 2') if self.nl_edge > 0: input_dim = self.embed_dim if self.nl_obj > 0: input_dim += self.hidden_dim if self.pass_in_obj_feats_to_edge: input_dim += self.obj_dim self.edge_ctx_rnn = AlternatingHighwayLSTM(input_size=input_dim, hidden_size=self.hidden_dim, num_layers=self.nl_edge, recurrent_dropout_probability=dropout_rate)
def __init__( self, obj_classes, rel_classes, embed_dim=200, hidden_dim=64, n_ch=512, pool_sz=7, fmap_sz=38, losses=('D', 'G', 'rec'), SN=True, BN=True, n_layers_G=5, vis_cond=None, init_embed=False, largeD=False, data_dir='', # to load word embeddings device='cuda'): """ :param embed_dim: Dimension for all embeddings :param obj_dim: """ super(GAN, self).__init__() self.obj_classes = obj_classes self.rel_classes = rel_classes self.embed_dim = embed_dim self.n_ch = n_ch self.obj_dim = pool_sz**2 * n_ch self.pool_sz = pool_sz self.fmap_sz = fmap_sz self.losses = losses self.SN = SN self.BN = BN self.vis_cond = vis_cond self.largeD = largeD self.h5_data = None self.device = device if vis_cond is not None: self.h5_data = h5py.File(vis_cond, mode='r') self.G_obj_embed = nn.Embedding(len(self.obj_classes), self.embed_dim) self.G_rel_embed = nn.Embedding(len(self.rel_classes), self.embed_dim) if SN: conv = lambda n_in, n_out, ks, pad: spectral_norm( nn.Conv2d(n_in, n_out, kernel_size=ks, padding=pad)) else: conv = lambda n_in, n_out, ks, pad: nn.Conv2d( n_in, n_out, kernel_size=ks, padding=pad) def cond_discriminator(n_classes): # input is 512x7x7 return nn.Sequential( conv(n_ch + n_classes, n_ch // 2, 3, 0), # ->256x5x5 nn.ReLU(), conv(n_ch // 2, n_ch // 4, 3, 0), # ->128x3x3 nn.ReLU(), conv(n_ch // 4, n_ch // 8, 1, 0), # ->64x3x3 nn.ReLU(), conv(n_ch // 8, 1, 3, 0), # ->1x1x1 nn.Flatten()) # Discriminators (must start with D_) ---------------------------------- self.D_nodes = cond_discriminator(len(self.obj_classes)) self.D_edges = cond_discriminator(len(self.rel_classes)) self.D_global = nn.Sequential( conv(n_ch, n_ch // 2, 3, 0), # 512x38x38->256x36x36 nn.LeakyReLU(0.2), conv(n_ch // 2, n_ch // 2, 1, 0) if largeD else nn.Identity(), # ->256x36x36 nn.LeakyReLU(0.2) if largeD else nn.Identity(), nn.AvgPool2d(2, ceil_mode=True) if fmap_sz > 24 else nn.Identity(), # ->256x18x18 conv(n_ch // 2, n_ch // 2, 3, 0), # ->256x16x16 nn.LeakyReLU(0.2), conv(n_ch // 2, n_ch // 2, 1, 0) if largeD else nn.Identity(), # ->256x16x16 nn.LeakyReLU(0.2) if largeD else nn.Identity(), nn.AvgPool2d(2), # ->256x8x8 conv(n_ch // 2, n_ch // 4, 3, 0), # ->128x6x6 nn.LeakyReLU(0.2), conv(n_ch // 4, n_ch // 4, 1, 0) if largeD else nn.Identity(), # ->128x6x6 nn.LeakyReLU(0.2) if largeD else nn.Identity(), nn.AvgPool2d(2), # ->128x3x3 conv(n_ch // 4, 1, 3, 0), # ->128x1x1 nn.Flatten()) print('Global Discriminator:', self.D_global) # Generators (must start with G_) -------------------------------------- # Graph Convolutional Network (returns 32x7x7 features) self.G_gcn = GraphTripleConvNet( input_dim=self.embed_dim + 4, input_edge_dim=self.embed_dim, output_dim=hidden_dim // 2 * pool_sz * pool_sz, num_layers=n_layers_G, hidden_dim=hidden_dim, pooling='avg', mlp_normalization='batch' if BN else 'none') # Post process GCN features with conv layers to make them more "spatial" self.G_node = nn.Sequential( nn.Conv2d(hidden_dim // 2, hidden_dim, kernel_size=3, padding=1), nn.ReLU(), nn.Conv2d(hidden_dim, hidden_dim, kernel_size=3, padding=1), nn.ReLU()) # To transform hidden features concatenated with visual features self.G_proj = nn.Conv2d(hidden_dim + int(vis_cond is not None) * n_ch, hidden_dim, kernel_size=1) # To generate large global features self.G_refine = RefinementNetwork(dims=(hidden_dim, n_ch // 4, n_ch // 2, n_ch), normalization='batch', activation='leakyrelu-0.2') # Predefine 0,1 labels to prevent resource consuming creation of large vectors for every batch during training n_max = 50000 # some random big number self.y_real_, self.y_fake_ = torch.ones(n_max, 1).to(device), torch.zeros( n_max, 1).to(device) self.y_real = lambda n: Variable(self.y_real_[:n]) self.y_fake = lambda n: Variable(self.y_fake_[:n]) # Load the Glove-based language model to use for SG perturbations and initializing the GAN input embeddings embed_objs, word_vectors = obj_edge_vectors(self.obj_classes, wv_dir=data_dir, wv_dim=embed_dim, word_vectors=None, avg_words=True) self.embed_objs = ( embed_objs / torch.norm(embed_objs, 2, dim=1, keepdim=True)).to(device) if init_embed: # Initialize learnable GAN embeddings with the Glove ones # Using this led to worse results in our experiments, so we don't use it assert self.G_obj_embed.weight.shape == self.embed_objs.shape, ( self.G_obj_embed.weight.shape, self.embed_objs.shape) self.G_obj_embed.weight.data = self.embed_objs.clone() embed_rels = obj_edge_vectors(self.rel_classes, wv_dim=embed_dim, word_vectors=word_vectors, avg_words=True)[0] self.embed_rels = ( embed_rels / torch.norm(embed_rels, 2, dim=1, keepdim=True)).to(device) assert self.G_rel_embed.weight.shape == self.embed_rels.shape, ( self.G_rel_embed.weight.shape, self.embed_rels.shape) self.G_rel_embed.weight.data = self.embed_rels.clone()
def __init__(self, classes, rel_classes, mode='sgdet', num_gpus=1, use_vision=True, require_overlap_det=True, embed_dim=200, hidden_dim=256, pooling_dim=2048, nl_obj=1, nl_edge=2, use_resnet=False, order='confidence', thresh=0.01, use_proposals=False, pass_in_obj_feats_to_decoder=True, pass_in_obj_feats_to_edge=True, rec_dropout=0.0, use_bias=True, use_tanh=True, limit_vision=True): """ :param classes: Object classes :param rel_classes: Relationship classes. None if were not using rel mode :param mode: (sgcls, predcls, or sgdet) :param num_gpus: how many GPUS 2 use :param use_vision: Whether to use vision in the final product :param require_overlap_det: Whether two objects must intersect :param embed_dim: Dimension for all embeddings :param hidden_dim: LSTM hidden size :param obj_dim: """ super(RelModel, self).__init__() self.classes = classes self.rel_classes = rel_classes self.num_gpus = num_gpus assert mode in MODES self.mode = mode self.pooling_size = 7 self.embed_dim = embed_dim self.hidden_dim = hidden_dim self.obj_dim = 2048 if use_resnet else 4096 self.pooling_dim = pooling_dim self.use_bias = use_bias self.use_vision = use_vision self.use_tanh = use_tanh self.limit_vision = limit_vision self.require_overlap = require_overlap_det and self.mode == 'sgdet' self.hook_for_grad = False self.gradients = [] self.detector = ObjectDetector( classes=classes, mode=('proposals' if use_proposals else 'refinerels') if mode == 'sgdet' else 'gtbox', use_resnet=use_resnet, thresh=thresh, max_per_img=64, ) self.ort_embedding = torch.autograd.Variable( get_ort_embeds(self.num_classes, 200).cuda()) embed_vecs = obj_edge_vectors(self.classes, wv_dim=self.embed_dim) self.obj_embed = nn.Embedding(self.num_classes, self.embed_dim) self.obj_embed.weight.data = embed_vecs.clone() # This probably doesn't help it much self.pos_embed = nn.Sequential(*[ nn.BatchNorm1d(4, momentum=BATCHNORM_MOMENTUM / 10.0), nn.Linear(4, 128), nn.ReLU(inplace=True), nn.Dropout(0.1), ]) self.context = LinearizedContext( self.classes, self.rel_classes, mode=self.mode, embed_dim=self.embed_dim, hidden_dim=self.hidden_dim, obj_dim=self.obj_dim, nl_obj=nl_obj, nl_edge=nl_edge, dropout_rate=rec_dropout, order=order, pass_in_obj_feats_to_decoder=pass_in_obj_feats_to_decoder, pass_in_obj_feats_to_edge=pass_in_obj_feats_to_edge) # Image Feats (You'll have to disable if you want to turn off the features from here) self.union_boxes = UnionBoxesAndFeats(pooling_size=self.pooling_size, stride=16, dim=1024 if use_resnet else 512) self.merge_obj_feats = nn.Sequential( nn.Linear(self.obj_dim + self.embed_dim + 128, self.hidden_dim), nn.ReLU()) # self.trans = nn.Sequential(nn.Linear(self.hidden_dim, self.hidden_dim//4), # LayerNorm(self.hidden_dim//4), nn.ReLU(), # nn.Linear(self.hidden_dim//4, self.hidden_dim)) self.get_phr_feats = nn.Linear(self.pooling_dim, self.hidden_dim) self.embeddings4lstm = nn.Embedding(self.num_classes, self.embed_dim) self.lstm = nn.LSTM(input_size=self.hidden_dim + self.embed_dim, hidden_size=self.hidden_dim, num_layers=1) self.obj_mps1 = Message_Passing4OBJ(self.hidden_dim) # self.obj_mps2 = Message_Passing4OBJ(self.hidden_dim) self.get_boxes_encode = Boxes_Encode(64) if use_resnet: self.roi_fmap = nn.Sequential( resnet_l4(relu_end=False), nn.AvgPool2d(self.pooling_size), Flattener(), ) else: roi_fmap = [ Flattener(), load_vgg(use_dropout=False, use_relu=False, use_linear=pooling_dim == 4096, pretrained=False).classifier, ] if pooling_dim != 4096: roi_fmap.append(nn.Linear(4096, pooling_dim)) self.roi_fmap = nn.Sequential(*roi_fmap) self.roi_fmap_obj = load_vgg(pretrained=False).classifier ################################### # self.obj_classify_head = nn.Linear(self.pooling_dim, self.num_classes) # self.post_emb_s = nn.Linear(self.pooling_dim, self.pooling_dim//2) # self.post_emb_s.weight = torch.nn.init.xavier_normal(self.post_emb_s.weight, gain=1.0) # self.post_emb_o = nn.Linear(self.pooling_dim, self.pooling_dim//2) # self.post_emb_o.weight = torch.nn.init.xavier_normal(self.post_emb_o.weight, gain=1.0) # self.merge_obj_high = nn.Linear(self.hidden_dim, self.pooling_dim//2) # self.merge_obj_high.weight = torch.nn.init.xavier_normal(self.merge_obj_high.weight, gain=1.0) # self.merge_obj_low = nn.Linear(self.pooling_dim + 5 + self.embed_dim, self.pooling_dim//2) # self.merge_obj_low.weight = torch.nn.init.xavier_normal(self.merge_obj_low.weight, gain=1.0) # self.rel_compress = nn.Linear(self.pooling_dim//2 + 64, self.num_rels, bias=True) # self.rel_compress.weight = torch.nn.init.xavier_normal(self.rel_compress.weight, gain=1.0) # self.freq_gate = nn.Linear(self.pooling_dim//2 + 64, self.num_rels, bias=True) # self.freq_gate.weight = torch.nn.init.xavier_normal(self.freq_gate.weight, gain=1.0) self.post_emb_s = nn.Linear(self.pooling_dim, self.pooling_dim) self.post_emb_s.weight = torch.nn.init.xavier_normal( self.post_emb_s.weight, gain=1.0) self.post_emb_o = nn.Linear(self.pooling_dim, self.pooling_dim) self.post_emb_o.weight = torch.nn.init.xavier_normal( self.post_emb_o.weight, gain=1.0) self.merge_obj_high = nn.Linear(self.hidden_dim, self.pooling_dim) self.merge_obj_high.weight = torch.nn.init.xavier_normal( self.merge_obj_high.weight, gain=1.0) self.merge_obj_low = nn.Linear(self.pooling_dim + 5 + self.embed_dim, self.pooling_dim) self.merge_obj_low.weight = torch.nn.init.xavier_normal( self.merge_obj_low.weight, gain=1.0) self.rel_compress = nn.Linear(self.pooling_dim + 64, self.num_rels, bias=True) self.rel_compress.weight = torch.nn.init.xavier_normal( self.rel_compress.weight, gain=1.0) self.freq_gate = nn.Linear(self.pooling_dim + 64, self.num_rels, bias=True) self.freq_gate.weight = torch.nn.init.xavier_normal( self.freq_gate.weight, gain=1.0) # self.ranking_module = nn.Sequential(nn.Linear(self.pooling_dim + 64, self.hidden_dim), nn.ReLU(), nn.Linear(self.hidden_dim, 1)) if self.use_bias: self.freq_bias = FrequencyBias()
def __init__(self, classes, rel_classes, mode='sgdet', embed_dim=200, hidden_dim=256, obj_dim=2048, nl_obj=2, nl_edge=2, dropout_rate=0.2, order='confidence', pass_in_obj_feats_to_decoder=True, pass_in_obj_feats_to_edge=True, use_rl_tree=True, draw_tree=False): super(LinearizedContext, self).__init__() self.classes = classes self.rel_classes = rel_classes assert mode in MODES self.mode = mode self.nl_obj = nl_obj self.nl_edge = nl_edge self.embed_dim = embed_dim self.hidden_dim = hidden_dim self.obj_dim = obj_dim self.dropout_rate = dropout_rate self.pass_in_obj_feats_to_decoder = pass_in_obj_feats_to_decoder self.pass_in_obj_feats_to_edge = pass_in_obj_feats_to_edge self.use_rl_tree = use_rl_tree self.draw_tree = draw_tree assert order in ('size', 'confidence', 'random', 'leftright') self.order = order # EMBEDDINGS embed_vecs = obj_edge_vectors(self.classes, wv_dim=self.embed_dim) self.obj_embed = nn.Embedding(self.num_classes, self.embed_dim) self.obj_embed.weight.data = embed_vecs.clone() self.virtual_node_embed = nn.Embedding(1, self.embed_dim) # used to encode Root Node self.obj_embed2 = nn.Embedding(self.num_classes, self.embed_dim) self.obj_embed2.weight.data = embed_vecs.clone() # This probably doesn't help it much self.pos_embed = nn.Sequential(*[ nn.BatchNorm1d(4, momentum=BATCHNORM_MOMENTUM / 10.0), nn.Linear(4, 128), nn.ReLU(inplace=True), #nn.Dropout(0.1), ]) # generate tree self.rl_input_size = 256 self.rl_hidden_size = 256 self.feat_preprocess_net = gen_tree.RLFeatPreprocessNet(self.obj_dim, self.embed_dim, 8, 6, self.rl_input_size) self.rl_sub = nn.Linear(self.rl_input_size, self.rl_hidden_size) self.rl_obj = nn.Linear(self.rl_input_size, self.rl_hidden_size) self.rl_scores = nn.Linear(self.rl_hidden_size * 3 + 3, 1) # (left child score, right child score) # init self.rl_sub.weight = torch.nn.init.xavier_normal(self.rl_sub.weight, gain=1.0) self.rl_sub.bias.data.zero_() self.rl_obj.weight = torch.nn.init.xavier_normal(self.rl_obj.weight, gain=1.0) self.rl_obj.bias.data.zero_() self.rl_scores.weight = torch.nn.init.xavier_normal(self.rl_scores.weight, gain=1.0) self.rl_scores.bias.data.zero_() # whether draw tree if self.draw_tree: self.draw_tree_count = 0 self.draw_tree_max = 600 if self.nl_obj > 0: self.obj_tree_lstm = tree_lstm.MultiLayer_BTreeLSTM(self.obj_dim+self.embed_dim+128, self.hidden_dim, self.nl_obj, dropout_rate) decoder_inputs_dim = self.hidden_dim if self.pass_in_obj_feats_to_decoder: decoder_inputs_dim += self.obj_dim + self.embed_dim self.decoder_tree_lstm = DecoderTreeLSTM(classes, embed_dim=100, #embed_dim = self.embed_dim, inputs_dim=decoder_inputs_dim, hidden_dim=self.hidden_dim, direction = 'backward', dropout=dropout_rate, pass_root=False, not_rl = not self.use_rl_tree) else: self.decoder_lin = nn.Linear(self.obj_dim + self.embed_dim + 128, self.num_classes) if self.nl_edge > 0: input_dim = self.embed_dim if self.nl_obj > 0: input_dim += self.hidden_dim if self.pass_in_obj_feats_to_edge: input_dim += self.obj_dim self.edge_tree_lstm = tree_lstm.MultiLayer_BTreeLSTM(input_dim, self.hidden_dim, self.nl_edge, dropout_rate)
def __init__(self, classes, rel_classes, embed_dim, obj_dim, inputs_dim, hidden_dim, pooling_dim, recurrent_dropout_probability=0.2, use_highway=True, use_input_projection_bias=True, use_vision=True, use_bias=True, use_tanh=True, limit_vision=True, sl_pretrain=False, num_iter=-1): """ Initializes the RNN :param embed_dim: Dimension of the embeddings :param encoder_hidden_dim: Hidden dim of the encoder, for attention purposes :param hidden_dim: Hidden dim of the decoder :param vocab_size: Number of words in the vocab :param bos_token: To use during decoding (non teacher forcing mode)) :param bos: beginning of sentence token :param unk: unknown token (not used) """ super(DecoderRNN, self).__init__() self.rel_embedding_dim = 100 self.classes = classes self.rel_classes = rel_classes embed_vecs = obj_edge_vectors(['start'] + self.classes, wv_dim=100) self.obj_embed = nn.Embedding(len(self.classes), embed_dim) self.obj_embed.weight.data = embed_vecs embed_rels = obj_edge_vectors(self.rel_classes, wv_dim=self.rel_embedding_dim) self.rel_embed = nn.Embedding(len(self.rel_classes), self.rel_embedding_dim) self.rel_embed.weight.data = embed_rels self.embed_dim = embed_dim self.obj_dim = obj_dim self.hidden_size = hidden_dim self.inputs_dim = inputs_dim self.pooling_dim = pooling_dim self.nms_thresh = 0.3 self.use_vision = use_vision self.use_bias = use_bias self.use_tanh = use_tanh self.limit_vision = limit_vision self.sl_pretrain = sl_pretrain self.num_iter = num_iter self.recurrent_dropout_probability = recurrent_dropout_probability self.use_highway = use_highway # We do the projections for all the gates all at once, so if we are # using highway layers, we need some extra projections, which is # why the sizes of the Linear layers change here depending on this flag. if use_highway: self.input_linearity = torch.nn.Linear( self.input_size, 6 * self.hidden_size, bias=use_input_projection_bias) self.state_linearity = torch.nn.Linear(self.hidden_size, 5 * self.hidden_size, bias=True) else: self.input_linearity = torch.nn.Linear( self.input_size, 4 * self.hidden_size, bias=use_input_projection_bias) self.state_linearity = torch.nn.Linear(self.hidden_size, 4 * self.hidden_size, bias=True) # self.obj_in_lin = torch.nn.Linear(self.rel_embedding_dim, self.rel_embedding_dim, bias=True) self.out = nn.Linear(self.hidden_size, len(self.classes)) self.reset_parameters() # For relation predication embed_vecs2 = obj_edge_vectors(self.classes, wv_dim=embed_dim) self.obj_embed2 = nn.Embedding(self.num_classes, embed_dim) self.obj_embed2.weight.data = embed_vecs2.clone() # self.post_lstm = nn.Linear(self.hidden_dim, self.pooling_dim * 2) self.post_lstm = nn.Linear(self.obj_dim + 2 * self.embed_dim + 128, self.pooling_dim * 2) # Initialize to sqrt(1/2n) so that the outputs all have mean 0 and variance 1. # (Half contribution comes from LSTM, half from embedding. # In practice the pre-lstm stuff tends to have stdev 0.1 so I multiplied this by 10. self.post_lstm.weight.data.normal_( 0, 10.0 * math.sqrt(1.0 / self.hidden_size) ) ######## there may need more consideration self.post_lstm.bias.data.zero_() self.rel_compress = nn.Linear(self.pooling_dim, self.num_rels, bias=True) self.rel_compress.weight = torch.nn.init.xavier_normal( self.rel_compress.weight, gain=1.0) if self.use_bias: self.freq_bias = FrequencyBias() # simple relation model from dataloaders.visual_genome import VG from lib.get_dataset_counts import get_counts, box_filter fg_matrix, bg_matrix = get_counts(train_data=VG.splits( num_val_im=5000, filter_non_overlap=True, filter_duplicate_rels=True, use_proposals=False)[0], must_overlap=True) prob_matrix = fg_matrix.astype(np.float32) prob_matrix[:, :, 0] = bg_matrix # TRYING SOMETHING NEW. prob_matrix[:, :, 0] += 1 prob_matrix /= np.sum(prob_matrix, 2)[:, :, None] # prob_matrix /= float(fg_matrix.max()) prob_matrix[:, :, 0] = 0 # Zero out BG self.prob_matrix = prob_matrix
def __init__(self, classes, rel_classes, mode='sgdet', num_gpus=1, use_vision=True, require_overlap_det=False, embed_dim=200, hidden_dim=256, obj_dim=2048, pooling_dim=4096, nl_obj=1, nl_edge=2, use_resnet=True, order='confidence', thresh=0.01, use_proposals=False, pass_in_obj_feats_to_decoder=True, pass_in_obj_feats_to_edge=True, rec_dropout=0.0, use_bias=True, use_tanh=True, limit_vision=True, spatial_dim=128, mp_iter_num=1, trim_graph=True): """ Args: mp_iter_num: integer, number of message passing iteration trim_graph: boolean, trim graph in rel pn """ super(FckModel, self).__init__() self.classes = classes self.rel_classes = rel_classes self.num_gpus = num_gpus assert mode in MODES self.mode = mode self.pooling_size = 7 self.embed_dim = embed_dim self.hidden_dim = hidden_dim self.obj_dim = obj_dim self.pooling_dim = 2048 if use_resnet else 4096 self.spatial_dim = spatial_dim self.use_bias = use_bias self.use_vision = use_vision self.use_tanh = use_tanh self.limit_vision = limit_vision self.require_overlap = require_overlap_det and self.mode == 'sgdet' self.mp_iter_num = mp_iter_num self.trim_graph = trim_graph classes_word_vec = obj_edge_vectors(self.classes, wv_dim=embed_dim) self.classes_word_embedding = nn.Embedding(self.num_classes, embed_dim) self.classes_word_embedding.weight.data = classes_word_vec.clone() self.classes_word_embedding.weight.requires_grad = False #fg_matrix, bg_matrix = get_counts() #rel_obj_distribution = fg_matrix / (fg_matrix.sum(2)[:, :, None] + 1e-5) #rel_obj_distribution = torch.FloatTensor(rel_obj_distribution) #rel_obj_distribution = rel_obj_distribution.view(-1, self.num_rels) # #self.rel_obj_distribution = nn.Embedding(rel_obj_distribution.size(0), self.num_rels) ## (#obj_class * #obj_class, #rel_class) #self.rel_obj_distribution.weight.data = rel_obj_distribution if mode == 'sgdet': if use_proposals: obj_detector_mode = 'proposals' else: obj_detector_mode = 'refinerels' else: obj_detector_mode = 'gtbox' self.detector = ObjectDetector( classes=classes, mode=obj_detector_mode, use_resnet=use_resnet, thresh=thresh, max_per_img=64, ) self.union_boxes = UnionBoxesAndFeats(pooling_size=self.pooling_size, stride=16, dim=1024 if use_resnet else 512, use_feats=False) self.spatial_fc = nn.Sequential(*[ nn.Linear(4, spatial_dim), nn.BatchNorm1d(spatial_dim, momentum=BATCHNORM_MOMENTUM / 10.), nn.ReLU(inplace=True) ]) self.word_fc = nn.Sequential(*[ nn.Linear(2 * embed_dim, hidden_dim), nn.BatchNorm1d(hidden_dim, momentum=BATCHNORM_MOMENTUM / 10.), nn.ReLU(inplace=True) ]) # union box feats feats_dim = obj_dim + spatial_dim + hidden_dim self.relpn_fc = nn.Linear(feats_dim, 2) self.relcnn_fc1 = nn.Sequential( *[nn.Linear(feats_dim, feats_dim), nn.ReLU(inplace=True)]) # v2 model--------- self.box_mp_fc = nn.Sequential(*[ nn.Linear(obj_dim, obj_dim), ]) self.sub_rel_mp_fc = nn.Sequential(*[nn.Linear(feats_dim, obj_dim)]) self.obj_rel_mp_fc = nn.Sequential(*[ nn.Linear(feats_dim, obj_dim), ]) self.mp_atten_fc = nn.Sequential(*[ nn.Linear(feats_dim + obj_dim, obj_dim), nn.ReLU(inplace=True), nn.Linear(obj_dim, 1) ]) # v2 model---------- self.cls_fc = nn.Linear(obj_dim, self.num_classes) self.relcnn_fc2 = nn.Linear(feats_dim, self.num_rels) # v3 model ----------- self.mem_module = MemoryRNN(classes=classes, rel_classes=rel_classes, inputs_dim=feats_dim, hidden_dim=hidden_dim, recurrent_dropout_probability=.0) # v3 model ----------- if use_resnet: # deprecate self.roi_fmap = nn.Sequential( resnet_l4(relu_end=False), nn.AvgPool2d(self.pooling_size), Flattener(), ) else: roi_fmap = [ load_vgg( use_dropout=False, use_relu=False, use_linear=self.obj_dim == 4096, pretrained=False, ).classifier, nn.Linear(self.pooling_dim, self.obj_dim) ] self.roi_fmap = nn.Sequential(*roi_fmap)
def __init__(self, vocabs, vocab_size, input_encoding_size, rnn_type='lstm', rnn_size=512, num_layers=1, drop_prob_lm=0.5, seq_length=16, seq_per_img=5, fc_feat_size=4096, att_feat_size=512, num_relation=20, object_classes=None, predicate_classes=None, triplet_embed_dim=-1, embed_triplet=True, freq_bl=False): super(RelCaptionModel, self).__init__() self.vocabs = vocabs self.vocabs['0'] = '__SENTSIGN__' ## ix self.vocabs = {i: self.vocabs[str(i)] for i in range(len(self.vocabs))} vocab_list = [self.vocabs[i] for i in range(len(self.vocabs))] self.vocab_size = vocab_size + 1 # including all the words and <UNK>, and 0 for <start>/<end> self.input_encoding_size = input_encoding_size self.rnn_type = rnn_type self.rnn_size = rnn_size self.num_layers = num_layers self.drop_prob_lm = drop_prob_lm self.seq_length = seq_length self.fc_feat_size = fc_feat_size self.ss_prob = 0.0 # Schedule sampling probability self.num_relation_per_img = num_relation self.seq_per_img = seq_per_img self.embed_triplet = embed_triplet self.triplet_embed_dim = triplet_embed_dim self.freq_bl = freq_bl self.linear = nn.Linear(self.fc_feat_size, self.num_layers * self.rnn_size) # feature to rnn_size embed_vec = obj_edge_vectors(vocab_list, wv_dim=self.input_encoding_size) self.embed = nn.Embedding(self.vocab_size, self.input_encoding_size) self.embed.weight.data = embed_vec.clone() if self.embed_triplet: assert object_classes is not None and predicate_classes is not None object_embed_vec = obj_edge_vectors(object_classes, wv_dim=self.triplet_embed_dim) predicate_embed_vec = obj_edge_vectors( predicate_classes, wv_dim=self.triplet_embed_dim) self.object_embed = nn.Embedding(len(object_classes), self.triplet_embed_dim) self.object_embed.weight.data = object_embed_vec.clone() self.predicate_embed = nn.Embedding(len(predicate_classes), self.triplet_embed_dim) self.predicate_embed.weight.data = predicate_embed_vec.clone() self.logit = nn.Linear(self.rnn_size, self.vocab_size) self.dropout = nn.Dropout(self.drop_prob_lm) self.core = RelCaptionCore(input_encoding_size, rnn_type, rnn_size, num_layers, drop_prob_lm, fc_feat_size, att_feat_size, triplet_embed_dim, embed_triplet) if self.freq_bl: self.freq_matrix, _ = get_counts(train_data=VG200( mode='train', filter_duplicate_rels=False, num_val_im=1000), must_overlap=True) else: self.freq_matrix = None self.init_weights()
def __init__(self, classes, rel_classes, mode='sgdet', num_gpus=1, require_overlap_det=True, embed_dim=200, use_resnet=False, order='confidence', thresh=0.01, use_proposals=False): """ :param classes: Object classes :param rel_classes: Relationship classes. None if were not using rel mode :param mode: (sgcls, predcls, or sgdet) """ super(NODIS, self).__init__() self.classes = classes self.rel_classes = rel_classes self.num_gpus = num_gpus assert mode in MODES self.mode = mode self.pooling_size = 7 self.embed_dim = embed_dim self.obj_dim = 2048 if use_resnet else 4096 self.order = 'random' self.require_overlap = require_overlap_det and self.mode == 'sgdet' self.detector = ObjectDetector( classes=classes, mode=('proposals' if use_proposals else 'refinerels') if mode == 'sgdet' else 'gtbox', use_resnet=use_resnet, thresh=thresh, max_per_img=64, ) self.context = O_NODE(self.classes, self.rel_classes, mode=self.mode, embed_dim=self.embed_dim, obj_dim=self.obj_dim, order=order) # Image Feats (You'll have to disable if you want to turn off the features from here) self.union_boxes = UnionBoxesAndFeats(pooling_size=self.pooling_size, stride=16, dim=1024 if use_resnet else 512) if use_resnet: self.roi_fmap = nn.Sequential( resnet_l4(relu_end=False), nn.AvgPool2d(self.pooling_size), Flattener(), ) else: self.roi_fmap_obj = load_vgg(pretrained=False).classifier self.roi_avg_pool = nn.AvgPool2d(kernel_size=7, stride=0) ################################### embed_vecs = obj_edge_vectors(self.classes, wv_dim=self.embed_dim) self.obj_embed = nn.Embedding(self.num_classes, self.embed_dim) self.obj_embed.weight.data = embed_vecs.clone() self.obj_embed2 = nn.Embedding(self.num_classes, self.embed_dim) self.obj_embed2.weight.data = embed_vecs.clone() self.lstm_visual = nn.LSTM(input_size=1536, hidden_size=512) self.lstm_semantic = nn.LSTM(input_size=400, hidden_size=512) self.odeBlock = odeBlock(odeFunc1(bidirectional=True)) self.fc_predicate = nn.Sequential(nn.Linear(1024, 512), nn.ReLU(inplace=False), nn.Linear(512, 51), nn.ReLU(inplace=False))
def __init__(self, classes, rel_classes, mode='sgdet', num_gpus=1, use_vision=True, require_overlap_det=True, embed_dim=200, hidden_dim=256, obj_dim=2048, pooling_dim=4096, nl_obj=1, nl_edge=2, use_resnet=True, order='confidence', thresh=0.01, use_proposals=False, pass_in_obj_feats_to_decoder=True, pass_in_obj_feats_to_edge=True, rec_dropout=0.0, use_bias=True, use_tanh=True, limit_vision=True, spatial_dim=128, graph_constrain=True, mp_iter_num=1): """ Args: mp_iter_num: integer, number of message passing iteration """ super(FckModel, self).__init__() self.classes = classes self.rel_classes = rel_classes self.num_gpus = num_gpus assert mode in MODES self.mode = mode self.pooling_size = 7 self.embed_dim = embed_dim self.hidden_dim = hidden_dim self.obj_dim = obj_dim self.pooling_dim = 2048 if use_resnet else 4096 self.spatial_dim = spatial_dim self.use_bias = use_bias self.use_vision = use_vision self.use_tanh = use_tanh self.limit_vision = limit_vision self.require_overlap = require_overlap_det and self.mode == 'sgdet' self.graph_cons = graph_constrain self.mp_iter_num = mp_iter_num classes_word_vec = obj_edge_vectors(self.classes, wv_dim=embed_dim) self.classes_word_embedding = nn.Embedding(self.num_classes, embed_dim) self.classes_word_embedding.weight.data = classes_word_vec.clone() self.classes_word_embedding.weight.requires_grad = False # the last one is dirty bit self.rel_mem = nn.Embedding(self.num_rels, self.obj_dim + 1) self.rel_mem.weight.data[:, -1] = 0 if mode == 'sgdet': if use_proposals: obj_detector_mode = 'proposals' else: obj_detector_mode = 'refinerels' else: obj_detector_mode = 'gtbox' self.detector = ObjectDetector( classes=classes, mode=obj_detector_mode, use_resnet=use_resnet, thresh=thresh, max_per_img=64, ) self.union_boxes = UnionBoxesAndFeats(pooling_size=self.pooling_size, stride=16, dim=1024 if use_resnet else 512, use_feats=False) self.spatial_fc = nn.Sequential(*[ nn.Linear(4, spatial_dim), nn.BatchNorm1d(spatial_dim, momentum=BATCHNORM_MOMENTUM / 10.), nn.ReLU(inplace=True) ]) self.word_fc = nn.Sequential(*[ nn.Linear(2 * embed_dim, hidden_dim), nn.BatchNorm1d(hidden_dim, momentum=BATCHNORM_MOMENTUM / 10.), nn.ReLU(inplace=True) ]) # union box feats feats_dim = obj_dim + spatial_dim + hidden_dim self.relpn_fc = nn.Linear(feats_dim, 2) self.relcnn_fc1 = nn.Sequential( *[nn.Linear(feats_dim, feats_dim), nn.ReLU(inplace=True)]) self.box_mp_fc = nn.Sequential(*[ nn.Linear(obj_dim, obj_dim), ]) self.sub_rel_mp_fc = nn.Sequential(*[nn.Linear(feats_dim, obj_dim)]) self.obj_rel_mp_fc = nn.Sequential(*[ nn.Linear(feats_dim, obj_dim), ]) self.mp_atten_fc = nn.Sequential(*[ nn.Linear(feats_dim + obj_dim, obj_dim), nn.ReLU(inplace=True), nn.Linear(obj_dim, 1) ]) self.cls_fc = nn.Linear(obj_dim, self.num_classes) self.relcnn_fc2 = nn.Linear( feats_dim, self.num_rels if self.graph_cons else 2 * self.num_rels) if use_resnet: #deprecate self.roi_fmap = nn.Sequential( resnet_l4(relu_end=False), nn.AvgPool2d(self.pooling_size), Flattener(), ) else: roi_fmap = [ load_vgg( use_dropout=False, use_relu=False, use_linear=self.obj_dim == 4096, pretrained=False, ).classifier, nn.Linear(self.pooling_dim, self.obj_dim) ] self.roi_fmap = nn.Sequential(*roi_fmap)