def __init__(self, opt): super(AttModel, self).__init__() self.image_crop_size = opt.image_crop_size self.vocab_size = opt.vocab_size self.detect_size = opt.detect_size self.input_encoding_size = opt.input_encoding_size #self.rnn_type = opt.rnn_type self.rnn_size = opt.rnn_size self.num_layers = opt.num_layers self.drop_prob_lm = opt.drop_prob_lm self.seq_length = opt.seq_length self.fc_feat_size = opt.fc_feat_size self.att_feat_size = opt.att_feat_size self.att_hid_size = opt.att_hid_size self.finetune_cnn = opt.finetune_cnn self.cbs = opt.cbs self.cbs_mode = opt.cbs_mode self.seq_per_img = 5 if opt.cnn_backend == 'vgg16': self.stride = 16 else: self.stride = 32 self.att_size = int(opt.image_crop_size / self.stride) self.tiny_value = 1e-8 self.pool_feat_size = self.att_feat_size + 300 * 2 self.ss_prob = 0.0 # Schedule sampling probability self.min_value = -1e8 opt.beta = 1 self.beta = opt.beta if opt.cnn_backend == 'res101': self.cnn = resnet(opt, _num_layers=101, _fixed_block=opt.fixed_block, pretrained=True) elif opt.cnn_backend == 'res152': self.cnn = resnet(opt, _num_layers=152, _fixed_block=opt.fixed_block, pretrained=True) elif opt.cnn_backend == 'vgg16': self.cnn = vgg16(opt, pretrained=True) self.det_fc = nn.Sequential(nn.Embedding(self.detect_size + 1, 300), nn.ReLU(), nn.Dropout()) self.loc_fc = nn.Sequential(nn.Linear(5, 300), nn.ReLU(), nn.Dropout()) self.embed = nn.Sequential( nn.Embedding(self.vocab_size + self.detect_size + 1, self.input_encoding_size), nn.ReLU(), nn.Dropout(self.drop_prob_lm)) self.fc_embed = nn.Sequential( nn.Linear(self.fc_feat_size, self.rnn_size), nn.ReLU(), nn.Dropout(self.drop_prob_lm)) self.att_embed = nn.Sequential( nn.Linear(self.att_feat_size, self.rnn_size), nn.ReLU(), nn.Dropout(self.drop_prob_lm)) self.pool_embed = nn.Sequential( nn.Linear(self.pool_feat_size, self.rnn_size), nn.ReLU(), nn.Dropout(self.drop_prob_lm)) self.ctx2att = nn.Linear(self.rnn_size, self.att_hid_size) self.ctx2pool = nn.Linear(self.rnn_size, self.att_hid_size) self.logit = nn.Linear(self.rnn_size, self.vocab_size + 1) self.roi_align = RoIAlignAvg(1, 1, 1.0 / self.stride) #self.grid_size = 1 #self.roi_crop = _RoICrop() self.critLM = utils.LMCriterion(opt) self.critBN = utils.BNCriterion(opt) self.critFG = utils.FGCriterion(opt) if opt.self_critical: print("load reward function...") self.get_self_critical_reward = get_self_critical_reward(opt) self.critRL = utils.RewardCriterion(opt) # initialize the glove weight for the labels. self.det_fc[0].weight.data.copy_(opt.glove_clss) for p in self.det_fc[0].parameters(): p.requires_grad = False
def __init__(self, opt): super(AttModel, self).__init__() self.image_crop_size = opt.image_crop_size self.vocab_size = opt.vocab_size self.detect_size = opt.detect_size self.input_encoding_size = opt.input_encoding_size # self.rnn_type = opt.rnn_type self.rnn_size = opt.rnn_size self.num_layers = opt.num_layers self.drop_prob_lm = opt.drop_prob_lm self.seq_length = opt.seq_length self.fc_feat_size = opt.fc_feat_size self.att_feat_size = opt.att_feat_size self.att_hid_size = opt.att_hid_size self.finetune_cnn = opt.finetune_cnn self.cbs = opt.cbs self.cbs_mode = opt.cbs_mode self.seq_per_img = 5 if opt.cnn_backend == 'vgg16': self.stride = 16 else: self.stride = 32 self.att_size = int(opt.image_crop_size / self.stride) self.tiny_value = 1e-8 if opt.relation_type == 'implicit' or opt.relation_type == 'spatial' or opt.relation_type == 'semantic': self.pool_feat_size = opt.relation_dim + 300 * 2 else: self.pool_feat_size = self.att_feat_size + 300 * 2 self.ss_prob = 0.0 # Schedule sampling probability self.min_value = -1e8 opt.beta = 1 self.beta = opt.beta if opt.cnn_backend == 'res101': self.cnn = resnet(opt, _num_layers=101, _fixed_block=opt.fixed_block, pretrained=True) elif opt.cnn_backend == 'res152': self.cnn = resnet(opt, _num_layers=152, _fixed_block=opt.fixed_block, pretrained=True) elif opt.cnn_backend == 'vgg16': self.cnn = vgg16(opt, pretrained=True) # Object Detection Model # self.faster_rcnn = fasterrcnn_resnet50_fpn(pretrained=True) # self.faster_rcnn.eval() # self.ppls_threshold = opt.ppls_thresh # self.max_proposal = 200 # self.det_oracle = opt.det_oracle self.det_fc = nn.Sequential(nn.Embedding(self.detect_size + 1, 300), nn.ReLU(inplace=opt.inplace), nn.Dropout()) self.loc_fc = nn.Sequential(nn.Linear(5, 300), nn.ReLU(inplace=opt.inplace), nn.Dropout()) self.embed = nn.Sequential( nn.Embedding(self.vocab_size + self.detect_size + 1, self.input_encoding_size), nn.ReLU(inplace=opt.inplace), nn.Dropout(self.drop_prob_lm)) self.fc_embed = nn.Sequential( nn.Linear(self.fc_feat_size, self.rnn_size), nn.ReLU(inplace=opt.inplace), nn.Dropout(self.drop_prob_lm)) self.att_embed = nn.Sequential( nn.Linear(self.att_feat_size, self.rnn_size), nn.ReLU(inplace=opt.inplace), nn.Dropout(self.drop_prob_lm)) self.pool_embed = nn.Sequential( nn.Linear(self.pool_feat_size, self.rnn_size), nn.ReLU(inplace=opt.inplace), nn.Dropout(self.drop_prob_lm)) self.ctx2att = nn.Linear(self.rnn_size, self.att_hid_size) self.ctx2pool = nn.Linear(self.rnn_size, self.att_hid_size) self.logit = nn.Linear(self.rnn_size, self.vocab_size + 1) # fix the RoIAlign to use the torchvision version # self.roi_align = RoIAlignAvg(1, 1, 1.0 / self.stride) self.roi_align = RoIAlign((1, 1), 1.0 / self.stride, 0) # self.grid_size = 1 # self.roi_crop = _RoICrop() self.critLM = utils.LMCriterion(opt) self.critBN = utils.BNCriterion(opt) self.critFG = utils.FGCriterion(opt) if opt.self_critical: print("load reward function...") self.get_self_critical_reward = get_self_critical_reward(opt) self.critRL = utils.RewardCriterion(opt) # initialize the glove weight for the labels. self.det_fc[0].weight.data.copy_(opt.glove_clss) for p in self.det_fc[0].parameters(): p.requires_grad = False # initialize relation module self.nongt_dim = opt.nongt_dim self.imp_pos_emb_dim = opt.imp_pos_emb_dim self.relation_type = opt.relation_type # if opt.implicit_type: # self.imp_relation = ImplicitRelationEncoder( # self.att_feat_size, opt.relation_dim, # opt.dir_num, opt.imp_pos_emb_dim, opt.nongt_dim, # num_heads=opt.num_heads, num_steps=opt.num_steps, # residual_connection=opt.residual_connection, # label_bias=opt.label_bias) # if opt.spatial_type: # self.spa_relation = ExplicitRelationEncoder( # self.att_feat_size, opt.relation_dim, # opt.dir_num, opt.spa_label_num, # num_heads=opt.num_heads, num_steps=opt.num_steps, # nongt_dim=opt.nongt_dim, # residual_connection=opt.residual_connection, # label_bias=opt.label_bias # ) # if opt.semantic_tpye: # self.sem_relation = ExplicitRelationEncoder( # self.att_feat_size, opt.relation_dim, # opt.dir_num, opt.sem_label_num, # num_heads=opt.num_heads, # num_steps=opt.num_steps, nongt_dim=opt.nongt_dim, # residual_connection=opt.residual_connection, # label_bias=opt.label_bias) if opt.relation_type == 'implicit': self.v_relation = ImplicitRelationEncoder( self.att_feat_size, opt.relation_dim, opt.dir_num, opt.imp_pos_emb_dim, opt.nongt_dim, num_heads=opt.num_heads, num_steps=opt.num_steps, residual_connection=opt.residual_connection, label_bias=opt.label_bias) elif opt.relation_type == 'spatial': self.v_relation = ExplicitRelationEncoder( self.att_feat_size, opt.relation_dim, opt.dir_num, opt.spa_label_num, pos_emb_dim=opt.imp_pos_emb_dim, num_heads=opt.num_heads, num_steps=opt.num_steps, nongt_dim=opt.nongt_dim, residual_connection=opt.residual_connection, label_bias=opt.label_bias, graph_att=opt.graph_attention) elif opt.relation_type == 'semantic': self.v_relation = ExplicitRelationEncoder( self.att_feat_size, opt.relation_dim, opt.dir_num, opt.sem_label_num, num_heads=opt.num_heads, num_steps=opt.num_steps, nongt_dim=opt.nongt_dim, residual_connection=opt.residual_connection, label_bias=opt.label_bias, graph_att=opt.graph_attention)