Ejemplo n.º 1
0
    def __init__(self, opt):
        super(AttModel, self).__init__()
        self.image_crop_size = opt.image_crop_size
        self.vocab_size = opt.vocab_size
        self.detect_size = opt.detect_size
        self.input_encoding_size = opt.input_encoding_size
        #self.rnn_type = opt.rnn_type
        self.rnn_size = opt.rnn_size
        self.num_layers = opt.num_layers
        self.drop_prob_lm = opt.drop_prob_lm
        self.seq_length = opt.seq_length
        self.fc_feat_size = opt.fc_feat_size
        self.att_feat_size = opt.att_feat_size
        self.att_hid_size = opt.att_hid_size
        self.finetune_cnn = opt.finetune_cnn
        self.cbs = opt.cbs
        self.cbs_mode = opt.cbs_mode
        self.seq_per_img = 5
        if opt.cnn_backend == 'vgg16':
            self.stride = 16
        else:
            self.stride = 32

        self.att_size = int(opt.image_crop_size / self.stride)
        self.tiny_value = 1e-8

        self.pool_feat_size = self.att_feat_size + 300 * 2
        self.ss_prob = 0.0  # Schedule sampling probability
        self.min_value = -1e8
        opt.beta = 1
        self.beta = opt.beta
        if opt.cnn_backend == 'res101':
            self.cnn = resnet(opt,
                              _num_layers=101,
                              _fixed_block=opt.fixed_block,
                              pretrained=True)
        elif opt.cnn_backend == 'res152':
            self.cnn = resnet(opt,
                              _num_layers=152,
                              _fixed_block=opt.fixed_block,
                              pretrained=True)
        elif opt.cnn_backend == 'vgg16':
            self.cnn = vgg16(opt, pretrained=True)

        self.det_fc = nn.Sequential(nn.Embedding(self.detect_size + 1, 300),
                                    nn.ReLU(), nn.Dropout())

        self.loc_fc = nn.Sequential(nn.Linear(5, 300), nn.ReLU(), nn.Dropout())

        self.embed = nn.Sequential(
            nn.Embedding(self.vocab_size + self.detect_size + 1,
                         self.input_encoding_size), nn.ReLU(),
            nn.Dropout(self.drop_prob_lm))

        self.fc_embed = nn.Sequential(
            nn.Linear(self.fc_feat_size, self.rnn_size), nn.ReLU(),
            nn.Dropout(self.drop_prob_lm))

        self.att_embed = nn.Sequential(
            nn.Linear(self.att_feat_size, self.rnn_size), nn.ReLU(),
            nn.Dropout(self.drop_prob_lm))

        self.pool_embed = nn.Sequential(
            nn.Linear(self.pool_feat_size, self.rnn_size), nn.ReLU(),
            nn.Dropout(self.drop_prob_lm))

        self.ctx2att = nn.Linear(self.rnn_size, self.att_hid_size)
        self.ctx2pool = nn.Linear(self.rnn_size, self.att_hid_size)

        self.logit = nn.Linear(self.rnn_size, self.vocab_size + 1)
        self.roi_align = RoIAlignAvg(1, 1, 1.0 / self.stride)

        #self.grid_size = 1
        #self.roi_crop = _RoICrop()
        self.critLM = utils.LMCriterion(opt)
        self.critBN = utils.BNCriterion(opt)
        self.critFG = utils.FGCriterion(opt)

        if opt.self_critical:
            print("load reward function...")
            self.get_self_critical_reward = get_self_critical_reward(opt)
            self.critRL = utils.RewardCriterion(opt)

        # initialize the glove weight for the labels.
        self.det_fc[0].weight.data.copy_(opt.glove_clss)
        for p in self.det_fc[0].parameters():
            p.requires_grad = False
Ejemplo n.º 2
0
    def __init__(self, opt):
        super(AttModel, self).__init__()
        self.image_crop_size = opt.image_crop_size
        self.vocab_size = opt.vocab_size
        self.detect_size = opt.detect_size
        self.input_encoding_size = opt.input_encoding_size
        # self.rnn_type = opt.rnn_type
        self.rnn_size = opt.rnn_size
        self.num_layers = opt.num_layers
        self.drop_prob_lm = opt.drop_prob_lm
        self.seq_length = opt.seq_length
        self.fc_feat_size = opt.fc_feat_size
        self.att_feat_size = opt.att_feat_size
        self.att_hid_size = opt.att_hid_size
        self.finetune_cnn = opt.finetune_cnn
        self.cbs = opt.cbs
        self.cbs_mode = opt.cbs_mode
        self.seq_per_img = 5
        if opt.cnn_backend == 'vgg16':
            self.stride = 16
        else:
            self.stride = 32

        self.att_size = int(opt.image_crop_size / self.stride)
        self.tiny_value = 1e-8

        if opt.relation_type == 'implicit' or opt.relation_type == 'spatial' or opt.relation_type == 'semantic':
            self.pool_feat_size = opt.relation_dim + 300 * 2
        else:
            self.pool_feat_size = self.att_feat_size + 300 * 2
        self.ss_prob = 0.0  # Schedule sampling probability
        self.min_value = -1e8
        opt.beta = 1
        self.beta = opt.beta
        if opt.cnn_backend == 'res101':
            self.cnn = resnet(opt,
                              _num_layers=101,
                              _fixed_block=opt.fixed_block,
                              pretrained=True)
        elif opt.cnn_backend == 'res152':
            self.cnn = resnet(opt,
                              _num_layers=152,
                              _fixed_block=opt.fixed_block,
                              pretrained=True)
        elif opt.cnn_backend == 'vgg16':
            self.cnn = vgg16(opt, pretrained=True)

        # Object Detection Model
        # self.faster_rcnn = fasterrcnn_resnet50_fpn(pretrained=True)
        # self.faster_rcnn.eval()
        # self.ppls_threshold = opt.ppls_thresh
        # self.max_proposal = 200
        # self.det_oracle = opt.det_oracle

        self.det_fc = nn.Sequential(nn.Embedding(self.detect_size + 1, 300),
                                    nn.ReLU(inplace=opt.inplace), nn.Dropout())

        self.loc_fc = nn.Sequential(nn.Linear(5, 300),
                                    nn.ReLU(inplace=opt.inplace), nn.Dropout())

        self.embed = nn.Sequential(
            nn.Embedding(self.vocab_size + self.detect_size + 1,
                         self.input_encoding_size),
            nn.ReLU(inplace=opt.inplace), nn.Dropout(self.drop_prob_lm))

        self.fc_embed = nn.Sequential(
            nn.Linear(self.fc_feat_size, self.rnn_size),
            nn.ReLU(inplace=opt.inplace), nn.Dropout(self.drop_prob_lm))

        self.att_embed = nn.Sequential(
            nn.Linear(self.att_feat_size, self.rnn_size),
            nn.ReLU(inplace=opt.inplace), nn.Dropout(self.drop_prob_lm))

        self.pool_embed = nn.Sequential(
            nn.Linear(self.pool_feat_size, self.rnn_size),
            nn.ReLU(inplace=opt.inplace), nn.Dropout(self.drop_prob_lm))

        self.ctx2att = nn.Linear(self.rnn_size, self.att_hid_size)
        self.ctx2pool = nn.Linear(self.rnn_size, self.att_hid_size)

        self.logit = nn.Linear(self.rnn_size, self.vocab_size + 1)

        # fix the RoIAlign to use the torchvision version
        # self.roi_align = RoIAlignAvg(1, 1, 1.0 / self.stride)
        self.roi_align = RoIAlign((1, 1), 1.0 / self.stride, 0)

        # self.grid_size = 1
        # self.roi_crop = _RoICrop()
        self.critLM = utils.LMCriterion(opt)
        self.critBN = utils.BNCriterion(opt)
        self.critFG = utils.FGCriterion(opt)

        if opt.self_critical:
            print("load reward function...")
            self.get_self_critical_reward = get_self_critical_reward(opt)
            self.critRL = utils.RewardCriterion(opt)

        # initialize the glove weight for the labels.
        self.det_fc[0].weight.data.copy_(opt.glove_clss)
        for p in self.det_fc[0].parameters():
            p.requires_grad = False

        # initialize relation module
        self.nongt_dim = opt.nongt_dim
        self.imp_pos_emb_dim = opt.imp_pos_emb_dim
        self.relation_type = opt.relation_type

        # if opt.implicit_type:
        #     self.imp_relation = ImplicitRelationEncoder(
        #         self.att_feat_size, opt.relation_dim,
        #         opt.dir_num, opt.imp_pos_emb_dim, opt.nongt_dim,
        #         num_heads=opt.num_heads, num_steps=opt.num_steps,
        #         residual_connection=opt.residual_connection,
        #         label_bias=opt.label_bias)
        # if opt.spatial_type:
        #     self.spa_relation = ExplicitRelationEncoder(
        #         self.att_feat_size, opt.relation_dim,
        #         opt.dir_num, opt.spa_label_num,
        #         num_heads=opt.num_heads, num_steps=opt.num_steps,
        #         nongt_dim=opt.nongt_dim,
        #         residual_connection=opt.residual_connection,
        #         label_bias=opt.label_bias
        #     )
        # if opt.semantic_tpye:
        #     self.sem_relation = ExplicitRelationEncoder(
        #         self.att_feat_size, opt.relation_dim,
        #         opt.dir_num, opt.sem_label_num,
        #         num_heads=opt.num_heads,
        #         num_steps=opt.num_steps, nongt_dim=opt.nongt_dim,
        #         residual_connection=opt.residual_connection,
        #         label_bias=opt.label_bias)
        if opt.relation_type == 'implicit':
            self.v_relation = ImplicitRelationEncoder(
                self.att_feat_size,
                opt.relation_dim,
                opt.dir_num,
                opt.imp_pos_emb_dim,
                opt.nongt_dim,
                num_heads=opt.num_heads,
                num_steps=opt.num_steps,
                residual_connection=opt.residual_connection,
                label_bias=opt.label_bias)
        elif opt.relation_type == 'spatial':
            self.v_relation = ExplicitRelationEncoder(
                self.att_feat_size,
                opt.relation_dim,
                opt.dir_num,
                opt.spa_label_num,
                pos_emb_dim=opt.imp_pos_emb_dim,
                num_heads=opt.num_heads,
                num_steps=opt.num_steps,
                nongt_dim=opt.nongt_dim,
                residual_connection=opt.residual_connection,
                label_bias=opt.label_bias,
                graph_att=opt.graph_attention)
        elif opt.relation_type == 'semantic':
            self.v_relation = ExplicitRelationEncoder(
                self.att_feat_size,
                opt.relation_dim,
                opt.dir_num,
                opt.sem_label_num,
                num_heads=opt.num_heads,
                num_steps=opt.num_steps,
                nongt_dim=opt.nongt_dim,
                residual_connection=opt.residual_connection,
                label_bias=opt.label_bias,
                graph_att=opt.graph_attention)