Ejemplo n.º 1
0
    def test_model(self):
        image_feat_dim = 40
        txt_embedding_dim = 300
        lstm_dim = 512
        hidden_size = 30
        num_of_loc = 5
        batch_size = 16
        num_vocab = 60
        num_ans_candidates = 35
        joint_embedding_dim = 500
        question_len = 13
        batch_first = True
        image_embedding_model = image_embedding(image_feat_dim, lstm_dim, hidden_size)
        question_embedding_model = QuestionEmbeding(
            num_vocab,
            txt_embedding_dim,
            lstm_dim,
            lstm_layer=2,
            dropout=0.1,
            batch_first=batch_first,
        )
        my_classifier = logit_classifier(
            joint_embedding_dim, num_ans_candidates, image_feat_dim, txt_embedding_dim
        )
        loss = torch.nn.CrossEntropyLoss()

        my_model = top_down_bottom_up_model(
            image_embedding_model, question_embedding_model, my_classifier, loss
        )
        image_feat = np.random.rand(batch_size, num_of_loc, image_feat_dim)
        input_txt = Variable(
            torch.rand(batch_size, question_len).type(torch.LongTensor) % num_vocab
        )
        res = my_model(image_feat, input_txt, batch_first)
        self.assertEqual((batch_size, num_ans_candidates), res.shape)
Ejemplo n.º 2
0
 def test_image_embedding(self):
     image_feat_dim = 40
     txt_embedding_dim = 50
     hidden_size = 30
     num_of_loc = 5
     batch_size = 16
     my_image_embeding = image_embedding(
         image_feat_dim, txt_embedding_dim, hidden_size
     )
     image_feat = Variable(torch.randn(batch_size, num_of_loc, image_feat_dim))
     txt = Variable(torch.randn(batch_size, txt_embedding_dim))
     res = my_image_embeding(image_feat, txt)
     self.assertEqual((batch_size, image_feat_dim), res.shape)
Ejemplo n.º 3
0
 def test_image_embedding(self):
     image_feat_dim = 40
     txt_embedding_dim = 50
     hidden_size = 30
     num_of_loc = 5
     batch_size = 16
     my_image_embeding = image_embedding(image_feat_dim,
                                         txt_embedding_dim,
                                         hidden_size)
     image_feat = Variable(torch.randn(batch_size,
                           num_of_loc,
                           image_feat_dim))
     txt = Variable(torch.randn(batch_size,
                    txt_embedding_dim))
     res = my_image_embeding(image_feat, txt)
     self.assertEqual((batch_size,
                      image_feat_dim),
                      res.shape)
Ejemplo n.º 4
0
    def test_model(self):
        image_feat_dim = 40
        txt_embedding_dim = 300
        lstm_dim = 512
        hidden_size = 30
        num_of_loc = 5
        batch_size = 16
        num_vocab = 60
        num_ans_candidates = 35
        joint_embedding_dim = 500
        question_len = 13
        batch_first = True
        image_embedding_model = image_embedding(image_feat_dim,
                                                lstm_dim,
                                                hidden_size)
        question_embedding_model = QuestionEmbeding(num_vocab,
                                                    txt_embedding_dim,
                                                    lstm_dim,
                                                    lstm_layer=2,
                                                    dropout=0.1,
                                                    batch_first=batch_first)
        my_classifier = logit_classifier(joint_embedding_dim,
                                         num_ans_candidates,
                                         image_feat_dim,
                                         txt_embedding_dim)
        loss = torch.nn.CrossEntropyLoss()

        my_model = top_down_bottom_up_model(image_embedding_model,
                                            question_embedding_model,
                                            my_classifier, loss)
        image_feat = np.random.rand(batch_size,
                                    num_of_loc,
                                    image_feat_dim)
        input_txt = Variable(torch.rand(batch_size,
                             question_len).type(torch.LongTensor) % num_vocab)
        res = my_model(image_feat, input_txt, batch_first)
        self.assertEqual((batch_size, num_ans_candidates), res.shape)
Ejemplo n.º 5
0
def prepare_model(num_vocab_txt, num_choices, **model_config):
    image_feat_dim = model_config['image_feat_dim']

    # generate the list of question embedding models
    ques_embeding_models_list = model_config['question_embedding']
    question_embeding_models = nn.ModuleList()
    final_question_embeding_dim = 0
    for ques_embeding_model in ques_embeding_models_list:
        ques_model_key = ques_embeding_model['method']
        ques_model_par = ques_embeding_model['par']
        tmp_model = build_question_encoding_module(ques_model_key,
                                                   ques_model_par,
                                                   num_vocab=num_vocab_txt)

        question_embeding_models.append(tmp_model)
        final_question_embeding_dim += tmp_model.text_out_dim

    image_feature_encode_list = nn.ModuleList()
    for image_feat_model_par in model_config['image_feature_encoding']:
        image_feat_model = build_image_feature_encoding(
            image_feat_model_par['method'], image_feat_model_par['par'],
            image_feat_dim)
        image_feature_encode_list.append(image_feat_model)
        image_feat_dim = image_feat_model.out_dim

    # generate the list of image attention models
    image_emdedding_models_list = nn.ModuleList()
    num_image_feat = model_config['num_image_feat']
    final_image_embedding_dim = 0
    for i_image in range(num_image_feat):
        image_emdedding_models = nn.ModuleList()
        image_att_model_list = model_config['image_embedding_models']

        for image_att_model in image_att_model_list:
            image_att_model_par = image_att_model
            tmp_img_att_model = build_image_attention_module(
                image_att_model_par,
                image_dim=image_feat_dim,
                ques_dim=final_question_embeding_dim)

            tmp_img_model = image_embedding(tmp_img_att_model)
            final_image_embedding_dim += tmp_img_model.out_dim
            image_emdedding_models.append(tmp_img_model)
        image_emdedding_models_list.append(image_emdedding_models)

    final_image_embedding_dim *= image_feat_dim

    inter_model = None

    # parse multi-modal combination after image-embedding & question-embedding
    multi_modal_combine = build_modal_combine_module(
        model_config['modal_combine']['method'],
        model_config['modal_combine']['par'], final_image_embedding_dim,
        final_question_embeding_dim)

    joint_embedding_dim = multi_modal_combine.out_dim
    # generate the classifier
    classifier = build_classifier(model_config['classifier']['method'],
                                  model_config['classifier']['par'],
                                  in_dim=joint_embedding_dim,
                                  out_dim=num_choices)

    my_model = vqa_multi_modal_model(image_emdedding_models_list,
                                     question_embeding_models,
                                     multi_modal_combine, classifier,
                                     image_feature_encode_list, inter_model)

    if use_cuda:
        my_model = my_model.cuda()

    if torch.cuda.device_count() > 1:
        my_model = nn.DataParallel(my_model)

    return my_model