Esempio n. 1
0
    def __init__(self,
                 args,
                 ques_feat_size,
                 image_feature_size,
                 lidar_feature_size,
                 num_classes,
                 qa=None,
                 encoder='lstm'):
        super(LSTM_BASIC, self).__init__()
        self.qa = qa

        # special_words = ["<UNK>"]
        # self.vocab = load_vocab(os.path.join(args.input_base, args.vocab))
        # word_vectors = get_word2vec(os.path.join(args.input_base, args.ques_vectors))

        # padding = vocab['question_token_to_idx']['<NULL>']
        # D = word2vec.vector_size
        # self.embeddings = get_embeddings(self.vocab, word_vectors, special_words)

        self.vocab = load_vocab(os.path.join(args.input_base, args.vocab))
        N = len(self.vocab['question_token_to_idx'])
        D = 200
        padding = self.vocab['question_token_to_idx']['<NULL>']
        self.embeddings = nn.Embedding(N, D, padding_idx=padding)
        self.question_module = QuestionModule(ques_feat_size, self.embeddings,
                                              encoder)
        self.classifier = AnswerModule(ques_feat_size, num_classes, (256, ))
Esempio n. 2
0
 def __init__(self,
              args,
              ques_feat_size,
              image_feature_size,
              lidar_feature_size,
              num_classes,
              qa=None,
              encoder='lstm',
              grouping='single_scale'):
     super(MCB_LIDAR, self).__init__()
     self.qa = qa
     self.image_feat_size = image_feature_size
     self.vocab = load_vocab(os.path.join(args.input_base, args.vocab))
     N = len(self.vocab['question_token_to_idx'])
     D = 200
     padding = self.vocab['question_token_to_idx']['<NULL>']
     self.embeddings = nn.Embedding(N, D, padding_idx=padding)
     self.question_module = QuestionModule(ques_feat_size, self.embeddings,
                                           encoder)
     self.attention = MCBAttention(ques_feat_size, 512, 512, 8000)
     self.classifier = AnswerModule(8000 + ques_feat_size + 1024,
                                    num_classes, (256, ),
                                    use_batchnorm=True,
                                    dropout=0.5)  #3584 if method is concat
     self.mcb = MCBPolling(512, 8000, n_modalities=2)
     self.linweights = nn.Linear(ques_feat_size, 7)
     self.grouping = grouping
     if self.grouping == 'single_scale':
         self.lidar_module = LidarSsgModule(normal_channel=False)
     if self.grouping == 'single_scale':
         self.lidar_module = LidarMsgModule(normal_channel=False)
Esempio n. 3
0
    def __init__(self,
                 args,
                 ques_feat_size,
                 image_feature_size,
                 lidar_feature_size,
                 num_classes,
                 qa=None,
                 encoder='lstm',
                 method='concat'):
        super(MCB, self).__init__()
        self.qa = qa

        # special_words = ["<UNK>"]
        # self.vocab = load_vocab(os.path.join(args.input_base, args.vocab))
        # word_vectors = get_word2vec(os.path.join(args.input_base, args.ques_vectors))

        # padding = vocab['question_token_to_idx']['<NULL>']
        # D = word2vec.vector_size
        # self.embeddings = get_embeddings(self.vocab, word_vectors, special_words)
        self.image_feat_size = image_feature_size
        self.vocab = load_vocab(os.path.join(args.input_base, args.vocab))
        N = len(self.vocab['question_token_to_idx'])
        D = 200
        padding = self.vocab['question_token_to_idx']['<NULL>']
        self.embeddings = nn.Embedding(N, D, padding_idx=padding)
        self.question_module = QuestionModule(ques_feat_size, self.embeddings,
                                              encoder)
        self.attention = MCBAttention(ques_feat_size, 512, 512, 8000)
        self.classifier = AnswerModule(8000,
                                       num_classes, (256, ),
                                       use_batchnorm=True,
                                       dropout=0.5)  #3584 if method is concat
        self.mcb = MCBPolling(512, 8000, n_modalities=2)
        self.linweights = nn.Linear(ques_feat_size, 7)
        self.method = method
Esempio n. 4
0
    def __init__(self,
                 args,
                 ques_feat_size,
                 image_feature_size,
                 lidar_feature_size,
                 num_classes,
                 qa=None,
                 encoder='lstm',
                 method='concat'):
        super(DAN, self).__init__()
        self.qa = qa

        # special_words = ["<UNK>"]
        # self.vocab = load_vocab(os.path.join(args.input_base, args.vocab))
        # word_vectors = get_word2vec(os.path.join(args.input_base, args.ques_vectors))

        # padding = vocab['question_token_to_idx']['<NULL>']
        # D = word2vec.vector_size
        # self.embeddings = get_embeddings(self.vocab, word_vectors, special_words)
        self.image_feat_size = image_feature_size
        self.vocab = load_vocab(os.path.join(args.input_base, args.vocab))
        N = len(self.vocab['question_token_to_idx'])
        D = 200
        padding = self.vocab['question_token_to_idx']['<NULL>']
        self.embeddings = nn.Embedding(N, D, padding_idx=padding)
        self.question_module = QuestionModule(ques_feat_size,
                                              self.embeddings,
                                              encoder,
                                              bidirectional=True,
                                              give_last=False)
        self.attention = DANAttention(ques_feat_size, 512, 512, 2)
        self.linweights = nn.Linear(ques_feat_size, 7)
        self.method = method
        self.softmax = nn.Softmax(dim=1)
        self.tanh = nn.Tanh()
        if self.method == 'concat':
            self.classifier = AnswerModule(
                2 * 512 * 7 + ques_feat_size,
                num_classes, (256, ),
                use_batchnorm=True,
                dropout=0.5)  #3584 if method is concat
        else:
            self.classifier = AnswerModule(
                2 * 512, num_classes, (256, ), use_batchnorm=True,
                dropout=0.5)  #3584 if method is concat
Esempio n. 5
0
    def __init__(self,
                 args,
                 ques_feat_size,
                 image_feature_size,
                 lidar_feature_size,
                 num_classes,
                 qa=None,
                 encoder='lstm',
                 method='concat'):
        super(SAN, self).__init__()
        self.qa = qa

        # special_words = ["<UNK>"]
        # self.vocab = load_vocab(os.path.join(args.input_base, args.vocab))
        # word_vectors = get_word2vec(os.path.join(args.input_base, args.ques_vectors))

        # padding = vocab['question_token_to_idx']['<NULL>']
        # D = word2vec.vector_size
        # self.embeddings = get_embeddings(self.vocab, word_vectors, special_words)
        self.image_feat_size = image_feature_size
        self.vocab = load_vocab(os.path.join(args.input_base, args.vocab))
        N = len(self.vocab['question_token_to_idx'])
        D = 200
        padding = self.vocab['question_token_to_idx']['<NULL>']
        self.embeddings = nn.Embedding(N, D, padding_idx=padding)
        self.question_module = QuestionModule(ques_feat_size, self.embeddings,
                                              encoder)
        self.attention = StackedAttention(ques_feat_size, 512, 512, 2)
        self.method = method
        if self.method == 'concat':
            self.classifier = AnswerModule(
                3584, num_classes, (256, ), use_batchnorm=True,
                dropout=0.5)  #3584 if method is concat
        if self.method == 'hierarchical':
            self.classifier = AnswerModule(
                512, num_classes, (256, ), use_batchnorm=True,
                dropout=0.5)  #3584 if method is concat
        # self.linweights=nn.Linear(512,7)
        self.linweights = nn.Sequential(
            nn.Linear(512, 256),
            nn.ReLU(inplace=True),
            nn.Dropout(0.5),
            nn.Linear(256, 7),  #no of glimpses
            nn.Sigmoid())
Esempio n. 6
0
    def __init__(self,
                 args,
                 ques_feat_size,
                 image_feature_size,
                 lidar_feature_size,
                 num_classes,
                 qa=None,
                 encoder='lstm',
                 method='dot'):
        super(CNN_LSTM, self).__init__()
        self.qa = qa

        # special_words = ["<UNK>"]
        # self.vocab = load_vocab(os.path.join(args.input_base, args.vocab))
        # word_vectors = get_word2vec(os.path.join(args.input_base, args.ques_vectors))

        # padding = vocab['question_token_to_idx']['<NULL>']
        # D = word2vec.vector_size
        # self.embeddings = get_embeddings(self.vocab, word_vectors, special_words)
        self.image_feat_size = image_feature_size
        self.vocab = load_vocab(os.path.join(args.input_base, args.vocab))
        N = len(self.vocab['question_token_to_idx'])
        D = 200
        padding = self.vocab['question_token_to_idx']['<NULL>']
        self.embeddings = nn.Embedding(N, D, padding_idx=padding)
        self.question_module = QuestionModule(ques_feat_size, self.embeddings,
                                              encoder)
        self.image_features_resize = nn.Linear(self.image_feat_size,
                                               ques_feat_size)
        self.method = method
        if self.method == 'dot':
            self.classifier = AnswerModule(ques_feat_size,
                                           num_classes,
                                           use_batchnorm=True,
                                           dropout=0.5)
        if self.method == 'concat':
            self.classifier = AnswerModule(ques_feat_size +
                                           image_feature_size * 7,
                                           num_classes,
                                           use_batchnorm=True,
                                           dropout=0.5)
Esempio n. 7
0
    def __init__(self,
                 args,
                 ques_feat_size,
                 image_feature_size,
                 lidar_feature_size,
                 num_classes,
                 qa=None,
                 encoder='lstm',
                 method='concat',
                 grouping='single_scale'):
        super(LIDAR_MODEL, self).__init__()
        self.qa = qa
        self.image_feat_size = image_feature_size
        self.vocab = load_vocab(os.path.join(args.input_base, args.vocab))
        N = len(self.vocab['question_token_to_idx'])
        D = 200
        padding = self.vocab['question_token_to_idx']['<NULL>']
        self.embeddings = nn.Embedding(N, D, padding_idx=padding)
        self.question_module = QuestionModule(ques_feat_size, self.embeddings,
                                              encoder)
        self.method = method
        self.grouping = grouping

        if self.grouping == 'single_scale':
            self.lidar_module = LidarSsgModule(normal_channel=False)
        if self.grouping == 'multi_scale':
            self.lidar_module = LidarMsgModule(normal_channel=False)

        if self.method == 'dot':
            self.classifier = AnswerModule(ques_feat_size,
                                           num_classes,
                                           use_batchnorm=True,
                                           dropout=0.5)
        if self.method == 'concat':
            self.classifier = AnswerModule(ques_feat_size + 1024,
                                           num_classes,
                                           use_batchnorm=True,
                                           dropout=0.5)
Esempio n. 8
0
    def __init__(self, args, ques_feat_size, image_feature_size, lidar_feature_size,num_classes, qa=None, encoder='lstm',method='concat'):
        super(MUTAN,self).__init__()
        self.qa = qa

        # special_words = ["<UNK>"]
        # self.vocab = load_vocab(os.path.join(args.input_base, args.vocab))
        # word_vectors = get_word2vec(os.path.join(args.input_base, args.ques_vectors))

        # padding = vocab['question_token_to_idx']['<NULL>']
        # D = word2vec.vector_size
        # self.embeddings = get_embeddings(self.vocab, word_vectors, special_words)
        self.image_feat_size=image_feature_size
        self.vocab = load_vocab(os.path.join(args.input_base, args.vocab))
        N = len(self.vocab['question_token_to_idx'])
        D = 200
        padding = self.vocab['question_token_to_idx']['<NULL>']
        self.embeddings = nn.Embedding(N, D, padding_idx=padding)
        self.question_module = QuestionModule(ques_feat_size, self.embeddings, encoder)
        self.opt=dict(
            dim_v= 512,
            dim_q= 1024,
            attention=dict(
                nb_glimpses= 2,
                dim_hv= 310,
                dim_hq= 310,
                dim_mm= 510,
                R= 5,
                dropout_v= 0.5,
                dropout_q= 0.5,
                dropout_mm= 0.5,
                activation_v= "tanh",
                activation_q= "tanh",
                dropout_hv= 0,
                dropout_hq= 0),
            fusion=dict(
                dim_hv= 620,
                dim_hq= 310,
                dim_mm= 510,
                R= 5,
                dropout_v= 0.5,
                dropout_q= 0.5,
                activation_v= "tanh",
                activation_q= "tanh",
                dropout_hv= 0,
                dropout_hq= 0)
        )
        self.attention=MutanAtt(self.opt)
        self.classifier = AnswerModule(self.opt['fusion']['dim_mm'], num_classes,(),use_batchnorm=True,dropout=0.5)  #3584 if method is concat
        self.linweights=nn.Linear(ques_feat_size ,7)
        self.method=method
Esempio n. 9
0
    def __init__(self, args, ques_feat_size, image_feature_size, lidar_feature_size,num_classes, qa=None, encoder='lstm',grouping='single_scale'):
        super(MUTAN_LIDAR,self).__init__()
        self.qa = qa
        self.image_feat_size=image_feature_size
        self.vocab = load_vocab(os.path.join(args.input_base, args.vocab))
        N = len(self.vocab['question_token_to_idx'])
        D = 200
        padding = self.vocab['question_token_to_idx']['<NULL>']
        self.embeddings = nn.Embedding(N, D, padding_idx=padding)
        self.question_module = QuestionModule(ques_feat_size, self.embeddings, encoder)
        self.opt=dict(
            dim_v= 512,
            dim_q= 1024,
            attention=dict(
                nb_glimpses= 2,
                dim_hv= 310,
                dim_hq= 310,
                dim_mm= 510,
                R= 5,
                dropout_v= 0.5,
                dropout_q= 0.5,
                dropout_mm= 0.5,
                activation_v= "tanh",
                activation_q= "tanh",
                dropout_hv= 0,
                dropout_hq= 0),
            fusion=dict(
                dim_hv= 620,
                dim_hq= 310,
                dim_mm= 510,
                R= 5,
                dropout_v= 0.5,
                dropout_q= 0.5,
                activation_v= "tanh",
                activation_q= "tanh",
                dropout_hv= 0,
                dropout_hq= 0)
        )
        self.attention=MutanAtt(self.opt)
        self.classifier = AnswerModule(self.opt['fusion']['dim_mm']+1024+ques_feat_size, num_classes,(),use_batchnorm=True,dropout=0.5)  #3584 if method is concat
        self.linweights=nn.Linear(ques_feat_size ,7)

        self.grouping=grouping
        if self.grouping=='single_scale':
            self.lidar_module=LidarSsgModule(normal_channel=False)
        if self.grouping=='single_scale':
            self.lidar_module=LidarMsgModule(normal_channel=False)
Esempio n. 10
0
    def __init__(self, args, ques_feat_size, image_feature_size, lidar_feature_size,num_classes, qa=None, encoder='lstm',grouping='single_scale'):
        super(MLB_LIDAR,self).__init__()
        self.qa = qa

        # special_words = ["<UNK>"]
        # self.vocab = load_vocab(os.path.join(args.input_base, args.vocab))
        # word_vectors = get_word2vec(os.path.join(args.input_base, args.ques_vectors))

        # padding = vocab['question_token_to_idx']['<NULL>']
        # D = word2vec.vector_size
        # self.embeddings = get_embeddings(self.vocab, word_vectors, special_words)
        self.image_feat_size=image_feature_size
        self.vocab = load_vocab(os.path.join(args.input_base, args.vocab))
        N = len(self.vocab['question_token_to_idx'])
        D = 200
        padding = self.vocab['question_token_to_idx']['<NULL>']
        self.embeddings = nn.Embedding(N, D, padding_idx=padding)
        self.question_module = QuestionModule(ques_feat_size, self.embeddings, encoder)
        self.opt=dict(
            dim_v=512,
            dim_q = 1024,
            attention=dict(
                nb_glimpses= 4,
                dim_h= 1200,
                dropout_v= 0.5,
                dropout_q= 0.5,
                dropout_mm= 0.5,
                activation_v= "tanh",
                activation_q= "tanh",
                activation_mm= "tanh"),
            fusion=dict(
                dim_h= 1200,
                dropout_v= 0.5,
                dropout_q= 0.5,
                activation_v= "tanh",
                activation_q= "tanh")

        )
        self.attention=MLBAtt(self.opt)
        self.classifier = AnswerModule(self.opt['fusion']['dim_h']* self.opt['attention']['nb_glimpses']+ques_feat_size+1024, num_classes,(),use_batchnorm=True,dropout=0.5)  #3584 if method is concat
        self.linweights=nn.Linear(ques_feat_size ,7)
        self.lidar_module=LidarSsgModule(normal_channel=False)
Esempio n. 11
0
    def __init__(self,
                 args,
                 ques_feat_size,
                 image_feature_size,
                 lidar_feature_size,
                 num_classes,
                 qa=None,
                 encoder='lstm',
                 grouping='single_scale'):
        super(MFB_LIDAR, self).__init__()
        self.qa = qa

        # special_words = ["<UNK>"]
        # self.vocab = load_vocab(os.path.join(args.input_base, args.vocab))
        # word_vectors = get_word2vec(os.path.join(args.input_base, args.ques_vectors))

        # padding = vocab['question_token_to_idx']['<NULL>']
        # D = word2vec.vector_size
        # self.embeddings = get_embeddings(self.vocab, word_vectors, special_words)
        self.image_feat_size = image_feature_size
        self.vocab = load_vocab(os.path.join(args.input_base, args.vocab))
        N = len(self.vocab['question_token_to_idx'])
        D = 200
        padding = self.vocab['question_token_to_idx']['<NULL>']
        self.embeddings = nn.Embedding(N, D, padding_idx=padding)
        self.question_module = QuestionModule(ques_feat_size,
                                              self.embeddings,
                                              encoder,
                                              give_last=False)
        self.attention = CoAtt(ques_feat_size, image_feature_size, 2)
        self.classifier = AnswerModule(
            500 + ques_feat_size + 1024,
            num_classes, (256, ),
            use_batchnorm=True,
            dropout=0.5)  #3500 if method is concat else 500
        self.linweights = nn.Linear(ques_feat_size, 7)
        self.grouping = grouping
        if self.grouping == 'single_scale':
            self.lidar_module = LidarSsgModule(normal_channel=False)
        if self.grouping == 'single_scale':
            self.lidar_module = LidarMsgModule(normal_channel=False)
Esempio n. 12
0
 def __init__(self,
              args,
              ques_feat_size,
              image_feature_size,
              lidar_feature_size,
              num_classes,
              qa=None,
              encoder='lstm',
              grouping='single_scale'):
     super(SAN_LIDAR, self).__init__()
     self.qa = qa
     self.image_feat_size = image_feature_size
     self.vocab = load_vocab(os.path.join(args.input_base, args.vocab))
     N = len(self.vocab['question_token_to_idx'])
     D = 200
     padding = self.vocab['question_token_to_idx']['<NULL>']
     self.embeddings = nn.Embedding(N, D, padding_idx=padding)
     self.question_module = QuestionModule(ques_feat_size, self.embeddings,
                                           encoder)
     self.attention = StackedAttention(ques_feat_size, 512, 512, 2)
     self.classifier = AnswerModule(512,
                                    num_classes, (256, ),
                                    use_batchnorm=True,
                                    dropout=0.5)  #3584 if method is concat
     # self.linweights=nn.Linear(512,7)
     self.linweights = nn.Sequential(
         nn.Linear(512, 256),
         nn.ReLU(inplace=True),
         nn.Dropout(0.5),
         nn.Linear(256, 7),  #no of glimpses
         nn.Sigmoid())
     self.grouping = grouping
     if self.grouping == 'single_scale':
         self.lidar_module = LidarSsgModule(normal_channel=False)
     if self.grouping == 'single_scale':
         self.lidar_module = LidarMsgModule(normal_channel=False)