def __init__(self, args, ques_feat_size, image_feature_size, lidar_feature_size, num_classes, qa=None, encoder='lstm'): super(LSTM_BASIC, self).__init__() self.qa = qa # special_words = ["<UNK>"] # self.vocab = load_vocab(os.path.join(args.input_base, args.vocab)) # word_vectors = get_word2vec(os.path.join(args.input_base, args.ques_vectors)) # padding = vocab['question_token_to_idx']['<NULL>'] # D = word2vec.vector_size # self.embeddings = get_embeddings(self.vocab, word_vectors, special_words) self.vocab = load_vocab(os.path.join(args.input_base, args.vocab)) N = len(self.vocab['question_token_to_idx']) D = 200 padding = self.vocab['question_token_to_idx']['<NULL>'] self.embeddings = nn.Embedding(N, D, padding_idx=padding) self.question_module = QuestionModule(ques_feat_size, self.embeddings, encoder) self.classifier = AnswerModule(ques_feat_size, num_classes, (256, ))
def __init__(self, args, ques_feat_size, image_feature_size, lidar_feature_size, num_classes, qa=None, encoder='lstm', grouping='single_scale'): super(MCB_LIDAR, self).__init__() self.qa = qa self.image_feat_size = image_feature_size self.vocab = load_vocab(os.path.join(args.input_base, args.vocab)) N = len(self.vocab['question_token_to_idx']) D = 200 padding = self.vocab['question_token_to_idx']['<NULL>'] self.embeddings = nn.Embedding(N, D, padding_idx=padding) self.question_module = QuestionModule(ques_feat_size, self.embeddings, encoder) self.attention = MCBAttention(ques_feat_size, 512, 512, 8000) self.classifier = AnswerModule(8000 + ques_feat_size + 1024, num_classes, (256, ), use_batchnorm=True, dropout=0.5) #3584 if method is concat self.mcb = MCBPolling(512, 8000, n_modalities=2) self.linweights = nn.Linear(ques_feat_size, 7) self.grouping = grouping if self.grouping == 'single_scale': self.lidar_module = LidarSsgModule(normal_channel=False) if self.grouping == 'single_scale': self.lidar_module = LidarMsgModule(normal_channel=False)
def __init__(self, args, ques_feat_size, image_feature_size, lidar_feature_size, num_classes, qa=None, encoder='lstm', method='concat'): super(MCB, self).__init__() self.qa = qa # special_words = ["<UNK>"] # self.vocab = load_vocab(os.path.join(args.input_base, args.vocab)) # word_vectors = get_word2vec(os.path.join(args.input_base, args.ques_vectors)) # padding = vocab['question_token_to_idx']['<NULL>'] # D = word2vec.vector_size # self.embeddings = get_embeddings(self.vocab, word_vectors, special_words) self.image_feat_size = image_feature_size self.vocab = load_vocab(os.path.join(args.input_base, args.vocab)) N = len(self.vocab['question_token_to_idx']) D = 200 padding = self.vocab['question_token_to_idx']['<NULL>'] self.embeddings = nn.Embedding(N, D, padding_idx=padding) self.question_module = QuestionModule(ques_feat_size, self.embeddings, encoder) self.attention = MCBAttention(ques_feat_size, 512, 512, 8000) self.classifier = AnswerModule(8000, num_classes, (256, ), use_batchnorm=True, dropout=0.5) #3584 if method is concat self.mcb = MCBPolling(512, 8000, n_modalities=2) self.linweights = nn.Linear(ques_feat_size, 7) self.method = method
def __init__(self, args, ques_feat_size, image_feature_size, lidar_feature_size, num_classes, qa=None, encoder='lstm', method='concat'): super(DAN, self).__init__() self.qa = qa # special_words = ["<UNK>"] # self.vocab = load_vocab(os.path.join(args.input_base, args.vocab)) # word_vectors = get_word2vec(os.path.join(args.input_base, args.ques_vectors)) # padding = vocab['question_token_to_idx']['<NULL>'] # D = word2vec.vector_size # self.embeddings = get_embeddings(self.vocab, word_vectors, special_words) self.image_feat_size = image_feature_size self.vocab = load_vocab(os.path.join(args.input_base, args.vocab)) N = len(self.vocab['question_token_to_idx']) D = 200 padding = self.vocab['question_token_to_idx']['<NULL>'] self.embeddings = nn.Embedding(N, D, padding_idx=padding) self.question_module = QuestionModule(ques_feat_size, self.embeddings, encoder, bidirectional=True, give_last=False) self.attention = DANAttention(ques_feat_size, 512, 512, 2) self.linweights = nn.Linear(ques_feat_size, 7) self.method = method self.softmax = nn.Softmax(dim=1) self.tanh = nn.Tanh() if self.method == 'concat': self.classifier = AnswerModule( 2 * 512 * 7 + ques_feat_size, num_classes, (256, ), use_batchnorm=True, dropout=0.5) #3584 if method is concat else: self.classifier = AnswerModule( 2 * 512, num_classes, (256, ), use_batchnorm=True, dropout=0.5) #3584 if method is concat
def __init__(self, args, ques_feat_size, image_feature_size, lidar_feature_size, num_classes, qa=None, encoder='lstm', method='concat'): super(SAN, self).__init__() self.qa = qa # special_words = ["<UNK>"] # self.vocab = load_vocab(os.path.join(args.input_base, args.vocab)) # word_vectors = get_word2vec(os.path.join(args.input_base, args.ques_vectors)) # padding = vocab['question_token_to_idx']['<NULL>'] # D = word2vec.vector_size # self.embeddings = get_embeddings(self.vocab, word_vectors, special_words) self.image_feat_size = image_feature_size self.vocab = load_vocab(os.path.join(args.input_base, args.vocab)) N = len(self.vocab['question_token_to_idx']) D = 200 padding = self.vocab['question_token_to_idx']['<NULL>'] self.embeddings = nn.Embedding(N, D, padding_idx=padding) self.question_module = QuestionModule(ques_feat_size, self.embeddings, encoder) self.attention = StackedAttention(ques_feat_size, 512, 512, 2) self.method = method if self.method == 'concat': self.classifier = AnswerModule( 3584, num_classes, (256, ), use_batchnorm=True, dropout=0.5) #3584 if method is concat if self.method == 'hierarchical': self.classifier = AnswerModule( 512, num_classes, (256, ), use_batchnorm=True, dropout=0.5) #3584 if method is concat # self.linweights=nn.Linear(512,7) self.linweights = nn.Sequential( nn.Linear(512, 256), nn.ReLU(inplace=True), nn.Dropout(0.5), nn.Linear(256, 7), #no of glimpses nn.Sigmoid())
def __init__(self, args, ques_feat_size, image_feature_size, lidar_feature_size, num_classes, qa=None, encoder='lstm', method='dot'): super(CNN_LSTM, self).__init__() self.qa = qa # special_words = ["<UNK>"] # self.vocab = load_vocab(os.path.join(args.input_base, args.vocab)) # word_vectors = get_word2vec(os.path.join(args.input_base, args.ques_vectors)) # padding = vocab['question_token_to_idx']['<NULL>'] # D = word2vec.vector_size # self.embeddings = get_embeddings(self.vocab, word_vectors, special_words) self.image_feat_size = image_feature_size self.vocab = load_vocab(os.path.join(args.input_base, args.vocab)) N = len(self.vocab['question_token_to_idx']) D = 200 padding = self.vocab['question_token_to_idx']['<NULL>'] self.embeddings = nn.Embedding(N, D, padding_idx=padding) self.question_module = QuestionModule(ques_feat_size, self.embeddings, encoder) self.image_features_resize = nn.Linear(self.image_feat_size, ques_feat_size) self.method = method if self.method == 'dot': self.classifier = AnswerModule(ques_feat_size, num_classes, use_batchnorm=True, dropout=0.5) if self.method == 'concat': self.classifier = AnswerModule(ques_feat_size + image_feature_size * 7, num_classes, use_batchnorm=True, dropout=0.5)
def __init__(self, args, ques_feat_size, image_feature_size, lidar_feature_size, num_classes, qa=None, encoder='lstm', method='concat', grouping='single_scale'): super(LIDAR_MODEL, self).__init__() self.qa = qa self.image_feat_size = image_feature_size self.vocab = load_vocab(os.path.join(args.input_base, args.vocab)) N = len(self.vocab['question_token_to_idx']) D = 200 padding = self.vocab['question_token_to_idx']['<NULL>'] self.embeddings = nn.Embedding(N, D, padding_idx=padding) self.question_module = QuestionModule(ques_feat_size, self.embeddings, encoder) self.method = method self.grouping = grouping if self.grouping == 'single_scale': self.lidar_module = LidarSsgModule(normal_channel=False) if self.grouping == 'multi_scale': self.lidar_module = LidarMsgModule(normal_channel=False) if self.method == 'dot': self.classifier = AnswerModule(ques_feat_size, num_classes, use_batchnorm=True, dropout=0.5) if self.method == 'concat': self.classifier = AnswerModule(ques_feat_size + 1024, num_classes, use_batchnorm=True, dropout=0.5)
def __init__(self, args, ques_feat_size, image_feature_size, lidar_feature_size,num_classes, qa=None, encoder='lstm',method='concat'): super(MUTAN,self).__init__() self.qa = qa # special_words = ["<UNK>"] # self.vocab = load_vocab(os.path.join(args.input_base, args.vocab)) # word_vectors = get_word2vec(os.path.join(args.input_base, args.ques_vectors)) # padding = vocab['question_token_to_idx']['<NULL>'] # D = word2vec.vector_size # self.embeddings = get_embeddings(self.vocab, word_vectors, special_words) self.image_feat_size=image_feature_size self.vocab = load_vocab(os.path.join(args.input_base, args.vocab)) N = len(self.vocab['question_token_to_idx']) D = 200 padding = self.vocab['question_token_to_idx']['<NULL>'] self.embeddings = nn.Embedding(N, D, padding_idx=padding) self.question_module = QuestionModule(ques_feat_size, self.embeddings, encoder) self.opt=dict( dim_v= 512, dim_q= 1024, attention=dict( nb_glimpses= 2, dim_hv= 310, dim_hq= 310, dim_mm= 510, R= 5, dropout_v= 0.5, dropout_q= 0.5, dropout_mm= 0.5, activation_v= "tanh", activation_q= "tanh", dropout_hv= 0, dropout_hq= 0), fusion=dict( dim_hv= 620, dim_hq= 310, dim_mm= 510, R= 5, dropout_v= 0.5, dropout_q= 0.5, activation_v= "tanh", activation_q= "tanh", dropout_hv= 0, dropout_hq= 0) ) self.attention=MutanAtt(self.opt) self.classifier = AnswerModule(self.opt['fusion']['dim_mm'], num_classes,(),use_batchnorm=True,dropout=0.5) #3584 if method is concat self.linweights=nn.Linear(ques_feat_size ,7) self.method=method
def __init__(self, args, ques_feat_size, image_feature_size, lidar_feature_size,num_classes, qa=None, encoder='lstm',grouping='single_scale'): super(MUTAN_LIDAR,self).__init__() self.qa = qa self.image_feat_size=image_feature_size self.vocab = load_vocab(os.path.join(args.input_base, args.vocab)) N = len(self.vocab['question_token_to_idx']) D = 200 padding = self.vocab['question_token_to_idx']['<NULL>'] self.embeddings = nn.Embedding(N, D, padding_idx=padding) self.question_module = QuestionModule(ques_feat_size, self.embeddings, encoder) self.opt=dict( dim_v= 512, dim_q= 1024, attention=dict( nb_glimpses= 2, dim_hv= 310, dim_hq= 310, dim_mm= 510, R= 5, dropout_v= 0.5, dropout_q= 0.5, dropout_mm= 0.5, activation_v= "tanh", activation_q= "tanh", dropout_hv= 0, dropout_hq= 0), fusion=dict( dim_hv= 620, dim_hq= 310, dim_mm= 510, R= 5, dropout_v= 0.5, dropout_q= 0.5, activation_v= "tanh", activation_q= "tanh", dropout_hv= 0, dropout_hq= 0) ) self.attention=MutanAtt(self.opt) self.classifier = AnswerModule(self.opt['fusion']['dim_mm']+1024+ques_feat_size, num_classes,(),use_batchnorm=True,dropout=0.5) #3584 if method is concat self.linweights=nn.Linear(ques_feat_size ,7) self.grouping=grouping if self.grouping=='single_scale': self.lidar_module=LidarSsgModule(normal_channel=False) if self.grouping=='single_scale': self.lidar_module=LidarMsgModule(normal_channel=False)
def __init__(self, args, ques_feat_size, image_feature_size, lidar_feature_size,num_classes, qa=None, encoder='lstm',grouping='single_scale'): super(MLB_LIDAR,self).__init__() self.qa = qa # special_words = ["<UNK>"] # self.vocab = load_vocab(os.path.join(args.input_base, args.vocab)) # word_vectors = get_word2vec(os.path.join(args.input_base, args.ques_vectors)) # padding = vocab['question_token_to_idx']['<NULL>'] # D = word2vec.vector_size # self.embeddings = get_embeddings(self.vocab, word_vectors, special_words) self.image_feat_size=image_feature_size self.vocab = load_vocab(os.path.join(args.input_base, args.vocab)) N = len(self.vocab['question_token_to_idx']) D = 200 padding = self.vocab['question_token_to_idx']['<NULL>'] self.embeddings = nn.Embedding(N, D, padding_idx=padding) self.question_module = QuestionModule(ques_feat_size, self.embeddings, encoder) self.opt=dict( dim_v=512, dim_q = 1024, attention=dict( nb_glimpses= 4, dim_h= 1200, dropout_v= 0.5, dropout_q= 0.5, dropout_mm= 0.5, activation_v= "tanh", activation_q= "tanh", activation_mm= "tanh"), fusion=dict( dim_h= 1200, dropout_v= 0.5, dropout_q= 0.5, activation_v= "tanh", activation_q= "tanh") ) self.attention=MLBAtt(self.opt) self.classifier = AnswerModule(self.opt['fusion']['dim_h']* self.opt['attention']['nb_glimpses']+ques_feat_size+1024, num_classes,(),use_batchnorm=True,dropout=0.5) #3584 if method is concat self.linweights=nn.Linear(ques_feat_size ,7) self.lidar_module=LidarSsgModule(normal_channel=False)
def __init__(self, args, ques_feat_size, image_feature_size, lidar_feature_size, num_classes, qa=None, encoder='lstm', grouping='single_scale'): super(MFB_LIDAR, self).__init__() self.qa = qa # special_words = ["<UNK>"] # self.vocab = load_vocab(os.path.join(args.input_base, args.vocab)) # word_vectors = get_word2vec(os.path.join(args.input_base, args.ques_vectors)) # padding = vocab['question_token_to_idx']['<NULL>'] # D = word2vec.vector_size # self.embeddings = get_embeddings(self.vocab, word_vectors, special_words) self.image_feat_size = image_feature_size self.vocab = load_vocab(os.path.join(args.input_base, args.vocab)) N = len(self.vocab['question_token_to_idx']) D = 200 padding = self.vocab['question_token_to_idx']['<NULL>'] self.embeddings = nn.Embedding(N, D, padding_idx=padding) self.question_module = QuestionModule(ques_feat_size, self.embeddings, encoder, give_last=False) self.attention = CoAtt(ques_feat_size, image_feature_size, 2) self.classifier = AnswerModule( 500 + ques_feat_size + 1024, num_classes, (256, ), use_batchnorm=True, dropout=0.5) #3500 if method is concat else 500 self.linweights = nn.Linear(ques_feat_size, 7) self.grouping = grouping if self.grouping == 'single_scale': self.lidar_module = LidarSsgModule(normal_channel=False) if self.grouping == 'single_scale': self.lidar_module = LidarMsgModule(normal_channel=False)
def __init__(self, args, ques_feat_size, image_feature_size, lidar_feature_size, num_classes, qa=None, encoder='lstm', grouping='single_scale'): super(SAN_LIDAR, self).__init__() self.qa = qa self.image_feat_size = image_feature_size self.vocab = load_vocab(os.path.join(args.input_base, args.vocab)) N = len(self.vocab['question_token_to_idx']) D = 200 padding = self.vocab['question_token_to_idx']['<NULL>'] self.embeddings = nn.Embedding(N, D, padding_idx=padding) self.question_module = QuestionModule(ques_feat_size, self.embeddings, encoder) self.attention = StackedAttention(ques_feat_size, 512, 512, 2) self.classifier = AnswerModule(512, num_classes, (256, ), use_batchnorm=True, dropout=0.5) #3584 if method is concat # self.linweights=nn.Linear(512,7) self.linweights = nn.Sequential( nn.Linear(512, 256), nn.ReLU(inplace=True), nn.Dropout(0.5), nn.Linear(256, 7), #no of glimpses nn.Sigmoid()) self.grouping = grouping if self.grouping == 'single_scale': self.lidar_module = LidarSsgModule(normal_channel=False) if self.grouping == 'single_scale': self.lidar_module = LidarMsgModule(normal_channel=False)