def __init__(self, name='', firststart=False, firststop=False): self.name = name self.firststart = firststart self.firststop = firststop self.tstart = self.tstop = None self.startcounter = Counter() self.stopcounter = Counter() self.resetcounter = Counter() self.timetotal = _TimeAccumulator()
def build_ban(dataset, num_hid, op='', gamma=4, task='vqa', use_counter=True): #w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, .0, op) #q_emb = QuestionEmbedding(300 if 'c' not in op else 600, num_hid, 1, False, .0) w_emb = AlbertTokenizer.from_pretrained('albert-large-v2') q_emb = AlbertModel.from_pretrained('albert-large-v2') params_set = set() for param in q_emb.parameters(): params_set.add(param) param.requires_grad = False v_att = BiAttention(dataset.v_dim, num_hid, num_hid, gamma) if task == 'vqa': b_net = [] q_prj = [] c_prj = [] objects = 10 # minimum number of boxes for i in range(gamma): b_net.append(BCNet(dataset.v_dim, num_hid, num_hid, None, k=1)) q_prj.append(FCNet([num_hid, num_hid], '', .2)) c_prj.append(FCNet([objects + 1, num_hid], 'ReLU', .0)) classifier = SimpleClassifier(num_hid, num_hid * 2, dataset.num_ans_candidates, .5) counter = Counter(objects) if use_counter else None return BanModel(dataset, params_set, w_emb, q_emb, v_att, b_net, q_prj, c_prj, classifier, counter, op, gamma) elif task == 'flickr': return BanModel_flickr(w_emb, q_emb, v_att, op, gamma)
def __init__(self, args, dataset, priotize_using_counter=False): super(BiResNet, self).__init__() # Optional module: counter use_counter = args.use_counter if priotize_using_counter is None else priotize_using_counter if use_counter or priotize_using_counter: objects = 10 # minimum number of boxes if use_counter or priotize_using_counter: counter = Counter(objects) else: counter = None # # init Bilinear residual network b_net = [] # bilinear connect : (XTU)T A (YTV) q_prj = [ ] # output of bilinear connect + original question-> new question Wq_ +q c_prj = [] for i in range(args.glimpse): b_net.append( BCNet(dataset.v_dim, args.hid_dim, args.hid_dim, None, k=1)) q_prj.append(FCNet([args.hid_dim, args.hid_dim], '', .2)) if use_counter or priotize_using_counter: c_prj.append(FCNet([objects + 1, args.hid_dim], 'ReLU', .0)) self.b_net = nn.ModuleList(b_net) self.q_prj = nn.ModuleList(q_prj) self.c_prj = nn.ModuleList(c_prj) self.args = args
def build_ban(dataset, num_hid, op='', gamma=4, task='vqa'): w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, .0, op) q_emb = QuestionEmbedding(300 if 'c' not in op else 600, num_hid, 1, False, .0) v_att = NewAttention(dataset.v_dim, num_hid, num_hid, dropout=0.2) q_net = FCNet([q_emb.num_hid, num_hid]) v_net = FCNet([dataset.v_dim, num_hid]) q_att = weight_norm(nn.Linear(num_hid, 1), dim=None) if task == 'vqa': b_net = [] q_prj = [] c_prj = [] objects = 10 # minimum number of boxes for i in range(gamma): b_net.append(BCNet(dataset.v_dim, num_hid, num_hid, None, k=1)) q_prj.append(FCNet([num_hid, num_hid], '', .2)) c_prj.append(FCNet([objects + 1, num_hid], 'ReLU', .0)) classifier = SimpleClassifier(num_hid, num_hid * 2, 3, .5) #classifier2 = SimpleClassifier(num_hid, num_hid * 2, dataset.num_ans_candidates, .5) counter = Counter(objects) return BanModel(dataset, w_emb, q_emb, v_att, q_att, b_net, q_prj, c_prj, q_net, v_net, classifier, counter, op, gamma) elif task == 'flickr': return BanModel_flickr(w_emb, q_emb, v_att, op, gamma)
def build_ban(num_token, v_dim, num_hid, num_ans, op='', gamma=4, reasoning=False): w_emb = WordEmbedding(num_token, 300, .0, op) q_emb = QuestionEmbedding(300 if 'c' not in op else 600, num_hid, 1, False, .0) if not reasoning: v_att = BiAttention(v_dim, num_hid, num_hid, gamma) else: v_att = BiAttention(v_dim, num_hid, num_hid, 1) # constructing the model b_net = [] q_prj = [] c_prj = [] objects = 36 # minimum number of boxes, originally 10 for i in range(gamma): b_net.append(BCNet(v_dim, num_hid, num_hid, None, k=1)) q_prj.append(FCNet([num_hid, num_hid], '', .2)) c_prj.append(FCNet([objects + 1, num_hid], 'ReLU', .0)) classifier = SimpleClassifier(num_hid, num_hid * 2, num_ans, .5) counter = Counter(objects) if not reasoning: return BanModel(w_emb, q_emb, v_att, b_net, q_prj, c_prj, classifier, counter, op, gamma, num_hid) else: return BanModel_Reasoning(w_emb, q_emb, v_att, b_net, q_prj, c_prj, classifier, counter, op, gamma, num_hid)
def __init__(self, **kwargs): """ kwargs: vocab, dim_hidden, dim_vision, mid_size, glimpses, dropout_prob, device, """ super(Net, self).__init__() for k, v in kwargs.items(): setattr(self, k, v) glimpses = 5 objects = 10 self.num_classes = len(self.vocab['answer_token_to_idx']) self.num_token = len(self.vocab['question_token_to_idx']) self.classifier = Classifier(in_features=self.mid_size, mid_features=self.dim_hidden * 2, out_features=self.num_classes, drop=self.dropout_prob) self.token_embedding = nn.Embedding(self.num_token, self.dim_word) for p in self.token_embedding.parameters(): p.require_grad = False self.dropout = nn.Dropout(self.dropout_prob) self.text = modulars.GRUEncoder( dim_word=self.dim_word, dim_hidden=self.dim_hidden, ) self.count = Counter(objects) self.attention = weight_norm(BiAttention( v_features=self.dim_vision, q_features=self.dim_hidden, mid_features=self.dim_hidden, glimpses=glimpses, drop=0.5, ), name='h_weight', dim=None) self.apply_attention = ApplyAttention( v_features=self.dim_vision, q_features=self.dim_hidden, mid_features=self.dim_hidden, glimpses=glimpses, num_obj=objects, drop=0.2, )
def build_BAN(dataset, args, priotize_using_counter=False): # init word embedding module, question embedding module, and Attention network w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, .0, args.op) q_emb = QuestionEmbedding(300 if 'c' not in args.op else 600, args.num_hid, 1, False, .0, args.rnn) v_att = BiAttention(dataset.v_dim, args.num_hid, args.num_hid, args.gamma) # build and load pre-trained MAML model if args.maml: weight_path = args.RAD_dir + '/' + args.maml_model_path print('load initial weights MAML from: %s' % (weight_path)) maml_v_emb = SimpleCNN(weight_path, args.eps_cnn, args.momentum_cnn) # build and load pre-trained Auto-encoder model if args.autoencoder: ae_v_emb = Auto_Encoder_Model() weight_path = args.RAD_dir + '/' + args.ae_model_path print('load initial weights DAE from: %s' % (weight_path)) ae_v_emb.load_state_dict(torch.load(weight_path)) # Loading tfidf weighted embedding if hasattr(args, 'tfidf'): w_emb = tfidf_loading(args.tfidf, w_emb, args) # Optional module: counter for BAN use_counter = args.use_counter if priotize_using_counter is None else priotize_using_counter if use_counter or priotize_using_counter: objects = 10 # minimum number of boxes if use_counter or priotize_using_counter: counter = Counter(objects) else: counter = None # init BAN residual network b_net = [] q_prj = [] c_prj = [] for i in range(args.gamma): b_net.append( BCNet(dataset.v_dim, args.num_hid, args.num_hid, None, k=1)) q_prj.append(FCNet([args.num_hid, args.num_hid], '', .2)) if use_counter or priotize_using_counter: c_prj.append(FCNet([objects + 1, args.num_hid], 'ReLU', .0)) # init classifier classifier = SimpleClassifier(args.num_hid, args.num_hid * 2, dataset.num_ans_candidates, args) # contruct VQA model and return if args.maml and args.autoencoder: return BAN_Model(dataset, w_emb, q_emb, v_att, b_net, q_prj, c_prj, classifier, counter, args, maml_v_emb, ae_v_emb) elif args.maml: return BAN_Model(dataset, w_emb, q_emb, v_att, b_net, q_prj, c_prj, classifier, counter, args, maml_v_emb, None) elif args.autoencoder: return BAN_Model(dataset, w_emb, q_emb, v_att, b_net, q_prj, c_prj, classifier, counter, args, None, ae_v_emb) return BAN_Model(dataset, w_emb, q_emb, v_att, b_net, q_prj, c_prj, classifier, counter, args, None, None)
def __init__(self): super(AttFlat, self).__init__() self.mlp = nn.Sequential(nn.Linear(hidden_size, flat_mlp_size), nn.ReLU(inplace=True), nn.Dropout(dropout_rate), nn.Linear(flat_mlp_size, 1)) self.linear_merge = nn.Linear(hidden_size, flat_out_size) self.count = Counter(20) self.c_lin = nn.Linear(21, flat_out_size) self.relu = nn.ReLU() self.bn2 = nn.BatchNorm1d(flat_out_size)
def build_ban_foil(dataset, num_hid, num_ans_candidates, op='', gamma=4): w_emb = WordEmbedding(dataset.dictionary.ntoken, 300, .0, op) q_emb = QuestionEmbedding(300 if 'c' not in op else 600, num_hid, 1, False, .0) v_att = BiAttention(dataset.v_dim, num_hid, num_hid, gamma) b_net = [] q_prj = [] c_prj = [] objects = 10 # minimum number of boxes for i in range(gamma): b_net.append(BCNet(dataset.v_dim, num_hid, num_hid, None, k=1)) q_prj.append(FCNet([num_hid, num_hid], '', .2)) c_prj.append(FCNet([objects + 1, num_hid], 'ReLU', .0)) classifier = SimpleClassifierFoil(num_hid, 64, num_ans_candidates) counter = Counter(objects) return BanModel(dataset, w_emb, q_emb, v_att, b_net, q_prj, c_prj, classifier, counter, op, gamma)
def build_ban(dataset, num_hid, op='', gamma=4, q_emb_type='bert', on_do_q=False, finetune_q=False): if 'bert' in q_emb_type: q_emb = BertModel.from_pretrained('bert-base-multilingual-cased') q_dim = 768 elif 'rg' in q_emb_type: w_dim = 100 q_dim = num_hid q_emb = RnnQuestionEmbedding(dataset.dictionary.ntoken, w_dim, q_dim, op) elif 'pkb' in q_emb_type: w_dim = 200 q_dim = num_hid q_emb = RnnQuestionEmbedding(dataset.dictionary.ntoken, w_dim, q_dim, op) if 'bertrnn' in q_emb_type: q_emb = BertRnnQuestionEmbedding(q_emb, 200, num_hid, op) q_dim = num_hid if not finetune_q: # Freeze question embedding if isinstance(q_emb, BertModel): for p in q_emb.parameters(): p.requires_grad_(False) else: for p in q_emb.w_emb.parameters(): p.requires_grad_(False) if not on_do_q: # Remove dropout of question embedding for m in q_emb.modules(): if isinstance(m, nn.Dropout): m.p = 0. v_att = BiAttention(dataset.v_dim, q_dim, num_hid, gamma) b_net = [] q_prj = [] c_prj = [] objects = 10 # minimum number of boxes for i in range(gamma): b_net.append(BCNet(dataset.v_dim, q_dim, num_hid, None, k=1)) q_prj.append(FCNet([num_hid, q_dim], '', .2)) c_prj.append(FCNet([objects + 1, q_dim], 'ReLU', .0)) classifiers = [SimpleClassifier(q_dim, num_hid * 2, dataset.num_ans_candidates, .5), SimpleClassifier(q_dim, num_hid * 2, 1, .5)] counter = Counter(objects) return BanModel(dataset, q_emb, v_att, b_net, q_prj, c_prj, classifiers, counter, op, gamma)
def __init__(self, words_list): super(Net, self).__init__() num_hid = 1280 question_features = num_hid vision_features = config.output_features glimpses = 12 objects = 10 self.text = word_embedding.TextProcessor( classes=words_list, embedding_features=300, lstm_features=question_features, use_hidden=False, drop=0.0, ) self.count = Counter(objects) self.attention = BiAttention( v_features=vision_features, q_features=question_features, mid_features=num_hid, glimpses=glimpses, drop=0.5,) self.apply_attention = ApplyAttention( v_features=vision_features, q_features=question_features, mid_features=num_hid, glimpses=glimpses, num_obj=objects, count = self.count, drop=0.2, ) self.classifier = Classifier( in_features=num_hid, mid_features=num_hid * 2, out_features=config.max_answers, drop=0.5,)
def __init__(self, startvalue=0): self.incrementcounter = Counter() self.accumulated_time = startvalue