def __init__(self, num_answers, fn_type="softmax"): super().__init__() # Build LXRT encoder self.lxrt_encoder = LXRTEncoder(args, max_seq_length=MAX_VQA_LENGTH) hid_dim = self.lxrt_encoder.dim print("Size of Hidden Dimension:", hid_dim) fc_dim = int(hid_dim) print("Size of Hidden Dimension:", fc_dim) # Type Predictor self.type_fc = nn.Sequential(nn.Linear(hid_dim, hid_dim * 2), GeLU(), BertLayerNorm(hid_dim * 2, eps=1e-12), nn.Linear(hid_dim * 2, 3)) self.sigmoid = nn.Sigmoid() self.tanh = nn.Tanh() self.softmax = nn.Softmax() if fn_type == "tanh": self.fn = self.tanh print("FN: TANH") elif fn_type == "softmax": self.fn = self.softmax print("FN: SOFTMAX") else: self.fn = self.sigmoid print("FN: SIGMOID") # YESNO feedforward self.yesno_fc = nn.Sequential(nn.Linear(hid_dim, hid_dim * 2), GeLU(), BertLayerNorm(hid_dim * 2, eps=1e-12), nn.Linear(2 * hid_dim, fc_dim), GeLU(), BertLayerNorm(fc_dim, eps=1e-12)) # NUMBER feedforward self.number_fc = nn.Sequential(nn.Linear(hid_dim, hid_dim * 2), GeLU(), BertLayerNorm(hid_dim * 2, eps=1e-12), nn.Linear(2 * hid_dim, fc_dim), GeLU(), BertLayerNorm(fc_dim, eps=1e-12)) # OTHER feedforward self.other_fc = nn.Sequential(nn.Linear(hid_dim, hid_dim * 2), GeLU(), BertLayerNorm(hid_dim * 2, eps=1e-12), nn.Linear(2 * hid_dim, fc_dim), GeLU(), BertLayerNorm(fc_dim, eps=1e-12)) # Answering Heads self.logit_fc1 = nn.Sequential(nn.Linear(4 * fc_dim, hid_dim * 2), GeLU(), BertLayerNorm(hid_dim * 2, eps=1e-12), nn.Linear(hid_dim * 2, hid_dim)) # Answering Heads self.logit_fc = nn.Sequential(nn.Linear(hid_dim, hid_dim * 2), GeLU(), BertLayerNorm(hid_dim * 2, eps=1e-12), nn.Linear(hid_dim * 2, num_answers)) self.logit_fc.apply(self.lxrt_encoder.model.init_bert_weights)
def __init__(self, num_answers, model_type='full'): super().__init__() self.model_type = model_type self.lxrt_encoder = LXRTEncoder(args, max_seq_length=MAX_CLF_LENGTH, model_type=args.model_type) hid_dim = self.lxrt_encoder.dim if num_answers == 2: output_dim = 1 else: output_dim = num_answers if self.model_type != 'concat': self.logit_fc = nn.Sequential( nn.Dropout(args.dropout), nn.Linear(hid_dim, hid_dim * 2), GeLU(), BertLayerNorm(hid_dim * 2, eps=1e-12), nn.Dropout(args.dropout), nn.Linear(hid_dim * 2, output_dim)) else: linear = nn.Linear(hid_dim, output_dim) self.logit_fc = nn.Sequential( nn.Dropout(args.dropout), linear, ) self.logit_fc.apply(self.lxrt_encoder.model.init_bert_weights)
def __init__(self, num_answers): super().__init__() # Build LXRT encoder self.lxrt_encoder = LXRTEncoder(args, max_seq_length=MAX_VQA_LENGTH) hid_dim = self.lxrt_encoder.dim #Build Decoder with Attention self.decoder = DecoderWithAttention(attention_dim=hid_dim, embed_dim=hid_dim, decoder_dim=hid_dim, vocab_size=vocab_size, features_dim=hid_dim, dropout=0.5) # VQA Answer heads self.logit_fc = nn.Sequential(nn.Linear(hid_dim, hid_dim * 2), GeLU(), BertLayerNorm(hid_dim * 2, eps=1e-12), nn.Linear(hid_dim * 2, num_answers)) self.logit_fc.apply(self.lxrt_encoder.model.init_bert_weights) self.lstm = nn.LSTM(input_size=hid_dim, hidden_size=hid_dim, num_layers=1, batch_first=True) self.linear = nn.Linear(hid_dim, vocab_size) #vocab size of bert is 30000
def __init__(self, num_answers): super().__init__() # Build LXRT encoder self.lxrt_encoder = LXRTEncoder( args, max_seq_length=MAX_VQA_LENGTH ) hid_dim = self.lxrt_encoder.dim # VQA Answer heads self.logit_fc = nn.Sequential( nn.Linear(hid_dim, hid_dim * 2), GeLU(), BertLayerNorm(hid_dim * 2, eps=1e-12), nn.Linear(hid_dim * 2, num_answers) ) self.logit_fc.apply(self.lxrt_encoder.model.init_bert_weights) # from https://dl.fbaipublicfiles.com/pythia/detectron_model/detectron_model.pth # https://dl.fbaipublicfiles.com/pythia/detectron_model/detectron_model.yaml self.args = SimpleNamespace( # model_file= 'data/faster-rcnn-r101.pth', model_file='data/R-50-FPN.pth', config_file='data/R-50-FPN.yaml', # config_file='../vqa-faster-rcnn/configs/visual_genome_vqa/e2e_faster_rcnn_X-101-64x4d-FPN_1x_MLP_2048_FPN_512_vqa_test.yaml', batch_size=args.batch_size, num_features=36, feature_name="fc6", confidence_threshold=0, background=True, partition=0) self.detection_model = self._build_detection_model()
def __init__(self, num_answers): super().__init__() self.lxrt_encoder = LXRTEncoder(args, max_seq_length=MAX_GQA_LENGTH) hid_dim = self.lxrt_encoder.dim self.logit_fc = nn.Sequential(nn.Linear(hid_dim, hid_dim * 2), GeLU(), BertLayerNorm(hid_dim * 2, eps=1e-12), nn.Linear(hid_dim * 2, num_answers)) self.logit_fc.apply(self.lxrt_encoder.model.init_bert_weights)
def __init__(self, metric): super(MatchingDecoderLV, self).__init__() HIDDEN_DECODER_SIZE = 256 hid_dim = 768 self.lang_proj = nn.Sequential( nn.Linear(hid_dim, hid_dim * 2), GeLU(), BertLayerNorm(hid_dim * 2, eps=1e-12), nn.Linear(hid_dim * 2, HIDDEN_DECODER_SIZE) ) self.vis_proj = nn.Sequential( nn.Linear(hid_dim, hid_dim * 2), GeLU(), BertLayerNorm(hid_dim * 2, eps=1e-12), nn.Linear(hid_dim * 2, HIDDEN_DECODER_SIZE) ) self.metric = metric assert metric in ['sdp', 'cosine']
def __init__(self): super().__init__() self.lxrt_encoder = LXRTEncoder(args, max_seq_length=20) self.hid_dim = hid_dim = self.lxrt_encoder.dim self.logit_fc = nn.Sequential(nn.Linear(hid_dim * 2, hid_dim * 2), GeLU(), BertLayerNorm(hid_dim * 2, eps=1e-12), nn.Linear(hid_dim * 2, 2)) self.logit_fc.apply(self.lxrt_encoder.model.init_bert_weights)
def __init__(self, num_answers): super().__init__() # Build LXRT encoder self.lxrt_encoder = LXRTEncoder(args, max_seq_length=MAX_VQA_LENGTH) hid_dim = self.lxrt_encoder.dim # VQA Answer heads self.logit_fc = nn.Sequential(nn.Linear(hid_dim, hid_dim * 2), GeLU(), BertLayerNorm(hid_dim * 2, eps=1e-12), nn.Linear(hid_dim * 2, 300)) self.logit_fc_ans = nn.Sequential( nn.Linear(hid_dim, hid_dim * 2), GeLU(), BertLayerNorm(hid_dim * 2, eps=1e-12), nn.Linear(hid_dim * 2, num_answers)) self.emb_proj = nn.Sequential(nn.Linear(300, hid_dim), GeLU(), BertLayerNorm(hid_dim, eps=1e-12), nn.Linear(hid_dim, 300)) self.logit_fc.apply(self.lxrt_encoder.model.init_bert_weights)
def __init__(self, num_blocks): super().__init__() # Build LXRT encoder #TODO: Make a new class in entry file self.policy_lxrt_encoder = PolicyLXRTEncoder( args, max_seq_length=MAX_VQA_LENGTH) hid_dim = self.policy_lxrt_encoder.dim # VQA Answer heads self.logit_fc = nn.Sequential(nn.Linear(hid_dim, hid_dim * 2), GeLU(), BertLayerNorm(hid_dim * 2, eps=1e-12), nn.Linear(hid_dim * 2, num_blocks)) self.logit_fc.apply(self.policy_lxrt_encoder.model.init_bert_weights)
def __init__(self, model_type='full'): super().__init__() self.model_type = model_type self.lxrt_encoder = LXRTEncoder(args, max_seq_length=MAX_RANK_LENGTH, model_type=args.model_type) self.hid_dim = hid_dim = self.lxrt_encoder.dim if self.model_type != 'concat': self.logit_fc = nn.Sequential(nn.Linear(hid_dim, hid_dim), GeLU(), BertLayerNorm(hid_dim, eps=1e-12), nn.Linear(hid_dim, 1)) else: self.logit_fc = nn.Sequential(nn.Linear(hid_dim, 1), ) self.logit_fc.apply(self.lxrt_encoder.model.init_bert_weights)
def __init__(self, num_answers): super().__init__() # Build LXRT encoder self.lxrt_encoder = LXRTEncoder(args, max_seq_length=MAX_VQA_LENGTH) hid_dim = self.lxrt_encoder.dim # VQA Answer heads self.logit_fc = nn.Sequential(nn.Linear(hid_dim, hid_dim * 2), GeLU(), BertLayerNorm(hid_dim * 2, eps=1e-12), nn.Linear(hid_dim * 2, num_answers)) self.logit_fc.apply(self.lxrt_encoder.model.init_bert_weights) device = torch.device('cpu') detection_model = torch.load(args.detection_model, map_location=device) self.detection_model = detection_model['model'].float().fuse().eval()
def __init__(self, num_answers): super().__init__() # Build LXRT encoder self.lxrt_encoder = LXRTEncoder(args, max_seq_length=MAX_VQA_LENGTH) hid_dim = self.lxrt_encoder.dim # VQA Answer heads self.logit_fc = nn.Sequential(nn.Linear(hid_dim, hid_dim * 2), GeLU(), BertLayerNorm(hid_dim * 2, eps=1e-12), nn.Linear(hid_dim * 2, num_answers)) self.logit_fc.apply(self.lxrt_encoder.model.init_bert_weights) self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)
def __init__(self, num_answers, attention=False): super().__init__() print(f"Making {__name__}") self.flag = True # Build LXRT encoder self.lxrt_encoder = LXRTEncoder(args, max_seq_length=MAX_VQA_LENGTH, attention=attention) hid_dim = self.lxrt_encoder.dim # VQA Answer heads self.logit_fc = nn.Sequential(nn.Linear(hid_dim, hid_dim * 2), GeLU(), BertLayerNorm(hid_dim * 2, eps=1e-12), nn.Linear(hid_dim * 2, num_answers)) self.logit_fc.apply(self.lxrt_encoder.model.init_bert_weights)
def __init__(self, args, width=100, height=100): super().__init__() # Build LXRT encoder self.lxrt_encoder = LXRTEncoder(args, max_seq_length=MAX_SEQ_LENGTH) hid_dim = self.lxrt_encoder.dim num_logits = width * height self.width = width self.height = height self.n_actions = num_logits self.logit_fc = nn.Sequential(nn.Linear(hid_dim, hid_dim * 2), GeLU(), BertLayerNorm(hid_dim * 2, eps=1e-12), nn.Linear(hid_dim * 2, num_logits)) self.logit_fc.apply(self.lxrt_encoder.model.init_bert_weights) self.use_detectron = args.use_detectron if self.use_detectron: print('Detectron will be used.') data_path = DATA_PATH vg_classes = [] with open(os.path.join(data_path, 'objects_vocab.txt')) as f: for object in f.readlines(): vg_classes.append(object.split(',')[0].lower().strip()) MetadataCatalog.get("vg").thing_classes = vg_classes yaml_file = DETECTRON2_YAML cfg = get_cfg() cfg.merge_from_file(yaml_file) cfg.MODEL.RPN.POST_NMS_TOPK_TEST = 300 cfg.MODEL.ROI_HEADS.NMS_THRESH_TEST = 0.6 cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.6 # VG Weight cfg.MODEL.WEIGHTS = "http://nlp.cs.unc.edu/models/faster_rcnn_from_caffe.pkl" self.predictor = DefaultPredictor(cfg) else: print('Resnet will be used.') self.cnn = nn.Sequential( *(list(models.resnet18( pretrained=True).children())[:-3])).cuda().eval() self.cnn2box = nn.Linear(256, 2048) self.preprocess = transforms.Compose([ transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), ])
def __init__(self, num_classes=2): super().__init__() # Build LXRT encoder self.lxrt_encoder = LXRTEncoder(args, max_seq_length=MAX_UTTERANCE_LENGTH) hid_dim = self.lxrt_encoder.dim # VCSD image features dimensions adjuster self.adaptive_pool = nn.AdaptiveAvgPool2d((36, 2048)) # VCSD Classification head self.logit_fc = nn.Sequential(nn.Linear(hid_dim, hid_dim * 2), GeLU(), BertLayerNorm(hid_dim * 2, eps=1e-12), nn.Linear(hid_dim * 2, num_classes)) self.logit_fc.apply(self.lxrt_encoder.model.init_bert_weights)
def __init__(self, num_answers, finetune_strategy='standard'): super().__init__() #self.finetune_strategy = finetune_strategy # Build LXRT encoder self.lxrt_encoder = LXRTEncoder(args, max_seq_length=MAX_VQA_LENGTH, finetune_strategy=finetune_strategy) hid_dim = self.lxrt_encoder.dim # VQA Answer heads self.logit_fc = nn.Sequential(nn.Linear(hid_dim, hid_dim * 2), GeLU(), BertLayerNorm(hid_dim * 2, eps=1e-12), nn.Linear(hid_dim * 2, num_answers)) self.logit_fc.apply(self.lxrt_encoder.model.init_bert_weights)
def create_head(self, num_answers): hid_dim = self.lxrt_encoder.dim if self.logit_fc is None: if self.encoder_type == 'lxrt': self.logit_fc = nn.Sequential( nn.Linear(hid_dim, hid_dim * 2), GeLU(), BertLayerNorm(hid_dim * 2, eps=1e-12), nn.Linear(hid_dim * 2, num_answers)) else: self.logit_fc = nn.Linear(1024, num_answers) init_weights = ( self.lxrt_encoder.model.init_bert_weights if not isinstance(self.lxrt_encoder.model, nn.DataParallel) else self.lxrt_encoder.model.module.init_bert_weights) self.logit_fc.apply(init_weights) return self.logit_fc[-1] = nn.Linear(hid_dim * 2, num_answers)
def __init__(self, num_answers): super().__init__() self.lxrt_encoder = LXRTEncoder( args, max_seq_length=MAX_GQA_LENGTH ) hid_dim = self.lxrt_encoder.dim self.logit_fc = nn.Sequential( nn.Linear(hid_dim, hid_dim * 2), GeLU(), BertLayerNorm(hid_dim * 2, eps=1e-12), nn.Linear(hid_dim * 2, num_answers) ) self.logit_fc.apply(self.lxrt_encoder.model.init_bert_weights) if args.task_pointer != 'none': self.matching_decoder = MatchingDecoderLV(metric='sdp')
def __init__(self, num_answers): super().__init__() self.lxrt_encoder = LXRTEncoder( args, max_seq_length=MAX_GQA_LENGTH, mode='xl' ) hid_dim = self.lxrt_encoder.dim self.logit_fc = nn.Sequential( nn.Linear(hid_dim, hid_dim * 2), GeLU(), BertLayerNorm(hid_dim * 2, eps=1e-12), nn.Linear(hid_dim * 2, num_answers) ) self.logit_fc.apply(self.lxrt_encoder.model.init_bert_weights) if args.task_nsp_qfpm or args.task_mlm_qfpm: self.qfpm = BertPreTrainingHeads(BertConfig(vocab_size_or_config_json_file = 30522), self.lxrt_encoder.model.bert.embeddings.word_embeddings.weight)
def __init__(self, num_answers): super().__init__() # Build LXRT encoder # lxrt.entry.LXRTEncoder -> LXRTFeatureExtraction -> LXRTModel self.lxrt_encoder = LXRTEncoder( args, max_seq_length=MAX_PVQA_LENGTH ) hid_dim = self.lxrt_encoder.dim # VQA Answer heads self.logit_fc = nn.Sequential( nn.Linear(hid_dim, hid_dim * 2), GeLU(), BertLayerNorm(hid_dim * 2, eps=1e-12), nn.Linear(hid_dim * 2, num_answers) ) self.logit_fc.apply(self.lxrt_encoder.model.init_bert_weights)
def __init__(self): super(KDDModel, self).__init__() # Build LXRT encoder self.lxrt_encoder = LXRTEncoder(args, mode='lx') hid_dim = self.lxrt_encoder.dim self.config = self.lxrt_encoder.model.config # Image-text heads self.logit_fc = nn.Sequential(nn.Linear(hid_dim, hid_dim * 2), GeLU(), BertLayerNorm(hid_dim * 2, eps=1e-12), nn.Linear(hid_dim * 2, 2)) self.logit_fc.apply(self.lxrt_encoder.model.init_bert_weights) # AMSoftmax loss heads self.logit_W = torch.nn.Parameter(torch.randn(hid_dim, 2), requires_grad=True) nn.init.xavier_normal_(self.logit_W, gain=1) # MLM heads self.cls = BertPreTrainingHeads( self.config, self.lxrt_encoder.model.bert.embeddings.word_embeddings.weight)
def __init__(self, num_answers, fn_type="softmax"): super().__init__() # Build LXRT encoder self.lxrt_encoder = LXRTEncoder(args, max_seq_length=MAX_VQA_LENGTH) hid_dim = self.lxrt_encoder.dim print("Size of Hidden Dimension:", hid_dim) fc_dim = int(hid_dim) print("Size of Hidden Dimension:", fc_dim) self.sigmoid = nn.Sigmoid() self.tanh = nn.Tanh() self.softmax = nn.Softmax() if fn_type == "tanh": self.fn = self.tanh print("FN: TANH") elif fn_type == "softmax": self.fn = self.softmax print("FN: SOFTMAX") else: self.fn = self.sigmoid print("FN: SIGMOID") # YN:AND/OR/NOT/NONE Type Predictor self.yn_fc = nn.Sequential(nn.Linear(hid_dim, hid_dim * 2), GeLU(), BertLayerNorm(hid_dim * 2, eps=1e-12), nn.Linear(hid_dim * 2, 4)) # AND FF self.and_fc = nn.Sequential(nn.Linear(hid_dim, hid_dim * 2), GeLU(), BertLayerNorm(hid_dim * 2, eps=1e-12), nn.Linear(2 * hid_dim, fc_dim), GeLU(), BertLayerNorm(fc_dim, eps=1e-12)) # OR FF self.or_fc = nn.Sequential(nn.Linear(hid_dim, hid_dim * 2), GeLU(), BertLayerNorm(hid_dim * 2, eps=1e-12), nn.Linear(2 * hid_dim, fc_dim), GeLU(), BertLayerNorm(fc_dim, eps=1e-12)) # NOT FF self.not_fc = nn.Sequential(nn.Linear(hid_dim, hid_dim * 2), GeLU(), BertLayerNorm(hid_dim * 2, eps=1e-12), nn.Linear(2 * hid_dim, fc_dim), GeLU(), BertLayerNorm(fc_dim, eps=1e-12)) # NONE FF self.none_fc = nn.Sequential(nn.Linear(hid_dim, hid_dim * 2), GeLU(), BertLayerNorm(hid_dim * 2, eps=1e-12), nn.Linear(2 * hid_dim, fc_dim), GeLU(), BertLayerNorm(fc_dim, eps=1e-12)) # Answering Heads self.logit_fc1 = nn.Sequential(nn.Linear(6 * fc_dim, hid_dim * 2), GeLU(), BertLayerNorm(hid_dim * 2, eps=1e-12), nn.Linear(hid_dim * 2, hid_dim)) self.logit_fc = nn.Sequential(nn.Linear(hid_dim, hid_dim * 2), GeLU(), BertLayerNorm(hid_dim * 2, eps=1e-12), nn.Linear(hid_dim * 2, num_answers)) self.logit_fc.apply(self.lxrt_encoder.model.init_bert_weights)