def _build_obj_encoding(self): # object appearance feature: Faster R-CNN self.obj_faster_rcnn_fc7 = ImageEncoder( encoder_type='finetune_faster_rcnn_fpn_fc7', in_dim=2048, weights_file='detectron/fc6/fc7_w.pkl', bias_file='detectron/fc6/fc7_b.pkl', model_data_dir=self.config["model_data_dir"] ) # apply smaller lr to pretrained Faster R-CNN fc7 self.finetune_modules.append({ 'module': self.obj_faster_rcnn_fc7, 'lr_scale': self.config.lr_scale_frcn, }) self.linear_obj_feat_to_mmt_in = nn.Linear( self.config.obj.mmt_in_dim, self.mmt_config.hidden_size ) # object location feature: relative bounding box coordinates (4-dim) self.linear_obj_bbox_to_mmt_in = nn.Linear( 4, self.mmt_config.hidden_size ) self.obj_feat_layer_norm = BertLayerNorm(self.mmt_config.hidden_size) self.obj_bbox_layer_norm = BertLayerNorm(self.mmt_config.hidden_size) self.obj_drop = nn.Dropout(self.config.obj.dropout_prob)
def _build_ocr_encoding(self): self.remove_ocr_fasttext = getattr(self.config.ocr, 'remove_ocr_fasttext', False) self.remove_ocr_phoc = getattr(self.config.ocr, 'remove_ocr_phoc', False) self.remove_ocr_frcn = getattr(self.config.ocr, 'remove_ocr_frcn', False) self.remove_ocr_semantics = getattr(self.config.ocr, 'remove_ocr_semantics', False) self.remove_ocr_bbox = getattr(self.config.ocr, 'remove_ocr_bbox', False) # OCR appearance feature: Faster R-CNN self.ocr_faster_rcnn_fc7 = ImageEncoder( encoder_type='finetune_faster_rcnn_fpn_fc7', in_dim=2048, weights_file='detectron/fc6/fc7_w.pkl', bias_file='detectron/fc6/fc7_b.pkl', model_data_dir=self.config["model_data_dir"]) self.finetune_modules.append({ 'module': self.ocr_faster_rcnn_fc7, 'lr_scale': self.config.lr_scale_frcn, }) self.linear_ocr_feat_to_mmt_in = nn.Linear(self.config.ocr.mmt_in_dim, self.mmt_config.hidden_size) # OCR location feature: relative bounding box coordinates (4-dim) self.linear_ocr_bbox_to_mmt_in = nn.Linear(4, self.mmt_config.hidden_size) self.ocr_feat_layer_norm = BertLayerNorm(self.mmt_config.hidden_size) self.ocr_bbox_layer_norm = BertLayerNorm(self.mmt_config.hidden_size) self.ocr_drop = nn.Dropout(self.config.ocr.dropout_prob)
def __init__(self, config): super().__init__() MAX_DEC_LENGTH = 100 hidden_size = config.hidden_size ln_eps = config.layer_norm_eps self.position_embeddings = nn.Embedding(MAX_DEC_LENGTH, hidden_size) self.ans_layer_norm = BertLayerNorm(hidden_size, eps=ln_eps) self.emb_layer_norm = BertLayerNorm(hidden_size, eps=ln_eps) self.emb_dropout = nn.Dropout(config.hidden_dropout_prob)
def __init__(self, hidden_size, vocab_size, hidden_act="gelu", task_name="lm", **kwargs): super(BertLMHead, self).__init__() self.hidden_size = hidden_size self.hidden_act = hidden_act self.vocab_size = vocab_size self.loss_fct = CrossEntropyLoss(reduction="none", ignore_index=-1) self.num_labels = vocab_size # vocab size # TODO Check if weight init needed! # self.apply(self.init_bert_weights) self.ph_output_type = "per_token" self.model_type = "language_modelling" self.task_name = task_name self.generate_config() # NN Layers # this is the "transform" module in the pytorch-transformers repo self.dense = nn.Linear(self.hidden_size, self.hidden_size) self.transform_act_fn = ACT2FN[self.hidden_act] self.LayerNorm = BertLayerNorm(self.hidden_size, eps=1e-12) # this is the "decoder" in the pytorch-transformers repo # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. self.decoder = nn.Linear(hidden_size, vocab_size, bias=False) self.bias = nn.Parameter(torch.zeros(vocab_size))
def __init__(self, config): super(RobertaLMHead, self).__init__() self.dense = nn.Linear(config.hidden_size, config.hidden_size) self.layer_norm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False) self.bias = nn.Parameter(torch.zeros(config.vocab_size))
def __init__(self, config): super(BertImageEmbeddings, self).__init__() self.image_embeddings = nn.Linear(2048, config.hidden_size) self.image_location_embeddings = nn.Linear(5, config.hidden_size) self.image_type_embeddings = nn.Embedding(1, config.hidden_size) self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12) self.dropout = nn.Dropout(config.hidden_dropout_prob)
def __init__(self, config): super(BertOutput, self).__init__() self.dense = quantized_linear_setup(config, "ffn_output", config.intermediate_size, config.hidden_size) self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.dropout = nn.Dropout(config.hidden_dropout_prob)
def __init__(self, config): super(BertSelfOutput, self).__init__() self.dense = quantized_linear_setup(config, 'attention_output', config.hidden_size, config.hidden_size) self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.dropout = nn.Dropout(config.hidden_dropout_prob)
def __init__(self, config): super().__init__() MAX_DEC_LENGTH = 100 MAX_TYPE_NUM = 5 hidden_size = config.hidden_size ln_eps = config.layer_norm_eps self.position_embeddings = nn.Embedding(MAX_DEC_LENGTH, hidden_size) self.token_type_embeddings = nn.Embedding(MAX_TYPE_NUM, hidden_size) self.ans_layer_norm = BertLayerNorm(hidden_size, eps=ln_eps) self.ocr_layer_norm = BertLayerNorm(hidden_size, eps=ln_eps) self.emb_layer_norm = BertLayerNorm(hidden_size, eps=ln_eps) # default value of 0.1 is used self.emb_dropout = nn.Dropout(config.hidden_dropout_prob)
def __init__(self, in_dim, hid_dim, out_dim, dropout): super().__init__() self.logit_fc = nn.Sequential( nn.Linear(in_dim, hid_dim), GeLU(), BertLayerNorm(hid_dim, eps=1e-12), nn.Linear(hid_dim, out_dim), )
def _build_ocr_encoding(self): # (YK): Todo assert self.frcn_encoder_type == "default" # OCR appearance feature: Faster R-CNN self.ocr_faster_rcnn_fc7 = ImageEncoder( encoder_type=self.frcn_encoder_type, in_dim=2048, weights_file="detectron/fc6/fc7_w.pkl", bias_file="detectron/fc6/fc7_b.pkl", model_data_dir=None, ) self.linear_ocr_feat_to_mmt_in = nn.Linear( self.mmt_config.ocr_feature_size, self.mmt_config.hidden_size) # OCR location feature: relative bounding box coordinates (4-dim) self.linear_ocr_bbox_to_mmt_in = nn.Linear(4, self.mmt_config.hidden_size) self.ocr_feat_layer_norm = BertLayerNorm(self.mmt_config.hidden_size) self.ocr_bbox_layer_norm = BertLayerNorm(self.mmt_config.hidden_size) self.ocr_drop = nn.Dropout(self.mmt_config.ocr_drop)
def _build_obj_encoding(self): # object appearance feature: Faster R-CNN # (YK) Todo: support for last-layer finetuning assert self.frcn_encoder_type == "default" self.obj_faster_rcnn_fc7 = ImageEncoder( encoder_type=self.frcn_encoder_type, in_dim=2048, ) # apply smaller lr to pretrained Faster R-CNN fc7 # self.finetune_modules.append({ # 'module': self.obj_faster_rcnn_fc7, # 'lr_scale': self.config.lr_scale_frcn, # }) self.linear_obj_feat_to_mmt_in = nn.Linear( self.mmt_config.obj_feature_size, self.mmt_config.hidden_size) # object location feature: relative bounding box coordinates (4-dim) self.linear_obj_bbox_to_mmt_in = nn.Linear(4, self.mmt_config.hidden_size) self.obj_feat_layer_norm = BertLayerNorm(self.mmt_config.hidden_size) self.obj_bbox_layer_norm = BertLayerNorm(self.mmt_config.hidden_size) self.obj_drop = nn.Dropout(self.mmt_config.obj_drop)
def _build_ocr_encoding(self): # OCR appearance feature: Faster R-CNN self.ocr_faster_rcnn_fc7 = ImageEncoder( encoder_type='finetune_faster_rcnn_fpn_fc7', in_dim=2048, weights_file='detectron/fc6/fc7_w.pkl', bias_file='detectron/fc6/fc7_b.pkl', model_data_dir=self.config["model_data_dir"]) self.finetune_modules.append({ 'module': self.ocr_faster_rcnn_fc7, 'lr_scale': 0.1, }) self.linear_ocr_feat_to_mmt_in = nn.Linear(3002, 768) # OCR location feature: relative bounding box coordinates (4-dim) self.linear_ocr_bbox_to_mmt_in = nn.Linear(4, 768) self.linear_ocr_conf_to_mmt_in = nn.Linear(1, 768) self.ocr_feat_layer_norm = BertLayerNorm(768) self.ocr_bbox_layer_norm = BertLayerNorm(768) self.ocr_conf_layer_norm = BertLayerNorm(768) self.ocr_drop = nn.Dropout(0.1)
def __init__(self, config): super(BertEmbeddings, self).__init__() self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=0) self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size) self.token_type_embeddings_modified = nn.Embedding(config.type_vocab_size, config.hidden_size) # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load # any TensorFlow checkpoint file self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.dropout = nn.Dropout(config.hidden_dropout_prob)
def __init__(self, config): super(BertEmbeddings, self).__init__() self.word_embeddings = quantized_embedding_setup(config, 'word_embeddings', config.vocab_size, config.hidden_size, padding_idx=0) self.position_embeddings = quantized_embedding_setup( config, 'position_embeddings', config.max_position_embeddings, config.hidden_size) self.token_type_embeddings = quantized_embedding_setup( config, 'token_type_embeddings', config.type_vocab_size, config.hidden_size) self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.dropout = nn.Dropout(config.hidden_dropout_prob)
def __init__(self, config, newly_added_config): super(MAG, self).__init__() self.W_hv = nn.Linear( newly_added_config["d_visual_in"] + newly_added_config["h_merge_sent"], newly_added_config["h_merge_sent"]) self.W_ha = nn.Linear( newly_added_config["d_acoustic_in"] + newly_added_config["h_merge_sent"], newly_added_config["h_merge_sent"]) self.W_v = nn.Linear(newly_added_config["d_visual_in"], newly_added_config["h_merge_sent"]) self.W_a = nn.Linear(newly_added_config["d_acoustic_in"], newly_added_config["h_merge_sent"]) self.beta = newly_added_config["beta_shift"] self.newly_added_config = newly_added_config self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-6) self.final_dropout = nn.Dropout(config.hidden_dropout_prob)
def __init__(self, config): super(BertEmbeddingsDialog, self).__init__(config) self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size) self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size) self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size) # add support for additional segment embeddings. Supporting 10 additional embedding as of now self.token_type_embeddings_extension = nn.Embedding( 10, config.hidden_size) # adding specialized embeddings for sep tokens self.sep_embeddings = nn.Embedding(50, config.hidden_size) # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load # any TensorFlow checkpoint file self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.config = config
def __init__(self, config): super(RobertaEmbeddings, self).__init__() self.padding_idx = config.padding_idx self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.padding_idx) self.position_embeddings = nn.Embedding( self.padding_idx + config.max_position_embeddings + 1, config.hidden_size, padding_idx=config.padding_idx) if config.type_vocab_size > 0: self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size) else: self.token_type_embeddings = None self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.dropout = nn.Dropout(config.hidden_dropout_prob)
def __init__(self, d_model=768, nhead=12, num_layers=4, image_encoder=None): super().__init__() decoder_layer = nn.TransformerDecoderLayer(d_model=d_model, nhead=nhead) self.transformer = nn.TransformerDecoder(decoder_layer, num_layers=num_layers) if image_encoder: self.image_encoder1 = image_encoder else: self.image_encoder1 = nn.Sequential( nn.Linear(2048, 2048), nn.ELU(inplace=True) ) self.image_encoder2 = nn.Sequential( nn.Linear(2048, d_model), BertLayerNorm(d_model) ) self.update_gate = nn.Sequential( nn.Linear(2 * d_model, d_model), nn.ELU(inplace=True), )
def build(self): self.mmt_config = BertConfig(**self.config.mmt) self.mmt = MMT(self.mmt_config) self.so_to_mmt_in = nn.Linear(3 * 1536, self.mmt_config.hidden_size) self.st_to_mmt_in = nn.Linear(3 * 1536, self.mmt_config.hidden_size) self.so_layer_norm = BertLayerNorm(self.mmt_config.hidden_size) self.st_layer_norm = BertLayerNorm(self.mmt_config.hidden_size) self.so_drop = nn.Dropout(0.1) self.st_drop = nn.Dropout(0.1) self.linear_go_to_mmt_in = nn.Linear(2048, self.mmt_config.hidden_size) self.linear_gt_to_mmt_in = nn.Linear(300, self.mmt_config.hidden_size) self.go_layer_norm = BertLayerNorm(self.mmt_config.hidden_size) self.gt_layer_norm = BertLayerNorm(self.mmt_config.hidden_size) self.go_drop = nn.Dropout(0.1) self.gt_drop = nn.Dropout(0.1) self.linear_updated_ocr_to_mmt_in = nn.Linear( 300, self.mmt_config.hidden_size) self.updated_ocr_layer_norm = BertLayerNorm( self.mmt_config.hidden_size) self.updated_ocr_drop = nn.Dropout(self.config.ocr.dropout_prob) self.linear_joint = nn.Linear(1536, 768) self.answer_processor = registry.get(self._datasets[0] + "_answer_processor") self.ocr_ptr_net = OcrPtrNet(**self.config.classifier.ocr_ptr_net) # modules requiring custom learning rates (usually for finetuning) self.finetune_modules = [] self._build_txt_encoding() self._build_obj_encoding() self._build_ocr_encoding() self._init_text_embeddings("text") # init feature embedding for "image" setattr(self, "image_feature_dim", self.config["image_feature_dim"]) self.feature_embeddings_out_dim = 0 feature_attn_model_params = self.config["image_feature_embeddings"][0] feature_embedding = ImageEmbedding(getattr(self, "image_feature_dim"), self.text_embeddings_out_dim, **feature_attn_model_params) self.feature_embeddings_out_dim += feature_embedding.out_dim self.feature_embeddings_out_dim *= getattr(self, "image_feature_dim") setattr(self, "image_feature_embeddings_out_dim", self.feature_embeddings_out_dim) del self.feature_embeddings_out_dim setattr(self, "image_feature_embedding", feature_embedding) # init feature embedding for "context" setattr(self, "context_feature_dim", self.config["context_feature_dim"]) self.feature_embeddings_out_dim = 0 feature_attn_model_params = self.config["context_feature_embeddings"][ 0] feature_embedding = ImageEmbedding( getattr(self, "context_feature_dim"), self.text_embeddings_out_dim, **feature_attn_model_params) self.feature_embeddings_out_dim += feature_embedding.out_dim self.feature_embeddings_out_dim *= getattr(self, "context_feature_dim") setattr(self, "context_feature_embeddings_out_dim", self.feature_embeddings_out_dim) del self.feature_embeddings_out_dim setattr(self, "context_feature_embedding", feature_embedding) self._init_combine_layer("image", "text") num_choices = registry.get(self._datasets[0] + "_num_final_outputs") self.classifier = ClassifierLayer( self.config["classifier"]["type"], in_dim=768, out_dim=num_choices - 50, **self.config["classifier"]["params"])