def __init__(self, config, num_labels): super().__init__() hid_dim = config.hidden_size if config.training_head_type == "nlvr2": in_dim = hid_dim * 2 out_dim = 2 else: in_dim = hid_dim out_dim = config.num_labels add_gqa = isinstance(num_labels, list) if add_gqa: self.logit_gqa = nn.Sequential( nn.Linear(in_dim, hid_dim * 2), GeLU(), BertLayerNorm(hid_dim * 2, eps=1e-12), nn.Linear(hid_dim * 2, num_labels[1]), ) out_dim = num_labels[0] self.logit_fc = nn.Sequential( nn.Linear(in_dim, hid_dim * 2), GeLU(), BertLayerNorm(hid_dim * 2, eps=1e-12), nn.Linear(hid_dim * 2, out_dim), )
def _build_ocr_encoding(self): self.remove_ocr_fasttext = getattr( self.config.ocr, "remove_ocr_fasttext", False ) self.remove_ocr_phoc = getattr(self.config.ocr, "remove_ocr_phoc", False) self.remove_ocr_frcn = getattr(self.config.ocr, "remove_ocr_frcn", False) self.remove_ocr_semantics = getattr( self.config.ocr, "remove_ocr_semantics", False ) self.remove_ocr_bbox = getattr(self.config.ocr, "remove_ocr_bbox", False) # OCR appearance feature: Faster R-CNN self.ocr_faster_rcnn_fc7 = build_image_encoder( self._build_encoder_config(), direct_features=True ) self.finetune_modules.append( {"module": self.ocr_faster_rcnn_fc7, "lr_scale": self.config.lr_scale_frcn} ) self.linear_ocr_feat_to_mmt_in = nn.Linear( self.config.ocr.mmt_in_dim, self.mmt_config.hidden_size ) # OCR location feature: relative bounding box coordinates (4-dim) self.linear_ocr_bbox_to_mmt_in = nn.Linear(4, self.mmt_config.hidden_size) self.ocr_feat_layer_norm = BertLayerNorm(self.mmt_config.hidden_size) self.ocr_bbox_layer_norm = BertLayerNorm(self.mmt_config.hidden_size) self.ocr_drop = nn.Dropout(self.config.ocr.dropout_prob)
def _build_ocr_encoding(self): self.remove_ocr_fasttext = getattr(self.config.ocr, "remove_ocr_fasttext", False) self.remove_ocr_phoc = getattr(self.config.ocr, "remove_ocr_phoc", False) self.remove_ocr_frcn = getattr(self.config.ocr, "remove_ocr_frcn", False) self.remove_ocr_semantics = getattr(self.config.ocr, "remove_ocr_semantics", False) self.remove_ocr_bbox = getattr(self.config.ocr, "remove_ocr_bbox", False) # OCR appearance feature: Faster R-CNN self.ocr_faster_rcnn_fc7 = ImageFeatureEncoder( encoder_type="finetune_faster_rcnn_fpn_fc7", in_dim=2048, weights_file="models/detectron.defaults/fc7_w.pkl", bias_file="models/detectron.defaults/fc7_b.pkl", model_data_dir=self.config.model_data_dir, ) self.finetune_modules.append({ "module": self.ocr_faster_rcnn_fc7, "lr_scale": self.config.lr_scale_frcn }) self.linear_ocr_feat_to_mmt_in = nn.Linear(self.config.ocr.mmt_in_dim, self.mmt_config.hidden_size) # OCR location feature: relative bounding box coordinates (4-dim) self.linear_ocr_bbox_to_mmt_in = nn.Linear(4, self.mmt_config.hidden_size) self.ocr_feat_layer_norm = BertLayerNorm(self.mmt_config.hidden_size) self.ocr_bbox_layer_norm = BertLayerNorm(self.mmt_config.hidden_size) self.ocr_drop = nn.Dropout(self.config.ocr.dropout_prob)
def __init__(self, config, input_size): super(MLPWithLayerNorm, self).__init__() self.config = config self.linear1 = nn.Linear(input_size, config.hidden_size) self.non_lin1 = get_activation(self.config.hidden_act) self.layer_norm1 = BertLayerNorm(config.hidden_size, eps=1e-12) self.linear2 = nn.Linear(config.hidden_size, config.hidden_size) self.non_lin2 = get_activation(self.config.hidden_act) self.layer_norm2 = BertLayerNorm(config.hidden_size, eps=1e-12)
def __init__(self, config): super().__init__() feat_dim = config.visual_feat_dim pos_dim = config.visual_pos_dim # Object feature encoding self.visn_fc = nn.Linear(feat_dim, config.hidden_size) self.visn_layer_norm = BertLayerNorm(config.hidden_size, eps=1e-12) # Box position encoding self.box_fc = nn.Linear(pos_dim, config.hidden_size) self.box_layer_norm = BertLayerNorm(config.hidden_size, eps=1e-12) self.dropout = nn.Dropout(config.hidden_dropout_prob)
def __init__(self, config): super().__init__() MAX_DEC_LENGTH = 100 MAX_TYPE_NUM = 5 hidden_size = config.hidden_size ln_eps = config.layer_norm_eps self.position_embeddings = nn.Embedding(MAX_DEC_LENGTH, hidden_size) self.token_type_embeddings = nn.Embedding(MAX_TYPE_NUM, hidden_size) self.ans_layer_norm = BertLayerNorm(hidden_size, eps=ln_eps) self.ocr_layer_norm = BertLayerNorm(hidden_size, eps=ln_eps) self.emb_layer_norm = BertLayerNorm(hidden_size, eps=ln_eps) self.emb_dropout = nn.Dropout(config.hidden_dropout_prob)
def __init__(self, config, vocab: Vocabulary = None): super(GLTEmbeddings, self).__init__() self.word_embeddings = Embedding(config.vocab_size, config.hidden_size, padding_index=0, vocab_namespace='tokens') if hasattr(config, 'glove_path') and config.glove_path: assert vocab is not None self.word_embeddings_glove = Embedding.from_vocab_or_file( vocab, 300, pretrained_file=config.glove_path, projection_dim=config.hidden_size, trainable=False) self.word_embeddings_glove._pretrained_file = config.glove_path self.use_glove = bool(config.glove_path) else: self.use_glove = False self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size) self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.dropout = nn.Dropout(config.layer_dropout_prob) self.use_position_embeddings = config.use_position_embeddings
def __init__(self, config): super(SpanBertSboHead, self).__init__() self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size) # Linear 1 self.linear1 = nn.Linear(3 * config.hidden_size, config.hidden_size) self.layer_norm1 = BertLayerNorm(config.hidden_size) self.linear2 = nn.Linear(config.hidden_size, config.hidden_size) self.layer_norm2 = BertLayerNorm(config.hidden_size) # Prediction self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False) self.bias = nn.Parameter(torch.zeros(config.vocab_size))
def __init__(self, config): super(LayoutLMEmbeddings, self).__init__() self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=0) self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size) # the related position of each words in the bbox self.position_relation_embeddings = nn.Embedding( config.max_position_embeddings, config.hidden_size) # the total number of words in the bbox self.box_number_embeddings = nn.Embedding( config.max_position_embeddings, config.hidden_size) self.x_position_embeddings = nn.Embedding( config.max_2d_position_embeddings, config.hidden_size) self.y_position_embeddings = nn.Embedding( config.max_2d_position_embeddings, config.hidden_size) self.h_position_embeddings = nn.Embedding( config.max_2d_position_embeddings, config.hidden_size) self.w_position_embeddings = nn.Embedding( config.max_2d_position_embeddings, config.hidden_size) self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size) # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load # any TensorFlow checkpoint file self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.dropout = nn.Dropout(config.hidden_dropout_prob)
def __init__(self, hidden_size, vocab_size, hidden_act="gelu", task_name="lm", **kwargs): super(BertLMHead, self).__init__() self.hidden_size = hidden_size self.hidden_act = hidden_act self.vocab_size = vocab_size self.loss_fct = CrossEntropyLoss(reduction="none", ignore_index=-1) self.num_labels = vocab_size # vocab size # TODO Check if weight init needed! # self.apply(self.init_bert_weights) self.ph_output_type = "per_token" self.model_type = "language_modelling" self.task_name = task_name self.generate_config() # NN Layers # this is the "transform" module in the pytorch-transformers repo self.dense = nn.Linear(self.hidden_size, self.hidden_size) self.transform_act_fn = ACT2FN[self.hidden_act] self.LayerNorm = BertLayerNorm(self.hidden_size, eps=1e-12) # this is the "decoder" in the pytorch-transformers repo # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. self.decoder = nn.Linear(hidden_size, vocab_size, bias=False) self.bias = nn.Parameter(torch.zeros(vocab_size))
def __init__(self, config): super(BertEmbeddingsDialog, self).__init__(config) self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size) self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size) self.bert_token_type_embeddings = nn.Embedding(3, config.hidden_size) # add support for additional segment embeddings. Supporting 10 additional embedding as of now self.token_type_embeddings_extension = nn.Embedding( 10, config.hidden_size) # adding specialized embeddings for sep tokens self.sep_embeddings = nn.Embedding(50, config.hidden_size) # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load # any TensorFlow checkpoint file self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.config = config # position_ids (1, len position emb) is contiguous in memory and exported when serialized self.register_buffer( "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
def __init__(self, num_answers, params): super().__init__() # Build LXRT encoder self.lxrt_encoder = LXRTEncoder_(max_seq_length=MAX_VQA_LENGTH, params=params) hid_dim = self.lxrt_encoder.dim # VQA Answer heads self.logit_fc = nn.Sequential( nn.Linear(hid_dim, hid_dim * 2), GeLU(), BertLayerNorm(hid_dim * 2, eps=1e-12), nn.Linear(hid_dim * 2, num_answers), ) self.logit_fc.apply(self.lxrt_encoder.model.init_bert_weights) if params["adapt_span_enabled"]: print("Using Adaptive Variant") if params["sparse_enabled"]: print("Sparse Enabled") if params["layerdrop_enabled"]: print( "LayerDrop is enabled with dropping rate set to ", params["layerdrop_num_layers"], )
def __init__(self, config): super(LayoutlmEmbeddings, self).__init__() self.word_embeddings = nn.Embedding( config.vocab_size, config.hidden_size, padding_idx=0 ) self.position_embeddings = nn.Embedding( config.max_position_embeddings, config.hidden_size ) self.x_position_embeddings = nn.Embedding( config.max_2d_position_embeddings, config.hidden_size ) self.y_position_embeddings = nn.Embedding( config.max_2d_position_embeddings, config.hidden_size ) self.h_position_embeddings = nn.Embedding( config.max_2d_position_embeddings, config.hidden_size ) # todo but not in the model's embedding self.w_position_embeddings = nn.Embedding( config.max_2d_position_embeddings, config.hidden_size ) # todo but not in the model's embedding self.token_type_embeddings = nn.Embedding( config.type_vocab_size, config.hidden_size ) # this is the segment embedding # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load # any TensorFlow checkpoint file self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.dropout = nn.Dropout(config.hidden_dropout_prob)
def __init__(self, config, v_feature_size, v_loc_size): super(VisualEmbedding, self).__init__() self.image_embeddings = nn.Linear(v_feature_size, config.hidden_size) self.image_location_embeddings = nn.Linear(v_loc_size, config.hidden_size) self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12) self.dropout = nn.Dropout(config.hidden_dropout_prob)
def __init__(self, config): super(RobertaLMHead, self).__init__() self.dense = nn.Linear(config.hidden_size, config.hidden_size) self.layer_norm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False) self.bias = nn.Parameter(torch.zeros(config.vocab_size))
def __init__(self, config): super(BertOutput, self).__init__() self.dense = quantized_linear_setup(config, "ffn_output", config.intermediate_size, config.hidden_size) self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.dropout = nn.Dropout(config.hidden_dropout_prob)
def __init__(self, config): super(BertEmbeddingsPlus, self).__init__() self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size) self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size) self.token_type_embeddings = nn.Embedding(3, config.hidden_size) self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.dropout = nn.Dropout(config.hidden_dropout_prob)
def __init__(self, config): super(BertSelfOutput, self).__init__() self.dense = quantized_linear_setup(config, 'attention_output', config.hidden_size, config.hidden_size) self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.dropout = nn.Dropout(config.hidden_dropout_prob)
def __init__(self, config): super().__init__() self.dense = nn.Linear(config.field_hidden_size, config.hidden_size) if isinstance(config.hidden_act, str): self.transform_act_fn = ACT2FN[config.hidden_act] else: self.transform_act_fn = config.hidden_act self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
def __init__(self, config, num_generated_triplets, num_layers, num_classes, return_intermediate=False): super().__init__() self.return_intermediate = return_intermediate self.num_generated_triplets = num_generated_triplets self.layers = nn.ModuleList( [DecoderLayer(config) for _ in range(num_layers)]) self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.query_embed = nn.Embedding(num_generated_triplets, config.hidden_size) self.decoder2class = nn.Linear(config.hidden_size, num_classes + 1) self.decoder2span = nn.Linear(config.hidden_size, 4) self.head_start_metric_1 = nn.Linear(config.hidden_size, config.hidden_size) self.head_end_metric_1 = nn.Linear(config.hidden_size, config.hidden_size) self.tail_start_metric_1 = nn.Linear(config.hidden_size, config.hidden_size) self.tail_end_metric_1 = nn.Linear(config.hidden_size, config.hidden_size) self.head_start_metric_2 = nn.Linear(config.hidden_size, config.hidden_size) self.head_end_metric_2 = nn.Linear(config.hidden_size, config.hidden_size) self.tail_start_metric_2 = nn.Linear(config.hidden_size, config.hidden_size) self.tail_end_metric_2 = nn.Linear(config.hidden_size, config.hidden_size) self.head_start_metric_3 = nn.Linear(config.hidden_size, 1, bias=False) self.head_end_metric_3 = nn.Linear(config.hidden_size, 1, bias=False) self.tail_start_metric_3 = nn.Linear(config.hidden_size, 1, bias=False) self.tail_end_metric_3 = nn.Linear(config.hidden_size, 1, bias=False) torch.nn.init.orthogonal_(self.head_start_metric_1.weight, gain=1) torch.nn.init.orthogonal_(self.head_end_metric_1.weight, gain=1) torch.nn.init.orthogonal_(self.tail_start_metric_1.weight, gain=1) torch.nn.init.orthogonal_(self.tail_end_metric_1.weight, gain=1) torch.nn.init.orthogonal_(self.head_start_metric_2.weight, gain=1) torch.nn.init.orthogonal_(self.head_end_metric_2.weight, gain=1) torch.nn.init.orthogonal_(self.tail_start_metric_2.weight, gain=1) torch.nn.init.orthogonal_(self.tail_end_metric_2.weight, gain=1) # self.head_start_metric = nn.Parameter(torch.randn(config.hidden_size, config.hidden_size)) # self.head_end_metric = nn.Parameter(torch.randn(config.hidden_size, config.hidden_size)) # self.tail_start_metric = nn.Parameter(torch.randn(config.hidden_size, config.hidden_size)) # self.tail_end_metric = nn.Parameter(torch.randn(config.hidden_size, config.hidden_size)) # torch.nn.init.orthogonal_(self.head_start_metric.data, gain=1) # torch.nn.init.orthogonal_(self.head_end_metric.data, gain=1) # torch.nn.init.orthogonal_(self.tail_start_metric.data, gain=1) # torch.nn.init.orthogonal_(self.tail_end_metric.data, gain=1) torch.nn.init.orthogonal_(self.query_embed.weight, gain=1)
def __init__(self, config, num_labels, dropout=0.3): super().__init__() self.dense = nn.Linear(config.hidden_size, config.hidden_size) self.layer_norm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.decoder = nn.Linear(config.hidden_size, num_labels, bias=False) self.bias = nn.Parameter(torch.zeros(num_labels), requires_grad=True) self.dropout = nn.Dropout(p=dropout) self.decoder.bias = self.bias
def __init__(self, config, rank): super().__init__() self.dense = nn.Linear(config.hidden_size, config.hidden_size) self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.low = LowRankProjectionTransform(config.hidden_size, rank) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.low_dense1 = nn.Linear(config.hidden_size, rank, bias=False) self.low_dense2 = nn.Linear(rank, config.hidden_size) self.applied = False
def __init__(self, config): super().__init__(config) self.config = config self.dense = nn.Linear(config.hidden_size, config.hidden_size) self.dropout = nn.Dropout(0.1) # this we don't have in default BertPooler self.distribution = "normal" self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.pooler_activation = nn.Tanh()
def __init__(self, input_size, hidden_size, out_size, dropout=0.15): super().__init__() self.dense = nn.Linear(input_size, hidden_size) self.layer_norm = BertLayerNorm(hidden_size) self.decoder = nn.Linear(hidden_size, input_size, bias=False) self.bias = nn.Parameter(torch.zeros(input_size), requires_grad=True) self.dropout = nn.Dropout(p=dropout) self.decoder.bias = self.bias
def __init__(self, cfg, args, tok: BertTokenizer): super().__init__(cfg) self.clip_embeddings = r2plus1d_18(pretrained=args.from_pretrained) self.clip_embeddings.fc = nn.Linear(in_features=512, out_features=cfg.hidden_size) self.LayerNorm = BertLayerNorm(cfg.hidden_size, eps=cfg.layer_norm_eps) self.dropout = nn.Dropout(cfg.hidden_dropout_prob) self.tok = tok self.args = args if args.fixed_position_embeddings: self.position_embeddings = PositionEmbeddings(cfg.hidden_size, cfg.hidden_dropout_prob)
def _build_obj_encoding(self): # object appearance feature: Faster R-CNN self.obj_faster_rcnn_fc7 = build_image_encoder( self._build_encoder_config(), direct_features=True ) # apply smaller lr to pretrained Faster R-CNN fc7 self.finetune_modules.append( {"module": self.obj_faster_rcnn_fc7, "lr_scale": self.config.lr_scale_frcn} ) self.linear_obj_feat_to_mmt_in = nn.Linear( self.config.obj.mmt_in_dim, self.mmt_config.hidden_size ) # object location feature: relative bounding box coordinates (4-dim) self.linear_obj_bbox_to_mmt_in = nn.Linear(4, self.mmt_config.hidden_size) self.obj_feat_layer_norm = BertLayerNorm(self.mmt_config.hidden_size) self.obj_bbox_layer_norm = BertLayerNorm(self.mmt_config.hidden_size) self.obj_drop = nn.Dropout(self.config.obj.dropout_prob)
def __init__(self, kb, max_mentions: int): super(MentionSpanRepresenter, self).__init__() # save values self.max_mentions = max_mentions self.embedd_dim = kb.embedd_dim # create all sub-modules self.pooler = SelfAttentiveSpanPooler(self.embedd_dim) self.span_repr_ln = BertLayerNorm(self.embedd_dim, eps=1e-5) # initialize weights self.init_weights()
def __init__(self, config): super().__init__() self.dense = MaskedLinear( config.intermediate_size, config.hidden_size, pruning_method=config.pruning_method, mask_init=config.mask_init, mask_scale=config.mask_scale, ) self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.dropout = nn.Dropout(config.hidden_dropout_prob)
def __init__(self, config): super(GLTTokenGrounding, self).__init__() self.initial_img_project = nn.Linear(config.input_img_dim, config.hidden_size) self.text_project = nn.Linear(config.hidden_size, config.hidden_size) self.b_bias = nn.Parameter(torch.zeros((1))) self.lnorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.pos_project = nn.Linear(6, config.hidden_size) self.lnorm_pos = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.coreference_jump_gate = nn.Linear(config.hidden_size, 2) self.layer = nn.ModuleList([ BertLayer(config) for _ in range(config.visual_self_attention_layers) ])
def __init__(self, config, dense=True, l_norm=True, dropout=True): super(GLTSelfOutput, self).__init__() self.dense = dense self.l_norm = l_norm self.dropout = dropout if dense: self.dense = nn.Linear(config.hidden_size, config.hidden_size) if l_norm: self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps) if dropout: self.dropout = nn.Dropout(config.layer_dropout_prob)