def __init__(self): super(AttBasicModel, self).__init__() self.ss_prob = 0.0 # Schedule sampling probability self.vocab_size = cfg.MODEL.VOCAB_SIZE + 1 # include <BOS>/<EOS> self.att_dim = cfg.MODEL.ATT_FEATS_EMBED_DIM \ if cfg.MODEL.ATT_FEATS_EMBED_DIM > 0 else cfg.MODEL.ATT_FEATS_DIM # word embed sequential = [nn.Embedding(self.vocab_size, cfg.MODEL.WORD_EMBED_DIM)] sequential.append(utils.activation(cfg.MODEL.WORD_EMBED_ACT)) if cfg.MODEL.WORD_EMBED_NORM == True: sequential.append(nn.LayerNorm(cfg.MODEL.WORD_EMBED_DIM)) if cfg.MODEL.DROPOUT_WORD_EMBED > 0: sequential.append(nn.Dropout(cfg.MODEL.DROPOUT_WORD_EMBED)) self.word_embed = nn.Sequential(*sequential) # global visual feat embed sequential = [] if cfg.MODEL.GVFEAT_EMBED_DIM > 0: sequential.append( nn.Linear(cfg.MODEL.GVFEAT_DIM, cfg.MODEL.GVFEAT_EMBED_DIM)) sequential.append(utils.activation(cfg.MODEL.GVFEAT_EMBED_ACT)) if cfg.MODEL.DROPOUT_GV_EMBED > 0: sequential.append(nn.Dropout(cfg.MODEL.DROPOUT_GV_EMBED)) self.gv_feat_embed = nn.Sequential( *sequential) if len(sequential) > 0 else None # attention feats embed sequential = [] if cfg.MODEL.ATT_FEATS_EMBED_DIM > 0: sequential.append( nn.Linear(cfg.MODEL.ATT_FEATS_DIM, cfg.MODEL.ATT_FEATS_EMBED_DIM)) sequential.append(utils.activation(cfg.MODEL.ATT_FEATS_EMBED_ACT)) if cfg.MODEL.DROPOUT_ATT_EMBED > 0: sequential.append(nn.Dropout(cfg.MODEL.DROPOUT_ATT_EMBED)) if cfg.MODEL.ATT_FEATS_NORM == True: sequential.append(torch.nn.LayerNorm( cfg.MODEL.ATT_FEATS_EMBED_DIM)) self.att_embed = nn.Sequential( *sequential) if len(sequential) > 0 else None self.dropout_lm = nn.Dropout( cfg.MODEL.DROPOUT_LM) if cfg.MODEL.DROPOUT_LM > 0 else None self.logit = nn.Linear(cfg.MODEL.RNN_SIZE, self.vocab_size) self.p_att_feats = nn.Linear(self.att_dim, cfg.MODEL.ATT_HIDDEN_SIZE) \ if cfg.MODEL.ATT_HIDDEN_SIZE > 0 else None # bilinear if cfg.MODEL.BILINEAR.DIM > 0: self.p_att_feats = None self.encoder_layers = blocks.create( cfg.MODEL.BILINEAR.ENCODE_BLOCK, embed_dim=cfg.MODEL.BILINEAR.DIM, att_type=cfg.MODEL.BILINEAR.ATTTYPE, att_heads=cfg.MODEL.BILINEAR.HEAD, att_mid_dim=cfg.MODEL.BILINEAR.ENCODE_ATT_MID_DIM, att_mid_drop=cfg.MODEL.BILINEAR.ENCODE_ATT_MID_DROPOUT, dropout=cfg.MODEL.BILINEAR.ENCODE_DROPOUT, layer_num=cfg.MODEL.BILINEAR.ENCODE_LAYERS)
def __init__(self, embed_dim, att_type, att_heads, att_mid_dim, att_mid_drop, dropout, layer_num): super(LowRankBilinearEncBlock, self).__init__() self.layers = nn.ModuleList([]) self.bifeat_emb = nn.ModuleList([]) self.layer_norms = nn.ModuleList([]) for _ in range(layer_num): sublayer = LowRankBilinearLayer(embed_dim=embed_dim, att_type=att_type, att_heads=att_heads, att_mid_dim=att_mid_dim, att_mid_drop=att_mid_drop, dropout=dropout) self.layers.append(sublayer) self.bifeat_emb.append( nn.Sequential( nn.Linear(2 * embed_dim, embed_dim), utils.activation(cfg.MODEL.BILINEAR.BIFEAT_EMB_ACT), nn.Dropout(cfg.MODEL.BILINEAR.ENCODE_BIFEAT_EMB_DROPOUT))) self.layer_norms.append(torch.nn.LayerNorm(embed_dim)) self.proj = nn.Linear(embed_dim * (layer_num + 1), embed_dim) self.layer_norm = torch.nn.LayerNorm(cfg.MODEL.BILINEAR.DIM)
def __init__(self, args, submodel): super(DWEXTransformer, self).__init__() self.vocab_size = cfg.MODEL.VOCAB_SIZE + 1 # image pretrained # self.image_pretrained_models, self.input_visual_feats = ImageClassification.image_features( # 'densenet', fixed_weight=False, pretrained_model=cfg.MODEL.PretrainedImageModel) if args.dataset_name == 'IUXRAY': num_images = 2 self.get_visual_features = self.forward_iuxray elif args.dataset_name == 'MIMICCXR': num_images = 1 self.get_visual_features = self.forward_mimiccxr elif args.dataset_name == 'MIMICCXR_MultiImages': num_images = 2 self.get_visual_features = self.forward_mimiccxr # att_feats encoder cnn_sequential = [] self.input_visual_feats = 1024 cnn_sequential.append( nn.Linear(self.input_visual_feats * num_images, cfg.MODEL.ATT_FEATS_EMBED_DIM)) cnn_sequential.append(utils.activation(cfg.MODEL.ATT_FEATS_EMBED_ACT)) if cfg.MODEL.ATT_FEATS_NORM == True: cnn_sequential.append(nn.LayerNorm(cfg.MODEL.ATT_FEATS_EMBED_DIM)) if cfg.MODEL.DROPOUT_ATT_EMBED > 0: cnn_sequential.append(nn.Dropout(cfg.MODEL.DROPOUT_ATT_EMBED)) gcn_sequential = copy.deepcopy(cnn_sequential) self.cnn_embed = nn.Sequential( *cnn_sequential) if len(cnn_sequential) > 0 else None self.gcn_embed = nn.Sequential( *gcn_sequential) if len(gcn_sequential) > 0 else None self.encoder = Encoder( embed_dim=cfg.MODEL.BILINEAR.DIM, dropout=cfg.MODEL.BILINEAR.ENCODE_DROPOUT, att_type=cfg.MODEL.BILINEAR.ATTTYPE, att_heads=cfg.MODEL.BILINEAR.HEAD, att_mid_dim=cfg.MODEL.BILINEAR.ENCODE_ATT_MID_DIM, att_mid_drop=cfg.MODEL.BILINEAR.ENCODE_ATT_MID_DROPOUT, bifeat_emb_act=cfg.MODEL.BILINEAR.BIFEAT_EMB_ACT, bifeat_emb_drop=cfg.MODEL.BILINEAR.ENCODE_BIFEAT_EMB_DROPOUT, ff_dropout=cfg.MODEL.BILINEAR.ENCODE_FF_DROPOUT, layer_num=cfg.MODEL.BILINEAR.ENCODE_LAYERS) self.decoder = Decoder( vocab_size=self.vocab_size, embed_dim=cfg.MODEL.BILINEAR.DIM, dropout=cfg.MODEL.BILINEAR.DECODE_DROPOUT, att_type=cfg.MODEL.BILINEAR.ATTTYPE, att_heads=cfg.MODEL.BILINEAR.HEAD, att_mid_dim=cfg.MODEL.BILINEAR.DECODE_ATT_MID_DIM, att_mid_drop=cfg.MODEL.BILINEAR.DECODE_ATT_MID_DROPOUT, bifeat_emb_act=cfg.MODEL.BILINEAR.BIFEAT_EMB_ACT, bifeat_emb_drop=cfg.MODEL.BILINEAR.DECODE_BIFEAT_EMB_DROPOUT, ff_dropout=cfg.MODEL.BILINEAR.DECODE_FF_DROPOUT, layer_num=cfg.MODEL.BILINEAR.DECODE_LAYERS) self.submodel = submodel
def __init__( self, embed_dim, dropout, att_type, att_heads, att_mid_dim, att_mid_drop, bifeat_emb_act, bifeat_emb_drop, ff_dropout ): super(EncoderLayer, self).__init__() self.encoder_attn = LowRank( embed_dim = embed_dim, att_type = att_type, att_heads = att_heads, att_mid_dim = att_mid_dim, att_mid_drop = att_mid_drop) self.dropout = nn.Dropout(dropout) self.bifeat_emb = nn.Sequential( nn.Linear(2 * embed_dim, embed_dim), utils.activation(bifeat_emb_act), nn.Dropout(bifeat_emb_drop) ) self.layer_norm = torch.nn.LayerNorm(embed_dim) self.ff_layer = blocks.create( 'FeedForward', embed_dim = embed_dim, ffn_embed_dim = embed_dim * 4, relu_dropout = ff_dropout, dropout = ff_dropout)
def __init__(self, vocab_size, embed_dim, dropout, att_type, att_heads, att_mid_dim, att_mid_drop, bifeat_emb_act, bifeat_emb_drop, ff_dropout, layer_num): super(Decoder, self).__init__() self.att_heads = att_heads self.layers = nn.ModuleList([]) self.embed_dim = embed_dim for i in range(layer_num): sublayer = DecoderLayer(embed_dim=embed_dim, dropout=dropout, att_type=att_type, att_heads=att_heads, att_mid_dim=att_mid_dim, att_mid_drop=att_mid_drop, bifeat_emb_act=bifeat_emb_act, bifeat_emb_drop=bifeat_emb_drop, ff_dropout=ff_dropout, last_layer=(i == layer_num - 1)) self.layers.append(sublayer) self.dropout = nn.Dropout(cfg.MODEL.DROPOUT_WORD_EMBED) self.embed_tokens = nn.Embedding(vocab_size, embed_dim) self.embed_scale = math.sqrt(embed_dim) self.embed_positions = PositionalEncoding( embed_dim, cfg.MODEL.TRANSFORMER.PE_MAX_LEN) self.layer_norm_word = torch.nn.LayerNorm(embed_dim) self.generator = nn.Linear(embed_dim, vocab_size) self.wbil1 = nn.Sequential(nn.Linear(embed_dim, embed_dim), utils.activation(cfg.MODEL.BILINEAR.ACT), torch.nn.LayerNorm(embed_dim)) self.wbil2 = nn.Sequential(nn.Linear(embed_dim, embed_dim), utils.activation(cfg.MODEL.BILINEAR.ACT), torch.nn.LayerNorm(embed_dim)) self.wbi_drop = nn.Dropout(cfg.MODEL.BILINEAR.DECODE_DROPOUT) self.dropout_lm = nn.Dropout(cfg.MODEL.DROPOUT_LM) self.proj_norm = nn.Sequential( nn.Linear(embed_dim * (layer_num + 1), 2 * embed_dim), nn.GLU(), torch.nn.LayerNorm(embed_dim)) self.clear_buffer()
def __init__(self, embed_dim, att_type, att_heads, att_mid_dim, att_mid_drop): super(LowRank, self).__init__() self.embed_dim = embed_dim self.num_heads = att_heads self.head_dim = embed_dim // self.num_heads self.scaling = self.head_dim**-0.5 output_dim = 2 * embed_dim if cfg.MODEL.BILINEAR.ACT == 'GLU' else embed_dim sequential = [] sequential.append(nn.Linear(embed_dim, output_dim)) act = utils.activation(cfg.MODEL.BILINEAR.ACT) if act is not None: sequential.append(act) sequential.append(torch.nn.GroupNorm(self.num_heads, embed_dim)) self.in_proj_q = nn.Sequential(*sequential) sequential = [] sequential.append(nn.Linear(embed_dim, output_dim)) act = utils.activation(cfg.MODEL.BILINEAR.ACT) if act is not None: sequential.append(act) sequential.append(torch.nn.GroupNorm(self.num_heads, embed_dim)) self.in_proj_k = nn.Sequential(*sequential) sequential = [] sequential.append(nn.Linear(embed_dim, output_dim)) act = utils.activation(cfg.MODEL.BILINEAR.ACT) if act is not None: sequential.append(act) sequential.append(torch.nn.GroupNorm(self.num_heads, embed_dim)) self.in_proj_v1 = nn.Sequential(*sequential) sequential = [] sequential.append(nn.Linear(embed_dim, output_dim)) act = utils.activation(cfg.MODEL.BILINEAR.ACT) if act is not None: sequential.append(act) sequential.append(torch.nn.GroupNorm(self.num_heads, embed_dim)) self.in_proj_v2 = nn.Sequential(*sequential) self.attn_net = layers.create(att_type, att_mid_dim, att_mid_drop) self.clear_buffer()
def __init__( self, embed_dim, dropout, att_type, att_heads, att_mid_dim, att_mid_drop, bifeat_emb_act, bifeat_emb_drop, ff_dropout, last_layer = False ): super(DecoderLayer, self).__init__() self.last_layer = last_layer self.word_attn = LowRank( embed_dim = embed_dim, att_type = att_type, att_heads = att_heads, att_mid_dim = att_mid_dim, att_mid_drop = att_mid_drop) self.word_dropout = nn.Dropout(dropout) self.cross_att = LowRank( embed_dim = embed_dim, att_type = att_type, att_heads = att_heads, att_mid_dim = att_mid_dim, att_mid_drop = att_mid_drop) self.cross_dropout = nn.Dropout(dropout) self.layer_norm_cross = torch.nn.LayerNorm(embed_dim) if self.last_layer == False: self.bifeat_emb = nn.Sequential( nn.Linear(2 * embed_dim, embed_dim), utils.activation(bifeat_emb_act), nn.Dropout(bifeat_emb_drop) ) self.layer_norm_x = torch.nn.LayerNorm(embed_dim) self.ff_layer = blocks.create( 'FeedForward', embed_dim = embed_dim, ffn_embed_dim = embed_dim * 4, relu_dropout = ff_dropout, dropout = ff_dropout) self.layer_norm_gx = torch.nn.LayerNorm(embed_dim)
def __init__(self): super(XTransformer, self).__init__() self.vocab_size = cfg.MODEL.VOCAB_SIZE + 1 # att_feats encoder sequential = [] sequential.append( nn.Linear(cfg.MODEL.ATT_FEATS_DIM, cfg.MODEL.ATT_FEATS_EMBED_DIM)) sequential.append(utils.activation(cfg.MODEL.ATT_FEATS_EMBED_ACT)) if cfg.MODEL.ATT_FEATS_NORM == True: sequential.append(nn.LayerNorm(cfg.MODEL.ATT_FEATS_EMBED_DIM)) if cfg.MODEL.DROPOUT_ATT_EMBED > 0: sequential.append(nn.Dropout(cfg.MODEL.DROPOUT_ATT_EMBED)) self.att_embed = nn.Sequential( *sequential) if len(sequential) > 0 else None self.encoder = Encoder( embed_dim=cfg.MODEL.BILINEAR.DIM, dropout=cfg.MODEL.BILINEAR.ENCODE_DROPOUT, att_type=cfg.MODEL.BILINEAR.ATTTYPE, att_heads=cfg.MODEL.BILINEAR.HEAD, att_mid_dim=cfg.MODEL.BILINEAR.ENCODE_ATT_MID_DIM, att_mid_drop=cfg.MODEL.BILINEAR.ENCODE_ATT_MID_DROPOUT, bifeat_emb_act=cfg.MODEL.BILINEAR.BIFEAT_EMB_ACT, bifeat_emb_drop=cfg.MODEL.BILINEAR.ENCODE_BIFEAT_EMB_DROPOUT, ff_dropout=cfg.MODEL.BILINEAR.ENCODE_FF_DROPOUT, layer_num=cfg.MODEL.BILINEAR.ENCODE_LAYERS) self.decoder = Decoder( vocab_size=self.vocab_size, embed_dim=cfg.MODEL.BILINEAR.DIM, dropout=cfg.MODEL.BILINEAR.DECODE_DROPOUT, att_type=cfg.MODEL.BILINEAR.ATTTYPE, att_heads=cfg.MODEL.BILINEAR.HEAD, att_mid_dim=cfg.MODEL.BILINEAR.DECODE_ATT_MID_DIM, att_mid_drop=cfg.MODEL.BILINEAR.DECODE_ATT_MID_DROPOUT, bifeat_emb_act=cfg.MODEL.BILINEAR.BIFEAT_EMB_ACT, bifeat_emb_drop=cfg.MODEL.BILINEAR.DECODE_BIFEAT_EMB_DROPOUT, ff_dropout=cfg.MODEL.BILINEAR.DECODE_FF_DROPOUT, layer_num=cfg.MODEL.BILINEAR.DECODE_LAYERS)