Example #1
0
    def __init__(self, config):
        super().__init__(config)

        self.prev_pred_embeddings = PrevPredEmbeddings(config)
        self.encoder = BertEncoder(config)
        # self.apply(self.init_weights)  # old versions of pytorch_transformers
        self.init_weights()
    def __init__(self, config):
        super(BertModelDialog, self).__init__(config)

        self.embeddings = BertEmbeddingsDialog(config)
        self.encoder = BertEncoder(config)
        self.pooler = BertPooler(config)
        self.init_weights()
Example #3
0
    def __init__(self, config):
        super(SelfAttn, self).__init__()
        self.config = config
        self.hsize = 64
        self.atom_emb = nn.Embedding(5, 64)
        self.type_emb = nn.Embedding(15, 64)
        self.pos_emb = nn.Linear(3, 256, bias=False)
        self.dist_emb = nn.Linear(1, 64, bias=False)
        self.mu_emb = nn.Linear(1, 32, bias=False)  # dipole_moment

        self.attn = BertEncoder(config)

        def get_reg_layer(output_size):
            return nn.Sequential(
                nn.Linear(config.hidden_size, config.hidden_size),
                nn.LayerNorm(config.hidden_size),
                nn.LeakyReLU(),
                nn.Dropout(config.hidden_dropout_prob),
                nn.Linear(config.hidden_size, config.hidden_size),
                nn.LayerNorm(config.hidden_size),
                nn.LeakyReLU(),
                nn.Dropout(config.hidden_dropout_prob),
                nn.Linear(config.hidden_size, output_size),
            )

        self.reg_layers4 = nn.ModuleList([get_reg_layer(4) for _ in range(9)])
        self.reg_layers1 = nn.ModuleList([get_reg_layer(1) for _ in range(9)])
        # not currently used.
        self.reg_aux = None
Example #4
0
 def __init__(self, config, depth=None):
     super(CustomBertModel, self).__init__(config)
     self.depth = depth
     self.embeddings = BertEmbeddings(config)
     self.encoder = BertEncoder(config)
     self.cls = BertPreTrainingHeads(config)
     self.apply(self.init_weights)
Example #5
0
 def __init__(self, config):
     super(PatientLevelBert, self).__init__()
     self.config = config
     self.embeddings = PatientLevelEmbedding(config)
     self.encoder = BertEncoder(config)
     self.pooler = BertPooler(config)
     self.apply(self.init_weights)
Example #6
0
    def __init__(self, config):
        super(BertModel, self).__init__(config)

        self.embeddings = BertEmbeddings(config)
        self.encoder = BertEncoder(config)
        self.pooler = BertPooler(config)

        self.apply(self.init_weights)
Example #7
0
    def __init__(self, config, gat_config):
        super().__init__(config)

        self.prev_pred_embeddings = PrevPredEmbeddings(config)
        # self.ggcn = QCGATLayers(config.hidden_size, gat_config.num_gat_layers) #
        self.ggcn = QVGATLayers(config.hidden_size,
                                gat_config.num_gat_layers)  #
        self.encoder = BertEncoder(config)
        # self.apply(self.init_weights)  # old versions of pytorch_transformers
        self.init_weights()
Example #8
0
 def __init__(self, args,adapter_config):
     super(Adapter, self).__init__()
     self.adapter_config = adapter_config
     self.args = args
     self.down_project = nn.Linear(
         self.adapter_config.project_hidden_size,
         self.adapter_config.adapter_size,
     )
     self.encoder = BertEncoder(self.adapter_config)
     self.up_project = nn.Linear(self.adapter_config.adapter_size, adapter_config.project_hidden_size)
     self.init_weights()
Example #9
0
    def __init__(self, config):
        super().__init__(config)

        self.prev_pred_embeddings = PrevPredEmbeddings(config)
        # self.ggcn = GatedGraphConvNet(768) # 40.47 -- 40.76
        # self.ggcn = MultiHeadGraphAttNet(768) # 39.86
        # self.ggcn = BaseGraphAttNet(768) # 39.57
        self.ggcn = QuestionConditionedGAT(768, 0.15)  # 40.99
        self.encoder = BertEncoder(config)
        # self.apply(self.init_weights)  # old versions of pytorch_transformers
        self.init_weights()
Example #10
0
 def __init__(self, cfg):
     super(LSTMATTNModel, self).__init__()
     self.cfg = cfg
     cate_col_size = len(cfg.cate_cols)
     cont_col_size = len(cfg.cont_cols)
     self.cate_emb = nn.Embedding(cfg.total_cate_size, cfg.emb_size, padding_idx=0)        
     self.cate_proj = nn.Sequential(
         nn.Linear(cfg.emb_size*cate_col_size, cfg.hidden_size//2),
         nn.LayerNorm(cfg.hidden_size//2),
     )        
     self.cont_emb = nn.Sequential(                
         nn.Linear(cont_col_size, cfg.hidden_size//2),
         nn.LayerNorm(cfg.hidden_size//2),
     )
     
     self.encoder = nn.LSTM(cfg.hidden_size, 
                         cfg.hidden_size, 1, dropout=cfg.dropout, batch_first=True)
     
     self.config = BertConfig( 
         3, # not used
         hidden_size=cfg.hidden_size,
         num_hidden_layers=1,
         num_attention_heads=cfg.nheads,
         intermediate_size=cfg.hidden_size,
         hidden_dropout_prob=cfg.dropout,
         attention_probs_dropout_prob=cfg.dropout,
     )
     self.attn = BertEncoder(self.config)                 
     
     def get_reg():
         return nn.Sequential(
         nn.Linear(cfg.hidden_size, cfg.hidden_size),
         nn.LayerNorm(cfg.hidden_size),
         nn.Dropout(cfg.dropout),
         nn.ReLU(),
         nn.Linear(cfg.hidden_size, cfg.hidden_size),
         nn.LayerNorm(cfg.hidden_size),
         nn.Dropout(cfg.dropout),
         nn.ReLU(),
         nn.Linear(cfg.hidden_size, cfg.target_size),            
     )           
     self.reg_layer = get_reg()
    def __init__(self, args, dictionary, embed_tokens, left_pad=False):
        super().__init__(dictionary)
        self.dropout = args.dropout

        # from pytorch_transformers import RobertaModel
        from fairseq.modules.roberta_causal_mask import RobertaCasulMaskModel, BertCasulMaskModel
        from pytorch_transformers.file_utils import PYTORCH_TRANSFORMERS_CACHE
        from pytorch_transformers import RobertaConfig, RobertaTokenizer, BertConfig, BertTokenizer

        if args.roberta_model.startswith('roberta'):
            self.roberta = RobertaCasulMaskModel.from_pretrained(
                args.roberta_model,
                cache_dir=PYTORCH_TRANSFORMERS_CACHE /
                'distributed_{}'.format(args.distributed_rank))
            self.config = RobertaConfig.from_pretrained(args.roberta_model)
            self.tokenizer = RobertaTokenizer.from_pretrained(
                args.roberta_model)
        else:
            self.roberta = BertCasulMaskModel.from_pretrained(
                args.roberta_model,
                cache_dir=PYTORCH_TRANSFORMERS_CACHE /
                'distributed_{}'.format(args.distributed_rank))
            self.config = BertConfig.from_pretrained(args.roberta_model)
            self.tokenizer = BertTokenizer.from_pretrained(args.roberta_model)
        self.config.output_attentions = True
        self.roberta.pooler.dense.weight.requires_grad = False
        self.roberta.pooler.dense.bias.requires_grad = False

        embed_dim = embed_tokens.embedding_dim
        self.padding_idx = embed_tokens.padding_idx

        # self.embed_tokens = embed_tokens
        # self.embed_scale = math.sqrt(embed_dim)

        self.args = args

        # if args.sentence_transformer_arch == 'fairseq':
        #     self.padding_idx = embed_tokens.padding_idx

        #     self.sent_embed_positions = PositionalEmbedding(
        #         1024, embed_dim, self.padding_idx,
        #         left_pad=False,
        #         learned=args.encoder_learned_pos,
        #     )

        #     self.doc_layers = nn.ModuleList([])
        #     self.doc_layers.extend([
        #         TransformerEncoderLayer(args)
        #         for i in range(args.encoder_layers)
        #     ])
        if args.sentence_transformer_arch == 'bert':
            # from pytorch_transformers import RobertaConfig, RobertaTokenizer

            # self.config = RobertaConfig.from_pretrained(args.roberta_model)
            # self.config.output_attentions = True
            # self.tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

            embed_dim = self.config.hidden_size
            print('*** padding idx before ***', embed_tokens.padding_idx)
            self.padding_idx = self.tokenizer.convert_tokens_to_ids(
                self.tokenizer.pad_token)
            print('*** padding idx after ***', self.padding_idx)

            # let's assume each document has at most 128-self.padding_idx-1 sentences
            # in case of roberta, it is 126
            self.sent_position_embeddings = nn.Embedding(128, embed_dim)
            if args.encoder_layers:
                self.config.num_hidden_layers = args.encoder_layers
            if args.dropout:
                self.config.hidden_dropout_prob = args.dropout
            if args.attention_dropout:
                self.config.attention_probs_dropout_prob = args.attention_dropout
            if args.attn_type == 'attn_score':
                self.sent_encoder = AttnScoreBertEncoder(self.config)
            elif args.attn_type == 'attn_prob':
                self.sent_encoder = BertEncoder(self.config)
            else:
                raise Exception('--attn-type doesn\'t support {} yet !'.format(
                    args.attn_type))
            self.sent_encoder.apply(self._init_weights)

            print('*** sentence encoder config ***')
            print(self.config)
        else:
            raise Exception(
                '--sentence-transformer-arch doesn\'t support {} yet!'.format(
                    args.sentence_transformer_arch))
class TransformerEncoder(FairseqEncoder):
    """Transformer encoder."""
    def __init__(self, args, dictionary, embed_tokens, left_pad=False):
        super().__init__(dictionary)
        self.dropout = args.dropout

        # from pytorch_transformers import RobertaModel
        from fairseq.modules.roberta_causal_mask import RobertaCasulMaskModel, BertCasulMaskModel
        from pytorch_transformers.file_utils import PYTORCH_TRANSFORMERS_CACHE
        from pytorch_transformers import RobertaConfig, RobertaTokenizer, BertConfig, BertTokenizer

        if args.roberta_model.startswith('roberta'):
            self.roberta = RobertaCasulMaskModel.from_pretrained(
                args.roberta_model,
                cache_dir=PYTORCH_TRANSFORMERS_CACHE /
                'distributed_{}'.format(args.distributed_rank))
            self.config = RobertaConfig.from_pretrained(args.roberta_model)
            self.tokenizer = RobertaTokenizer.from_pretrained(
                args.roberta_model)
        else:
            self.roberta = BertCasulMaskModel.from_pretrained(
                args.roberta_model,
                cache_dir=PYTORCH_TRANSFORMERS_CACHE /
                'distributed_{}'.format(args.distributed_rank))
            self.config = BertConfig.from_pretrained(args.roberta_model)
            self.tokenizer = BertTokenizer.from_pretrained(args.roberta_model)
        self.config.output_attentions = True
        self.roberta.pooler.dense.weight.requires_grad = False
        self.roberta.pooler.dense.bias.requires_grad = False

        embed_dim = embed_tokens.embedding_dim
        self.padding_idx = embed_tokens.padding_idx

        # self.embed_tokens = embed_tokens
        # self.embed_scale = math.sqrt(embed_dim)

        self.args = args

        # if args.sentence_transformer_arch == 'fairseq':
        #     self.padding_idx = embed_tokens.padding_idx

        #     self.sent_embed_positions = PositionalEmbedding(
        #         1024, embed_dim, self.padding_idx,
        #         left_pad=False,
        #         learned=args.encoder_learned_pos,
        #     )

        #     self.doc_layers = nn.ModuleList([])
        #     self.doc_layers.extend([
        #         TransformerEncoderLayer(args)
        #         for i in range(args.encoder_layers)
        #     ])
        if args.sentence_transformer_arch == 'bert':
            # from pytorch_transformers import RobertaConfig, RobertaTokenizer

            # self.config = RobertaConfig.from_pretrained(args.roberta_model)
            # self.config.output_attentions = True
            # self.tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

            embed_dim = self.config.hidden_size
            print('*** padding idx before ***', embed_tokens.padding_idx)
            self.padding_idx = self.tokenizer.convert_tokens_to_ids(
                self.tokenizer.pad_token)
            print('*** padding idx after ***', self.padding_idx)

            # let's assume each document has at most 128-self.padding_idx-1 sentences
            # in case of roberta, it is 126
            self.sent_position_embeddings = nn.Embedding(128, embed_dim)
            if args.encoder_layers:
                self.config.num_hidden_layers = args.encoder_layers
            if args.dropout:
                self.config.hidden_dropout_prob = args.dropout
            if args.attention_dropout:
                self.config.attention_probs_dropout_prob = args.attention_dropout
            if args.attn_type == 'attn_score':
                self.sent_encoder = AttnScoreBertEncoder(self.config)
            elif args.attn_type == 'attn_prob':
                self.sent_encoder = BertEncoder(self.config)
            else:
                raise Exception('--attn-type doesn\'t support {} yet !'.format(
                    args.attn_type))
            self.sent_encoder.apply(self._init_weights)

            print('*** sentence encoder config ***')
            print(self.config)
        else:
            raise Exception(
                '--sentence-transformer-arch doesn\'t support {} yet!'.format(
                    args.sentence_transformer_arch))

    def _init_weights(self, module):
        """ Initialize the weights """
        if isinstance(module, (nn.Linear, nn.Embedding)):
            # Slightly different from the TF version which uses truncated_normal for initialization
            # cf https://github.com/pytorch/pytorch/pull/5617
            module.weight.data.normal_(mean=0.0,
                                       std=self.config.initializer_range)
        elif isinstance(module, BertLayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        if isinstance(module, nn.Linear) and module.bias is not None:
            module.bias.data.zero_()

    def forward(self,
                src_tokens,
                segment_ids,
                doc_pad_mask,
                doc_pos_tok,
                cls_pos,
                attention_mask=None):
        # if self.args.sentence_transformer_arch == 'fairseq':
        #     bsz, seqlen = src_tokens.size()

        #     # compute padding mask
        #     attention_mask = src_tokens.ne(self.padding_idx)
        #     # enc_hids, _ = self.bert(src_tokens, segment_ids, attention_mask, output_all_encoded_layers=False)
        #     all_hids = self.roberta(src_tokens, segment_ids, attention_mask)
        #     # print('all_hids', all_hids.size())
        #     enc_hids = all_hids[0]
        #     doc_pos = self.sent_embed_positions(doc_pos_tok)

        #     sent_repr = get_sent_end_repr(enc_hids, cls_pos)

        #     sent_repr = sent_repr + doc_pos
        #     # n_sent x bsz x C
        #     sent_repr = sent_repr.transpose(0, 1)
        #     for doc_layer in self.doc_layers:
        #         sent_repr = doc_layer(sent_repr, doc_pad_mask)

        #     return {
        #         'encoder_out': sent_repr,  # n_sent x bsz x C
        #         'encoder_padding_mask': doc_pad_mask,  # bsz x n_sent
        #     }
        if self.args.sentence_transformer_arch == 'bert':
            bsz, seqlen = src_tokens.size()

            doclen = cls_pos.size(1)
            position_ids = torch.arange(1 + self.padding_idx,
                                        doclen + 1 + self.padding_idx,
                                        dtype=torch.long,
                                        device=cls_pos.device)
            position_ids = position_ids.unsqueeze(0).expand_as(cls_pos)
            doc_pos = self.sent_position_embeddings(position_ids)

            # compute padding mask
            if attention_mask is None:
                attention_mask = src_tokens.ne(self.padding_idx)

            # seq_len = src_tokens.shape[1]
            # while seq_len >= self.roberta.embeddings.position_embeddings.weight.shape[0] - self.roberta.embeddings.padding_idx:
            #     old_num_pos = self.roberta.embeddings.position_embeddings.weight.shape[0]
            #     print('| WARNING: longer than {}, expand the position embedding to {}'.format(old_num_pos, old_num_pos+512))
            #     num_pos = old_num_pos + 512
            #     embed_dim = self.roberta.embeddings.position_embeddings.weight.shape[1]
            #     new_embeddings = torch.nn.Embedding(num_pos, embed_dim)
            #     new_embeddings.to(self.roberta.embeddings.position_embeddings.weight.device)
            #     new_embeddings.to(self.roberta.embeddings.position_embeddings.weight.dtype)
            #     new_embeddings.weight.data[:old_num_pos, :] = self.roberta.embeddings.position_embeddings.weight.data[:old_num_pos, :]
            #     new_embeddings.weight.data[old_num_pos:, :] = self.roberta.embeddings.position_embeddings.weight.data[-512:, :]
            #     self.roberta.embeddings.position_embeddings = new_embeddings
            # self.roberta.embeddings.position_embeddings.weight.fill_(0)

            all_hids = self.roberta(src_tokens, segment_ids, attention_mask)
            enc_hids = all_hids[0]

            sent_repr = get_sent_end_repr(enc_hids, cls_pos)

            sent_repr = sent_repr + doc_pos

            head_mask = [None] * self.config.num_hidden_layers

            extended_doc_mask = doc_pad_mask.unsqueeze(1).unsqueeze(2)
            # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
            # masked positions, this operation will create a tensor which is 0.0 for
            # positions we want to attend and -10000.0 for masked positions.
            # Since we are adding it to the raw scores before the softmax, this is
            # effectively the same as removing these entirely.
            extended_doc_mask = extended_doc_mask.to(dtype=next(
                self.parameters()).dtype)  # fp16 compatibility
            extended_doc_mask = extended_doc_mask * -10000.0

            all_hids_doc = self.sent_encoder(sent_repr, extended_doc_mask,
                                             head_mask)
            sent_repr_given_doc = all_hids_doc[0]
            attn_weights = all_hids_doc[1]

            return {
                'encoder_out': sent_repr_given_doc,  # bsz x n_sent x C
                'attn_weights': attn_weights,
                'encoder_doc_mask': doc_pad_mask,  # bsz x n_sent
            }
        else:
            raise Exception(
                '--sentence-transformer-arch doesn\'t support {} yet!'.format(
                    args.sentence_transformer_arch))

    def reorder_encoder_out(self, encoder_out_dict, new_order):
        if encoder_out_dict['encoder_out'] is not None:
            encoder_out_dict['encoder_out'] = \
                encoder_out_dict['encoder_out'].index_select(1, new_order)
        if encoder_out_dict['encoder_padding_mask'] is not None:
            encoder_out_dict['encoder_padding_mask'] = \
                encoder_out_dict['encoder_padding_mask'].index_select(0, new_order)
        return encoder_out_dict

    def max_positions(self):
        """Maximum input length supported by the encoder."""
        # return self.embed_positions.max_positions()
        return 10240

    def upgrade_state_dict(self, state_dict):
        '''
        if isinstance(self.embed_positions, SinusoidalPositionalEmbedding):
            if 'encoder.embed_positions.weights' in state_dict:
                del state_dict['encoder.embed_positions.weights']
            if 'encoder.embed_positions._float_tensor' not in state_dict:
                state_dict['encoder.embed_positions._float_tensor'] = torch.FloatTensor()
        '''
        return state_dict
Example #13
0
    def __init__(self, config):
        super().__init__(config)

        self.encoder = BertEncoder(config)
        # self.apply(self.init_weights)  # old versions of pytorch_transformers
        self.init_weights()