def test():
    config_ = BertConfig().from_json_file(MODEL_PATH + 'bert_config.json')

    # device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")

    # model.bert.load_state_dict(torch.load(args.init_checkpoint, map_location='cpu'))

    from transformers import BertTokenizer

    # tokenizer = BertTokenizer(MODEL_PATH + 'vocab.txt')
    # inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
    # print(inputs)

    model_ = BertModel(config_).from_pretrained(MODEL_PATH)

    for name, param in model_.named_parameters():
        print(name)
Beispiel #2
0
def get_pretrained_model(path, logger, args=None):
    logger.info('load pretrained model in {}'.format(path))
    bert_tokenizer = BertTokenizer.from_pretrained(path)
    
    if args is None or args.hidden_layers == 12:
        bert_config = BertConfig.from_pretrained(path)
        bert_model = BertModel.from_pretrained(path)

    else:
        logger.info('load {} layers bert'.format(args.hidden_layers))
        bert_config = BertConfig.from_pretrained(path, num_hidden_layers=args.hidden_layers)
        bert_model = BertModel(bert_config)
        model_param_list = [p[0] for p in bert_model.named_parameters()]
        load_dict = torch.load(os.path.join(path, 'pytorch_model.bin'))
        new_load_dict = {}
        for k, v in load_dict.items():
            k = k.replace('bert.', '')
            if k in model_param_list:
                new_load_dict[k] = v
        new_load_dict['embeddings.position_ids'] = torch.tensor([i for i in range(512)]).unsqueeze(dim=0)
        bert_model.load_state_dict(new_load_dict)

    logger.info('load complete')
    return bert_config, bert_tokenizer, bert_model
Beispiel #3
0
class BertLSTM(BertPreTrainedModel):
    def __init__(self, config: BertConfig):
        super().__init__(config)

        self.bert = BertModel(config)

        self.dropout = nn.Dropout(config.hidden_dropout_prob)

        self.lstm = nn.LSTM(input_size=config.hidden_size,
                            hidden_size=config.hidden_size,
                            num_layers=1,
                            dropout=0,
                            batch_first=True,
                            bidirectional=False)

        self.fc = nn.Linear(config.hidden_size * 3, config.num_labels)
        self.fc_bn = nn.BatchNorm1d(config.num_labels)
        self.tanh = nn.Tanh()
        self.init_weights()

        # Default: freeze bert
        for name, param in self.bert.named_parameters():
            param.requires_grad = False

        # Unfreeze layers
        if config.unfreeze == "embed":
            for name, param in self.bert.named_parameters():
                if "embeddings" in name:
                    param.requires_grad = True

        if config.unfreeze == "embed_enc0":
            for name, param in self.bert.named_parameters():
                if "embeddings" in name or "encoder.layer.0" in name:
                    param.requires_grad = True

        if config.unfreeze == "embed_enc0_pooler":
            for name, param in self.bert.named_parameters():
                if "embeddings" in name or "encoder.layer.0" in name or "pooler" in name:
                    param.requires_grad = True

        if config.unfreeze == "enc0":
            for name, param in self.bert.named_parameters():
                if "encoder.layer.0" in name:
                    param.requires_grad = True

        if config.unfreeze == "enc0_pooler":
            for name, param in self.bert.named_parameters():
                if "encoder.layer.0" in name or "pooler" in name:
                    param.requires_grad = True

        if config.unfreeze == "embed_pooler":
            for name, param in self.bert.named_parameters():
                if "embed" in name or "pooler" in name:
                    param.requires_grad = True

        if config.unfreeze == "pooler":
            for name, param in self.bert.named_parameters():
                if "pooler" in name:
                    param.requires_grad = True

        if config.unfreeze == "enc-1":
            n_layer = sum([
                1 for name, _ in self.bert.named_parameters()
                if "encoder.layer" in name
            ])
            last_layer = "encoder.layer." + str(
                int(n_layer / 16 - 1))  # each enc layer has 16 pars
            for name, param in self.bert.named_parameters():
                if last_layer in name:
                    param.requires_grad = True

        if config.unfreeze == "enc-1_pooler":
            n_layer = sum([
                1 for name, _ in self.bert.named_parameters()
                if "encoder.layer" in name
            ])
            last_layer = "encoder.layer." + str(
                int(n_layer / 16 - 1))  # each enc layer has 16 pars
            for name, param in self.bert.named_parameters():
                if last_layer in name or "pooler" in name:
                    param.requires_grad = True

    def forward(self, doc):
        """
        Input:
            doc: [batch_size, num_chunks, 3, max_chunk_len]            
        Returns:
            out: [batch_size, output_dim]       
            
        """
        batch_size = doc.shape[0]

        pooled = self.bert(input_ids=doc[0, :, 0],
                           attention_mask=doc[0, :, 1],
                           token_type_ids=doc[0, :, 2])[1].unsqueeze(0)

        for i in range(batch_size - 1):
            # Output of BertModel: (last_hidden_state, pooler_output, hidden_states, attentions)
            # Last layer hidden-state of the first token of the sequence (classification token)
            pool_i = self.bert(input_ids=doc[i + 1, :, 0],
                               attention_mask=doc[i + 1, :, 1],
                               token_type_ids=doc[i + 1, :, 2])[1]
            pooled = torch.cat((pooled, pool_i.unsqueeze(0)), dim=0)

        dp = self.dropout(pooled)  # [batch_size, num_chunks, hidden_size]
        # output: [batch_size, num_chunks, n_directions*hidden_size], output features from last layer for each t
        # h_n: [n_layers*n_directions, batch_size, hidden_size], hidden state for t=seq_len
        # c_n: [n_layers*n_directions, batch_size, hidden_size], cell state fir t=seq_len
        output, (h_n, c_n) = self.lstm(dp)

        # Concat pooling
        # h_n = output[:,-1,].squeeze(1)  # [batch_size, hidden_size]
        h_n = h_n.squeeze(0)  # [batch_size, hidden_size]
        h_max = torch.max(output, dim=1).values  # [batch_size, hidden_size]
        h_mean = torch.mean(output, dim=1)  # [batch_size, hidden_size]
        out = torch.cat((h_n, h_max, h_mean),
                        dim=1)  # [batch_size, hidden_size*3]

        out = self.fc(out)  # [batch_size, num_labels]
        out = self.fc_bn(out)
        out = F.softmax(out, dim=1)  # [batch_size, num_labels]
        # out = self.tanh(out)   # [batch_size, num_labels]

        return out
Beispiel #4
0
class BertPoolLSTM(BertPreTrainedModel):
    def __init__(self, config: BertConfig):
        super().__init__(config)

        self.config = config
        self.bert = BertModel(config)

        # self.seq_summary = SequenceSummary(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

        self.lstm = nn.LSTM(input_size=config.hidden_size,
                            hidden_size=config.hidden_size,
                            num_layers=1,
                            dropout=0,
                            batch_first=True,
                            bidirectional=False)

        self.fc = nn.Linear(config.hidden_size, config.num_labels)
        self.fc_bn = nn.BatchNorm1d(config.num_labels)
        self.init_weights()

        # Default: freeze bert
        for name, param in self.bert.named_parameters():
            param.requires_grad = False

        # Unfreeze layers
        if config.unfreeze == "pooler":
            for name, param in self.bert.named_parameters():
                if "pooler" in name:
                    param.requires_grad = True

        if config.unfreeze == "enc-1":
            n_layer = sum([
                1 for name, _ in self.bert.named_parameters()
                if "encoder.layer" in name
            ])
            last_layer = "encoder.layer." + str(
                int(n_layer / 16 - 1))  # each enc layer has 16 pars
            for name, param in self.bert.named_parameters():
                if last_layer in name:
                    param.requires_grad = True

        if config.unfreeze == "enc-1_pooler":
            n_layer = sum([
                1 for name, _ in self.bert.named_parameters()
                if "encoder.layer" in name
            ])
            last_layer = "encoder.layer." + str(
                int(n_layer / 16 - 1))  # each enc layer has 16 pars
            for name, param in self.bert.named_parameters():
                if last_layer in name or "pooler" in name:
                    param.requires_grad = True

    def forward(self, doc):
        """
        Input:
            doc: [batch_size, n_chunks, 3, max_chunk_len]     
                 n_chunks is the number of chunks within the batch (same for each doc after PadDoc)
        Returns:
            out: [batch_size, output_dim]       
            
        """
        batch_size = doc.shape[0]

        hidden_pooled_layers = []

        for k in range(batch_size):
            # Each doc is considered as a special 'batch' and each chunk is an element of the special 'bert_batch'
            # n_chunks is the temporary 'bert_batch_size', max_chunk_len corresponds to 'seq_len'
            bert_output_k = self.bert(
                input_ids=doc[k, :, 0],  # [n_chunks, max_chunk_len]
                attention_mask=doc[k, :, 1],
                token_type_ids=doc[k, :, 2])
            # pooled_k = bert_output_k[1].unsqueeze(0)
            hidden_states_k = bert_output_k[
                2]  # each element in the tuple: [n_chunks, max_chunk_len, hidden_size]

            # Average pooling over last [pool_layers] layers
            hidden_list_k = list(hidden_states_k[self.config.pool_layers:])
            hidden_stack_k = torch.stack(
                hidden_list_k
            )  # [n_pooled_layers, n_chunks, max_chunk_len, hidden_size]
            hidden_pooled_layers_k = torch.mean(
                hidden_stack_k,
                dim=0)  # [n_chunks, max_chunk_len, hidden_size]
            hidden_pooled_layers.append(hidden_pooled_layers_k)

        hidden_pooled_layers = torch.stack(
            hidden_pooled_layers
        )  # [batch_size, n_chunks, max_chunk_len, hidden_size]
        # Pooling within each chunk (over 512 word tokens of individual chunk)
        if self.config.pool_method == 'mean':
            hidden_pooled = torch.mean(
                hidden_pooled_layers,
                dim=2)  # [batch_size, n_chunks, hidden_size]
        elif self.config.pool_method == 'max':
            hidden_pooled = torch.max(
                hidden_pooled_layers,
                dim=2).values  # [batch_size, n_chunks, hidden_size]
        elif self.config.pool_method == 'mean_max':
            hidden_pooled_mean = torch.mean(
                hidden_pooled_layers,
                dim=2)  # [batch_size, n_chunks, hidden_size]
            hidden_pooled_max = torch.max(
                hidden_pooled_layers,
                dim=2).values  # [batch_size, n_chunks, hidden_size]
            hidden_pooled = torch.cat(
                (hidden_pooled_mean, hidden_pooled_max),
                dim=1)  # [batch_size, n_chunks*2, hidden_size]
        elif self.config.pool_method == 'cls':
            hidden_pooled = hidden_pooled_layers[:, :,
                                                 0, :]  # [batch_size, n_chunks, hidden_size]
        else:  # pool_method is None
            hidden_pooled = hidden_pooled_layers.view(
                batch_size, -1, self.config.hidden_size
            )  # [batch_size, n_chunks*max_chunk_len, hidden_size]

        dp = self.dropout(hidden_pooled)  # [batch_size, ?, hidden_size]
        # ? can be n_chunks, n_chunks*2 or n_chunks*max_chunk_len)
        # output: [batch_size, ?, n_directions*hidden_size], output features from last layer for each t
        # h_n: [n_layers*n_directions, batch_size, hidden_size], hidden state for t=seq_len
        # c_n: [n_layers*n_directions, batch_size, hidden_size], cell state fir t=seq_len
        output, (h_n, c_n) = self.lstm(dp)

        h_n = h_n.squeeze(0)  # [batch_size, hidden_size]

        out = self.fc(h_n)  # [batch_size, num_labels]
        out = self.fc_bn(out)
        out = F.softmax(out, dim=1)  # [batch_size, num_labels]

        return out
Beispiel #5
0
class BertPoolConv(BertPreTrainedModel):
    def __init__(self, config: BertConfig):
        super().__init__(config)

        self.config = config
        self.bert = BertModel(config)

        # self.seq_summary = SequenceSummary(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

        self.convs = nn.ModuleList([
            nn.Conv2d(in_channels=1,
                      out_channels=config.num_filters,
                      kernel_size=(fsize, config.hidden_size))
            for fsize in config.filter_sizes
        ])

        self.fc = nn.Linear(config.num_filters * len(config.filter_sizes),
                            config.num_labels)
        self.fc_bn = nn.BatchNorm1d(config.num_labels)
        self.init_weights()

        # Default: freeze bert
        for name, param in self.bert.named_parameters():
            param.requires_grad = False

        # Unfreeze layers
        if config.unfreeze == "pooler":
            for name, param in self.bert.named_parameters():
                if "pooler" in name:
                    param.requires_grad = True

        if config.unfreeze == "enc-1":
            n_layer = sum([
                1 for name, _ in self.bert.named_parameters()
                if "encoder.layer" in name
            ])
            last_layer = "encoder.layer." + str(
                int(n_layer / 16 - 1))  # each enc layer has 16 pars
            for name, param in self.bert.named_parameters():
                if last_layer in name:
                    param.requires_grad = True

        if config.unfreeze == "enc-1_pooler":
            n_layer = sum([
                1 for name, _ in self.bert.named_parameters()
                if "encoder.layer" in name
            ])
            last_layer = "encoder.layer." + str(
                int(n_layer / 16 - 1))  # each enc layer has 16 pars
            for name, param in self.bert.named_parameters():
                if last_layer in name or "pooler" in name:
                    param.requires_grad = True

    def forward(self, doc):
        """
        Input:
            doc: [batch_size, n_chunks, 3, max_chunk_len]     
                 n_chunks is the number of chunks within the batch (same for each doc after PadDoc)
        Returns:
            out: [batch_size, output_dim]       
            
        """
        batch_size = doc.shape[0]

        hidden_pooled_layers = []

        for k in range(batch_size):
            # Each doc is considered as a special 'batch' and each chunk is an element of the special 'bert_batch'
            # n_chunks is the temporary 'bert_batch_size', max_chunk_len corresponds to 'seq_len'
            bert_output_k = self.bert(
                input_ids=doc[k, :, 0],  # [n_chunks, max_chunk_len]
                attention_mask=doc[k, :, 1],
                token_type_ids=doc[k, :, 2])
            # pooled_k = bert_output_k[1].unsqueeze(0)
            hidden_states_k = bert_output_k[
                2]  # each element in the tuple: [n_chunks, max_chunk_len, hidden_size]

            # Average pooling over last [pool_layers] layers
            hidden_list_k = list(hidden_states_k[self.config.pool_layers:])
            hidden_stack_k = torch.stack(
                hidden_list_k
            )  # [n_pooled_layers, n_chunks, max_chunk_len, hidden_size]
            hidden_pooled_layers_k = torch.mean(
                hidden_stack_k,
                dim=0)  # [n_chunks, max_chunk_len, hidden_size]
            hidden_pooled_layers.append(hidden_pooled_layers_k)

        hidden_pooled_layers = torch.stack(
            hidden_pooled_layers
        )  # [batch_size, n_chunks, max_chunk_len, hidden_size]
        # Pooling within each chunk (over 512 word tokens of individual chunk)
        if self.config.pool_method == 'mean':
            hidden_pooled = torch.mean(
                hidden_pooled_layers,
                dim=2)  # [batch_size, n_chunks, hidden_size]
        elif self.config.pool_method == 'max':
            hidden_pooled = torch.max(
                hidden_pooled_layers,
                dim=2).values  # [batch_size, n_chunks, hidden_size]
        elif self.config.pool_method == 'mean_max':
            hidden_pooled_mean = torch.mean(
                hidden_pooled_layers,
                dim=2)  # [batch_size, n_chunks, hidden_size]
            hidden_pooled_max = torch.max(
                hidden_pooled_layers,
                dim=2).values  # [batch_size, n_chunks, hidden_size]
            hidden_pooled = torch.cat(
                (hidden_pooled_mean, hidden_pooled_max),
                dim=1)  # [batch_size, n_chunks*2, hidden_size]
        elif self.config.pool_method == 'cls':
            hidden_pooled = hidden_pooled_layers[:, :,
                                                 0, :]  # [batch_size, n_chunks, hidden_size]
        else:  # pool_method is None
            hidden_pooled = hidden_pooled_layers.view(
                batch_size, -1, self.config.hidden_size
            )  # [batch_size, n_chunks*max_chunk_len, hidden_size]

        hidden_pooled = hidden_pooled.unsqueeze(
            1)  # [batch_size, 1, ?, hidden_size]
        hidden_conved = [
            F.relu(conv(hidden_pooled)) for conv in self.convs
        ]  # hidden_conved[i]: [batch_size, n_filters, (?-fsize+1), 1]
        hidden_conved = [
            conv.squeeze(3) for conv in hidden_conved
        ]  # hidden_conved[i]: [batch_size, n_filters, (?-fsize+1)]
        hc_pooled = [
            F.max_pool1d(conv, conv.shape[2]) for conv in hidden_conved
        ]  # hc_pooled[i]: [batch_size, n_filters, 1]
        hc_pooled = [pool.squeeze(2) for pool in hc_pooled
                     ]  # hc_pooled[i]: [batch_size, n_filters]

        cat = torch.cat(hc_pooled,
                        dim=1)  # [batch_size, n_filters * len(filter_sizes)]
        dp = self.dropout(cat)
        out = self.fc(dp)  # # [batch_size, num_labels]
        out = self.fc_bn(out)
        out = F.softmax(out, dim=1)  # [batch_size, num_labels]

        return out
Beispiel #6
0
    'hidden_act': 'gelu',
    'hidden_dropout_prob': 0.1,
    'hidden_size': 768,
    'initializer_range': 0.02,
    'intermediate_size': 3072,
    'max_position_embeddings': 512,
    'num_attention_heads': 12,
    'num_hidden_layers': 12,
    'type_vocab_size': 2,
    'vocab_size': 8002
}

if __name__ == "__main__":
    ctx = "cpu"
    # kobert
    kobert_model_file = "./kobert_resources/pytorch_kobert_2439f391a6.params"
    kobert_vocab_file = "./kobert_resources/kobert_news_wiki_ko_cased-ae5711deb3.spiece"

    bertmodel = BertModel(config=BertConfig.from_dict(bert_config))
    bertmodel.load_state_dict(torch.load(kobert_model_file))
    device = torch.device(ctx)
    bertmodel.to(device)
    # bertmodel.eval()

    # for name, param in bertmodel.named_parameters():
    #     print(name, param.shape)

    for name, param in bertmodel.named_parameters():
        if param.requires_grad:
            print(name, param.shape)
Beispiel #7
0
class DocumentBert(BertPreTrainedModel):
    def __init__(self, bert_model_config: BertConfig):
        super(DocumentBert, self).__init__(bert_model_config)
        self.bert_patent = BertModel(bert_model_config)
        self.bert_tsd = BertModel(bert_model_config)

        for param in self.bert_patent.parameters():
            param.requires_grad = False

        for param in self.bert_tsd.parameters():
            param.requires_grad = False

        self.bert_batch_size = self.bert_patent.config.bert_batch_size
        self.dropout_patent = torch.nn.Dropout(
            p=bert_model_config.hidden_dropout_prob)
        self.dropout_tsd = torch.nn.Dropout(
            p=bert_model_config.hidden_dropout_prob)

        self.lstm_patent = torch.nn.LSTM(bert_model_config.hidden_size,
                                         bert_model_config.hidden_size)
        self.lstm_tsd = torch.nn.LSTM(bert_model_config.hidden_size,
                                      bert_model_config.hidden_size)

        self.output = torch.nn.Linear(bert_model_config.hidden_size * 2,
                                      out_features=1)

    def forward(self,
                patent_batch: torch.Tensor,
                tsd_batch: torch.Tensor,
                device='cuda'):

        #patent
        bert_output_patent = torch.zeros(
            size=(patent_batch.shape[0],
                  min(patent_batch.shape[1], self.bert_batch_size),
                  self.bert_patent.config.hidden_size),
            dtype=torch.float,
            device=device)
        for doc_id in range(patent_batch.shape[0]):
            bert_output_patent[
                doc_id][:self.bert_batch_size] = self.dropout_patent(
                    self.bert_patent(
                        patent_batch[doc_id][:self.bert_batch_size, 0],
                        token_type_ids=patent_batch[doc_id]
                        [:self.bert_batch_size, 1],
                        attention_mask=patent_batch[doc_id]
                        [:self.bert_batch_size, 2])[1])
        output_patent, (_, _) = self.lstm_patent(
            bert_output_patent.permute(1, 0, 2))
        last_layer_patent = output_patent[-1]

        #tsd

        bert_output_tsd = torch.zeros(size=(tsd_batch.shape[0],
                                            min(tsd_batch.shape[1],
                                                self.bert_batch_size),
                                            self.bert_tsd.config.hidden_size),
                                      dtype=torch.float,
                                      device=device)
        for doc_id in range(tsd_batch.shape[0]):
            bert_output_tsd[doc_id][:self.bert_batch_size] = self.dropout_tsd(
                self.bert_tsd(
                    tsd_batch[doc_id][:self.bert_batch_size, 0],
                    token_type_ids=tsd_batch[doc_id][:self.bert_batch_size, 1],
                    attention_mask=tsd_batch[doc_id][:self.bert_batch_size,
                                                     2])[1])
        output_tsd, (_, _) = self.lstm_tsd(bert_output_tsd.permute(1, 0, 2))
        last_layer_tsd = output_tsd[-1]

        x = torch.cat([last_layer_patent, last_layer_tsd], dim=1)
        prediction = torch.nn.functional.sigmoid(self.output(x))

        assert prediction.shape[0] == patent_batch.shape[0]
        return prediction

    def freeze_bert_encoder(self):
        for param in self.bert_patent.parameters():
            param.requires_grad = False
        for param in self.bert_tsd.parameters():
            param.requires_grad = False

    def unfreeze_bert_encoder(self):
        for param in self.bert_patent.parameters():
            param.requires_grad = True
        for param in self.bert_tsd.parameters():
            param.requires_grad = True

    def unfreeze_bert_encoder_last_layers(self):
        for name, param in self.bert_patent.named_parameters():
            if "encoder.layer.11" in name or "pooler" in name:
                param.requires_grad = True
        for name, param in self.bert_tsd.named_parameters():
            if "encoder.layer.11" in name or "pooler" in name:
                param.requires_grad = True

    def unfreeze_bert_encoder_pooler_layer(self):
        for name, param in self.bert_patent.named_parameters():
            if "pooler" in name:
                param.requires_grad = True
        for name, param in self.bert_tsd.named_parameters():
            if "pooler" in name:
                param.requires_grad = True
Beispiel #8
0
class BertPoolConv(BertPreTrainedModel):
    def __init__(self, config: BertConfig):
        super().__init__(config)

        self.config = config
        self.bert = BertModel(config)

        # self.seq_summary = SequenceSummary(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

        self.convs = nn.ModuleList([
            nn.Conv2d(in_channels=1,
                      out_channels=config.num_filters,
                      kernel_size=(fsize, config.hidden_size))
            for fsize in config.filter_sizes
        ])

        self.fc = nn.Linear(config.num_filters * len(config.filter_sizes),
                            config.num_labels)
        self.fc_bn = nn.BatchNorm1d(config.num_labels)
        self.init_weights()

        # Default: freeze bert
        for name, param in self.bert.named_parameters():
            param.requires_grad = False

        # Unfreeze layers
        if config.unfreeze == "pooler":
            for name, param in self.bert.named_parameters():
                if "pooler" in name:
                    param.requires_grad = True

        if config.unfreeze == "enc-1":
            n_layer = sum([
                1 for name, _ in self.bert.named_parameters()
                if "encoder.layer" in name
            ])
            last_layer = "encoder.layer." + str(
                int(n_layer / 16 - 1))  # each enc layer has 16 pars
            for name, param in self.bert.named_parameters():
                if last_layer in name:
                    param.requires_grad = True

        if config.unfreeze == "enc-1_pooler":
            n_layer = sum([
                1 for name, _ in self.bert.named_parameters()
                if "encoder.layer" in name
            ])
            last_layer = "encoder.layer." + str(
                int(n_layer / 16 - 1))  # each enc layer has 16 pars
            for name, param in self.bert.named_parameters():
                if last_layer in name or "pooler" in name:
                    param.requires_grad = True

    def forward(self, doc):
        """
        Input:
            doc: [batch_size, n_chunks, 3, max_chunk_len]     
                 n_chunks is the number of chunks within the batch (same for each doc after PadDoc)
        Returns:
            out: [batch_size, output_dim]       
            
        """
        batch_size = doc.shape[0]

        hidden_pooled_layers = []

        for k in range(batch_size):
            # Each doc is considered as a special 'batch' and each chunk is an element of the special 'bert_batch'
            # n_chunks is the temporary 'bert_batch_size', max_chunk_len corresponds to 'seq_len'
            bert_output_k = self.bert(
                input_ids=doc[k, :, 0],  # [n_chunks, max_chunk_len]
                attention_mask=doc[k, :, 1],
                token_type_ids=doc[k, :, 2])
            # pooled_k = bert_output_k[1].unsqueeze(0)
            hidden_states_k = bert_output_k[
                2]  # each element in the tuple: [n_chunks, max_chunk_len, hidden_size]

            # Average pooling over last [pool_layers] layers
            hidden_list_k = list(hidden_states_k[self.config.pool_layers:])
            hidden_stack_k = torch.stack(
                hidden_list_k
            )  # [n_pooled_layers, n_chunks, max_chunk_len, hidden_size]
            hidden_pooled_layers_k = torch.mean(
                hidden_stack_k,
                dim=0)  # [n_chunks, max_chunk_len, hidden_size]
            hidden_pooled_layers.append(hidden_pooled_layers_k)

        hidden_pooled_layers = torch.stack(
            hidden_pooled_layers
        )  # [batch_size, n_chunks, max_chunk_len, hidden_size]
        # Pooling within each chunk (over 512 word tokens of individual chunk)
        if self.config.pool_method == 'mean':
            hidden_pooled = torch.mean(
                hidden_pooled_layers,
                dim=2)  # [batch_size, n_chunks, hidden_size]
        elif self.config.pool_method == 'max':
            hidden_pooled = torch.max(
                hidden_pooled_layers,
                dim=2).values  # [batch_size, n_chunks, hidden_size]
        elif self.config.pool_method == 'mean_max':
            hidden_pooled_mean = torch.mean(
                hidden_pooled_layers,
                dim=2)  # [batch_size, n_chunks, hidden_size]
            hidden_pooled_max = torch.max(
                hidden_pooled_layers,
                dim=2).values  # [batch_size, n_chunks, hidden_size]
            hidden_pooled = torch.cat(
                (hidden_pooled_mean, hidden_pooled_max),
                dim=1)  # [batch_size, n_chunks*2, hidden_size]
        elif self.config.pool_method == 'cls':
            hidden_pooled = hidden_pooled_layers[:, :,
                                                 0, :]  # [batch_size, n_chunks, hidden_size]
        else:  # pool_method is None
            hidden_pooled = hidden_pooled_layers.view(
                batch_size, -1, self.config.hidden_size
            )  # [batch_size, n_chunks*max_chunk_len, hidden_size]

        hidden_pooled = hidden_pooled.unsqueeze(
            1)  # [batch_size, 1, ?, hidden_size]
        hidden_conved = [
            F.relu(conv(hidden_pooled)) for conv in self.convs
        ]  # hidden_conved[i]: [batch_size, n_filters, (?-fsize+1), 1]
        hidden_conved = [
            conv.squeeze(3) for conv in hidden_conved
        ]  # hidden_conved[i]: [batch_size, n_filters, (?-fsize+1)]
        hc_pooled = [
            F.max_pool1d(conv, conv.shape[2]) for conv in hidden_conved
        ]  # hc_pooled[i]: [batch_size, n_filters, 1]
        hc_pooled = [pool.squeeze(2) for pool in hc_pooled
                     ]  # hc_pooled[i]: [batch_size, n_filters]

        cat = torch.cat(hc_pooled,
                        dim=1)  # [batch_size, n_filters * len(filter_sizes)]
        dp = self.dropout(cat)
        out = self.fc(dp)  # # [batch_size, num_labels]
        out = self.fc_bn(out)
        out = F.softmax(out, dim=1)  # [batch_size, num_labels]

        return out


#%%
# class AlbertLinear(AlbertPreTrainedModel):

#     def __init__(self, config: AlbertConfig):
#         super().__init__(config)

#         self.albert = AlbertModel(config)

#         self.dropout = nn.Dropout(config.hidden_dropout_prob)
#         self.fc = nn.Linear(config.hidden_size, config.num_labels)
#         self.fc_bn = nn.BatchNorm1d(config.num_labels)
#         # self.fc = nn.Linear(config.hidden_size * config.n_chunks, config.num_labels)
#         self.init_weights()

#         # Default: freeze albert
#         for name, param in self.albert.named_parameters():
#             param.requires_grad = False

#         # Unfreeze layers
#         if config.unfreeze == "pooler":
#             for name, param in self.albert.named_parameters():
#                 if "pooler" in name:
#                     param.requires_grad = True

#     def forward(self, doc):
#         """
#         Input:
#             doc: [batch_size, num_chunks, 3, max_chunk_len]
#         Returns:
#             out: [batch_size, output_dim]

#         """
#         batch_size = doc.shape[0]

#         pooled = self.albert(input_ids = doc[0,:,0],
#                              attention_mask = doc[0,:,1],
#                              token_type_ids = doc[0,:,2])[1].unsqueeze(0)
#         for i in range(batch_size-1):
#             pool_i = self.albert(input_ids = doc[i+1,:,0],
#                                  attention_mask = doc[i+1,:,1],
#                                  token_type_ids = doc[i+1,:,2])[1]
#             pooled = torch.cat((pooled, pool_i.unsqueeze(0)), dim=0)

#         dp = self.dropout(pooled)  # [batch_size, num_chunks, hidden_size]
#         # concat = dp.view(batch_size, -1)  # [batch_size, num_chunks*hidden_size]
#         if self.albert.config.linear_max == True:
#             dp = torch.max(dp, dim=1).values  # [batch_size, hidden_size]
#         else:
#             dp = torch.mean(dp, dim=1)  # [batch_size, hidden_size]
#         # dp = dp.sum(dim=1) # [batch_size, hidden_size]

#         out = self.fc(dp)  # [batch_size, num_labels]
#         out = self.fc_bn(out)
#         out = F.softmax(out, dim=1)  # [batch_size, num_labels]

#         return out

#%%
# class AlbertLSTM(AlbertPreTrainedModel):

#     def __init__(self, config: AlbertConfig):
#         super().__init__(config)

#         self.albert = AlbertModel(config)

#         self.dropout = nn.Dropout(config.hidden_dropout_prob)

#         self.lstm = nn.LSTM(input_size = config.hidden_size, hidden_size = config.hidden_size,
#                             num_layers = 1, dropout = 0,
#                             batch_first = True, bidirectional = False)

#         self.fc = nn.Linear(config.hidden_size, config.num_labels)
#         self.fc_bn = nn.BatchNorm1d(config.num_labels)
#         self.tanh = nn.Tanh()
#         self.init_weights()

#         # Default: freeze albert
#         for name, param in self.albert.named_parameters():
#             param.requires_grad = False

#         # Unfreeze layers
#         if config.unfreeze == "embed":
#             for name, param in self.albert.named_parameters():
#                 if "embeddings" in name:
#                     param.requires_grad = True

#         if config.unfreeze == "embed_enc0":
#             for name, param in self.albert.named_parameters():
#                 if "embeddings" in name or "encoder" in name:
#                     param.requires_grad = True

#         if config.unfreeze == "embed_enc0_pooler":
#             for name, param in self.albert.named_parameters():
#                     param.requires_grad = True

#         if config.unfreeze == "enc0":
#             for name, param in self.albert.named_parameters():
#                 if "encoder" in name:
#                     param.requires_grad = True

#         if config.unfreeze == "enc0_pooler":
#             for name, param in self.albert.named_parameters():
#                 if "encoder" in name or "pooler" in name:
#                     param.requires_grad = True

#         if config.unfreeze == "embed_pooler":
#             for name, param in self.albert.named_parameters():
#                 if "embed" in name or "pooler" in name:
#                     param.requires_grad = True

#         if config.unfreeze == "pooler":
#             for name, param in self.albert.named_parameters():
#                 if "pooler" in name:
#                     param.requires_grad = True

#     def forward(self, doc):
#         """
#         Input:
#             doc: [batch_size, num_chunks, 3, max_chunk_len]
#         Returns:
#             out: [batch_size, output_dim]

#         """
#         batch_size = doc.shape[0]

#         pooled = self.albert(input_ids = doc[0,:,0],
#                              attention_mask = doc[0,:,1],
#                              token_type_ids = doc[0,:,2])[1].unsqueeze(0)

#         for i in range(batch_size-1):
#             # Output of BertModel: (last_hidden_state, pooler_output, hidden_states, attentions)
#             # Last layer hidden-state of the first token of the sequence (classification token)
#             pool_i = self.albert(input_ids = doc[i+1,:,0],
#                                  attention_mask = doc[i+1,:,1],
#                                  token_type_ids = doc[i+1,:,2])[1]
#             pooled = torch.cat((pooled, pool_i.unsqueeze(0)), dim=0)

#         dp = self.dropout(pooled)  # [batch_size, num_chunks, bert_hidden_size]
#         # output: [batch_size, num_chunks, n_directions*hidden_size], output features from last layer for each t
#         # h_n: [n_layers*n_directions, batch_size, hidden_size], hidden state for t=seq_len
#         # c_n: [n_layers*n_directions, batch_size, hidden_size], cell state fir t=seq_len
#         output, (h_n, c_n) = self.lstm(dp)

#         # h_n = output[:,-1,].squeeze(1)  # [batch_size, hidden_size]
#         h_n = h_n.squeeze(0)  # [batch_size, hidden_size]

#         out = self.fc(h_n)  # [batch_size, num_labels]
#         out = self.fc_bn(out)
#         out = F.softmax(out, dim=1)  # [batch_size, num_labels]
#         # out = self.tanh(out)   # [batch_size, num_labels]

#         return out
Beispiel #9
0
    for idx in range(len(attentions)):
        output["bert_layer" + str(idx + 1)] = {
            "hidden_states": hidden_states[idx + 1],
            "attention": attentions[idx]
        }
    output["pred_layer"] = {"pooler_output": pooler_output}
    return output


# loss_model
loss_model = MultiLayerBasedDistillationLoss(
    distill_config=distill_config,
    teacher_output_adaptor=output_adaptor,
    student_output_adaptor=output_adaptor)
# optimizer
param_optimizer = list(student_model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [{
    'params':
    [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
    'weight_decay':
    0.01
}, {
    'params':
    [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
    'weight_decay':
    0.0
}]
optimizer = torch.optim.Adam(params=optimizer_grouped_parameters,
                             lr=learning_rate)
# evaluator
Beispiel #10
0
class BertBinaryClassification(BertPreTrainedModel):
    def __init__(self, config):
        """
        :param config: a transformers Config object
        """
        self.num_labels = 1
        super().__init__(config, num_labels=self.num_labels)

        # The BertModel without pooling layer
        self.bert = BertModel(config, add_pooling_layer=False)

        self.dropout = nn.Dropout(config.hidden_dropout_prob)

        # Input dimension depending on number of tokens encodings
        input_dim = config.hidden_size * 3

        cls_layers = []
        cls_layers.append(nn.Linear(input_dim, config.hidden_size))
        cls_layers.append(nn.GELU())
        cls_layers.append(
            nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps))
        cls_layers.append(nn.Linear(config.hidden_size, self.num_labels))

        # The classifier:
        self.classifier = nn.Sequential(*cls_layers)

        # Used in BCEWithLogitsLoss function
        #  to counteract unbalanced training sets
        # Can be changed with self.set_class_weights
        self.class_weights = torch.ones([1])

        self.init_weights()

    def forward(self,
                input_ids=None,
                attention_mask=None,
                token_type_ids=None,
                position_ids=None,
                head_mask=None,
                inputs_embeds=None,
                labels=None,
                output_attentions=None,
                output_hidden_states=None,
                return_dict=None):
        outputs = self.bert(input_ids,
                            attention_mask=attention_mask,
                            token_type_ids=token_type_ids,
                            position_ids=position_ids,
                            head_mask=head_mask,
                            inputs_embeds=inputs_embeds,
                            output_attentions=output_attentions,
                            output_hidden_states=output_hidden_states,
                            return_dict=return_dict)

        # Use either pooled output or sequence output, depending on settings
        bert_output = outputs[0]

        # Get the state of the [CLS] token,
        # and the first token of the mention and candidate entity

        # Position of the first token of the candidate
        # (right after the [SEP] token)
        cand_pos = torch.argmax(token_type_ids, dim=1)

        # Get the embedding of the first token of the candidate over the batch
        cand_tensors = torch.cat([t[i] for t, i in zip(bert_output, cand_pos)
                                  ]).reshape((bert_output.size(0),
                                              bert_output.size(-1)))

        # Flattened input of 3 * hidden_size features
        bert_output = torch.cat(
            [bert_output[:, 0], bert_output[:, 1], cand_tensors], dim=1)

        bert_output = self.dropout(bert_output)
        logits = self.classifier(bert_output)

        loss = None
        if labels is not None:
            # Binary cross entropy loss with class weights and sigmoid
            loss_fn = nn.BCEWithLogitsLoss(pos_weight=self.class_weights)
            loss = loss_fn(logits.view(-1),
                           labels.view(-1).to(dtype=torch.float))

        if not return_dict:
            output = (logits, ) + outputs[2:]
            return ((loss, ) + output) if loss is not None else output

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

    def freeze_layers(self, param_idx: List):
        """
        Freeze layers at provided indices to not train them
        :param param_idx: list of indices of layers to be frozen
        """
        module_params = list(self.named_parameters())
        for param in (module_params[p] for p in param_idx):
            param[1].requires_grad = False

    def freeze_n_transformers(self, n: int = 11):
        """
        Freeze the provided number of encoders in the BERT architecture
         to not train them
        :param n: number of encoders to freeze
        """
        n = min(n, 12)
        n_emb_layers = 5
        n_layers_in_transformer = 12
        emb_layers = list(range(n_emb_layers))
        encoder_layers = list(
            range(n_emb_layers, n_emb_layers + n * n_layers_in_transformer))
        self.freeze_layers(emb_layers + encoder_layers)

    def freeze_bert(self):
        """
        Freezes all layers in BERT from training
        """
        for param in self.bert.named_parameters():
            param[1].requires_grad = False

    def set_class_weights(self, class_weights):
        """
        Set the self.class_weights used to penalize wrong prediction of
         minority class
        :param class_weights: a pytorch tensor with weights over the two
                              classes. Used by the CrossEntropyLoss.
        """
        self.class_weights = class_weights
Beispiel #11
0
model = BertModel(configuration)

# Accessing the model configuration
configuration = model.config
print(configuration)

# Loading the Hugging Face Bert Uncased Base Model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=2)
model.cuda()

# Optimizer Grouped Parameters

# Don't apply weight decay to any parameters whose names include these tokens.
# (Here, the BERT doesn't have `gamma` or `beta` parameters, only `bias` terms)
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.weight']
# Separate the `weight` parameters from the `bias` parameters.
# - For the `weight` parameters, this specifies a 'weight_decay_rate' of 0.01.
# - For the `bias` parameters, the 'weight_decay_rate' is 0.0.
optimizer_grouped_parameters = [
    # Filter for all parameters which *don't* include 'bias', 'gamma', 'beta'.
    {
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay_rate':
        0.1
    },

    # Filter for parameters which *do* include those.
    {