def __init__(self, bert_dir: str, vocabulary_builder: VocabularyBuilder, dropout: float, is_used_crf: bool): """ 初始化 :param bert_dir: 预训练好的 bert 模型所在 dir :param vocabulary_builder: vocabulary builder :param dropout: bert 最后一层输出的 dropout :param is_used_crf: 是否使用 crf, True: 使用 crf; False: 不使用 crf """ super().__init__() self.label_vocabulary = vocabulary_builder.label_vocabulary self.dropout = Dropout(dropout) self.is_used_crf = is_used_crf self.bert = BertModel.from_pretrained(bert_dir) bert_config: BertConfig = self.bert.config self.classifier = Linear(bert_config.hidden_size, self.label_vocabulary.label_size) if self.is_used_crf: constraints = BIO.allowed_transitions( label_vocabulary=self.label_vocabulary) self.crf = ConditionalRandomField( num_tags=self.label_vocabulary.label_size, constraints=constraints) else: self.crf = None self.reset_parameters()
def test_allowed_transitions(): """ 测试允许转移mask pair :return: """ label_vocabulary = LabelVocabulary(labels=[["B-L1", "I-L1", "B-L2", "I-L2", "O"]], padding=LabelVocabulary.PADDING) allowed_pairs = BIO.allowed_transitions(label_vocabulary=label_vocabulary) for from_idx, to_idx in allowed_pairs: if from_idx == label_vocabulary.label_size: from_label = "START" else: from_label = label_vocabulary.token(from_idx) if to_idx == label_vocabulary.label_size + 1: to_label = "STOP" else: to_label = label_vocabulary.token(to_idx) print(f"(\"{from_label}\", \"{to_label}\"),") expect_trainsition_labels = [ ("B-L1", "B-L1"), ("B-L1", "I-L1"), ("B-L1", "B-L2"), ("B-L1", "O"), ("B-L1", "STOP"), ("I-L1", "B-L1"), ("I-L1", "I-L1"), ("I-L1", "B-L2"), ("I-L1", "O"), ("I-L1", "STOP"), ("B-L2", "B-L1"), ("B-L2", "B-L2"), ("B-L2", "I-L2"), ("B-L2", "O"), ("B-L2", "STOP"), ("I-L2", "B-L1"), ("I-L2", "B-L2"), ("I-L2", "I-L2"), ("I-L2", "O"), ("I-L2", "STOP"), ("O", "B-L1"), ("O", "B-L2"), ("O", "O"), ("O", "STOP"), ("START", "B-L1"), ("START", "B-L2"), ("START", "O")] expect = list() for from_label, to_label in expect_trainsition_labels: if from_label == "START": from_idx = label_vocabulary.label_size else: from_idx = label_vocabulary.index(from_label) if to_label == "STOP": to_idx = label_vocabulary.label_size + 1 else: to_idx = label_vocabulary.index(to_label) expect.append((from_idx, to_idx)) ASSERT.assertSetEqual(set(expect), set(allowed_pairs))
def test_allowed_transitions(self): bio_labels = ['O', 'B-X', 'I-X', 'B-Y', 'I-Y'] # start tag, end tag label_vocabulary = LabelVocabulary(labels=[bio_labels], padding=LabelVocabulary.PADDING) # 0 1 2 3 4 5 6 allowed = BIO.allowed_transitions(label_vocabulary=label_vocabulary) # The empty spaces in this matrix indicate disallowed transitions. assert set(allowed) == { # Extra column for end tag. (0, 0), (0, 1), (0, 3), (0, 6), (1, 0), (1, 1), (1, 2), (1, 3), (1, 6), (2, 0), (2, 1), (2, 2), (2, 3), (2, 6), (3, 0), (3, 1), (3, 3), (3, 4), (3, 6), (4, 0), (4, 1), (4, 3), (4, 4), (4, 6), (5, 0), (5, 1), (5, 3) # Extra row for start tag }
def __init__(self, token_vocabulary: Vocabulary, token_embedding_dim: int, token_embedding_dropout: float, gaz_vocabulary: PretrainedVocabulary, gaz_word_embedding_dropout: float, gaz_word_embedding_dim: int, num_lstm_layer: int, lstm_hidden_size: int, gat_hidden_size: int, gat_num_heads: int, gat_dropout: float, lstm_dropout: float, alpha: float, fusion_strategy: str, label_vocabulary: LabelVocabulary): super().__init__() assert gaz_word_embedding_dim == lstm_hidden_size * 2, \ f"gaz_vocabulary.embedding_dim: {gaz_vocabulary.embedding_dim} " \ f"与 lstm_hidden_size * 2: {lstm_hidden_size * 2} 不相等, 因为二者都会作为图的节点,所以 size 必须一致" self.token_vocabulary = token_vocabulary self.label_vocabulary = label_vocabulary self.token_embedding_dropout = Dropout(token_embedding_dropout) if isinstance(self.token_vocabulary, Vocabulary): self.token_embedding: Embedding = Embedding( num_embeddings=self.token_vocabulary.size, embedding_dim=token_embedding_dim, padding_idx=self.token_vocabulary.padding_index) elif isinstance(self.token_vocabulary, PretrainedVocabulary): self.token_embedding: Embedding = Embedding.from_pretrained( self.token_vocabulary.embedding_matrix, freeze=True, padding_idx=self.token_vocabulary.padding_index) self.gaz_word_embedding = Embedding.from_pretrained( gaz_vocabulary.embedding_matrix, freeze=True, padding_idx=gaz_vocabulary.padding_index) self.gaz_word_embedding_dropout = Dropout(gaz_word_embedding_dropout) # bilstm bilstm = DynamicRnn(rnn=LSTM(input_size=token_embedding_dim, hidden_size=lstm_hidden_size, num_layers=num_lstm_layer, batch_first=True, bidirectional=True)) self.bilstm_seq2seq = RnnSeq2Seq(bilstm) self.lstm_dropout = Dropout(lstm_dropout) self.lstm_encoding_feed_forward = Linear( in_features=lstm_hidden_size * 2, out_features=self.label_vocabulary.label_size) # C-Graph self.c_gat = GAT(in_features=2 * lstm_hidden_size, out_features=label_vocabulary.label_size, dropout=gat_dropout, alpha=alpha, num_heads=gat_num_heads, hidden_size=gat_hidden_size) # T-Graph self.t_gat = GAT(in_features=2 * lstm_hidden_size, out_features=label_vocabulary.label_size, dropout=gat_dropout, alpha=alpha, num_heads=gat_num_heads, hidden_size=gat_hidden_size) # L-Graph self.l_gat = GAT(in_features=2 * lstm_hidden_size, out_features=label_vocabulary.label_size, dropout=gat_dropout, alpha=alpha, num_heads=gat_num_heads, hidden_size=gat_hidden_size) if fusion_strategy == "m": self.fusion_layer = MFunsionLayer( label_size=label_vocabulary.label_size) elif fusion_strategy == "v": self.fusion_layer = VFusionLayer( label_size=label_vocabulary.label_size) elif fusion_strategy == "n": self.fusion_layer = NFusionLayer() else: raise RuntimeError( f"fusion_stategy 必须是: m, v, n 之一, 而现在是 {fusion_strategy}") # crf constraints = BIO.allowed_transitions( label_vocabulary=self.label_vocabulary) self.crf = ConditionalRandomField( num_tags=self.label_vocabulary.label_size, constraints=constraints)
def __init__(self, token_vocabulary: Vocabulary, token_embedding_dim: int, token_embedding_dropout: float, gaz_vocabulary: PretrainedVocabulary, gaz_word_embedding_dim: int, gaz_word_embedding_dropout: float, label_vocabulary: LabelVocabulary, hidden_size: int, lstm_dropout: float): """ :param token_vocabulary: token vocabulary :param token_embedding_dim: token embedding 维度 :param token_embedding_dropout: token embedding dropout :param gaz_vocabulary: gaz vocabualry :param gaz_word_embedding_dim: gaz word embedding 维度 :param gaz_word_embedding_dropout: gaz word embedding droupout :param label_vocabulary: labe vocabulary :param hidden_size: lattice lstm 隐层输出, 2*hidden_size, 因为使用了双向的 :param lstm_dropout: lstm dropout """ super().__init__() self.token_vocabulary = token_vocabulary self.label_vocabulary = label_vocabulary self.token_embedding_dropout = Dropout(token_embedding_dropout) self.lstm_dropout = Dropout(lstm_dropout) if isinstance(self.token_vocabulary, Vocabulary): self.token_embedding: Embedding = Embedding(num_embeddings=self.token_vocabulary.size, embedding_dim=token_embedding_dim, padding_idx=self.token_vocabulary.padding_index) elif isinstance(self.token_vocabulary, PretrainedVocabulary): self.token_embedding: Embedding = Embedding.from_pretrained(self.token_vocabulary.embedding_matrix, freeze=True, padding_idx=self.token_vocabulary.padding_index) self.gaz_word_embedding = Embedding.from_pretrained(gaz_vocabulary.embedding_matrix, freeze=True, padding_idx=gaz_vocabulary.padding_index) # 默认使用双向的 Lattice LSTM # 前向 lattice lstm self.forward_lattice_lstm = LatticeLSTM(input_dim=token_embedding_dim, hidden_dim=hidden_size, gaz_word_embedding_dim=gaz_word_embedding_dim, gaz_word_embedding=self.gaz_word_embedding, gaz_word_embedding_dropout=gaz_word_embedding_dropout, left2right=True) # 反向 lattice lstm self.backward_lattice_lstm = LatticeLSTM(input_dim=token_embedding_dim, hidden_dim=hidden_size, gaz_word_embedding_dim=gaz_word_embedding_dim, gaz_word_embedding=self.gaz_word_embedding, gaz_word_embedding_dropout=gaz_word_embedding_dropout, left2right=False) # 将 双向 lattice lstm 的输出转化到 label 空间 self.linear = Linear(in_features=(hidden_size * 2), out_features=label_vocabulary.label_size) # crf constraints = BIO.allowed_transitions(label_vocabulary=self.label_vocabulary) self.crf = ConditionalRandomField(num_tags=self.label_vocabulary.label_size, constraints=constraints) self.reset_parameters()
def __init__(self, vocabulary_builder: VocabularyBuilder, word_embedding_dim: int, rnn_type: str, hidden_size: int, num_layer: int, dropout: float, is_used_crf: bool): super().__init__() self.word_embedding_dim = word_embedding_dim self.token_vocabulary = vocabulary_builder.token_vocabulary self.label_vocabulary = vocabulary_builder.label_vocabulary self.is_used_crf = is_used_crf if isinstance(self.token_vocabulary, Vocabulary): self.embedding: Embedding = Embedding( num_embeddings=self.token_vocabulary.size, embedding_dim=word_embedding_dim, padding_idx=self.token_vocabulary.padding_index) elif isinstance(self.token_vocabulary, PretrainedVocabulary): self.embedding: Embedding = Embedding.from_pretrained( self.token_vocabulary.embedding_matrix, freeze=True, padding_idx=self.token_vocabulary.padding_index) self.hidden_size = hidden_size if rnn_type == DynamicRnn.LSTM: lstm = LSTM(input_size=word_embedding_dim, hidden_size=hidden_size, num_layers=num_layer, bidirectional=True, dropout=dropout, batch_first=True) dynamic_rnn = DynamicRnn(rnn=lstm) elif rnn_type == DynamicRnn.GRU: gru = GRU(input_size=word_embedding_dim, hidden_size=hidden_size, num_layers=num_layer, bidirectional=True, dropout=dropout, batch_first=True) dynamic_rnn = DynamicRnn(rnn=gru) else: raise RuntimeError( f"rnn_type: {rnn_type} 必须是 {DynamicRnn.LSTM} 或 {DynamicRnn.GRU} " ) self.rnn_seq2seq = RnnSeq2Seq(dynamic_rnn=dynamic_rnn) self.liner = Linear(in_features=hidden_size * 2, out_features=self.label_vocabulary.label_size) if self.is_used_crf: constraints = BIO.allowed_transitions( label_vocabulary=self.label_vocabulary) self.crf = ConditionalRandomField( num_tags=self.label_vocabulary.label_size, constraints=constraints) else: self.crf = None self.reset_parameters()