Beispiel #1
0
    def prepare_input(self, data, labels=None):
        """Prepare input (features and labels) for NER model.
        Here we not only use character embeddings (or bert embeddings) as main input, but also
        support word embeddings and other hand-crafted features embeddings as additional input.

        Args:
            data: list of tokenized (in char level) texts, like ``[['我', '是', '中', '国', '人']]``
            labels: list of list of str, the corresponding label strings

        Returns:
            features: id matrix
            y: label id matrix (only if labels is provided)

        """
        batch_char_ids, batch_bert_ids, batch_bert_seg_ids, batch_word_ids = [], [], [], []
        batch_label_ids = []
        for i, char_text in enumerate(data):
            if self.use_char:
                if self.use_bert:
                    text_for_char_input = [self.cls_token
                                           ] + char_text + [self.seq_token]
                else:
                    text_for_char_input = char_text
                char_ids = [
                    self.char_vocab.get(token, self.char_vocab[self.unk_token])
                    for token in text_for_char_input
                ]
                batch_char_ids.append(char_ids)

            if self.use_bert:
                indices, segments = self.bert_tokenizer.encode(
                    first=''.join(char_text), max_len=self.max_len)
                batch_bert_ids.append(indices)
                batch_bert_seg_ids.append(segments)

            if self.use_word:
                word_text = jieba.lcut(''.join(char_text))
                word_ids = self.get_word_ids(word_text)
                batch_word_ids.append(word_ids)

            if labels is not None:
                if self.use_bert:
                    label_str = [self.cls_token] + labels[i] + [self.cls_token]
                else:
                    label_str = labels[i]
                label_ids = [
                    self.label_vocab.get(l, self.get_unk_label_id())
                    for l in label_str
                ]
                label_ids = to_categorical(label_ids,
                                           self.num_class).astype(int)
                batch_label_ids.append(label_ids)

        features = []
        if self.use_char:
            features.append(self.pad_sequence(batch_char_ids))
        if self.use_bert:
            features.append(self.pad_sequence(batch_bert_ids))
            features.append(self.pad_sequence(batch_bert_seg_ids))
        if self.use_word:
            features.append(self.pad_sequence(batch_word_ids))

        if len(features) == 1:
            features = features[0]

        if not batch_label_ids:
            return features, None
        else:
            y = pad_sequences_2d(batch_label_ids,
                                 max_len_1=self.max_len,
                                 max_len_2=self.num_class,
                                 padding=self.padding_mode,
                                 truncating=self.truncating_mode)
            return features, y
Beispiel #2
0
    def prepare_input(self, data, labels=None):
        """Prepare input (features and labels) for SPM model.
        Here we not only use character embeddings (or bert embeddings) as main input, but also
        support word embeddings and other hand-crafted features embeddings as additional input.

        Args:
            data: list of text pairs, like ``[['我是中国人', ...], ['我爱中国', ...]]``
            labels: list of str, the corresponding label strings

        Returns:
            features: id matrix
            y: label id matrix (only if labels is provided)

        """
        batch_word_ids_a, batch_char_ids_a, batch_bert_ids_a, batch_bert_seg_ids_a = \
            [], [], [], []
        batch_word_ids_b, batch_char_ids_b, batch_bert_ids_b, batch_bert_seg_ids_b = \
            [], [], [], []
        batch_label_ids = []

        for i, (text_a, text_b) in enumerate(zip(data[0], data[1])):
            if self.use_bert_model:
                indices, segments = self.bert_tokenizer.encode(
                    first=text_a, second=text_b, max_len=self.max_len)
                batch_bert_ids_a.append(indices)
                batch_bert_seg_ids_a.append(segments)

            elif self.use_word:
                word_text_a = jieba.lcut(text_a)
                word_text_b = jieba.lcut(text_b)
                word_ids_a = self.get_word_ids(word_text_a)
                batch_word_ids_a.append(word_ids_a)
                word_ids_b = self.get_word_ids(word_text_b)
                batch_word_ids_b.append(word_ids_b)

                if self.use_char:
                    word_text_a = [list(word) for word in word_text_a]
                    word_text_b = [list(word) for word in word_text_b]
                    char_ids_a = [[
                        self.char_vocab.get(char,
                                            self.char_vocab[self.unk_token])
                        for char in token
                    ] for token in word_text_a]
                    char_ids_b = [[
                        self.char_vocab.get(char,
                                            self.char_vocab[self.unk_token])
                        for char in token
                    ] for token in word_text_b]
                    batch_char_ids_a.append(char_ids_a)
                    batch_char_ids_b.append(char_ids_b)

            else:
                text_a = list(text_a)
                text_b = list(text_b)

                if self.use_char:
                    char_text_a = [self.cls_token] + text_a + [self.seq_token] if self.use_bert \
                        else text_a
                    char_text_b = [self.cls_token] + text_b + [self.seq_token] if self.use_bert \
                        else text_b
                    char_ids_a = [
                        self.char_vocab.get(token,
                                            self.char_vocab[self.unk_token])
                        for token in char_text_a
                    ]
                    batch_char_ids_a.append(char_ids_a)
                    char_ids_b = [
                        self.char_vocab.get(token,
                                            self.char_vocab[self.unk_token])
                        for token in char_text_b
                    ]
                    batch_char_ids_b.append(char_ids_b)

                if self.use_bert:
                    indices_a, segments_a = self.bert_tokenizer.encode(
                        first=''.join(text_a), max_len=self.max_len)
                    batch_bert_ids_a.append(indices_a)
                    batch_bert_seg_ids_a.append(segments_a)

                    indices_b, segments_b = self.bert_tokenizer.encode(
                        first=''.join(text_b), max_len=self.max_len)
                    batch_bert_ids_b.append(indices_b)
                    batch_bert_seg_ids_b.append(segments_b)

            if labels is not None:
                label_ids = self.label_vocab.get(labels[i],
                                                 self.get_unk_label_id())
                label_ids = to_categorical(label_ids,
                                           self.num_class).astype(int)
                batch_label_ids.append(label_ids)

        features_a, features_b = [], []
        if self.use_bert_model:
            features_a.append(self.pad_sequence(batch_bert_ids_a))
            features_a.append(self.pad_sequence(batch_bert_seg_ids_a))

        elif self.use_word:
            features_a.append(self.pad_sequence(batch_word_ids_a))
            features_b.append(self.pad_sequence(batch_word_ids_b))
            if self.use_char:
                features_a.append(
                    pad_sequences_2d(batch_char_ids_a,
                                     max_len_1=self.max_len,
                                     max_len_2=self.max_word_len,
                                     padding=self.padding_mode,
                                     truncating=self.truncating_mode))
                features_b.append(
                    pad_sequences_2d(batch_char_ids_b,
                                     max_len_1=self.max_len,
                                     max_len_2=self.max_word_len,
                                     padding=self.padding_mode,
                                     truncating=self.truncating_mode))

        else:
            if self.use_char:
                features_a.append(self.pad_sequence(batch_char_ids_a))
                features_b.append(self.pad_sequence(batch_char_ids_b))
            if self.use_bert:
                features_a.append(self.pad_sequence(batch_bert_ids_a))
                features_b.append(self.pad_sequence(batch_bert_ids_b))
                features_a.append(self.pad_sequence(batch_bert_seg_ids_a))
                features_b.append(self.pad_sequence(batch_bert_seg_ids_b))

        if len(features_a) == 1:
            features = [features_a[0], features_b[0]]
        else:
            features = features_a + features_b

        if not batch_label_ids:
            return features, None
        else:
            y = np.asarray(batch_label_ids)
            return features, y
    def prepare_input(self,
                      data: List[List[str]],
                      labels: Optional[List[List[str]]] = None
                      ) -> Tuple[np.ndarray, Any]:
        """Prepare input (features and labels) for NER model.
        Here we not only use character embeddings (or bert embeddings) as main input, but also
        support word embeddings and other hand-crafted features embeddings as additional input.

        Args:
            data: List of List of str. List of tokenized (in char level) texts for training,
                like ``[['我', '在', '上', '海', '上', '学'], ...]``.
            labels: Optional List of List of str, can be None. The labels of train_data, usually in
            BIO or BIOES format, like ``[['O', 'O', 'B-LOC', 'I-LOC', 'O', 'O'], ...]``.

        Returns: Tuple:
            features: id matrix
            y: label id matrix only if labels is provided, otherwise None,

        """
        batch_char_ids, batch_bert_ids, batch_bert_seg_ids, batch_word_ids = [], [], [], []
        batch_label_ids = []
        for i, char_text in enumerate(data):
            if self.use_char:
                if self.use_bert:
                    text_for_char_input = [self.cls_token
                                           ] + char_text + [self.seq_token]
                else:
                    text_for_char_input = char_text
                char_ids = [
                    self.char_vocab.get(token, self.char_vocab[self.unk_token])
                    for token in text_for_char_input
                ]
                batch_char_ids.append(char_ids)

            if self.use_bert:
                indices, segments = self.bert_tokenizer.encode(
                    first_text=''.join(char_text), max_length=self.max_len)
                batch_bert_ids.append(indices)
                batch_bert_seg_ids.append(segments)

            if self.use_word:
                word_text = jieba.lcut(''.join(char_text))
                word_ids = self.get_word_ids(word_text)
                batch_word_ids.append(word_ids)

            if labels is not None:
                if self.use_bert:
                    label_str = [self.cls_token] + labels[i] + [self.cls_token]
                else:
                    label_str = labels[i]
                label_ids = [
                    self.label_vocab.get(l, self.get_unk_label_id())
                    for l in label_str
                ]
                label_ids = tf.keras.utils.to_categorical(
                    label_ids, self.num_class).astype(int)
                batch_label_ids.append(label_ids)

        features = []
        if self.use_char:
            features.append(self.pad_sequence(batch_char_ids))
        if self.use_bert:
            features.append(self.pad_sequence(batch_bert_ids))
            features.append(self.pad_sequence(batch_bert_seg_ids))
        if self.use_word:
            features.append(self.pad_sequence(batch_word_ids))

        if len(features) == 1:
            features = features[0]

        if not batch_label_ids:
            return features, None
        else:
            y = pad_sequences_2d(batch_label_ids,
                                 max_len_1=self.max_len,
                                 max_len_2=self.num_class,
                                 padding=self.padding_mode,
                                 truncating=self.truncating_mode)
            return features, y