Beispiel #1
0
    def __init__(self,
                 train_data,
                 train_labels,
                 min_count=2,
                 use_word=False,
                 use_char=True,
                 use_bert=False,
                 use_bert_model=False,
                 external_word_dict=None,
                 bert_vocab_file=None,
                 word_embed_type=None,
                 word_embed_dim=300,
                 char_embed_type=None,
                 char_embed_dim=300,
                 label_dict_file=None,
                 max_len=None,
                 max_word_len=None,
                 padding_mode='post',
                 truncating_mode='post'):
        """

        Args:
            train_data: a list of untokenized text pairs
            train_labels: list of str, train_data's labels
            min_count: int, token of which frequency is lower than min_count will be ignored
            use_word: whether to use word embedding as input
            use_char:whether to use char embedding as input
            use_bert: whether to use bert embedding as input
            use_bert_model: boolean, whether to use traditional bert model which combines two
                            sentences as one input
            word_embed_type: str, can be a pre-trained embedding filename or pre-trained embedding
                             methods (word2vec, glove, fastext)
            word_embed_dim: dimensionality of word embedding
            char_embed_type: same as word_embed_type, only apply when use_char is True
            char_embed_dim: dimensionality of char embedding
            external_word_dict: external word dictionary, only apply when use_word is True
            bert_vocab_file: vocabulary file of pre-trained bert model, only apply when use_bert is
                             True
            label_dict_file: a file with two columns separated by tab, the first column is raw
                             label name, and the second column is the corresponding name which is
                             meaningful
            max_len: int, max sequence length
            max_word_len: int, max word length
            padding_mode: str, 'pre' or 'post', pad either before or after each sequence
            truncating_mode: str, 'pre' or 'post', remove values from sequences larger than
                             `max_len`, either at the beginning or at the end of the sequences
        """
        super(SPMPreprocessor, self).__init__(max_len, padding_mode,
                                              truncating_mode)

        self.train_data = train_data
        self.train_labels = train_labels
        self.min_count = min_count
        self.use_word = use_word
        self.use_char = use_char
        self.use_bert = use_bert
        self.use_bert_model = use_bert_model
        self.external_word_dict = external_word_dict
        self.word_embed_type = word_embed_type
        self.char_embed_type = char_embed_type
        self.max_word_len = max_word_len

        self.label_dict = self.load_label_dict(label_dict_file)

        assert not (self.use_bert_model and (self.use_word or self.use_char)), \
            "bert model can not add word or char embedding as additional input"
        assert not (self.use_bert_model
                    and not use_bert), "bert model must use bert embedding"
        assert self.use_word or self.use_char or self.use_bert, "must use word or char or bert" \
                                                                "embedding as main input"
        assert not (self.use_word and self.use_bert), "bert embedding can not be used with word" \
                                                      "embedding"
        special_token = 'bert' if self.use_bert else 'standard'

        train_data_a, train_data_b = self.train_data
        train_data = list(chain(*zip(train_data_a, train_data_b)))

        # build word vocabulary and word embedding
        if self.use_word:
            self.load_word_dict()

            word_corpus = self.build_corpus(train_data,
                                            cut_func=lambda x: jieba.lcut(x))
            self.word_vocab_count, self.word_vocab, self.id2word = \
                self.build_vocab(word_corpus, self.min_count, special_token)
            self.word_vocab_size = len(self.word_vocab)
            self.word_embeddings = self.build_embedding(
                word_embed_type, self.word_vocab, word_corpus, word_embed_dim,
                special_token)
            if self.word_embeddings is not None:
                self.word_embed_dim = self.word_embeddings.shape[1]
            else:
                self.word_embed_dim = word_embed_dim

            if self.max_len is None:
                self.max_len = get_len_from_corpus(word_corpus)
            if self.use_char and self.max_word_len is None:
                self.max_word_len = get_len_from_corpus(
                    list(chain(*word_corpus)))
        else:
            self.word_vocab_count, self.word_vocab, self.id2word = None, None, None
            self.word_vocab_size = -1
            self.word_embeddings = None
            self.word_embed_dim = -1

        train_data = [list(text) for text in train_data]

        # build char vocabulary and char embedding
        if self.use_char:
            self.char_vocab_count, self.char_vocab, self.id2char = \
                self.build_vocab(train_data, self.min_count, special_token)
            self.char_vocab_size = len(self.char_vocab)
            self.char_embeddings = self.build_embedding(
                char_embed_type, self.char_vocab, train_data, char_embed_dim,
                special_token)
            if self.char_embeddings is not None:
                self.char_embed_dim = self.char_embeddings.shape[1]
            else:
                self.char_embed_dim = char_embed_dim
            if self.max_len is None:
                self.max_len = get_len_from_corpus(train_data)
        else:
            self.char_vocab_count, self.char_vocab, self.id2char = None, None, None
            self.char_vocab_size = -1
            self.char_embeddings = None
            self.char_embed_dim = -1

        # build bert vocabulary
        if self.use_bert:
            self.bert_vocab = {}
            with codecs.open(bert_vocab_file, 'r', 'utf8') as reader:
                for line in reader:
                    token = line.strip()
                    self.bert_vocab[token] = len(self.bert_vocab)
            self.bert_tokenizer = ChineseBertTokenizer(self.bert_vocab)

        # build label vocabulary
        self.label_vocab, self.id2label = self.build_label_vocab(
            self.train_labels)
        self.num_class = len(self.label_vocab)

        if self.use_bert_model and self.max_len is None:
            # max_len should be provided when use bert model!
            # We will reset max_len from train_data when max_len is not provided.
            self.max_len = get_len_from_corpus([
                list(a) + list(b) for a, b in zip(train_data_a, train_data_b)
            ])
        elif not self.use_word and self.use_bert and self.max_len is None:
            # max_len should be provided when use bert as input!
            # We will reset max_len from train_data when max_len is not provided.
            self.max_len = get_len_from_corpus(train_data)

        if self.use_bert:
            # max length is 512 for bert
            self.max_len = min(self.max_len, 512)
    def __init__(self,
                 train_data,
                 train_labels,
                 min_count=2,
                 use_char=True,
                 use_bert=False,
                 use_word=False,
                 external_word_dict=None,
                 label_dict_file=None,
                 bert_vocab_file=None,
                 char_embed_type=None,
                 char_embed_dim=300,
                 word_embed_type=None,
                 word_embed_dim=300,
                 max_len=None,
                 padding_mode='post',
                 truncating_mode='post'):
        """

        Args:
            train_data: a list of tokenized (in char level) texts
            train_labels: list of str, train_data's labels
            min_count: int, token of which frequency is lower than min_count will be ignored
            use_char:whether to use char embedding as input
            use_bert: whether to use bert embedding as input
            use_word: whether to use word embedding as additional input
            external_word_dict: external word dictionary, only apply when use_word is True
            label_dict_file: a file with two columns separated by tab, the first column is raw
                             label name, and the second column is the corresponding name which is
                             meaningful
            bert_vocab_file: vocabulary file of pre-trained bert model, only apply when use_bert is
                             True
            char_embed_type: str, can be a pre-trained embedding filename or pre-trained embedding
                             methods (word2vec, glove, fastext)
            char_embed_dim: dimensionality of char embedding
            word_embed_type: same as char_embed_type, only apply when use_word is True
            word_embed_dim: dimensionality of word embedding
            max_len: int, max sequence len
            padding_mode:
            truncating_mode:
        """
        super(TextClassificationPreprocessor,
              self).__init__(max_len, padding_mode, truncating_mode)

        self.train_data = train_data
        self.train_labels = train_labels
        self.min_count = min_count
        self.use_char = use_char
        self.use_bert = use_bert
        self.use_word = use_word
        self.external_word_dict = external_word_dict
        self.char_embed_type = char_embed_type
        self.word_embed_type = word_embed_type

        self.label_dict = self.load_label_dict(label_dict_file)

        assert self.use_char or self.use_bert, "must use char or bert embedding as main input"
        special_token = 'bert' if self.use_bert else 'standard'

        # build char vocabulary and char embedding
        if self.use_char:
            self.char_vocab_count, self.char_vocab, self.id2char = \
                self.build_vocab(self.train_data, self.min_count, special_token)
            self.char_vocab_size = len(self.char_vocab)
            self.char_embeddings = self.build_embedding(
                char_embed_type, self.char_vocab, self.train_data,
                char_embed_dim, special_token)
            if self.char_embeddings is not None:
                self.char_embed_dim = self.char_embeddings.shape[1]
            else:
                self.char_embed_dim = char_embed_dim
        else:
            self.char_vocab_count, self.char_vocab, self.id2char = None, None, None
            self.char_vocab_size = -1
            self.char_embeddings = None
            self.char_embed_dim = -1

        # build bert vocabulary
        if self.use_bert:
            # lower case for non-chinese character
            self.bert_tokenizer = ChineseBertTokenizer(bert_vocab_file)

        # build word vocabulary and word embedding
        if self.use_word:
            self.load_word_dict()

            untokenized_texts = [''.join(text) for text in self.train_data]
            word_corpus = self.build_corpus(untokenized_texts,
                                            cut_func=lambda x: jieba.lcut(x))

            self.word_vocab_count, self.word_vocab, self.id2word = \
                self.build_vocab(word_corpus, self.min_count, special_token)
            self.word_vocab_size = len(self.word_vocab)
            self.word_embeddings = self.build_embedding(
                word_embed_type, self.word_vocab, word_corpus, word_embed_dim,
                special_token)
            if self.word_embeddings is not None:
                self.word_embed_dim = self.word_embeddings.shape[1]
            else:
                self.word_embed_dim = word_embed_dim
        else:
            self.word_vocab_count, self.word_vocab, self.id2word = None, None, None
            self.word_vocab_size = -1
            self.word_embeddings = None
            self.word_embed_dim = -1

        # build label vocabulary
        self.label_vocab, self.id2label = self.build_label_vocab(
            self.train_labels)
        self.num_class = len(self.label_vocab)

        if self.use_bert and self.max_len is None:
            # max_len must be provided when use bert as input!
            # We will reset max_len from train_data when max_len is not provided.
            self.max_len = get_len_from_corpus(self.train_data)

            # make sure max_len is shorted than bert's max length (512)
            # since there are 2 more special token: <CLS> and <SEQ>, so add 2
            self.max_len = min(self.max_len + 2, 512)
Beispiel #3
0
class SPMPreprocessor(Preprocessor):
    """SPM preprocessor.
    """
    def __init__(self,
                 train_data,
                 train_labels,
                 min_count=2,
                 use_word=False,
                 use_char=True,
                 use_bert=False,
                 use_bert_model=False,
                 external_word_dict=None,
                 bert_vocab_file=None,
                 word_embed_type=None,
                 word_embed_dim=300,
                 char_embed_type=None,
                 char_embed_dim=300,
                 label_dict_file=None,
                 max_len=None,
                 max_word_len=None,
                 padding_mode='post',
                 truncating_mode='post'):
        """

        Args:
            train_data: a list of untokenized text pairs
            train_labels: list of str, train_data's labels
            min_count: int, token of which frequency is lower than min_count will be ignored
            use_word: whether to use word embedding as input
            use_char:whether to use char embedding as input
            use_bert: whether to use bert embedding as input
            use_bert_model: boolean, whether to use traditional bert model which combines two
                            sentences as one input
            word_embed_type: str, can be a pre-trained embedding filename or pre-trained embedding
                             methods (word2vec, glove, fastext)
            word_embed_dim: dimensionality of word embedding
            char_embed_type: same as word_embed_type, only apply when use_char is True
            char_embed_dim: dimensionality of char embedding
            external_word_dict: external word dictionary, only apply when use_word is True
            bert_vocab_file: vocabulary file of pre-trained bert model, only apply when use_bert is
                             True
            label_dict_file: a file with two columns separated by tab, the first column is raw
                             label name, and the second column is the corresponding name which is
                             meaningful
            max_len: int, max sequence length
            max_word_len: int, max word length
            padding_mode: str, 'pre' or 'post', pad either before or after each sequence
            truncating_mode: str, 'pre' or 'post', remove values from sequences larger than
                             `max_len`, either at the beginning or at the end of the sequences
        """
        super(SPMPreprocessor, self).__init__(max_len, padding_mode,
                                              truncating_mode)

        self.train_data = train_data
        self.train_labels = train_labels
        self.min_count = min_count
        self.use_word = use_word
        self.use_char = use_char
        self.use_bert = use_bert
        self.use_bert_model = use_bert_model
        self.external_word_dict = external_word_dict
        self.word_embed_type = word_embed_type
        self.char_embed_type = char_embed_type
        self.max_word_len = max_word_len

        self.label_dict = self.load_label_dict(label_dict_file)

        assert not (self.use_bert_model and (self.use_word or self.use_char)), \
            "bert model can not add word or char embedding as additional input"
        assert not (self.use_bert_model
                    and not use_bert), "bert model must use bert embedding"
        assert self.use_word or self.use_char or self.use_bert, "must use word or char or bert" \
                                                                "embedding as main input"
        assert not (self.use_word and self.use_bert), "bert embedding can not be used with word" \
                                                      "embedding"
        special_token = 'bert' if self.use_bert else 'standard'

        train_data_a, train_data_b = self.train_data
        train_data = list(chain(*zip(train_data_a, train_data_b)))

        # build word vocabulary and word embedding
        if self.use_word:
            self.load_word_dict()

            word_corpus = self.build_corpus(train_data,
                                            cut_func=lambda x: jieba.lcut(x))
            self.word_vocab_count, self.word_vocab, self.id2word = \
                self.build_vocab(word_corpus, self.min_count, special_token)
            self.word_vocab_size = len(self.word_vocab)
            self.word_embeddings = self.build_embedding(
                word_embed_type, self.word_vocab, word_corpus, word_embed_dim,
                special_token)
            if self.word_embeddings is not None:
                self.word_embed_dim = self.word_embeddings.shape[1]
            else:
                self.word_embed_dim = word_embed_dim

            if self.max_len is None:
                self.max_len = get_len_from_corpus(word_corpus)
            if self.use_char and self.max_word_len is None:
                self.max_word_len = get_len_from_corpus(
                    list(chain(*word_corpus)))
        else:
            self.word_vocab_count, self.word_vocab, self.id2word = None, None, None
            self.word_vocab_size = -1
            self.word_embeddings = None
            self.word_embed_dim = -1

        train_data = [list(text) for text in train_data]

        # build char vocabulary and char embedding
        if self.use_char:
            self.char_vocab_count, self.char_vocab, self.id2char = \
                self.build_vocab(train_data, self.min_count, special_token)
            self.char_vocab_size = len(self.char_vocab)
            self.char_embeddings = self.build_embedding(
                char_embed_type, self.char_vocab, train_data, char_embed_dim,
                special_token)
            if self.char_embeddings is not None:
                self.char_embed_dim = self.char_embeddings.shape[1]
            else:
                self.char_embed_dim = char_embed_dim
            if self.max_len is None:
                self.max_len = get_len_from_corpus(train_data)
        else:
            self.char_vocab_count, self.char_vocab, self.id2char = None, None, None
            self.char_vocab_size = -1
            self.char_embeddings = None
            self.char_embed_dim = -1

        # build bert vocabulary
        if self.use_bert:
            self.bert_vocab = {}
            with codecs.open(bert_vocab_file, 'r', 'utf8') as reader:
                for line in reader:
                    token = line.strip()
                    self.bert_vocab[token] = len(self.bert_vocab)
            self.bert_tokenizer = ChineseBertTokenizer(self.bert_vocab)

        # build label vocabulary
        self.label_vocab, self.id2label = self.build_label_vocab(
            self.train_labels)
        self.num_class = len(self.label_vocab)

        if self.use_bert_model and self.max_len is None:
            # max_len should be provided when use bert model!
            # We will reset max_len from train_data when max_len is not provided.
            self.max_len = get_len_from_corpus([
                list(a) + list(b) for a, b in zip(train_data_a, train_data_b)
            ])
        elif not self.use_word and self.use_bert and self.max_len is None:
            # max_len should be provided when use bert as input!
            # We will reset max_len from train_data when max_len is not provided.
            self.max_len = get_len_from_corpus(train_data)

        if self.use_bert:
            # max length is 512 for bert
            self.max_len = min(self.max_len, 512)

    def load_word_dict(self):
        if self.external_word_dict:
            for word in self.external_word_dict:
                jieba.add_word(word, freq=1000000)

    @staticmethod
    def load_label_dict(label_dict_file):
        result_dict = dict()
        if label_dict_file:
            with codecs.open(label_dict_file,
                             encoding='utf-8') as f_label_dict:
                for line in f_label_dict:
                    line_items = line.strip().split('\t')
                    result_dict[line_items[0]] = line_items[1]
            return result_dict
        else:
            return None

    def build_label_vocab(self, labels):
        """Build label vocabulary

        Args:
            labels: list of str, the label strings
        """
        label_count = {}
        for label in labels:
            label_count[label] = label_count.get(label, 0) + 1

        # sorted by frequency, so that the label with the highest frequency will be given
        # id of 0, which is the default id for unknown labels
        sorted_label_count = sorted(label_count.items(),
                                    key=lambda x: x[1],
                                    reverse=True)
        sorted_label_count = dict(sorted_label_count)

        label_vocab = {}
        for label in sorted_label_count:
            label_vocab[label] = len(label_vocab)

        id2label = dict((idx, label) for label, idx in label_vocab.items())

        logging.info('Build label vocabulary finished, '
                     'vocabulary size: {}'.format(len(label_vocab)))
        return label_vocab, id2label

    def prepare_input(self, data, labels=None):
        """Prepare input (features and labels) for SPM model.
        Here we not only use character embeddings (or bert embeddings) as main input, but also
        support word embeddings and other hand-crafted features embeddings as additional input.

        Args:
            data: list of text pairs, like ``[['我是中国人', ...], ['我爱中国', ...]]``
            labels: list of str, the corresponding label strings

        Returns:
            features: id matrix
            y: label id matrix (only if labels is provided)

        """
        batch_word_ids_a, batch_char_ids_a, batch_bert_ids_a, batch_bert_seg_ids_a = \
            [], [], [], []
        batch_word_ids_b, batch_char_ids_b, batch_bert_ids_b, batch_bert_seg_ids_b = \
            [], [], [], []
        batch_label_ids = []

        for i, (text_a, text_b) in enumerate(zip(data[0], data[1])):
            if self.use_bert_model:
                indices, segments = self.bert_tokenizer.encode(
                    first=text_a, second=text_b, max_len=self.max_len)
                batch_bert_ids_a.append(indices)
                batch_bert_seg_ids_a.append(segments)

            elif self.use_word:
                word_text_a = jieba.lcut(text_a)
                word_text_b = jieba.lcut(text_b)
                word_ids_a = self.get_word_ids(word_text_a)
                batch_word_ids_a.append(word_ids_a)
                word_ids_b = self.get_word_ids(word_text_b)
                batch_word_ids_b.append(word_ids_b)

                if self.use_char:
                    word_text_a = [list(word) for word in word_text_a]
                    word_text_b = [list(word) for word in word_text_b]
                    char_ids_a = [[
                        self.char_vocab.get(char,
                                            self.char_vocab[self.unk_token])
                        for char in token
                    ] for token in word_text_a]
                    char_ids_b = [[
                        self.char_vocab.get(char,
                                            self.char_vocab[self.unk_token])
                        for char in token
                    ] for token in word_text_b]
                    batch_char_ids_a.append(char_ids_a)
                    batch_char_ids_b.append(char_ids_b)

            else:
                text_a = list(text_a)
                text_b = list(text_b)

                if self.use_char:
                    char_text_a = [self.cls_token] + text_a + [self.seq_token] if self.use_bert \
                        else text_a
                    char_text_b = [self.cls_token] + text_b + [self.seq_token] if self.use_bert \
                        else text_b
                    char_ids_a = [
                        self.char_vocab.get(token,
                                            self.char_vocab[self.unk_token])
                        for token in char_text_a
                    ]
                    batch_char_ids_a.append(char_ids_a)
                    char_ids_b = [
                        self.char_vocab.get(token,
                                            self.char_vocab[self.unk_token])
                        for token in char_text_b
                    ]
                    batch_char_ids_b.append(char_ids_b)

                if self.use_bert:
                    indices_a, segments_a = self.bert_tokenizer.encode(
                        first=''.join(text_a), max_len=self.max_len)
                    batch_bert_ids_a.append(indices_a)
                    batch_bert_seg_ids_a.append(segments_a)

                    indices_b, segments_b = self.bert_tokenizer.encode(
                        first=''.join(text_b), max_len=self.max_len)
                    batch_bert_ids_b.append(indices_b)
                    batch_bert_seg_ids_b.append(segments_b)

            if labels is not None:
                label_ids = self.label_vocab.get(labels[i],
                                                 self.get_unk_label_id())
                label_ids = to_categorical(label_ids,
                                           self.num_class).astype(int)
                batch_label_ids.append(label_ids)

        features_a, features_b = [], []
        if self.use_bert_model:
            features_a.append(self.pad_sequence(batch_bert_ids_a))
            features_a.append(self.pad_sequence(batch_bert_seg_ids_a))

        elif self.use_word:
            features_a.append(self.pad_sequence(batch_word_ids_a))
            features_b.append(self.pad_sequence(batch_word_ids_b))
            if self.use_char:
                features_a.append(
                    pad_sequences_2d(batch_char_ids_a,
                                     max_len_1=self.max_len,
                                     max_len_2=self.max_word_len,
                                     padding=self.padding_mode,
                                     truncating=self.truncating_mode))
                features_b.append(
                    pad_sequences_2d(batch_char_ids_b,
                                     max_len_1=self.max_len,
                                     max_len_2=self.max_word_len,
                                     padding=self.padding_mode,
                                     truncating=self.truncating_mode))

        else:
            if self.use_char:
                features_a.append(self.pad_sequence(batch_char_ids_a))
                features_b.append(self.pad_sequence(batch_char_ids_b))
            if self.use_bert:
                features_a.append(self.pad_sequence(batch_bert_ids_a))
                features_b.append(self.pad_sequence(batch_bert_ids_b))
                features_a.append(self.pad_sequence(batch_bert_seg_ids_a))
                features_b.append(self.pad_sequence(batch_bert_seg_ids_b))

        if len(features_a) == 1:
            features = [features_a[0], features_b[0]]
        else:
            features = features_a + features_b

        if not batch_label_ids:
            return features, None
        else:
            y = np.asarray(batch_label_ids)
            return features, y

    def get_word_ids(self, word_cut):
        """Given a word-level tokenized text, return the corresponding word ids.

        Args:
            word_cut: list of str, like ['我', '是'. '中国人']
            unk_idx: the index of words that do not appear in vocabulary, we usually set it to 1

        Returns: list of int, id sequence

        """
        word_ids = []
        for word in word_cut:
            word_ids.append(
                self.word_vocab.get(word, self.word_vocab[self.unk_token]))
        return word_ids

    def label_decode(self, pred_probs, label_dict=None):
        pred_ids = np.argmax(pred_probs, axis=-1)
        pred_labels = [self.id2label[pred_id] for pred_id in pred_ids]
        if label_dict:
            pred_labels = [label_dict[raw_label] for raw_label in pred_labels]
        return pred_labels

    def get_unk_label_id(self):
        """return a default id for label that does not exist in the label vocab

        Args:
            label: str

        Returns: int

        """
        if 'O' in self.label_vocab:
            return self.label_vocab['O']
        elif 'o' in self.label_vocab:
            return self.label_vocab['o']
        else:
            return 0

    def save(self, preprocessor_file):
        pickle.dump(self, open(preprocessor_file, 'wb'))

    @classmethod
    def load(cls, preprocessor_file):
        p = pickle.load(open(preprocessor_file, 'rb'))
        p.load_word_dict()  # reload external word dict into jieba
        return p
class TextClassificationPreprocessor(Preprocessor):
    """Text Classification preprocessor.
    """
    def __init__(self,
                 train_data,
                 train_labels,
                 min_count=2,
                 use_char=True,
                 use_bert=False,
                 use_word=False,
                 external_word_dict=None,
                 label_dict_file=None,
                 bert_vocab_file=None,
                 char_embed_type=None,
                 char_embed_dim=300,
                 word_embed_type=None,
                 word_embed_dim=300,
                 max_len=None,
                 padding_mode='post',
                 truncating_mode='post'):
        """

        Args:
            train_data: a list of tokenized (in char level) texts
            train_labels: list of str, train_data's labels
            min_count: int, token of which frequency is lower than min_count will be ignored
            use_char:whether to use char embedding as input
            use_bert: whether to use bert embedding as input
            use_word: whether to use word embedding as additional input
            external_word_dict: external word dictionary, only apply when use_word is True
            label_dict_file: a file with two columns separated by tab, the first column is raw
                             label name, and the second column is the corresponding name which is
                             meaningful
            bert_vocab_file: vocabulary file of pre-trained bert model, only apply when use_bert is
                             True
            char_embed_type: str, can be a pre-trained embedding filename or pre-trained embedding
                             methods (word2vec, glove, fastext)
            char_embed_dim: dimensionality of char embedding
            word_embed_type: same as char_embed_type, only apply when use_word is True
            word_embed_dim: dimensionality of word embedding
            max_len: int, max sequence len
            padding_mode:
            truncating_mode:
        """
        super(TextClassificationPreprocessor,
              self).__init__(max_len, padding_mode, truncating_mode)

        self.train_data = train_data
        self.train_labels = train_labels
        self.min_count = min_count
        self.use_char = use_char
        self.use_bert = use_bert
        self.use_word = use_word
        self.external_word_dict = external_word_dict
        self.char_embed_type = char_embed_type
        self.word_embed_type = word_embed_type

        self.label_dict = self.load_label_dict(label_dict_file)

        assert self.use_char or self.use_bert, "must use char or bert embedding as main input"
        special_token = 'bert' if self.use_bert else 'standard'

        # build char vocabulary and char embedding
        if self.use_char:
            self.char_vocab_count, self.char_vocab, self.id2char = \
                self.build_vocab(self.train_data, self.min_count, special_token)
            self.char_vocab_size = len(self.char_vocab)
            self.char_embeddings = self.build_embedding(
                char_embed_type, self.char_vocab, self.train_data,
                char_embed_dim, special_token)
            if self.char_embeddings is not None:
                self.char_embed_dim = self.char_embeddings.shape[1]
            else:
                self.char_embed_dim = char_embed_dim
        else:
            self.char_vocab_count, self.char_vocab, self.id2char = None, None, None
            self.char_vocab_size = -1
            self.char_embeddings = None
            self.char_embed_dim = -1

        # build bert vocabulary
        if self.use_bert:
            # lower case for non-chinese character
            self.bert_tokenizer = ChineseBertTokenizer(bert_vocab_file)

        # build word vocabulary and word embedding
        if self.use_word:
            self.load_word_dict()

            untokenized_texts = [''.join(text) for text in self.train_data]
            word_corpus = self.build_corpus(untokenized_texts,
                                            cut_func=lambda x: jieba.lcut(x))

            self.word_vocab_count, self.word_vocab, self.id2word = \
                self.build_vocab(word_corpus, self.min_count, special_token)
            self.word_vocab_size = len(self.word_vocab)
            self.word_embeddings = self.build_embedding(
                word_embed_type, self.word_vocab, word_corpus, word_embed_dim,
                special_token)
            if self.word_embeddings is not None:
                self.word_embed_dim = self.word_embeddings.shape[1]
            else:
                self.word_embed_dim = word_embed_dim
        else:
            self.word_vocab_count, self.word_vocab, self.id2word = None, None, None
            self.word_vocab_size = -1
            self.word_embeddings = None
            self.word_embed_dim = -1

        # build label vocabulary
        self.label_vocab, self.id2label = self.build_label_vocab(
            self.train_labels)
        self.num_class = len(self.label_vocab)

        if self.use_bert and self.max_len is None:
            # max_len must be provided when use bert as input!
            # We will reset max_len from train_data when max_len is not provided.
            self.max_len = get_len_from_corpus(self.train_data)

            # make sure max_len is shorted than bert's max length (512)
            # since there are 2 more special token: <CLS> and <SEQ>, so add 2
            self.max_len = min(self.max_len + 2, 512)

    def load_word_dict(self):
        if self.external_word_dict:
            for word in self.external_word_dict:
                jieba.add_word(word, freq=1000000)

    @staticmethod
    def load_label_dict(label_dict_file):
        result_dict = dict()
        if label_dict_file:
            with codecs.open(label_dict_file,
                             encoding='utf-8') as f_label_dict:
                for line in f_label_dict:
                    line_items = line.strip().split('\t')
                    result_dict[line_items[0]] = line_items[1]
            return result_dict
        else:
            return None

    def build_label_vocab(self, labels):
        """Build label vocabulary

        Args:
            labels: list of str, the label strings
        """
        label_count = {}
        for label in labels:
            label_count[label] = label_count.get(label, 0) + 1

        # sorted by frequency, so that the label with the highest frequency will be given
        # id of 0, which is the default id for unknown labels
        sorted_label_count = sorted(label_count.items(),
                                    key=lambda x: x[1],
                                    reverse=True)
        sorted_label_count = dict(sorted_label_count)

        label_vocab = {}
        for label in sorted_label_count:
            label_vocab[label] = len(label_vocab)

        id2label = dict((idx, label) for label, idx in label_vocab.items())

        logging.info('Build label vocabulary finished, '
                     'vocabulary size: {}'.format(len(label_vocab)))
        return label_vocab, id2label

    def prepare_input(self, data, labels=None):
        """Prepare input (features and labels) for text classification model.
        Here we not only use character embeddings (or bert embeddings) as main input, but also
        support word embeddings and other hand-crafted features embeddings as additional input.

        Args:
            data: list of tokenized (in char level) texts, like ``[['我', '是', '中', '国', '人']]``
            labels: list of str, the corresponding label strings

        Returns:
            features: id matrix
            y: label id matrix (only if labels is provided)

        """
        batch_char_ids, batch_bert_ids, batch_bert_seg_ids, batch_word_ids = [], [], [], []
        batch_label_ids = []
        for i, char_text in enumerate(data):
            if self.use_char:
                if self.use_bert:
                    text_for_char_input = [self.cls_token
                                           ] + char_text + [self.seq_token]
                else:
                    text_for_char_input = char_text
                char_ids = [
                    self.char_vocab.get(token, self.char_vocab[self.unk_token])
                    for token in text_for_char_input
                ]
                batch_char_ids.append(char_ids)

            if self.use_bert:
                indices, segments = self.bert_tokenizer.encode(
                    first_text=''.join(char_text), max_length=self.max_len)
                batch_bert_ids.append(indices)
                batch_bert_seg_ids.append(segments)

            if self.use_word:
                word_text = jieba.lcut(''.join(char_text))
                word_ids = self.get_word_ids(word_text)
                batch_word_ids.append(word_ids)

        if labels is not None:
            batch_label_ids = [self.label_vocab.get(l, 0) for l in labels]
            batch_label_ids = tf.keras.utils.to_categorical(
                batch_label_ids, self.num_class).astype(int)

        features = []
        if self.use_char:
            features.append(self.pad_sequence(batch_char_ids))
        if self.use_bert:
            features.append(self.pad_sequence(batch_bert_ids))
            features.append(self.pad_sequence(batch_bert_seg_ids))
        if self.use_word:
            features.append(self.pad_sequence(batch_word_ids))

        if len(features) == 1:
            features = features[0]

        if not list(batch_label_ids):
            return features, None
        else:
            y = batch_label_ids
            return features, y

    def get_word_ids(self, word_cut):
        """Given a word-level tokenized text, return the corresponding word ids in char-level
           sequence. We add the same word id to each character in the word.

        Args:
            word_cut: list of str, like ['我', '是'. '中国人']
            unk_idx: the index of words that do not appear in vocabulary, we usually set it to 1

        Returns: list of int, id sequence

        """
        word_ids = []
        for word in word_cut:
            for _ in word:
                word_ids.append(
                    self.word_vocab.get(word, self.word_vocab[self.unk_token]))
        if self.use_bert:
            word_ids = [self.word_vocab[self.cls_token]] + word_ids + \
                       [self.word_vocab[self.seq_token]]
        return word_ids

    def label_decode(self, pred_probs, label_dict=None):
        pred_ids = np.argmax(pred_probs, axis=-1)
        pred_labels = [self.id2label[pred_id] for pred_id in pred_ids]
        if label_dict:
            pred_labels = [label_dict[raw_label] for raw_label in pred_labels]
        return pred_labels

    def save(self, preprocessor_file):
        pickle.dump(self, open(preprocessor_file, 'wb'))

    @classmethod
    def load(cls, preprocessor_file):
        p = pickle.load(open(preprocessor_file, 'rb'))
        p.load_word_dict()  # reload external word dict into jieba
        return p
Beispiel #5
0
    def __init__(self,
                 train_data: Tuple[List[str], List[str]],
                 train_labels: List[str],
                 min_count: int = 2,
                 use_word: bool = False,
                 use_char: bool = True,
                 use_bert: bool = False,
                 use_bert_model: bool = False,
                 external_word_dict: Optional[List[str]] = None,
                 bert_vocab_file: Optional[str] = None,
                 word_embed_type: Optional[str] = None,
                 word_embed_dim: int = 300,
                 char_embed_type: Optional[str] = None,
                 char_embed_dim: int = 300,
                 max_len: Optional[int] = None,
                 max_word_len: Optional[int] = None,
                 padding_mode: str = 'post',
                 truncating_mode: str = 'post') -> None:
        """

        Args:
            train_data: a list of untokenized text pairs
            train_labels: list of str, train_data's labels
            min_count: int, token of which frequency is lower than min_count will be ignored
            use_word: boolean, whether to use word embedding as input
            use_char:boolean, whether to use char embedding as input
            use_bert: boolean, whether to use bert embedding as input
            use_bert_model: boolean, whether to use traditional bert model which combines two
                            sentences as one input
            word_embed_type: str, can be a pre-trained embedding filename or pre-trained embedding
                             methods (word2vec, glove, fastext)
            word_embed_dim: int, dimensionality of word embedding
            char_embed_type: sstr, ame as word_embed_type, only apply when use_char is True
            char_embed_dim: int, dimensionality of char embedding
            external_word_dict: external word dictionary, only apply when use_word is True
            bert_vocab_file: vocabulary file of pre-trained bert model, only apply when use_bert is
                             True
            max_len: int, max sequence length
            max_word_len: int, max word length
            padding_mode: str, 'pre' or 'post', pad either before or after each sequence
            truncating_mode: str, 'pre' or 'post', remove values from sequences larger than
                             `max_len`, either at the beginning or at the end of the sequences
        """
        super(SPMPreprocessor, self).__init__(max_len, padding_mode,
                                              truncating_mode)

        self.train_data = train_data
        self.train_labels = train_labels
        self.min_count = min_count
        self.use_word = use_word
        self.use_char = use_char
        self.use_bert = use_bert
        self.use_bert_model = use_bert_model
        self.external_word_dict = external_word_dict
        self.word_embed_type = word_embed_type
        self.char_embed_type = char_embed_type
        self.max_word_len = max_word_len

        assert not (self.use_bert_model and (self.use_word or self.use_char)), \
            "bert model can not add word or char embedding as additional input"
        assert not (self.use_bert_model
                    and not use_bert), "bert model must use bert embedding"
        assert self.use_word or self.use_char or self.use_bert, "must use word or char or bert" \
                                                                "embedding as main input"
        assert not (self.use_word and self.use_bert), "bert embedding can not be used with word" \
                                                      "embedding"
        special_token = 'bert' if self.use_bert else 'standard'

        train_data_a, train_data_b = self.train_data
        train_data = list(chain(*zip(train_data_a, train_data_b)))

        # build word vocabulary and word embedding
        if self.use_word:
            self.load_word_dict()

            word_corpus = self.build_corpus(train_data,
                                            cut_func=lambda x: jieba.lcut(x))
            self.word_vocab_count, self.word_vocab, self.id2word = \
                self.build_vocab(word_corpus, self.min_count, special_token)
            self.word_vocab_size = len(self.word_vocab)
            self.word_embeddings = self.build_embedding(
                word_embed_type, self.word_vocab, word_corpus, word_embed_dim,
                special_token)
            if self.word_embeddings is not None:
                self.word_embed_dim = self.word_embeddings.shape[1]
            else:
                self.word_embed_dim = word_embed_dim

            if self.max_len is None:
                self.max_len = get_len_from_corpus(word_corpus)
            if self.use_char and self.max_word_len is None:
                self.max_word_len = get_len_from_corpus(
                    list(chain(*word_corpus)))
        else:
            self.word_vocab_count, self.word_vocab, self.id2word = None, None, None
            self.word_vocab_size = -1
            self.word_embeddings = None
            self.word_embed_dim = -1

        train_data = [list(text) for text in train_data]

        # build char vocabulary and char embedding
        if self.use_char:
            self.char_vocab_count, self.char_vocab, self.id2char = \
                self.build_vocab(train_data, self.min_count, special_token)
            self.char_vocab_size = len(self.char_vocab)
            self.char_embeddings = self.build_embedding(
                char_embed_type, self.char_vocab, train_data, char_embed_dim,
                special_token)
            if self.char_embeddings is not None:
                self.char_embed_dim = self.char_embeddings.shape[1]
            else:
                self.char_embed_dim = char_embed_dim
            if self.max_len is None:
                self.max_len = get_len_from_corpus(train_data)
        else:
            self.char_vocab_count, self.char_vocab, self.id2char = None, None, None
            self.char_vocab_size = -1
            self.char_embeddings = None
            self.char_embed_dim = -1

        # build bert vocabulary
        if self.use_bert:
            # lower case for non-chinese character
            self.bert_tokenizer = ChineseBertTokenizer(bert_vocab_file,
                                                       do_lower_case=True)

        # build label vocabulary
        self.label_vocab, self.id2label = self.build_label_vocab(
            self.train_labels)
        self.num_class = len(self.label_vocab)

        if self.use_bert_model and self.max_len is None:
            # max_len should be provided when use bert model!
            # We will reset max_len from train_data when max_len is not provided.
            self.max_len = get_len_from_corpus([
                list(a) + list(b) for a, b in zip(train_data_a, train_data_b)
            ])
            self.max_len += 3  # consider 3 more special tokens: <CLS> <SEQ> <SEQ>
        elif not self.use_word and self.use_bert and self.max_len is None:
            # max_len should be provided when use bert as input!
            # We will reset max_len from train_data when max_len is not provided.
            self.max_len = get_len_from_corpus(train_data)
            self.max_len += 2  # consider 2 more special tokens: <CLS> <SEQ>

        if self.use_bert:
            # max length is 512 for bert
            self.max_len = min(self.max_len, 512)
    def __init__(self,
                 train_data: List[List[str]],
                 train_labels: List[List[str]],
                 min_count: int = 2,
                 use_char: bool = True,
                 use_bert: bool = False,
                 use_word: bool = False,
                 external_word_dict: Optional[List[str]] = None,
                 bert_vocab_file: Optional[str] = None,
                 char_embed_type: Optional[str] = None,
                 char_embed_dim: int = 300,
                 word_embed_type: Optional[str] = None,
                 word_embed_dim: int = 300,
                 max_len: Optional[int] = None,
                 padding_mode: str = 'post',
                 truncating_mode: str = 'post') -> None:
        """Build vocabulary, pre-trained embedding.

        Args:
            train_data: List of List of str. List of tokenized (in char level) texts for training,
                like ``[['我', '在', '上', '海', '上', '学'], ...]``.
            train_labels: List of List of str. The labels of train_data, usually in BIO or BIOES
                format, like ``[['O', 'O', 'B-LOC', 'I-LOC', 'O', 'O'], ...]``.
            min_count: int. Token of which frequency is lower than min_count will be ignored when
                building vocabulary.
            use_char:Boolean. Whether to use character embedding as input.
            use_bert: Boolean. Whether to use bert embedding as input.
            use_word: Boolean. Whether to use word embedding as additional input.
            external_word_dict: Optional List of str, can be None. List of words, external word
                dictionary that will be used to loaded in jieba. It can be regarded as one kind
                of gazetter that contain a number of correct named-entities.
                Such as ``['南京市', '长江大桥']``. Only applied when use_word is True.
            bert_vocab_file: Optional str, can be None. Path to bert's vocabulary file.
            char_embed_type:  Optional str, can be None. The type of char embedding, can be a
                pre-trained embedding filename that used to load pre-trained embedding,
                or a embedding training method (one of {'word2vec', 'fasttext'}) that used to
                train character embedding with dataset. If None, do not apply anr pre-trained
                embedding, and use randomly initialized embedding instead.
            char_embed_dim: int. Dimensionality of char embedding.
            word_embed_type: str. Same as char_embed_type, only applied when use_word is True.
            word_embed_dim: int. Dimensionality of word embedding
            max_len: Optional int, can be None. Max length of one sequence. If None, we dynamically
                use the max length of each batch as max_len. However, max_len must be provided
                when using bert as input.
            padding_mode: str. 'pre' or 'post': pad either before or after each sequence, used when
                preparing feature input for ner model.
            truncating_mode: str. pre' or 'post': remove values from sequences larger than
                `maxlen`, either at the beginning or at the end of the sequences, used when
                preparing feature input for ner model.
        """
        super(NERPreprocessor, self).__init__(max_len, padding_mode,
                                              truncating_mode)

        self.train_data = train_data
        self.train_labels = train_labels
        self.min_count = min_count
        self.use_char = use_char
        self.use_bert = use_bert
        self.use_word = use_word
        self.external_word_dict = external_word_dict
        self.char_embed_type = char_embed_type
        self.word_embed_type = word_embed_type

        assert self.use_char or self.use_bert, "must use char or bert embedding as main input"
        special_token = 'bert' if self.use_bert else 'standard'

        # build char vocabulary and char embedding
        if self.use_char:
            self.char_vocab_count, self.char_vocab, self.id2char = \
                self.build_vocab(self.train_data, self.min_count, special_token)
            self.char_vocab_size = len(self.char_vocab)
            self.char_embeddings = self.build_embedding(
                char_embed_type, self.char_vocab, self.train_data,
                char_embed_dim, special_token)
            if self.char_embeddings is not None:
                self.char_embed_dim = self.char_embeddings.shape[1]
            else:
                self.char_embed_dim = char_embed_dim
        else:
            self.char_vocab_count, self.char_vocab, self.id2char = None, None, None
            self.char_vocab_size = -1
            self.char_embeddings = None
            self.char_embed_dim = -1

        # build bert vocabulary
        if self.use_bert:
            # lower case for non-chinese character
            self.bert_tokenizer = ChineseBertTokenizer(bert_vocab_file,
                                                       do_lower_case=True)

        # build word vocabulary and word embedding
        if self.use_word:
            self.load_word_dict()

            untokenized_texts = [''.join(text) for text in self.train_data]
            word_corpus = self.build_corpus(untokenized_texts,
                                            cut_func=lambda x: jieba.lcut(x))

            self.word_vocab_count, self.word_vocab, self.id2word = \
                self.build_vocab(word_corpus, self.min_count, special_token)
            self.word_vocab_size = len(self.word_vocab)
            self.word_embeddings = self.build_embedding(
                word_embed_type, self.word_vocab, word_corpus, word_embed_dim,
                special_token)
            if self.word_embeddings is not None:
                self.word_embed_dim = self.word_embeddings.shape[1]
            else:
                self.word_embed_dim = word_embed_dim
        else:
            self.word_vocab_count, self.word_vocab, self.id2word = None, None, None
            self.word_vocab_size = -1
            self.word_embeddings = None
            self.word_embed_dim = -1

        # build label vocabulary
        self.label_vocab, self.id2label = self.build_label_vocab(
            self.train_labels)
        self.num_class = len(self.label_vocab)

        if self.use_bert and self.max_len is None:
            # max_len must be provided when use bert as input!
            # We will reset max_len from train_data when max_len is not provided.
            self.max_len = get_len_from_corpus(self.train_data)

            # make sure max_len is shorted than bert's max length (512)
            # since there are 2 more special token: <CLS> and <SEQ>, so add 2
            self.max_len = min(self.max_len + 2, 512)
class NERPreprocessor(Preprocessor):
    """NER preprocessor, which is used to
    1) build all kinds of vocabulary (char, word , label) from training data;
    2) pre-trained embedding matrix using training corpus;
    3) prepare feature input for ner model;
    4) decode model predictions to tagging sequence.
    """
    def __init__(self,
                 train_data: List[List[str]],
                 train_labels: List[List[str]],
                 min_count: int = 2,
                 use_char: bool = True,
                 use_bert: bool = False,
                 use_word: bool = False,
                 external_word_dict: Optional[List[str]] = None,
                 bert_vocab_file: Optional[str] = None,
                 char_embed_type: Optional[str] = None,
                 char_embed_dim: int = 300,
                 word_embed_type: Optional[str] = None,
                 word_embed_dim: int = 300,
                 max_len: Optional[int] = None,
                 padding_mode: str = 'post',
                 truncating_mode: str = 'post') -> None:
        """Build vocabulary, pre-trained embedding.

        Args:
            train_data: List of List of str. List of tokenized (in char level) texts for training,
                like ``[['我', '在', '上', '海', '上', '学'], ...]``.
            train_labels: List of List of str. The labels of train_data, usually in BIO or BIOES
                format, like ``[['O', 'O', 'B-LOC', 'I-LOC', 'O', 'O'], ...]``.
            min_count: int. Token of which frequency is lower than min_count will be ignored when
                building vocabulary.
            use_char:Boolean. Whether to use character embedding as input.
            use_bert: Boolean. Whether to use bert embedding as input.
            use_word: Boolean. Whether to use word embedding as additional input.
            external_word_dict: Optional List of str, can be None. List of words, external word
                dictionary that will be used to loaded in jieba. It can be regarded as one kind
                of gazetter that contain a number of correct named-entities.
                Such as ``['南京市', '长江大桥']``. Only applied when use_word is True.
            bert_vocab_file: Optional str, can be None. Path to bert's vocabulary file.
            char_embed_type:  Optional str, can be None. The type of char embedding, can be a
                pre-trained embedding filename that used to load pre-trained embedding,
                or a embedding training method (one of {'word2vec', 'fasttext'}) that used to
                train character embedding with dataset. If None, do not apply anr pre-trained
                embedding, and use randomly initialized embedding instead.
            char_embed_dim: int. Dimensionality of char embedding.
            word_embed_type: str. Same as char_embed_type, only applied when use_word is True.
            word_embed_dim: int. Dimensionality of word embedding
            max_len: Optional int, can be None. Max length of one sequence. If None, we dynamically
                use the max length of each batch as max_len. However, max_len must be provided
                when using bert as input.
            padding_mode: str. 'pre' or 'post': pad either before or after each sequence, used when
                preparing feature input for ner model.
            truncating_mode: str. pre' or 'post': remove values from sequences larger than
                `maxlen`, either at the beginning or at the end of the sequences, used when
                preparing feature input for ner model.
        """
        super(NERPreprocessor, self).__init__(max_len, padding_mode,
                                              truncating_mode)

        self.train_data = train_data
        self.train_labels = train_labels
        self.min_count = min_count
        self.use_char = use_char
        self.use_bert = use_bert
        self.use_word = use_word
        self.external_word_dict = external_word_dict
        self.char_embed_type = char_embed_type
        self.word_embed_type = word_embed_type

        assert self.use_char or self.use_bert, "must use char or bert embedding as main input"
        special_token = 'bert' if self.use_bert else 'standard'

        # build char vocabulary and char embedding
        if self.use_char:
            self.char_vocab_count, self.char_vocab, self.id2char = \
                self.build_vocab(self.train_data, self.min_count, special_token)
            self.char_vocab_size = len(self.char_vocab)
            self.char_embeddings = self.build_embedding(
                char_embed_type, self.char_vocab, self.train_data,
                char_embed_dim, special_token)
            if self.char_embeddings is not None:
                self.char_embed_dim = self.char_embeddings.shape[1]
            else:
                self.char_embed_dim = char_embed_dim
        else:
            self.char_vocab_count, self.char_vocab, self.id2char = None, None, None
            self.char_vocab_size = -1
            self.char_embeddings = None
            self.char_embed_dim = -1

        # build bert vocabulary
        if self.use_bert:
            # lower case for non-chinese character
            self.bert_tokenizer = ChineseBertTokenizer(bert_vocab_file,
                                                       do_lower_case=True)

        # build word vocabulary and word embedding
        if self.use_word:
            self.load_word_dict()

            untokenized_texts = [''.join(text) for text in self.train_data]
            word_corpus = self.build_corpus(untokenized_texts,
                                            cut_func=lambda x: jieba.lcut(x))

            self.word_vocab_count, self.word_vocab, self.id2word = \
                self.build_vocab(word_corpus, self.min_count, special_token)
            self.word_vocab_size = len(self.word_vocab)
            self.word_embeddings = self.build_embedding(
                word_embed_type, self.word_vocab, word_corpus, word_embed_dim,
                special_token)
            if self.word_embeddings is not None:
                self.word_embed_dim = self.word_embeddings.shape[1]
            else:
                self.word_embed_dim = word_embed_dim
        else:
            self.word_vocab_count, self.word_vocab, self.id2word = None, None, None
            self.word_vocab_size = -1
            self.word_embeddings = None
            self.word_embed_dim = -1

        # build label vocabulary
        self.label_vocab, self.id2label = self.build_label_vocab(
            self.train_labels)
        self.num_class = len(self.label_vocab)

        if self.use_bert and self.max_len is None:
            # max_len must be provided when use bert as input!
            # We will reset max_len from train_data when max_len is not provided.
            self.max_len = get_len_from_corpus(self.train_data)

            # make sure max_len is shorted than bert's max length (512)
            # since there are 2 more special token: <CLS> and <SEQ>, so add 2
            self.max_len = min(self.max_len + 2, 512)

    def load_word_dict(self):
        """Load external word dictionary in jieba"""
        if self.external_word_dict:
            for word in self.external_word_dict:
                jieba.add_word(word, freq=1000000)

    def build_label_vocab(
            self,
            labels: List[List[str]]) -> Tuple[Dict[str, int], Dict[int, str]]:
        """Build label vocabulary.

        Args:
            labels: List of list of str, the label sequences, like
            ``[['O', 'O', 'B-LOC', 'I-LOC', 'O', 'O'], ...]``.

        Returns:
            Tuple of 2 dict

        """
        label_count = {}
        for sequence in labels:
            for label in sequence:
                label_count[label] = label_count.get(label, 0) + 1

        # sorted by frequency, so that the label with the highest frequency will be given
        # id of 0, which is the default id for unknown labels
        sorted_label_count = sorted(label_count.items(),
                                    key=lambda x: x[1],
                                    reverse=True)
        sorted_label_count = dict(sorted_label_count)

        label_vocab = {}
        for label in sorted_label_count:
            label_vocab[label] = len(label_vocab)

        id2label = dict((idx, label) for label, idx in label_vocab.items())

        logging.info('Build label vocabulary finished, '
                     'vocabulary size: {}'.format(len(label_vocab)))
        return label_vocab, id2label

    def prepare_input(self,
                      data: List[List[str]],
                      labels: Optional[List[List[str]]] = None
                      ) -> Tuple[np.ndarray, Any]:
        """Prepare input (features and labels) for NER model.
        Here we not only use character embeddings (or bert embeddings) as main input, but also
        support word embeddings and other hand-crafted features embeddings as additional input.

        Args:
            data: List of List of str. List of tokenized (in char level) texts for training,
                like ``[['我', '在', '上', '海', '上', '学'], ...]``.
            labels: Optional List of List of str, can be None. The labels of train_data, usually in
            BIO or BIOES format, like ``[['O', 'O', 'B-LOC', 'I-LOC', 'O', 'O'], ...]``.

        Returns: Tuple:
            features: id matrix
            y: label id matrix only if labels is provided, otherwise None,

        """
        batch_char_ids, batch_bert_ids, batch_bert_seg_ids, batch_word_ids = [], [], [], []
        batch_label_ids = []
        for i, char_text in enumerate(data):
            if self.use_char:
                if self.use_bert:
                    text_for_char_input = [self.cls_token
                                           ] + char_text + [self.seq_token]
                else:
                    text_for_char_input = char_text
                char_ids = [
                    self.char_vocab.get(token, self.char_vocab[self.unk_token])
                    for token in text_for_char_input
                ]
                batch_char_ids.append(char_ids)

            if self.use_bert:
                indices, segments = self.bert_tokenizer.encode(
                    first_text=''.join(char_text), max_length=self.max_len)
                batch_bert_ids.append(indices)
                batch_bert_seg_ids.append(segments)

            if self.use_word:
                word_text = jieba.lcut(''.join(char_text))
                word_ids = self.get_word_ids(word_text)
                batch_word_ids.append(word_ids)

            if labels is not None:
                if self.use_bert:
                    label_str = [self.cls_token] + labels[i] + [self.cls_token]
                else:
                    label_str = labels[i]
                label_ids = [
                    self.label_vocab.get(l, self.get_unk_label_id())
                    for l in label_str
                ]
                label_ids = tf.keras.utils.to_categorical(
                    label_ids, self.num_class).astype(int)
                batch_label_ids.append(label_ids)

        features = []
        if self.use_char:
            features.append(self.pad_sequence(batch_char_ids))
        if self.use_bert:
            features.append(self.pad_sequence(batch_bert_ids))
            features.append(self.pad_sequence(batch_bert_seg_ids))
        if self.use_word:
            features.append(self.pad_sequence(batch_word_ids))

        if len(features) == 1:
            features = features[0]

        if not batch_label_ids:
            return features, None
        else:
            y = pad_sequences_2d(batch_label_ids,
                                 max_len_1=self.max_len,
                                 max_len_2=self.num_class,
                                 padding=self.padding_mode,
                                 truncating=self.truncating_mode)
            return features, y

    def get_word_ids(self, word_cut: List[str]) -> List[int]:
        """Given a word-level tokenized text, return the corresponding word ids in char-level
           sequence. We add the same word id to each character in the word.

        Args:
            word_cut: List of str, like ['我', '是'. '中国人']

        Returns: List of int, id sequence

        """
        word_ids = []
        for word in word_cut:
            for _ in word:
                word_ids.append(
                    self.word_vocab.get(word, self.word_vocab[self.unk_token]))
        if self.use_bert:
            word_ids = [self.word_vocab[self.cls_token]] + word_ids + \
                       [self.word_vocab[self.seq_token]]
        return word_ids

    def label_decode(self,
                     pred_probs: np.ndarray,
                     lengths: Optional[List[int]] = None) -> List[List[str]]:
        """Decode model predictions to label strings

        Args:
            pred_probs: np.ndarray, shaped [num_samples, max_len, num_class], the ner model's
                predictions
            lengths: Optional List of int. Length of each sample;

        Returns:
            List of List of str, the tagging sequences of each sample.

        """
        pred_ids = np.argmax(pred_probs, axis=-1)
        pred_labels = [[self.id2label[label_id] for label_id in ids]
                       for ids in pred_ids]
        if lengths is not None:
            pred_labels = [
                labels[:length]
                for labels, length in zip(pred_labels, lengths)
            ]
        return pred_labels

    def get_unk_label_id(self):
        """return a default id for label that does not exist in the label vocab

        Returns: int

        """
        if 'O' in self.label_vocab:
            return self.label_vocab['O']
        elif 'o' in self.label_vocab:
            return self.label_vocab['o']
        else:
            return 0  # id of 0 is the label with the highest frequency

    def save(self, preprocessor_file: str):
        """Save preprocessor to disk

        Args:
            preprocessor_file: str, path to save preprocessor

        Returns:

        """
        pickle.dump(self, open(preprocessor_file, 'wb'))

    @classmethod
    def load(cls, preprocessor_file):
        """Load preprocessor from disk

        Args:
            preprocessor_file: str, path to load preprocessor.

        Returns:

        """
        p = pickle.load(open(preprocessor_file, 'rb'))
        p.load_word_dict()  # reload external word dict into jieba
        return p
Beispiel #8
0
    def __init__(self, train_data, train_labels, min_count=2, use_char=True, use_bert=False,
                 use_word=False, external_word_dict=None, bert_vocab_file=None,
                 char_embed_type=None, char_embed_dim=300, word_embed_type=None, word_embed_dim=300,
                 max_len=None, padding_mode='post', truncating_mode='post'):
        """

        Args:
            train_data: a list of tokenized (in char level) texts
            train_labels: list of list, train_data's labels
            min_count: int, token of which frequency is lower than min_count will be ignored
            use_char:whether to use char embedding as input
            use_bert: whether to use bert embedding as input
            use_word: whether to use word embedding as additional input
            external_word_dict: external word dictionary, only apply when use_word is True
            bert_vocab_file: vocabulary file of pre-trained bert model, only apply when use_bert is
                             True
            char_embed_type: str, can be a pre-trained embedding filename or pre-trained embedding
                             methods (word2vec, glove, fastext)
            char_embed_dim: dimensionality of char embedding
            word_embed_type: same as char_embed_type, only apply when use_word is True
            word_embed_dim: dimensionality of word embedding
            max_len: int, max sequence len
            padding_mode:
            truncating_mode:
        """
        super(NERPreprocessor, self).__init__(max_len, padding_mode, truncating_mode)

        self.train_data = train_data
        self.train_labels = train_labels
        self.min_count = min_count
        self.use_char = use_char
        self.use_bert = use_bert
        self.use_word = use_word
        self.external_word_dict = external_word_dict
        self.char_embed_type = char_embed_type
        self.word_embed_type = word_embed_type

        assert self.use_char or self.use_bert, "must use char or bert embedding as main input"
        special_token = 'bert' if self.use_bert else 'standard'

        # build char vocabulary and char embedding
        if self.use_char:
            self.char_vocab_count, self.char_vocab, self.id2char = \
                self.build_vocab(self.train_data, self.min_count, special_token)
            self.char_vocab_size = len(self.char_vocab)
            self.char_embeddings = self.build_embedding(char_embed_type, self.char_vocab,
                                                        self.train_data, char_embed_dim,
                                                        special_token)
            if self.char_embeddings is not None:
                self.char_embed_dim = self.char_embeddings.shape[1]
            else:
                self.char_embed_dim = char_embed_dim
        else:
            self.char_vocab_count, self.char_vocab, self.id2char = None, None, None
            self.char_vocab_size = -1
            self.char_embeddings = None
            self.char_embed_dim = -1

        # build bert vocabulary
        if self.use_bert:
            self.bert_vocab = {}
            with codecs.open(bert_vocab_file, 'r', 'utf8') as reader:
                for line in reader:
                    token = line.strip()
                    self.bert_vocab[token] = len(self.bert_vocab)
            self.bert_tokenizer = ChineseBertTokenizer(self.bert_vocab)

        # build word vocabulary and word embedding
        if self.use_word:
            self.load_word_dict()

            untokenized_texts = [''.join(text) for text in self.train_data]
            word_corpus = self.build_corpus(untokenized_texts, cut_func=lambda x: jieba.lcut(x))

            self.word_vocab_count, self.word_vocab, self.id2word = \
                self.build_vocab(word_corpus, self.min_count, special_token)
            self.word_vocab_size = len(self.word_vocab)
            self.word_embeddings = self.build_embedding(word_embed_type, self.word_vocab,
                                                        word_corpus, word_embed_dim,
                                                        special_token)
            if self.word_embeddings is not None:
                self.word_embed_dim = self.word_embeddings.shape[1]
            else:
                self.word_embed_dim = word_embed_dim
        else:
            self.word_vocab_count, self.word_vocab, self.id2word = None, None, None
            self.word_vocab_size = -1
            self.word_embeddings = None
            self.word_embed_dim = -1

        # build label vocabulary
        self.label_vocab, self.id2label = self.build_label_vocab(self.train_labels)
        self.num_class = len(self.label_vocab)

        if self.use_bert and self.max_len is None:
            # max_len must be provided when use bert as input!
            # We will reset max_len from train_data when max_len is not provided.
            self.max_len = get_len_from_corpus(self.train_data)