Example #1
0
    def build_alphabet(cls, input_file, lexi_trees):
        with open(input_file, 'r') as rf:
            intent_corpus = rf.readlines()
            words, chars, feats, labels, lexicons = [], [], [], [], []
            word_alphabet, char_alphabet, feat_alphabet, label_alphabet, lexicon_alphabet = {}, {}, {}, {}, {}
            for i in intent_corpus:
                # print(i)
                line = ast.literal_eval(i)
                char, label, word, feat = line['char'], line[
                    'char_label'], line['word'], line['intent']
                word = list(map(lambda x: normalize_word(x), word))
                char = list(map(lambda x: normalize_word(x), char))
                # 增加字典树搜索得到的特征
                for w in word:
                    lexi_feat = []
                    for lexi_type, lb in lexi_trees.items():
                        lexi_feat.append(lb.search(w))
                    for n in range(len(lexi_feat)):
                        if lexi_feat[n] is None or lexi_feat[n] == '_STEM_':
                            lexi_feat[n] = 0
                        else:
                            lexi_feat[n] = 1
                    lexi_feat = ''.join([str(i) for i in lexi_feat])
                    if lexi_feat not in lexicons: lexicons.append(lexi_feat)
                words.extend(word)
                chars.extend(char)
                labels.extend(label)
                feats.append(feat)
        words = list(Counter(words).keys())
        chars = list(Counter(chars).keys())
        labels = list(Counter(labels).keys())
        feats = list(Counter(feats).keys())

        words = ['/unk'] + words
        chars = ['/unk'] + chars
        feats = ['/unk'] + feats

        lexicons = ['/unk'] + lexicons

        for i, v in enumerate(words):
            word_alphabet[v] = i + 1
        for i, v in enumerate(chars):
            char_alphabet[v] = i + 1
        for i, v in enumerate(labels):
            label_alphabet[v] = i + 1
        for i, v in enumerate(feats):
            feat_alphabet[v] = i + 1

        for i, v in enumerate(lexicons):
            lexicon_alphabet[v] = i + 1

        logger.info('intent nums: %s, slot nums: %s' %
                    (len(feat_alphabet), len(label_alphabet)))
        return cls(word_alphabet, char_alphabet, feat_alphabet, label_alphabet,
                   words, chars, feats, labels, lexicon_alphabet, lexi_trees)
Example #2
0
    def inference(self, text, intent, session_keep, previous_intent):
        # 如果存在多轮对话,且当前intent为空,取上一轮text的意图
        if session_keep and intent is None:
            intent = previous_intent
        if intent is None:  # label_alphabet中的None是str类型
            intent = 'None'
        instance_ids = []
        char_ids = []
        chars = list(text)
        # intent
        feat_ids = [self.alphabet.get_index(intent, key='intent')]
        # 字符
        for char in chars:
            char_id = self.alphabet.get_index(normalize_word(char), key='char')
            char_ids.append(char_id)
        # 词
        word_list = self.seg.cut(text)
        print('text: %s, word_list: %s' % (text, word_list))
        word_ids, word_feat_ids = [], []
        for word in word_list:
            lexi_feat = []
            for lexi_type, lb in self.lexi_trees.items():
                lexi_feat.append(lb.search(word))
            for n in range(len(lexi_feat)):
                if lexi_feat[n] is None or lexi_feat[n] == '_STEM_':
                    lexi_feat[n] = 0
                else:
                    lexi_feat[n] = 1
            lexi_feat = ''.join([str(i) for i in lexi_feat])
            word_feat_ids.append(self.alphabet.get_index(lexi_feat, 'lexicon'))

            word_id = self.alphabet.get_index(normalize_word(word), key='word')
            word_ids.append(word_id)
        instance_ids.append([char_ids, word_ids, feat_ids, word_feat_ids])
        batch_word, batch_features, batch_wordlen, batch_wordrecover, batch_char, batch_charlen, batch_charrecover, mask, \
         _, batch_lexi = batch_char_sequence_labeling_with_word(instance_ids, self.configs['gpu'], if_train=False, if_label=False)
        tag_seq = self.model(batch_word, batch_features, batch_wordlen,
                             batch_char, batch_charlen, mask, batch_lexi)
        pred_result = predict_recover_label(tag_seq, mask, self.alphabet,
                                            batch_charrecover)
        pred_result = list(np.array(pred_result).reshape(len(chars), ))
        result = self.slot_concat(chars, pred_result)
        return result
 def slot_filling(self, text_list, intent):
     instance_ids = []
     word_ids, char_ids, feat_ids = [], [], []
     feat_ids.append([self.alphabet.get_index(intent, 'intent')])
     for word in text_list:
         char_id = []
         word_ids.append(self.alphabet.get_index(normalize_word(word), 'word'))
         for char in word:
             char_id.append(self.alphabet.get_index(normalize_word(char), 'char'))
         char_ids.append(char_id)
         # char_ids.append([self.alphabet.get_index(char, 'char') for char in word])
     instance_ids.append([char_ids, word_ids, feat_ids])
     batch_word, batch_features, batch_wordlen, batch_wordrecover, batch_char, batch_charlen, batch_charrecover, \
         mask = predict_batchify_sequence_labeling(instance_ids, self.config.gpu)
     with torch.no_grad():
         tag_seq = self.model(batch_word, batch_features, batch_wordlen, batch_char, batch_charlen,
                              batch_charrecover, mask)
     pred_result = predict_recover_label(tag_seq, mask, self.alphabet, batch_wordrecover)
     pred_result = list(np.array(pred_result).reshape(len(text_list),))
     result = self.slot_concat(text_list, pred_result)
     return result
 def inference(self, text, intent, session_keep, previous_intent):
     # 如果存在多轮对话,且当前intent为空,取上一轮text的意图
     if session_keep and intent is None:
         intent = previous_intent
     if intent is None:  # label_alphabet中的None是str类型
         intent = 'None'
     instance_ids = []
     char_ids = []
     chars = list(text)
     feat_ids = [self.alphabet.get_index(intent, key='intent')]
     for char in chars:
         char_id = self.alphabet.get_index(normalize_word(char), key='char')
         char_ids.append(char_id)
     instance_ids.append([char_ids, feat_ids])
     batch_char, batch_features, batch_charlen, batch_charrecover, mask, _ = batch_char_sequence_labeling(
         instance_ids, self.configs['gpu'], if_train=False, if_label=False)
     tag_seq = self.model(batch_char, batch_features, batch_charlen,
                          batch_charrecover, mask)
     pred_result = predict_recover_label(tag_seq, mask, self.alphabet,
                                         batch_charrecover)
     pred_result = list(np.array(pred_result).reshape(len(chars), ))
     result = self.slot_concat(chars, pred_result)
     return result
Example #5
0
    def read_instance(cls, alphabet, input_file, lexi_trees, read_type='word'):
        texts, ids = [], []
        # char, char_id = [], []
        with open(input_file, 'r') as rf:
            intent_corpus = rf.readlines()
            for i in intent_corpus:
                line = ast.literal_eval(i)
                char, word, feat, word_label, char_label = line['char'], line[
                    'word'], line['intent'], line['word_label'], line[
                        'char_label']
                # 加入字典树搜索的特征:
                word_feat, word_feat_id = [], []
                for w in word:
                    lexi_feat = []
                    for lexi_type, lb in lexi_trees.items():
                        lexi_feat.append(lb.search(w))
                    for n in range(len(lexi_feat)):
                        if lexi_feat[n] is None or lexi_feat[n] == '_STEM_':
                            lexi_feat[n] = 0
                        else:
                            lexi_feat[n] = 1
                    lexi_feat = ''.join([str(i) for i in lexi_feat])
                    word_feat.append(lexi_feat)
                    word_feat_id.append(
                        alphabet.get_index(lexi_feat, 'lexicon'))
                texts.append([char, word, feat, char_label, word_feat])

                if not isinstance(feat, list): feat = [feat]
                # normalized处理数字
                char = list(map(lambda x: normalize_word(x), char))
                word = list(map(lambda x: normalize_word(x), word))
                # 字需要用函数处理一下
                char, char_id = generate_char(char, word, alphabet)
                # char_id = [alphabet.get_index(c, 'char') for c in char]
                word_id = [alphabet.get_index(w, 'word') for w in word]
                feat_id = [alphabet.get_index(f, 'intent') for f in feat]
                # 将label替换为字符级别
                if read_type == 'char':
                    label_id = [
                        alphabet.get_index(l, 'char_label') for l in char_label
                    ]
                else:
                    label_id = [
                        alphabet.get_index(l, 'word_label') for l in word_label
                    ]
                ids.append([char_id, word_id, feat_id, label_id, word_feat_id])

        indexes = list(range(len(ids)))
        random.seed(43)
        random.shuffle(indexes)
        texts = [texts[i] for i in indexes]
        ids = [ids[i] for i in indexes]
        logger.info('indexes: %s' % indexes[:10])

        n = int(len(ids) * 0.1)  # 抽样比例
        dev_texts, dev_ids = texts[:n], ids[:n]
        test_texts, test_ids = texts[n:2 * n], ids[n:2 * n]
        train_texts, train_ids = texts[2 * n:], ids[2 * n:]

        # 将test集写入文件,待模型训练完成后做验证
        # with open(os.path.join(ROOT_PATH, 'models/slot_filling/data/output/test_texts.pkl'), 'wb') as wbf:
        #     pickle.dump(test_texts, wbf)

        char_alphabet_size, word_alphabet_size, feat_alphabet_size, label_alphabet_size = \
            len(alphabet.char_alphabet) + 1, len(alphabet.word_alphabet) + 1, len(alphabet.feat_alphabet) + 1, \
            len(alphabet.label_alphabet) + 1
        lexicon_alphabet_size = len(alphabet.lexicon_alphabet) + 1
        logger.info('train_size:%s, dev_size:%s, test_size:%s' %
                    (len(train_texts), len(dev_texts), len(test_texts)))
        return cls(train_texts, dev_texts, test_texts, train_ids, dev_ids,
                   test_ids, char_alphabet_size, word_alphabet_size,
                   feat_alphabet_size, label_alphabet_size,
                   lexicon_alphabet_size)