def build_alphabet(cls, input_file, lexi_trees): with open(input_file, 'r') as rf: intent_corpus = rf.readlines() words, chars, feats, labels, lexicons = [], [], [], [], [] word_alphabet, char_alphabet, feat_alphabet, label_alphabet, lexicon_alphabet = {}, {}, {}, {}, {} for i in intent_corpus: # print(i) line = ast.literal_eval(i) char, label, word, feat = line['char'], line[ 'char_label'], line['word'], line['intent'] word = list(map(lambda x: normalize_word(x), word)) char = list(map(lambda x: normalize_word(x), char)) # 增加字典树搜索得到的特征 for w in word: lexi_feat = [] for lexi_type, lb in lexi_trees.items(): lexi_feat.append(lb.search(w)) for n in range(len(lexi_feat)): if lexi_feat[n] is None or lexi_feat[n] == '_STEM_': lexi_feat[n] = 0 else: lexi_feat[n] = 1 lexi_feat = ''.join([str(i) for i in lexi_feat]) if lexi_feat not in lexicons: lexicons.append(lexi_feat) words.extend(word) chars.extend(char) labels.extend(label) feats.append(feat) words = list(Counter(words).keys()) chars = list(Counter(chars).keys()) labels = list(Counter(labels).keys()) feats = list(Counter(feats).keys()) words = ['/unk'] + words chars = ['/unk'] + chars feats = ['/unk'] + feats lexicons = ['/unk'] + lexicons for i, v in enumerate(words): word_alphabet[v] = i + 1 for i, v in enumerate(chars): char_alphabet[v] = i + 1 for i, v in enumerate(labels): label_alphabet[v] = i + 1 for i, v in enumerate(feats): feat_alphabet[v] = i + 1 for i, v in enumerate(lexicons): lexicon_alphabet[v] = i + 1 logger.info('intent nums: %s, slot nums: %s' % (len(feat_alphabet), len(label_alphabet))) return cls(word_alphabet, char_alphabet, feat_alphabet, label_alphabet, words, chars, feats, labels, lexicon_alphabet, lexi_trees)
def inference(self, text, intent, session_keep, previous_intent): # 如果存在多轮对话,且当前intent为空,取上一轮text的意图 if session_keep and intent is None: intent = previous_intent if intent is None: # label_alphabet中的None是str类型 intent = 'None' instance_ids = [] char_ids = [] chars = list(text) # intent feat_ids = [self.alphabet.get_index(intent, key='intent')] # 字符 for char in chars: char_id = self.alphabet.get_index(normalize_word(char), key='char') char_ids.append(char_id) # 词 word_list = self.seg.cut(text) print('text: %s, word_list: %s' % (text, word_list)) word_ids, word_feat_ids = [], [] for word in word_list: lexi_feat = [] for lexi_type, lb in self.lexi_trees.items(): lexi_feat.append(lb.search(word)) for n in range(len(lexi_feat)): if lexi_feat[n] is None or lexi_feat[n] == '_STEM_': lexi_feat[n] = 0 else: lexi_feat[n] = 1 lexi_feat = ''.join([str(i) for i in lexi_feat]) word_feat_ids.append(self.alphabet.get_index(lexi_feat, 'lexicon')) word_id = self.alphabet.get_index(normalize_word(word), key='word') word_ids.append(word_id) instance_ids.append([char_ids, word_ids, feat_ids, word_feat_ids]) batch_word, batch_features, batch_wordlen, batch_wordrecover, batch_char, batch_charlen, batch_charrecover, mask, \ _, batch_lexi = batch_char_sequence_labeling_with_word(instance_ids, self.configs['gpu'], if_train=False, if_label=False) tag_seq = self.model(batch_word, batch_features, batch_wordlen, batch_char, batch_charlen, mask, batch_lexi) pred_result = predict_recover_label(tag_seq, mask, self.alphabet, batch_charrecover) pred_result = list(np.array(pred_result).reshape(len(chars), )) result = self.slot_concat(chars, pred_result) return result
def slot_filling(self, text_list, intent): instance_ids = [] word_ids, char_ids, feat_ids = [], [], [] feat_ids.append([self.alphabet.get_index(intent, 'intent')]) for word in text_list: char_id = [] word_ids.append(self.alphabet.get_index(normalize_word(word), 'word')) for char in word: char_id.append(self.alphabet.get_index(normalize_word(char), 'char')) char_ids.append(char_id) # char_ids.append([self.alphabet.get_index(char, 'char') for char in word]) instance_ids.append([char_ids, word_ids, feat_ids]) batch_word, batch_features, batch_wordlen, batch_wordrecover, batch_char, batch_charlen, batch_charrecover, \ mask = predict_batchify_sequence_labeling(instance_ids, self.config.gpu) with torch.no_grad(): tag_seq = self.model(batch_word, batch_features, batch_wordlen, batch_char, batch_charlen, batch_charrecover, mask) pred_result = predict_recover_label(tag_seq, mask, self.alphabet, batch_wordrecover) pred_result = list(np.array(pred_result).reshape(len(text_list),)) result = self.slot_concat(text_list, pred_result) return result
def inference(self, text, intent, session_keep, previous_intent): # 如果存在多轮对话,且当前intent为空,取上一轮text的意图 if session_keep and intent is None: intent = previous_intent if intent is None: # label_alphabet中的None是str类型 intent = 'None' instance_ids = [] char_ids = [] chars = list(text) feat_ids = [self.alphabet.get_index(intent, key='intent')] for char in chars: char_id = self.alphabet.get_index(normalize_word(char), key='char') char_ids.append(char_id) instance_ids.append([char_ids, feat_ids]) batch_char, batch_features, batch_charlen, batch_charrecover, mask, _ = batch_char_sequence_labeling( instance_ids, self.configs['gpu'], if_train=False, if_label=False) tag_seq = self.model(batch_char, batch_features, batch_charlen, batch_charrecover, mask) pred_result = predict_recover_label(tag_seq, mask, self.alphabet, batch_charrecover) pred_result = list(np.array(pred_result).reshape(len(chars), )) result = self.slot_concat(chars, pred_result) return result
def read_instance(cls, alphabet, input_file, lexi_trees, read_type='word'): texts, ids = [], [] # char, char_id = [], [] with open(input_file, 'r') as rf: intent_corpus = rf.readlines() for i in intent_corpus: line = ast.literal_eval(i) char, word, feat, word_label, char_label = line['char'], line[ 'word'], line['intent'], line['word_label'], line[ 'char_label'] # 加入字典树搜索的特征: word_feat, word_feat_id = [], [] for w in word: lexi_feat = [] for lexi_type, lb in lexi_trees.items(): lexi_feat.append(lb.search(w)) for n in range(len(lexi_feat)): if lexi_feat[n] is None or lexi_feat[n] == '_STEM_': lexi_feat[n] = 0 else: lexi_feat[n] = 1 lexi_feat = ''.join([str(i) for i in lexi_feat]) word_feat.append(lexi_feat) word_feat_id.append( alphabet.get_index(lexi_feat, 'lexicon')) texts.append([char, word, feat, char_label, word_feat]) if not isinstance(feat, list): feat = [feat] # normalized处理数字 char = list(map(lambda x: normalize_word(x), char)) word = list(map(lambda x: normalize_word(x), word)) # 字需要用函数处理一下 char, char_id = generate_char(char, word, alphabet) # char_id = [alphabet.get_index(c, 'char') for c in char] word_id = [alphabet.get_index(w, 'word') for w in word] feat_id = [alphabet.get_index(f, 'intent') for f in feat] # 将label替换为字符级别 if read_type == 'char': label_id = [ alphabet.get_index(l, 'char_label') for l in char_label ] else: label_id = [ alphabet.get_index(l, 'word_label') for l in word_label ] ids.append([char_id, word_id, feat_id, label_id, word_feat_id]) indexes = list(range(len(ids))) random.seed(43) random.shuffle(indexes) texts = [texts[i] for i in indexes] ids = [ids[i] for i in indexes] logger.info('indexes: %s' % indexes[:10]) n = int(len(ids) * 0.1) # 抽样比例 dev_texts, dev_ids = texts[:n], ids[:n] test_texts, test_ids = texts[n:2 * n], ids[n:2 * n] train_texts, train_ids = texts[2 * n:], ids[2 * n:] # 将test集写入文件,待模型训练完成后做验证 # with open(os.path.join(ROOT_PATH, 'models/slot_filling/data/output/test_texts.pkl'), 'wb') as wbf: # pickle.dump(test_texts, wbf) char_alphabet_size, word_alphabet_size, feat_alphabet_size, label_alphabet_size = \ len(alphabet.char_alphabet) + 1, len(alphabet.word_alphabet) + 1, len(alphabet.feat_alphabet) + 1, \ len(alphabet.label_alphabet) + 1 lexicon_alphabet_size = len(alphabet.lexicon_alphabet) + 1 logger.info('train_size:%s, dev_size:%s, test_size:%s' % (len(train_texts), len(dev_texts), len(test_texts))) return cls(train_texts, dev_texts, test_texts, train_ids, dev_ids, test_ids, char_alphabet_size, word_alphabet_size, feat_alphabet_size, label_alphabet_size, lexicon_alphabet_size)