def train_dst(x_dst, y_dst): """Train DST.""" dst = DialogStateTracker() dst.fit(x_dst, y_dst) logger.info('\n' + '-' * 30) logger.info('DST: trained') return dst
def train_dpl(x_dpl, y_dpl): """Train DPL.""" dpl = DialogPolicyLearning() dpl.fit(x_dpl, y_dpl) logger.info('\n' + '-' * 30) logger.info('DPL: trained') return dpl
def load_nlu_data(data_path): """Load NLU data from dir. 目录中应该有intents与entities子目录,分别保存意图和实体信息,为yaml格式 """ assert os.path.exists(data_path), '数据“{}”不存在'.format(data_path) # paths = [] # for dirname, _, filenames in os.walk(data_dir): # filenames = [x for x in filenames if x.endswith('.yml')] # for filename in filenames: # path = os.path.join(dirname, filename) # paths.append(path) # assert paths, '找不到yaml数据文件,注意要以“.yml”后缀名结尾' entities = [] intents = [] # for path in paths: with open(data_path, 'r') as fp: try: obj = yaml.load(fp, Loader=Loader) except: # noqa raise Exception('数据读取错误,可能不是合法YAML文件 “{}”'.format(data_path)) assert 'nlu' in obj objs = obj.get('nlu') assert isinstance(objs, (list, tuple)), \ '数据文件必须是list or tuple “{}”'.format(data_path) for obj in objs: if isinstance(obj, dict): if 'intent' in obj: assert 'data' in obj, \ '意图必须包括“data”属性 “{}”'.format(data_path) assert isinstance(obj['data'], (list, tuple)) \ and obj['data'], \ '意图必须包括“data”且长度大于0 “{}”'.format(data_path) intents.append(obj) elif 'entity' in obj: assert 'data' in obj, \ '实体必须包括“data”属性 “{}”'.format(data_path) assert 'copyFrom' in obj or ( isinstance(obj['data'], (list, tuple)) and obj['data'] ), '有copyFrom,或者有“data”且长度大于0 “{}”'.format(data_path) entities.append(obj) entities = entity_merge(entities) logger.info('读取到了 %s 个intent, %s 个entity', len(intents), len(entities)) return intents, entities
def data_to_iob(intents, entities): """Convert Data to IOB Format. 把数据转换为IOB格式 Inside-outside-beginning """ np.random.seed(0) index_entities_data = get_index_entities_data(entities) slot_count = {} for intent in intents: for item in intent['data']: if 'name' in item: slot_name = item['name'] if slot_name not in slot_count: slot_count[slot_name] = 0 slot_count[slot_name] += 1 sentence_result, slot_result, domain_result, intent_result = [], [], [], [] logger.info(f'parallel job %s', len(intents)) ret = Parallel(n_jobs=-1, verbose=6)( delayed(convert_item)(intent, index_entities_data, slot_count) for intent in intents) logger.info('parallel job done') for r1, r2, r3, r4 in ret: sentence_result += r1 slot_result += r2 domain_result += r3 intent_result += r4 logger.info('return IOB data') return sentence_result, slot_result, domain_result, intent_result
def main(data_path, model_path, outside_function={}, n_history=3, n_times=50): """Train Entrance. 训练模型 Args: data_path: 数据输入目录 model_path: 模型输出目录 outside_function: 外部NLG函数 n_history: 历史保存轮次 n_times: 都个story组合轮次 """ faq = FrequentlyAskedQuestions() faq.fit(data_path) # NLU Part if len(faq): nlu = train_nlu(data_path, faq.questions) else: nlu = train_nlu(data_path) # NLG Part nlg = train_nlg(data_path) logger.info('\n' + '-' * 30) logger.info('NLG Intents:') logger.info('\n'.join(nlg.intent_list)) # story_path = os.path.join(data_path, 'story') # parse_story会返回下面这些 # { # 'dialog': dialogs, # 'user_intent': user_intent_list, # 'user_domain': user_domain_list, # 'user_slot': user_slot_list, # 'sys_intent': sys_intent_list, # 'sys_slot': sys_slot_list, # } stories = parse_story(data_path) # 检查NLU和stories是否冲突 for ud in stories['user_domain']: assert ud in nlu.domain_list, 'user domain {} not in NLU'.format(ud) for ui in stories['user_intent']: assert ui in nlu.intent_list, 'user intent {} not in NLU'.format(ui) for us in stories['user_slot']: assert us in nlu.slot_list, 'user slot {} not in NLU'.format(us) for si in stories['sys_intent']: assert si in nlg.intent_list, 'sys intent {} not in NLG'.format(si) init_state = make_init_state(stories) (x_dst, y_dst, x_dpl, y_dpl) = build_dialog_train_data(stories['dialog'], init_state, n_history=n_history, n_times=n_times) # DST Part dst = train_dst(x_dst, y_dst) # DPL Part dpl = train_dpl(x_dpl, y_dpl) data = { 'init_state': init_state, 'faq': faq, 'nlu': nlu, 'nlg': nlg, 'dst': dst, 'dpl': dpl, } model_path_dir = os.path.dirname(model_path) mkdir(model_path_dir) with open(model_path, 'wb') as fp: pickle.dump(data, fp) logger.info('\n') logger.info('Train done')
def train_nlu(data_path, faq_questions=None): """Train NLU.""" nlu_path = os.path.join(data_path, 'nlu') assert os.path.exists(nlu_path), 'Invalid NLU data path' logger.info('Start train NLU') nlu = NaturalLanguageUnderstanding() nlu.fit(nlu_path, faq_questions) logger.info('\n' + '-' * 30) logger.info('NLU Intents:') logger.info('\n'.join(nlu.intent_list)) logger.info('NLU Slots') logger.info(nlu.slot_list) return nlu
def fit(self, data_path, faq_questions=None): """Fit NLU Module. 先从目录转换出所有的yml文件 然后得到四个列表,分别是句子本身,槽,领域,意图,他们的长度是相等的 例如一条句子: sentences: [ ['我', '爱', '你'] ] slots: [ 'O', 'O', 'O' ] domains: [ 'life' ] intents: [ 'ask_love' ] """ raw_intents, raw_entities = load_nlu_data(data_path) sentences, slots, domains, intents = data_to_iob( raw_intents, raw_entities) slot_list = [] for slot in slots: for s in slot: if s.startswith('B_'): if s[2:] not in slot_list: slot_list.append(s[2:]) self.slot_list = sorted(set(slot_list)) # 处理特殊的FAQ意图 if faq_questions is None: faq_questions = [] else: faq_questions = [list(x) for x in faq_questions] * 10 # TODO 一个不太好的超参,控制FAQ和其他对话的训练比例 # import pdb; pdb.set_trace() # Entity as B, I as * 2 + Outer self.ner_slot = NERSlotFiller(len(self.slot_list) * 2 + 1) self.ner_slot.fit(sentences, slots) slot_accuracy, _ = self.ner_slot.eval(sentences, slots) self.intent_label = LabelBinarizer() self.intent_label.fit(intents + [FAQ_INTENT]) self.domain_label = LabelBinarizer() self.domain_label.fit(domains + [FAQ_INTENT]) self.intent_list = self.intent_label.classes_.tolist() self.domain_list = self.domain_label.classes_.tolist() self.tokenizer = Tokenizer(num_words=self.vocab_size, char_level=True) self.tokenizer.fit_on_texts(sentences + faq_questions) seq = self.tokenizer.texts_to_sequences(sentences + faq_questions) seq_pad = pad_sequences(seq, maxlen=self.maxlen) self.intent_clr = get_model(self.intent_label.classes_.shape[0], n_vocab=self.vocab_size) self.domain_clr = get_model(self.domain_label.classes_.shape[0], n_vocab=self.vocab_size) y_intent = self.intent_label.transform(intents + [FAQ_INTENT] * len(faq_questions)) y_domain = self.domain_label.transform(domains + [FAQ_INTENT] * len(faq_questions)) if 1 == y_domain.shape[1]: y_domain = to_categorical(y_domain, 2) self.intent_clr.fit(seq_pad, y_intent) self.domain_clr.fit(seq_pad, y_domain) loop = tqdm(zip(sentences, domains, intents), total=len(sentences)) domain_ret = self.domain_label.inverse_transform( self.domain_clr.predict_proba(seq_pad)) intent_ret = self.intent_label.inverse_transform( self.intent_clr.predict_proba(seq_pad)) ret = [] for (a, b, c), dr, ir in zip(loop, domain_ret, intent_ret): ret.append((b == dr, c == ir)) domain_accuracy, intent_accuracy = (np.sum([x[0] for x in ret]) / len(sentences), np.sum([x[1] for x in ret]) / len(sentences)) logger.info( 'domain_accuracy: %s\n' + 'intent_accuracy: %s\n' + 'slot_accuracy: %s\n', domain_accuracy, intent_accuracy, slot_accuracy)
def build_dialog_train_data(dialogs, init_state, n_history, n_times): """Generate Trainning Data, include DST and DPL.""" x_dst, y_dst = [], [] x_dpl, y_dpl = [], [] dialog_queue = [] for i in range(len(dialogs) * 1000): dialog_queue.append('clean') for i in range(5): dialog_queue.append(random.choice(dialogs)) logger.info('dialog_queue length %s', len(dialog_queue)) history = [] for i in range(n_history): history.append(init_state.clone()) for dialog in dialog_queue: if dialog == 'clean': history = [] for i in range(n_history): history.append(init_state.clone()) continue for turn in dialog: state = history[-1].clone() # last history if 'user' in turn: new_state = make_new_state( init_state, turn['domain'], turn['intent'], turn['slots'] ) slot_vec = state.slot_vec new_slot_vec = new_state.slot_vec y = np.array([ 1. if a != b else 0. for a, b in zip( slot_vec.tolist(), new_slot_vec.tolist()) ]) x = np.array([ s.vec for s in history ] + [new_state.vec]) x_dst.append(x) y_dst.append(y) if np.sum(y) > 0: for i in range(10): x_dst.append(x) y_dst.append(y) history = history[1:] + [new_state] if 'sys' in turn: x = np.array([s.vec for s in history]) state.sys_intent = turn['intent'] # 设置为了获取y向量,不影响history y = state.sys_vec x_dpl.append(x) y_dpl.append(y) history[-1].sys_intent = turn['intent'] # 设置history x_dst = np.array(x_dst) y_dst = np.array(y_dst) x_dpl = np.array(x_dpl) y_dpl = np.array(y_dpl) logger.info( 'dialog train data, x_dst %s y_dst %s x_dpl %s y_dpl %s', x_dst.shape, y_dst.shape, x_dpl.shape, y_dpl.shape) return x_dst, y_dst, x_dpl, y_dpl