def get_index_entities_regex(self, entities): """将实体列表转换为正则表达式 """ ret = {} for x in entities: assert 'entity' in x and isinstance(x['entity'], str), \ '实体必须有entity属性且为字符串类型' data = [] for item in x['data']: if isinstance(item, str): data.append(item) elif isinstance(item, list): for iitem in item: if isinstance(iitem, str): data.append(iitem) if len(data) > LIMIT: data = shuffle(data, random_state=0) data = data[:LIMIT] data = [ clean_re(x) for x in data ] r = '(?:' + '|'.join(data) + ')' if 'regex' in x: r += '|(?:' + x['regex'] + ')' regex = '\\s*(?:' + r + ')\\s*' ret[x['entity']] = regex LOG.debug('entity: %s regex: %s', x['entity'], regex) return ret
def unit_test(): """unit test""" from nlu.utils.data_loader import load_nlu_data from nlu.utils.data_iob import data_to_iob intents, entities = load_nlu_data('nlu_data') # intents = [x for x in intents if x['intent'] == 'lottery_inform'] # print(intents) sentence_result, slot_result, _ = data_to_iob(intents, entities) # print(max([len(x) for x in sentence_result])) NeuralSlotFiller.cv_eval(sentence_result, slot_result, cv=5) exit(0) eng = NeuralSlotFiller() eng.fit(sentence_result, slot_result) LOG.debug('crf fitted') metrics = eng.eval(sentence_result, slot_result, progress=True) for k, v in metrics.items(): print(k, v) # acc, bad = eng.exact_eval(sentence_result, slot_result) # print('exact acc', acc) # print('bad count', len(bad)) print(eng.predict([list('我要买第18138期')]))
def _get_iob(iob): """load iob only once""" if iob[0] is None: LOG.info('build IOB data') (sentence_result, slot_result, domain_result) = data_to_iob(intents, entities) iob = sentence_result, slot_result, domain_result return iob
def __init__(self, intent, index_entities_regex): """初始化""" assert isinstance(intent['intent'], str), '错误的意图' assert intent['intent'].strip(), '意图不能为空' self.intent = intent['intent'].strip() self.domain = None \ if not isinstance(intent['domain'], str) or \ len(intent['domain'].strip()) <= 0 \ else intent['domain'].strip() self.data = intent['data'] slot_index = {} def _replace(part): """转换部分句子结构,如果这个部分是实体,就返回正则表达式,如果是普通文本,就返回文本""" if 'name' in part: slot_name = part['name'] if slot_name not in slot_index: slot_index[slot_name] = 0 slot_index[slot_name] += 1 temp = '(?P<{slot_name}{splitor}{index}>{slot_regex})' if slot_name in index_entities_regex: return temp.format( slot_name=slot_name, splitor=self.slot_name_splitor, index=slot_index[slot_name], slot_regex=index_entities_regex[slot_name]) # else: return temp.format( slot_name=slot_name, slot_regex=clean_re(part['text'])) text = part['text'] place = [] def choice(x): x = x.group(1).split('|') place_id = '__place__{}__'.format(len(place)) place.append((place_id, '(?:' + '|'.join([clean_re(xx) for xx in x]) + ')')) return place_id text = re.sub( r'\[\[([^\]]+)\]\]', choice, text) text = clean_re(text) for k, v in place: text = text.replace(k, v) return text self.patten = re.compile( '^' + \ ''.join([_replace(x) for x in self.data]) + \ '$') LOG.debug('pattens: %s', self.patten)
def load_nlu_data(data_dir): """读取NLU数据目录的信息 目录中应该有intents与entities子目录,分别保存意图和实体信息,为yaml格式 """ assert os.path.exists(data_dir), '数据目录“{}”不存在'.format(data_dir) paths = [] for dirname, _, filenames in os.walk(data_dir): filenames = [x for x in filenames if x.endswith('.yml')] for filename in filenames: path = os.path.join(dirname, filename) paths.append(path) assert paths, '找不到yaml数据文件,注意要以“.yml”后缀名结尾' entities = [] intents = [] for path in paths: with open(path, 'r') as fp: try: objs = yaml.load(fp) except: raise Exception('数据读取错误,可能不是合法YAML文件 “{}”'.format(path)) assert isinstance(objs, (list, tuple)), \ '数据文件必须是list or tuple “{}”'.format(path) for obj in objs: if isinstance(obj, dict): if 'intent' in obj: assert 'data' in obj, '意图必须包括“data”属性 “{}”'.format( path) assert isinstance(obj['data'], (list, tuple)) \ and obj['data'], \ '意图必须包括“data”且长度大于0 “{}”'.format(path) intents.append(obj) elif 'entity' in obj: assert 'data' in obj, \ '实体必须包括“data”属性 “{}”'.format(path) assert 'copyFrom' in obj \ or (isinstance(obj['data'], (list, tuple)) \ and obj['data']), \ '有copyFrom,或者有“data”且长度大于0 “{}”'.format(path) entities.append(obj) entities = entity_merge(entities) LOG.debug('读取到了 %s 个intent, %s 个entity', len(intents), len(entities)) return intents, entities
def fit(self, sentence_result, domain_result, feature='tfidf', algorithm='LinearSVC'): """fit model""" LOG.debug('fit MLIntentClassifier') ( model_intent, model_domain, x_train, y_train_intent, y_train_domain ) = self.build_model( sentence_result, domain_result, feature, algorithm) self.model_intent, self.model_domain = model_intent, model_domain self.model_intent.fit(x_train, y_train_intent) self.model_domain.fit(x_train, y_train_domain)
def load_models(model_dir='./tmp/nlu_model'): """加载模型""" config_path = os.path.join(model_dir, 'config.json') if not os.path.exists(config_path): LOG.error('config_path not exsits "%s"', config_path) exit(1) pipeline_config = json.load(open(config_path)) models = [] for model_name in pipeline_config: model = pickle.load( open(os.path.join( model_dir, '{}.pkl'.format(model_name)), 'rb')) models.append((model_name, model)) return models
def predict_slot(self, nlu_obj): """识别实体""" tokens = nlu_obj['tokens'] tokens = [x.lower() for x in tokens] ret = self.predict([tokens]) LOG.debug('crf_slot_filler raw %s', ret) crf_ret = get_slots_detail(nlu_obj['tokens'], ret[0]) nlu_obj['crf_slot_filler'] = {'slots': crf_ret} for slot in crf_ret: slot['from'] = 'crf_slot_filler' if len(nlu_obj['slots']) <= 0: nlu_obj['slots'] = crf_ret else: for slot in crf_ret: is_include = False for s in nlu_obj['slots']: if slot['pos'][0] >= s['pos'][0] \ and slot['pos'][0] <= s['pos'][1]: is_include = True break elif slot['pos'][1] >= s['pos'][0] \ and slot['pos'][1] <= s['pos'][1]: is_include = True break elif s['pos'][0] >= slot['pos'][0] \ and s['pos'][0] <= slot['pos'][1]: is_include = True break elif s['pos'][1] >= slot['pos'][0] \ and s['pos'][1] <= slot['pos'][1]: is_include = True break if not is_include: nlu_obj['slots'].append(slot) nlu_obj['slots'] = sorted(nlu_obj['slots'], key=lambda x: x['pos'][0]) return nlu_obj
def data_to_iob(intents, entities): """把数据转换为IOB格式 Inside-outside-beginning""" np.random.seed(0) index_entities_data = get_index_entities_data(entities) keys = sorted([(k, len(v)) for k, v in index_entities_data.items()], key=lambda x: x[1]) for k, v in keys: LOG.debug('kv %s %s', k, v) slot_count = {} for intent in intents: for item in intent['data']: if 'name' in item: slot_name = item['name'] if slot_name not in slot_count: slot_count[slot_name] = 0 slot_count[slot_name] += 1 sentence_result, slot_result, domain_result = [], [], [] LOG.debug('parallel job %s', len(intents)) ret = Parallel(n_jobs=8, verbose=6)( delayed(convert_item)(intent, index_entities_data, slot_count) for intent in intents) LOG.debug('parallel job done') for r1, r2, r3 in ret: sentence_result += r1 slot_result += r2 domain_result += r3 with open('/tmp/nlu_iob.txt', 'w') as fp: for a, b, c in zip(sentence_result, slot_result, domain_result): fp.write('\t'.join(a) + '\n') fp.write('\t'.join(b) + '\n') fp.write(c + '\n') fp.write('\n') LOG.debug('return IOB data') return sentence_result, slot_result, domain_result
def build_model(self, sentence_result, domain_result, feature, algorithm): """构建模型""" self.build_vectorizer(feature) x_text = [ ''.join(x).lower() for x in sentence_result ] y_class_domain = [ x.split(SPLITOR)[0] for x in domain_result ] y_class_intent = [ x.split(SPLITOR)[1] for x in domain_result ] with open('/tmp/ml_intent_classifier.tmp', 'w') as fp: for x, y in zip(x_text, y_class_intent): fp.write('{}\t{}\n'.format(x, y)) x_train = self.vectorizer.fit_transform(x_text) intent_class_index = {} intent_index_class = {} for i, c in enumerate(sorted(list(set(y_class_intent)))): intent_class_index[c] = i intent_index_class[i] = c self.intent_class_index = intent_class_index self.intent_index_class = intent_index_class LOG.debug('ml_intent_classifier intent class %s', len(intent_class_index)) y_train_intent = [self.intent_class_index[x.split(SPLITOR)[1]] for x in domain_result] domain_class_index = {} domain_index_class = {} for i, c in enumerate(sorted(list(set(y_class_domain)))): domain_class_index[c] = i domain_index_class[i] = c self.domain_class_index = domain_class_index self.domain_index_class = domain_index_class LOG.debug('ml_intent_classifier domain class %s', len(domain_class_index)) y_train_domain = [self.domain_class_index[x.split(SPLITOR)[0]] for x in domain_result] model_intent = None model_domain = None if algorithm == 'RandomForest': model_intent, model_domain = [RandomForestClassifier( random_state=0, class_weight='balanced', n_jobs=-1) for _ in range(2)] elif algorithm == 'SVC': model_intent, model_domain = [SVC( random_state=0, probability=True, class_weight='balanced') for _ in range(2)] elif algorithm == 'LinearSVC': model_intent, model_domain = [LinearSVC( random_state=0, class_weight='balanced') for _ in range(2)] else: raise Exception('Unknown algorithm "{}"'.format(algorithm)) return (model_intent, model_domain, x_train, y_train_intent, y_train_domain)
def build_model(nlu_data, model_dir, pipline_config): """构建模型""" models = [] LOG.info('start build') intents, entities = load_nlu_data(nlu_data) iob = [None, None, None] def _get_iob(iob): """load iob only once""" if iob[0] is None: LOG.info('build IOB data') (sentence_result, slot_result, domain_result) = data_to_iob(intents, entities) iob = sentence_result, slot_result, domain_result return iob for item in pipline_config: LOG.info('train "%s"', item) if item == 'regex_engine': reng = RegexEngine(intents, entities) models.append(('regex_engine', reng)) elif item == 'ml_intent_classifier': ml_intent = MLIntentClassifier() iob = _get_iob(iob) sentence_result, _, domain_result = iob ml_intent.fit(sentence_result, domain_result) models.append(('ml_intent_classifier', ml_intent)) elif item == 'dl_intent_classifier': dl_intent = DLIntentClassifier() iob = _get_iob(iob) sentence_result, _, domain_result = iob dl_intent.fit(sentence_result, domain_result) models.append(('dl_intent_classifier', dl_intent)) elif item == 'crf_slot_filler': crf_slot = CRFSlotFiller() iob = _get_iob(iob) sentence_result, slot_result, _ = iob crf_slot.fit(sentence_result, slot_result) models.append(('crf_slot_filler', crf_slot)) elif item == 'neural_slot_filler': crf_slot = NeuralSlotFiller() iob = _get_iob(iob) sentence_result, slot_result, _ = iob crf_slot.fit(sentence_result, slot_result) models.append(('neural_slot_filler', crf_slot)) elif item == 'neural_intent_classifier_slot_filler': nicsf = NeuralIntentClassifierSlotFiller() iob = _get_iob(iob) sentence_result, slot_result, domain_result = iob y_data = list(zip(slot_result, domain_result)) nicsf.fit(sentence_result, y_data) models.append(('neural_intent_classifier_slot_filler', nicsf)) else: LOG.error('invalid engine "%s"', item) raise Exception('invalid engine "%s"' % item) if not os.path.exists(model_dir): os.makedirs(model_dir) config_path = os.path.join(model_dir, 'config.json') with open(config_path, 'w') as fp: json.dump(pipline_config, fp, indent=4, ensure_ascii=False) for model_name, model in models: model_path = os.path.join(model_dir, '{}.pkl'.format(model_name)) with open(model_path, 'wb') as fp: pickle.dump(model, fp) LOG.info('train and saved')
def fit(self, sentence_result, slot_result, max_iterations=100, c1=0.17, c2=0.01): """fit model""" self.c1 = c1 self.c2 = c2 self.max_iterations = max_iterations LOG.debug('fit CRFSlotFiller') x_train = sentences_to_features(sentence_result) y_train = slot_result labels = set() for x in slot_result: labels.update(x) labels = sorted(list(labels)) labels.remove('O') LOG.debug('labels: %s', ', '.join(labels)) self.labels = labels LOG.debug('CRFSlotFiller try write tmp train data') with open('/tmp/crf_slot_filler.tmp', 'w') as fp: for x, y in zip(sentence_result, slot_result): line = [] for i, x_i in enumerate(x): line.append('{}\t{}'.format(x_i, y[i])) fp.write('\n'.join(line) + '\n\n') LOG.debug('CRFSlotFiller try write tmp train data done') LOG.debug('x_train %d, y_train %d', len(x_train), len(y_train)) if os.environ.get('CRF') == 'search': crf = CRF(algorithm='lbfgs', max_iterations=50, all_possible_transitions=True) params_space = { 'c1': scipy.stats.expon(scale=0.5), 'c2': scipy.stats.expon(scale=0.05), } f1_score = make_scorer(metrics.flat_f1_score, average='weighted', labels=labels) rs = RandomizedSearchCV(crf, params_space, cv=3, verbose=1, n_jobs=2, n_iter=8 * 8, scoring=f1_score) rs.fit(x_train, y_train) LOG.debug('best params: %s', rs.best_params_) LOG.debug('best cv score: %s', rs.best_score_) self.crf = rs.best_estimator_ else: crf = CRF(algorithm='lbfgs', c1=c1, c2=c2, max_iterations=max_iterations, all_possible_transitions=True) for x, y in zip(x_train, y_train): assert len(x) == len(y), '"{}", "{}" diff'.format( str([xx['token'] for xx in x]), str(y)) crf.fit(x_train, y_train) self.crf = crf
def web_parse(sentence=None): """提供NLU服务""" if sentence is None: return jsonify(success=False, message='sentence is None') nlu_obj = { 'intent': None, 'domain': None, 'slots': [], 'text': sentence, 'tokens': list(sentence), } start_time = time.time() LOG.debug('start %s models', len(MODELS)) for model_name, model in MODELS: LOG.debug('through %s model %s', model_name, time.time() - start_time) if model.domain_implement: LOG.debug('through %s model predict_domain %s', model_name, time.time() - start_time) nlu_obj = model.predict_domain(nlu_obj) if model.intent_implement: LOG.debug('through %s model predict_intent %s', model_name, time.time() - start_time) nlu_obj = model.predict_intent(nlu_obj) if model.slot_implement: LOG.debug('through %s model predict_slot %s', model_name, time.time() - start_time) nlu_obj = model.predict_slot(nlu_obj) # print(nlu_obj) LOG.debug('return %s', time.time() - start_time) return APP.response_class( response=simplejson.dumps({ 'success': True, 'result': nlu_obj }, indent=4, ensure_ascii=False), status=200, mimetype='application/json' )