def _build_feature_extractor(self, mode, files): print('Build feature extraction...') corpus = list() for path in files: with open(path, 'r') as f: for line in f: # line = json.loads(line.strip().decode('utf-8')) # question = line['question'] question = line.replace('\t', '').replace( ' ', '').strip('\n').decode('utf-8') question = QueryUtils.static_remove_cn_punct(str(question)) tokens = self.cut(question) corpus.append(tokens) if mode == 'ngram': bigram_vectorizer = CountVectorizer( ngram_range=(1, 2), min_df=0.0, max_df=1.0, analyzer='char', stop_words=[',', '?', '我', '我要'], binary=True) self.feature_extractor = bigram_vectorizer.fit(corpus) if mode == 'tfidf': print_cn('use {0}'.format(mode)) tfidf_vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(1, 2), max_df=1.0, min_df=1, sublinear_tf=True) self.feature_extractor = tfidf_vectorizer.fit(corpus)
def get_train_data(data_path, normal_path, shuffle_path): add_dict() normal_f = open(normal_path, 'w') shuffle_f = open(shuffle_path, 'w') # drop_f=open(drop_path) with open(data_path, 'r') as f: for line in f: print_cn(line) line = json.loads(line) intentions = ','.join(line['intention']) questions = line['question'] for question in questions: normal_f.write(intentions + '#' + question + '\n') tokens = cut(question) if len(tokens) <= 1: continue elif len(tokens) == 2: random.shuffle(tokens) shuffle_f.write(intentions + '#' + ''.join(tokens) + '\n') else: for _ in range(2): random.shuffle(tokens) shuffle_f.write(intentions + '#' + ''.join(tokens) + '\n') normal_f.close() shuffle_f.close()
def _build_feature_extraction(self, data_path): print('Build feature extraction...') corpus = list() with open(data_path, 'r') as f: reader = csv.reader(f, delimiter='#') for line in reader: b = line[1].decode('utf-8') # b = QueryUtils.static_remove_stop_words(b) tokens = QueryUtils.static_jieba_cut(b) corpus.append(tokens) if self.mode == 'ngram': print_cn('Use {0}'.format(self.mode)) bigram_vectorizer = CountVectorizer( ngram_range=(1, 2), min_df=0.0, max_df=1.0, analyzer='char', stop_words=[',', '?', '我', '我要'], binary=True) self.feature_extractor = bigram_vectorizer.fit(corpus) if self.mode == 'tfidf': print_cn('Use {0}'.format(self.mode)) tfidf_vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(1, 2), max_df=1.0, min_df=1, sublinear_tf=True) self.feature_extractor = tfidf_vectorizer.fit(corpus) if self.mode == 'fasttext': pass
def classify(): try: args = request.args q = args['q'] q = urllib.unquote(q).decode('utf8') ids = [char2index.get(c, 3) for c in q] print_cn(q) print(ids) inputs_length = [len(ids)] * BATCH_SIZE ids_inputs = [ids for _ in range(BATCH_SIZE)] # predicting_embed_inputs = [[fasttext_wv(word) for word in predicting_inputs] for _ in range(BATCH_SIZE)] # predicting_embed_inputs = np.asarray(predicting_embed_inputs) answer_logits = tf_sess.run(tf_model.predicting_logits, feed_dict={ tf_model.encoder_inputs.name: ids_inputs, tf_model.encoder_inputs_length.name: inputs_length }) prediction = recover(answer_logits.tolist()[0], index2char, False) # print(answer_logits.tolist()[0]) print("predict->", prediction) print("-----------------------") return prediction except Exception, e: return None
def predict(dict_path, model_path): sess, model = load_tf_session(dict_path, model_path) char2index, index2char = init_dict(dict_path) while True: line = _get_user_input() line = line.strip().decode('utf-8') ids = [char2index.get(c, 3) for c in line] print_cn(line) print(ids) inputs_length = [len(ids)] * BATCH_SIZE ids_inputs = [ids for _ in range(BATCH_SIZE)] # predicting_embed_inputs = [[fasttext_wv(word) for word in predicting_inputs] for _ in range(BATCH_SIZE)] # predicting_embed_inputs = np.asarray(predicting_embed_inputs) answer_logits = sess.run(model.predicting_logits, feed_dict={ model.encoder_inputs.name: ids_inputs, model.encoder_inputs_length.name: inputs_length }) prediction = recover(answer_logits.tolist()[0], index2char, False) # print(answer_logits.tolist()[0]) print("predict->", prediction) print("-----------------------")
def predict(data_path, encoder_vocab_path, decoder_vocab_path, model_path, embedding='word2vec'): if embedding == 'word2vec': encoder_vocab, embeddings = load_word2vec(encoder_vocab_path) else: encoder_vocab, embeddings = load_fasttext(encoder_vocab_path) decoder_vocab = load_decoder_vocab(decoder_vocab_path) global ENC_VOCAB_SIZE ENC_VOCAB_SIZE = len(encoder_vocab) global DEC_VOCAB_SIZE DEC_VOCAB_SIZE = len(decoder_vocab) global EMBEDDING_SIZE EMBEDDING_SIZE = len(embeddings[0]) for word in encoder_vocab: jieba.add_word(word) for word in decoder_vocab: jieba.add_word(word) model = BeliefRnn(decoder_vocab, False) model.build_graph() saver = tf.train.Saver() # loaded_graph = tf.Graph() with tf.Session() as sess: sess.run(tf.global_variables_initializer()) sess.run(model.embedding_init, feed_dict={model.embedding_placeholder: embeddings}) _check_restore_parameters(sess, saver, model_path) while True: line = _get_user_input() predicting_inputs = list(jieba.cut(line.strip().decode('utf-8'))) ids = [] for word in predicting_inputs: if word in encoder_vocab: ids.append(encoder_vocab.index(word)) else: ids.append(encoder_vocab.index('#UNK#')) print_cn(predicting_inputs) predicting_inputs_length = [len(predicting_inputs)] * BATCH_SIZE ids_inputs = [ids for _ in range(BATCH_SIZE)] # predicting_embed_inputs = [[fasttext_wv(word) for word in predicting_inputs] for _ in range(BATCH_SIZE)] # predicting_embed_inputs = np.asarray(predicting_embed_inputs) answer_logits = sess.run(model.predicting_logits, feed_dict={ model.encoder_inputs.name: ids_inputs, model.encoder_inputs_length.name: predicting_inputs_length }) prediction = recover_label(answer_logits.tolist()[0], decoder_vocab) # print(answer_logits.tolist()[0]) print("predict->", prediction) print("-----------------------")
def make(self, q, slot): b, q = self.quant_fix(q) if b: cn_util.print_cn(str(q)) r = self.jieba_cut(''.join(q)) # purify return self.all_possible(r, slot) else: return None
def _request_solr(self, q): ## cut q into tokens tokens = ['question:' + s for s in QueryUtils.static_jieba_cut(q, smart=False, remove_single=True)] q = ' OR '.join(tokens) url = self.qa_url.format(q) # print('qa_debug:', url) cn_util.print_cn(url) r = requests.get(url) return r
def check_zero_tokens(self, tokens): count = 0 target_len = len(tokens) for word in tokens: word = word.encode('utf-8') if w2v_model.__contains__(word.strip()): count += 1 if count != target_len: print_cn(tokens) return True if count == target_len else False
def _request_solr(self, q): tokenized, exact_q = self.purify_q(q) if not self.last_g: url = self.i_url % (tokenized, exact_q) self.last_g = q else: last_tkz, last_exact_q = self.purify_q(self.last_g) url = self.simple_context_i_url % (tokenized, exact_q, last_tkz, last_exact_q) self.last_g = q cn_util.print_cn('debug:interactive_url:' + url) r = requests.get(url) return r
def online(model_path): clf = Multilabel_Clf.load(model_path=model_path) print('loaded model file...') try: while True: question = raw_input('input something...\n') tokens = jieba.cut(question, cut_all=True) labels, probs = clf.predict(list(tokens)) print_cn(labels, probs) print('-------------------------') except KeyboardInterrupt: print('interaction interrupted')
def _request_solr(self, q, key, base_url): ## cut q into tokens key = '%s:' % key tokens = [ s for s in QueryUtils.static_jieba_cut( q, smart=False, remove_single=True) ] if len(tokens) == 0: return None q = key + "(" + '%20'.join(tokens) + ")" url = base_url % q cn_util.print_cn(url) r = requests.get(url) return r
def _request_solr(self, q, key, base_url): try: ## cut q into tokens key = 'fq=%s:' % key tokens = [ s for s in QueryUtils.static_jieba_cut( q, smart=False, remove_single=True) ] q = key + '%20'.join(tokens) url = base_url.format(q) cn_util.print_cn(url) r = requests.get(url) return r except: traceback.print_exc() return None
def get_w2v_emb(self, tokens): embedding = np.zeros((1, 300), dtype=np.float32) count = 0 # print_cn(tokens) for word in tokens: word = word.encode('utf-8') if w2v_model.__contains__(word.strip()): vector = w2v_model.__getitem__(word.strip()) result = [v for v in vector] embedding = np.add(embedding, np.asarray(result)) # print embedding count += 1 if count == 0: print('get...', count) print_cn(tokens) return False else: embedding = np.divide(embedding, count) return np.squeeze(embedding)
def select_max_match_with_sim(self, q, r): if not self.bt: return None matched_questions = SolrUtils.get_dynamic_response(r=r, key='question', random_hit=False, random_field=True, keep_array=False, facet=True) q_tokens = ' '.join(QueryUtils.static_jieba_cut(q)) matched_questions_tokens = [' '.join(QueryUtils.static_jieba_cut(mqt)) for mqt in matched_questions] max_sim = self.bt.getMaxSim(q_tokens, matched_questions_tokens) best_sentence = ''.join(max_sim['sentence'].split(' ')) sim = max_sim['sim'] cn_util.print_cn(best_sentence, str(sim), '[' + ','.join(matched_questions) + ']') if sim > 0.3: index = matched_questions.index(best_sentence) answer = SolrUtils.get_dynamic_response(r, key='answer', force_hit=index, random_field=True, random_hit=False) return answer return None
def _fasttext_vector(self, tokens): if not self.weighted: try: weights = np.ones(shape=len(tokens)) url = self.fasttext_url_weighted.format( ','.join(tokens), ",".join([str(weight) for weight in weights])) except: traceback.print_exc() else: try: idf_url = "http://10.89.100.14:3032/s/{0}".format( "%7C".join(tokens)) idf_r = requests.get(url=idf_url) weights = [] returned_json = idf_r.json() max_weight = 1 for key, value in returned_json.iteritems(): if value > max_weight: max_weight = value for token in tokens: if token not in returned_json: weights.append(str(max_weight)) else: weights.append(str(returned_json[token])) url = self.fasttext_url_weighted.format( ','.join(tokens), ','.join(weights)) except: traceback.print_exc() url = self.fasttext_url.format(','.join(tokens)) try: r = requests.get(url=url) vector = r.json()['vector'] return vector except: print_cn(url) traceback.print_exc() return None
def travel_with_clf(self, node, tokens, gbdt_recursion=True): key = None # word/tag next_node = None key_found = False num_found = False value_types = node.value_types query = ''.join(tokens) if "RANGE" in value_types and gbdt_recursion: try: slot, proba = self.gbdt.predict(parent_slot=node.slot, input_=query) next_node = self.graph.get_global_node(slot=slot) num_found = next_node is not None and proba > 0.95 if num_found: cn_util.print_cn('found type by gbdt_range:%s,%s' % (cn_util.cn(slot), proba)) self.last_slot = node.slot else: cn_util.print_cn('NOT found type by gbdt_range:%s,%s' % (cn_util.cn(slot), proba)) except Exception, e: print(e.message) num_found = False
sim = max_sim['sim'] cn_util.print_cn(best_sentence, str(sim), '[' + ','.join(matched_questions) + ']') if sim > 0.3: index = matched_questions.index(best_sentence) answer = SolrUtils.get_dynamic_response(r, key='answer', force_hit=index, random_field=True, random_hit=False) return answer return None def _num_answer(self, r): return int(r.json()["response"]["numFound"]) def _get_response(self, r, i=0): try: a = r.json()["response"]["docs"][i]['answer'] rr = np.random.choice(a, 1)[0] x = random.randint(0, min(0, len(a) - 1)) return rr.encode('utf8') except: return None def purify_q(self, q): q = self.qu.remove_cn_punct(q) pos_q = self.qu.corenlp_cut(q, remove_tags=["CD", "VA", "AD", "VC"]) return ''.join(pos_q), q if __name__ == '__main__': qa = SimpleQAKernel() cn_util.print_cn(qa.kernel(u'得基怎么去')[1])
try: i = i + 1 components = line.split('##') user = components[0] if user != current_user: current_user = user instance.main_kernel.clear_state() instance.last_response = None question = components[2] question = question.split(":")[1] answer = instance.kernel(question.decode('utf-8')) print('---%d--%s###%s' % (i, question, answer)) except: instance.main_kernel.clear_state() if __name__ == '__main__': kernel = EntryKernel() input_file = '../data/sc/test/test.txt' # test(input_file, kernel) # while True: input_ = raw_input() input_ = input_.decode('utf-8') response = kernel.kernel(input_) print_cn(response) # # response = kernel.kernel(u'我不买实惠的衣服') # print(response)
children = line.split('#')[1].split(':')[2].strip() children = children.split(',') for i, item in enumerate(children): children[i] = Node(children[i]) return Node(parent, children) def build_tree(data_path): tree = Tree() with open(data_path, 'r') as inp: for line in inp: node = get_node(line) tree.add(node) return tree def sort_intention(input_): tree = build_tree('../../../data/sc/belief_graph.txt') tree.pre_order(tree.root) pre_order_list = tree.pre_order_list input_=input_.split(',') sorted_intention = sorted(input_, key=lambda x:pre_order_list.index(x)) return sorted_intention if __name__ == '__main__': test=['女,购物,衣服','吃饭,低,有','辣,吃饭,低,有,女,购物,衣服'] for t in test: print_cn(sort_intention(t))
q = line[1].encode('utf-8') response = self.make(q, '存款') elif u'取款' in line[0]: q = line[1].encode('utf-8') response = self.make(q, '取款') elif u'转账' in line[0]: q = line[1].encode('utf-8') response = self.make(q, '转账') if response: write = [] for r in response: w = [line[0], r.strip()] write.append(w) for w in write: ww = '\t'.join(w) mm = ww.strip() out.write(mm + '\n') if __name__ == '__main__': qu = QueryUtils() jieba.load_userdict("../data/char_table/ext1.dic") # qu.process_data('../data/business/intention_pair_q', '../data/business/business_train_v7') # # print(QueryUtils.static_remove_cn_punct(u'我在电视上见过你,听说你很聪明啊?')) # cn_util.print_cn(qu.quant_bucket_fix('一点钱')) # cn_util.print_cn(qu.quant_bucket_fix('我要取1千零1百')) # cn_util.print_cn(QueryUtils.static_jieba_cut('紫桂焖大排', smart=True, remove_single=True)) cn_util.print_cn(QueryUtils.static_remove_stop_words('我来高兴哈')) # cn_util.print_cn(','.join(jieba.cut_for_search('南京精菜馆'.decode('utf-8'))))
def main(): model_path = '../model/sc/belief_clf.pkl' train_data_path = '../data/sc/train/sale_train0831.txt' test_data_path = '../data/sc/train/sale_train0831.txt' parser = argparse.ArgumentParser() parser.add_argument('-m', choices={'train', 'test'}, default='train', help='mode.if not specified,it is in test mode') args = parser.parse_args() if args.m == 'train': train(train_data_path, model_path) elif args.m == 'test': test(test_data_path, model_path) else: print('Unknow mode, exit.') if __name__ == '__main__': # main() model_path = '../model/sc/belief_clf.pkl' clf = Multilabel_Clf.load(model_path=model_path) inputs = [u"买热水器"] for p in inputs: labels, probs = clf.predict(input_=p.decode('utf-8')) cn_util.print_cn(','.join(labels))
class GKernel: def __init__(self, graph_path, clf_path): # self.tokenizer = CoreNLP() self.graph = None self.gbdt = None self.state_cleared = True self._load_graph(graph_path) self._load_clf(clf_path) self.qu = QueryUtils() last_slot = None base_url = "http://localhost:11403/solr/business/select?defType=edismax&indent=on&wt=json&rows=1" trick_url = "http://localhost:11403/solr/trick/select?defType=edismax&indent=on&wt=json&rows=10" # tokenizer_url = "http://localhost:5000/pos?q=" def kernel(self, query): return self.r_walk(query=query) def clear_state(self): print 'state cleared' self.state_cleared = True self.last_slot = None def _load_clf(self, path): print('loading gbdt classifier...') with open(path, 'rb') as f: self.gbdt = pickle.load(f) def _load_graph(self, path): print('loading logic graph...') with open(path, "rb") as input_file: self.graph = pickle.load(input_file) def num_answer(self, r): return int(r.json()["response"]["numFound"]) def travel_with_clf(self, node, tokens, gbdt_recursion=True): key = None # word/tag next_node = None key_found = False num_found = False value_types = node.value_types query = ''.join(tokens) if "RANGE" in value_types and gbdt_recursion: try: slot, proba = self.gbdt.predict(parent_slot=node.slot, input_=query) next_node = self.graph.get_global_node(slot=slot) num_found = next_node is not None and proba > 0.95 if num_found: cn_util.print_cn('found type by gbdt_range:%s,%s' % (cn_util.cn(slot), proba)) self.last_slot = node.slot else: cn_util.print_cn('NOT found type by gbdt_range:%s,%s' % (cn_util.cn(slot), proba)) except Exception, e: print(e.message) num_found = False # last try of RANGE if not num_found and "RANGE" in value_types: for t in tokens: try: t = cn2arab.cn2arab(t)[1].replace(' ', '').replace( '\t', '').encode('utf-8') if t.isdigit(): try: next_node = node.go(q=float(t), value_type="RANGE") num_found = next_node is not None if num_found: cn_util.print_cn('found type by RANGE: %s' % next_node.slot) self.last_slot = node.slot key = t break except Exception, e: print(e.message) num_found = False except Exception, e: print(e.message) num_found = False
break except Exception, e: print(e.message) num_found = False except Exception, e: print(e.message) num_found = False if not key_found and "KEY" in value_types and gbdt_recursion: try: slot, proba = self.gbdt.predict(parent_slot=node.slot, input_=query) next_node = self.graph.get_global_node(slot=slot) key_found = next_node is not None and proba > 0.95 if key_found: cn_util.print_cn('found type by gbdt_key:%s,%s' % (cn_util.cn(slot), proba)) self.last_slot = node.slot else: cn_util.print_cn('NOT found type by gbdt:%s,%s' % (cn_util.cn(slot), proba)) except Exception, e: print(e.message) key_found = False # last try of KEY if not key_found and not num_found and "KEY" in value_types: for t in tokens: try: value_types = node.value_types if "KEY" in value_types and not t.isdigit(): try:
response = None if u'存款' in line[0]: q = line[1].encode('utf-8') response = self.make(q, '存款') elif u'取款' in line[0]: q = line[1].encode('utf-8') response = self.make(q, '取款') elif u'转账' in line[0]: q = line[1].encode('utf-8') response = self.make(q, '转账') if response: write = [] for r in response: w = [line[0], r.strip()] write.append(w) for w in write: ww = '\t'.join(w) mm = ww.strip() out.write(mm + '\n') if __name__ == '__main__': qu = QueryUtils() qu.process_data('../data/business/intention_pair_q', '../data/business/business_train_v7') # print(QueryUtils.static_remove_cn_punct(u'我在电视上见过你,听说你很聪明啊?')) cn_util.print_cn(qu.quant_bucket_fix('一点钱')) # cn_util.print_cn(qu.quant_bucket_fix('我要取1千零1百'))
def r_walk_with_pointer_with_clf(self, query, given_slot=None): r = None response = None if given_slot == '#NULL#': given_slot = None self.last_slot = None self.clear_state() if self.state_cleared: if given_slot: self.should_clear_state(self.last_slot) url = self.base_url + "&q=exact_question:" + \ query + "%20AND%20exact_intention:" + given_slot else: url = self.base_url + "&q=exact_question:" + query cn_util.print_cn('extact_try_url %s' % url) r = requests.get(url) if self.num_answer(r) > 0: self.state_cleared = False return_slot = self.last_slot = self.get_intention(r) self.should_clear_state(self.last_slot) return_response = self.get_response(r) cn_util.print_cn('clear exact_%s, %s' % (return_slot, self.get_response(r))) return return_slot, return_response else: tokenized = self.qu.corenlp_cut(query, self.qu.remove_tags) if len(tokenized) == 0: # do trick self.clear_state() return None, self.trick(query) if given_slot: cn_util.print_cn('given:%s' % given_slot) given = self.graph.get_global_node(given_slot) else: given = self.graph fixed_query_tokens = self.qu.quant_bucket_fix(query) node = self.travel_with_clf(given, fixed_query_tokens) self.state_cleared = False if node != given: if self.last_slot == given.slot: url = self.base_url + "&q=exact_intention:" + node.slot else: url = self.base_url + "&q=exact_last_intention:" + \ self.last_slot + "%20AND%20exact_intention:" + node.slot cn_util.print_cn("gbdt_result_url %s" % url) r = requests.get(url) if self.num_answer(r) > 0: self.last_slot = node.slot self.should_clear_state(node.slot) cn_util.print_cn('clear deepest_ %s, %s' % (node.slot, self.get_response(r))) return node.slot, self.get_response(r) else: return None, self.trick(query) else: if given_slot: self.last_slot = given_slot else: if not self.last_slot: parent_slot = self.graph.get_global_node( self.last_slot).parent_node.slot cn_util.print_cn('retrace...', self.last_slot, parent_slot) self.clear_state() slot, response = self.r_walk_with_pointer_with_clf( query, parent_slot) return slot, response url = self.base_url + "&q=exact_question:" + query + \ "%20AND%20exact_last_intention:" + self.last_slot cn_util.print_cn('non_clear_url_first_try', url) r = requests.get(url) if self.num_answer(r) > 0: self.state_cleared = False self.last_slot = slot_ = self.get_intention(r) self.should_clear_state(self.last_slot) cn_util.print_cn('non clear exact_', slot_, self.get_response(r)) return slot_, self.get_response(r) else: node = self.graph.all_nodes[self.last_slot] next_node = None tks = self.qu.corenlp_cut(query) if len(tks) == 0: self.clear_state() return self.r_walk_with_pointer_with_clf(query) fixed_query_tokens = self.qu.quant_bucket_fix(query) next_node = self.travel_with_clf(node, fixed_query_tokens) if next_node == node: # query solr parent_slot = self.graph.get_global_node( self.last_slot).parent_node.slot cn_util.print_cn('retrace...', self.last_slot, parent_slot) self.clear_state() return self.r_walk_with_pointer_with_clf( query, parent_slot) else: url = self.base_url + "&q=last_intention:" + \ self.last_slot + "%20AND%20intention:" + next_node.slot cn_util.print_cn("non_clear_go_deeper_url:", url) r = requests.get(url) if self.num_answer(r) > 0: response = self.get_response(r) slot_ = self.get_intention(r) self.last_slot = slot_ self.state_cleared = False # but cn_util.print_cn( str( self.graph.get_global_node( slot_).classified_out_neighbors)) self.should_clear_state(slot_) cn_util.print_cn('non clear deepest _', slot_, self.get_response(r)) return slot_, self.get_response(r) else: # do trick self.clear_state() url = self.concat_solr_request(query=query, base_url=self.trick_url) r = requests.get(url) response = self.get_response(r) cn_util.print_cn("None-CLEAR-Trick", self.get_response(r)) return None, response
else: return False, np.random.choice(self.null_anwer, 1)[0] except: return False, np.random.choice(self.null_anwer, 1)[0] def _request_solr(self, q, key, base_url): try: ## cut q into tokens key = 'fq=%s:' % key tokens = [ s for s in QueryUtils.static_jieba_cut( q, smart=False, remove_single=True) ] q = key + '%20'.join(tokens) url = base_url.format(q) cn_util.print_cn(url) r = requests.get(url) return r except: traceback.print_exc() return None def _num_answer(self, r): return int(r.json()["response"]["numFound"]) if __name__ == '__main__': qa = QAKernel() # result = qa.kernel(u'三星手机在哪', u"Omega,一期三楼") cn_util.print_cn(qa.kernel(u'星巴克好吃吗', u"Omega,一期三楼")[1])