def similarity(asr_result, db_path): # while 1: # asr_result = input("Enter your input: ") # time0 = time.time() '''对于识别结果只有两个字的走WORD_CORRECT表,一个字的直接返回''' if len(asr_result) < 2 or len(set(asr_result)) < 2: return 0.0, u"无匹配".encode("utf-8") if len(asr_result) == 2: features = feature_names_word tf_array = tf_word_array id_list = word_id_list table = 'WORD_CORRECT' flag = 0 elif len(asr_result) > 2: features = feature_names tf_array = tf_sen_array id_list = sen_id_list table = 'REQ_ANS_DATA_ZH' flag = 1 # asr_result = re.sub(u'呢', 'ne', asr_result)#防止呢的拼音被轉換成‘ni’ # asr_result = re.sub(u'还', 'hai', asr_result)#防止還的拼音被轉換成‘huan’ # asr_result = re.sub(u'嗯', 'en', asr_result) # 因爲嗯的拼音沒有 f, asr_result = isWaidi(asr_result) if f: return 1.0, asr_result asr_result = unicode(asr_result, 'utf-8') asr_result = asr_result.strip(u'?') asr_result_pinyin = ' '.join(get_pinyin(asr_result)) print asr_result_pinyin word_list = [] # 將語音識別出來的結果與數據庫中所有的記錄一起轉換詞頻向量 word_list.append(asr_result_pinyin) # print(word_list)r try: if flag == 0: tf_vectorizer = CountVectorizer(analyzer='word', ngram_range=(2, 2)) else: tf_vectorizer = CountVectorizer(analyzer='word', ngram_range=(1, 1)) tf_vectorizer.fit(word_list) except ValueError: return 0.0, u"无匹配".encode("utf-8") else: # tf_df = pd.DataFrame(tf.toarray()) print tf_vectorizer.get_feature_names() feature_list = tf_vectorizer.get_feature_names() # singleWord_list = [] # twoGram_list = [] # for item in feature_names: # if ' ' in item: # twoGram_list.append(item) # else: # singleWord_list.append(item) vector_start_time = time.time() vector = np.zeros(len(tf_array[0])) for word in feature_list: if word in features: vector[features.index(word)] += 1 #else: # if ' ' in word: # continue '''如果识别结果是两个字,则单个字不再计算编辑距离''' # if flag == 0 and ' ' not in word: # continue for i in range(len(features)): # if ' ' in feature_names[i]: # continue '''优化计算编辑距离效率''' # if ' ' in word: # if ' ' not in features[i]: # continue # else: # if ' ' in features[i]: # continue if flag == 0: leven_cost = difflib_leven_word(word, features[i]) else: leven_cost = difflib_leven_zi(word, features[i]) if ' ' in word and leven_cost <= 2: vector[i] += 1 elif ' ' not in word and leven_cost <= 1: vector[i] += 1 print vector vector_end_time = time.time() print "构造向量时间:", (vector_end_time - vector_start_time) if vector.any() == 0: distance = 0.0 changeText = u"无匹配" return distance, changeText.encode("utf-8") print len(tf_array[0]) cos_list = [] # 保存計算出來的所有相似度 cos_start_time = time.time() for i in range(len(tf_array)): csim_value = csim(np.matrix(tf_array[i]), np.matrix(vector)) # print(cos(tf_array[i], tf_array[-1])) cos_list.append(csim_value) # time1 = time.time() # print "運行時間:",time1-time0 cos_end_time = time.time() print "计算余弦相似度时间:", (cos_end_time - cos_start_time) # print cos_list max_index = np.argmax(cos_list) # 獲得餘弦距離最大值的索引 ID = id_list[max_index] # 獲得餘弦距離最大值對應數據庫中的ID print "相似度為:", cos_list[max_index] # 返回餘弦距離最大值的索引 db_start_time = time.time() sql = "select name from " + table + " where id = '%s'" % ID cx = sqlite3.connect(db_path) cu = cx.cursor() cu.execute(sql) changeText = cu.fetchone()[0] if cos_list[max_index] > 0.55: sql = "update " + table + " set class = class + 1 where id = '%s'" % ID cu.execute(sql) cx.commit() cu.close() cx.close() db_end_time = time.time() print "查询数据库时间:", (db_end_time - db_start_time) print "替換為:", changeText return cos_list[max_index], changeText.encode("utf-8")
將文本文件中的内容轉化為拼音插入數據庫REQ_ANS_DATA_ZH ''' import sqlite3 import codecs import re from pinyin_test import get_pinyin cx = sqlite3.connect("d:/share/ai2.0/bin/correct/correct.db") cu = cx.cursor() reader = codecs.open("d:/house9.txt", 'r', encoding='utf-8') i = 1700 while 1: line = reader.readline().strip() if not line: break line = re.sub(u'[,。?:;!/、]', '', line) # line_p = re.sub(r'\d', '', line) # line_p = re.sub(r'还', 'hai', line_p) # line_p = re.sub(r'呢', 'ne', line_p) print(line) line_pinyin = ' '.join(get_pinyin(line)) sql = "insert into REQ_ANS_DATA_ZH(id,name,condition) values(%d,'%s','%s')" % ( i, line, line_pinyin) cu.execute(sql) i = i + 1 cx.commit() cu.close() cx.close()
asr_result = re.sub(word, '', asr_result) print 'after filter:', asr_result return asr_result '''构造句向量矩阵REQ_ANS_DATA_ZH表''' cx = sqlite3.connect("d:/share/ai2.0/bin/correct/correct.db") cu = cx.cursor() cu.execute("select id,name from REQ_ANS_DATA_ZH") word_list = [] id_list = [] for item in cu.fetchall(): # print(item[0]) word = process(item[1]) word_pinyin = ' '.join(get_pinyin(word)) if word_pinyin == '': continue word_list.append(word_pinyin) id_list.append(item[0]) # 保存數據庫中的ID tf_vectorizer = CountVectorizer(analyzer='word', ngram_range=(1, 2)) tf = tf_vectorizer.fit_transform(word_list) tf_array = tf.toarray() print len(tf_array[0]) feature_names = tf_vectorizer.get_feature_names() '''构造词向量矩阵word_correct表''' cu.execute("select id, pingyin from word_correct") word_id_list = [] word_list = [] for item in cu.fetchall():