コード例 #1
0
ファイル: ED_v7.py プロジェクト: pinweihelai/ED
def similarity(asr_result, db_path):
    # while 1:
    # asr_result = input("Enter your input: ")
    # time0 = time.time()
    '''对于识别结果只有两个字的走WORD_CORRECT表,一个字的直接返回'''
    if len(asr_result) < 2 or len(set(asr_result)) < 2:
        return 0.0, u"无匹配".encode("utf-8")

    if len(asr_result) == 2:
        features = feature_names_word
        tf_array = tf_word_array
        id_list = word_id_list
        table = 'WORD_CORRECT'
        flag = 0
    elif len(asr_result) > 2:
        features = feature_names
        tf_array = tf_sen_array
        id_list = sen_id_list
        table = 'REQ_ANS_DATA_ZH'
        flag = 1

    # asr_result = re.sub(u'呢', 'ne', asr_result)#防止呢的拼音被轉換成‘ni’
    # asr_result = re.sub(u'还', 'hai', asr_result)#防止還的拼音被轉換成‘huan’
    # asr_result = re.sub(u'嗯', 'en', asr_result)  # 因爲嗯的拼音沒有
    f, asr_result = isWaidi(asr_result)
    if f:
        return 1.0, asr_result
    asr_result = unicode(asr_result, 'utf-8')
    asr_result = asr_result.strip(u'?')
    asr_result_pinyin = ' '.join(get_pinyin(asr_result))
    print asr_result_pinyin
    word_list = []
    # 將語音識別出來的結果與數據庫中所有的記錄一起轉換詞頻向量
    word_list.append(asr_result_pinyin)
    # print(word_list)r

    try:
        if flag == 0:
            tf_vectorizer = CountVectorizer(analyzer='word',
                                            ngram_range=(2, 2))
        else:
            tf_vectorizer = CountVectorizer(analyzer='word',
                                            ngram_range=(1, 1))

        tf_vectorizer.fit(word_list)
    except ValueError:
        return 0.0, u"无匹配".encode("utf-8")
    else:
        # tf_df = pd.DataFrame(tf.toarray())
        print tf_vectorizer.get_feature_names()
        feature_list = tf_vectorizer.get_feature_names()
        # singleWord_list = []
        # twoGram_list = []
        # for item in feature_names:
        #     if ' ' in item:
        #         twoGram_list.append(item)
        #     else:
        #         singleWord_list.append(item)
        vector_start_time = time.time()
        vector = np.zeros(len(tf_array[0]))
        for word in feature_list:
            if word in features:
                vector[features.index(word)] += 1
                #else:
                # if ' ' in word:
                #     continue
                '''如果识别结果是两个字,则单个字不再计算编辑距离'''
            # if flag == 0 and ' ' not in word:
            #     continue
            for i in range(len(features)):
                # if ' ' in feature_names[i]:
                #     continue
                '''优化计算编辑距离效率'''
                # if ' ' in word:
                #     if ' ' not in features[i]:
                #         continue
                # else:
                #     if ' ' in features[i]:
                #         continue
                if flag == 0:
                    leven_cost = difflib_leven_word(word, features[i])
                else:
                    leven_cost = difflib_leven_zi(word, features[i])
                if ' ' in word and leven_cost <= 2:
                    vector[i] += 1
                elif ' ' not in word and leven_cost <= 1:
                    vector[i] += 1

        print vector
        vector_end_time = time.time()
        print "构造向量时间:", (vector_end_time - vector_start_time)
        if vector.any() == 0:
            distance = 0.0
            changeText = u"无匹配"
            return distance, changeText.encode("utf-8")

        print len(tf_array[0])
        cos_list = []  # 保存計算出來的所有相似度
        cos_start_time = time.time()

        for i in range(len(tf_array)):
            csim_value = csim(np.matrix(tf_array[i]), np.matrix(vector))
            # print(cos(tf_array[i], tf_array[-1]))
            cos_list.append(csim_value)
        # time1 = time.time()
        # print "運行時間:",time1-time0
        cos_end_time = time.time()
        print "计算余弦相似度时间:", (cos_end_time - cos_start_time)
        # print cos_list
        max_index = np.argmax(cos_list)  # 獲得餘弦距離最大值的索引
        ID = id_list[max_index]  # 獲得餘弦距離最大值對應數據庫中的ID
        print "相似度為:", cos_list[max_index]  # 返回餘弦距離最大值的索引

        db_start_time = time.time()
        sql = "select name from " + table + " where id = '%s'" % ID
        cx = sqlite3.connect(db_path)
        cu = cx.cursor()
        cu.execute(sql)

        changeText = cu.fetchone()[0]
        if cos_list[max_index] > 0.55:
            sql = "update " + table + " set class = class + 1 where id = '%s'" % ID
            cu.execute(sql)
            cx.commit()
        cu.close()
        cx.close()
        db_end_time = time.time()
        print "查询数据库时间:", (db_end_time - db_start_time)
        print "替換為:", changeText
        return cos_list[max_index], changeText.encode("utf-8")
コード例 #2
0
ファイル: script.py プロジェクト: pinweihelai/ED
將文本文件中的内容轉化為拼音插入數據庫REQ_ANS_DATA_ZH
'''
import sqlite3
import codecs
import re
from pinyin_test import get_pinyin
cx = sqlite3.connect("d:/share/ai2.0/bin/correct/correct.db")
cu = cx.cursor()
reader = codecs.open("d:/house9.txt", 'r', encoding='utf-8')
i = 1700
while 1:
    line = reader.readline().strip()
    if not line:
        break
    line = re.sub(u'[,。?:;!/、]', '', line)
    # line_p = re.sub(r'\d', '', line)
    # line_p = re.sub(r'还', 'hai', line_p)
    # line_p = re.sub(r'呢', 'ne', line_p)
    print(line)

    line_pinyin = ' '.join(get_pinyin(line))

    sql = "insert into REQ_ANS_DATA_ZH(id,name,condition) values(%d,'%s','%s')" % (
        i, line, line_pinyin)

    cu.execute(sql)
    i = i + 1

cx.commit()
cu.close()
cx.close()
コード例 #3
0
            asr_result = re.sub(word, '', asr_result)
            print 'after filter:', asr_result
    return asr_result


'''构造句向量矩阵REQ_ANS_DATA_ZH表'''
cx = sqlite3.connect("d:/share/ai2.0/bin/correct/correct.db")
cu = cx.cursor()
cu.execute("select id,name from REQ_ANS_DATA_ZH")
word_list = []
id_list = []
for item in cu.fetchall():
    # print(item[0])

    word = process(item[1])
    word_pinyin = ' '.join(get_pinyin(word))
    if word_pinyin == '':
        continue
    word_list.append(word_pinyin)
    id_list.append(item[0])  # 保存數據庫中的ID

tf_vectorizer = CountVectorizer(analyzer='word', ngram_range=(1, 2))
tf = tf_vectorizer.fit_transform(word_list)
tf_array = tf.toarray()
print len(tf_array[0])
feature_names = tf_vectorizer.get_feature_names()
'''构造词向量矩阵word_correct表'''
cu.execute("select id, pingyin from word_correct")
word_id_list = []
word_list = []
for item in cu.fetchall():