def robot_chat(word):
    start_time = time.time()
    con = sqlite3.connect('weibo_words.db')
    cur = con.cursor()
    word_list = jieba._lcut(word)
    solved_words = sorted(word_list, key=lambda x: len(x))
    # print(solved_words)
    if len(solved_words) >= 5:
        solved_words = random.choice(sorted(word_list, key=lambda x: len(x))[-3:-1])
    elif 5>len(solved_words)>=1:
        solved_words =solved_words[-1]
    else:
        solved_words = '没有发现相关话题'
    cur.execute("select Words from words where Words LIKE '%{}%'".format(solved_words))
    data = cur.fetchall()
    con.close()
    end_time = time.time()
    spend_time = end_time - start_time
    print(f"[Spend_time]:{round(spend_time,2)}s [Keyword]:{solved_words}")
    if len(data) == 1:
        robot_words = data[0]
        return robot_words
    elif len(data) > 1:
        robot_words = random.choice(data)[0]
        return robot_words
    else:
        return '小K,听不懂你说的这个{}'.format(word)
def get_single_answer(q):
    luis_ret = get_question_type(q)
    intent = luis_ret['topScoringIntent']['intent']
    kb_dict = {
        u'职务查询': 'd1ed6abf-2e70-4fd1-bd7b-6ccd3135ec74',
        u'复旦大学概况查询': '00716ffa-7171-49d9-83ca-ac5987b1af09',
        u'电话查询': '6f632b07-e80f-4db6-ac5a-eddd53f0df75',
        u'传真号码查询': '507cad69-c46b-4417-877d-901e754d8da8',
        u'网站查询': '28fb93b0-8af0-4846-8c74-eef524c6fd01',
        u'学院概况':'41546723-30a2-4507-85e0-c737428df79f'
    }
    trusted_list = []
    words = jieba._lcut(q)
    words = [i.encode('utf-8') for i in words]
    distinctive_type(trusted_list, words)
    if not trusted_list:
        trusted_list = kb_dict.keys()
    res = []
    for kb in trusted_list:
        a, score = qna_maker_fetch(q, kb_dict[kb])
        if kb_dict.get(intent, '') == kb:
            score += 10
        res.append((a, score))
        if score > 80:
            break
    res.sort(key=lambda x: -x[1])
    return res[0][0]
def process_question(st,request):
    if type(st) == unicode:
        st = st.encode('utf-8')
    if request:
        request.session.get('prev_ans')
    st = question_preprocess(st)

    l = jieba._lcut(st)
    l = [i.encode('utf-8') for i in l]
    conj_list = ['和','与','还有',',']
    idx = -1
    questions = []
    answers = []

    for i,word in enumerate(l):
        if word in conj_list:
            idx = i
            break
    # delete the latter word
    if idx != -1:
        try:
            bak = copy.deepcopy(l)
            del l[idx+1]
            del l[idx]
            questions.append(''.join(l))
            l = bak
            del l[idx]
            del l[idx-1]
            questions.append(''.join(l))

        except IndexError:
            print 'index error'
    else:
        questions.append(st)

    for q in questions:
        parsed_l = jieba._lcut(q)
        parsed_l = [i.encode('utf-8') for i in parsed_l]
        tf, ans = TF_question(q, parsed_l)
        if tf:
            ans = TF_handler(q,parsed_l)
            answers.append(ans)
        else:
            ans = get_single_answer(q)
            answers.append(ans)
    return ','.join(answers)
def keyword_tfidf(sentence, rate=1.):
    """
        使用tf-dif获取关键词构建
    :param sentence: str, input sentence
    :param rate: float, 0-1
    :return: str
    """
    sen_words = jieba._lcut(sentence)
    top_k = int(len(sen_words) * rate)
    keyword = tfidf(sentence, topK=top_k, withWeight=False, withFlag=False)
    keyword_sort = [k if k in keyword else '' for k in sen_words]
    return ''.join(keyword_sort)
def keyword_textrank(sentence,
                     rate=1.,
                     allow_pos=('an', 'i', 'j', 'l', 'r', 't', 'n', 'nr', 'ns',
                                'nt', 'nz', 'v', 'vd', 'vn')):
    """
        使用text-rank获取关键词构建
    :param sentence:  str, input sentence, 例: '大漠帝国是谁呀,你知道吗'
    :param rate: float, 0-1 , 例: '0.6'
    :param allow_pos: list, 例: ('ns', 'n', 'vn', 'v')
    :return: str, 例: '大漠帝国'
    """
    sen_words = jieba._lcut(sentence)
    top_k = int(len(sen_words) * rate)
    keyword = textrank(sentence,
                       topK=top_k,
                       allowPOS=allow_pos,
                       withWeight=False,
                       withFlag=False)
    keyword_sort = [k if k in keyword else '' for k in sen_words]
    return ''.join(keyword_sort)
import jieba


def cos_dis(vector1, vector2):
    return dot(vector1, vector2) / (linalg.norm(vector1) * linalg.norm(vector2))


word_dict = pickle.load(open("nlpcc_dict_20160605"))
input_path = "/home/shin/MyGit/Common/MyCommon/NLPCC_dbqa/nlpcc-iccpol-2016.dbqa.training-data"
out_path = "similarity_0606"
f_input = open(input_path, "r").readlines()
print "total lines of input is {}".format(len(f_input))
dis_list = []
for line in tqdm(f_input):
    each = line.split("\t")
    question = jieba._lcut(each[0])
    question_vector = zeros(100)
    # print question_vector.shape
    for word in question:
        one_vec = word_dict[word.encode("utf8")]
        # print one_vec.shape
        question_vector += one_vec

    answer = jieba._lcut(each[1])
    answer_vector = zeros(100)
    for word in answer:
        one_vec = word_dict[word.encode("utf8")]
        answer_vector += one_vec

    dis = cos_dis(question_vector, answer_vector)
    dis_list.append(dis)
Beispiel #7
0
with open(fn, encoding='utf-8') as file:

    #读取停用词表:
    stopwords = [
        line.strip()
        for line in open('chineseStopWord', encoding='UTF-8').readlines()
    ]
    stopwords.append('\n')  #去掉回车字符
    stopwords.append('\u00A0')  #去掉回车字符
    stopwords.append('\u0020')  #去掉回车字符
    stopwords.append('\u3000')  #去掉回车字符

    # seg_list=jieba._lcut("大事发生大放")
    out = []
    for line in file:
        seg_list = jieba._lcut(line)

        for i in seg_list:
            if i in stopwords:
                continue
            out.append(i)
out3 = out

#
#
#

# score1=[]
# score2=[]
# score3=[]
#

def cos_dis(vector1, vector2):
    return dot(vector1,
               vector2) / (linalg.norm(vector1) * linalg.norm(vector2))


word_dict = pickle.load(open('nlpcc_dict_20160605'))
input_path = '/home/shin/MyGit/Common/MyCommon/NLPCC_dbqa/nlpcc-iccpol-2016.dbqa.training-data'
out_path = 'similarity_0606'
f_input = open(input_path, 'r').readlines()
print 'total lines of input is {}'.format(len(f_input))
dis_list = []
for line in tqdm(f_input):
    each = line.split('\t')
    question = jieba._lcut(each[0])
    question_vector = zeros(100)
    # print question_vector.shape
    for word in question:
        one_vec = word_dict[word.encode('utf8')]
        # print one_vec.shape
        question_vector += one_vec

    answer = jieba._lcut(each[1])
    answer_vector = zeros(100)
    for word in answer:
        one_vec = word_dict[word.encode('utf8')]
        answer_vector += one_vec

    dis = cos_dis(question_vector, answer_vector)
    dis_list.append(dis)
    dia=dia.split('\r\n')
    w_dia=''
    for j in range(len(dia)):
        '''
        if j%2==0:
            sent_w=
            assert 'M' in dia[j]
        '''
        sent=dia[j][3:-7]


        if 'greeting' in sent:

            w_sent=str(sent_id)
            sent_id=sent_id+1
            sent_list=jieba._lcut(sent[:sent.index('\t')])
            for word in (sent_list):
                w_sent +=' '
                w_sent +=word

            w_dia=w_dia+w_sent+'\n'
        elif 'request' in sent:
            pass
        elif 'inform' in sent:
            w_sent=str(sent_id)
            sent_id=sent_id+1
            sent_list=jieba._lcut(sent[:sent.index('\t')])
            for word in (sent_list):
                w_sent +=' '
                w_sent +=word
Beispiel #10
0
            assert 'M' in dia[j]
        '''
        slot_count = 'nil\t' + str(sent_id)
        slot_time = 'nil\t' + str(sent_id)
        slot_idnumber = 'nil\t' + str(sent_id)
        slot_destination = 'nil\t' + str(sent_id)
        slot_departure = 'nil\t' + str(sent_id)
        slot_name = 'nil\t' + str(sent_id)

        sent = dia[j][3:-7]

        if 'greeting' in sent:

            w_sent = str(sent_id)
            sent_id = sent_id + 1
            sent_list = jieba._lcut(sent[:sent.index('\t')])
            for word in (sent_list):
                w_sent += ' '
                w_sent += word

            w_dia = w_dia + w_sent + '\n'
        elif 'request' in sent:
            sent.replace('?', '?')
            w_sent = str(sent_id)
            sent_id = sent_id + 1
            sent_list = jieba._lcut(sent[:sent.index('\t')])
            for word in (sent_list):
                w_sent += ' '
                w_sent += word

            w_dia = w_dia + w_sent + '\n'
import jieba
import wordcloud
import cv2

mask = cv2.imread("chinamap.png")
f = open("关于实施乡村振兴战略的意见.txt", "r", encoding="utf-8")
t = f.read()
f.close()
ls = jieba._lcut(t)
txt = " ".join(ls)
w=wordcloud.WordCloud(font_path="msyh.ttc",mask=mask,width=1000,height=700,\
    background_color="white")
w.generate(txt)
w.to_file("grwordcloud.png")
Beispiel #12
0
def fc():
    if request.method == 'POST':
        # 获取请求参数
        text1 = request.form["str"]
        text2 = jieba._lcut(text1)
        return jsonify(text2)
Beispiel #13
0
comment_list = []
for i in range(1, 20):
    pageNum = str(page * i)
    comment_temp = GetMovieComment(movieId, pageNum)
    comment_list = comment_list + comment_temp
# print(comment_list)
s = ''
for i in range(len(comment_list)):
    s = s + comment_list[i]
pattern = re.compile(r'[\u4e00-\u9fa5]+')
filterdata = re.findall(pattern, s)
cleaned_comments = ''.join(filterdata)
StoreTo_txt(cleaned_comments)
print('影评信息保存完毕!')

segment = jieba._lcut(cleaned_comments)
print(segment)
words_df = pd.DataFrame({'segment': segment})
"""去停用词"""
stopwords = pd.read_csv("stopwords.txt",
                        index_col=False,
                        quoting=3,
                        sep="\t",
                        names=['stopword'],
                        encoding='utf-8')  #quoting=3全不引用
words_df = words_df[~words_df.segment.isin(stopwords.stopword)]
"""词频统计"""
words_stat = words_df.groupby(by=['segment'])['segment'].agg(
    {"计数": numpy.size})
words_stat = words_stat.reset_index().sort_values(by=["计数"], ascending=False)
print(words_stat.head(30))
Beispiel #14
0
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dropout, Dense
from keras.layers import Activation
from keras.layers.embeddings import Embedding
#from keras.layers import LSTM
from keras.layers.recurrent import SimpleRNN
import numpy as np
import jieba

str_list = []
path = "C:/Users/Oliver/Desktop/RNN/"
for i in os.listdir(path + "唐诗"):
    with open(path + "唐诗/" + str(i)) as f:
        data = f.readline()
        data = jieba._lcut(data)
        str_list_pre = " ".join(data)
        str_list.append(data)
for i in os.listdir(path + "宋词"):
    with open(path + "宋词/" + str(i)) as f:
        data = f.readline()
        data = jieba._lcut(data)
        str_list_pre = " ".join(data)
        str_list.append(data)
token = Tokenizer(num_words=5000)
token.fit_on_texts(str_list)
print(token.word_index)
X_train_seq = token.texts_to_sequences(str_list)
X_train = sequence.pad_sequences(X_train_seq, maxlen=40, padding="post")
Y_train = [0] * 1191 + [1] * 1186
print(len(X_train))
namelist_question.append('乘客的姓名是?')
namelist_question.append('请问先生怎么称呼?')
namelist_question.append('请问小姐怎么称呼?')
namelist_question.append('请问老人家怎么称呼?')
namelist_question.append('先生您怎么称呼?')
namelist_question.append('小姐您怎么称呼?')
namelist_question.append('先生您叫什么名字?')
namelist_question.append('小姐您叫什么名字?')
namelist_question.append('您的名字?')
namelist_question.append('先生的名字?')
namelist_question.append('小姐的名字?')
namelist_question.append('乘客姓名?')
namelist_question.append('姓名?')
namelist_question.append('名字?')
namelist_question.append('可否请教先生名姓?')
namelist_question.append('小姐芳名可否见告?')
namelist_question.append('麻烦您说一下您的姓名可以吗?')
namelist_question.append('麻烦说下您的名字?谢谢。')
namelist_question.append('请告知姓名,谢谢。')
namelist_question.append('麻烦您告诉我您的名字,非常感谢。')

namelist_question_cut=[]
for ans in namelist_question:
    w_sent=''
    sent=jieba._lcut(ans)
    for word in (sent):
        w_sent +=' '
        w_sent +=word
    w_sent += '\n'
    namelist_question_cut.append(w_sent)
pass
Beispiel #16
0
greetinglist_client.append('我想预订宾馆')
greetinglist_client.append('我要预订房间。')
greetinglist_client.append('请帮我预订房间')
greetinglist_client.append('请您帮我预订一下宾馆房间。')
greetinglist_client.append('我需要预订房间。')
greetinglist_client.append('您好,我需要预订宾馆')
greetinglist_client.append('您好,我想预订宾馆')
greetinglist_client.append('您好,我要预订宾馆。')
greetinglist_client.append('您好,请帮我预订房间')
greetinglist_client.append('您好,请您帮我预订一下房间。')
greetinglist_client.append('您好,我需要预订宾馆。')

greetinglist_server_split = []
for ans in greetinglist_server:
    w_sent = ''
    sent = jieba._lcut(ans)
    for word in sent:
        w_sent += ' '
        w_sent += word
    w_sent += '\n'
    greetinglist_server_split.append(w_sent)

greetinglist_client_split = []
for ans in greetinglist_client:
    w_sent = ''
    sent = jieba._lcut(ans)
    for word in sent:
        w_sent += ' '
        w_sent += word
    w_sent += '\n'
    greetinglist_client_split.append(w_sent)
Beispiel #17
0
import re
import pickle
import jieba

input_path='/home/shin/MyGit/Common/MyCommon/NLPCC_dbqa/nlpcc-iccpol-2016.dbqa.training-data'
# input_path='/home/shin/MyGit/Common/MyCommon/NLPCC_dbqa/NLPCC2016QA-Update/evatestdata2-dbqa.testing-data-answers'
f_input=open(input_path,'r').readlines()
print 'total lines of input is {}'.format(len(f_input))
sent_set=set()
question_set=set()
ans_set=set()
vocab=set()
for line in f_input:
    each=line.split('\t')
    # question_set.add(each[0])
    # ans_set.add(each[1])
    # if each[1] in ans_set:
    #     print each[1]
    sent_set.add(each[0])
    sent_set.add(each[1])
print 'total num of sents:{}'.format(len(sent_set))
# print len(question_set)
# print len(ans_set)
for sent in sent_set:
    words=jieba._lcut(sent)
    for word in words:
        vocab.add(word)
print len(vocab)
# pickle.dump(vocab,open('vocabSet_in_NLPCC_0701','w'))