def robot_chat(word): start_time = time.time() con = sqlite3.connect('weibo_words.db') cur = con.cursor() word_list = jieba._lcut(word) solved_words = sorted(word_list, key=lambda x: len(x)) # print(solved_words) if len(solved_words) >= 5: solved_words = random.choice(sorted(word_list, key=lambda x: len(x))[-3:-1]) elif 5>len(solved_words)>=1: solved_words =solved_words[-1] else: solved_words = '没有发现相关话题' cur.execute("select Words from words where Words LIKE '%{}%'".format(solved_words)) data = cur.fetchall() con.close() end_time = time.time() spend_time = end_time - start_time print(f"[Spend_time]:{round(spend_time,2)}s [Keyword]:{solved_words}") if len(data) == 1: robot_words = data[0] return robot_words elif len(data) > 1: robot_words = random.choice(data)[0] return robot_words else: return '小K,听不懂你说的这个{}'.format(word)
def get_single_answer(q): luis_ret = get_question_type(q) intent = luis_ret['topScoringIntent']['intent'] kb_dict = { u'职务查询': 'd1ed6abf-2e70-4fd1-bd7b-6ccd3135ec74', u'复旦大学概况查询': '00716ffa-7171-49d9-83ca-ac5987b1af09', u'电话查询': '6f632b07-e80f-4db6-ac5a-eddd53f0df75', u'传真号码查询': '507cad69-c46b-4417-877d-901e754d8da8', u'网站查询': '28fb93b0-8af0-4846-8c74-eef524c6fd01', u'学院概况':'41546723-30a2-4507-85e0-c737428df79f' } trusted_list = [] words = jieba._lcut(q) words = [i.encode('utf-8') for i in words] distinctive_type(trusted_list, words) if not trusted_list: trusted_list = kb_dict.keys() res = [] for kb in trusted_list: a, score = qna_maker_fetch(q, kb_dict[kb]) if kb_dict.get(intent, '') == kb: score += 10 res.append((a, score)) if score > 80: break res.sort(key=lambda x: -x[1]) return res[0][0]
def process_question(st,request): if type(st) == unicode: st = st.encode('utf-8') if request: request.session.get('prev_ans') st = question_preprocess(st) l = jieba._lcut(st) l = [i.encode('utf-8') for i in l] conj_list = ['和','与','还有',','] idx = -1 questions = [] answers = [] for i,word in enumerate(l): if word in conj_list: idx = i break # delete the latter word if idx != -1: try: bak = copy.deepcopy(l) del l[idx+1] del l[idx] questions.append(''.join(l)) l = bak del l[idx] del l[idx-1] questions.append(''.join(l)) except IndexError: print 'index error' else: questions.append(st) for q in questions: parsed_l = jieba._lcut(q) parsed_l = [i.encode('utf-8') for i in parsed_l] tf, ans = TF_question(q, parsed_l) if tf: ans = TF_handler(q,parsed_l) answers.append(ans) else: ans = get_single_answer(q) answers.append(ans) return ','.join(answers)
def keyword_tfidf(sentence, rate=1.): """ 使用tf-dif获取关键词构建 :param sentence: str, input sentence :param rate: float, 0-1 :return: str """ sen_words = jieba._lcut(sentence) top_k = int(len(sen_words) * rate) keyword = tfidf(sentence, topK=top_k, withWeight=False, withFlag=False) keyword_sort = [k if k in keyword else '' for k in sen_words] return ''.join(keyword_sort)
def keyword_textrank(sentence, rate=1., allow_pos=('an', 'i', 'j', 'l', 'r', 't', 'n', 'nr', 'ns', 'nt', 'nz', 'v', 'vd', 'vn')): """ 使用text-rank获取关键词构建 :param sentence: str, input sentence, 例: '大漠帝国是谁呀,你知道吗' :param rate: float, 0-1 , 例: '0.6' :param allow_pos: list, 例: ('ns', 'n', 'vn', 'v') :return: str, 例: '大漠帝国' """ sen_words = jieba._lcut(sentence) top_k = int(len(sen_words) * rate) keyword = textrank(sentence, topK=top_k, allowPOS=allow_pos, withWeight=False, withFlag=False) keyword_sort = [k if k in keyword else '' for k in sen_words] return ''.join(keyword_sort)
import jieba def cos_dis(vector1, vector2): return dot(vector1, vector2) / (linalg.norm(vector1) * linalg.norm(vector2)) word_dict = pickle.load(open("nlpcc_dict_20160605")) input_path = "/home/shin/MyGit/Common/MyCommon/NLPCC_dbqa/nlpcc-iccpol-2016.dbqa.training-data" out_path = "similarity_0606" f_input = open(input_path, "r").readlines() print "total lines of input is {}".format(len(f_input)) dis_list = [] for line in tqdm(f_input): each = line.split("\t") question = jieba._lcut(each[0]) question_vector = zeros(100) # print question_vector.shape for word in question: one_vec = word_dict[word.encode("utf8")] # print one_vec.shape question_vector += one_vec answer = jieba._lcut(each[1]) answer_vector = zeros(100) for word in answer: one_vec = word_dict[word.encode("utf8")] answer_vector += one_vec dis = cos_dis(question_vector, answer_vector) dis_list.append(dis)
with open(fn, encoding='utf-8') as file: #读取停用词表: stopwords = [ line.strip() for line in open('chineseStopWord', encoding='UTF-8').readlines() ] stopwords.append('\n') #去掉回车字符 stopwords.append('\u00A0') #去掉回车字符 stopwords.append('\u0020') #去掉回车字符 stopwords.append('\u3000') #去掉回车字符 # seg_list=jieba._lcut("大事发生大放") out = [] for line in file: seg_list = jieba._lcut(line) for i in seg_list: if i in stopwords: continue out.append(i) out3 = out # # # # score1=[] # score2=[] # score3=[] #
def cos_dis(vector1, vector2): return dot(vector1, vector2) / (linalg.norm(vector1) * linalg.norm(vector2)) word_dict = pickle.load(open('nlpcc_dict_20160605')) input_path = '/home/shin/MyGit/Common/MyCommon/NLPCC_dbqa/nlpcc-iccpol-2016.dbqa.training-data' out_path = 'similarity_0606' f_input = open(input_path, 'r').readlines() print 'total lines of input is {}'.format(len(f_input)) dis_list = [] for line in tqdm(f_input): each = line.split('\t') question = jieba._lcut(each[0]) question_vector = zeros(100) # print question_vector.shape for word in question: one_vec = word_dict[word.encode('utf8')] # print one_vec.shape question_vector += one_vec answer = jieba._lcut(each[1]) answer_vector = zeros(100) for word in answer: one_vec = word_dict[word.encode('utf8')] answer_vector += one_vec dis = cos_dis(question_vector, answer_vector) dis_list.append(dis)
dia=dia.split('\r\n') w_dia='' for j in range(len(dia)): ''' if j%2==0: sent_w= assert 'M' in dia[j] ''' sent=dia[j][3:-7] if 'greeting' in sent: w_sent=str(sent_id) sent_id=sent_id+1 sent_list=jieba._lcut(sent[:sent.index('\t')]) for word in (sent_list): w_sent +=' ' w_sent +=word w_dia=w_dia+w_sent+'\n' elif 'request' in sent: pass elif 'inform' in sent: w_sent=str(sent_id) sent_id=sent_id+1 sent_list=jieba._lcut(sent[:sent.index('\t')]) for word in (sent_list): w_sent +=' ' w_sent +=word
assert 'M' in dia[j] ''' slot_count = 'nil\t' + str(sent_id) slot_time = 'nil\t' + str(sent_id) slot_idnumber = 'nil\t' + str(sent_id) slot_destination = 'nil\t' + str(sent_id) slot_departure = 'nil\t' + str(sent_id) slot_name = 'nil\t' + str(sent_id) sent = dia[j][3:-7] if 'greeting' in sent: w_sent = str(sent_id) sent_id = sent_id + 1 sent_list = jieba._lcut(sent[:sent.index('\t')]) for word in (sent_list): w_sent += ' ' w_sent += word w_dia = w_dia + w_sent + '\n' elif 'request' in sent: sent.replace('?', '?') w_sent = str(sent_id) sent_id = sent_id + 1 sent_list = jieba._lcut(sent[:sent.index('\t')]) for word in (sent_list): w_sent += ' ' w_sent += word w_dia = w_dia + w_sent + '\n'
import jieba import wordcloud import cv2 mask = cv2.imread("chinamap.png") f = open("关于实施乡村振兴战略的意见.txt", "r", encoding="utf-8") t = f.read() f.close() ls = jieba._lcut(t) txt = " ".join(ls) w=wordcloud.WordCloud(font_path="msyh.ttc",mask=mask,width=1000,height=700,\ background_color="white") w.generate(txt) w.to_file("grwordcloud.png")
def fc(): if request.method == 'POST': # 获取请求参数 text1 = request.form["str"] text2 = jieba._lcut(text1) return jsonify(text2)
comment_list = [] for i in range(1, 20): pageNum = str(page * i) comment_temp = GetMovieComment(movieId, pageNum) comment_list = comment_list + comment_temp # print(comment_list) s = '' for i in range(len(comment_list)): s = s + comment_list[i] pattern = re.compile(r'[\u4e00-\u9fa5]+') filterdata = re.findall(pattern, s) cleaned_comments = ''.join(filterdata) StoreTo_txt(cleaned_comments) print('影评信息保存完毕!') segment = jieba._lcut(cleaned_comments) print(segment) words_df = pd.DataFrame({'segment': segment}) """去停用词""" stopwords = pd.read_csv("stopwords.txt", index_col=False, quoting=3, sep="\t", names=['stopword'], encoding='utf-8') #quoting=3全不引用 words_df = words_df[~words_df.segment.isin(stopwords.stopword)] """词频统计""" words_stat = words_df.groupby(by=['segment'])['segment'].agg( {"计数": numpy.size}) words_stat = words_stat.reset_index().sort_values(by=["计数"], ascending=False) print(words_stat.head(30))
from keras.preprocessing import sequence from keras.models import Sequential from keras.layers import Dropout, Dense from keras.layers import Activation from keras.layers.embeddings import Embedding #from keras.layers import LSTM from keras.layers.recurrent import SimpleRNN import numpy as np import jieba str_list = [] path = "C:/Users/Oliver/Desktop/RNN/" for i in os.listdir(path + "唐诗"): with open(path + "唐诗/" + str(i)) as f: data = f.readline() data = jieba._lcut(data) str_list_pre = " ".join(data) str_list.append(data) for i in os.listdir(path + "宋词"): with open(path + "宋词/" + str(i)) as f: data = f.readline() data = jieba._lcut(data) str_list_pre = " ".join(data) str_list.append(data) token = Tokenizer(num_words=5000) token.fit_on_texts(str_list) print(token.word_index) X_train_seq = token.texts_to_sequences(str_list) X_train = sequence.pad_sequences(X_train_seq, maxlen=40, padding="post") Y_train = [0] * 1191 + [1] * 1186 print(len(X_train))
namelist_question.append('乘客的姓名是?') namelist_question.append('请问先生怎么称呼?') namelist_question.append('请问小姐怎么称呼?') namelist_question.append('请问老人家怎么称呼?') namelist_question.append('先生您怎么称呼?') namelist_question.append('小姐您怎么称呼?') namelist_question.append('先生您叫什么名字?') namelist_question.append('小姐您叫什么名字?') namelist_question.append('您的名字?') namelist_question.append('先生的名字?') namelist_question.append('小姐的名字?') namelist_question.append('乘客姓名?') namelist_question.append('姓名?') namelist_question.append('名字?') namelist_question.append('可否请教先生名姓?') namelist_question.append('小姐芳名可否见告?') namelist_question.append('麻烦您说一下您的姓名可以吗?') namelist_question.append('麻烦说下您的名字?谢谢。') namelist_question.append('请告知姓名,谢谢。') namelist_question.append('麻烦您告诉我您的名字,非常感谢。') namelist_question_cut=[] for ans in namelist_question: w_sent='' sent=jieba._lcut(ans) for word in (sent): w_sent +=' ' w_sent +=word w_sent += '\n' namelist_question_cut.append(w_sent) pass
greetinglist_client.append('我想预订宾馆') greetinglist_client.append('我要预订房间。') greetinglist_client.append('请帮我预订房间') greetinglist_client.append('请您帮我预订一下宾馆房间。') greetinglist_client.append('我需要预订房间。') greetinglist_client.append('您好,我需要预订宾馆') greetinglist_client.append('您好,我想预订宾馆') greetinglist_client.append('您好,我要预订宾馆。') greetinglist_client.append('您好,请帮我预订房间') greetinglist_client.append('您好,请您帮我预订一下房间。') greetinglist_client.append('您好,我需要预订宾馆。') greetinglist_server_split = [] for ans in greetinglist_server: w_sent = '' sent = jieba._lcut(ans) for word in sent: w_sent += ' ' w_sent += word w_sent += '\n' greetinglist_server_split.append(w_sent) greetinglist_client_split = [] for ans in greetinglist_client: w_sent = '' sent = jieba._lcut(ans) for word in sent: w_sent += ' ' w_sent += word w_sent += '\n' greetinglist_client_split.append(w_sent)
import re import pickle import jieba input_path='/home/shin/MyGit/Common/MyCommon/NLPCC_dbqa/nlpcc-iccpol-2016.dbqa.training-data' # input_path='/home/shin/MyGit/Common/MyCommon/NLPCC_dbqa/NLPCC2016QA-Update/evatestdata2-dbqa.testing-data-answers' f_input=open(input_path,'r').readlines() print 'total lines of input is {}'.format(len(f_input)) sent_set=set() question_set=set() ans_set=set() vocab=set() for line in f_input: each=line.split('\t') # question_set.add(each[0]) # ans_set.add(each[1]) # if each[1] in ans_set: # print each[1] sent_set.add(each[0]) sent_set.add(each[1]) print 'total num of sents:{}'.format(len(sent_set)) # print len(question_set) # print len(ans_set) for sent in sent_set: words=jieba._lcut(sent) for word in words: vocab.add(word) print len(vocab) # pickle.dump(vocab,open('vocabSet_in_NLPCC_0701','w'))