def test_oov_emb(self): """测试 OOV word embedding""" w = ',' comma_res = text2vec.encode(w) print(w, comma_res) self.assertEqual(comma_res, 0.0) w = '特价机票' r = text2vec.encode(w) print(w, r) w = '特价' r1 = text2vec.encode(w) print(w, r1) w = '机票' r2 = text2vec.encode(w) print(w, r2) emb = [r1, r2] r_average = np.array(emb).sum(axis=0) / 2.0 print('r_average:', r_average) if str(r) == str(r_average): print('same') self.assertTrue(str(r) == str(r_average))
def test_sentence_emb(): char = '你' result = text2vec.encode(char) print(char, result) char = '好' result = text2vec.encode(char) print(char, result) char = '吗' result = text2vec.encode(char) print(char, result) char = '你好' result = text2vec.encode(char) print(char, result) char = '你好吗' result = text2vec.encode(char) print(char, result) import numpy as np emb = [text2vec.encode('你好'), text2vec.encode('吗')] average = np.array(emb).sum(axis=0) / 2.0 print('average:', average) act = text2vec.encode('你好吗') if str(act) == str(average): print("same") else: print('diff')
def test_oov_emb(): char = ',' result = text2vec.encode(char) print(char, result) char = '特价机票' result = text2vec.encode(char) print(char, result) char = '特价' result = text2vec.encode(char) print(char, result) char = '机票' result = text2vec.encode(char) print(char, result)
def test_encode_text(self): """测试文本 text encode结果""" a = '如何更换花呗绑定银行卡' emb = text2vec.encode(a) print(a, emb) self.assertEqual(emb.shape, (200, )) self.assertTrue(' '.join(["{:.3f}".format(i) for i in emb[:3]]) == "0.041 -0.126 0.019")
def test_encode_word(self): """测试文本 word encode结果""" word = '银行卡' emb = text2vec.encode(word) print(word, emb) self.assertEqual(emb.shape, (200, )) self.assertTrue(' '.join(["{:.3f}".format(i) for i in emb[:3]]) == "0.002 -0.126 0.053")
def test_encode_char(self): """测试文本 char encode结果""" char = '卡' emb = text2vec.encode(char) t = type(emb) print(t) self.assertTrue(t == np.ndarray) print(char, emb, emb.shape) self.assertEqual(emb.shape, (200, )) print(' '.join(["{:.3f}".format(i) for i in emb[:3]])) self.assertTrue(' '.join(["{:.3f}".format(i) for i in emb[:3]]) == "0.068 -0.110 -0.048")
@author:XuMing([email protected]) @description: """ import sys import numpy as np sys.path.append('..') import text2vec text2vec.set_log_level('INFO') if __name__ == '__main__': char = '卡' emb = text2vec.encode(char) print(type(emb), emb.shape) print(char, emb) word = '银行卡' print(word, text2vec.encode(word)) a = '如何更换花呗绑定银行卡' emb = text2vec.encode(a) print(a, emb) b = [ '卡', '银行卡', '如何更换花呗绑定银行卡', '如何更换花呗绑定银行卡,如何更换花呗绑定银行卡。如何更换花呗绑定银行卡?。。。这个,如何更换花呗绑定银行卡!' ] res = []
# -*- coding: utf-8 -*- """ @author:XuMing([email protected]) @description: """ import text2vec a = '如何更换花呗绑定银行卡' b = '花呗更改绑定银行卡' c = '我什么时候开通了花呗' emb = text2vec.encode(a) print(emb) s = text2vec.score(a, b) print(a, b, s) s = text2vec.score(a, c) print(a, c, s) s = text2vec.score(b, c) print(b, c, s)
def similarProj(request): final_result = {"fundraisings": [], "rates": [], "points": [], "chart": []} if request.method == "POST": try: # 處理資料 text2vec_df = pd.read_pickle( "/Users/hw_students/proj02/proj02_data/text2vec_df.pkl") df = pd.read_pickle("/Users/hw_students/proj02/proj02_data/df.pkl") # 處理Input userdata = request.data print("userdata:") print(userdata) unit = np.array(list(userdata[0].values())) # unit = unit.reshape(1, -1) #全部放到同一個陣列 # 把使用者的文本放入 text2vec_df = text2vec_df.append( pd.DataFrame(text2vec.encode(unit[5])).T) # 把使用者的專案放入 print(unit[0]) titles = df.title.tolist() + [unit[0]] # titles = df.title.tolist() + ["-1"] cs = cosine_similarity(text2vec_df) ###相似案例 RETURN_NUMBER = 3 similar_project = getSimiliarArticle( -1, cs, titles) # 分數>0.9 的所有募資案id(type=list) new = df[df["title"].apply( lambda x: x in similar_project[:RETURN_NUMBER])] fundraisings = campaignlist(new) final_result["fundraisings"] = fundraisings # print("similar_project") # print(similar_project) ###取十筆 NUMBER = 10 df_10 = df[df["title"].apply( lambda x: x in similar_project[:NUMBER])] ten_proj = [] for i in range(len(df_10)): one_proj = {} one_proj["id"] = int(df_10.iloc[i:i + 1, :].id.values[0]) one_proj["title"] = str(df_10.iloc[i:i + 1, :].title.values[0]) one_proj["url"] = str(df_10.iloc[i:i + 1, :].url.values[0]) funding_target = int(df_10.iloc[i:i + 1, :].funding_target.values[0]) now_funding = int(df_10.iloc[i:i + 1, :].now_funding.values[0]) one_proj["amountRaised"] = now_funding one_proj["amountReached"] = funding_target one_proj["proportion"] = round( now_funding / funding_target * 100, 2) one_proj["status"] = str(df_10.iloc[i:i + 1, :].status.values[0]) ten_proj.append(one_proj) final_result["rates"] = ten_proj ###處理落點分布 CALCULATE_NUMBER = 15 df_15 = df[df["title"].apply( lambda x: x in similar_project[:CALCULATE_NUMBER])] #print(similar_project[:CALCULATE_NUMBER]) points = {} points["averageTarget"] = funding_target_med(df_15) points["userTarget"] = unit[2] points["averageTime"] = days_med(df_15) points["userTime"] = unit[3] points["averageFeedback"] = cam_count_med(df_15) points["userFeedback"] = unit[4] final_result["points"] = points ###處理圖表 # 找出和輸入金額最接近的十筆募資案 money = int(unit[2]) # 使用者輸入的金額 df3 = pd.DataFrame() df3 = funding_target_similar(money, df) # 分割回饋方案 list3 = campaignlist_origin(df3) # print(list3) # 相似金額分布 chart = funding_table(list3, df3) final_result["chart"] = chart print(final_result) return JsonResponse(final_result, safe=False) except ValueError as e: return Response(e.args[0], status.HTTP_400_BAD_REQUEST)