def top_N_except_pop(file): """ 去掉流行景点的准确度 :param file: :return: """ for n in range(1, 221): f = open( file, "r", ) right = 0 total = 0 for line in f: line = line.strip() s_lst = line.split('\t') answer = AName(s_lst[2]) predict_lst = s_lst[3] predict_lst = predict_lst.replace("'", '"') predict_lst = json.loads(predict_lst) predict_lst = list(predict_lst.keys()) if answer not in POPULAR_ATTR.keys(): total += 1 else: continue if AName(answer) in predict_lst[:n]: right += 1 print("top_%s: %s" % (n, right / total))
def av_position_except_pop(file): """ 不计流行景点 对结果进行评价 计算正确景点的平均推荐位置 :param file: :return: """ f = open(file, "r", encoding="utf-8") pos_lst = list() for line in f: line = line.strip() s_lst = line.split('\t') answer = AName(s_lst[2]) predict_lst = s_lst[3] predict_lst = predict_lst.replace("'", '"') predict_lst = json.loads(predict_lst) predict_lst = list(predict_lst.keys()) # 不计流行景点 if answer in POPULAR_ATTR.keys(): continue try: pos = predict_lst.index(AName(answer)) pos_lst.append(pos / len(predict_lst)) except Exception as e: pos_lst.append(1) print(pos_lst) av_pos = sum(pos_lst) / len(pos_lst) print(av_pos)
def init_matrix(): """ 根据训练集的数据 生成邻接矩阵 :return: """ conn = get_conn() cursor = conn.cursor() # 矩阵的维数 nodes_list = gen_nodes_list() n = len(nodes_list) # 初始化矩阵 matrix = np.mat(np.full((n, n), np.complex(0.0, 0.0))) sql = "select * from public.train_set" cursor.execute(sql) rows = cursor.fetchall() # 填充训练数据 for row in rows: classroute = row[2] user_id = VName(row[7]) user_index = nodes_list.index(user_id) for att_id in classroute: att = AName(att_id) att_index = nodes_list.index(att) matrix[user_index, att_index] = np.complex(0.0, 1.0) matrix[att_index, user_index] = np.complex(0.0, -1.0) sql = "select * from public.train_set" cursor.execute(sql) rows = cursor.fetchall() # 填充测试数据 sql = "select * from public.test_set" cursor.execute(sql) rows = cursor.fetchall() for row in rows: classroute = row[2] user_id = VName(row[7]) user_index = nodes_list.index(user_id) for att_id in classroute[:-1]: att = AName(att_id) att_index = nodes_list.index(att) matrix[user_index, att_index] = np.complex(0.0, 1.0) matrix[att_index, user_index] = np.complex(0.0, -1.0) return matrix
def novelty(file): """ 推荐列表中物品的平均流行度 :param file: :return: """ f = open(file, "r", encoding="utf-8") # 保存所有的推荐结果 all_list = list() for line in f: line = line.strip() s_lst = line.split('\t') answer = AName(s_lst[2]) predict_lst = s_lst[3] predict_lst = predict_lst.replace("'", '"') predict_lst = json.loads(predict_lst) predict_lst = list(predict_lst.keys()) all_list.append(predict_lst) f.close() a_degree = att_degree() # print(a_degree) for i in range(1, 51): nov = 0 for j in range(len(all_list)): for att in all_list[j][:i]: nov += a_degree[att] / i nov = nov / len(all_list) print(i, nov)
def coverage(file): """ 推荐系统推荐给所有用户的物品数占总物品数的比例 :param file: :return: """ f = open(file, "r", encoding="utf-8") # 保存所有的推荐结果 all_list = list() for line in f: line = line.strip() s_lst = line.split('\t') answer = AName(s_lst[2]) predict_lst = s_lst[3] predict_lst = predict_lst.replace("'", '"') predict_lst = json.loads(predict_lst) predict_lst = list(predict_lst.keys()) all_list.append(predict_lst) f.close() for i in range(1, 51): all_rec_items = set() for j in range(len(all_list)): all_rec_items = all_rec_items.union(set(all_list[j][:i])) print(i, len(all_rec_items) / 221)
def inter_diversity(file): """ 整体多样性 :param file: :return: """ f = open(file, "r", encoding="utf-8") # 保存所有的推荐结果 all_list = list() for line in f: line = line.strip() s_lst = line.split('\t') answer = AName(s_lst[2]) predict_lst = s_lst[3] predict_lst = predict_lst.replace("'", '"') predict_lst = json.loads(predict_lst) predict_lst = list(predict_lst.keys()) all_list.append(predict_lst) f.close() for L in range(1, 51): diver = 0 for i in range(len(all_list)): for j in range(len(all_list)): if i == j: continue diver += 1 - len( set(all_list[i][:L]).intersection(set( all_list[j][:L]))) / L diver = diver / (len(all_list) * (len(all_list) - 1)) print(L, diver)
def test(): """ 开始进行预测 :return: """ conn = get_conn() cursor = conn.cursor() # 初始化矩阵 user_dict, item_user = form_data() print("初始化矩阵完成") print(time.time()) sql = "select * from public.test_set" cursor.execute(sql) rows = cursor.fetchall() f = open("recommend_20.txt", "w", encoding="utf-8") i = 0 for row in rows: i += 1 if i % 100 == 0: print("完成:" + str(i)) user = row[7] classroute = row[2] rec_dict = recommend_by_user(user_dict, item_user, user, 20) f.writelines("%s\t%s\t%s\t%s\n" % (user, classroute[:-1], AName( classroute[-1]), json.dumps(rec_dict))) f.close() cursor.close() conn.close()
def recommend_by_user(user_dict, item_user, user, k=10): """ 对该用户进行推荐 :param user_dict: :param item_user: :param user: :param k: 选择最近的k个邻居 :return: """ neighbor_dict = gen_near_neighbors(user_dict, item_user, user) neighbor_dict = dict(list(neighbor_dict.items())[:k]) recommend_dict = dict() route = list(user_dict[user].keys()) left_nodes = set(item_user.keys()) - set(route) for item in left_nodes: up = 0 down = 0 for neighbor in neighbor_dict.keys(): if item in user_dict[neighbor].keys(): up += neighbor_dict[neighbor] * user_dict[neighbor][item] down += neighbor_dict[neighbor] if up != 0: recommend_dict[AName(item)] = up / down recommend_dict = dict( sorted(recommend_dict.items(), key=lambda x: x[1], reverse=True)) return recommend_dict
def gen_nodes_list(): """ 产生所有的节点列表 包括游客和景点 :return: """ conn = get_conn() cursor = conn.cursor() nodes_list = list() # 先添加游客节点 sql = "select * from public.route_0320" cursor.execute(sql) rows = cursor.fetchall() for row in rows: user_id = row[7] user_id = VName(user_id) nodes_list.append(user_id) # 添加景点节点 sql = "select * from public.node_1023" cursor.execute(sql) rows = cursor.fetchall() for row in rows: att_id = row[0] att_id = AName(att_id) nodes_list.append(att_id) return nodes_list
def get_dis_feature(node_loc_dict, classroute, att_id): # 没有历史记录 if len(classroute) == 0: return [0, 0, 0, 0] # 计算最近一次游览的距离 last_loc = node_loc_dict[AName(classroute[-1])] att_loc = node_loc_dict[att_id] last_d = haversine(last_loc[0], last_loc[1], att_loc[0], att_loc[1]) min_d = last_d max_d = last_d total_d = 0 # 计算最小距离,平均距离, 最大距离 for node in classroute: temp_loc = node_loc_dict[AName(node)] dis = haversine(temp_loc[0], temp_loc[1], att_loc[0], att_loc[1]) total_d += dis if dis < min_d: min_d = dis if dis > max_d: max_d = dis mean_d = total_d / len(classroute) return [min_d, mean_d, max_d, last_d]
def top_N(file): for n in range(1, 221): f = open( file, "r", ) right = 0 total = 0 for line in f: total += 1 line = line.strip() s_lst = line.split('\t') answer = AName(s_lst[2]) predict_lst = s_lst[3] predict_lst = predict_lst.replace("'", '"') predict_lst = json.loads(predict_lst) predict_lst = list(predict_lst.keys()) if AName(answer) in predict_lst[:n]: right += 1 print("top_%s: %s" % (n, right / total))
def cal_precision(result, n): """ 计算前n个推荐结果的准确率 :param result: :param n: :return: """ total = 0 right = 0 for user_id in result.keys(): total += 1 answer = result[user_id]["answer"] recommend = result[user_id]["recommend"] if AName(answer) in list(recommend.keys())[:n]: right += 1 return right / total
def predict(extract_fun): """ 对结果进行预测 :param extract_fun: :return: """ conn = get_conn() cursor = conn.cursor() # 恢复模型 clf = joblib.load("model_" + extract_fun.__name__ + ".pkl") # 二分网络 graph = init_graph() print("构建二分网络完成") print(time.time()) # 进行投影 v_nodes = get_v_nodes() a_nodes = get_a_nodes() prjv_graph = project(graph, v_nodes) prja_graph = project(graph, a_nodes) print("投影完成") print(time.time()) sql = "select * from public.ml_test_set" cursor.execute(sql) rows = cursor.fetchall() f = open("predict_" + extract_fun.__name__ + ".txt", "w", encoding="utf-8") for row in rows: user_id = VName(row[0]) att_id = AName(row[1]) is_link = row[4] if extract_fun.__name__ == "extract_direct": feature = extract_direct(graph, user_id, att_id) elif extract_fun.__name__ == "extract_indirect": feature = extract_indirect(graph, prjv_graph, prja_graph, user_id, att_id) result = clf.predict([feature])[0] f.writelines("%s\t%s\t%s\t%s\n" % (user_id, att_id, is_link, result)) f.close() cursor.close() conn.close()
def get_node_loc_dict(): """ 景点id 及其对应的经纬度 :return: """ conn = get_conn() cursor = conn.cursor() sql = "select * from public.node_1023" cursor.execute(sql) rows = cursor.fetchall() node_loc_dict = dict() for row in rows: att_id = AName(row[0]) lon = row[2] lat = row[3] node_loc_dict[att_id] = (lon, lat) return node_loc_dict
def cal_train_distance(save_dis_file="train_distance.csv"): """ 计算 训练集 的距离特征 并保存 :param save_dis_file: 保存训练集距离特征的文件 :return: """ conn = get_conn() cursor = conn.cursor() # 节点对应的经纬度 node_loc_dict = get_node_loc_dict() sql = "select * from public.ml_train_set" cursor.execute(sql) rows = cursor.fetchall() distance = [] title = ["min_d", "mean_d", "max_d", "last_d"] for row in rows: user_id = VName(row[0]) att_id = AName(row[1]) sql = "select * from public.route_0320 where id={user_id}".format( user_id=row[0]) cursor.execute(sql) item = cursor.fetchone() classroute = item[2][:-1] dis_feature = get_dis_feature(node_loc_dict, classroute, att_id) distance.append(dis_feature) # 写入到csv文件中 df = pd.DataFrame(distance, columns=title) df.to_csv(save_dis_file, encoding="utf-8")
def recommend(test_file, model_path, save_file): """ 根据上面生成的模型 进行预测推荐 :param model_path: :return: """ # 加载模型 model = xgb.Booster() model.load_model(model_path) # print(dir(model)) # 构造训练集数据 test_data = pd.read_csv(test_file) anode_list = list(test_data["anode"]) test_set = test_data.drop("anode", axis=1) # xgb矩阵赋值 xgb_test = xgb.DMatrix(test_set) # 进行预测 preds = model.predict(xgb_test) conn = get_conn() cursor = conn.cursor() sql = "select * from public.ml_test_set " cursor.execute(sql) rows = cursor.fetchall() # 所有的景区节点 a_nodes = list(get_node_id_dict().keys()) index = 0 f = open(save_file, "w", encoding="utf-8") for row in rows: user_id = VName(row[0]) att_id = AName(row[1]) is_link = row[4] # 对于测试的负案例 直接跳过 if not is_link: continue sql = "select classroute from public.route_0320 where id={user_id}".format(user_id=row[0]) cursor.execute(sql) result = cursor.fetchone() classroute = result[0] # 待预测的集合 left_set = set(a_nodes) - set(classroute[0:-1]) n = len(left_set) recommend_list = dict() for i in range(n): recommend_list[anode_list[index+i]] = preds[index+i] index += n recommend_list = dict(sorted(recommend_list.items(), key=lambda x:x[1], reverse=True)) f.write("%s\t%s\t%s\t%s\n" % (user_id, classroute[:-1], att_id, json.dumps(recommend_list, cls=MyEncoder))) f.close() cursor.close() conn.close()
def write_test_feature(func, has_sd=0): """ 保存测试集的特征 :param func: :return: """ conn = get_conn() cursor = conn.cursor() if has_sd: file_name = func.__name__ + "_has_sd_test.csv" else: file_name = func.__name__ + "_test.csv" if func.__name__ == "extract_direct": title = ["anode", "snv", "sna", "cn", "jc", "aa", "pa", "sd"] elif func.__name__ == "extract_indirect": if has_sd: title = [ "anode", "snv", "sna", "cn", "jc", "aa", "pa", "sd", "prj_cnv", "prj_cna", "prj_jcv", "prj_jca", "prj_aav", "prj_aaa", "prj_pav", "prj_paa", "prj_sdv", "prj_sda" ] else: title = [ "anode", "snv", "sna", "cn", "jc", "aa", "pa", "sd", "prj_cnv", "prj_cna", "prj_jcv", "prj_jca", "prj_aav", "prj_aaa", "prj_pav", "prj_paa" ] else: print("函数错误") return # 读数据 sql = "select * from public.ml_test_set" cursor.execute(sql) rows = cursor.fetchall() # 二分网络 graph = init_graph() print("构建二分网络完成") print(time.time()) # 进行投影 v_nodes = get_v_nodes() a_nodes = get_a_nodes() prjv_graph = project(graph, v_nodes) prja_graph = project(graph, a_nodes) print("投影完成") print(time.time()) # 所有的景点 a_nodes = list(get_node_id_dict().keys()) test_f = [] i = 0 for row in rows: print(i) i += 1 user_id = VName(row[0]) att_id = AName(row[1]) is_link = row[4] if not is_link: continue sql = "select classroute from public.route_0320 where id={user_id}".format( user_id=row[0]) cursor.execute(sql) result = cursor.fetchone() classroute = result[0] # 待预测的集合 left_set = set(a_nodes) - set(classroute[0:-1]) for anode in left_set: anode = AName(anode) if func.__name__ == "extract_direct": feature = func(graph, user_id, anode) elif func.__name__ == "extract_indirect": feature = func(graph, prjv_graph, prja_graph, user_id, anode, has_sd) else: print("函数名错误") line = [anode] line.extend(feature) test_f.append(line) # 写入到csv文件中 df = pd.DataFrame(test_f, columns=title) df.to_csv(file_name, encoding="utf-8") print("测试特征保存完成")
def write_train_feature(func, have_sd=0): """ 生成训练集特征 并写入文件 :param func: :param have_sd: 间接特征 是否含有最短距离 :return: """ if have_sd: file_name = func.__name__ + "_has_sd_train.csv" else: file_name = func.__name__ + "_train.csv" # 读取数据 conn = get_conn() cursor = conn.cursor() # 二分网络 graph = init_graph() print("构建二分网络完成") print(time.time()) # 进行投影 v_nodes = get_v_nodes() a_nodes = get_a_nodes() prjv_graph = project(graph, v_nodes) prja_graph = project(graph, a_nodes) print("网络投影完成") print(time.time()) sql = "select * from public.ml_train_set" cursor.execute(sql) rows = cursor.fetchall() print("len_rows:" + str(len(rows))) if func.__name__ == "extract_direct": title = ["label", "snv", "sna", "cn", "jc", "aa", "pa", "sd"] elif func.__name__ == "extract_indirect": if have_sd: title = [ "label", "snv", "sna", "cn", "jc", "aa", "pa", "sd", "prj_cnv", "prj_cna", "prj_jcv", "prj_jca", "prj_aav", "prj_aaa", "prj_pav", "prj_paa", "prj_sdv", "prj_sda" ] else: title = [ "label", "snv", "sna", "cn", "jc", "aa", "pa", "sd", "prj_cnv", "prj_cna", "prj_jcv", "prj_jca", "prj_aav", "prj_aaa", "prj_pav", "prj_paa" ] else: print("函数错误") return i = 0 train_f = [] for row in rows: i += 1 print(i) user_id = VName(row[0]) att_id = AName(row[1]) if func.__name__ == "extract_direct": feature = func(graph, user_id, att_id) elif func.__name__ == "extract_indirect": feature = func(graph, prjv_graph, prja_graph, user_id, att_id, have_sd) else: print("函数错误") if row[4]: line = [1] line.extend(feature) train_f.append(line) else: line = [0] line.extend(feature) train_f.append(line) # 写入到csv文件中 df = pd.DataFrame(train_f, columns=title) df.to_csv(file_name, encoding="utf-8") print("训练特征保存完成")
def train(extract_fun): """ 训练模型 :param: extract_fun :return: """ # 读取数据 conn = get_conn() cursor = conn.cursor() # 二分网络 graph = init_graph() print("构建二分网络完成") print(time.time()) # 进行投影 v_nodes = get_v_nodes() a_nodes = get_a_nodes() prjv_graph = project(graph, v_nodes) prja_graph = project(graph, a_nodes) print("网络投影完成") print(time.time()) sql = "select * from public.ml_train_set" cursor.execute(sql) rows = cursor.fetchall() print(len(rows)) # 保存训练数据 X_list = list() Y_list = list() i = 0 for row in rows: user_id = VName(row[0]) att_id = AName(row[1]) i += 1 print(i) # print(time.time()) if extract_fun.__name__ == "extract_direct": feature = extract_direct(graph, user_id, att_id) elif extract_fun.__name__ == "extract_indirect": feature = extract_indirect(graph, prjv_graph, prja_graph, user_id, att_id) # print(feature) else: print("wrong function") break X_list.append(feature) if row[4]: Y_list.append(1) else: Y_list.append(-1) print("生成训练数据") print(time.time()) cursor.close() conn.close() # 记录X_list, Y_list f = open("param_" + extract_fun.__name__ + ".txt", "w", encoding="utf-8") f.writelines(json.dumps(X_list) + "\n") f.writelines(json.dumps(Y_list) + "\n") f.close() print("训练数据保存成功") print(time.time()) clf = svm.SVC(kernel="linear") clf.fit(X_list, Y_list) print("训练数据结束") print(time.time()) joblib.dump(clf, "model_" + extract_fun.__name__ + ".pkl") print("保存模型") print(time.time())
def recommend_test(extract_fun, tuned_params): """ 根据GridSearchCV求得的参数 检验调参结果 :param tuned_params: :return: """ conn = get_conn() cursor = conn.cursor() # 读数据 sql = "select * from public.ml_test_set" cursor.execute(sql) rows = cursor.fetchall() # 构建模型 clf = svm.SVC(kernel=tuned_params["kernel"], C=tuned_params["C"], gamma=tuned_params["gamma"]) f = open("param_" + extract_fun.__name__ + ".txt", "r", encoding="utf-8") x_list = f.readline() x_list.split() y_list = f.readline() y_list.split() f.close() x_list = json.loads(x_list) y_list = json.loads(y_list) clf.fit(x_list, y_list) # 二分网络 graph = init_graph() print("构建二分网络完成") print(time.time()) # 进行投影 v_nodes = get_v_nodes() a_nodes = get_a_nodes() prjv_graph = project(graph, v_nodes) prja_graph = project(graph, a_nodes) print("投影完成") print(time.time()) # 所有的景点 a_nodes = list(get_node_id_dict().keys()) # 记录结果数据 f = open("recommend_" + extract_fun.__name__ + "_" + tuned_params["kernel"] + "_C" + str(tuned_params["C"]) + "_gamma" + str(tuned_params["gamma"]) + ".txt", "w", encoding="utf-8") i = 0 for row in rows: i += 1 print(i) user_id = VName(row[0]) att_id = AName(row[1]) is_link = row[4] if not is_link: continue sql = "select classroute from public.route_0320 where id={user_id}".format( user_id=row[0]) cursor.execute(sql) result = cursor.fetchone() classroute = result[0] # 待预测的集合 left_set = set(a_nodes) - set(classroute[0:-1]) recommendation = dict() for anode in left_set: anode = AName(anode) if extract_fun.__name__ == "extract_direct": feature = extract_direct(graph, user_id, anode) elif extract_fun.__name__ == "extract_indirect": feature = extract_indirect(graph, prjv_graph, prja_graph, user_id, anode) result = clf.predict([feature])[0] dis = abs(clf.decision_function([feature])) if result == 1: recommendation[anode] = dis[0] recommendation = dict( sorted(recommendation.items(), key=lambda x: x[1], reverse=True)) f.write("%s\t%s\t%s\t%s\n" % (user_id, classroute[:-1], att_id, json.dumps(recommendation))) f.close() cursor.close() conn.close()
def recommend_list(extract_fun): """ 利用之前生成的模型 进行推荐 :param extract_fun: :return: """ conn = get_conn() cursor = conn.cursor() # 读数据 sql = "select * from public.ml_test_set" cursor.execute(sql) rows = cursor.fetchall() # 读模型 clf = joblib.load("model_" + extract_fun.__name__ + ".pkl") # 二分网络 graph = init_graph() print("构建二分网络完成") print(time.time()) # 进行投影 v_nodes = get_v_nodes() a_nodes = get_a_nodes() prjv_graph = project(graph, v_nodes) prja_graph = project(graph, a_nodes) print("投影完成") print(time.time()) # 所有的景点 a_nodes = list(get_node_id_dict().keys()) # 记录结果数据 f = open("recommend_" + extract_fun.__name__ + ".txt", "w", encoding="utf-8") for row in rows: user_id = VName(row[0]) att_id = AName(row[1]) is_link = row[4] if not is_link: continue sql = "select classroute from public.route_0320 where id={user_id}".format( user_id=row[0]) cursor.execute(sql) result = cursor.fetchone() classroute = result[0] # 待预测的集合 left_set = set(a_nodes) - set(classroute[0:-1]) recommendation = dict() for anode in left_set: anode = AName(anode) if extract_fun.__name__ == "extract_direct": feature = extract_direct(graph, user_id, anode) elif extract_fun.__name__ == "extract_indirect": feature = extract_indirect(graph, prjv_graph, prja_graph, user_id, anode) result = clf.predict([feature])[0] dis = abs(clf.decision_function([feature])) if result == 1: recommendation[anode] = dis[0] recommendation = dict( sorted(recommendation.items(), key=lambda x: x[1], reverse=True)) f.write("%s\t%s\t%s\t%s\n" % (user_id, classroute[:-1], att_id, json.dumps(recommendation))) f.close() cursor.close() conn.close()
def recommend_list(route_length=3, coeffs=[]): """ :param route_length: :return: """ nodes_list = gen_nodes_list() matrix = init_matrix() conn = get_conn() cursor = conn.cursor() if route_length == 3: final_matrix = (matrix ** 3) elif route_length == 5: if coeffs: final_matrix = (matrix ** 3) + (matrix ** 5) / coeffs[0] else: final_matrix = (matrix ** 3) + (matrix ** 5) / 20 elif route_length == 7: if coeffs: final_matrix = (matrix ** 3) + (matrix ** 5) / coeffs[0] + (matrix ** 7) / coeffs[1] else: final_matrix = (matrix ** 3) + (matrix ** 5) / 120 + (matrix ** 7) / 5040 else: print("暂时没有对应的公式") return sql = "select * from public.test_set" cursor.execute(sql) rows = cursor.fetchall() file_name = "cn_route_" + str(route_length) + ".txt" f = open(file_name, "w", encoding="utf-8") # 所有的景点 a_nodes = list(get_node_id_dict().keys()) # 用于记录所有测试案例的推荐结果 record_result = dict() for row in rows: classroute = row[2] user_id = VName(row[7]) user_index = nodes_list.index(user_id) # 剩余的 待推荐景点 left_nodes = list(set(a_nodes) - set(classroute[:-1])) result_dict = dict() for node in left_nodes: node = AName(node) att_index = nodes_list.index(node) imag_coeff = final_matrix[user_index, att_index].imag if imag_coeff == 0.0: continue else: result_dict[node] = imag_coeff # 对结果进行排序 result_dict = dict(sorted(result_dict.items(), key=lambda x:x[1], reverse=True)) # 记录推荐结果 f.write("%s\t%s\t%s\t%s\n" % (user_id, classroute[:-1], AName(classroute[-1]), json.dumps(result_dict))) record_result[user_id] = {"user_id":user_id, "classroute":classroute[-1], "answer":AName(classroute[-1]), "recommend":result_dict} return record_result