def init_set(): """ 从route表中取数据 并根据此初始化训练集和测试集 :return: """ conn = get_conn() cursor = conn.cursor() # 节点字典 node_dict = get_node_id_dict() # 获取数量 count_sql = r"select count(*) from public.route" cursor.execute(count_sql) number = cursor.fetchone()[0] print(number) # 取出所有元素 select_sql = r"select * from public.route order by id" cursor.execute(select_sql) rows = cursor.fetchall() train_number = int(0.9*number) test_number = number - train_number train_set = random.sample(rows, train_number) for row in rows: classroute = row[2] classroutestr = row[3] routetime = row[4] id = row[7] if id_in_set(train_set, row[7]): for i in range(len(classroute)): insert_sql = """ insert into public.md_train_set(user_id, attraction, attractionstr, visittime) values(%s, %s, %s, %s); """ cursor.execute(insert_sql, (id, classroute[i], node_dict[classroute[i]], routetime[i])) else: for i in range(len(classroute) - 1): insert_sql = """ insert into public.md_train_set(user_id, attraction, attractionstr, visittime) values(%s, %s, %s, %s); """ cursor.execute(insert_sql, (id, classroute[i], node_dict[classroute[i]], routetime[i])) insert_sql = """ insert into public.md_test_set(user_id, attraction, attractionstr, visittime) values(%s, %s, %s, %s); """ cursor.execute(insert_sql, (id, classroute[-1], node_dict[classroute[-1]], routetime[-1])) conn.commit() cursor.close() conn.close()
def recommend_test(extract_fun, tuned_params): """ 根据GridSearchCV求得的参数 检验调参结果 :param tuned_params: :return: """ conn = get_conn() cursor = conn.cursor() # 读数据 sql = "select * from public.ml_test_set" cursor.execute(sql) rows = cursor.fetchall() # 构建模型 clf = svm.SVC(kernel=tuned_params["kernel"], C=tuned_params["C"], gamma=tuned_params["gamma"]) f = open("param_" + extract_fun.__name__ + ".txt", "r", encoding="utf-8") x_list = f.readline() x_list.split() y_list = f.readline() y_list.split() f.close() x_list = json.loads(x_list) y_list = json.loads(y_list) clf.fit(x_list, y_list) # 二分网络 graph = init_graph() print("构建二分网络完成") print(time.time()) # 进行投影 v_nodes = get_v_nodes() a_nodes = get_a_nodes() prjv_graph = project(graph, v_nodes) prja_graph = project(graph, a_nodes) print("投影完成") print(time.time()) # 所有的景点 a_nodes = list(get_node_id_dict().keys()) # 记录结果数据 f = open("recommend_" + extract_fun.__name__ + "_" + tuned_params["kernel"] + "_C" + str(tuned_params["C"]) + "_gamma" + str(tuned_params["gamma"]) + ".txt", "w", encoding="utf-8") i = 0 for row in rows: i += 1 print(i) user_id = VName(row[0]) att_id = AName(row[1]) is_link = row[4] if not is_link: continue sql = "select classroute from public.route_0320 where id={user_id}".format( user_id=row[0]) cursor.execute(sql) result = cursor.fetchone() classroute = result[0] # 待预测的集合 left_set = set(a_nodes) - set(classroute[0:-1]) recommendation = dict() for anode in left_set: anode = AName(anode) if extract_fun.__name__ == "extract_direct": feature = extract_direct(graph, user_id, anode) elif extract_fun.__name__ == "extract_indirect": feature = extract_indirect(graph, prjv_graph, prja_graph, user_id, anode) result = clf.predict([feature])[0] dis = abs(clf.decision_function([feature])) if result == 1: recommendation[anode] = dis[0] recommendation = dict( sorted(recommendation.items(), key=lambda x: x[1], reverse=True)) f.write("%s\t%s\t%s\t%s\n" % (user_id, classroute[:-1], att_id, json.dumps(recommendation))) f.close() cursor.close() conn.close()
def recommend_list(extract_fun): """ 利用之前生成的模型 进行推荐 :param extract_fun: :return: """ conn = get_conn() cursor = conn.cursor() # 读数据 sql = "select * from public.ml_test_set" cursor.execute(sql) rows = cursor.fetchall() # 读模型 clf = joblib.load("model_" + extract_fun.__name__ + ".pkl") # 二分网络 graph = init_graph() print("构建二分网络完成") print(time.time()) # 进行投影 v_nodes = get_v_nodes() a_nodes = get_a_nodes() prjv_graph = project(graph, v_nodes) prja_graph = project(graph, a_nodes) print("投影完成") print(time.time()) # 所有的景点 a_nodes = list(get_node_id_dict().keys()) # 记录结果数据 f = open("recommend_" + extract_fun.__name__ + ".txt", "w", encoding="utf-8") for row in rows: user_id = VName(row[0]) att_id = AName(row[1]) is_link = row[4] if not is_link: continue sql = "select classroute from public.route_0320 where id={user_id}".format( user_id=row[0]) cursor.execute(sql) result = cursor.fetchone() classroute = result[0] # 待预测的集合 left_set = set(a_nodes) - set(classroute[0:-1]) recommendation = dict() for anode in left_set: anode = AName(anode) if extract_fun.__name__ == "extract_direct": feature = extract_direct(graph, user_id, anode) elif extract_fun.__name__ == "extract_indirect": feature = extract_indirect(graph, prjv_graph, prja_graph, user_id, anode) result = clf.predict([feature])[0] dis = abs(clf.decision_function([feature])) if result == 1: recommendation[anode] = dis[0] recommendation = dict( sorted(recommendation.items(), key=lambda x: x[1], reverse=True)) f.write("%s\t%s\t%s\t%s\n" % (user_id, classroute[:-1], att_id, json.dumps(recommendation))) f.close() cursor.close() conn.close()
#! usr/bin/env python3 # -*- coding:utf-8 -*- import sys sys.path.append("../") from postgresql import get_conn, get_node_id_dict if __name__=="__main__": conn = get_conn() cursor = conn.cursor() print(get_node_id_dict())
def write_test_feature(func, has_sd=0): """ 保存测试集的特征 :param func: :return: """ conn = get_conn() cursor = conn.cursor() if has_sd: file_name = func.__name__ + "_has_sd_test.csv" else: file_name = func.__name__ + "_test.csv" if func.__name__ == "extract_direct": title = ["anode", "snv", "sna", "cn", "jc", "aa", "pa", "sd"] elif func.__name__ == "extract_indirect": if has_sd: title = [ "anode", "snv", "sna", "cn", "jc", "aa", "pa", "sd", "prj_cnv", "prj_cna", "prj_jcv", "prj_jca", "prj_aav", "prj_aaa", "prj_pav", "prj_paa", "prj_sdv", "prj_sda" ] else: title = [ "anode", "snv", "sna", "cn", "jc", "aa", "pa", "sd", "prj_cnv", "prj_cna", "prj_jcv", "prj_jca", "prj_aav", "prj_aaa", "prj_pav", "prj_paa" ] else: print("函数错误") return # 读数据 sql = "select * from public.ml_test_set" cursor.execute(sql) rows = cursor.fetchall() # 二分网络 graph = init_graph() print("构建二分网络完成") print(time.time()) # 进行投影 v_nodes = get_v_nodes() a_nodes = get_a_nodes() prjv_graph = project(graph, v_nodes) prja_graph = project(graph, a_nodes) print("投影完成") print(time.time()) # 所有的景点 a_nodes = list(get_node_id_dict().keys()) test_f = [] i = 0 for row in rows: print(i) i += 1 user_id = VName(row[0]) att_id = AName(row[1]) is_link = row[4] if not is_link: continue sql = "select classroute from public.route_0320 where id={user_id}".format( user_id=row[0]) cursor.execute(sql) result = cursor.fetchone() classroute = result[0] # 待预测的集合 left_set = set(a_nodes) - set(classroute[0:-1]) for anode in left_set: anode = AName(anode) if func.__name__ == "extract_direct": feature = func(graph, user_id, anode) elif func.__name__ == "extract_indirect": feature = func(graph, prjv_graph, prja_graph, user_id, anode, has_sd) else: print("函数名错误") line = [anode] line.extend(feature) test_f.append(line) # 写入到csv文件中 df = pd.DataFrame(test_f, columns=title) df.to_csv(file_name, encoding="utf-8") print("测试特征保存完成")
def init_set(): """ 从route表中取数据 并根据此初始化训练集和测试集 :return: """ conn = get_conn() cursor = conn.cursor() # 节点字典 node_dict = get_node_id_dict() all_nodes = list(node_dict.keys()) # 获取数量 count_sql = r"select count(*) from public.train_set" cursor.execute(count_sql) number = cursor.fetchone()[0] print(number) # 从train_set中读数据 加入ml_train_set和ml_graph_set中 sql = "select * from public.train_set" cursor.execute(sql) rows = cursor.fetchall() train_number = int(0.2 * number) train_set = random.sample(rows, train_number) for row in rows: classroute = row[2] classroutestr = row[3] routetime = row[4] id = row[7] if not id_in_set(train_set, row[7]): for i in range(len(classroute)): insert_sql = """ insert into public.ml_graph_set(user_id, attraction, attractionstr, visittime, islink) values(%s, %s, %s, %s, %s); """ cursor.execute(insert_sql, (id, classroute[i], node_dict[classroute[i]], routetime[i], True)) else: for i in range(len(classroute) - 1): insert_sql = """ insert into public.ml_graph_set(user_id, attraction, attractionstr, visittime, islink) values(%s, %s, %s, %s, %s); """ cursor.execute(insert_sql, (id, classroute[i], node_dict[classroute[i]], routetime[i], True)) insert_sql = """ insert into public.ml_train_set(user_id, attraction, attractionstr, visittime, islink) values(%s, %s, %s, %s, %s); """ cursor.execute(insert_sql, (id, classroute[-1], node_dict[classroute[-1]], routetime[-1], True)) left_nodes = set(all_nodes) - set(classroute) neg_examples = random.sample(left_nodes, 2) for neg in neg_examples: cursor.execute(insert_sql, (id, neg, node_dict[neg], None, False)) conn.commit() # 从test_set中读数据 加入ml_test_set和ml_graph_set中 sql = "select * from public.test_set" cursor.execute(sql) rows = cursor.fetchall() for row in rows: classroute = row[2] classroutestr = row[3] routetime = row[4] id = row[7] for i in range(len(classroute) - 1): insert_sql = """ insert into public.ml_graph_set(user_id, attraction, attractionstr, visittime, islink) values(%s, %s, %s, %s, %s); """ cursor.execute(insert_sql, (id, classroute[i], node_dict[classroute[i]], routetime[i], True)) insert_sql = """ insert into public.ml_test_set(user_id, attraction, attractionstr, visittime, islink) values(%s, %s, %s, %s, %s); """ cursor.execute(insert_sql, (id, classroute[-1], node_dict[classroute[-1]], routetime[-1], True)) left_nodes = set(all_nodes) - set(classroute) neg_examples = random.sample(left_nodes, 2) for neg in neg_examples: cursor.execute(insert_sql, (id, neg, node_dict[neg], None, False)) conn.commit() cursor.close() conn.close()
def recommend(test_file, model_path, save_file): """ 根据上面生成的模型 进行预测推荐 :param model_path: :return: """ # 加载模型 model = xgb.Booster() model.load_model(model_path) # print(dir(model)) # 构造训练集数据 test_data = pd.read_csv(test_file) anode_list = list(test_data["anode"]) test_set = test_data.drop("anode", axis=1) # xgb矩阵赋值 xgb_test = xgb.DMatrix(test_set) # 进行预测 preds = model.predict(xgb_test) conn = get_conn() cursor = conn.cursor() sql = "select * from public.ml_test_set " cursor.execute(sql) rows = cursor.fetchall() # 所有的景区节点 a_nodes = list(get_node_id_dict().keys()) index = 0 f = open(save_file, "w", encoding="utf-8") for row in rows: user_id = VName(row[0]) att_id = AName(row[1]) is_link = row[4] # 对于测试的负案例 直接跳过 if not is_link: continue sql = "select classroute from public.route_0320 where id={user_id}".format(user_id=row[0]) cursor.execute(sql) result = cursor.fetchone() classroute = result[0] # 待预测的集合 left_set = set(a_nodes) - set(classroute[0:-1]) n = len(left_set) recommend_list = dict() for i in range(n): recommend_list[anode_list[index+i]] = preds[index+i] index += n recommend_list = dict(sorted(recommend_list.items(), key=lambda x:x[1], reverse=True)) f.write("%s\t%s\t%s\t%s\n" % (user_id, classroute[:-1], att_id, json.dumps(recommend_list, cls=MyEncoder))) f.close() cursor.close() conn.close()
def recommend_list(route_length=3, coeffs=[]): """ :param route_length: :return: """ nodes_list = gen_nodes_list() matrix = init_matrix() conn = get_conn() cursor = conn.cursor() if route_length == 3: final_matrix = (matrix ** 3) elif route_length == 5: if coeffs: final_matrix = (matrix ** 3) + (matrix ** 5) / coeffs[0] else: final_matrix = (matrix ** 3) + (matrix ** 5) / 20 elif route_length == 7: if coeffs: final_matrix = (matrix ** 3) + (matrix ** 5) / coeffs[0] + (matrix ** 7) / coeffs[1] else: final_matrix = (matrix ** 3) + (matrix ** 5) / 120 + (matrix ** 7) / 5040 else: print("暂时没有对应的公式") return sql = "select * from public.test_set" cursor.execute(sql) rows = cursor.fetchall() file_name = "cn_route_" + str(route_length) + ".txt" f = open(file_name, "w", encoding="utf-8") # 所有的景点 a_nodes = list(get_node_id_dict().keys()) # 用于记录所有测试案例的推荐结果 record_result = dict() for row in rows: classroute = row[2] user_id = VName(row[7]) user_index = nodes_list.index(user_id) # 剩余的 待推荐景点 left_nodes = list(set(a_nodes) - set(classroute[:-1])) result_dict = dict() for node in left_nodes: node = AName(node) att_index = nodes_list.index(node) imag_coeff = final_matrix[user_index, att_index].imag if imag_coeff == 0.0: continue else: result_dict[node] = imag_coeff # 对结果进行排序 result_dict = dict(sorted(result_dict.items(), key=lambda x:x[1], reverse=True)) # 记录推荐结果 f.write("%s\t%s\t%s\t%s\n" % (user_id, classroute[:-1], AName(classroute[-1]), json.dumps(result_dict))) record_result[user_id] = {"user_id":user_id, "classroute":classroute[-1], "answer":AName(classroute[-1]), "recommend":result_dict} return record_result