def predict(extract_fun): """ 对结果进行预测 :param extract_fun: :return: """ conn = get_conn() cursor = conn.cursor() # 恢复模型 clf = joblib.load("model_" + extract_fun.__name__ + ".pkl") # 二分网络 graph = init_graph() print("构建二分网络完成") print(time.time()) # 进行投影 v_nodes = get_v_nodes() a_nodes = get_a_nodes() prjv_graph = project(graph, v_nodes) prja_graph = project(graph, a_nodes) print("投影完成") print(time.time()) sql = "select * from public.ml_test_set" cursor.execute(sql) rows = cursor.fetchall() f = open("predict_" + extract_fun.__name__ + ".txt", "w", encoding="utf-8") for row in rows: user_id = VName(row[0]) att_id = AName(row[1]) is_link = row[4] if extract_fun.__name__ == "extract_direct": feature = extract_direct(graph, user_id, att_id) elif extract_fun.__name__ == "extract_indirect": feature = extract_indirect(graph, prjv_graph, prja_graph, user_id, att_id) result = clf.predict([feature])[0] f.writelines("%s\t%s\t%s\t%s\n" % (user_id, att_id, is_link, result)) f.close() cursor.close() conn.close()
def init_graph(): conn = get_conn() cursor = conn.cursor() graph = networkx.Graph() # 添加 游客节点 VNodes = get_v_nodes() for node in VNodes: graph.add_node(node, bipartite=0) # 添加景点节点 ANodes = get_a_nodes() for node in ANodes: graph.add_node(node, bipartite=1) sql = "select * from public.train_set" cursor.execute(sql) rows = cursor.fetchall() for row in rows: user_id = row[7] classroute = row[2] classroutestr = row[3] for atrraction_id in classroute: if (VName(user_id), AName(atrraction_id)) in graph.edges: graph[VName(user_id)][AName(atrraction_id)]["weight"] += 1 else: graph.add_edge(VName(user_id), AName(atrraction_id), weight=1) sql = "select * from public.test_set" cursor.execute(sql) rows = cursor.fetchall() for row in rows: user_id = row[7] classroute = row[2] classroutestr = row[3] for atrraction_id in classroute[:-1]: if (VName(user_id), AName(atrraction_id)) in graph.edges: graph[VName(user_id)][AName(atrraction_id)]["weight"] += 1 else: graph.add_edge(VName(user_id), AName(atrraction_id), weight=1) return graph
def train(extract_fun): """ 训练模型 :param: extract_fun :return: """ # 读取数据 conn = get_conn() cursor = conn.cursor() # 二分网络 graph = init_graph() print("构建二分网络完成") print(time.time()) # 进行投影 v_nodes = get_v_nodes() a_nodes = get_a_nodes() prjv_graph = project(graph, v_nodes) prja_graph = project(graph, a_nodes) print("网络投影完成") print(time.time()) sql = "select * from public.ml_train_set" cursor.execute(sql) rows = cursor.fetchall() print(len(rows)) # 保存训练数据 X_list = list() Y_list = list() i = 0 for row in rows: user_id = VName(row[0]) att_id = AName(row[1]) i += 1 print(i) # print(time.time()) if extract_fun.__name__ == "extract_direct": feature = extract_direct(graph, user_id, att_id) elif extract_fun.__name__ == "extract_indirect": feature = extract_indirect(graph, prjv_graph, prja_graph, user_id, att_id) # print(feature) else: print("wrong function") break X_list.append(feature) if row[4]: Y_list.append(1) else: Y_list.append(-1) print("生成训练数据") print(time.time()) cursor.close() conn.close() # 记录X_list, Y_list f = open("param_" + extract_fun.__name__ + ".txt", "w", encoding="utf-8") f.writelines(json.dumps(X_list) + "\n") f.writelines(json.dumps(Y_list) + "\n") f.close() print("训练数据保存成功") print(time.time()) clf = svm.SVC(kernel="linear") clf.fit(X_list, Y_list) print("训练数据结束") print(time.time()) joblib.dump(clf, "model_" + extract_fun.__name__ + ".pkl") print("保存模型") print(time.time())
def recommend_test(extract_fun, tuned_params): """ 根据GridSearchCV求得的参数 检验调参结果 :param tuned_params: :return: """ conn = get_conn() cursor = conn.cursor() # 读数据 sql = "select * from public.ml_test_set" cursor.execute(sql) rows = cursor.fetchall() # 构建模型 clf = svm.SVC(kernel=tuned_params["kernel"], C=tuned_params["C"], gamma=tuned_params["gamma"]) f = open("param_" + extract_fun.__name__ + ".txt", "r", encoding="utf-8") x_list = f.readline() x_list.split() y_list = f.readline() y_list.split() f.close() x_list = json.loads(x_list) y_list = json.loads(y_list) clf.fit(x_list, y_list) # 二分网络 graph = init_graph() print("构建二分网络完成") print(time.time()) # 进行投影 v_nodes = get_v_nodes() a_nodes = get_a_nodes() prjv_graph = project(graph, v_nodes) prja_graph = project(graph, a_nodes) print("投影完成") print(time.time()) # 所有的景点 a_nodes = list(get_node_id_dict().keys()) # 记录结果数据 f = open("recommend_" + extract_fun.__name__ + "_" + tuned_params["kernel"] + "_C" + str(tuned_params["C"]) + "_gamma" + str(tuned_params["gamma"]) + ".txt", "w", encoding="utf-8") i = 0 for row in rows: i += 1 print(i) user_id = VName(row[0]) att_id = AName(row[1]) is_link = row[4] if not is_link: continue sql = "select classroute from public.route_0320 where id={user_id}".format( user_id=row[0]) cursor.execute(sql) result = cursor.fetchone() classroute = result[0] # 待预测的集合 left_set = set(a_nodes) - set(classroute[0:-1]) recommendation = dict() for anode in left_set: anode = AName(anode) if extract_fun.__name__ == "extract_direct": feature = extract_direct(graph, user_id, anode) elif extract_fun.__name__ == "extract_indirect": feature = extract_indirect(graph, prjv_graph, prja_graph, user_id, anode) result = clf.predict([feature])[0] dis = abs(clf.decision_function([feature])) if result == 1: recommendation[anode] = dis[0] recommendation = dict( sorted(recommendation.items(), key=lambda x: x[1], reverse=True)) f.write("%s\t%s\t%s\t%s\n" % (user_id, classroute[:-1], att_id, json.dumps(recommendation))) f.close() cursor.close() conn.close()
def recommend_list(extract_fun): """ 利用之前生成的模型 进行推荐 :param extract_fun: :return: """ conn = get_conn() cursor = conn.cursor() # 读数据 sql = "select * from public.ml_test_set" cursor.execute(sql) rows = cursor.fetchall() # 读模型 clf = joblib.load("model_" + extract_fun.__name__ + ".pkl") # 二分网络 graph = init_graph() print("构建二分网络完成") print(time.time()) # 进行投影 v_nodes = get_v_nodes() a_nodes = get_a_nodes() prjv_graph = project(graph, v_nodes) prja_graph = project(graph, a_nodes) print("投影完成") print(time.time()) # 所有的景点 a_nodes = list(get_node_id_dict().keys()) # 记录结果数据 f = open("recommend_" + extract_fun.__name__ + ".txt", "w", encoding="utf-8") for row in rows: user_id = VName(row[0]) att_id = AName(row[1]) is_link = row[4] if not is_link: continue sql = "select classroute from public.route_0320 where id={user_id}".format( user_id=row[0]) cursor.execute(sql) result = cursor.fetchone() classroute = result[0] # 待预测的集合 left_set = set(a_nodes) - set(classroute[0:-1]) recommendation = dict() for anode in left_set: anode = AName(anode) if extract_fun.__name__ == "extract_direct": feature = extract_direct(graph, user_id, anode) elif extract_fun.__name__ == "extract_indirect": feature = extract_indirect(graph, prjv_graph, prja_graph, user_id, anode) result = clf.predict([feature])[0] dis = abs(clf.decision_function([feature])) if result == 1: recommendation[anode] = dis[0] recommendation = dict( sorted(recommendation.items(), key=lambda x: x[1], reverse=True)) f.write("%s\t%s\t%s\t%s\n" % (user_id, classroute[:-1], att_id, json.dumps(recommendation))) f.close() cursor.close() conn.close()
def write_train_feature(func, have_sd=0): """ 生成训练集特征 并写入文件 :param func: :param have_sd: 间接特征 是否含有最短距离 :return: """ if have_sd: file_name = func.__name__ + "_has_sd_train.csv" else: file_name = func.__name__ + "_train.csv" # 读取数据 conn = get_conn() cursor = conn.cursor() # 二分网络 graph = init_graph() print("构建二分网络完成") print(time.time()) # 进行投影 v_nodes = get_v_nodes() a_nodes = get_a_nodes() prjv_graph = project(graph, v_nodes) prja_graph = project(graph, a_nodes) print("网络投影完成") print(time.time()) sql = "select * from public.ml_train_set" cursor.execute(sql) rows = cursor.fetchall() print("len_rows:" + str(len(rows))) if func.__name__ == "extract_direct": title = ["label", "snv", "sna", "cn", "jc", "aa", "pa", "sd"] elif func.__name__ == "extract_indirect": if have_sd: title = [ "label", "snv", "sna", "cn", "jc", "aa", "pa", "sd", "prj_cnv", "prj_cna", "prj_jcv", "prj_jca", "prj_aav", "prj_aaa", "prj_pav", "prj_paa", "prj_sdv", "prj_sda" ] else: title = [ "label", "snv", "sna", "cn", "jc", "aa", "pa", "sd", "prj_cnv", "prj_cna", "prj_jcv", "prj_jca", "prj_aav", "prj_aaa", "prj_pav", "prj_paa" ] else: print("函数错误") return i = 0 train_f = [] for row in rows: i += 1 print(i) user_id = VName(row[0]) att_id = AName(row[1]) if func.__name__ == "extract_direct": feature = func(graph, user_id, att_id) elif func.__name__ == "extract_indirect": feature = func(graph, prjv_graph, prja_graph, user_id, att_id, have_sd) else: print("函数错误") if row[4]: line = [1] line.extend(feature) train_f.append(line) else: line = [0] line.extend(feature) train_f.append(line) # 写入到csv文件中 df = pd.DataFrame(train_f, columns=title) df.to_csv(file_name, encoding="utf-8") print("训练特征保存完成")
def write_test_feature(func, has_sd=0): """ 保存测试集的特征 :param func: :return: """ conn = get_conn() cursor = conn.cursor() if has_sd: file_name = func.__name__ + "_has_sd_test.csv" else: file_name = func.__name__ + "_test.csv" if func.__name__ == "extract_direct": title = ["anode", "snv", "sna", "cn", "jc", "aa", "pa", "sd"] elif func.__name__ == "extract_indirect": if has_sd: title = [ "anode", "snv", "sna", "cn", "jc", "aa", "pa", "sd", "prj_cnv", "prj_cna", "prj_jcv", "prj_jca", "prj_aav", "prj_aaa", "prj_pav", "prj_paa", "prj_sdv", "prj_sda" ] else: title = [ "anode", "snv", "sna", "cn", "jc", "aa", "pa", "sd", "prj_cnv", "prj_cna", "prj_jcv", "prj_jca", "prj_aav", "prj_aaa", "prj_pav", "prj_paa" ] else: print("函数错误") return # 读数据 sql = "select * from public.ml_test_set" cursor.execute(sql) rows = cursor.fetchall() # 二分网络 graph = init_graph() print("构建二分网络完成") print(time.time()) # 进行投影 v_nodes = get_v_nodes() a_nodes = get_a_nodes() prjv_graph = project(graph, v_nodes) prja_graph = project(graph, a_nodes) print("投影完成") print(time.time()) # 所有的景点 a_nodes = list(get_node_id_dict().keys()) test_f = [] i = 0 for row in rows: print(i) i += 1 user_id = VName(row[0]) att_id = AName(row[1]) is_link = row[4] if not is_link: continue sql = "select classroute from public.route_0320 where id={user_id}".format( user_id=row[0]) cursor.execute(sql) result = cursor.fetchone() classroute = result[0] # 待预测的集合 left_set = set(a_nodes) - set(classroute[0:-1]) for anode in left_set: anode = AName(anode) if func.__name__ == "extract_direct": feature = func(graph, user_id, anode) elif func.__name__ == "extract_indirect": feature = func(graph, prjv_graph, prja_graph, user_id, anode, has_sd) else: print("函数名错误") line = [anode] line.extend(feature) test_f.append(line) # 写入到csv文件中 df = pd.DataFrame(test_f, columns=title) df.to_csv(file_name, encoding="utf-8") print("测试特征保存完成")