def write_tmp_file(tmp_file, ds_g, ds_N): # 这里要重新写,要组成(node, outlinks)应该是从node出来的等够到达的点的集合 ds_new_attribute_dict = read_attribute_dict('ds_g') #new_attribute_dict = read_attribute_dict('g') #new_attribute_dict = json.loads(new_attribute_dict) #print 'len(new_attribute_dict):', len(new_attribute_dict.keys()) ds_new_attribute_dict = json.loads(ds_new_attribute_dict) for node in ds_g.nodes(): outlinks = ds_g.out_edges(nbunch=[node ]) # 找到节点node对应的direct_superior_user if not outlinks: value = 'tr_results,%s,%s' % (1.0 / ds_N, ds_N) tmp_file.write('%s\t%s\n' % (node, value)) else: outlinks_list = compute_s2(node, outlinks, ds_new_attribute_dict) outlinks_str = ','.join(outlinks_list) value = 'tr_results,%s,%s,' % ( 1.0 / ds_N, ds_N ) # value=pr_results,1/n,n,str(uid1),str(i(uid1,v)),str(uid2),str(i(uid2,v)) value += outlinks_str tmp_file.write('%s\t%s\n' % (node, value)) print '(node,value):', node, value tmp_file.flush() return tmp_file
def write_tmp_file(tmp_file, ds_g, ds_N): # 这里要重新写,要组成(node, outlinks)应该是从node出来的等够到达的点的集合 ds_new_attribute_dict = read_attribute_dict('ds_g') #new_attribute_dict = read_attribute_dict('g') #new_attribute_dict = json.loads(new_attribute_dict) #print 'len(new_attribute_dict):', len(new_attribute_dict.keys()) ds_new_attribute_dict = json.loads(ds_new_attribute_dict) for node in ds_g.nodes(): outlinks = ds_g.out_edges(nbunch=[node]) # 找到节点node对应的direct_superior_user if not outlinks: value = 'tr_results,%s,%s' % (1.0 / ds_N, ds_N) tmp_file.write('%s\t%s\n' % (node, value)) else: outlinks_list = compute_s2(node, outlinks, ds_new_attribute_dict) outlinks_str = ','.join(outlinks_list) value = 'tr_results,%s,%s,' % (1.0 / ds_N, ds_N) # value=pr_results,1/n,n,str(uid1),str(i(uid1,v)),str(uid2),str(i(uid2,v)) value += outlinks_str tmp_file.write('%s\t%s\n' % (node, value)) print '(node,value):', node, value tmp_file.flush() return tmp_file
def make_network_graph(current_date, topic_id, topic, window_size, all_uid_pr, pr_data, ds_all_uid_pr, ds_pr_data, real_topic_id, key_user_labeled=True): date = current_date ''' key_users对应的是源头转发网络的pagerank前10的用户,ds_key_users对应的是直接上级转发网络的pagerank前10的用户 ''' if key_user_labeled: key_users = read_key_users(current_date, window_size, topic, top_n=10) ds_key_users = ds_read_key_users(current_date, window_size, topic, top_n=10) else: key_users = [] ds_key_users = [] ''' 读取图结构,并从数据库中获取new_attribute_dict, ds_new_attribute_dict ''' key = str(real_topic_id) + '_' + str(date) + '_' + str(window_size) G = nx.read_gexf(str(GRAPH_PATH) + str(key) + '_g_graph.gexf') gg = nx.read_gexf(str(GRAPH_PATH) + str(key) + '_gg_graph.gexf') ds_dg = nx.read_gexf(str(GRAPH_PATH) + str(key) + '_ds_dg_graph.gexf') ds_udg = nx.read_gexf(str(GRAPH_PATH) + str(key) + '_ds_udg_graph.gexf') new_attribute_dict = json.loads(read_attribute_dict('g')) ds_new_attribute_dict = json.loads(read_attribute_dict('ds_g')) # community detection, http://perso.crans.org/aynaud/communities/, undirected graph import community N = len(G) print 'len_G_N:', N ds_N = len(ds_dg) print 'len_ds_dg_N:', ds_N if (not N) or (not ds_N): return '' node_degree = nx.degree(G) ds_node_degree = nx.degree(ds_dg) G.remove_edges_from(G.selfloop_edges()) gg.remove_edges_from(gg.selfloop_edges()) ds_dg.remove_edges_from(ds_dg.selfloop_edges()) ds_udg.remove_edges_from(ds_udg.selfloop_edges()) G = cut_network(G, nx.degree(G), cut_degree) # 筛选出度数大于等于1的节点数 gg = cut_network(gg, nx.degree(gg), cut_degree) ds_dg = cut_network(ds_dg, nx.degree(ds_dg), cut_degree) ds_udg = cut_network(ds_udg, nx.degree(ds_udg), cut_degree) print 'after cut_network:' print 'len(G):', len(G) print 'len(ds_dg):', len(ds_dg) p_gg = nx.read_gexf(str(GRAPH_PATH) + str(key) + '_gg_graph.gexf') p_ds_udg = nx.read_gexf(str(GRAPH_PATH) + str(key) + '_ds_udg_graph.gexf') partition = community.best_partition(p_gg) ds_partition = community.best_partition( p_ds_udg) # 将直接上级转发网络进行社区划分!!!!!!!!!!!! print 'start snowball sampling' new_G, new_gg = SnowballSampling(G, gg, topic, network_type) ds_new_G, ds_new_gg = SnowballSampling(ds_dg, ds_udg, topic, ds_network_type) print 'sampling complicated' # Local Bridge的算法需要提升效率,此处先不显示 ''' print 'get local bridge start:' GetLocalBridge(gg) GetLocalBridge(ds_udg) print 'local bridge complicated' ''' print 'start computing quota' #new_G = G #new_gg = gg #ds_new_G = ds_dg #ds_new_gg = ds_udg compute_quota(new_G, new_gg, date, window_size, topic, all_uid_pr, network_type) # compute quota compute_quota(ds_new_G, ds_new_gg, date, window_size, topic, ds_all_uid_pr, ds_network_type) print 'quota computed complicated' # 生成gexf文件 ''' 将生成gexf文件的部分作为一个函数,将相关的参数传入。以此简洁化两个不同不同网络的gexf生成过程 ''' gexf = make_gexf('hxq', 'Source Network', new_G, node_degree, key_users, all_uid_pr, pr_data, partition, new_attribute_dict) ds_gexf = make_ds_gexf('hxq', 'Direct Superior Network', ds_new_G, ds_node_degree, ds_key_users, ds_all_uid_pr, ds_pr_data, ds_partition, ds_new_attribute_dict) ''' gexf = Gexf("Yang Han", "Topic Network") node_id = {} graph = gexf.addGraph("directed", "static", "demp graph") graph.addNodeAttribute('name', type='string', force_id='name') graph.addNodeAttribute('location', type='string', force_id='location') # 添加地理位置属性 graph.addNodeAttribute('timestamp', type='int', force_id='timestamp') graph.addNodeAttribute('pagerank', type='string', force_id='pagerank') graph.addNodeAttribute('acategory', type='string', force_id='acategory') graph.addNodeAttribute('text', type='string', force_id='text') graph.addNodeAttribute('reposts_count', type='string', force_id='reposts_count') # 新添加的属性 graph.addNodeAttribute('comments_count', type='string', force_id='comments_count') graph.addNodeAttribute('attitude_count', type='string', force_id='attitude_count') pos = nx.spring_layout(G) # 定义一个布局 pos={node:[v...]/(v...)} node_counter = 0 edge_counter = 0 for node in G.nodes(): x, y = pos[node] # 返回位置(x,y) degree = node_degree[node] if node not in node_id: # {node:排名} node_id[node] = node_counter node_counter += 1 uid = node # 节点就是用户名 if uid in key_users: # 根据是否为关键用户添加不同的节点 _node = graph.addNode(node_id[node], str(node), x=str(x), y=str(y), z='0', r='255', g='51', b='51', size=str(degree)) else: _node = graph.addNode(node_id[node], str(node), x=str(x), y=str(y), z='0', r='0', g='204', b='204', size=str(degree)) cluster_id = str(partition[node]) _node.addAttribute('acategory', cluster_id) #print 'al_uid_pr:', all_uid_pr pr = str(all_uid_pr[str(uid)]) _node.addAttribute('pagerank', pr) #print 'pagarank_uid:', uid try: text_add = new_attribute_dict[uid][0][0] # 添加节点属性--text _node.addAttribute('text', json.dumps(text_add)) reposts_count_add = new_attribute_dict[uid][0][1] _node.addAttribute('reposts_count', str(reposts_count_add)) # 添加节点属性--reposts_count comment_count_add = new_attribute_dict[uid][0][2] _node.addAttribute('comments_count', str(comment_count_add)) # 添加节点属性--comment_count attitude_count_add = new_attribute_dict[uid][0][3] if attitude_count_add == None: attitude_count_add = u'未知' _node.addAttribute('attitude_count', attitude_count_add) # 添加节点属性--attitude_count except KeyError: _node.addAttribute('text', u'未知') _node.addAttribute('reposts_count', u'未知') _node.addAttribute('comments_count', u'未知') _node.addAttribute('attitude_count', u'未知') user_info = acquire_user_by_id(uid) # 获取对应的用户信息,添加属性 if user_info: _node.addAttribute('name', user_info['name']) _node.addAttribute('location', user_info['location']) else: _node.addAttribute('name', u'未知') _node.addAttribute('location', u'未知') #_node.addAttribute('timestamp', str(uid_ts[uid])) for edge in G.edges(): start, end = edge # (repost_uid, source_uid) start_id = node_id[start] end_id = node_id[end] graph.addEdge(str(edge_counter), str(start_id), str(end_id)) edge_counter += 1 ''' return etree.tostring(gexf.getXML(), pretty_print=True, encoding='utf-8', xml_declaration=True), etree.tostring( ds_gexf.getXML(), pretty_print=True, encoding='utf-8', xml_declaration=True) # 生成序列化字符串
def make_network_graph(current_date, topic_id, topic, window_size, all_uid_pr, pr_data, ds_all_uid_pr, ds_pr_data, real_topic_id, key_user_labeled=True): date = current_date ''' key_users对应的是源头转发网络的pagerank前10的用户,ds_key_users对应的是直接上级转发网络的pagerank前10的用户 ''' if key_user_labeled: key_users = read_key_users(current_date, window_size, topic, top_n=10) ds_key_users = ds_read_key_users(current_date, window_size, topic ,top_n=10) else: key_users = [] ds_key_users = [] ''' 读取图结构,并从数据库中获取new_attribute_dict, ds_new_attribute_dict ''' key = str(real_topic_id) + '_' + str(date) + '_' + str(window_size) G = nx.read_gexf(str(GRAPH_PATH)+str(key)+'_g_graph.gexf') gg = nx.read_gexf(str(GRAPH_PATH)+str(key)+'_gg_graph.gexf') ds_dg = nx.read_gexf(str(GRAPH_PATH)+str(key)+'_ds_dg_graph.gexf') ds_udg = nx.read_gexf(str(GRAPH_PATH)+str(key)+'_ds_udg_graph.gexf') new_attribute_dict = json.loads(read_attribute_dict('g')) ds_new_attribute_dict = json.loads(read_attribute_dict('ds_g')) # community detection, http://perso.crans.org/aynaud/communities/, undirected graph import community N = len(G) print 'len_G_N:', N ds_N = len(ds_dg) print 'len_ds_dg_N:', ds_N if (not N) or (not ds_N): return '' node_degree = nx.degree(G) ds_node_degree = nx.degree(ds_dg) G.remove_edges_from(G.selfloop_edges()) gg.remove_edges_from(gg.selfloop_edges()) ds_dg.remove_edges_from(ds_dg.selfloop_edges()) ds_udg.remove_edges_from(ds_udg.selfloop_edges()) G = cut_network(G, nx.degree(G), cut_degree) # 筛选出度数大于等于1的节点数 gg = cut_network(gg, nx.degree(gg), cut_degree) ds_dg = cut_network(ds_dg, nx.degree(ds_dg), cut_degree) ds_udg = cut_network(ds_udg, nx.degree(ds_udg), cut_degree) print 'after cut_network:' print 'len(G):', len(G) print 'len(ds_dg):', len(ds_dg) p_gg = nx.read_gexf(str(GRAPH_PATH)+str(key)+'_gg_graph.gexf') p_ds_udg = nx.read_gexf(str(GRAPH_PATH)+str(key)+'_ds_udg_graph.gexf') partition = community.best_partition(p_gg) ds_partition = community.best_partition(p_ds_udg) # 将直接上级转发网络进行社区划分!!!!!!!!!!!! print 'start snowball sampling' new_G, new_gg = SnowballSampling(G, gg, topic, network_type) ds_new_G, ds_new_gg = SnowballSampling(ds_dg, ds_udg, topic, ds_network_type) print 'sampling complicated' # Local Bridge的算法需要提升效率,此处先不显示 ''' print 'get local bridge start:' GetLocalBridge(gg) GetLocalBridge(ds_udg) print 'local bridge complicated' ''' print 'start computing quota' #new_G = G #new_gg = gg #ds_new_G = ds_dg #ds_new_gg = ds_udg compute_quota(new_G, new_gg, date, window_size, topic, all_uid_pr, network_type) # compute quota compute_quota(ds_new_G, ds_new_gg, date, window_size, topic, ds_all_uid_pr, ds_network_type) print 'quota computed complicated' # 生成gexf文件 ''' 将生成gexf文件的部分作为一个函数,将相关的参数传入。以此简洁化两个不同不同网络的gexf生成过程 ''' gexf = make_gexf('hxq', 'Source Network', new_G, node_degree, key_users, all_uid_pr, pr_data , partition, new_attribute_dict) ds_gexf = make_ds_gexf('hxq', 'Direct Superior Network', ds_new_G, ds_node_degree, ds_key_users, ds_all_uid_pr, ds_pr_data, ds_partition, ds_new_attribute_dict) ''' gexf = Gexf("Yang Han", "Topic Network") node_id = {} graph = gexf.addGraph("directed", "static", "demp graph") graph.addNodeAttribute('name', type='string', force_id='name') graph.addNodeAttribute('location', type='string', force_id='location') # 添加地理位置属性 graph.addNodeAttribute('timestamp', type='int', force_id='timestamp') graph.addNodeAttribute('pagerank', type='string', force_id='pagerank') graph.addNodeAttribute('acategory', type='string', force_id='acategory') graph.addNodeAttribute('text', type='string', force_id='text') graph.addNodeAttribute('reposts_count', type='string', force_id='reposts_count') # 新添加的属性 graph.addNodeAttribute('comments_count', type='string', force_id='comments_count') graph.addNodeAttribute('attitude_count', type='string', force_id='attitude_count') pos = nx.spring_layout(G) # 定义一个布局 pos={node:[v...]/(v...)} node_counter = 0 edge_counter = 0 for node in G.nodes(): x, y = pos[node] # 返回位置(x,y) degree = node_degree[node] if node not in node_id: # {node:排名} node_id[node] = node_counter node_counter += 1 uid = node # 节点就是用户名 if uid in key_users: # 根据是否为关键用户添加不同的节点 _node = graph.addNode(node_id[node], str(node), x=str(x), y=str(y), z='0', r='255', g='51', b='51', size=str(degree)) else: _node = graph.addNode(node_id[node], str(node), x=str(x), y=str(y), z='0', r='0', g='204', b='204', size=str(degree)) cluster_id = str(partition[node]) _node.addAttribute('acategory', cluster_id) #print 'al_uid_pr:', all_uid_pr pr = str(all_uid_pr[str(uid)]) _node.addAttribute('pagerank', pr) #print 'pagarank_uid:', uid try: text_add = new_attribute_dict[uid][0][0] # 添加节点属性--text _node.addAttribute('text', json.dumps(text_add)) reposts_count_add = new_attribute_dict[uid][0][1] _node.addAttribute('reposts_count', str(reposts_count_add)) # 添加节点属性--reposts_count comment_count_add = new_attribute_dict[uid][0][2] _node.addAttribute('comments_count', str(comment_count_add)) # 添加节点属性--comment_count attitude_count_add = new_attribute_dict[uid][0][3] if attitude_count_add == None: attitude_count_add = u'未知' _node.addAttribute('attitude_count', attitude_count_add) # 添加节点属性--attitude_count except KeyError: _node.addAttribute('text', u'未知') _node.addAttribute('reposts_count', u'未知') _node.addAttribute('comments_count', u'未知') _node.addAttribute('attitude_count', u'未知') user_info = acquire_user_by_id(uid) # 获取对应的用户信息,添加属性 if user_info: _node.addAttribute('name', user_info['name']) _node.addAttribute('location', user_info['location']) else: _node.addAttribute('name', u'未知') _node.addAttribute('location', u'未知') #_node.addAttribute('timestamp', str(uid_ts[uid])) for edge in G.edges(): start, end = edge # (repost_uid, source_uid) start_id = node_id[start] end_id = node_id[end] graph.addEdge(str(edge_counter), str(start_id), str(end_id)) edge_counter += 1 ''' return etree.tostring(gexf.getXML(), pretty_print=True, encoding='utf-8', xml_declaration=True), etree.tostring(ds_gexf.getXML(), pretty_print=True, encoding='utf-8', xml_declaration=True)# 生成序列化字符串