def pagerank_rank(top_n, date, topic_id, window_size, topicname, real_topic_id): data = [] tmp_file, N, ds_tmp_file, ds_N = prepare_data_for_pr(topic_id, date, window_size, topicname, real_topic_id) top_n = N ds_top_n = ds_N print 'page_rank start' if not tmp_file or not ds_tmp_file: return data input_tmp_path = tmp_file.name ds_input_tmp_path = ds_tmp_file.name print input_tmp_path, ds_input_tmp_path iter_count = PAGERANK_ITER_MAX print 'pagerank_source_network' sorted_uids, all_uid_pr = pagerank(iter_count, input_tmp_path, top_n) # 排序的uid的序列 print 'pagerank_direct_superior_network' ds_sorted_uids, ds_all_uid_pr = pagerank(iter_count, ds_input_tmp_path, ds_top_n) print 'top_n:', top_n print 'len(sorted_uid):', len(ds_sorted_uids) print 'len(ds_all_uid_pr):', len(ds_all_uid_pr) print 'ds_top_n:', ds_top_n topicname = acquire_topic_name(topic_id) print 'topicname:', topicname if not topicname: return data print 'save_rank_results' data = save_rank_results(sorted_uids, 'topic', 'spark_pagerank', date, window_size, topicname, all_uid_pr) ds_data = save_ds_rank_results(ds_sorted_uids, 'topic', 'spark_pagerank', date, window_size, topicname, ds_all_uid_pr) return all_uid_pr, ds_all_uid_pr, data, ds_data
def prepare_data_for_pr(topic_id, date, window_size): tmp_file = tempfile.NamedTemporaryFile(delete=False) topic = acquire_topic_name(topic_id) if not topic: return None g = make_network(topic, date, window_size) if not g: return None N = len(g.nodes()) print 'topic network size %s' % N if not N: return None for node in g.nodes(): outlinks = g.out_edges(nbunch=[node]) outlinks = map(str, [n2 for n1, n2 in outlinks]) if not outlinks: value = 'pr_results,%s,%s' % (1.0/N, N) tmp_file.write('%s\t%s\n' % (node, value)) else: outlinks_str = ','.join(outlinks) value = 'pr_results,%s,%s,' % (1.0/N, N) value += outlinks_str tmp_file.write('%s\t%s\n' % (node, value)) tmp_file.flush() return tmp_file
def prepare_data_for_degree(topic_id, date, window_size): topic = acquire_topic_name(topic_id) if not topic: return None g = make_network(topic, date, window_size) if not g: return None N = len(g.nodes()) print 'topic network size %s' % N if not N: return None return g.degree()
def pagerank_rank(top_n, date, topic_id, window_size): data = [] tmp_file = prepare_data_for_pr(topic_id, date, window_size) if not tmp_file: return data input_tmp_path = tmp_file.name job_id = generate_job_id(datetime2ts(date), window_size, topic_id) iter_count = PAGERANK_ITER_MAX sorted_uids = pagerank(job_id, iter_count, input_tmp_path, top_n) topicname = acquire_topic_name(topic_id) if not topicname: return data data = save_rank_results(sorted_uids, 'topic', 'pagerank', date, window_size, topicname) return data
def pagerank_rank(top_n, date, topic_id, window_size, topicname, real_topic_id): data = [] tmp_file, N, ds_tmp_file, ds_N = prepare_data_for_pr( topic_id, date, window_size, topicname, real_topic_id) top_n = N ds_top_n = ds_N print 'page_rank start' if not tmp_file or not ds_tmp_file: return data input_tmp_path = tmp_file.name ds_input_tmp_path = ds_tmp_file.name print input_tmp_path, ds_input_tmp_path iter_count = PAGERANK_ITER_MAX print 'pagerank_source_network' sorted_uids, all_uid_pr = pagerank(iter_count, input_tmp_path, top_n) # 排序的uid的序列 print 'pagerank_direct_superior_network' ds_sorted_uids, ds_all_uid_pr = pagerank(iter_count, ds_input_tmp_path, ds_top_n) print 'top_n:', top_n print 'len(sorted_uid):', len(ds_sorted_uids) print 'len(ds_all_uid_pr):', len(ds_all_uid_pr) print 'ds_top_n:', ds_top_n topicname = acquire_topic_name(topic_id) print 'topicname:', topicname if not topicname: return data print 'save_rank_results' data = save_rank_results(sorted_uids, 'topic', 'spark_pagerank', date, window_size, topicname, all_uid_pr) ds_data = save_ds_rank_results(ds_sorted_uids, 'topic', 'spark_pagerank', date, window_size, topicname, ds_all_uid_pr) return all_uid_pr, ds_all_uid_pr, data, ds_data
def degree_rank(top_n, date, topic_id, window_size): data = [] degree = prepare_data_for_degree(topic_id, date, window_size) if not degree: return data sorted_degree = sorted(degree.iteritems(), key=operator.itemgetter(1), reverse=True) sorted_uids = [] count = 0 for uid, value in sorted_degree: if count >= top_n: break sorted_uids.append(uid) count += 1 topicname = acquire_topic_name(topic_id) if not topicname: return data data = save_rank_results(sorted_uids, 'topic', 'degree', date, window_size, topicname) return data
def make_network_graph(current_date, topic_id, window_size, key_user_labeled=True): date = current_date if key_user_labeled: key_users = read_key_users(current_date, window_size, topic_id, top_n=10) else: key_users = [] topic = acquire_topic_name(topic_id) if not topic: return None uid_ts, G = make_network(topic, date, window_size, ts=True) N = len(G.nodes()) if not N: return '' node_degree = nx.degree(G) G = cut_network(G, node_degree) gexf = Gexf("Yang Han", "Topic Network") node_id = {} graph = gexf.addGraph("directed", "static", "demp graph") graph.addNodeAttribute('name', type='string', force_id='name') graph.addNodeAttribute('location', type='string', force_id='location') graph.addNodeAttribute('timestamp', type='int', force_id='timestamp') pos = nx.spring_layout(G) node_counter = 0 edge_counter = 0 for node in G.nodes(): x, y = pos[node] degree = node_degree[node] if node not in node_id: node_id[node] = node_counter node_counter += 1 uid = node if uid in key_users: _node = graph.addNode(node_id[node], str(node), x=str(x), y=str(y), z='0', r='255', g='51', b='51', size=str(degree)) else: _node = graph.addNode(node_id[node], str(node), x=str(x), y=str(y), z='0', r='0', g='204', b='204', size=str(degree)) user_info = acquire_user_by_id('area', uid) if user_info: _node.addAttribute('name', user_info['name'].decode('utf-8')) _node.addAttribute('location', user_info['location'].decode('utf-8')) else: _node.addAttribute('name', 'Unknown') _node.addAttribute('location', 'Unknown') _node.addAttribute('timestamp', str(uid_ts[uid])) for edge in G.edges(): start, end = edge start_id = node_id[start] end_id = node_id[end] graph.addEdge(str(edge_counter), str(start_id), str(end_id)) edge_counter += 1 return etree.tostring(gexf.getXML(), pretty_print=True, encoding='utf-8', xml_declaration=True)