Exemple #1
0
def write_tmp_file(tmp_file, ds_g,
                   ds_N):  # 这里要重新写,要组成(node, outlinks)应该是从node出来的等够到达的点的集合
    ds_new_attribute_dict = read_attribute_dict('ds_g')
    #new_attribute_dict = read_attribute_dict('g')
    #new_attribute_dict = json.loads(new_attribute_dict)
    #print 'len(new_attribute_dict):', len(new_attribute_dict.keys())
    ds_new_attribute_dict = json.loads(ds_new_attribute_dict)
    for node in ds_g.nodes():
        outlinks = ds_g.out_edges(nbunch=[node
                                          ])  # 找到节点node对应的direct_superior_user
        if not outlinks:
            value = 'tr_results,%s,%s' % (1.0 / ds_N, ds_N)
            tmp_file.write('%s\t%s\n' % (node, value))
        else:
            outlinks_list = compute_s2(node, outlinks, ds_new_attribute_dict)
            outlinks_str = ','.join(outlinks_list)
            value = 'tr_results,%s,%s,' % (
                1.0 / ds_N, ds_N
            )  # value=pr_results,1/n,n,str(uid1),str(i(uid1,v)),str(uid2),str(i(uid2,v))
            value += outlinks_str
            tmp_file.write('%s\t%s\n' % (node, value))
        print '(node,value):', node, value
    tmp_file.flush()

    return tmp_file
Exemple #2
0
def write_tmp_file(tmp_file, ds_g, ds_N): # 这里要重新写,要组成(node, outlinks)应该是从node出来的等够到达的点的集合
    ds_new_attribute_dict = read_attribute_dict('ds_g')
    #new_attribute_dict = read_attribute_dict('g')
    #new_attribute_dict = json.loads(new_attribute_dict)
    #print 'len(new_attribute_dict):', len(new_attribute_dict.keys())
    ds_new_attribute_dict = json.loads(ds_new_attribute_dict)
    for node in ds_g.nodes():
        outlinks = ds_g.out_edges(nbunch=[node]) # 找到节点node对应的direct_superior_user
        if not outlinks:
            value = 'tr_results,%s,%s' % (1.0 / ds_N, ds_N)
            tmp_file.write('%s\t%s\n' % (node, value))
        else:
            outlinks_list = compute_s2(node, outlinks, ds_new_attribute_dict)
            outlinks_str = ','.join(outlinks_list)
            value = 'tr_results,%s,%s,' % (1.0 / ds_N, ds_N)  # value=pr_results,1/n,n,str(uid1),str(i(uid1,v)),str(uid2),str(i(uid2,v))
            value += outlinks_str
            tmp_file.write('%s\t%s\n' % (node, value))
        print '(node,value):', node, value
    tmp_file.flush()
    
    return tmp_file
Exemple #3
0
def make_network_graph(current_date,
                       topic_id,
                       topic,
                       window_size,
                       all_uid_pr,
                       pr_data,
                       ds_all_uid_pr,
                       ds_pr_data,
                       real_topic_id,
                       key_user_labeled=True):
    date = current_date
    '''
    key_users对应的是源头转发网络的pagerank前10的用户,ds_key_users对应的是直接上级转发网络的pagerank前10的用户
    '''
    if key_user_labeled:
        key_users = read_key_users(current_date, window_size, topic, top_n=10)
        ds_key_users = ds_read_key_users(current_date,
                                         window_size,
                                         topic,
                                         top_n=10)

    else:
        key_users = []
        ds_key_users = []
    '''
    读取图结构,并从数据库中获取new_attribute_dict, ds_new_attribute_dict
    '''
    key = str(real_topic_id) + '_' + str(date) + '_' + str(window_size)
    G = nx.read_gexf(str(GRAPH_PATH) + str(key) + '_g_graph.gexf')
    gg = nx.read_gexf(str(GRAPH_PATH) + str(key) + '_gg_graph.gexf')
    ds_dg = nx.read_gexf(str(GRAPH_PATH) + str(key) + '_ds_dg_graph.gexf')
    ds_udg = nx.read_gexf(str(GRAPH_PATH) + str(key) + '_ds_udg_graph.gexf')

    new_attribute_dict = json.loads(read_attribute_dict('g'))
    ds_new_attribute_dict = json.loads(read_attribute_dict('ds_g'))
    # community detection, http://perso.crans.org/aynaud/communities/, undirected graph
    import community

    N = len(G)
    print 'len_G_N:', N
    ds_N = len(ds_dg)
    print 'len_ds_dg_N:', ds_N
    if (not N) or (not ds_N):
        return ''

    node_degree = nx.degree(G)
    ds_node_degree = nx.degree(ds_dg)
    G.remove_edges_from(G.selfloop_edges())
    gg.remove_edges_from(gg.selfloop_edges())
    ds_dg.remove_edges_from(ds_dg.selfloop_edges())
    ds_udg.remove_edges_from(ds_udg.selfloop_edges())

    G = cut_network(G, nx.degree(G), cut_degree)  # 筛选出度数大于等于1的节点数
    gg = cut_network(gg, nx.degree(gg), cut_degree)
    ds_dg = cut_network(ds_dg, nx.degree(ds_dg), cut_degree)
    ds_udg = cut_network(ds_udg, nx.degree(ds_udg), cut_degree)

    print 'after cut_network:'
    print 'len(G):', len(G)
    print 'len(ds_dg):', len(ds_dg)
    p_gg = nx.read_gexf(str(GRAPH_PATH) + str(key) + '_gg_graph.gexf')
    p_ds_udg = nx.read_gexf(str(GRAPH_PATH) + str(key) + '_ds_udg_graph.gexf')
    partition = community.best_partition(p_gg)
    ds_partition = community.best_partition(
        p_ds_udg)  # 将直接上级转发网络进行社区划分!!!!!!!!!!!!

    print 'start snowball sampling'
    new_G, new_gg = SnowballSampling(G, gg, topic, network_type)
    ds_new_G, ds_new_gg = SnowballSampling(ds_dg, ds_udg, topic,
                                           ds_network_type)
    print 'sampling complicated'

    # Local Bridge的算法需要提升效率,此处先不显示
    '''
    print 'get local bridge start:'
    GetLocalBridge(gg)
    GetLocalBridge(ds_udg)
    print 'local bridge complicated'
    '''
    print 'start computing quota'
    #new_G = G
    #new_gg = gg
    #ds_new_G = ds_dg
    #ds_new_gg = ds_udg
    compute_quota(new_G, new_gg, date, window_size, topic, all_uid_pr,
                  network_type)  # compute quota
    compute_quota(ds_new_G, ds_new_gg, date, window_size, topic, ds_all_uid_pr,
                  ds_network_type)
    print 'quota computed complicated'

    # 生成gexf文件
    '''
    将生成gexf文件的部分作为一个函数,将相关的参数传入。以此简洁化两个不同不同网络的gexf生成过程
    '''
    gexf = make_gexf('hxq', 'Source Network', new_G, node_degree, key_users,
                     all_uid_pr, pr_data, partition, new_attribute_dict)
    ds_gexf = make_ds_gexf('hxq', 'Direct Superior Network', ds_new_G,
                           ds_node_degree, ds_key_users, ds_all_uid_pr,
                           ds_pr_data, ds_partition, ds_new_attribute_dict)
    '''
    gexf = Gexf("Yang Han", "Topic Network")

    node_id = {}
    graph = gexf.addGraph("directed", "static", "demp graph")
    graph.addNodeAttribute('name', type='string', force_id='name')
    graph.addNodeAttribute('location', type='string', force_id='location') # 添加地理位置属性
    graph.addNodeAttribute('timestamp', type='int', force_id='timestamp')
    graph.addNodeAttribute('pagerank', type='string', force_id='pagerank')
    graph.addNodeAttribute('acategory', type='string', force_id='acategory')
    graph.addNodeAttribute('text', type='string', force_id='text')
    graph.addNodeAttribute('reposts_count', type='string', force_id='reposts_count') # 新添加的属性
    graph.addNodeAttribute('comments_count', type='string', force_id='comments_count')
    graph.addNodeAttribute('attitude_count', type='string', force_id='attitude_count')

    pos = nx.spring_layout(G) # 定义一个布局 pos={node:[v...]/(v...)}

    node_counter = 0
    edge_counter = 0

    for node in G.nodes():
        x, y = pos[node] # 返回位置(x,y)
        degree = node_degree[node]
        if node not in node_id: # {node:排名}
            node_id[node] = node_counter
            node_counter += 1
        uid = node # 节点就是用户名
        if uid in key_users: # 根据是否为关键用户添加不同的节点 
            _node = graph.addNode(node_id[node], str(node), x=str(x), y=str(y), z='0', r='255', g='51', b='51', size=str(degree))
        else:
            _node = graph.addNode(node_id[node], str(node), x=str(x), y=str(y), z='0', r='0', g='204', b='204', size=str(degree))
        cluster_id = str(partition[node])
        _node.addAttribute('acategory', cluster_id)
        #print 'al_uid_pr:', all_uid_pr
        pr = str(all_uid_pr[str(uid)])
        _node.addAttribute('pagerank', pr)
        #print 'pagarank_uid:', uid
        try:
            text_add = new_attribute_dict[uid][0][0] # 添加节点属性--text
            _node.addAttribute('text', json.dumps(text_add))
            reposts_count_add = new_attribute_dict[uid][0][1]
            _node.addAttribute('reposts_count', str(reposts_count_add)) # 添加节点属性--reposts_count
            comment_count_add = new_attribute_dict[uid][0][2]
            _node.addAttribute('comments_count', str(comment_count_add)) # 添加节点属性--comment_count
            attitude_count_add = new_attribute_dict[uid][0][3]
            if attitude_count_add == None:
                attitude_count_add = u'未知'
            _node.addAttribute('attitude_count', attitude_count_add) # 添加节点属性--attitude_count
        except KeyError:
            _node.addAttribute('text', u'未知')
            _node.addAttribute('reposts_count', u'未知')
            _node.addAttribute('comments_count', u'未知')
            _node.addAttribute('attitude_count', u'未知')
        user_info = acquire_user_by_id(uid) # 获取对应的用户信息,添加属性
        if user_info:
            _node.addAttribute('name', user_info['name'])
            _node.addAttribute('location', user_info['location'])
        else:
            _node.addAttribute('name', u'未知')
            _node.addAttribute('location', u'未知')
            #_node.addAttribute('timestamp', str(uid_ts[uid]))

    for edge in G.edges():
        start, end = edge # (repost_uid, source_uid)
        start_id = node_id[start]
        end_id = node_id[end]
        graph.addEdge(str(edge_counter), str(start_id), str(end_id))
        edge_counter += 1
    '''

    return etree.tostring(gexf.getXML(),
                          pretty_print=True,
                          encoding='utf-8',
                          xml_declaration=True), etree.tostring(
                              ds_gexf.getXML(),
                              pretty_print=True,
                              encoding='utf-8',
                              xml_declaration=True)  # 生成序列化字符串
Exemple #4
0
def make_network_graph(current_date, topic_id, topic, window_size, all_uid_pr, pr_data, ds_all_uid_pr, ds_pr_data,  real_topic_id, key_user_labeled=True):
    date = current_date
    '''
    key_users对应的是源头转发网络的pagerank前10的用户,ds_key_users对应的是直接上级转发网络的pagerank前10的用户
    '''
    if key_user_labeled:
        key_users = read_key_users(current_date, window_size, topic, top_n=10)
        ds_key_users = ds_read_key_users(current_date, window_size, topic ,top_n=10)
        
    else:
        key_users = []
        ds_key_users = []
    '''
    读取图结构,并从数据库中获取new_attribute_dict, ds_new_attribute_dict
    '''
    key = str(real_topic_id) + '_' + str(date) + '_' + str(window_size)
    G = nx.read_gexf(str(GRAPH_PATH)+str(key)+'_g_graph.gexf')
    gg = nx.read_gexf(str(GRAPH_PATH)+str(key)+'_gg_graph.gexf')
    ds_dg = nx.read_gexf(str(GRAPH_PATH)+str(key)+'_ds_dg_graph.gexf')
    ds_udg = nx.read_gexf(str(GRAPH_PATH)+str(key)+'_ds_udg_graph.gexf')

    new_attribute_dict = json.loads(read_attribute_dict('g'))
    ds_new_attribute_dict = json.loads(read_attribute_dict('ds_g'))
    # community detection, http://perso.crans.org/aynaud/communities/, undirected graph
    import community
    
    N = len(G)
    print 'len_G_N:', N
    ds_N = len(ds_dg)
    print 'len_ds_dg_N:', ds_N
    if (not N) or (not ds_N):
        return ''
    
    node_degree = nx.degree(G)
    ds_node_degree = nx.degree(ds_dg)
    G.remove_edges_from(G.selfloop_edges())
    gg.remove_edges_from(gg.selfloop_edges())
    ds_dg.remove_edges_from(ds_dg.selfloop_edges())
    ds_udg.remove_edges_from(ds_udg.selfloop_edges())

    
    G = cut_network(G, nx.degree(G), cut_degree) # 筛选出度数大于等于1的节点数
    gg = cut_network(gg, nx.degree(gg), cut_degree)
    ds_dg = cut_network(ds_dg, nx.degree(ds_dg), cut_degree)
    ds_udg = cut_network(ds_udg, nx.degree(ds_udg), cut_degree)
    
    print 'after cut_network:'
    print 'len(G):', len(G)
    print 'len(ds_dg):', len(ds_dg)
    p_gg = nx.read_gexf(str(GRAPH_PATH)+str(key)+'_gg_graph.gexf')
    p_ds_udg = nx.read_gexf(str(GRAPH_PATH)+str(key)+'_ds_udg_graph.gexf')
    partition = community.best_partition(p_gg)
    ds_partition = community.best_partition(p_ds_udg) # 将直接上级转发网络进行社区划分!!!!!!!!!!!!
    
    
    print 'start snowball sampling'
    new_G, new_gg = SnowballSampling(G, gg, topic, network_type)
    ds_new_G, ds_new_gg = SnowballSampling(ds_dg, ds_udg, topic, ds_network_type)
    print 'sampling complicated'
    
    # Local Bridge的算法需要提升效率,此处先不显示
    '''
    print 'get local bridge start:'
    GetLocalBridge(gg)
    GetLocalBridge(ds_udg)
    print 'local bridge complicated'
    '''
    print 'start computing quota'
    #new_G = G
    #new_gg = gg
    #ds_new_G = ds_dg
    #ds_new_gg = ds_udg
    compute_quota(new_G, new_gg, date, window_size, topic, all_uid_pr, network_type) # compute quota
    compute_quota(ds_new_G, ds_new_gg, date, window_size, topic, ds_all_uid_pr, ds_network_type)
    print 'quota computed complicated'

    # 生成gexf文件
    '''
    将生成gexf文件的部分作为一个函数,将相关的参数传入。以此简洁化两个不同不同网络的gexf生成过程
    '''
    gexf = make_gexf('hxq', 'Source Network', new_G, node_degree, key_users, all_uid_pr, pr_data , partition, new_attribute_dict)
    ds_gexf = make_ds_gexf('hxq', 'Direct Superior Network', ds_new_G, ds_node_degree, ds_key_users, ds_all_uid_pr, ds_pr_data, ds_partition, ds_new_attribute_dict)
    '''
    gexf = Gexf("Yang Han", "Topic Network")

    node_id = {}
    graph = gexf.addGraph("directed", "static", "demp graph")
    graph.addNodeAttribute('name', type='string', force_id='name')
    graph.addNodeAttribute('location', type='string', force_id='location') # 添加地理位置属性
    graph.addNodeAttribute('timestamp', type='int', force_id='timestamp')
    graph.addNodeAttribute('pagerank', type='string', force_id='pagerank')
    graph.addNodeAttribute('acategory', type='string', force_id='acategory')
    graph.addNodeAttribute('text', type='string', force_id='text')
    graph.addNodeAttribute('reposts_count', type='string', force_id='reposts_count') # 新添加的属性
    graph.addNodeAttribute('comments_count', type='string', force_id='comments_count')
    graph.addNodeAttribute('attitude_count', type='string', force_id='attitude_count')

    pos = nx.spring_layout(G) # 定义一个布局 pos={node:[v...]/(v...)}

    node_counter = 0
    edge_counter = 0

    for node in G.nodes():
        x, y = pos[node] # 返回位置(x,y)
        degree = node_degree[node]
        if node not in node_id: # {node:排名}
            node_id[node] = node_counter
            node_counter += 1
        uid = node # 节点就是用户名
        if uid in key_users: # 根据是否为关键用户添加不同的节点 
            _node = graph.addNode(node_id[node], str(node), x=str(x), y=str(y), z='0', r='255', g='51', b='51', size=str(degree))
        else:
            _node = graph.addNode(node_id[node], str(node), x=str(x), y=str(y), z='0', r='0', g='204', b='204', size=str(degree))
        cluster_id = str(partition[node])
        _node.addAttribute('acategory', cluster_id)
        #print 'al_uid_pr:', all_uid_pr
        pr = str(all_uid_pr[str(uid)])
        _node.addAttribute('pagerank', pr)
        #print 'pagarank_uid:', uid
        try:
            text_add = new_attribute_dict[uid][0][0] # 添加节点属性--text
            _node.addAttribute('text', json.dumps(text_add))
            reposts_count_add = new_attribute_dict[uid][0][1]
            _node.addAttribute('reposts_count', str(reposts_count_add)) # 添加节点属性--reposts_count
            comment_count_add = new_attribute_dict[uid][0][2]
            _node.addAttribute('comments_count', str(comment_count_add)) # 添加节点属性--comment_count
            attitude_count_add = new_attribute_dict[uid][0][3]
            if attitude_count_add == None:
                attitude_count_add = u'未知'
            _node.addAttribute('attitude_count', attitude_count_add) # 添加节点属性--attitude_count
        except KeyError:
            _node.addAttribute('text', u'未知')
            _node.addAttribute('reposts_count', u'未知')
            _node.addAttribute('comments_count', u'未知')
            _node.addAttribute('attitude_count', u'未知')
        user_info = acquire_user_by_id(uid) # 获取对应的用户信息,添加属性
        if user_info:
            _node.addAttribute('name', user_info['name'])
            _node.addAttribute('location', user_info['location'])
        else:
            _node.addAttribute('name', u'未知')
            _node.addAttribute('location', u'未知')
            #_node.addAttribute('timestamp', str(uid_ts[uid]))

    for edge in G.edges():
        start, end = edge # (repost_uid, source_uid)
        start_id = node_id[start]
        end_id = node_id[end]
        graph.addEdge(str(edge_counter), str(start_id), str(end_id))
        edge_counter += 1
    '''

    return etree.tostring(gexf.getXML(), pretty_print=True, encoding='utf-8', xml_declaration=True), etree.tostring(ds_gexf.getXML(), pretty_print=True, encoding='utf-8', xml_declaration=True)# 生成序列化字符串