Exemple #1
0
    start_time = datetime.now()
    postgres_handle = PostgresHandle(smarttypes.connection_string)

    if len(sys.argv) < 3:
        raise Exception('Need a twitter handle and distance.')
    else:
        screen_name = sys.argv[1]
        distance = int(sys.argv[2])
    root_user = TwitterUser.by_screen_name(screen_name, postgres_handle)

    smarttypes.config.IS_PROD = False
    if distance < 1:
        smarttypes.config.IS_PROD = True
        distance = 10000 / len(root_user.following[:1000])

    network = TwitterUser.get_rooted_network(root_user, postgres_handle, distance=distance)
    g = get_igraph_graph(network)
    layout_list = reduce_with_linloglayout(g, root_user)
    
    #id_communities
    g, community_idx_list, vertex_clustering = id_communities(g, layout_list, eps=0.62, min_samples=12)

    #set color based on communities
    color_array = np.array(community_idx_list)
    color_array = color_array / (max(color_array) / 31)
    g.vs['color'] = ['rgb(%s, %s, %s)' % (int(x) * 8, int(x) * 8, int(x) * 8) for x in color_array]
    g.vs[g.vs.find(root_user.id).index]['color'] = 'rgb(255,0,0)'
    #g.vs['shape'] = ['hidden' if x == 0 else 'circle' for x in community_idx_list]

    #community_stats
    community_stats = get_community_stats(network, g, vertex_clustering, layout_list)
def reduce_and_save_communities(root_user, distance=10, return_graph_for_inspection=False):

    print 'starting reduce_and_save_communities'
    print 'root_user: %s,  following_in_our_db: %s, distance: %s' % (
        root_user.screen_name, len(root_user.following), distance)
    network = TwitterUser.get_rooted_network(root_user, postgres_handle, distance=distance)

    print 'load %s users into igraph' % len(network)
    g = Graph(directed=True)
    keys_set = set(network.keys())
    g.add_vertices(network.keys())
    g.vs["id"] = network.keys() #need this for pajek format
    print 'iterative load into igraph'
    edges = []
    for source in network:
        for target in network[source].intersection(keys_set):
            edges.append((source, target))
    g.add_edges(edges)
    g = g.simplify()
    print 'make sure graph is connected'
    connected_clusters = g.clusters()
    connected_cluster_lengths = [len(x) for x in connected_clusters]
    connected_cluster_max_idx = connected_cluster_lengths.index(max(connected_cluster_lengths))
    g = connected_clusters.subgraph(connected_cluster_max_idx)
    if g.is_connected():
        print 'graph is connected'
    else:
        print 'graph is not connected'

    if return_graph_for_inspection:
        return g

    print 'write to pajek format'
    root_file_name = root_user.screen_name
    f = open('io/%s.net' % root_file_name, 'w')
    g.write(f, format='pajek')

    print 'run infomap'
    #infomap_command = 'infomap_dir/infomap 345234 io/%s.net 10'
    #infomap_command = 'conf-infomap_dir/conf-infomap 344 io/%s.net 10 10 0.50'
    infomap_command = 'infohiermap_dir/infohiermap 345234 io/%s.net 30'
    os.system(infomap_command % root_file_name)

    print 'read into memory'
    f = open('io/%s.smap' % root_file_name)

    section_header = ''
    communities = defaultdict(lambda: ([], [], []))
    for line in f:
        if line.startswith('*Modules'):
            section_header = 'Modules'
            continue
        if line.startswith('*Insignificants'):
            section_header = 'Insignificants'
            continue
        if line.startswith('*Nodes'):
            section_header = 'Nodes'
            continue
        if line.startswith('*Links'):
            section_header = 'Links'
            continue

        if section_header == 'Modules':
            #looks like this:
            #1 "26000689,..." 0.130147 0.0308866
            #The names under *Modules are derived from the node with the highest 
            #flow volume within the module, and 0.25 0.0395432 represent, respectively, 
            #the aggregated flow volume of all nodes within the module and the per 
            #step exit flow from the module.
            continue

        if section_header == 'Nodes':
            #looks like this: 
            #1:10 "2335431" 0.00365772
            #or w/ a semicolon instead, semicolon means not significant
            #see http://www.tp.umu.se/~rosvall/code.html
            if ';' in line:
                continue
            community_idx = line.split(':')[0]
            node_id = line.split('"')[1]
            final_volume = float(line.split(' ')[2])
            communities[community_idx][1].append(node_id)
            communities[community_idx][2].append(final_volume)

        if section_header == 'Links':
            #community_edges
            #looks like this:
            #1 4 0.0395432
            community_idx = line.split(' ')[0]
            target_community_idx = line.split(' ')[1]
            edge_weight = line.split(' ')[2]
            communities[community_idx][0].append('%s:%s' % (target_community_idx, edge_weight))
Exemple #3
0
def reduce_and_save_communities(root_user,
                                distance=10,
                                return_graph_for_inspection=False):

    print 'starting reduce_and_save_communities'
    print 'root_user: %s,  following_in_our_db: %s, distance: %s' % (
        root_user.screen_name, len(root_user.following), distance)
    network = TwitterUser.get_rooted_network(root_user,
                                             postgres_handle,
                                             distance=distance)

    print 'load %s users into igraph' % len(network)
    g = Graph(directed=True)
    keys_set = set(network.keys())
    g.add_vertices(network.keys())
    g.vs["id"] = network.keys()  #need this for pajek format
    print 'iterative load into igraph'
    edges = []
    for source in network:
        for target in network[source].intersection(keys_set):
            edges.append((source, target))
    g.add_edges(edges)
    g = g.simplify()
    print 'make sure graph is connected'
    connected_clusters = g.clusters()
    connected_cluster_lengths = [len(x) for x in connected_clusters]
    connected_cluster_max_idx = connected_cluster_lengths.index(
        max(connected_cluster_lengths))
    g = connected_clusters.subgraph(connected_cluster_max_idx)
    if g.is_connected():
        print 'graph is connected'
    else:
        print 'graph is not connected'

    if return_graph_for_inspection:
        return g

    print 'write to pajek format'
    root_file_name = root_user.screen_name
    f = open('io/%s.net' % root_file_name, 'w')
    g.write(f, format='pajek')

    print 'run infomap'
    #infomap_command = 'infomap_dir/infomap 345234 io/%s.net 10'
    #infomap_command = 'conf-infomap_dir/conf-infomap 344 io/%s.net 10 10 0.50'
    infomap_command = 'infohiermap_dir/infohiermap 345234 io/%s.net 30'
    os.system(infomap_command % root_file_name)

    print 'read into memory'
    f = open('io/%s.smap' % root_file_name)

    section_header = ''
    communities = defaultdict(lambda: ([], [], []))
    for line in f:
        if line.startswith('*Modules'):
            section_header = 'Modules'
            continue
        if line.startswith('*Insignificants'):
            section_header = 'Insignificants'
            continue
        if line.startswith('*Nodes'):
            section_header = 'Nodes'
            continue
        if line.startswith('*Links'):
            section_header = 'Links'
            continue

        if section_header == 'Modules':
            #looks like this:
            #1 "26000689,..." 0.130147 0.0308866
            #The names under *Modules are derived from the node with the highest
            #flow volume within the module, and 0.25 0.0395432 represent, respectively,
            #the aggregated flow volume of all nodes within the module and the per
            #step exit flow from the module.
            continue

        if section_header == 'Nodes':
            #looks like this:
            #1:10 "2335431" 0.00365772
            #or w/ a semicolon instead, semicolon means not significant
            #see http://www.tp.umu.se/~rosvall/code.html
            if ';' in line:
                continue
            community_idx = line.split(':')[0]
            node_id = line.split('"')[1]
            final_volume = float(line.split(' ')[2])
            communities[community_idx][1].append(node_id)
            communities[community_idx][2].append(final_volume)

        if section_header == 'Links':
            #community_edges
            #looks like this:
            #1 4 0.0395432
            community_idx = line.split(' ')[0]
            target_community_idx = line.split(' ')[1]
            edge_weight = line.split(' ')[2]
            communities[community_idx][0].append(
                '%s:%s' % (target_community_idx, edge_weight))
    start_time = datetime.now()
    postgres_handle = PostgresHandle(smarttypes.connection_string)
    if not len(sys.argv) > 1:
        raise Exception('Need a twitter handle.')
    else:
        screen_name = sys.argv[1]

    if smarttypes.config.IS_PROD:
        start_here = datetime.now()
    else:
        start_here = datetime(2012, 8, 1)
    root_user = TwitterUser.by_screen_name(screen_name, postgres_handle)
    distance = 45000 / len(root_user.following[:5000])
    #distance = 0
    network = TwitterUser.get_rooted_network(root_user, postgres_handle, start_here=start_here, distance=distance,
        go_back_this_many_weeks=15)
    print "writing %s nodes to disk" % len(network)
    g = reduce_graph.get_igraph_graph(network)

    lang_names = []
    loc_names = []
    for node_id in g.vs['name']:
        user = TwitterUser.get_by_id(node_id, postgres_handle)
        lang_names.append(user.lang.encode('ascii', 'ignore'))
        loc_names.append(user.location_name.encode('ascii', 'ignore'))
    g.vs['lang_name'] = lang_names
    g.vs['loc_name'] = loc_names
    reduce_graph.write_to_graphml_file(root_user, g, network)
    # print "mk_user_csv took %s to execute" % (datetime.now() - start_time)