start_time = datetime.now() postgres_handle = PostgresHandle(smarttypes.connection_string) if len(sys.argv) < 3: raise Exception('Need a twitter handle and distance.') else: screen_name = sys.argv[1] distance = int(sys.argv[2]) root_user = TwitterUser.by_screen_name(screen_name, postgres_handle) smarttypes.config.IS_PROD = False if distance < 1: smarttypes.config.IS_PROD = True distance = 10000 / len(root_user.following[:1000]) network = TwitterUser.get_rooted_network(root_user, postgres_handle, distance=distance) g = get_igraph_graph(network) layout_list = reduce_with_linloglayout(g, root_user) #id_communities g, community_idx_list, vertex_clustering = id_communities(g, layout_list, eps=0.62, min_samples=12) #set color based on communities color_array = np.array(community_idx_list) color_array = color_array / (max(color_array) / 31) g.vs['color'] = ['rgb(%s, %s, %s)' % (int(x) * 8, int(x) * 8, int(x) * 8) for x in color_array] g.vs[g.vs.find(root_user.id).index]['color'] = 'rgb(255,0,0)' #g.vs['shape'] = ['hidden' if x == 0 else 'circle' for x in community_idx_list] #community_stats community_stats = get_community_stats(network, g, vertex_clustering, layout_list)
def reduce_and_save_communities(root_user, distance=10, return_graph_for_inspection=False): print 'starting reduce_and_save_communities' print 'root_user: %s, following_in_our_db: %s, distance: %s' % ( root_user.screen_name, len(root_user.following), distance) network = TwitterUser.get_rooted_network(root_user, postgres_handle, distance=distance) print 'load %s users into igraph' % len(network) g = Graph(directed=True) keys_set = set(network.keys()) g.add_vertices(network.keys()) g.vs["id"] = network.keys() #need this for pajek format print 'iterative load into igraph' edges = [] for source in network: for target in network[source].intersection(keys_set): edges.append((source, target)) g.add_edges(edges) g = g.simplify() print 'make sure graph is connected' connected_clusters = g.clusters() connected_cluster_lengths = [len(x) for x in connected_clusters] connected_cluster_max_idx = connected_cluster_lengths.index(max(connected_cluster_lengths)) g = connected_clusters.subgraph(connected_cluster_max_idx) if g.is_connected(): print 'graph is connected' else: print 'graph is not connected' if return_graph_for_inspection: return g print 'write to pajek format' root_file_name = root_user.screen_name f = open('io/%s.net' % root_file_name, 'w') g.write(f, format='pajek') print 'run infomap' #infomap_command = 'infomap_dir/infomap 345234 io/%s.net 10' #infomap_command = 'conf-infomap_dir/conf-infomap 344 io/%s.net 10 10 0.50' infomap_command = 'infohiermap_dir/infohiermap 345234 io/%s.net 30' os.system(infomap_command % root_file_name) print 'read into memory' f = open('io/%s.smap' % root_file_name) section_header = '' communities = defaultdict(lambda: ([], [], [])) for line in f: if line.startswith('*Modules'): section_header = 'Modules' continue if line.startswith('*Insignificants'): section_header = 'Insignificants' continue if line.startswith('*Nodes'): section_header = 'Nodes' continue if line.startswith('*Links'): section_header = 'Links' continue if section_header == 'Modules': #looks like this: #1 "26000689,..." 0.130147 0.0308866 #The names under *Modules are derived from the node with the highest #flow volume within the module, and 0.25 0.0395432 represent, respectively, #the aggregated flow volume of all nodes within the module and the per #step exit flow from the module. continue if section_header == 'Nodes': #looks like this: #1:10 "2335431" 0.00365772 #or w/ a semicolon instead, semicolon means not significant #see http://www.tp.umu.se/~rosvall/code.html if ';' in line: continue community_idx = line.split(':')[0] node_id = line.split('"')[1] final_volume = float(line.split(' ')[2]) communities[community_idx][1].append(node_id) communities[community_idx][2].append(final_volume) if section_header == 'Links': #community_edges #looks like this: #1 4 0.0395432 community_idx = line.split(' ')[0] target_community_idx = line.split(' ')[1] edge_weight = line.split(' ')[2] communities[community_idx][0].append('%s:%s' % (target_community_idx, edge_weight))
def reduce_and_save_communities(root_user, distance=10, return_graph_for_inspection=False): print 'starting reduce_and_save_communities' print 'root_user: %s, following_in_our_db: %s, distance: %s' % ( root_user.screen_name, len(root_user.following), distance) network = TwitterUser.get_rooted_network(root_user, postgres_handle, distance=distance) print 'load %s users into igraph' % len(network) g = Graph(directed=True) keys_set = set(network.keys()) g.add_vertices(network.keys()) g.vs["id"] = network.keys() #need this for pajek format print 'iterative load into igraph' edges = [] for source in network: for target in network[source].intersection(keys_set): edges.append((source, target)) g.add_edges(edges) g = g.simplify() print 'make sure graph is connected' connected_clusters = g.clusters() connected_cluster_lengths = [len(x) for x in connected_clusters] connected_cluster_max_idx = connected_cluster_lengths.index( max(connected_cluster_lengths)) g = connected_clusters.subgraph(connected_cluster_max_idx) if g.is_connected(): print 'graph is connected' else: print 'graph is not connected' if return_graph_for_inspection: return g print 'write to pajek format' root_file_name = root_user.screen_name f = open('io/%s.net' % root_file_name, 'w') g.write(f, format='pajek') print 'run infomap' #infomap_command = 'infomap_dir/infomap 345234 io/%s.net 10' #infomap_command = 'conf-infomap_dir/conf-infomap 344 io/%s.net 10 10 0.50' infomap_command = 'infohiermap_dir/infohiermap 345234 io/%s.net 30' os.system(infomap_command % root_file_name) print 'read into memory' f = open('io/%s.smap' % root_file_name) section_header = '' communities = defaultdict(lambda: ([], [], [])) for line in f: if line.startswith('*Modules'): section_header = 'Modules' continue if line.startswith('*Insignificants'): section_header = 'Insignificants' continue if line.startswith('*Nodes'): section_header = 'Nodes' continue if line.startswith('*Links'): section_header = 'Links' continue if section_header == 'Modules': #looks like this: #1 "26000689,..." 0.130147 0.0308866 #The names under *Modules are derived from the node with the highest #flow volume within the module, and 0.25 0.0395432 represent, respectively, #the aggregated flow volume of all nodes within the module and the per #step exit flow from the module. continue if section_header == 'Nodes': #looks like this: #1:10 "2335431" 0.00365772 #or w/ a semicolon instead, semicolon means not significant #see http://www.tp.umu.se/~rosvall/code.html if ';' in line: continue community_idx = line.split(':')[0] node_id = line.split('"')[1] final_volume = float(line.split(' ')[2]) communities[community_idx][1].append(node_id) communities[community_idx][2].append(final_volume) if section_header == 'Links': #community_edges #looks like this: #1 4 0.0395432 community_idx = line.split(' ')[0] target_community_idx = line.split(' ')[1] edge_weight = line.split(' ')[2] communities[community_idx][0].append( '%s:%s' % (target_community_idx, edge_weight))
start_time = datetime.now() postgres_handle = PostgresHandle(smarttypes.connection_string) if not len(sys.argv) > 1: raise Exception('Need a twitter handle.') else: screen_name = sys.argv[1] if smarttypes.config.IS_PROD: start_here = datetime.now() else: start_here = datetime(2012, 8, 1) root_user = TwitterUser.by_screen_name(screen_name, postgres_handle) distance = 45000 / len(root_user.following[:5000]) #distance = 0 network = TwitterUser.get_rooted_network(root_user, postgres_handle, start_here=start_here, distance=distance, go_back_this_many_weeks=15) print "writing %s nodes to disk" % len(network) g = reduce_graph.get_igraph_graph(network) lang_names = [] loc_names = [] for node_id in g.vs['name']: user = TwitterUser.get_by_id(node_id, postgres_handle) lang_names.append(user.lang.encode('ascii', 'ignore')) loc_names.append(user.location_name.encode('ascii', 'ignore')) g.vs['lang_name'] = lang_names g.vs['loc_name'] = loc_names reduce_graph.write_to_graphml_file(root_user, g, network) # print "mk_user_csv took %s to execute" % (datetime.now() - start_time)