import re def get_links(x): split = re.findall(r"[\w']+", x) parent = int(split[0]) children = [int(z) for z in split[1:]] parent_to_children = [(parent, z) for z in children] children_to_parent = [(z, parent) for z in children] return parent_to_children + children_to_parent all_links = links_raw_data.flatMap(get_links) node_then_all_links = all_links.groupByKey() # Remove non-unique links node_then_all_links_expanded = node_then_all_links.map(lambda x: (x[0], list(set(x[1])))) network_rdd = node_then_all_links_expanded ### Run connected component code on network ### from HW1.network_commands import Connected_Components connector = Connected_Components(sc, network_rdd) connector.run_until_converged() # Print number of connected components at the end print 'Number of unique groups:' , connector.get_num_unique_groups() # Get the connected component with the biggest number of nodes nodes_per_index = connector.get_number_of_nodes_per_index() collected_nodes_per_index = nodes_per_index.collect() num_nodes = map(lambda x: x[1], collected_nodes_per_index) print 'Biggest group:' , np.sort(num_nodes)[-1]
def get_links(x): split = re.findall(r"[\w']+", x) parent = int(split[0]) children = [int(z) for z in split[1:]] parent_to_children = [(parent, z) for z in children] children_to_parent = [(z, parent) for z in children] return parent_to_children + children_to_parent all_links = links_raw_data.flatMap(get_links) node_then_all_links = all_links.groupByKey() # Remove non-unique links node_then_all_links_expanded = node_then_all_links.map(lambda x: (x[0], list(set(x[1])))) network_rdd = node_then_all_links_expanded ### Run connected component code on network ### from HW1.network_commands import Connected_Components connector = Connected_Components(sc, network_rdd) connector.run_until_converged() # Print number of connected components at the end print 'Number of unique groups:', connector.get_num_unique_groups() # Get the connected component with the biggest number of nodes nodes_per_index = connector.get_number_of_nodes_per_index() collected_nodes_per_index = nodes_per_index.collect() num_nodes = map(lambda x: x[1], collected_nodes_per_index) print 'Biggest group:', np.sort(num_nodes)[-1]
joined_links = parent_child_links.join(child_then_parents_expanded) # join copartitioned data! def get_acceptable_links(x): '''Return links that are symmetric.''' parent = x[0] list1 = x[1][0] list2 = x[1][1] symmetric_links = set(list1).intersection(list2) symmetric_links = list(symmetric_links) # Many nodes are unlinked to anything...so they get their own index! # They *are* a connected component. return (parent, symmetric_links) symmetric_links = joined_links.map(get_acceptable_links, preservesPartitioning=True) network_rdd = symmetric_links ### Analyze the links using my connected components class ### from HW1.network_commands import Connected_Components connector = Connected_Components(sc, network_rdd) connector.run_until_converged() # This prints the number of connected components at the end # Print number of connected components at the end print 'Number of unique groups:' , connector.get_num_unique_groups() # Get the biggest group num_rdd = connector.get_number_of_nodes_per_index() id_and_num = num_rdd.collect() num_nodes = map(lambda x: x[1], id_and_num) print 'Biggest group:' , np.sort(num_nodes)[-1]
'''Return links that are symmetric.''' parent = x[0] list1 = x[1][0] list2 = x[1][1] symmetric_links = set(list1).intersection(list2) symmetric_links = list(symmetric_links) # Many nodes are unlinked to anything...so they get their own index! # They *are* a connected component. return (parent, symmetric_links) symmetric_links = joined_links.map(get_acceptable_links, preservesPartitioning=True) network_rdd = symmetric_links ### Analyze the links using my connected components class ### from HW1.network_commands import Connected_Components connector = Connected_Components(sc, network_rdd) connector.run_until_converged( ) # This prints the number of connected components at the end # Print number of connected components at the end print 'Number of unique groups:', connector.get_num_unique_groups() # Get the biggest group num_rdd = connector.get_number_of_nodes_per_index() id_and_num = num_rdd.collect() num_nodes = map(lambda x: x[1], id_and_num) print 'Biggest group:', np.sort(num_nodes)[-1]