import re

def get_links(x):
    split = re.findall(r"[\w']+", x)
    parent = int(split[0])
    children = [int(z) for z in split[1:]]
    parent_to_children = [(parent, z) for z in children]
    children_to_parent = [(z, parent) for z in children]
    return parent_to_children + children_to_parent

all_links = links_raw_data.flatMap(get_links)
node_then_all_links = all_links.groupByKey()
# Remove non-unique links
node_then_all_links_expanded = node_then_all_links.map(lambda x: (x[0], list(set(x[1]))))

network_rdd = node_then_all_links_expanded

### Run connected component code on network ###
from HW1.network_commands import Connected_Components

connector = Connected_Components(sc, network_rdd)
connector.run_until_converged()
# Print number of connected components at the end
print 'Number of unique groups:' , connector.get_num_unique_groups()

# Get the connected component with the biggest number of nodes
nodes_per_index = connector.get_number_of_nodes_per_index()
collected_nodes_per_index = nodes_per_index.collect()
num_nodes = map(lambda x: x[1], collected_nodes_per_index)

print 'Biggest group:' ,  np.sort(num_nodes)[-1]
Ejemplo n.º 2
0
def get_links(x):
    split = re.findall(r"[\w']+", x)
    parent = int(split[0])
    children = [int(z) for z in split[1:]]
    parent_to_children = [(parent, z) for z in children]
    children_to_parent = [(z, parent) for z in children]
    return parent_to_children + children_to_parent


all_links = links_raw_data.flatMap(get_links)
node_then_all_links = all_links.groupByKey()
# Remove non-unique links
node_then_all_links_expanded = node_then_all_links.map(lambda x:
                                                       (x[0], list(set(x[1]))))

network_rdd = node_then_all_links_expanded

### Run connected component code on network ###
from HW1.network_commands import Connected_Components

connector = Connected_Components(sc, network_rdd)
connector.run_until_converged()
# Print number of connected components at the end
print 'Number of unique groups:', connector.get_num_unique_groups()

# Get the connected component with the biggest number of nodes
nodes_per_index = connector.get_number_of_nodes_per_index()
collected_nodes_per_index = nodes_per_index.collect()
num_nodes = map(lambda x: x[1], collected_nodes_per_index)

print 'Biggest group:', np.sort(num_nodes)[-1]
joined_links = parent_child_links.join(child_then_parents_expanded) # join copartitioned data!

def get_acceptable_links(x):
    '''Return links that are symmetric.'''
    parent = x[0]
    list1 = x[1][0]
    list2 = x[1][1]
    symmetric_links = set(list1).intersection(list2)
    symmetric_links = list(symmetric_links)
    # Many nodes are unlinked to anything...so they get their own index!
    # They *are* a connected component.
    return (parent, symmetric_links)

symmetric_links = joined_links.map(get_acceptable_links, preservesPartitioning=True)

network_rdd = symmetric_links

### Analyze the links using my connected components class ###

from HW1.network_commands import Connected_Components

connector = Connected_Components(sc, network_rdd)
connector.run_until_converged() # This prints the number of connected components at the end
# Print number of connected components at the end
print 'Number of unique groups:' , connector.get_num_unique_groups()

# Get the biggest group
num_rdd = connector.get_number_of_nodes_per_index()
id_and_num = num_rdd.collect()
num_nodes = map(lambda x: x[1], id_and_num)
print 'Biggest group:' ,  np.sort(num_nodes)[-1]
    '''Return links that are symmetric.'''
    parent = x[0]
    list1 = x[1][0]
    list2 = x[1][1]
    symmetric_links = set(list1).intersection(list2)
    symmetric_links = list(symmetric_links)
    # Many nodes are unlinked to anything...so they get their own index!
    # They *are* a connected component.
    return (parent, symmetric_links)


symmetric_links = joined_links.map(get_acceptable_links,
                                   preservesPartitioning=True)

network_rdd = symmetric_links

### Analyze the links using my connected components class ###

from HW1.network_commands import Connected_Components

connector = Connected_Components(sc, network_rdd)
connector.run_until_converged(
)  # This prints the number of connected components at the end
# Print number of connected components at the end
print 'Number of unique groups:', connector.get_num_unique_groups()

# Get the biggest group
num_rdd = connector.get_number_of_nodes_per_index()
id_and_num = num_rdd.collect()
num_nodes = map(lambda x: x[1], id_and_num)
print 'Biggest group:', np.sort(num_nodes)[-1]