Exemple #1
0
 def __init__(self,
              graph: MultiDiGraph,
              deep_copy: bool = True,
              strong_components: dict = None,
              links_to_components: dict = None):
     if deep_copy:
         self.graph = graph.copy()
     else:
         self.graph = graph
     self.strong_components = strong_components
     self.links_to_components = links_to_components
     self.visited_nodes = []
     self.nodes_threat = {}
Exemple #2
0
    def graphs_variance(previous_graph: nx.MultiDiGraph, current_graph: nx.MultiDiGraph) -> dict:
        """
        TODO: Add doc.

        :param previous_graph:
        :param current_graph:
        :return:
        """
        graph_1 = previous_graph.copy()
        graph_2 = current_graph.copy()

        graph_1_nodes = list(graph_1.nodes)
        graph_2_nodes = list(graph_2.nodes)

        diff_1_2 = my_list.diff(graph_1_nodes, graph_2_nodes)
        diff_2_1 = my_list.diff(graph_2_nodes, graph_1_nodes)

        return {
            'loss': len(diff_1_2),
            'gain': len(diff_2_1),
            'lost_nodes': diff_1_2,
            'gain_nodes': diff_2_1
        }
def remove_nondot_keys(graph: nx.MultiDiGraph, inplace=False) -> nx.MultiDiGraph:
    if not inplace:
        graph = graph.copy()
    allowed = set(config.graphviz_attrs)

    def clean_attr(attrs: Dict):
        for key in attrs.keys() - allowed:
            del attrs[key]

    for node in graph:
        clean_attr(graph.nodes[node])
    # noinspection PyArgumentList
    for u, v, attr in graph.edges(keys=False, data=True):
        clean_attr(attr)
    return graph
Exemple #4
0
def collapse_timeline(graph: nx.MultiDiGraph) -> nx.MultiDiGraph:
    """
    Returns a new graph in which unneeded datetime nodes are removed.
    """
    g: nx.MultiDiGraph = graph.copy()
    timeline = sorted(node for node in g.nodes() if isinstance(node, date))
    if not timeline:
        return g  # nothing to do
    for node in timeline[1:]:
        pred = first(g.predecessors(node))
        succ = first(g.successors(node))
        if g.in_degree(node) == 1 and g.out_degree(node) == 1 \
                and isinstance(pred, date) and isinstance(succ, date):
            g.add_edge(pred, succ, **g[pred][node][0])
            g.remove_node(node)
    return g
Exemple #5
0
def convert_to_digraph(G_orig: nx.MultiDiGraph) -> nx.DiGraph:
    # Prevent upstream impacts
    G = G_orig.copy()

    dupes_dict = {}
    for node_id in G.nodes():
        nodes_to = []
        for fr, to in G.out_edges(node_id):
            nodes_to.append(to)
        to_collection = collections.Counter(nodes_to).items()
        dupes = [item for item, count in to_collection if count > 1]

        if len(dupes) > 0:
            dupes_dict[node_id] = {}

            for dupe in dupes:
                in_consideration = []

                # Get all the edge attributes for this node pair
                dupe_count = G.number_of_edges(node_id, dupe)
                for i in range(dupe_count):
                    e = G.edges[node_id, dupe, i]
                    in_consideration.append(e)

                # From the results, we optimistically select the fastest
                # edge value and all associated key/values from the list
                fastest_e = min(in_consideration, key=lambda x: x['length'])
                dupes_dict[node_id][dupe] = fastest_e

    # Now that we have a list of issue duplicates, we can
    # iterate through the list and remove and replace edges
    for fr in dupes_dict.keys():
        to_dict = dupes_dict[fr]
        for to in to_dict.keys():
            # Remove all the edges that exist, we are going
            # to start with a fresh slate (also, NetworkX makes
            # it really hard to control which edges you are
            # removing, otherwise)
            for i in range(G.number_of_edges(fr, to)):
                G.remove_edge(fr, to)

            # Now let's start fresh and add a new, single, edge
            G.add_edge(fr, to, **to_dict[to])

    # Now we should be safe to return a clean directed graph object
    return nx.DiGraph(G)
Exemple #6
0
def test_EncodeNodes_llvm_program_graph(llvm_program_graph_nx: nx.MultiDiGraph):
  """Black-box test encoding LLVM program graphs."""
  encoder = node_encoder.GraphNodeEncoder()
  g = llvm_program_graph_nx.copy()
  encoder.EncodeNodes(g)

  # This assumes that all of the test graphs have at least one statement.
  num_statements = sum(
    1 if data["type"] == programl_pb2.Node.STATEMENT else 0
    for _, data in g.nodes(data=True)
  )
  assert num_statements >= 1

  # Check for the presence of expected node attributes.
  for _, data in g.nodes(data=True):
    assert len(data["x"]) == 1
    assert len(data["y"]) == 0
    assert "preprocessed_text" in data
Exemple #7
0
def linefy_all_geom(G_original: nx.MultiDiGraph):
    '''
    Simplifies the shape shown in the image.
    :param G_original: networkx graph object.
    :return: SImplified networkx graph object.
    '''
    G = G_original.copy()
    for e in G.edges(data=True):
        u, v, info = e
        ax = G.nodes[u]['x']
        ay = G.nodes[u]['y']
        bx = G.nodes[v]['x']
        by = G.nodes[v]['y']
        info['geometry'] = LineString([[ax, ay], [bx, by]])
    return G


# import queue
# def add_direct_edges(G_original: nx.MultiDiGraph, threshold = 2000):
#     G = G_original.copy()
#     for n in G.nodes:
#         q = queue.Queue()
#         neighbors = list(G.successors(n))
#         q.put(n)
#         #neighbors = set()
#         distances = {}
#         distances[n] = 0
#         while q.qsize() > 0:
#             v = q.get()
#             for e in list(G.out_edges(v, data=True, keys=True)):
#                 _, successor, _, info = e
#                 if (successor not in distances or distances[v] + info['length'] < distances[successor])\
#                         and distances[v] + info['length'] < threshold:
#                     q.put(successor)
#                     #neighbors.add(successor)
#                     distances[successor] = distances[v] + info['length']
#
#         for successor in distances.keys():
#             if successor not in neighbors:
#                 G.add_edge(n, successor, length=distances[successor])
#
#     print("%d to %d by add edge ." % (G_original.number_of_edges(), G.number_of_edges()))
#
#     return G
Exemple #8
0
    def __init__(self, multiDiGraph: MultiDiGraph):
        # Nx MultiDiGraph used to compute threat
        self.nxGraph = multiDiGraph
        self.previousCopy = multiDiGraph.copy()
        # Threat calculator with an ability to remember threat of node
        self.threat_calc = ThreatCalculator(self.nxGraph)

        self.components_index = 0
        self.graph_threat = 0
        self.strong_components = {}
        self.links_to_strong_components = {}
        self.nodes_threat = {}
        self.strong_components_to_update = set()
        self.vulns = {}
        self.nodes = []
        self.edges = []

        self._init_graph()
        pass
Exemple #9
0
def simplify_graph_remove_boundary_nodes(G_original: nx.MultiDiGraph):
    '''
    Removes dangling roads at the boundary.
    :param G_original: networkx graph object.
    :return: SImplified networkx graph object.
    '''
    G = G_original.copy()
    while True:
        to_remove = []
        for n, info in list(G.nodes(data=True)):
            ins = list(G.predecessors(n))
            outs = list(G.successors(n))

            if G.in_degree(n) == 1 and G.out_degree(n) == 1 and len(ins) == 1 and len(outs) == 1 and ins[0] == outs[0]:
                to_remove.append(n)
        if len(to_remove) == 0:
            break
        G.remove_nodes_from(to_remove)

    print("Boundary Removed")
    print_graph_info(G)
    return G
Exemple #10
0
def r1_minus(g: nx.MultiDiGraph) -> nx.MultiDiGraph:
    g = g.copy()
    del_nodes = []
    for node, parity in g.nodes(data="parity"):
        # print(node, parity)
        if (parity == "Odd") and (node in g.neighbors(node)):
            data = g[node][node][0]
            if data["Tu"] != data["Tv"]:
                for pred_node, data in g.pred[node].items():
                    if pred_node != node:
                        u = pred_node
                        Tu = data[0]["Tu"]
                for succ_node, data in g.succ[node].items():
                    if succ_node != node:
                        v = succ_node
                        Tv = data[0]["Tv"]
                # print(u, v, Tu, Tv)
                del_nodes.append(node)
                g.add_edge(u, v, Tu=Tu, Tv=Tv)
    for node in del_nodes:
        g.remove_node(node)
    return g
def simplify_graph(original_graph: nx.MultiDiGraph) -> nx.MultiDiGraph:
    """
    Creates a copy of the graph that contains only simple types, so it can be serialized to, e.g., GEXF
    """
    graph = original_graph.copy()

    translation = {}
    for node, attrs in graph.nodes.data():
        if isinstance(node, date):
            attrs['kind'] = 'date'
            translation[node] = node.isoformat()
        elif isinstance(node, Reference):
            attrs['label'] = node.label
            translation[node] = node.uri
            if isinstance(node, SplitReference):
                attrs['kind'] = node.side.value
            else:
                attrs['kind'] = node.__class__.__name__
        else:
            attrs['kind'] = type(node).__name__
            attrs['label'] = str(node)
            translation[node] = base_n(hash(node), 62)  # use a stable, short representation
        _simplify_attrs(attrs)

    nx.relabel_nodes(graph, translation, copy=False)

    for u, v, attrs in graph.edges(data=True):
        if 'source' in attrs and not 'label' in attrs:
            source_ = attrs['source']
            if isinstance(source_, Sequence) and not isinstance(source_, str):
                attrs['label'] = '\n'.join(f"{s.citation}: {s.detail}" if s.detail else s.citation for s in source_)
            else:
                attrs['label'] = str(source_)
        _simplify_attrs(attrs)

    # noinspection PyTypeChecker
    return graph
Exemple #12
0
def collapse_edges_by_source(graph: nx.MultiDiGraph) -> nx.MultiDiGraph:
    """
    Returns a new graph with all parallel edges from the same source collapsed.
    """
    result = graph.copy()
    edge_groups = defaultdict(list)
    for u, v, k, attr in result.edges(keys=True, data=True):
        if 'source' in attr:
            edge_groups[(u, v, attr['kind'], attr['source'].uri)].append(
                (u, v, k, attr))

    for (u, v, kind, source_uri), group in edge_groups.items():
        if len(group) > 1:
            logger.debug('Collapsing group %s', group)
            group_attr = dict(
                weight=sum(attr.get('weight', 1) for u, v, k, attr in group),
                kind=kind,
                collapsed=len(group),
                source=BiblSource(source_uri),
                sources=[attr['source'] for u, v, k, attr in group],
                xml=[attr['xml'] for u, v, k, attr in group])
            result.remove_edges_from(group)
            result.add_edge(u, v, **group_attr)
    return result
Exemple #13
0
def simplify_graph_remove22(G_original: nx.MultiDiGraph):
    '''
    Simplifies <=><=> shaped road to <=>.
    :param G_original: networkx graph object.
    :return: SImplified networkx graph object.
    '''
    G = G_original.copy()

    def set_geometry(e):
        u, v, _, edge_info = e
        if 'geometry' not in edge_info:
            ux = G.nodes[u]['x']
            uy = G.nodes[u]['y']
            vx = G.nodes[v]['x']
            vy = G.nodes[v]['y']
            edge_info['geometry'] = LineString([[ux, uy], [vx, vy]])

    while True:
        to_remove = []
        for n in list(G.nodes()):
            # a0 -> b0 -> c0
            # a1 <- b1 <- c1

            outs = list(G.out_edges(n, data=True, keys=True))
            ins = list(G.in_edges(n, data=True, keys=True))

            if len(ins) == 2 and len(outs) == 2:
                ins.sort(key=lambda x: x[0])
                outs.sort(key=lambda x: x[1])
                if ins[0][0] == outs[0][1] and ins[1][0] == outs[1][
                        1] and ins[0][0] != outs[1][1] and (n != ins[0][0] and
                                                            n != ins[1][0]):

                    a = ins[0][0]
                    c = ins[1][0]

                    b = n

                    for e in ins:
                        set_geometry(e)

                    for e in outs:
                        set_geometry(e)

                    l_ab = ins[0][3]
                    l_cb = ins[1][3]
                    l_ba = outs[0][3]
                    l_bc = outs[1][3]

                    l_ac = copy.deepcopy(l_ab)
                    l_ca = copy.deepcopy(l_cb)
                    gac = MultiLineString([l_ab['geometry'], l_bc['geometry']])
                    gac = ops.linemerge(gac)
                    gca = MultiLineString([l_cb['geometry'], l_ba['geometry']])
                    gca = ops.linemerge(gca)

                    l_ac['length'] = l_ab['length'] + l_bc['length']
                    l_ac['geometry'] = gac

                    l_ca['length'] = l_cb['length'] + l_ba['length']
                    l_ca['geometry'] = gca

                    G.remove_node(n)
                    G.add_edge(a, c, **l_ac)
                    G.add_edge(c, a, **l_ca)

                    to_remove.append(b)

        if len(to_remove) == 0:
            break

    print("== Removed")
    print_graph_info(G)
    return G
def write_dot(graph: nx.MultiDiGraph, target: Optional[Union[PathLike, str]] = 'base_graph.dot',
              style: Optional[Dict] = None,
              highlight: Optional[Union[Node, Sequence[Node]]] = None,
              highlight_path: Optional[Tuple[Node, Node]] = None,
              record: Union[bool, str] = 'auto', edge_labels: bool = True) -> AGraph:
    """
    Writes a properly styled graphviz file for the given graph.

    Args:
        graph: the subgraph to draw
        target: dot file that should be written, may be a Path. If none, nothing is written but the AGraph returns
        style (dict): rules for styling the graph
        highlight: if a node, highlight that in the graph.
        highlight_path: If a tuple of nodes, highlight the shortest path(s) from the
                   first to the second node
        record: record in the queue for `render_all`. If ``"auto"`` dependent on graph size
        edge_labels (bool): Should we paint edge labels?

    Returns:
        the AGraph, can be used to write the thing yourself.
    """
    if style is None:
        style = config.styles
    logger.info('Writing %s ...', target)
    try:
        if record == 'auto' and config.render_node_limit >= 0:
            record = graph.number_of_nodes() < config.render_node_limit
            if not record:
                logger.info('%s is too large to be rendered automatically (%d nodes)', target, graph.number_of_nodes())
    except Exception as e:
        logger.warning('Auto edges limit configuration error: %s', e)

    vis = graph.copy()
    add_timeline_edges(vis)
    for node in vis:
        if isinstance(node, Reference):
            vis.nodes[node]['URL'] = node.filename.stem
            vis.nodes[node]['target'] = '_top'

    # single node highlight
    if highlight is not None and not isinstance(highlight, Sequence):
        highlight = [highlight]

    if highlight_path is not None:
        if highlight is None:
            highlight = list(highlight_path)
        else:
            highlight = list(highlight)
            highlight.extend(highlight_path)
            if 'highlight' in style['edge']:
                try:
                    vis.edges[highlight].update(style['edge']['highlight'])
                except KeyError:
                    logger.warning('Highlight key %s not found while writing %s', highlight, target)

    if highlight is not None:
        if not isinstance(highlight, Sequence):
            highlight = [highlight]

        for node in list(highlight):
            if isinstance(node, SplitReference) and node.other:
                highlight.append(node.other)

        if 'highlight' in style['node']:
            for highlight_node in highlight:
                try:
                    vis.nodes[highlight_node].update(style['node']['highlight'])
                except KeyError:
                    logger.warning('Highlight key %s not found while writing %s', highlight, target)

    # noinspection PyTypeChecker
    simplified: MultiDiGraph = simplify_graph(vis)

    # now style by kind:
    if 'edge' in style:
        for u, v, k, attr in simplified.edges(data=True, keys=True):
            kind = attr.get('kind', None)
            if attr.get('delete', False):
                attr['URL'] = pathlink(u, v).stem
                attr['target'] = '_top'
            if kind in style['edge']:
                simplified.edges[u, v, k].update(style['edge'][kind])
            for styled_attr in attr.keys() & style['edge']:
                if attr[styled_attr]:
                    simplified.edges[u, v, k].update(style['edge'][styled_attr])
            if 'topo' in attr and 'constraint' in attr:
                del attr['constraint']

    if 'node' in style:
        for node, attr in simplified.nodes(data=True):
            kind = attr.get('kind', None)
            if kind in style['node']:
                simplified.nodes[node].update(style['node'][kind])
            for styled_attr in attr.keys() & style['node']:
                if attr[styled_attr]:
                    attr.update(style['node'][styled_attr])

    if not edge_labels:
        for u, v, k, attr in simplified.edges(data=True, keys=True):
            if 'label' in attr:
                del attr['label']

    if config.clean_gv_files:
        remove_nondot_keys(simplified, inplace=True)
    agraph: AGraph = nx.nx_agraph.to_agraph(simplified)
    agraph.edge_attr['fontname'] = 'Ubuntu derivative Faust'
    agraph.edge_attr['fontsize'] = 8
    agraph.node_attr['fontname'] = 'Ubuntu derivative Faust'
    agraph.node_attr['fontsize'] = 12
    agraph.graph_attr['rankdir'] = 'LR'
    agraph.graph_attr['stylesheet'] = '/css/webfonts.css'

    # extract the timeline
    timeline = agraph.add_subgraph([node for node in agraph.nodes() if node.attr['kind'] == 'date'],
                                   name='cluster_timeline')

    if 'timeline' in style:
        timeline_style = style['timeline']
        for t in ('graph', 'edge', 'node'):
            if t in timeline_style:
                getattr(timeline, t + '_attr', {}).update(timeline_style[t])
                logger.debug('timeline style: %s = %s', t, getattr(timeline, t + '_attr').items())  ## Doesn’t work

    if target is not None:
        target_path = Path(target)
        target_path.parent.mkdir(exist_ok=True, parents=True)
        dotfilename = str(target)
        agraph.write(dotfilename)
        if record:
            _render_queue.append(dotfilename)
        else:
            logger.warning('%s has not been queued for rendering', dotfilename)
    return agraph
Exemple #15
0
def MakeAliasSetGraphs(
  g: nx.MultiDiGraph,
  bytecode: str,
  n: typing.Optional[int] = None,
  false=False,
  true=True,
) -> typing.Iterable[nx.MultiDiGraph]:
  """Produce up to `n` alias set graphs.

  Args:
    g: The unlabelled input graph.
    bytecode: The bytecode which produced the input graph.
    n: The maximum number of graphs to produce. Multiple graphs are produced by
      selecting different root pointers for alias sets. If `n` is provided,
      the number of graphs generated will be in the range
      1 <= x <= min(num_alias_sets, n), where num_alias_sets is the number of
      alias sets larger than --alias_set_min_size. If n is None, num_alias_sets
      graphs will be produced.
    false: TODO(github.com/ChrisCummins/ProGraML/issues/2): Unused. This method
      is hardcoded to use 3-class 1-hots.
    true: TODO(github.com/ChrisCummins/ProGraML/issues/2): Unused. This method
      is hardcoded to use 3-class 1-hots.

  Returns:
    A generator of annotated graphs, where each graph has 'x' and 'y' labels on
    the statement nodes, and additionally a 'data_flow_max_steps_required'
    attribute which is set to the number of pointers in the alias set.
  """
  # TODO(github.com/ChrisCummins/ProGraML/issues/2): Replace true/false args
  # with a list of class values for all graph annotator functions.
  del false
  del true

  # Build the alias sets for the given bytecode.
  alias_sets_by_function = opt_util.GetAliasSetsByFunction(bytecode)

  functions = {
    function
    for node, function in g.nodes(data="function")
    # Not all nodes have a 'function' attribute, e.g. the magic root node.
    if function
  }

  # Silently drop alias sets for functions which don't exist in the graph.
  alias_sets_to_delete = []
  for function in alias_sets_by_function:
    if function not in functions:
      alias_sets_to_delete.append(function)
  if alias_sets_to_delete:
    for function in alias_sets_to_delete:
      del alias_sets_by_function[function]
    app.Log(
      2,
      "Removed %d alias sets generated from bytecode but not found in "
      "graph: %s",
      len(alias_sets_to_delete),
      alias_sets_to_delete,
    )

  function_alias_set_pairs: typing.List[
    typing.Tuple[str, opt_util.AliasSet]
  ] = []
  # Flatten the alias set dictionary and ignore any alias sets that are smaller
  # than the threshold size.
  for function, alias_sets in alias_sets_by_function.items():
    function_alias_set_pairs += [
      (function, alias_set)
      for alias_set in alias_sets
      if len(alias_set.pointers) >= FLAGS.alias_set_min_size
    ]

  # Select `n` random alias sets to generate labelled graphs for.
  if n and len(function_alias_set_pairs) > n:
    random.shuffle(function_alias_set_pairs)
    function_alias_set_pairs = function_alias_set_pairs[:n]

  for function, alias_set in function_alias_set_pairs:
    # Translate the must/may alias property into 3-class 1-hot labels.
    if alias_set.type == "may alias":
      false = np.array([1, 0, 0], np.int64)
      true = np.array([0, 1, 0], np.int64)
    elif alias_set.type == "must alias":
      false = np.array([1, 0, 0], np.int64)
      true = np.array([0, 0, 1], np.int64)
    else:
      raise ValueError(f"Unknown alias set type `{alias_set.type}`")

    # Transform pointer name into the node names produced by the ComposeGraphs()
    # method in the graph builder. When we compose multiple graphs, we add the
    # function name as a prefix, and `_operand` suffix to identifier nodes.
    pointers = [
      f"{function}_{p.identifier}_operand" for p in alias_set.pointers
    ]

    root_pointer = random.choice(pointers)
    labelled = g.copy()
    labelled.data_flow_max_steps_required = AnnotateAliasSet(
      labelled, root_pointer, pointers, false=false, true=true
    )
    yield labelled
Exemple #16
0
def simplify_graph(G_orig: nx.MultiDiGraph) -> nx.MultiDiGraph:
    # Note: This operation borrows heavily from the operation of
    #       the same name in OSMnx, as it existed in this state/commit:
    #       github.com/gboeing/osmnx/blob/
    #       c5916aab5c9b94c951c8fb1964c841899c9467f8/osmnx/simplify.py
    #       Function on line 203

    # Prevent upstream mutation, always copy
    G = G_orig.copy()

    # Used to track updates to execute
    all_nodes_to_remove = []
    all_edges_to_add = []

    # TODO: Improve this method to not produce any mixed mode path
    #       removal proposals
    # Utilize the recursive function from OSMnx that identifies paths based
    # on isolated successor nodes
    paths_to_consider = ox.simplify.get_paths_to_simplify(G)

    # Iterate through the resulting path arrays to target
    for path in paths_to_consider:
        # If the path is not all one mode of travel, skip the
        # proposed simplification
        if not _path_has_consistent_mode_type(G, path):
            continue

        # Keep track of the edges to be removed so we can
        # assemble a LineString geometry with all of them
        edge_attributes = {}

        # Work from the last edge through, "wrapped around," to the beginning
        for u, v in zip(path[:-1], path[1:]):
            # Should not be multiple edges between interstitial nodes
            only_one_edge = G.number_of_edges(u, v) == 1
            if not only_one_edge:
                log(('Multiple edges between "{}" and "{}" '
                     'found when simplifying').format(u, v))

            # We ask for the 0th edge as we assume there is only one
            edge = G.edges[u, v, 0]
            for key in edge:
                if key in edge_attributes:
                    # If key already exists in dict, append
                    edge_attributes[key].append(edge[key])
                else:
                    # Otherwise, initialize a list
                    edge_attributes[key] = [edge[key]]

        # Note: In peartree, we opt to not preserve any other elements;
        #       we only keep length, mode and - in the case of simplified
        #       geometries - the shape of the simplified route
        edge_attributes['mode'] = edge_attributes['mode'][0]
        edge_attributes['length'] = sum(edge_attributes['length'])

        # Construct the geometry from the points array
        points_array = []
        for node in path:
            p = Point((G.nodes[node]['x'], G.nodes[node]['y']))
            points_array.append(p)
        edge_attributes['geometry'] = LineString(points_array)

        # Add nodes and edges to respective lists for processing
        all_nodes_to_remove.extend(path[1:-1])
        all_edges_to_add.append({
            'origin': path[0],
            'destination': path[-1],
            'attr_dict': edge_attributes
        })

    # For each edge to add in the list we assembled, create a new edge between
    # the origin and destination
    for edge in all_edges_to_add:
        G.add_edge(edge['origin'], edge['destination'], **edge['attr_dict'])

    # Remove all the interstitial nodes between the new edges, which will also
    # knock out the related edges from the graph
    G.remove_nodes_from(set(all_nodes_to_remove))

    # TODO: This step could be significantly optimized (as well as
    # parameterized, made optional)
    # A final step that cleans out all duplicate edges (not desired in a
    # simplified network)
    mult_edges = []
    mult_edges_full = []
    for fr, to, edge in G.edges(data=True):
        if G.number_of_edges(fr, to) > 1:
            mult_edges.append((fr, to))
            mult_edges_full.append((fr, to, edge))

    # Clean out the permutations to just one of each
    mult_edges = set(mult_edges)

    # TODO: This nested for loop is sloppy; clean up (numpy scalars, perhaps)
    for fr1, to1 in mult_edges:
        subset_edges = []
        for fr2, to2, edge in mult_edges_full:
            if fr1 == fr2 and to1 == to2:
                subset_edges.append(edge)
        keep = max(subset_edges, key=lambda x: x['length'])

        # Drop all the edges
        edge_ct = len(subset_edges)
        G.remove_edges_from([(fr1, to1)] * edge_ct)

        # Then just re-add the one that we want
        G.add_edge(fr1, to1, **keep)

    return G
Exemple #17
0
def coalesce(G_orig: nx.MultiDiGraph, resolution: float) -> nx.MultiDiGraph:
    # Make sure our resolution satisfies basic requirement
    if resolution < 1:
        raise ValueError('Resolution parameters must be >= 1')

    # Avoid upstream mutation of the graph
    G = G_orig.copy()

    # Before we continue, attempt to simplfy the current network
    # such that we won't generate isolated nodes that become disconnected
    # from key coalesced nodes (because too many intermediary nodes)
    G = simplify_graph(G)

    # Extract all x, y values
    grouped = {}
    for i, node in G.nodes(data=True):
        x = (round(node['x'] / resolution) * resolution)
        y = (round(node['y'] / resolution) * resolution)

        # Build the dictionary as needed
        if x not in grouped:
            grouped[x] = {}
        if y not in grouped[x]:
            grouped[x][y] = []

        # Append each node under its approx. area grouping
        grouped[x][y].append(i)

    # Generate a series of reference dictionaries that allow us
    # to assign a new node name to each grouping of nodes
    counter = 0
    new_node_coords = {}
    lookup = {}

    # Populate the fresh reference dictionaries
    for x in grouped:
        for y in grouped[x]:
            new_node_name = '{}_{}'.format(G.name, counter)
            new_node_coords[new_node_name] = {'x': x, 'y': y}

            # Pair each newly generate name to the original node id,
            # preserved from the original groupings resulting array
            for n in grouped[x][y]:
                lookup[n] = new_node_name

            # Update the counter so each new synthetic
            # node name will be different
            counter += 1

    # Recast the lookup crosswalk as a series for convenience
    reference = pd.Series(lookup)

    # Get the average boarding cost for each node grouping
    for nni in new_node_coords:
        # Initialize an empty list
        boarding_costs = []

        # Get all original nodes that have been grouped
        g_nodes = reference.loc[reference == nni].index.values

        # Iterate through and add gather costs
        for i in g_nodes:
            bc = G.nodes[i]['boarding_cost']
            boarding_costs.append(bc)

        # Calculate the mean of the boarding costs
        avg_bc = np.array(boarding_costs).mean()

        # And assign it to the new nodes objects
        new_node_coords[nni]['boarding_cost'] = avg_bc

    # First step to creating a list of replacement edges
    replacement_edges_fr = []
    replacement_edges_to = []
    replacement_edges_len = []

    for n1, n2, edge in G.edges(data=True):
        # This will be used to parse out which edges to keep
        replacement_edges_fr.append(reference[n1])
        replacement_edges_to.append(reference[n2])
        replacement_edges_len.append(edge['length'])

    # This takes the resulting matrix and converts it to a pandas DataFrame
    edges_df = pd.DataFrame({
        'fr': replacement_edges_fr,
        'to': replacement_edges_to,
        'len': replacement_edges_len
    })
    # Next we group by the edge pattern (from -> to)
    grouped = edges_df.groupby(['fr', 'to'], sort=False)
    # With the resulting groupings, we extract values
    min_edges = grouped['len'].min()

    # Second step; which uses results from edge_df grouping/parsing
    edges_to_add = []
    for n1, n2, edge in G.edges(data=True):
        rn1 = reference[n1]
        rn2 = reference[n2]

        # Make sure that this is the min edge
        min_length = min_edges.loc[rn1, rn2]

        # Skip this edge if it is not the minimum edge length
        if not edge['length'] == min_length:
            continue

        # If we pass the first check, we should also make sure that
        # the edge has not already been added by another minimum edge
        try:
            # If this works, then the edge already exists
            existing_edge = G[rn1][rn2]
            # Also sanity check that it is the min length value
            if not existing_edge['length'] == min_length:
                raise ValueError(
                    'Edge should have had minimum length of '
                    '{}, but instead had value of {}'.format(min_length))

        # If this happens, then this is the first time this edge
        # is being added
        except KeyError:
            edges_to_add.append((rn1, rn2, edge))

    # Add the new edges
    for n1, n2, edge in edges_to_add:
        # But avoid edges that now connect to the same node
        if not n1 == n2:
            G.add_edge(n1, n2, length=edge['length'], mode=edge['mode'])

    # Now we can remove all edges and nodes that predated the
    # coalescing operations
    for n in reference.index:
        # Note that this will also drop all edges
        G.remove_node(n)

    # Also make sure to update the new nodes with their summary
    # stats and locational data
    for i, node in new_node_coords.items():
        # Some nodes are completely dropped in this operation
        # with no replacement edges (e.g. nodes that would have
        # connected to another node that ended up getting coalesced
        # into the same single node)
        if i not in G.nodes():
            continue

        # For all other nodes, preserve them by re-populating
        for key in node:
            G.nodes[i][key] = node[key]

    return G
Exemple #18
0
class MossNet:
    def __init__(self, moss_results_dict):
        '''Create a ``MossNet`` object from a 3D dictionary of downloaded MOSS results

        Args:
            ``moss_results_dict`` (``dict``): A 3D dictionary of downloaded MOSS results

        Returns:
            ``MossNet``: A ``MossNet`` object
        '''
        if isinstance(moss_results_dict, MultiDiGraph):
            self.graph = moss_results_dict; return
        if isinstance(moss_results_dict, str):
            try:
                if moss_results_dict.lower().endswith('.gz'):
                    moss_results_dict = load(gopen(moss_results_dict))
                else:
                    moss_results_dict = load(open(moss_results_dict,'rb'))
            except:
                raise ValueError("Unable to load dictionary: %s" % moss_results_dict)
        if not isinstance(moss_results_dict, dict):
            raise TypeError("moss_results_dict must be a 3D dictionary of MOSS results")
        self.graph = MultiDiGraph()
        for u in moss_results_dict:
            u_edges = moss_results_dict[u]
            if not isinstance(u_edges, dict):
                raise TypeError("moss_results_dict must be a 3D dictionary of MOSS results")
            for v in u_edges:
                u_v_links = u_edges[v]
                if not isinstance(u_edges[v], dict):
                    raise TypeError("moss_results_dict must be a 3D dictionary of MOSS results")
                for f in u_v_links:
                    try:
                        left, right = u_v_links[f]
                    except:
                        raise TypeError("moss_results_dict must be a 3D dictionary of MOSS results")
                    self.graph.add_edge(u, v, attr_dict = {'files':f, 'left':left, 'right':right})

    def save(self, outfile):
        '''Save this ``MossNet`` object as a 3D dictionary of MOSS results

        Args:
            ``outfile`` (``str``): The desired output file's path
        '''
        out = dict()
        for u in self.graph.nodes:
            u_edges = dict(); out[u] = u_edges
            for v in self.graph.neighbors(u):
                u_v_links = dict(); u_edges[v] = u_v_links; u_v_edge_data = self.graph.get_edge_data(u,v)
                for k in u_v_edge_data:
                    edge = u_v_edge_data[k]['attr_dict']; u_v_links[edge['files']] = (edge['left'], edge['right'])
        if outfile.lower().endswith('.gz'):
            f = gopen(outfile, mode='wb', compresslevel=9)
        else:
            f = open(outfile, 'wb')
        pkldump(out, f); f.close()

    def __add__(self, o):
        if not isinstance(o, MossNet):
            raise TypeError("unsupported operand type(s) for +: 'MossNet' and '%s'" % type(o).__name__)
        g = MultiDiGraph()
        g.add_edges_from(list(self.graph.edges(data=True)) + list(o.graph.edges(data=True)))
        g.add_nodes_from(list(self.graph.nodes(data=True)) + list(o.graph.nodes(data=True)))
        return MossNet(g)

    def get_networkx(self):
        '''Return a NetworkX ``MultiDiGraph`` equivalent to this ``MossNet`` object

        Returns:
            ``MultiDiGraph``: A NetworkX ``DiGraph`` equivalent to this ``MossNet`` object
        '''
        return self.graph.copy()

    def get_nodes(self):
        '''Returns a ``set`` of node labels in this ``MossNet`` object

        Returns:
            ``set``: The node labels in this ``MossNet`` object
        '''
        return set(self.graph.nodes)

    def get_pair(self, u, v, style='tuples'):
        '''Returns the links between nodes ``u`` and ``v``

        Args:
            ``u`` (``str``): A node label

            ``v`` (``str``): A node label not equal to ``u``

            ``style`` (``str``): The representation of a given link

            * ``"tuples"``: Links are ``((u_percent, u_html), (v_percent, v_html))`` tuples

            * ``"html"``: Links are HTML representation (one HTML for all links)

            * ``"htmls"``: Links are HTML representations (one HTML per link)

        Returns:
            ``dict``: The links between ``u`` and ``v`` (keys are filenames)
        '''
        if style not in {'tuples', 'html', 'htmls'}:
            raise ValueError("Invalid link style: %s" % style)
        if u == v:
            raise ValueError("u and v cannot be equal: %s" % u)
        for node in [u,v]:
            if not self.graph.has_node(node):
                raise ValueError("Nonexistant node: %s" % node)
        links = self.graph.get_edge_data(u,v)
        out = dict()
        for k in sorted(links.keys(), key=lambda x: links[x]['attr_dict']['files']):
            d = links[k]['attr_dict']
            u_fn, v_fn = d['files']
            u_percent, u_html = d['left']
            v_percent, v_html = d['right']
            if style == 'tuples':
                out[(u_fn, v_fn)] = ((u_percent, u_html), (v_percent, v_html))
            elif style in {'html', 'htmls'}:
                out[(u_fn, v_fn)] = '<html><table style="width:100%%" border="1"><tr><td colspan="2"><center><b>%s/%s --- %s/%s</b></center></td></tr><tr><td>%s (%d%%)</td><td>%s (%d%%)</td></tr><tr><td><pre>%s</pre></td><td><pre>%s</pre></td></tr></table></html>' % (u, u_fn, v, v_fn, u, u_percent, v, v_percent, u_html, v_html)
        if style == 'html':
            out = '<html>' + '<br>'.join(out[fns].replace('<html>','').replace('</html>','') for fns in sorted(out.keys())) + '</html>'
        return out

    def get_summary(self, style='html'):
        '''Returns a summary of this ``MossNet``

        Args:
            ``style`` (``str``): The representation of this ``MossNet``

        Returns:
            ``dict``: A summary of this ``MossNet``, where keys are filenames
        '''
        if style not in {'html'}:
            raise ValueError("Invalid summary style: %s" % style)
        matches = list() # list of (u_path, u_percent, v_path, v_percent) tuples
        for u,v in self.traverse_pairs(order=None):
            links = self.graph.get_edge_data(u,v)
            for k in links:
                d = links[k]['attr_dict']
                u_fn, v_fn = d['files']
                u_percent, u_html = d['left']
                v_percent, v_html = d['right']
                matches.append(('%s/%s' % (u,u_fn), u_percent, '%s/%s' % (v,v_fn), v_percent))
        matches.sort(reverse=True, key=lambda x: max(x[1],x[3]))
        return '<html><table style="width:100%%" border="1">%s</table></html>' % ''.join(('<tr><td>%s (%d%%)</td><td>%s (%d%%)</td></tr>' % tup) for tup in matches)

    def num_links(self, u, v):
        '''Returns the number of links between ``u`` and ``v``

        Args:
            ``u`` (``str``): A node label

            ``v`` (``str``): A node label not equal to ``u``

        Returns:
            ``int``: The number of links between ``u`` and ``v``
        '''
        for node in [u,v]:
            if not self.graph.has_node(node):
                raise ValueError("Nonexistant node: %s" % node)
        return len(self.graph.get_edge_data(u,v))

    def num_nodes(self):
        '''Returns the number of nodes in this ``MossNet`` object

        Returns:
            ``int``: The number of nodes in this ``MossNet`` object
        '''
        return self.graph.number_of_nodes()

    def num_edges(self):
        '''Returns the number of (undirected) edges in this ``MossNet`` object (including parallel edges)

        Returns:
            ``int``: The number of (undirected) edges in this ``MossNet`` object (including parallel edges)
        '''
        return int(self.graph.number_of_edges()/2)

    def outlier_pairs(self):
        '''Predict which student pairs are outliers (i.e., too many problem similarities).
        The distribution of number of links between student pairs (i.e., histogram) is modeled as y = A/(B^x),
        where x = a number of links, and y = the number of student pairs with that many links

        Returns:
            ``list`` of ``tuple``: The student pairs expected to be outliers (in decreasing order of significance)
        '''
        links = dict() # key = number of links; value = set of student pairs that have that number of links
        for u,v in self.traverse_pairs():
            n = self.num_links(u,v)
            if n not in links:
                links[n] = set()
            links[n].add((u,v))
        mult = list(); min_links = min(len(s) for s in links.values()); max_links = max(len(s) for s in links.values())
        for i in range(min_links, max_links):
            if i not in links or i+1 not in links or len(links[i+1]) > len(links[i]):
                break
            mult.append(float(len(links[i]))/len(links[i+1]))
        B = sum(mult)/len(mult)
        A = len(links[min_links]) * (B**min_links)
        n_cutoff = log(A)/log(B)
        out = list()
        for n in sorted(links.keys(), reverse=True):
            if n < n_cutoff:
                break
            for u,v in links[n]:
                out.append((n,u,v))
        return out

    def traverse_pairs(self, order='descending'):
        '''Iterate over student pairs

        Args:
            ``order`` (``str``): Order to sort pairs in iteration

            * ``None`` to not sort (may be faster for large/dense graphs)

            * ``"ascending"`` to sort in ascending order of number of links

            * ``"descending"`` to sort in descending order of number of links
        '''
        if order not in {None, 'None', 'none', 'ascending', 'descending'}:
            raise ValueError("Invalid order: %s" % order)
        nodes = list(self.graph.nodes)
        pairs = [(u,v) for u in self.graph.nodes for v in self.graph.neighbors(u) if u < v]
        if order == 'ascending':
            pairs.sort(key=lambda x: len(self.graph.get_edge_data(x[0],x[1])))
        elif order == 'descending':
            pairs.sort(key=lambda x: len(self.graph.get_edge_data(x[0],x[1])), reverse=True)
        for pair in pairs:
            yield pair

    def export(self, outpath, style='html', gte=0, verbose=False):
        '''Export the links in this ``MossNet`` in the specified style

        Args:
            ``outpath`` (``str``): Path to desired output folder/file

            ``style`` (``str``): Desired output style

            ``gte`` (``int``): The minimum number of links for an edge to be exported

            * ``"dot"`` to export as a GraphViz DOT file

            * ``"gexf"`` to export as a Graph Exchange XML Format (GEXF) file

            * ``"html"`` to export one HTML file per pair

            ``verbose`` (``bool``): ``True`` to show verbose messages, otherwise ``False``
        '''
        if style not in {'dot', 'gexf', 'html'}:
            raise ValueError("Invalid export style: %s" % style)
        if isdir(outpath) or isfile(outpath):
            raise ValueError("Output path exists: %s" % outpath)
        if not isinstance(gte, int):
            raise TypeError("'gte' must be an 'int', but you provided a '%s'" % type(gte).__name__)
        if gte < 0:
            raise ValueError("'gte' must be non-negative, but yours was %d" % gte)

        # export as folder of HTML files
        if style == 'html':
            summary = self.get_summary(style='html')
            pairs = list(self.traverse_pairs(order=None))
            makedirs(outpath)
            f = open('%s/summary.html' % outpath, 'w'); f.write(summary); f.close()
            for i,pair in enumerate(pairs):
                if verbose:
                    print("Exporting pair %d of %d..." % (i+1, len(pairs)), end='\r')
                u,v = pair
                if self.num_links(u,v) < gte:
                    continue
                if style == 'html':
                    f = open("%s/%d_%s_%s.html" % (outpath, self.num_links(u,v), u, v), 'w')
                    f.write(self.get_pair(u, v, style='html'))
                    f.close()
            if verbose:
                print("Successfully exported %d pairs" % len(pairs))

        # export as GraphViz DOT or a GEXF file
        elif style in {'dot', 'gexf'}:
            if verbose:
                print("Computing colors...", end='')
            max_links = max(self.num_links(u,v) for u,v in self.traverse_pairs())
            try:
                from seaborn import color_palette
            except:
                raise RuntimeError("Exporting as a DOT or GEXF file currently requires seaborn")
            pal = color_palette("Reds", max_links)
            if verbose:
                print(" done")
                print("Computing node information...", end='')
            nodes = list(self.get_nodes())
            index = {u:i for i,u in enumerate(nodes)}
            if verbose:
                print(" done")
                print("Writing output file...", end='')
            outfile = open(outpath, 'w')
            if style == 'dot':
                pal = [str(c).upper() for c in pal.as_hex()]
                outfile.write("graph G {\n")
                for u in nodes:
                    outfile.write('  node%d[label="%s"]\n' % (index[u], u))
                for u,v in self.traverse_pairs():
                    curr_num_links = self.num_links(u,v)
                    if curr_num_links < gte:
                        continue
                    outfile.write('  node%d -- node%d[color="%s"]\n' % (index[u], index[v], pal[curr_num_links-1]))
                outfile.write('}\n')
            elif style == 'gexf':
                from datetime import datetime
                pal = [(int(255*c[0]), int(255*c[1]), int(255*c[2])) for c in pal]
                outfile.write('<?xml version="1.0" encoding="UTF-8"?>\n')
                outfile.write('<gexf xmlns="http://www.gexf.net/1.3draft" xmlns:viz="http://www.gexf.net/1.3draft/viz">\n')
                outfile.write('  <meta lastmodifieddate="%s">\n' % datetime.today().strftime('%Y-%m-%d'))
                outfile.write('    <creator>MossNet</creator>\n')
                outfile.write('    <description>A MossNet network exported to GEXF</description>\n')
                outfile.write('  </meta>\n')
                outfile.write('  <graph mode="static" defaultedgetype="undirected">\n')
                outfile.write('    <nodes>\n')
                for u in nodes:
                    outfile.write('      <node id="%d" label="%s"/>\n' % (index[u], u))
                outfile.write('    </nodes>\n')
                outfile.write('    <edges>\n')
                for i,pair in enumerate(self.traverse_pairs()):
                    u,v = pair
                    curr_num_links = self.num_links(u,v)
                    if curr_num_links == 0:
                        continue
                    color = pal[curr_num_links-1]
                    outfile.write('      <edge id="%d" source="%d" target="%d">\n' % (i, index[u], index[v]))
                    outfile.write('        <viz:color r="%d" g="%d" b="%d"/>\n' % (color[0], color[1], color[2]))
                    outfile.write('      </edge>\n')
                outfile.write('    </edges>\n')
                outfile.write('  </graph>\n')
                outfile.write('</gexf>\n')
            outfile.close()
            if verbose:
                print(" done")
Exemple #19
0
def coalesce(
        G_orig: nx.MultiDiGraph,
        resolution: float,
        edge_summary_method=lambda x: x.max(),
        boarding_cost_summary_method=lambda x: x.mean(),
) -> nx.MultiDiGraph:
    # Note: Feature is experimental. For more details, see
    #       https://github.com/kuanb/peartree/issues/126
    warnings.warn(('coalesce method is experimental - method risks '
                   'deformation of relative graph structure'))

    # Make sure our resolution satisfies basic requirement
    if resolution < 1:
        raise ValueError('Resolution parameters must be >= 1')

    # Avoid upstream mutation of the graph
    G = G_orig.copy()

    # Before we continue, attempt to simplfy the current network
    # such that we won't generate isolated nodes that become disconnected
    # from key coalesced nodes (because too many intermediary nodes)
    G = simplify_graph(G)

    # Extract all x, y values
    grouped = {}
    for i, node in G.nodes(data=True):
        x = (round(node['x'] / resolution) * resolution)
        y = (round(node['y'] / resolution) * resolution)

        # Build the dictionary as needed
        if x not in grouped:
            grouped[x] = {}
        if y not in grouped[x]:
            grouped[x][y] = []

        # Append each node under its approx. area grouping
        grouped[x][y].append(i)

    # Generate a series of reference dictionaries that allow us
    # to assign a new node name to each grouping of nodes
    counter = 0
    new_node_coords = {}
    lookup = {}

    # Populate the fresh reference dictionaries
    for x in grouped:
        for y in grouped[x]:
            new_node_name = '{}_{}'.format(G.name, counter)
            new_node_coords[new_node_name] = {'x': x, 'y': y}

            # Pair each newly generate name to the original node id,
            # preserved from the original groupings resulting array
            for n in grouped[x][y]:
                lookup[n] = new_node_name

            # Update the counter so each new synthetic
            # node name will be different
            counter += 1

    # Recast the lookup crosswalk as a series for convenience
    reference = pd.Series(lookup)

    # Get the following attributes:
    #   1. average boarding cost for each node grouping
    #   2. modes associated with each node grouping
    for nni in new_node_coords:
        # Initialize an empty list
        boarding_costs = []
        all_modes_related = []

        # Get all original nodes that have been grouped
        g_nodes = reference.loc[reference == nni].index.values

        # Iterate through and add gather costs
        for i in g_nodes:
            specific_node = G.nodes[i]

            bc = specific_node['boarding_cost']
            boarding_costs.append(bc)

            this_nodes_modes = specific_node['modes']
            all_modes_related.extend(this_nodes_modes)

        # Calculate the summary boarding costs
        # and assign it to the new nodes objects
        new_node_coords[nni]['boarding_cost'] = (boarding_cost_summary_method(
            np.array(boarding_costs)))

        # Get all unique modes and assign it to the new nodes objects
        sorted_set_list = sorted(list(set(all_modes_related)))
        new_node_coords[nni]['modes'] = sorted_set_list

    # First step to creating a list of replacement edges
    replacement_edges_fr = []
    replacement_edges_to = []
    replacement_edges_len = []

    for n1, n2, edge in G.edges(data=True):
        # This will be used to parse out which edges to keep
        replacement_edges_fr.append(reference[n1])
        replacement_edges_to.append(reference[n2])
        replacement_edges_len.append(edge['length'])

    # This takes the resulting matrix and converts it to a pandas DataFrame
    edges_df = pd.DataFrame({
        'fr': replacement_edges_fr,
        'to': replacement_edges_to,
        'len': replacement_edges_len
    })

    # Next we group by the edge pattern (from -> to)
    grouped = edges_df.groupby(['fr', 'to'], sort=False)

    # With the resulting groupings, we extract values
    # TODO: Also group on modes
    processed_edge_costs = edge_summary_method(grouped['len'])

    # Second step; which uses results from edge_df grouping/parsing
    edges_to_add = []
    for n1, n2, edge in G.edges(data=True):
        # Get corresponding ids of new nodes (grid corners)
        ref_n1 = reference[n1]
        ref_n2 = reference[n2]

        # Retrieve pair value from previous grouping operation
        avg_length = processed_edge_costs.loc[ref_n1, ref_n2]
        edges_to_add.append((ref_n1, ref_n2, avg_length, edge['mode']))

    # Add the new edges to graph
    for n1, n2, length, mode in edges_to_add:
        # Only add edge if it has not yet been added yet
        if G.has_edge(n1, n2):
            continue

        # Also avoid edges that now connect to the same node
        if n1 == n2:
            continue

        G.add_edge(n1, n2, length=length, mode=mode)

    # Now we can remove all edges and nodes that predated the
    # coalescing operations
    for n in reference.index:
        # Note that this will also drop all edges
        G.remove_node(n)

    # Also make sure to update the new nodes with their summary
    # stats and locational data
    for i, node in new_node_coords.items():
        if G.has_node(i):
            # For all other nodes, preserve them by re-populating
            for key in node:
                G.nodes[i][key] = node[key]

    return G