def ensure_prov_networkx_graph(prov_doc):
    if isinstance(prov_doc, ProvDocument):
        g = prov_to_graph(prov_doc)
    else:
        assert isinstance(g, nx.Graph)
        g = prov_doc
    return g
def ensure_prov_networkx_graph(prov_doc):
    if isinstance(prov_doc, ProvDocument):
        g = prov_to_graph(prov_doc)
    else:
        # Assuming we got a NetworkX graph already
        # TODO Raise an exception when this is not the case
        g = prov_doc
    return g
 def test_simple_graph_conversion(self):
     for name, doc_func in tests:
         prov_org = doc_func()
         g = prov_to_graph(prov_org)
         if prov_org.has_bundles():
             # Cannot round-trip with documents containing bundles, skipping
             continue
         prov_doc = graph_to_prov(g)
         self.assertEqual(prov_doc, prov_org, "Round trip graph conversion for '{}' failed.".format(name))
def build_grakel_graphs(graphs: pd.DataFrame, dataset_path: Path):
    if "grakel_graphs" in graphs.columns:
        # nothing to do
        return graphs  # unchanged

    # expecting a "graphfile" column in the input DataFrame
    grakel_graphs = []
    for graph_filename in graphs.graph_file:
        filepath = dataset_path / graph_filename
        # load the file
        prov_doc = ProvDocument.deserialize(filepath)
        prov_graph = prov_to_graph(prov_doc)  # type: nx.MultiDiGraph
        grakel_graphs.append(graph_from_prov_networkx_graph(prov_graph))
    graphs["grakel_graphs"] = grakel_graphs
    return graphs
Ejemplo n.º 5
0
 def test_simple_graph_conversion(self):
     for name, doc_func in tests:
         prov_to_graph(doc_func())
def version4(prov_doc, flat=False):
    results = dict()

    if isinstance(prov_doc, ProvDocument):
        g = prov_to_graph(prov_doc)
    else:
        # Assuming we got a NetworkX graph already
        g = prov_doc

    # PROV types
    type_counter = defaultdict(int, Counter(map(type, g.nodes())))
    results["entities"] = type_counter[ProvEntity]
    results["agents"] = type_counter[ProvAgent]
    results["activities"] = type_counter[ProvActivity]

    # Graph size
    results["nodes"] = g.number_of_nodes()
    results["edges"] = g.size()

    ug = g.to_undirected(as_view=True)
    n_comps = nx.number_connected_components(ug)
    results["components"] = n_comps
    results["diameter"] = nx.diameter(ug) if n_comps == 1 else -1

    # Clustering coefficients for all nodes
    # cc = nx.clustering(ug)
    cc = dict((n, e) for n, e in nx.clustering(nx.Graph(ug)).items()
              if e)  # excluding zero values
    cc_by_type = lambda node_type: [
        cc[n] for n in cc if isinstance(n, node_type)
    ]
    avg_or_0 = lambda l: sum(l) / len(l) if l else 0

    results["average_clustering_coefficient"] = {
        "all": avg_or_0(cc.values()),
        "entity": avg_or_0(cc_by_type(ProvEntity)),
        "activity": avg_or_0(cc_by_type(ProvActivity)),
        "agent": avg_or_0(cc_by_type(ProvAgent)),
    }

    try:
        assortability = nx.degree_pearson_correlation_coefficient(g)
    except ValueError:
        assortability = -1
    results["degree_assortativity_coefficient"] = (
        assortability if np.isfinite(assortability) else -1)

    s_paths = nx.shortest_path(g)
    lengths = lambda g, t1, t2: [(len(s_paths[i][j]) - 1)
                                 for i in node_select(g, t1) if i in s_paths
                                 for j in node_select(g, t2)
                                 if j in s_paths[i] and i != j]

    def mfd(graph, t1, t2):
        s_distances = lengths(graph, t1, t2)
        return max(s_distances) if s_distances else 0

    results["mfd"] = {
        "entity": {
            "entity": mfd(g, ProvEntity, ProvEntity),
            "activity": mfd(g, ProvEntity, ProvActivity),
            "agent": mfd(g, ProvEntity, ProvAgent),
        },
        "activity": {
            "entity": mfd(g, ProvActivity, ProvEntity),
            "activity": mfd(g, ProvActivity, ProvActivity),
            "agent": mfd(g, ProvActivity, ProvAgent),
        },
        "agent": {
            "entity": mfd(g, ProvAgent, ProvEntity),
            "activity": mfd(g, ProvAgent, ProvActivity),
            "agent": mfd(g, ProvAgent, ProvAgent),
        },
    }

    distributions = dict()
    # Path length distributions of derivations
    der_paths = paths_select(g, ProvEntity, ProvEntity, ProvDerivation)
    der_lengths = [(len(der_paths[i][j]) - 1) for i in der_paths
                   for j in der_paths[i]]
    distributions["derivations"] = der_lengths

    # Path length distributions of usages
    aee_paths = paths_select(g, ProvActivity, ProvEntity,
                             (ProvDerivation, ProvUsage))
    aee_lengths = [(len(aee_paths[i][j]) - 1) for i in aee_paths
                   for j in aee_paths[i]]
    distributions["activity_entity"] = aee_lengths

    # Path length distributions of attributions
    eeag_paths = paths_select(g, ProvEntity, ProvAgent,
                              (ProvDerivation, ProvAttribution))
    eeag_lengths = [(len(eeag_paths[i][j]) - 1) for i in eeag_paths
                    for j in eeag_paths[i]]
    distributions["entity_agent"] = eeag_lengths

    # Node degree distribution (undirected)
    distributions["node_degrees"] = list(dict(ug.degree()).values())

    results["distributions"] = distributions

    # The power law exponent of node degrees
    power_law_fit = powerlaw.Fit(distributions["node_degrees"],
                                 discrete=True,
                                 verbose=False)

    if not math.isnan(power_law_fit.alpha):
        # Check if the distribution is likely to be following the power law
        R, p = power_law_fit.distribution_compare("power_law", "exponential")
        if R > 0 and p < 0.05:
            results["node_degrees_powerlaw"] = {
                "alpha": power_law_fit.alpha,
                "sigma": power_law_fit.sigma,
            }

    if not flat:
        return results
    else:
        return flatten_v4(results)
def version1(prov_doc):
    results = dict()

    g = prov_to_graph(prov_doc)
    # Graph size
    results["nodes"] = g.number_of_nodes()
    results["edges"] = g.size()

    ug = nx.Graph(g)
    n_comps = nx.number_connected_components(ug)
    results["components"] = n_comps
    results["diameter"] = nx.diameter(nx.Graph(ug)) if n_comps == 1 else -1

    s_paths = nx.shortest_path(g)
    lengths = lambda g, t1, t2: [(len(s_paths[i][j]) - 1)
                                 for i in node_select(g, t1) if i in s_paths
                                 for j in node_select(g, t2)
                                 if j in s_paths[i] and i != j]

    def mfd(graph, t1, t2):
        s_distances = lengths(graph, t1, t2)
        return max(s_distances) if s_distances else 0

    results["mfd"] = {
        "entity": {
            "entity": mfd(g, ProvEntity, ProvEntity),
            "activity": mfd(g, ProvEntity, ProvActivity),
            "agent": mfd(g, ProvEntity, ProvAgent),
        },
        "activity": {
            "entity": mfd(g, ProvActivity, ProvEntity),
            "activity": mfd(g, ProvActivity, ProvActivity),
            "agent": mfd(g, ProvActivity, ProvAgent),
        },
        "agent": {
            "entity": mfd(g, ProvAgent, ProvEntity),
            "activity": mfd(g, ProvAgent, ProvActivity),
            "agent": mfd(g, ProvAgent, ProvAgent),
        },
    }

    distributions = dict()
    der_paths = paths_select(g, ProvEntity, ProvEntity, ProvDerivation)
    der_lengths = [(len(der_paths[i][j]) - 1) for i in der_paths
                   for j in der_paths[i]]
    distributions["derivations"] = der_lengths

    aee_paths = paths_select(g, ProvActivity, ProvEntity,
                             (ProvDerivation, ProvUsage))
    aee_lengths = [(len(aee_paths[i][j]) - 1) for i in aee_paths
                   for j in aee_paths[i]]
    distributions["activity_entities"] = aee_lengths

    eeag_paths = paths_select(g, ProvEntity, ProvAgent,
                              (ProvDerivation, ProvGeneration))
    eeag_lengths = [(len(eeag_paths[i][j]) - 1) for i in eeag_paths
                    for j in eeag_paths[i]]
    distributions["entities_agent"] = eeag_lengths

    results["distributions"] = distributions

    return results
def version2(prov_doc):
    results = dict()

    g = prov_to_graph(prov_doc) if isinstance(prov_doc,
                                              ProvDocument) else prov_doc
    # Graph size
    results["nodes"] = g.number_of_nodes()
    results["edges"] = g.size()

    ug = nx.Graph(g)
    n_comps = nx.number_connected_components(ug)
    results["components"] = n_comps
    results["diameter"] = nx.diameter(nx.Graph(ug)) if n_comps == 1 else -1

    s_paths = nx.shortest_path(g)
    lengths = lambda g, t1, t2: [(len(s_paths[i][j]) - 1)
                                 for i in node_select(g, t1) if i in s_paths
                                 for j in node_select(g, t2)
                                 if j in s_paths[i] and i != j]

    def mfd(graph, t1, t2):
        s_distances = lengths(graph, t1, t2)
        return max(s_distances) if s_distances else 0

    results["mfd"] = {
        "entity": {
            "entity": mfd(g, ProvEntity, ProvEntity),
            "activity": mfd(g, ProvEntity, ProvActivity),
            "agent": mfd(g, ProvEntity, ProvAgent),
        },
        "activity": {
            "entity": mfd(g, ProvActivity, ProvEntity),
            "activity": mfd(g, ProvActivity, ProvActivity),
            "agent": mfd(g, ProvActivity, ProvAgent),
        },
        "agent": {
            "entity": mfd(g, ProvAgent, ProvEntity),
            "activity": mfd(g, ProvAgent, ProvActivity),
            "agent": mfd(g, ProvAgent, ProvAgent),
        },
    }

    distributions = dict()
    # Path length distributions of derivations
    der_paths = paths_select(g, ProvEntity, ProvEntity, ProvDerivation)
    der_lengths = [(len(der_paths[i][j]) - 1) for i in der_paths
                   for j in der_paths[i]]
    distributions["derivations"] = der_lengths

    # Path length distributions of usages
    aee_paths = paths_select(g, ProvActivity, ProvEntity,
                             (ProvDerivation, ProvUsage))
    aee_lengths = [(len(aee_paths[i][j]) - 1) for i in aee_paths
                   for j in aee_paths[i]]
    distributions["activity_entities"] = aee_lengths

    # Path length distributions of attributions
    eeag_paths = paths_select(g, ProvEntity, ProvAgent,
                              (ProvDerivation, ProvAttribution))
    eeag_lengths = [(len(eeag_paths[i][j]) - 1) for i in eeag_paths
                    for j in eeag_paths[i]]
    distributions["entities_agent"] = eeag_lengths

    # Node degree distribution (undirected)
    distributions["node_degrees"] = dict(ug.degree()).values()

    results["distributions"] = distributions

    # The power law exponent of node degrees
    power_law_fit = powerlaw.Fit(distributions["node_degrees"],
                                 discrete=True,
                                 verbose=False)

    if not math.isnan(power_law_fit.alpha):
        # Check if the distribution is likely to be following the power law
        R, p = power_law_fit.distribution_compare("power_law", "exponential")
        if R > 0 and p < 0.05:
            # print power_law_fit.alpha, power_law_fit.sigma, R, p
            results["node_degrees_powerlaw"] = {
                "alpha": power_law_fit.alpha,
                "sigma": power_law_fit.sigma,
            }
    return results