Example #1
0
def test_threshold():
    _, graph, group = next(pg.load_datasets_one_community(["bigraph"]))
    for _ in supported_backends():
        training, evaluation = pg.split(list(group), training_samples=0.5)
        cond1 = pg.Conductance().evaluate(
            pg.Threshold(pg.Sweep(pg.PageRank())).rank(
                graph, {v: 1
                        for v in training}))
        cond2 = pg.Conductance().evaluate(
            pg.Threshold("gap").transform(pg.PageRank().rank(
                graph, {v: 1
                        for v in training})))  # try all api types
        assert cond1 <= cond2
Example #2
0
def test_unsupervised_vs_auc():
    def loader():
        return pg.load_datasets_multiple_communities(["graph9"])

    algorithms = pg.create_variations(pg.create_many_filters(), pg.create_many_variation_types())
    time_scores = pg.benchmark_scores(pg.benchmark(algorithms, loader(), pg.Time))
    assert sum(time_scores) > 0

    measures = {"AUC": lambda ground_truth, exlude: pg.MultiSupervised(pg.AUC, ground_truth, exlude),
                "NDCG": lambda ground_truth, exlude: pg.MultiSupervised(pg.NDCG, ground_truth, exlude),
                "Density": lambda graph: pg.MultiUnsupervised(pg.Density, graph),
                "Conductance": lambda graph: pg.MultiUnsupervised(pg.Conductance(autofix=True).as_unsupervised_method(), graph),
                "Modularity": lambda graph: pg.MultiUnsupervised(pg.Modularity(max_positive_samples=5).as_unsupervised_method(), graph),
                "CCcos": lambda graph: pg.ClusteringCoefficient(graph, similarity="cos", max_positive_samples=5),
                "CCdot": lambda graph: pg.ClusteringCoefficient(graph, similarity="dot", max_positive_samples=5),
                "LinkAUCcos": lambda graph: pg.LinkAssessment(graph, similarity="cos", max_positive_samples=5),
                "LinkAUCdot": lambda graph: pg.LinkAssessment(graph, similarity="dot", max_positive_samples=5),
                "HopAUCcos": lambda graph: pg.LinkAssessment(graph, similarity="cos", hops=2, max_positive_samples=5),
                "HopAUCdot": lambda graph: pg.LinkAssessment(graph, similarity="dot", hops=2, max_positive_samples=5),
                }

    scores = {}#measure: pg.benchmark_scores(pg.benchmark(algorithms, loader(), measures[measure])) for measure in measures}
    for measure in measures:  # do this as a for loop, because pytest becomes a little slow above list comprehension
        scores[measure] = pg.benchmark_scores(pg.benchmark(algorithms, loader(), measures[measure]))
    supervised = {"AUC", "NDCG"}
    evaluations = dict()
    for measure in measures:
        evaluations[measure] = abs(pg.SpearmanCorrelation(scores["AUC"])(scores[measure]))
    #for measure in measures:
    #    print(measure, evaluations[measure])
    assert max([evaluations[measure] for measure in measures if measure not in supervised]) == evaluations["LinkAUCdot"]
Example #3
0
def test_best_direction():
    assert pg.Conductance().best_direction() == -1
    assert pg.Density().best_direction() == 1
    assert pg.Modularity().best_direction() == 1
    assert pg.AUC([1, 2, 3]).best_direction() == 1
    assert pg.Cos([1, 2, 3]).best_direction() == 1
    assert pg.Dot([1, 2, 3]).best_direction() == 1
    assert pg.TPR([1, 2, 3]).best_direction() == 1
    assert pg.TNR([1, 2, 3]).best_direction() == 1
Example #4
0
def test_edge_cases():
    assert pg.pRule([0])([0]) == 0
    assert pg.Cos([0])([0]) == 0
    with pytest.raises(Exception):
        pg.Measure()([0, 1, 0])
    with pytest.raises(Exception):
        pg.AUC([0, 0, 0])([0, 1, 0])
    with pytest.raises(Exception):
        pg.AUC([1, 1, 1])([0, 1, 0])
    with pytest.raises(Exception):
        pg.KLDivergence([0], exclude={"A": 1})([1])
    with pytest.raises(Exception):
        pg.Conductance(next(pg.load_datasets_graph(["graph5"])),
                       max_rank=0.5)([1, 1, 1, 1, 1])
    import networkx as nx
    for _ in supported_backends():
        assert pg.Conductance(nx.Graph())([]) == float(
            "inf")  # this is indeed correct in python
        assert pg.Density(nx.Graph())([]) == 0
        assert pg.Modularity(nx.Graph())([]) == 0
        assert pg.KLDivergence([0, 1, 0])([0, 1, 0]) == 0
        assert pg.MKLDivergence([0, 1, 0])([0, 1, 0]) == 0
        assert pg.KLDivergence([0])([-1]) == 0
Example #5
0
def test_threshold():
    _, graph, group = next(pg.load_datasets_one_community(["bigraph"]))
    for _ in supported_backends():
        training, evaluation = pg.split(list(group), training_samples=0.5)
        algorithm = pg.PageRank()
        cond1 = pg.Conductance().evaluate(
            pg.Threshold(pg.Sweep(algorithm),
                         "gap").rank(graph, {v: 1
                                             for v in training}))
        cond2 = pg.Conductance().evaluate(
            pg.Threshold(0.3).transform(
                algorithm.rank(graph,
                               {v: 1
                                for v in training})))  # try all api types
        cond3 = pg.Conductance().evaluate(
            pg.Threshold(1).transform(
                algorithm.rank(
                    graph,
                    {v: 1
                     for v in training})))  # should yield infinite conductance
        # TODO: find an algorithm other than gap to outperform 0.2 threshold too
        assert cond1 <= cond2
        assert cond2 <= cond3
Example #6
0
def test_krylov_space_oversampling():
    # this demonstrates a highly complicated setting
    _, graph, community = next(pg.load_datasets_one_community(["bigraph"]))
    algorithm = pg.HeatKernel(
        t=5,  # the number of hops away HeatKernel places maximal importance on
        krylov_dims=5,
        normalization="symmetric",
        renormalize=True)
    for _ in supported_backends():
        personalization = {node: 1. for node in list(community)[:10]}
        oversampling = pg.SeedOversampling(algorithm)
        pg.Normalize(oversampling)(graph, personalization)
        measure = pg.Conductance()
        assert measure(pg.Normalize(algorithm)(
            graph, personalization)) >= measure(
                pg.Normalize(oversampling)(graph, personalization)) - 5.E-6
Example #7
0
def overlapping_community_detection(graph, known_members, top=None):
    graph_filter = pg.PageRank(
        0.9) if len(known_members) < 50 else pg.ParameterTuner().tune(
            graph, known_members)
    ranks = pg.to_signal(graph,
                         {v: 1
                          for v in known_members
                          }) >> pg.Sweep(graph_filter) >> pg.Normalize("range")
    if top is not None:
        ranks = ranks * (1 - pg.to_signal(graph, {v: 1
                                                  for v in known_members})
                         )  # set known member scores to zero
        return sorted(list(graph), key=lambda node: -ranks[node]
                      )[:top]  # return specific number of top predictions

    threshold = pg.optimize(max_vals=[1],
                            loss=lambda p: pg.Conductance(graph)
                            (pg.Threshold(p[0]).transform(ranks)))[0]
    known_members = set(known_members)
    return [
        v for v in graph if ranks[v] > threshold and v not in known_members
    ]
Example #8
0
import pygrank as pg
_, graph, community = next(pg.load_datasets_one_community(["EUCore"]))
algorithm = pg.HeatKernel(
    t=5,  # the number of hops away HeatKernel places maximal importance on
    normalization="symmetric",
    renormalize=True)
personalization = {node: 1.
                   for node in community}  # ignored nodes assumed to be zeroes
algorithms = {
    "HK5": algorithm,
    "HK5+Oversampling": pg.SeedOversampling(algorithm)
}
algorithms = algorithms | pg.create_variations(algorithms,
                                               {"+Sweep": pg.Sweep})
algorithms = pg.create_variations(algorithms, {"": pg.Normalize})

measure = pg.Conductance()
for algorithm_name, algorithm in algorithms.items():
    scores = algorithm(graph,
                       personalization)  # returns a dict-like pg.GraphSignal
    print(algorithm_name, measure(scores))
import pygrank as pg

algorithm = pg.HeatKernel(
    t=5,  # the number of hops to place maximal importance on
    normalization="symmetric",
    renormalize=True)
algorithms = {
    "hk5": algorithm,
    "hk5+oversampling": pg.SeedOversampling(algorithm)
}
algorithms = algorithms | pg.create_variations(algorithms,
                                               {"+sweep": pg.Sweep})
algorithms = pg.create_variations(algorithms, pg.Normalize)

_, graph, community = next(pg.load_datasets_one_community(["EUCore"]))
personalization = {node: 1.
                   for node in community}  # missing scores considered zero
measure = pg.Conductance()  # smaller means tightly-knit stochastic community
for algorithm_name, algorithm in algorithms.items():
    scores = algorithm(graph,
                       personalization)  # returns a dict-like pg.GraphSignal
    pg.benchmark_print_line(algorithm_name, measure(scores),
                            tabs=[20, 5])  # pretty