def test_threshold(): _, graph, group = next(pg.load_datasets_one_community(["bigraph"])) for _ in supported_backends(): training, evaluation = pg.split(list(group), training_samples=0.5) cond1 = pg.Conductance().evaluate( pg.Threshold(pg.Sweep(pg.PageRank())).rank( graph, {v: 1 for v in training})) cond2 = pg.Conductance().evaluate( pg.Threshold("gap").transform(pg.PageRank().rank( graph, {v: 1 for v in training}))) # try all api types assert cond1 <= cond2
def test_unsupervised_vs_auc(): def loader(): return pg.load_datasets_multiple_communities(["graph9"]) algorithms = pg.create_variations(pg.create_many_filters(), pg.create_many_variation_types()) time_scores = pg.benchmark_scores(pg.benchmark(algorithms, loader(), pg.Time)) assert sum(time_scores) > 0 measures = {"AUC": lambda ground_truth, exlude: pg.MultiSupervised(pg.AUC, ground_truth, exlude), "NDCG": lambda ground_truth, exlude: pg.MultiSupervised(pg.NDCG, ground_truth, exlude), "Density": lambda graph: pg.MultiUnsupervised(pg.Density, graph), "Conductance": lambda graph: pg.MultiUnsupervised(pg.Conductance(autofix=True).as_unsupervised_method(), graph), "Modularity": lambda graph: pg.MultiUnsupervised(pg.Modularity(max_positive_samples=5).as_unsupervised_method(), graph), "CCcos": lambda graph: pg.ClusteringCoefficient(graph, similarity="cos", max_positive_samples=5), "CCdot": lambda graph: pg.ClusteringCoefficient(graph, similarity="dot", max_positive_samples=5), "LinkAUCcos": lambda graph: pg.LinkAssessment(graph, similarity="cos", max_positive_samples=5), "LinkAUCdot": lambda graph: pg.LinkAssessment(graph, similarity="dot", max_positive_samples=5), "HopAUCcos": lambda graph: pg.LinkAssessment(graph, similarity="cos", hops=2, max_positive_samples=5), "HopAUCdot": lambda graph: pg.LinkAssessment(graph, similarity="dot", hops=2, max_positive_samples=5), } scores = {}#measure: pg.benchmark_scores(pg.benchmark(algorithms, loader(), measures[measure])) for measure in measures} for measure in measures: # do this as a for loop, because pytest becomes a little slow above list comprehension scores[measure] = pg.benchmark_scores(pg.benchmark(algorithms, loader(), measures[measure])) supervised = {"AUC", "NDCG"} evaluations = dict() for measure in measures: evaluations[measure] = abs(pg.SpearmanCorrelation(scores["AUC"])(scores[measure])) #for measure in measures: # print(measure, evaluations[measure]) assert max([evaluations[measure] for measure in measures if measure not in supervised]) == evaluations["LinkAUCdot"]
def test_best_direction(): assert pg.Conductance().best_direction() == -1 assert pg.Density().best_direction() == 1 assert pg.Modularity().best_direction() == 1 assert pg.AUC([1, 2, 3]).best_direction() == 1 assert pg.Cos([1, 2, 3]).best_direction() == 1 assert pg.Dot([1, 2, 3]).best_direction() == 1 assert pg.TPR([1, 2, 3]).best_direction() == 1 assert pg.TNR([1, 2, 3]).best_direction() == 1
def test_edge_cases(): assert pg.pRule([0])([0]) == 0 assert pg.Cos([0])([0]) == 0 with pytest.raises(Exception): pg.Measure()([0, 1, 0]) with pytest.raises(Exception): pg.AUC([0, 0, 0])([0, 1, 0]) with pytest.raises(Exception): pg.AUC([1, 1, 1])([0, 1, 0]) with pytest.raises(Exception): pg.KLDivergence([0], exclude={"A": 1})([1]) with pytest.raises(Exception): pg.Conductance(next(pg.load_datasets_graph(["graph5"])), max_rank=0.5)([1, 1, 1, 1, 1]) import networkx as nx for _ in supported_backends(): assert pg.Conductance(nx.Graph())([]) == float( "inf") # this is indeed correct in python assert pg.Density(nx.Graph())([]) == 0 assert pg.Modularity(nx.Graph())([]) == 0 assert pg.KLDivergence([0, 1, 0])([0, 1, 0]) == 0 assert pg.MKLDivergence([0, 1, 0])([0, 1, 0]) == 0 assert pg.KLDivergence([0])([-1]) == 0
def test_threshold(): _, graph, group = next(pg.load_datasets_one_community(["bigraph"])) for _ in supported_backends(): training, evaluation = pg.split(list(group), training_samples=0.5) algorithm = pg.PageRank() cond1 = pg.Conductance().evaluate( pg.Threshold(pg.Sweep(algorithm), "gap").rank(graph, {v: 1 for v in training})) cond2 = pg.Conductance().evaluate( pg.Threshold(0.3).transform( algorithm.rank(graph, {v: 1 for v in training}))) # try all api types cond3 = pg.Conductance().evaluate( pg.Threshold(1).transform( algorithm.rank( graph, {v: 1 for v in training}))) # should yield infinite conductance # TODO: find an algorithm other than gap to outperform 0.2 threshold too assert cond1 <= cond2 assert cond2 <= cond3
def test_krylov_space_oversampling(): # this demonstrates a highly complicated setting _, graph, community = next(pg.load_datasets_one_community(["bigraph"])) algorithm = pg.HeatKernel( t=5, # the number of hops away HeatKernel places maximal importance on krylov_dims=5, normalization="symmetric", renormalize=True) for _ in supported_backends(): personalization = {node: 1. for node in list(community)[:10]} oversampling = pg.SeedOversampling(algorithm) pg.Normalize(oversampling)(graph, personalization) measure = pg.Conductance() assert measure(pg.Normalize(algorithm)( graph, personalization)) >= measure( pg.Normalize(oversampling)(graph, personalization)) - 5.E-6
def overlapping_community_detection(graph, known_members, top=None): graph_filter = pg.PageRank( 0.9) if len(known_members) < 50 else pg.ParameterTuner().tune( graph, known_members) ranks = pg.to_signal(graph, {v: 1 for v in known_members }) >> pg.Sweep(graph_filter) >> pg.Normalize("range") if top is not None: ranks = ranks * (1 - pg.to_signal(graph, {v: 1 for v in known_members}) ) # set known member scores to zero return sorted(list(graph), key=lambda node: -ranks[node] )[:top] # return specific number of top predictions threshold = pg.optimize(max_vals=[1], loss=lambda p: pg.Conductance(graph) (pg.Threshold(p[0]).transform(ranks)))[0] known_members = set(known_members) return [ v for v in graph if ranks[v] > threshold and v not in known_members ]
import pygrank as pg _, graph, community = next(pg.load_datasets_one_community(["EUCore"])) algorithm = pg.HeatKernel( t=5, # the number of hops away HeatKernel places maximal importance on normalization="symmetric", renormalize=True) personalization = {node: 1. for node in community} # ignored nodes assumed to be zeroes algorithms = { "HK5": algorithm, "HK5+Oversampling": pg.SeedOversampling(algorithm) } algorithms = algorithms | pg.create_variations(algorithms, {"+Sweep": pg.Sweep}) algorithms = pg.create_variations(algorithms, {"": pg.Normalize}) measure = pg.Conductance() for algorithm_name, algorithm in algorithms.items(): scores = algorithm(graph, personalization) # returns a dict-like pg.GraphSignal print(algorithm_name, measure(scores))
import pygrank as pg algorithm = pg.HeatKernel( t=5, # the number of hops to place maximal importance on normalization="symmetric", renormalize=True) algorithms = { "hk5": algorithm, "hk5+oversampling": pg.SeedOversampling(algorithm) } algorithms = algorithms | pg.create_variations(algorithms, {"+sweep": pg.Sweep}) algorithms = pg.create_variations(algorithms, pg.Normalize) _, graph, community = next(pg.load_datasets_one_community(["EUCore"])) personalization = {node: 1. for node in community} # missing scores considered zero measure = pg.Conductance() # smaller means tightly-knit stochastic community for algorithm_name, algorithm in algorithms.items(): scores = algorithm(graph, personalization) # returns a dict-like pg.GraphSignal pg.benchmark_print_line(algorithm_name, measure(scores), tabs=[20, 5]) # pretty