def test_algorithm_selection(): for _ in supported_backends(): _, graph, communities = next( pg.load_datasets_multiple_communities(["bigraph"], max_group_number=3)) train, test = pg.split(communities, 0.05) # 5% of community members are known algorithms = pg.create_variations(pg.create_demo_filters(), pg.Normalize) supervised_algorithm = pg.AlgorithmSelection(algorithms.values(), measure=pg.AUC, tuning_backend="numpy") print(supervised_algorithm.cite()) modularity_algorithm = pg.AlgorithmSelection( algorithms.values(), fraction_of_training=1, measure=pg.Modularity().as_supervised_method(), tuning_backend="numpy") supervised_aucs = list() modularity_aucs = list() for seeds, members in zip(train.values(), test.values()): measure = pg.AUC(members, exclude=seeds) supervised_aucs.append(measure(supervised_algorithm(graph, seeds))) modularity_aucs.append(measure(modularity_algorithm(graph, seeds))) assert abs( sum(supervised_aucs) / len(supervised_aucs) - sum(modularity_aucs) / len(modularity_aucs)) < 0.05
def test_explicit_citations(): assert "unknown node ranking algorithm" == pg.NodeRanking().cite() assert "with parameters tuned \cite{krasanakis2021pygrank}" in pg.ParameterTuner( lambda params: pg.PageRank(params[0])).cite() assert "Postprocessor" in pg.Postprocessor().cite() assert pg.PageRank().cite() in pg.AlgorithmSelection().cite() assert "krasanakis2021pygrank" in pg.ParameterTuner().cite() assert "ortega2018graph" in pg.ParameterTuner().cite() assert pg.HeatKernel().cite() in pg.SeedOversampling(pg.HeatKernel()).cite() assert pg.AbsorbingWalks().cite() in pg.BoostedSeedOversampling(pg.AbsorbingWalks()).cite() assert "krasanakis2018venuerank" in pg.BiasedKernel(converge_to_eigenvectors=True).cite() assert "yu2021chebyshev" in pg.HeatKernel(coefficient_type="chebyshev").cite() assert "susnjara2015accelerated" in pg.HeatKernel(krylov_dims=5).cite() assert "krasanakis2021pygrank" in pg.GenericGraphFilter(optimization_dict=dict()).cite() assert "tautology" in pg.Tautology().cite() assert pg.PageRank().cite() == pg.Tautology(pg.PageRank()).cite() assert "mabs" in pg.MabsMaintain(pg.PageRank()).cite() assert "max normalization" in pg.Normalize(pg.PageRank()).cite() assert "[0,1] range" in pg.Normalize(pg.PageRank(), "range").cite() assert "ordinal" in pg.Ordinals(pg.PageRank()).cite() assert "exp" in pg.Transformer(pg.PageRank()).cite() assert "0.5" in pg.Threshold(pg.PageRank(), 0.5).cite() assert "andersen2007local" in pg.Sweep(pg.PageRank()).cite() assert pg.HeatKernel().cite() in pg.Sweep(pg.PageRank(), pg.HeatKernel()).cite() assert "LFPRO" in pg.AdHocFairness("O").cite() assert "LFPRO" in pg.AdHocFairness(pg.PageRank(), "LFPRO").cite() assert "multiplicative" in pg.AdHocFairness(pg.PageRank(), "B").cite() assert "multiplicative" in pg.AdHocFairness(pg.PageRank(), "mult").cite() assert "tsioutsiouliklis2020fairness" in pg.AdHocFairness().cite() assert "rahman2019fairwalk" in pg.FairWalk(pg.PageRank()).cite() assert "krasanakis2020prioredit" in pg.FairPersonalizer(pg.PageRank()).cite()
def test_all_communities_benchmarks(): datasets = ["bigraph"] pre = pg.preprocessor(assume_immutability=True, normalization="symmetric") tol = 1.E-9 optimization = pg.SelfClearDict() algorithms = { "ppr0.85": pg.PageRank(alpha=0.85, preprocessor=pre, max_iters=10000, tol=tol), "ppr0.9": pg.PageRank(alpha=0.9, preprocessor=pre, max_iters=10000, tol=tol), "ppr0.99": pg.PageRank(alpha=0.99, preprocessor=pre, max_iters=10000, tol=tol), "hk3": pg.HeatKernel(t=3, preprocessor=pre, max_iters=10000, tol=tol, optimization_dict=optimization), "hk5": pg.HeatKernel(t=5, preprocessor=pre, max_iters=10000, tol=tol, optimization_dict=optimization), "hk7": pg.HeatKernel(t=7, preprocessor=pre, max_iters=10000, tol=tol, optimization_dict=optimization), } tuned = {"selected": pg.AlgorithmSelection(algorithms.values(), fraction_of_training=0.8)} loader = pg.load_datasets_all_communities(datasets, min_group_size=50) pg.benchmark_print(pg.benchmark(algorithms | tuned, loader, pg.AUC, fraction_of_training=.8, seed=list(range(1))), decimals=3, delimiter=" & ", end_line="\\\\") loader = pg.load_datasets_all_communities(datasets, min_group_size=50) pg.benchmark_print(pg.benchmark(algorithms | tuned, loader, pg.Modularity, sensitive=pg.pRule, fraction_of_training=.8, seed=list(range(1))), decimals=3, delimiter=" & ", end_line="\\\\") mistreatment = lambda known_scores, sensitive_signal, exclude: \ pg.AM([pg.Disparity([pg.TPR(known_scores, exclude=1 - (1 - exclude.np) * sensitive_signal.np), pg.TPR(known_scores, exclude=1 - (1 - exclude.np) * (1 - sensitive_signal.np))]), pg.Disparity([pg.TNR(known_scores, exclude=1 - (1 - exclude.np) * sensitive_signal.np), pg.TNR(known_scores, exclude=1 - (1 - exclude.np) * (1 - sensitive_signal.np))])]) loader = pg.load_datasets_all_communities(datasets, min_group_size=50) pg.benchmark_print(pg.benchmark(algorithms | tuned, loader, pg.Modularity, sensitive=mistreatment, fraction_of_training=.8, seed=list(range(1))), decimals=3, delimiter=" & ", end_line="\\\\")
def test_autotune_methods(): import numpy as np _, G, groups = next(pg.load_datasets_multiple_communities(["bigraph"])) group = groups[0] training, evaluation = pg.split(pg.to_signal(G, {v: 1 for v in group})) aucs = [pg.AUC(evaluation, exclude=training)(ranker.rank(training)) for ranker in pg.create_demo_filters().values()] auc2 = pg.AUC(evaluation, exclude=training)(pg.AlgorithmSelection().rank(training)) assert max(aucs)-np.std(aucs) <= auc2
def test_multigroup_benchmarks(): datasets = ["bigraph"] pre = pg.preprocessor(assume_immutability=True, normalization="symmetric") tol = 1.E-9 optimization = pg.SelfClearDict() algorithms = { "ppr0.85": pg.PageRank(alpha=0.85, preprocessor=pre, max_iters=10000, tol=tol), "ppr0.9": pg.PageRank(alpha=0.9, preprocessor=pre, max_iters=10000, tol=tol), "ppr0.99": pg.PageRank(alpha=0.99, preprocessor=pre, max_iters=10000, tol=tol), "hk3": pg.HeatKernel(t=3, preprocessor=pre, max_iters=10000, tol=tol, optimization_dict=optimization), "hk5": pg.HeatKernel(t=5, preprocessor=pre, max_iters=10000, tol=tol, optimization_dict=optimization), "hk7": pg.HeatKernel(t=7, preprocessor=pre, max_iters=10000, tol=tol, optimization_dict=optimization), } tuned = { "selected": pg.AlgorithmSelection(algorithms.values(), fraction_of_training=0.8) } loader = pg.load_datasets_multiple_communities(datasets, min_group_size=50) pg.benchmark_print(pg.benchmark( algorithms | tuned, loader, lambda ground_truth, exclude: pg.MultiSupervised( pg.AUC, ground_truth, exclude), fraction_of_training=.8, seed=list(range(1))), decimals=3, delimiter=" & ", end_line="\\\\") loader = pg.load_datasets_multiple_communities(datasets, min_group_size=50) pg.benchmark_print(pg.benchmark(algorithms | tuned, loader, pg.Modularity, sensitive=pg.pRule, fraction_of_training=.8, seed=list(range(1))), decimals=3, delimiter=" & ", end_line="\\\\")
def test_autotune_citations(): assert pg.ParameterTuner().cite() != pg.GenericGraphFilter().cite() assert pg.HopTuner().cite() != pg.GenericGraphFilter().cite() assert pg.AlgorithmSelection().cite() != pg.GenericGraphFilter().cite()
import pygrank as pg datasets = ["EUCore", "Amazon"] pre = pg.preprocessor(assume_immutability=True, normalization="symmetric") algs = { "ppr.85": pg.PageRank(.85, preprocessor=pre, tol=1.E-9, max_iters=1000), "ppr.99": pg.PageRank(.99, preprocessor=pre, tol=1.E-9, max_iters=1000), "hk3": pg.HeatKernel(3, preprocessor=pre, tol=1.E-9, max_iters=1000), "hk5": pg.HeatKernel(5, preprocessor=pre, tol=1.E-9, max_iters=1000), } algs = algs | pg.create_variations(algs, {"+Sweep": pg.Sweep}) loader = pg.load_datasets_one_community(datasets) algs["tuned"] = pg.ParameterTuner(preprocessor=pre, tol=1.E-9, max_iters=1000) algs["selected"] = pg.AlgorithmSelection( pg.create_demo_filters(preprocessor=pre, tol=1.E-9, max_iters=1000).values()) algs["tuned+Sweep"] = pg.ParameterTuner( ranker_generator=lambda params: pg.Sweep( pg.GenericGraphFilter( params, preprocessor=pre, tol=1.E-9, max_iters=1000))) for alg in algs.values(): print(alg.cite()) # prints a list of algorithm citations pg.benchmark_print(pg.benchmark(algs, loader, pg.AUC, fraction_of_training=.5), delimiter=" & ", end_line="\\\\")
pg.Normalize( postprocessor( pg.GenericGraphFilter([1]+params, preprocessor=pre, error_type="iters", max_iters=41, optimization_dict=optimization, preserve_norm=False))), deviation_tol=1.E-6, measure=measure, optimizer=optimizer, max_vals=[1]*40, min_vals=[0]*40) tuned = { "select": pg.AlgorithmSelection(algorithms.values(), fraction_of_training=0.9, measure=measure), "tune": create_param_tuner(), "tuneLBFGSB": create_param_tuner(pg.lbfgsb) } for name, graph, group in pg.load_datasets_all_communities(datasets, min_group_size=community_size, max_group_number=3): print(" & ".join([str(val) for val in [name, len(graph), graph.number_of_edges(), len(group)]])+" \\\\") loader = pg.load_datasets_all_communities(datasets, min_group_size=community_size, max_group_number=3) pg.benchmark_print( pg.benchmark_average((pg.benchmark(algorithms | tuned, loader, measure, fraction_of_training=[0.1, 0.2, 0.3], seed=list(range(1)))), posthocs=True), decimals=3, delimiter=" & ", end_line="\\\\")
import pygrank as pg _, graph, communities = next( pg.load_datasets_multiple_communities(["EUCore"], max_group_number=3)) train, test = pg.split(communities, 0.05) # 5% of community members are known algorithms = pg.create_variations(pg.create_demo_filters(), pg.Normalize) supervised_algorithm = pg.AlgorithmSelection(algorithms.values(), measure=pg.AUC) print(supervised_algorithm.cite()) modularity_algorithm = pg.AlgorithmSelection( algorithms.values(), fraction_of_training=1, measure=pg.Modularity().as_supervised_method()) linkauc_algorithm = None best_evaluation = 0 linkAUC = pg.LinkAssessment( graph, similarity="cos", hops=1) # LinkAUC, because emails systemically exhibit homophily for algorithm in algorithms.values(): evaluation = linkAUC.evaluate({ community: algorithm(graph, seeds) for community, seeds in train.items() }) if evaluation > best_evaluation: best_evaluation = evaluation linkauc_algorithm = algorithm supervised_aucs = list() modularity_aucs = list() linkauc_aucs = list()
preprocessor=pre, max_iters=10000, tol=tol, optimization_dict=optimization), "hk7": pg.HeatKernel(t=7, preprocessor=pre, max_iters=10000, tol=tol, optimization_dict=optimization), } algorithms = algorithms # | pg.benchmarks.create_variations(algorithms, {"+sweep": pg.Sweep}) tuned = { "selected": pg.AlgorithmSelection(algorithms.values(), fraction_of_training=0.8), #"tuned": pg.ParameterTuner(preprocessor=pre, fraction_of_training=0.8, tol=tol, optimization_dict=optimization, measure=pg.AUC), "arnoldi": pg.HopTuner(preprocessor=pre, basis="arnoldi", measure=pg.Cos, tol=tol, optimization_dict=optimization), #"arnoldi2": pg.ParameterTuner(lambda params: pg.HopTuner(preprocessor=pre, basis="arnoldi", num_parameters=int(params[0]), # measure=pg.Cos, # tol=tol, optimization_dict=optimization, tunable_offset=None), # max_vals=[40], min_vals=[5], divide_range=2, fraction_of_training=0.1), } #algorithms = pg.create_variations(algorithms, {"": pg.Tautology, "+Sweep": pg.Sweep}) #print(algorithms.keys())
postprocessor = pg.Tautology algorithms = pg.benchmarks.create_variations(algorithms, postprocessor) measure = pg.Time optimization = pg.SelfClearDict() def create_param_tuner(optimizer=pg.optimize): return pg.ParameterTuner(lambda params: pg.Normalize(postprocessor( pg.GenericGraphFilter([1]+params, preprocessor=pre, error_type="iters", max_iters=41, optimization_dict=optimization, preserve_norm=False))), deviation_tol=1.E-6, measure=measure, optimizer=optimizer, max_vals=[1]*40, min_vals=[0]*40) tuned = { "select": pg.AlgorithmSelection(algorithms.values()),#, combined_prediction=False), #"tune": create_param_tuner(), #"tuneLBFGSB": create_param_tuner(pg.lbfgsb) } for name, graph, group in pg.load_datasets_all_communities(datasets, min_group_size=community_size, max_group_number=3): print(" & ".join([str(val) for val in [name, len(graph), graph.number_of_edges(), len(group)]])+" \\\\") loader = pg.load_datasets_all_communities(datasets, min_group_size=community_size, max_group_number=3) pg.benchmark_print( pg.benchmark_average((pg.benchmark(algorithms | tuned, loader, measure, fraction_of_training=[0.1, 0.2, 0.3], seed=list(range(1)))), posthocs=True), decimals=3, delimiter=" & ", end_line="\\\\")