def test_one_community_benchmarks(): pg.load_backend("numpy") datasets = ["graph9", "bigraph"] pre = pg.preprocessor(assume_immutability=True, normalization="symmetric") algorithms = { "ppr0.85": pg.PageRank(alpha=0.85, preprocessor=pre, max_iters=10000, tol=1.E-9), "ppr0.99": pg.PageRank(alpha=0.99, preprocessor=pre, max_iters=10000, tol=1.E-9), "hk3": pg.HeatKernel(t=3, preprocessor=pre, max_iters=10000, tol=1.E-9), "hk5": pg.HeatKernel(t=5, preprocessor=pre, max_iters=10000, tol=1.E-9), "tuned": pg.ParameterTuner(preprocessor=pre, max_iters=10000, tol=1.E-9), } # algorithms = benchmark.create_variations(algorithms, {"": pg.Tautology, "+SO": pg.SeedOversampling}) # loader = pg.load_datasets_one_community(datasets) # pg.benchmark(algorithms, loader, "time", verbose=True) loader = pg.load_datasets_one_community(datasets) pg.benchmark_print( pg.benchmark_average( pg.benchmark_ranks( pg.benchmark(algorithms, loader, pg.AUC, fraction_of_training=.8))))
def test_multigroup_benchmarks(): datasets = ["bigraph"] pre = pg.preprocessor(assume_immutability=True, normalization="symmetric") tol = 1.E-9 optimization = pg.SelfClearDict() algorithms = { "ppr0.85": pg.PageRank(alpha=0.85, preprocessor=pre, max_iters=10000, tol=tol), "ppr0.9": pg.PageRank(alpha=0.9, preprocessor=pre, max_iters=10000, tol=tol), "ppr0.99": pg.PageRank(alpha=0.99, preprocessor=pre, max_iters=10000, tol=tol), "hk3": pg.HeatKernel(t=3, preprocessor=pre, max_iters=10000, tol=tol, optimization_dict=optimization), "hk5": pg.HeatKernel(t=5, preprocessor=pre, max_iters=10000, tol=tol, optimization_dict=optimization), "hk7": pg.HeatKernel(t=7, preprocessor=pre, max_iters=10000, tol=tol, optimization_dict=optimization), } tuned = { "selected": pg.AlgorithmSelection(algorithms.values(), fraction_of_training=0.8) } loader = pg.load_datasets_multiple_communities(datasets, min_group_size=50) pg.benchmark_print(pg.benchmark( algorithms | tuned, loader, lambda ground_truth, exclude: pg.MultiSupervised( pg.AUC, ground_truth, exclude), fraction_of_training=.8, seed=list(range(1))), decimals=3, delimiter=" & ", end_line="\\\\") loader = pg.load_datasets_multiple_communities(datasets, min_group_size=50) pg.benchmark_print(pg.benchmark(algorithms | tuned, loader, pg.Modularity, sensitive=pg.pRule, fraction_of_training=.8, seed=list(range(1))), decimals=3, delimiter=" & ", end_line="\\\\")
def test_all_communities_benchmarks(): datasets = ["bigraph"] pre = pg.preprocessor(assume_immutability=True, normalization="symmetric") tol = 1.E-9 optimization = pg.SelfClearDict() algorithms = { "ppr0.85": pg.PageRank(alpha=0.85, preprocessor=pre, max_iters=10000, tol=tol), "ppr0.9": pg.PageRank(alpha=0.9, preprocessor=pre, max_iters=10000, tol=tol), "ppr0.99": pg.PageRank(alpha=0.99, preprocessor=pre, max_iters=10000, tol=tol), "hk3": pg.HeatKernel(t=3, preprocessor=pre, max_iters=10000, tol=tol, optimization_dict=optimization), "hk5": pg.HeatKernel(t=5, preprocessor=pre, max_iters=10000, tol=tol, optimization_dict=optimization), "hk7": pg.HeatKernel(t=7, preprocessor=pre, max_iters=10000, tol=tol, optimization_dict=optimization), } tuned = {"selected": pg.AlgorithmSelection(algorithms.values(), fraction_of_training=0.8)} loader = pg.load_datasets_all_communities(datasets, min_group_size=50) pg.benchmark_print(pg.benchmark(algorithms | tuned, loader, pg.AUC, fraction_of_training=.8, seed=list(range(1))), decimals=3, delimiter=" & ", end_line="\\\\") loader = pg.load_datasets_all_communities(datasets, min_group_size=50) pg.benchmark_print(pg.benchmark(algorithms | tuned, loader, pg.Modularity, sensitive=pg.pRule, fraction_of_training=.8, seed=list(range(1))), decimals=3, delimiter=" & ", end_line="\\\\") mistreatment = lambda known_scores, sensitive_signal, exclude: \ pg.AM([pg.Disparity([pg.TPR(known_scores, exclude=1 - (1 - exclude.np) * sensitive_signal.np), pg.TPR(known_scores, exclude=1 - (1 - exclude.np) * (1 - sensitive_signal.np))]), pg.Disparity([pg.TNR(known_scores, exclude=1 - (1 - exclude.np) * sensitive_signal.np), pg.TNR(known_scores, exclude=1 - (1 - exclude.np) * (1 - sensitive_signal.np))])]) loader = pg.load_datasets_all_communities(datasets, min_group_size=50) pg.benchmark_print(pg.benchmark(algorithms | tuned, loader, pg.Modularity, sensitive=mistreatment, fraction_of_training=.8, seed=list(range(1))), decimals=3, delimiter=" & ", end_line="\\\\")
def test_benchmark_print(): assert pg.benchmarks.utils._fraction2str(0.1) == ".10" assert pg.benchmarks.utils._fraction2str(0.00001) == "0" assert pg.benchmarks.utils._fraction2str(1) == "1.00" loader = pg.load_datasets_one_community(["graph9", "bigraph"]) console = pg.benchmark_print(pg.benchmark(pg.create_demo_filters(), loader), out=io.StringIO(""), err=None).getvalue() loader = pg.load_datasets_one_community(["graph9", "bigraph"]) ret = pg.benchmark_dict(pg.benchmark(pg.create_demo_filters(), loader, sensitive=pg.MannWhitneyParity)) assert isinstance(ret, dict) assert len(ret) == 3 assert isinstance(ret["graph9"], dict) assert (len(str(ret)) - len(console)) < (len(str(ret)) + len(console))/2
.8, pRule_weight=10, max_residual=1, error_type=pg.Mabs, error_skewing=False, parameter_buckets=1, parity_type="impact") #"FFfix-C": pg.FairTradeoff(filter, .8, pRule_weight=10, error_type=pg.Mabs) #"FairTf": pg.FairnessTf(filter) } algorithms = pg.create_variations(algorithms, {"": pg.Normalize}) #import cProfile as profile #pr = profile.Profile() #pr.enable() mistreatment = lambda known_scores, sensitive_signal, exclude: \ pg.AM([pg.Disparity([pg.TPR(known_scores, exclude=1-(1-exclude.np)*sensitive_signal.np), pg.TPR(known_scores, exclude=1-(1-exclude.np)*(1-sensitive_signal.np))]), pg.Disparity([pg.TNR(known_scores, exclude=1 - (1 - exclude.np) * sensitive_signal.np), pg.TNR(known_scores, exclude=1 - (1 - exclude.np) * (1 - sensitive_signal.np))])]) pg.benchmark_print(pg.benchmark(algorithms, pg.load_datasets_multiple_communities( datasets, max_group_number=2), metric=pg.AUC, sensitive=pg.pRule, fraction_of_training=seed_fractions), delimiter=" & ", end_line="\\\\") #pr.disable() #pr.dump_stats('profile.pstat')
personalization: pg.GraphSignalData = None, **kwargs): personalization = pg.to_signal(graph, personalization) graph = personalization.graph ranks = self.ranker(personalization) ret = 0 total_sum = pg.sum(ranks) accum_sum = 0 for threshold in sorted(ranks.values()): accum_sum += threshold if accum_sum > total_sum * 0.1: break for i, v in enumerate(ranks): pg.utils.log(f"{i}/{len(ranks)}") if ranks[v] >= threshold: partial = ranks >> pg.Threshold(ranks[v], inclusive=True) >> self.ranker ret = partial * ranks[v] + ret return ret algs = { "ppr": pg.PageRank(0.9), "ppr+so": pg.PageRank(0.9) >> pg.SeedOversampling(), "ppr+bso": pg.PageRank(0.9) >> pg.BoostedSeedOversampling(), "ppr+sso": pg.PageRank(0.9) >> StochasticSeedOversampling(), } loader = pg.load_datasets_one_community(["citeseer"]) pg.benchmark_print(pg.benchmark(algs, loader, pg.AUC, 3))
import pygrank as pg datasets = ["EUCore", "Amazon"] pre = pg.preprocessor(assume_immutability=True, normalization="symmetric") algs = { "ppr.85": pg.PageRank(.85, preprocessor=pre, tol=1.E-9, max_iters=1000), "ppr.99": pg.PageRank(.99, preprocessor=pre, tol=1.E-9, max_iters=1000), "hk3": pg.HeatKernel(3, preprocessor=pre, tol=1.E-9, max_iters=1000), "hk5": pg.HeatKernel(5, preprocessor=pre, tol=1.E-9, max_iters=1000), } algs = algs | pg.create_variations(algs, {"+Sweep": pg.Sweep}) loader = pg.load_datasets_one_community(datasets) algs["tuned"] = pg.ParameterTuner(preprocessor=pre, tol=1.E-9, max_iters=1000) algs["selected"] = pg.AlgorithmSelection( pg.create_demo_filters(preprocessor=pre, tol=1.E-9, max_iters=1000).values()) algs["tuned+Sweep"] = pg.ParameterTuner( ranker_generator=lambda params: pg.Sweep( pg.GenericGraphFilter( params, preprocessor=pre, tol=1.E-9, max_iters=1000))) for alg in algs.values(): print(alg.cite()) # prints a list of algorithm citations pg.benchmark_print(pg.benchmark(algs, loader, pg.AUC, fraction_of_training=.5), delimiter=" & ", end_line="\\\\")
import pygrank as pg loader = list(pg.load_datasets_multiple_communities(["bigraph", "cora", "citeseer"])) algorithms = pg.create_variations(pg.create_demo_filters(), pg.create_many_variation_types()) algorithms = pg.create_variations(algorithms, pg.Normalize) # add normalization to all algorithms print("Algorithms", len(algorithms)) measures = {"AUC": lambda ground_truth, exlude: pg.MultiSupervised(pg.AUC, ground_truth, exlude), "NDCG": lambda ground_truth, exlude: pg.MultiSupervised(pg.NDCG, ground_truth, exlude), "Density": lambda graph: pg.MultiUnsupervised(pg.Density, graph), "Modularity": lambda graph: pg.MultiUnsupervised(pg.Modularity, graph), "LinkCC": lambda graph: pg.ClusteringCoefficient(graph, similarity="dot"), "LinkAUCcos": lambda graph: pg.LinkAssessment(graph, similarity="cos"), "HopAUCdot": lambda graph: pg.LinkAssessment(graph, similarity="dot", hops=2), } scores = {measure: pg.benchmark_scores(pg.benchmark(algorithms, loader, measures[measure])) for measure in measures} evaluations_vs_auc = dict() evaluations_vs_ndcg = dict() for measure in measures: evaluations_vs_auc[measure] = abs(pg.SpearmanCorrelation(scores["AUC"])(scores[measure])) evaluations_vs_ndcg[measure] = abs(pg.SpearmanCorrelation(scores["NDCG"])(scores[measure])) pg.benchmark_print([("Measure", "AUC corr", "NDCG corr")] + [(measure, evaluations_vs_auc[measure], evaluations_vs_ndcg[measure]) for measure in measures])
import pygrank as pg datasets = ["friendster"] pre = pg.preprocessor(assume_immutability=True, normalization="symmetric") # common preprocessor algs = { "ppr.85": pg.PageRank(.85, preprocessor=pre), "ppr.99": pg.PageRank(.99, preprocessor=pre, max_iters=1000), "hk3": pg.HeatKernel(3, preprocessor=pre), "hk5": pg.HeatKernel(5, preprocessor=pre), "tuned": pg.ParameterTuner(preprocessor=pre) } loader = pg.load_datasets_one_community(datasets) pg.benchmark_print(pg.benchmark(algs, loader, pg.AUC, fraction_of_training=.5))
pg.Normalize( postprocessor( pg.GenericGraphFilter([1]+params, preprocessor=pre, error_type="iters", max_iters=41, optimization_dict=optimization, preserve_norm=False))), deviation_tol=1.E-6, measure=measure, optimizer=optimizer, max_vals=[1]*40, min_vals=[0]*40) tuned = { "select": pg.AlgorithmSelection(algorithms.values(), fraction_of_training=0.9, measure=measure), "tune": create_param_tuner(), "tuneLBFGSB": create_param_tuner(pg.lbfgsb) } for name, graph, group in pg.load_datasets_all_communities(datasets, min_group_size=community_size, max_group_number=3): print(" & ".join([str(val) for val in [name, len(graph), graph.number_of_edges(), len(group)]])+" \\\\") loader = pg.load_datasets_all_communities(datasets, min_group_size=community_size, max_group_number=3) pg.benchmark_print( pg.benchmark_average((pg.benchmark(algorithms | tuned, loader, measure, fraction_of_training=[0.1, 0.2, 0.3], seed=list(range(1)))), posthocs=True), decimals=3, delimiter=" & ", end_line="\\\\")
tuned = { "selected": pg.AlgorithmSelection(algorithms.values(), fraction_of_training=0.8), #"tuned": pg.ParameterTuner(preprocessor=pre, fraction_of_training=0.8, tol=tol, optimization_dict=optimization, measure=pg.AUC), "arnoldi": pg.HopTuner(preprocessor=pre, basis="arnoldi", measure=pg.Cos, tol=tol, optimization_dict=optimization), #"arnoldi2": pg.ParameterTuner(lambda params: pg.HopTuner(preprocessor=pre, basis="arnoldi", num_parameters=int(params[0]), # measure=pg.Cos, # tol=tol, optimization_dict=optimization, tunable_offset=None), # max_vals=[40], min_vals=[5], divide_range=2, fraction_of_training=0.1), } #algorithms = pg.create_variations(algorithms, {"": pg.Tautology, "+Sweep": pg.Sweep}) #print(algorithms.keys()) #for name, graph, group in pg.load_datasets_all_communities(datasets, min_group_size=50): # print(" & ".join([str(val) for val in [name, len(graph), graph.number_of_edges(), len(group)]])+" \\\\") loader = pg.load_datasets_all_communities(datasets, min_group_size=50) pg.benchmark_print(pg.benchmark(algorithms | tuned, loader, pg.AUC, fraction_of_training=.8, seed=list(range(1))), decimals=3, delimiter=" & ", end_line="\\\\")