def test_autorefs(): """ Tests that different (base) algorithms yield different citations, that all citations have at least one reference to a publication and that wrapping the same base algorithms yields the same citations. """ pre = pg.preprocessor(assume_immutability=True, normalization="symmetric") algs = { "ppr.85": pg.PageRank(.85, preprocessor=pre, tol=1.E-9, max_iters=1000), "ppr.99": pg.PageRank(.99, preprocessor=pre, tol=1.E-9, max_iters=1000), "hk3": pg.HeatKernel(3, preprocessor=pre, tol=1.E-9, max_iters=1000), "hk5": pg.HeatKernel(5, preprocessor=pre, tol=1.E-9, max_iters=1000), "hk5'": pg.HeatKernel(5, preprocessor=pre, tol=1.E-9, max_iters=1000), } algs = algs | pg.create_variations( algs, { "+Sweep": pg.Sweep, "+SO": pg.SeedOversampling, "+BSO": pg.BoostedSeedOversampling }) citations = set() for alg in algs.values(): citation = alg.cite() assert "\\cite{" in citation citations.add(citation) assert len(citations) == len(algs) - 4
def test_unsupervised_vs_auc(): def loader(): return pg.load_datasets_multiple_communities(["graph9"]) algorithms = pg.create_variations(pg.create_many_filters(), pg.create_many_variation_types()) time_scores = pg.benchmark_scores(pg.benchmark(algorithms, loader(), pg.Time)) assert sum(time_scores) > 0 measures = {"AUC": lambda ground_truth, exlude: pg.MultiSupervised(pg.AUC, ground_truth, exlude), "NDCG": lambda ground_truth, exlude: pg.MultiSupervised(pg.NDCG, ground_truth, exlude), "Density": lambda graph: pg.MultiUnsupervised(pg.Density, graph), "Conductance": lambda graph: pg.MultiUnsupervised(pg.Conductance(autofix=True).as_unsupervised_method(), graph), "Modularity": lambda graph: pg.MultiUnsupervised(pg.Modularity(max_positive_samples=5).as_unsupervised_method(), graph), "CCcos": lambda graph: pg.ClusteringCoefficient(graph, similarity="cos", max_positive_samples=5), "CCdot": lambda graph: pg.ClusteringCoefficient(graph, similarity="dot", max_positive_samples=5), "LinkAUCcos": lambda graph: pg.LinkAssessment(graph, similarity="cos", max_positive_samples=5), "LinkAUCdot": lambda graph: pg.LinkAssessment(graph, similarity="dot", max_positive_samples=5), "HopAUCcos": lambda graph: pg.LinkAssessment(graph, similarity="cos", hops=2, max_positive_samples=5), "HopAUCdot": lambda graph: pg.LinkAssessment(graph, similarity="dot", hops=2, max_positive_samples=5), } scores = {}#measure: pg.benchmark_scores(pg.benchmark(algorithms, loader(), measures[measure])) for measure in measures} for measure in measures: # do this as a for loop, because pytest becomes a little slow above list comprehension scores[measure] = pg.benchmark_scores(pg.benchmark(algorithms, loader(), measures[measure])) supervised = {"AUC", "NDCG"} evaluations = dict() for measure in measures: evaluations[measure] = abs(pg.SpearmanCorrelation(scores["AUC"])(scores[measure])) #for measure in measures: # print(measure, evaluations[measure]) assert max([evaluations[measure] for measure in measures if measure not in supervised]) == evaluations["LinkAUCdot"]
def test_algorithm_selection(): for _ in supported_backends(): _, graph, communities = next( pg.load_datasets_multiple_communities(["bigraph"], max_group_number=3)) train, test = pg.split(communities, 0.05) # 5% of community members are known algorithms = pg.create_variations(pg.create_demo_filters(), pg.Normalize) supervised_algorithm = pg.AlgorithmSelection(algorithms.values(), measure=pg.AUC, tuning_backend="numpy") print(supervised_algorithm.cite()) modularity_algorithm = pg.AlgorithmSelection( algorithms.values(), fraction_of_training=1, measure=pg.Modularity().as_supervised_method(), tuning_backend="numpy") supervised_aucs = list() modularity_aucs = list() for seeds, members in zip(train.values(), test.values()): measure = pg.AUC(members, exclude=seeds) supervised_aucs.append(measure(supervised_algorithm(graph, seeds))) modularity_aucs.append(measure(modularity_algorithm(graph, seeds))) assert abs( sum(supervised_aucs) / len(supervised_aucs) - sum(modularity_aucs) / len(modularity_aucs)) < 0.05
"facebook0", "facebook686", "log4j", "ant", "eucore", "citeseer", "dblp" ] seed_fractions = [0.3, 0.5] pre = pg.preprocessor(assume_immutability=True, normalization="symmetric") filters = { "ppr0.85": pg.PageRank(alpha=0.85, preprocessor=pre, max_iters=10000, tol=1.E-6), "ppr0.99": pg.PageRank(alpha=0.99, preprocessor=pre, max_iters=10000, tol=1.E-6), "hk3": pg.HeatKernel(t=3, preprocessor=pre, max_iters=10000, tol=1.E-6), "hk7": pg.HeatKernel(t=7, preprocessor=pre, max_iters=10000, tol=1.E-6), } filters = pg.create_variations(filters, {"": pg.Tautology, "+Sweep": pg.Sweep}) for name, filter in filters.items(): print("=====", name, "=====") algorithms = { "None": filter, "Mult": pg.AdHocFairness(filter, "B"), "LFPRO": pg.AdHocFairness(filter, "O"), #"FBuck-C": pg.FairPersonalizer(filter, .8, pRule_weight=10, max_residual=1, error_type=pg.Mabs, parameter_buckets=0), "FPers-C": pg.FairPersonalizer(filter, .8, pRule_weight=10,
import pygrank as pg datasets = ["EUCore", "Amazon"] pre = pg.preprocessor(assume_immutability=True, normalization="symmetric") algs = { "ppr.85": pg.PageRank(.85, preprocessor=pre, tol=1.E-9, max_iters=1000), "ppr.99": pg.PageRank(.99, preprocessor=pre, tol=1.E-9, max_iters=1000), "hk3": pg.HeatKernel(3, preprocessor=pre, tol=1.E-9, max_iters=1000), "hk5": pg.HeatKernel(5, preprocessor=pre, tol=1.E-9, max_iters=1000), } algs = algs | pg.create_variations(algs, {"+Sweep": pg.Sweep}) loader = pg.load_datasets_one_community(datasets) algs["tuned"] = pg.ParameterTuner(preprocessor=pre, tol=1.E-9, max_iters=1000) algs["selected"] = pg.AlgorithmSelection( pg.create_demo_filters(preprocessor=pre, tol=1.E-9, max_iters=1000).values()) algs["tuned+Sweep"] = pg.ParameterTuner( ranker_generator=lambda params: pg.Sweep( pg.GenericGraphFilter( params, preprocessor=pre, tol=1.E-9, max_iters=1000))) for alg in algs.values(): print(alg.cite()) # prints a list of algorithm citations pg.benchmark_print(pg.benchmark(algs, loader, pg.AUC, fraction_of_training=.5), delimiter=" & ", end_line="\\\\")
import pygrank as pg loader = list(pg.load_datasets_multiple_communities(["bigraph", "cora", "citeseer"])) algorithms = pg.create_variations(pg.create_demo_filters(), pg.create_many_variation_types()) algorithms = pg.create_variations(algorithms, pg.Normalize) # add normalization to all algorithms print("Algorithms", len(algorithms)) measures = {"AUC": lambda ground_truth, exlude: pg.MultiSupervised(pg.AUC, ground_truth, exlude), "NDCG": lambda ground_truth, exlude: pg.MultiSupervised(pg.NDCG, ground_truth, exlude), "Density": lambda graph: pg.MultiUnsupervised(pg.Density, graph), "Modularity": lambda graph: pg.MultiUnsupervised(pg.Modularity, graph), "LinkCC": lambda graph: pg.ClusteringCoefficient(graph, similarity="dot"), "LinkAUCcos": lambda graph: pg.LinkAssessment(graph, similarity="cos"), "HopAUCdot": lambda graph: pg.LinkAssessment(graph, similarity="dot", hops=2), } scores = {measure: pg.benchmark_scores(pg.benchmark(algorithms, loader, measures[measure])) for measure in measures} evaluations_vs_auc = dict() evaluations_vs_ndcg = dict() for measure in measures: evaluations_vs_auc[measure] = abs(pg.SpearmanCorrelation(scores["AUC"])(scores[measure])) evaluations_vs_ndcg[measure] = abs(pg.SpearmanCorrelation(scores["NDCG"])(scores[measure])) pg.benchmark_print([("Measure", "AUC corr", "NDCG corr")] + [(measure, evaluations_vs_auc[measure], evaluations_vs_ndcg[measure]) for measure in measures])
import pygrank as pg _, graph, community = next(pg.load_datasets_one_community(["EUCore"])) algorithm = pg.HeatKernel( t=5, # the number of hops away HeatKernel places maximal importance on normalization="symmetric", renormalize=True) personalization = {node: 1. for node in community} # ignored nodes assumed to be zeroes algorithms = { "HK5": algorithm, "HK5+Oversampling": pg.SeedOversampling(algorithm) } algorithms = algorithms | pg.create_variations(algorithms, {"+Sweep": pg.Sweep}) algorithms = pg.create_variations(algorithms, {"": pg.Normalize}) measure = pg.Conductance() for algorithm_name, algorithm in algorithms.items(): scores = algorithm(graph, personalization) # returns a dict-like pg.GraphSignal print(algorithm_name, measure(scores))
import pygrank as pg _, graph, communities = next( pg.load_datasets_multiple_communities(["EUCore"], max_group_number=3)) train, test = pg.split(communities, 0.05) # 5% of community members are known algorithms = pg.create_variations(pg.create_demo_filters(), pg.Normalize) supervised_algorithm = pg.AlgorithmSelection(algorithms.values(), measure=pg.AUC) print(supervised_algorithm.cite()) modularity_algorithm = pg.AlgorithmSelection( algorithms.values(), fraction_of_training=1, measure=pg.Modularity().as_supervised_method()) linkauc_algorithm = None best_evaluation = 0 linkAUC = pg.LinkAssessment( graph, similarity="cos", hops=1) # LinkAUC, because emails systemically exhibit homophily for algorithm in algorithms.values(): evaluation = linkAUC.evaluate({ community: algorithm(graph, seeds) for community, seeds in train.items() }) if evaluation > best_evaluation: best_evaluation = evaluation linkauc_algorithm = algorithm supervised_aucs = list() modularity_aucs = list() linkauc_aucs = list()