def test_hoptuner_autorgression(): _, G, groups = next(pg.load_datasets_multiple_communities(["bigraph"])) group = groups[0] training, evaluation = pg.split(pg.to_signal(G, {v: 1 for v in group}), training_samples=0.01) auc1 = pg.AUC(evaluation, exclude=training)(pg.HopTuner(measure=pg.AUC).rank(training)) auc3 = pg.AUC(evaluation, exclude=training)(pg.HopTuner(measure=pg.AUC, autoregression=5).rank(training)) assert auc3 > auc1*0.9
def test_sweep_streaming(): _, graph, group = next(pg.load_datasets_one_community(["bigraph"])) for _ in supported_backends(): training, evaluation = pg.split(list(group), training_samples=0.1) auc1 = pg.AUC({v: 1 for v in evaluation}, exclude=training).evaluate( (pg.PageRank() >> pg.Sweep()).rank( graph, {v: 1 for v in training})) auc2 = pg.AUC({v: 1 for v in evaluation}, exclude=training).evaluate(pg.PageRank().rank( graph, {v: 1 for v in training})) auc3 = pg.AUC( {v: 1 for v in evaluation}, exclude=training).evaluate( pg.PageRank() >> pg.Transformer(pg.log) >> pg.LinearSweep() | pg.to_signal(graph, {v: 1 for v in training})) assert auc1 > auc2 assert abs(auc1 - auc3) < pg.epsilon() with pytest.raises(Exception): pg.Sweep() << "a"
def test_hoptuner_explicit_algorithm(): _, G, groups = next(pg.load_datasets_multiple_communities(["bigraph"])) group = groups[0] training, evaluation = pg.split(pg.to_signal(G, {v: 1 for v in group}), training_samples=0.5) auc1 = pg.AUC(evaluation, exclude=training)(pg.HopTuner(lambda params: pg.GenericGraphFilter(params, krylov_dims=10), basis="arnoldi", measure=pg.AUC).rank(training)) auc2 = pg.AUC(evaluation, exclude=training)(pg.HopTuner(basis="arnoldi", krylov_dims=10, measure=pg.AUC).rank(training)) assert abs(auc1-auc2) < 0.005
def test_lowpass_tuning(): _, G, groups = next(pg.load_datasets_multiple_communities(["bigraph"])) group = groups[0] training, evaluation = pg.split(pg.to_signal(G, {v: 1 for v in group}), training_samples=0.1) auc1 = pg.AUC(evaluation, exclude=training)(pg.ParameterTuner(lambda params: pg.GenericGraphFilter(params)).rank(training)) auc2 = pg.AUC(evaluation, exclude=training)(pg.ParameterTuner(lambda params: pg.LowPassRecursiveGraphFilter(params)).rank(training)) assert auc2 > auc1*0.8
def test_autotune_methods(): import numpy as np _, G, groups = next(pg.load_datasets_multiple_communities(["bigraph"])) group = groups[0] training, evaluation = pg.split(pg.to_signal(G, {v: 1 for v in group})) aucs = [pg.AUC(evaluation, exclude=training)(ranker.rank(training)) for ranker in pg.create_demo_filters().values()] auc2 = pg.AUC(evaluation, exclude=training)(pg.AlgorithmSelection().rank(training)) assert max(aucs)-np.std(aucs) <= auc2
def test_autotune_manual(): _, G, groups = next(pg.load_datasets_multiple_communities(["bigraph"])) group = groups[0] training, evaluation = pg.split(pg.to_signal(G, {v: 1 for v in group}), training_samples=0.5) auc1 = pg.AUC(evaluation, exclude=training)(pg.PageRank().rank(training)) alg2 = pg.ParameterTuner(lambda params: pg.PageRank(params[0]), max_vals=[0.99], min_vals=[0.5]).tune(training) auc2 = pg.AUC(evaluation, exclude=training)(alg2.rank(training)) assert auc1 <= auc2
def test_autotune(): _, G, groups = next(pg.load_datasets_multiple_communities(["bigraph"])) group = groups[0] training, evaluation = pg.split(pg.to_signal(G, {v: 1 for v in group}), training_samples=0.5) auc1 = pg.AUC(evaluation, exclude=training)(pg.PageRank().rank(training)) auc2 = pg.AUC(evaluation, exclude=training)(pg.HeatKernel().rank(training)) auc3 = pg.AUC(evaluation, exclude=training)(pg.ParameterTuner(optimization_dict=dict()).rank(training)) assert min(auc1, auc2) <= auc3 and max(auc1, auc2)*0.9 <= auc3
def test_hoptuner_arnoldi_backends(): _, G, groups = next(pg.load_datasets_multiple_communities(["bigraph"])) group = groups[0] training, evaluation = pg.split(pg.to_signal(G, {v: 1 for v in group}), training_samples=0.5) auc1 = pg.AUC(evaluation, exclude=training)(pg.HopTuner(basis="arnoldi", measure=pg.AUC).rank(training)) auc2 = pg.AUC(evaluation, exclude=training)(pg.HopTuner(basis="arnoldi", measure=pg.AUC, tuning_backend="pytorch").rank(training)) auc3 = pg.AUC(evaluation, exclude=training)(pg.HopTuner(basis="arnoldi", measure=pg.AUC, tuning_backend="tensorflow").rank(training)) assert auc1 == auc2 assert auc1 == auc3
def test_chebyshev(): _, graph, group = next(pg.load_datasets_one_community(["bigraph"])) # do not test with tensorflow, as it can be too slow training, evaluation = pg.split(pg.to_signal(graph, {v: 1 for v in group})) tuned_auc = pg.AUC(evaluation, training).evaluate(pg.ParameterTuner().rank( graph, training)) tuned_chebyshev_auc = pg.AUC(evaluation, training).evaluate( pg.ParameterTuner(coefficient_type="chebyshev").rank(graph, training)) assert (tuned_auc - tuned_chebyshev_auc) < 0.1
def test_autotune_backends(): _, G, groups = next(pg.load_datasets_multiple_communities(["bigraph"])) group = groups[0] training, evaluation = pg.split(pg.to_signal(G, {v: 1 for v in group}), training_samples=0.5) for tuner in [pg.HopTuner, pg.AlgorithmSelection, pg.ParameterTuner]: auc3 = pg.AUC(evaluation, exclude=training)(tuner(measure=pg.KLDivergence, tuning_backend="pytorch").rank(training)) auc2 = pg.AUC(evaluation, exclude=training)(tuner(measure=pg.KLDivergence, tuning_backend="tensorflow").rank(training)) auc1 = pg.AUC(evaluation, exclude=training)(tuner(measure=pg.KLDivergence).rank(training)) # TODO: maybe fix KLDivergence implementation to not be affected by backend.epsilon() assert abs(auc1-auc2) < 0.005 # different results due to different backend.epsilon() assert abs(auc1-auc3) < 0.005
def test_hoptuner_arnoldi(): _, G, groups = next(pg.load_datasets_multiple_communities(["bigraph"])) group = groups[0] training, evaluation = pg.split(pg.to_signal(G, {v: 1 for v in group}), training_samples=0.5) auc1 = pg.AUC(evaluation, exclude=training)(pg.HopTuner(measure=pg.AUC).rank(training)) auc2 = pg.AUC(evaluation, exclude=training)(pg.HopTuner(basis="arnoldi", measure=pg.AUC).rank(training)) assert abs(auc1 - auc2) < 0.005
def test_auc_ndcg_compliance(): _, graph, group = next(pg.load_datasets_one_community(["bigraph"])) training, test = pg.split(group, 0.5) for _ in supported_backends(): scores1 = pg.PageRank()(graph, training) scores2 = pg.HeatKernel()(graph, training) AUC1 = pg.AUC(test, exclude=training)(scores1) AUC2 = pg.AUC(test, exclude=training)(scores2) NDCG1 = float(pg.NDCG(test, exclude=training)(scores1)) NDCG2 = float(pg.NDCG(test, exclude=training)(scores2)) assert (AUC1 < AUC2) == (NDCG1 < NDCG2) with pytest.raises(Exception): pg.AUC(test, exclude=test, k=len(graph) + 1)(scores2) with pytest.raises(Exception): pg.NDCG(test, exclude=training, k=len(graph) + 1)(scores2)
def test_sweep(): _, graph, group = next(pg.load_datasets_one_community(["bigraph"])) for _ in supported_backends(): training, evaluation = pg.split(list(group), training_samples=0.1) auc1 = pg.AUC({v: 1 for v in evaluation}, exclude=training).evaluate( pg.Sweep(pg.PageRank()).rank( graph, {v: 1 for v in training})) auc2 = pg.AUC({v: 1 for v in evaluation}, exclude=training).evaluate(pg.PageRank().rank( graph, {v: 1 for v in training})) assert auc1 > auc2
def evaluate(graph, algorithm): tprs = list() ppvs = list() f1s = list() aucs = list() for node in list(graph): neighbors = list(graph.neighbors(node)) if len(neighbors) < 10: continue training = pg.to_signal(graph, {node: 1}) test = pg.to_signal(graph, {neighbor: 1 for neighbor in neighbors}) for neighbor in random.sample(neighbors, 1): assert graph.has_edge(node, neighbor) graph.remove_edge(node, neighbor) assert not graph.has_edge(node, neighbor) assert not graph.has_edge(neighbor, node) result = (training >> algorithm) * (1 - training) aucs.append(pg.AUC(test, exclude=training)(result)) top = result >> pg.Top(10) >> pg.Threshold() prec = pg.PPV(test, exclude=training)(top) rec = pg.TPR(test, exclude=training)(top) ppvs.append(prec) tprs.append(rec) f1s.append(pg.safe_div(2 * prec * rec, prec + rec)) for neighbor in graph.neighbors(node): if not graph.has_edge(node, neighbor): graph.add_edge(node, neighbor) print( f"\r{algorithm.cite()}\t AUC {sum(aucs) / len(aucs):.3f}\t f1 {sum(f1s) / len(f1s):.3f}\t prec {sum(ppvs) / len(ppvs):.3f}\t rec {sum(tprs)/len(tprs):.3f}\t", end="") print()
def test_algorithm_selection(): for _ in supported_backends(): _, graph, communities = next( pg.load_datasets_multiple_communities(["bigraph"], max_group_number=3)) train, test = pg.split(communities, 0.05) # 5% of community members are known algorithms = pg.create_variations(pg.create_demo_filters(), pg.Normalize) supervised_algorithm = pg.AlgorithmSelection(algorithms.values(), measure=pg.AUC, tuning_backend="numpy") print(supervised_algorithm.cite()) modularity_algorithm = pg.AlgorithmSelection( algorithms.values(), fraction_of_training=1, measure=pg.Modularity().as_supervised_method(), tuning_backend="numpy") supervised_aucs = list() modularity_aucs = list() for seeds, members in zip(train.values(), test.values()): measure = pg.AUC(members, exclude=seeds) supervised_aucs.append(measure(supervised_algorithm(graph, seeds))) modularity_aucs.append(measure(modularity_algorithm(graph, seeds))) assert abs( sum(supervised_aucs) / len(supervised_aucs) - sum(modularity_aucs) / len(modularity_aucs)) < 0.05
def test_best_direction(): assert pg.Conductance().best_direction() == -1 assert pg.Density().best_direction() == 1 assert pg.Modularity().best_direction() == 1 assert pg.AUC([1, 2, 3]).best_direction() == 1 assert pg.Cos([1, 2, 3]).best_direction() == 1 assert pg.Dot([1, 2, 3]).best_direction() == 1 assert pg.TPR([1, 2, 3]).best_direction() == 1 assert pg.TNR([1, 2, 3]).best_direction() == 1
def test_aggregated(): y1 = [1, 1, 0] y2 = [1, 0, 0] y3 = [1, 1, 0] for _ in supported_backends(): # TODO: investigate why not exactly the same always (numerical precision should be lower for numpy) epsilon = 1.E-6 assert abs( float(pg.GM().add(pg.AUC(y1), max_val=0.5).add( pg.AUC(y2), min_val=0.9).evaluate(y3)) - 0.45**0.5) < epsilon assert abs( float(pg.AM().add(pg.AUC(y1), max_val=0.5).add( pg.AUC(y2), min_val=0.9).evaluate(y3)) - 0.7) < epsilon assert abs( float(pg.Disparity().add(pg.AUC(y1), max_val=0.5).add( pg.AUC(y2), min_val=0.9).evaluate(y3)) - 0.4) < epsilon assert abs( float(pg.Disparity().add(pg.AUC(y1), max_val=0.5).add( pg.AUC(y2), min_val=0.9).evaluate(y3)) + float(pg.Parity().add(pg.AUC(y1), max_val=0.5).add( pg.AUC(y2), min_val=0.9).evaluate(y3) - 1)) < epsilon
def test_edge_cases(): assert pg.pRule([0])([0]) == 0 assert pg.Cos([0])([0]) == 0 with pytest.raises(Exception): pg.Measure()([0, 1, 0]) with pytest.raises(Exception): pg.AUC([0, 0, 0])([0, 1, 0]) with pytest.raises(Exception): pg.AUC([1, 1, 1])([0, 1, 0]) with pytest.raises(Exception): pg.KLDivergence([0], exclude={"A": 1})([1]) with pytest.raises(Exception): pg.Conductance(next(pg.load_datasets_graph(["graph5"])), max_rank=0.5)([1, 1, 1, 1, 1]) import networkx as nx for _ in supported_backends(): assert pg.Conductance(nx.Graph())([]) == float( "inf") # this is indeed correct in python assert pg.Density(nx.Graph())([]) == 0 assert pg.Modularity(nx.Graph())([]) == 0 assert pg.KLDivergence([0, 1, 0])([0, 1, 0]) == 0 assert pg.MKLDivergence([0, 1, 0])([0, 1, 0]) == 0 assert pg.KLDivergence([0])([-1]) == 0
algorithms.values(), fraction_of_training=1, measure=pg.Modularity().as_supervised_method()) linkauc_algorithm = None best_evaluation = 0 linkAUC = pg.LinkAssessment( graph, similarity="cos", hops=1) # LinkAUC, because emails systemically exhibit homophily for algorithm in algorithms.values(): evaluation = linkAUC.evaluate({ community: algorithm(graph, seeds) for community, seeds in train.items() }) if evaluation > best_evaluation: best_evaluation = evaluation linkauc_algorithm = algorithm supervised_aucs = list() modularity_aucs = list() linkauc_aucs = list() for seeds, members in zip(train.values(), test.values()): measure = pg.AUC(members, exclude=seeds) supervised_aucs.append(measure(supervised_algorithm(graph, seeds))) modularity_aucs.append(measure(modularity_algorithm(graph, seeds))) linkauc_aucs.append(measure(linkauc_algorithm(graph, seeds))) print("Supervised", sum(supervised_aucs) / len(supervised_aucs)) print("Modularity", sum(modularity_aucs) / len(modularity_aucs)) print("LinkAUC", sum(modularity_aucs) / len(modularity_aucs))