def sample_trial(params): Cs = np.arange(0.25,.50,.25) filtered_Cs = [] hier_deltas = [] kmeans_deltas = [] for C in Cs: try: cluster_sizes = [params["cluster_size"]] * params["num_clusters"] clusters = create_clusters(cluster_sizes) G = create_sbm(clusters, params["p"], params["q"], False) L = nx.normalized_laplacian_matrix(G).todense() w,_ = np.linalg.eig(L) sorted_w = sorted(w) sample_sparsifier = SampleSparsifier(sorted_w[len(clusters)], C=C) delta_hier, delta_kmeans = _run_test(G, clusters, sample_sparsifier, C, "sample") filtered_Cs.append(C) hier_deltas.append(delta_hier) kmeans_deltas.append(delta_kmeans) except: continue _plot_trial_results(filtered_Cs,hier_deltas,kmeans_deltas,"sample")
def single_contract_test(params): """Given graph parameters with p (probability in-cluster), q (probability out-of-cluster), and percent_edges (percent of total edges to be contracted), runs a with-contraction clustering trial on a randomly generated SBM (Stochastic Block Model) graph Returns (1) hierarchical accuracy (float); (2) kmeans accuracy (float) """ cluster_size = 8 num_clusters = 5 cluster_sizes = [cluster_size] * num_clusters clusters = create_clusters(cluster_sizes) G = create_sbm(clusters, params["p"], params["q"], False) to_contract = int(len(G.edges) * params["percent_edges"]) num_clusters = len(clusters) hier_partitions, kmeans_partitions = contract_deanonymize( G, k=num_clusters, to_contract=to_contract) hier_accuracy = calc_accuracy(clusters, hier_partitions) kmeans_accuracy = calc_accuracy(clusters, kmeans_partitions) print("hierarchical accuracy: {}".format(hier_accuracy)) print("k-means accuracy: {}".format(kmeans_accuracy)) return hier_accuracy, kmeans_accuracy
def conduct_tests(ps, qs, css): """Given lists of p probabilities, q probabilities, and lists of cluster sizes, runs tests on clustering accuracies (both hierarchical and k-means) Returns void """ trials = 5 for cs in css: clusters = create_clusters(cs) for p in ps: hier_accuracies, kmeans_accuracies = [], [] for i, q in enumerate(qs): if q > p: break hier_trials, kmeans_trials = [], [] for _ in range(trials): sbm = create_sbm(clusters, p, q) hier_partitions, kmeans_partitions = deanonymize( sbm, k=len(clusters)) hier_accuracy = calc_accuracy(clusters, hier_partitions) kmeans_accuracy = calc_accuracy(clusters, kmeans_partitions) hier_trials.append(hier_accuracy) kmeans_trials.append(kmeans_accuracy) hier_accuracies.append(np.mean(hier_trials)) kmeans_accuracies.append(np.mean(kmeans_trials)) print("Completed accuracy for: p={}, cs={}".format(p, cs)) for accuracies, label in zip([hier_accuracies, kmeans_accuracies], ["hierarchical", "kmeans"]): fig = plt.figure() plt.scatter(qs[:i], accuracies) plt.title("{} vs. q (p={}_cs={})".format(label, p, cs)) plt.xlabel("q") plt.ylabel("accuracy (%_correct)") plt.savefig("output/accuracy/p={}_cs={}_{}.png".format( p, cs, label)) plt.close()
def spectral_trial(params): epsilons = np.arange(0.25,10.0,.125) filtered_epsilons = [] hier_deltas = [] kmeans_deltas = [] for epsilon in epsilons: try: cluster_sizes = [params["cluster_size"]] * params["num_clusters"] clusters = create_clusters(cluster_sizes) G = create_sbm(clusters, params["p"], params["q"], False) spectral_sparsifier = SpectralSparsifier(epsilon=epsilon) delta_hier, delta_kmeans = _run_test(G, clusters, spectral_sparsifier, epsilon, "spectral") filtered_epsilons.append(epsilon) hier_deltas.append(delta_hier) kmeans_deltas.append(delta_kmeans) except: continue _plot_trial_results(filtered_epsilons,hier_deltas,kmeans_deltas,"spectral")
def _cmd_graph(argv): """Parses arguments as specified by argv and returns as a dictionary. Entries are parsed as specified in the help menu (visible by running "python3 app.py -h") Returns parameters dictionary """ params = { "byte_percent": .01, "cluster_size": 10, "pca": False, "guess_clusters": False, "run_metis": True, "run_spectral": True, "num_clusters": 2, "run_test": True, "weighted": False, "p": 0.75, "q": 0.25, "cs": None, "graph_coarsen": None, "lib": "matplotlib", "multi_run": 1 } USAGE_STRING = """eigenvalues.py -b <byte_percent> [(float) percent of bytes in full data to be analyzed] -c <cluster_size> [(int) size of each cluster (assumed to be same for all)] -d <display_bool> [(y/n) for whether to show PCA projections] -g <guess_bool> [(y/n) to guess the number of clusters vs. take it as known] -m <run_metis> [(y/n) to additionally enable METIS clustering] -n <num_cluster> [(int) number of clusters (distinct people)] -p <p_value> [(0,1) float for in-cluster probability] -q <q_value> [(0,1) float for non-cluster probability] -r <run_test_bool> [(y/n) for whether to create SBM to run test or run on actual data] -s <run_spectral> [(y/n) to enable spectral clustering] -w <weighted_graph> [(y/n) for whether to have weights on edges (randomized)] --cs <cluster_sizes> [(int list) size of each cluster (comma delimited)] --gc <graph_coarsen> [(int) iterations of matchings found to be coarsened (default 0)] --lib [('matplotlib','plotly') for plotting library] --mr [(int) indicates how many trials to be run in testing]""" opts, args = getopt.getopt(argv, "hb:c:d:g:m:n:p:q:r:s:w:", ['lib=', 'cs=', 'gc=', 'mr=']) for opt, arg in opts: if opt in ('-h'): print(USAGE_STRING) sys.exit() elif opt in ("-b"): params["byte_percent"] = float(arg) elif opt in ("-c"): params["cluster_size"] = int(arg) elif opt in ("-d"): params["pca"] = (arg == "y") elif opt in ("-g"): params["guess_clusters"] = (arg == "y") elif opt in ("-m"): params["run_metis"] = (arg == "y") elif opt in ("-n"): params["num_clusters"] = int(arg) elif opt in ("-r"): params["run_test"] = (arg == "y") elif opt in ("-s"): params["run_spectral"] = (arg == "y") elif opt in ("-w"): params["weighted"] = (arg == "y") elif opt in ("-p"): params["p"] = float(arg) elif opt in ("-q"): params["q"] = float(arg) elif opt in ("--cs"): params["cs"] = arg elif opt in ("--gc"): params["graph_coarsen"] = int(arg) elif opt in ("--lib"): params["lib"] = arg elif opt in ("--mr"): params["multi_run"] = int(arg) if params["run_test"]: if params["cs"] is not None: params["cluster_sizes"] = [ int(cluster) for cluster in params["cs"].split(",") ] else: params["cluster_sizes"] = [cluster_size] * num_clusters params["clusters"] = create_clusters(params["cluster_sizes"]) return params