Ejemplo n.º 1
0
def sample_trial(params):
    Cs = np.arange(0.25,.50,.25)

    filtered_Cs   = []
    hier_deltas   = []
    kmeans_deltas = []
    
    for C in Cs:
        try:
            cluster_sizes = [params["cluster_size"]] * params["num_clusters"]
            clusters = create_clusters(cluster_sizes)
            
            G = create_sbm(clusters, params["p"], params["q"], False)
            L = nx.normalized_laplacian_matrix(G).todense()
            w,_ = np.linalg.eig(L)
            sorted_w = sorted(w)
            
            sample_sparsifier = SampleSparsifier(sorted_w[len(clusters)], C=C)
            delta_hier, delta_kmeans = _run_test(G, clusters, 
                sample_sparsifier, C, "sample")
            
            filtered_Cs.append(C)
            hier_deltas.append(delta_hier)
            kmeans_deltas.append(delta_kmeans)

        except:
            continue
    _plot_trial_results(filtered_Cs,hier_deltas,kmeans_deltas,"sample")
Ejemplo n.º 2
0
def single_contract_test(params):
    """Given graph parameters with p (probability in-cluster), 
    q (probability out-of-cluster), and percent_edges (percent of total edges
    to be contracted), runs a with-contraction clustering trial on a randomly
    generated SBM (Stochastic Block Model) graph

    Returns (1) hierarchical accuracy (float); (2) kmeans accuracy (float)
    """
    cluster_size = 8
    num_clusters = 5
    cluster_sizes = [cluster_size] * num_clusters
    clusters = create_clusters(cluster_sizes)

    G = create_sbm(clusters, params["p"], params["q"], False)
    to_contract = int(len(G.edges) * params["percent_edges"])

    num_clusters = len(clusters)
    hier_partitions, kmeans_partitions = contract_deanonymize(
        G, k=num_clusters, to_contract=to_contract)

    hier_accuracy = calc_accuracy(clusters, hier_partitions)
    kmeans_accuracy = calc_accuracy(clusters, kmeans_partitions)

    print("hierarchical accuracy: {}".format(hier_accuracy))
    print("k-means accuracy: {}".format(kmeans_accuracy))
    return hier_accuracy, kmeans_accuracy
Ejemplo n.º 3
0
def conduct_tests(ps, qs, css):
    """Given lists of p probabilities, q probabilities, and lists of cluster sizes,
    runs tests on clustering accuracies (both hierarchical and k-means)

    Returns void
    """
    trials = 5

    for cs in css:
        clusters = create_clusters(cs)

        for p in ps:
            hier_accuracies, kmeans_accuracies = [], []
            for i, q in enumerate(qs):
                if q > p: break

                hier_trials, kmeans_trials = [], []
                for _ in range(trials):
                    sbm = create_sbm(clusters, p, q)
                    hier_partitions, kmeans_partitions = deanonymize(
                        sbm, k=len(clusters))
                    hier_accuracy = calc_accuracy(clusters, hier_partitions)
                    kmeans_accuracy = calc_accuracy(clusters,
                                                    kmeans_partitions)

                    hier_trials.append(hier_accuracy)
                    kmeans_trials.append(kmeans_accuracy)

                hier_accuracies.append(np.mean(hier_trials))
                kmeans_accuracies.append(np.mean(kmeans_trials))

            print("Completed accuracy for: p={}, cs={}".format(p, cs))
            for accuracies, label in zip([hier_accuracies, kmeans_accuracies],
                                         ["hierarchical", "kmeans"]):

                fig = plt.figure()
                plt.scatter(qs[:i], accuracies)

                plt.title("{} vs. q (p={}_cs={})".format(label, p, cs))
                plt.xlabel("q")
                plt.ylabel("accuracy (%_correct)")

                plt.savefig("output/accuracy/p={}_cs={}_{}.png".format(
                    p, cs, label))
                plt.close()
Ejemplo n.º 4
0
def spectral_trial(params):
    epsilons = np.arange(0.25,10.0,.125)

    filtered_epsilons = []
    hier_deltas       = []
    kmeans_deltas     = []
    
    for epsilon in epsilons:
        try:
            cluster_sizes = [params["cluster_size"]] * params["num_clusters"]
            clusters = create_clusters(cluster_sizes)
            G = create_sbm(clusters, params["p"], params["q"], False)
                
            spectral_sparsifier = SpectralSparsifier(epsilon=epsilon)
            delta_hier, delta_kmeans = _run_test(G, clusters, 
                spectral_sparsifier, epsilon, "spectral")
                
            filtered_epsilons.append(epsilon)
            hier_deltas.append(delta_hier)
            kmeans_deltas.append(delta_kmeans)
            
        except:
            continue
    _plot_trial_results(filtered_epsilons,hier_deltas,kmeans_deltas,"spectral")
Ejemplo n.º 5
0
def _cmd_graph(argv):
    """Parses arguments as specified by argv and returns as a dictionary. Entries
    are parsed as specified in the help menu (visible by running "python3 app.py -h")

    Returns parameters dictionary
    """
    params = {
        "byte_percent": .01,
        "cluster_size": 10,
        "pca": False,
        "guess_clusters": False,
        "run_metis": True,
        "run_spectral": True,
        "num_clusters": 2,
        "run_test": True,
        "weighted": False,
        "p": 0.75,
        "q": 0.25,
        "cs": None,
        "graph_coarsen": None,
        "lib": "matplotlib",
        "multi_run": 1
    }

    USAGE_STRING = """eigenvalues.py 
            -b <byte_percent>    [(float) percent of bytes in full data to be analyzed]
            -c <cluster_size>    [(int) size of each cluster (assumed to be same for all)]
            -d <display_bool>    [(y/n) for whether to show PCA projections]
            -g <guess_bool>      [(y/n) to guess the number of clusters vs. take it as known] 
            -m <run_metis>       [(y/n) to additionally enable METIS clustering]
            -n <num_cluster>     [(int) number of clusters (distinct people)]
            -p <p_value>         [(0,1) float for in-cluster probability]
            -q <q_value>         [(0,1) float for non-cluster probability] 
            -r <run_test_bool>   [(y/n) for whether to create SBM to run test or run on actual data]
            -s <run_spectral>    [(y/n) to enable spectral clustering]
            -w <weighted_graph>  [(y/n) for whether to have weights on edges (randomized)]
            
            --cs <cluster_sizes> [(int list) size of each cluster (comma delimited)]
            --gc <graph_coarsen> [(int) iterations of matchings found to be coarsened (default 0)]
            --lib                [('matplotlib','plotly') for plotting library]
            --mr                 [(int) indicates how many trials to be run in testing]"""

    opts, args = getopt.getopt(argv, "hb:c:d:g:m:n:p:q:r:s:w:",
                               ['lib=', 'cs=', 'gc=', 'mr='])
    for opt, arg in opts:
        if opt in ('-h'):
            print(USAGE_STRING)
            sys.exit()

        elif opt in ("-b"):
            params["byte_percent"] = float(arg)
        elif opt in ("-c"):
            params["cluster_size"] = int(arg)
        elif opt in ("-d"):
            params["pca"] = (arg == "y")
        elif opt in ("-g"):
            params["guess_clusters"] = (arg == "y")
        elif opt in ("-m"):
            params["run_metis"] = (arg == "y")
        elif opt in ("-n"):
            params["num_clusters"] = int(arg)
        elif opt in ("-r"):
            params["run_test"] = (arg == "y")
        elif opt in ("-s"):
            params["run_spectral"] = (arg == "y")
        elif opt in ("-w"):
            params["weighted"] = (arg == "y")

        elif opt in ("-p"):
            params["p"] = float(arg)
        elif opt in ("-q"):
            params["q"] = float(arg)

        elif opt in ("--cs"):
            params["cs"] = arg
        elif opt in ("--gc"):
            params["graph_coarsen"] = int(arg)
        elif opt in ("--lib"):
            params["lib"] = arg
        elif opt in ("--mr"):
            params["multi_run"] = int(arg)

    if params["run_test"]:
        if params["cs"] is not None:
            params["cluster_sizes"] = [
                int(cluster) for cluster in params["cs"].split(",")
            ]
        else:
            params["cluster_sizes"] = [cluster_size] * num_clusters
        params["clusters"] = create_clusters(params["cluster_sizes"])
    return params