def test_sample_network_simulation(self): sparsity = 1 noise_prob = 0 sim_partial_network = sim.sample_network(self.cluster_sizes,sparsity,noise_prob) #without sparsity should completely recover network #and no noise either assert (self.sim_full_network != sim_partial_network).nnz == 0
def test_sample_network_noise(self): #with complete noise should recover opposite of matrix #could write another probabilistic test for the case of partial noise #but this is OK sparsity = 1 noise_prob = 1 sim_partial_network = sim.sample_network(self.cluster_sizes,sparsity,noise_prob) #should recover opposite of matrix assert (self.sim_full_network == sim_partial_network).nnz == 0
def test_sample_network_sparse_quantity(self): #with sparsity should in expectation recover a lot of the network sparsity = 0.5 noise_prob = 0 sim_partial_network = sim.sample_network(self.cluster_sizes,sparsity,noise_prob, symmetric=False) expected_num_edges = self.network_size**2 * sparsity #Probabilistic test: MAY FAIL WITH CORRECT BEHAVIOR (BUT W/ LOW PROBABILITY) #by a Chernoff bound, this should deviate from expected number of edges #by more than t*(network_size/2) #with probability 2*e^(-t^2/2) #here set t = 4, so this tests passes with probability 1 - 2*e^-8 actual_num_edges = sim_partial_network.nnz print expected_num_edges, actual_num_edges assert abs(actual_num_edges - expected_num_edges) <= 20
def clustering_pipeline(network_params, clustering_params): #Create network cluster_sizes, sparsity, noise_prob = network_params num_clusters = len(cluster_sizes) network = sim.sample_network(cluster_sizes, sparsity, noise_prob) rows, cols = network.nonzero() #Assign ground truth labels (the first "cluster_size" are in cluster 0, #next are in cluster 1, etc.) cluster_labels = list() for cluster_index in range(len(cluster_sizes)): cluster_labels += [cluster_index] * cluster_sizes[cluster_index] cluster_labels = np.asarray(cluster_labels) #perform clustering cluster_sizes, method, completion_alg, completion_params, mode = clustering_params cluster_predictions = cluster_signed_network(network, cluster_sizes, method, \ completion_alg, completion_params, mode) cluster_accuracy = evaluate_cluster_accuracy(cluster_predictions, cluster_labels, \ rows, cols) return cluster_accuracy
def run_experiment(): simulated = False real = True use_moi = True use_hoc = True use_svp = True use_sgd_sh = False use_sgd_sig = False use_als = True adj_matrix = None if simulated: cluster_sizes = [100, 200, 300, 400] sparsity_level = 0.01175 noise_prob = 0 print "creating adjacency matrix..." adj_matrix = sim.sample_network(cluster_sizes, sparsity_level, noise_prob) elif real: data_file_name = "data/Preprocessed Data/small_network.npy" #data_file_name = "data/Preprocessed Data/wiki_elections_csr.npy" try: adj_matrix = np.load(data_file_name).item() except Exception as e: raise ValueError("could not load adj matrix from file: ", e) if use_moi: print "performing MOI..." max_cycle_order_moi = 10 discount = [0.5**i for i in range(3, max_cycle_order_moi + 1)] #max_cycle_order_moi = np.inf #discount = 0.0001 num_folds = 5 avg_acc, stderr_acc, avg_fpr, stderr_fpr, avg_time, stderr_time = \ moi.kfoldcv_moi(adj_matrix, discount, max_cycle_order_moi, num_folds) print "MOI results: " print("Accuracy: average %f with standard error %f" % (avg_acc, stderr_acc)) print("False positive rate: average %f with standard error %f" % (avg_fpr, stderr_fpr)) print("Model running time: average %f with standard error %f" % (avg_time, stderr_time)) print if use_hoc: print "performing HOC..." max_cycle_order_hoc = 5 num_folds = 10 avg_acc, stderr_acc, avg_fpr, stderr_fpr, avg_time, stderr_time = \ hoc.hoc_learning_pipeline(adj_matrix, max_cycle_order_hoc) print "HOC results:" print("Accuracy: average %f with standard error %f" % (avg_acc, stderr_acc)) print("False positive rate: average %f with standard error %f" % (avg_fpr, stderr_fpr)) print("Model running time: average %f with standard error %f" % (avg_time, stderr_time)) print alg = "" alg_params = None #settings if using SGD if use_sgd_sh or use_sgd_sig: #Parameters used for this experiment #https://www.cs.uic.edu/~liub/KDD-cup-2007/proceedings/Regular-Paterek.pdf learning_rate = 1000 #0.05 for square hinge tol = adj_matrix.nnz / 10 max_iter = 20 reg_param = 10 #0.5 for square hinge dim = 100 num_folds_mf = 10 #Bundle up these parameters and use this algorithm if use_sgd_sh: loss_type = "squarehinge" #"sigmoid" alg_params = (learning_rate, loss_type, tol, max_iter, reg_param, dim) alg = "sgd" print "performing SGD with square-hinge loss..." avg_acc, stderr_acc, avg_fpr, stderr_fpr, avg_time, stderr_time = \ mf.kfold_CV_pipeline(adj_matrix, alg, alg_params, num_folds_mf) print "SGD_SH results:" print("Accuracy: average %f with standard error %f" % (avg_acc, stderr_acc)) print("False positive rate: average %f with standard error %f" % (avg_fpr, stderr_fpr)) print("Model running time: average %f with standard error %f" % (avg_time, stderr_time)) print if use_sgd_sig: loss_type = "sigmoid" alg_params = (learning_rate, loss_type, tol, max_iter, reg_param, dim) alg = "sgd" print "performing SGD with sigmoid loss..." avg_acc, stderr_acc, avg_fpr, stderr_fpr, avg_time, stderr_time = \ mf.kfold_CV_pipeline(adj_matrix, alg, alg_params, num_folds_mf) print "SGD_SIG results:" print("Accuracy: average %f with standard error %f" % (avg_acc, stderr_acc)) print("False positive rate: average %f with standard error %f" % (avg_fpr, stderr_fpr)) print("Model running time: average %f with standard error %f" % (avg_time, stderr_time)) print #settings if using als if use_als: #Parameters used for this experiment max_iter = 2 dim = 40 #Bundle up these parameters and use this algorithm alg_params = (max_iter, dim) alg = "als" num_folds_mf = 10 print "performing ALS..." avg_acc, stderr_acc, avg_fpr, stderr_fpr, avg_time, stderr_time = \ mf.kfold_CV_pipeline(adj_matrix, alg, alg_params, num_folds_mf) print "ALS results:" print("Accuracy: average %f with standard error %f" % (avg_acc, stderr_acc)) print("False positive rate: average %f with standard error %f" % (avg_fpr, stderr_fpr)) print("Model running time: average %f with standard error %f" % (avg_time, stderr_time)) print #settings if using SVP if use_svp: #Parameters used for this experiment rank = 40 tol = 100 max_iter = 5 step_size = 1 #Bundle up these parameters and use this algorithm alg_params = (rank, tol, max_iter, step_size) alg = "svp" num_folds_mf = 10 print "performing SVP..." avg_acc, stderr_acc, avg_fpr, stderr_fpr, avg_time, stderr_time = \ mf.kfold_CV_pipeline(adj_matrix, alg, alg_params, num_folds_mf) print "SVP results:" print("Accuracy: average %f with standard error %f" % (avg_acc, stderr_acc)) print("False positive rate: average %f with standard error %f" % (avg_fpr, stderr_fpr)) print("Model running time: average %f with standard error %f" % (avg_time, stderr_time)) print
''' cluster_sizes = [2,3,4] sparsity_level = 0.5 noise_prob = 0 adj_matrix = sim.sample_network(cluster_sizes, sparsity_level, noise_prob) #print adj_matrix.A signed_laplacian(adj_matrix).A ''' adj_matrix = None if simulated: cluster_sizes = [500,500]#[100,200,300,400] sparsity_level = 0.5#0.01175 noise_prob = 0 print "creating adjacency matrix..." adj_matrix = sim.sample_network(cluster_sizes, sparsity_level, noise_prob) elif real: #data_file_name = "Preprocessed Data/small_network.npy" data_file_name = "../data/Preprocessed Data/wiki_elections_csr.npy" try: adj_matrix = np.load(data_file_name).item() except Exception as e: raise ValueError("could not load adj matrix from file: ", e) avg_acc, avg_fpr = kfoldcv(adj_matrix, num_folds = 20) print("Accuracy %f and false positive rate %f" % (avg_acc, avg_fpr))
def run_experiment(): simulated = False real = True use_moi = True use_hoc = True use_svp = True use_sgd_sh = False use_sgd_sig = False use_als = True adj_matrix = None if simulated: cluster_sizes = [100,200,300,400] sparsity_level = 0.01175 noise_prob = 0 print "creating adjacency matrix..." adj_matrix = sim.sample_network(cluster_sizes, sparsity_level, noise_prob) elif real: data_file_name = "data/Preprocessed Data/small_network.npy" #data_file_name = "data/Preprocessed Data/wiki_elections_csr.npy" try: adj_matrix = np.load(data_file_name).item() except Exception as e: raise ValueError("could not load adj matrix from file: ", e) if use_moi: print "performing MOI..." max_cycle_order_moi = 10 discount = [0.5**i for i in range(3, max_cycle_order_moi + 1)] #max_cycle_order_moi = np.inf #discount = 0.0001 num_folds = 5 avg_acc, stderr_acc, avg_fpr, stderr_fpr, avg_time, stderr_time = \ moi.kfoldcv_moi(adj_matrix, discount, max_cycle_order_moi, num_folds) print "MOI results: " print("Accuracy: average %f with standard error %f" % (avg_acc, stderr_acc)) print("False positive rate: average %f with standard error %f" % (avg_fpr, stderr_fpr)) print("Model running time: average %f with standard error %f" % (avg_time, stderr_time)) print if use_hoc: print "performing HOC..." max_cycle_order_hoc = 5 num_folds = 10 avg_acc, stderr_acc, avg_fpr, stderr_fpr, avg_time, stderr_time = \ hoc.hoc_learning_pipeline(adj_matrix, max_cycle_order_hoc) print "HOC results:" print("Accuracy: average %f with standard error %f" % (avg_acc, stderr_acc)) print("False positive rate: average %f with standard error %f" % (avg_fpr, stderr_fpr)) print("Model running time: average %f with standard error %f" % (avg_time, stderr_time)) print alg = "" alg_params = None #settings if using SGD if use_sgd_sh or use_sgd_sig: #Parameters used for this experiment #https://www.cs.uic.edu/~liub/KDD-cup-2007/proceedings/Regular-Paterek.pdf learning_rate = 1000#0.05 for square hinge tol = adj_matrix.nnz/10 max_iter = 20 reg_param = 10#0.5 for square hinge dim = 100 num_folds_mf = 10 #Bundle up these parameters and use this algorithm if use_sgd_sh: loss_type = "squarehinge" #"sigmoid" alg_params = (learning_rate, loss_type, tol, max_iter, reg_param, dim) alg = "sgd" print "performing SGD with square-hinge loss..." avg_acc, stderr_acc, avg_fpr, stderr_fpr, avg_time, stderr_time = \ mf.kfold_CV_pipeline(adj_matrix, alg, alg_params, num_folds_mf) print "SGD_SH results:" print("Accuracy: average %f with standard error %f" % (avg_acc, stderr_acc)) print("False positive rate: average %f with standard error %f" % (avg_fpr, stderr_fpr)) print("Model running time: average %f with standard error %f" % (avg_time, stderr_time)) print if use_sgd_sig: loss_type = "sigmoid" alg_params = (learning_rate, loss_type, tol, max_iter, reg_param, dim) alg = "sgd" print "performing SGD with sigmoid loss..." avg_acc, stderr_acc, avg_fpr, stderr_fpr, avg_time, stderr_time = \ mf.kfold_CV_pipeline(adj_matrix, alg, alg_params, num_folds_mf) print "SGD_SIG results:" print("Accuracy: average %f with standard error %f" % (avg_acc, stderr_acc)) print("False positive rate: average %f with standard error %f" % (avg_fpr, stderr_fpr)) print("Model running time: average %f with standard error %f" % (avg_time, stderr_time)) print #settings if using als if use_als: #Parameters used for this experiment max_iter = 2 dim = 40 #Bundle up these parameters and use this algorithm alg_params = (max_iter, dim) alg = "als" num_folds_mf = 10 print "performing ALS..." avg_acc, stderr_acc, avg_fpr, stderr_fpr, avg_time, stderr_time = \ mf.kfold_CV_pipeline(adj_matrix, alg, alg_params, num_folds_mf) print "ALS results:" print("Accuracy: average %f with standard error %f" % (avg_acc, stderr_acc)) print("False positive rate: average %f with standard error %f" % (avg_fpr, stderr_fpr)) print("Model running time: average %f with standard error %f" % (avg_time, stderr_time)) print #settings if using SVP if use_svp: #Parameters used for this experiment rank = 40 tol = 100 max_iter = 5 step_size = 1 #Bundle up these parameters and use this algorithm alg_params = (rank, tol, max_iter, step_size) alg = "svp" num_folds_mf = 10 print "performing SVP..." avg_acc, stderr_acc, avg_fpr, stderr_fpr, avg_time, stderr_time = \ mf.kfold_CV_pipeline(adj_matrix, alg, alg_params, num_folds_mf) print "SVP results:" print("Accuracy: average %f with standard error %f" % (avg_acc, stderr_acc)) print("False positive rate: average %f with standard error %f" % (avg_fpr, stderr_fpr)) print("Model running time: average %f with standard error %f" % (avg_time, stderr_time)) print