コード例 #1
0
ファイル: model_builder.py プロジェクト: zurk/lightcluster
def compute_bigclam(n_vertex, edge_list, n_clusters):

    f = open("data\\for_bigclam.txt", "w")
    for i in xrange(len(edge_list)):
        f.write(str(edge_list[i][0] + 1) + "\t" + str(edge_list[i][1] + 1) + "\n")
        f.write(str(edge_list[i][1] + 1) + "\t" + str(edge_list[i][0] + 1) + "\n")
    f.close()

    t = time.time()
    cmd = (
        "lib\\bigclam.exe -c:"
        + str(n_clusters)
        + " -i:data\\for_bigclam.txt -o:data\\bigclam -mc:"
        + str(n_clusters)
        + " -xc:"
        + str(n_clusters)
        + " > functions\\log.txt"
    )
    import subprocess

    PIPE = subprocess.PIPE
    p = subprocess.Popen(cmd, shell=True)
    exectime = time.time() - t
    p.wait()

    clusters = []
    labels = None

    f = open("data\\bigclamcmtyvv.txt", "r")
    s = f.readline()
    while s:
        cnt = s.count("\t")
        cluster_str = ()  # set of strings
        cluster = ()  # set of numbers
        cluster_str = s.split("\t", cnt)
        for string in cluster_str:  # transform  strings into numbers
            if string != "\n":
                vertex = int(string) - 1
                cluster = cluster + (vertex,)
        clusters.append(cluster)
        s = f.readline()

    if len(clusters) == 0:
        labels = xrange(n_vertex)
        clusters = tf.compute_clusters_from_labels(labels)

    return [labels, clusters, exectime]
コード例 #2
0
ファイル: model_builder.py プロジェクト: zurk/lightcluster
def compute_spectral_clustering(n_vertex, edge_list, n_clusters):

    from sklearn.cluster import SpectralClustering

    clst = SpectralClustering(n_clusters, affinity="precomputed")

    adjacency_matrix = tf.compute_adjacency_matrix(n_vertex, edge_list)

    t = time.time()
    labels = clst.fit_predict(adjacency_matrix, n_clusters)
    exectime = time.time() - t

    labels = tf.compute_normal_labels(labels)

    clusters = tf.compute_clusters_from_labels(labels)

    return labels, clusters, exectime
コード例 #3
0
ファイル: bench.py プロジェクト: zurk/lightcluster
def make_optimal_experiment(algorithms=None, datasets=None):
	
	if algorithms == None:
		raise TypeError("Algorithms are not given\n")
	if datasets == None:
		raise TypeError("Datasets are not given\n")

	for algorithm in algorithms:
		if algorithm not in ['Spectral', 'SCAN', 'GreedyNewman', 'Walktrap', 'LPA', 'CFinder', 'Clauset-Newman', 'Bigclam']:
			print 'Algorithm '+algorithm+' is unavailable!\n'

	for dataset in datasets:
			if dataset not in ['football.txt', 'polbooks.txt', 'protein_new.txt', 'amazon.txt', 'scientists_new.txt', 'karate.txt', 
													'facebook.txt', 'cliques.txt', 'nested.txt', 'stars.txt', 'cycles.txt']:
				print 'Dataset '+dataset+' is unavailable!\n'

	result = {}

	for dataset in datasets:
		if dataset not in ['football.txt', 'polbooks.txt', 'protein_new.txt', 'amazon.txt', 'scientists_new.txt', 'karate.txt', 
													'facebook.txt', 'cliques.txt', 'nested.txt', 'stars.txt', 'cycles.txt']:
			continue
		from get_parameters import get_optimal_parameters
		parameters = get_optimal_parameters(dataset, recompute=False)
		n_clusters = parameters['n_clusters']
		n_steps = parameters['n_steps']
		clique_size = parameters['clique_size']
		neighbours_threshold = parameters['neighbours_threshold']
		similarity_threshold = parameters['similarity_threshold']

		for algorithm in algorithms:
			if algorithm not in ['Spectral', 'SCAN', 'GreedyNewman', 'Walktrap', 'LPA', 'CFinder', 'Clauset-Newman', 'Bigclam']:
				continue

			from load_data import download_graph
			n_vertex, edge_list = download_graph('data\\'+dataset)

			from model_builder import clustering
			#try:
			lbls, clrs, exectime = clustering(algorithm, n_vertex, edge_list, n_clusters, neighbours_threshold,  similarity_threshold, n_steps, clique_size)
			#except:
			#	continue

			from load_data import write_labels, write_clusters
			if lbls != None:
				write_labels(algorithm, dataset, lbls)
			if clrs != None:
				write_clusters(algorithm, dataset, clrs)

			result[algorithm, dataset, 'Time'] = exectime

			from cluster_metrics import compute_my_modularity, compute_overlapping_modularity, compute_modularity, compute_ratio_cut, compute_normalized_cut
			if algorithm in ['LPA', 'Walktrap', 'GreedyNewman', 'Clauset-Newman', 'Spectral']:
				result[algorithm, dataset, 'My modularity'] = compute_overlapping_modularity(clrs, n_vertex, edge_list)
				result[algorithm, dataset, 'Modularity'] = compute_modularity(lbls, edge_list)
				result[algorithm, dataset, 'RatioCut'] = compute_ratio_cut(lbls, clrs, edge_list)
				result[algorithm, dataset, 'NormCut'] = compute_normalized_cut(lbls, clrs, edge_list)

			elif algorithm in ['Bigclam', 'CFinder']:
				result[algorithm, dataset, 'My modularity'] = compute_overlapping_modularity(clrs, n_vertex, edge_list)
			elif algorithm in ['SCAN']:
				result[algorithm, dataset, 'My modularity'] = compute_overlapping_modularity(clrs, n_vertex, edge_list)
				result[algorithm, dataset, 'RatioCut'] = compute_ratio_cut(lbls, clrs, edge_list)
				result[algorithm, dataset, 'NormCut'] = compute_normalized_cut(lbls, clrs, edge_list)

			lbls_true = None
			clrs_true = None
			import os
			if os.path.isfile('data\\'+dataset[:-4]+'_labels.txt'):
				from load_data import download_labels
				lbls_true = download_labels('data\\'+dataset[:-4]+'_labels.txt')
				from transform_functions import compute_clusters_from_labels
				clrs_true = compute_clusters_from_labels(lbls_true)

			elif os.path.isfile('data\\'+dataset[:-4]+'_clusters.txt'):
				from load_data import download_clusters
				clrs_true = download_clusters('data\\'+dataset[:-4]+'_clusters.txt')
			
			if clrs_true == None:
				result[algorithm, dataset, 'Precision'] = None
				result[algorithm, dataset, 'Recall'] = None
				result[algorithm, dataset, 'Average F1'] = None
			else:
				from cluster_metrics import compute_recall, compute_precision, compute_avg_f1
				result[algorithm, dataset, 'Precision'] = compute_precision(clrs_true, clrs)
				result[algorithm, dataset, 'Recall'] = compute_recall(clrs_true, clrs)
				result[algorithm, dataset, 'Average F1'] = compute_avg_f1(clrs_true, clrs)

			if algorithm != 'SCAN':
				if lbls_true == None:
					result[algorithm, dataset, 'NMI'] = None
					result[algorithm, dataset, 'ARS'] = None	
				elif lbls != None:
					from cluster_metrics import compute_nmi, compute_ars
					result[algorithm, dataset, 'NMI'] = compute_nmi(lbls_true, lbls)
					result[algorithm, dataset, 'ARS'] = compute_ars(lbls_true, lbls)
	
	return result
コード例 #4
0
ファイル: bench.py プロジェクト: zurk/lightcluster
def make_experiment(algorithms=None, datasets=None, **kwargs):
	
	if algorithms == None:
		raise TypeError("Algorithms are not given\n")
	if datasets == None:
		raise TypeError("Datasets are not given\n")

	recognized = ['n_clusters', 'neighbours_threshold', 'similarity_threshold', 'n_steps', 'clique_size']

	n_clusters=None
	neighbours_threshold=None
	similarity_threshold=None
	n_steps=None
	clique_size=None

	for key, value in kwargs.items():
		if key not in recognized:
			 raise TypeError(("Keyword argument '%s' is not recognized!\nAvailable keywords are:\n'"
												 + "', '".join(recognized)  + "'") % key)

		if key == recognized[0]:
			n_clusters = value
		elif key == recognized[1]:
			neighbours_threshold = value
		elif key == recognized[2]:
			similarity_threshold = value
		elif key == recognized[3]:
			n_steps = value
		elif key == recognized[4]:
			clique_size = value

	for algorithm in algorithms:
		if algorithm not in ['Spectral', 'SCAN', 'GreedyNewman', 'Walktrap', 'LPA', 'CFinder', 'Clauset-Newman', 'Bigclam']:
			print 'Algorithm '+algorithm+' is unavailable!\n'

	for dataset in datasets:
			if dataset not in ['football.txt', 'polbooks.txt', 'protein_new.txt', 'amazon.txt', 'scientists_new.txt', 'karate.txt', 
													'facebook.txt', 'cliques.txt', 'nested.txt', 'stars.txt', 'cycles.txt']:
				print 'Dataset '+dataset+' is unavailable!\n'

	result = {}

	for algorithm in algorithms:
		if algorithm not in ['Spectral', 'SCAN', 'GreedyNewman', 'Walktrap', 'LPA', 'CFinder', 'Clauset-Newman', 'Bigclam']:
			continue

		fit, n_clusters, similarity_threshold, neighbours_threshold, n_steps, clique_size = fit_algo_params(algorithm, n_clusters, 
																										 similarity_threshold, neighbours_threshold, n_steps, clique_size)
		if not fit:
			continue

		for dataset in datasets:
			if dataset not in ['football.txt', 'polbooks.txt', 'protein_new.txt', 'amazon.txt', 'scientists_new.txt', 'karate.txt', 
													'facebook.txt', 'cliques.txt', 'nested.txt', 'stars.txt', 'cycles.txt']:
				continue

			from load_data import download_graph
			n_vertex, edge_list = download_graph('data\\'+dataset)

			from model_builder import clustering
			#try:
			lbls, clrs, exectime = clustering(algorithm, n_vertex, edge_list, n_clusters, neighbours_threshold,  similarity_threshold, n_steps, clique_size)
			#except:
			#	continue

			from load_data import write_labels, write_clusters
			if lbls != None:
				write_labels(algorithm, dataset, lbls)
			if clrs != None:
				write_clusters(algorithm, dataset, clrs)

			result[algorithm, dataset, 'Time'] = exectime

			from cluster_metrics import compute_my_modularity, compute_overlapping_modularity, compute_modularity, compute_ratio_cut, compute_normalized_cut
			if algorithm in ['LPA', 'Walktrap', 'GreedyNewman', 'Clauset-Newman', 'Spectral']:
				result[algorithm, dataset, 'My modularity'] = compute_overlapping_modularity(clrs, n_vertex, edge_list)
				result[algorithm, dataset, 'Modularity'] = compute_modularity(lbls, edge_list)
				result[algorithm, dataset, 'RatioCut'] = compute_ratio_cut(lbls, clrs, edge_list)
				result[algorithm, dataset, 'NormCut'] = compute_normalized_cut(lbls, clrs, edge_list)

			elif algorithm in ['Bigclam', 'CFinder']:
				result[algorithm, dataset, 'My modularity'] = compute_overlapping_modularity(clrs, n_vertex, edge_list)
			elif algorithm in ['SCAN']:
				result[algorithm, dataset, 'My modularity'] = compute_overlapping_modularity(clrs, n_vertex, edge_list)
				result[algorithm, dataset, 'RatioCut'] = compute_ratio_cut(lbls, clrs, edge_list)
				result[algorithm, dataset, 'NormCut'] = compute_normalized_cut(lbls, clrs, edge_list)

			lbls_true = None
			clrs_true = None
			import os
			if os.path.isfile('data\\'+dataset[:-4]+'_labels.txt'):
				from load_data import download_labels
				lbls_true = download_labels('data\\'+dataset[:-4]+'_labels.txt')
				from transform_functions import compute_clusters_from_labels
				clrs_true = compute_clusters_from_labels(lbls_true)

			elif os.path.isfile('data\\'+dataset[:-4]+'_clusters.txt'):
				from load_data import download_clusters
				clrs_true = download_clusters('data\\'+dataset[:-4]+'_clusters.txt')
			
			if clrs_true == None:
				result[algorithm, dataset, 'Precision'] = None
				result[algorithm, dataset, 'Recall'] = None
				result[algorithm, dataset, 'Average F1'] = None
			else:
				from cluster_metrics import compute_recall, compute_precision, compute_avg_f1
				result[algorithm, dataset, 'Precision'] = compute_precision(clrs_true, clrs)
				result[algorithm, dataset, 'Recall'] = compute_recall(clrs_true, clrs)
				result[algorithm, dataset, 'Average F1'] = compute_avg_f1(clrs_true, clrs)

			if algorithm != 'SCAN':
				if lbls_true == None:
					result[algorithm, dataset, 'NMI'] = None
					result[algorithm, dataset, 'ARS'] = None	
				elif lbls != None:
					from cluster_metrics import compute_nmi, compute_ars
					result[algorithm, dataset, 'NMI'] = compute_nmi(lbls_true, lbls)
					result[algorithm, dataset, 'ARS'] = compute_ars(lbls_true, lbls)
	
	return result
コード例 #5
0
ファイル: testing(single).py プロジェクト: zurk/lightcluster
print "Modularity = " + str(compute_modularity(lbls_pred, edge_list))
print "Overlapping modularity = " + str(compute_overlapping_modularity(clrs_pred, n_vertex, edge_list))
print "RatioCut = " + str(compute_ratio_cut(lbls_pred, clrs_pred, edge_list))
print "NormalizedCut = " + str(compute_normalized_cut(lbls_pred, clrs_pred, edge_list))


"""
If ground-truth communities are known, you can load them.
After that you can calculate performance (ground-truth) metrics, such as: average F1-score, average recall, average precision, 
																		  normalized mutual information (NMI), adjusted_rand_score (ARS)
"""
#if true labels are known
from load_data import download_labels, download_clusters
lbls_true = download_labels('data\\'+dataset[:-4]+'_labels.txt')
from transform_functions import compute_clusters_from_labels
clrs_true = compute_clusters_from_labels(lbls_true)

#if only true clusters are known
#clrs_true = download_clusters('data\\'+dataset[:-4]+'_clusters.txt')

from cluster_metrics import compute_avg_f1, compute_recall, compute_precision, compute_nmi, compute_ars
print "Average F1-score = " + str(compute_avg_f1(clrs_true, clrs_pred))
print "Average recall = " + str(compute_recall(clrs_true, clrs_pred))
print "Average precision = " + str(compute_precision(clrs_true, clrs_pred))
print "NMI = " + str(compute_nmi(lbls_true, lbls_pred))
print "ARS = " + str(compute_ars(lbls_true, lbls_pred))