def make_optimal_experiment(algorithms=None, datasets=None): if algorithms == None: raise TypeError("Algorithms are not given\n") if datasets == None: raise TypeError("Datasets are not given\n") for algorithm in algorithms: if algorithm not in ['Spectral', 'SCAN', 'GreedyNewman', 'Walktrap', 'LPA', 'CFinder', 'Clauset-Newman', 'Bigclam']: print 'Algorithm '+algorithm+' is unavailable!\n' for dataset in datasets: if dataset not in ['football.txt', 'polbooks.txt', 'protein_new.txt', 'amazon.txt', 'scientists_new.txt', 'karate.txt', 'facebook.txt', 'cliques.txt', 'nested.txt', 'stars.txt', 'cycles.txt']: print 'Dataset '+dataset+' is unavailable!\n' result = {} for dataset in datasets: if dataset not in ['football.txt', 'polbooks.txt', 'protein_new.txt', 'amazon.txt', 'scientists_new.txt', 'karate.txt', 'facebook.txt', 'cliques.txt', 'nested.txt', 'stars.txt', 'cycles.txt']: continue from get_parameters import get_optimal_parameters parameters = get_optimal_parameters(dataset, recompute=False) n_clusters = parameters['n_clusters'] n_steps = parameters['n_steps'] clique_size = parameters['clique_size'] neighbours_threshold = parameters['neighbours_threshold'] similarity_threshold = parameters['similarity_threshold'] for algorithm in algorithms: if algorithm not in ['Spectral', 'SCAN', 'GreedyNewman', 'Walktrap', 'LPA', 'CFinder', 'Clauset-Newman', 'Bigclam']: continue from load_data import download_graph n_vertex, edge_list = download_graph('data\\'+dataset) from model_builder import clustering #try: lbls, clrs, exectime = clustering(algorithm, n_vertex, edge_list, n_clusters, neighbours_threshold, similarity_threshold, n_steps, clique_size) #except: # continue from load_data import write_labels, write_clusters if lbls != None: write_labels(algorithm, dataset, lbls) if clrs != None: write_clusters(algorithm, dataset, clrs) result[algorithm, dataset, 'Time'] = exectime from cluster_metrics import compute_my_modularity, compute_overlapping_modularity, compute_modularity, compute_ratio_cut, compute_normalized_cut if algorithm in ['LPA', 'Walktrap', 'GreedyNewman', 'Clauset-Newman', 'Spectral']: result[algorithm, dataset, 'My modularity'] = compute_overlapping_modularity(clrs, n_vertex, edge_list) result[algorithm, dataset, 'Modularity'] = compute_modularity(lbls, edge_list) result[algorithm, dataset, 'RatioCut'] = compute_ratio_cut(lbls, clrs, edge_list) result[algorithm, dataset, 'NormCut'] = compute_normalized_cut(lbls, clrs, edge_list) elif algorithm in ['Bigclam', 'CFinder']: result[algorithm, dataset, 'My modularity'] = compute_overlapping_modularity(clrs, n_vertex, edge_list) elif algorithm in ['SCAN']: result[algorithm, dataset, 'My modularity'] = compute_overlapping_modularity(clrs, n_vertex, edge_list) result[algorithm, dataset, 'RatioCut'] = compute_ratio_cut(lbls, clrs, edge_list) result[algorithm, dataset, 'NormCut'] = compute_normalized_cut(lbls, clrs, edge_list) lbls_true = None clrs_true = None import os if os.path.isfile('data\\'+dataset[:-4]+'_labels.txt'): from load_data import download_labels lbls_true = download_labels('data\\'+dataset[:-4]+'_labels.txt') from transform_functions import compute_clusters_from_labels clrs_true = compute_clusters_from_labels(lbls_true) elif os.path.isfile('data\\'+dataset[:-4]+'_clusters.txt'): from load_data import download_clusters clrs_true = download_clusters('data\\'+dataset[:-4]+'_clusters.txt') if clrs_true == None: result[algorithm, dataset, 'Precision'] = None result[algorithm, dataset, 'Recall'] = None result[algorithm, dataset, 'Average F1'] = None else: from cluster_metrics import compute_recall, compute_precision, compute_avg_f1 result[algorithm, dataset, 'Precision'] = compute_precision(clrs_true, clrs) result[algorithm, dataset, 'Recall'] = compute_recall(clrs_true, clrs) result[algorithm, dataset, 'Average F1'] = compute_avg_f1(clrs_true, clrs) if algorithm != 'SCAN': if lbls_true == None: result[algorithm, dataset, 'NMI'] = None result[algorithm, dataset, 'ARS'] = None elif lbls != None: from cluster_metrics import compute_nmi, compute_ars result[algorithm, dataset, 'NMI'] = compute_nmi(lbls_true, lbls) result[algorithm, dataset, 'ARS'] = compute_ars(lbls_true, lbls) return result
def make_experiment(algorithms=None, datasets=None, **kwargs): if algorithms == None: raise TypeError("Algorithms are not given\n") if datasets == None: raise TypeError("Datasets are not given\n") recognized = ['n_clusters', 'neighbours_threshold', 'similarity_threshold', 'n_steps', 'clique_size'] n_clusters=None neighbours_threshold=None similarity_threshold=None n_steps=None clique_size=None for key, value in kwargs.items(): if key not in recognized: raise TypeError(("Keyword argument '%s' is not recognized!\nAvailable keywords are:\n'" + "', '".join(recognized) + "'") % key) if key == recognized[0]: n_clusters = value elif key == recognized[1]: neighbours_threshold = value elif key == recognized[2]: similarity_threshold = value elif key == recognized[3]: n_steps = value elif key == recognized[4]: clique_size = value for algorithm in algorithms: if algorithm not in ['Spectral', 'SCAN', 'GreedyNewman', 'Walktrap', 'LPA', 'CFinder', 'Clauset-Newman', 'Bigclam']: print 'Algorithm '+algorithm+' is unavailable!\n' for dataset in datasets: if dataset not in ['football.txt', 'polbooks.txt', 'protein_new.txt', 'amazon.txt', 'scientists_new.txt', 'karate.txt', 'facebook.txt', 'cliques.txt', 'nested.txt', 'stars.txt', 'cycles.txt']: print 'Dataset '+dataset+' is unavailable!\n' result = {} for algorithm in algorithms: if algorithm not in ['Spectral', 'SCAN', 'GreedyNewman', 'Walktrap', 'LPA', 'CFinder', 'Clauset-Newman', 'Bigclam']: continue fit, n_clusters, similarity_threshold, neighbours_threshold, n_steps, clique_size = fit_algo_params(algorithm, n_clusters, similarity_threshold, neighbours_threshold, n_steps, clique_size) if not fit: continue for dataset in datasets: if dataset not in ['football.txt', 'polbooks.txt', 'protein_new.txt', 'amazon.txt', 'scientists_new.txt', 'karate.txt', 'facebook.txt', 'cliques.txt', 'nested.txt', 'stars.txt', 'cycles.txt']: continue from load_data import download_graph n_vertex, edge_list = download_graph('data\\'+dataset) from model_builder import clustering #try: lbls, clrs, exectime = clustering(algorithm, n_vertex, edge_list, n_clusters, neighbours_threshold, similarity_threshold, n_steps, clique_size) #except: # continue from load_data import write_labels, write_clusters if lbls != None: write_labels(algorithm, dataset, lbls) if clrs != None: write_clusters(algorithm, dataset, clrs) result[algorithm, dataset, 'Time'] = exectime from cluster_metrics import compute_my_modularity, compute_overlapping_modularity, compute_modularity, compute_ratio_cut, compute_normalized_cut if algorithm in ['LPA', 'Walktrap', 'GreedyNewman', 'Clauset-Newman', 'Spectral']: result[algorithm, dataset, 'My modularity'] = compute_overlapping_modularity(clrs, n_vertex, edge_list) result[algorithm, dataset, 'Modularity'] = compute_modularity(lbls, edge_list) result[algorithm, dataset, 'RatioCut'] = compute_ratio_cut(lbls, clrs, edge_list) result[algorithm, dataset, 'NormCut'] = compute_normalized_cut(lbls, clrs, edge_list) elif algorithm in ['Bigclam', 'CFinder']: result[algorithm, dataset, 'My modularity'] = compute_overlapping_modularity(clrs, n_vertex, edge_list) elif algorithm in ['SCAN']: result[algorithm, dataset, 'My modularity'] = compute_overlapping_modularity(clrs, n_vertex, edge_list) result[algorithm, dataset, 'RatioCut'] = compute_ratio_cut(lbls, clrs, edge_list) result[algorithm, dataset, 'NormCut'] = compute_normalized_cut(lbls, clrs, edge_list) lbls_true = None clrs_true = None import os if os.path.isfile('data\\'+dataset[:-4]+'_labels.txt'): from load_data import download_labels lbls_true = download_labels('data\\'+dataset[:-4]+'_labels.txt') from transform_functions import compute_clusters_from_labels clrs_true = compute_clusters_from_labels(lbls_true) elif os.path.isfile('data\\'+dataset[:-4]+'_clusters.txt'): from load_data import download_clusters clrs_true = download_clusters('data\\'+dataset[:-4]+'_clusters.txt') if clrs_true == None: result[algorithm, dataset, 'Precision'] = None result[algorithm, dataset, 'Recall'] = None result[algorithm, dataset, 'Average F1'] = None else: from cluster_metrics import compute_recall, compute_precision, compute_avg_f1 result[algorithm, dataset, 'Precision'] = compute_precision(clrs_true, clrs) result[algorithm, dataset, 'Recall'] = compute_recall(clrs_true, clrs) result[algorithm, dataset, 'Average F1'] = compute_avg_f1(clrs_true, clrs) if algorithm != 'SCAN': if lbls_true == None: result[algorithm, dataset, 'NMI'] = None result[algorithm, dataset, 'ARS'] = None elif lbls != None: from cluster_metrics import compute_nmi, compute_ars result[algorithm, dataset, 'NMI'] = compute_nmi(lbls_true, lbls) result[algorithm, dataset, 'ARS'] = compute_ars(lbls_true, lbls) return result
print clrs_pred print time from load_data import write_labels, write_clusters #write communities (labels) into 'data\\answers\\'+'labels_'+algorithm+'_'+dataset write_labels(algorithm, dataset, lbls_pred) #write communities (clusters) into 'data\\answers\\'+'clusters_'+algorithm+'_'+dataset write_labels(algorithm, dataset, clrs_pred) """ You can calculate goodness (non ground-truth) metrics, such as: modularity, overlapping modularity, ratio cut, normalized cut. """ from cluster_metrics import compute_modularity, compute_overlapping_modularity, compute_ratio_cut, compute_normalized_cut print "Modularity = " + str(compute_modularity(lbls_pred, edge_list)) print "Overlapping modularity = " + str(compute_overlapping_modularity(clrs_pred, n_vertex, edge_list)) print "RatioCut = " + str(compute_ratio_cut(lbls_pred, clrs_pred, edge_list)) print "NormalizedCut = " + str(compute_normalized_cut(lbls_pred, clrs_pred, edge_list)) """ If ground-truth communities are known, you can load them. After that you can calculate performance (ground-truth) metrics, such as: average F1-score, average recall, average precision, normalized mutual information (NMI), adjusted_rand_score (ARS) """ #if true labels are known from load_data import download_labels, download_clusters lbls_true = download_labels('data\\'+dataset[:-4]+'_labels.txt') from transform_functions import compute_clusters_from_labels clrs_true = compute_clusters_from_labels(lbls_true)