def dbscan(edgelist=None, distance_matrix=None, threshold=None): """ cluster using DBSCAN algorithm """ if edgelist is not None: distance_matrix, names = utils.edgelist_to_distance_matrix(edgelist) threshold = threshold or 2.8 core_samples, labels = sklearn.cluster.dbscan(distance_matrix, metric='precomputed', algorithm='brute', eps=threshold, min_samples=2) return labels
def test_edgelist_to_distance_matrix1(): edgelist = numpy.array([('node002', 'node001', 2.0), ('node003', 'node001', 4.0), ('node003', 'node002', 1.2)], dtype=[('n1', 'S7'),('n2', 'S7'),('d', 'f64')]) matrix, names = utils.edgelist_to_distance_matrix(edgelist) expected = ['node001', 'node002', 'node003'] assert all([n == e for n,e in zip(names,expected)]) assert matrix.shape == (3,3) assert matrix[0][0] == 0.0 assert matrix[1][0] == 2.0 assert matrix[2][0] == 4.0
def spectral(edgelist=None, distance_matrix=None): """ cluster using spectral clustering """ if edgelist is not None: distance_matrix, names = utils.edgelist_to_distance_matrix(edgelist) sc = sklearn.cluster.SpectralClustering(n_clusters=10, affinity='precomputed') labels = sc.fit_predict(distance_matrix) return labels
def test_edgelist_to_distance_matrix1(): edgelist = numpy.array([('node002', 'node001', 2.0), ('node003', 'node001', 4.0), ('node003', 'node002', 1.2)], dtype=[('n1', 'S7'), ('n2', 'S7'), ('d', 'f64')]) matrix, names = utils.edgelist_to_distance_matrix(edgelist) expected = ['node001', 'node002', 'node003'] assert all([n == e for n, e in zip(names, expected)]) assert matrix.shape == (3, 3) assert matrix[0][0] == 0.0 assert matrix[1][0] == 2.0 assert matrix[2][0] == 4.0
def hierarchical_clustering(edgelist=None, distance_matrix=None, names=None, method='complete', threshold=None): """ create a flat clustering based on hierarchical clustering methods and a threshold """ if edgelist is not None: distance_matrix, names = utils.edgelist_to_distance_matrix(edgelist) linkage = sch.linkage(distance_matrix, method=method) threshold = threshold or 0.7*linkage[:,2].max() labels = sch.fcluster(linkage, threshold, criterion='distance') return labels
def test_edgelist_to_distance_matrix2(): edgelist = utils.read_edgelist_file(test_files + 'edgelist-pentax-pce.txt') matrix, names = utils.edgelist_to_distance_matrix(edgelist) expected = ['Pentax_OptioA40_0_30521.JPG', 'Pentax_OptioA40_0_30522.JPG', 'Pentax_OptioA40_0_30523.JPG', 'Pentax_OptioA40_0_30524.JPG', 'Pentax_OptioA40_0_30525.JPG'] assert all([n == e for n,e in zip(names,expected)]) assert matrix.shape == (5,5) expected = [0.0, 704.17228119005165, 433.30115575430841, 154.4174623739334, 658.55780052635578] assert all([n == e for n,e in zip(matrix[0,:],expected)]) expected = [704.17228119005165, 0.0, 476.46750462930402, 244.28991913458535, 585.63345061046823] assert all([n == e for n,e in zip(matrix[1,:],expected)])
def agglomerative_clustering(edgelist=None, distance_matrix=None, num_clusters=4, method='complete', metric='precomputed'): """ computes an agglomerative clustering as one of the hierarchical clustering methods """ if edgelist is not None: distance_matrix, names = utils.edgelist_to_distance_matrix(edgelist) num_clusters=int(input("Enter the number of clusters: ")) assert isinstance(num_clusters, int) method_options = list(_TREE_BUILDERS.keys()) print('The list of available methods:', method_options, file=sys.stdout) in_method = input('Input the method name:') assert isinstance(in_method, str) # native str on Py2 and Py3 method = in_method.strip() if method == 'ward': metric = 'euclidean' else: metric_options = ['precomputed', 'cosine', 'euclidean', 'cityblock'] print('The list of available metrics:', metric_options , file=sys.stdout) in_metric = input('Input the metric name:') assert isinstance(in_metric, str) # native str on Py2 and Py3 metric = in_metric.strip() #tree_cutoff_options = [True, False, 'auto'] tree_cutoff_options = [] #for method in method_options: # for metric in metric_options: #for tree_cutoff in tree_cutoff_options: model = sklearn.cluster.AgglomerativeClustering(linkage=method, affinity=metric, n_clusters=num_clusters, connectivity=distance_matrix, compute_full_tree='auto') model = model.fit(distance_matrix) labels = model.labels_ print(method, metric) #plot_dendrogram(model, labels=labels) return labels
def test_edgelist_to_distance_matrix2(): edgelist = utils.read_edgelist_file(test_files + 'edgelist-pentax-pce.txt') matrix, names = utils.edgelist_to_distance_matrix(edgelist) expected = [ 'Pentax_OptioA40_0_30521.JPG', 'Pentax_OptioA40_0_30522.JPG', 'Pentax_OptioA40_0_30523.JPG', 'Pentax_OptioA40_0_30524.JPG', 'Pentax_OptioA40_0_30525.JPG' ] assert all([n == e for n, e in zip(names, expected)]) assert matrix.shape == (5, 5) expected = [ 0.0, 704.17228119005165, 433.30115575430841, 154.4174623739334, 658.55780052635578 ] assert all([n == e for n, e in zip(matrix[0, :], expected)]) expected = [ 704.17228119005165, 0.0, 476.46750462930402, 244.28991913458535, 585.63345061046823 ] assert all([n == e for n, e in zip(matrix[1, :], expected)])