def test_similarity_to_distance(): matrix = numpy.array([[0, 1, 2], [1, 0, 3], [2, 3, 0]], dtype=numpy.float) output = utils.similarity_to_distance(matrix, 20.0) expected = numpy.array([[0, 20.0, 10.0], [20.0, 0, 6.66666667], [10.0, 6.666666667, 0]]) assert output.shape == expected.shape assert all(output.ravel() - expected.ravel() < 1e4)
def hierarchical_dbscan(similarities=None, distance_cutoff=200): """ cluster using the Hierarchical DBSCAN algorithm """ if similarities is None: return None distance_matrix = utils.similarity_to_distance(similarities, distance_cutoff) hdbscan_clusterer = hdbscan.HDBSCAN(metric="precomputed", min_samples=3) labels = hdbscan_clusterer.fit_predict(distance_matrix) return labels
def dbscan(similarities=None, threshold=None, distance_cutoff=200): """ cluster using DBSCAN algorithm """ if similarities is None: return None distance_matrix = utils.similarity_to_distance(similarities, distance_cutoff) threshold = threshold or 2.8 dbscan_clusterer = sklearn.cluster.DBSCAN(eps=threshold, min_samples=3, metric="precomputed", algorithm="brute") labels = dbscan_clusterer.fit_predict(distance_matrix) return labels
def hierarchical_clustering(similarities=None, method='complete', threshold=None, distance_cutoff=200): """ create a flat clustering based on hierarchical clustering methods and a threshold """ if similarities is None: return None distance_matrix = utils.similarity_to_distance(similarities, distance_cutoff) linkage = sch.linkage(distance_matrix, method=method) threshold = threshold or 0.7 * linkage[:, 2].max() labels = sch.fcluster(linkage, threshold, criterion='distance') return labels
args = parse_arguments() edgelist = None if args.edgelist: print("edgelist filename=" + args.edgelist) edgelist = utils.read_edgelist_file(args.edgelist) matrix = None if args.matrix: print("matrix filename=" + args.matrix) matrix = utils.read_distance_matrix_file(args.matrix) if args.convert: print("convert=" + args.convert) if args.edgelist: edgelist['d'] = utils.similarity_to_distance(edgelist['d'], float(args.convert)) if args.matrix: matrix = utils.similarity_to_distance(matrix, float(args.convert)) if args.names: print("names filenname=" + args.names) print("clustering_algorithm=" + args.clustering_algorithm) if args.clustering_algorithm == 'hierarchical': clustering = hierarchical_clustering(edgelist=edgelist, distance_matrix=matrix) elif args.clustering_algorithm == 'dbscan': clustering = dbscan(edgelist=edgelist, distance_matrix=matrix) elif args.clustering_algorithm == 'spectral': clustering = spectral(edgelist=edgelist, distance_matrix=matrix) elif args.clustering_algorithm == 'agglomerative':
args = parse_arguments() edgelist = None if args.edgelist: print("edgelist filename=" + args.edgelist) edgelist = utils.read_edgelist_file(args.edgelist) matrix = None if args.matrix: print("matrix filename=" + args.matrix) matrix = utils.read_distance_matrix_file(args.matrix) if args.convert: print("convert=" + args.convert) if args.edgelist: edgelist['d'] = utils.similarity_to_distance( edgelist['d'], float(args.convert)) if args.matrix: matrix = utils.similarity_to_distance(matrix, float(args.convert)) if args.names: print("names filenname=" + args.names) print("clustering_algorithm=" + args.clustering_algorithm) if args.clustering_algorithm == 'hierarchical': clustering = hierarchical_clustering(edgelist=edgelist, distance_matrix=matrix) elif args.clustering_algorithm == 'dbscan': clustering = dbscan(edgelist=edgelist, distance_matrix=matrix) elif args.clustering_algorithm == 'hdbscan': clustering = hierarchical_dbscan(edgelist=edgelist, distance_matrix=matrix)
def test_similarity_to_distance(): matrix = numpy.array([[0, 1, 2],[1,0,3],[2,3,0]], dtype=numpy.float) output = utils.similarity_to_distance(matrix, 20.0) expected = numpy.array([[0, 20.0, 10.0],[20.0,0,6.66666667],[10.0,6.666666667,0]]) assert output.shape == expected.shape assert all(output.ravel() - expected.ravel() < 1e4)