def multi_cut_spectral(cluster_list, affinity_matrix, dist_matrix, n_jobs=-1, sample_names=None): """Perform a spectral clustering with variable cluster sizes. Parameters ---------- cluster_list : array-like Contains the list of the number of clusters to use at each step. affinity_matrix : array-like Precomputed affinity matrix. dist_matrix : array-like Precomputed distance matrix between points. Returns ------- queue_y : array-like Array to be visualised on the y-axis. Contains the list of average silhouette for each number of clusters present in cluster_list. """ def _internal(cluster_list, affinity_matrix, dist_matrix, idx, n_jobs, n, queue_y): for i in range(idx, n, n_jobs): sp = SpectralClustering(n_clusters=cluster_list[i], affinity='precomputed', norm_laplacian=True, n_init=1000) sp.fit(affinity_matrix) save_results_clusters("res_spectral_{:03d}_clust.csv" .format(cluster_list[i]), sample_names, sp.labels_) silhouette_list = silhouette_samples(dist_matrix, sp.labels_, metric="precomputed") queue_y[i] = np.mean(silhouette_list) n = len(cluster_list) if n_jobs == -1: n_jobs = min(mp.cpu_count(), n) queue_y = mp.Array('d', [0.] * n) ps = [] try: for idx in range(n_jobs): p = mp.Process(target=_internal, args=(cluster_list, affinity_matrix, dist_matrix, idx, n_jobs, n, queue_y)) p.start() ps.append(p) for p in ps: p.join() except (KeyboardInterrupt, SystemExit): extra.term_processes(ps, 'Exit signal received\n') except BaseException as e: extra.term_processes(ps, 'ERROR: %s\n' % e) return queue_y
def inverse_index_parallel(records): """Compute a inverse index given records, based on their V and J genes. It computes in a parallel way. Parameters ---------- records : list of externals.DbCore.IgRecord Records must have the getVGene and getJGene methods. Returns ------- reverse_index : dict Reverse index. For each key (a string which represent a gene), it has the list of indices of records which contains that gene. Example: reverse_index = {'IGHV3' : [0,3,5] ...} """ def _get_v_j_padding(lock, queue, idx, nprocs, igs_arr, n): local_dict = defaultdict(list) for i in range(idx, n, nprocs): ig = igs_arr[i] for _ in ig.setV: local_dict[_].append(i) for _ in ig.setJ: local_dict[_].append(i) with lock: queue.append(dict(local_dict)) n = len(records) manager = mp.Manager() lock = mp.Lock() nprocs = min(n, mp.cpu_count()) procs = [] try: queue = manager.list() for idx in range(nprocs): p = mp.Process(target=_get_v_j_padding, args=(lock, queue, idx, nprocs, records, n)) p.start() procs.append(p) for p in procs: p.join() except: extra.term_processes(procs) reverse_index = {} for dictionary in queue: reverse_index = dict(reverse_index.items() + dictionary.items() + [(k, list(set(reverse_index[k] + dictionary[k]))) for k in set(dictionary) & set(reverse_index)]) return reverse_index
def dnearest_inter_padding(l1, l2, dist_function, filt=None, func=min): """Compute in a parallel way a dist2nearest for two 1-d arrays. Use this function with different arrays; if l1 == l2, then the results is a 0-array. Parameters ---------- l1, l2 : array_like 1-dimensional arrays. Compute the nearest element of l2 to l1. dist_function : function Function to use for the distance computation. filt : function or None, optional Filter based on the result of the distance function. func : function, optional, default: min (built-in function) Function to apply for selecting the best. Use min for distances, max for similarities (consider numpy variants for speed). Returns ------- dist2nearest : array_like 1-D array """ def _internal(l1, l2, n, idx, nprocs, shared_arr, dist_function): for i in xrange(idx, n, nprocs): # if i % 100 == 0: # progressbar(i, n) shared_arr[i] = _min( ifilter(filt, (dist_function(l1[i], el2) for el2 in l2)), func) n = len(l1) nprocs = min(mp.cpu_count(), n) shared_array = mp.Array('d', [0.] * n) procs = [] try: for idx in xrange(nprocs): p = mp.Process(target=_internal, args=(l1, l2, n, idx, nprocs, shared_array, dist_function)) p.start() procs.append(p) for p in procs: p.join() except (KeyboardInterrupt, SystemExit): term_processes(procs, 'Exit signal received\n') except BaseException as msg: term_processes(procs, 'ERROR: %s\n' % msg) # progressbar(n, n) return shared_array
def dm_dense_intra_padding(l1, dist_function, condensed=False): """Compute in a parallel way a distance matrix for a 1-d array. Parameters ---------- l1, l2 : array_like 1-dimensional arrays. Compute the distance matrix for each couple of elements of l1. dist_function : function Function to use for the distance computation. Returns ------- dist_matrix : array_like Symmetric NxN distance matrix for each input_array element. """ def _internal(l1, n, idx, nprocs, shared_arr, dist_function): for i in xrange(idx, n, nprocs): if i % 2 == 0: progressbar(i, n) # shared_arr[i, i:] = [dist_function(l1[i], el2) for el2 in l2] for j in xrange(i + 1, n): shared_arr[i, j] = dist_function(l1[i], l1[j]) # if shared_arr[idx, j] == 0: # print l1[i].junction, '\n', l1[j].junction, '\n----------' n = len(l1) nprocs = min(mp.cpu_count(), n) shared_array = np.frombuffer(mp.Array('d', n * n).get_obj()).reshape( (n, n)) procs = [] try: for idx in xrange(nprocs): p = mp.Process(target=_internal, args=(l1, n, idx, nprocs, shared_array, dist_function)) p.start() procs.append(p) for p in procs: p.join() except (KeyboardInterrupt, SystemExit): term_processes(procs, 'Exit signal received\n') except BaseException as msg: term_processes(procs, 'ERROR: %s\n' % msg) progressbar(n, n) dist_matrix = shared_array + shared_array.T if condensed: dist_matrix = scipy.spatial.distance.squareform(dist_matrix) return dist_matrix
def multi_cut_dendrogram(dist_matrix, Z, threshold_arr, n, mode='clusters', method='single', n_jobs=-1, sample_names=None): """Cut a dendrogram at some heights. Parameters ---------- dist_matrix : array-like Precomputed distance matrix between points. Z : array-like Linkage matrix, results of scipy.cluster.hierarchy.linkage. threshold_arr : array-like One-dimensional array which contains the thresholds where to cut the dendrogram. n : int Length of threshold_arr mode : ('clusters', 'thresholds'), optional Choose what to visualise on the x-axis. Returns ------- queue_{x, y} : array-like The results to be visualised on a plot. """ def _internal(dist_matrix, Z, threshold_arr, idx, n_jobs, arr_length, queue_x, queue_y, mode='clusters', method='single', sample_names=None): for i in range(idx, arr_length, n_jobs): queue_x[i], queue_y[i] = single_silhouette_dendrogram( dist_matrix, Z, threshold_arr[i], mode, method, sample_names) if n_jobs == -1: n_jobs = min(mp.cpu_count(), n) queue_x, queue_y = mp.Array('d', [0.] * n), mp.Array('d', [0.] * n) ps = [] try: for idx in range(n_jobs): p = mp.Process(target=_internal, args=(dist_matrix, Z, threshold_arr, idx, n_jobs, n, queue_x, queue_y, mode, method, sample_names)) p.start() ps.append(p) for p in ps: p.join() except (KeyboardInterrupt, SystemExit): extra.term_processes(ps, 'Exit signal received\n') except BaseException as e: extra.term_processes(ps, 'ERROR: %s\n' % e) return queue_x, queue_y
def dm_dense_intra_padding(l1, dist_function, condensed=False): """Compute in a parallel way a distance matrix for a 1-d array. Parameters ---------- l1, l2 : array_like 1-dimensional arrays. Compute the distance matrix for each couple of elements of l1. dist_function : function Function to use for the distance computation. Returns ------- dist_matrix : array_like Symmetric NxN distance matrix for each input_array element. """ def _internal(l1, n, idx, nprocs, shared_arr, dist_function): for i in xrange(idx, n, nprocs): if i % 2 == 0: progressbar(i, n) # shared_arr[i, i:] = [dist_function(l1[i], el2) for el2 in l2] for j in xrange(i + 1, n): shared_arr[i, j] = dist_function(l1[i], l1[j]) # if shared_arr[idx, j] == 0: # print l1[i].junction, '\n', l1[j].junction, '\n----------' n = len(l1) nprocs = min(mp.cpu_count(), n) shared_array = np.frombuffer(mp.Array('d', n*n).get_obj()).reshape((n, n)) procs = [] try: for idx in xrange(nprocs): p = mp.Process(target=_internal, args=(l1, n, idx, nprocs, shared_array, dist_function)) p.start() procs.append(p) for p in procs: p.join() except (KeyboardInterrupt, SystemExit): term_processes(procs, 'Exit signal received\n') except BaseException as msg: term_processes(procs, 'ERROR: %s\n' % msg) progressbar(n, n) dist_matrix = shared_array + shared_array.T if condensed: dist_matrix = scipy.spatial.distance.squareform(dist_matrix) return dist_matrix
def dnearest_intra_padding(l1, dist_function, filt=None, func=min): """Compute in a parallel way a dist2nearest for a 1-d arrays. For each element in l1, find its closest (without considering itself). Parameters ---------- l1 : array_like 1-dimensional array. dist_function : function Function to use for the distance computation. Returns ------- dist2nearest : array_like 1-D array """ def _internal(l1, n, idx, nprocs, shared_arr, dist_function): for i in xrange(idx, n, nprocs): # if i % 100 == 0: # progressbar(i, n) shared_arr[i] = _min( ifilter( filt, chain((dist_function(l1[i], l1[j]) for j in xrange(0, i)), (dist_function(l1[i], l1[j]) for j in xrange(i + 1, n)))), func) n = len(l1) nprocs = min(mp.cpu_count(), n) shared_array = mp.Array('d', [0.] * n) procs = [] try: for idx in xrange(nprocs): p = mp.Process(target=_internal, args=(l1, n, idx, nprocs, shared_array, dist_function)) p.start() procs.append(p) for p in procs: p.join() except (KeyboardInterrupt, SystemExit): term_processes(procs, 'Exit signal received\n') except BaseException as msg: term_processes(procs, 'ERROR: %s\n' % msg) # progressbar(n, n) return shared_array
def dnearest_intra_padding(l1, dist_function, filt=None, func=min): """Compute in a parallel way a dist2nearest for a 1-d arrays. For each element in l1, find its closest (without considering itself). Parameters ---------- l1 : array_like 1-dimensional array. dist_function : function Function to use for the distance computation. Returns ------- dist2nearest : array_like 1-D array """ def _internal(l1, n, idx, nprocs, shared_arr, dist_function): for i in xrange(idx, n, nprocs): # if i % 100 == 0: # progressbar(i, n) shared_arr[i] = _min(ifilter(filt, chain( (dist_function(l1[i], l1[j]) for j in xrange(0, i)), (dist_function(l1[i], l1[j]) for j in xrange(i + 1, n)) )), func) n = len(l1) nprocs = min(mp.cpu_count(), n) shared_array = mp.Array('d', [0.] * n) procs = [] try: for idx in xrange(nprocs): p = mp.Process(target=_internal, args=(l1, n, idx, nprocs, shared_array, dist_function)) p.start() procs.append(p) for p in procs: p.join() except (KeyboardInterrupt, SystemExit): term_processes(procs, 'Exit signal received\n') except BaseException as msg: term_processes(procs, 'ERROR: %s\n' % msg) # progressbar(n, n) return shared_array
def dm_dense_inter_padding(l1, l2, dist_function, condensed=False): """Compute in a parallel way a distance matrix for a 1-d array. Parameters ---------- l1, l2 : array_like 1-dimensional arrays. Compute the distance matrix for each couple of elements of l1 and l2. dist_function : function Function to use for the distance computation. Returns ------- dist_matrix : array_like Symmetric NxN distance matrix for each input_array element. """ def _internal(l1, l2, n, idx, nprocs, shared_arr, dist_function): for i in xrange(idx, n, nprocs): if i % 100 == 0: progressbar(i, n) shared_arr[i] = [dist_function(l1[i], el2) for el2 in l2] n, m = len(l1), len(l2) nprocs = min(mp.cpu_count(), n) # index = mp.Value('i', 0) # lock = mp.Lock() shared_array = np.frombuffer(mp.Array('d', n * m).get_obj()).reshape( (n, m)) procs = [] try: for idx in xrange(nprocs): p = mp.Process(target=_internal, args=(l1, l2, n, idx, nprocs, shared_array, dist_function)) p.start() procs.append(p) for p in procs: p.join() except (KeyboardInterrupt, SystemExit): term_processes(procs, 'Exit signal received\n') except BaseException as msg: term_processes(procs, 'ERROR: %s\n' % msg) # progressbar(n,n) return shared_array.flatten() if condensed else shared_array
def dm_dense_inter_padding(l1, l2, dist_function, condensed=False): """Compute in a parallel way a distance matrix for a 1-d array. Parameters ---------- l1, l2 : array_like 1-dimensional arrays. Compute the distance matrix for each couple of elements of l1 and l2. dist_function : function Function to use for the distance computation. Returns ------- dist_matrix : array_like Symmetric NxN distance matrix for each input_array element. """ def _internal(l1, l2, n, idx, nprocs, shared_arr, dist_function): for i in xrange(idx, n, nprocs): if i % 100 == 0: progressbar(i, n) shared_arr[i] = [dist_function(l1[i], el2) for el2 in l2] n, m = len(l1), len(l2) nprocs = min(mp.cpu_count(), n) # index = mp.Value('i', 0) # lock = mp.Lock() shared_array = np.frombuffer(mp.Array('d', n*m).get_obj()).reshape((n, m)) procs = [] try: for idx in xrange(nprocs): p = mp.Process(target=_internal, args=(l1, l2, n, idx, nprocs, shared_array, dist_function)) p.start() procs.append(p) for p in procs: p.join() except (KeyboardInterrupt, SystemExit): term_processes(procs, 'Exit signal received\n') except BaseException as msg: term_processes(procs, 'ERROR: %s\n' % msg) # progressbar(n,n) return shared_array.flatten() if condensed else shared_array
def indicator_to_similarity(rows, cols, records, similarity_function): """Given the position on a sparse matrix, compute the similarity. Parameters: ----------- rows, cols : array_like Positions of records to calculate similarities. records : array_like Records to use for the ocmputation. similarity_function : function Function to calculate similarities. Returns: -------- data : multiprocessing.array Array of length len(rows) which contains similarities among records as specified by rows and cols. """ def _internal(data, rows, cols, n, records, idx, nprocs, similarity_function): for i in range(idx, n, nprocs): data[i] = similarity_function(records[rows[i]], records[cols[i]]) n = len(rows) nprocs = min(mp.cpu_count(), n) data = mp.Array('d', [0.] * n) procs = [] try: for idx in range(nprocs): p = mp.Process(target=_internal, args=(data, rows, cols, n, records, idx, nprocs, similarity_function)) p.start() procs.append(p) for p in procs: p.join() except (KeyboardInterrupt, SystemExit): extra.term_processes(procs, 'Exit signal received\n') except BaseException as msg: extra.term_processes(procs, 'ERROR: %s\n' % msg) return data
def dm_sparse_intra_padding(l1, dist_function, condensed=False): """Compute in a parallel way a distance matrix for a 1-d input array. Parameters ---------- l1 : array_like 1-dimensional array for which to compute the distance matrix. dist_function : function Function to use for the distance computation. Returns ------- dist_matrix : array_like Sparse symmetric NxN distance matrix for each input_array element. """ def _internal(l1, n, idx, nprocs, rows, cols, data, dist_function): for i in xrange(idx, n, nprocs): if i % 100 == 0: progressbar(i, n) # shared_arr[i, i:] = [dist_function(l1[i], el2) for el2 in l2] for j in xrange(i + 1, n): # shared_arr[idx, j] = dist_function(l1[i], l1[j]) _res = dist_function(l1[i], l1[j]) if _res > 0: c_idx = n * (n - 1) / 2 - (n - i) * (n - i - 1) / 2 + j - i - 1 data[c_idx] = _res rows[c_idx] = i cols[c_idx] = j n = len(l1) nprocs = min(mp.cpu_count(), n) c_length = int(n * (n - 1) / 2) data = mp.Array('d', [0.] * c_length) rows = mp.Array('d', [0.] * c_length) cols = mp.Array('d', [0.] * c_length) procs = [] try: for idx in xrange(nprocs): process = mp.Process(target=_internal, args=(l1, n, idx, nprocs, rows, cols, data, dist_function)) process.start() procs.append(process) for process in procs: process.join() except (KeyboardInterrupt, SystemExit): term_processes(procs, 'Exit signal received\n') except BaseException as msg: term_processes(procs, 'ERROR: %s\n' % msg) data = np.array(data) idx = data > 0 data = data[idx] rows = np.array(rows)[idx] cols = np.array(cols)[idx] # print (data) D = scipy.sparse.csr_matrix((data, (rows, cols)), shape=(n, n)) dist_matrix = D + D.T if condensed: dist_matrix = scipy.spatial.distance.squareform(dist_matrix) progressbar(n, n) return dist_matrix
def sm_sparse(X, metric, tol): """Compute in a parallel way a sim matrix for a 1-d array. Parameters ---------- l1 : array_like 1-dimensional arrays. Compute the distance matrix for each couple of elements of l1. dist_function : function Function to use for the distance computation. Returns ------- dist_matrix : array_like Symmetric NxN distance matrix for each input_array element. """ def _internal(X, metric, iterator, idx, return_queue): data = np.empty(0, dtype=float) rows = np.empty(0, dtype=int) cols = np.empty(0, dtype=int) append = np.append for i, j in iterator: res = metric(X[i], X[j]) if res > 0: data = append(data, res) rows = append(rows, i) cols = append(cols, j) return_queue.put((data, rows, cols), False) return_queue.put(None, True) def _internal_deque(X, metric, iterator, idx, return_queue): from collections import deque deq = deque() appendleft = deq.appendleft popleft = deq.popleft for i, j in iterator: res = metric(X[i], X[j]) if res > 0: # np.random.ranf(1)[0] / 10: appendleft((res, i, j)) len_d = len(deq) data = np.empty(len_d, dtype=float) rows = np.empty(len_d, dtype=int) cols = np.empty(len_d, dtype=int) for i in xrange(len_d): res = popleft() data[i] = res[0] rows[i] = res[1] cols[i] = res[2] return_queue.put((data, rows, cols), False) return_queue.put(None, True) n = X.shape[0] nprocs = min(mp.cpu_count(), n) # allows fast and lighter computation # pool = mp.Pool(processes=4) # from functools import partial # job = partial(_job, X=X, tol=tol) # import time # tic = time.time() # iterator = list(x for x in pool.imap_unordered( # job, combinations(xrange(len(X)), 2), int(n * (n - 1) / 2 / nprocs)) # if x is not None) # print(time.time() - tic) # iterator = filter(lambda x: x is not None, pool.imap_unordered( # job, combinations(xrange(len(X)), 2), int(n * (n - 1) / 2 / 4))) # tic = time.time() # def opt_iterator(): # lengths = np.array([x.junction_length for x in X]) # keys = np.arange(X.shape[0]) # # lst = dict(zip(keys, lengths)) # ss = dict() # for k in keys: # jl = lengths[k] # # ss[l] = set(lst[abs(lst-l) <= tol]) # _ss_l = set(keys[abs(lengths - jl) <= tol]) # insert = True # to_remove = None # for a in _ss_l: # s = lengths[a] # try: # _ss_l_old = ss[s] # l1 = len(_ss_l.difference(_ss_l_old)) # l2 = len(_ss_l_old.difference(_ss_l)) # if l1 == 0 and l2 == 0: # # they are equal, i can avoid the insertion # insert = False # break # elif l1 == 0: # # old is bigger, i avoid the insertion # insert = False # break # elif l2 == 0: # # new is bigger, i insert and remove the old # insert = True # to_remove = s # break # else: # # they are different, continue # continue # except KeyError: # continue # if insert: # if to_remove is not None: # del ss[to_remove] # ss[jl] = _ss_l # comb = [] # for k in ss: # comb += list((i, j) for i, j in combinations(ss[k], 2) # if len(X[i].setV & X[j].setV) > 0) # return list(set(comb)) # # iterator = opt_iterator() iterator = list( (i, j) for i, j in combinations(xrange(X.shape[0]), 2) if len(X[i].setV & X[j].setV) > 0 and abs(X[i].junction_length - X[j].junction_length) <= tol) # print(time.time() - tic) # pool.close() len_it = len(iterator) procs = [] manager = mp.Manager() return_queue = manager.Queue() data = np.empty(0, dtype=float) rows = np.empty(0, dtype=int) cols = np.empty(0, dtype=int) try: for idx in xrange(nprocs): num_elem = int(len_it / nprocs) + 1 itera = iterator[:num_elem] iterator = iterator[num_elem:] # itera = list(islice(iterator, int(n * (n - 1) / (2. * nprocs)) + 1)) p = mp.Process(target=_internal_deque, args=(X, metric, itera, idx, return_queue)) p.start() procs.append(p) count = 0 while count < nprocs: v = return_queue.get(True) if v is None: count += 1 continue data = np.hstack((data, v[0])) rows = np.hstack((rows, v[1])) cols = np.hstack((cols, v[2])) for p in procs: p.join() assert return_queue.empty() except (KeyboardInterrupt, SystemExit): term_processes(procs, 'Exit signal received\n') except BaseException as msg: term_processes(procs, 'ERROR: %s\n' % msg) return data, rows, cols
def dm_sparse_intra_padding(l1, dist_function, condensed=False): """Compute in a parallel way a distance matrix for a 1-d input array. Parameters ---------- l1 : array_like 1-dimensional array for which to compute the distance matrix. dist_function : function Function to use for the distance computation. Returns ------- dist_matrix : array_like Sparse symmetric NxN distance matrix for each input_array element. """ def _internal(l1, n, idx, nprocs, rows, cols, data, dist_function): for i in xrange(idx, n, nprocs): if i % 100 == 0: progressbar(i, n) # shared_arr[i, i:] = [dist_function(l1[i], el2) for el2 in l2] for j in xrange(i + 1, n): # shared_arr[idx, j] = dist_function(l1[i], l1[j]) _res = dist_function(l1[i], l1[j]) if _res > 0: c_idx = n*(n-1)/2 - (n-i)*(n-i-1)/2 + j - i - 1 data[c_idx] = _res rows[c_idx] = i cols[c_idx] = j n = len(l1) nprocs = min(mp.cpu_count(), n) c_length = int(n * (n - 1) / 2) data = mp.Array('d', [0.] * c_length) rows = mp.Array('d', [0.] * c_length) cols = mp.Array('d', [0.] * c_length) procs = [] try: for idx in xrange(nprocs): process = mp.Process( target=_internal, args=(l1, n, idx, nprocs, rows, cols, data, dist_function)) process.start() procs.append(process) for process in procs: process.join() except (KeyboardInterrupt, SystemExit): term_processes(procs, 'Exit signal received\n') except BaseException as msg: term_processes(procs, 'ERROR: %s\n' % msg) data = np.array(data) idx = data > 0 data = data[idx] rows = np.array(rows)[idx] cols = np.array(cols)[idx] # print (data) D = scipy.sparse.csr_matrix((data, (rows, cols)), shape=(n, n)) dist_matrix = D + D.T if condensed: dist_matrix = scipy.spatial.distance.squareform(dist_matrix) progressbar(n, n) return dist_matrix
def multi_cut_ap(preferences, affinity_matrix, dist_matrix, n_jobs=-1, sample_names=None): """Perform a AP clustering with variable cluster sizes. Parameters ---------- cluster_list : array-like Contains the list of the number of clusters to use at each step. affinity_matrix : array-like Precomputed affinity matrix. dist_matrix : array-like Precomputed distance matrix between points. Returns ------- queue_y : array-like Array to be visualised on the y-axis. Contains the list of average silhouette for each number of clusters present in cluster_list. """ def _internal(preferences, affinity_matrix, dist_matrix, idx, n_jobs, n, queue_y): for i in range(idx, n, n_jobs): ap = AffinityPropagation(preference=preferences[i], affinity='precomputed', max_iter=500) ap.fit(affinity_matrix) cluster_labels = ap.labels_.copy() nclusts = np.unique(cluster_labels).shape[0] save_results_clusters("res_ap_{:03d}_clust.csv" .format(nclusts), sample_names, ap.labels_) if nclusts > 1: try: silhouette_list = silhouette_samples(dist_matrix, ap.labels_, metric="precomputed") queue_y[i] = np.mean(silhouette_list) except BaseException: print(dist_matrix.shape, ap.labels_.shape) n = len(preferences) if n_jobs == -1: n_jobs = min(mp.cpu_count(), n) queue_y = mp.Array('d', [0.] * n) ps = [] try: for idx in range(n_jobs): p = mp.Process(target=_internal, args=(preferences, affinity_matrix, dist_matrix, idx, n_jobs, n, queue_y)) p.start() ps.append(p) for p in ps: p.join() except (KeyboardInterrupt, SystemExit): extra.term_processes(ps, 'Exit signal received\n') except BaseException as e: extra.term_processes(ps, 'ERROR: %s\n' % e) return queue_y
def multi_cut_spectral(cluster_list, affinity_matrix, dist_matrix, n_jobs=-1, sample_names=None): """Perform a spectral clustering with variable cluster sizes. Parameters ---------- cluster_list : array-like Contains the list of the number of clusters to use at each step. affinity_matrix : array-like Precomputed affinity matrix. dist_matrix : array-like Precomputed distance matrix between points. Returns ------- queue_y : array-like Array to be visualised on the y-axis. Contains the list of average silhouette for each number of clusters present in cluster_list. """ def _internal(cluster_list, affinity_matrix, dist_matrix, idx, n_jobs, n, queue_y): for i in range(idx, n, n_jobs): sp = SpectralClustering(n_clusters=cluster_list[i], affinity='precomputed', norm_laplacian=True, n_init=1000) sp.fit(affinity_matrix) save_results_clusters( "res_spectral_{:03d}_clust.csv".format(cluster_list[i]), sample_names, sp.labels_) silhouette_list = silhouette_samples(dist_matrix, sp.labels_, metric="precomputed") queue_y[i] = np.mean(silhouette_list) n = len(cluster_list) if n_jobs == -1: n_jobs = min(mp.cpu_count(), n) queue_y = mp.Array('d', [0.] * n) ps = [] try: for idx in range(n_jobs): p = mp.Process(target=_internal, args=(cluster_list, affinity_matrix, dist_matrix, idx, n_jobs, n, queue_y)) p.start() ps.append(p) for p in ps: p.join() except (KeyboardInterrupt, SystemExit): extra.term_processes(ps, 'Exit signal received\n') except BaseException as e: extra.term_processes(ps, 'ERROR: %s\n' % e) return queue_y
def sm_sparse(X, metric, tol): """Compute in a parallel way a sim matrix for a 1-d array. Parameters ---------- l1 : array_like 1-dimensional arrays. Compute the distance matrix for each couple of elements of l1. dist_function : function Function to use for the distance computation. Returns ------- dist_matrix : array_like Symmetric NxN distance matrix for each input_array element. """ def _internal(X, metric, iterator, idx, return_queue): data = np.empty(0, dtype=float) rows = np.empty(0, dtype=int) cols = np.empty(0, dtype=int) append = np.append for i, j in iterator: res = metric(X[i], X[j]) if res > 0: data = append(data, res) rows = append(rows, i) cols = append(cols, j) return_queue.put((data, rows, cols), False) return_queue.put(None, True) def _internal_deque(X, metric, iterator, idx, return_queue): from collections import deque deq = deque() appendleft = deq.appendleft popleft = deq.popleft for i, j in iterator: res = metric(X[i], X[j]) if res > 0: # np.random.ranf(1)[0] / 10: appendleft((res, i, j)) len_d = len(deq) data = np.empty(len_d, dtype=float) rows = np.empty(len_d, dtype=int) cols = np.empty(len_d, dtype=int) for i in xrange(len_d): res = popleft() data[i] = res[0] rows[i] = res[1] cols[i] = res[2] return_queue.put((data, rows, cols), False) return_queue.put(None, True) n = X.shape[0] nprocs = min(mp.cpu_count(), n) # allows fast and lighter computation # pool = mp.Pool(processes=4) # from functools import partial # job = partial(_job, X=X, tol=tol) # import time # tic = time.time() # iterator = list(x for x in pool.imap_unordered( # job, combinations(xrange(len(X)), 2), int(n * (n - 1) / 2 / nprocs)) # if x is not None) # print(time.time() - tic) # iterator = filter(lambda x: x is not None, pool.imap_unordered( # job, combinations(xrange(len(X)), 2), int(n * (n - 1) / 2 / 4))) # tic = time.time() # def opt_iterator(): # lengths = np.array([x.junction_length for x in X]) # keys = np.arange(X.shape[0]) # # lst = dict(zip(keys, lengths)) # ss = dict() # for k in keys: # jl = lengths[k] # # ss[l] = set(lst[abs(lst-l) <= tol]) # _ss_l = set(keys[abs(lengths - jl) <= tol]) # insert = True # to_remove = None # for a in _ss_l: # s = lengths[a] # try: # _ss_l_old = ss[s] # l1 = len(_ss_l.difference(_ss_l_old)) # l2 = len(_ss_l_old.difference(_ss_l)) # if l1 == 0 and l2 == 0: # # they are equal, i can avoid the insertion # insert = False # break # elif l1 == 0: # # old is bigger, i avoid the insertion # insert = False # break # elif l2 == 0: # # new is bigger, i insert and remove the old # insert = True # to_remove = s # break # else: # # they are different, continue # continue # except KeyError: # continue # if insert: # if to_remove is not None: # del ss[to_remove] # ss[jl] = _ss_l # comb = [] # for k in ss: # comb += list((i, j) for i, j in combinations(ss[k], 2) # if len(X[i].setV & X[j].setV) > 0) # return list(set(comb)) # # iterator = opt_iterator() iterator = list( (i, j) for i, j in combinations(xrange(X.shape[0]), 2) if len(X[i].setV & X[j].setV) > 0 and abs(X[i].junction_length - X[j].junction_length) <= tol) # print(time.time() - tic) # pool.close() len_it = len(iterator) procs = [] manager = mp.Manager() return_queue = manager.Queue() data = np.empty(0, dtype=float) rows = np.empty(0, dtype=int) cols = np.empty(0, dtype=int) try: for idx in xrange(nprocs): num_elem = int(len_it / nprocs) + 1 itera = iterator[:num_elem] iterator = iterator[num_elem:] # itera = list(islice(iterator, int(n * (n - 1) / (2. * nprocs)) + 1)) p = mp.Process( target=_internal_deque, args=(X, metric, itera, idx, return_queue)) p.start() procs.append(p) count = 0 while count < nprocs: v = return_queue.get(True) if v is None: count += 1 continue data = np.hstack((data, v[0])) rows = np.hstack((rows, v[1])) cols = np.hstack((cols, v[2])) for p in procs: p.join() assert return_queue.empty() except (KeyboardInterrupt, SystemExit): term_processes(procs, 'Exit signal received\n') except BaseException as msg: term_processes(procs, 'ERROR: %s\n' % msg) return data, rows, cols
def multi_cut_ap(preferences, affinity_matrix, dist_matrix, n_jobs=-1, sample_names=None): """Perform a AP clustering with variable cluster sizes. Parameters ---------- cluster_list : array-like Contains the list of the number of clusters to use at each step. affinity_matrix : array-like Precomputed affinity matrix. dist_matrix : array-like Precomputed distance matrix between points. Returns ------- queue_y : array-like Array to be visualised on the y-axis. Contains the list of average silhouette for each number of clusters present in cluster_list. """ def _internal(preferences, affinity_matrix, dist_matrix, idx, n_jobs, n, queue_y): for i in range(idx, n, n_jobs): ap = AffinityPropagation(preference=preferences[i], affinity='precomputed', max_iter=500) ap.fit(affinity_matrix) cluster_labels = ap.labels_.copy() nclusts = np.unique(cluster_labels).shape[0] save_results_clusters("res_ap_{:03d}_clust.csv".format(nclusts), sample_names, ap.labels_) if nclusts > 1: try: silhouette_list = silhouette_samples(dist_matrix, ap.labels_, metric="precomputed") queue_y[i] = np.mean(silhouette_list) except BaseException: print(dist_matrix.shape, ap.labels_.shape) n = len(preferences) if n_jobs == -1: n_jobs = min(mp.cpu_count(), n) queue_y = mp.Array('d', [0.] * n) ps = [] try: for idx in range(n_jobs): p = mp.Process(target=_internal, args=(preferences, affinity_matrix, dist_matrix, idx, n_jobs, n, queue_y)) p.start() ps.append(p) for p in ps: p.join() except (KeyboardInterrupt, SystemExit): extra.term_processes(ps, 'Exit signal received\n') except BaseException as e: extra.term_processes(ps, 'ERROR: %s\n' % e) return queue_y
def multi_cut_spectral(cluster_list, affinity_matrix, dist_matrix, n_jobs=-1, sample_names=None): """Perform a spectral clustering with variable cluster sizes. Parameters ---------- cluster_list : array-like Contains the list of the number of clusters to use at each step. affinity_matrix : array-like Precomputed affinity matrix. dist_matrix : array-like Precomputed distance matrix between points. Returns ------- queue_y : array-like Array to be visualised on the y-axis. Contains the list of average silhouette for each number of clusters present in cluster_list. """ def _internal(cluster_list, affinity_matrix, dist_matrix, idx, n_jobs, n, queue_y): for i in range(idx, n, n_jobs): sp = SpectralClustering(n_clusters=cluster_list[i], affinity='precomputed', norm_laplacian=True, n_init=1000) sp.fit(affinity_matrix) save_results_clusters( "res_spectral_{:03d}_clust.csv".format(cluster_list[i]), sample_names, sp.labels_) cluster_labels = sp.labels_.copy() # nclusts = np.unique(cluster_labels).shape[0] # Go, stability! # List of ids ids = np.array(sample_names) # Create original clusters clusters = {} for _ in np.unique(cluster_labels): clusters[_] = ids[np.where(cluster_labels == _)] # clust2 = clusters.copy() # other_clusts = set_subset _aux_i = cluster_list[i] for _aux_i in range(cluster_list[i] - 2, cluster_list[i] + 2): try: df = pd.read_csv( 'res_hierarchical_{:03d}_clust.csv'.format(_aux_i), header=None) except IOError: continue other_clusts = ([ tuple(_df[1][0]) for _df in df.sort_values(1).groupby(1) ]) calc_stability(clusters.copy(), other_clusts) # calc_stability(clust2, set_light_chain) silhouette_list = silhouette_samples(dist_matrix, sp.labels_, metric="precomputed") queue_y[i] = np.mean(silhouette_list) n = len(cluster_list) if n_jobs == -1: n_jobs = min(mp.cpu_count(), n) queue_y = mp.Array('d', [0.] * n) ps = [] try: for idx in range(n_jobs): p = mp.Process(target=_internal, args=(cluster_list, affinity_matrix, dist_matrix, idx, n_jobs, n, queue_y)) p.start() ps.append(p) for p in ps: p.join() except (KeyboardInterrupt, SystemExit): extra.term_processes(ps, 'Exit signal received\n') except BaseException as e: extra.term_processes(ps, 'ERROR: %s\n' % e) return queue_y