def main(args, outs): np.random.seed(0) if args.skip: return with LogPerf('submatrix_load'): submatrix = cr_graphclust.load_ndarray_h5(args.submatrix, 'submatrix') with LogPerf('nn_idx_load'): balltree = cr_graphclust.load_neighbor_index(args.neighbor_index) with LogPerf('nn_query'): nn_matrix = cr_graphclust.compute_nearest_neighbors( submatrix, balltree, args.k_nearest, args.row_start) cr_graphclust.write_nearest_neighbors(nn_matrix, outs.chunked_neighbors)
def compute_snn_matrix(nn, k_nearest): """ Compute shared-nearest-neighbor matrix from a nearest-neighbor boolean matrix """ with LogPerf('tocsr'): nn = nn.tocsr(copy=False) # The SNN (shared nearest neighbor) similarity is # The length of the nearest-neighbor intersection between two rows # (divided by the max number of neighbors) # This can be computed via the dot products of rows in the boolean NN matrix with LogPerf('snn'): snn = (nn.dot(nn.T)) / float(k_nearest) # Use the SNN similarity in the modularity optimization algorithm # Louvain takes a text edge-list and converts to its own binary format with LogPerf('tocoo'): snn = snn.tocoo(copy=False) return snn
def split(args): np.random.seed(0) if args.skip: return {'chunks': [{'__mem_gb': h5_constants.MIN_MEM_GB}]} if args.similarity_type not in SIMILARITY_TYPES: martian.exit("Unsupported similarity type: %s. Must be one of: %s" % (args.similarity_type, ','.join(SIMILARITY_TYPES))) with LogPerf('load'): pca_mat = SingleGenomeAnalysis.load_pca_from_h5( args.pca_h5).transformed_pca_matrix # Subselect barcodes if desired if args.num_bcs is None: use_bcs = np.arange(pca_mat.shape[0]) else: use_bcs = np.random.choice(pca_mat.shape[0], args.num_bcs, replace=False) pca_mat = pca_mat[use_bcs, :] # Record indices of selected barcodes use_bcs_path = martian.make_path('use_bcs.h5') cr_graphclust.save_ndarray_h5(use_bcs, use_bcs_path, 'use_bcs') # Subselect PCs if desired if args.input_pcs is not None: n_pcs = min(pca_mat.shape[1], args.input_pcs) pca_mat = pca_mat[:, np.arange(n_pcs)] # Build the nearest neighbor query index with LogPerf('nn_build'): balltree = cr_graphclust.build_neighbor_index( pca_mat, args.balltree_leaf_size or DEFAULT_BALLTREE_LEAFSIZE) neighbor_index = martian.make_path('neighbor_index.pickle') cr_graphclust.save_neighbor_index(balltree, neighbor_index) # Compute the actual number of nearest neighbors we'll use given_num_neighbors = args.num_neighbors if args.num_neighbors is not None else analysis_constants.GRAPHCLUST_NEIGHBORS_DEFAULT given_neighbor_a = args.neighbor_a if args.neighbor_a is not None else analysis_constants.GRAPHCLUST_NEIGHBOR_A_DEFAULT given_neighbor_b = args.neighbor_b if args.neighbor_b is not None else analysis_constants.GRAPHCLUST_NEIGHBOR_B_DEFAULT # Take max of {num_neighbors, a + b*log10(n)} use_neighbors = int( max( given_num_neighbors, np.round(given_neighbor_a + given_neighbor_b * np.log10(len(use_bcs))))) # Clamp to [1, n - 1] num_neighbors = max(1, min(use_neighbors, len(use_bcs) - 1)) print "Using %d neighbors" % num_neighbors # Divide the PCA matrix up into rows for NN queries with LogPerf('chunk_pca'): chunks = [] for row_start in xrange(0, pca_mat.shape[0], NN_QUERIES_PER_CHUNK): row_end = min(row_start + NN_QUERIES_PER_CHUNK, pca_mat.shape[0]) # Write the pca submatrix to an h5 file submatrix_path = martian.make_path('%d_submatrix.h5' % row_start) cr_graphclust.save_ndarray_h5(pca_mat[row_start:row_end, :], submatrix_path, 'submatrix') chunks.append({ 'neighbor_index': neighbor_index, 'submatrix': submatrix_path, 'row_start': row_start, 'total_rows': pca_mat.shape[0], 'k_nearest': num_neighbors, 'use_bcs': use_bcs_path, }) if args.similarity_type == SNN_SIMILARITY: join_mem_gb = 64 join_threads = 4 # Overallocate else: # Scale memory with size of nearest-neighbor adjacency matrix join_mem_gb = max( h5_constants.MIN_MEM_GB, int(np.ceil( (num_neighbors * len(use_bcs)) / NN_ENTRIES_PER_MEM_GB))) # HACK: use more threads for bigger mem requests to avoid mem oversubscription on clusters that don't enforce it join_threads = cr_io.get_thread_request_from_mem_gb(join_mem_gb) return { 'chunks': chunks, 'join': { '__mem_gb': join_mem_gb, '__threads': join_threads, } }
def join(args, outs, chunk_defs, chunk_outs): if args.skip: return # Merge the neighbor matrices with LogPerf('merge_nn'): nn = cr_graphclust.merge_nearest_neighbors( [chunk.chunked_neighbors for chunk in chunk_outs], chunk_defs[0].total_rows) print 'nn\tnn_nodes\t%0.4f' % nn.shape[0] print 'nn\tnn_links\t%0.4f' % nn.nnz print 'nn\tnn_density\t%0.4f' % cr_graphclust.matrix_density(nn) sys.stdout.flush() matrix_bin = martian.make_path('matrix.bin') matrix_weights = martian.make_path('matrix.weights') louvain_out = martian.make_path('louvain.out') if args.similarity_type == 'snn': snn = cr_graphclust.compute_snn_matrix(nn, chunk_defs[0].k_nearest) print 'snn\tsnn_nodes\t%d' % snn.shape[0] print 'snn\tsnn_links\t%d' % (snn.nnz / 2) print 'snn\tsnn_density\t%0.4f' % ( (snn.nnz) / float(snn.shape[0] * (snn.shape[0] - 1))) sys.stdout.flush() with LogPerf('convert'): cr_graphclust.pipe_weighted_edgelist_to_convert( snn, matrix_bin, matrix_weights) with LogPerf('louvain'): cr_graphclust.run_louvain_weighted_clustering( matrix_bin, matrix_weights, louvain_out) else: with LogPerf('tocoo'): nn = nn.tocoo(copy=False) with LogPerf('convert'): cr_graphclust.pipe_unweighted_edgelist_to_convert(nn, matrix_bin) with LogPerf('louvain'): cr_graphclust.run_louvain_unweighted_clustering( matrix_bin, louvain_out) with LogPerf('load_bcs'): barcodes = SingleGenomeAnalysis.load_bcs_from_matrix_h5(args.matrix_h5) use_bcs = cr_graphclust.load_ndarray_h5(chunk_defs[0].use_bcs, 'use_bcs') labels = cr_graphclust.load_louvain_results(len(barcodes), use_bcs, louvain_out) labels = cr_clustering.relabel_by_size(labels) # Save cluster results with analysis_io.open_h5_for_writing(outs.clusters_h5) as f: cr_graphclust.save_graphclust_h5(f, labels) clustering_key = cr_clustering.format_clustering_key( cr_clustering.CLUSTER_TYPE_GRAPHCLUST, 0) cr_clustering.save_clustering_csv(outs.clusters_csv, clustering_key, labels, barcodes) outs.chunked_neighbors = None
def split(args): np.random.seed(0) if args.matrix_h5 is None: return {'chunks': [{'__mem_gb': h5_constants.MIN_MEM_GB}]} if not os.path.exists(args.reduced_data): raise IOError('reduced data not found at {}'.format(args.reduced_data)) if not set(args.factorization).issubset(ALLOWED_FACTORIZATIONS): raise ValueError('Invalid factorization provided') if args.similarity_type not in SIMILARITY_TYPES: raise ValueError( 'Unsupported similarity type: %s. Must be one of: %s' % (args.similarity_type, ','.join(SIMILARITY_TYPES))) reduction_summary = args.reduction_summary['h5'] method_dict = {} for method in args.factorization: method_dict[method] = {} with LogPerf('load'): for method in args.factorization: if method == 'pca': method_dict[method][ 'transformed_matrix'] = cr_pca.load_pca_from_h5( reduction_summary[method]).transformed_pca_matrix if method == 'lsa': method_dict[method][ 'transformed_matrix'] = cr_lsa.load_lsa_from_h5( reduction_summary[method]).transformed_lsa_matrix if method == 'plsa': method_dict[method][ 'transformed_matrix'] = cr_plsa.load_plsa_from_h5( reduction_summary[method]).transformed_plsa_matrix # Record indices of selected barcodes. All methods must use same barcodes use_bcs = np.arange( method_dict[args.factorization[0]]['transformed_matrix'].shape[0]) use_bcs_path = martian.make_path('use_bcs.h5') cr_graphclust.save_ndarray_h5(use_bcs, use_bcs_path, 'use_bcs') # Build the nearest neighbor query index with LogPerf('nn_build'): for method in args.factorization: method_mat = method_dict[method]['transformed_matrix'] # normalize for plsa/lsa so that standard euclidean distance in normalized space is cosine distance in original space if method in ['plsa', 'lsa']: method_mat = method_mat / np.linalg.norm( method_mat, axis=1, keepdims=True) balltree = cr_graphclust.build_neighbor_index( method_mat, args.balltree_leaf_size or DEFAULT_BALLTREE_LEAFSIZE) method_dict[method]['neighbor_index'] = martian.make_path( 'neighbor_index_{}.pickle'.format(method)) cr_graphclust.save_neighbor_index( balltree, method_dict[method]['neighbor_index']) # Compute the actual number of nearest neighbors we'll use given_num_neighbors = args.num_neighbors if args.num_neighbors is not None else analysis_constants.GRAPHCLUST_NEIGHBORS_DEFAULT given_neighbor_a = args.neighbor_a if args.neighbor_a is not None else analysis_constants.GRAPHCLUST_NEIGHBOR_A_DEFAULT given_neighbor_b = args.neighbor_b if args.neighbor_b is not None else analysis_constants.GRAPHCLUST_NEIGHBOR_B_DEFAULT # Take max of {num_neighbors, a + b*log10(n)} use_neighbors = int( max( given_num_neighbors, np.round(given_neighbor_a + given_neighbor_b * np.log10(len(use_bcs))))) # Clamp to [1, n - 1] num_neighbors = max(1, min(use_neighbors, len(use_bcs) - 1)) print "Using %d neighbors" % num_neighbors # Divide the PCA matrix up into rows for NN queries with LogPerf('chunk_matrix'): chunks = [] for method in args.factorization: method_mat = method_dict[method]['transformed_matrix'] for row_start in xrange(0, method_mat.shape[0], NN_QUERIES_PER_CHUNK): row_end = min(row_start + NN_QUERIES_PER_CHUNK, method_mat.shape[0]) # Write the submatrix to an h5 file submatrix_path = martian.make_path('{}_{}_submatrix.h5'.format( method, row_start)) cr_graphclust.save_ndarray_h5(method_mat[row_start:row_end, :], submatrix_path, 'submatrix') chunks.append({ 'method': method, 'neighbor_index': method_dict[method]['neighbor_index'], 'submatrix': submatrix_path, 'row_start': row_start, 'total_rows': method_mat.shape[0], 'k_nearest': num_neighbors, 'use_bcs': use_bcs_path, }) if args.similarity_type == SNN_SIMILARITY: join_mem_gb = 64 join_threads = 4 # Overallocate else: # Scale memory with size of nearest-neighbor adjacency matrix join_mem_gb = max( h5_constants.MIN_MEM_GB, int(np.ceil( (num_neighbors * len(use_bcs)) / NN_ENTRIES_PER_MEM_GB))) # HACK: use more threads for bigger mem requests to avoid mem oversubscription on clusters that don't enforce it join_threads = cr_io.get_thread_request_from_mem_gb(join_mem_gb) return { 'chunks': chunks, 'join': { '__mem_gb': join_mem_gb, '__threads': join_threads, } }
def join(args, outs, chunk_defs, chunk_outs): if args.matrix_h5 is None: outs.graph_clustering_summary = {} return outs.graph_clustering_summary = {'h5': {}, 'csv': {}} # Merge the neighbor matrices for method in args.factorization: chunk_outs_def_method = [[ chunk_out, chunk_def ] for chunk_out, chunk_def in zip(chunk_outs, chunk_defs) if chunk_def.method == method] chunk_outs_method = [c[0] for c in chunk_outs_def_method] chunk_defs_method = [c[1] for c in chunk_outs_def_method] with LogPerf('merge_nn'): nn = cr_graphclust.merge_nearest_neighbors( [chunk.chunked_neighbors for chunk in chunk_outs_method], chunk_defs_method[0].total_rows) print 'nn\tnn_nodes\t%0.4f' % nn.shape[0] print 'nn\tnn_links\t%0.4f' % nn.nnz print 'nn\tnn_density\t%0.4f' % cr_graphclust.matrix_density(nn) sys.stdout.flush() matrix_bin = martian.make_path('matrix_{}.bin'.format(method)) matrix_weights = martian.make_path('matrix_{}.weights'.format(method)) louvain_out = martian.make_path('louvain_{}.out'.format(method)) if args.similarity_type == 'snn': snn = cr_graphclust.compute_snn_matrix( nn, chunk_defs_method[0].k_nearest) print 'snn\tsnn_nodes\t%d' % snn.shape[0] print 'snn\tsnn_links\t%d' % (snn.nnz / 2) print 'snn\tsnn_density\t%0.4f' % ( (snn.nnz) / float(snn.shape[0] * (snn.shape[0] - 1))) sys.stdout.flush() with LogPerf('convert'): cr_graphclust.pipe_weighted_edgelist_to_convert( snn, matrix_bin, matrix_weights) with LogPerf('louvain'): cr_graphclust.run_louvain_weighted_clustering( matrix_bin, matrix_weights, louvain_out) else: with LogPerf('tocoo'): nn = nn.tocoo(copy=False) with LogPerf('convert'): cr_graphclust.pipe_unweighted_edgelist_to_convert( nn, matrix_bin) with LogPerf('louvain'): cr_graphclust.run_louvain_unweighted_clustering( matrix_bin, louvain_out) with LogPerf('load_bcs'): barcodes = None with h5.File(args.matrix_h5, 'r') as f: group_name = f.keys()[0] barcodes = cr_matrix.CountMatrix.load_bcs_from_h5_group( f[group_name]) use_bcs = cr_graphclust.load_ndarray_h5(chunk_defs_method[0].use_bcs, 'use_bcs') labels = cr_graphclust.load_louvain_results(len(barcodes), use_bcs, louvain_out) labels = cr_clustering.relabel_by_size(labels) # Save cluster results cr_io.mkdir(outs.knn_clusters, allow_existing=True) method_dir = os.path.join(outs.knn_clusters, method) cr_io.mkdir(method_dir, allow_existing=True) _h5 = os.path.join(method_dir, "clusters.h5") _csv = os.path.join(method_dir, "clusters_csv") with analysis_io.open_h5_for_writing(_h5) as f: cr_graphclust.save_graphclust_h5(f, labels) clustering_key = cr_clustering.format_clustering_key( cr_clustering.CLUSTER_TYPE_GRAPHCLUST, 0) cr_clustering.save_clustering_csv(_csv, clustering_key, labels, barcodes) outs.graph_clustering_summary['h5'][method] = _h5 outs.graph_clustering_summary['csv'][method] = _csv outs.chunked_neighbors = None
def main(args, outs): np.random.seed(0) LogPerf.mem() with MoleculeCounter.open(args.molecules, 'r') as mc: library_info = mc.get_library_info() barcode_info = mc.get_barcode_info() metrics_in = mc.get_all_metrics() metrics_out = copy.deepcopy(metrics_in) # Compute subsampling rate and approximate new total readpair count frac_reads_kept = np.array(args.frac_reads_kept, dtype=float) total_reads_in = mc.get_raw_read_pairs_per_library() total_reads_out = total_reads_in * frac_reads_kept for lib_idx, _ in enumerate(library_info): metrics_out[cr_mol_counter.LIBRARIES_METRIC][str( lib_idx)][cr_mol_counter. DOWNSAMPLED_READS_METRIC] = total_reads_out[lib_idx] # downsample molecule info chunk = slice(args.chunk_start, args.chunk_start + args.chunk_len) mol_library_idx = mc.get_column_lazy('library_idx')[chunk] mol_read_pairs = mc.get_column_lazy('count')[chunk] mol_rate = frac_reads_kept[mol_library_idx] del mol_library_idx new_read_pairs = np.random.binomial(mol_read_pairs, mol_rate) del mol_read_pairs del mol_rate keep_mol = np.flatnonzero(new_read_pairs) new_read_pairs = new_read_pairs[keep_mol] mol_gem_group = mc.get_column_lazy('gem_group')[chunk][keep_mol] mol_barcode_idx = mc.get_column_lazy('barcode_idx')[chunk][keep_mol] mol_feature_idx = mc.get_column_lazy('feature_idx')[chunk][keep_mol] # Assert that gem groups start at 1 and are contiguous gem_groups = sorted(set(lib['gem_group'] for lib in library_info)) assert(min(gem_groups) == 1 and \ np.all(np.diff(np.array(gem_groups,dtype=int)) == 1)) feature_ref = mc.get_feature_ref() # Compute matrix dimensions # Get the range of possible barcode indices for each gem group. gg_barcode_idx_start = np.zeros(1 + len(gem_groups), dtype=int) gg_barcode_idx_len = np.zeros(1 + len(gem_groups), dtype=int) for gg_str, idx_range in sorted( args.gem_group_barcode_ranges.iteritems(), key=lambda kv: int(kv[0])): gg = int(gg_str) gg_barcode_idx_start[gg] = idx_range[0] gg_barcode_idx_len[gg] = idx_range[1] - idx_range[0] num_bcs = gg_barcode_idx_len.sum() num_features = feature_ref.get_num_features() print 'downsampled' LogPerf.mem() # Convert molecule barcode indices into matrix barcode indices # The molecule info barcode_idx is in this space: # [W_0, W_1, ...] where W_i is distinct original whitelist i. # The matrix is in, e.g., this space: # [w_0-1, w_1-2, w_0-3, ...] where w_i-j is a copy of whitelist i for gem group j. # Return to the original whitelist index mol_barcode_idx -= gg_barcode_idx_start.astype( np.uint64)[mol_gem_group] # Offset by the cumulative whitelist length up to a barcode's gem group gg_barcode_matrix_start = np.cumsum(gg_barcode_idx_len).astype( np.uint64) mol_barcode_idx += gg_barcode_matrix_start[mol_gem_group - 1] ones = np.ones(len(mol_barcode_idx), dtype=cr_matrix.DEFAULT_DATA_DTYPE) umi_matrix = sp_sparse.coo_matrix( (ones, (mol_feature_idx, mol_barcode_idx)), shape=(num_features, num_bcs)) print 'created umi matrix' LogPerf.mem() # Create a read-count matrix so we can summarize reads per barcode read_matrix = sp_sparse.coo_matrix( (new_read_pairs, (mol_feature_idx, mol_barcode_idx)), shape=(num_features, num_bcs)) del ones del mol_feature_idx del mol_barcode_idx del new_read_pairs # Get all barcodes strings for the raw matrix barcode_seqs = mc.get_barcodes() print len(barcode_seqs), len(gem_groups) print 'creating barcode strings' LogPerf.mem() barcodes = [] for gg in gem_groups: idx_start = gg_barcode_idx_start[gg] idx_end = idx_start + gg_barcode_idx_len[gg] gg_bcs = np.array([ cr_utils.format_barcode_seq(bc, gg) for bc in barcode_seqs[idx_start:idx_end] ]) barcodes.append(gg_bcs) barcodes = np.concatenate(barcodes) barcodes.flags.writeable = False print 'created barcode strings' LogPerf.mem() # Get mapped reads per barcode per library,genome read_summary = {} read_matrix = CountMatrix(feature_ref, barcodes, read_matrix) read_matrix.m = read_matrix.m.tocsc(copy=True) read_summary = summarize_read_matrix(read_matrix, library_info, barcode_info, barcode_seqs) del read_matrix print 'created read matrix' LogPerf.mem() # Construct the raw UMI matrix raw_umi_matrix = CountMatrix(feature_ref, barcodes, umi_matrix) raw_umi_matrix.save_h5_file(outs.raw_matrix_h5) outs.raw_nnz = raw_umi_matrix.m.nnz # Construct the filtered UMI matrix filtered_bcs = MoleculeCounter.get_filtered_barcodes( barcode_info, library_info, barcode_seqs) filtered_umi_matrix = raw_umi_matrix.select_barcodes_by_seq( filtered_bcs) filtered_umi_matrix.save_h5_file(outs.filtered_matrix_h5) outs.filtered_nnz = filtered_umi_matrix.m.nnz print 'created filtered umi matrix' LogPerf.mem() summary = { 'read_summary': read_summary, 'mol_metrics': metrics_out, } with open(outs.chunk_summary, 'w') as f: json.dump(tk_safe_json.json_sanitize(summary), f, indent=4, sort_keys=True) # Don't write MEX from chunks. outs.raw_matrices_mex = None outs.filtered_matrices_mex = None