def main(args, outs): np.random.seed(args.random_seed) if args.skip or args.is_multi_genome: return matrix = cr_matrix.GeneBCMatrix.load_h5(args.matrix_h5) pca = cr_pca.load_pca_from_h5(args.pca_h5) pca_mat = pca.transformed_pca_matrix # Subsample barcodes if args.num_bcs is not None: use_bcs = np.random.choice(pca_mat.shape[0], args.num_bcs, replace=False) matrix = matrix.select_barcodes(use_bcs) pca_mat = pca_mat[use_bcs, :] # Subset principal components if args.num_pcs is not None: pca_mat = pca_mat[:, np.arange(args.num_pcs)] kmeans = cr_kmeans.run_kmeans(pca_mat, args.n_clusters, random_state=args.random_seed) with cr_io.open_h5_for_writing(outs.kmeans_h5) as f: cr_kmeans.save_kmeans_h5(f, args.n_clusters, kmeans) clustering_key = cr_clustering.format_clustering_key( cr_clustering.CLUSTER_TYPE_KMEANS, args.n_clusters) cr_clustering.save_clustering_csv(outs.kmeans_csv, clustering_key, kmeans.clusters, matrix.bcs)
def main(args, outs): np.random.seed(0) if args.filtered_matrix is None: return if not os.path.exists(outs.clustered_data): cr_io.mkdir(outs.clustered_data) matrix_bcs = cr_matrix.CountMatrix.load_bcs_from_h5_file( args.filtered_matrix) for method in args.factorization: transformed_matrix = args.transformed_matrix[method] method_dir = os.path.join(outs.clustered_data, method) cr_io.mkdir(method_dir, allow_existing=True) file_head = CLUSTER_FILE_HEAD[method] _h5 = os.path.join(method_dir, file_head + ".h5") _csv = os.path.join(method_dir, file_head + "_csv") dr_mat = None if not os.path.exists(transformed_matrix): raise IOError('matrix does not exist') if method == 'pca': pca = cr_pca.load_pca_from_h5(transformed_matrix) dr_mat = pca.transformed_pca_matrix if method == 'lsa': lsa = cr_lsa.load_lsa_from_h5(transformed_matrix) lsa = lsa._replace( transformed_lsa_matrix=lsa.transformed_lsa_matrix + 1e-120) dr_mat = lsa.transformed_lsa_matrix / np.linalg.norm( lsa.transformed_lsa_matrix, axis=1, keepdims=True) if method == 'plsa': plsa = cr_plsa.load_plsa_from_h5(args.transformed_matrix[method]) plsa = plsa._replace( transformed_plsa_matrix=plsa.transformed_plsa_matrix + 1e-120) dr_mat = plsa.transformed_plsa_matrix / np.linalg.norm( plsa.transformed_plsa_matrix, axis=1, keepdims=True) if args.num_dims is not None: if args.num_dims > dr_mat.shape[1]: raise ValueError( 'number of dimensions requested to use is larger than number of dimensions in data' ) dr_mat = dr_mat[:, np.arange(args.num_dims)] kmeans = cr_kmeans.run_kmeans(dr_mat, args.n_clusters, random_state=args.random_seed) with analysis_io.open_h5_for_writing(_h5) as f: cr_kmeans.save_kmeans_h5(f, args.n_clusters, kmeans) clustering_key = cr_clustering.format_clustering_key( cr_clustering.CLUSTER_TYPE_KMEANS, args.n_clusters) cr_clustering.save_clustering_csv(_csv, clustering_key, kmeans.clusters, matrix_bcs)
def main(args, outs): if args.skip or args.is_multi_genome: return matrix = cr_matrix.GeneBCMatrix.load_h5(args.matrix_h5) clustering = SingleGenomeAnalysis.load_clustering_from_h5(args.clustering_h5, args.clustering_key) diffexp = cr_diffexp.run_differential_expression(matrix, clustering.clusters) with cr_io.open_h5_for_writing(outs.diffexp_h5) as f: cr_diffexp.save_differential_expression_h5(f, args.clustering_key, diffexp) cr_diffexp.save_differential_expression_csv(args.clustering_key, diffexp, matrix, outs.diffexp_csv)
def join(args, outs, chunk_defs, chunk_outs): ctg_mgr = ReferenceManager(args.reference_path) species = ctg_mgr.list_species() if args.filtered_peak_bc_matrix is None or len(species) > 1: outs.enrichment_analysis = None outs.enrichment_analysis_summary = {} return peak_matrix_features = cr_matrix.CountMatrix.load_feature_ref_from_h5_file(args.filtered_peak_bc_matrix) tf_matrix_features = cr_matrix.CountMatrix.load_feature_ref_from_h5_file(args.filtered_tf_bc_matrix) if args.filtered_tf_bc_matrix is not None else None outs.enrichment_analysis_summary = {'h5': {}, 'csv': {}} # for each method, we merge h5 files and copy csv directories to one place cr_io.mkdir(outs.enrichment_analysis, allow_existing=True) for method in args.factorization: method_dir = os.path.join(outs.enrichment_analysis, method) cr_io.mkdir(method_dir, allow_existing=True) _h5 = os.path.join(method_dir, '{}_enrichment_h5.h5'.format(method)) outs.enrichment_analysis_summary['h5'][method] = _h5 chunk_h5s = [] _csv = os.path.join(method_dir, '{}_enrichment_csv'.format(method)) outs.enrichment_analysis_summary['csv'][method] = _csv diffexp_prefixes = [(fr.id, fr.name) for fr in peak_matrix_features.feature_defs] if args.filtered_tf_bc_matrix is not None: diffexp_prefixes += [(fr.id, fr.name) for fr in tf_matrix_features.feature_defs] clustering_h5 = args.clustering_summary['h5'][method] for key in SingleGenomeAnalysis.load_clustering_keys_from_h5(clustering_h5): chunk_outs_def_method_clustering = sorted([[chunk_out, chunk_def] for chunk_out, chunk_def in zip(chunk_outs, chunk_defs) if chunk_def.clustering_key == key], key=lambda x: x[1].cluster) chunk_outs_method_clustering = [c[0] for c in chunk_outs_def_method_clustering] # load 1 vs rest tests in sorted order of chunks and combine into one output per clustering diffexp = cr_diffexp.DIFFERENTIAL_EXPRESSION(np.hstack([np.loadtxt(com.tmp_diffexp, delimiter=',')[:, 0:3] for com in chunk_outs_method_clustering])) # write out h5 chunk_h5 = martian.make_path('{}_enrichment_h5.h5'.format(key)) with analysis_io.open_h5_for_writing(chunk_h5) as f: cr_diffexp.save_differential_expression_h5(f, key, diffexp) chunk_h5s += [chunk_h5] # write out csv cr_diffexp.save_differential_expression_csv_from_features(key, diffexp, diffexp_prefixes, _csv) analysis_io.combine_h5_files(chunk_h5s, _h5, [analysis_constants.ANALYSIS_H5_DIFFERENTIAL_EXPRESSION_GROUP, analysis_constants.ANALYSIS_H5_MAP_DE[method]])
def main(args, outs): if args.skip: return matrix = cr_matrix.CountMatrix.load_h5_file(args.matrix_h5) # For now, only compute for gene expression features matrix = matrix.select_features_by_type(lib_constants.GENE_EXPRESSION_LIBRARY_TYPE) clustering = SingleGenomeAnalysis.load_clustering_from_h5(args.clustering_h5, args.clustering_key) diffexp = cr_diffexp.run_differential_expression(matrix, clustering.clusters) with analysis_io.open_h5_for_writing(outs.diffexp_h5) as f: cr_diffexp.save_differential_expression_h5(f, args.clustering_key, diffexp) cr_diffexp.save_differential_expression_csv(args.clustering_key, diffexp, matrix, outs.diffexp_csv)
def join(args, outs, chunk_defs, chunk_outs): if args.skip: return # Merge the neighbor matrices with LogPerf('merge_nn'): nn = cr_graphclust.merge_nearest_neighbors( [chunk.chunked_neighbors for chunk in chunk_outs], chunk_defs[0].total_rows) print 'nn\tnn_nodes\t%0.4f' % nn.shape[0] print 'nn\tnn_links\t%0.4f' % nn.nnz print 'nn\tnn_density\t%0.4f' % cr_graphclust.matrix_density(nn) sys.stdout.flush() matrix_bin = martian.make_path('matrix.bin') matrix_weights = martian.make_path('matrix.weights') louvain_out = martian.make_path('louvain.out') if args.similarity_type == 'snn': snn = cr_graphclust.compute_snn_matrix(nn, chunk_defs[0].k_nearest) print 'snn\tsnn_nodes\t%d' % snn.shape[0] print 'snn\tsnn_links\t%d' % (snn.nnz / 2) print 'snn\tsnn_density\t%0.4f' % ( (snn.nnz) / float(snn.shape[0] * (snn.shape[0] - 1))) sys.stdout.flush() with LogPerf('convert'): cr_graphclust.pipe_weighted_edgelist_to_convert( snn, matrix_bin, matrix_weights) with LogPerf('louvain'): cr_graphclust.run_louvain_weighted_clustering( matrix_bin, matrix_weights, louvain_out) else: with LogPerf('tocoo'): nn = nn.tocoo(copy=False) with LogPerf('convert'): cr_graphclust.pipe_unweighted_edgelist_to_convert(nn, matrix_bin) with LogPerf('louvain'): cr_graphclust.run_louvain_unweighted_clustering( matrix_bin, louvain_out) with LogPerf('load_bcs'): barcodes = SingleGenomeAnalysis.load_bcs_from_matrix_h5(args.matrix_h5) use_bcs = cr_graphclust.load_ndarray_h5(chunk_defs[0].use_bcs, 'use_bcs') labels = cr_graphclust.load_louvain_results(len(barcodes), use_bcs, louvain_out) labels = cr_clustering.relabel_by_size(labels) # Save cluster results with analysis_io.open_h5_for_writing(outs.clusters_h5) as f: cr_graphclust.save_graphclust_h5(f, labels) clustering_key = cr_clustering.format_clustering_key( cr_clustering.CLUSTER_TYPE_GRAPHCLUST, 0) cr_clustering.save_clustering_csv(outs.clusters_csv, clustering_key, labels, barcodes) outs.chunked_neighbors = None
def join(args, outs, chunk_defs, chunk_outs): if args.skip: return np.random.seed(0) # Load the matrix mat = CountMatrix.load_h5_file(args.matrix_h5) print mat.m.shape, mat.m.nnz barcodes = mat.bcs # Load graph-based clustering from analysis H5 clustering_key = cr_clustering.format_clustering_key( cr_clustering.CLUSTER_TYPE_GRAPHCLUST, 0) clustering = SingleGenomeAnalysis.load_clustering_from_h5( args.clusters_h5, clustering_key) labels = clustering.clusters # Clusters that were 0 were unused in the clustering analysis (only relevant if the cluster stage was run by itself) total_bcs = len(labels) use_bcs = np.flatnonzero(labels > 0) expr_mat = mat.m[:, use_bcs] # Make cluster labels 0-based labels = labels[use_bcs] - 1 # Convert PCA coords to dataframe pca = SingleGenomeAnalysis.load_pca_from_h5( args.pca_h5).transformed_pca_matrix[use_bcs, :] pca_df = pd.DataFrame(pca) print pca_df.shape # 1) Run hierarchical clustering on cluster medoids in PCA-space # 2) For each pair of clusters that are sibling leaves, # 3) Run a differential expression analysis # 4) Merge the clusters if not enough genes are differentially expressed # 5) If merged, stop considering cluster-pairs and goto 1) # Cache already-checked cluster-pairs # set of (frozenset, frozenset) checked_cluster_pairs = set() while True: print resource.getrusage(resource.RUSAGE_SELF).ru_maxrss sys.stdout.flush() if len(np.bincount(labels)) == 1: # One cluster remains break # Compute medoids, perform hierarchical clustering pca_df['cluster'] = labels medoids = pca_df.groupby('cluster').apply( lambda x: x.median(axis=0)).as_matrix()[:, :-1] hc = linkage(medoids, 'complete') max_label = np.max(labels) print np.bincount(labels) print 'max=%d' % max_label any_merged = False for step in xrange(hc.shape[0]): if hc[step, 0] <= max_label and hc[step, 1] <= max_label: leaf0, leaf1 = hc[step, 0], hc[step, 1] group0 = np.flatnonzero(labels == leaf0) group1 = np.flatnonzero(labels == leaf1) # Skip this cluster pair if already checked set0 = frozenset(group0) set1 = frozenset(group1) cluster_pair = tuple(sorted([set0, set1])) if cluster_pair in checked_cluster_pairs: continue checked_cluster_pairs.add(cluster_pair) print 'Comparing clusters (%d,%d)' % (1 + leaf0, 1 + leaf1) submat = expr_mat[:, np.concatenate((group0, group1))] print '\tComputing params on (%d,%d) matrix' % submat.shape params = compute_sseq_params(submat) print '\tRunning DE on %d vs %d cells' % (len(group0), len(group1)) group0_submat = np.arange(len(group0)) group1_submat = np.arange(len(group0), len(group0) + len(group1)) de_result = sseq_differential_expression( submat, group0_submat, group1_submat, params) n_de_genes = np.sum(de_result.adjusted_p_value < MERGE_CLUSTERS_DE_ADJ_P_THRESHOLD) if n_de_genes == 0: print '\tFound %d DE genes. Merging clusters (%d,%d)' % ( n_de_genes, 1 + leaf0, 1 + leaf1) # Relabel as the smaller-index cluster labels[labels == leaf1] = leaf0 # Shift all labels above old label down labels[labels > leaf1] = labels[labels > leaf1] - 1 any_merged = True break sys.stdout.flush() if not any_merged: break # Convert back to one-based cluster labels labels += 1 labels = cr_clustering.relabel_by_size(labels) # Convert back into original bc space, with 0s for unused bcs final_labels = np.zeros(total_bcs, dtype=int) final_labels[use_bcs] = labels # Save results with analysis_io.open_h5_for_writing(outs.clusters_h5) as f: cr_graphclust.save_graphclust_h5(f, final_labels) clustering_key = cr_clustering.format_clustering_key( cr_clustering.CLUSTER_TYPE_GRAPHCLUST, 0) cr_clustering.save_clustering_csv(outs.clusters_csv, clustering_key, final_labels, barcodes)
def join(args, outs, chunk_defs, chunk_outs): if args.matrix_h5 is None: outs.graph_clustering_summary = {} return outs.graph_clustering_summary = {'h5': {}, 'csv': {}} # Merge the neighbor matrices for method in args.factorization: chunk_outs_def_method = [[ chunk_out, chunk_def ] for chunk_out, chunk_def in zip(chunk_outs, chunk_defs) if chunk_def.method == method] chunk_outs_method = [c[0] for c in chunk_outs_def_method] chunk_defs_method = [c[1] for c in chunk_outs_def_method] with LogPerf('merge_nn'): nn = cr_graphclust.merge_nearest_neighbors( [chunk.chunked_neighbors for chunk in chunk_outs_method], chunk_defs_method[0].total_rows) print 'nn\tnn_nodes\t%0.4f' % nn.shape[0] print 'nn\tnn_links\t%0.4f' % nn.nnz print 'nn\tnn_density\t%0.4f' % cr_graphclust.matrix_density(nn) sys.stdout.flush() matrix_bin = martian.make_path('matrix_{}.bin'.format(method)) matrix_weights = martian.make_path('matrix_{}.weights'.format(method)) louvain_out = martian.make_path('louvain_{}.out'.format(method)) if args.similarity_type == 'snn': snn = cr_graphclust.compute_snn_matrix( nn, chunk_defs_method[0].k_nearest) print 'snn\tsnn_nodes\t%d' % snn.shape[0] print 'snn\tsnn_links\t%d' % (snn.nnz / 2) print 'snn\tsnn_density\t%0.4f' % ( (snn.nnz) / float(snn.shape[0] * (snn.shape[0] - 1))) sys.stdout.flush() with LogPerf('convert'): cr_graphclust.pipe_weighted_edgelist_to_convert( snn, matrix_bin, matrix_weights) with LogPerf('louvain'): cr_graphclust.run_louvain_weighted_clustering( matrix_bin, matrix_weights, louvain_out) else: with LogPerf('tocoo'): nn = nn.tocoo(copy=False) with LogPerf('convert'): cr_graphclust.pipe_unweighted_edgelist_to_convert( nn, matrix_bin) with LogPerf('louvain'): cr_graphclust.run_louvain_unweighted_clustering( matrix_bin, louvain_out) with LogPerf('load_bcs'): barcodes = None with h5.File(args.matrix_h5, 'r') as f: group_name = f.keys()[0] barcodes = cr_matrix.CountMatrix.load_bcs_from_h5_group( f[group_name]) use_bcs = cr_graphclust.load_ndarray_h5(chunk_defs_method[0].use_bcs, 'use_bcs') labels = cr_graphclust.load_louvain_results(len(barcodes), use_bcs, louvain_out) labels = cr_clustering.relabel_by_size(labels) # Save cluster results cr_io.mkdir(outs.knn_clusters, allow_existing=True) method_dir = os.path.join(outs.knn_clusters, method) cr_io.mkdir(method_dir, allow_existing=True) _h5 = os.path.join(method_dir, "clusters.h5") _csv = os.path.join(method_dir, "clusters_csv") with analysis_io.open_h5_for_writing(_h5) as f: cr_graphclust.save_graphclust_h5(f, labels) clustering_key = cr_clustering.format_clustering_key( cr_clustering.CLUSTER_TYPE_GRAPHCLUST, 0) cr_clustering.save_clustering_csv(_csv, clustering_key, labels, barcodes) outs.graph_clustering_summary['h5'][method] = _h5 outs.graph_clustering_summary['csv'][method] = _csv outs.chunked_neighbors = None