def main(args, outs): np.random.seed(args.random_seed) if args.skip or args.is_multi_genome: return matrix = cr_matrix.GeneBCMatrix.load_h5(args.matrix_h5) pca = cr_pca.load_pca_from_h5(args.pca_h5) pca_mat = pca.transformed_pca_matrix # Subsample barcodes if args.num_bcs is not None: use_bcs = np.random.choice(pca_mat.shape[0], args.num_bcs, replace=False) matrix = matrix.select_barcodes(use_bcs) pca_mat = pca_mat[use_bcs, :] # Subset principal components if args.num_pcs is not None: pca_mat = pca_mat[:, np.arange(args.num_pcs)] kmeans = cr_kmeans.run_kmeans(pca_mat, args.n_clusters, random_state=args.random_seed) with cr_io.open_h5_for_writing(outs.kmeans_h5) as f: cr_kmeans.save_kmeans_h5(f, args.n_clusters, kmeans) clustering_key = cr_clustering.format_clustering_key( cr_clustering.CLUSTER_TYPE_KMEANS, args.n_clusters) cr_clustering.save_clustering_csv(outs.kmeans_csv, clustering_key, kmeans.clusters, matrix.bcs)
def main(args, outs): np.random.seed(0) if args.filtered_matrix is None: return if not os.path.exists(outs.clustered_data): cr_io.mkdir(outs.clustered_data) matrix_bcs = cr_matrix.CountMatrix.load_bcs_from_h5_file( args.filtered_matrix) for method in args.factorization: transformed_matrix = args.transformed_matrix[method] method_dir = os.path.join(outs.clustered_data, method) cr_io.mkdir(method_dir, allow_existing=True) file_head = CLUSTER_FILE_HEAD[method] _h5 = os.path.join(method_dir, file_head + ".h5") _csv = os.path.join(method_dir, file_head + "_csv") dr_mat = None if not os.path.exists(transformed_matrix): raise IOError('matrix does not exist') if method == 'pca': pca = cr_pca.load_pca_from_h5(transformed_matrix) dr_mat = pca.transformed_pca_matrix if method == 'lsa': lsa = cr_lsa.load_lsa_from_h5(transformed_matrix) lsa = lsa._replace( transformed_lsa_matrix=lsa.transformed_lsa_matrix + 1e-120) dr_mat = lsa.transformed_lsa_matrix / np.linalg.norm( lsa.transformed_lsa_matrix, axis=1, keepdims=True) if method == 'plsa': plsa = cr_plsa.load_plsa_from_h5(args.transformed_matrix[method]) plsa = plsa._replace( transformed_plsa_matrix=plsa.transformed_plsa_matrix + 1e-120) dr_mat = plsa.transformed_plsa_matrix / np.linalg.norm( plsa.transformed_plsa_matrix, axis=1, keepdims=True) if args.num_dims is not None: if args.num_dims > dr_mat.shape[1]: raise ValueError( 'number of dimensions requested to use is larger than number of dimensions in data' ) dr_mat = dr_mat[:, np.arange(args.num_dims)] kmeans = cr_kmeans.run_kmeans(dr_mat, args.n_clusters, random_state=args.random_seed) with analysis_io.open_h5_for_writing(_h5) as f: cr_kmeans.save_kmeans_h5(f, args.n_clusters, kmeans) clustering_key = cr_clustering.format_clustering_key( cr_clustering.CLUSTER_TYPE_KMEANS, args.n_clusters) cr_clustering.save_clustering_csv(_csv, clustering_key, kmeans.clusters, matrix_bcs)
def main(args, outs): if args.filtered_matrix is None: return if not os.path.exists(outs.tsne): os.mkdir(outs.tsne) matrix = cr_matrix.CountMatrix.load_h5_file(args.filtered_matrix) if args.method == 'pca': transformed_matrix = cr_pca.load_pca_from_h5(args.transformed_matrix_h5).transformed_pca_matrix if args.method == 'lsa': lsa = cr_lsa.load_lsa_from_h5(args.transformed_matrix_h5) lsa = lsa._replace(transformed_lsa_matrix=lsa.transformed_lsa_matrix + 1e-120) # transform matrix to unit normed so that euclidean distance in new space is cosine distance in old space transformed_matrix = lsa.transformed_lsa_matrix / np.linalg.norm(lsa.transformed_lsa_matrix, axis=1, keepdims=True) if args.method == 'plsa': plsa = cr_plsa.load_plsa_from_h5(args.transformed_matrix_h5) plsa = plsa._replace(transformed_plsa_matrix=plsa.transformed_plsa_matrix + 1e-120) # transform matrix to unit normed so that euclidean distance in new space is cosine distance in old space transformed_matrix = plsa.transformed_plsa_matrix / np.linalg.norm(plsa.transformed_plsa_matrix, axis=1, keepdims=True) tsne_dims = args.tsne_dims tsne = cr_tsne.run_tsne(transformed_matrix, key=str(tsne_dims), tsne_dims=tsne_dims, input_pcs=args.tsne_input_pcs, perplexity=args.tsne_perplexity, theta=args.tsne_theta, max_iter=args.tsne_max_iter, stop_lying_iter=args.tsne_stop_lying_iter, mom_switch_iter=args.tsne_mom_switch_iter, random_state=args.random_seed) filters = tables.Filters(complevel=h5_constants.H5_COMPRESSION_LEVEL) _h5 = os.path.join(outs.tsne, args.method + '_tsne.h5') _csv = os.path.join(outs.tsne, args.method + '_tsne_csv') with tables.open_file(_h5, 'w', filters=filters) as f: cr_tsne.save_tsne_h5(tsne, f) cr_tsne.save_tsne_csv(tsne, matrix, _csv)
def main(args, outs): if args.skip: return tsne_dims = args.tsne_dims matrix = cr_matrix.CountMatrix.load_h5_file(args.matrix_h5) if args.feature_type == lib_constants.GENE_EXPRESSION_LIBRARY_TYPE: # Use PCA for gene expression pca = cr_pca.load_pca_from_h5(args.pca_h5) tsne_input = pca.transformed_pca_matrix else: # Use feature space for other feature types # Assumes other feature types are much lower dimension than gene expression matrix = matrix.select_features_by_type(args.feature_type) matrix.m.data = np.log2(1 + matrix.m.data) tsne_input = matrix.m.transpose().todense() name = get_tsne_name(args.feature_type, args.tsne_dims) key = get_tsne_key(args.feature_type, args.tsne_dims) tsne = cr_tsne.run_tsne(tsne_input, name=name, key=key, input_pcs=args.input_pcs, perplexity=args.perplexity, theta=args.theta, tsne_dims=tsne_dims, max_iter=args.max_iter, stop_lying_iter=args.stop_lying_iter, mom_switch_iter=args.mom_switch_iter, random_state=args.random_seed) filters = tables.Filters(complevel=h5_constants.H5_COMPRESSION_LEVEL) with tables.open_file(outs.tsne_h5, 'w', filters=filters) as f: cr_tsne.save_tsne_h5(tsne, f) cr_tsne.save_tsne_csv(tsne, matrix, outs.tsne_csv)
def main(args, outs): if args.skip or args.is_multi_genome: return tsne_dims = args.tsne_dims matrix = cr_matrix.GeneBCMatrix.load_h5(args.matrix_h5) pca = cr_pca.load_pca_from_h5(args.pca_h5) tsne = cr_tsne.run_tsne(pca.transformed_pca_matrix, input_pcs=args.input_pcs, perplexity=args.perplexity, theta=args.theta, tsne_dims=tsne_dims, max_iter=args.max_iter, stop_lying_iter=args.stop_lying_iter, mom_switch_iter=args.mom_switch_iter, random_state=args.random_seed) tsne_map = {tsne_dims: tsne} filters = tables.Filters(complevel=cr_constants.H5_COMPRESSION_LEVEL) with tables.open_file(outs.tsne_h5, 'w', filters=filters) as f: cr_tsne.save_tsne_h5(tsne_map, f) cr_tsne.save_tsne_csv(tsne_map, matrix, outs.tsne_csv)
def split(args): np.random.seed(0) if args.matrix_h5 is None: return {'chunks': [{'__mem_gb': h5_constants.MIN_MEM_GB}]} if not os.path.exists(args.reduced_data): raise IOError('reduced data not found at {}'.format(args.reduced_data)) if not set(args.factorization).issubset(ALLOWED_FACTORIZATIONS): raise ValueError('Invalid factorization provided') if args.similarity_type not in SIMILARITY_TYPES: raise ValueError( 'Unsupported similarity type: %s. Must be one of: %s' % (args.similarity_type, ','.join(SIMILARITY_TYPES))) reduction_summary = args.reduction_summary['h5'] method_dict = {} for method in args.factorization: method_dict[method] = {} with LogPerf('load'): for method in args.factorization: if method == 'pca': method_dict[method][ 'transformed_matrix'] = cr_pca.load_pca_from_h5( reduction_summary[method]).transformed_pca_matrix if method == 'lsa': method_dict[method][ 'transformed_matrix'] = cr_lsa.load_lsa_from_h5( reduction_summary[method]).transformed_lsa_matrix if method == 'plsa': method_dict[method][ 'transformed_matrix'] = cr_plsa.load_plsa_from_h5( reduction_summary[method]).transformed_plsa_matrix # Record indices of selected barcodes. All methods must use same barcodes use_bcs = np.arange( method_dict[args.factorization[0]]['transformed_matrix'].shape[0]) use_bcs_path = martian.make_path('use_bcs.h5') cr_graphclust.save_ndarray_h5(use_bcs, use_bcs_path, 'use_bcs') # Build the nearest neighbor query index with LogPerf('nn_build'): for method in args.factorization: method_mat = method_dict[method]['transformed_matrix'] # normalize for plsa/lsa so that standard euclidean distance in normalized space is cosine distance in original space if method in ['plsa', 'lsa']: method_mat = method_mat / np.linalg.norm( method_mat, axis=1, keepdims=True) balltree = cr_graphclust.build_neighbor_index( method_mat, args.balltree_leaf_size or DEFAULT_BALLTREE_LEAFSIZE) method_dict[method]['neighbor_index'] = martian.make_path( 'neighbor_index_{}.pickle'.format(method)) cr_graphclust.save_neighbor_index( balltree, method_dict[method]['neighbor_index']) # Compute the actual number of nearest neighbors we'll use given_num_neighbors = args.num_neighbors if args.num_neighbors is not None else analysis_constants.GRAPHCLUST_NEIGHBORS_DEFAULT given_neighbor_a = args.neighbor_a if args.neighbor_a is not None else analysis_constants.GRAPHCLUST_NEIGHBOR_A_DEFAULT given_neighbor_b = args.neighbor_b if args.neighbor_b is not None else analysis_constants.GRAPHCLUST_NEIGHBOR_B_DEFAULT # Take max of {num_neighbors, a + b*log10(n)} use_neighbors = int( max( given_num_neighbors, np.round(given_neighbor_a + given_neighbor_b * np.log10(len(use_bcs))))) # Clamp to [1, n - 1] num_neighbors = max(1, min(use_neighbors, len(use_bcs) - 1)) print "Using %d neighbors" % num_neighbors # Divide the PCA matrix up into rows for NN queries with LogPerf('chunk_matrix'): chunks = [] for method in args.factorization: method_mat = method_dict[method]['transformed_matrix'] for row_start in xrange(0, method_mat.shape[0], NN_QUERIES_PER_CHUNK): row_end = min(row_start + NN_QUERIES_PER_CHUNK, method_mat.shape[0]) # Write the submatrix to an h5 file submatrix_path = martian.make_path('{}_{}_submatrix.h5'.format( method, row_start)) cr_graphclust.save_ndarray_h5(method_mat[row_start:row_end, :], submatrix_path, 'submatrix') chunks.append({ 'method': method, 'neighbor_index': method_dict[method]['neighbor_index'], 'submatrix': submatrix_path, 'row_start': row_start, 'total_rows': method_mat.shape[0], 'k_nearest': num_neighbors, 'use_bcs': use_bcs_path, }) if args.similarity_type == SNN_SIMILARITY: join_mem_gb = 64 join_threads = 4 # Overallocate else: # Scale memory with size of nearest-neighbor adjacency matrix join_mem_gb = max( h5_constants.MIN_MEM_GB, int(np.ceil( (num_neighbors * len(use_bcs)) / NN_ENTRIES_PER_MEM_GB))) # HACK: use more threads for bigger mem requests to avoid mem oversubscription on clusters that don't enforce it join_threads = cr_io.get_thread_request_from_mem_gb(join_mem_gb) return { 'chunks': chunks, 'join': { '__mem_gb': join_mem_gb, '__threads': join_threads, } }