def split(args): # Estimate memory usage from the matrix stored in the analysis h5 if args.analysis: with tables.open_file(cr_io.h5_path(args.analysis), 'r') as f: matrix = getattr(f.root, cr_constants.ANALYSIS_H5_MATRIX_GROUP) matrix_mem_gb = cr_matrix.GeneBCMatrix.get_mem_gb_from_group( matrix) else: matrix_mem_gb = cr_constants.MIN_MEM_GB chunks = [{ '__mem_gb': matrix_mem_gb, }] return {'chunks': chunks}
def split(args): if args.analysis: # Estimate memory usage from the matrix stored in the analysis h5 h5_path = analysis_io.h5_path(args.analysis) with h5.File(h5_path, 'r') as f: matrix_mem_gb = cr_matrix.CountMatrix.get_mem_gb_from_group(f['matrix']) else: matrix_mem_gb = h5_constants.MIN_MEM_GB chunks = [{ '__mem_gb': matrix_mem_gb, }] return { 'chunks': chunks, 'join': {'__mem_gb': h5_constants.MIN_MEM_GB} }
def main(args, outs): if args.skip: return if args.is_multi_genome: cr_io.copytree(args.multi_genome_json, outs.analysis) cr_io.copytree(args.multi_genome_csv, outs.analysis_csv) analysis_h5 = analysis_io.h5_path(outs.analysis) cr_io.makedirs(os.path.dirname(analysis_h5), allow_existing=True) # Pytables doesn't support variable len strings, so use h5py first with h5.File(args.matrix_h5, 'r') as matrix,\ h5.File(analysis_h5, 'w') as out: # TODO: copy the first group; fixme when we have a key name = matrix.keys()[0] matrix.copy(matrix[name], out, name='matrix') with tables.open_file(args.pca_h5, 'r') as pca,\ tables.open_file(args.clustering_h5, 'r') as clustering,\ tables.open_file(args.diffexp_h5, 'r') as diffexp,\ tables.open_file(args.tsne_h5, 'r') as tsne,\ tables.open_file(analysis_h5, 'a') as out: pca.copy_children(pca.root, out.root, recursive=True) clustering.copy_children(clustering.root, out.root, recursive=True) diffexp.copy_children(diffexp.root, out.root, recursive=True) tsne.copy_children(tsne.root, out.root, recursive=True) pca_dir = os.path.join(outs.analysis_csv, 'pca') cr_io.copytree(args.pca_csv, pca_dir) clustering_dir = os.path.join(outs.analysis_csv, 'clustering') cr_io.copytree(args.clustering_csv, clustering_dir) diffexp_dir = os.path.join(outs.analysis_csv, 'diffexp') cr_io.copytree(args.diffexp_csv, diffexp_dir) tsne_dir = os.path.join(outs.analysis_csv, 'tsne') cr_io.copytree(args.tsne_csv, tsne_dir)
def main(args, outs): if args.skip: return if args.is_multi_genome: cr_utils.copytree(args.multi_genome_json, outs.analysis) cr_utils.copytree(args.multi_genome_csv, outs.analysis_csv) return analysis_h5 = cr_io.h5_path(outs.analysis) cr_utils.makedirs(os.path.dirname(analysis_h5), allow_existing=True) with tables.open_file(args.matrix_h5, 'r') as matrix,\ tables.open_file(args.pca_h5, 'r') as pca,\ tables.open_file(args.clustering_h5, 'r') as clustering,\ tables.open_file(args.diffexp_h5, 'r') as diffexp,\ tables.open_file(args.tsne_h5, 'r') as tsne,\ tables.open_file(analysis_h5, 'w') as out: # NOTE - genome name is replaced with 'matrix' mat_groups = [m for m in matrix.root] matrix.copy_node(mat_groups[0], out.root, recursive=True, newname='matrix') pca.copy_children(pca.root, out.root, recursive=True) clustering.copy_children(clustering.root, out.root, recursive=True) diffexp.copy_children(diffexp.root, out.root, recursive=True) tsne.copy_children(tsne.root, out.root, recursive=True) pca_dir = os.path.join(outs.analysis_csv, 'pca') cr_utils.copytree(args.pca_csv, pca_dir) clustering_dir = os.path.join(outs.analysis_csv, 'clustering') cr_utils.copytree(args.clustering_csv, clustering_dir) diffexp_dir = os.path.join(outs.analysis_csv, 'diffexp') cr_utils.copytree(args.diffexp_csv, diffexp_dir) tsne_dir = os.path.join(outs.analysis_csv, 'tsne') cr_utils.copytree(args.tsne_csv, tsne_dir)
def load_default_format(base_dir, method): h5_file_path = analysis_io.h5_path(base_dir) if os.path.exists(h5_file_path): return SingleGenomeAnalysis.load_h5(h5_file_path, method) else: return None