def split(args): """Compute base background in split and use it in each chunk.""" ref_mgr = ReferenceManager(args.reference_path) npeaks = utils.quick_line_count(args.peaks) if args.peaks else 0 if len(ref_mgr.list_species() ) > 1 or npeaks == 0 or ref_mgr.motifs is None: chunk_def = [{'skip': True}] return {'chunks': chunk_def} with open(args.globalGCdict, 'r') as f: GCdict = pickle.load(f) GCdict_paths = {} GCbins = sorted(GCdict.keys()) for gc in GCbins: GCdict_paths[gc] = martian.make_path('GCdict_{}_{}'.format( gc[0], gc[1])) with open(GCdict_paths[gc], 'w') as dump: pickle.dump(GCdict[gc], dump) # write rows of each chunk to a new peak file mem_in_gb = 8 chunk_def = [{ '__mem_gb': mem_in_gb, '__vmem_gb': mem_in_gb + int(np.ceil(ref_mgr.get_vmem_est())) + 1, 'skip': False, 'GCdict': GCdict_paths[chunk] } for chunk in GCbins] return {'chunks': chunk_def}
def split(args): ctg_mgr = ReferenceManager(args.reference_path) species = ctg_mgr.list_species() if args.filtered_peak_bc_matrix is None or len(species) > 1: return {'chunks': [{'__mem_gb': h5_constants.MIN_MEM_GB}]} chunks = [] matrix_mem_gb = 0. if args.filtered_tf_bc_matrix is not None: matrix_mem_gb = cr_matrix.CountMatrix.get_mem_gb_from_matrix_h5(args.filtered_tf_bc_matrix) * 1.5 matrix_mem_gb += cr_matrix.CountMatrix.get_mem_gb_from_matrix_h5(args.filtered_peak_bc_matrix) chunk_mem_gb = int(np.ceil(max(matrix_mem_gb, h5_constants.MIN_MEM_GB))) if not set(args.factorization).issubset(ALLOWED_FACTORIZATIONS): raise ValueError('Invalid factorization provided') # create a chunk for each method x clustering combo for method in args.factorization: clustering_h5 = args.clustering_summary['h5'][method] for key in SingleGenomeAnalysis.load_clustering_keys_from_h5(clustering_h5): clustering = SingleGenomeAnalysis.load_clustering_from_h5(clustering_h5, key) for cluster in set(clustering.clusters): chunks.append({ 'method': method, 'clustering_key': key, 'cluster': cluster, '__mem_gb': chunk_mem_gb, '__vmem_gb': chunk_mem_gb + int(np.ceil(ctg_mgr.get_vmem_est())) + 1, '__threads': 1, }) return {'chunks': chunks, 'join': {'__mem_gb': 3}}
def split(args): ref_mgr = ReferenceManager(args.reference_path) return { 'chunks': [], 'join': { '__mem_gb': 4, '__vmem_gb': int(np.ceil(ref_mgr.get_vmem_est())) + 3 } }
def split(args): """We just align each chunk independently -- joining will happen in the join step of SORT_READS""" # Pull some reads from fastq files -- bail out if it's less than 25bp fastq_tests = [x['read1'] for x in args.chunks] for fastq_test in fastq_tests: with open(fastq_test) as in_file: reader = tk_fasta.read_generator_fastq(in_file) for name, read, qual in itertools.islice(reader, 10): if len(read) < MIN_READ_LENGTH: martian.alarm("BWA-MEM can't handle reads <25bp -- reads will be unmapped.") continue # estimated amount of memory needed to process genome is 2x(num gigabases)+4GB ctg_mgr = ReferenceManager(args.reference_path) base_mem_in_gb = int(math.ceil(2 * ctg_mgr.get_vmem_est())) mem_in_gb = base_mem_in_gb + 4 chunks = [{'chunk': x, '__threads': args.num_threads, '__mem_gb': mem_in_gb} for x in args.chunks] return {'chunks': chunks}