def split(args): # Chunk bam to get 1GB per chunk bam_in = tk_bam.create_bam_infile(args.input) chunk_defs = tk_bam.chunk_bam_records(bam_in, chunk_bound_func, chunk_size_gb=0.75) if args.targets_file is None: targets_path = None else: targets_path = args.targets_file estimated_coverage = tenkit.coverage.estimate_mean_coverage( targets_path, bam_in, lambda x: stringent_read_filter(x, False)) for chunk in chunk_defs: chunk['estimated_coverage'] = estimated_coverage lane_coord_sys = tk_lane.LaneCoordinateSystem() # Reopen BAM for estimating tile extents bam_in = tk_bam.create_bam_infile(args.input) lane_coord_sys.estimate_tile_extents(bam_in) for chunk in chunk_defs: chunk['lane_map'] = lane_coord_sys.to_dict() return {'chunks': chunk_defs}
def split(args): if args.bcsorted_bam is None or args.barcode_whitelist is None: chunk_defs = [{'chunk_start': "0", 'chunk_end': "0"}] return {'chunks': chunk_defs} # if args.input # Some R&D bc sets have very small diversity -- don't run on them barcode_whitelist = bc_utils.load_barcode_whitelist(args.barcode_whitelist) if len(barcode_whitelist) < 100: chunk_defs = [{'chunk_start': "0", 'chunk_end': "0"}] return {'chunks': chunk_defs} # if barcode_whitelist min_chunks = 4 if len(barcode_whitelist) > 1e6: min_chunks = 8 # if barcode_whitelist bam_in = tk_bam.create_bam_infile(args.bcsorted_bam) chunks = tk_bam.chunk_bam_records(bam_in, chunk_split_func, chunk_size_gb=8.0, min_chunks=min_chunks) for c in chunks: c['__mem_gb'] = 12 # for c return {'chunks': chunks, 'join': {'__mem_gb': 32}}
def split(args): bam_in = create_bam_infile(args.input) chunk_defs = tk_bam.chunk_bam_records(bam_in, chunk_bound_key=None, chunk_size_gb=0.5) for chunk in chunk_defs: chunk["__mem_gb"] = 8.0 return {'chunks': chunk_defs}
def split(args): bam_in = tk_bam.create_bam_infile(args.input) chunk_defs = tk_bam.chunk_bam_records(bam_in, chunk_bound_key=None, max_chunks=120) for i, chunk_def in enumerate(chunk_defs): chunk_def['chunk_index'] = i chunk_def['__mem_gb'] = 6 chunk_def['__threads'] = 4 return {'chunks': chunk_defs, 'join': {'__mem_gb': 1}}
def split(args): with tk_bam.create_bam_infile(args.input) as in_bam: chunks = tk_bam.chunk_bam_records( in_bam, chunk_bound_key=cr_utils.barcode_sort_key, chunk_size_gb=cr_constants.BAM_CHUNK_SIZE_GB, max_chunks=cr_constants.MAX_BAM_CHUNKS) if args.mem_gb is not None and args.mem_gb > cr_constants.MIN_MEM_GB: for chunk in chunks: chunk['__mem_gb'] = args.mem_gb return {'chunks': chunks}
def split(args): # use a custom key that returns None for all unbarcoded reads, so that chunk_bam_records # never has to linearly scan over those reads to find a chunk boundary (which could take a long # time for large datasets) bc_sort_key = lambda read: cr_utils.barcode_sort_key(read, squash_unbarcoded=True) with tk_bam.create_bam_infile(args.input) as in_bam: chunks = tk_bam.chunk_bam_records(in_bam, chunk_bound_key=bc_sort_key, chunk_size_gb=cr_constants.BAM_CHUNK_SIZE_GB, max_chunks=cr_constants.MAX_BAM_CHUNKS) if args.mem_gb is not None and args.mem_gb > cr_constants.MIN_MEM_GB: for chunk in chunks: chunk['__mem_gb'] = args.mem_gb return {'chunks': chunks}
def split(args): df = pd.read_csv(args.barcode_clusters) # construct BAM chunks with tk_bam.create_bam_infile(args.possorted_bam) as in_bam: chunks = tk_bam.chunk_bam_records(in_bam, chunk_bound_key=cr_utils.pos_sort_key, chunk_size_gb=cr_constants.BAM_CHUNK_SIZE_GB, max_chunks=cr_constants.MAX_BAM_CHUNKS) # nest BAM chunks with clusters bc_chunks = [] for cluster_id, d in df.groupby('Cluster'): for c in chunks: bc_chunks.append({'chunk_start': c['chunk_start'], 'chunk_end': c['chunk_end'], 'cluster_bcs': d.Barcode.tolist(), 'cluster_id': cluster_id, '__mem_gb': 8}) return {'chunks': bc_chunks}
def split(args): bam = pysam.Samfile(args.input, check_sq=False) min_chunks = 1 if args.barcode_whitelist is not None: barcode_whitelist = bc_utils.load_barcode_whitelist( args.barcode_whitelist) if len(barcode_whitelist) > 1e6: min_chunks = 4 # Split to ensure read pairs always go together chunks = tk_bam.chunk_bam_records(bam, lambda x: x.qname, min_chunks=min_chunks) for chunk in chunks: chunk['n_chunks'] = len(chunks) chunk['__mem_gb'] = 3 return {'chunks': chunks, 'join': {'__mem_gb': 8}}
def split(args): if args.input is None or args.barcode_whitelist is None: chunk_defs = [{'chunk_start': "0", 'chunk_end': "0", '__mem_gb': 1}] return {'chunks': chunk_defs, 'join': {'__mem_gb': 1}} ref = contig_manager.contig_manager(args.reference_path) species_list = ref.list_species() if (args.force_cells is not None and args.force_cells > 0 and len(species_list) > 1): martian.exit( "force_cells can only be used for single species reference.") min_chunks = 10 bam_in = tk_bam.create_bam_infile(args.input) chunks = tk_bam.chunk_bam_records(bam_in, chunk_split_func, chunk_size_gb=8.0, min_chunks=min_chunks) # 0.03 =~ 26meg = 1M bcs * (sizeof(int64) + 18) join_mem_gb = int(np.ceil(0.03 * (len(chunks) + 1) + 1)) return {'chunks': chunks, 'join': {'__mem_gb': join_mem_gb}}
def split(args): if args.input is None or args.barcode_whitelist is None: chunk_defs = [{'chunk_start':"0", 'chunk_end':"0"}] return {'chunks': chunk_defs} # Some R&D bc sets have very small diversity -- don't run on them barcode_whitelist = bc_utils.load_barcode_whitelist(args.barcode_whitelist) if len(barcode_whitelist) < 100: chunk_defs = [{'chunk_start':"0", 'chunk_end':"0"}] return {'chunks': chunk_defs} min_chunks = 20 if len(barcode_whitelist) > 1e6: min_chunks = 100 bam_in = tk_bam.create_bam_infile(args.input) chunks = tk_bam.chunk_bam_records(bam_in, groupbybarcode, chunk_size_gb=8.0, min_chunks=min_chunks) for c in chunks: c['__mem_gb'] = 3 return {'chunks': chunks, 'join': {'__mem_gb': 6}}
def split(args): # Chunk bam to get 1GB per chunk bam_in = tk_bam.create_bam_infile(args.input) chunk_defs = tk_bam.chunk_bam_records(bam_in, chunk_bound_func, chunk_size_gb=0.75) for i, chunk in enumerate(chunk_defs): chunk['chunk_index'] = i chunk['__mem_gb'] = 3 lane_coord_sys = tk_lane.LaneCoordinateSystem() # Reopen BAM for estimating tile extents bam_in = tk_bam.create_bam_infile(args.input) lane_coord_sys.estimate_tile_extents(bam_in) with open(args.diffusion_dup_summary) as f: data = json.load(f) threshold = data['diffusion']['threshold'] for chunk in chunk_defs: chunk['lane_map'] = lane_coord_sys.to_dict() chunk['diffusion_threshold'] = threshold return {'chunks': chunk_defs, 'join': {'__mem_gb': 1, '__threads': 6}}
def split(args): # Chunk bam to get 1GB per chunk bam_in = create_bam_infile(args.input) bam_chunk_size_disk = 0.75 chunk_defs = tk_bam.chunk_bam_records(bam_in, chunk_bound_func, chunk_size_gb=bam_chunk_size_disk) for chunk in chunk_defs: chunk['__mem_gb'] = 4 chunk['__vmem_gb'] = 5 + int( np.ceil(2 * whitelist_mem_gb(args.barcode_whitelist) + bam_chunk_size_disk * 10)) lane_coord_sys = tk_lane.LaneCoordinateSystem() # Reopen BAM for estimating tile extents bam_in = create_bam_infile(args.input) lane_coord_sys.estimate_tile_extents(bam_in) for cnum, chunk in enumerate(chunk_defs): chunk['lane_map'] = lane_coord_sys.to_dict() chunk['chunk_num'] = cnum return {'chunks': chunk_defs, 'join': {'__mem_gb': 8, '__threads': 4}}
def split(args): # Chunk bam to get 1GB per chunk bam_in = tk_bam.create_bam_infile(args.input) lane_coord_sys = tk_lane.LaneCoordinateSystem() bam_in.reset() lane_coord_sys.estimate_tile_extents(bam_in) flowcell_geometry = estimate_flowcell_geometry(bam_in, lane_coord_sys) print "Flowcell Geometry: ", flowcell_geometry if flowcell_geometry is None: return { 'chunks': [{ 'seed': None, 'lane_map': None, 'flowcell_geometry': None, 'chunk_start': None, 'chunk_end': None }] } chunk_defs = tk_bam.chunk_bam_records(bam_in, chunk_bound_func, chunk_size_gb=0.75) for i, chunk in enumerate(chunk_defs): chunk['seed'] = i chunk['__mem_gb'] = 3 for chunk in chunk_defs: chunk['lane_map'] = lane_coord_sys.to_dict() chunk['flowcell_geometry'] = flowcell_geometry return {'chunks': chunk_defs, 'join': {'__mem_gb': 1}}
def split(args): bam_in = tk_bam.create_bam_infile(args.input) chunk_defs = tk_bam.chunk_bam_records(bam_in, chunk_bound_key=None, chunk_size_gb=0.5) return {'chunks': chunk_defs}
def split(args): with tk_bam.create_bam_infile(args.input) as in_bam: chunks = tk_bam.chunk_bam_records(in_bam, chunk_bound_key=cr_utils.pos_sort_key, chunk_size_gb=cr_constants.BAM_CHUNK_SIZE_GB, max_chunks=cr_constants.MAX_BAM_CHUNKS) return {'chunks': chunks}
def split(args): bam_in = tk_bam.create_bam_infile(args.input) chunk_defs = tk_bam.chunk_bam_records(bam_in, chunk_bound_key=None) for c in chunk_defs: c["__mem_gb"] = 4 return {'chunks': chunk_defs}