def split(args): chunk_mem_gb = cr_utils.get_mem_gb_request_from_barcode_whitelist( args.barcode_whitelist) whitelist_mem_gb = cr_utils.get_mem_gb_request_from_barcode_whitelist( args.barcode_whitelist, args.gem_groups, use_min=False) # Estimate the total number of rows in the final molecule info. Worst case. total_reads = cr_utils.get_metric_from_json(args.extract_reads_summary, 'total_reads') mol_info_rows = total_reads # Memory for sorting in MoleculeCounter.concatenate_sort: # N = total number of rows # 8*N bytes to store the sort indices # (8+8+8)*N bytes to load, concatenate, and index into a 64-bit data column mol_info_mem_gb = int(math.ceil((32 * mol_info_rows) / 1e9)) join_mem_gb = min( MAX_MEM_GB, max(cr_constants.MIN_MEM_GB, whitelist_mem_gb + mol_info_mem_gb)) chunks = [] for chunk_input in args.inputs: chunks.append({ 'chunk_input': chunk_input, '__mem_gb': chunk_mem_gb, }) join = { '__mem_gb': join_mem_gb, } return {'chunks': chunks, 'join': join}
def split(args): mem_gb = cr_utils.get_mem_gb_request_from_barcode_whitelist( args.barcode_whitelist, args.gem_groups) chunks = [{ '__mem_gb': mem_gb, }] return {'chunks': chunks}
def split(args): chunks = [] for molecule_h5 in args.molecule_chunks: with cr_mol_counter.MoleculeCounter.open(molecule_h5, 'r') as mol_counter: barcode_whitelist = mol_counter.get_barcode_whitelist() whitelist_mem_gb = cr_utils.get_mem_gb_request_from_barcode_whitelist( barcode_whitelist, use_min=False) for gem_group, chunk_start, chunk_len in mol_counter.get_chunks_by_gem_group( ): mol_mem_gb = cr_mol_counter.MoleculeCounter.estimate_mem_gb( chunk_len) recovered_cells = mol_counter.get_metric( cr_mol_counter.GEM_GROUPS_METRIC)[gem_group].get( cr_mol_counter.GG_RECOVERED_CELLS_METRIC, None) force_cells = mol_counter.get_metric( cr_mol_counter.GEM_GROUPS_METRIC)[gem_group].get( cr_mol_counter.GG_FORCE_CELLS_METRIC, None) chunks.append({ 'molecule_h5': molecule_h5, 'gem_group': str(gem_group), 'recovered_cells': recovered_cells, 'force_cells': force_cells, 'chunk_start': chunk_start, 'chunk_len': chunk_len, '__mem_gb': whitelist_mem_gb + mol_mem_gb, }) return {'chunks': chunks}
def split(args): chunk_mem_gb = cr_utils.get_mem_gb_request_from_barcode_whitelist(args.barcode_whitelist) join_mem_gb = cr_utils.get_mem_gb_request_from_barcode_whitelist(args.barcode_whitelist, args.gem_groups) chunks = [] for chunk_genome_input, chunk_trimmed_input, gem_group in itertools.izip_longest( args.genome_inputs, args.trimmed_inputs or [], args.gem_groups): chunks.append({ 'chunk_genome_input': chunk_genome_input, 'chunk_trimmed_input': chunk_trimmed_input, 'gem_group': gem_group, '__mem_gb': chunk_mem_gb, }) join = { '__mem_gb': join_mem_gb, } return {'chunks': chunks, 'join': join}
def split(args): mem_gb = cr_utils.get_mem_gb_request_from_barcode_whitelist(args.barcode_whitelist, args.gem_groups) return { 'chunks': [{}], 'join': { '__mem_gb': mem_gb, }, }
def split(args): chunk_mem_gb = cr_utils.get_mem_gb_request_from_barcode_whitelist( args.barcode_whitelist) chunks = [] for chunk_input in args.inputs: chunks.append({ 'chunk_input': chunk_input, '__mem_gb': chunk_mem_gb, }) join_mem_gb = cr_utils.get_mem_gb_request_from_barcode_whitelist( args.barcode_whitelist, args.gem_groups, use_min=False) # Account for memory used by reporters (particularly the bc and umi diversity dicts) genomes = cr_utils.get_reference_genomes(args.reference_path) barcode_whitelist = cr_utils.load_barcode_whitelist(args.barcode_whitelist) if barcode_whitelist is not None: num_barcodes = len(barcode_whitelist) * max(args.gem_groups) else: num_barcodes = cr_utils.get_num_barcodes_from_barcode_summary( args.barcode_summary) max_bc_diversity_entries = num_barcodes max_umi_diversity_entries = 4**cr_chem.get_umi_length(args.chemistry_def) # Multiply by 2 to hold the current reporter + accumulating reporter in the merge bc_diversity_mem_gb = (2 * max_bc_diversity_entries * cr_constants.BYTES_PER_STR_INT_DICT_ENTRY * (len(genomes) + 1) * len(cr_constants.READ_TYPES)) / 1e9 umi_diversity_mem_gb = (2 * max_umi_diversity_entries * cr_constants.BYTES_PER_STR_INT_DICT_ENTRY * (len(genomes) + 1) * len(cr_constants.READ_TYPES)) / 1e9 join_mem_gb = min( cr_constants.COUNT_GENES_MAX_MEM_GB, max(cr_constants.MIN_MEM_GB, int(join_mem_gb + bc_diversity_mem_gb + umi_diversity_mem_gb))) join = { '__mem_gb': join_mem_gb, } return {'chunks': chunks, 'join': join}
def split(args): gem_groups = [chunk['gem_group'] for chunk in args.chunks] chunk_mem_gb = cr_utils.get_mem_gb_request_from_barcode_whitelist(args.barcode_whitelist) join_mem_gb = cr_utils.get_mem_gb_request_from_barcode_whitelist(args.barcode_whitelist, gem_groups) chunks = [] for chunk in args.chunks: chunk['__mem_gb'] = chunk_mem_gb if args.initial_reads is None: chunk['initial_reads'] = None else: chunk['initial_reads'] = args.initial_reads / len(args.chunks) chunks.append(chunk) join = { '__mem_gb': join_mem_gb, } return {'chunks': chunks, 'join': join}
def split(args): # Write BAM comments to json file bam_comment_fn = martian.make_path('bam_comments.json') with open(bam_comment_fn, 'w') as f: json.dump(args.bam_comments, f) chunk_mem_gb = cr_utils.get_mem_gb_request_from_barcode_whitelist( args.barcode_whitelist) join_mem_gb = cr_utils.get_mem_gb_request_from_barcode_whitelist( args.barcode_whitelist, args.gem_groups) chunks = [] for chunk_genome_input, gem_group in itertools.izip_longest( args.genome_inputs, args.gem_groups): chunks.append({ 'chunk_genome_input': chunk_genome_input, 'gem_group': gem_group, 'bam_comments_json': bam_comment_fn, '__mem_gb': chunk_mem_gb, }) join = { '__mem_gb': join_mem_gb, } return {'chunks': chunks, 'join': join}
def split(args): chunks = [] with cr_mol_counter.MoleculeCounter.open(args.raw_molecules, 'r') as mol_counter: barcode_whitelist = mol_counter.get_barcode_whitelist() matrix_mem_gb = cr_utils.get_mem_gb_request_from_barcode_whitelist( barcode_whitelist, use_min=False) for chunk_start, chunk_len in mol_counter.get_chunks( cr_constants.NUM_MOLECULE_INFO_ENTRIES_PER_CHUNK): mol_mem_gb = cr_mol_counter.MoleculeCounter.estimate_mem_gb( chunk_len) tot_mem_gb = 4 * (matrix_mem_gb + mol_mem_gb) threads = max(1, int(round(tot_mem_gb / 8.0))) chunks.append({ 'chunk_start': str(chunk_start), 'chunk_len': str(chunk_len), '__mem_gb': tot_mem_gb, '__threads': threads, }) join_mem_gb = max(matrix_mem_gb, cr_constants.MIN_MEM_GB) join = {'__mem_gb': join_mem_gb} return {'chunks': chunks, 'join': join}
def split(args): # Get the cell count filtered_bcs_per_genome = cr_utils.load_barcode_csv(args.filtered_barcodes) filtered_bcs = set() for _, bcs in filtered_bcs_per_genome.iteritems(): filtered_bcs |= set(bcs) n_cells = len(filtered_bcs) if n_cells == 0: return { 'chunks': [{ 'chunk_start': 0, 'chunk_len': 0, 'subsample_info': {} }] } # Get required info from the mol info with MoleculeCounter.open(args.molecule_info, 'r') as mol_counter: n_molecule_info_entries = mol_counter.nrows() barcode_whitelist = mol_counter.get_barcode_whitelist() gem_groups = mol_counter.get_gem_groups() raw_reads = mol_counter.get_total_raw_reads() raw_rpc = tk_stats.robust_divide(raw_reads, n_cells) mapped_reads = mol_counter.get_total_conf_mapped_filtered_bc_reads() mapped_read_frac = tk_stats.robust_divide(mapped_reads, raw_reads) subsamplings = list() # track subsample info definitions # Calculate extra deciles to add in based on raw reads if raw_reads > 0: subsampling_deciles = [ round(decile * raw_rpc) for decile in np.arange(0.1, 1.1, 0.1) ] else: subsampling_deciles = [] # All target depths target_rpcs = cr_constants.SUBSAMPLE_READS_PER_CELL + subsampling_deciles for subsample_type, rpc_multiplier in [ (cr_constants.RAW_SUBSAMPLE_TYPE, mapped_read_frac), (cr_constants.MAPPED_SUBSAMPLE_TYPE, 1.0) ]: # Generate subsampling definitions for target_rpc in target_rpcs: target_mapped_reads = int( float(target_rpc) * float(n_cells) * rpc_multiplier) subsample_rate = tk_stats.robust_divide(target_mapped_reads, mapped_reads) if subsample_rate > 1.0: continue subsamplings.append({ 'subsample_type': subsample_type, 'target_rpc': target_rpc, 'subsample_rate': subsample_rate, 'all_target_rpc': target_rpcs, }) # Each chunk needs to store the entire gene-bc matrix and a piece of the mol info h5 matrix_mem_gb = cr_utils.get_mem_gb_request_from_barcode_whitelist( barcode_whitelist, gem_groups) chunk_len = cr_constants.NUM_MOLECULE_INFO_ENTRIES_PER_CHUNK chunk_mem_gb = matrix_mem_gb + MoleculeCounter.estimate_mem_gb(chunk_len) join_mem_gb = matrix_mem_gb # Split the molecule info h5 into equi-RAM chunks chunks = [] for subsample_info in subsamplings: for chunk_start in xrange(0, n_molecule_info_entries, chunk_len): chunks.append({ 'chunk_start': str(chunk_start), 'chunk_len': str(min(n_molecule_info_entries - chunk_start, chunk_len)), 'subsample_info': subsample_info, '__mem_gb': chunk_mem_gb, }) join = { '__mem_gb': join_mem_gb, } if len(chunks) == 0: chunks.append({ 'chunk_start': str(0), 'chunk_len': str(0), 'subsample_info': {}, }) return {'chunks': chunks, 'join': join}