def call_cell_barcodes(umi_info_path, gem_group): """ Call cell barcodes by UMI support. Args: umi_info_path (str) - path to umi info h5 gem_group (int) - gem group Returns: (bc_support, cell_bcs, rt, ut) where bc_support = dict of { barcode: umi_count }, cell_bcs = list(str) of cell barcodes) rt = read pair per umi threshold used ut = umi threshold """ # Get umi info for this gem group only bc_idx = vdj_umi_info.get_column(umi_info_path, 'barcode_idx') bc_str = vdj_umi_info.get_column(umi_info_path, 'barcodes') bc_gg = np.array([int(cr_utils.split_barcode_seq(bc)[1]) for bc in bc_str]) bc_in_gg = bc_gg == gem_group umi_in_gg = bc_in_gg[bc_idx] umi_read_pairs = vdj_umi_info.get_column(umi_info_path, 'reads') rpu_threshold, umi_threshold, bc_support, confidence = vdj_stats.call_vdj_cells( umi_barcode_idx=bc_idx[umi_in_gg], umi_read_pairs=umi_read_pairs[umi_in_gg], barcodes=bc_str, rpu_mix_init_sd=RPU_MIX_INIT_SD, umi_mix_init_sd=UMI_MIX_INIT_SD, verbosity=1, ) cell_bcs = [ bc for bc, umis in bc_support.iteritems() if umis >= umi_threshold ] return bc_support, cell_bcs, rpu_threshold, umi_threshold, confidence
def call_cell_barcodes(umi_info_path, gem_group): """ Call cell barcodes by UMI support. Args: umi_info_path (str) - path to umi info h5 gem_group (int) - gem group Returns: (bc_support, cell_bcs, rt, ut) where bc_support = dict of { barcode: umi_count }, cell_bcs = list(str) of cell barcodes) rt = read pair per umi threshold used ut = umi threshold """ # Get umi info for this gem group only bc_str = vdj_umi_info.get_column(umi_info_path, 'barcodes') bc_gg = np.array([int(cr_utils.split_barcode_seq(bc)[1]) for bc in bc_str]) bc_in_gg = bc_gg == gem_group umi_info = vdj_umi_info.read_umi_info(umi_info_path) umi_barcode_idx = [] umi_read_pairs = [] for bc_idx, data_iter in itertools.groupby(itertools.izip( umi_info['barcode_idx'], umi_info['umi_idx'], umi_info['reads']), key=lambda x: x[0]): if not bc_in_gg[bc_idx]: continue bc_umi_read_pairs = {} for _, umi, reads in data_iter: bc_umi_read_pairs[umi] = bc_umi_read_pairs.get(umi, 0) + reads for r in bc_umi_read_pairs.itervalues(): umi_barcode_idx.append(bc_idx) umi_read_pairs.append(r) rpu_threshold, umi_threshold, bc_support, confidence = vdj_stats.call_vdj_cells( umi_barcode_idx=np.array(umi_barcode_idx, dtype=vdj_umi_info.get_dtype('barcode_idx')), umi_read_pairs=np.array(umi_read_pairs, dtype=vdj_umi_info.get_dtype('reads')), barcodes=bc_str, rpu_mix_init_sd=RPU_MIX_INIT_SD, umi_mix_init_sd=UMI_MIX_INIT_SD, verbosity=1, ) cell_bcs = [ bc for bc, umis in bc_support.iteritems() if umis >= umi_threshold ] return bc_support, cell_bcs, rpu_threshold, umi_threshold, confidence
def split(args): """ Chunk the UMI info HDF5 file by gem group """ num_entries = vdj_umi_info.get_num_rows(args.umi_info) if num_entries > 1e9: print 'Warning: There are >1e9 entries in the umi_info - this could potentially cause an out-of-memory error.' # This will cause an OOM if there are >1.5e9 UMIs barcode_indices = vdj_umi_info.get_column(args.umi_info, 'barcode_idx') barcodes = vdj_umi_info.get_column(args.umi_info, 'barcodes') chunks = [] start_row = 0 prev_gem_group = None prev_barcode_idx = None for row, barcode_idx in enumerate(barcode_indices): if barcode_idx == prev_barcode_idx: continue _, gem_group = cr_utils.split_barcode_seq(barcodes[barcode_idx]) if prev_gem_group is not None and gem_group != prev_gem_group: # Write complete chunk end_row = row mem_gb = max( cr_constants.MIN_MEM_GB, 2 * int( np.ceil( vdj_umi_info.get_mem_gb(args.umi_info, start_row=start_row, end_row=end_row)))) chunks.append({ 'gem_group': prev_gem_group, 'start_row': start_row, 'end_row': end_row, '__mem_gb': mem_gb, }) start_row = end_row prev_gem_group = gem_group prev_barcode_idx = barcode_idx # Write final chunk end_row = vdj_umi_info.get_num_rows(args.umi_info) mem_gb = max( cr_constants.MIN_MEM_GB, 2 * int( np.ceil( vdj_umi_info.get_mem_gb( args.umi_info, start_row=start_row, end_row=end_row)))) # Handle case where umi info is empty by supplying a dummy gem group if prev_gem_group is None: prev_gem_group = args.gem_groups[0] chunks.append({ 'gem_group': prev_gem_group, 'start_row': start_row, 'end_row': end_row, '__mem_gb': mem_gb, }) return {'chunks': chunks}