def split(args): # Estimate the total number of rows in the final molecule info. Worst case. total_reads = cr_utils.get_metric_from_json(args.extract_reads_summary, 'total_reads') mol_info_rows = total_reads # Memory for chunk if len(args.inputs) > 0: avg_rows_per_chunk = int(total_reads / len(args.inputs)) avg_chunk_mem_gb = int(math.ceil((32 * avg_rows_per_chunk) / 2.5e8)) chunk_mem_gb = min(MAX_MEM_GB, max(8, avg_chunk_mem_gb)) else: chunk_mem_gb = 1 # Memory for concatenating molecule info # N = total number of rows # 8*N bytes to store the sort indices # (8+8+8)*N bytes to load, concatenate, and index into a 64-bit data column mol_info_mem_gb = int(math.ceil((32 * mol_info_rows) / 2.5e8)) join_mem_gb = min(MAX_MEM_GB, max(4, mol_info_mem_gb)) chunks = [] for chunk_input in args.inputs: chunks.append({ 'chunk_input': chunk_input, '__mem_gb': chunk_mem_gb, }) join = { '__mem_gb': join_mem_gb, } return {'chunks': chunks, 'join': join}
def split(args): chunk_mem_gb = cr_utils.get_mem_gb_request_from_barcode_whitelist( args.barcode_whitelist) whitelist_mem_gb = cr_utils.get_mem_gb_request_from_barcode_whitelist( args.barcode_whitelist, args.gem_groups, use_min=False) # Estimate the total number of rows in the final molecule info. Worst case. total_reads = cr_utils.get_metric_from_json(args.extract_reads_summary, 'total_reads') mol_info_rows = total_reads # Memory for sorting in MoleculeCounter.concatenate_sort: # N = total number of rows # 8*N bytes to store the sort indices # (8+8+8)*N bytes to load, concatenate, and index into a 64-bit data column mol_info_mem_gb = int(math.ceil((32 * mol_info_rows) / 1e9)) join_mem_gb = min( MAX_MEM_GB, max(cr_constants.MIN_MEM_GB, whitelist_mem_gb + mol_info_mem_gb)) chunks = [] for chunk_input in args.inputs: chunks.append({ 'chunk_input': chunk_input, '__mem_gb': chunk_mem_gb, }) join = { '__mem_gb': join_mem_gb, } return {'chunks': chunks, 'join': join}
def main(args, outs): np.random.seed(0) unique_gem_groups = np.unique(args.gem_groups).tolist() reporter = vdj_report.VdjReporter(gem_groups=unique_gem_groups) cell_barcodes = set() bc_support = defaultdict(int) # Load barcode whitelist barcode_whitelist = cr_utils.load_barcode_whitelist(args.barcode_whitelist) all_gem_groups = sorted(set(args.gem_groups)) if args.recovered_cells: recovered_cells = args.recovered_cells else: recovered_cells = cr_constants.DEFAULT_TOP_BARCODE_CUTOFF * len( all_gem_groups) for gem_group in all_gem_groups: if barcode_whitelist is None: break # Load barcode raw read count distribution barcode_dist = cr_utils.load_barcode_dist(args.barcode_counts, barcode_whitelist, gem_group, proportions=False) counts = np.array(barcode_dist.values()) # Append gem group to barcode seqs barcodes = np.array([ cr_utils.format_barcode_seq(seq, gem_group) for seq in barcode_dist.keys() ]) # Call cell barcodes gg_bc_support, gg_cell_bcs, rpu_threshold, umi_threshold, confidence = call_cell_barcodes( args.umi_info, int(gem_group)) # Record the RPU and UMI thresholds reporter._get_metric_attr('vdj_filter_bcs_rpu_threshold', gem_group).set_value(rpu_threshold) reporter._get_metric_attr('vdj_filter_bcs_umi_threshold', gem_group).set_value(umi_threshold) reporter._get_metric_attr('vdj_filter_bcs_confidence', gem_group).set_value(confidence) if len(gg_bc_support) > 0: if args.force_cells is not None: sorted_bcs = map( lambda kv: kv[0], sorted(gg_bc_support.items(), key=lambda kv: kv[1], reverse=True)) gg_cell_bcs = sorted_bcs[:min(len(sorted_bcs), args.force_cells )] # Update set of BCs called as cells cell_barcodes.update(set(gg_cell_bcs)) # Sum BC support for bc, count in gg_bc_support.iteritems(): bc_support[bc] += count # Load the extract_reads summary to get the total raw reads total_read_pairs = cr_utils.get_metric_from_json( args.extract_reads_summary, 'total_read_pairs') reporter.vdj_filter_barcodes_cb(cell_barcodes, barcodes, counts, total_read_pairs, recovered_cells) save_cell_barcodes_json(cell_barcodes, outs.cell_barcodes) with open(outs.barcode_support, 'w') as f: f.write('barcode,count\n') for k, v in bc_support.iteritems(): f.write('%s,%d\n' % (k, v)) write_barcode_umi_summary(args.umi_info, reporter, outs.barcode_umi_summary, args.min_readpairs_per_umi, cell_barcodes) reporter.report_summary_json(outs.summary)
def main(args, outs): unique_gem_groups = np.unique(args.gem_groups).tolist() reporter = vdj_report.VdjReporter(gem_groups=unique_gem_groups) cell_barcodes = set() bc_support = {} # Load barcode whitelist barcode_whitelist = cr_utils.load_barcode_whitelist(args.barcode_whitelist) all_gem_groups = sorted(set(args.gem_groups)) if args.recovered_cells: recovered_cells = args.recovered_cells else: recovered_cells = cr_constants.DEFAULT_TOP_BARCODE_CUTOFF * len( all_gem_groups) for gem_group in all_gem_groups: if barcode_whitelist is None: break # Load barcode raw read count distribution barcode_dist = cr_utils.load_barcode_dist(args.barcode_counts, barcode_whitelist, gem_group, proportions=False) counts = np.array(barcode_dist.values()) # Append gem group to barcode seqs barcodes = np.array([ cr_utils.format_barcode_seq(seq, gem_group) for seq in barcode_dist.keys() ]) # Call cell barcodes gg_bc_support, gg_cell_bcs, threshold = call_cell_barcodes( args.umi_summary, int(gem_group), args.min_umis, args.readpairs_per_umi_nx, args.readpairs_per_umi_ratio) # Record the threshold reporter._get_metric_attr( 'vdj_filtered_bc_contig_kth_umi_readpair_threshold', gem_group).set_value(threshold) if len(gg_bc_support) > 0: if args.force_cells is not None: sorted_bcs = map( lambda kv: kv[0], sorted(gg_bc_support.items(), key=lambda kv: kv[1], reverse=True)) gg_cell_bcs = sorted_bcs[:min(len(sorted_bcs), args.force_cells )] cell_barcodes.update(set(gg_cell_bcs)) bc_support.update(gg_bc_support) # Load the extract_reads summary to get the total raw reads total_read_pairs = cr_utils.get_metric_from_json( args.extract_reads_summary, 'total_read_pairs') # Load the assembly metrics summary to get the total assemblable reads assemblable_read_pairs_by_bc = cr_utils.get_metric_from_json( args.assemble_metrics_summary, 'assemblable_read_pairs_by_bc') assemblable_read_pairs = sum( assemblable_read_pairs_by_bc.get(bc, 0) for bc in cell_barcodes) reporter.vdj_filter_barcodes_cb(cell_barcodes, barcodes, counts, total_read_pairs, assemblable_read_pairs, recovered_cells) save_cell_barcodes_json(cell_barcodes, outs.cell_barcodes) with open(outs.barcode_support, 'w') as f: f.write('barcode,count\n') for k, v in bc_support.iteritems(): f.write('%s,%d\n' % (k, v)) write_barcode_umi_summary(args.umi_info, reporter, outs.barcode_umi_summary, args.min_readpairs_per_umi, cell_barcodes) reporter.report_summary_json(outs.summary)
def main(args, outs): reporter = vdj_report.VdjReporter() barcode_contigs = defaultdict(list) contig_annotations = {} # Get annotations for each contig for annotation in iter(json.load(open(args.annotations))): contig_annotations[annotation['contig_name']] = annotation if args.contig_summary and os.path.isfile(args.contig_summary): contig_summary = pd.read_csv(args.contig_summary, header=0, index_col=None, sep='\t', dtype={ 'component': int, 'num_reads': int, 'num_pairs': int, 'num_umis': int, 'umi_list': str, }) contig_summary = contig_summary.groupby('barcode') else: contig_summary = None if args.umi_summary and os.path.isfile(args.umi_summary): umi_summary = pd.read_csv(args.umi_summary, header=0, index_col=None, sep='\t') umi_summary = umi_summary.groupby('barcode') else: umi_summary = None if args.filter_summary: filter_summary = vdj_utils.load_contig_summary_table( args.filter_summary) else: filter_summary = None # Get contigs for each barcode for contig_hdr, contig_seq in cr_utils.get_fasta_iter(open(args.contigs)): contig_name = contig_hdr.split(' ')[0] if not filter_summary is None and not vdj_utils.is_contig_filtered( filter_summary, contig_name): continue barcode = vdj_utils.get_barcode_from_contig_name(contig_name) barcode_contigs[barcode].append((contig_name, contig_seq)) # Compute metrics for each barcode if args.cell_barcodes: barcodes = vdj_utils.load_cell_barcodes_json(args.cell_barcodes) else: # Pass an empty barcode JSON for bulk barcodes = {''} reference = vdj_ref.VdjReference(args.vdj_reference_path) for barcode in barcodes: contigs = barcode_contigs[barcode] annotations = [contig_annotations[contig[0]] for contig in contigs] reporter.vdj_barcode_contig_cb(barcode, contigs, annotations, reference) if not contig_summary is None and barcode in contig_summary.groups: bc_contig_summary = contig_summary.get_group(barcode) else: bc_contig_summary = None if not umi_summary is None and barcode in umi_summary.groups: bc_umi_summary = umi_summary.get_group(barcode) else: bc_umi_summary = None reporter.vdj_assembly_cb(bc_contig_summary, bc_umi_summary, annotations, reference) ## Compute post-assembly per-cell metrics # Load the assembly metrics summary to get the total assemblable reads if args.assemble_metrics_summary and args.reads_summary: assemblable_read_pairs_by_bc = cr_utils.get_metric_from_json( args.assemble_metrics_summary, 'assemblable_read_pairs_by_bc') assemblable_read_pairs = sum( assemblable_read_pairs_by_bc.get(bc, 0) for bc in barcodes) total_read_pairs = cr_utils.get_metric_from_json( args.reads_summary, 'total_read_pairs') reporter._get_metric_attr( 'vdj_assemblable_read_pairs_per_filtered_bc').set_value( assemblable_read_pairs, len(barcodes)) reporter._get_metric_attr('vdj_sequencing_efficiency').set_value( assemblable_read_pairs, total_read_pairs) ## Try to autodetect the chain type # Find all chains w/ a significant presence. # If there's exactly one, set the chain type filter to that. # Otherwise, show all chain types. chain_count = defaultdict(int) for anno_dict in contig_annotations.itervalues(): contig = vdj_annotations.AnnotatedContig.from_dict( anno_dict, reference) if contig.is_cell and contig.high_confidence and contig.productive: for anno in contig.annotations: if anno.feature.chain_type in vdj_constants.VDJ_CHAIN_TYPES: chain_count[anno.feature.chain_type] += 1 outs.chain_type = vdj_constants.ALL_CHAIN_TYPES print chain_count if len(chain_count) > 0: n_contigs = sum(chain_count.itervalues()) sig_chains = [ ct for ct, count in chain_count.iteritems() if tk_stats.robust_divide( count, n_contigs) >= MIN_CHAIN_TYPE_CONTIG_FRAC ] if len(sig_chains) == 1: outs.chain_type = sig_chains[0] reporter.report_summary_json(outs.summary)