def main(args, outs): # Handle the dummy chunk if args.contigs is None: outs.chunked_reporter = None return # Calculate metrics on assigned contigs reporter = vdj_report.VdjReporter() bam = tk_bam.create_bam_infile(args.contig_bam) contigs = [ contig for contig in bam.header['SQ'] if contig['SN'] in set(args.contigs) ] for contig in contigs: contig_name = contig['SN'] # Fetch indexed portion of BAM. read_iter = bam.fetch(str(contig_name)) reporter.contig_mapping_frac_statistics_cb( read_iter, contig['LN'], cr_chem.get_strandedness(args.chemistry_def)) reporter.save(outs.chunked_reporter)
def main(args, outs): paired_end = cr_chem.is_paired_end(args.chemistry_def) # Write compressed files outs.read1s += cr_constants.LZ4_SUFFIX outs.read2s += cr_constants.LZ4_SUFFIX cutadapt_out = os.path.join(os.path.dirname(outs.chunked_reporter), 'cutadapt_stdout') with open(cutadapt_out, 'w') as cut_stdout: status = run_cutadapt(args, outs.read1s, outs.read2s, args.chemistry_def, cut_stdout) if args.read2s_chunk == None: outs.read2s = None if status != 0: martian.log_info('Error while running cutadapt') else: reporter = vdj_report.VdjReporter(primers=cr_utils.get_primers_from_dicts(args.primers)) get_vdj_trim_metrics(reporter, cutadapt_out, paired_end) reporter.save(outs.chunked_reporter)
def get_constants_for_pipeline(pipeline): if pipeline == shared_constants.PIPELINE_VDJ: metrics, alarms, charts = ws_vdj_constants.METRICS, ws_vdj_constants.METRIC_ALARMS, ws_vdj_constants.CHARTS metric_prefixes = vdj_report.VdjReporter().get_all_prefixes() else: metrics, alarms, charts = ws_gex_constants.METRICS, ws_gex_constants.METRIC_ALARMS, ws_gex_constants.CHARTS metric_prefixes = cr_report.Reporter().get_all_prefixes() return metrics, alarms, charts, metric_prefixes
def main(args, outs): np.random.seed(0) unique_gem_groups = np.unique(args.gem_groups).tolist() reporter = vdj_report.VdjReporter(gem_groups=unique_gem_groups) # Load the umi info umi_info = vdj_umi_info.read_umi_info(args.umi_info, args.start_row, args.end_row) chains = umi_info['chains'] barcodes = umi_info['barcodes'] bc_gg = [str(cr_utils.split_barcode_seq(bc)[1]) for bc in barcodes] # Compute N50 read pairs per UMI for this gem group umi_read_pairs = [] total_read_pairs = {} chain_bad_read_pairs = {} for bc_idx, data_iter in itertools.groupby(itertools.izip( umi_info['barcode_idx'], umi_info['umi_idx'], umi_info['chain_idx'], umi_info['reads']), key=lambda x: x[0]): bc_umi_read_pairs = {} for _, umi, chain_idx, reads in data_iter: bc_umi_read_pairs[umi] = bc_umi_read_pairs.get(umi, 0) + reads chain = chains[chain_idx] total_read_pairs[chain] = total_read_pairs.get(chain, 0) + reads total_read_pairs[ cr_constants.MULTI_REFS_PREFIX] = total_read_pairs.get( cr_constants.MULTI_REFS_PREFIX, 0) + reads if reads < args.min_readpairs_per_umi[bc_gg[bc_idx]]: chain_bad_read_pairs[chain] = chain_bad_read_pairs.get( chain, 0) + reads chain_bad_read_pairs[ cr_constants.MULTI_REFS_PREFIX] = chain_bad_read_pairs.get( cr_constants.MULTI_REFS_PREFIX, 0) + reads for r in bc_umi_read_pairs.itervalues(): umi_read_pairs.append(r) rppu_n50 = tk_stats.NX(umi_read_pairs, 0.5) if rppu_n50 is None: rppu_n50 = float('NaN') # Report bad read-pairs/umi for chain in reporter.vdj_genes: bad_count = chain_bad_read_pairs.get(chain, 0) total_count = total_read_pairs.get(chain, 0) reporter._get_metric_attr('vdj_recombinome_low_support_reads_frac', chain).set_value(bad_count, total_count) reporter._get_metric_attr('vdj_recombinome_readpairs_per_umi_n50', cr_constants.MULTI_REFS_PREFIX, args.gem_group).set_value(rppu_n50) reporter.save(outs.chunked_reporter)
def main(args, outs): status = run_cutadapt(args, outs.read1s, outs.read2s) if args.read2s_chunk == None: outs.read2s = None if status != 0: martian.log_info('Error while running cutadapt') else: reporter = vdj_report.VdjReporter( primers=cr_utils.get_primers_from_dicts(args.primers)) get_vdj_trim_metrics( reporter, os.path.join(os.path.dirname(outs.chunked_reporter), '..', '_stdout')) reporter.save(outs.chunked_reporter)
def get_constants_for_pipeline(pipeline, sample_properties): """ Get the appropriate metrics/alarms/charts for a pipeline """ if pipeline == shared_constants.PIPELINE_VDJ: metrics, alarms, charts = ws_vdj_constants.METRICS, ws_vdj_constants.METRIC_ALARMS, ws_vdj_constants.CHARTS metric_prefixes = filter_vdj_prefixes( vdj_report.VdjReporter().get_all_prefixes(), sample_properties) alarms = filter_vdj_alarms(alarms, sample_properties) else: metrics, alarms, charts = ws_gex_constants.METRICS, ws_gex_constants.METRIC_ALARMS, ws_gex_constants.CHARTS metric_prefixes = cr_report.Reporter().get_all_prefixes() return metrics, alarms, charts, metric_prefixes
def main(args, outs): np.random.seed(0) unique_gem_groups = np.unique(args.gem_groups).tolist() reporter = vdj_report.VdjReporter(gem_groups=unique_gem_groups) # Load the umi info umi_info = vdj_umi_info.read_umi_info(args.umi_info, args.start_row, args.end_row) # Compute N50 read pairs per UMI for this gem group rppu_n50 = tk_stats.NX(umi_info['reads'], 0.5) if rppu_n50 is None: rppu_n50 = float('NaN') reporter._get_metric_attr('vdj_recombinome_readpairs_per_umi_n50', cr_constants.MULTI_REFS_PREFIX, args.gem_group).set_value(rppu_n50) reporter.save(outs.chunked_reporter)
def main(args, outs): np.random.seed(0) unique_gem_groups = np.unique(args.gem_groups).tolist() reporter = vdj_report.VdjReporter(gem_groups=unique_gem_groups) cell_barcodes = set() bc_support = defaultdict(int) # Load barcode whitelist barcode_whitelist = cr_utils.load_barcode_whitelist(args.barcode_whitelist) all_gem_groups = sorted(set(args.gem_groups)) if args.recovered_cells: recovered_cells = args.recovered_cells else: recovered_cells = cr_constants.DEFAULT_TOP_BARCODE_CUTOFF * len( all_gem_groups) for gem_group in all_gem_groups: if barcode_whitelist is None: break # Load barcode raw read count distribution barcode_dist = cr_utils.load_barcode_dist(args.barcode_counts, barcode_whitelist, gem_group, proportions=False) counts = np.array(barcode_dist.values()) # Append gem group to barcode seqs barcodes = np.array([ cr_utils.format_barcode_seq(seq, gem_group) for seq in barcode_dist.keys() ]) # Call cell barcodes gg_bc_support, gg_cell_bcs, rpu_threshold, umi_threshold, confidence = call_cell_barcodes( args.umi_info, int(gem_group)) # Record the RPU and UMI thresholds reporter._get_metric_attr('vdj_filter_bcs_rpu_threshold', gem_group).set_value(rpu_threshold) reporter._get_metric_attr('vdj_filter_bcs_umi_threshold', gem_group).set_value(umi_threshold) reporter._get_metric_attr('vdj_filter_bcs_confidence', gem_group).set_value(confidence) if len(gg_bc_support) > 0: if args.force_cells is not None: sorted_bcs = map( lambda kv: kv[0], sorted(gg_bc_support.items(), key=lambda kv: kv[1], reverse=True)) gg_cell_bcs = sorted_bcs[:min(len(sorted_bcs), args.force_cells )] # Update set of BCs called as cells cell_barcodes.update(set(gg_cell_bcs)) # Sum BC support for bc, count in gg_bc_support.iteritems(): bc_support[bc] += count # Load the extract_reads summary to get the total raw reads total_read_pairs = cr_utils.get_metric_from_json( args.extract_reads_summary, 'total_read_pairs') reporter.vdj_filter_barcodes_cb(cell_barcodes, barcodes, counts, total_read_pairs, recovered_cells) save_cell_barcodes_json(cell_barcodes, outs.cell_barcodes) with open(outs.barcode_support, 'w') as f: f.write('barcode,count\n') for k, v in bc_support.iteritems(): f.write('%s,%d\n' % (k, v)) write_barcode_umi_summary(args.umi_info, reporter, outs.barcode_umi_summary, args.min_readpairs_per_umi, cell_barcodes) reporter.report_summary_json(outs.summary)
def main(args, outs): reporter = vdj_report.VdjReporter() with open(args.contig_annotations) as f: contigs = vdj_annot.load_contig_list_from_json(f, args.vdj_reference_path) contigs.sort(key=lambda c: (c.barcode, c.get_single_chain( ), not c.productive, -c.umi_count, -c.read_count, -len(c))) low_confidence_contigs = set() cell_contigs = set() for (bc, chain), group in itertools.groupby(contigs, key=lambda c: (c.barcode, c.get_single_chain())): first_cdr3 = None first_cdr3_umis = None seen_cdr3s = set() for contig in group: contig.high_confidence = True if contig.is_cell: cell_contigs.add(contig.contig_name) if first_cdr3 is None: first_cdr3 = contig.cdr3_seq first_cdr3_umis = contig.umi_count # Mark as low confidence: # 1) Any additional CDR3s beyond the highest-(productive,UMI,read,length) contig's CDR3 # with a single UMI or low UMIs relative to the first contig, or extraneous_cdr3 = first_cdr3 is not None \ and contig.cdr3_seq != first_cdr3 \ and (contig.umi_count == 1 or \ (float(contig.umi_count) / first_cdr3_umis) < EXTRA_CONTIG_MIN_UMI_RATIO) # 2) Any contigs with a repeated CDR3. repeat_cdr3 = contig.cdr3_seq in seen_cdr3s if extraneous_cdr3 or repeat_cdr3: contig.high_confidence = False low_confidence_contigs.add(contig.contig_name) seen_cdr3s.add(contig.cdr3_seq) if chain in vdj_constants.VDJ_GENES: reporter._get_metric_attr('vdj_high_conf_prod_contig_frac', chain).add( 1, filter=contig.high_confidence) reporter._get_metric_attr('vdj_high_conf_prod_contig_frac', cr_constants.MULTI_REFS_PREFIX).add( 1, filter=contig.high_confidence) # Write augmented contig annotations with open(outs.contig_annotations, 'w') as f: vdj_annot.save_annotation_list_json(f, contigs) # Write filtered fasta with open(args.contig_fasta) as in_file, \ open(outs.filtered_contig_fasta, 'w') as out_file: for hdr, seq in cr_utils.get_fasta_iter(in_file): # Keep contigs that are high confidence & in cells if hdr not in low_confidence_contigs and hdr in cell_contigs: tk_fasta.write_read_fasta(out_file, hdr, seq) # Write filtered fastq with open(args.contig_fastq) as in_file, \ open(outs.filtered_contig_fastq, 'w') as out_file: for name, seq, qual in tk_fasta.read_generator_fastq(in_file): if name not in low_confidence_contigs and name in cell_contigs: tk_fasta.write_read_fastq(out_file, name, seq, qual) reporter.report_summary_json(outs.summary)
def main(args, outs): outs.chunked_consensus_bams = [] outs.chunked_concat_ref_bams = [] chunk_clonotypes = set(args.chunk_clonotypes) reporter = vdj_report.VdjReporter() if not args.clonotype_assignments or not vdj_utils.bam_has_seqs( args.contig_bam): # always produce an empty summary reporter.save(outs.chunked_reporter) return # Get the clonotype-barcode assignments with open(args.clonotype_assignments) as f: clonotypes = json.load(f) # Partition contig annotations by consensus id consensus_to_contigs = defaultdict(list) relevant_contig_ids = set() with open(args.chunk_annotations) as f: contigs = vdj_annot.load_contig_list_from_json(f, args.vdj_reference_path) clo_key = '%s_clonotype_id' % args.metric_prefix cons_key = '%s_consensus_id' % args.metric_prefix for contig in contigs: clo_id = contig.info_dict.get(clo_key) cons_id = contig.info_dict.get(cons_key) assert clo_id in chunk_clonotypes and cons_id is not None consensus_to_contigs[cons_id].append(contig) relevant_contig_ids.add(contig.contig_name) assert len(consensus_to_contigs) > 0 in_bam = tk_bam.create_bam_infile(args.contig_bam) n_merged_bams = 0 # For all contigs relevant to this chunk, # get the assembler umi data required for base qual recalculation. # Do not attempt to read into a pandas object because it can be huge. contig_umis = defaultdict(set) with open(args.umi_summary_tsv, 'r') as umi_file: for line in umi_file: fields = line.strip().split('\t') umi = fields[2] if umi == 'umi' or len(fields) < 7: continue good_umi = fields[5].lower() == 'true' contig_ids = set(fields[6].split(',')) if good_umi and len(contig_ids & relevant_contig_ids) > 0: for c in contig_ids: contig_umis[c].add(umi) consensus_fastq = open(outs.consensus_fastq, 'w') consensus_fasta = open(outs.consensus_fasta, 'w') ref_fasta = open(outs.concat_ref_fasta, 'w') consensus_contigs = [] ref_contigs = [] assert (args.metric_prefix in reporter.vdj_clonotype_types) # Iterate over clonotype assignments for clonotype_id, clonotype in clonotypes.iteritems(): if not clonotype_id in chunk_clonotypes: continue for consensus_id, consensus in clonotype['consensuses'].iteritems(): cdr = consensus['cdr3_seq'] # Verify that the contig annotation data are consistent with the clonotype assignment data assert set(consensus['cell_contigs']) == \ set(c.contig_name for c in consensus_to_contigs[consensus_id]) sel_contigs = consensus_to_contigs[consensus_id] sel_contig_ids = [c.contig_name for c in sel_contigs] # Keep track of the "best" contig. This will be used in case the # merging fails. best_contig = None # Keep track of the set of distinct annotations of the contigs to merge. # Will use to report rate of discrepancies. feature_annotations = defaultdict(set) for contig in sel_contigs: for anno in contig.annotations: feature_annotations[anno.feature.region_type].add( anno.feature.gene_name) # Always choose a productive over a non-productive. Between # contigs with the same productivity, choose the one that had more UMIs. if best_contig is None or (not best_contig.productive and contig.productive) or \ (best_contig.productive == contig.productive and \ best_contig.umi_count < contig.umi_count): best_contig = contig assert best_contig is not None anno_count = np.max( [len(feature_annotations[v]) for v in VDJ_V_FEATURE_TYPES]) metric = reporter._get_metric_attr( 'vdj_clonotype_gt1_v_annotations_contig_frac', args.metric_prefix) metric.add(1, filter=anno_count > 1) anno_count = np.max( [len(feature_annotations[v]) for v in VDJ_J_FEATURE_TYPES]) metric = reporter._get_metric_attr( 'vdj_clonotype_gt1_j_annotations_contig_frac', args.metric_prefix) metric.add(1, filter=anno_count > 1) wrong_cdr_metric = reporter._get_metric_attr( 'vdj_clonotype_consensus_wrong_cdr_contig_frac', args.metric_prefix) tmp_dir = martian.make_path(consensus_id + '_outs') cr_io.mkdir(tmp_dir, allow_existing=True) res = get_consensus_seq(consensus_id, sel_contig_ids, best_contig.contig_name, tmp_dir, args) (best_seq, best_quals, consensus_seq, contig_to_cons_bam, contig_fastq, contig_fasta) = res outs.chunked_consensus_bams.append(contig_to_cons_bam) # make sure the bam file has the right header (single sequence with this consensus name) tmp_bam = tk_bam.create_bam_infile(contig_to_cons_bam) if list(tmp_bam.references) != [consensus_id]: # Print some info to help us debug print tmp_bam.references, consensus_id assert (list(tmp_bam.references) == [consensus_id]) tmp_bam.close() if consensus_seq: # If this is not None, we actually built a consensus, so we have to compute the quals from scratch. # Use a subset of the contigs for computing quals. contig_ids = map( lambda c: c.contig_name, sorted(sel_contigs, key=lambda c: c.umi_count, reverse=True)) contig_ids = contig_ids[0:MAX_CELLS_FOR_BASE_QUALS] consensus_quals = get_consensus_quals(in_bam, consensus_id, contig_fasta, contig_ids, contig_umis, tmp_dir) else: consensus_seq = best_seq consensus_quals = best_quals assert (len(consensus_seq) == len(consensus_quals)) total_read_count = sum([c.read_count for c in sel_contigs]) total_umi_count = sum([c.umi_count for c in sel_contigs]) contig_info_dict = { 'cells': clonotype['barcodes'], 'cell_contigs': sel_contig_ids, 'clonotype_freq': clonotype['freq'], 'clonotype_prop': clonotype['prop'], } contig = annotate_consensus_contig(args.vdj_reference_path, args.min_score_ratios, args.min_word_sizes, consensus_id, clonotype_id, consensus_seq, consensus_quals, read_count=total_read_count, umi_count=total_umi_count, info_dict=contig_info_dict, primers=args.primers) wrong_cdr_metric.add(1, filter=contig.cdr3_seq is None or contig.cdr3_seq != cdr) if contig.cdr3_seq is None or contig.cdr3_seq != cdr: # Something went wrong. Use "best" contig as the consensus. consensus_seq = best_seq consensus_quals = best_quals contig = annotate_consensus_contig(args.vdj_reference_path, args.min_score_ratios, args.min_word_sizes, consensus_id, clonotype_id, consensus_seq, consensus_quals, read_count=total_read_count, umi_count=total_umi_count, info_dict=contig_info_dict, primers=args.primers) assert (not contig.cdr3_seq is None and contig.cdr3_seq == cdr) consensus_contigs.append(contig) tk_fasta.write_read_fasta(consensus_fasta, consensus_id, consensus_seq) tk_fasta.write_read_fastq(consensus_fastq, consensus_id, consensus_seq, consensus_quals) assert (len(consensus_seq) == len(consensus_quals)) ref_seq_parts, ref_annos = contig.get_concat_reference_sequence() # Align the contigs and consensus to a synthetic concatenated reference if ref_seq_parts is not None: # Trim the last segment down to the annotated length # to avoid including the entire (500nt) C-region ref_seq_parts[-1] = ref_seq_parts[-1][0:ref_annos[-1]. annotation_match_end] # Concatenate the reference VDJC segments ref_seq = reduce(lambda x, y: x + y, ref_seq_parts) ref_name = re.sub('consensus', 'concat_ref', consensus_id) # Reannotate the reference sequence. # Restrict the annotation to the already-called segments to # reduce the risk of discordance between the consensus and # concat_ref annotations. ref_contig = annotate_consensus_contig( args.vdj_reference_path, args.min_score_ratios, args.min_word_sizes, ref_name, clonotype_id, ref_seq, 'I' * len(ref_seq), use_features=set([a.feature.feature_id for a in ref_annos]), ) ref_contigs.append(ref_contig) # Add the consensus sequence to the input FASTQ (next to the contigs) with open(contig_fastq, 'a') as contig_fq: # Create a fake UMI and barcode header = cr_fastq.AugmentedFastqHeader(consensus_id) header.set_tag(PROCESSED_UMI_TAG, consensus_id) header.set_tag(PROCESSED_BARCODE_TAG, consensus_id) tk_fasta.write_read_fastq(contig_fq, header.to_string(), consensus_seq, consensus_quals) # Reuse this file (this had the assembly output but we don't need it anymore) ref_fasta_name = martian.make_path(consensus_id + '_contigs.fasta') with open(ref_fasta_name, 'w') as f: tk_fasta.write_read_fasta(f, ref_name, ref_seq) # Also append to the final output tk_fasta.write_read_fasta(ref_fasta, ref_name, ref_seq) cmd = [ 'vdj_asm', 'base-quals', martian.make_path(consensus_id + '_contigs'), tmp_dir, '--single-end' ] sys.stderr.write('Running ' + ' '.join(cmd) + '\n') tk_subproc.check_call(cmd, cwd=os.getcwd()) # Move out of tmp dir rec_bam = martian.make_path(consensus_id + '_reference.bam') cr_io.move( os.path.join(tmp_dir, consensus_id + '_contigs.bam'), rec_bam) outs.chunked_concat_ref_bams.append(rec_bam) if os.path.isdir(tmp_dir): shutil.rmtree(tmp_dir) # Clean up unneeded files ASAP rm_files([ consensus_id + '_contigs.fasta', consensus_id + '_contigs.fastq' ]) # Merge N most recent BAM files to avoid filesystem overload if len(outs.chunked_consensus_bams) >= MERGE_BAMS_EVERY: assert len(outs.chunked_consensus_bams) == len( outs.chunked_concat_ref_bams) new_cons_bam = martian.make_path('merged-consensus-%03d.bam' % n_merged_bams) concatenate_bams(new_cons_bam, outs.chunked_consensus_bams) rm_files(outs.chunked_consensus_bams) outs.chunked_consensus_bams = [new_cons_bam] new_ref_bam = martian.make_path('merged-ref-%03d.bam' % n_merged_bams) concatenate_bams(new_ref_bam, outs.chunked_concat_ref_bams) rm_files(outs.chunked_concat_ref_bams) outs.chunked_concat_ref_bams = [new_ref_bam] n_merged_bams += 1 in_bam.close() consensus_fastq.close() consensus_fasta.close() ref_fasta.close() reporter.save(outs.chunked_reporter) with open(outs.consensus_annotations_json, 'w') as out_file: vdj_annot.save_annotation_list_json(out_file, consensus_contigs) with open(outs.concat_ref_annotations_json, 'w') as out_file: vdj_annot.save_annotation_list_json(out_file, ref_contigs)
def main(args, outs): unique_gem_groups = np.unique(args.gem_groups).tolist() reporter = vdj_report.VdjReporter(gem_groups=unique_gem_groups) cell_barcodes = set() bc_support = {} # Load barcode whitelist barcode_whitelist = cr_utils.load_barcode_whitelist(args.barcode_whitelist) all_gem_groups = sorted(set(args.gem_groups)) if args.recovered_cells: recovered_cells = args.recovered_cells else: recovered_cells = cr_constants.DEFAULT_TOP_BARCODE_CUTOFF * len( all_gem_groups) for gem_group in all_gem_groups: if barcode_whitelist is None: break # Load barcode raw read count distribution barcode_dist = cr_utils.load_barcode_dist(args.barcode_counts, barcode_whitelist, gem_group, proportions=False) counts = np.array(barcode_dist.values()) # Append gem group to barcode seqs barcodes = np.array([ cr_utils.format_barcode_seq(seq, gem_group) for seq in barcode_dist.keys() ]) # Call cell barcodes gg_bc_support, gg_cell_bcs, threshold = call_cell_barcodes( args.umi_summary, int(gem_group), args.min_umis, args.readpairs_per_umi_nx, args.readpairs_per_umi_ratio) # Record the threshold reporter._get_metric_attr( 'vdj_filtered_bc_contig_kth_umi_readpair_threshold', gem_group).set_value(threshold) if len(gg_bc_support) > 0: if args.force_cells is not None: sorted_bcs = map( lambda kv: kv[0], sorted(gg_bc_support.items(), key=lambda kv: kv[1], reverse=True)) gg_cell_bcs = sorted_bcs[:min(len(sorted_bcs), args.force_cells )] cell_barcodes.update(set(gg_cell_bcs)) bc_support.update(gg_bc_support) # Load the extract_reads summary to get the total raw reads total_read_pairs = cr_utils.get_metric_from_json( args.extract_reads_summary, 'total_read_pairs') # Load the assembly metrics summary to get the total assemblable reads assemblable_read_pairs_by_bc = cr_utils.get_metric_from_json( args.assemble_metrics_summary, 'assemblable_read_pairs_by_bc') assemblable_read_pairs = sum( assemblable_read_pairs_by_bc.get(bc, 0) for bc in cell_barcodes) reporter.vdj_filter_barcodes_cb(cell_barcodes, barcodes, counts, total_read_pairs, assemblable_read_pairs, recovered_cells) save_cell_barcodes_json(cell_barcodes, outs.cell_barcodes) with open(outs.barcode_support, 'w') as f: f.write('barcode,count\n') for k, v in bc_support.iteritems(): f.write('%s,%d\n' % (k, v)) write_barcode_umi_summary(args.umi_info, reporter, outs.barcode_umi_summary, args.min_readpairs_per_umi, cell_barcodes) reporter.report_summary_json(outs.summary)
def main(args, outs): reporter = vdj_report.VdjReporter( vdj_reference_path=args.vdj_reference_path) gene_umi_counts_per_bc = {} strand = cr_chem.get_strandedness(args.chemistry_def) # For the entire chunk, match reads against the V(D)J reference ref_fasta = vdj_reference.get_vdj_reference_fasta(args.vdj_reference_path) fq_prefix = re.sub('_1.fastq', '', args.read1_chunk) # The filtering code will write this bam. Then we'll read it, correct the UMIs # and write outs.chunked_bams. filter_bam = martian.make_path('tmp.bam') run_read_match(fq_prefix, ref_fasta, filter_bam, args.chemistry_def, args.sw_params) # Make two passes over the BAM file, processing one barcode at a time bam1 = tk_bam.create_bam_infile(filter_bam) bam2 = tk_bam.create_bam_infile(filter_bam) bc_iter1 = get_bc_grouped_pair_iter(bam1) bc_iter2 = get_bc_grouped_pair_iter(bam2) reads_per_bc = open(outs.reads_per_bc, 'w') if args.output_fastqs: out_fastq1 = open(outs.barcode_chunked_read1, 'w') out_fastq2 = open(outs.barcode_chunked_read2, 'w') out_bam = None else: out_bam, _ = tk_bam.create_bam_outfile(outs.barcode_chunked_bams, None, None, template=bam1) out_fastq1 = None out_fastq2 = None for (bc, pair_iter1), (_, pair_iter2) in itertools.izip(bc_iter1, bc_iter2): nreads = 0 # Pass 1: UMI correction umi_counts = defaultdict(int) for header, (read1, read2) in pair_iter1: nreads += 2 if is_mapped(read1, read2): umi_counts[header.get_tag(cr_constants.RAW_UMI_TAG)] += 1 corrected_umis = correct_umis(umi_counts) # Pass 2: Write the UMI-corrected records write_barcode_fastq(bam1, pair_iter2, bc, corrected_umis, reporter, gene_umi_counts_per_bc, strand, out_bam, out_fastq1, out_fastq2) reads_per_bc.write('{}\t{}\n'.format(bc, nreads)) bam1.close() bam2.close() if args.output_fastqs: out_fastq1.close() out_fastq2.close() else: out_bam.close() # Write bc-gene-umi counts cPickle.dump(gene_umi_counts_per_bc, open(outs.chunked_gene_umi_counts, 'w')) reporter.save(outs.chunked_reporter)
def main(args, outs): # Load barcode whitelist if args.barcode_whitelist is not None: barcode_whitelist = cr_utils.load_barcode_whitelist( args.barcode_whitelist) reporter = vdj_report.VdjReporter() # Load barcode count distribution barcode_dist = cr_utils.load_barcode_dist(args.barcode_counts, barcode_whitelist, args.gem_group) if args.barcode_whitelist is not None: barcode_whitelist_set = set(barcode_whitelist) else: barcode_whitelist_set = None in_read1_fastq = open(args.read1_chunk) in_read2_fastq = open(args.read2_chunk) out_read1_fastq = open(outs.corrected_read1s, 'w') out_read2_fastq = open(outs.corrected_read2s, 'w') bc_counter = cr_fastq.BarcodeCounter(args.barcode_whitelist, outs.corrected_barcode_counts) # Correct barcodes, add processed bc tag to fastq read_pair_iter = itertools.izip(tk_fasta.read_generator_fastq(in_read1_fastq), \ tk_fasta.read_generator_fastq(in_read2_fastq)) for read1, read2 in itertools.islice(read_pair_iter, args.initial_reads): read1_header = cr_fastq.AugmentedFastqHeader(read1[0]) read2_header = cr_fastq.AugmentedFastqHeader(read2[0]) raw_bc = read1_header.get_tag(cr_constants.RAW_BARCODE_TAG) bc_qual = read1_header.get_tag(cr_constants.RAW_BARCODE_QUAL_TAG) if raw_bc: if barcode_whitelist_set is not None and raw_bc not in barcode_whitelist_set: processed_bc = cr_stats.correct_bc_error( args.barcode_confidence_threshold, raw_bc, bc_qual, barcode_dist) else: # Disallow Ns in no-whitelist case if 'N' in raw_bc: processed_bc = None else: processed_bc = raw_bc if processed_bc: bc_counter.count(None, processed_bc, None) # Add gem group to barcode sequence processed_bc = cr_utils.format_barcode_seq( processed_bc, gem_group=args.gem_group) read1_header.set_tag(cr_constants.PROCESSED_BARCODE_TAG, processed_bc) read2_header.set_tag(cr_constants.PROCESSED_BARCODE_TAG, processed_bc) reporter.vdj_barcode_cb(raw_bc, processed_bc) tk_fasta.write_read_fastq(out_read1_fastq, read1_header.to_string(), read1[1], read1[2]) tk_fasta.write_read_fastq(out_read2_fastq, read2_header.to_string(), read2[1], read2[2]) in_read1_fastq.close() in_read2_fastq.close() out_read1_fastq.close() out_read2_fastq.close() bc_counter.close() reporter.save(outs.chunked_reporter)
def main(args, outs): reporter = vdj_report.VdjReporter() barcode_contigs = collections.defaultdict(list) contig_annotations = {} # Get annotations for each contig for annotation in iter(json.load(open(args.annotations))): contig_annotations[annotation['contig_name']] = annotation if args.contig_summary and os.path.isfile(args.contig_summary): contig_summary = pd.read_csv(args.contig_summary, header=0, index_col=None, sep='\t', dtype={'component': int, 'num_reads': int, 'num_pairs': int, 'num_umis': int, 'umi_list': str, }) contig_summary = contig_summary.groupby('barcode') else: contig_summary = None if args.umi_summary and os.path.isfile(args.umi_summary): umi_summary = pd.read_csv(args.umi_summary, header=0, index_col=None, sep='\t') umi_summary = umi_summary.groupby('barcode') else: umi_summary = None if args.filter_summary: filter_summary = vdj_utils.load_contig_summary_table(args.filter_summary) else: filter_summary = None # Get contigs for each barcode for contig_hdr, contig_seq in cr_utils.get_fasta_iter(open(args.contigs)): contig_name = contig_hdr.split(' ')[0] if not filter_summary is None and not vdj_utils.is_contig_filtered(filter_summary, contig_name): continue barcode = vdj_utils.get_barcode_from_contig_name(contig_name) barcode_contigs[barcode].append((contig_name, contig_seq)) # Compute metrics for each barcode if args.cell_barcodes: barcodes = vdj_utils.load_cell_barcodes_json(args.cell_barcodes) else: # Pass an empty barcode JSON for bulk barcodes = {''} reference = vdj_ref.VdjReference(args.vdj_reference_path) for barcode in barcodes: contigs = barcode_contigs[barcode] annotations = [contig_annotations[contig[0]] for contig in contigs] reporter.vdj_barcode_contig_cb(barcode, contigs, annotations, reference) if not contig_summary is None and barcode in contig_summary.groups: bc_contig_summary = contig_summary.get_group(barcode) else: bc_contig_summary = None if not umi_summary is None and barcode in umi_summary.groups: bc_umi_summary = umi_summary.get_group(barcode) else: bc_umi_summary = None reporter.vdj_assembly_cb(bc_contig_summary, bc_umi_summary, annotations, reference) reporter.report_summary_json(outs.summary)
def main(args, outs): reporter = vdj_report.VdjReporter() barcode_contigs = defaultdict(list) contig_annotations = {} # Get annotations for each contig for annotation in iter(json.load(open(args.annotations))): contig_annotations[annotation['contig_name']] = annotation if args.contig_summary and os.path.isfile(args.contig_summary): contig_summary = pd.read_csv(args.contig_summary, header=0, index_col=None, sep='\t', dtype={ 'component': int, 'num_reads': int, 'num_pairs': int, 'num_umis': int, 'umi_list': str, }) contig_summary = contig_summary.groupby('barcode') else: contig_summary = None if args.umi_summary and os.path.isfile(args.umi_summary): umi_summary = pd.read_csv(args.umi_summary, header=0, index_col=None, sep='\t') umi_summary = umi_summary.groupby('barcode') else: umi_summary = None if args.filter_summary: filter_summary = vdj_utils.load_contig_summary_table( args.filter_summary) else: filter_summary = None # Get contigs for each barcode for contig_hdr, contig_seq in cr_utils.get_fasta_iter(open(args.contigs)): contig_name = contig_hdr.split(' ')[0] if not filter_summary is None and not vdj_utils.is_contig_filtered( filter_summary, contig_name): continue barcode = vdj_utils.get_barcode_from_contig_name(contig_name) barcode_contigs[barcode].append((contig_name, contig_seq)) # Compute metrics for each barcode if args.cell_barcodes: barcodes = vdj_utils.load_cell_barcodes_json(args.cell_barcodes) else: # Pass an empty barcode JSON for bulk barcodes = {''} reference = vdj_ref.VdjReference(args.vdj_reference_path) for barcode in barcodes: contigs = barcode_contigs[barcode] annotations = [contig_annotations[contig[0]] for contig in contigs] reporter.vdj_barcode_contig_cb(barcode, contigs, annotations, reference) if not contig_summary is None and barcode in contig_summary.groups: bc_contig_summary = contig_summary.get_group(barcode) else: bc_contig_summary = None if not umi_summary is None and barcode in umi_summary.groups: bc_umi_summary = umi_summary.get_group(barcode) else: bc_umi_summary = None reporter.vdj_assembly_cb(bc_contig_summary, bc_umi_summary, annotations, reference) ## Compute post-assembly per-cell metrics # Load the assembly metrics summary to get the total assemblable reads if args.assemble_metrics_summary and args.reads_summary: assemblable_read_pairs_by_bc = cr_utils.get_metric_from_json( args.assemble_metrics_summary, 'assemblable_read_pairs_by_bc') assemblable_read_pairs = sum( assemblable_read_pairs_by_bc.get(bc, 0) for bc in barcodes) total_read_pairs = cr_utils.get_metric_from_json( args.reads_summary, 'total_read_pairs') reporter._get_metric_attr( 'vdj_assemblable_read_pairs_per_filtered_bc').set_value( assemblable_read_pairs, len(barcodes)) reporter._get_metric_attr('vdj_sequencing_efficiency').set_value( assemblable_read_pairs, total_read_pairs) ## Try to autodetect the chain type # Find all chains w/ a significant presence. # If there's exactly one, set the chain type filter to that. # Otherwise, show all chain types. chain_count = defaultdict(int) for anno_dict in contig_annotations.itervalues(): contig = vdj_annotations.AnnotatedContig.from_dict( anno_dict, reference) if contig.is_cell and contig.high_confidence and contig.productive: for anno in contig.annotations: if anno.feature.chain_type in vdj_constants.VDJ_CHAIN_TYPES: chain_count[anno.feature.chain_type] += 1 outs.chain_type = vdj_constants.ALL_CHAIN_TYPES print chain_count if len(chain_count) > 0: n_contigs = sum(chain_count.itervalues()) sig_chains = [ ct for ct, count in chain_count.iteritems() if tk_stats.robust_divide( count, n_contigs) >= MIN_CHAIN_TYPE_CONTIG_FRAC ] if len(sig_chains) == 1: outs.chain_type = sig_chains[0] reporter.report_summary_json(outs.summary)
def main(args, outs): reporter = vdj_report.VdjReporter() cell_barcodes = set(vdj_utils.load_cell_barcodes_json(args.cell_barcodes)) barcode_contigs = vdj_annot.load_cell_contigs_from_json( args.annotations, args.vdj_reference_path, group_key='barcode') # From CDR sequence to sequence id sequences = {} # From clonotype (tuple of CDR ids) to clonotype id clonotypes = {} # From barcode to clonotype id bc_clonotype_assignments = {} # First pass: Just keep track of observed CDR3s for contig_list in barcode_contigs: # This will be a tuple of sequences like "TRA_<cdr seq>" barcode_clonotype_tuple = contig_list.clonotype_tuple( require_productive=not args.use_non_productive, require_full_len=True, require_high_conf=True) # Give unique numerical ids to the CDR3 sequences if barcode_clonotype_tuple: for cdr_seq in barcode_clonotype_tuple: sequences.setdefault(cdr_seq, len(sequences)) # From sequence id to CDR sequence sequence_ids = {seq_id: seq for seq, seq_id in sequences.iteritems()} # Do a second pass to potentially use non-full length contigs with a valid CDR3. for contig_list in barcode_contigs: if args.use_non_full_len: barcode_clonotype_tuple = [] for c in contig_list.contigs(): (_, cl_seq) = c.clonotype_seq() # If this contig has a CDR3 and we can infer the gene type of # that CDR3 (either based on the contig itself or based on # other full-length contigs that had this CDR3, then add this # to the clonotype tuple). if cl_seq in sequences: # this will rescue contigs that have a chain and CDR3 assigned # but aren't full length barcode_clonotype_tuple.append(cl_seq) else: barcode_clonotype_tuple = contig_list.clonotype_tuple( require_productive=(not args.use_non_productive), require_full_len=True, require_high_conf=True) barcode_clonotype = tuple( sorted(list(set([sequences[s] for s in barcode_clonotype_tuple])))) if barcode_clonotype: clonotype_id = clonotypes.setdefault(barcode_clonotype, len(clonotypes)) bc_clonotype_assignments[contig_list.name] = clonotype_id # From clonotype id to tuple of CDRs clonotype_ids = { clonotype_id: clonotype_tuple for clonotype_tuple, clonotype_id in clonotypes.iteritems() } out_clonotypes = vdj_annot.report_clonotypes(reporter, 'raw', cell_barcodes, clonotype_ids, sequence_ids, barcode_contigs, bc_clonotype_assignments) with open(outs.clonotype_assignments, 'w') as out_file: tk_safe_json.dump_numpy(tk_safe_json.json_sanitize(out_clonotypes), out_file, pretty=True) # Add clonotype assignments to contig annotations del barcode_contigs with open(args.annotations) as f: all_contigs = vdj_annot.load_contig_list_from_json( f, args.vdj_reference_path) vdj_annot.label_contigs_with_consensus(out_clonotypes, all_contigs, 'raw') # Write augmented contig annotations with open(outs.contig_annotations, 'w') as out_file: vdj_annot.save_annotation_list_json(out_file, all_contigs) with open(outs.contig_annotations_csv, 'w') as out_file: vdj_annot.save_contig_list_csv(out_file, all_contigs, write_inferred=False) with open(outs.contig_annotations_pickle, 'w') as out_file: cPickle.dump(all_contigs, out_file, protocol=cPickle.HIGHEST_PROTOCOL) # Write filtered contig annotations with open(outs.filtered_contig_annotations_csv, 'w') as out_file: filtered_contigs = filter(lambda x: x.high_confidence and x.is_cell, all_contigs) vdj_annot.save_contig_list_csv(out_file, filtered_contigs, write_inferred=False) # Set a default value for paired clonotype diversity so that it will be # present in the metric summary csv even when there are no paired cells # or in denovo mode paired_diversity_metric = reporter._get_metric_attr( 'vdj_paired_clonotype_diversity', MULTI_REFS_PREFIX, 'raw') if not paired_diversity_metric.d: paired_diversity_metric.add(None, 0) reporter.report_summary_json(outs.summary)
def main(args, outs): outs.chunked_consensus_bams = [] outs.chunked_concat_ref_bams = [] chunk_clonotypes = set(args.chunk_clonotypes) reporter = vdj_report.VdjReporter() if not args.clonotype_assignments or not vdj_utils.bam_has_seqs( args.contig_bam): # always produce an empty summary reporter.save(outs.chunked_reporter) return with open(args.annotations) as f: contigs = cPickle.load(f) with open(args.clonotype_assignments) as f: clonotypes = json.load(f) in_bam = tk_bam.create_bam_infile(args.contig_bam) contig_read_counts = {c.contig_name: c.read_count for c in contigs} contig_umi_counts = {c.contig_name: c.umi_count for c in contigs} # Do not attempt to read into a pandas object because it can be huge. contig_umis = defaultdict(set) with open(args.umi_summary_tsv, 'r') as umi_file: for line in umi_file: fields = line.strip().split('\t') umi = fields[2] if umi == 'umi' or len(fields) < 7: continue good_umi = fields[5] == 'True' contig_names = fields[6].split(',') if good_umi: for c in contig_names: contig_umis[c].add(umi) consensus_fastq = open(outs.consensus_fastq, 'w') consensus_fasta = open(outs.consensus_fasta, 'w') ref_fasta = open(outs.concat_ref_fasta, 'w') consensus_contigs = [] ref_contigs = [] assert (args.metric_prefix in reporter.vdj_clonotype_types) # Iterate over clonotype assignments for clonotype_id, clonotype in clonotypes.iteritems(): if not clonotype_id in chunk_clonotypes: continue for consensus_id, consensus in clonotype['consensuses'].iteritems(): cdr = consensus['cdr3_seq'] sel_contigs = set(consensus['cell_contigs'] ) # Get the contigs that should be merged # Keep track of the "best" contig. This will be used in case the # merging fails. best_contig = None # Keep track of the set of distinct annotations of the contigs to merge. # Will use to report rate of discrepancies. feature_annotations = defaultdict(set) for contig in contigs: if contig.contig_name in sel_contigs: for anno in contig.annotations: feature_annotations[anno.feature.region_type].add( anno.feature.gene_name) # Always choose a productive over a non-productive. Between # contigs with the same productivity, choose the one that had more UMIs. if best_contig is None or (not best_contig.productive and contig.productive) or \ (best_contig.productive == contig.productive and \ len(contig_umis[best_contig.contig_name]) < len(contig_umis[contig.contig_name])): best_contig = contig assert not best_contig is None anno_count = np.max( [len(feature_annotations[v]) for v in VDJ_V_FEATURE_TYPES]) metric = reporter._get_metric_attr( 'vdj_clonotype_gt1_v_annotations_contig_frac', args.metric_prefix) metric.add(1, filter=anno_count > 1) anno_count = np.max( [len(feature_annotations[v]) for v in VDJ_J_FEATURE_TYPES]) metric = reporter._get_metric_attr( 'vdj_clonotype_gt1_j_annotations_contig_frac', args.metric_prefix) metric.add(1, filter=anno_count > 1) # Order contigs by decreasing UMI support ordered_contigs = list( sorted(sel_contigs, key=lambda x: len(contig_umis[x]), reverse=True)) ordered_contigs = ordered_contigs[ 0:min(MAX_CELLS_FOR_BASE_QUALS, len(sel_contigs))] wrong_cdr_metric = reporter._get_metric_attr( 'vdj_clonotype_consensus_wrong_cdr_contig_frac', args.metric_prefix) tmp_dir = martian.make_path(consensus_id + '_outs') cr_utils.mkdir(tmp_dir, allow_existing=True) res = get_consensus_seq(consensus_id, sel_contigs, best_contig.contig_name, tmp_dir, args) (best_seq, best_quals, consensus_seq, contig_to_cons_bam, contig_fastq, contig_fasta) = res outs.chunked_consensus_bams.append(contig_to_cons_bam) # make sure the bam file has the right header (single sequence with this consensus name) tmp_bam = tk_bam.create_bam_infile(contig_to_cons_bam) assert (list(tmp_bam.references) == [consensus_id]) tmp_bam.close() if consensus_seq: # If this is not None, we actually built a consensus, so we have to compute the quals from scratch. consensus_quals = get_consensus_quals(in_bam, consensus_id, contig_fasta, ordered_contigs, contig_umis, tmp_dir) else: consensus_seq = best_seq consensus_quals = best_quals assert (len(consensus_seq) == len(consensus_quals)) total_read_count = np.sum( [contig_read_counts[c] for c in sel_contigs]) total_umi_count = np.sum( [contig_umi_counts[c] for c in sel_contigs]) contig_info_dict = { 'cells': clonotype['barcodes'], 'cell_contigs': sel_contigs, 'clonotype_freq': clonotype['freq'], 'clonotype_prop': clonotype['prop'], } contig = annotate_consensus_contig(args.vdj_reference_path, args.min_score_ratios, args.min_word_sizes, consensus_id, clonotype_id, consensus_seq, consensus_quals, read_count=total_read_count, umi_count=total_umi_count, info_dict=contig_info_dict, primers=args.primers) wrong_cdr_metric.add(1, filter=contig.cdr3_seq is None or contig.cdr3_seq != cdr) if contig.cdr3_seq is None or contig.cdr3_seq != cdr: # Something went wrong. Use "best" contig as the consensus. consensus_seq = best_seq consensus_quals = best_quals contig = annotate_consensus_contig(args.vdj_reference_path, args.min_score_ratios, args.min_word_sizes, consensus_id, clonotype_id, consensus_seq, consensus_quals, read_count=total_read_count, umi_count=total_umi_count, info_dict=contig_info_dict, primers=args.primers) assert (not contig.cdr3_seq is None and contig.cdr3_seq == cdr) consensus_contigs.append(contig) tk_fasta.write_read_fasta(consensus_fasta, consensus_id, consensus_seq) tk_fasta.write_read_fastq(consensus_fastq, consensus_id, consensus_seq, consensus_quals) assert (len(consensus_seq) == len(consensus_quals)) ref_seq_parts, ref_annos = contig.get_concat_reference_sequence() # Align the contigs and consensus to a synthetic concatenated reference if ref_seq_parts is not None: # Trim the last segment down to the annotated length # to avoid including the entire (500nt) C-region ref_seq_parts[-1] = ref_seq_parts[-1][0:ref_annos[-1]. annotation_match_end] # Concatenate the reference VDJC segments ref_seq = reduce(lambda x, y: x + y, ref_seq_parts) ref_name = re.sub('consensus', 'concat_ref', consensus_id) # Reannotate the reference sequence. # Restrict the annotation to the already-called segments to # reduce the risk of discordance between the consensus and # concat_ref annotations. ref_contig = annotate_consensus_contig( args.vdj_reference_path, args.min_score_ratios, args.min_word_sizes, ref_name, clonotype_id, ref_seq, 'I' * len(ref_seq), use_features=set([a.feature.feature_id for a in ref_annos]), ) ref_contigs.append(ref_contig) # Add the consensus sequence to the input FASTQ (next to the contigs) with open(contig_fastq, 'a') as contig_fq: # Create a fake UMI and barcode header = cr_fastq.AugmentedFastqHeader(consensus_id) header.set_tag(PROCESSED_UMI_TAG, consensus_id) header.set_tag(PROCESSED_BARCODE_TAG, consensus_id) tk_fasta.write_read_fastq(contig_fq, header.to_string(), consensus_seq, consensus_quals) # Reuse this file (this had the assembly output but we don't need it anymore) ref_fasta_name = martian.make_path(consensus_id + '_contigs.fasta') with open(ref_fasta_name, 'w') as f: tk_fasta.write_read_fasta(f, ref_name, ref_seq) # Also append to the final output tk_fasta.write_read_fasta(ref_fasta, ref_name, ref_seq) cmd = [ 'vdj_asm', 'base-quals', martian.make_path(consensus_id + '_contigs'), tmp_dir, '--single-end', '--global' # use global alignment if a good seed isn't found - everything must get aligned ] sys.stderr.write('Running ' + ' '.join(cmd) + '\n') subprocess.check_call(cmd, cwd=os.getcwd()) # Move out of tmp dir rec_bam = martian.make_path(consensus_id + '_reference.bam') cr_utils.move( os.path.join(tmp_dir, consensus_id + '_contigs.bam'), rec_bam) outs.chunked_concat_ref_bams.append(rec_bam) if os.path.isdir(tmp_dir): shutil.rmtree(tmp_dir) in_bam.close() consensus_fastq.close() consensus_fasta.close() ref_fasta.close() reporter.save(outs.chunked_reporter) with open(outs.consensus_annotations_json, 'w') as out_file: vdj_annot.save_annotation_list_json(out_file, consensus_contigs) with open(outs.concat_ref_annotations_json, 'w') as out_file: vdj_annot.save_annotation_list_json(out_file, ref_contigs)
def main(args, outs): # Load barcode whitelist if args.barcode_whitelist is not None: barcode_whitelist = cr_utils.load_barcode_whitelist( args.barcode_whitelist) reporter = vdj_report.VdjReporter() # Load barcode count distribution barcode_dist = cr_utils.load_barcode_dist(args.barcode_counts, barcode_whitelist, args.gem_group, args.library_type) if args.barcode_whitelist is not None: barcode_whitelist_set = set(barcode_whitelist) else: barcode_whitelist_set = None in_read1_fastq = cr_io.open_maybe_gzip(args.read1_chunk) in_read2_fastq = cr_io.open_maybe_gzip( args.read2_chunk) if args.read2_chunk else [] outs.corrected_bcs += h5_constants.LZ4_SUFFIX out_file = cr_io.open_maybe_gzip(outs.corrected_bcs, 'w') bc_counter = cr_fastq.BarcodeCounter(args.barcode_whitelist, outs.corrected_barcode_counts) # Correct barcodes, add processed bc tag to fastq read_pair_iter = itertools.izip_longest(tk_fasta.read_generator_fastq(in_read1_fastq), \ tk_fasta.read_generator_fastq(in_read2_fastq)) for read1, read2 in itertools.islice(read_pair_iter, args.initial_reads): read1_header = cr_fastq.AugmentedFastqHeader(read1[0]) raw_bc = read1_header.get_tag(cr_constants.RAW_BARCODE_TAG) bc_qual = read1_header.get_tag(cr_constants.RAW_BARCODE_QUAL_TAG) processed_bc = None if raw_bc: if barcode_whitelist_set is not None and raw_bc not in barcode_whitelist_set: processed_bc = cr_stats.correct_bc_error( args.barcode_confidence_threshold, raw_bc, bc_qual, barcode_dist) else: # Disallow Ns in no-whitelist case if 'N' in raw_bc: processed_bc = None else: processed_bc = raw_bc if processed_bc: bc_counter.count(None, processed_bc, None) # Add gem group to barcode sequence processed_bc = cr_utils.format_barcode_seq( processed_bc, gem_group=args.gem_group) reporter.vdj_barcode_cb(raw_bc, processed_bc) out_file.write('%s\n' % (processed_bc if processed_bc is not None else '')) in_read1_fastq.close() if in_read2_fastq: in_read2_fastq.close() out_file.close() bc_counter.close() reporter.save(outs.chunked_reporter)
def main(args, outs): reporter = vdj_report.VdjReporter( vdj_reference_path=args.vdj_reference_path) gene_umi_counts_per_bc = {} strand = cr_chem.get_strandedness(args.chemistry_def) paired_end = cr_chem.is_paired_end(args.chemistry_def) assert paired_end != (args.read2_chunk is None) # For the entire chunk, match reads against the V(D)J reference ref_fasta = vdj_reference.get_vdj_reference_fasta(args.vdj_reference_path) # The filtering code will write this bam. Then we'll read it, correct the UMIs # and write outs.chunked_bams. filter_bam = martian.make_path('tmp.bam') vdj_filt.run_read_match(args.read1_chunk, args.read2_chunk, ref_fasta, filter_bam, strand, args.sw_params) # Make two passes over the BAM file, processing one barcode at a time bam1 = pysam.AlignmentFile(filter_bam, check_sq=False) bam2 = pysam.AlignmentFile(filter_bam, check_sq=False) bc_iter1 = get_bc_grouped_pair_iter(bam1, paired_end) bc_iter2 = get_bc_grouped_pair_iter(bam2, paired_end) reads_per_bc = open(outs.reads_per_bc, 'w') out_bam, _ = tk_bam.create_bam_outfile(outs.barcode_chunked_bams, None, None, template=bam1) for (bc, pair_iter1), (_, pair_iter2) in itertools.izip(bc_iter1, bc_iter2): nreads = 0 # Pass 1: UMI correction umi_counts = defaultdict(int) for header, (read1, read2) in pair_iter1: nreads += 2 umi_counts[header.get_tag(cr_constants.RAW_UMI_TAG)] += 1 corrected_umis = correct_umis(umi_counts) # Pass 2: Write the UMI-corrected records process_bam_barcode(bam1, pair_iter2, bc, corrected_umis, reporter, gene_umi_counts_per_bc, strand, out_bam, paired_end) reads_per_bc.write('{}\t{}\n'.format(bc, nreads)) bam1.close() bam2.close() out_bam.close() # Write bc-gene-umi counts cPickle.dump(gene_umi_counts_per_bc, open(outs.chunked_gene_umi_counts, 'w')) # Copy the input barcodes if args.barcodes_chunk is not None: cr_utils.copy(args.barcodes_chunk, outs.barcodes_in_chunks) else: outs.barcodes_in_chunks = None reporter.save(outs.chunked_reporter)
def main(args, outs): np.random.seed(0) unique_gem_groups = np.unique(args.gem_groups).tolist() reporter = vdj_report.VdjReporter(gem_groups=unique_gem_groups) # Load the umi info umi_info = vdj_umi_info.read_umi_info(args.umi_info, args.start_row, args.end_row) # Compute initial within-barcode thresholds # Assumes the fraction of noise-UMI reads is < some fraction (NX) barcode_nx = np.zeros(len(umi_info['barcodes']), dtype=int) # Assume grouped by barcode for bc, bc_reads in itertools.groupby(itertools.izip( umi_info['barcode_idx'], umi_info['reads']), key=lambda x: x[0]): bc_reads_arr = np.fromiter((reads for bc, reads in bc_reads), umi_info['reads'].dtype) barcode_nx[bc] = tk_stats.NX(bc_reads_arr, args.intra_barcode_nx) # Filter out UMIs below the within-BC threshold (in-place) top_in_bc = umi_info['reads'] >= barcode_nx[umi_info['barcode_idx']] for col in vdj_umi_info.UMI_INFO_COLS.iterkeys(): umi_info[col] = np.compress(top_in_bc, umi_info[col]) # Compute N50 read pairs per UMI for this gem group # and use it to subsample to the target N50. rppu_n50 = tk_stats.NX(umi_info['reads'], 0.5) if rppu_n50 is None: rppu_n50 = float('NaN') reporter._get_metric_attr('vdj_recombinome_readpairs_per_umi_n50', cr_constants.MULTI_REFS_PREFIX, args.gem_group).set_value(rppu_n50) if rppu_n50 == 0: subsample_rate = 1.0 else: subsample_rate = min(1.0, tk_stats.robust_divide(args.target_n50, rppu_n50)) reporter._get_metric_attr('vdj_assembly_subsample_rate', args.gem_group).set_value(subsample_rate, 1.0) # Weighted average of subsample rates where weight = sum of readpairs on UMIs for each gem-group reporter._get_metric_attr('vdj_assembly_overall_subsample_rate').set_value( subsample_rate * sum(umi_info['reads']), sum(umi_info['reads'])) # Find the global (per-chain) thresholds thresholds = {} chain_totals = {} # Sort the chains alphabetically for determinism in e.g. multi-library vs single-library # runs. chain_tuples = list(enumerate(umi_info['chains'])) sorted_chain_tuples = sorted(chain_tuples, key=lambda x: x[1]) for chain_idx, chain in sorted_chain_tuples: chain_reads = umi_info['reads'][umi_info['chain_idx'] == chain_idx] chain_totals[chain] = chain_reads.sum() # Record the per-chain N50 read pairs per UMI (but don't use it) chain_n50 = tk_stats.NX(chain_reads, 0.5) if chain_n50 is None: chain_n50 = float('NaN') reporter._get_metric_attr('vdj_recombinome_readpairs_per_umi_n50', chain, args.gem_group).set_value(chain_n50) print "Computing per-chain threshold for %s" % chain thresholds[chain] = vdj_stats.compute_readpairs_per_umi_threshold( chain_reads, subsample_rate) print " %d" % thresholds[chain] reporter._get_metric_attr( 'vdj_recombinome_readpairs_per_umi_threshold', chain, args.gem_group).set_value(thresholds[chain]) # Take the min threshold among the chains that make up N90 of all reads chain_n90 = tk_stats.NX(chain_totals.values(), 0.9) use_chains = [ chain for chain in thresholds.iterkeys() if chain_totals[chain] >= chain_n90 ] use_thresholds = [thresholds[c] for c in use_chains] print "Using thresholds from " + str(use_chains) + ": " + str( use_thresholds) # Handle case where no chains were detected if len(use_chains) == 0: threshold = 1 else: threshold = min(use_thresholds) outs.min_readpairs_per_umi = {args.gem_group: int(threshold)} outs.subsample_rate = {args.gem_group: float(subsample_rate)} reporter._get_metric_attr('vdj_recombinome_readpairs_per_umi_threshold', cr_constants.MULTI_REFS_PREFIX, args.gem_group).set_value(threshold) reporter.save(outs.chunked_reporter)