def join(args, outs, chunk_defs, chunk_outs): if len(chunk_outs) == 0: # Set all outputs to null for slot in outs.slots: setattr(outs, slot, None) return reporters = [chunk_out.chunked_reporter for chunk_out in chunk_outs] final_report = cr_report.merge_reporters(reporters) final_report.report_summary_json(outs.summary) consensus_contigs = [] ref_contigs = [] all_bams = [] all_ref_bams = [] for chunk in chunk_outs: if chunk.consensus_annotations_json and os.path.isfile( chunk.consensus_annotations_json): # Collect consensus annotations new_contigs = vdj_annot.load_cell_contigs_from_json( chunk.consensus_annotations_json, args.vdj_reference_path, group_key='clonotype') for cl in new_contigs: consensus_contigs.extend(cl.chains) # Collect concat_ref annotations new_ref_contigs = vdj_annot.load_cell_contigs_from_json( chunk.concat_ref_annotations_json, args.vdj_reference_path, group_key='clonotype') for cl in new_ref_contigs: ref_contigs.extend(cl.chains) all_bams.extend(chunk.chunked_consensus_bams) all_ref_bams.extend(chunk.chunked_concat_ref_bams) if consensus_contigs: all_fastqs = [chunk_out.consensus_fastq for chunk_out in chunk_outs] cr_io.concatenate_files(outs.consensus_fastq, all_fastqs) all_fastas = [chunk_out.consensus_fasta for chunk_out in chunk_outs] concatenate_and_index_fastas(outs.consensus_fasta, all_fastas) outs.consensus_fasta_fai = outs.consensus_fasta + '.fai' all_fastas = [chunk_out.concat_ref_fasta for chunk_out in chunk_outs] concatenate_and_index_fastas(outs.concat_ref_fasta, all_fastas) outs.concat_ref_fasta_fai = outs.concat_ref_fasta + '.fai' concatenate_sort_and_index_bams(outs.consensus_bam, all_bams) outs.consensus_bam_bai = outs.consensus_bam + '.bai' concatenate_sort_and_index_bams(outs.concat_ref_bam, all_ref_bams) outs.concat_ref_bam_bai = outs.concat_ref_bam + '.bai' # Sort contigs (and clonotypes) by frequency. with open(args.clonotype_assignments) as f: clonotypes = json.load(f) clonotype_freqs = {cid: c['freq'] for cid, c in clonotypes.iteritems()} consensus_contigs.sort(key=lambda x: clonotype_freqs[x.clonotype], reverse=True) ref_contigs.sort(key=lambda x: clonotype_freqs[x.clonotype], reverse=True) with open(outs.consensus_annotations_json, 'w') as out_file: vdj_annot.save_annotation_list_json(out_file, consensus_contigs) with open(outs.concat_ref_annotations_json, 'w') as out_file: vdj_annot.save_annotation_list_json(out_file, ref_contigs) with open(outs.consensus_annotations_csv, 'w') as out_file: vdj_annot.save_consensus_list_csv(out_file, consensus_contigs) with open(outs.clonotypes, 'w') as f: vdj_annot.save_clonotype_info_csv(f, consensus_contigs) outs.chunked_consensus_bams = [] outs.chunked_concat_ref_bams = []
def main(args, outs): reporter = vdj_report.VdjReporter() cell_barcodes = set(vdj_utils.load_cell_barcodes_json(args.cell_barcodes)) barcode_contigs = vdj_annot.load_cell_contigs_from_json( args.annotations, args.vdj_reference_path, group_key='barcode') # From CDR sequence to sequence id sequences = {} # From clonotype (tuple of CDR ids) to clonotype id clonotypes = {} # From barcode to clonotype id bc_clonotype_assignments = {} # First pass: Just keep track of observed CDR3s for contig_list in barcode_contigs: # This will be a tuple of sequences like "TRA_<cdr seq>" barcode_clonotype_tuple = contig_list.clonotype_tuple( require_productive=not args.use_non_productive, require_full_len=True, require_high_conf=True) # Give unique numerical ids to the CDR3 sequences if barcode_clonotype_tuple: for cdr_seq in barcode_clonotype_tuple: sequences.setdefault(cdr_seq, len(sequences)) # From sequence id to CDR sequence sequence_ids = {seq_id: seq for seq, seq_id in sequences.iteritems()} # Do a second pass to potentially use non-full length contigs with a valid CDR3. for contig_list in barcode_contigs: if args.use_non_full_len: barcode_clonotype_tuple = [] for c in contig_list.contigs(): (_, cl_seq) = c.clonotype_seq() # If this contig has a CDR3 and we can infer the gene type of # that CDR3 (either based on the contig itself or based on # other full-length contigs that had this CDR3, then add this # to the clonotype tuple). if cl_seq in sequences: # this will rescue contigs that have a chain and CDR3 assigned # but aren't full length barcode_clonotype_tuple.append(cl_seq) else: barcode_clonotype_tuple = contig_list.clonotype_tuple( require_productive=(not args.use_non_productive), require_full_len=True, require_high_conf=True) barcode_clonotype = tuple( sorted(list(set([sequences[s] for s in barcode_clonotype_tuple])))) if barcode_clonotype: clonotype_id = clonotypes.setdefault(barcode_clonotype, len(clonotypes)) bc_clonotype_assignments[contig_list.name] = clonotype_id # From clonotype id to tuple of CDRs clonotype_ids = { clonotype_id: clonotype_tuple for clonotype_tuple, clonotype_id in clonotypes.iteritems() } out_clonotypes = vdj_annot.report_clonotypes(reporter, 'raw', cell_barcodes, clonotype_ids, sequence_ids, barcode_contigs, bc_clonotype_assignments) with open(outs.clonotype_assignments, 'w') as out_file: tk_safe_json.dump_numpy(tk_safe_json.json_sanitize(out_clonotypes), out_file, pretty=True) # Add clonotype assignments to contig annotations del barcode_contigs with open(args.annotations) as f: all_contigs = vdj_annot.load_contig_list_from_json( f, args.vdj_reference_path) vdj_annot.label_contigs_with_consensus(out_clonotypes, all_contigs, 'raw') # Write augmented contig annotations with open(outs.contig_annotations, 'w') as out_file: vdj_annot.save_annotation_list_json(out_file, all_contigs) with open(outs.contig_annotations_csv, 'w') as out_file: vdj_annot.save_contig_list_csv(out_file, all_contigs, write_inferred=False) with open(outs.contig_annotations_pickle, 'w') as out_file: cPickle.dump(all_contigs, out_file, protocol=cPickle.HIGHEST_PROTOCOL) # Write filtered contig annotations with open(outs.filtered_contig_annotations_csv, 'w') as out_file: filtered_contigs = filter(lambda x: x.high_confidence and x.is_cell, all_contigs) vdj_annot.save_contig_list_csv(out_file, filtered_contigs, write_inferred=False) # Set a default value for paired clonotype diversity so that it will be # present in the metric summary csv even when there are no paired cells # or in denovo mode paired_diversity_metric = reporter._get_metric_attr( 'vdj_paired_clonotype_diversity', MULTI_REFS_PREFIX, 'raw') if not paired_diversity_metric.d: paired_diversity_metric.add(None, 0) reporter.report_summary_json(outs.summary)