def join(args, outs, chunk_defs, chunk_outs): barcodes_csv = [chunk_out.cell_barcodes for chunk_out in chunk_outs] cr_utils.concatenate_files(outs.cell_barcodes, barcodes_csv) outs.gem_group_metrics = { cd.gem_group: co.gem_group_metrics for (cd, co) in zip(chunk_defs, chunk_outs) }
def concatenate_and_index_fastas(out_fasta, fastas): cr_utils.concatenate_files(out_fasta, fastas) tk_subproc.check_call(['samtools', 'faidx', out_fasta], cwd=os.getcwd())
def join(args, outs, chunk_defs, chunk_outs): if len(chunk_outs) == 0: # Set all outputs to null for slot in outs.slots: setattr(outs, slot, None) return reporters = [chunk_out.chunked_reporter for chunk_out in chunk_outs] final_report = cr_report.merge_reporters(reporters) final_report.report_summary_json(outs.summary) consensus_contigs = [] ref_contigs = [] all_bams = [] all_ref_bams = [] for chunk in chunk_outs: if chunk.consensus_annotations_json and os.path.isfile(chunk.consensus_annotations_json): # Collect consensus annotations new_contigs = vdj_annot.load_cell_contigs_from_json(chunk.consensus_annotations_json, args.vdj_reference_path, group_key='clonotype') for cl in new_contigs: consensus_contigs.extend(cl.chains) # Collect concat_ref annotations new_ref_contigs = vdj_annot.load_cell_contigs_from_json(chunk.concat_ref_annotations_json, args.vdj_reference_path, group_key='clonotype') for cl in new_ref_contigs: ref_contigs.extend(cl.chains) all_bams.extend(chunk.chunked_consensus_bams) all_ref_bams.extend(chunk.chunked_concat_ref_bams) if consensus_contigs: all_fastqs = [chunk_out.consensus_fastq for chunk_out in chunk_outs] cr_utils.concatenate_files(outs.consensus_fastq, all_fastqs) all_fastas = [chunk_out.consensus_fasta for chunk_out in chunk_outs] concatenate_and_index_fastas(outs.consensus_fasta, all_fastas) outs.consensus_fasta_fai = outs.consensus_fasta + '.fai' all_fastas = [chunk_out.concat_ref_fasta for chunk_out in chunk_outs] concatenate_and_index_fastas(outs.concat_ref_fasta, all_fastas) outs.concat_ref_fasta_fai = outs.concat_ref_fasta + '.fai' concatenate_sort_and_index_bams(outs.consensus_bam, all_bams) outs.consensus_bam_bai = outs.consensus_bam + '.bai' concatenate_sort_and_index_bams(outs.concat_ref_bam, all_ref_bams) outs.concat_ref_bam_bai = outs.concat_ref_bam + '.bai' # Sort contigs (and clonotypes) by frequency. with open(args.clonotype_assignments) as f: clonotypes = json.load(f) clonotype_freqs = {cid:c['freq'] for cid, c in clonotypes.iteritems()} consensus_contigs.sort(key=lambda x:clonotype_freqs[x.clonotype], reverse=True) ref_contigs.sort(key=lambda x:clonotype_freqs[x.clonotype], reverse=True) with open(outs.consensus_annotations_json, 'w') as out_file: vdj_annot.save_annotation_list_json(out_file, consensus_contigs) with open(outs.concat_ref_annotations_json, 'w') as out_file: vdj_annot.save_annotation_list_json(out_file, ref_contigs) with open(outs.consensus_annotations_csv, 'w') as out_file: vdj_annot.save_consensus_list_csv(out_file, consensus_contigs) with open(outs.clonotypes, 'w') as f: vdj_annot.save_clonotype_info_csv(f, consensus_contigs) outs.chunked_consensus_bams = [] outs.chunked_concat_ref_bams = []
def join(args, outs, chunk_defs, chunk_outs): contigs = [] contig_fastqs = [] contig_bams = [] if len(chunk_outs) == 0: # No input reads # Create empty BAM file with open(outs.contig_bam, 'w') as f: pass outs.contig_bam_bai = None # Create empty contig FASTA with open(outs.contig_fasta, 'w') as f: pass outs.contig_fasta_fai = None # Create empty contig FASTQ with open(outs.contig_fastq, 'w') as f: pass outs.metrics_summary_json = None outs.summary_tsv = None outs.umi_summary_tsv = None return summary_tsvs = [] umi_summary_tsvs = [] for chunk_out in chunk_outs: if not os.path.isfile(chunk_out.contig_fasta): continue contigs.append(chunk_out.contig_fasta) contig_fastqs.append(chunk_out.contig_fastq) contig_bams.append(chunk_out.contig_bam) summary_tsvs.append(chunk_out.summary_tsv) umi_summary_tsvs.append(chunk_out.umi_summary_tsv) cr_utils.concatenate_files(outs.contig_fasta, contigs) if os.path.getsize(outs.contig_fasta) > 0: tk_subproc.check_call('samtools faidx %s' % outs.contig_fasta, shell=True) outs.contig_fasta_fai = outs.contig_fasta + '.fai' cr_utils.concatenate_files(outs.contig_fastq, contig_fastqs) if len(summary_tsvs) > 0: cr_utils.concatenate_headered_files(outs.summary_tsv, summary_tsvs) if len(umi_summary_tsvs) > 0: cr_utils.concatenate_headered_files(outs.umi_summary_tsv, umi_summary_tsvs) if contig_bams: # Merge every N BAMs. Trying to merge them all at once # risks hitting the filehandle limit. n_merged = 0 while len(contig_bams) > 1: to_merge = contig_bams[0:MERGE_BAMS_N] tmp_bam = martian.make_path('merged-%04d.bam' % n_merged) n_merged += 1 print "Merging %d BAMs into %s ..." % (len(to_merge), tmp_bam) tk_bam.merge(tmp_bam, to_merge, threads=args.__threads) # Delete any temporary bams that have been merged for in_bam in to_merge: if os.path.basename(in_bam).startswith('merged-'): cr_utils.remove(in_bam) # Pop the input bams and push the merged bam contig_bams = contig_bams[len(to_merge):] + [tmp_bam] if os.path.basename(contig_bams[0]).startswith('merged-'): # We merged at least two chunks together. # Rename it to the output bam. cr_utils.move(contig_bams[0], outs.contig_bam) else: # There was only a single chunk, so copy it from the input cr_utils.copy(contig_bams[0], outs.contig_bam) tk_bam.index(outs.contig_bam) # Make sure the Martian out matches the actual index filename outs.contig_bam_bai = outs.contig_bam + '.bai' # Merge the assembler summary jsons merged_summary = cr_utils.merge_jsons_single_level( [out.metrics_summary_json for out in chunk_outs]) with open(outs.metrics_summary_json, 'w') as f: json.dump(tk_safe_json.json_sanitize(merged_summary), f, indent=4, sort_keys=True)
def join(args, outs, chunk_defs, chunk_outs): contigs = [] contig_fastqs = [] contig_bams = [] summary_df_parts = [] umi_summary_df_parts = [] for chunk_out in chunk_outs: if not os.path.isfile(chunk_out.contig_fasta): continue contigs.append(chunk_out.contig_fasta) contig_fastqs.append(chunk_out.contig_fastq) contig_bams.append(chunk_out.contig_bam) summary_df_parts.append( pd.read_csv(chunk_out.summary_tsv, header=0, index_col=None, sep='\t', dtype={ 'component': int, 'num_reads': int, 'num_pairs': int, 'num_umis': int })) umi_summary_df_parts.append( pd.read_csv(chunk_out.umi_summary_tsv, header=0, index_col=None, sep='\t', dtype={ 'umi_id': int, 'reads': int, 'min_umi_reads': int, 'contigs': str })) summary_df = pd.concat(summary_df_parts, ignore_index=True) umi_summary_df = pd.concat(umi_summary_df_parts, ignore_index=True) cr_utils.concatenate_files(outs.contig_fasta, contigs) if os.path.getsize(outs.contig_fasta) > 0: subprocess.check_call('samtools faidx %s' % outs.contig_fasta, shell=True) outs.contig_fasta_fai = outs.contig_fasta + '.fai' cr_utils.concatenate_files(outs.contig_fastq, contig_fastqs) if summary_df is not None: summary_df.to_csv(outs.summary_tsv, header=True, index=False, sep='\t') if umi_summary_df is not None: umi_summary_df.to_csv(outs.umi_summary_tsv, header=True, index=False, sep='\t') if contig_bams: tk_bam.merge(outs.contig_bam, contig_bams, threads=args.__threads) tk_bam.index(outs.contig_bam) # Make sure the Martian out matches the actual index filename outs.contig_bam_bai = outs.contig_bam + '.bai' # Merge the assembler summary jsons merged_summary = cr_utils.merge_jsons_single_level( [out.metrics_summary_json for out in chunk_outs]) with open(outs.metrics_summary_json, 'w') as f: json.dump(tk_safe_json.json_sanitize(merged_summary), f, indent=4, sort_keys=True)