def concatenate_sort_and_index_bams(out_bam, bams): tmp_bam = out_bam.replace('.bam', '_tmp.bam') # Drop the UMI tag since it's useless vdj_utils.concatenate_and_fix_bams(tmp_bam, bams, drop_tags=[PROCESSED_UMI_TAG]) # Wrap filenames in str() to prevent pysam crash on unicode input tk_bam.sort_and_index(str(tmp_bam), str(out_bam.replace('.bam', ''))) cr_utils.remove(tmp_bam, allow_nonexisting=True)
def rm_files(filenames): for filename in filenames: cr_utils.remove(filename, allow_nonexisting=True)
def join(args, outs, chunk_defs, chunk_outs): contigs = [] contig_fastqs = [] contig_bams = [] if len(chunk_outs) == 0: # No input reads # Create empty BAM file with open(outs.contig_bam, 'w') as f: pass outs.contig_bam_bai = None # Create empty contig FASTA with open(outs.contig_fasta, 'w') as f: pass outs.contig_fasta_fai = None # Create empty contig FASTQ with open(outs.contig_fastq, 'w') as f: pass outs.metrics_summary_json = None outs.summary_tsv = None outs.umi_summary_tsv = None return summary_tsvs = [] umi_summary_tsvs = [] for chunk_out in chunk_outs: if not os.path.isfile(chunk_out.contig_fasta): continue contigs.append(chunk_out.contig_fasta) contig_fastqs.append(chunk_out.contig_fastq) contig_bams.append(chunk_out.contig_bam) summary_tsvs.append(chunk_out.summary_tsv) umi_summary_tsvs.append(chunk_out.umi_summary_tsv) cr_utils.concatenate_files(outs.contig_fasta, contigs) if os.path.getsize(outs.contig_fasta) > 0: tk_subproc.check_call('samtools faidx %s' % outs.contig_fasta, shell=True) outs.contig_fasta_fai = outs.contig_fasta + '.fai' cr_utils.concatenate_files(outs.contig_fastq, contig_fastqs) if len(summary_tsvs) > 0: cr_utils.concatenate_headered_files(outs.summary_tsv, summary_tsvs) if len(umi_summary_tsvs) > 0: cr_utils.concatenate_headered_files(outs.umi_summary_tsv, umi_summary_tsvs) if contig_bams: # Merge every N BAMs. Trying to merge them all at once # risks hitting the filehandle limit. n_merged = 0 while len(contig_bams) > 1: to_merge = contig_bams[0:MERGE_BAMS_N] tmp_bam = martian.make_path('merged-%04d.bam' % n_merged) n_merged += 1 print "Merging %d BAMs into %s ..." % (len(to_merge), tmp_bam) tk_bam.merge(tmp_bam, to_merge, threads=args.__threads) # Delete any temporary bams that have been merged for in_bam in to_merge: if os.path.basename(in_bam).startswith('merged-'): cr_utils.remove(in_bam) # Pop the input bams and push the merged bam contig_bams = contig_bams[len(to_merge):] + [tmp_bam] if os.path.basename(contig_bams[0]).startswith('merged-'): # We merged at least two chunks together. # Rename it to the output bam. cr_utils.move(contig_bams[0], outs.contig_bam) else: # There was only a single chunk, so copy it from the input cr_utils.copy(contig_bams[0], outs.contig_bam) tk_bam.index(outs.contig_bam) # Make sure the Martian out matches the actual index filename outs.contig_bam_bai = outs.contig_bam + '.bai' # Merge the assembler summary jsons merged_summary = cr_utils.merge_jsons_single_level( [out.metrics_summary_json for out in chunk_outs]) with open(outs.metrics_summary_json, 'w') as f: json.dump(tk_safe_json.json_sanitize(merged_summary), f, indent=4, sort_keys=True)