def run(bam_file, data, out_dir): """Run viral QC analysis. """ viral_target = "gdc-viral" out = {} if vcfutils.get_paired_phenotype(data): viral_refs = [x for x in dd.get_viral_files(data) if os.path.basename(x) == "%s.fa" % viral_target] if viral_refs and utils.file_exists(viral_refs[0]): viral_ref = viral_refs[0] viral_bam = os.path.join(utils.safe_makedir(out_dir), "%s-%s.bam" % (dd.get_sample_name(data), utils.splitext_plus(os.path.basename(viral_ref))[0])) out_file = "%s-counts.txt" % utils.splitext_plus(viral_bam)[0] if not utils.file_uptodate(out_file, bam_file): if not utils.file_uptodate(viral_bam, bam_file): with file_transaction(data, viral_bam) as tx_out_file: cores = dd.get_num_cores(data) tmpfile = "%s-tmp" % utils.splitext_plus(tx_out_file)[0] cmd = ("samtools view -u -f 4 {bam_file} | " "bamtofastq collate=0 | " "bwa mem -t {cores} {viral_ref} - | " "bamsort tmpfile={tmpfile} inputthreads={cores} outputthreads={cores} " "inputformat=sam index=1 indexfilename={tx_out_file}.bai O={tx_out_file}") do.run(cmd.format(**locals()), "Compare unmapped reads to viral genome") with file_transaction(data, out_file) as tx_out_file: with open(tx_out_file, "w") as out_handle: out_handle.write("# sample\t%s\n" % dd.get_sample_name(data)) for info in bam.idxstats(viral_bam, data): if info.aligned > 0: out_handle.write("%s\t%s\n" % (info.contig, info.aligned)) out["base"] = out_file return out
def run(_, data, out_dir): stats_file = os.path.join(utils.safe_makedir(out_dir), "%s_umi_stats.yaml" % dd.get_sample_name(data)) if not utils.file_uptodate(stats_file, dd.get_align_bam(data)): out = {} total = 0 mapped = 0 duplicates = 0 umi_reductions = [] umi_counts = collections.defaultdict(int) with pysam.AlignmentFile(data["umi_bam"], "rb", check_sq=False) as bam_iter: cur_counts = collections.defaultdict(int) cur_key = None for rec in bam_iter: total += 1 umi = _get_umi_tag(rec) if umi and not rec.is_unmapped: mapped += 1 if rec.is_duplicate: duplicates += 1 chrom = bam_iter.getrname(rec.reference_id) pos = rec.reference_start key = (chrom, pos) if key != cur_key: # update counts if cur_counts: for c in cur_counts.values(): umi_counts[c] += 1 total_seqs = sum(cur_counts.values()) umi_count = len(cur_counts) umi_reductions.append(float(total_seqs) / umi_count) # update current keys cur_key = key cur_counts = collections.defaultdict(int) cur_counts[umi] += 1 if cur_counts: for c in cur_counts.values(): umi_counts[c] += 1 total_seqs = sum(cur_counts.values()) umi_count = len(cur_counts) umi_reductions.append(float(total_seqs) / umi_count) consensus_count = sum([x.aligned for x in bam.idxstats(dd.get_align_bam(data), data)]) out["umi_baseline_all"] = total out["umi_baseline_mapped"] = mapped out["umi_baseline_duplicate_pct"] = float(duplicates) / float(mapped) * 100.0 out["umi_consensus_mapped"] = consensus_count out["umi_consensus_pct"] = (100.0 - float(consensus_count) / float(mapped) * 100.0) out["umi_reduction_median"] = int(math.ceil(np.median(umi_reductions))) out["umi_reduction_max"] = int(max(umi_reductions)) out["umi_counts"] = dict(umi_counts) out["umi_raw_avg_cov"] = data["config"]["algorithm"].get("rawumi_avg_cov", 0) with open(stats_file, "w") as out_handle: yaml.safe_dump({dd.get_sample_name(data): out}, out_handle, default_flow_style=False, allow_unicode=False) return stats_file
def _average_genome_coverage(data, bam_file): """Quickly calculate average coverage for whole genome files using indices. Includes all reads, with duplicates. Uses sampling of 10M reads. """ total = sum([c.size for c in ref.file_contigs(dd.get_ref_file(data), data["config"])]) read_counts = sum(x.aligned for x in bam.idxstats(bam_file, data)) with pysam.Samfile(bam_file, "rb") as pysam_bam: read_size = np.median(list(itertools.islice((a.query_length for a in pysam_bam.fetch()), int(1e7)))) avg_cov = float(read_counts * read_size) / total return avg_cov
def _average_genome_coverage(data, bam_file): """Quickly calculate average coverage for whole genome files using indices. Includes all reads, with duplicates. """ total = sum([c.size for c in ref.file_contigs(dd.get_ref_file(data), data["config"])]) read_counts = sum(x.aligned for x in bam.idxstats(bam_file, data)) with pysam.Samfile(bam_file, "rb") as pysam_bam: read_size = np.median(list(itertools.islice((a.query_length for a in pysam_bam.fetch()), 1e5))) avg_cov = float(read_counts * read_size) / total return avg_cov
def run(_, data, out_dir): stats_file = os.path.join(utils.safe_makedir(out_dir), "%s_umi_stats.yaml" % dd.get_sample_name(data)) if not utils.file_uptodate(stats_file, dd.get_align_bam(data)): out = {} counts = collections.defaultdict(lambda: collections.defaultdict(int)) total = 0 mapped = 0 duplicates = 0 with pysam.AlignmentFile(data["umi_bam"], "rb", check_sq=False) as bam_iter: for rec in bam_iter: total += 1 umi = rec.get_tag("RX") if umi and not rec.is_unmapped: mapped += 1 if rec.is_duplicate: duplicates += 1 chrom = bam_iter.getrname(rec.reference_id) pos = rec.reference_start key = (chrom, pos) counts[key][umi] += 1 umi_reductions = [] umi_counts = collections.defaultdict(int) for key in sorted(counts.keys()): for c in counts[key].values(): umi_counts[c] += 1 total_seqs = sum(counts[key].values()) umi_count = len(counts[key]) umi_reductions.append(float(total_seqs) / umi_count) consensus_count = sum( [x.aligned for x in bam.idxstats(dd.get_align_bam(data), data)]) out["umi_baseline_all"] = total out["umi_baseline_mapped"] = mapped out["umi_baseline_duplicate_pct"] = float(duplicates) / float( mapped) * 100.0 out["umi_consensus_mapped"] = consensus_count out["umi_consensus_pct"] = ( 100.0 - float(consensus_count) / float(mapped) * 100.0) out["umi_reduction_median"] = int(math.ceil(np.median(umi_reductions))) out["umi_reduction_max"] = int(max(umi_reductions)) out["umi_counts"] = dict(umi_counts) with open(stats_file, "w") as out_handle: yaml.safe_dump({dd.get_sample_name(data): out}, out_handle, default_flow_style=False, allow_unicode=False) return stats_file
def run(bam_file, data, out_dir): """Run viral QC analysis. """ viral_target = "gdc-viral" out = {} if vcfutils.get_paired_phenotype(data): viral_refs = [ x for x in dd.get_viral_files(data) if os.path.basename(x) == "%s.fa" % viral_target ] if viral_refs and utils.file_exists(viral_refs[0]): viral_ref = viral_refs[0] viral_bam = os.path.join( utils.safe_makedir(out_dir), "%s-%s.bam" % (dd.get_sample_name(data), utils.splitext_plus(os.path.basename(viral_ref))[0])) out_file = "%s-counts.txt" % utils.splitext_plus(viral_bam)[0] if not utils.file_uptodate(out_file, bam_file): if not utils.file_uptodate(viral_bam, bam_file): with file_transaction(data, viral_bam) as tx_out_file: cores = dd.get_num_cores(data) tmpfile = "%s-tmp" % utils.splitext_plus( tx_out_file)[0] cmd = ( "samtools view -u -f 4 {bam_file} | " "bamtofastq collate=0 | " "bwa mem -t {cores} {viral_ref} - | " "bamsort tmpfile={tmpfile} inputthreads={cores} outputthreads={cores} " "inputformat=sam index=1 indexfilename={tx_out_file}.bai O={tx_out_file}" ) do.run(cmd.format(**locals()), "Compare unmapped reads to viral genome") with file_transaction(data, out_file) as tx_out_file: with open(tx_out_file, "w") as out_handle: out_handle.write("# sample\t%s\n" % dd.get_sample_name(data)) for info in bam.idxstats(viral_bam, data): if info.aligned > 0: out_handle.write("%s\t%s\n" % (info.contig, info.aligned)) out["base"] = out_file return out
def run(_, data, out_dir): stats_file = os.path.join(utils.safe_makedir(out_dir), "%s_umi_stats.yaml" % dd.get_sample_name(data)) if not utils.file_uptodate(stats_file, dd.get_align_bam(data)): out = {} total = 0 mapped = 0 duplicates = 0 umi_reductions = [] umi_counts = collections.defaultdict(int) with pysam.AlignmentFile(data["umi_bam"], "rb", check_sq=False) as bam_iter: cur_counts = collections.defaultdict(int) cur_key = None for rec in bam_iter: total += 1 umi = _get_umi_tag(rec) if umi and not rec.is_unmapped: mapped += 1 if rec.is_duplicate: duplicates += 1 chrom = bam_iter.getrname(rec.reference_id) pos = rec.reference_start key = (chrom, pos) if key != cur_key: # update counts if cur_counts: for c in cur_counts.values(): umi_counts[c] += 1 total_seqs = sum(cur_counts.values()) umi_count = len(cur_counts) umi_reductions.append( float(total_seqs) / umi_count) # update current keys cur_key = key cur_counts = collections.defaultdict(int) cur_counts[umi] += 1 if cur_counts: for c in cur_counts.values(): umi_counts[c] += 1 total_seqs = sum(cur_counts.values()) umi_count = len(cur_counts) umi_reductions.append(float(total_seqs) / umi_count) consensus_count = sum( [x.aligned for x in bam.idxstats(dd.get_align_bam(data), data)]) out["umi_baseline_all"] = total out["umi_baseline_mapped"] = mapped out["umi_baseline_duplicate_pct"] = float(duplicates) / float( mapped) * 100.0 out["umi_consensus_mapped"] = consensus_count out["umi_consensus_pct"] = ( 100.0 - float(consensus_count) / float(mapped) * 100.0) out["umi_reduction_median"] = int(math.ceil(np.median(umi_reductions))) out["umi_reduction_max"] = int(max(umi_reductions)) out["umi_counts"] = dict(umi_counts) out["umi_raw_avg_cov"] = data["config"]["algorithm"].get( "rawumi_avg_cov", 0) with open(stats_file, "w") as out_handle: yaml.safe_dump({dd.get_sample_name(data): out}, out_handle, default_flow_style=False, allow_unicode=False) return stats_file
def run(name, chip_bam, input_bam, genome_build, out_dir, method, resources, data): """ Run macs2 for chip and input samples avoiding errors due to samples. """ # output file name need to have the caller name config = dd.get_config(data) out_file = os.path.join(out_dir, name + "_peaks_macs2.xls") macs2_file = os.path.join(out_dir, name + "_peaks.xls") if utils.file_exists(out_file): _compress_and_sort_bdg_files(out_dir, data) return _get_output_files(out_dir) macs2 = config_utils.get_program("macs2", config) antibody = dd.get_antibody(data) if antibody: antibody = antibody.lower() if antibody not in antibodies.SUPPORTED_ANTIBODIES: logger.error( f"{antibody} specified, but not listed as a supported antibody. Valid antibodies are {antibodies.SUPPORTED_ANTIBODIES}. If you know your antibody " f"should be called with narrow or broad peaks, supply 'narrow' or 'broad' as the antibody." f"It will run 'narrow' if the antibody is not supported.") antibody = 'narrow' antibody = antibodies.ANTIBODIES[antibody] logger.info( f"{antibody.name} specified, using {antibody.peaktype} peak settings." ) peaksettings = select_peak_parameters(antibody) elif method == "atac": logger.info(f"ATAC-seq specified, using narrow peak settings.") peaksettings = " " else: peaksettings = " " options = " ".join(resources.get("macs2", {}).get("options", "")) genome_size = bam.fasta.total_sequence_length(dd.get_ref_file(data)) genome_size = "" if options.find("-g") > -1 else "-g %s" % genome_size paired = "-f BAMPE" if bam.is_paired(chip_bam) else "" chip_reads = sum([x.aligned for x in bam.idxstats(chip_bam, data)]) if chip_reads == 0: logger.error( f"{chip_bam} has 0 reads. Please remove the sample and re-run") raise RuntimeWarning( f"macs2 terminated - no reads in {chip_bam}. Please remove the sample and re-run" ) with utils.chdir(out_dir): cmd = _macs2_cmd(data) cmd += peaksettings try: do.run(cmd.format(**locals()), "macs2 for %s" % name) utils.move_safe(macs2_file, out_file) except subprocess.CalledProcessError: raise RuntimeWarning( "macs2 terminated with an error. " "Please, check the message and report " "error if it is related to bcbio. " "You can add specific options for the sample " "setting resources as explained in docs: " "https://bcbio-nextgen.readthedocs.org/en/latest/contents/configuration.html#sample-specific-resources" ) _compress_and_sort_bdg_files(out_dir, data) return _get_output_files(out_dir)