def plot_model_segments(seg_files, work_dir, data): """Diagnostic plots of segmentation and inputs. """ from bcbio.heterogeneity import chromhacks out_file = os.path.join(work_dir, "%s.modeled.png" % dd.get_sample_name(data)) if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: dict_file = utils.splitext_plus(dd.get_ref_file(data))[0] + ".dict" plot_dict = os.path.join(os.path.dirname(tx_out_file), os.path.basename(dict_file)) with open(dict_file) as in_handle: with open(plot_dict, "w") as out_handle: for line in in_handle: if line.startswith("@SQ"): cur_chrom = [x.split(":", 1)[1].strip() for x in line.split("\t") if x.startswith("SN:")][0] if chromhacks.is_autosomal_or_sex(cur_chrom): out_handle.write(line) else: out_handle.write(line) params = ["-T", "PlotModeledSegments", "--denoised-copy-ratios", tz.get_in(["depth", "bins", "normalized"], data), "--segments", seg_files["final_seg"], "--allelic-counts", seg_files["tumor_hets"], "--sequence-dictionary", plot_dict, "--minimum-contig-length", "10", "--output-prefix", dd.get_sample_name(data), "-O", os.path.dirname(tx_out_file)] _run_with_memory_scaling(params, tx_out_file, data) return {"seg": out_file}
def remove_extracontigs(in_bam, data): """Remove extra contigs (non chr1-22,X,Y) from an input BAM. These extra contigs can often be arranged in different ways, causing incompatibility issues with GATK and other tools. This also fixes the read group header as in fixrg. """ work_dir = utils.safe_makedir( os.path.join(dd.get_work_dir(data), "bamclean", dd.get_sample_name(data))) out_file = os.path.join( work_dir, "%s-noextras.bam" % utils.splitext_plus(os.path.basename(in_bam))[0]) if not utils.file_uptodate(out_file, in_bam): with file_transaction(data, out_file) as tx_out_file: target_chroms = [ x.name for x in ref.file_contigs(dd.get_ref_file(data)) if chromhacks.is_autosomal_or_sex(x.name) ] str_chroms = " ".join(target_chroms) comma_chroms = ",".join(target_chroms) rg_info = novoalign.get_rg_info(data["rgnames"]) bcbio_py = sys.executable cmd = ( "samtools view -h {in_bam} {str_chroms} | " """{bcbio_py} -c 'from bcbio.pipeline import cleanbam; """ """cleanbam.fix_header("{comma_chroms}")' | """ "samtools view -u - | " "samtools addreplacerg -r '{rg_info}' -m overwrite_all -O bam -o {tx_out_file} - " ) do.run( cmd.format(**locals()), "bamprep, remove extra contigs: %s" % dd.get_sample_name(data)) return out_file
def _target_chroms_and_header(bam_file, data): """Get a list of chromosomes to target and new updated ref_file header. Could potentially handle remapping from chr1 -> 1 but currently disabled due to speed issues. """ special_remaps = {"chrM": "MT", "MT": "chrM"} target_chroms = dict([(x.name, i) for i, x in enumerate(ref.file_contigs(dd.get_ref_file(data))) if chromhacks.is_autosomal_or_sex(x.name)]) out_chroms = [] with pysam.Samfile(bam_file, "rb") as bamfile: for bami, bam_contig in enumerate([c["SN"] for c in bamfile.header["SQ"]]): if bam_contig in target_chroms: target_chrom = bam_contig elif bam_contig in special_remaps and special_remaps[bam_contig] in target_chroms: target_chrom = special_remaps[bam_contig] elif bam_contig.startswith("chr") and bam_contig.replace("chr", "") in target_chroms: target_chrom = bam_contig.replace("chr", "") elif "chr%s" % bam_contig in target_chroms: target_chrom = "chr%s" % bam_contig else: target_chrom = None # target_chrom == bam_contig ensures we don't try chr1 -> 1 style remapping if target_chrom and target_chrom == bam_contig: # Order not required if dealing with SAM file header fixing #assert bami == target_chroms[target_chrom], \ # ("remove_extracontigs: Non-matching order of standard contig: %s %s (%s vs %s)" % # (bam_file, target_chrom, bami, target_chroms[target_chrom])) out_chroms.append(target_chrom) assert out_chroms, ("remove_extracontigs: Did not find any chromosomes in reference file: %s %s" % (bam_file, target_chroms)) return out_chroms
def main(): url = "http://evs.gs.washington.edu/evs_bulk_data/ESP6500SI-V2-SSA137.GRCh38-liftover.snps_indels.vcf.tar.gz" ref_file = "../seq/hg38.fa" subprocess.check_call( "wget -c -O esp-orig.tar.gz {url}".format(**locals()), shell=True) subprocess.check_call("tar -xzvpf esp-orig.tar.gz", shell=True) raw_file = "esp-raw.vcf" with open(raw_file, "w") as out_handle: for i, chrom in enumerate(range(1, 22) + ["X", "Y"]): fnames = glob.glob("*chr%s.snps_indels.vcf" % chrom) assert len(fnames) == 1, (chrom, fnames) with open(fnames[0]) as in_handle: for line in in_handle: if line.startswith("#"): if i == 0: if line.startswith("#CHROM"): _add_contigs(out_handle, ref_file) out_handle.write(line) else: parts = line.strip().split("\t") key, val = parts[-1].split(";")[-1].split("=") assert key == "GRCh38_POSITION" if val != "-1": new_chrom, new_pos = val.split(":") if chromhacks.is_autosomal_or_sex(new_chrom): parts[0] = "chr%s" % new_chrom parts[1] = new_pos out_handle.write("\t".join(parts) + "\n") out_file = "ESP6500SI-V2-hg38.vcf.gz" subprocess.check_call( ("vt sort {raw_file} | vt decompose -s - | " "vt normalize -n -r {ref_file} - | bgzip -c > {out_file}").format( **locals()), shell=True) vcfutils.bgzip_and_index(out_file)
def remove_extracontigs(in_bam, data): """Remove extra contigs (non chr1-22,X,Y) from an input BAM. These extra contigs can often be arranged in different ways, causing incompatibility issues with GATK and other tools. This also fixes the read group header as in fixrg. """ work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "bamclean", dd.get_sample_name(data))) out_file = os.path.join(work_dir, "%s-noextras.bam" % utils.splitext_plus(os.path.basename(in_bam))[0]) if not utils.file_uptodate(out_file, in_bam): with file_transaction(data, out_file) as tx_out_file: target_chroms = [x.name for x in ref.file_contigs(dd.get_ref_file(data)) if chromhacks.is_autosomal_or_sex(x.name)] str_chroms = " ".join(target_chroms) comma_chroms = ",".join(target_chroms) rg_info = novoalign.get_rg_info(data["rgnames"]) bcbio_py = sys.executable cmd = ("samtools view -h {in_bam} {str_chroms} | " """{bcbio_py} -c 'from bcbio.pipeline import cleanbam; """ """cleanbam.fix_header("{comma_chroms}")' | """ "samtools view -u - | " "samtools addreplacerg -r '{rg_info}' -m overwrite_all -O bam -o {tx_out_file} - ") do.run(cmd.format(**locals()), "bamprep, remove extra contigs: %s" % dd.get_sample_name(data)) return out_file
def mutect_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Run the MuTect paired analysis algorithm. """ config = items[0]["config"] if out_file is None: out_file = "%s-paired-variants.vcf.gz" % os.path.splitext( align_bams[0])[0] if not file_exists(out_file): base_config = items[0]["config"] broad_runner = broad.runner_from_config(base_config, "mutect") out_file_mutect = (out_file.replace(".vcf", "-mutect.vcf") if "vcf" in out_file else out_file + "-mutect.vcf") broad_runner, params = \ _mutect_call_prep(align_bams, items, ref_file, assoc_files, region, out_file_mutect) if (not isinstance(region, (list, tuple)) and not all(has_aligned_reads(x, region) for x in align_bams)): vcfutils.write_empty_vcf(out_file) return out_file_orig = "%s-orig%s" % utils.splitext_plus(out_file_mutect) if not file_exists(out_file_orig): with file_transaction(config, out_file_orig) as tx_out_file: # Rationale: MuTect writes another table to stdout, which we don't need params += ["--vcf", tx_out_file, "-o", os.devnull] broad_runner.run_mutect(params) is_paired = "-I:normal" in params if not utils.file_uptodate(out_file_mutect, out_file_orig): out_file_mutect = _fix_mutect_output(out_file_orig, config, out_file_mutect, is_paired) indelcaller = vcfutils.get_indelcaller(base_config) if ("scalpel" in indelcaller.lower() and region and isinstance(region, (tuple, list)) and chromhacks.is_autosomal_or_sex(region[0])): # Scalpel InDels out_file_indels = (out_file.replace(".vcf", "-somaticIndels.vcf") if "vcf" in out_file else out_file + "-somaticIndels.vcf") if scalpel.is_installed(items[0]["config"]): if not is_paired: vcfutils.check_paired_problems(items) scalpel._run_scalpel_caller(align_bams, items, ref_file, assoc_files, region=region, out_file=out_file_indels) else: scalpel._run_scalpel_paired(align_bams, items, ref_file, assoc_files, region=region, out_file=out_file_indels) out_file = vcfutils.combine_variant_files( orig_files=[out_file_mutect, out_file_indels], out_file=out_file, ref_file=items[0]["sam_ref"], config=items[0]["config"], region=region) else: utils.symlink_plus(out_file_mutect, out_file) elif "pindel" in indelcaller.lower(): from bcbio.structural import pindel out_file_indels = (out_file.replace(".vcf", "-somaticIndels.vcf") if "vcf" in out_file else out_file + "-somaticIndels.vcf") if pindel.is_installed(items[0]["config"]): pindel._run_tumor_pindel_caller(align_bams, items, ref_file, assoc_files, region=region, out_file=out_file_indels) out_file = vcfutils.combine_variant_files( orig_files=[out_file_mutect, out_file_indels], out_file=out_file, ref_file=ref_file, config=items[0]["config"], region=region) else: utils.symlink_plus(out_file_mutect, out_file) elif (("somaticindeldetector" in indelcaller.lower() or "sid" in indelcaller.lower()) and "appistry" in broad_runner.get_mutect_version()): # SomaticIndelDetector InDels out_file_indels = (out_file.replace(".vcf", "-somaticIndels.vcf") if "vcf" in out_file else out_file + "-somaticIndels.vcf") params_indels = _SID_call_prep(align_bams, items, ref_file, assoc_files, region, out_file_indels) with file_transaction(config, out_file_indels) as tx_out_file: params_indels += ["-o", tx_out_file] broad_runner.run_mutect(params_indels) out_file = vcfutils.combine_variant_files( orig_files=[out_file_mutect, out_file_indels], out_file=out_file, ref_file=items[0]["sam_ref"], config=items[0]["config"], region=region) else: utils.symlink_plus(out_file_mutect, out_file) return out_file
def mutect_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Run the MuTect paired analysis algorithm. """ config = items[0]["config"] if out_file is None: out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not file_exists(out_file): base_config = items[0]["config"] broad_runner = broad.runner_from_config(base_config, "mutect") out_file_mutect = (out_file.replace(".vcf", "-mutect.vcf") if "vcf" in out_file else out_file + "-mutect.vcf") broad_runner, params = \ _mutect_call_prep(align_bams, items, ref_file, assoc_files, region, out_file_mutect) if (not isinstance(region, (list, tuple)) and not all(has_aligned_reads(x, region) for x in align_bams)): vcfutils.write_empty_vcf(out_file) return out_file_orig = "%s-orig%s" % utils.splitext_plus(out_file_mutect) if not file_exists(out_file_orig): with file_transaction(config, out_file_orig) as tx_out_file: # Rationale: MuTect writes another table to stdout, which we don't need params += ["--vcf", tx_out_file, "-o", os.devnull] broad_runner.run_mutect(params) is_paired = "-I:normal" in params if not utils.file_uptodate(out_file_mutect, out_file_orig): out_file_mutect = _fix_mutect_output(out_file_orig, config, out_file_mutect, is_paired) indelcaller = vcfutils.get_indelcaller(base_config) if ("scalpel" in indelcaller.lower() and region and isinstance(region, (tuple, list)) and chromhacks.is_autosomal_or_sex(region[0])): # Scalpel InDels out_file_indels = (out_file.replace(".vcf", "-somaticIndels.vcf") if "vcf" in out_file else out_file + "-somaticIndels.vcf") if scalpel.is_installed(items[0]["config"]): if not is_paired: vcfutils.check_paired_problems(items) scalpel._run_scalpel_caller(align_bams, items, ref_file, assoc_files, region=region, out_file=out_file_indels) else: scalpel._run_scalpel_paired(align_bams, items, ref_file, assoc_files, region=region, out_file=out_file_indels) out_file = vcfutils.combine_variant_files(orig_files=[out_file_mutect, out_file_indels], out_file=out_file, ref_file=items[0]["sam_ref"], config=items[0]["config"], region=region) else: utils.symlink_plus(out_file_mutect, out_file) elif "pindel" in indelcaller.lower(): from bcbio.structural import pindel out_file_indels = (out_file.replace(".vcf", "-somaticIndels.vcf") if "vcf" in out_file else out_file + "-somaticIndels.vcf") if pindel.is_installed(items[0]["config"]): pindel._run_tumor_pindel_caller(align_bams, items, ref_file, assoc_files, region=region, out_file=out_file_indels) out_file = vcfutils.combine_variant_files(orig_files=[out_file_mutect, out_file_indels], out_file=out_file, ref_file=ref_file, config=items[0]["config"], region=region) else: utils.symlink_plus(out_file_mutect, out_file) elif (("somaticindeldetector" in indelcaller.lower() or "sid" in indelcaller.lower()) and "appistry" in broad_runner.get_mutect_version()): # SomaticIndelDetector InDels out_file_indels = (out_file.replace(".vcf", "-somaticIndels.vcf") if "vcf" in out_file else out_file + "-somaticIndels.vcf") params_indels = _SID_call_prep(align_bams, items, ref_file, assoc_files, region, out_file_indels) with file_transaction(config, out_file_indels) as tx_out_file: params_indels += ["-o", tx_out_file] broad_runner.run_mutect(params_indels) out_file = vcfutils.combine_variant_files(orig_files=[out_file_mutect, out_file_indels], out_file=out_file, ref_file=items[0]["sam_ref"], config=items[0]["config"], region=region) else: utils.symlink_plus(out_file_mutect, out_file) return out_file
def _add_contigs(out_handle, ref_file): for contig in ref.file_contigs(ref_file): if chromhacks.is_autosomal_or_sex(contig.name): out_handle.write("##contig=<ID=%s,length=%s>\n" % (contig.name, contig.size))