def _run_svtyper(in_file, full_bam, exclude_file, data): """Genotype structural variant calls with SVtyper. Removes calls in high depth regions to avoid slow runtimes: https://github.com/hall-lab/svtyper/issues/16 """ out_file = "%s-wgts.vcf.gz" % utils.splitext_plus(in_file)[0] if not utils.file_uptodate(out_file, in_file): with file_transaction(data, out_file) as tx_out_file: if not vcfutils.vcf_has_variants(in_file): shutil.copy(in_file, out_file) else: python = sys.executable svtyper = os.path.join(os.path.dirname(sys.executable), "svtyper") if exclude_file and utils.file_exists(exclude_file): regions_to_rm = "-T ^%s" % (exclude_file) else: regions_to_rm = "" # add FILTER headers, which are lost during svtyping header_file = "%s-header.txt" % utils.splitext_plus(tx_out_file)[0] with open(header_file, "w") as out_handle: with utils.open_gzipsafe(in_file) as in_handle: for line in in_handle: if not line.startswith("#"): break if line.startswith("##FILTER"): out_handle.write(line) for region in ref.file_contigs(dd.get_ref_file(data), data["config"]): out_handle.write("##contig=<ID=%s,length=%s>\n" % (region.name, region.size)) cmd = ("bcftools view {in_file} {regions_to_rm} | " "{python} {svtyper} --max_reads 1000 -B {full_bam} | " "bcftools annotate -h {header_file} | " "bgzip -c > {tx_out_file}") do.run(cmd.format(**locals()), "SV genotyping with svtyper") return vcfutils.sort_by_ref(out_file, data)
def _run_lumpy(full_bams, sr_bams, disc_bams, work_dir, items): """Run lumpy-sv, using speedseq pipeline. """ batch = sshared.get_cur_batch(items) ext = "-%s-svs" % batch if batch else "-svs" out_file = os.path.join( work_dir, "%s%s.vcf" % (os.path.splitext(os.path.basename(items[0]["align_bam"]))[0], ext)) sv_exclude_bed = sshared.prepare_exclude_file(items, out_file) if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: with tx_tmpdir(items[0]) as tmpdir: full_bams = ",".join(full_bams) sr_bams = ",".join(sr_bams) disc_bams = ",".join(disc_bams) exclude = "-x %s" % sv_exclude_bed if utils.file_exists( sv_exclude_bed) else "" ref_file = dd.get_ref_file(items[0]) # use our bcbio python for runs within lumpyexpress curpython_dir = os.path.dirname(sys.executable) cmd = ( "export PATH={curpython_dir}:$PATH && " "lumpyexpress -v -B {full_bams} -S {sr_bams} -D {disc_bams} " "{exclude} -T {tmpdir} -o {tx_out_file}") do.run(cmd.format(**locals()), "lumpyexpress", items[0]) return vcfutils.sort_by_ref(out_file, items[0]), sv_exclude_bed
def _run_lumpy(full_bams, sr_bams, disc_bams, previous_evidence, work_dir, items): """Run lumpy-sv, using speedseq pipeline. """ batch = sshared.get_cur_batch(items) ext = "-%s-svs" % batch if batch else "-svs" out_file = os.path.join( work_dir, "%s%s.vcf" % (os.path.splitext(os.path.basename(items[0]["align_bam"]))[0], ext)) sv_exclude_bed = sshared.prepare_exclude_file(items, out_file) if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: with tx_tmpdir(items[0]) as tmpdir: full_bams = ",".join(full_bams) sr_bams = ",".join(sr_bams) disc_bams = ",".join(disc_bams) exclude = "-x %s" % sv_exclude_bed if ( sv_exclude_bed and utils.file_exists(sv_exclude_bed)) else "" ref_file = dd.get_ref_file(items[0]) depths = [] for sample, ev_files in previous_evidence.items(): for ev_type, ev_file in ev_files.items(): if utils.file_exists(ev_file): depths.append("%s:%s" % (sample, ev_file)) depth_arg = "-d %s" % ",".join(depths) if len( depths) > 0 else "" # use our bcbio python for runs within lumpyexpress exports = utils.local_path_export() cmd = ( "{exports}lumpyexpress -v -B {full_bams} -S {sr_bams} -D {disc_bams} " "{exclude} {depth_arg} -T {tmpdir} -o {tx_out_file}") do.run(cmd.format(**locals()), "lumpyexpress", items[0]) return vcfutils.sort_by_ref(out_file, items[0]), sv_exclude_bed
def _prioritize_vcf(caller, vcf_file, prioritize_by, post_prior_fn, work_dir, data): """Provide prioritized tab delimited output for a single caller. """ sample = dd.get_sample_name(data) out_file = os.path.join(work_dir, "%s-%s-prioritize.tsv" % (sample, caller)) simple_vcf = os.path.join(work_dir, "%s-%s-simple.vcf.gz" % (sample, caller)) if not utils.file_exists(simple_vcf): gene_list = _find_gene_list_from_bed(prioritize_by, out_file, data) # If we have a standard gene list we can skip BED based prioritization priority_vcf = "%s.vcf.gz" % utils.splitext_plus(out_file)[0] if gene_list: if vcf_file.endswith(".vcf.gz"): utils.symlink_plus(vcf_file, priority_vcf) else: assert vcf_file.endswith(".vcf") utils.symlink_plus(vcf_file, priority_vcf.replace(".vcf.gz", ".vcf")) vcfutils.bgzip_and_index(priority_vcf.replace(".vcf.gz", ".vcf"), data["config"], remove_orig=False) # otherwise prioritize based on BED and proceed else: if not utils.file_exists(priority_vcf): with file_transaction(data, priority_vcf) as tx_out_file: resources = config_utils.get_resources("bcbio_prioritize", data["config"]) jvm_opts = resources.get("jvm_opts", ["-Xms1g", "-Xmx4g"]) jvm_opts = config_utils.adjust_opts(jvm_opts, {"algorithm": {"memory_adjust": {"direction": "increase", "maximum": "30000M", "magnitude": dd.get_cores(data)}}}) jvm_opts = " ".join(jvm_opts) export = utils.local_path_export() cmd = ("{export} bcbio-prioritize {jvm_opts} known -i {vcf_file} -o {tx_out_file} " " -k {prioritize_by}") do.run(cmd.format(**locals()), "Prioritize: select in known regions of interest") data_dir = os.path.dirname(os.path.realpath(utils.which("simple_sv_annotation.py"))) with file_transaction(data, simple_vcf) as tx_out_file: fusion_file = os.path.join(data_dir, "fusion_pairs.txt") opts = "" if os.path.exists(fusion_file): opts += " --known_fusion_pairs %s" % fusion_file if not gene_list: opts += " --gene_list %s" % os.path.join(data_dir, "az-cancer-panel.txt") else: opts += " --gene_list %s" % gene_list cmd = "simple_sv_annotation.py {opts} -o - {priority_vcf} | bgzip -c > {tx_out_file}" do.run(cmd.format(**locals()), "Prioritize: simplified annotation output") simple_vcf = vcfutils.bgzip_and_index(vcfutils.sort_by_ref(simple_vcf, data), data["config"]) if post_prior_fn: simple_vcf = post_prior_fn(simple_vcf, work_dir, data) if not utils.file_uptodate(out_file, simple_vcf): with file_transaction(data, out_file) as tx_out_file: export = utils.local_path_export(env_cmd="vawk") cmd = ("{export} zcat {simple_vcf} | vawk -v SNAME={sample} -v CALLER={caller} " """'{{if (($7 == "PASS" || $7 == ".") && (S${sample}$GT != "0/0")) """ "print CALLER,SNAME,$1,$2,I$END," """I$SVTYPE=="BND" ? I$SVTYPE":"$3":"I$MATEID : I$SVTYPE,""" "I$LOF,I$SIMPLE_ANN," "S${sample}$SR,S${sample}$PE,S${sample}$PR}}' > {tx_out_file}") do.run(cmd.format(**locals()), "Prioritize: convert to tab delimited") return out_file, simple_vcf
def to_vcf(in_tsv, data): """Convert seq2c output file into BED output. """ call_convert = {"Amp": "DUP", "Del": "DEL"} out_file = "%s.vcf" % utils.splitext_plus(in_tsv)[0] if not utils.file_uptodate(out_file, in_tsv): with file_transaction(data, out_file) as tx_out_file: with open(in_tsv) as in_handle: with open(tx_out_file, "w") as out_handle: out_handle.write( VCF_HEADER + "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t%s\n" % (dd.get_sample_name(data))) header = in_handle.readline().split("\t") for cur in (dict(zip(header, l.split("\t"))) for l in in_handle): if cur["Amp_Del"] in call_convert: svtype = call_convert[cur["Amp_Del"]] info = "SVTYPE=%s;END=%s;SVLEN=%s;FOLD_CHANGE_LOG=%s;PROBES=%s;GENE=%s" % ( svtype, cur["End"], int(cur["End"]) - int(cur["Start"]), cur["Log2ratio"], cur["Ab_Seg"], cur["Gene"]) out_handle.write("\t".join([ cur["Chr"], cur["Start"], ".", "N", "<%s>" % (svtype), ".", ".", info, "GT", "1/1" ]) + "\n") return vcfutils.sort_by_ref(out_file, data)
def _prioritize_vcf(caller, vcf_file, prioritize_by, post_prior_fn, work_dir, data): """Provide prioritized tab delimited output for a single caller. """ sample = dd.get_sample_name(data) out_file = os.path.join(work_dir, "%s-%s-prioritize.tsv" % (sample, caller)) simple_vcf = os.path.join(work_dir, "%s-%s-simple.vcf.gz" % (sample, caller)) if not utils.file_exists(simple_vcf): gene_list = _find_gene_list_from_bed(prioritize_by, out_file, data) # If we have a standard gene list we can skip BED based prioritization priority_vcf = "%s.vcf.gz" % utils.splitext_plus(out_file)[0] if gene_list: if vcf_file.endswith(".vcf.gz"): utils.symlink_plus(vcf_file, priority_vcf) else: assert vcf_file.endswith(".vcf") utils.symlink_plus(vcf_file, priority_vcf.replace(".vcf.gz", ".vcf")) vcfutils.bgzip_and_index(priority_vcf.replace(".vcf.gz", ".vcf"), data["config"], remove_orig=False) # otherwise prioritize based on BED and proceed else: if not utils.file_exists(priority_vcf): with file_transaction(data, priority_vcf) as tx_out_file: resources = config_utils.get_resources("bcbio_prioritize", data["config"]) jvm_opts = resources.get("jvm_opts", ["-Xms1g", "-Xmx4g"]) jvm_opts = config_utils.adjust_opts(jvm_opts, {"algorithm": {"memory_adjust": {"direction": "increase", "maximum": "30000M", "magnitude": dd.get_cores(data)}}}) jvm_opts = " ".join(jvm_opts) export = utils.local_path_export() cmd = ("{export} bcbio-prioritize {jvm_opts} known -i {vcf_file} -o {tx_out_file} " " -k {prioritize_by}") do.run(cmd.format(**locals()), "Prioritize: select in known regions of interest") data_dir = os.path.dirname(os.path.realpath(utils.which("simple_sv_annotation.py"))) with file_transaction(data, simple_vcf) as tx_out_file: fusion_file = os.path.join(data_dir, "fusion_pairs.txt") opts = "" if os.path.exists(fusion_file): opts += " --known_fusion_pairs %s" % fusion_file if not gene_list: opts += " --gene_list %s" % os.path.join(data_dir, "az-cancer-panel.txt") else: opts += " --gene_list %s" % gene_list cmd = "simple_sv_annotation.py {opts} -o - {priority_vcf} | bgzip -c > {tx_out_file}" do.run(cmd.format(**locals()), "Prioritize: simplified annotation output") simple_vcf = vcfutils.bgzip_and_index(vcfutils.sort_by_ref(simple_vcf, data), data["config"]) if post_prior_fn: simple_vcf = post_prior_fn(simple_vcf, work_dir, data) if not utils.file_uptodate(out_file, simple_vcf): with file_transaction(data, out_file) as tx_out_file: export = utils.local_path_export() cmd = ("{export} zcat {simple_vcf} | vawk -v SNAME={sample} -v CALLER={caller} " """'{{if (($7 == "PASS" || $7 == ".") && (S${sample}$GT != "0/0")) """ "print CALLER,SNAME,$1,$2,I$END," """I$SVTYPE=="BND" ? I$SVTYPE":"$3":"I$MATEID : I$SVTYPE,""" "I$LOF,I$SIMPLE_ANN," "S${sample}$SR,S${sample}$PE,S${sample}$PR}}' > {tx_out_file}") do.run(cmd.format(**locals()), "Prioritize: convert to tab delimited") return out_file, simple_vcf
def to_vcf(in_file, caller, header_fn, vcf_fn, data, sep="\t"): """Convert output TitanCNA segs file into bgzipped VCF.""" out_file = "%s.vcf" % utils.splitext_plus(in_file)[0] out_file_gz = out_file + ".gz" if not utils.file_exists(out_file + ".gz") and not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: with open(in_file) as in_handle: with open(tx_out_file, "w") as out_handle: out_handle.write(_vcf_header.format(caller=caller)) out_handle.write("\t".join([ "#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT", dd.get_sample_name(data) ]) + "\n") header, in_handle = header_fn(in_handle) for line in in_handle: out = vcf_fn(dict(zip(header, line.strip().split(sep)))) if out: out_handle.write("\t".join(out) + "\n") # also does bgzip and index out_file_prep_vcf_gz = vcfutils.sort_by_ref(out_file, data) shutil.move(out_file_prep_vcf_gz, out_file_gz) shutil.move(out_file_prep_vcf_gz + ".tbi", out_file_gz + ".tbi") effects_vcf, _ = effects.add_to_vcf(out_file_gz, data, "snpeff") return effects_vcf or out_file_gz
def _bedpe_to_vcf(bedpe_file, sconfig_file, items): """Convert BEDPE output into a VCF file. """ tovcf_script = do.find_cmd("bedpeToVcf") if tovcf_script: out_file = "%s.vcf.gz" % utils.splitext_plus(bedpe_file)[0] out_nogzip = out_file.replace(".vcf.gz", ".vcf") raw_file = "%s-raw.vcf" % utils.splitext_plus(bedpe_file)[0] if not utils.file_exists(out_file): if not utils.file_exists(raw_file): with file_transaction(raw_file) as tx_raw_file: ref_file = tz.get_in(["reference", "fasta", "base"], items[0]) cmd = [ sys.executable, tovcf_script, "-c", sconfig_file, "-f", ref_file, "-b", bedpe_file, "-o", tx_raw_file, ] do.run(cmd, "Convert lumpy bedpe output to VCF") prep_file = vcfutils.sort_by_ref(raw_file, items[0]) if not utils.file_exists(out_nogzip): utils.symlink_plus(prep_file, out_nogzip) out_file = vcfutils.bgzip_and_index(out_nogzip, items[0]["config"]) return out_file
def _prioritize_vcf(caller, vcf_file, prioritize_by, post_prior_fn, work_dir, data): """Provide prioritized tab delimited output for a single caller. """ sample = dd.get_sample_name(data) out_file = os.path.join(work_dir, "%s-%s-prioritize.tsv" % (sample, caller)) if not utils.file_exists(out_file): priority_vcf = "%s.vcf.gz" % utils.splitext_plus(out_file)[0] if not utils.file_exists(priority_vcf): with file_transaction(data, priority_vcf) as tx_out_file: cmd = ("bcbio-prioritize known -i {vcf_file} -o {tx_out_file} -k {prioritize_by}") do.run(cmd.format(**locals()), "Prioritize: select in known regions of interest") if post_prior_fn: priority_vcf = post_prior_fn(priority_vcf, work_dir, data) simple_vcf = "%s-simple.vcf.gz" % utils.splitext_plus(priority_vcf)[0] if not utils.file_exists(simple_vcf): with file_transaction(data, simple_vcf) as tx_out_file: transcript_file = regions.get_sv_bed(data, "transcripts1000", work_dir) if transcript_file: transcript_file = vcfutils.bgzip_and_index(transcript_file, data["config"]) ann_opt = "--gene_bed %s" % transcript_file else: ann_opt = "" cmd = "simple_sv_annotation.py {ann_opt} -o - {priority_vcf} | bgzip -c > {tx_out_file}" do.run(cmd.format(**locals()), "Prioritize: simplified annotation output") simple_vcf = vcfutils.bgzip_and_index(vcfutils.sort_by_ref(simple_vcf, data), data["config"]) with file_transaction(data, out_file) as tx_out_file: cmd = ("zcat {simple_vcf} | vawk -v SNAME={sample} -v CALLER={caller} " """'{{if (($7 == "PASS" || $7 == ".") && (S${sample}$GT != "0/0")) """ "print CALLER,SNAME,$1,$2,I$END," """I$SVTYPE=="BND" ? I$SVTYPE":"$3":"I$MATEID : I$SVTYPE,""" "I$KNOWN,I$END_GENE,I$LOF,I$SIMPLE_ANN," "S${sample}$SR,S${sample}$PE}}' > {tx_out_file}") do.run(cmd.format(**locals()), "Prioritize: convert to tab delimited") return out_file
def _run_lumpy(full_bams, sr_bams, disc_bams, previous_evidence, work_dir, items): """Run lumpy-sv, using speedseq pipeline. """ batch = sshared.get_cur_batch(items) ext = "-%s-svs" % batch if batch else "-svs" out_file = os.path.join(work_dir, "%s%s.vcf" % (os.path.splitext(os.path.basename(items[0]["align_bam"]))[0], ext)) sv_exclude_bed = sshared.prepare_exclude_file(items, out_file) if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: with tx_tmpdir(items[0]) as tmpdir: full_bams = ",".join(full_bams) sr_bams = ",".join(sr_bams) disc_bams = ",".join(disc_bams) exclude = "-x %s" % sv_exclude_bed if (sv_exclude_bed and utils.file_exists(sv_exclude_bed)) else "" ref_file = dd.get_ref_file(items[0]) depths = [] for sample, ev_files in previous_evidence.items(): for ev_type, ev_file in ev_files.items(): if utils.file_exists(ev_file): depths.append("%s:%s" % (sample, ev_file)) depth_arg = "-d %s" % ",".join(depths) if len(depths) > 0 else "" # use our bcbio python for runs within lumpyexpress exports = utils.local_path_export() cmd = ("{exports}lumpyexpress -v -B {full_bams} -S {sr_bams} -D {disc_bams} " "{exclude} {depth_arg} -T {tmpdir} -o {tx_out_file}") do.run(cmd.format(**locals()), "lumpyexpress", items[0]) return vcfutils.sort_by_ref(out_file, items[0]), sv_exclude_bed
def _prioritize_vcf(caller, vcf_file, prioritize_by, post_prior_fn, work_dir, data): """Provide prioritized tab delimited output for a single caller. """ sample = dd.get_sample_name(data) out_file = os.path.join(work_dir, "%s-%s-prioritize.tsv" % (sample, caller)) if not utils.file_exists(out_file): priority_vcf = "%s.vcf.gz" % utils.splitext_plus(out_file)[0] if not utils.file_exists(priority_vcf): with file_transaction(data, priority_vcf) as tx_out_file: resources = config_utils.get_resources("bcbio_prioritize", data["config"]) jvm_opts = " ".join( resources.get("jvm_opts", ["-Xms1g", "-Xmx4g"])) cmd = ( "bcbio-prioritize {jvm_opts} known -i {vcf_file} -o {tx_out_file} -k {prioritize_by}" ) do.run(cmd.format(**locals()), "Prioritize: select in known regions of interest") if post_prior_fn: priority_vcf = post_prior_fn(priority_vcf, work_dir, data) simple_vcf = "%s-simple.vcf.gz" % utils.splitext_plus(priority_vcf)[0] if not utils.file_exists(simple_vcf): with file_transaction(data, simple_vcf) as tx_out_file: transcript_file = regions.get_sv_bed(data, "transcripts1000", work_dir) if transcript_file: transcript_file = vcfutils.bgzip_and_index( transcript_file, data["config"]) ann_opt = "--gene_bed %s" % transcript_file else: ann_opt = "" cmd = "simple_sv_annotation.py {ann_opt} -o - {priority_vcf} | bgzip -c > {tx_out_file}" do.run(cmd.format(**locals()), "Prioritize: simplified annotation output") simple_vcf = vcfutils.bgzip_and_index( vcfutils.sort_by_ref(simple_vcf, data), data["config"]) with file_transaction(data, out_file) as tx_out_file: cmd = ( "zcat {simple_vcf} | vawk -v SNAME={sample} -v CALLER={caller} " """'{{if (($7 == "PASS" || $7 == ".") && (S${sample}$GT != "0/0")) """ "print CALLER,SNAME,$1,$2,I$END," """I$SVTYPE=="BND" ? I$SVTYPE":"$3":"I$MATEID : I$SVTYPE,""" "I$KNOWN,I$END_GENE,I$LOF,I$SIMPLE_ANN," "S${sample}$SR,S${sample}$PE}}' > {tx_out_file}") do.run(cmd.format(**locals()), "Prioritize: convert to tab delimited") return out_file
def _run_svtyper(in_file, full_bam, sr_bam, data): """Genotype structural variant calls with SVtyper. """ out_file = "%s-wgts.vcf.gz" % utils.splitext_plus(in_file)[0] if not utils.file_uptodate(out_file, in_file): with file_transaction(data, out_file) as tx_out_file: if not vcfutils.vcf_has_variants(in_file): shutil.copy(in_file, out_file) else: python = sys.executable svtyper = os.path.join(os.path.dirname(sys.executable), "svtyper") cmd = ("gunzip -c {in_file} | " "{python} {svtyper} -B {full_bam} -S {sr_bam} | " "bgzip -c > {tx_out_file}") do.run(cmd.format(**locals()), "SV genotyping with svtyper") return vcfutils.sort_by_ref(out_file, data)
def _run_wham_coords(inputs, background_bams, coords, final_file): """Run WHAM on a specific set of chromosome, start, end coordinates. """ base, ext = utils.splitext_plus(final_file) raw_file = "%s-%s.vcf" % (base, region.to_safestr(coords)) all_bams = ",".join([x["align_bam"] for x in inputs] + background_bams) if not utils.file_exists(raw_file): with file_transaction(inputs[0], raw_file) as tx_raw_file: cores = dd.get_cores(inputs[0]) ref_file = dd.get_ref_file(inputs[0]) coord_str = bamprep.region_to_gatk(coords) opts = "-k -m 30" cmd = ("WHAM-GRAPHENING {opts} -x {cores} -a {ref_file} -f {all_bams} -r {coord_str} " "> {tx_raw_file}") do.run(cmd.format(**locals()), "Run WHAM: %s" % region.to_safestr(coords)) merge_vcf = _run_wham_merge(raw_file, inputs[0]) gt_vcf = _run_wham_genotype(merge_vcf, all_bams, coords, inputs[0]) prep_vcf = vcfutils.sort_by_ref(gt_vcf, inputs[0]) return [[coords, prep_vcf]]
def _bedpe_to_vcf(bedpe_file, sconfig_file, items): """Convert BEDPE output into a VCF file. """ tovcf_script = do.find_cmd("bedpeToVcf") if tovcf_script: out_file = "%s.vcf.gz" % utils.splitext_plus(bedpe_file)[0] out_nogzip = out_file.replace(".vcf.gz", ".vcf") raw_file = "%s-raw.vcf" % utils.splitext_plus(bedpe_file)[0] if not utils.file_exists(out_file): if not utils.file_exists(raw_file): with file_transaction(items[0], raw_file) as tx_raw_file: cmd = [sys.executable, tovcf_script, "-c", sconfig_file, "-f", dd.get_ref_file(items[0]), "-t", "LUMPY", "-b", bedpe_file, "-o", tx_raw_file] do.run(cmd, "Convert lumpy bedpe output to VCF") prep_file = vcfutils.sort_by_ref(raw_file, items[0]) if not utils.file_exists(out_nogzip): utils.symlink_plus(prep_file, out_nogzip) out_file = vcfutils.bgzip_and_index(out_nogzip, items[0]["config"]) return out_file
def to_vcf(in_tsv, data): """Convert seq2c output file into BED output. """ call_convert = {"Amp": "DUP", "Del": "DEL"} out_file = "%s.vcf" % utils.splitext_plus(in_tsv)[0] if not utils.file_uptodate(out_file, in_tsv): with file_transaction(data, out_file) as tx_out_file: with open(in_tsv) as in_handle: with open(tx_out_file, "w") as out_handle: out_handle.write(VCF_HEADER + "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t%s\n" % (dd.get_sample_name(data))) header = in_handle.readline().split("\t") for cur in (dict(zip(header, l.split("\t"))) for l in in_handle): if cur["Amp_Del"] in call_convert: svtype = call_convert[cur["Amp_Del"]] info = "SVTYPE=%s;END=%s;SVLEN=%s;FOLD_CHANGE_LOG=%s;PROBES=%s;GENE=%s" % ( svtype, cur["End"], int(cur["End"]) - int(cur["Start"]), cur["Log2ratio"], cur["Ab_Seg"], cur["Gene"]) out_handle.write("\t".join([cur["Chr"], cur["Start"], ".", "N", "<%s>" % (svtype), ".", ".", info, "GT", "1/1"]) + "\n") return vcfutils.sort_by_ref(out_file, data)
def _run_svtyper(in_file, full_bam, sr_bam, exclude_file, data): """Genotype structural variant calls with SVtyper. Removes calls in high depth regions to avoid slow runtimes: https://github.com/hall-lab/svtyper/issues/16 """ out_file = "%s-wgts.vcf.gz" % utils.splitext_plus(in_file)[0] if not utils.file_uptodate(out_file, in_file): with file_transaction(data, out_file) as tx_out_file: if not vcfutils.vcf_has_variants(in_file): shutil.copy(in_file, out_file) else: python = sys.executable svtyper = os.path.join(os.path.dirname(sys.executable), "svtyper") if exclude_file and utils.file_exists(exclude_file): regions_to_rm = "-T ^%s" % (exclude_file) else: regions_to_rm = "" cmd = ("bcftools view {in_file} {regions_to_rm} | " "{python} {svtyper} -M -B {full_bam} -S {sr_bam} | " "bgzip -c > {tx_out_file}") do.run(cmd.format(**locals()), "SV genotyping with svtyper") return vcfutils.sort_by_ref(out_file, data)
def _run_lumpy(full_bams, sr_bams, disc_bams, work_dir, items): """Run lumpy-sv, using speedseq pipeline. """ batch = sshared.get_cur_batch(items) ext = "-%s-svs" % batch if batch else "-svs" out_file = os.path.join(work_dir, "%s%s.vcf" % (os.path.splitext(os.path.basename(items[0]["align_bam"]))[0], ext)) sv_exclude_bed = sshared.prepare_exclude_file(items, out_file) if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: with tx_tmpdir(items[0]) as tmpdir: full_bams = ",".join(full_bams) sr_bams = ",".join(sr_bams) disc_bams = ",".join(disc_bams) exclude = "-x %s" % sv_exclude_bed if utils.file_exists(sv_exclude_bed) else "" ref_file = dd.get_ref_file(items[0]) # use our bcbio python for runs within lumpyexpress curpython_dir = os.path.dirname(sys.executable) cmd = ("export PATH={curpython_dir}:$PATH && " "lumpyexpress -v -B {full_bams} -S {sr_bams} -D {disc_bams} " "{exclude} -T {tmpdir} -o {tx_out_file}") do.run(cmd.format(**locals()), "lumpyexpress", items[0]) return vcfutils.sort_by_ref(out_file, items[0]), sv_exclude_bed