def _cluster(bam_file, ma_file, out_dir, reference, annotation_file=None): """ Connect to seqcluster to run cluster with python directly """ seqcluster = op.join(get_bcbio_bin(), "seqcluster") # cl = ["cluster", "-o", out_dir, "-m", ma_file, "-a", bam_file, "-r", reference] if annotation_file: annotation_file = "-g " + annotation_file else: annotation_file = "" if not file_exists(op.join(out_dir, "counts.tsv")): cmd = ( "{seqcluster} cluster -o {out_dir} -m {ma_file} -a {bam_file} -r {reference} {annotation_file}" ) do.run(cmd.format(**locals()), "Running seqcluster.") counts = op.join(out_dir, "counts.tsv") stats = op.join(out_dir, "read_stats.tsv") json = op.join(out_dir, "seqcluster.json") return { 'out_dir': out_dir, 'count_file': counts, 'stat_file': stats, 'json': json }
def remove_bcbiopath(): """Remove bcbio internal path from first element in PATH. Useful when we need to access remote programs, like Java 7 for older installations. """ to_remove = os.environ.get("BCBIOPATH", utils.get_bcbio_bin()) + ":" if os.environ["PATH"].startswith(to_remove): os.environ["PATH"] = os.environ["PATH"][len(to_remove):]
def _collapse(in_file): """ Collpase reads into unique sequences with seqcluster """ seqcluster = op.join(utils.get_bcbio_bin(), "seqcluster") out_file = "%s.fastq" % utils.splitext_plus(append_stem(in_file, "_trimmed"))[0] out_dir = os.path.dirname(in_file) if file_exists(out_file): return out_file cmd = ("{seqcluster} collapse -o {out_dir} -f {in_file} -m 1 --min_size 16") do.run(cmd.format(**locals()), "Running seqcluster collapse in %s." % in_file) return out_file
def _collapse(in_file): """ Collpase reads into unique sequences with seqcluster """ seqcluster = op.join(utils.get_bcbio_bin(), "seqcluster") out_file = "%s.fastq" % utils.splitext_plus(append_stem(in_file, "_trimmed"))[0] out_dir = os.path.dirname(in_file) if file_exists(out_file): return out_file cmd = ("{seqcluster} collapse -o {out_dir} -f {in_file} -m 1 --min_size 16") do.run(cmd.format(**locals()), "Running seqcluster collapse in %s." % in_file) return out_file
def prepend_bcbiopath(): """Prepend paths in the BCBIOPATH environment variable (if any) to PATH. Uses either a pre-sent global environmental variable (BCBIOPATH) or the local anaconda directory. """ if os.environ.get('BCBIOPATH'): os.environ['PATH'] = _prepend(os.environ.get('PATH', ''), os.environ.get('BCBIOPATH', None)) else: os.environ['PATH'] = _prepend(os.environ.get('PATH', ''), utils.get_bcbio_bin())
def _report(data, reference): """ Run report of seqcluster to get browser options for results """ seqcluster = op.join(get_bcbio_bin(), "seqcluster") work_dir = dd.get_work_dir(data) out_dir = safe_makedir(os.path.join(work_dir, "seqcluster", "report")) out_file = op.join(out_dir, "seqcluster.db") json = op.join(work_dir, "seqcluster", "cluster", "seqcluster.json") cmd = ("{seqcluster} report -o {out_dir} -r {reference} -j {json}") if not file_exists(out_file): do.run(cmd.format(**locals()), "Run report on clusters") return out_file
def run_gatk(self, params, tmp_dir=None, log_error=True, data=None, region=None, memscale=None, parallel_gc=False, ld_preload=False): """Top level interface to running a GATK command. ld_preload injects required libraries for Java JNI calls: https://gatkforums.broadinstitute.org/gatk/discussion/8810/something-about-create-pon-workflow """ needs_java7 = LooseVersion( self.get_gatk_version()) < LooseVersion("3.6") # For old Java requirements use global java 7 if needs_java7: setpath.remove_bcbiopath() with tx_tmpdir(self._config) as local_tmp_dir: if tmp_dir is None: tmp_dir = local_tmp_dir cl = self.cl_gatk(params, tmp_dir, memscale=memscale, parallel_gc=parallel_gc) atype_index = params.index("-T") if params.count("-T") > 0 \ else params.index("--analysis_type") prog = params[atype_index + 1] cl = fix_missing_spark_user(cl, prog, params) if ld_preload: cl = "export LD_PRELOAD=%s/lib/libopenblas.so && %s" % ( os.path.dirname(utils.get_bcbio_bin()), cl) do.run(cl, "GATK: {0}".format(prog), data, region=region, log_error=log_error) if needs_java7: setpath.prepend_bcbiopath()
def _get_ericscript_path(self): """Retrieve PATH to the isolated eriscript anaconda environment. """ es = utils.which(os.path.join(utils.get_bcbio_bin(), self.EXECUTABLE)) return os.path.dirname(os.path.realpath(es))
def run_gatk(self, params, tmp_dir=None, log_error=True, data=None, region=None, memscale=None, parallel_gc=False, ld_preload=False): """Top level interface to running a GATK command. ld_preload injects required libraries for Java JNI calls: https://gatkforums.broadinstitute.org/gatk/discussion/8810/something-about-create-pon-workflow """ needs_java7 = LooseVersion(self.get_gatk_version()) < LooseVersion("3.6") # For old Java requirements use global java 7 if needs_java7: setpath.remove_bcbiopath() with tx_tmpdir(self._config) as local_tmp_dir: if tmp_dir is None: tmp_dir = local_tmp_dir cl = self.cl_gatk(params, tmp_dir, memscale=memscale, parallel_gc=parallel_gc) atype_index = params.index("-T") if params.count("-T") > 0 \ else params.index("--analysis_type") prog = params[atype_index + 1] cl = fix_missing_spark_user(cl, prog, params) if ld_preload: cl = "export LD_PRELOAD=%s/lib/libopenblas.so && %s" % (os.path.dirname(utils.get_bcbio_bin()), cl) do.run(cl, "GATK: {0}".format(prog), data, region=region, log_error=log_error) if needs_java7: setpath.prepend_bcbiopath()
def _run_vardict_paired(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect variants with Vardict. This is used for paired tumor / normal samples. """ config = items[0]["config"] if out_file is None: out_file = "%s-paired-variants.vcf.gz" % os.path.splitext( align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: target = shared.subset_variant_regions(dd.get_variant_regions( items[0]), region, out_file, do_merge=True) paired = vcfutils.get_paired_bams(align_bams, items) if not _is_bed_file(target): vcfutils.write_empty_vcf( tx_out_file, config, samples=[ x for x in [paired.tumor_name, paired.normal_name] if x ]) else: if not paired.normal_bam: ann_file = _run_vardict_caller(align_bams, items, ref_file, assoc_files, region, out_file) return ann_file vardict = get_vardict_command(items[0]) vcfstreamsort = config_utils.get_program( "vcfstreamsort", config) strandbias = "testsomatic.R" var2vcf = "var2vcf_paired.pl" compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" freq = float( utils.get_in(config, ("algorithm", "min_allele_fraction"), 10)) / 100.0 # merge bed file regions as amplicon VarDict is only supported in single sample mode opts, var2vcf_opts = _vardict_options_from_config( items, config, out_file, target) fix_ambig_ref = vcfutils.fix_ambiguous_cl() fix_ambig_alt = vcfutils.fix_ambiguous_cl(5) remove_dup = vcfutils.remove_dup_cl() if any("vardict_somatic_filter" in tz.get_in(( "config", "algorithm", "tools_off"), data, []) for data in items): somatic_filter = "" freq_filter = "" else: var2vcf_opts += " -M " # this makes VarDict soft filter non-differential variants somatic_filter = ( "| sed 's/\\\\.*Somatic\\\\/Somatic/' " "| sed 's/REJECT,Description=\".*\">/REJECT,Description=\"Not Somatic via VarDict\">/' " """| %s -c 'from bcbio.variation import freebayes; """ """freebayes.call_somatic("%s", "%s")' """ % (sys.executable, paired.tumor_name, paired.normal_name)) freq_filter = ( "| bcftools filter -m '+' -s 'REJECT' -e 'STATUS !~ \".*Somatic\"' 2> /dev/null " "| %s -x 'bcbio.variation.vardict.depth_freq_filter(x, %s, \"%s\")'" % (os.path.join(os.path.dirname(sys.executable), "py"), 0, dd.get_aligner(paired.tumor_data))) jvm_opts = _get_jvm_opts(items[0], tx_out_file) py_cl = os.path.join(utils.get_bcbio_bin(), "py") setup = ("%s && unset JAVA_HOME &&" % utils.get_R_exports()) contig_cl = vcfutils.add_contig_to_header_cl( ref_file, tx_out_file) cmd = ( "{setup}{jvm_opts}{vardict} -G {ref_file} -f {freq} " "-N {paired.tumor_name} -b \"{paired.tumor_bam}|{paired.normal_bam}\" {opts} " "| {strandbias} " "| {var2vcf} -P 0.9 -m 4.25 -f {freq} {var2vcf_opts} " "-N \"{paired.tumor_name}|{paired.normal_name}\" " "| {contig_cl} {freq_filter} " "| bcftools filter -i 'QUAL >= 0' " "{somatic_filter} | {fix_ambig_ref} | {fix_ambig_alt} | {remove_dup} | {vcfstreamsort} " "{compress_cmd} > {tx_out_file}") do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {}) return out_file
def _run_vardict_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect SNPs and indels with VarDict. var2vcf_valid uses -A flag which reports all alleles and improves sensitivity: https://github.com/AstraZeneca-NGS/VarDict/issues/35#issuecomment-276738191 """ config = items[0]["config"] if out_file is None: out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: vrs = bedutils.population_variant_regions(items) target = shared.subset_variant_regions(vrs, region, out_file, items=items, do_merge=False) num_bams = len(align_bams) sample_vcf_names = [ ] # for individual sample names, given batch calling may be required for bamfile, item in zip(align_bams, items): # prepare commands sample = dd.get_sample_name(item) vardict = get_vardict_command(items[0]) strandbias = "teststrandbias.R" var2vcf = "var2vcf_valid.pl" opts, var2vcf_opts = _vardict_options_from_config( items, config, out_file, target) vcfstreamsort = config_utils.get_program( "vcfstreamsort", config) compress_cmd = "| bgzip -c" if tx_out_file.endswith( "gz") else "" freq = float( utils.get_in(config, ("algorithm", "min_allele_fraction"), 10)) / 100.0 fix_ambig_ref = vcfutils.fix_ambiguous_cl() fix_ambig_alt = vcfutils.fix_ambiguous_cl(5) remove_dup = vcfutils.remove_dup_cl() py_cl = os.path.join(utils.get_bcbio_bin(), "py") jvm_opts = _get_jvm_opts(items[0], tx_out_file) setup = ("%s && unset JAVA_HOME &&" % utils.get_R_exports()) contig_cl = vcfutils.add_contig_to_header_cl( ref_file, tx_out_file) cmd = ( "{setup}{jvm_opts}{vardict} -G {ref_file} -f {freq} " "-N {sample} -b {bamfile} {opts} " "| {strandbias}" "| {var2vcf} -A -N {sample} -E -f {freq} {var2vcf_opts} " "| {contig_cl} | bcftools filter -i 'QUAL >= 0' " "| {fix_ambig_ref} | {fix_ambig_alt} | {remove_dup} | {vcfstreamsort} {compress_cmd}" ) if num_bams > 1: temp_file_prefix = out_file.replace(".gz", "").replace( ".vcf", "") + item["name"][1] tmp_out = temp_file_prefix + ".temp.vcf" tmp_out += ".gz" if out_file.endswith("gz") else "" sample_vcf_names.append(tmp_out) with file_transaction(item, tmp_out) as tx_tmp_file: if not _is_bed_file(target): vcfutils.write_empty_vcf(tx_tmp_file, config, samples=[sample]) else: cmd += " > {tx_tmp_file}" do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {}) else: if not _is_bed_file(target): vcfutils.write_empty_vcf(tx_out_file, config, samples=[sample]) else: cmd += " > {tx_out_file}" do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {}) if num_bams > 1: # N.B. merge_variant_files wants region in 1-based end-inclusive # coordinates. Thus use bamprep.region_to_gatk vcfutils.merge_variant_files( orig_files=sample_vcf_names, out_file=tx_out_file, ref_file=ref_file, config=config, region=bamprep.region_to_gatk(region)) return out_file
def _get_ericscript_path(self): """Retrieve PATH to the isolated eriscript anaconda environment. """ es = utils.which(os.path.join(utils.get_bcbio_bin(), self.EXECUTABLE)) return os.path.dirname(os.path.realpath(es))
def add_contig_to_header_cl(data): return ("""%s -x 'bcbio.variation.vcfutils.add_contig_to_header(x, "%s")' """ % (os.path.join(utils.get_bcbio_bin(), "py"), dd.get_ref_file(data)))
def _run_vardict_paired(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect variants with Vardict. This is used for paired tumor / normal samples. """ config = items[0]["config"] if out_file is None: out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: vrs = bedutils.population_variant_regions(items) target = shared.subset_variant_regions(vrs, region, out_file, items=items, do_merge=True) paired = vcfutils.get_paired_bams(align_bams, items) if not _is_bed_file(target): vcfutils.write_empty_vcf(tx_out_file, config, samples=[x for x in [paired.tumor_name, paired.normal_name] if x]) else: if not paired.normal_bam: ann_file = _run_vardict_caller(align_bams, items, ref_file, assoc_files, region, out_file) return ann_file vardict = get_vardict_command(items[0]) vcfstreamsort = config_utils.get_program("vcfstreamsort", config) compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" freq = float(utils.get_in(config, ("algorithm", "min_allele_fraction"), 10)) / 100.0 # merge bed file regions as amplicon VarDict is only supported in single sample mode opts, var2vcf_opts = _vardict_options_from_config(items, config, out_file, target) fix_ambig_ref = vcfutils.fix_ambiguous_cl() fix_ambig_alt = vcfutils.fix_ambiguous_cl(5) remove_dup = vcfutils.remove_dup_cl() if any("vardict_somatic_filter" in tz.get_in(("config", "algorithm", "tools_off"), data, []) for data in items): somatic_filter = "" freq_filter = "" else: var2vcf_opts += " -M " # this makes VarDict soft filter non-differential variants somatic_filter = ("| sed 's/\\\\.*Somatic\\\\/Somatic/' " "| sed 's/REJECT,Description=\".*\">/REJECT,Description=\"Not Somatic via VarDict\">/' " """| %s -c 'from bcbio.variation import freebayes; """ """freebayes.call_somatic("%s", "%s")' """ % (sys.executable, paired.tumor_name, paired.normal_name)) freq_filter = ("| bcftools filter -m '+' -s 'REJECT' -e 'STATUS !~ \".*Somatic\"' 2> /dev/null " "| %s -x 'bcbio.variation.vardict.add_db_germline_flag(x)' " "| %s " "| %s -x 'bcbio.variation.vardict.depth_freq_filter(x, %s, \"%s\")'" % (os.path.join(os.path.dirname(sys.executable), "py"), _lowfreq_linear_filter(0, True), os.path.join(os.path.dirname(sys.executable), "py"), 0, bam.aligner_from_header(paired.tumor_bam))) jvm_opts = _get_jvm_opts(items[0], tx_out_file) py_cl = os.path.join(utils.get_bcbio_bin(), "py") setup = ("%s && unset JAVA_HOME &&" % utils.get_R_exports()) contig_cl = vcfutils.add_contig_to_header_cl(ref_file, tx_out_file) cmd = ("{setup}{jvm_opts}{vardict} -G {ref_file} -f {freq} " "-N {paired.tumor_name} -b \"{paired.tumor_bam}|{paired.normal_bam}\" {opts} " "| awk 'NF>=48' | testsomatic.R " "| var2vcf_paired.pl -P 0.9 -m 4.25 -f {freq} {var2vcf_opts} " "-N \"{paired.tumor_name}|{paired.normal_name}\" " "| {contig_cl} {freq_filter} " "| bcftools filter -i 'QUAL >= 0' " "{somatic_filter} | {fix_ambig_ref} | {fix_ambig_alt} | {remove_dup} | {vcfstreamsort} " "{compress_cmd} > {tx_out_file}") do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {}) return out_file
def _run_vardict_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect SNPs and indels with VarDict. var2vcf_valid uses -A flag which reports all alleles and improves sensitivity: https://github.com/AstraZeneca-NGS/VarDict/issues/35#issuecomment-276738191 """ config = items[0]["config"] if out_file is None: out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: vrs = bedutils.population_variant_regions(items) target = shared.subset_variant_regions( vrs, region, out_file, items=items, do_merge=False) num_bams = len(align_bams) sample_vcf_names = [] # for individual sample names, given batch calling may be required for bamfile, item in zip(align_bams, items): # prepare commands sample = dd.get_sample_name(item) vardict = get_vardict_command(items[0]) opts, var2vcf_opts = _vardict_options_from_config(items, config, out_file, target) vcfstreamsort = config_utils.get_program("vcfstreamsort", config) compress_cmd = "| bgzip -c" if tx_out_file.endswith("gz") else "" freq = float(utils.get_in(config, ("algorithm", "min_allele_fraction"), 10)) / 100.0 fix_ambig_ref = vcfutils.fix_ambiguous_cl() fix_ambig_alt = vcfutils.fix_ambiguous_cl(5) remove_dup = vcfutils.remove_dup_cl() py_cl = os.path.join(utils.get_bcbio_bin(), "py") jvm_opts = _get_jvm_opts(items[0], tx_out_file) setup = ("%s && unset JAVA_HOME &&" % utils.get_R_exports()) contig_cl = vcfutils.add_contig_to_header_cl(ref_file, tx_out_file) lowfreq_filter = _lowfreq_linear_filter(0, False) cmd = ("{setup}{jvm_opts}{vardict} -G {ref_file} -f {freq} " "-N {sample} -b {bamfile} {opts} " "| teststrandbias.R " "| var2vcf_valid.pl -A -N {sample} -E -f {freq} {var2vcf_opts} " "| {contig_cl} | bcftools filter -i 'QUAL >= 0' | {lowfreq_filter} " "| {fix_ambig_ref} | {fix_ambig_alt} | {remove_dup} | {vcfstreamsort} {compress_cmd}") if num_bams > 1: temp_file_prefix = out_file.replace(".gz", "").replace(".vcf", "") + item["name"][1] tmp_out = temp_file_prefix + ".temp.vcf" tmp_out += ".gz" if out_file.endswith("gz") else "" sample_vcf_names.append(tmp_out) with file_transaction(item, tmp_out) as tx_tmp_file: if not _is_bed_file(target): vcfutils.write_empty_vcf(tx_tmp_file, config, samples=[sample]) else: cmd += " > {tx_tmp_file}" do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {}) else: if not _is_bed_file(target): vcfutils.write_empty_vcf(tx_out_file, config, samples=[sample]) else: cmd += " > {tx_out_file}" do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {}) if num_bams > 1: # N.B. merge_variant_files wants region in 1-based end-inclusive # coordinates. Thus use bamprep.region_to_gatk vcfutils.merge_variant_files(orig_files=sample_vcf_names, out_file=tx_out_file, ref_file=ref_file, config=config, region=bamprep.region_to_gatk(region)) return out_file