def rnaseq_vardict_variant_calling(data): sample = dd.get_sample_name(data) variation_dir = os.path.join(dd.get_work_dir(data), "variation") safe_makedir(variation_dir) out_file = os.path.join(variation_dir, sample + "-vardict.vcf.gz") if file_exists(out_file): data = dd.set_vrn_file(data, out_file) return data vardict_cmd = vardict.get_vardict_command(data) strandbias = "teststrandbias.R" var2vcf = "var2vcf_valid.pl" vcfstreamsort = config_utils.get_program("vcfstreamsort", data) compress_cmd = "| bgzip -c" freq = float(dd.get_min_allele_fraction(data, 20) / 100.0) var2vcf_opts = "-v 50" fix_ambig = vcfutils.fix_ambiguous_cl() remove_dup = vcfutils.remove_dup_cl() r_setup = ("unset R_HOME && export PATH=%s:$PATH && " % os.path.dirname(Rscript_cmd())) ref_file = dd.get_ref_file(data) bamfile = dd.get_work_bam(data) bed_file = gtf.gtf_to_bed(dd.get_gtf_file(data)) opts = " -c 1 -S 2 -E 3 -g 4 " with file_transaction(out_file) as tx_out_file: jvm_opts = vardict._get_jvm_opts(data, tx_out_file) cmd = ("{r_setup}{jvm_opts}{vardict_cmd} -G {ref_file} -f {freq} " "-N {sample} -b {bamfile} {opts} {bed_file} " "| {strandbias}" "| {var2vcf} -N {sample} -E -f {freq} {var2vcf_opts} " "| {fix_ambig} | {remove_dup} | {vcfstreamsort} {compress_cmd} " "> {tx_out_file}") message = "Calling RNA-seq variants with VarDict" do.run(cmd.format(**locals()), message) data = dd.set_vrn_file(data, out_file) return data
def run_rnaseq_variant_calling(data): """ run RNA-seq variant calling, variation file is stored in `vrn_file` in the datadict """ variantcaller = dd.get_variantcaller(data) if isinstance(variantcaller, list) and len(variantcaller) > 1: logger.error("Only one variantcaller can be run for RNA-seq at " "this time. Post an issue here " "(https://github.com/bcbio/bcbio-nextgen/issues) " "if this is something you need to do.") sys.exit(1) if variantcaller: if "gatk-haplotype" in variantcaller: data = variation.rnaseq_gatk_variant_calling(data) if vardict.get_vardict_command(data): data = variation.rnaseq_vardict_variant_calling(data) if dd.get_vrn_file(data): ann_file = vcfanno.run_vcfanno(dd.get_vrn_file(data), ["rnaedit"], data) if ann_file: data = dd.set_vrn_file(data, ann_file) ann_file = population.run_vcfanno(dd.get_vrn_file(data), data, population.do_db_build([data])) if ann_file: data = dd.set_vrn_file(data, ann_file) return [[data]]
def run_rnaseq_variant_calling(data): variantcaller = dd.get_variantcaller(data) if isinstance(variantcaller, list) and len(variantcaller) > 1: logger.error("Only one variantcaller can be run for RNA-seq at " "this time. Post an issue here " "(https://github.com/chapmanb/bcbio-nextgen/issues) " "if this is something you need to do.") sys.exit(1) if variantcaller and "gatk" in variantcaller: data = variation.rnaseq_gatk_variant_calling(data) if vardict.get_vardict_command(data): data = variation.rnaseq_vardict_variant_calling(data) return [[data]]
def rnaseq_vardict_variant_calling(data): sample = dd.get_sample_name(data) out_dir = utils.safe_makedir( os.path.join(dd.get_work_dir(data), "variation", "rnaseq", "vardict")) out_file = os.path.join(out_dir, sample + "-vardict.vcf.gz") if file_exists(out_file): data = dd.set_vrn_file(data, out_file) return data vardict_cmd = vardict.get_vardict_command(data) strandbias = "teststrandbias.R" var2vcf = "var2vcf_valid.pl" vcfstreamsort = config_utils.get_program("vcfstreamsort", data) compress_cmd = "| bgzip -c" freq = float(dd.get_min_allele_fraction(data, 20) / 100.0) var2vcf_opts = "-v 50" fix_ambig = vcfutils.fix_ambiguous_cl() remove_dup = vcfutils.remove_dup_cl() r_setup = get_R_exports() ref_file = dd.get_ref_file(data) bamfile = dd.get_work_bam(data) data = _setup_variant_regions(data, out_dir) opts, _ = vardict._vardict_options_from_config( [data], data["config"], out_file, dd.get_variant_regions(data), is_rnaseq=True) cores = dd.get_num_cores(data) if cores and cores > 1: opts += " -th %s" % str(cores) with file_transaction(data, out_file) as tx_out_file: jvm_opts = vardict._get_jvm_opts(data, tx_out_file) cmd = ("{r_setup} && {jvm_opts}{vardict_cmd} -G {ref_file} -f {freq} " "-N {sample} -b {bamfile} {opts} " "| {strandbias}" "| {var2vcf} -N {sample} -E -f {freq} {var2vcf_opts} " "| {fix_ambig} | {remove_dup} | {vcfstreamsort} {compress_cmd} " "> {tx_out_file}") message = "Calling RNA-seq variants with VarDict" do.run(cmd.format(**locals()), message) out_file = vcfutils.bgzip_and_index(out_file, data["config"]) data = dd.set_vrn_file(data, out_file) return data
def run_rnaseq_variant_calling(data): """ run RNA-seq variant calling, variation file is stored in `vrn_file` in the datadict """ variantcaller = dd.get_variantcaller(data) if isinstance(variantcaller, list) and len(variantcaller) > 1: logger.error("Only one variantcaller can be run for RNA-seq at " "this time. Post an issue here " "(https://github.com/bcbio/bcbio-nextgen/issues) " "if this is something you need to do.") sys.exit(1) if variantcaller: if "gatk-haplotype" in variantcaller: data = variation.rnaseq_gatk_variant_calling(data) if vardict.get_vardict_command(data): data = variation.rnaseq_vardict_variant_calling(data) vrn_file = dd.get_vrn_file(data) return [[data]]
def rnaseq_vardict_variant_calling(data): sample = dd.get_sample_name(data) out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "variation", "rnaseq", "vardict")) out_file = os.path.join(out_dir, sample + "-vardict.vcf.gz") if file_exists(out_file): data = dd.set_vrn_file(data, out_file) return data vardict_cmd = vardict.get_vardict_command(data) strandbias = "teststrandbias.R" var2vcf = "var2vcf_valid.pl" vcfstreamsort = config_utils.get_program("vcfstreamsort", data) compress_cmd = "| bgzip -c" freq = float(dd.get_min_allele_fraction(data, 20) / 100.0) var2vcf_opts = "-v 50" fix_ambig = vcfutils.fix_ambiguous_cl() remove_dup = vcfutils.remove_dup_cl() r_setup = get_R_exports() ref_file = dd.get_ref_file(data) bamfile = dd.get_work_bam(data) data = _setup_variant_regions(data, out_dir) opts, _ = vardict._vardict_options_from_config([data], data["config"], out_file, dd.get_variant_regions(data), is_rnaseq=True) cores = dd.get_num_cores(data) if cores and cores > 1: opts += " -th %s" % str(cores) with file_transaction(data, out_file) as tx_out_file: jvm_opts = vardict._get_jvm_opts(data, tx_out_file) cmd = ("{r_setup} && {jvm_opts}{vardict_cmd} -G {ref_file} -f {freq} " "-N {sample} -b {bamfile} {opts} " "| {strandbias}" "| {var2vcf} -N {sample} -E -f {freq} {var2vcf_opts} " "| {fix_ambig} | {remove_dup} | {vcfstreamsort} {compress_cmd} " "> {tx_out_file}") message = "Calling RNA-seq variants with VarDict" do.run(cmd.format(**locals()), message) out_file = vcfutils.bgzip_and_index(out_file, data["config"]) data = dd.set_vrn_file(data, out_file) return data
def run_rnaseq_variant_calling(data): """ run RNA-seq variant calling, variation file is stored in `vrn_file` in the datadict """ variantcaller = dd.get_variantcaller(data) if isinstance(variantcaller, list) and len(variantcaller) > 1: logger.error("Only one variantcaller can be run for RNA-seq at " "this time. Post an issue here " "(https://github.com/chapmanb/bcbio-nextgen/issues) " "if this is something you need to do.") sys.exit(1) if variantcaller and "gatk" in variantcaller: data = variation.rnaseq_gatk_variant_calling(data) if vardict.get_vardict_command(data): data = variation.rnaseq_vardict_variant_calling(data) # annotate RNA-editing events with vcfanno if dd.get_vrn_file(data): vrn_file = vcfanno.run_vcfanno(dd.get_vrn_file(data), "rnaedit", data) data = dd.set_vrn_file(data, vrn_file) return [[data]]