def call_variant_varscan(bam, orig_genome_path, bed, conf): pre_output = "varscan." + util.randstr() + ".mpileup" vcfoutput = "output-vs." + util.randstr() + ".vcf" bedarg = "" if bed is not None: bedarg = " -l " + bed cmd = ( conf.get("main", "samtools_path") + " mpileup " + " -f " + orig_genome_path + " -o " + pre_output + " " + bedarg + " " + bam ) subprocess.check_call(cmd, shell=True) cmd2 = ( "java -Xmx2g -jar " + conf.get("main", "varscan_path") + " mpileup2cns " + pre_output + " --variants --output-vcf 1 --output-file " + vcfoutput ) output = subprocess.check_output(cmd2, shell=True) with open(vcfoutput, "w") as fh: fh.write(output) return util.bgz_tabix(vcfoutput, conf)
def normalize_nothing(orig_vcf, conf): """ Just copy the original vcf to a new, identical vcf file. """ new_vcf = util.strip_extensions(orig_vcf, ['vcf']) + '.nonorm.vcf' cmd = 'cp {orig} {new}'.format(orig_vcf, new_ncf) subprocess.check_call(cmd, shell=True) return util.bgz_tabix(new_vcf, conf)
def compare_test_vcf(self, raw_orig_vcf, raw_test_vcf): raw_orig_vcf = os.path.abspath(raw_orig_vcf) raw_test_vcf = os.path.abspath(raw_test_vcf) orig_vars = list(pysam.VariantFile(raw_orig_vcf)) tmp_dirname = util.strip_extensions(raw_test_vcf, ['gz','vcf']) + "-vcomp-" + util.randstr() with util.TempDir(dirname=tmp_dirname): orig_vcf = util.bgz_tabix(raw_orig_vcf, self.conf) test_vcf = util.remove_halfcalls(raw_test_vcf) test_vcf = util.bgz_tabix(test_vcf, self.conf) caller_name = util.strip_extensions(test_vcf, ['gz','vcf']) bed = util.vars_to_bed(orig_vars) var_results = defaultdict(dict) var_quals = self.collect_var_quals({caller_name: test_vcf}, bed, orig_vcf) bamstats = defaultdict(dict) for normalizer_name, normalizer in self.normalizers.iteritems(): logging.info("--> Running normalizer " + normalizer_name) normed_orig_vcf = normalizer(orig_vcf, self.conf) normed_caller_vcf = normalizer(test_vcf, self.conf) for comparator_name, comparator in self.comparators.iteritems(): logging.info("--> Running comparator " + comparator_name + " (normalizer " + normalizer_name + ")") all_results = comparator(normed_orig_vcf, normed_caller_vcf, None, self.conf) single_results = split_results(all_results, bed) for region, result in zip(util.read_regions(bed), single_results): match_vars = util.find_matching_var(orig_vcf, region) if not match_vars: raise ValueError('Unable to find original variant from region ' + str(region)) result = compare_single_var(result, region, normed_orig_vcf, normed_caller_vcf, comparator, "/".join(str(i) for i in match_vars[0].samples[0]['GT']), self.conf) key = var_key(match_vars) if caller_name not in var_results[key]: var_results[key][caller_name] = defaultdict(dict) var_results[key][caller_name][normalizer_name][comparator_name] = result bamstats[key] = {} # Iterate over all results and write to standard output. We do this here instead of within the loops above # because it keeps results organized by variant, which makes them easier to look at self.reporter.write_output(var_results, var_quals, bamstats)
def normalize_vt(orig_vcf, conf): """ Use vt to normalize :param conf: configuration object with path to reference genome and vt binary :return: String describing variant matching result """ norm_orig_vcf = orig_vcf.replace(".vcf", ".norm.vt.vcf") norm_orig_cmd = conf.get('main', 'vt') + " normalize " + " -r " + conf.get('main', 'ref_genome') + " " + orig_vcf + " -o " + norm_orig_vcf subprocess.check_call(norm_orig_cmd.split(), stderr=open("/dev/null")) return util.bgz_tabix(norm_orig_vcf, conf)
def normalize_nothing(orig_vcf, conf): """ Just copy the original vcf to a new, identical vcf file. """ newvcf_name = orig_vcf.replace(".vcf", ".nonorm.vcf") cmd = "cp " + orig_vcf + " " + newvcf_name subprocess.check_call(cmd, shell=True) newvcf_name = util.bgz_tabix(newvcf_name, conf) return newvcf_name
def normalize_bcftools(orig_vcf, conf): """ Use bcftools to normalize. :param orig_vcf: :param conf: :return: """ norm_orig_vcf = orig_vcf.replace(".vcf.gz", ".norm.bcftools" + util.randstr() + ".vcf") norm_orig_cmd = conf.get('main', 'bcftools') + " norm " + " -c w -f " + conf.get('main', 'ref_genome') + " " + orig_vcf + " -o " + norm_orig_vcf subprocess.check_call(norm_orig_cmd.split()) return util.bgz_tabix(norm_orig_vcf, conf)
def call_variant_mp_bcf(bam, genome, bed, conf): pre_output = "mpileup." + util.randstr() + ".vcf" vcfoutput = "output-mp." + util.randstr() + ".vcf" bedarg = "" if bed is not None: bedarg = " -l " + bed cmd = conf.get('main','samtools') + ' mpileup ' + ' -f ' + genome + " -uv " + " -o " + pre_output + " " + bedarg + " " + bam subprocess.check_call(cmd) cmd2 = conf.get('main', 'bcftools') + ' call ' + ' -mv ' + ' -o ' + vcfoutput + " " + pre_output subprocess.check_call(cmd2) return util.bgz_tabix(vcfoutput, conf)
def call_variant_varscan_emit_all(bam, genome, bed, conf): pre_output = "varscan." + util.randstr() + ".mpileup" vcfoutput = "output-vs." + util.randstr() + ".vcf" bedarg = "" if bed is not None: bedarg = " -l " + bed cmd = conf.get('main','samtools') + ' mpileup ' + ' -f ' + genome + " -o " + pre_output + " " + bedarg + " " + bam subprocess.check_call(cmd) cmd2 = "java -Xmx2g -jar " + conf.get('main', 'varscan') + ' mpileup2cns ' + pre_output + ' --p-value 0.5 --variants --output-vcf 1 --output-file ' + vcfoutput subprocess.check_call(cmd2, stdout=open(vcfoutput, 'w')) return util.bgz_tabix(vcfoutput, conf)
def normalize_vap_leftalign(orig_vcf, conf): orig_vcf = util.sort_vcf(orig_vcf, conf) tmp_vcf = orig_vcf.replace(".vcf", ".vap.tmp.vcf").replace(".gz", "") final_vcf = orig_vcf.replace(".vcf", ".vap.leftaligned.vcf") norm_orig_cmd = conf.get('main', 'vcfallelicprimitives') + " " + orig_vcf subprocess.check_call(norm_orig_cmd, shell=True, stdout=file(tmp_vcf, 'w')) no_et = "" try: no_et = " -et NO_ET -K " + conf.get('main', 'gatk_no_et') except: pass cmd = "java -Djava.io.tmpdir=. -Xmx1g -jar " + conf.get('main', 'gatk') + " -T LeftAlignAndTrimVariants " + no_et + " -R " + conf.get('main', 'ref_genome') + " -U ALLOW_SEQ_DICT_INCOMPATIBILITY -V " + tmp_vcf + " -o " + final_vcf subprocess.check_call(cmd, shell=True) return util.bgz_tabix(final_vcf, conf)
def normalize_vap_leftalign(orig_vcf, conf): err = open("/dev/null") orig_vcf = util.sort_vcf(orig_vcf, conf) tmp_vcf = orig_vcf.replace(".vcf", ".vap.tmp.vcf").replace(".gz", "") final_vcf = orig_vcf.replace(".vcf", ".vap.leftaligned.vcf") norm_orig_cmd = conf.get('main', 'vcfallelicprimitives_path') + " " + orig_vcf tmp_output=subprocess.check_output(norm_orig_cmd, shell=True) with open(tmp_vcf, "w") as fh: fh.write(tmp_output) no_et = "" try: no_et = " -et NO_ET -K " + conf.get('main', 'gatk_no_et') except: pass cmd = "java -Djava.io.tmpdir=. -Xmx1g -jar " + conf.get('main', 'gatk_path') + " -T LeftAlignAndTrimVariants " + no_et + " -R " + conf.get('main', 'ref_genome') + " -V " + tmp_vcf + " -o " + final_vcf subprocess.check_output(cmd, shell=True) err.close() return util.bgz_tabix(final_vcf, conf)
def call_variant_mp_bcf(bam, orig_genome_path, bed, conf): pre_output = "mpileup." + util.randstr() + ".vcf" vcfoutput = "output-mp." + util.randstr() + ".vcf" bedarg = "" if bed is not None: bedarg = " -l " + bed cmd = ( conf.get("main", "samtools_path") + " mpileup " + " -f " + orig_genome_path + " -uv " + " -o " + pre_output + " " + bedarg + " " + bam ) subprocess.check_call(cmd, shell=True) cmd2 = conf.get("main", "bcftools_path") + " call " + " -mv " + " -o " + vcfoutput + " " + pre_output subprocess.check_call(cmd2, shell=True) return util.bgz_tabix(vcfoutput, conf)