Example #1
0
def call_variant_varscan(bam, orig_genome_path, bed, conf):
    pre_output = "varscan." + util.randstr() + ".mpileup"
    vcfoutput = "output-vs." + util.randstr() + ".vcf"
    bedarg = ""
    if bed is not None:
        bedarg = " -l " + bed
    cmd = (
        conf.get("main", "samtools_path")
        + " mpileup "
        + " -f "
        + orig_genome_path
        + " -o "
        + pre_output
        + " "
        + bedarg
        + " "
        + bam
    )
    subprocess.check_call(cmd, shell=True)
    cmd2 = (
        "java -Xmx2g -jar "
        + conf.get("main", "varscan_path")
        + " mpileup2cns "
        + pre_output
        + " --variants --output-vcf 1 --output-file "
        + vcfoutput
    )
    output = subprocess.check_output(cmd2, shell=True)
    with open(vcfoutput, "w") as fh:
        fh.write(output)
    return util.bgz_tabix(vcfoutput, conf)
Example #2
0
def normalize_nothing(orig_vcf, conf):
    """
    Just copy the original vcf to a new, identical vcf file.
    """
    new_vcf = util.strip_extensions(orig_vcf, ['vcf']) + '.nonorm.vcf'
    cmd = 'cp {orig} {new}'.format(orig_vcf, new_ncf)
    subprocess.check_call(cmd, shell=True)
    return util.bgz_tabix(new_vcf, conf)
    def compare_test_vcf(self, raw_orig_vcf, raw_test_vcf):
        raw_orig_vcf = os.path.abspath(raw_orig_vcf)
        raw_test_vcf = os.path.abspath(raw_test_vcf)
        orig_vars    = list(pysam.VariantFile(raw_orig_vcf))
        tmp_dirname  = util.strip_extensions(raw_test_vcf, ['gz','vcf']) + "-vcomp-" + util.randstr()

        with util.TempDir(dirname=tmp_dirname):
            orig_vcf = util.bgz_tabix(raw_orig_vcf, self.conf)
            test_vcf = util.remove_halfcalls(raw_test_vcf)
            test_vcf = util.bgz_tabix(test_vcf, self.conf)
            caller_name = util.strip_extensions(test_vcf, ['gz','vcf'])
            bed = util.vars_to_bed(orig_vars)
            var_results = defaultdict(dict)
            var_quals = self.collect_var_quals({caller_name: test_vcf}, bed, orig_vcf)
            bamstats = defaultdict(dict)

            for normalizer_name, normalizer in self.normalizers.iteritems():
                logging.info("--> Running normalizer " + normalizer_name)
                normed_orig_vcf   = normalizer(orig_vcf, self.conf)
                normed_caller_vcf = normalizer(test_vcf, self.conf)

                for comparator_name, comparator in self.comparators.iteritems():
                    logging.info("--> Running comparator " + comparator_name + " (normalizer " + normalizer_name + ")")
                    all_results = comparator(normed_orig_vcf, normed_caller_vcf, None, self.conf)
                    single_results = split_results(all_results, bed)
                    for region, result in zip(util.read_regions(bed), single_results):
                        match_vars = util.find_matching_var(orig_vcf, region)
                        if not match_vars:
                            raise ValueError('Unable to find original variant from region ' + str(region))
                        result = compare_single_var(result,
                                                    region,
                                                    normed_orig_vcf,
                                                    normed_caller_vcf,
                                                    comparator,
                                                    "/".join(str(i) for i in match_vars[0].samples[0]['GT']),
                                                    self.conf)
                        key = var_key(match_vars)
                        if caller_name not in var_results[key]:
                            var_results[key][caller_name] = defaultdict(dict)
                        var_results[key][caller_name][normalizer_name][comparator_name] = result
                        bamstats[key] = {}

        # Iterate over all results and write to standard output. We do this here instead of within the loops above
        # because it keeps results organized by variant, which makes them easier to look at
        self.reporter.write_output(var_results, var_quals, bamstats)
Example #4
0
def normalize_vt(orig_vcf, conf):
    """
    Use vt to normalize
    :param conf: configuration object with path to reference genome and vt binary
    :return: String describing variant matching result
    """
    norm_orig_vcf = orig_vcf.replace(".vcf", ".norm.vt.vcf")
    norm_orig_cmd = conf.get('main', 'vt') + " normalize " + " -r " + conf.get('main', 'ref_genome') + " " + orig_vcf + " -o " + norm_orig_vcf
    subprocess.check_call(norm_orig_cmd.split(), stderr=open("/dev/null"))
    return util.bgz_tabix(norm_orig_vcf, conf)
Example #5
0
def normalize_nothing(orig_vcf, conf):
    """
    Just copy the original vcf to a new, identical vcf file.
    """
    newvcf_name = orig_vcf.replace(".vcf", ".nonorm.vcf")
    cmd = "cp " + orig_vcf + " " + newvcf_name
    subprocess.check_call(cmd, shell=True)

    newvcf_name = util.bgz_tabix(newvcf_name, conf)
    return newvcf_name
Example #6
0
def normalize_bcftools(orig_vcf, conf):
    """
    Use bcftools to normalize.
    :param orig_vcf:
    :param conf:
    :return:
    """
    norm_orig_vcf = orig_vcf.replace(".vcf.gz", ".norm.bcftools" + util.randstr() + ".vcf")
    norm_orig_cmd = conf.get('main', 'bcftools') + " norm " + " -c w -f " + conf.get('main', 'ref_genome') + " " + orig_vcf + " -o " + norm_orig_vcf
    subprocess.check_call(norm_orig_cmd.split())
    return util.bgz_tabix(norm_orig_vcf, conf)
Example #7
0
def call_variant_mp_bcf(bam, genome, bed, conf):
    pre_output = "mpileup." + util.randstr() + ".vcf"
    vcfoutput = "output-mp." + util.randstr() + ".vcf"
    bedarg = ""
    if bed is not None:
        bedarg = " -l " + bed
    cmd = conf.get('main','samtools') + ' mpileup ' + ' -f ' + genome + " -uv " + " -o " + pre_output + " " + bedarg + " " + bam
    subprocess.check_call(cmd)
    cmd2 = conf.get('main', 'bcftools') + ' call ' + ' -mv ' + ' -o ' + vcfoutput + " " + pre_output
    subprocess.check_call(cmd2)
    return util.bgz_tabix(vcfoutput, conf)
Example #8
0
def call_variant_varscan_emit_all(bam, genome, bed, conf):
    pre_output = "varscan." + util.randstr() + ".mpileup"
    vcfoutput = "output-vs." + util.randstr() + ".vcf"
    bedarg = ""
    if bed is not None:
        bedarg = " -l " + bed
    cmd = conf.get('main','samtools') + ' mpileup ' + ' -f ' + genome + " -o " + pre_output + " " + bedarg + " " + bam
    subprocess.check_call(cmd)
    cmd2 = "java -Xmx2g -jar " + conf.get('main', 'varscan') + ' mpileup2cns ' + pre_output + '  --p-value 0.5 --variants --output-vcf 1 --output-file ' + vcfoutput
    subprocess.check_call(cmd2, stdout=open(vcfoutput, 'w'))
    return util.bgz_tabix(vcfoutput, conf)
Example #9
0
def normalize_vap_leftalign(orig_vcf, conf):
    orig_vcf = util.sort_vcf(orig_vcf, conf)
    tmp_vcf = orig_vcf.replace(".vcf", ".vap.tmp.vcf").replace(".gz", "")
    final_vcf = orig_vcf.replace(".vcf", ".vap.leftaligned.vcf")
    norm_orig_cmd = conf.get('main', 'vcfallelicprimitives') + " " + orig_vcf
    subprocess.check_call(norm_orig_cmd, shell=True, stdout=file(tmp_vcf, 'w'))

    no_et = ""
    try:
        no_et = " -et NO_ET -K " + conf.get('main', 'gatk_no_et')
    except:
        pass

    cmd = "java -Djava.io.tmpdir=. -Xmx1g -jar " + conf.get('main', 'gatk') + " -T LeftAlignAndTrimVariants " + no_et + " -R " + conf.get('main', 'ref_genome') + " -U ALLOW_SEQ_DICT_INCOMPATIBILITY -V " + tmp_vcf + " -o " + final_vcf
    subprocess.check_call(cmd, shell=True)

    return util.bgz_tabix(final_vcf, conf)
Example #10
0
def normalize_vap_leftalign(orig_vcf, conf):
    err = open("/dev/null")
    orig_vcf = util.sort_vcf(orig_vcf, conf)
    tmp_vcf = orig_vcf.replace(".vcf", ".vap.tmp.vcf").replace(".gz", "")
    final_vcf = orig_vcf.replace(".vcf", ".vap.leftaligned.vcf")
    norm_orig_cmd = conf.get('main', 'vcfallelicprimitives_path') + " " + orig_vcf
    tmp_output=subprocess.check_output(norm_orig_cmd, shell=True)
    with open(tmp_vcf, "w") as fh:
        fh.write(tmp_output)

    no_et = ""
    try:
        no_et = " -et NO_ET -K " + conf.get('main', 'gatk_no_et')
    except:
        pass

    cmd = "java -Djava.io.tmpdir=. -Xmx1g -jar " + conf.get('main', 'gatk_path') + " -T LeftAlignAndTrimVariants " + no_et + " -R " + conf.get('main', 'ref_genome') + " -V " + tmp_vcf + " -o " + final_vcf
    subprocess.check_output(cmd, shell=True)
    err.close()

    return util.bgz_tabix(final_vcf, conf)
Example #11
0
def call_variant_mp_bcf(bam, orig_genome_path, bed, conf):
    pre_output = "mpileup." + util.randstr() + ".vcf"
    vcfoutput = "output-mp." + util.randstr() + ".vcf"
    bedarg = ""
    if bed is not None:
        bedarg = " -l " + bed
    cmd = (
        conf.get("main", "samtools_path")
        + " mpileup "
        + " -f "
        + orig_genome_path
        + " -uv "
        + " -o "
        + pre_output
        + " "
        + bedarg
        + " "
        + bam
    )
    subprocess.check_call(cmd, shell=True)
    cmd2 = conf.get("main", "bcftools_path") + " call " + " -mv " + " -o " + vcfoutput + " " + pre_output
    subprocess.check_call(cmd2, shell=True)
    return util.bgz_tabix(vcfoutput, conf)