def add_backfill_genotypes(vcf):
    # Will add genotype columns for backfill in place.
    from genomicode import vcflib

    # FORMAT      GT:GQ:SDP:DP:RD:AD:FREQ:PVAL:RBQ:ABQ:RDF:RDR:ADF:ADR
    # <genotype>  0/1:12:28:28:24:4:14.29%:5.5746E-2:37:36:14:10:3:1

    # Columns to add.
    COLUMNS = ["BFILL_REF", "BFILL_ALT", "BFILL_COV", "BFILL_VAF"]

    for i in range(vcf.num_variants()):
        var = vcflib.get_variant(vcf, i)

        changed = False
        for col in COLUMNS:
            if col not in var.genotype_names:
                var.genotype_names.append(col)
                changed = True
            for genodict in var.sample2genodict.itervalues():
                if col in genodict:
                    continue
                genodict[col] = "."
                changed = True
        if changed:
            vcflib.set_variant(vcf, i, var)
Exemple #2
0
def fix_vcf_file(sample, infile, outfile):
    # JointSNVMix produces VCF files that don't have FORMAT and
    # <SAMPLE> columns.  Add them.
    from genomicode import vcflib

    vcf = vcflib.read(infile)
    matrix = vcf.matrix

    genotype_names = ["DP", "RD", "AD", "FREQ"]

    # Get the calls for each variant.
    all_genotypes = []  # one for each variant
    for i in range(vcf.num_variants()):
        var = vcflib.get_variant(vcf, i)
        call = vcflib.get_call(var, None)
        geno_dict = {
            "DP": call.total_reads,
            "RD": call.num_ref,
            "AD": call.num_alt,
            "FREQ": call.vaf,
        }
        x = vcflib._format_genotype(genotype_names, geno_dict)
        all_genotypes.append(x)

    # Add FORMAT.
    FORMAT_STRING = ":".join(genotype_names)
    assert "FORMAT" not in matrix
    matrix.headers.append("FORMAT")
    matrix.headers_h.append("FORMAT")
    matrix.header2annots["FORMAT"] = [FORMAT_STRING] * matrix.num_annots()

    # Add the sample.
    assert not vcf.samples
    assert sample not in matrix
    matrix.headers.append(sample)
    matrix.headers_h.append(sample)
    matrix.header2annots[sample] = all_genotypes
    vcf.samples = [sample]

    # Add the proper header lines.
    lines = [
        '##FORMAT=<ID=RD,Number=1,Type=Integer,Description="Allelic depth for the ref allele in the tumor sample">',
        '##FORMAT=<ID=AD,Number=1,Type=Integer,Description="Allelic depth for the alt allele in the tumor sample">',
        '##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read depth">',
        '##FORMAT=<ID=FREQ,Number=1,Type=Integer,Description="Variant allele frequency">',
    ]
    matrix.headerlines.extend(lines)

    vcflib.write(outfile, vcf)
def backfill_vcf(in_file, bf_file, out_file):
    import copy
    from genomicode import vcflib

    #print in_mvcf_node.identifier
    #print back_mvcf_node.identifier
    in_vcf = vcflib.read(in_file)
    bf_vcf = vcflib.read(bf_file)

    # May have multiple samples, e.g. germline and tumor.
    #assert len(in_vcf.samples) == 1, "Too many samples: %s" % in_vcf.samples
    x = [x for x in in_vcf.samples if x in bf_vcf.samples]
    SAMPLES = x

    # Parse out the read counts from the backfill vcf.
    bf_variants = {}  # (sample, chrom, pos) -> ref, alt, Variant, Call
    for i in range(bf_vcf.num_variants()):
        var = vcflib.get_variant(bf_vcf, i)
        for sample in SAMPLES:
            call = vcflib.get_call(var, sample)

            if call.num_ref is None and call.num_alt is None and \
               call.total_reads is None and call.vaf is None:
                continue
            x = sample, var.chrom, var.pos
            assert x not in bf_variants, "Duplicate: %s %s %s" % x
            bf_variants[x] = var.ref, var.alt, var, call

    # Find the variants that can be backfilled.
    # List of (chrom, pos, in_var_num, sample, in_call, bf_var, bf_call)
    matches = []
    for i in range(in_vcf.num_variants()):
        in_var = vcflib.get_variant(in_vcf, i)
        for sample in SAMPLES:
            # Skip if there is no backfill information.
            key = sample, in_var.chrom, in_var.pos
            if key not in bf_variants:
                continue
            bf_ref, bf_alt, bf_var, bf_call = bf_variants[key]
            # Don't worry if the variants match.  Just want a
            # rough estimate of the coverage at this location.
            ## Make sure the variants match.
            ##if not is_same_variants(ref, alt, bf_ref, bf_alt):
            ##    continue
            in_call = vcflib.get_call(in_var, sample)
            x = in_var.chrom, in_var.pos, i, sample, in_call, bf_var, bf_call
            matches.append(x)

    # Update the read counts from annotated VCF file.
    out_vcf = copy.deepcopy(in_vcf)
    add_backfill_genotypes(out_vcf)

    seen = {}
    for x in matches:
        chrom, pos, var_num, sample, in_call, bf_var, bf_call = x
        seen[(sample, chrom, pos)] = 1

        var = vcflib.get_variant(out_vcf, var_num)
        GD = var.sample2genodict[sample]

        mapping = [
            ("BFILL_REF", "num_ref"),
            ("BFILL_ALT", "num_alt"),
            ("BFILL_COV", "total_reads"),
            ("BFILL_VAF", "vaf"),
        ]
        changed = False
        for gt_key, call_attr in mapping:
            x = getattr(bf_call, call_attr)
            if x is None:
                continue
            if type(x) is type([]):  # arbitrarily use max
                x = max(x)
            GD[gt_key] = vcflib._format_vcf_value(x)
            changed = True
        if changed:
            vcflib.set_variant(out_vcf, var_num, var)

    # Add the variants that are in bf_file, but not in in_file.
    for x in bf_variants:
        # sample, chrom, pos = x
        if x in seen:
            continue
        bf_ref, bf_alt, bf_var, bf_call = bf_variants[x]
        # VarScan sets the filter_ to "PASS" for everything.  Get rid
        # of this.
        bf_var.filter_ = ["BACKFILL"]
        vcflib.add_variant(out_vcf, bf_var)

    vcflib.write(out_file, out_vcf)
def parse_snpeff_file(vcf_filename, out_filename):
    from genomicode import vcflib

    # Parse out the snpEff annotations.  Should have ANN in INFO.
    # Make a tab-delimited text file containin columns:
    # Chrom  Pos  Ref  Alt  <snpEff-specific columns>
    #
    # ##INFO=<ID=ANN,Number=.,Type=String,
    #     Description="Functional annotations: '
    #     Allele |
    #     Annotation |
    #     Annotation_Impact |
    #     Gene_Name |
    #     Gene_ID |
    #     Feature_Type |
    #     Feature_ID |
    #     Transcript_BioType |
    #     Rank |
    #     HGVS.c |
    #     HGVS.p |
    #     cDNA.pos / cDNA.length |
    #     CDS.pos / CDS.length |
    #     AA.pos / AA.length |
    #     Distance | ERRORS / WARNINGS / INFO' ">

    vcf = vcflib.read(vcf_filename)

    # Figure out the Functional annotations.
    assert vcf.matrix.headerlines, "No header lines"
    x = [x for x in vcf.matrix.headerlines if x.find("<ID=ANN,") >= 0]
    if not x:
        return
    # No duplicates.
    # The ANN line can end with:
    #   ERRORS / WARNINGS / INFO'">
    #   ERRORS / WARNINGS / INFO' ">
    # I encountered a VCF file that contained two ANN lines differing
    # by this spacing.  Normalize these lines and make sure there are
    # no duplicates.
    x = [
        x.replace("ERRORS / WARNINGS / INFO' \">",
                  "ERRORS / WARNINGS / INFO'\">") for x in x
    ]
    x = {}.fromkeys(x).keys()
    assert len(x) == 1, "Multiple ANN headers: %s" % vcf_filename
    header = x[0]
    x = header.strip()
    TEXT = "Functional annotations:"
    assert TEXT in x
    x = x[x.index(TEXT) + len(TEXT):]  # Get rid of "Functional annotations:"
    assert x.endswith('">')  # No ">
    x = x[:-2].strip()
    assert x.startswith("'") and x.endswith("'")  # No ''
    x = x[1:-1]
    x = x.split("|")
    x = [x.strip() for x in x]
    annotations = x

    handle = open(out_filename, 'w')
    header = ["Chrom", "Pos", "Ref", "Alt"] + annotations
    print >> handle, "\t".join(header)

    for i in range(vcf.num_variants()):
        var = vcflib.get_variant(vcf, i)
        if "ANN" not in var.infodict:
            continue

        # Can have multiple annotations if there are more than one allele.
        # <ALLELE>|...|...|,<ALLELE>|...|...|
        # If this happens, just add them to the file.
        x = var.infodict["ANN"]
        annots = x.split(",")
        for annot in annots:
            x = annot.split("|")
            x = [x.strip() for x in x]
            values = x
            assert len(values) == len(annotations), \
                   "Mismatch annotations %d %d: %s %s %d" % (
                len(annotations), len(values), vcf_filename,
                var.chrom, var.pos)

            alt = ",".join(var.alt)
            x = [var.chrom, var.pos, var.ref, alt] + values
            assert len(x) == len(header)
            print >> handle, "\t".join(map(str, x))
Exemple #5
0
def summarize_vcf_file(filename, filestem, header, outfilename, lock):
    from genomicode import hashlib
    from genomicode import vcflib

    vcf = vcflib.read(filename)

    lines = []
    for i in range(vcf.num_variants()):
        var = vcflib.get_variant(vcf, i)

        caller_name = var.caller.name
        ref = var.ref
        alt = ",".join(var.alt)
        filter_str = vcf.caller.get_filter(var)

        for sample in var.samples:
            # If sample begins with an integer, there may be a
            # "X" pre-pended to it.  Try to detect this case
            # and fix it.
            clean_sample = sample
            if sample == hashlib.hash_var(filestem):
                clean_sample = filestem

            source = "DNA"
            if caller_name == "Radia":
                # DNA    <clean_sample>       196B-lung
                # RNA    <clean_sample>_RNA   196B-lung_RNA
                # Figure out whether this is RNA and fix it.
                if clean_sample.endswith("_RNA"):
                    clean_sample = clean_sample[:-4]
                    source = "RNA"

            genodict = var.sample2genodict[sample]
            call = vcflib.get_call(var, sample)

            num_ref = vcflib._format_vcf_value(call.num_ref, None_char="")
            num_alt = vcflib._format_vcf_value(call.num_alt, None_char="")
            total_reads = vcflib._format_vcf_value(call.total_reads,
                                                   None_char="")
            vaf = vcflib._format_vcf_value(call.vaf, None_char="")
            call_str = vcflib._format_vcf_value(call.call, None_char="")
            GQ = genodict.get("GQ", "")
            if GQ in [None, "."]:
                GQ = ""

            x = caller_name, filestem, clean_sample, var.chrom, var.pos, \
                ref, alt, source, \
                num_ref, num_alt, total_reads, vaf, filter_str, call_str, GQ
            assert len(x) == len(header)
            x = "\t".join(map(str, x))
            lines.append(x)

            if len(lines) >= 100000:
                x = "\n".join(lines) + "\n"
                lock.acquire()
                handle = open(outfilename, 'a')
                handle.write(x)
                handle.close()
                lock.release()
                lines = []

    x = "\n".join(lines) + "\n"
    lock.acquire()
    handle = open(outfilename, 'a')
    handle.write(x)
    handle.close()
    lock.release()