Python extractHeaders Examples

Programming Language: Python

Namespace/Package Name: Tools.vcfextract

Method/Function: extractHeaders

Examples at hotexamples.com: 8

Python extractHeaders - 8 examples found. These are the top rated real world Python examples of Tools.vcfextract.extractHeaders extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: Strelka.py Project: NHLBI-BCB/hap.py

def extractStrelkaIndelFeatures(vcfname, tag, avg_depth=None):
    """ Return a data frame with features collected from the given VCF, tagged by given type
    :param vcfname: name of the VCF file
    :param tag: type of variants
    :param avg_depth: average chromosome depths from BAM file
    """
    features = [
        "CHROM",
        "POS",
        "REF",
        "ALT",
        "FILTER",
        "I.NT",
        "I.SOMATIC",
        "I.QSI_NT",
        "I.EVS",
        "I.EVSF",
        "I.SGT",
        "I.RC",
        "I.RU",
        "I.IC",
        "I.IHP",
        "I.MQ",
        "I.MQ0",
        "S.1.DP",
        "S.2.DP",
        "S.1.TAR",
        "S.2.TAR",
        "S.1.TIR",
        "S.2.TIR",
        "S.1.TOR",
        "S.2.TOR",
        "S.1.BCN50",
        "S.2.BCN50",
        "S.1.FDP50",
        "S.2.FDP50",
    ]

    cols = [
        "CHROM", "POS", "REF", "ALT", "LENGTH", "INDELTYPE", "FILTER", "NT",
        "NT_REF", "EVS", "QSI_NT", "N_DP", "T_DP", "N_DP_RATE", "T_DP_RATE",
        "N_BCN", "T_BCN", "N_FDP", "T_FDP", "N_AF", "T_AF", "SGT", "RC", "RU",
        "RU_LEN", "IC", "IHP", "MQ", "MQ0", "tag"
    ]

    records = []

    vcfheaders = list(extractHeaders(vcfname))

    evs_featurenames = {}

    for l in vcfheaders:
        if '##indel_scoring_features' in l:
            try:
                xl = str(l).split('=', 1)
                xl = xl[1].split(",")
                for i, n in enumerate(xl):
                    evs_featurenames[i] = n
                    cols.append("E." + n)
                    logging.info("Scoring feature %i : %s" % (i, n))
            except:
                logging.warn(
                    "Could not parse scoring feature names from Strelka output"
                )

    if not avg_depth:
        avg_depth = {}

        for l in vcfheaders:
            x = str(l).lower()
            x = x.replace("##meandepth_", "##maxdepth_")
            x = x.replace("##depth_", "##maxdepth_")
            if '##maxdepth_' in x:
                p, _, l = l.partition("_")
                xl = str(l).split('=')
                xchr = xl[0]
                avg_depth[xchr] = float(xl[1])
                logging.info("%s depth from VCF header is %f" %
                             (xchr, avg_depth[xchr]))

    has_warned = {}
    for vr in vcfExtract(vcfname, features):
        rec = {}
        for i, ff in enumerate(features):
            rec[ff] = vr[i]
        rec["tag"] = tag

        # fix missing features
        for q in [
                "I.QSI_NT", "I.RC", "I.IC", "I.IHP", "I.EVS", "S.1.DP",
                "S.2.DP", "S.1.BCN50", "S.2.BCN50", "S.1.FDP50", "S.2.FDP50"
        ]:
            if q not in rec or rec[q] is None:
                rec[q] = 0
                if not ("feat:" + q) in has_warned:
                    logging.warn("Missing feature %s" % q)
                    has_warned["feat:" + q] = True

        for q in [
                "S.1.TAR", "S.2.TAR", "S.1.TIR", "S.2.TIR", "S.1.TOR",
                "S.2.TOR"
        ]:
            if q not in rec or rec[q] is None:
                rec[q] = [0, 0]
                if not ("feat:" + q) in has_warned:
                    logging.warn("Missing feature %s" % q)
                    has_warned["feat:" + q] = True

        NT = rec["I.NT"]
        NT_is_ref = int(NT == "ref")
        QSI_NT = int(rec["I.QSI_NT"])

        n_DP = float(rec["S.1.DP"])
        t_DP = float(rec["S.2.DP"])

        in_del = 0

        max_len = len(rec["REF"])
        min_len = len(rec["REF"])

        for a in rec["ALT"]:
            if len(a) > len(rec["REF"]):
                in_del |= 1
            else:
                in_del |= 2
            min_len = min(len(a), min_len)
            max_len = max(len(a), max_len)

        ilen = max_len - min_len

        n_DP_ratio = 0
        t_DP_ratio = 0

        if avg_depth:
            try:
                n_DP_ratio = n_DP / float(avg_depth[rec["CHROM"]])
                t_DP_ratio = t_DP / float(avg_depth[rec["CHROM"]])
            except:
                if not rec["CHROM"] in has_warned:
                    logging.warn("Cannot normalize depths on %s" %
                                 rec["CHROM"])
                    has_warned[rec["CHROM"]] = True
        elif "DPnorm" not in has_warned:
            logging.warn("Cannot normalize depths.")
            has_warned["DPnorm"] = True

        # extract observed AF from strelka counts. TIR = ALT; TAR = REF
        try:
            n_af = float(rec["S.1.TIR"][0]) / (float(rec["S.1.TIR"][0]) +
                                               float(rec["S.1.TAR"][0]))
        except:
            n_af = 0

        try:
            t_af = float(rec["S.2.TIR"][0]) / (float(rec["S.2.TIR"][0]) +
                                               float(rec["S.2.TAR"][0]))
        except:
            t_af = 0

        # Gather the computed data into a dict
        qrec = {
            "CHROM": rec["CHROM"],
            "POS": int(rec["POS"]),
            "REF": rec["REF"],
            "ALT": ",".join(rec["ALT"]),
            "LENGTH": ilen,
            "INDELTYPE": in_del,
            "FILTER": ",".join(rec["FILTER"]),
            "NT": NT,
            "NT_REF": NT_is_ref,
            "QSI_NT": QSI_NT,
            "N_DP": n_DP,
            "T_DP": t_DP,
            "N_DP_RATE": n_DP_ratio,
            "T_DP_RATE": t_DP_ratio,
            "N_AF": n_af,
            "T_AF": t_af,
            "SGT": rec["I.SGT"],
            "tag": tag
        }

        # fields with defaults
        fields = [
            {
                "n": "EVS",
                "s": "I.EVS",
                "def": 0,
                "t": float
            },
            {
                "n": "VQSR",
                "s": "I.VQSR",
                "def": 0,
                "t": float
            },
            {
                "n": "RC",
                "s": "I.RC",
                "def": 0,
                "t": int
            },
            {
                "n": "RU",
                "s": "I.RU",
                "def": ""
            },
            {
                "n": "RU_LEN",
                "s": "I.RU",
                "def": 0,
                "t": len
            },
            {
                "n": "IC",
                "s": "I.IC",
                "def": 0,
                "t": int
            },
            {
                "n": "IHP",
                "s": "I.IHP",
                "def": 0,
                "t": int
            },
            {
                "n": "MQ",
                "s": "I.MQ",
                "def": 0.0,
                "t": float
            },
            {
                "n": "MQ0",
                "s": "I.MQ0",
                "def": 0.0,
                "t": float
            },
            {
                "n": "N_BCN",
                "s": "S.1.BCN50",
                "def": 0.0,
                "t": float
            },
            {
                "n": "T_BCN",
                "s": "S.2.BCN50",
                "def": 0.0,
                "t": float
            },
            {
                "n": "N_FDP",
                "s": "S.1.FDP50",
                "def": 0.0,
                "t": float
            },
            {
                "n": "T_FDP",
                "s": "S.2.FDP50",
                "def": 0.0,
                "t": float
            },
        ]

        for fd in fields:
            try:
                res = rec[fd["s"]]
                if "t" in fd:
                    res = fd["t"](res)
            except:
                res = fd["def"]

            qrec[fd["n"]] = res

        # ESF features
        try:
            for i, v in enumerate(rec["I.EVSF"]):
                if i in evs_featurenames:
                    try:
                        qrec["E." + evs_featurenames[i]] = float(v)
                    except:
                        # failure to parse
                        pass
        except:
            pass

        for k, v in evs_featurenames.iteritems():
            if not "E." + v in qrec:
                qrec["E." + v] = 0

        records.append(qrec)

    if records:
        df = pandas.DataFrame(records, columns=cols)
    else:
        df = pandas.DataFrame(columns=cols)

    return df

Example #2

Show file

def extractPiscesSNVFeatures(vcfname, tag, avg_depth=None):
    """ Return a data frame with features collected from the given VCF, tagged by given type
    :param vcfname: name of the VCF file
    :param tag: type of variants
    :param avg_depth: average chromosome depths from BAM file
    """
    features = [
        "CHROM", "POS", "REF", "ALT", "FILTER", "I.DP", "I.EVS", "S.1.GT",
        "S.1.GQ", "S.1.AD", "S.1.DP", "S.1.VF", "S.1.NL", "S.1.SB", "S.1.NC",
        "S.1.AQ", "S.1.GQX"
    ]

    cols = [
        "CHROM", "POS", "REF", "ALT", "FILTER", "GQX", "EVS", "T_DP",
        "T_DP_RATE", "T_AF", "tag"
    ]

    vcfheaders = list(extractHeaders(vcfname))

    evs_featurenames = {}
    for l in vcfheaders:
        if '##snv_scoring_features' in l:
            try:
                xl = str(l).split('=', 1)
                xl = xl[1].split(",")
                for i, n in enumerate(xl):
                    evs_featurenames[i] = n
                    cols.append("E." + n)
                    logging.info("Scoring feature %i : %s" % (i, n))
            except:
                logging.warn(
                    "Could not parse scoring feature names from Pisces output")

    records = []

    if not avg_depth:
        avg_depth = {}

        for l in vcfheaders:
            x = str(l).lower()
            x = x.replace("##meandepth_", "##maxdepth_")
            x = x.replace("##depth_", "##maxdepth_")
            if '##maxdepth_' in x:
                p, _, l = l.partition("_")
                xl = str(l).split('=')
                xchr = xl[0]
                avg_depth[xchr] = float(xl[1])
                logging.info("%s depth from VCF header is %f" %
                             (xchr, avg_depth[xchr]))

    has_warned = {}

    for vr in vcfExtract(vcfname, features):
        rec = {}
        for i, ff in enumerate(features):
            rec[ff] = vr[i]

        # read VQSR value, if it's not present, set to -1 (old versions of Pisces)
        try:
            rec["I.VQSR"] = float(rec["I.VQSR"])
        except:
            rec["I.VQSR"] = -1.0

        # read EVS value, if it's not present, set to -1 (old versions of Pisces)
        if "I.SomaticEVS" in rec:
            try:
                rec["I.EVS"] = float(rec["I.SomaticEVS"])
            except:
                rec["I.EVS"] = -1.0
        else:
            try:
                rec["I.EVS"] = float(rec["I.EVS"])
            except:
                rec["I.EVS"] = -1.0

        # fix missing features
        for q in ["S.1.NC", "S.1.AQ"]:
            if q not in rec or rec[q] is None:
                rec[q] = 0
                if not ("feat:" + q) in has_warned:
                    logging.warn("Missing feature %s" % q)
                    has_warned["feat:" + q] = True

        rec["tag"] = tag

        t_DP = float(rec["S.1.DP"])
        t_VF = float(rec["S.1.VF"])
        GQX = float(rec["S.1.GQX"])

        t_DP_ratio = 0

        if avg_depth:
            try:
                t_DP_ratio = t_DP / float(avg_depth[rec["CHROM"]])
            except:
                if not rec["CHROM"] in has_warned:
                    logging.warn("Cannot normalize depths on %s" %
                                 rec["CHROM"])
                    has_warned[rec["CHROM"]] = True
        elif "DPnorm" not in has_warned:
            logging.warn("Cannot normalize depths.")
            has_warned["DPnorm"] = True

        # Gather the computed data into a dict
        qrec = {
            "CHROM": rec["CHROM"],
            "POS": int(rec["POS"]),
            "REF": rec["REF"],
            "ALT": ",".join(rec["ALT"]),
            "FILTER": ",".join(rec["FILTER"]),
            "GQX": GQX,
            "EVS": rec["I.EVS"],
            "T_DP": t_DP,
            "T_DP_RATE": t_DP_ratio,
            "T_AF": t_VF,
            "tag": tag
        }

        records.append(qrec)

    if records:
        df = pandas.DataFrame(records, columns=cols)
    else:
        df = pandas.DataFrame(columns=cols)

    return df

Example #3

Show file

File: Strelka.py Project: NHLBI-BCB/hap.py

def extractStrelkaSNVFeatures(vcfname, tag, avg_depth=None):
    """ Return a data frame with features collected from the given VCF, tagged by given type
    :param vcfname: name of the VCF file
    :param tag: type of variants
    :param avg_depth: average chromosome depths from BAM file
    """
    features = [
        "CHROM", "POS", "REF", "ALT", "FILTER", "I.NT", "I.SOMATIC",
        "I.QSS_NT", "I.VQSR", "I.EVS", "I.EVSF", "I.SGT", "I.MQ", "I.MQ0",
        "I.SNVSB", "I.ReadPosRankSum", "S.1.SDP", "S.2.SDP", "S.1.FDP",
        "S.2.FDP", "S.1.DP", "S.2.DP", "S.1.AU", "S.2.AU", "S.1.CU", "S.2.CU",
        "S.1.GU", "S.2.GU", "S.1.TU", "S.2.TU"
    ]

    cols = [
        "CHROM", "POS", "REF", "ALT", "NT", "NT_REF", "QSS_NT", "FILTER",
        "EVS", "VQSR", "N_FDP_RATE", "T_FDP_RATE", "N_SDP_RATE", "T_SDP_RATE",
        "N_DP", "T_DP", "N_DP_RATE", "T_DP_RATE", "N_AF", "T_AF", "MQ", "MQ0",
        "SNVSB", "ReadPosRankSum", "tag"
    ]

    vcfheaders = list(extractHeaders(vcfname))

    evs_featurenames = {}
    for l in vcfheaders:
        if '##snv_scoring_features' in l:
            try:
                xl = str(l).split('=', 1)
                xl = xl[1].split(",")
                for i, n in enumerate(xl):
                    evs_featurenames[i] = n
                    cols.append("E." + n)
                    logging.info("Scoring feature %i : %s" % (i, n))
            except:
                logging.warn(
                    "Could not parse scoring feature names from Strelka output"
                )

    records = []

    if not avg_depth:
        avg_depth = {}

        for l in vcfheaders:
            x = str(l).lower()
            x = x.replace("##meandepth_", "##maxdepth_")
            x = x.replace("##depth_", "##maxdepth_")
            if '##maxdepth_' in x:
                p, _, l = l.partition("_")
                xl = str(l).split('=')
                xchr = xl[0]
                avg_depth[xchr] = float(xl[1])
                logging.info("%s depth from VCF header is %f" %
                             (xchr, avg_depth[xchr]))

    has_warned = {}

    for vr in vcfExtract(vcfname, features):
        rec = {}
        for i, ff in enumerate(features):
            rec[ff] = vr[i]

        # read VQSR value, if it's not present, set to -1 (old versions of Strelka)
        try:
            rec["I.VQSR"] = float(rec["I.VQSR"])
        except:
            rec["I.VQSR"] = -1.0

        # read EVS value, if it's not present, set to -1 (old versions of Strelka)
        try:
            rec["I.EVS"] = float(rec["I.EVS"])
        except:
            rec["I.EVS"] = -1.0

        # fix missing features
        for q in [
                "I.QSS_NT", "I.MQ", "I.MQ0", "I.SNVSB", "I.ReadPosRankSum",
                "S.1.SDP", "S.2.SDP", "S.1.FDP", "S.2.FDP", "S.1.DP", "S.2.DP",
                "S.1.AU", "S.2.AU", "S.1.CU", "S.2.CU", "S.1.GU", "S.2.GU",
                "S.1.TU", "S.2.TU"
        ]:
            if q not in rec or rec[q] is None:
                rec[q] = 0
                if not ("feat:" + q) in has_warned:
                    logging.warn("Missing feature %s" % q)
                    has_warned["feat:" + q] = True

        rec["tag"] = tag

        NT = rec["I.NT"]
        NT_is_ref = int(NT == "ref")
        QSS_NT = int(rec["I.QSS_NT"])

        try:
            MQ = float(rec["I.MQ"])
        except:
            MQ = None

        try:
            MQ_ZERO = float(rec["I.MQ0"])
        except:
            MQ_ZERO = None

        n_FDP = float(rec["S.1.FDP"])
        t_FDP = float(rec["S.2.FDP"])
        n_SDP = float(rec["S.1.SDP"])
        t_SDP = float(rec["S.2.SDP"])
        n_DP = float(rec["S.1.DP"])
        t_DP = float(rec["S.2.DP"])

        n_FDP_ratio = n_FDP / n_DP if n_DP != 0 else 0
        t_FDP_ratio = t_FDP / t_DP if t_DP != 0 else 0

        n_SDP_ratio = n_SDP / (n_DP + n_SDP) if (n_DP + n_SDP) != 0 else 0
        t_SDP_ratio = t_SDP / (t_DP + t_SDP) if (t_DP + t_SDP) != 0 else 0

        n_DP_ratio = 0
        t_DP_ratio = 0

        if avg_depth:
            try:
                n_DP_ratio = n_DP / float(avg_depth[rec["CHROM"]])
                t_DP_ratio = t_DP / float(avg_depth[rec["CHROM"]])
            except:
                if not rec["CHROM"] in has_warned:
                    logging.warn("Cannot normalize depths on %s" %
                                 rec["CHROM"])
                    has_warned[rec["CHROM"]] = True
        elif "DPnorm" not in has_warned:
            logging.warn("Cannot normalize depths.")
            has_warned["DPnorm"] = True

        # Ref and alt allele counts for tier1 and tier2
        allele_ref = rec["REF"]
        try:
            t_allele_ref_counts = map(float, rec['S.2.' + allele_ref + 'U'])
        except:
            t_allele_ref_counts = [0, 0]

        alleles_alt = rec["ALT"]

        try:
            t_allele_alt_counts = [0, 0]
            for a in alleles_alt:
                for i in range(2):
                    t_allele_alt_counts[i] += float(rec['S.2.' + a + 'U'][i])
        except:
            t_allele_alt_counts = [0, 0]

        # Compute the tier1 and tier2 alt allele rates.
        if t_allele_alt_counts[0] + t_allele_ref_counts[0] == 0:
            t_tier1_allele_rate = 0
        else:
            t_tier1_allele_rate = t_allele_alt_counts[0] / float(
                t_allele_alt_counts[0] + t_allele_ref_counts[0])

        try:
            n_allele_ref_counts = map(float, rec['S.1.' + allele_ref + 'U'])
        except:
            n_allele_ref_counts = [0, 0]

        alleles_alt = rec["ALT"]

        try:
            n_allele_alt_counts = [0, 0]
            for a in alleles_alt:
                for i in range(2):
                    n_allele_alt_counts[i] += float(rec['S.1.' + a + 'U'][i])
        except:
            n_allele_alt_counts = [0, 0]

        # Compute the tier1 and tier2 alt allele rates.
        if n_allele_alt_counts[0] + n_allele_ref_counts[0] == 0:
            n_tier1_allele_rate = 0
        else:
            n_tier1_allele_rate = n_allele_alt_counts[0] / float(
                n_allele_alt_counts[0] + n_allele_ref_counts[0])

        try:
            snvsb = rec["I.SNVSB"]
        except:
            snvsb = 0

        try:
            rprs = rec["I.ReadPosRankSum"]
        except:
            rprs = 0

        # Gather the computed data into a dict
        qrec = {
            "CHROM": rec["CHROM"],
            "POS": int(rec["POS"]),
            "REF": rec["REF"],
            "ALT": ",".join(rec["ALT"]),
            "FILTER": ",".join(rec["FILTER"]),
            "NT": NT,
            "NT_REF": NT_is_ref,
            "QSS_NT": QSS_NT,
            "VQSR": rec["I.VQSR"],
            "EVS": rec["I.EVS"],
            "N_FDP_RATE": n_FDP_ratio,
            "T_FDP_RATE": t_FDP_ratio,
            "N_SDP_RATE": n_SDP_ratio,
            "T_SDP_RATE": t_SDP_ratio,
            "N_DP": n_DP,
            "T_DP": t_DP,
            "N_DP_RATE": n_DP_ratio,
            "T_DP_RATE": t_DP_ratio,
            "N_AF": n_tier1_allele_rate,
            "T_AF": t_tier1_allele_rate,
            "MQ": MQ,
            "MQ0": MQ_ZERO,
            "SNVSB": snvsb,
            "ReadPosRankSum": rprs,
            "tag": tag
        }
        # ESF features
        try:
            for i, v in enumerate(rec["I.EVSF"]):
                if i in evs_featurenames:
                    try:
                        qrec["E." + evs_featurenames[i]] = float(v)
                    except:
                        # failure to parse
                        pass
        except:
            pass
        for k, v in evs_featurenames.iteritems():
            if not "E." + v in qrec:
                qrec["E." + v] = 0

        records.append(qrec)

    if records:
        df = pandas.DataFrame(records, columns=cols)
    else:
        df = pandas.DataFrame(columns=cols)

    return df

Example #4

Show file

File: Strelka.py Project: jaredo/hap.py

def extractStrelkaIndelFeatures(vcfname, tag, avg_depth=None):
        """ Return a data frame with features collected from the given VCF, tagged by given type """
        features = ["CHROM", "POS", "REF", "ALT", "FILTER",
                    "I.NT", "I.SOMATIC", "I.QSI_NT",
                    "I.SGT", "I.RC", "I.RU", "I.IC", "I.IHP",
                    "I.MQ", "I.MQ0",
                    "I.H200", "I.RC_HPOL_200", "I.RC_DINUC_200", "I.RC_TRIPLET_200",
                    "S.1.DP", "S.2.DP",
                    "S.1.TAR", "S.2.TAR",
                    "S.1.TIR", "S.2.TIR",
                    "S.1.TOR", "S.2.TOR",
                    "S.1.DP50", "S.2.DP50",
                    "S.1.FDP50", "S.2.FDP50",
                    "S.1.SUBDP50", "S.2.SUBDP50"]

        records = []

        if not avg_depth:
            avg_depth = {}

            for l in list(extractHeaders(vcfname)):
                x = str(l).lower()
                if '##maxdepth_' in x:
                    xl = str(l).split('=')
                    xchr = xl[0][11:]
                    avg_depth[xchr] = float(xl[1])
                    # logging.info("Maxdepth for %s depth from VCF header is %f" % (xchr, avg_depth[xchr]))

        has_warned = {}
        for vr in vcfExtract(vcfname, features):
            rec = {}
            for i, ff in enumerate(features):
                rec[ff] = vr[i]
            rec["tag"] = tag

            # fix missing features
            for q in ["I.QSI_NT", "I.RC", "I.IC", "I.IHP",
                      "S.1.DP", "S.2.DP", "I.H200", "I.RC_HPOL_200",
                      "I.RC_DINUC_200", "I.RC_TRIPLET_200",
                      "S.1.FDP50", "S.2.FDP50",
                      "S.1.SUBDP50", "S.2.SUBDP50"]:
                if q not in rec or rec[q] is None:
                    rec[q] = 0
                    if not ("feat:" + q) in has_warned:
                        logging.warn("Missing feature %s" % q)
                        has_warned["feat:" + q] = True

            for q in ["S.1.TAR", "S.2.TAR",
                      "S.1.TIR", "S.2.TIR",
                      "S.1.TOR", "S.2.TOR"]:
                if q not in rec or rec[q] is None:
                    rec[q] = [0, 0]
                    if not ("feat:" + q) in has_warned:
                        logging.warn("Missing feature %s" % q)
                        has_warned["feat:" + q] = True

            NT = rec["I.NT"]
            NT_is_ref = int(NT == "ref")
            QSI_NT = int(rec["I.QSI_NT"])

            n_D_total_1 = float(rec["S.1.TIR"][0]) + float(rec["S.1.TAR"][0]) + float(rec["S.1.TOR"][0])
            t_D_total_1 = float(rec["S.2.TIR"][0]) + float(rec["S.2.TAR"][0]) + float(rec["S.2.TOR"][0])
            n_D_total_2 = float(rec["S.1.TIR"][1]) + float(rec["S.1.TAR"][1]) + float(rec["S.1.TOR"][1])
            t_D_total_2 = float(rec["S.2.TIR"][1]) + float(rec["S.2.TAR"][1]) + float(rec["S.2.TOR"][1])

            n_TOR_ratio_1 = float(rec["S.1.TOR"][0]) / n_D_total_1 if n_D_total_1 != 0 else 0
            t_TOR_ratio_1 = float(rec["S.2.TOR"][0]) / t_D_total_1 if t_D_total_1 != 0 else 0
            n_TOR_ratio_2 = float(rec["S.1.TOR"][1]) / n_D_total_2 if n_D_total_2 != 0 else 0
            t_TOR_ratio_2 = float(rec["S.2.TOR"][1]) / t_D_total_2 if t_D_total_2 != 0 else 0

            n_DP = float(rec["S.1.DP"])
            t_DP = float(rec["S.2.DP"])

            in_del = 0

            max_len = len(rec["REF"])
            min_len = len(rec["REF"])

            for a in rec["ALT"]:
                if len(a) > len(rec["REF"]):
                    in_del |= 1
                else:
                    in_del |= 2
                min_len = min(len(a), min_len)
                max_len = max(len(a), max_len)

            ilen = max_len - min_len

            n_DP_ratio = 0
            t_DP_ratio = 0

            if avg_depth:
                if rec["CHROM"] in avg_depth:
                    n_DP_ratio = n_DP/float(avg_depth[rec["CHROM"]])
                    t_DP_ratio = t_DP/float(avg_depth[rec["CHROM"]])
                elif not rec["CHROM"] in has_warned:
                    logging.warn("Cannot normalize depths on %s" % rec["CHROM"])
                    has_warned[rec["CHROM"]] = True
            elif "DPnorm" not in has_warned:
                logging.warn("Cannot normalize depths.")
                has_warned["DPnorm"] = True

            # Ref and alt allele counts for tier1 and tier2
            t_allele_ref_counts = map(float, rec['S.2.TAR'])
            t_allele_alt_counts = map(float, rec['S.2.TIR'])

            # Compute the tier1 and tier2 alt allele rates.
            if t_allele_alt_counts[0] + t_allele_ref_counts[0] == 0:
                t_tier1_allele_rate = 0
            else:
                t_tier1_allele_rate = t_allele_alt_counts[0] / float(t_allele_alt_counts[0] + t_allele_ref_counts[0])

            if t_allele_alt_counts[1] + t_allele_ref_counts[1] == 0:
                t_tier2_allele_rate = 0
            else:
                t_tier2_allele_rate = t_allele_alt_counts[1] / float(t_allele_alt_counts[1] + t_allele_ref_counts[1])

            # Ref and alt allele counts for tier1 and tier2
            n_allele_ref_counts = map(float, rec['S.1.TAR'])
            n_allele_alt_counts = map(float, rec['S.1.TIR'])

            # Compute the tier1 and tier2 alt allele rates.
            if n_allele_alt_counts[0] + n_allele_ref_counts[0] == 0:
                n_tier1_allele_rate = 0
            else:
                n_tier1_allele_rate = n_allele_alt_counts[0] / float(n_allele_alt_counts[0] + n_allele_ref_counts[0])

            if n_allele_alt_counts[1] + n_allele_ref_counts[1] == 0:
                n_tier2_allele_rate = 0
            else:
                n_tier2_allele_rate = n_allele_alt_counts[1] / float(n_allele_alt_counts[1] + n_allele_ref_counts[1])

            bcn = 0

            try:
                bcn = rec["S.1.FDP50"] / rec["S.1.DP50"]
            except:
                pass

            try:
                bcn = max(bcn, rec["S.2.FDP50"] / rec["S.2.DP50"])
            except:
                pass

            # Gather the computed data into a dict
            qrec = {
                "CHROM": rec["CHROM"],
                "POS": int(rec["POS"]),
                "REF": rec["REF"],
                "ALT": ",".join(rec["ALT"]),
                "LENGTH": ilen,
                "LENGTHGT5": 0 if ilen <= 5 else 1,
                "INDELTYPE": in_del,
                "FILTER": ",".join(rec["FILTER"]),
                "NT": NT,
                "NT_REF": NT_is_ref,
                "QSI_NT": QSI_NT,
                "N_TOR_RATE_TIER1": n_TOR_ratio_1,
                "N_TOR_RATE_TIER2": n_TOR_ratio_2,
                "T_TOR_RATE_TIER1": t_TOR_ratio_1,
                "T_TOR_RATE_TIER2": t_TOR_ratio_2,
                "N_DP": n_DP,
                "T_DP": t_DP,
                "N_DP_RATE": n_DP_ratio,
                "T_DP_RATE": t_DP_ratio,
                "T_TIER1_ALT_RATE": t_tier1_allele_rate,
                "T_TIER2_ALT_RATE": t_tier2_allele_rate,
                "N_TIER1_ALT_RATE": n_tier1_allele_rate,
                "N_TIER2_ALT_RATE": n_tier2_allele_rate,
                "SGT": rec["I.SGT"],
                "entropy": rec["I.H200"],
                "hpol": rec["I.RC_HPOL_200"],
                "dinuc": rec["I.RC_DINUC_200"],
                "triplet": rec["I.RC_TRIPLET_200"],
                "bcn": bcn,
                "tag": tag
            }

            try:
                qrec["RC"] = int(rec["I.RC"])
            except:
                qrec["RC"] = 0

            try:
                qrec["RU"] = rec["I.RU"]
            except:
                qrec["RU"] = ""

            try:
                qrec["RU_LEN"] = len(rec["I.RU"])
            except:
                qrec["RU_LEN"] = 0

            try:
                qrec["IC"] = int(rec["I.IC"])
            except:
                qrec["IC"] = 0

            try:
                qrec["IHP"] = int(rec["I.IHP"])
            except:
                qrec["IHP"] = 0

            try:
                qrec["S.1.FDP50"] = float(rec["S.1.FDP50"])
            except:
                qrec["S.1.FDP50"] = 0

            try:
                qrec["S.2.FDP50"] = float(rec["S.2.FDP50"])
            except:
                qrec["S.2.FDP50"] = 0

            try:
                qrec["S.1.SUBDP50"] = float(rec["S.1.SUBDP50"])
            except:
                qrec["S.1.SUBDP50"] = 0

            try:
                qrec["S.2.SUBDP50"] = float(rec["S.2.SUBDP50"])
            except:
                qrec["S.2.SUBDP50"] = 0

            try:
                qrec["MQ"] = float(rec["I.MQ"])
            except:
                qrec["MQ"] = 0

            try:
                qrec["MQ0"] = float(rec["I.MQ0"])
            except:
                qrec["MQ0"] = 0

            records.append(qrec)

        cols = ["CHROM",
                "POS",
                "REF",
                "ALT",
                "LENGTH",
                "LENGTHGT5",
                "INDELTYPE",
                "FILTER",
                "NT",
                "NT_REF",
                "QSI_NT",
                "N_TOR_RATE_TIER1",
                "T_TOR_RATE_TIER1",
                "N_DP",
                "T_DP",
                "N_DP_RATE",
                "T_DP_RATE",
                "T_TIER1_ALT_RATE",
                "T_TIER2_ALT_RATE",
                "N_TIER1_ALT_RATE",
                "N_TIER2_ALT_RATE",
                "SGT",
                "RC",
                "RU",
                "RU_LEN",
                "IC",
                "IHP",
                "S.1.FDP50",
                "S.1.SUBDP50",
                "MQ",
                "MQ0",
                "entropy",
                "hpol",
                "dinuc",
                "triplet",
                "bcn",
                "tag"]

        if records:
            df = pandas.DataFrame(records, columns=cols)
        else:
            df = pandas.DataFrame(columns=cols)

        return df

Example #5

Show file

File: Strelka.py Project: jaredo/hap.py

def extractStrelkaSNVFeatures(vcfname, tag, avg_depth=None):
        """ Return a data frame with features collected from the given VCF, tagged by given type """
        features = ["CHROM", "POS", "REF", "ALT", "FILTER",
                    "I.NT", "I.SOMATIC", "I.QSS_NT", "I.VQSR",
                    "I.SGT", "I.MQ", "I.MQ0", "I.PNOISE", "I.PNOISE2",
                    "I.SNVSB", "I.ReadPosRankSum",
                    "S.1.SDP", "S.2.SDP",
                    "S.1.FDP", "S.2.FDP",
                    "S.1.DP", "S.2.DP",
                    "S.1.AU", "S.2.AU",
                    "S.1.CU", "S.2.CU",
                    "S.1.GU", "S.2.GU",
                    "S.1.TU", "S.2.TU"]

        records = []

        if not avg_depth:
            avg_depth = {}

            for l in list(extractHeaders(vcfname)):
                x = str(l).lower()
                if '##maxdepth_' in x:
                    xl = str(l).split('=')
                    xchr = xl[0][11:]
                    avg_depth[xchr] = float(xl[1])
                    # logging.info("Maxdepth for %s depth from VCF header is %f" % (xchr, avg_depth[xchr]))

        has_warned = {}

        for vr in vcfExtract(vcfname, features):
            rec = {}
            for i, ff in enumerate(features):
                rec[ff] = vr[i]

            # fix missing features
            for q in ["I.QSS_NT", "I.MQ", "I.MQ0", "I.PNOISE", "I.PNOISE2", "I.VQSR",
                      "I.SNVSB", "I.ReadPosRankSum", "S.1.SDP", "S.2.SDP",
                      "S.1.FDP", "S.2.FDP",
                      "S.1.DP", "S.2.DP",
                      "S.1.AU", "S.2.AU",
                      "S.1.CU", "S.2.CU",
                      "S.1.GU", "S.2.GU",
                      "S.1.TU", "S.2.TU"]:
                if q not in rec or rec[q] is None:
                    rec[q] = 0
                    if not ("feat:" + q) in has_warned:
                        logging.warn("Missing feature %s" % q)
                        has_warned["feat:" + q] = True

            rec["tag"] = tag

            NT = rec["I.NT"]
            NT_is_ref = int(NT == "ref")
            QSS_NT = int(rec["I.QSS_NT"])

            try:
                MQ = float(rec["I.MQ"])
            except:
                MQ = None

            try:
                MQ_ZERO = float(rec["I.MQ0"])
            except:
                MQ_ZERO = None

            n_FDP = float(rec["S.1.FDP"])
            t_FDP = float(rec["S.2.FDP"])
            n_SDP = float(rec["S.1.SDP"])
            t_SDP = float(rec["S.2.SDP"])
            n_DP = float(rec["S.1.DP"])
            t_DP = float(rec["S.2.DP"])

            n_FDP_ratio = n_FDP/n_DP if n_DP != 0 else 0
            t_FDP_ratio = t_FDP/t_DP if t_DP != 0 else 0

            n_SDP_ratio = n_SDP/(n_DP + n_SDP) if (n_DP + n_SDP) != 0 else 0
            t_SDP_ratio = t_SDP/(t_DP + t_SDP) if (t_DP + t_SDP) != 0 else 0

            n_DP_ratio = 0
            t_DP_ratio = 0

            if avg_depth:
                if rec["CHROM"] in avg_depth:
                    n_DP_ratio = n_DP/float(avg_depth[rec["CHROM"]])
                    t_DP_ratio = t_DP/float(avg_depth[rec["CHROM"]])
                elif not rec["CHROM"] in has_warned:
                    logging.warn("Cannot normalize depths on %s" % rec["CHROM"])
                    has_warned[rec["CHROM"]] = True
            elif "DPnorm" not in has_warned:
                logging.warn("Cannot normalize depths.")
                has_warned["DPnorm"] = True

            # Ref and alt allele counts for tier1 and tier2
            allele_ref = rec["REF"]
            t_allele_ref_counts = map(float, rec['S.2.' + allele_ref + 'U'])

            alleles_alt = rec["ALT"]

            if alleles_alt == ['.']:
                t_allele_alt_counts = [0, 0]
            else:
                t_allele_alt_counts = [0, 0]
                for a in alleles_alt:
                    for i in range(2):
                        t_allele_alt_counts[i] += float(rec['S.2.' + a + 'U'][i])

            # Compute the tier1 and tier2 alt allele rates.
            if t_allele_alt_counts[0] + t_allele_ref_counts[0] == 0:
                t_tier1_allele_rate = 0
            else:
                t_tier1_allele_rate = t_allele_alt_counts[0] / float(t_allele_alt_counts[0] + t_allele_ref_counts[0])

            if t_allele_alt_counts[1] + t_allele_ref_counts[1] == 0:
                t_tier2_allele_rate = 0
            else:
                t_tier2_allele_rate = t_allele_alt_counts[1] / float(t_allele_alt_counts[1] + t_allele_ref_counts[1])

            n_allele_ref_counts = map(float, rec['S.1.' + allele_ref + 'U'])

            alleles_alt = rec["ALT"]

            if alleles_alt == ['.']:
                n_allele_alt_counts = [0, 0]
            else:
                n_allele_alt_counts = [0, 0]
                for a in alleles_alt:
                    for i in range(2):
                        n_allele_alt_counts[i] += float(rec['S.1.' + a + 'U'][i])

            # Compute the tier1 and tier2 alt allele rates.
            if n_allele_alt_counts[0] + n_allele_ref_counts[0] == 0:
                n_tier1_allele_rate = 0
            else:
                n_tier1_allele_rate = n_allele_alt_counts[0] / float(n_allele_alt_counts[0] + n_allele_ref_counts[0])

            if n_allele_alt_counts[1] + n_allele_ref_counts[1] == 0:
                n_tier2_allele_rate = 0
            else:
                n_tier2_allele_rate = n_allele_alt_counts[1] / float(n_allele_alt_counts[1] + n_allele_ref_counts[1])

            try:
                pnoise = rec["I.PNOISE"]
            except:
                pnoise = 0

            try:
                pnoise2 = rec["I.PNOISE2"]
            except:
                pnoise2 = 0

            try:
                snvsb = rec["I.SNVSB"]
            except:
                snvsb = 0

            try:
                rprs = rec["I.ReadPosRankSum"]
            except:
                rprs = 0

            # Gather the computed data into a dict
            qrec = {
                "CHROM": rec["CHROM"],
                "POS": int(rec["POS"]),
                "REF": rec["REF"],
                "ALT": ",".join(rec["ALT"]),
                "FILTER": ",".join(rec["FILTER"]),
                "NT": NT,
                "NT_REF": NT_is_ref,
                "QSS_NT": QSS_NT,
                "VQSR": rec["I.VQSR"],
                "N_FDP_RATE": n_FDP_ratio,
                "T_FDP_RATE": t_FDP_ratio,
                "N_SDP_RATE": n_SDP_ratio,
                "T_SDP_RATE": t_SDP_ratio,
                "N_DP": n_DP,
                "T_DP": t_DP,
                "N_DP_RATE": n_DP_ratio,
                "T_DP_RATE": t_DP_ratio,
                "T_TIER1_ALT_RATE": t_tier1_allele_rate,
                "T_TIER2_ALT_RATE": t_tier2_allele_rate,
                "N_TIER1_ALT_RATE": n_tier1_allele_rate,
                "N_TIER2_ALT_RATE": n_tier2_allele_rate,
                "MQ_SCORE": MQ,
                "MQ_ZERO_RATE": MQ_ZERO,
                "PNOISE": pnoise,
                "PNOISE2": pnoise2,
                "SNVSB": snvsb,
                "ReadPosRankSum": rprs,
                "tag": tag
            }
            records.append(qrec)

        cols = ["CHROM", "POS", "REF", "ALT",
                "NT", "NT_REF", "QSS_NT", "FILTER", "VQSR",
                "N_FDP_RATE", "T_FDP_RATE", "N_SDP_RATE", "T_SDP_RATE",
                "N_DP", "T_DP", "N_DP_RATE", "T_DP_RATE",
                "T_TIER1_ALT_RATE", "T_TIER2_ALT_RATE", "N_TIER1_ALT_RATE", "N_TIER2_ALT_RATE",
                "MQ_SCORE", "MQ_ZERO_RATE", "PNOISE", "PNOISE2", "SNVSB",
                "ReadPosRankSum", "tag"]

        if records:
            df = pandas.DataFrame(records, columns=cols)
        else:
            df = pandas.DataFrame(columns=cols)

        return df

Example #6

Show file

File: Pisces.py Project: Illumina/hap.py

def extractPiscesSNVFeatures(vcfname, tag, avg_depth=None):
    """ Return a data frame with features collected from the given VCF, tagged by given type
    :param vcfname: name of the VCF file
    :param tag: type of variants
    :param avg_depth: average chromosome depths from BAM file
    """
    features = ["CHROM", "POS", "REF", "ALT", "FILTER",
                "I.DP",
                "I.EVS",
                "S.1.GT",
                "S.1.GQ",
                "S.1.AD",
                "S.1.DP",
                "S.1.VF",
                "S.1.NL",
                "S.1.SB",
                "S.1.NC",
                "S.1.AQ",
                "S.1.GQX"]

    cols = ["CHROM", "POS", "REF", "ALT",
            "FILTER", "GQX", "EVS",
            "T_DP", "T_DP_RATE",
            "T_AF",
            "tag"]

    vcfheaders = list(extractHeaders(vcfname))

    evs_featurenames = {}
    for l in vcfheaders:
        if '##snv_scoring_features' in l:
            try:
                xl = str(l).split('=', 1)
                xl = xl[1].split(",")
                for i, n in enumerate(xl):
                    evs_featurenames[i] = n
                    cols.append("E." + n)
                    logging.info("Scoring feature %i : %s" % (i, n))
            except:
                logging.warn("Could not parse scoring feature names from Pisces output")

    records = []

    if not avg_depth:
        avg_depth = {}

        for l in vcfheaders:
            x = str(l).lower()
            x = x.replace("##meandepth_", "##maxdepth_")
            x = x.replace("##depth_", "##maxdepth_")
            if '##maxdepth_' in x:
                p, _, l = l.partition("_")
                xl = str(l).split('=')
                xchr = xl[0]
                avg_depth[xchr] = float(xl[1])
                logging.info("%s depth from VCF header is %f" % (xchr, avg_depth[xchr]))

    has_warned = {}

    for vr in vcfExtract(vcfname, features):
        rec = {}
        for i, ff in enumerate(features):
            rec[ff] = vr[i]

        # read VQSR value, if it's not present, set to -1 (old versions of Pisces)
        try:
            rec["I.VQSR"] = float(rec["I.VQSR"])
        except:
            rec["I.VQSR"] = -1.0

        # read EVS value, if it's not present, set to -1 (old versions of Pisces)
        if "I.SomaticEVS" in rec:
            try:
                rec["I.EVS"] = float(rec["I.SomaticEVS"])
            except:
                rec["I.EVS"] = -1.0
        else:
            try:
                rec["I.EVS"] = float(rec["I.EVS"])
            except:
                rec["I.EVS"] = -1.0

        # fix missing features
        for q in ["S.1.NC", "S.1.AQ"]:
            if q not in rec or rec[q] is None:
                rec[q] = 0
                if not ("feat:" + q) in has_warned:
                    logging.warn("Missing feature %s" % q)
                    has_warned["feat:" + q] = True

        rec["tag"] = tag

        t_DP = float(rec["S.1.DP"])
        t_VF = float(rec["S.1.VF"])
        GQX = float(rec["S.1.GQX"])

        t_DP_ratio = 0

        if avg_depth:
            try:
                t_DP_ratio = t_DP / float(avg_depth[rec["CHROM"]])
            except:
                if not rec["CHROM"] in has_warned:
                    logging.warn("Cannot normalize depths on %s" % rec["CHROM"])
                    has_warned[rec["CHROM"]] = True
        elif "DPnorm" not in has_warned:
            logging.warn("Cannot normalize depths.")
            has_warned["DPnorm"] = True

        # Gather the computed data into a dict
        qrec = {
            "CHROM": rec["CHROM"],
            "POS": int(rec["POS"]),
            "REF": rec["REF"],
            "ALT": ",".join(rec["ALT"]),
            "FILTER": ",".join(rec["FILTER"]),
            "GQX": GQX,
            "EVS": rec["I.EVS"],
            "T_DP": t_DP,
            "T_DP_RATE": t_DP_ratio,
            "T_AF": t_VF,
            "tag": tag
        }

        records.append(qrec)

    if records:
        df = pandas.DataFrame(records, columns=cols)
    else:
        df = pandas.DataFrame(columns=cols)

    return df

Example #7

Show file

File: Strelka.py Project: simexin/hap.py

def extractStrelkaIndelFeatures(vcfname, tag, avg_depth=None):
    """ Return a data frame with features collected from the given VCF, tagged by given type
    :param vcfname: name of the VCF file
    :param tag: type of variants
    :param avg_depth: average chromosome depths from BAM file
    """
    features = ["CHROM", "POS", "REF", "ALT", "FILTER",
                "I.NT", "I.SOMATIC", "I.QSI_NT", "I.EQSI", "I.ESF",
                "I.SGT", "I.RC", "I.RU",
                "I.IC", "I.IHP",
                "I.MQ", "I.MQ0",
                "S.1.DP", "S.2.DP",
                "S.1.TAR", "S.2.TAR",
                "S.1.TIR", "S.2.TIR",
                "S.1.TOR", "S.2.TOR",
                "S.1.AF", "S.2.AF",
                "S.1.OF", "S.2.OF",
                "S.1.SOR", "S.2.SOR",
                "S.1.FS", "S.2.FS",
                "S.1.BSA", "S.2.BSA",
                "S.1.RR", "S.2.RR",
                "S.1.BCN50", "S.2.BCN50",
                ]

    cols = ["CHROM",
            "POS",
            "REF",
            "ALT",
            "LENGTH",
            "INDELTYPE",
            "FILTER",
            "NT",
            "NT_REF",
            "VQSR",
            "EQSI",
            "QSI_NT",
            "N_DP",
            "T_DP",
            "N_DP_RATE",
            "T_DP_RATE",
            "N_AF",
            "T_AF",
            "N_OF",
            "T_OF",
            "N_SOR",
            "T_SOR",
            "N_FS",
            "T_FS",
            "N_BSA",
            "T_BSA",
            "N_RR",
            "T_RR",
            "N_BCN",
            "T_BCN",
            "SGT",
            "RC",
            "RU",
            "RU_LEN",
            "IC",
            "IHP",
            "MQ",
            "MQ0",
            "tag"]

    records = []

    vcfheaders = list(extractHeaders(vcfname))

    vqsr_featurenames = {}

    for l in vcfheaders:
        if '##vqsr_features' in l:
            try:
                xl = str(l).split('=', 1)
                xl = xl[1].split(",")
                for x in xl:
                    i, n = x.split(":", 1)
                    i = int(i)
                    vqsr_featurenames[i] = n
                    cols.append("VQSR." + n)
                    logging.info("VQSR feature %i : %s" % (i, n))
            except:
                logging.warn("Could not parse VQSR feature names from Strelka output")

    if not avg_depth:
        avg_depth = {}

        for l in vcfheaders:
            x = str(l).lower()
            x = x.replace("meandepth_", "maxdepth_")
            if '##maxdepth_' in x:
                xl = str(l).split('=')
                xchr = xl[0][12:]
                avg_depth[xchr] = float(xl[1])
                logging.info("%s depth from VCF header is %f" % (xchr, avg_depth[xchr]))

    has_warned = {}
    for vr in vcfExtract(vcfname, features):
        rec = {}
        for i, ff in enumerate(features):
            rec[ff] = vr[i]
        rec["tag"] = tag

        # fix missing features
        for q in ["I.QSI_NT", "I.RC", "I.IC", "I.IHP", "I.EQSI",
                  "S.1.DP", "S.2.DP",
                  "S.1.OF", "S.2.OF",
                  "S.1.RR", "S.2.RR",
                  "S.1.FS", "S.2.FS",
                  "S.1.BSA", "S.2.BSA",
                  "S.1.BCN50", "S.2.BCN50",
                  "S.1.AF", "S.2.AF"]:
            if q not in rec or rec[q] is None:
                rec[q] = 0
                if not ("feat:" + q) in has_warned:
                    logging.warn("Missing feature %s" % q)
                    has_warned["feat:" + q] = True

        for q in ["S.1.TAR", "S.2.TAR",
                  "S.1.TIR", "S.2.TIR",
                  "S.1.TOR", "S.2.TOR"]:
            if q not in rec or rec[q] is None:
                rec[q] = [0, 0]
                if not ("feat:" + q) in has_warned:
                    logging.warn("Missing feature %s" % q)
                    has_warned["feat:" + q] = True

        NT = rec["I.NT"]
        NT_is_ref = int(NT == "ref")
        QSI_NT = int(rec["I.QSI_NT"])

        n_DP = float(rec["S.1.DP"])
        t_DP = float(rec["S.2.DP"])

        in_del = 0

        max_len = len(rec["REF"])
        min_len = len(rec["REF"])

        for a in rec["ALT"]:
            if len(a) > len(rec["REF"]):
                in_del |= 1
            else:
                in_del |= 2
            min_len = min(len(a), min_len)
            max_len = max(len(a), max_len)

        ilen = max_len - min_len

        n_DP_ratio = 0
        t_DP_ratio = 0

        if avg_depth:
            if rec["CHROM"] in avg_depth:
                n_DP_ratio = n_DP / float(avg_depth[rec["CHROM"]])
                t_DP_ratio = t_DP / float(avg_depth[rec["CHROM"]])
            elif not rec["CHROM"] in has_warned:
                logging.warn("Cannot normalize depths on %s" % rec["CHROM"])
                has_warned[rec["CHROM"]] = True
        elif "DPnorm" not in has_warned:
            logging.warn("Cannot normalize depths.")
            has_warned["DPnorm"] = True

        # Gather the computed data into a dict
        qrec = {
            "CHROM": rec["CHROM"],
            "POS": int(rec["POS"]),
            "REF": rec["REF"],
            "ALT": ",".join(rec["ALT"]),
            "LENGTH": ilen,
            "INDELTYPE": in_del,
            "FILTER": ",".join(rec["FILTER"]),
            "NT": NT,
            "NT_REF": NT_is_ref,
            "QSI_NT": QSI_NT,
            "N_DP": n_DP,
            "T_DP": t_DP,
            "N_DP_RATE": n_DP_ratio,
            "T_DP_RATE": t_DP_ratio,
            "SGT": rec["I.SGT"],
            "tag": tag
        }

        # fields with defaults
        fields = [
            {"n": "EQSI", "s": "I.EQSI", "def": 0, "t": float},
            {"n": "VQSR", "s": "I.EQSI", "def": 0, "t": float},
            {"n": "RC", "s": "I.RC", "def": 0, "t": int},
            {"n": "RU", "s": "I.RU", "def": ""},
            {"n": "RU_LEN", "s": "I.RU", "def": 0, "t": len},
            {"n": "IC", "s": "I.IC", "def": 0, "t": int},
            {"n": "IHP", "s": "I.IHP", "def": 0, "t": int},
            {"n": "MQ", "s": "I.MQ", "def": 0.0, "t": float},
            {"n": "MQ0", "s": "I.MQ0", "def": 0.0, "t": float},
            {"n": "N_AF", "s": "S.1.AF", "def": 0.0, "t": float},
            {"n": "T_AF", "s": "S.2.AF", "def": 0.0, "t": float},
            {"n": "N_OF", "s": "S.1.OF", "def": 0.0, "t": float},
            {"n": "T_OF", "s": "S.2.OF", "def": 0.0, "t": float},
            {"n": "N_SOR", "s": "S.1.SOR", "def": 0.0, "t": float},
            {"n": "T_SOR", "s": "S.2.SOR", "def": 0.0, "t": float},
            {"n": "N_FS", "s": "S.1.FS", "def": 0.0, "t": float},
            {"n": "T_FS", "s": "S.2.FS", "def": 0.0, "t": float},
            {"n": "N_BSA", "s": "S.1.BSA", "def": 0.0, "t": float},
            {"n": "T_BSA", "s": "S.2.BSA", "def": 0.0, "t": float},
            {"n": "N_RR", "s": "S.1.RR", "def": 0.0, "t": float},
            {"n": "T_RR", "s": "S.2.RR", "def": 0.0, "t": float},
            {"n": "N_BCN", "s": "S.1.BCN50", "def": 0.0, "t": float},
            {"n": "T_BCN", "s": "S.2.BCN50", "def": 0.0, "t": float},
        ]

        for fd in fields:
            try:
                res = rec[fd["s"]]
                if "t" in fd:
                    res = fd["t"](res)
            except:
                res = fd["def"]

            qrec[fd["n"]] = res

        # VQSR features
        try:
            for i, v in enumerate(rec["I.ESF"]):
                if i in vqsr_featurenames:
                    try:
                        qrec["VQSR." + vqsr_featurenames[i]] = float(v)
                    except:
                        # failure to parse
                        pass
        except:
            pass
        for k, v in vqsr_featurenames.iteritems():
            if not "VQSR." + v in qrec:
                qrec["VQSR." + v] = 0

        records.append(qrec)

    if records:
        df = pandas.DataFrame(records, columns=cols)
    else:
        df = pandas.DataFrame(columns=cols)

    return df

Example #8

Show file

def extractStrelkaSNVFeatures(vcfname, tag, avg_depth=None):
    """ Return a data frame with features collected from the given VCF, tagged by given type
    :param vcfname: name of the VCF file
    :param tag: type of variants
    :param avg_depth: average chromosome depths from BAM file
    """
    features = [
        "CHROM", "POS", "REF", "ALT", "FILTER", "I.NT", "I.SOMATIC",
        "I.QSS_NT", "I.VQSR", "I.SGT", "I.MQ", "I.MQ0", "I.PNOISE",
        "I.PNOISE2", "I.SNVSB", "I.ReadPosRankSum", "S.1.SDP", "S.2.SDP",
        "S.1.FDP", "S.2.FDP", "S.1.DP", "S.2.DP", "S.1.AU", "S.2.AU", "S.1.CU",
        "S.2.CU", "S.1.GU", "S.2.GU", "S.1.TU", "S.2.TU"
    ]

    records = []

    if not avg_depth:
        avg_depth = {}

        for l in list(extractHeaders(vcfname)):
            x = str(l).lower()
            x = x.replace("meandepth_", "maxdepth_")
            if '##maxdepth_' in x:
                xl = str(l).split('=')
                xchr = xl[0][12:]
                avg_depth[xchr] = float(xl[1])
                logging.info("%s depth from VCF header is %f" %
                             (xchr, avg_depth[xchr]))

    has_warned = {}

    for vr in vcfExtract(vcfname, features):
        rec = {}
        for i, ff in enumerate(features):
            rec[ff] = vr[i]

        # fix missing features
        for q in [
                "I.QSS_NT", "I.MQ", "I.MQ0", "I.PNOISE", "I.PNOISE2", "I.VQSR",
                "I.SNVSB", "I.ReadPosRankSum", "S.1.SDP", "S.2.SDP", "S.1.FDP",
                "S.2.FDP", "S.1.DP", "S.2.DP", "S.1.AU", "S.2.AU", "S.1.CU",
                "S.2.CU", "S.1.GU", "S.2.GU", "S.1.TU", "S.2.TU"
        ]:
            if q not in rec or rec[q] is None:
                rec[q] = 0
                if not ("feat:" + q) in has_warned:
                    logging.warn("Missing feature %s" % q)
                    has_warned["feat:" + q] = True

        rec["tag"] = tag

        NT = rec["I.NT"]
        NT_is_ref = int(NT == "ref")
        QSS_NT = int(rec["I.QSS_NT"])

        try:
            MQ = float(rec["I.MQ"])
        except:
            MQ = None

        try:
            MQ_ZERO = float(rec["I.MQ0"])
        except:
            MQ_ZERO = None

        n_FDP = float(rec["S.1.FDP"])
        t_FDP = float(rec["S.2.FDP"])
        n_SDP = float(rec["S.1.SDP"])
        t_SDP = float(rec["S.2.SDP"])
        n_DP = float(rec["S.1.DP"])
        t_DP = float(rec["S.2.DP"])

        n_FDP_ratio = n_FDP / n_DP if n_DP != 0 else 0
        t_FDP_ratio = t_FDP / t_DP if t_DP != 0 else 0

        n_SDP_ratio = n_SDP / (n_DP + n_SDP) if (n_DP + n_SDP) != 0 else 0
        t_SDP_ratio = t_SDP / (t_DP + t_SDP) if (t_DP + t_SDP) != 0 else 0

        n_DP_ratio = 0
        t_DP_ratio = 0

        if avg_depth:
            if rec["CHROM"] in avg_depth:
                n_DP_ratio = n_DP / float(avg_depth[rec["CHROM"]])
                t_DP_ratio = t_DP / float(avg_depth[rec["CHROM"]])
            elif not rec["CHROM"] in has_warned:
                logging.warn("Cannot normalize depths on %s" % rec["CHROM"])
                has_warned[rec["CHROM"]] = True
        elif "DPnorm" not in has_warned:
            logging.warn("Cannot normalize depths.")
            has_warned["DPnorm"] = True

        # Ref and alt allele counts for tier1 and tier2
        allele_ref = rec["REF"]
        t_allele_ref_counts = map(float, rec['S.2.' + allele_ref + 'U'])

        alleles_alt = rec["ALT"]

        if alleles_alt == ['.']:
            t_allele_alt_counts = [0, 0]
        else:
            t_allele_alt_counts = [0, 0]
            for a in alleles_alt:
                for i in range(2):
                    t_allele_alt_counts[i] += float(rec['S.2.' + a + 'U'][i])

        # Compute the tier1 and tier2 alt allele rates.
        if t_allele_alt_counts[0] + t_allele_ref_counts[0] == 0:
            t_tier1_allele_rate = 0
        else:
            t_tier1_allele_rate = t_allele_alt_counts[0] / float(
                t_allele_alt_counts[0] + t_allele_ref_counts[0])

        if t_allele_alt_counts[1] + t_allele_ref_counts[1] == 0:
            t_tier2_allele_rate = 0
        else:
            t_tier2_allele_rate = t_allele_alt_counts[1] / float(
                t_allele_alt_counts[1] + t_allele_ref_counts[1])

        n_allele_ref_counts = map(float, rec['S.1.' + allele_ref + 'U'])

        alleles_alt = rec["ALT"]

        if alleles_alt == ['.']:
            n_allele_alt_counts = [0, 0]
        else:
            n_allele_alt_counts = [0, 0]
            for a in alleles_alt:
                for i in range(2):
                    n_allele_alt_counts[i] += float(rec['S.1.' + a + 'U'][i])

        # Compute the tier1 and tier2 alt allele rates.
        if n_allele_alt_counts[0] + n_allele_ref_counts[0] == 0:
            n_tier1_allele_rate = 0
        else:
            n_tier1_allele_rate = n_allele_alt_counts[0] / float(
                n_allele_alt_counts[0] + n_allele_ref_counts[0])

        if n_allele_alt_counts[1] + n_allele_ref_counts[1] == 0:
            n_tier2_allele_rate = 0
        else:
            n_tier2_allele_rate = n_allele_alt_counts[1] / float(
                n_allele_alt_counts[1] + n_allele_ref_counts[1])

        try:
            pnoise = rec["I.PNOISE"]
        except:
            pnoise = 0

        try:
            pnoise2 = rec["I.PNOISE2"]
        except:
            pnoise2 = 0

        try:
            snvsb = rec["I.SNVSB"]
        except:
            snvsb = 0

        try:
            rprs = rec["I.ReadPosRankSum"]
        except:
            rprs = 0

        # Gather the computed data into a dict
        qrec = {
            "CHROM": rec["CHROM"],
            "POS": int(rec["POS"]),
            "REF": rec["REF"],
            "ALT": ",".join(rec["ALT"]),
            "FILTER": ",".join(rec["FILTER"]),
            "NT": NT,
            "NT_REF": NT_is_ref,
            "QSS_NT": QSS_NT,
            "VQSR": rec["I.VQSR"],
            "N_FDP_RATE": n_FDP_ratio,
            "T_FDP_RATE": t_FDP_ratio,
            "N_SDP_RATE": n_SDP_ratio,
            "T_SDP_RATE": t_SDP_ratio,
            "N_DP": n_DP,
            "T_DP": t_DP,
            "N_DP_RATE": n_DP_ratio,
            "T_DP_RATE": t_DP_ratio,
            "T_TIER1_ALT_RATE": t_tier1_allele_rate,
            "T_AF": t_tier1_allele_rate,
            "T_TIER2_ALT_RATE": t_tier2_allele_rate,
            "N_TIER1_ALT_RATE": n_tier1_allele_rate,
            "N_TIER2_ALT_RATE": n_tier2_allele_rate,
            "MQ_SCORE": MQ,
            "MQ_ZERO_RATE": MQ_ZERO,
            "PNOISE": pnoise,
            "PNOISE2": pnoise2,
            "SNVSB": snvsb,
            "ReadPosRankSum": rprs,
            "tag": tag
        }
        records.append(qrec)

    cols = [
        "CHROM", "POS", "REF", "ALT", "NT", "NT_REF", "QSS_NT", "FILTER",
        "VQSR", "N_FDP_RATE", "T_FDP_RATE", "N_SDP_RATE", "T_SDP_RATE", "N_DP",
        "T_DP", "N_DP_RATE", "T_DP_RATE", "T_TIER1_ALT_RATE",
        "T_TIER2_ALT_RATE", "N_TIER1_ALT_RATE", "N_TIER2_ALT_RATE", "T_AF",
        "MQ_SCORE", "MQ_ZERO_RATE", "PNOISE", "PNOISE2", "SNVSB",
        "ReadPosRankSum", "tag"
    ]

    if records:
        df = pandas.DataFrame(records, columns=cols)
    else:
        df = pandas.DataFrame(columns=cols)

    return df