Beispiel #1
0
    def collectFeatures(vcfname, tag, features, processor=None):
        if not processor:
            processor = GenericFeatures.processValue

        records = []

        for vr in vcfExtract(vcfname, features):
            rec = {}
            for i, v in enumerate(vr):
                rec[features[i]] = processor((features[i], v))
            rec["tag"] = tag
            records.append(rec)

        if records:
            df = pandas.DataFrame(records, columns=features + ["tag"])
        else:
            df = pandas.DataFrame(columns=features + ["tag"])
        return df
Beispiel #2
0
    def collectFeatures(vcfname, tag, features, processor=None):
        if not processor:
            processor = GenericFeatures.processValue

        records = []

        for vr in vcfExtract(vcfname, features):
            rec = {}
            for i, v in enumerate(vr):
                rec[features[i]] = processor((features[i], v))
            rec["tag"] = tag
            records.append(rec)

        if records:
            df = pandas.DataFrame(records, columns=features + ["tag"])
        else:
            df = pandas.DataFrame(columns=features + ["tag"])
        return df
Beispiel #3
0
def extractMutectSNVFeatures(vcfname, tag, avg_depth=None):
    """ Return a data frame with features collected from the given VCF, tagged by given type """
    records = []

    if not avg_depth:
        logging.warn(
            "No average depths available, normalized depth features cannot be calculated"
        )

    hdrs = extractHeadersJSON(vcfname)

    tsn = ""
    nsn = ""

    t_sample = "S.1."
    n_sample = "S.2."

    try:
        samples = hdrs["samples"]
        for f in hdrs["fields"]:
            if f["key"] == "GATKCommandLine" and f["values"]["ID"].lower(
            ) == "mutect":
                clopts = f["values"]["CommandLineOptions"]
                # ... tumor_sample_name=HCC2218_tumour ... normal_sample_name=HCC2218_normal
                m = re.search("tumor_sample_name=([^\s]+)", clopts)
                if m:
                    tsn = m.group(1)
                    for i, x in enumerate(samples):
                        if x == tsn:
                            t_sample = "S.%i." % (i + 1)
                            break
                m = re.search("normal_sample_name=([^\s]+)", clopts)
                if m:
                    nsn = m.group(1)
                    for i, x in enumerate(samples):
                        if x == nsn:
                            n_sample = "S.%i." % (i + 1)
                            break

    except:
        logging.warn(
            "Unable to detect tumour / normal sample order from VCF header")

    logging.info(
        "Normal sample name : %s (prefix %s) / tumour sample name : %s (prefix %s)"
        % (nsn, n_sample, tsn, t_sample))

    features = [
        "CHROM", "POS", "REF", "ALT", "FILTER", "I.DB", n_sample + "GT",
        t_sample + "GT", n_sample + "DP", t_sample + "DP", n_sample + "AD",
        t_sample + "AD", n_sample + "BQ", t_sample + "BQ", n_sample + "FA",
        t_sample + "FA", n_sample + "SS", t_sample + "SS"
    ]

    has_warned = {"feat:I.DB": 1}

    for vr in vcfExtract(vcfname, features):
        rec = {}
        for i, ff in enumerate(features):
            rec[ff] = vr[i]

        for q in [n_sample + "GT", t_sample + "GT"]:
            if not q in rec or rec[q] is None:
                rec[q] = "."
                if not ("feat:" + q) in has_warned:
                    logging.warn("Missing feature %s" % q)
                    has_warned["feat:" + q] = True

        # fix missing features
        for q in [
                "I.DB", n_sample + "DP", t_sample + "DP", n_sample + "AD",
                t_sample + "AD", n_sample + "BQ", t_sample + "BQ",
                n_sample + "FA", t_sample + "FA", n_sample + "SS",
                t_sample + "SS"
        ]:
            if not q in rec or rec[q] is None:
                rec[q] = 0
                if not ("feat:" + q) in has_warned:
                    logging.warn("Missing feature %s" % q)
                    has_warned["feat:" + q] = True
            else:
                if q.endswith("FA"):
                    try:
                        rec[q] = float(rec[q])
                    except ValueError:
                        rec[q] = float("NaN")
                elif q.endswith("AD"):
                    if type(rec[q]) is not list:
                        if not has_warned["AD_PARSE_FAIL"]:
                            logging.warn("Cannot parse AD: %s" % str(rec[q]))
                            has_warned["AD_PARSE_FAIL"] = True
                            rec[q] = [0] * (1 + len(rec["ALT"]))

                        for xx in range(0, 1 + len(rec["ALT"])):
                            if len(rec[q]) <= xx:
                                rec[q].append(0)
                            else:
                                try:
                                    rec[q][xx] = float(rec[q][xx])
                                except ValueError:
                                    rec[q][xx] = 0
                else:
                    try:
                        rec[q] = int(rec[q])
                    except ValueError:
                        rec[q] = -1

        rec["tag"] = tag

        n_DP = float(rec[n_sample + "DP"])
        t_DP = float(rec[t_sample + "DP"])

        n_DP_ratio = 0
        t_DP_ratio = 0

        if avg_depth:
            if rec["CHROM"] in avg_depth:
                n_DP_ratio = n_DP / float(avg_depth[rec["CHROM"]])
                t_DP_ratio = t_DP / float(avg_depth[rec["CHROM"]])
            elif not rec["CHROM"] in has_warned:
                logging.warn("Cannot normalize depths on %s" % rec["CHROM"])
                has_warned[rec["CHROM"]] = True
        elif not "DPnorm" in has_warned:
            logging.warn("Cannot normalize depths.")
            has_warned["DPnorm"] = True

        n_allele_ref_count = rec[n_sample + "AD"][0]
        alleles_alt = rec["ALT"]

        if alleles_alt == ['.']:
            n_allele_alt_count = 0
        else:
            n_allele_alt_count = 0
            for a in xrange(0, len(alleles_alt)):
                n_allele_alt_count += float(rec[n_sample + "AD"][a + 1])

        if n_allele_alt_count + n_allele_ref_count == 0:
            n_allele_rate = 0
        else:
            n_allele_rate = n_allele_alt_count / float(n_allele_alt_count +
                                                       n_allele_ref_count)

        t_allele_ref_count = rec[t_sample + "AD"][0]
        alleles_alt = rec["ALT"]

        if alleles_alt == ['.']:
            t_allele_alt_count = 0
        else:
            t_allele_alt_count = 0
            for a in xrange(0, len(alleles_alt)):
                t_allele_alt_count += float(rec[t_sample + "AD"][a + 1])

        if t_allele_alt_count + t_allele_ref_count == 0:
            t_allele_rate = 0
        else:
            t_allele_rate = t_allele_alt_count / float(t_allele_alt_count +
                                                       t_allele_ref_count)

        # Gather the computed data into a dict
        qrec = {
            "CHROM": rec["CHROM"],
            "POS": int(rec["POS"]),
            "REF": rec["REF"],
            "ALT": ",".join(rec["ALT"]),
            "FILTER": ",".join(rec["FILTER"]),
            "DBSNP": rec["I.DB"],
            "N_DP": n_DP,
            "T_DP": t_DP,
            "N_DP_RATE": n_DP_ratio,
            "T_DP_RATE": t_DP_ratio,
            "N_GT": rec[n_sample + "GT"],
            "T_GT": rec[t_sample + "GT"],
            "N_AD": rec[n_sample + "AD"],
            "T_AD": rec[t_sample + "AD"],
            "N_BQ": rec[n_sample + "BQ"],
            "T_BQ": rec[t_sample + "BQ"],
            "N_FA": rec[n_sample + "FA"],
            "T_FA": rec[t_sample + "FA"],
            "N_SS": rec[n_sample + "SS"],
            "T_SS": rec[t_sample + "SS"],
            "N_ALT_RATE": n_allele_rate,
            "T_ALT_RATE": t_allele_rate,
            "tag": tag
        }
        records.append(qrec)

    cols = [
        "CHROM", "POS", "REF", "ALT", "FILTER", "DBSNP", "N_DP", "T_DP",
        "N_DP_RATE", "T_DP_RATE", "N_GT", "T_GT", "N_AD", "T_AD", "N_BQ",
        "T_BQ", "N_FA", "T_FA", "N_SS", "T_SS", "N_ALT_RATE", "T_ALT_RATE",
        "tag"
    ]

    if records:
        df = pandas.DataFrame(records, columns=cols)
    else:
        df = pandas.DataFrame(columns=cols)

    return df
Beispiel #4
0
def extractMutectIndelFeatures(vcfname, tag, avg_depth=None):
    """ Return a data frame with features collected from the given VCF, tagged by given type """
    records = []

    if not avg_depth:
        logging.warn(
            "No average depths available, normalized depth features cannot be calculated"
        )

    hdrs = extractHeadersJSON(vcfname)

    tsn = ""
    nsn = ""

    t_sample = "S.1."
    n_sample = "S.2."

    try:
        samples = hdrs["samples"]
        for f in hdrs["fields"]:
            if f["key"] == "GATKCommandLine" and f["values"]["ID"].lower(
            ) == "mutect":
                clopts = f["values"]["CommandLineOptions"]
                # ... tumor_sample_name=HCC2218_tumour ... normal_sample_name=HCC2218_normal
                m = re.search("tumor_sample_name=([^\s]+)", clopts)
                if m:
                    tsn = m.group(1)
                    for i, x in enumerate(samples):
                        if x == tsn:
                            t_sample = "S.%i." % (i + 1)
                            break
                m = re.search("normal_sample_name=([^\s]+)", clopts)
                if m:
                    nsn = m.group(1)
                    for i, x in enumerate(samples):
                        if x == nsn:
                            n_sample = "S.%i." % (i + 1)
                            break

    except:
        logging.warn(
            "Unable to detect tumour / normal sample order from VCF header")

    logging.info(
        "Normal sample name : %s (prefix %s) / tumour sample name : %s (prefix %s)"
        % (nsn, n_sample, tsn, t_sample))
    has_warned = {}

    ##FORMAT=<ID=MM,Number=2,Type=Float,Description="Average # of mismatches per ref-/consensus indel-supporting read">
    ##FORMAT=<ID=MQS,Number=2,Type=Float,Description="Average mapping qualities of ref-/consensus indel-supporting reads">
    ##FORMAT=<ID=NQSBQ,Number=2,Type=Float,Description="Within NQS window: average quality of bases in ref-/consensus indel-supporting reads">
    ##FORMAT=<ID=NQSMM,Number=2,Type=Float,Description="Within NQS window: fraction of mismatching bases in ref/consensus indel-supporting reads">
    ##FORMAT=<ID=REnd,Number=2,Type=Integer,Description="Median/mad of indel offsets from the ends of the reads">
    ##FORMAT=<ID=RStart,Number=2,Type=Integer,Description="Median/mad of indel offsets from the starts of the reads">
    ##FORMAT=<ID=SC,Number=4,Type=Integer,Description="Strandness: counts of forward-/reverse-aligned reference and indel-supporting reads (FwdRef,RevRef,FwdIndel,RevIndel)">

    features = [
        "CHROM", "POS", "REF", "ALT", "FILTER", n_sample + "GT",
        t_sample + "GT", n_sample + "DP", t_sample + "DP", n_sample + "AD",
        t_sample + "AD", n_sample + "MM", t_sample + "MM", n_sample + "MQS",
        t_sample + "MQS", n_sample + "NQSBQ", t_sample + "NQSBQ",
        n_sample + "NQSMM", t_sample + "NQSMM", n_sample + "RStart",
        t_sample + "RStart", n_sample + "REnd", t_sample + "REnd",
        n_sample + "SC", t_sample + "SC"
    ]

    for vr in vcfExtract(vcfname, features):
        rec = {}
        for i, ff in enumerate(features):
            rec[ff] = vr[i]

        for q in [n_sample + "GT", t_sample + "GT"]:
            if not q in rec or rec[q] is None:
                rec[q] = "."
                if not ("feat:" + q) in has_warned:
                    logging.warn("Missing feature %s" % q)
                    has_warned["feat:" + q] = True

        # fix missing features
        for q in [
                n_sample + "GT", t_sample + "GT", n_sample + "DP",
                t_sample + "DP", n_sample + "AD", t_sample + "AD",
                n_sample + "MM", t_sample + "MM", n_sample + "MQS",
                t_sample + "MQS", n_sample + "NQSBQ", t_sample + "NQSBQ",
                n_sample + "NQSMM", t_sample + "NQSMM", n_sample + "RStart",
                t_sample + "RStart", n_sample + "REnd", t_sample + "REnd",
                n_sample + "SC", t_sample + "SC"
        ]:
            if not q in rec or rec[q] is None:
                rec[q] = 0
                if not ("feat:" + q) in has_warned:
                    logging.warn("Missing feature %s" % q)
                    has_warned["feat:" + q] = True
            else:
                if q.endswith("AD") or q.endswith("MM") or q.endswith("MQS") or \
                   q.endswith("NQSBQ") or q.endswith("NQSMM") or \
                   q.endswith("REnd") or q.endswith("RStart"):
                    if type(rec[q]) is not list:
                        if not has_warned[q + "_PARSE_FAIL"]:
                            logging.warn("Cannot parse %s: %s" %
                                         (q, str(rec[q])))
                            has_warned[q + "_PARSE_FAIL"] = True
                            rec[q] = [-1, -1]
                        for xx in range(2):
                            if len(rec[q]) <= xx:
                                rec[q].append(-1)
                            else:
                                try:
                                    rec[q][xx] = float(rec[q][xx])
                                except ValueError:
                                    rec[q][xx] = -1
                elif q.endswith("SC"):
                    if type(rec[q]) is not list:
                        if not has_warned[q + "_PARSE_FAIL"]:
                            logging.warn("Cannot parse %s: %s" %
                                         (q, str(rec[q])))
                            has_warned[q + "_PARSE_FAIL"] = True
                            rec[q] = [-1, -1, -1, -1]
                    else:
                        for xx in range(4):
                            if len(rec[q]) <= xx:
                                rec[q].append(-1)
                            else:
                                try:
                                    rec[q][xx] = float(rec[q][xx])
                                except ValueError:
                                    rec[q][xx] = -1
                else:
                    try:
                        rec[q] = int(rec[q])
                    except ValueError:
                        rec[q] = -1

        rec["tag"] = tag

        n_DP = float(rec[n_sample + "DP"])
        t_DP = float(rec[t_sample + "DP"])

        n_DP_ratio = 0
        t_DP_ratio = 0

        if avg_depth:
            if rec["CHROM"] in avg_depth:
                n_DP_ratio = n_DP / float(avg_depth[rec["CHROM"]])
                t_DP_ratio = t_DP / float(avg_depth[rec["CHROM"]])
            elif not rec["CHROM"] in has_warned:
                logging.warn("Cannot normalize depths on %s" % rec["CHROM"])
                has_warned[rec["CHROM"]] = True
        elif not "DPnorm" in has_warned:
            logging.warn("Cannot normalize depths.")
            has_warned["DPnorm"] = True

        n_allele_ref_count = rec[n_sample + "AD"][0]
        alleles_alt = rec["ALT"]

        if alleles_alt == ['.']:
            n_allele_alt_count = 0
        else:
            n_allele_alt_count = 0
            for a in xrange(1, len(rec[n_sample + "AD"])):
                n_allele_alt_count += float(rec[n_sample + "AD"][a])

        if n_allele_alt_count + n_allele_ref_count == 0:
            n_allele_rate = 0
        else:
            n_allele_rate = n_allele_alt_count / float(n_allele_alt_count +
                                                       n_allele_ref_count)

        t_allele_ref_count = rec[t_sample + "AD"][0]
        alleles_alt = rec["ALT"]

        if alleles_alt == ['.']:
            t_allele_alt_count = 0
        else:
            t_allele_alt_count = 0
            for a in xrange(1, len(rec[t_sample + "AD"])):
                t_allele_alt_count += float(rec[t_sample + "AD"][a])

        if t_allele_alt_count + t_allele_ref_count == 0:
            t_allele_rate = 0
        else:
            t_allele_rate = t_allele_alt_count / float(t_allele_alt_count +
                                                       t_allele_ref_count)

        # Gather the computed data into a dict
        qrec = {
            "CHROM": rec["CHROM"],
            "POS": int(rec["POS"]),
            "REF": rec["REF"],
            "ALT": ",".join(rec["ALT"]),
            "FILTER": ",".join(rec["FILTER"]),
            "N_DP": n_DP,
            "T_DP": t_DP,
            "N_DP_RATE": n_DP_ratio,
            "T_DP_RATE": t_DP_ratio,
            "N_GT": rec[n_sample + "GT"],
            "T_GT": rec[t_sample + "GT"],
            "N_AD": rec[n_sample + "AD"],
            "T_AD": rec[t_sample + "AD"],
            "N_ALT_RATE": n_allele_rate,
            "T_ALT_RATE": t_allele_rate,
            "N_MM": n_sample + "MM",
            "T_MM": t_sample + "MM",
            "N_MQS": n_sample + "MQS",
            "T_MQS": t_sample + "MQS",
            "N_NQSBQ": n_sample + "NQSBQ",
            "T_NQSBQ": t_sample + "NQSBQ",
            "N_NQSMM": n_sample + "NQSMM",
            "T_NQSMM": t_sample + "NQSMM",
            "N_RStart": n_sample + "RStart",
            "T_RStart": t_sample + "RStart",
            "N_REnd": n_sample + "REnd",
            "T_REnd": t_sample + "REnd",
            "N_SC": n_sample + "SC",
            "T_SC": t_sample + "SC",
            "tag": tag
        }
        records.append(qrec)

    cols = [
        "CHROM", "POS", "REF", "ALT", "FILTER", "DBSNP", "N_DP", "T_DP",
        "N_DP_RATE", "T_DP_RATE", "N_GT", "T_GT", "N_AD", "T_AD", "N_ALT_RATE",
        "T_ALT_RATE", "N_MM", "T_MM", "N_MQS", "T_MQS", "N_NQSBQ", "T_NQSBQ",
        "N_NQSMM", "T_NQSMM", "N_RStart", "T_RStart", "N_REnd", "T_REnd",
        "N_SC", "T_SC", "tag"
    ]

    if records:
        df = pandas.DataFrame(records, columns=cols)
    else:
        df = pandas.DataFrame(columns=cols)

    return df
Beispiel #5
0
def extractStrelkaIndelFeatures(vcfname, tag, avg_depth=None):
    """ Return a data frame with features collected from the given VCF, tagged by given type
    :param vcfname: name of the VCF file
    :param tag: type of variants
    :param avg_depth: average chromosome depths from BAM file
    """
    features = [
        "CHROM",
        "POS",
        "REF",
        "ALT",
        "FILTER",
        "I.NT",
        "I.SOMATIC",
        "I.QSI_NT",
        "I.EVS",
        "I.EVSF",
        "I.SGT",
        "I.RC",
        "I.RU",
        "I.IC",
        "I.IHP",
        "I.MQ",
        "I.MQ0",
        "S.1.DP",
        "S.2.DP",
        "S.1.TAR",
        "S.2.TAR",
        "S.1.TIR",
        "S.2.TIR",
        "S.1.TOR",
        "S.2.TOR",
        "S.1.BCN50",
        "S.2.BCN50",
        "S.1.FDP50",
        "S.2.FDP50",
    ]

    cols = [
        "CHROM", "POS", "REF", "ALT", "LENGTH", "INDELTYPE", "FILTER", "NT",
        "NT_REF", "EVS", "QSI_NT", "N_DP", "T_DP", "N_DP_RATE", "T_DP_RATE",
        "N_BCN", "T_BCN", "N_FDP", "T_FDP", "N_AF", "T_AF", "SGT", "RC", "RU",
        "RU_LEN", "IC", "IHP", "MQ", "MQ0", "tag"
    ]

    records = []

    vcfheaders = list(extractHeaders(vcfname))

    evs_featurenames = {}

    for l in vcfheaders:
        if '##indel_scoring_features' in l:
            try:
                xl = str(l).split('=', 1)
                xl = xl[1].split(",")
                for i, n in enumerate(xl):
                    evs_featurenames[i] = n
                    cols.append("E." + n)
                    logging.info("Scoring feature %i : %s" % (i, n))
            except:
                logging.warn(
                    "Could not parse scoring feature names from Strelka output"
                )

    if not avg_depth:
        avg_depth = {}

        for l in vcfheaders:
            x = str(l).lower()
            x = x.replace("##meandepth_", "##maxdepth_")
            x = x.replace("##depth_", "##maxdepth_")
            if '##maxdepth_' in x:
                p, _, l = l.partition("_")
                xl = str(l).split('=')
                xchr = xl[0]
                avg_depth[xchr] = float(xl[1])
                logging.info("%s depth from VCF header is %f" %
                             (xchr, avg_depth[xchr]))

    has_warned = {}
    for vr in vcfExtract(vcfname, features):
        rec = {}
        for i, ff in enumerate(features):
            rec[ff] = vr[i]
        rec["tag"] = tag

        # fix missing features
        for q in [
                "I.QSI_NT", "I.RC", "I.IC", "I.IHP", "I.EVS", "S.1.DP",
                "S.2.DP", "S.1.BCN50", "S.2.BCN50", "S.1.FDP50", "S.2.FDP50"
        ]:
            if q not in rec or rec[q] is None:
                rec[q] = 0
                if not ("feat:" + q) in has_warned:
                    logging.warn("Missing feature %s" % q)
                    has_warned["feat:" + q] = True

        for q in [
                "S.1.TAR", "S.2.TAR", "S.1.TIR", "S.2.TIR", "S.1.TOR",
                "S.2.TOR"
        ]:
            if q not in rec or rec[q] is None:
                rec[q] = [0, 0]
                if not ("feat:" + q) in has_warned:
                    logging.warn("Missing feature %s" % q)
                    has_warned["feat:" + q] = True

        NT = rec["I.NT"]
        NT_is_ref = int(NT == "ref")
        QSI_NT = int(rec["I.QSI_NT"])

        n_DP = float(rec["S.1.DP"])
        t_DP = float(rec["S.2.DP"])

        in_del = 0

        max_len = len(rec["REF"])
        min_len = len(rec["REF"])

        for a in rec["ALT"]:
            if len(a) > len(rec["REF"]):
                in_del |= 1
            else:
                in_del |= 2
            min_len = min(len(a), min_len)
            max_len = max(len(a), max_len)

        ilen = max_len - min_len

        n_DP_ratio = 0
        t_DP_ratio = 0

        if avg_depth:
            try:
                n_DP_ratio = n_DP / float(avg_depth[rec["CHROM"]])
                t_DP_ratio = t_DP / float(avg_depth[rec["CHROM"]])
            except:
                if not rec["CHROM"] in has_warned:
                    logging.warn("Cannot normalize depths on %s" %
                                 rec["CHROM"])
                    has_warned[rec["CHROM"]] = True
        elif "DPnorm" not in has_warned:
            logging.warn("Cannot normalize depths.")
            has_warned["DPnorm"] = True

        # extract observed AF from strelka counts. TIR = ALT; TAR = REF
        try:
            n_af = float(rec["S.1.TIR"][0]) / (float(rec["S.1.TIR"][0]) +
                                               float(rec["S.1.TAR"][0]))
        except:
            n_af = 0

        try:
            t_af = float(rec["S.2.TIR"][0]) / (float(rec["S.2.TIR"][0]) +
                                               float(rec["S.2.TAR"][0]))
        except:
            t_af = 0

        # Gather the computed data into a dict
        qrec = {
            "CHROM": rec["CHROM"],
            "POS": int(rec["POS"]),
            "REF": rec["REF"],
            "ALT": ",".join(rec["ALT"]),
            "LENGTH": ilen,
            "INDELTYPE": in_del,
            "FILTER": ",".join(rec["FILTER"]),
            "NT": NT,
            "NT_REF": NT_is_ref,
            "QSI_NT": QSI_NT,
            "N_DP": n_DP,
            "T_DP": t_DP,
            "N_DP_RATE": n_DP_ratio,
            "T_DP_RATE": t_DP_ratio,
            "N_AF": n_af,
            "T_AF": t_af,
            "SGT": rec["I.SGT"],
            "tag": tag
        }

        # fields with defaults
        fields = [
            {
                "n": "EVS",
                "s": "I.EVS",
                "def": 0,
                "t": float
            },
            {
                "n": "VQSR",
                "s": "I.VQSR",
                "def": 0,
                "t": float
            },
            {
                "n": "RC",
                "s": "I.RC",
                "def": 0,
                "t": int
            },
            {
                "n": "RU",
                "s": "I.RU",
                "def": ""
            },
            {
                "n": "RU_LEN",
                "s": "I.RU",
                "def": 0,
                "t": len
            },
            {
                "n": "IC",
                "s": "I.IC",
                "def": 0,
                "t": int
            },
            {
                "n": "IHP",
                "s": "I.IHP",
                "def": 0,
                "t": int
            },
            {
                "n": "MQ",
                "s": "I.MQ",
                "def": 0.0,
                "t": float
            },
            {
                "n": "MQ0",
                "s": "I.MQ0",
                "def": 0.0,
                "t": float
            },
            {
                "n": "N_BCN",
                "s": "S.1.BCN50",
                "def": 0.0,
                "t": float
            },
            {
                "n": "T_BCN",
                "s": "S.2.BCN50",
                "def": 0.0,
                "t": float
            },
            {
                "n": "N_FDP",
                "s": "S.1.FDP50",
                "def": 0.0,
                "t": float
            },
            {
                "n": "T_FDP",
                "s": "S.2.FDP50",
                "def": 0.0,
                "t": float
            },
        ]

        for fd in fields:
            try:
                res = rec[fd["s"]]
                if "t" in fd:
                    res = fd["t"](res)
            except:
                res = fd["def"]

            qrec[fd["n"]] = res

        # ESF features
        try:
            for i, v in enumerate(rec["I.EVSF"]):
                if i in evs_featurenames:
                    try:
                        qrec["E." + evs_featurenames[i]] = float(v)
                    except:
                        # failure to parse
                        pass
        except:
            pass

        for k, v in evs_featurenames.iteritems():
            if not "E." + v in qrec:
                qrec["E." + v] = 0

        records.append(qrec)

    if records:
        df = pandas.DataFrame(records, columns=cols)
    else:
        df = pandas.DataFrame(columns=cols)

    return df
Beispiel #6
0
def extractStrelkaSNVFeatures(vcfname, tag, avg_depth=None):
    """ Return a data frame with features collected from the given VCF, tagged by given type
    :param vcfname: name of the VCF file
    :param tag: type of variants
    :param avg_depth: average chromosome depths from BAM file
    """
    features = [
        "CHROM", "POS", "REF", "ALT", "FILTER", "I.NT", "I.SOMATIC",
        "I.QSS_NT", "I.VQSR", "I.EVS", "I.EVSF", "I.SGT", "I.MQ", "I.MQ0",
        "I.SNVSB", "I.ReadPosRankSum", "S.1.SDP", "S.2.SDP", "S.1.FDP",
        "S.2.FDP", "S.1.DP", "S.2.DP", "S.1.AU", "S.2.AU", "S.1.CU", "S.2.CU",
        "S.1.GU", "S.2.GU", "S.1.TU", "S.2.TU"
    ]

    cols = [
        "CHROM", "POS", "REF", "ALT", "NT", "NT_REF", "QSS_NT", "FILTER",
        "EVS", "VQSR", "N_FDP_RATE", "T_FDP_RATE", "N_SDP_RATE", "T_SDP_RATE",
        "N_DP", "T_DP", "N_DP_RATE", "T_DP_RATE", "N_AF", "T_AF", "MQ", "MQ0",
        "SNVSB", "ReadPosRankSum", "tag"
    ]

    vcfheaders = list(extractHeaders(vcfname))

    evs_featurenames = {}
    for l in vcfheaders:
        if '##snv_scoring_features' in l:
            try:
                xl = str(l).split('=', 1)
                xl = xl[1].split(",")
                for i, n in enumerate(xl):
                    evs_featurenames[i] = n
                    cols.append("E." + n)
                    logging.info("Scoring feature %i : %s" % (i, n))
            except:
                logging.warn(
                    "Could not parse scoring feature names from Strelka output"
                )

    records = []

    if not avg_depth:
        avg_depth = {}

        for l in vcfheaders:
            x = str(l).lower()
            x = x.replace("##meandepth_", "##maxdepth_")
            x = x.replace("##depth_", "##maxdepth_")
            if '##maxdepth_' in x:
                p, _, l = l.partition("_")
                xl = str(l).split('=')
                xchr = xl[0]
                avg_depth[xchr] = float(xl[1])
                logging.info("%s depth from VCF header is %f" %
                             (xchr, avg_depth[xchr]))

    has_warned = {}

    for vr in vcfExtract(vcfname, features):
        rec = {}
        for i, ff in enumerate(features):
            rec[ff] = vr[i]

        # read VQSR value, if it's not present, set to -1 (old versions of Strelka)
        try:
            rec["I.VQSR"] = float(rec["I.VQSR"])
        except:
            rec["I.VQSR"] = -1.0

        # read EVS value, if it's not present, set to -1 (old versions of Strelka)
        try:
            rec["I.EVS"] = float(rec["I.EVS"])
        except:
            rec["I.EVS"] = -1.0

        # fix missing features
        for q in [
                "I.QSS_NT", "I.MQ", "I.MQ0", "I.SNVSB", "I.ReadPosRankSum",
                "S.1.SDP", "S.2.SDP", "S.1.FDP", "S.2.FDP", "S.1.DP", "S.2.DP",
                "S.1.AU", "S.2.AU", "S.1.CU", "S.2.CU", "S.1.GU", "S.2.GU",
                "S.1.TU", "S.2.TU"
        ]:
            if q not in rec or rec[q] is None:
                rec[q] = 0
                if not ("feat:" + q) in has_warned:
                    logging.warn("Missing feature %s" % q)
                    has_warned["feat:" + q] = True

        rec["tag"] = tag

        NT = rec["I.NT"]
        NT_is_ref = int(NT == "ref")
        QSS_NT = int(rec["I.QSS_NT"])

        try:
            MQ = float(rec["I.MQ"])
        except:
            MQ = None

        try:
            MQ_ZERO = float(rec["I.MQ0"])
        except:
            MQ_ZERO = None

        n_FDP = float(rec["S.1.FDP"])
        t_FDP = float(rec["S.2.FDP"])
        n_SDP = float(rec["S.1.SDP"])
        t_SDP = float(rec["S.2.SDP"])
        n_DP = float(rec["S.1.DP"])
        t_DP = float(rec["S.2.DP"])

        n_FDP_ratio = n_FDP / n_DP if n_DP != 0 else 0
        t_FDP_ratio = t_FDP / t_DP if t_DP != 0 else 0

        n_SDP_ratio = n_SDP / (n_DP + n_SDP) if (n_DP + n_SDP) != 0 else 0
        t_SDP_ratio = t_SDP / (t_DP + t_SDP) if (t_DP + t_SDP) != 0 else 0

        n_DP_ratio = 0
        t_DP_ratio = 0

        if avg_depth:
            try:
                n_DP_ratio = n_DP / float(avg_depth[rec["CHROM"]])
                t_DP_ratio = t_DP / float(avg_depth[rec["CHROM"]])
            except:
                if not rec["CHROM"] in has_warned:
                    logging.warn("Cannot normalize depths on %s" %
                                 rec["CHROM"])
                    has_warned[rec["CHROM"]] = True
        elif "DPnorm" not in has_warned:
            logging.warn("Cannot normalize depths.")
            has_warned["DPnorm"] = True

        # Ref and alt allele counts for tier1 and tier2
        allele_ref = rec["REF"]
        try:
            t_allele_ref_counts = map(float, rec['S.2.' + allele_ref + 'U'])
        except:
            t_allele_ref_counts = [0, 0]

        alleles_alt = rec["ALT"]

        try:
            t_allele_alt_counts = [0, 0]
            for a in alleles_alt:
                for i in range(2):
                    t_allele_alt_counts[i] += float(rec['S.2.' + a + 'U'][i])
        except:
            t_allele_alt_counts = [0, 0]

        # Compute the tier1 and tier2 alt allele rates.
        if t_allele_alt_counts[0] + t_allele_ref_counts[0] == 0:
            t_tier1_allele_rate = 0
        else:
            t_tier1_allele_rate = t_allele_alt_counts[0] / float(
                t_allele_alt_counts[0] + t_allele_ref_counts[0])

        try:
            n_allele_ref_counts = map(float, rec['S.1.' + allele_ref + 'U'])
        except:
            n_allele_ref_counts = [0, 0]

        alleles_alt = rec["ALT"]

        try:
            n_allele_alt_counts = [0, 0]
            for a in alleles_alt:
                for i in range(2):
                    n_allele_alt_counts[i] += float(rec['S.1.' + a + 'U'][i])
        except:
            n_allele_alt_counts = [0, 0]

        # Compute the tier1 and tier2 alt allele rates.
        if n_allele_alt_counts[0] + n_allele_ref_counts[0] == 0:
            n_tier1_allele_rate = 0
        else:
            n_tier1_allele_rate = n_allele_alt_counts[0] / float(
                n_allele_alt_counts[0] + n_allele_ref_counts[0])

        try:
            snvsb = rec["I.SNVSB"]
        except:
            snvsb = 0

        try:
            rprs = rec["I.ReadPosRankSum"]
        except:
            rprs = 0

        # Gather the computed data into a dict
        qrec = {
            "CHROM": rec["CHROM"],
            "POS": int(rec["POS"]),
            "REF": rec["REF"],
            "ALT": ",".join(rec["ALT"]),
            "FILTER": ",".join(rec["FILTER"]),
            "NT": NT,
            "NT_REF": NT_is_ref,
            "QSS_NT": QSS_NT,
            "VQSR": rec["I.VQSR"],
            "EVS": rec["I.EVS"],
            "N_FDP_RATE": n_FDP_ratio,
            "T_FDP_RATE": t_FDP_ratio,
            "N_SDP_RATE": n_SDP_ratio,
            "T_SDP_RATE": t_SDP_ratio,
            "N_DP": n_DP,
            "T_DP": t_DP,
            "N_DP_RATE": n_DP_ratio,
            "T_DP_RATE": t_DP_ratio,
            "N_AF": n_tier1_allele_rate,
            "T_AF": t_tier1_allele_rate,
            "MQ": MQ,
            "MQ0": MQ_ZERO,
            "SNVSB": snvsb,
            "ReadPosRankSum": rprs,
            "tag": tag
        }
        # ESF features
        try:
            for i, v in enumerate(rec["I.EVSF"]):
                if i in evs_featurenames:
                    try:
                        qrec["E." + evs_featurenames[i]] = float(v)
                    except:
                        # failure to parse
                        pass
        except:
            pass
        for k, v in evs_featurenames.iteritems():
            if not "E." + v in qrec:
                qrec["E." + v] = 0

        records.append(qrec)

    if records:
        df = pandas.DataFrame(records, columns=cols)
    else:
        df = pandas.DataFrame(columns=cols)

    return df
Beispiel #7
0
def extractPiscesSNVFeatures(vcfname, tag, avg_depth=None):
    """ Return a data frame with features collected from the given VCF, tagged by given type
    :param vcfname: name of the VCF file
    :param tag: type of variants
    :param avg_depth: average chromosome depths from BAM file
    """
    features = [
        "CHROM", "POS", "REF", "ALT", "FILTER", "I.DP", "I.EVS", "S.1.GT",
        "S.1.GQ", "S.1.AD", "S.1.DP", "S.1.VF", "S.1.NL", "S.1.SB", "S.1.NC",
        "S.1.AQ", "S.1.GQX"
    ]

    cols = [
        "CHROM", "POS", "REF", "ALT", "FILTER", "GQX", "EVS", "T_DP",
        "T_DP_RATE", "T_AF", "tag"
    ]

    vcfheaders = list(extractHeaders(vcfname))

    evs_featurenames = {}
    for l in vcfheaders:
        if '##snv_scoring_features' in l:
            try:
                xl = str(l).split('=', 1)
                xl = xl[1].split(",")
                for i, n in enumerate(xl):
                    evs_featurenames[i] = n
                    cols.append("E." + n)
                    logging.info("Scoring feature %i : %s" % (i, n))
            except:
                logging.warn(
                    "Could not parse scoring feature names from Pisces output")

    records = []

    if not avg_depth:
        avg_depth = {}

        for l in vcfheaders:
            x = str(l).lower()
            x = x.replace("##meandepth_", "##maxdepth_")
            x = x.replace("##depth_", "##maxdepth_")
            if '##maxdepth_' in x:
                p, _, l = l.partition("_")
                xl = str(l).split('=')
                xchr = xl[0]
                avg_depth[xchr] = float(xl[1])
                logging.info("%s depth from VCF header is %f" %
                             (xchr, avg_depth[xchr]))

    has_warned = {}

    for vr in vcfExtract(vcfname, features):
        rec = {}
        for i, ff in enumerate(features):
            rec[ff] = vr[i]

        # read VQSR value, if it's not present, set to -1 (old versions of Pisces)
        try:
            rec["I.VQSR"] = float(rec["I.VQSR"])
        except:
            rec["I.VQSR"] = -1.0

        # read EVS value, if it's not present, set to -1 (old versions of Pisces)
        if "I.SomaticEVS" in rec:
            try:
                rec["I.EVS"] = float(rec["I.SomaticEVS"])
            except:
                rec["I.EVS"] = -1.0
        else:
            try:
                rec["I.EVS"] = float(rec["I.EVS"])
            except:
                rec["I.EVS"] = -1.0

        # fix missing features
        for q in ["S.1.NC", "S.1.AQ"]:
            if q not in rec or rec[q] is None:
                rec[q] = 0
                if not ("feat:" + q) in has_warned:
                    logging.warn("Missing feature %s" % q)
                    has_warned["feat:" + q] = True

        rec["tag"] = tag

        t_DP = float(rec["S.1.DP"])
        t_VF = float(rec["S.1.VF"])
        GQX = float(rec["S.1.GQX"])

        t_DP_ratio = 0

        if avg_depth:
            try:
                t_DP_ratio = t_DP / float(avg_depth[rec["CHROM"]])
            except:
                if not rec["CHROM"] in has_warned:
                    logging.warn("Cannot normalize depths on %s" %
                                 rec["CHROM"])
                    has_warned[rec["CHROM"]] = True
        elif "DPnorm" not in has_warned:
            logging.warn("Cannot normalize depths.")
            has_warned["DPnorm"] = True

        # Gather the computed data into a dict
        qrec = {
            "CHROM": rec["CHROM"],
            "POS": int(rec["POS"]),
            "REF": rec["REF"],
            "ALT": ",".join(rec["ALT"]),
            "FILTER": ",".join(rec["FILTER"]),
            "GQX": GQX,
            "EVS": rec["I.EVS"],
            "T_DP": t_DP,
            "T_DP_RATE": t_DP_ratio,
            "T_AF": t_VF,
            "tag": tag
        }

        records.append(qrec)

    if records:
        df = pandas.DataFrame(records, columns=cols)
    else:
        df = pandas.DataFrame(columns=cols)

    return df
Beispiel #8
0
def extractStrelkaIndelFeatures(vcfname, tag, avg_depth=None):
        """ Return a data frame with features collected from the given VCF, tagged by given type """
        features = ["CHROM", "POS", "REF", "ALT", "FILTER",
                    "I.NT", "I.SOMATIC", "I.QSI_NT",
                    "I.SGT", "I.RC", "I.RU", "I.IC", "I.IHP",
                    "I.MQ", "I.MQ0",
                    "I.H200", "I.RC_HPOL_200", "I.RC_DINUC_200", "I.RC_TRIPLET_200",
                    "S.1.DP", "S.2.DP",
                    "S.1.TAR", "S.2.TAR",
                    "S.1.TIR", "S.2.TIR",
                    "S.1.TOR", "S.2.TOR",
                    "S.1.DP50", "S.2.DP50",
                    "S.1.FDP50", "S.2.FDP50",
                    "S.1.SUBDP50", "S.2.SUBDP50"]

        records = []

        if not avg_depth:
            avg_depth = {}

            for l in list(extractHeaders(vcfname)):
                x = str(l).lower()
                if '##maxdepth_' in x:
                    xl = str(l).split('=')
                    xchr = xl[0][11:]
                    avg_depth[xchr] = float(xl[1])
                    # logging.info("Maxdepth for %s depth from VCF header is %f" % (xchr, avg_depth[xchr]))

        has_warned = {}
        for vr in vcfExtract(vcfname, features):
            rec = {}
            for i, ff in enumerate(features):
                rec[ff] = vr[i]
            rec["tag"] = tag

            # fix missing features
            for q in ["I.QSI_NT", "I.RC", "I.IC", "I.IHP",
                      "S.1.DP", "S.2.DP", "I.H200", "I.RC_HPOL_200",
                      "I.RC_DINUC_200", "I.RC_TRIPLET_200",
                      "S.1.FDP50", "S.2.FDP50",
                      "S.1.SUBDP50", "S.2.SUBDP50"]:
                if q not in rec or rec[q] is None:
                    rec[q] = 0
                    if not ("feat:" + q) in has_warned:
                        logging.warn("Missing feature %s" % q)
                        has_warned["feat:" + q] = True

            for q in ["S.1.TAR", "S.2.TAR",
                      "S.1.TIR", "S.2.TIR",
                      "S.1.TOR", "S.2.TOR"]:
                if q not in rec or rec[q] is None:
                    rec[q] = [0, 0]
                    if not ("feat:" + q) in has_warned:
                        logging.warn("Missing feature %s" % q)
                        has_warned["feat:" + q] = True

            NT = rec["I.NT"]
            NT_is_ref = int(NT == "ref")
            QSI_NT = int(rec["I.QSI_NT"])

            n_D_total_1 = float(rec["S.1.TIR"][0]) + float(rec["S.1.TAR"][0]) + float(rec["S.1.TOR"][0])
            t_D_total_1 = float(rec["S.2.TIR"][0]) + float(rec["S.2.TAR"][0]) + float(rec["S.2.TOR"][0])
            n_D_total_2 = float(rec["S.1.TIR"][1]) + float(rec["S.1.TAR"][1]) + float(rec["S.1.TOR"][1])
            t_D_total_2 = float(rec["S.2.TIR"][1]) + float(rec["S.2.TAR"][1]) + float(rec["S.2.TOR"][1])

            n_TOR_ratio_1 = float(rec["S.1.TOR"][0]) / n_D_total_1 if n_D_total_1 != 0 else 0
            t_TOR_ratio_1 = float(rec["S.2.TOR"][0]) / t_D_total_1 if t_D_total_1 != 0 else 0
            n_TOR_ratio_2 = float(rec["S.1.TOR"][1]) / n_D_total_2 if n_D_total_2 != 0 else 0
            t_TOR_ratio_2 = float(rec["S.2.TOR"][1]) / t_D_total_2 if t_D_total_2 != 0 else 0

            n_DP = float(rec["S.1.DP"])
            t_DP = float(rec["S.2.DP"])

            in_del = 0

            max_len = len(rec["REF"])
            min_len = len(rec["REF"])

            for a in rec["ALT"]:
                if len(a) > len(rec["REF"]):
                    in_del |= 1
                else:
                    in_del |= 2
                min_len = min(len(a), min_len)
                max_len = max(len(a), max_len)

            ilen = max_len - min_len

            n_DP_ratio = 0
            t_DP_ratio = 0

            if avg_depth:
                if rec["CHROM"] in avg_depth:
                    n_DP_ratio = n_DP/float(avg_depth[rec["CHROM"]])
                    t_DP_ratio = t_DP/float(avg_depth[rec["CHROM"]])
                elif not rec["CHROM"] in has_warned:
                    logging.warn("Cannot normalize depths on %s" % rec["CHROM"])
                    has_warned[rec["CHROM"]] = True
            elif "DPnorm" not in has_warned:
                logging.warn("Cannot normalize depths.")
                has_warned["DPnorm"] = True

            # Ref and alt allele counts for tier1 and tier2
            t_allele_ref_counts = map(float, rec['S.2.TAR'])
            t_allele_alt_counts = map(float, rec['S.2.TIR'])

            # Compute the tier1 and tier2 alt allele rates.
            if t_allele_alt_counts[0] + t_allele_ref_counts[0] == 0:
                t_tier1_allele_rate = 0
            else:
                t_tier1_allele_rate = t_allele_alt_counts[0] / float(t_allele_alt_counts[0] + t_allele_ref_counts[0])

            if t_allele_alt_counts[1] + t_allele_ref_counts[1] == 0:
                t_tier2_allele_rate = 0
            else:
                t_tier2_allele_rate = t_allele_alt_counts[1] / float(t_allele_alt_counts[1] + t_allele_ref_counts[1])

            # Ref and alt allele counts for tier1 and tier2
            n_allele_ref_counts = map(float, rec['S.1.TAR'])
            n_allele_alt_counts = map(float, rec['S.1.TIR'])

            # Compute the tier1 and tier2 alt allele rates.
            if n_allele_alt_counts[0] + n_allele_ref_counts[0] == 0:
                n_tier1_allele_rate = 0
            else:
                n_tier1_allele_rate = n_allele_alt_counts[0] / float(n_allele_alt_counts[0] + n_allele_ref_counts[0])

            if n_allele_alt_counts[1] + n_allele_ref_counts[1] == 0:
                n_tier2_allele_rate = 0
            else:
                n_tier2_allele_rate = n_allele_alt_counts[1] / float(n_allele_alt_counts[1] + n_allele_ref_counts[1])

            bcn = 0

            try:
                bcn = rec["S.1.FDP50"] / rec["S.1.DP50"]
            except:
                pass

            try:
                bcn = max(bcn, rec["S.2.FDP50"] / rec["S.2.DP50"])
            except:
                pass

            # Gather the computed data into a dict
            qrec = {
                "CHROM": rec["CHROM"],
                "POS": int(rec["POS"]),
                "REF": rec["REF"],
                "ALT": ",".join(rec["ALT"]),
                "LENGTH": ilen,
                "LENGTHGT5": 0 if ilen <= 5 else 1,
                "INDELTYPE": in_del,
                "FILTER": ",".join(rec["FILTER"]),
                "NT": NT,
                "NT_REF": NT_is_ref,
                "QSI_NT": QSI_NT,
                "N_TOR_RATE_TIER1": n_TOR_ratio_1,
                "N_TOR_RATE_TIER2": n_TOR_ratio_2,
                "T_TOR_RATE_TIER1": t_TOR_ratio_1,
                "T_TOR_RATE_TIER2": t_TOR_ratio_2,
                "N_DP": n_DP,
                "T_DP": t_DP,
                "N_DP_RATE": n_DP_ratio,
                "T_DP_RATE": t_DP_ratio,
                "T_TIER1_ALT_RATE": t_tier1_allele_rate,
                "T_TIER2_ALT_RATE": t_tier2_allele_rate,
                "N_TIER1_ALT_RATE": n_tier1_allele_rate,
                "N_TIER2_ALT_RATE": n_tier2_allele_rate,
                "SGT": rec["I.SGT"],
                "entropy": rec["I.H200"],
                "hpol": rec["I.RC_HPOL_200"],
                "dinuc": rec["I.RC_DINUC_200"],
                "triplet": rec["I.RC_TRIPLET_200"],
                "bcn": bcn,
                "tag": tag
            }

            try:
                qrec["RC"] = int(rec["I.RC"])
            except:
                qrec["RC"] = 0

            try:
                qrec["RU"] = rec["I.RU"]
            except:
                qrec["RU"] = ""

            try:
                qrec["RU_LEN"] = len(rec["I.RU"])
            except:
                qrec["RU_LEN"] = 0

            try:
                qrec["IC"] = int(rec["I.IC"])
            except:
                qrec["IC"] = 0

            try:
                qrec["IHP"] = int(rec["I.IHP"])
            except:
                qrec["IHP"] = 0

            try:
                qrec["S.1.FDP50"] = float(rec["S.1.FDP50"])
            except:
                qrec["S.1.FDP50"] = 0

            try:
                qrec["S.2.FDP50"] = float(rec["S.2.FDP50"])
            except:
                qrec["S.2.FDP50"] = 0

            try:
                qrec["S.1.SUBDP50"] = float(rec["S.1.SUBDP50"])
            except:
                qrec["S.1.SUBDP50"] = 0

            try:
                qrec["S.2.SUBDP50"] = float(rec["S.2.SUBDP50"])
            except:
                qrec["S.2.SUBDP50"] = 0

            try:
                qrec["MQ"] = float(rec["I.MQ"])
            except:
                qrec["MQ"] = 0

            try:
                qrec["MQ0"] = float(rec["I.MQ0"])
            except:
                qrec["MQ0"] = 0

            records.append(qrec)

        cols = ["CHROM",
                "POS",
                "REF",
                "ALT",
                "LENGTH",
                "LENGTHGT5",
                "INDELTYPE",
                "FILTER",
                "NT",
                "NT_REF",
                "QSI_NT",
                "N_TOR_RATE_TIER1",
                "T_TOR_RATE_TIER1",
                "N_DP",
                "T_DP",
                "N_DP_RATE",
                "T_DP_RATE",
                "T_TIER1_ALT_RATE",
                "T_TIER2_ALT_RATE",
                "N_TIER1_ALT_RATE",
                "N_TIER2_ALT_RATE",
                "SGT",
                "RC",
                "RU",
                "RU_LEN",
                "IC",
                "IHP",
                "S.1.FDP50",
                "S.1.SUBDP50",
                "MQ",
                "MQ0",
                "entropy",
                "hpol",
                "dinuc",
                "triplet",
                "bcn",
                "tag"]

        if records:
            df = pandas.DataFrame(records, columns=cols)
        else:
            df = pandas.DataFrame(columns=cols)

        return df
Beispiel #9
0
def extractStrelkaSNVFeatures(vcfname, tag, avg_depth=None):
        """ Return a data frame with features collected from the given VCF, tagged by given type """
        features = ["CHROM", "POS", "REF", "ALT", "FILTER",
                    "I.NT", "I.SOMATIC", "I.QSS_NT", "I.VQSR",
                    "I.SGT", "I.MQ", "I.MQ0", "I.PNOISE", "I.PNOISE2",
                    "I.SNVSB", "I.ReadPosRankSum",
                    "S.1.SDP", "S.2.SDP",
                    "S.1.FDP", "S.2.FDP",
                    "S.1.DP", "S.2.DP",
                    "S.1.AU", "S.2.AU",
                    "S.1.CU", "S.2.CU",
                    "S.1.GU", "S.2.GU",
                    "S.1.TU", "S.2.TU"]

        records = []

        if not avg_depth:
            avg_depth = {}

            for l in list(extractHeaders(vcfname)):
                x = str(l).lower()
                if '##maxdepth_' in x:
                    xl = str(l).split('=')
                    xchr = xl[0][11:]
                    avg_depth[xchr] = float(xl[1])
                    # logging.info("Maxdepth for %s depth from VCF header is %f" % (xchr, avg_depth[xchr]))

        has_warned = {}

        for vr in vcfExtract(vcfname, features):
            rec = {}
            for i, ff in enumerate(features):
                rec[ff] = vr[i]

            # fix missing features
            for q in ["I.QSS_NT", "I.MQ", "I.MQ0", "I.PNOISE", "I.PNOISE2", "I.VQSR",
                      "I.SNVSB", "I.ReadPosRankSum", "S.1.SDP", "S.2.SDP",
                      "S.1.FDP", "S.2.FDP",
                      "S.1.DP", "S.2.DP",
                      "S.1.AU", "S.2.AU",
                      "S.1.CU", "S.2.CU",
                      "S.1.GU", "S.2.GU",
                      "S.1.TU", "S.2.TU"]:
                if q not in rec or rec[q] is None:
                    rec[q] = 0
                    if not ("feat:" + q) in has_warned:
                        logging.warn("Missing feature %s" % q)
                        has_warned["feat:" + q] = True

            rec["tag"] = tag

            NT = rec["I.NT"]
            NT_is_ref = int(NT == "ref")
            QSS_NT = int(rec["I.QSS_NT"])

            try:
                MQ = float(rec["I.MQ"])
            except:
                MQ = None

            try:
                MQ_ZERO = float(rec["I.MQ0"])
            except:
                MQ_ZERO = None

            n_FDP = float(rec["S.1.FDP"])
            t_FDP = float(rec["S.2.FDP"])
            n_SDP = float(rec["S.1.SDP"])
            t_SDP = float(rec["S.2.SDP"])
            n_DP = float(rec["S.1.DP"])
            t_DP = float(rec["S.2.DP"])

            n_FDP_ratio = n_FDP/n_DP if n_DP != 0 else 0
            t_FDP_ratio = t_FDP/t_DP if t_DP != 0 else 0

            n_SDP_ratio = n_SDP/(n_DP + n_SDP) if (n_DP + n_SDP) != 0 else 0
            t_SDP_ratio = t_SDP/(t_DP + t_SDP) if (t_DP + t_SDP) != 0 else 0

            n_DP_ratio = 0
            t_DP_ratio = 0

            if avg_depth:
                if rec["CHROM"] in avg_depth:
                    n_DP_ratio = n_DP/float(avg_depth[rec["CHROM"]])
                    t_DP_ratio = t_DP/float(avg_depth[rec["CHROM"]])
                elif not rec["CHROM"] in has_warned:
                    logging.warn("Cannot normalize depths on %s" % rec["CHROM"])
                    has_warned[rec["CHROM"]] = True
            elif "DPnorm" not in has_warned:
                logging.warn("Cannot normalize depths.")
                has_warned["DPnorm"] = True

            # Ref and alt allele counts for tier1 and tier2
            allele_ref = rec["REF"]
            t_allele_ref_counts = map(float, rec['S.2.' + allele_ref + 'U'])

            alleles_alt = rec["ALT"]

            if alleles_alt == ['.']:
                t_allele_alt_counts = [0, 0]
            else:
                t_allele_alt_counts = [0, 0]
                for a in alleles_alt:
                    for i in range(2):
                        t_allele_alt_counts[i] += float(rec['S.2.' + a + 'U'][i])

            # Compute the tier1 and tier2 alt allele rates.
            if t_allele_alt_counts[0] + t_allele_ref_counts[0] == 0:
                t_tier1_allele_rate = 0
            else:
                t_tier1_allele_rate = t_allele_alt_counts[0] / float(t_allele_alt_counts[0] + t_allele_ref_counts[0])

            if t_allele_alt_counts[1] + t_allele_ref_counts[1] == 0:
                t_tier2_allele_rate = 0
            else:
                t_tier2_allele_rate = t_allele_alt_counts[1] / float(t_allele_alt_counts[1] + t_allele_ref_counts[1])

            n_allele_ref_counts = map(float, rec['S.1.' + allele_ref + 'U'])

            alleles_alt = rec["ALT"]

            if alleles_alt == ['.']:
                n_allele_alt_counts = [0, 0]
            else:
                n_allele_alt_counts = [0, 0]
                for a in alleles_alt:
                    for i in range(2):
                        n_allele_alt_counts[i] += float(rec['S.1.' + a + 'U'][i])

            # Compute the tier1 and tier2 alt allele rates.
            if n_allele_alt_counts[0] + n_allele_ref_counts[0] == 0:
                n_tier1_allele_rate = 0
            else:
                n_tier1_allele_rate = n_allele_alt_counts[0] / float(n_allele_alt_counts[0] + n_allele_ref_counts[0])

            if n_allele_alt_counts[1] + n_allele_ref_counts[1] == 0:
                n_tier2_allele_rate = 0
            else:
                n_tier2_allele_rate = n_allele_alt_counts[1] / float(n_allele_alt_counts[1] + n_allele_ref_counts[1])

            try:
                pnoise = rec["I.PNOISE"]
            except:
                pnoise = 0

            try:
                pnoise2 = rec["I.PNOISE2"]
            except:
                pnoise2 = 0

            try:
                snvsb = rec["I.SNVSB"]
            except:
                snvsb = 0

            try:
                rprs = rec["I.ReadPosRankSum"]
            except:
                rprs = 0

            # Gather the computed data into a dict
            qrec = {
                "CHROM": rec["CHROM"],
                "POS": int(rec["POS"]),
                "REF": rec["REF"],
                "ALT": ",".join(rec["ALT"]),
                "FILTER": ",".join(rec["FILTER"]),
                "NT": NT,
                "NT_REF": NT_is_ref,
                "QSS_NT": QSS_NT,
                "VQSR": rec["I.VQSR"],
                "N_FDP_RATE": n_FDP_ratio,
                "T_FDP_RATE": t_FDP_ratio,
                "N_SDP_RATE": n_SDP_ratio,
                "T_SDP_RATE": t_SDP_ratio,
                "N_DP": n_DP,
                "T_DP": t_DP,
                "N_DP_RATE": n_DP_ratio,
                "T_DP_RATE": t_DP_ratio,
                "T_TIER1_ALT_RATE": t_tier1_allele_rate,
                "T_TIER2_ALT_RATE": t_tier2_allele_rate,
                "N_TIER1_ALT_RATE": n_tier1_allele_rate,
                "N_TIER2_ALT_RATE": n_tier2_allele_rate,
                "MQ_SCORE": MQ,
                "MQ_ZERO_RATE": MQ_ZERO,
                "PNOISE": pnoise,
                "PNOISE2": pnoise2,
                "SNVSB": snvsb,
                "ReadPosRankSum": rprs,
                "tag": tag
            }
            records.append(qrec)

        cols = ["CHROM", "POS", "REF", "ALT",
                "NT", "NT_REF", "QSS_NT", "FILTER", "VQSR",
                "N_FDP_RATE", "T_FDP_RATE", "N_SDP_RATE", "T_SDP_RATE",
                "N_DP", "T_DP", "N_DP_RATE", "T_DP_RATE",
                "T_TIER1_ALT_RATE", "T_TIER2_ALT_RATE", "N_TIER1_ALT_RATE", "N_TIER2_ALT_RATE",
                "MQ_SCORE", "MQ_ZERO_RATE", "PNOISE", "PNOISE2", "SNVSB",
                "ReadPosRankSum", "tag"]

        if records:
            df = pandas.DataFrame(records, columns=cols)
        else:
            df = pandas.DataFrame(columns=cols)

        return df
Beispiel #10
0
def extractVarscan2SNVFeatures(vcfname, tag, avg_depth=None):
        """ Return a data frame with features collected from the given VCF, tagged by given type """
        records = []

        if not avg_depth:
            logging.warn("No average depths available, normalized depth features cannot be calculated")

        hdrs = extractHeadersJSON(vcfname)

        tsn = ""
        nsn = ""

        n_sample = "NORMAL"
        t_sample = "TUMOR"

        logging.info("Normal sample name : %s (prefix %s) / tumour sample name : %s (prefix %s)" % (nsn, n_sample,
                                                                                                    tsn, t_sample))

        features = ["CHROM", "POS", "REF", "ALT", "FILTER",
                    "I.SSC", "I.GPV", "I.SPV",
                    n_sample + "GT", t_sample + "GT", # Genotype
                    n_sample + "GQ", t_sample + "GQ", # Genotype quality
                    n_sample + "DP", t_sample + "DP", # Read depth
                    n_sample + "RD", t_sample + "RD", # Reference depth
                    n_sample + "AD", t_sample + "AD", # Alternative depth
                    n_sample + "FREQ", t_sample + "FREQ" # Alt. frequence (FA in MuTect)
                    ]
        
        has_warned = {}

        for vr in vcfExtract(vcfname, features):
            rec = {}
            for i, ff in enumerate(features):
                rec[ff] = vr[i]

            for q in [n_sample + "GT", t_sample + "GT"]:
                if not q in rec or rec[q] is None:
                    rec[q] = "."
                    if not ("feat:" + q) in has_warned:
                        logging.warn("Missing feature %s" % q)
                        has_warned["feat:" + q] = True

            # fix missing features
            for q in [n_sample + "GT", t_sample + "GT",
                      n_sample + "GQ", t_sample + "GQ",
                      n_sample + "DP", t_sample + "DP",
                      n_sample + "AD", t_sample + "AD",
                      n_sample + "RD", t_sample + "RD",
                      n_sample + "FREQ", t_sample + "FREQ"]:
                if not q in rec or rec[q] is None:
                    rec[q] = 0
                    if not ("feat:" + q) in has_warned:
                        logging.warn("Missing feature %s" % q)
                        has_warned["feat:" + q] = True
                else:
                    if q.endswith("FREQ"):
                        try:
                            rec[q] = float(rec[q])
                        except ValueError:
                            rec[q] = float("NaN")
                    
                    else:
                        try:
                            rec[q] = int(rec[q])
                        except ValueError:
                            rec[q] = -1

            rec["tag"] = tag

            n_DP        = float(rec[n_sample + "DP"])
            t_DP        = float(rec[t_sample + "DP"])

            n_DP_ratio = 0
            t_DP_ratio = 0

            if avg_depth:
                if rec["CHROM"] in avg_depth:
                    n_DP_ratio      = n_DP/float(avg_depth[rec["CHROM"]])
                    t_DP_ratio      = t_DP/float(avg_depth[rec["CHROM"]])
                elif not rec["CHROM"] in has_warned:
                    logging.warn("Cannot normalize depths on %s" % rec["CHROM"])
                    has_warned[rec["CHROM"]] = True
            elif not "DPnorm" in has_warned:
                logging.warn("Cannot normalize depths.")
                has_warned["DPnorm"] = True

            n_allele_ref_count = rec[n_sample + "RD"]
            alleles_alt = rec["ALT"]

            if alleles_alt == ['.']:
                n_allele_alt_count = 0
            else:
                n_allele_alt_count = rec[n_sample + "AD"]

            if n_allele_alt_count + n_allele_ref_count == 0:
                n_allele_rate = 0
            else:
                n_allele_rate = n_allele_alt_count / float(n_allele_alt_count + n_allele_ref_count)

            t_allele_ref_count = rec[t_sample + "RD"]
            alleles_alt = rec["ALT"]

            if alleles_alt == ['.']:
                t_allele_alt_count = 0
            else:
                t_allele_alt_count =  rec[t_sample + "AD"]

            if t_allele_alt_count + t_allele_ref_count == 0:
                t_allele_rate = 0
            else:
                t_allele_rate = t_allele_alt_count / float(t_allele_alt_count + t_allele_ref_count)

            # Gather the computed data into a dict
            qrec = {
                "CHROM": rec["CHROM"],
                "POS": int(rec["POS"]),
                "REF": rec["REF"],
                "ALT": ",".join(rec["ALT"]),
                "FILTER": ",".join(rec["FILTER"]),
                "SSC": rec["I.SSC"],
                "GPV": rec["I.GPV"],
                "SPV": rec["I.SPV"],                
                "N_DP": n_DP,
                "T_DP": t_DP,
                "N_DP_RATE" : n_DP_ratio,
                "T_DP_RATE" : t_DP_ratio,
                "N_GT": rec[n_sample + "GT"],
                "T_GT": rec[t_sample + "GT"],
                "N_GQ": rec[n_sample +"GQ"],
                "T_GQ": rec[t_sample +"GQ"],
                "N_AD": rec[n_sample + "AD"],
                "T_AD": rec[t_sample + "AD"],
                "N_FA": rec[n_sample + "FREQ"],
                "T_FA": rec[t_sample + "FREQ"],
                "N_ALT_RATE": n_allele_rate,
                "T_ALT_RATE": t_allele_rate,
                "tag" : tag
            }
            
            records.append(qrec)

        cols = [
            "CHROM",
            "POS",
            "REF",
            "ALT",
            "FILTER",
            "SSC",
            "GPV",
            "SPV",
            "N_DP",
            "T_DP",
            "N_DP_RATE",
            "T_DP_RATE",
            "N_GT",
            "T_GT",
            "N_GQ",
            "T_GQ",
            "N_AD",
            "T_AD",
            "N_FA",
            "T_FA",
            "N_ALT_RATE",
            "T_ALT_RATE",
            "tag"]
            

        if records:
            df = pandas.DataFrame(records, columns=cols)
        else:
            df = pandas.DataFrame(columns=cols)

        return df
Beispiel #11
0
def extractPiscesSNVFeatures(vcfname, tag, avg_depth=None):
    """ Return a data frame with features collected from the given VCF, tagged by given type
    :param vcfname: name of the VCF file
    :param tag: type of variants
    :param avg_depth: average chromosome depths from BAM file
    """
    features = ["CHROM", "POS", "REF", "ALT", "FILTER",
                "I.DP",
                "I.EVS",
                "S.1.GT",
                "S.1.GQ",
                "S.1.AD",
                "S.1.DP",
                "S.1.VF",
                "S.1.NL",
                "S.1.SB",
                "S.1.NC",
                "S.1.AQ",
                "S.1.GQX"]

    cols = ["CHROM", "POS", "REF", "ALT",
            "FILTER", "GQX", "EVS",
            "T_DP", "T_DP_RATE",
            "T_AF",
            "tag"]

    vcfheaders = list(extractHeaders(vcfname))

    evs_featurenames = {}
    for l in vcfheaders:
        if '##snv_scoring_features' in l:
            try:
                xl = str(l).split('=', 1)
                xl = xl[1].split(",")
                for i, n in enumerate(xl):
                    evs_featurenames[i] = n
                    cols.append("E." + n)
                    logging.info("Scoring feature %i : %s" % (i, n))
            except:
                logging.warn("Could not parse scoring feature names from Pisces output")

    records = []

    if not avg_depth:
        avg_depth = {}

        for l in vcfheaders:
            x = str(l).lower()
            x = x.replace("##meandepth_", "##maxdepth_")
            x = x.replace("##depth_", "##maxdepth_")
            if '##maxdepth_' in x:
                p, _, l = l.partition("_")
                xl = str(l).split('=')
                xchr = xl[0]
                avg_depth[xchr] = float(xl[1])
                logging.info("%s depth from VCF header is %f" % (xchr, avg_depth[xchr]))

    has_warned = {}

    for vr in vcfExtract(vcfname, features):
        rec = {}
        for i, ff in enumerate(features):
            rec[ff] = vr[i]

        # read VQSR value, if it's not present, set to -1 (old versions of Pisces)
        try:
            rec["I.VQSR"] = float(rec["I.VQSR"])
        except:
            rec["I.VQSR"] = -1.0

        # read EVS value, if it's not present, set to -1 (old versions of Pisces)
        if "I.SomaticEVS" in rec:
            try:
                rec["I.EVS"] = float(rec["I.SomaticEVS"])
            except:
                rec["I.EVS"] = -1.0
        else:
            try:
                rec["I.EVS"] = float(rec["I.EVS"])
            except:
                rec["I.EVS"] = -1.0

        # fix missing features
        for q in ["S.1.NC", "S.1.AQ"]:
            if q not in rec or rec[q] is None:
                rec[q] = 0
                if not ("feat:" + q) in has_warned:
                    logging.warn("Missing feature %s" % q)
                    has_warned["feat:" + q] = True

        rec["tag"] = tag

        t_DP = float(rec["S.1.DP"])
        t_VF = float(rec["S.1.VF"])
        GQX = float(rec["S.1.GQX"])

        t_DP_ratio = 0

        if avg_depth:
            try:
                t_DP_ratio = t_DP / float(avg_depth[rec["CHROM"]])
            except:
                if not rec["CHROM"] in has_warned:
                    logging.warn("Cannot normalize depths on %s" % rec["CHROM"])
                    has_warned[rec["CHROM"]] = True
        elif "DPnorm" not in has_warned:
            logging.warn("Cannot normalize depths.")
            has_warned["DPnorm"] = True

        # Gather the computed data into a dict
        qrec = {
            "CHROM": rec["CHROM"],
            "POS": int(rec["POS"]),
            "REF": rec["REF"],
            "ALT": ",".join(rec["ALT"]),
            "FILTER": ",".join(rec["FILTER"]),
            "GQX": GQX,
            "EVS": rec["I.EVS"],
            "T_DP": t_DP,
            "T_DP_RATE": t_DP_ratio,
            "T_AF": t_VF,
            "tag": tag
        }

        records.append(qrec)

    if records:
        df = pandas.DataFrame(records, columns=cols)
    else:
        df = pandas.DataFrame(columns=cols)

    return df
Beispiel #12
0
def extractStrelkaIndelFeatures(vcfname, tag, avg_depth=None):
    """ Return a data frame with features collected from the given VCF, tagged by given type
    :param vcfname: name of the VCF file
    :param tag: type of variants
    :param avg_depth: average chromosome depths from BAM file
    """
    features = ["CHROM", "POS", "REF", "ALT", "FILTER",
                "I.NT", "I.SOMATIC", "I.QSI_NT", "I.EQSI", "I.ESF",
                "I.SGT", "I.RC", "I.RU",
                "I.IC", "I.IHP",
                "I.MQ", "I.MQ0",
                "S.1.DP", "S.2.DP",
                "S.1.TAR", "S.2.TAR",
                "S.1.TIR", "S.2.TIR",
                "S.1.TOR", "S.2.TOR",
                "S.1.AF", "S.2.AF",
                "S.1.OF", "S.2.OF",
                "S.1.SOR", "S.2.SOR",
                "S.1.FS", "S.2.FS",
                "S.1.BSA", "S.2.BSA",
                "S.1.RR", "S.2.RR",
                "S.1.BCN50", "S.2.BCN50",
                ]

    cols = ["CHROM",
            "POS",
            "REF",
            "ALT",
            "LENGTH",
            "INDELTYPE",
            "FILTER",
            "NT",
            "NT_REF",
            "VQSR",
            "EQSI",
            "QSI_NT",
            "N_DP",
            "T_DP",
            "N_DP_RATE",
            "T_DP_RATE",
            "N_AF",
            "T_AF",
            "N_OF",
            "T_OF",
            "N_SOR",
            "T_SOR",
            "N_FS",
            "T_FS",
            "N_BSA",
            "T_BSA",
            "N_RR",
            "T_RR",
            "N_BCN",
            "T_BCN",
            "SGT",
            "RC",
            "RU",
            "RU_LEN",
            "IC",
            "IHP",
            "MQ",
            "MQ0",
            "tag"]

    records = []

    vcfheaders = list(extractHeaders(vcfname))

    vqsr_featurenames = {}

    for l in vcfheaders:
        if '##vqsr_features' in l:
            try:
                xl = str(l).split('=', 1)
                xl = xl[1].split(",")
                for x in xl:
                    i, n = x.split(":", 1)
                    i = int(i)
                    vqsr_featurenames[i] = n
                    cols.append("VQSR." + n)
                    logging.info("VQSR feature %i : %s" % (i, n))
            except:
                logging.warn("Could not parse VQSR feature names from Strelka output")

    if not avg_depth:
        avg_depth = {}

        for l in vcfheaders:
            x = str(l).lower()
            x = x.replace("meandepth_", "maxdepth_")
            if '##maxdepth_' in x:
                xl = str(l).split('=')
                xchr = xl[0][12:]
                avg_depth[xchr] = float(xl[1])
                logging.info("%s depth from VCF header is %f" % (xchr, avg_depth[xchr]))

    has_warned = {}
    for vr in vcfExtract(vcfname, features):
        rec = {}
        for i, ff in enumerate(features):
            rec[ff] = vr[i]
        rec["tag"] = tag

        # fix missing features
        for q in ["I.QSI_NT", "I.RC", "I.IC", "I.IHP", "I.EQSI",
                  "S.1.DP", "S.2.DP",
                  "S.1.OF", "S.2.OF",
                  "S.1.RR", "S.2.RR",
                  "S.1.FS", "S.2.FS",
                  "S.1.BSA", "S.2.BSA",
                  "S.1.BCN50", "S.2.BCN50",
                  "S.1.AF", "S.2.AF"]:
            if q not in rec or rec[q] is None:
                rec[q] = 0
                if not ("feat:" + q) in has_warned:
                    logging.warn("Missing feature %s" % q)
                    has_warned["feat:" + q] = True

        for q in ["S.1.TAR", "S.2.TAR",
                  "S.1.TIR", "S.2.TIR",
                  "S.1.TOR", "S.2.TOR"]:
            if q not in rec or rec[q] is None:
                rec[q] = [0, 0]
                if not ("feat:" + q) in has_warned:
                    logging.warn("Missing feature %s" % q)
                    has_warned["feat:" + q] = True

        NT = rec["I.NT"]
        NT_is_ref = int(NT == "ref")
        QSI_NT = int(rec["I.QSI_NT"])

        n_DP = float(rec["S.1.DP"])
        t_DP = float(rec["S.2.DP"])

        in_del = 0

        max_len = len(rec["REF"])
        min_len = len(rec["REF"])

        for a in rec["ALT"]:
            if len(a) > len(rec["REF"]):
                in_del |= 1
            else:
                in_del |= 2
            min_len = min(len(a), min_len)
            max_len = max(len(a), max_len)

        ilen = max_len - min_len

        n_DP_ratio = 0
        t_DP_ratio = 0

        if avg_depth:
            if rec["CHROM"] in avg_depth:
                n_DP_ratio = n_DP / float(avg_depth[rec["CHROM"]])
                t_DP_ratio = t_DP / float(avg_depth[rec["CHROM"]])
            elif not rec["CHROM"] in has_warned:
                logging.warn("Cannot normalize depths on %s" % rec["CHROM"])
                has_warned[rec["CHROM"]] = True
        elif "DPnorm" not in has_warned:
            logging.warn("Cannot normalize depths.")
            has_warned["DPnorm"] = True

        # Gather the computed data into a dict
        qrec = {
            "CHROM": rec["CHROM"],
            "POS": int(rec["POS"]),
            "REF": rec["REF"],
            "ALT": ",".join(rec["ALT"]),
            "LENGTH": ilen,
            "INDELTYPE": in_del,
            "FILTER": ",".join(rec["FILTER"]),
            "NT": NT,
            "NT_REF": NT_is_ref,
            "QSI_NT": QSI_NT,
            "N_DP": n_DP,
            "T_DP": t_DP,
            "N_DP_RATE": n_DP_ratio,
            "T_DP_RATE": t_DP_ratio,
            "SGT": rec["I.SGT"],
            "tag": tag
        }

        # fields with defaults
        fields = [
            {"n": "EQSI", "s": "I.EQSI", "def": 0, "t": float},
            {"n": "VQSR", "s": "I.EQSI", "def": 0, "t": float},
            {"n": "RC", "s": "I.RC", "def": 0, "t": int},
            {"n": "RU", "s": "I.RU", "def": ""},
            {"n": "RU_LEN", "s": "I.RU", "def": 0, "t": len},
            {"n": "IC", "s": "I.IC", "def": 0, "t": int},
            {"n": "IHP", "s": "I.IHP", "def": 0, "t": int},
            {"n": "MQ", "s": "I.MQ", "def": 0.0, "t": float},
            {"n": "MQ0", "s": "I.MQ0", "def": 0.0, "t": float},
            {"n": "N_AF", "s": "S.1.AF", "def": 0.0, "t": float},
            {"n": "T_AF", "s": "S.2.AF", "def": 0.0, "t": float},
            {"n": "N_OF", "s": "S.1.OF", "def": 0.0, "t": float},
            {"n": "T_OF", "s": "S.2.OF", "def": 0.0, "t": float},
            {"n": "N_SOR", "s": "S.1.SOR", "def": 0.0, "t": float},
            {"n": "T_SOR", "s": "S.2.SOR", "def": 0.0, "t": float},
            {"n": "N_FS", "s": "S.1.FS", "def": 0.0, "t": float},
            {"n": "T_FS", "s": "S.2.FS", "def": 0.0, "t": float},
            {"n": "N_BSA", "s": "S.1.BSA", "def": 0.0, "t": float},
            {"n": "T_BSA", "s": "S.2.BSA", "def": 0.0, "t": float},
            {"n": "N_RR", "s": "S.1.RR", "def": 0.0, "t": float},
            {"n": "T_RR", "s": "S.2.RR", "def": 0.0, "t": float},
            {"n": "N_BCN", "s": "S.1.BCN50", "def": 0.0, "t": float},
            {"n": "T_BCN", "s": "S.2.BCN50", "def": 0.0, "t": float},
        ]

        for fd in fields:
            try:
                res = rec[fd["s"]]
                if "t" in fd:
                    res = fd["t"](res)
            except:
                res = fd["def"]

            qrec[fd["n"]] = res

        # VQSR features
        try:
            for i, v in enumerate(rec["I.ESF"]):
                if i in vqsr_featurenames:
                    try:
                        qrec["VQSR." + vqsr_featurenames[i]] = float(v)
                    except:
                        # failure to parse
                        pass
        except:
            pass
        for k, v in vqsr_featurenames.iteritems():
            if not "VQSR." + v in qrec:
                qrec["VQSR." + v] = 0

        records.append(qrec)

    if records:
        df = pandas.DataFrame(records, columns=cols)
    else:
        df = pandas.DataFrame(columns=cols)

    return df
Beispiel #13
0
def extractStrelkaSNVFeatures(vcfname, tag, avg_depth=None):
    """ Return a data frame with features collected from the given VCF, tagged by given type
    :param vcfname: name of the VCF file
    :param tag: type of variants
    :param avg_depth: average chromosome depths from BAM file
    """
    features = [
        "CHROM", "POS", "REF", "ALT", "FILTER", "I.NT", "I.SOMATIC",
        "I.QSS_NT", "I.VQSR", "I.SGT", "I.MQ", "I.MQ0", "I.PNOISE",
        "I.PNOISE2", "I.SNVSB", "I.ReadPosRankSum", "S.1.SDP", "S.2.SDP",
        "S.1.FDP", "S.2.FDP", "S.1.DP", "S.2.DP", "S.1.AU", "S.2.AU", "S.1.CU",
        "S.2.CU", "S.1.GU", "S.2.GU", "S.1.TU", "S.2.TU"
    ]

    records = []

    if not avg_depth:
        avg_depth = {}

        for l in list(extractHeaders(vcfname)):
            x = str(l).lower()
            x = x.replace("meandepth_", "maxdepth_")
            if '##maxdepth_' in x:
                xl = str(l).split('=')
                xchr = xl[0][12:]
                avg_depth[xchr] = float(xl[1])
                logging.info("%s depth from VCF header is %f" %
                             (xchr, avg_depth[xchr]))

    has_warned = {}

    for vr in vcfExtract(vcfname, features):
        rec = {}
        for i, ff in enumerate(features):
            rec[ff] = vr[i]

        # fix missing features
        for q in [
                "I.QSS_NT", "I.MQ", "I.MQ0", "I.PNOISE", "I.PNOISE2", "I.VQSR",
                "I.SNVSB", "I.ReadPosRankSum", "S.1.SDP", "S.2.SDP", "S.1.FDP",
                "S.2.FDP", "S.1.DP", "S.2.DP", "S.1.AU", "S.2.AU", "S.1.CU",
                "S.2.CU", "S.1.GU", "S.2.GU", "S.1.TU", "S.2.TU"
        ]:
            if q not in rec or rec[q] is None:
                rec[q] = 0
                if not ("feat:" + q) in has_warned:
                    logging.warn("Missing feature %s" % q)
                    has_warned["feat:" + q] = True

        rec["tag"] = tag

        NT = rec["I.NT"]
        NT_is_ref = int(NT == "ref")
        QSS_NT = int(rec["I.QSS_NT"])

        try:
            MQ = float(rec["I.MQ"])
        except:
            MQ = None

        try:
            MQ_ZERO = float(rec["I.MQ0"])
        except:
            MQ_ZERO = None

        n_FDP = float(rec["S.1.FDP"])
        t_FDP = float(rec["S.2.FDP"])
        n_SDP = float(rec["S.1.SDP"])
        t_SDP = float(rec["S.2.SDP"])
        n_DP = float(rec["S.1.DP"])
        t_DP = float(rec["S.2.DP"])

        n_FDP_ratio = n_FDP / n_DP if n_DP != 0 else 0
        t_FDP_ratio = t_FDP / t_DP if t_DP != 0 else 0

        n_SDP_ratio = n_SDP / (n_DP + n_SDP) if (n_DP + n_SDP) != 0 else 0
        t_SDP_ratio = t_SDP / (t_DP + t_SDP) if (t_DP + t_SDP) != 0 else 0

        n_DP_ratio = 0
        t_DP_ratio = 0

        if avg_depth:
            if rec["CHROM"] in avg_depth:
                n_DP_ratio = n_DP / float(avg_depth[rec["CHROM"]])
                t_DP_ratio = t_DP / float(avg_depth[rec["CHROM"]])
            elif not rec["CHROM"] in has_warned:
                logging.warn("Cannot normalize depths on %s" % rec["CHROM"])
                has_warned[rec["CHROM"]] = True
        elif "DPnorm" not in has_warned:
            logging.warn("Cannot normalize depths.")
            has_warned["DPnorm"] = True

        # Ref and alt allele counts for tier1 and tier2
        allele_ref = rec["REF"]
        t_allele_ref_counts = map(float, rec['S.2.' + allele_ref + 'U'])

        alleles_alt = rec["ALT"]

        if alleles_alt == ['.']:
            t_allele_alt_counts = [0, 0]
        else:
            t_allele_alt_counts = [0, 0]
            for a in alleles_alt:
                for i in range(2):
                    t_allele_alt_counts[i] += float(rec['S.2.' + a + 'U'][i])

        # Compute the tier1 and tier2 alt allele rates.
        if t_allele_alt_counts[0] + t_allele_ref_counts[0] == 0:
            t_tier1_allele_rate = 0
        else:
            t_tier1_allele_rate = t_allele_alt_counts[0] / float(
                t_allele_alt_counts[0] + t_allele_ref_counts[0])

        if t_allele_alt_counts[1] + t_allele_ref_counts[1] == 0:
            t_tier2_allele_rate = 0
        else:
            t_tier2_allele_rate = t_allele_alt_counts[1] / float(
                t_allele_alt_counts[1] + t_allele_ref_counts[1])

        n_allele_ref_counts = map(float, rec['S.1.' + allele_ref + 'U'])

        alleles_alt = rec["ALT"]

        if alleles_alt == ['.']:
            n_allele_alt_counts = [0, 0]
        else:
            n_allele_alt_counts = [0, 0]
            for a in alleles_alt:
                for i in range(2):
                    n_allele_alt_counts[i] += float(rec['S.1.' + a + 'U'][i])

        # Compute the tier1 and tier2 alt allele rates.
        if n_allele_alt_counts[0] + n_allele_ref_counts[0] == 0:
            n_tier1_allele_rate = 0
        else:
            n_tier1_allele_rate = n_allele_alt_counts[0] / float(
                n_allele_alt_counts[0] + n_allele_ref_counts[0])

        if n_allele_alt_counts[1] + n_allele_ref_counts[1] == 0:
            n_tier2_allele_rate = 0
        else:
            n_tier2_allele_rate = n_allele_alt_counts[1] / float(
                n_allele_alt_counts[1] + n_allele_ref_counts[1])

        try:
            pnoise = rec["I.PNOISE"]
        except:
            pnoise = 0

        try:
            pnoise2 = rec["I.PNOISE2"]
        except:
            pnoise2 = 0

        try:
            snvsb = rec["I.SNVSB"]
        except:
            snvsb = 0

        try:
            rprs = rec["I.ReadPosRankSum"]
        except:
            rprs = 0

        # Gather the computed data into a dict
        qrec = {
            "CHROM": rec["CHROM"],
            "POS": int(rec["POS"]),
            "REF": rec["REF"],
            "ALT": ",".join(rec["ALT"]),
            "FILTER": ",".join(rec["FILTER"]),
            "NT": NT,
            "NT_REF": NT_is_ref,
            "QSS_NT": QSS_NT,
            "VQSR": rec["I.VQSR"],
            "N_FDP_RATE": n_FDP_ratio,
            "T_FDP_RATE": t_FDP_ratio,
            "N_SDP_RATE": n_SDP_ratio,
            "T_SDP_RATE": t_SDP_ratio,
            "N_DP": n_DP,
            "T_DP": t_DP,
            "N_DP_RATE": n_DP_ratio,
            "T_DP_RATE": t_DP_ratio,
            "T_TIER1_ALT_RATE": t_tier1_allele_rate,
            "T_AF": t_tier1_allele_rate,
            "T_TIER2_ALT_RATE": t_tier2_allele_rate,
            "N_TIER1_ALT_RATE": n_tier1_allele_rate,
            "N_TIER2_ALT_RATE": n_tier2_allele_rate,
            "MQ_SCORE": MQ,
            "MQ_ZERO_RATE": MQ_ZERO,
            "PNOISE": pnoise,
            "PNOISE2": pnoise2,
            "SNVSB": snvsb,
            "ReadPosRankSum": rprs,
            "tag": tag
        }
        records.append(qrec)

    cols = [
        "CHROM", "POS", "REF", "ALT", "NT", "NT_REF", "QSS_NT", "FILTER",
        "VQSR", "N_FDP_RATE", "T_FDP_RATE", "N_SDP_RATE", "T_SDP_RATE", "N_DP",
        "T_DP", "N_DP_RATE", "T_DP_RATE", "T_TIER1_ALT_RATE",
        "T_TIER2_ALT_RATE", "N_TIER1_ALT_RATE", "N_TIER2_ALT_RATE", "T_AF",
        "MQ_SCORE", "MQ_ZERO_RATE", "PNOISE", "PNOISE2", "SNVSB",
        "ReadPosRankSum", "tag"
    ]

    if records:
        df = pandas.DataFrame(records, columns=cols)
    else:
        df = pandas.DataFrame(columns=cols)

    return df
Beispiel #14
0
def extractVarscan2SNVFeatures(vcfname, tag, avg_depth=None):
    """ Return a data frame with features collected from the given VCF, tagged by given type """
    records = []

    if not avg_depth:
        logging.warn(
            "No average depths available, normalized depth features cannot be calculated"
        )

    hdrs = extractHeadersJSON(vcfname)

    # TODO could figure this out automatically
    nsn = "NORMAL"
    tsn = "TUMOR"
    n_sample = "S.1."
    t_sample = "S.2."

    logging.info(
        "Normal sample name : %s (prefix %s) / tumour sample name : %s (prefix %s)"
        % (nsn, n_sample, tsn, t_sample))

    features = [
        "CHROM",
        "POS",
        "REF",
        "ALT",
        "FILTER",
        "I.SSC",
        "I.GPV",
        "I.SPV",
        n_sample + "GT",
        t_sample + "GT",  # Genotype
        n_sample + "GQ",
        t_sample + "GQ",  # Genotype quality
        n_sample + "DP",
        t_sample + "DP",  # Read depth
        n_sample + "RD",
        t_sample + "RD",  # Reference depth
        n_sample + "AD",
        t_sample + "AD",  # Alternative depth
        n_sample + "FREQ",
        t_sample + "FREQ"  # Alt. frequence (FA in MuTect)
    ]

    has_warned = {}

    for vr in vcfExtract(vcfname, features):
        rec = {}
        for i, ff in enumerate(features):
            rec[ff] = vr[i]

        for q in [n_sample + "GT", t_sample + "GT"]:
            if not q in rec or rec[q] is None:
                rec[q] = "."
                if not ("feat:" + q) in has_warned:
                    logging.warn("Missing feature %s" % q)
                    has_warned["feat:" + q] = True

        # fix missing features
        for q in [
                n_sample + "GT", t_sample + "GT", n_sample + "GQ",
                t_sample + "GQ", n_sample + "DP", t_sample + "DP",
                n_sample + "AD", t_sample + "AD", n_sample + "RD",
                t_sample + "RD", n_sample + "FREQ", t_sample + "FREQ"
        ]:
            if not q in rec or rec[q] is None:
                rec[q] = 0
                if not ("feat:" + q) in has_warned:
                    logging.warn("Missing feature %s" % q)
                    has_warned["feat:" + q] = True
            else:
                if q.endswith("FREQ"):
                    try:
                        rec[q] = float(rec[q])
                    except ValueError:
                        rec[q] = float("NaN")

                else:
                    try:
                        rec[q] = int(rec[q])
                    except ValueError:
                        rec[q] = -1

        rec["tag"] = tag

        n_DP = float(rec[n_sample + "DP"])
        t_DP = float(rec[t_sample + "DP"])

        n_DP_ratio = 0
        t_DP_ratio = 0

        if avg_depth:
            if rec["CHROM"] in avg_depth:
                n_DP_ratio = n_DP / float(avg_depth[rec["CHROM"]])
                t_DP_ratio = t_DP / float(avg_depth[rec["CHROM"]])
            elif not rec["CHROM"] in has_warned:
                logging.warn("Cannot normalize depths on %s" % rec["CHROM"])
                has_warned[rec["CHROM"]] = True
        elif not "DPnorm" in has_warned:
            logging.warn("Cannot normalize depths.")
            has_warned["DPnorm"] = True

        n_allele_ref_count = rec[n_sample + "RD"]
        alleles_alt = rec["ALT"]

        if alleles_alt == ['.']:
            n_allele_alt_count = 0
        else:
            n_allele_alt_count = rec[n_sample + "AD"]

        if n_allele_alt_count + n_allele_ref_count == 0:
            n_allele_rate = 0
        else:
            n_allele_rate = n_allele_alt_count / float(n_allele_alt_count +
                                                       n_allele_ref_count)

        t_allele_ref_count = rec[t_sample + "RD"]
        alleles_alt = rec["ALT"]

        if alleles_alt == ['.']:
            t_allele_alt_count = 0
        else:
            t_allele_alt_count = rec[t_sample + "AD"]

        if t_allele_alt_count + t_allele_ref_count == 0:
            t_allele_rate = 0
        else:
            t_allele_rate = t_allele_alt_count / float(t_allele_alt_count +
                                                       t_allele_ref_count)

        # Gather the computed data into a dict
        qrec = {
            "CHROM": rec["CHROM"],
            "POS": int(rec["POS"]),
            "REF": rec["REF"],
            "ALT": ",".join(rec["ALT"]),
            "FILTER": ",".join(rec["FILTER"]),
            "SSC": rec["I.SSC"],
            "GPV": rec["I.GPV"],
            "SPV": rec["I.SPV"],
            "N_DP": n_DP,
            "T_DP": t_DP,
            "N_DP_RATE": n_DP_ratio,
            "T_DP_RATE": t_DP_ratio,
            "N_GT": rec[n_sample + "GT"],
            "T_GT": rec[t_sample + "GT"],
            "N_GQ": rec[n_sample + "GQ"],
            "T_GQ": rec[t_sample + "GQ"],
            "N_AD": rec[n_sample + "AD"],
            "T_AD": rec[t_sample + "AD"],
            "N_FA": rec[n_sample + "FREQ"],
            "T_FA": rec[t_sample + "FREQ"],
            "N_ALT_RATE": n_allele_rate,
            "T_ALT_RATE": t_allele_rate,
            "tag": tag
        }

        records.append(qrec)

    cols = [
        "CHROM", "POS", "REF", "ALT", "FILTER", "SSC", "GPV", "SPV", "N_DP",
        "T_DP", "N_DP_RATE", "T_DP_RATE", "N_GT", "T_GT", "N_GQ", "T_GQ",
        "N_AD", "T_AD", "N_FA", "T_FA", "N_ALT_RATE", "T_ALT_RATE", "tag"
    ]

    if records:
        df = pandas.DataFrame(records, columns=cols)
    else:
        df = pandas.DataFrame(columns=cols)

    return df
Beispiel #15
0
def extractMutectSNVFeatures(vcfname, tag, avg_depth=None):
        """ Return a data frame with features collected from the given VCF, tagged by given type """
        records = []

        if not avg_depth:
            logging.warn("No average depths available, normalized depth features cannot be calculated")

        hdrs = extractHeadersJSON(vcfname)

        tsn = ""
        nsn = ""

        t_sample = "S.1."
        n_sample = "S.2."

        try:
            samples = hdrs["samples"]
            for f in hdrs["fields"]:
                if f["key"] == "GATKCommandLine" and f["values"]["ID"].lower() == "mutect":
                    clopts = f["values"]["CommandLineOptions"]
                    # ... tumor_sample_name=HCC2218_tumour ... normal_sample_name=HCC2218_normal
                    m = re.search("tumor_sample_name=([^\s]+)", clopts)
                    if m:
                        tsn = m.group(1)
                        for i, x in enumerate(samples):
                            if x == tsn:
                                t_sample = "S.%i." % (i+1)
                                break
                    m = re.search("normal_sample_name=([^\s]+)", clopts)
                    if m:
                        nsn = m.group(1)
                        for i, x in enumerate(samples):
                            if x == nsn:
                                n_sample = "S.%i." % (i+1)
                                break

        except:
            logging.warn("Unable to detect tumour / normal sample order from VCF header")

        logging.info("Normal sample name : %s (prefix %s) / tumour sample name : %s (prefix %s)" % (nsn, n_sample,
                                                                                                    tsn, t_sample))

        features = ["CHROM", "POS", "REF", "ALT", "FILTER",
                    "I.DB", "I.TLOD", "I.NLOD", "I.ECNT",
                    "I.HCNT", "I.MAX_ED", "I.MIN_ED",
                    n_sample + "GT", t_sample + "GT",
                    n_sample + "DP", t_sample + "DP",
                    n_sample + "QSS", t_sample + "QSS",
                    n_sample + "AD", t_sample + "AD"]

        has_warned = {}

        for vr in vcfExtract(vcfname, features):
            rec = {}
            for i, ff in enumerate(features):
                rec[ff] = vr[i]

            for q in [n_sample + "GT", t_sample + "GT"]:
                if not q in rec or rec[q] is None:
                    rec[q] = "."
                    if not ("feat:" + q) in has_warned:
                        logging.warn("Missing feature %s" % q)
                        has_warned["feat:" + q] = True

            # fix missing features
            for q in ["I.DB", "I.TLOD", "I.NLOD", "I.ECNT",
                      "I.HCNT", "I.MAX_ED", "I.MIN_ED",
                      n_sample + "GT", t_sample + "GT",
                      n_sample + "DP", t_sample + "DP",
                      n_sample + "QSS", t_sample + "QSS",
                      n_sample + "AD", t_sample + "AD"]:
                if not q in rec or rec[q] is None:
                    rec[q] = 0
                    if not ("feat:" + q) in has_warned:
                        logging.warn("Missing feature %s" % q)
                        has_warned["feat:" + q] = True
                else:
                    # list features
                    if q.endswith("AD") or q.endswith("QSS"):
                        if type(rec[q]) is not list:
                            if not has_warned[q + "_PARSE_FAIL"]:
                                logging.warn("Cannot parse %s: %s" % (q, str(rec[q])))
                                has_warned[q + "_PARSE_FAIL"] = True
                                rec[q] = [0] * (1 + len(rec["ALT"]))

                            for xx in range(0, 1 + len(rec["ALT"])):
                                if len(rec[q]) <= xx:
                                    rec[q].append(0)
                                else:
                                    try:
                                        rec[q][xx] = float(rec[q][xx])
                                    except ValueError:
                                        rec[q][xx] = 0
                    else:
                        try:
                            rec[q] = int(rec[q])
                        except ValueError:
                            rec[q] = -1

            rec["tag"] = tag
            TLOD = float(rec["I.TLOD"])
            NLOD = float(rec["I.NLOD"])

            n_DP        = float(rec[n_sample + "DP"])
            t_DP        = float(rec[t_sample + "DP"])

            n_DP_ratio = 0
            t_DP_ratio = 0

            if avg_depth:
                if rec["CHROM"] in avg_depth:
                    n_DP_ratio      = n_DP/float(avg_depth[rec["CHROM"]])
                    t_DP_ratio      = t_DP/float(avg_depth[rec["CHROM"]])
                elif not rec["CHROM"] in has_warned:
                    logging.warn("Cannot normalize depths on %s" % rec["CHROM"])
                    has_warned[rec["CHROM"]] = True
            elif not "DPnorm" in has_warned:
                logging.warn("Cannot normalize depths.")
                has_warned["DPnorm"] = True

            n_allele_ref_count = rec[n_sample + "AD"][0]
            alleles_alt = rec["ALT"]

            if alleles_alt == ['.']:
                n_allele_alt_count = 0
            else:
                n_allele_alt_count = 0
                for a in xrange(0, len(alleles_alt)):
                    n_allele_alt_count += float(rec[n_sample + "AD"][a + 1])

            if n_allele_alt_count + n_allele_ref_count == 0:
                n_allele_rate = 0
            else:
                n_allele_rate = n_allele_alt_count / float(n_allele_alt_count + n_allele_ref_count)

            t_allele_ref_count = rec[t_sample + "AD"][0]
            alleles_alt = rec["ALT"]

            if alleles_alt == ['.']:
                t_allele_alt_count = 0
            else:
                t_allele_alt_count = 0
                for a in xrange(0, len(alleles_alt)):
                    t_allele_alt_count += float(rec[t_sample + "AD"][a + 1])

            if t_allele_alt_count + t_allele_ref_count == 0:
                t_allele_rate = 0
            else:
                t_allele_rate = t_allele_alt_count / float(t_allele_alt_count + t_allele_ref_count)

            # Gather the computed data into a dict
            qrec = {
                "CHROM": rec["CHROM"],
                "POS": int(rec["POS"]),
                "REF": rec["REF"],
                "ALT": ",".join(rec["ALT"]),
                "FILTER": ",".join(rec["FILTER"]),
                "DBSNP": rec["I.DB"],
                "TLOD": TLOD,
                "NLOD": NLOD,
                "N_DP": n_DP,
                "T_DP": t_DP,
                "N_DP_RATE" : n_DP_ratio,
                "T_DP_RATE" : t_DP_ratio,
                "N_GT": rec[n_sample + "GT"],
                "T_GT": rec[t_sample + "GT"],
                "N_AD": rec[n_sample + "AD"],
                "T_AD": rec[t_sample + "AD"],
                "N_QSS": rec[n_sample + "QSS"],
                "T_QSS": rec[t_sample + "QSS"],
                "N_AF": n_allele_rate,
                "T_AF": t_allele_rate,
                "ECNT": rec["I.ECNT"],
                "HCNT": rec["I.HCNT"],
                "MAX_ED": rec["I.MAX_ED"],
                "MIN_ED": rec["I.MIN_ED"],
                "tag" : tag
            }
            records.append(qrec)

        cols = ["CHROM", "POS", "REF", "ALT",
                "FILTER", "TLOD", "NLOD", "DBSNP",
                "N_DP", "T_DP", "N_DP_RATE", "T_DP_RATE", "N_GT", "T_GT",
                "N_AD", "T_AD", "N_QSS", "T_QSS",
                "N_AF", "T_AF",
                "tag"]

        if records:
            df = pandas.DataFrame(records, columns=cols)
        else:
            df = pandas.DataFrame(columns=cols)

        return df
Beispiel #16
0
def extractMutectIndelFeatures(vcfname, tag, avg_depth=None):
        """ Return a data frame with features collected from the given VCF, tagged by given type """
        records = []

        if not avg_depth:
            logging.warn("No average depths available, normalized depth features cannot be calculated")

        hdrs = extractHeadersJSON(vcfname)

        tsn = ""
        nsn = ""

        t_sample = "S.1."
        n_sample = "S.2."

        try:
            samples = hdrs["samples"]
            for f in hdrs["fields"]:
                if f["key"] == "GATKCommandLine" and f["values"]["ID"].lower() == "mutect":
                    clopts = f["values"]["CommandLineOptions"]
                    # ... tumor_sample_name=HCC2218_tumour ... normal_sample_name=HCC2218_normal
                    m = re.search("tumor_sample_name=([^\s]+)", clopts)
                    if m:
                        tsn = m.group(1)
                        for i, x in enumerate(samples):
                            if x == tsn:
                                t_sample = "S.%i." % (i+1)
                                break
                    m = re.search("normal_sample_name=([^\s]+)", clopts)
                    if m:
                        nsn = m.group(1)
                        for i, x in enumerate(samples):
                            if x == nsn:
                                n_sample = "S.%i." % (i+1)
                                break

        except:
            logging.warn("Unable to detect tumour / normal sample order from VCF header")

        logging.info("Normal sample name : %s (prefix %s) / tumour sample name : %s (prefix %s)" % (nsn, n_sample,
                                                                                                    tsn, t_sample))
        has_warned = {}

        ##FORMAT=<ID=MM,Number=2,Type=Float,Description="Average # of mismatches per ref-/consensus indel-supporting read">
        ##FORMAT=<ID=MQS,Number=2,Type=Float,Description="Average mapping qualities of ref-/consensus indel-supporting reads">
        ##FORMAT=<ID=NQSBQ,Number=2,Type=Float,Description="Within NQS window: average quality of bases in ref-/consensus indel-supporting reads">
        ##FORMAT=<ID=NQSMM,Number=2,Type=Float,Description="Within NQS window: fraction of mismatching bases in ref/consensus indel-supporting reads">
        ##FORMAT=<ID=REnd,Number=2,Type=Integer,Description="Median/mad of indel offsets from the ends of the reads">
        ##FORMAT=<ID=RStart,Number=2,Type=Integer,Description="Median/mad of indel offsets from the starts of the reads">
        ##FORMAT=<ID=SC,Number=4,Type=Integer,Description="Strandness: counts of forward-/reverse-aligned reference and indel-supporting reads (FwdRef,RevRef,FwdIndel,RevIndel)">

        features = ["CHROM", "POS", "REF", "ALT", "FILTER",
                    n_sample + "GT", t_sample + "GT",
                    n_sample + "DP", t_sample + "DP",
                    n_sample + "AD", t_sample + "AD",
                    n_sample + "MM", t_sample + "MM",
                    n_sample + "MQS", t_sample + "MQS",
                    n_sample + "NQSBQ", t_sample + "NQSBQ",
                    n_sample + "NQSMM", t_sample + "NQSMM",
                    n_sample + "RStart", t_sample + "RStart",
                    n_sample + "REnd", t_sample + "REnd",
                    n_sample + "SC", t_sample + "SC"]

        for vr in vcfExtract(vcfname, features):
            rec = {}
            for i, ff in enumerate(features):
                rec[ff] = vr[i]

            for q in [n_sample + "GT", t_sample + "GT"]:
                if not q in rec or rec[q] is None:
                    rec[q] = "."
                    if not ("feat:" + q) in has_warned:
                        logging.warn("Missing feature %s" % q)
                        has_warned["feat:" + q] = True

            # fix missing features
            for q in [n_sample + "GT", t_sample + "GT",
                      n_sample + "DP", t_sample + "DP",
                      n_sample + "AD", t_sample + "AD",
                      n_sample + "MM", t_sample + "MM",
                      n_sample + "MQS", t_sample + "MQS",
                      n_sample + "NQSBQ", t_sample + "NQSBQ",
                      n_sample + "NQSMM", t_sample + "NQSMM",
                      n_sample + "RStart", t_sample + "RStart",
                      n_sample + "REnd", t_sample + "REnd",
                      n_sample + "SC", t_sample + "SC"]:
                if not q in rec or rec[q] is None:
                    rec[q] = 0
                    if not ("feat:" + q) in has_warned:
                        logging.warn("Missing feature %s" % q)
                        has_warned["feat:" + q] = True
                else:
                    if q.endswith("AD") or q.endswith("MM") or q.endswith("MQS") or \
                       q.endswith("NQSBQ") or q.endswith("NQSMM") or \
                       q.endswith("REnd") or q.endswith("RStart"):
                        if type(rec[q]) is not list:
                            if not has_warned[q + "_PARSE_FAIL"]:
                                logging.warn("Cannot parse %s: %s" % (q, str(rec[q])))
                                has_warned[q + "_PARSE_FAIL"] = True
                                rec[q] = [-1, -1]
                            for xx in range(2):
                                if len(rec[q]) <= xx:
                                    rec[q].append(-1)
                                else:
                                    try:
                                        rec[q][xx] = float(rec[q][xx])
                                    except ValueError:
                                        rec[q][xx] = -1
                    elif q.endswith("SC"):
                        if type(rec[q]) is not list:
                            if not has_warned[q + "_PARSE_FAIL"]:
                                logging.warn("Cannot parse %s: %s" % (q, str(rec[q])))
                                has_warned[q + "_PARSE_FAIL"] = True
                                rec[q] = [-1, -1, -1, -1]
                        else:
                            for xx in range(4):
                                if len(rec[q]) <= xx:
                                    rec[q].append(-1)
                                else:
                                    try:
                                        rec[q][xx] = float(rec[q][xx])
                                    except ValueError:
                                        rec[q][xx] = -1
                    else:
                        try:
                            rec[q] = int(rec[q])
                        except ValueError:
                            rec[q] = -1

            rec["tag"] = tag

            n_DP        = float(rec[n_sample + "DP"])
            t_DP        = float(rec[t_sample + "DP"])

            n_DP_ratio = 0
            t_DP_ratio = 0

            if avg_depth:
                if rec["CHROM"] in avg_depth:
                    n_DP_ratio      = n_DP/float(avg_depth[rec["CHROM"]])
                    t_DP_ratio      = t_DP/float(avg_depth[rec["CHROM"]])
                elif not rec["CHROM"] in has_warned:
                    logging.warn("Cannot normalize depths on %s" % rec["CHROM"])
                    has_warned[rec["CHROM"]] = True
            elif not "DPnorm" in has_warned:
                logging.warn("Cannot normalize depths.")
                has_warned["DPnorm"] = True

            n_allele_ref_count = rec[n_sample + "AD"][0]
            alleles_alt = rec["ALT"]

            if alleles_alt == ['.']:
                n_allele_alt_count = 0
            else:
                n_allele_alt_count = 0
                for a in xrange(1, len(rec[n_sample + "AD"])):
                    n_allele_alt_count += float(rec[n_sample + "AD"][a])

            if n_allele_alt_count + n_allele_ref_count == 0:
                n_allele_rate = 0
            else:
                n_allele_rate = n_allele_alt_count / float(n_allele_alt_count + n_allele_ref_count)

            t_allele_ref_count = rec[t_sample + "AD"][0]
            alleles_alt = rec["ALT"]

            if alleles_alt == ['.']:
                t_allele_alt_count = 0
            else:
                t_allele_alt_count = 0
                for a in xrange(1, len(rec[t_sample + "AD"])):
                    t_allele_alt_count += float(rec[t_sample + "AD"][a])

            if t_allele_alt_count + t_allele_ref_count == 0:
                t_allele_rate = 0
            else:
                t_allele_rate = t_allele_alt_count / float(t_allele_alt_count + t_allele_ref_count)

            # Gather the computed data into a dict
            qrec = {
                "CHROM": rec["CHROM"],
                "POS": int(rec["POS"]),
                "REF": rec["REF"],
                "ALT": ",".join(rec["ALT"]),
                "FILTER": ",".join(rec["FILTER"]),
                "N_DP": n_DP,
                "T_DP": t_DP,
                "N_DP_RATE" : n_DP_ratio,
                "T_DP_RATE" : t_DP_ratio,
                "N_GT": rec[n_sample + "GT"],
                "T_GT": rec[t_sample + "GT"],
                "N_AD": rec[n_sample + "AD"],
                "T_AD": rec[t_sample + "AD"],
                "N_ALT_RATE": n_allele_rate,
                "T_ALT_RATE": t_allele_rate,
                "N_MM": n_sample + "MM",
                "T_MM": t_sample + "MM",
                "N_MQS": n_sample + "MQS",
                "T_MQS": t_sample + "MQS",
                "N_NQSBQ": n_sample + "NQSBQ",
                "T_NQSBQ": t_sample + "NQSBQ",
                "N_NQSMM": n_sample + "NQSMM",
                "T_NQSMM": t_sample + "NQSMM",
                "N_RStart": n_sample + "RStart",
                "T_RStart": t_sample + "RStart",
                "N_REnd": n_sample + "REnd",
                "T_REnd": t_sample + "REnd",
                "N_SC": n_sample + "SC",
                "T_SC": t_sample + "SC",
                "tag" : tag
            }
            records.append(qrec)

        cols = [
            "CHROM",
            "POS",
            "REF",
            "ALT",
            "FILTER",
            "DBSNP",
            "N_DP",
            "T_DP",
            "N_DP_RATE",
            "T_DP_RATE",
            "N_GT",
            "T_GT",
            "N_AD",
            "T_AD",
            "N_ALT_RATE",
            "T_ALT_RATE",
            "N_MM",
            "T_MM",
            "N_MQS",
            "T_MQS",
            "N_NQSBQ",
            "T_NQSBQ",
            "N_NQSMM",
            "T_NQSMM",
            "N_RStart",
            "T_RStart",
            "N_REnd",
            "T_REnd",
            "N_SC",
            "T_SC",
            "tag"]

        if records:
            df = pandas.DataFrame(records, columns=cols)
        else:
            df = pandas.DataFrame(columns=cols)

        return df