def collectFeatures(vcfname, tag, features, processor=None): if not processor: processor = GenericFeatures.processValue records = [] for vr in vcfExtract(vcfname, features): rec = {} for i, v in enumerate(vr): rec[features[i]] = processor((features[i], v)) rec["tag"] = tag records.append(rec) if records: df = pandas.DataFrame(records, columns=features + ["tag"]) else: df = pandas.DataFrame(columns=features + ["tag"]) return df
def extractMutectSNVFeatures(vcfname, tag, avg_depth=None): """ Return a data frame with features collected from the given VCF, tagged by given type """ records = [] if not avg_depth: logging.warn( "No average depths available, normalized depth features cannot be calculated" ) hdrs = extractHeadersJSON(vcfname) tsn = "" nsn = "" t_sample = "S.1." n_sample = "S.2." try: samples = hdrs["samples"] for f in hdrs["fields"]: if f["key"] == "GATKCommandLine" and f["values"]["ID"].lower( ) == "mutect": clopts = f["values"]["CommandLineOptions"] # ... tumor_sample_name=HCC2218_tumour ... normal_sample_name=HCC2218_normal m = re.search("tumor_sample_name=([^\s]+)", clopts) if m: tsn = m.group(1) for i, x in enumerate(samples): if x == tsn: t_sample = "S.%i." % (i + 1) break m = re.search("normal_sample_name=([^\s]+)", clopts) if m: nsn = m.group(1) for i, x in enumerate(samples): if x == nsn: n_sample = "S.%i." % (i + 1) break except: logging.warn( "Unable to detect tumour / normal sample order from VCF header") logging.info( "Normal sample name : %s (prefix %s) / tumour sample name : %s (prefix %s)" % (nsn, n_sample, tsn, t_sample)) features = [ "CHROM", "POS", "REF", "ALT", "FILTER", "I.DB", n_sample + "GT", t_sample + "GT", n_sample + "DP", t_sample + "DP", n_sample + "AD", t_sample + "AD", n_sample + "BQ", t_sample + "BQ", n_sample + "FA", t_sample + "FA", n_sample + "SS", t_sample + "SS" ] has_warned = {"feat:I.DB": 1} for vr in vcfExtract(vcfname, features): rec = {} for i, ff in enumerate(features): rec[ff] = vr[i] for q in [n_sample + "GT", t_sample + "GT"]: if not q in rec or rec[q] is None: rec[q] = "." if not ("feat:" + q) in has_warned: logging.warn("Missing feature %s" % q) has_warned["feat:" + q] = True # fix missing features for q in [ "I.DB", n_sample + "DP", t_sample + "DP", n_sample + "AD", t_sample + "AD", n_sample + "BQ", t_sample + "BQ", n_sample + "FA", t_sample + "FA", n_sample + "SS", t_sample + "SS" ]: if not q in rec or rec[q] is None: rec[q] = 0 if not ("feat:" + q) in has_warned: logging.warn("Missing feature %s" % q) has_warned["feat:" + q] = True else: if q.endswith("FA"): try: rec[q] = float(rec[q]) except ValueError: rec[q] = float("NaN") elif q.endswith("AD"): if type(rec[q]) is not list: if not has_warned["AD_PARSE_FAIL"]: logging.warn("Cannot parse AD: %s" % str(rec[q])) has_warned["AD_PARSE_FAIL"] = True rec[q] = [0] * (1 + len(rec["ALT"])) for xx in range(0, 1 + len(rec["ALT"])): if len(rec[q]) <= xx: rec[q].append(0) else: try: rec[q][xx] = float(rec[q][xx]) except ValueError: rec[q][xx] = 0 else: try: rec[q] = int(rec[q]) except ValueError: rec[q] = -1 rec["tag"] = tag n_DP = float(rec[n_sample + "DP"]) t_DP = float(rec[t_sample + "DP"]) n_DP_ratio = 0 t_DP_ratio = 0 if avg_depth: if rec["CHROM"] in avg_depth: n_DP_ratio = n_DP / float(avg_depth[rec["CHROM"]]) t_DP_ratio = t_DP / float(avg_depth[rec["CHROM"]]) elif not rec["CHROM"] in has_warned: logging.warn("Cannot normalize depths on %s" % rec["CHROM"]) has_warned[rec["CHROM"]] = True elif not "DPnorm" in has_warned: logging.warn("Cannot normalize depths.") has_warned["DPnorm"] = True n_allele_ref_count = rec[n_sample + "AD"][0] alleles_alt = rec["ALT"] if alleles_alt == ['.']: n_allele_alt_count = 0 else: n_allele_alt_count = 0 for a in xrange(0, len(alleles_alt)): n_allele_alt_count += float(rec[n_sample + "AD"][a + 1]) if n_allele_alt_count + n_allele_ref_count == 0: n_allele_rate = 0 else: n_allele_rate = n_allele_alt_count / float(n_allele_alt_count + n_allele_ref_count) t_allele_ref_count = rec[t_sample + "AD"][0] alleles_alt = rec["ALT"] if alleles_alt == ['.']: t_allele_alt_count = 0 else: t_allele_alt_count = 0 for a in xrange(0, len(alleles_alt)): t_allele_alt_count += float(rec[t_sample + "AD"][a + 1]) if t_allele_alt_count + t_allele_ref_count == 0: t_allele_rate = 0 else: t_allele_rate = t_allele_alt_count / float(t_allele_alt_count + t_allele_ref_count) # Gather the computed data into a dict qrec = { "CHROM": rec["CHROM"], "POS": int(rec["POS"]), "REF": rec["REF"], "ALT": ",".join(rec["ALT"]), "FILTER": ",".join(rec["FILTER"]), "DBSNP": rec["I.DB"], "N_DP": n_DP, "T_DP": t_DP, "N_DP_RATE": n_DP_ratio, "T_DP_RATE": t_DP_ratio, "N_GT": rec[n_sample + "GT"], "T_GT": rec[t_sample + "GT"], "N_AD": rec[n_sample + "AD"], "T_AD": rec[t_sample + "AD"], "N_BQ": rec[n_sample + "BQ"], "T_BQ": rec[t_sample + "BQ"], "N_FA": rec[n_sample + "FA"], "T_FA": rec[t_sample + "FA"], "N_SS": rec[n_sample + "SS"], "T_SS": rec[t_sample + "SS"], "N_ALT_RATE": n_allele_rate, "T_ALT_RATE": t_allele_rate, "tag": tag } records.append(qrec) cols = [ "CHROM", "POS", "REF", "ALT", "FILTER", "DBSNP", "N_DP", "T_DP", "N_DP_RATE", "T_DP_RATE", "N_GT", "T_GT", "N_AD", "T_AD", "N_BQ", "T_BQ", "N_FA", "T_FA", "N_SS", "T_SS", "N_ALT_RATE", "T_ALT_RATE", "tag" ] if records: df = pandas.DataFrame(records, columns=cols) else: df = pandas.DataFrame(columns=cols) return df
def extractMutectIndelFeatures(vcfname, tag, avg_depth=None): """ Return a data frame with features collected from the given VCF, tagged by given type """ records = [] if not avg_depth: logging.warn( "No average depths available, normalized depth features cannot be calculated" ) hdrs = extractHeadersJSON(vcfname) tsn = "" nsn = "" t_sample = "S.1." n_sample = "S.2." try: samples = hdrs["samples"] for f in hdrs["fields"]: if f["key"] == "GATKCommandLine" and f["values"]["ID"].lower( ) == "mutect": clopts = f["values"]["CommandLineOptions"] # ... tumor_sample_name=HCC2218_tumour ... normal_sample_name=HCC2218_normal m = re.search("tumor_sample_name=([^\s]+)", clopts) if m: tsn = m.group(1) for i, x in enumerate(samples): if x == tsn: t_sample = "S.%i." % (i + 1) break m = re.search("normal_sample_name=([^\s]+)", clopts) if m: nsn = m.group(1) for i, x in enumerate(samples): if x == nsn: n_sample = "S.%i." % (i + 1) break except: logging.warn( "Unable to detect tumour / normal sample order from VCF header") logging.info( "Normal sample name : %s (prefix %s) / tumour sample name : %s (prefix %s)" % (nsn, n_sample, tsn, t_sample)) has_warned = {} ##FORMAT=<ID=MM,Number=2,Type=Float,Description="Average # of mismatches per ref-/consensus indel-supporting read"> ##FORMAT=<ID=MQS,Number=2,Type=Float,Description="Average mapping qualities of ref-/consensus indel-supporting reads"> ##FORMAT=<ID=NQSBQ,Number=2,Type=Float,Description="Within NQS window: average quality of bases in ref-/consensus indel-supporting reads"> ##FORMAT=<ID=NQSMM,Number=2,Type=Float,Description="Within NQS window: fraction of mismatching bases in ref/consensus indel-supporting reads"> ##FORMAT=<ID=REnd,Number=2,Type=Integer,Description="Median/mad of indel offsets from the ends of the reads"> ##FORMAT=<ID=RStart,Number=2,Type=Integer,Description="Median/mad of indel offsets from the starts of the reads"> ##FORMAT=<ID=SC,Number=4,Type=Integer,Description="Strandness: counts of forward-/reverse-aligned reference and indel-supporting reads (FwdRef,RevRef,FwdIndel,RevIndel)"> features = [ "CHROM", "POS", "REF", "ALT", "FILTER", n_sample + "GT", t_sample + "GT", n_sample + "DP", t_sample + "DP", n_sample + "AD", t_sample + "AD", n_sample + "MM", t_sample + "MM", n_sample + "MQS", t_sample + "MQS", n_sample + "NQSBQ", t_sample + "NQSBQ", n_sample + "NQSMM", t_sample + "NQSMM", n_sample + "RStart", t_sample + "RStart", n_sample + "REnd", t_sample + "REnd", n_sample + "SC", t_sample + "SC" ] for vr in vcfExtract(vcfname, features): rec = {} for i, ff in enumerate(features): rec[ff] = vr[i] for q in [n_sample + "GT", t_sample + "GT"]: if not q in rec or rec[q] is None: rec[q] = "." if not ("feat:" + q) in has_warned: logging.warn("Missing feature %s" % q) has_warned["feat:" + q] = True # fix missing features for q in [ n_sample + "GT", t_sample + "GT", n_sample + "DP", t_sample + "DP", n_sample + "AD", t_sample + "AD", n_sample + "MM", t_sample + "MM", n_sample + "MQS", t_sample + "MQS", n_sample + "NQSBQ", t_sample + "NQSBQ", n_sample + "NQSMM", t_sample + "NQSMM", n_sample + "RStart", t_sample + "RStart", n_sample + "REnd", t_sample + "REnd", n_sample + "SC", t_sample + "SC" ]: if not q in rec or rec[q] is None: rec[q] = 0 if not ("feat:" + q) in has_warned: logging.warn("Missing feature %s" % q) has_warned["feat:" + q] = True else: if q.endswith("AD") or q.endswith("MM") or q.endswith("MQS") or \ q.endswith("NQSBQ") or q.endswith("NQSMM") or \ q.endswith("REnd") or q.endswith("RStart"): if type(rec[q]) is not list: if not has_warned[q + "_PARSE_FAIL"]: logging.warn("Cannot parse %s: %s" % (q, str(rec[q]))) has_warned[q + "_PARSE_FAIL"] = True rec[q] = [-1, -1] for xx in range(2): if len(rec[q]) <= xx: rec[q].append(-1) else: try: rec[q][xx] = float(rec[q][xx]) except ValueError: rec[q][xx] = -1 elif q.endswith("SC"): if type(rec[q]) is not list: if not has_warned[q + "_PARSE_FAIL"]: logging.warn("Cannot parse %s: %s" % (q, str(rec[q]))) has_warned[q + "_PARSE_FAIL"] = True rec[q] = [-1, -1, -1, -1] else: for xx in range(4): if len(rec[q]) <= xx: rec[q].append(-1) else: try: rec[q][xx] = float(rec[q][xx]) except ValueError: rec[q][xx] = -1 else: try: rec[q] = int(rec[q]) except ValueError: rec[q] = -1 rec["tag"] = tag n_DP = float(rec[n_sample + "DP"]) t_DP = float(rec[t_sample + "DP"]) n_DP_ratio = 0 t_DP_ratio = 0 if avg_depth: if rec["CHROM"] in avg_depth: n_DP_ratio = n_DP / float(avg_depth[rec["CHROM"]]) t_DP_ratio = t_DP / float(avg_depth[rec["CHROM"]]) elif not rec["CHROM"] in has_warned: logging.warn("Cannot normalize depths on %s" % rec["CHROM"]) has_warned[rec["CHROM"]] = True elif not "DPnorm" in has_warned: logging.warn("Cannot normalize depths.") has_warned["DPnorm"] = True n_allele_ref_count = rec[n_sample + "AD"][0] alleles_alt = rec["ALT"] if alleles_alt == ['.']: n_allele_alt_count = 0 else: n_allele_alt_count = 0 for a in xrange(1, len(rec[n_sample + "AD"])): n_allele_alt_count += float(rec[n_sample + "AD"][a]) if n_allele_alt_count + n_allele_ref_count == 0: n_allele_rate = 0 else: n_allele_rate = n_allele_alt_count / float(n_allele_alt_count + n_allele_ref_count) t_allele_ref_count = rec[t_sample + "AD"][0] alleles_alt = rec["ALT"] if alleles_alt == ['.']: t_allele_alt_count = 0 else: t_allele_alt_count = 0 for a in xrange(1, len(rec[t_sample + "AD"])): t_allele_alt_count += float(rec[t_sample + "AD"][a]) if t_allele_alt_count + t_allele_ref_count == 0: t_allele_rate = 0 else: t_allele_rate = t_allele_alt_count / float(t_allele_alt_count + t_allele_ref_count) # Gather the computed data into a dict qrec = { "CHROM": rec["CHROM"], "POS": int(rec["POS"]), "REF": rec["REF"], "ALT": ",".join(rec["ALT"]), "FILTER": ",".join(rec["FILTER"]), "N_DP": n_DP, "T_DP": t_DP, "N_DP_RATE": n_DP_ratio, "T_DP_RATE": t_DP_ratio, "N_GT": rec[n_sample + "GT"], "T_GT": rec[t_sample + "GT"], "N_AD": rec[n_sample + "AD"], "T_AD": rec[t_sample + "AD"], "N_ALT_RATE": n_allele_rate, "T_ALT_RATE": t_allele_rate, "N_MM": n_sample + "MM", "T_MM": t_sample + "MM", "N_MQS": n_sample + "MQS", "T_MQS": t_sample + "MQS", "N_NQSBQ": n_sample + "NQSBQ", "T_NQSBQ": t_sample + "NQSBQ", "N_NQSMM": n_sample + "NQSMM", "T_NQSMM": t_sample + "NQSMM", "N_RStart": n_sample + "RStart", "T_RStart": t_sample + "RStart", "N_REnd": n_sample + "REnd", "T_REnd": t_sample + "REnd", "N_SC": n_sample + "SC", "T_SC": t_sample + "SC", "tag": tag } records.append(qrec) cols = [ "CHROM", "POS", "REF", "ALT", "FILTER", "DBSNP", "N_DP", "T_DP", "N_DP_RATE", "T_DP_RATE", "N_GT", "T_GT", "N_AD", "T_AD", "N_ALT_RATE", "T_ALT_RATE", "N_MM", "T_MM", "N_MQS", "T_MQS", "N_NQSBQ", "T_NQSBQ", "N_NQSMM", "T_NQSMM", "N_RStart", "T_RStart", "N_REnd", "T_REnd", "N_SC", "T_SC", "tag" ] if records: df = pandas.DataFrame(records, columns=cols) else: df = pandas.DataFrame(columns=cols) return df
def extractStrelkaIndelFeatures(vcfname, tag, avg_depth=None): """ Return a data frame with features collected from the given VCF, tagged by given type :param vcfname: name of the VCF file :param tag: type of variants :param avg_depth: average chromosome depths from BAM file """ features = [ "CHROM", "POS", "REF", "ALT", "FILTER", "I.NT", "I.SOMATIC", "I.QSI_NT", "I.EVS", "I.EVSF", "I.SGT", "I.RC", "I.RU", "I.IC", "I.IHP", "I.MQ", "I.MQ0", "S.1.DP", "S.2.DP", "S.1.TAR", "S.2.TAR", "S.1.TIR", "S.2.TIR", "S.1.TOR", "S.2.TOR", "S.1.BCN50", "S.2.BCN50", "S.1.FDP50", "S.2.FDP50", ] cols = [ "CHROM", "POS", "REF", "ALT", "LENGTH", "INDELTYPE", "FILTER", "NT", "NT_REF", "EVS", "QSI_NT", "N_DP", "T_DP", "N_DP_RATE", "T_DP_RATE", "N_BCN", "T_BCN", "N_FDP", "T_FDP", "N_AF", "T_AF", "SGT", "RC", "RU", "RU_LEN", "IC", "IHP", "MQ", "MQ0", "tag" ] records = [] vcfheaders = list(extractHeaders(vcfname)) evs_featurenames = {} for l in vcfheaders: if '##indel_scoring_features' in l: try: xl = str(l).split('=', 1) xl = xl[1].split(",") for i, n in enumerate(xl): evs_featurenames[i] = n cols.append("E." + n) logging.info("Scoring feature %i : %s" % (i, n)) except: logging.warn( "Could not parse scoring feature names from Strelka output" ) if not avg_depth: avg_depth = {} for l in vcfheaders: x = str(l).lower() x = x.replace("##meandepth_", "##maxdepth_") x = x.replace("##depth_", "##maxdepth_") if '##maxdepth_' in x: p, _, l = l.partition("_") xl = str(l).split('=') xchr = xl[0] avg_depth[xchr] = float(xl[1]) logging.info("%s depth from VCF header is %f" % (xchr, avg_depth[xchr])) has_warned = {} for vr in vcfExtract(vcfname, features): rec = {} for i, ff in enumerate(features): rec[ff] = vr[i] rec["tag"] = tag # fix missing features for q in [ "I.QSI_NT", "I.RC", "I.IC", "I.IHP", "I.EVS", "S.1.DP", "S.2.DP", "S.1.BCN50", "S.2.BCN50", "S.1.FDP50", "S.2.FDP50" ]: if q not in rec or rec[q] is None: rec[q] = 0 if not ("feat:" + q) in has_warned: logging.warn("Missing feature %s" % q) has_warned["feat:" + q] = True for q in [ "S.1.TAR", "S.2.TAR", "S.1.TIR", "S.2.TIR", "S.1.TOR", "S.2.TOR" ]: if q not in rec or rec[q] is None: rec[q] = [0, 0] if not ("feat:" + q) in has_warned: logging.warn("Missing feature %s" % q) has_warned["feat:" + q] = True NT = rec["I.NT"] NT_is_ref = int(NT == "ref") QSI_NT = int(rec["I.QSI_NT"]) n_DP = float(rec["S.1.DP"]) t_DP = float(rec["S.2.DP"]) in_del = 0 max_len = len(rec["REF"]) min_len = len(rec["REF"]) for a in rec["ALT"]: if len(a) > len(rec["REF"]): in_del |= 1 else: in_del |= 2 min_len = min(len(a), min_len) max_len = max(len(a), max_len) ilen = max_len - min_len n_DP_ratio = 0 t_DP_ratio = 0 if avg_depth: try: n_DP_ratio = n_DP / float(avg_depth[rec["CHROM"]]) t_DP_ratio = t_DP / float(avg_depth[rec["CHROM"]]) except: if not rec["CHROM"] in has_warned: logging.warn("Cannot normalize depths on %s" % rec["CHROM"]) has_warned[rec["CHROM"]] = True elif "DPnorm" not in has_warned: logging.warn("Cannot normalize depths.") has_warned["DPnorm"] = True # extract observed AF from strelka counts. TIR = ALT; TAR = REF try: n_af = float(rec["S.1.TIR"][0]) / (float(rec["S.1.TIR"][0]) + float(rec["S.1.TAR"][0])) except: n_af = 0 try: t_af = float(rec["S.2.TIR"][0]) / (float(rec["S.2.TIR"][0]) + float(rec["S.2.TAR"][0])) except: t_af = 0 # Gather the computed data into a dict qrec = { "CHROM": rec["CHROM"], "POS": int(rec["POS"]), "REF": rec["REF"], "ALT": ",".join(rec["ALT"]), "LENGTH": ilen, "INDELTYPE": in_del, "FILTER": ",".join(rec["FILTER"]), "NT": NT, "NT_REF": NT_is_ref, "QSI_NT": QSI_NT, "N_DP": n_DP, "T_DP": t_DP, "N_DP_RATE": n_DP_ratio, "T_DP_RATE": t_DP_ratio, "N_AF": n_af, "T_AF": t_af, "SGT": rec["I.SGT"], "tag": tag } # fields with defaults fields = [ { "n": "EVS", "s": "I.EVS", "def": 0, "t": float }, { "n": "VQSR", "s": "I.VQSR", "def": 0, "t": float }, { "n": "RC", "s": "I.RC", "def": 0, "t": int }, { "n": "RU", "s": "I.RU", "def": "" }, { "n": "RU_LEN", "s": "I.RU", "def": 0, "t": len }, { "n": "IC", "s": "I.IC", "def": 0, "t": int }, { "n": "IHP", "s": "I.IHP", "def": 0, "t": int }, { "n": "MQ", "s": "I.MQ", "def": 0.0, "t": float }, { "n": "MQ0", "s": "I.MQ0", "def": 0.0, "t": float }, { "n": "N_BCN", "s": "S.1.BCN50", "def": 0.0, "t": float }, { "n": "T_BCN", "s": "S.2.BCN50", "def": 0.0, "t": float }, { "n": "N_FDP", "s": "S.1.FDP50", "def": 0.0, "t": float }, { "n": "T_FDP", "s": "S.2.FDP50", "def": 0.0, "t": float }, ] for fd in fields: try: res = rec[fd["s"]] if "t" in fd: res = fd["t"](res) except: res = fd["def"] qrec[fd["n"]] = res # ESF features try: for i, v in enumerate(rec["I.EVSF"]): if i in evs_featurenames: try: qrec["E." + evs_featurenames[i]] = float(v) except: # failure to parse pass except: pass for k, v in evs_featurenames.iteritems(): if not "E." + v in qrec: qrec["E." + v] = 0 records.append(qrec) if records: df = pandas.DataFrame(records, columns=cols) else: df = pandas.DataFrame(columns=cols) return df
def extractStrelkaSNVFeatures(vcfname, tag, avg_depth=None): """ Return a data frame with features collected from the given VCF, tagged by given type :param vcfname: name of the VCF file :param tag: type of variants :param avg_depth: average chromosome depths from BAM file """ features = [ "CHROM", "POS", "REF", "ALT", "FILTER", "I.NT", "I.SOMATIC", "I.QSS_NT", "I.VQSR", "I.EVS", "I.EVSF", "I.SGT", "I.MQ", "I.MQ0", "I.SNVSB", "I.ReadPosRankSum", "S.1.SDP", "S.2.SDP", "S.1.FDP", "S.2.FDP", "S.1.DP", "S.2.DP", "S.1.AU", "S.2.AU", "S.1.CU", "S.2.CU", "S.1.GU", "S.2.GU", "S.1.TU", "S.2.TU" ] cols = [ "CHROM", "POS", "REF", "ALT", "NT", "NT_REF", "QSS_NT", "FILTER", "EVS", "VQSR", "N_FDP_RATE", "T_FDP_RATE", "N_SDP_RATE", "T_SDP_RATE", "N_DP", "T_DP", "N_DP_RATE", "T_DP_RATE", "N_AF", "T_AF", "MQ", "MQ0", "SNVSB", "ReadPosRankSum", "tag" ] vcfheaders = list(extractHeaders(vcfname)) evs_featurenames = {} for l in vcfheaders: if '##snv_scoring_features' in l: try: xl = str(l).split('=', 1) xl = xl[1].split(",") for i, n in enumerate(xl): evs_featurenames[i] = n cols.append("E." + n) logging.info("Scoring feature %i : %s" % (i, n)) except: logging.warn( "Could not parse scoring feature names from Strelka output" ) records = [] if not avg_depth: avg_depth = {} for l in vcfheaders: x = str(l).lower() x = x.replace("##meandepth_", "##maxdepth_") x = x.replace("##depth_", "##maxdepth_") if '##maxdepth_' in x: p, _, l = l.partition("_") xl = str(l).split('=') xchr = xl[0] avg_depth[xchr] = float(xl[1]) logging.info("%s depth from VCF header is %f" % (xchr, avg_depth[xchr])) has_warned = {} for vr in vcfExtract(vcfname, features): rec = {} for i, ff in enumerate(features): rec[ff] = vr[i] # read VQSR value, if it's not present, set to -1 (old versions of Strelka) try: rec["I.VQSR"] = float(rec["I.VQSR"]) except: rec["I.VQSR"] = -1.0 # read EVS value, if it's not present, set to -1 (old versions of Strelka) try: rec["I.EVS"] = float(rec["I.EVS"]) except: rec["I.EVS"] = -1.0 # fix missing features for q in [ "I.QSS_NT", "I.MQ", "I.MQ0", "I.SNVSB", "I.ReadPosRankSum", "S.1.SDP", "S.2.SDP", "S.1.FDP", "S.2.FDP", "S.1.DP", "S.2.DP", "S.1.AU", "S.2.AU", "S.1.CU", "S.2.CU", "S.1.GU", "S.2.GU", "S.1.TU", "S.2.TU" ]: if q not in rec or rec[q] is None: rec[q] = 0 if not ("feat:" + q) in has_warned: logging.warn("Missing feature %s" % q) has_warned["feat:" + q] = True rec["tag"] = tag NT = rec["I.NT"] NT_is_ref = int(NT == "ref") QSS_NT = int(rec["I.QSS_NT"]) try: MQ = float(rec["I.MQ"]) except: MQ = None try: MQ_ZERO = float(rec["I.MQ0"]) except: MQ_ZERO = None n_FDP = float(rec["S.1.FDP"]) t_FDP = float(rec["S.2.FDP"]) n_SDP = float(rec["S.1.SDP"]) t_SDP = float(rec["S.2.SDP"]) n_DP = float(rec["S.1.DP"]) t_DP = float(rec["S.2.DP"]) n_FDP_ratio = n_FDP / n_DP if n_DP != 0 else 0 t_FDP_ratio = t_FDP / t_DP if t_DP != 0 else 0 n_SDP_ratio = n_SDP / (n_DP + n_SDP) if (n_DP + n_SDP) != 0 else 0 t_SDP_ratio = t_SDP / (t_DP + t_SDP) if (t_DP + t_SDP) != 0 else 0 n_DP_ratio = 0 t_DP_ratio = 0 if avg_depth: try: n_DP_ratio = n_DP / float(avg_depth[rec["CHROM"]]) t_DP_ratio = t_DP / float(avg_depth[rec["CHROM"]]) except: if not rec["CHROM"] in has_warned: logging.warn("Cannot normalize depths on %s" % rec["CHROM"]) has_warned[rec["CHROM"]] = True elif "DPnorm" not in has_warned: logging.warn("Cannot normalize depths.") has_warned["DPnorm"] = True # Ref and alt allele counts for tier1 and tier2 allele_ref = rec["REF"] try: t_allele_ref_counts = map(float, rec['S.2.' + allele_ref + 'U']) except: t_allele_ref_counts = [0, 0] alleles_alt = rec["ALT"] try: t_allele_alt_counts = [0, 0] for a in alleles_alt: for i in range(2): t_allele_alt_counts[i] += float(rec['S.2.' + a + 'U'][i]) except: t_allele_alt_counts = [0, 0] # Compute the tier1 and tier2 alt allele rates. if t_allele_alt_counts[0] + t_allele_ref_counts[0] == 0: t_tier1_allele_rate = 0 else: t_tier1_allele_rate = t_allele_alt_counts[0] / float( t_allele_alt_counts[0] + t_allele_ref_counts[0]) try: n_allele_ref_counts = map(float, rec['S.1.' + allele_ref + 'U']) except: n_allele_ref_counts = [0, 0] alleles_alt = rec["ALT"] try: n_allele_alt_counts = [0, 0] for a in alleles_alt: for i in range(2): n_allele_alt_counts[i] += float(rec['S.1.' + a + 'U'][i]) except: n_allele_alt_counts = [0, 0] # Compute the tier1 and tier2 alt allele rates. if n_allele_alt_counts[0] + n_allele_ref_counts[0] == 0: n_tier1_allele_rate = 0 else: n_tier1_allele_rate = n_allele_alt_counts[0] / float( n_allele_alt_counts[0] + n_allele_ref_counts[0]) try: snvsb = rec["I.SNVSB"] except: snvsb = 0 try: rprs = rec["I.ReadPosRankSum"] except: rprs = 0 # Gather the computed data into a dict qrec = { "CHROM": rec["CHROM"], "POS": int(rec["POS"]), "REF": rec["REF"], "ALT": ",".join(rec["ALT"]), "FILTER": ",".join(rec["FILTER"]), "NT": NT, "NT_REF": NT_is_ref, "QSS_NT": QSS_NT, "VQSR": rec["I.VQSR"], "EVS": rec["I.EVS"], "N_FDP_RATE": n_FDP_ratio, "T_FDP_RATE": t_FDP_ratio, "N_SDP_RATE": n_SDP_ratio, "T_SDP_RATE": t_SDP_ratio, "N_DP": n_DP, "T_DP": t_DP, "N_DP_RATE": n_DP_ratio, "T_DP_RATE": t_DP_ratio, "N_AF": n_tier1_allele_rate, "T_AF": t_tier1_allele_rate, "MQ": MQ, "MQ0": MQ_ZERO, "SNVSB": snvsb, "ReadPosRankSum": rprs, "tag": tag } # ESF features try: for i, v in enumerate(rec["I.EVSF"]): if i in evs_featurenames: try: qrec["E." + evs_featurenames[i]] = float(v) except: # failure to parse pass except: pass for k, v in evs_featurenames.iteritems(): if not "E." + v in qrec: qrec["E." + v] = 0 records.append(qrec) if records: df = pandas.DataFrame(records, columns=cols) else: df = pandas.DataFrame(columns=cols) return df
def extractPiscesSNVFeatures(vcfname, tag, avg_depth=None): """ Return a data frame with features collected from the given VCF, tagged by given type :param vcfname: name of the VCF file :param tag: type of variants :param avg_depth: average chromosome depths from BAM file """ features = [ "CHROM", "POS", "REF", "ALT", "FILTER", "I.DP", "I.EVS", "S.1.GT", "S.1.GQ", "S.1.AD", "S.1.DP", "S.1.VF", "S.1.NL", "S.1.SB", "S.1.NC", "S.1.AQ", "S.1.GQX" ] cols = [ "CHROM", "POS", "REF", "ALT", "FILTER", "GQX", "EVS", "T_DP", "T_DP_RATE", "T_AF", "tag" ] vcfheaders = list(extractHeaders(vcfname)) evs_featurenames = {} for l in vcfheaders: if '##snv_scoring_features' in l: try: xl = str(l).split('=', 1) xl = xl[1].split(",") for i, n in enumerate(xl): evs_featurenames[i] = n cols.append("E." + n) logging.info("Scoring feature %i : %s" % (i, n)) except: logging.warn( "Could not parse scoring feature names from Pisces output") records = [] if not avg_depth: avg_depth = {} for l in vcfheaders: x = str(l).lower() x = x.replace("##meandepth_", "##maxdepth_") x = x.replace("##depth_", "##maxdepth_") if '##maxdepth_' in x: p, _, l = l.partition("_") xl = str(l).split('=') xchr = xl[0] avg_depth[xchr] = float(xl[1]) logging.info("%s depth from VCF header is %f" % (xchr, avg_depth[xchr])) has_warned = {} for vr in vcfExtract(vcfname, features): rec = {} for i, ff in enumerate(features): rec[ff] = vr[i] # read VQSR value, if it's not present, set to -1 (old versions of Pisces) try: rec["I.VQSR"] = float(rec["I.VQSR"]) except: rec["I.VQSR"] = -1.0 # read EVS value, if it's not present, set to -1 (old versions of Pisces) if "I.SomaticEVS" in rec: try: rec["I.EVS"] = float(rec["I.SomaticEVS"]) except: rec["I.EVS"] = -1.0 else: try: rec["I.EVS"] = float(rec["I.EVS"]) except: rec["I.EVS"] = -1.0 # fix missing features for q in ["S.1.NC", "S.1.AQ"]: if q not in rec or rec[q] is None: rec[q] = 0 if not ("feat:" + q) in has_warned: logging.warn("Missing feature %s" % q) has_warned["feat:" + q] = True rec["tag"] = tag t_DP = float(rec["S.1.DP"]) t_VF = float(rec["S.1.VF"]) GQX = float(rec["S.1.GQX"]) t_DP_ratio = 0 if avg_depth: try: t_DP_ratio = t_DP / float(avg_depth[rec["CHROM"]]) except: if not rec["CHROM"] in has_warned: logging.warn("Cannot normalize depths on %s" % rec["CHROM"]) has_warned[rec["CHROM"]] = True elif "DPnorm" not in has_warned: logging.warn("Cannot normalize depths.") has_warned["DPnorm"] = True # Gather the computed data into a dict qrec = { "CHROM": rec["CHROM"], "POS": int(rec["POS"]), "REF": rec["REF"], "ALT": ",".join(rec["ALT"]), "FILTER": ",".join(rec["FILTER"]), "GQX": GQX, "EVS": rec["I.EVS"], "T_DP": t_DP, "T_DP_RATE": t_DP_ratio, "T_AF": t_VF, "tag": tag } records.append(qrec) if records: df = pandas.DataFrame(records, columns=cols) else: df = pandas.DataFrame(columns=cols) return df
def extractStrelkaIndelFeatures(vcfname, tag, avg_depth=None): """ Return a data frame with features collected from the given VCF, tagged by given type """ features = ["CHROM", "POS", "REF", "ALT", "FILTER", "I.NT", "I.SOMATIC", "I.QSI_NT", "I.SGT", "I.RC", "I.RU", "I.IC", "I.IHP", "I.MQ", "I.MQ0", "I.H200", "I.RC_HPOL_200", "I.RC_DINUC_200", "I.RC_TRIPLET_200", "S.1.DP", "S.2.DP", "S.1.TAR", "S.2.TAR", "S.1.TIR", "S.2.TIR", "S.1.TOR", "S.2.TOR", "S.1.DP50", "S.2.DP50", "S.1.FDP50", "S.2.FDP50", "S.1.SUBDP50", "S.2.SUBDP50"] records = [] if not avg_depth: avg_depth = {} for l in list(extractHeaders(vcfname)): x = str(l).lower() if '##maxdepth_' in x: xl = str(l).split('=') xchr = xl[0][11:] avg_depth[xchr] = float(xl[1]) # logging.info("Maxdepth for %s depth from VCF header is %f" % (xchr, avg_depth[xchr])) has_warned = {} for vr in vcfExtract(vcfname, features): rec = {} for i, ff in enumerate(features): rec[ff] = vr[i] rec["tag"] = tag # fix missing features for q in ["I.QSI_NT", "I.RC", "I.IC", "I.IHP", "S.1.DP", "S.2.DP", "I.H200", "I.RC_HPOL_200", "I.RC_DINUC_200", "I.RC_TRIPLET_200", "S.1.FDP50", "S.2.FDP50", "S.1.SUBDP50", "S.2.SUBDP50"]: if q not in rec or rec[q] is None: rec[q] = 0 if not ("feat:" + q) in has_warned: logging.warn("Missing feature %s" % q) has_warned["feat:" + q] = True for q in ["S.1.TAR", "S.2.TAR", "S.1.TIR", "S.2.TIR", "S.1.TOR", "S.2.TOR"]: if q not in rec or rec[q] is None: rec[q] = [0, 0] if not ("feat:" + q) in has_warned: logging.warn("Missing feature %s" % q) has_warned["feat:" + q] = True NT = rec["I.NT"] NT_is_ref = int(NT == "ref") QSI_NT = int(rec["I.QSI_NT"]) n_D_total_1 = float(rec["S.1.TIR"][0]) + float(rec["S.1.TAR"][0]) + float(rec["S.1.TOR"][0]) t_D_total_1 = float(rec["S.2.TIR"][0]) + float(rec["S.2.TAR"][0]) + float(rec["S.2.TOR"][0]) n_D_total_2 = float(rec["S.1.TIR"][1]) + float(rec["S.1.TAR"][1]) + float(rec["S.1.TOR"][1]) t_D_total_2 = float(rec["S.2.TIR"][1]) + float(rec["S.2.TAR"][1]) + float(rec["S.2.TOR"][1]) n_TOR_ratio_1 = float(rec["S.1.TOR"][0]) / n_D_total_1 if n_D_total_1 != 0 else 0 t_TOR_ratio_1 = float(rec["S.2.TOR"][0]) / t_D_total_1 if t_D_total_1 != 0 else 0 n_TOR_ratio_2 = float(rec["S.1.TOR"][1]) / n_D_total_2 if n_D_total_2 != 0 else 0 t_TOR_ratio_2 = float(rec["S.2.TOR"][1]) / t_D_total_2 if t_D_total_2 != 0 else 0 n_DP = float(rec["S.1.DP"]) t_DP = float(rec["S.2.DP"]) in_del = 0 max_len = len(rec["REF"]) min_len = len(rec["REF"]) for a in rec["ALT"]: if len(a) > len(rec["REF"]): in_del |= 1 else: in_del |= 2 min_len = min(len(a), min_len) max_len = max(len(a), max_len) ilen = max_len - min_len n_DP_ratio = 0 t_DP_ratio = 0 if avg_depth: if rec["CHROM"] in avg_depth: n_DP_ratio = n_DP/float(avg_depth[rec["CHROM"]]) t_DP_ratio = t_DP/float(avg_depth[rec["CHROM"]]) elif not rec["CHROM"] in has_warned: logging.warn("Cannot normalize depths on %s" % rec["CHROM"]) has_warned[rec["CHROM"]] = True elif "DPnorm" not in has_warned: logging.warn("Cannot normalize depths.") has_warned["DPnorm"] = True # Ref and alt allele counts for tier1 and tier2 t_allele_ref_counts = map(float, rec['S.2.TAR']) t_allele_alt_counts = map(float, rec['S.2.TIR']) # Compute the tier1 and tier2 alt allele rates. if t_allele_alt_counts[0] + t_allele_ref_counts[0] == 0: t_tier1_allele_rate = 0 else: t_tier1_allele_rate = t_allele_alt_counts[0] / float(t_allele_alt_counts[0] + t_allele_ref_counts[0]) if t_allele_alt_counts[1] + t_allele_ref_counts[1] == 0: t_tier2_allele_rate = 0 else: t_tier2_allele_rate = t_allele_alt_counts[1] / float(t_allele_alt_counts[1] + t_allele_ref_counts[1]) # Ref and alt allele counts for tier1 and tier2 n_allele_ref_counts = map(float, rec['S.1.TAR']) n_allele_alt_counts = map(float, rec['S.1.TIR']) # Compute the tier1 and tier2 alt allele rates. if n_allele_alt_counts[0] + n_allele_ref_counts[0] == 0: n_tier1_allele_rate = 0 else: n_tier1_allele_rate = n_allele_alt_counts[0] / float(n_allele_alt_counts[0] + n_allele_ref_counts[0]) if n_allele_alt_counts[1] + n_allele_ref_counts[1] == 0: n_tier2_allele_rate = 0 else: n_tier2_allele_rate = n_allele_alt_counts[1] / float(n_allele_alt_counts[1] + n_allele_ref_counts[1]) bcn = 0 try: bcn = rec["S.1.FDP50"] / rec["S.1.DP50"] except: pass try: bcn = max(bcn, rec["S.2.FDP50"] / rec["S.2.DP50"]) except: pass # Gather the computed data into a dict qrec = { "CHROM": rec["CHROM"], "POS": int(rec["POS"]), "REF": rec["REF"], "ALT": ",".join(rec["ALT"]), "LENGTH": ilen, "LENGTHGT5": 0 if ilen <= 5 else 1, "INDELTYPE": in_del, "FILTER": ",".join(rec["FILTER"]), "NT": NT, "NT_REF": NT_is_ref, "QSI_NT": QSI_NT, "N_TOR_RATE_TIER1": n_TOR_ratio_1, "N_TOR_RATE_TIER2": n_TOR_ratio_2, "T_TOR_RATE_TIER1": t_TOR_ratio_1, "T_TOR_RATE_TIER2": t_TOR_ratio_2, "N_DP": n_DP, "T_DP": t_DP, "N_DP_RATE": n_DP_ratio, "T_DP_RATE": t_DP_ratio, "T_TIER1_ALT_RATE": t_tier1_allele_rate, "T_TIER2_ALT_RATE": t_tier2_allele_rate, "N_TIER1_ALT_RATE": n_tier1_allele_rate, "N_TIER2_ALT_RATE": n_tier2_allele_rate, "SGT": rec["I.SGT"], "entropy": rec["I.H200"], "hpol": rec["I.RC_HPOL_200"], "dinuc": rec["I.RC_DINUC_200"], "triplet": rec["I.RC_TRIPLET_200"], "bcn": bcn, "tag": tag } try: qrec["RC"] = int(rec["I.RC"]) except: qrec["RC"] = 0 try: qrec["RU"] = rec["I.RU"] except: qrec["RU"] = "" try: qrec["RU_LEN"] = len(rec["I.RU"]) except: qrec["RU_LEN"] = 0 try: qrec["IC"] = int(rec["I.IC"]) except: qrec["IC"] = 0 try: qrec["IHP"] = int(rec["I.IHP"]) except: qrec["IHP"] = 0 try: qrec["S.1.FDP50"] = float(rec["S.1.FDP50"]) except: qrec["S.1.FDP50"] = 0 try: qrec["S.2.FDP50"] = float(rec["S.2.FDP50"]) except: qrec["S.2.FDP50"] = 0 try: qrec["S.1.SUBDP50"] = float(rec["S.1.SUBDP50"]) except: qrec["S.1.SUBDP50"] = 0 try: qrec["S.2.SUBDP50"] = float(rec["S.2.SUBDP50"]) except: qrec["S.2.SUBDP50"] = 0 try: qrec["MQ"] = float(rec["I.MQ"]) except: qrec["MQ"] = 0 try: qrec["MQ0"] = float(rec["I.MQ0"]) except: qrec["MQ0"] = 0 records.append(qrec) cols = ["CHROM", "POS", "REF", "ALT", "LENGTH", "LENGTHGT5", "INDELTYPE", "FILTER", "NT", "NT_REF", "QSI_NT", "N_TOR_RATE_TIER1", "T_TOR_RATE_TIER1", "N_DP", "T_DP", "N_DP_RATE", "T_DP_RATE", "T_TIER1_ALT_RATE", "T_TIER2_ALT_RATE", "N_TIER1_ALT_RATE", "N_TIER2_ALT_RATE", "SGT", "RC", "RU", "RU_LEN", "IC", "IHP", "S.1.FDP50", "S.1.SUBDP50", "MQ", "MQ0", "entropy", "hpol", "dinuc", "triplet", "bcn", "tag"] if records: df = pandas.DataFrame(records, columns=cols) else: df = pandas.DataFrame(columns=cols) return df
def extractStrelkaSNVFeatures(vcfname, tag, avg_depth=None): """ Return a data frame with features collected from the given VCF, tagged by given type """ features = ["CHROM", "POS", "REF", "ALT", "FILTER", "I.NT", "I.SOMATIC", "I.QSS_NT", "I.VQSR", "I.SGT", "I.MQ", "I.MQ0", "I.PNOISE", "I.PNOISE2", "I.SNVSB", "I.ReadPosRankSum", "S.1.SDP", "S.2.SDP", "S.1.FDP", "S.2.FDP", "S.1.DP", "S.2.DP", "S.1.AU", "S.2.AU", "S.1.CU", "S.2.CU", "S.1.GU", "S.2.GU", "S.1.TU", "S.2.TU"] records = [] if not avg_depth: avg_depth = {} for l in list(extractHeaders(vcfname)): x = str(l).lower() if '##maxdepth_' in x: xl = str(l).split('=') xchr = xl[0][11:] avg_depth[xchr] = float(xl[1]) # logging.info("Maxdepth for %s depth from VCF header is %f" % (xchr, avg_depth[xchr])) has_warned = {} for vr in vcfExtract(vcfname, features): rec = {} for i, ff in enumerate(features): rec[ff] = vr[i] # fix missing features for q in ["I.QSS_NT", "I.MQ", "I.MQ0", "I.PNOISE", "I.PNOISE2", "I.VQSR", "I.SNVSB", "I.ReadPosRankSum", "S.1.SDP", "S.2.SDP", "S.1.FDP", "S.2.FDP", "S.1.DP", "S.2.DP", "S.1.AU", "S.2.AU", "S.1.CU", "S.2.CU", "S.1.GU", "S.2.GU", "S.1.TU", "S.2.TU"]: if q not in rec or rec[q] is None: rec[q] = 0 if not ("feat:" + q) in has_warned: logging.warn("Missing feature %s" % q) has_warned["feat:" + q] = True rec["tag"] = tag NT = rec["I.NT"] NT_is_ref = int(NT == "ref") QSS_NT = int(rec["I.QSS_NT"]) try: MQ = float(rec["I.MQ"]) except: MQ = None try: MQ_ZERO = float(rec["I.MQ0"]) except: MQ_ZERO = None n_FDP = float(rec["S.1.FDP"]) t_FDP = float(rec["S.2.FDP"]) n_SDP = float(rec["S.1.SDP"]) t_SDP = float(rec["S.2.SDP"]) n_DP = float(rec["S.1.DP"]) t_DP = float(rec["S.2.DP"]) n_FDP_ratio = n_FDP/n_DP if n_DP != 0 else 0 t_FDP_ratio = t_FDP/t_DP if t_DP != 0 else 0 n_SDP_ratio = n_SDP/(n_DP + n_SDP) if (n_DP + n_SDP) != 0 else 0 t_SDP_ratio = t_SDP/(t_DP + t_SDP) if (t_DP + t_SDP) != 0 else 0 n_DP_ratio = 0 t_DP_ratio = 0 if avg_depth: if rec["CHROM"] in avg_depth: n_DP_ratio = n_DP/float(avg_depth[rec["CHROM"]]) t_DP_ratio = t_DP/float(avg_depth[rec["CHROM"]]) elif not rec["CHROM"] in has_warned: logging.warn("Cannot normalize depths on %s" % rec["CHROM"]) has_warned[rec["CHROM"]] = True elif "DPnorm" not in has_warned: logging.warn("Cannot normalize depths.") has_warned["DPnorm"] = True # Ref and alt allele counts for tier1 and tier2 allele_ref = rec["REF"] t_allele_ref_counts = map(float, rec['S.2.' + allele_ref + 'U']) alleles_alt = rec["ALT"] if alleles_alt == ['.']: t_allele_alt_counts = [0, 0] else: t_allele_alt_counts = [0, 0] for a in alleles_alt: for i in range(2): t_allele_alt_counts[i] += float(rec['S.2.' + a + 'U'][i]) # Compute the tier1 and tier2 alt allele rates. if t_allele_alt_counts[0] + t_allele_ref_counts[0] == 0: t_tier1_allele_rate = 0 else: t_tier1_allele_rate = t_allele_alt_counts[0] / float(t_allele_alt_counts[0] + t_allele_ref_counts[0]) if t_allele_alt_counts[1] + t_allele_ref_counts[1] == 0: t_tier2_allele_rate = 0 else: t_tier2_allele_rate = t_allele_alt_counts[1] / float(t_allele_alt_counts[1] + t_allele_ref_counts[1]) n_allele_ref_counts = map(float, rec['S.1.' + allele_ref + 'U']) alleles_alt = rec["ALT"] if alleles_alt == ['.']: n_allele_alt_counts = [0, 0] else: n_allele_alt_counts = [0, 0] for a in alleles_alt: for i in range(2): n_allele_alt_counts[i] += float(rec['S.1.' + a + 'U'][i]) # Compute the tier1 and tier2 alt allele rates. if n_allele_alt_counts[0] + n_allele_ref_counts[0] == 0: n_tier1_allele_rate = 0 else: n_tier1_allele_rate = n_allele_alt_counts[0] / float(n_allele_alt_counts[0] + n_allele_ref_counts[0]) if n_allele_alt_counts[1] + n_allele_ref_counts[1] == 0: n_tier2_allele_rate = 0 else: n_tier2_allele_rate = n_allele_alt_counts[1] / float(n_allele_alt_counts[1] + n_allele_ref_counts[1]) try: pnoise = rec["I.PNOISE"] except: pnoise = 0 try: pnoise2 = rec["I.PNOISE2"] except: pnoise2 = 0 try: snvsb = rec["I.SNVSB"] except: snvsb = 0 try: rprs = rec["I.ReadPosRankSum"] except: rprs = 0 # Gather the computed data into a dict qrec = { "CHROM": rec["CHROM"], "POS": int(rec["POS"]), "REF": rec["REF"], "ALT": ",".join(rec["ALT"]), "FILTER": ",".join(rec["FILTER"]), "NT": NT, "NT_REF": NT_is_ref, "QSS_NT": QSS_NT, "VQSR": rec["I.VQSR"], "N_FDP_RATE": n_FDP_ratio, "T_FDP_RATE": t_FDP_ratio, "N_SDP_RATE": n_SDP_ratio, "T_SDP_RATE": t_SDP_ratio, "N_DP": n_DP, "T_DP": t_DP, "N_DP_RATE": n_DP_ratio, "T_DP_RATE": t_DP_ratio, "T_TIER1_ALT_RATE": t_tier1_allele_rate, "T_TIER2_ALT_RATE": t_tier2_allele_rate, "N_TIER1_ALT_RATE": n_tier1_allele_rate, "N_TIER2_ALT_RATE": n_tier2_allele_rate, "MQ_SCORE": MQ, "MQ_ZERO_RATE": MQ_ZERO, "PNOISE": pnoise, "PNOISE2": pnoise2, "SNVSB": snvsb, "ReadPosRankSum": rprs, "tag": tag } records.append(qrec) cols = ["CHROM", "POS", "REF", "ALT", "NT", "NT_REF", "QSS_NT", "FILTER", "VQSR", "N_FDP_RATE", "T_FDP_RATE", "N_SDP_RATE", "T_SDP_RATE", "N_DP", "T_DP", "N_DP_RATE", "T_DP_RATE", "T_TIER1_ALT_RATE", "T_TIER2_ALT_RATE", "N_TIER1_ALT_RATE", "N_TIER2_ALT_RATE", "MQ_SCORE", "MQ_ZERO_RATE", "PNOISE", "PNOISE2", "SNVSB", "ReadPosRankSum", "tag"] if records: df = pandas.DataFrame(records, columns=cols) else: df = pandas.DataFrame(columns=cols) return df
def extractVarscan2SNVFeatures(vcfname, tag, avg_depth=None): """ Return a data frame with features collected from the given VCF, tagged by given type """ records = [] if not avg_depth: logging.warn("No average depths available, normalized depth features cannot be calculated") hdrs = extractHeadersJSON(vcfname) tsn = "" nsn = "" n_sample = "NORMAL" t_sample = "TUMOR" logging.info("Normal sample name : %s (prefix %s) / tumour sample name : %s (prefix %s)" % (nsn, n_sample, tsn, t_sample)) features = ["CHROM", "POS", "REF", "ALT", "FILTER", "I.SSC", "I.GPV", "I.SPV", n_sample + "GT", t_sample + "GT", # Genotype n_sample + "GQ", t_sample + "GQ", # Genotype quality n_sample + "DP", t_sample + "DP", # Read depth n_sample + "RD", t_sample + "RD", # Reference depth n_sample + "AD", t_sample + "AD", # Alternative depth n_sample + "FREQ", t_sample + "FREQ" # Alt. frequence (FA in MuTect) ] has_warned = {} for vr in vcfExtract(vcfname, features): rec = {} for i, ff in enumerate(features): rec[ff] = vr[i] for q in [n_sample + "GT", t_sample + "GT"]: if not q in rec or rec[q] is None: rec[q] = "." if not ("feat:" + q) in has_warned: logging.warn("Missing feature %s" % q) has_warned["feat:" + q] = True # fix missing features for q in [n_sample + "GT", t_sample + "GT", n_sample + "GQ", t_sample + "GQ", n_sample + "DP", t_sample + "DP", n_sample + "AD", t_sample + "AD", n_sample + "RD", t_sample + "RD", n_sample + "FREQ", t_sample + "FREQ"]: if not q in rec or rec[q] is None: rec[q] = 0 if not ("feat:" + q) in has_warned: logging.warn("Missing feature %s" % q) has_warned["feat:" + q] = True else: if q.endswith("FREQ"): try: rec[q] = float(rec[q]) except ValueError: rec[q] = float("NaN") else: try: rec[q] = int(rec[q]) except ValueError: rec[q] = -1 rec["tag"] = tag n_DP = float(rec[n_sample + "DP"]) t_DP = float(rec[t_sample + "DP"]) n_DP_ratio = 0 t_DP_ratio = 0 if avg_depth: if rec["CHROM"] in avg_depth: n_DP_ratio = n_DP/float(avg_depth[rec["CHROM"]]) t_DP_ratio = t_DP/float(avg_depth[rec["CHROM"]]) elif not rec["CHROM"] in has_warned: logging.warn("Cannot normalize depths on %s" % rec["CHROM"]) has_warned[rec["CHROM"]] = True elif not "DPnorm" in has_warned: logging.warn("Cannot normalize depths.") has_warned["DPnorm"] = True n_allele_ref_count = rec[n_sample + "RD"] alleles_alt = rec["ALT"] if alleles_alt == ['.']: n_allele_alt_count = 0 else: n_allele_alt_count = rec[n_sample + "AD"] if n_allele_alt_count + n_allele_ref_count == 0: n_allele_rate = 0 else: n_allele_rate = n_allele_alt_count / float(n_allele_alt_count + n_allele_ref_count) t_allele_ref_count = rec[t_sample + "RD"] alleles_alt = rec["ALT"] if alleles_alt == ['.']: t_allele_alt_count = 0 else: t_allele_alt_count = rec[t_sample + "AD"] if t_allele_alt_count + t_allele_ref_count == 0: t_allele_rate = 0 else: t_allele_rate = t_allele_alt_count / float(t_allele_alt_count + t_allele_ref_count) # Gather the computed data into a dict qrec = { "CHROM": rec["CHROM"], "POS": int(rec["POS"]), "REF": rec["REF"], "ALT": ",".join(rec["ALT"]), "FILTER": ",".join(rec["FILTER"]), "SSC": rec["I.SSC"], "GPV": rec["I.GPV"], "SPV": rec["I.SPV"], "N_DP": n_DP, "T_DP": t_DP, "N_DP_RATE" : n_DP_ratio, "T_DP_RATE" : t_DP_ratio, "N_GT": rec[n_sample + "GT"], "T_GT": rec[t_sample + "GT"], "N_GQ": rec[n_sample +"GQ"], "T_GQ": rec[t_sample +"GQ"], "N_AD": rec[n_sample + "AD"], "T_AD": rec[t_sample + "AD"], "N_FA": rec[n_sample + "FREQ"], "T_FA": rec[t_sample + "FREQ"], "N_ALT_RATE": n_allele_rate, "T_ALT_RATE": t_allele_rate, "tag" : tag } records.append(qrec) cols = [ "CHROM", "POS", "REF", "ALT", "FILTER", "SSC", "GPV", "SPV", "N_DP", "T_DP", "N_DP_RATE", "T_DP_RATE", "N_GT", "T_GT", "N_GQ", "T_GQ", "N_AD", "T_AD", "N_FA", "T_FA", "N_ALT_RATE", "T_ALT_RATE", "tag"] if records: df = pandas.DataFrame(records, columns=cols) else: df = pandas.DataFrame(columns=cols) return df
def extractPiscesSNVFeatures(vcfname, tag, avg_depth=None): """ Return a data frame with features collected from the given VCF, tagged by given type :param vcfname: name of the VCF file :param tag: type of variants :param avg_depth: average chromosome depths from BAM file """ features = ["CHROM", "POS", "REF", "ALT", "FILTER", "I.DP", "I.EVS", "S.1.GT", "S.1.GQ", "S.1.AD", "S.1.DP", "S.1.VF", "S.1.NL", "S.1.SB", "S.1.NC", "S.1.AQ", "S.1.GQX"] cols = ["CHROM", "POS", "REF", "ALT", "FILTER", "GQX", "EVS", "T_DP", "T_DP_RATE", "T_AF", "tag"] vcfheaders = list(extractHeaders(vcfname)) evs_featurenames = {} for l in vcfheaders: if '##snv_scoring_features' in l: try: xl = str(l).split('=', 1) xl = xl[1].split(",") for i, n in enumerate(xl): evs_featurenames[i] = n cols.append("E." + n) logging.info("Scoring feature %i : %s" % (i, n)) except: logging.warn("Could not parse scoring feature names from Pisces output") records = [] if not avg_depth: avg_depth = {} for l in vcfheaders: x = str(l).lower() x = x.replace("##meandepth_", "##maxdepth_") x = x.replace("##depth_", "##maxdepth_") if '##maxdepth_' in x: p, _, l = l.partition("_") xl = str(l).split('=') xchr = xl[0] avg_depth[xchr] = float(xl[1]) logging.info("%s depth from VCF header is %f" % (xchr, avg_depth[xchr])) has_warned = {} for vr in vcfExtract(vcfname, features): rec = {} for i, ff in enumerate(features): rec[ff] = vr[i] # read VQSR value, if it's not present, set to -1 (old versions of Pisces) try: rec["I.VQSR"] = float(rec["I.VQSR"]) except: rec["I.VQSR"] = -1.0 # read EVS value, if it's not present, set to -1 (old versions of Pisces) if "I.SomaticEVS" in rec: try: rec["I.EVS"] = float(rec["I.SomaticEVS"]) except: rec["I.EVS"] = -1.0 else: try: rec["I.EVS"] = float(rec["I.EVS"]) except: rec["I.EVS"] = -1.0 # fix missing features for q in ["S.1.NC", "S.1.AQ"]: if q not in rec or rec[q] is None: rec[q] = 0 if not ("feat:" + q) in has_warned: logging.warn("Missing feature %s" % q) has_warned["feat:" + q] = True rec["tag"] = tag t_DP = float(rec["S.1.DP"]) t_VF = float(rec["S.1.VF"]) GQX = float(rec["S.1.GQX"]) t_DP_ratio = 0 if avg_depth: try: t_DP_ratio = t_DP / float(avg_depth[rec["CHROM"]]) except: if not rec["CHROM"] in has_warned: logging.warn("Cannot normalize depths on %s" % rec["CHROM"]) has_warned[rec["CHROM"]] = True elif "DPnorm" not in has_warned: logging.warn("Cannot normalize depths.") has_warned["DPnorm"] = True # Gather the computed data into a dict qrec = { "CHROM": rec["CHROM"], "POS": int(rec["POS"]), "REF": rec["REF"], "ALT": ",".join(rec["ALT"]), "FILTER": ",".join(rec["FILTER"]), "GQX": GQX, "EVS": rec["I.EVS"], "T_DP": t_DP, "T_DP_RATE": t_DP_ratio, "T_AF": t_VF, "tag": tag } records.append(qrec) if records: df = pandas.DataFrame(records, columns=cols) else: df = pandas.DataFrame(columns=cols) return df
def extractStrelkaIndelFeatures(vcfname, tag, avg_depth=None): """ Return a data frame with features collected from the given VCF, tagged by given type :param vcfname: name of the VCF file :param tag: type of variants :param avg_depth: average chromosome depths from BAM file """ features = ["CHROM", "POS", "REF", "ALT", "FILTER", "I.NT", "I.SOMATIC", "I.QSI_NT", "I.EQSI", "I.ESF", "I.SGT", "I.RC", "I.RU", "I.IC", "I.IHP", "I.MQ", "I.MQ0", "S.1.DP", "S.2.DP", "S.1.TAR", "S.2.TAR", "S.1.TIR", "S.2.TIR", "S.1.TOR", "S.2.TOR", "S.1.AF", "S.2.AF", "S.1.OF", "S.2.OF", "S.1.SOR", "S.2.SOR", "S.1.FS", "S.2.FS", "S.1.BSA", "S.2.BSA", "S.1.RR", "S.2.RR", "S.1.BCN50", "S.2.BCN50", ] cols = ["CHROM", "POS", "REF", "ALT", "LENGTH", "INDELTYPE", "FILTER", "NT", "NT_REF", "VQSR", "EQSI", "QSI_NT", "N_DP", "T_DP", "N_DP_RATE", "T_DP_RATE", "N_AF", "T_AF", "N_OF", "T_OF", "N_SOR", "T_SOR", "N_FS", "T_FS", "N_BSA", "T_BSA", "N_RR", "T_RR", "N_BCN", "T_BCN", "SGT", "RC", "RU", "RU_LEN", "IC", "IHP", "MQ", "MQ0", "tag"] records = [] vcfheaders = list(extractHeaders(vcfname)) vqsr_featurenames = {} for l in vcfheaders: if '##vqsr_features' in l: try: xl = str(l).split('=', 1) xl = xl[1].split(",") for x in xl: i, n = x.split(":", 1) i = int(i) vqsr_featurenames[i] = n cols.append("VQSR." + n) logging.info("VQSR feature %i : %s" % (i, n)) except: logging.warn("Could not parse VQSR feature names from Strelka output") if not avg_depth: avg_depth = {} for l in vcfheaders: x = str(l).lower() x = x.replace("meandepth_", "maxdepth_") if '##maxdepth_' in x: xl = str(l).split('=') xchr = xl[0][12:] avg_depth[xchr] = float(xl[1]) logging.info("%s depth from VCF header is %f" % (xchr, avg_depth[xchr])) has_warned = {} for vr in vcfExtract(vcfname, features): rec = {} for i, ff in enumerate(features): rec[ff] = vr[i] rec["tag"] = tag # fix missing features for q in ["I.QSI_NT", "I.RC", "I.IC", "I.IHP", "I.EQSI", "S.1.DP", "S.2.DP", "S.1.OF", "S.2.OF", "S.1.RR", "S.2.RR", "S.1.FS", "S.2.FS", "S.1.BSA", "S.2.BSA", "S.1.BCN50", "S.2.BCN50", "S.1.AF", "S.2.AF"]: if q not in rec or rec[q] is None: rec[q] = 0 if not ("feat:" + q) in has_warned: logging.warn("Missing feature %s" % q) has_warned["feat:" + q] = True for q in ["S.1.TAR", "S.2.TAR", "S.1.TIR", "S.2.TIR", "S.1.TOR", "S.2.TOR"]: if q not in rec or rec[q] is None: rec[q] = [0, 0] if not ("feat:" + q) in has_warned: logging.warn("Missing feature %s" % q) has_warned["feat:" + q] = True NT = rec["I.NT"] NT_is_ref = int(NT == "ref") QSI_NT = int(rec["I.QSI_NT"]) n_DP = float(rec["S.1.DP"]) t_DP = float(rec["S.2.DP"]) in_del = 0 max_len = len(rec["REF"]) min_len = len(rec["REF"]) for a in rec["ALT"]: if len(a) > len(rec["REF"]): in_del |= 1 else: in_del |= 2 min_len = min(len(a), min_len) max_len = max(len(a), max_len) ilen = max_len - min_len n_DP_ratio = 0 t_DP_ratio = 0 if avg_depth: if rec["CHROM"] in avg_depth: n_DP_ratio = n_DP / float(avg_depth[rec["CHROM"]]) t_DP_ratio = t_DP / float(avg_depth[rec["CHROM"]]) elif not rec["CHROM"] in has_warned: logging.warn("Cannot normalize depths on %s" % rec["CHROM"]) has_warned[rec["CHROM"]] = True elif "DPnorm" not in has_warned: logging.warn("Cannot normalize depths.") has_warned["DPnorm"] = True # Gather the computed data into a dict qrec = { "CHROM": rec["CHROM"], "POS": int(rec["POS"]), "REF": rec["REF"], "ALT": ",".join(rec["ALT"]), "LENGTH": ilen, "INDELTYPE": in_del, "FILTER": ",".join(rec["FILTER"]), "NT": NT, "NT_REF": NT_is_ref, "QSI_NT": QSI_NT, "N_DP": n_DP, "T_DP": t_DP, "N_DP_RATE": n_DP_ratio, "T_DP_RATE": t_DP_ratio, "SGT": rec["I.SGT"], "tag": tag } # fields with defaults fields = [ {"n": "EQSI", "s": "I.EQSI", "def": 0, "t": float}, {"n": "VQSR", "s": "I.EQSI", "def": 0, "t": float}, {"n": "RC", "s": "I.RC", "def": 0, "t": int}, {"n": "RU", "s": "I.RU", "def": ""}, {"n": "RU_LEN", "s": "I.RU", "def": 0, "t": len}, {"n": "IC", "s": "I.IC", "def": 0, "t": int}, {"n": "IHP", "s": "I.IHP", "def": 0, "t": int}, {"n": "MQ", "s": "I.MQ", "def": 0.0, "t": float}, {"n": "MQ0", "s": "I.MQ0", "def": 0.0, "t": float}, {"n": "N_AF", "s": "S.1.AF", "def": 0.0, "t": float}, {"n": "T_AF", "s": "S.2.AF", "def": 0.0, "t": float}, {"n": "N_OF", "s": "S.1.OF", "def": 0.0, "t": float}, {"n": "T_OF", "s": "S.2.OF", "def": 0.0, "t": float}, {"n": "N_SOR", "s": "S.1.SOR", "def": 0.0, "t": float}, {"n": "T_SOR", "s": "S.2.SOR", "def": 0.0, "t": float}, {"n": "N_FS", "s": "S.1.FS", "def": 0.0, "t": float}, {"n": "T_FS", "s": "S.2.FS", "def": 0.0, "t": float}, {"n": "N_BSA", "s": "S.1.BSA", "def": 0.0, "t": float}, {"n": "T_BSA", "s": "S.2.BSA", "def": 0.0, "t": float}, {"n": "N_RR", "s": "S.1.RR", "def": 0.0, "t": float}, {"n": "T_RR", "s": "S.2.RR", "def": 0.0, "t": float}, {"n": "N_BCN", "s": "S.1.BCN50", "def": 0.0, "t": float}, {"n": "T_BCN", "s": "S.2.BCN50", "def": 0.0, "t": float}, ] for fd in fields: try: res = rec[fd["s"]] if "t" in fd: res = fd["t"](res) except: res = fd["def"] qrec[fd["n"]] = res # VQSR features try: for i, v in enumerate(rec["I.ESF"]): if i in vqsr_featurenames: try: qrec["VQSR." + vqsr_featurenames[i]] = float(v) except: # failure to parse pass except: pass for k, v in vqsr_featurenames.iteritems(): if not "VQSR." + v in qrec: qrec["VQSR." + v] = 0 records.append(qrec) if records: df = pandas.DataFrame(records, columns=cols) else: df = pandas.DataFrame(columns=cols) return df
def extractStrelkaSNVFeatures(vcfname, tag, avg_depth=None): """ Return a data frame with features collected from the given VCF, tagged by given type :param vcfname: name of the VCF file :param tag: type of variants :param avg_depth: average chromosome depths from BAM file """ features = [ "CHROM", "POS", "REF", "ALT", "FILTER", "I.NT", "I.SOMATIC", "I.QSS_NT", "I.VQSR", "I.SGT", "I.MQ", "I.MQ0", "I.PNOISE", "I.PNOISE2", "I.SNVSB", "I.ReadPosRankSum", "S.1.SDP", "S.2.SDP", "S.1.FDP", "S.2.FDP", "S.1.DP", "S.2.DP", "S.1.AU", "S.2.AU", "S.1.CU", "S.2.CU", "S.1.GU", "S.2.GU", "S.1.TU", "S.2.TU" ] records = [] if not avg_depth: avg_depth = {} for l in list(extractHeaders(vcfname)): x = str(l).lower() x = x.replace("meandepth_", "maxdepth_") if '##maxdepth_' in x: xl = str(l).split('=') xchr = xl[0][12:] avg_depth[xchr] = float(xl[1]) logging.info("%s depth from VCF header is %f" % (xchr, avg_depth[xchr])) has_warned = {} for vr in vcfExtract(vcfname, features): rec = {} for i, ff in enumerate(features): rec[ff] = vr[i] # fix missing features for q in [ "I.QSS_NT", "I.MQ", "I.MQ0", "I.PNOISE", "I.PNOISE2", "I.VQSR", "I.SNVSB", "I.ReadPosRankSum", "S.1.SDP", "S.2.SDP", "S.1.FDP", "S.2.FDP", "S.1.DP", "S.2.DP", "S.1.AU", "S.2.AU", "S.1.CU", "S.2.CU", "S.1.GU", "S.2.GU", "S.1.TU", "S.2.TU" ]: if q not in rec or rec[q] is None: rec[q] = 0 if not ("feat:" + q) in has_warned: logging.warn("Missing feature %s" % q) has_warned["feat:" + q] = True rec["tag"] = tag NT = rec["I.NT"] NT_is_ref = int(NT == "ref") QSS_NT = int(rec["I.QSS_NT"]) try: MQ = float(rec["I.MQ"]) except: MQ = None try: MQ_ZERO = float(rec["I.MQ0"]) except: MQ_ZERO = None n_FDP = float(rec["S.1.FDP"]) t_FDP = float(rec["S.2.FDP"]) n_SDP = float(rec["S.1.SDP"]) t_SDP = float(rec["S.2.SDP"]) n_DP = float(rec["S.1.DP"]) t_DP = float(rec["S.2.DP"]) n_FDP_ratio = n_FDP / n_DP if n_DP != 0 else 0 t_FDP_ratio = t_FDP / t_DP if t_DP != 0 else 0 n_SDP_ratio = n_SDP / (n_DP + n_SDP) if (n_DP + n_SDP) != 0 else 0 t_SDP_ratio = t_SDP / (t_DP + t_SDP) if (t_DP + t_SDP) != 0 else 0 n_DP_ratio = 0 t_DP_ratio = 0 if avg_depth: if rec["CHROM"] in avg_depth: n_DP_ratio = n_DP / float(avg_depth[rec["CHROM"]]) t_DP_ratio = t_DP / float(avg_depth[rec["CHROM"]]) elif not rec["CHROM"] in has_warned: logging.warn("Cannot normalize depths on %s" % rec["CHROM"]) has_warned[rec["CHROM"]] = True elif "DPnorm" not in has_warned: logging.warn("Cannot normalize depths.") has_warned["DPnorm"] = True # Ref and alt allele counts for tier1 and tier2 allele_ref = rec["REF"] t_allele_ref_counts = map(float, rec['S.2.' + allele_ref + 'U']) alleles_alt = rec["ALT"] if alleles_alt == ['.']: t_allele_alt_counts = [0, 0] else: t_allele_alt_counts = [0, 0] for a in alleles_alt: for i in range(2): t_allele_alt_counts[i] += float(rec['S.2.' + a + 'U'][i]) # Compute the tier1 and tier2 alt allele rates. if t_allele_alt_counts[0] + t_allele_ref_counts[0] == 0: t_tier1_allele_rate = 0 else: t_tier1_allele_rate = t_allele_alt_counts[0] / float( t_allele_alt_counts[0] + t_allele_ref_counts[0]) if t_allele_alt_counts[1] + t_allele_ref_counts[1] == 0: t_tier2_allele_rate = 0 else: t_tier2_allele_rate = t_allele_alt_counts[1] / float( t_allele_alt_counts[1] + t_allele_ref_counts[1]) n_allele_ref_counts = map(float, rec['S.1.' + allele_ref + 'U']) alleles_alt = rec["ALT"] if alleles_alt == ['.']: n_allele_alt_counts = [0, 0] else: n_allele_alt_counts = [0, 0] for a in alleles_alt: for i in range(2): n_allele_alt_counts[i] += float(rec['S.1.' + a + 'U'][i]) # Compute the tier1 and tier2 alt allele rates. if n_allele_alt_counts[0] + n_allele_ref_counts[0] == 0: n_tier1_allele_rate = 0 else: n_tier1_allele_rate = n_allele_alt_counts[0] / float( n_allele_alt_counts[0] + n_allele_ref_counts[0]) if n_allele_alt_counts[1] + n_allele_ref_counts[1] == 0: n_tier2_allele_rate = 0 else: n_tier2_allele_rate = n_allele_alt_counts[1] / float( n_allele_alt_counts[1] + n_allele_ref_counts[1]) try: pnoise = rec["I.PNOISE"] except: pnoise = 0 try: pnoise2 = rec["I.PNOISE2"] except: pnoise2 = 0 try: snvsb = rec["I.SNVSB"] except: snvsb = 0 try: rprs = rec["I.ReadPosRankSum"] except: rprs = 0 # Gather the computed data into a dict qrec = { "CHROM": rec["CHROM"], "POS": int(rec["POS"]), "REF": rec["REF"], "ALT": ",".join(rec["ALT"]), "FILTER": ",".join(rec["FILTER"]), "NT": NT, "NT_REF": NT_is_ref, "QSS_NT": QSS_NT, "VQSR": rec["I.VQSR"], "N_FDP_RATE": n_FDP_ratio, "T_FDP_RATE": t_FDP_ratio, "N_SDP_RATE": n_SDP_ratio, "T_SDP_RATE": t_SDP_ratio, "N_DP": n_DP, "T_DP": t_DP, "N_DP_RATE": n_DP_ratio, "T_DP_RATE": t_DP_ratio, "T_TIER1_ALT_RATE": t_tier1_allele_rate, "T_AF": t_tier1_allele_rate, "T_TIER2_ALT_RATE": t_tier2_allele_rate, "N_TIER1_ALT_RATE": n_tier1_allele_rate, "N_TIER2_ALT_RATE": n_tier2_allele_rate, "MQ_SCORE": MQ, "MQ_ZERO_RATE": MQ_ZERO, "PNOISE": pnoise, "PNOISE2": pnoise2, "SNVSB": snvsb, "ReadPosRankSum": rprs, "tag": tag } records.append(qrec) cols = [ "CHROM", "POS", "REF", "ALT", "NT", "NT_REF", "QSS_NT", "FILTER", "VQSR", "N_FDP_RATE", "T_FDP_RATE", "N_SDP_RATE", "T_SDP_RATE", "N_DP", "T_DP", "N_DP_RATE", "T_DP_RATE", "T_TIER1_ALT_RATE", "T_TIER2_ALT_RATE", "N_TIER1_ALT_RATE", "N_TIER2_ALT_RATE", "T_AF", "MQ_SCORE", "MQ_ZERO_RATE", "PNOISE", "PNOISE2", "SNVSB", "ReadPosRankSum", "tag" ] if records: df = pandas.DataFrame(records, columns=cols) else: df = pandas.DataFrame(columns=cols) return df
def extractVarscan2SNVFeatures(vcfname, tag, avg_depth=None): """ Return a data frame with features collected from the given VCF, tagged by given type """ records = [] if not avg_depth: logging.warn( "No average depths available, normalized depth features cannot be calculated" ) hdrs = extractHeadersJSON(vcfname) # TODO could figure this out automatically nsn = "NORMAL" tsn = "TUMOR" n_sample = "S.1." t_sample = "S.2." logging.info( "Normal sample name : %s (prefix %s) / tumour sample name : %s (prefix %s)" % (nsn, n_sample, tsn, t_sample)) features = [ "CHROM", "POS", "REF", "ALT", "FILTER", "I.SSC", "I.GPV", "I.SPV", n_sample + "GT", t_sample + "GT", # Genotype n_sample + "GQ", t_sample + "GQ", # Genotype quality n_sample + "DP", t_sample + "DP", # Read depth n_sample + "RD", t_sample + "RD", # Reference depth n_sample + "AD", t_sample + "AD", # Alternative depth n_sample + "FREQ", t_sample + "FREQ" # Alt. frequence (FA in MuTect) ] has_warned = {} for vr in vcfExtract(vcfname, features): rec = {} for i, ff in enumerate(features): rec[ff] = vr[i] for q in [n_sample + "GT", t_sample + "GT"]: if not q in rec or rec[q] is None: rec[q] = "." if not ("feat:" + q) in has_warned: logging.warn("Missing feature %s" % q) has_warned["feat:" + q] = True # fix missing features for q in [ n_sample + "GT", t_sample + "GT", n_sample + "GQ", t_sample + "GQ", n_sample + "DP", t_sample + "DP", n_sample + "AD", t_sample + "AD", n_sample + "RD", t_sample + "RD", n_sample + "FREQ", t_sample + "FREQ" ]: if not q in rec or rec[q] is None: rec[q] = 0 if not ("feat:" + q) in has_warned: logging.warn("Missing feature %s" % q) has_warned["feat:" + q] = True else: if q.endswith("FREQ"): try: rec[q] = float(rec[q]) except ValueError: rec[q] = float("NaN") else: try: rec[q] = int(rec[q]) except ValueError: rec[q] = -1 rec["tag"] = tag n_DP = float(rec[n_sample + "DP"]) t_DP = float(rec[t_sample + "DP"]) n_DP_ratio = 0 t_DP_ratio = 0 if avg_depth: if rec["CHROM"] in avg_depth: n_DP_ratio = n_DP / float(avg_depth[rec["CHROM"]]) t_DP_ratio = t_DP / float(avg_depth[rec["CHROM"]]) elif not rec["CHROM"] in has_warned: logging.warn("Cannot normalize depths on %s" % rec["CHROM"]) has_warned[rec["CHROM"]] = True elif not "DPnorm" in has_warned: logging.warn("Cannot normalize depths.") has_warned["DPnorm"] = True n_allele_ref_count = rec[n_sample + "RD"] alleles_alt = rec["ALT"] if alleles_alt == ['.']: n_allele_alt_count = 0 else: n_allele_alt_count = rec[n_sample + "AD"] if n_allele_alt_count + n_allele_ref_count == 0: n_allele_rate = 0 else: n_allele_rate = n_allele_alt_count / float(n_allele_alt_count + n_allele_ref_count) t_allele_ref_count = rec[t_sample + "RD"] alleles_alt = rec["ALT"] if alleles_alt == ['.']: t_allele_alt_count = 0 else: t_allele_alt_count = rec[t_sample + "AD"] if t_allele_alt_count + t_allele_ref_count == 0: t_allele_rate = 0 else: t_allele_rate = t_allele_alt_count / float(t_allele_alt_count + t_allele_ref_count) # Gather the computed data into a dict qrec = { "CHROM": rec["CHROM"], "POS": int(rec["POS"]), "REF": rec["REF"], "ALT": ",".join(rec["ALT"]), "FILTER": ",".join(rec["FILTER"]), "SSC": rec["I.SSC"], "GPV": rec["I.GPV"], "SPV": rec["I.SPV"], "N_DP": n_DP, "T_DP": t_DP, "N_DP_RATE": n_DP_ratio, "T_DP_RATE": t_DP_ratio, "N_GT": rec[n_sample + "GT"], "T_GT": rec[t_sample + "GT"], "N_GQ": rec[n_sample + "GQ"], "T_GQ": rec[t_sample + "GQ"], "N_AD": rec[n_sample + "AD"], "T_AD": rec[t_sample + "AD"], "N_FA": rec[n_sample + "FREQ"], "T_FA": rec[t_sample + "FREQ"], "N_ALT_RATE": n_allele_rate, "T_ALT_RATE": t_allele_rate, "tag": tag } records.append(qrec) cols = [ "CHROM", "POS", "REF", "ALT", "FILTER", "SSC", "GPV", "SPV", "N_DP", "T_DP", "N_DP_RATE", "T_DP_RATE", "N_GT", "T_GT", "N_GQ", "T_GQ", "N_AD", "T_AD", "N_FA", "T_FA", "N_ALT_RATE", "T_ALT_RATE", "tag" ] if records: df = pandas.DataFrame(records, columns=cols) else: df = pandas.DataFrame(columns=cols) return df
def extractMutectSNVFeatures(vcfname, tag, avg_depth=None): """ Return a data frame with features collected from the given VCF, tagged by given type """ records = [] if not avg_depth: logging.warn("No average depths available, normalized depth features cannot be calculated") hdrs = extractHeadersJSON(vcfname) tsn = "" nsn = "" t_sample = "S.1." n_sample = "S.2." try: samples = hdrs["samples"] for f in hdrs["fields"]: if f["key"] == "GATKCommandLine" and f["values"]["ID"].lower() == "mutect": clopts = f["values"]["CommandLineOptions"] # ... tumor_sample_name=HCC2218_tumour ... normal_sample_name=HCC2218_normal m = re.search("tumor_sample_name=([^\s]+)", clopts) if m: tsn = m.group(1) for i, x in enumerate(samples): if x == tsn: t_sample = "S.%i." % (i+1) break m = re.search("normal_sample_name=([^\s]+)", clopts) if m: nsn = m.group(1) for i, x in enumerate(samples): if x == nsn: n_sample = "S.%i." % (i+1) break except: logging.warn("Unable to detect tumour / normal sample order from VCF header") logging.info("Normal sample name : %s (prefix %s) / tumour sample name : %s (prefix %s)" % (nsn, n_sample, tsn, t_sample)) features = ["CHROM", "POS", "REF", "ALT", "FILTER", "I.DB", "I.TLOD", "I.NLOD", "I.ECNT", "I.HCNT", "I.MAX_ED", "I.MIN_ED", n_sample + "GT", t_sample + "GT", n_sample + "DP", t_sample + "DP", n_sample + "QSS", t_sample + "QSS", n_sample + "AD", t_sample + "AD"] has_warned = {} for vr in vcfExtract(vcfname, features): rec = {} for i, ff in enumerate(features): rec[ff] = vr[i] for q in [n_sample + "GT", t_sample + "GT"]: if not q in rec or rec[q] is None: rec[q] = "." if not ("feat:" + q) in has_warned: logging.warn("Missing feature %s" % q) has_warned["feat:" + q] = True # fix missing features for q in ["I.DB", "I.TLOD", "I.NLOD", "I.ECNT", "I.HCNT", "I.MAX_ED", "I.MIN_ED", n_sample + "GT", t_sample + "GT", n_sample + "DP", t_sample + "DP", n_sample + "QSS", t_sample + "QSS", n_sample + "AD", t_sample + "AD"]: if not q in rec or rec[q] is None: rec[q] = 0 if not ("feat:" + q) in has_warned: logging.warn("Missing feature %s" % q) has_warned["feat:" + q] = True else: # list features if q.endswith("AD") or q.endswith("QSS"): if type(rec[q]) is not list: if not has_warned[q + "_PARSE_FAIL"]: logging.warn("Cannot parse %s: %s" % (q, str(rec[q]))) has_warned[q + "_PARSE_FAIL"] = True rec[q] = [0] * (1 + len(rec["ALT"])) for xx in range(0, 1 + len(rec["ALT"])): if len(rec[q]) <= xx: rec[q].append(0) else: try: rec[q][xx] = float(rec[q][xx]) except ValueError: rec[q][xx] = 0 else: try: rec[q] = int(rec[q]) except ValueError: rec[q] = -1 rec["tag"] = tag TLOD = float(rec["I.TLOD"]) NLOD = float(rec["I.NLOD"]) n_DP = float(rec[n_sample + "DP"]) t_DP = float(rec[t_sample + "DP"]) n_DP_ratio = 0 t_DP_ratio = 0 if avg_depth: if rec["CHROM"] in avg_depth: n_DP_ratio = n_DP/float(avg_depth[rec["CHROM"]]) t_DP_ratio = t_DP/float(avg_depth[rec["CHROM"]]) elif not rec["CHROM"] in has_warned: logging.warn("Cannot normalize depths on %s" % rec["CHROM"]) has_warned[rec["CHROM"]] = True elif not "DPnorm" in has_warned: logging.warn("Cannot normalize depths.") has_warned["DPnorm"] = True n_allele_ref_count = rec[n_sample + "AD"][0] alleles_alt = rec["ALT"] if alleles_alt == ['.']: n_allele_alt_count = 0 else: n_allele_alt_count = 0 for a in xrange(0, len(alleles_alt)): n_allele_alt_count += float(rec[n_sample + "AD"][a + 1]) if n_allele_alt_count + n_allele_ref_count == 0: n_allele_rate = 0 else: n_allele_rate = n_allele_alt_count / float(n_allele_alt_count + n_allele_ref_count) t_allele_ref_count = rec[t_sample + "AD"][0] alleles_alt = rec["ALT"] if alleles_alt == ['.']: t_allele_alt_count = 0 else: t_allele_alt_count = 0 for a in xrange(0, len(alleles_alt)): t_allele_alt_count += float(rec[t_sample + "AD"][a + 1]) if t_allele_alt_count + t_allele_ref_count == 0: t_allele_rate = 0 else: t_allele_rate = t_allele_alt_count / float(t_allele_alt_count + t_allele_ref_count) # Gather the computed data into a dict qrec = { "CHROM": rec["CHROM"], "POS": int(rec["POS"]), "REF": rec["REF"], "ALT": ",".join(rec["ALT"]), "FILTER": ",".join(rec["FILTER"]), "DBSNP": rec["I.DB"], "TLOD": TLOD, "NLOD": NLOD, "N_DP": n_DP, "T_DP": t_DP, "N_DP_RATE" : n_DP_ratio, "T_DP_RATE" : t_DP_ratio, "N_GT": rec[n_sample + "GT"], "T_GT": rec[t_sample + "GT"], "N_AD": rec[n_sample + "AD"], "T_AD": rec[t_sample + "AD"], "N_QSS": rec[n_sample + "QSS"], "T_QSS": rec[t_sample + "QSS"], "N_AF": n_allele_rate, "T_AF": t_allele_rate, "ECNT": rec["I.ECNT"], "HCNT": rec["I.HCNT"], "MAX_ED": rec["I.MAX_ED"], "MIN_ED": rec["I.MIN_ED"], "tag" : tag } records.append(qrec) cols = ["CHROM", "POS", "REF", "ALT", "FILTER", "TLOD", "NLOD", "DBSNP", "N_DP", "T_DP", "N_DP_RATE", "T_DP_RATE", "N_GT", "T_GT", "N_AD", "T_AD", "N_QSS", "T_QSS", "N_AF", "T_AF", "tag"] if records: df = pandas.DataFrame(records, columns=cols) else: df = pandas.DataFrame(columns=cols) return df
def extractMutectIndelFeatures(vcfname, tag, avg_depth=None): """ Return a data frame with features collected from the given VCF, tagged by given type """ records = [] if not avg_depth: logging.warn("No average depths available, normalized depth features cannot be calculated") hdrs = extractHeadersJSON(vcfname) tsn = "" nsn = "" t_sample = "S.1." n_sample = "S.2." try: samples = hdrs["samples"] for f in hdrs["fields"]: if f["key"] == "GATKCommandLine" and f["values"]["ID"].lower() == "mutect": clopts = f["values"]["CommandLineOptions"] # ... tumor_sample_name=HCC2218_tumour ... normal_sample_name=HCC2218_normal m = re.search("tumor_sample_name=([^\s]+)", clopts) if m: tsn = m.group(1) for i, x in enumerate(samples): if x == tsn: t_sample = "S.%i." % (i+1) break m = re.search("normal_sample_name=([^\s]+)", clopts) if m: nsn = m.group(1) for i, x in enumerate(samples): if x == nsn: n_sample = "S.%i." % (i+1) break except: logging.warn("Unable to detect tumour / normal sample order from VCF header") logging.info("Normal sample name : %s (prefix %s) / tumour sample name : %s (prefix %s)" % (nsn, n_sample, tsn, t_sample)) has_warned = {} ##FORMAT=<ID=MM,Number=2,Type=Float,Description="Average # of mismatches per ref-/consensus indel-supporting read"> ##FORMAT=<ID=MQS,Number=2,Type=Float,Description="Average mapping qualities of ref-/consensus indel-supporting reads"> ##FORMAT=<ID=NQSBQ,Number=2,Type=Float,Description="Within NQS window: average quality of bases in ref-/consensus indel-supporting reads"> ##FORMAT=<ID=NQSMM,Number=2,Type=Float,Description="Within NQS window: fraction of mismatching bases in ref/consensus indel-supporting reads"> ##FORMAT=<ID=REnd,Number=2,Type=Integer,Description="Median/mad of indel offsets from the ends of the reads"> ##FORMAT=<ID=RStart,Number=2,Type=Integer,Description="Median/mad of indel offsets from the starts of the reads"> ##FORMAT=<ID=SC,Number=4,Type=Integer,Description="Strandness: counts of forward-/reverse-aligned reference and indel-supporting reads (FwdRef,RevRef,FwdIndel,RevIndel)"> features = ["CHROM", "POS", "REF", "ALT", "FILTER", n_sample + "GT", t_sample + "GT", n_sample + "DP", t_sample + "DP", n_sample + "AD", t_sample + "AD", n_sample + "MM", t_sample + "MM", n_sample + "MQS", t_sample + "MQS", n_sample + "NQSBQ", t_sample + "NQSBQ", n_sample + "NQSMM", t_sample + "NQSMM", n_sample + "RStart", t_sample + "RStart", n_sample + "REnd", t_sample + "REnd", n_sample + "SC", t_sample + "SC"] for vr in vcfExtract(vcfname, features): rec = {} for i, ff in enumerate(features): rec[ff] = vr[i] for q in [n_sample + "GT", t_sample + "GT"]: if not q in rec or rec[q] is None: rec[q] = "." if not ("feat:" + q) in has_warned: logging.warn("Missing feature %s" % q) has_warned["feat:" + q] = True # fix missing features for q in [n_sample + "GT", t_sample + "GT", n_sample + "DP", t_sample + "DP", n_sample + "AD", t_sample + "AD", n_sample + "MM", t_sample + "MM", n_sample + "MQS", t_sample + "MQS", n_sample + "NQSBQ", t_sample + "NQSBQ", n_sample + "NQSMM", t_sample + "NQSMM", n_sample + "RStart", t_sample + "RStart", n_sample + "REnd", t_sample + "REnd", n_sample + "SC", t_sample + "SC"]: if not q in rec or rec[q] is None: rec[q] = 0 if not ("feat:" + q) in has_warned: logging.warn("Missing feature %s" % q) has_warned["feat:" + q] = True else: if q.endswith("AD") or q.endswith("MM") or q.endswith("MQS") or \ q.endswith("NQSBQ") or q.endswith("NQSMM") or \ q.endswith("REnd") or q.endswith("RStart"): if type(rec[q]) is not list: if not has_warned[q + "_PARSE_FAIL"]: logging.warn("Cannot parse %s: %s" % (q, str(rec[q]))) has_warned[q + "_PARSE_FAIL"] = True rec[q] = [-1, -1] for xx in range(2): if len(rec[q]) <= xx: rec[q].append(-1) else: try: rec[q][xx] = float(rec[q][xx]) except ValueError: rec[q][xx] = -1 elif q.endswith("SC"): if type(rec[q]) is not list: if not has_warned[q + "_PARSE_FAIL"]: logging.warn("Cannot parse %s: %s" % (q, str(rec[q]))) has_warned[q + "_PARSE_FAIL"] = True rec[q] = [-1, -1, -1, -1] else: for xx in range(4): if len(rec[q]) <= xx: rec[q].append(-1) else: try: rec[q][xx] = float(rec[q][xx]) except ValueError: rec[q][xx] = -1 else: try: rec[q] = int(rec[q]) except ValueError: rec[q] = -1 rec["tag"] = tag n_DP = float(rec[n_sample + "DP"]) t_DP = float(rec[t_sample + "DP"]) n_DP_ratio = 0 t_DP_ratio = 0 if avg_depth: if rec["CHROM"] in avg_depth: n_DP_ratio = n_DP/float(avg_depth[rec["CHROM"]]) t_DP_ratio = t_DP/float(avg_depth[rec["CHROM"]]) elif not rec["CHROM"] in has_warned: logging.warn("Cannot normalize depths on %s" % rec["CHROM"]) has_warned[rec["CHROM"]] = True elif not "DPnorm" in has_warned: logging.warn("Cannot normalize depths.") has_warned["DPnorm"] = True n_allele_ref_count = rec[n_sample + "AD"][0] alleles_alt = rec["ALT"] if alleles_alt == ['.']: n_allele_alt_count = 0 else: n_allele_alt_count = 0 for a in xrange(1, len(rec[n_sample + "AD"])): n_allele_alt_count += float(rec[n_sample + "AD"][a]) if n_allele_alt_count + n_allele_ref_count == 0: n_allele_rate = 0 else: n_allele_rate = n_allele_alt_count / float(n_allele_alt_count + n_allele_ref_count) t_allele_ref_count = rec[t_sample + "AD"][0] alleles_alt = rec["ALT"] if alleles_alt == ['.']: t_allele_alt_count = 0 else: t_allele_alt_count = 0 for a in xrange(1, len(rec[t_sample + "AD"])): t_allele_alt_count += float(rec[t_sample + "AD"][a]) if t_allele_alt_count + t_allele_ref_count == 0: t_allele_rate = 0 else: t_allele_rate = t_allele_alt_count / float(t_allele_alt_count + t_allele_ref_count) # Gather the computed data into a dict qrec = { "CHROM": rec["CHROM"], "POS": int(rec["POS"]), "REF": rec["REF"], "ALT": ",".join(rec["ALT"]), "FILTER": ",".join(rec["FILTER"]), "N_DP": n_DP, "T_DP": t_DP, "N_DP_RATE" : n_DP_ratio, "T_DP_RATE" : t_DP_ratio, "N_GT": rec[n_sample + "GT"], "T_GT": rec[t_sample + "GT"], "N_AD": rec[n_sample + "AD"], "T_AD": rec[t_sample + "AD"], "N_ALT_RATE": n_allele_rate, "T_ALT_RATE": t_allele_rate, "N_MM": n_sample + "MM", "T_MM": t_sample + "MM", "N_MQS": n_sample + "MQS", "T_MQS": t_sample + "MQS", "N_NQSBQ": n_sample + "NQSBQ", "T_NQSBQ": t_sample + "NQSBQ", "N_NQSMM": n_sample + "NQSMM", "T_NQSMM": t_sample + "NQSMM", "N_RStart": n_sample + "RStart", "T_RStart": t_sample + "RStart", "N_REnd": n_sample + "REnd", "T_REnd": t_sample + "REnd", "N_SC": n_sample + "SC", "T_SC": t_sample + "SC", "tag" : tag } records.append(qrec) cols = [ "CHROM", "POS", "REF", "ALT", "FILTER", "DBSNP", "N_DP", "T_DP", "N_DP_RATE", "T_DP_RATE", "N_GT", "T_GT", "N_AD", "T_AD", "N_ALT_RATE", "T_ALT_RATE", "N_MM", "T_MM", "N_MQS", "T_MQS", "N_NQSBQ", "T_NQSBQ", "N_NQSMM", "T_NQSMM", "N_RStart", "T_RStart", "N_REnd", "T_REnd", "N_SC", "T_SC", "tag"] if records: df = pandas.DataFrame(records, columns=cols) else: df = pandas.DataFrame(columns=cols) return df