def collectCore(self, vcfname, headerKey=None): """ Return a data frame with features collected from the given VCF If headerKey is provided, then use this header value to extract labels for the INFO EVSF feature tag """ feature_labels = ["CHROM", "POS", "REF", "ALT"] header_feature_labels = None records = [] isHeader = True isHeaderKey = (headerKey is not None) for line in openMaybeGzip(vcfname): if isHeader: if line[0] == "#": if isHeaderKey and line.startswith("##"): word = line[2:].strip().split("=") if word[0] == headerKey: assert (header_feature_labels is None) header_feature_labels = word[1].split(",") # print header_feature_labels assert (len(header_feature_labels) > 0) continue else: if isHeaderKey: assert (header_feature_labels is not None) isHeader = False word = line.strip().split('\t') qrec = { "CHROM": word[VCFID.CHROM], "POS": int(word[VCFID.POS]), "REF": word[VCFID.REF], "ALT": word[VCFID.ALT] } if isHeaderKey: for ikv in word[VCFID.INFO].split(';'): iword = ikv.split("=", 1) if iword[0] != "EVSF": continue assert (len(iword) == 2) if len(word[VCFID.ALT]) > 1: continue # skip indels features = [float(f) for f in iword[1].split(',')] # print features # assert(len(features) == len(header_feature_labels)) for i in range(len(features)): qrec[header_feature_labels[i]] = features[i] records.append(qrec) cols = feature_labels if isHeaderKey: cols += header_feature_labels return pandas.DataFrame(records, columns=cols)
def collectCore(self, vcfname, headerKey = None): """ Return a data frame with features collected from the given VCF If headerKey is provided, then use this header value to extract labels for the INFO EVSF feature tag """ def isNucleotide(nucString): """ Return True if nucString is a single nucleotide, False otherwise. """ return (nucString in ["A", "C", "G", "T", "N"]) def variantType(ref, alt): """ Return 'snv' if ref and all alt alleles are single nucleotides, otherwise return 'indel' """ if isNucleotide(ref) and all(isNucleotide(allele) for allele in alt.split(',')) : return "snv" return "indel" def processVariant(line, keyType, header_feature_labels): """ Return a record with collected features for this variant or None if the variant type does not match the key type. """ isHeaderKey = (header_feature_labels is not None) word = line.strip().split('\t') qrec = { "CHROM": word[VCFID.CHROM], "POS": int(word[VCFID.POS]), "REF": word[VCFID.REF], "ALT": word[VCFID.ALT], } if isHeaderKey : if variantType(word[VCFID.REF], word[VCFID.ALT]) != keyType : return None for ikv in word[VCFID.INFO].split(';') : iword = ikv.split("=",1) if iword[0] == "EVSF" : assert(len(iword) == 2) features = [float(f) for f in iword[1].split(',')] for i in range(len(features)) : qrec[header_feature_labels[i]] = features[i] return qrec feature_labels = ["CHROM", "POS", "REF", "ALT"] header_feature_labels = None records = [] isHeader = True isHeaderKey = (headerKey is not None) if isHeaderKey : if headerKey == "snv_scoring_features" : keyType = "snv" elif headerKey == "indel_scoring_features" : keyType = "indel" else : raise Exception("Unknown header key: '%s'" % headerKey) else : keyType = None for line in openMaybeGzip(vcfname): if isHeader : if line[0] == "#" : if isHeaderKey and line.startswith("##") : word = line[2:].strip().split("=") if word[0] == headerKey : assert(header_feature_labels is None) header_feature_labels = word[1].split(",") assert(len(header_feature_labels) > 0) continue else : if isHeaderKey : assert(header_feature_labels is not None) isHeader = False qrec = processVariant(line, keyType, header_feature_labels) if qrec is not None: records.append(qrec) cols = feature_labels if isHeaderKey : cols += header_feature_labels return pandas.DataFrame(records, columns=cols)