def __init__(self, columns, chrColumn, posColumn, idColumn=None): self.columns = columns self.chrom = standardizeChromosome(columns[chrColumn]) self.pos = int(columns[posColumn]) if idColumn == None: self.id = "%s_%i" % (self.chrom,self.pos) else: self.id = "."
def __init__(self, line): if "CHROM" in line and "POS" in line: self.isFirstLine = True self.line = line else: self.isFirstLine = False columns = line.strip().split(csvKey.delimiter) self.chromosome = standardizeChromosome(columns[csvKey.chromColumn]) self.position = int(columns[csvKey.posColumn])
def __init__(self, line): if "CHROM" in line and "POS" in line: self.isFirstLine = True self.line = line else: self.isFirstLine = False columns = line.strip().split(csvKey.delimiter) self.chromosome = standardizeChromosome( columns[csvKey.chromColumn]) self.position = int(columns[csvKey.posColumn])
def __init__(self, line): if len(line.strip()) == 0: self.type = vcfKey.EMPTY elif line.startswith('##'): line = line.lower() if line.startswith('##fileformat'): self.type = vcfKey.FIRSTLINE elif line.startswith('##contig'): self.type = vcfKey.CONTIG contigID = line[line.find('ID=') + 3:] contigID = contigID[:contigID.find(',')] self.chromosome = standardizeChromosome(contigID) else: self.type = vcfKey.OTHER_META self.line = line elif line.startswith('#'): self.type = vcfKey.HEADER else: self.type = vcfKey.REGULAR temp = vcfLine(line.strip().split('\t')) temp.extractChrAndPos() self.chromosome = temp.chromosome self.position = temp.position
def __init__(self, line): if len(line.strip()) == 0: self.type = vcfKey.EMPTY elif line.startswith("##"): line = line.lower() if line.startswith("##fileformat"): self.type = vcfKey.FIRSTLINE elif line.startswith("##contig"): self.type = vcfKey.CONTIG contigID = line[line.find("ID=") + 3 :] contigID = contigID[: contigID.find(",")] self.chromosome = standardizeChromosome(contigID) else: self.type = vcfKey.OTHER_META self.line = line elif line.startswith("#"): self.type = vcfKey.HEADER else: self.type = vcfKey.REGULAR temp = vcfLine(line.strip().split("\t")) temp.extractChrAndPos() self.chromosome = temp.chromosome self.position = temp.position
def run(args): separateInfoFields = args.separate_info_fields.strip().lower() == "true" countSeparate = args.count_separate.strip().lower() == "true" ignoreFields = args.ignore_fields if ignoreFields == None: ignoreFields = [] ignoreFields = set(ignoreFields) posLengthWarned = False allChrs = [] positions = [] alleleColumn = infoDetails("Ref/Alt", 1, False) qualColumn = infoDetails("QUAL", 1, False) filterColumn = infoDetails("FILTER", args.max_strings, False) infoFields = {"Ref/Alt":alleleColumn,"QUAL":qualColumn,"FILTER":filterColumn} # TODO: get the numeric ranges, all valid categorical values infile = open(args.infile,'r') for line in infile: line = line.strip() if len(line) <= 1: continue elif line.startswith("#"): temp = line.lower() if temp.startswith("##info"): newTag = line[temp.find("id=")+3:] newTag = newTag[:newTag.find(',')] if infoFields.has_key(newTag): raise Exception("Duplicate INFO ID or use of reserved ID:\t%s" % newTag) infoFields[newTag] = infoDetails(newTag, args.max_strings, countSeparate) if newTag in ignoreFields: infoFields[newTag].maxedOut = True elif temp.startswith("##filter"): newTag = line[temp.find("id=")+3:] newTag = newTag[:newTag.find(',')] filterColumn.addCategory(newTag, 0) elif temp.startswith("##contig"): chrom = line[temp.find("id=")+3:] chrom = newTag[:chrom.find(',')] chrom = standardizeChromosome(chrom) chrLength = line[temp.find("length=")+3:] chrLength = chrLength[:chrLength.find(',')] allChrs.append(chrom) positions.append((0,int(chrLength))) else: # a sneaky way of freezing the filter column; if other filters are added (without a .vcf pragma line) or we aren't separating info fields, # other strings will make this column max out early filterColumn.maxCategories = len(filterColumn.categories[0]) else: line = vcfLine(line.split('\t')) line.extractChrAndPos() if not line.chromosome not in allChrs: allChrs.append(line.chromosome) positions.append((0,0)) chrIndex = allChrs.index(line.chromosome) if line.position > positions[chrIndex][1]: positions[chrIndex] = (0,line.position) if not posLengthWarned: sys.stderr.write('WARNING: Either ##contig pragma lines aren\'t supplied in your .vcf file or a variant has a position beyond the length of a chromosome.') sys.stderr.write(' In either case, be aware that chromosome lengths in the .cvf file may not be accurate.') posLengthWarned = True line.extractAlleles() alleles = line.alleles if not separateInfoFields: alleles = ",".join(alleles) alleleColumn.addArbitraryValue(line.alleles) line.extractQual() qualColumn.addArbitraryValue(line.qual) line.extractFilters() filters = line.filters if not separateInfoFields: filters = ",".join(filters) filterColumn.addArbitraryValue(filters) line.extractInfo() for k,v in line.info.iteritems(): if not infoFields.has_key(k): raise Exception("Missing ##INFO pragma for: %s" % k) if separateInfoFields: v = ",".split(v) infoFields[k].addArbitraryValue(v) infile.close() print "Creating file..." outfile = open(args.outfile, 'w') outfile.write("##\t%s created from %s on %s\n" % (args.outfile,args.infile,str(datetime.datetime.now()))) outfile.write("#\tChromosome\tCHR\t%s\n" % ("\t".join(allChrs))) outfile.write("#\tPosition\tPOS\t%s\n" % ("\t".join(["(%i,%i)" % p for p in positions]))) outfile.write("#\tID\tID\n") headers = [] fieldOrder = sorted(infoFields.iterkeys()) for f in fieldOrder: pragmas = infoFields[f].getPragmas() for p in pragmas: outfile.write(p + "\n") h = p.split("\t")[1] headers.append(h) outfile.write('Chromosome\tPosition\tID\t%s\n' % ("\t".join(headers))) infile = open(args.infile,'r') for line in infile: line = line.strip() if len(line) <= 1 or line.startswith("#"): continue line = vcfLine(line.split('\t')) line.extractChrAndPos() line.extractInfo() line.extractAlleles() line.info["Ref/Alt"] = line.alleles line.extractQual() line.info["QUAL"] = str(line.qual) line.extractFilters() line.info["FILTER"] = line.filters outfile.write("%s\t%i\t%s" % (line.chromosome,line.position,line.name)) for f in fieldOrder: values = line.info[f] if isinstance(values,list): if separateInfoFields: values = "\t".join(values) else: values = ",".join(values) outfile.write("\t%s" % values) outfile.write("\n") infile.close() outfile.close()
def run(args): separateInfoFields = args.separate_info_fields.strip().lower() == "true" countSeparate = args.count_separate.strip().lower() == "true" ignoreFields = args.ignore_fields if ignoreFields == None: ignoreFields = [] ignoreFields = set(ignoreFields) posLengthWarned = False allChrs = [] positions = [] alleleColumn = infoDetails("Ref/Alt", 1, False) qualColumn = infoDetails("QUAL", 1, False) filterColumn = infoDetails("FILTER", args.max_strings, False) infoFields = { "Ref/Alt": alleleColumn, "QUAL": qualColumn, "FILTER": filterColumn } # TODO: get the numeric ranges, all valid categorical values infile = open(args.infile, 'r') for line in infile: line = line.strip() if len(line) <= 1: continue elif line.startswith("#"): temp = line.lower() if temp.startswith("##info"): newTag = line[temp.find("id=") + 3:] newTag = newTag[:newTag.find(',')] if infoFields.has_key(newTag): raise Exception( "Duplicate INFO ID or use of reserved ID:\t%s" % newTag) infoFields[newTag] = infoDetails(newTag, args.max_strings, countSeparate) if newTag in ignoreFields: infoFields[newTag].maxedOut = True elif temp.startswith("##filter"): newTag = line[temp.find("id=") + 3:] newTag = newTag[:newTag.find(',')] filterColumn.addCategory(newTag, 0) elif temp.startswith("##contig"): chrom = line[temp.find("id=") + 3:] chrom = newTag[:chrom.find(',')] chrom = standardizeChromosome(chrom) chrLength = line[temp.find("length=") + 3:] chrLength = chrLength[:chrLength.find(',')] allChrs.append(chrom) positions.append((0, int(chrLength))) else: # a sneaky way of freezing the filter column; if other filters are added (without a .vcf pragma line) or we aren't separating info fields, # other strings will make this column max out early filterColumn.maxCategories = len(filterColumn.categories[0]) else: line = vcfLine(line.split('\t')) line.extractChrAndPos() if not line.chromosome not in allChrs: allChrs.append(line.chromosome) positions.append((0, 0)) chrIndex = allChrs.index(line.chromosome) if line.position > positions[chrIndex][1]: positions[chrIndex] = (0, line.position) if not posLengthWarned: sys.stderr.write( 'WARNING: Either ##contig pragma lines aren\'t supplied in your .vcf file or a variant has a position beyond the length of a chromosome.' ) sys.stderr.write( ' In either case, be aware that chromosome lengths in the .cvf file may not be accurate.' ) posLengthWarned = True line.extractAlleles() alleles = line.alleles if not separateInfoFields: alleles = ",".join(alleles) alleleColumn.addArbitraryValue(line.alleles) line.extractQual() qualColumn.addArbitraryValue(line.qual) line.extractFilters() filters = line.filters if not separateInfoFields: filters = ",".join(filters) filterColumn.addArbitraryValue(filters) line.extractInfo() for k, v in line.info.iteritems(): if not infoFields.has_key(k): raise Exception("Missing ##INFO pragma for: %s" % k) if separateInfoFields: v = ",".split(v) infoFields[k].addArbitraryValue(v) infile.close() print "Creating file..." outfile = open(args.outfile, 'w') outfile.write("##\t%s created from %s on %s\n" % (args.outfile, args.infile, str(datetime.datetime.now()))) outfile.write("#\tChromosome\tCHR\t%s\n" % ("\t".join(allChrs))) outfile.write("#\tPosition\tPOS\t%s\n" % ("\t".join(["(%i,%i)" % p for p in positions]))) outfile.write("#\tID\tID\n") headers = [] fieldOrder = sorted(infoFields.iterkeys()) for f in fieldOrder: pragmas = infoFields[f].getPragmas() for p in pragmas: outfile.write(p + "\n") h = p.split("\t")[1] headers.append(h) outfile.write('Chromosome\tPosition\tID\t%s\n' % ("\t".join(headers))) infile = open(args.infile, 'r') for line in infile: line = line.strip() if len(line) <= 1 or line.startswith("#"): continue line = vcfLine(line.split('\t')) line.extractChrAndPos() line.extractInfo() line.extractAlleles() line.info["Ref/Alt"] = line.alleles line.extractQual() line.info["QUAL"] = str(line.qual) line.extractFilters() line.info["FILTER"] = line.filters outfile.write("%s\t%i\t%s" % (line.chromosome, line.position, line.name)) for f in fieldOrder: values = line.info[f] if isinstance(values, list): if separateInfoFields: values = "\t".join(values) else: values = ",".join(values) outfile.write("\t%s" % values) outfile.write("\n") infile.close() outfile.close()