Python standardizeChromosome Examples, genome_utils.standardizeChromosome Python Examples

Example #1

0

Show file

 def __init__(self, columns, chrColumn, posColumn, idColumn=None):
     self.columns = columns
     self.chrom = standardizeChromosome(columns[chrColumn])
     self.pos = int(columns[posColumn])
     if idColumn == None:
         self.id = "%s_%i" % (self.chrom,self.pos)
     else:
         self.id = "."

Example #2

0

Show file

File: sort.py Project: alex-r-bigelow/genepi_ngs_scripts

 def __init__(self, line):
     if "CHROM" in line and "POS" in line:
         self.isFirstLine = True
         self.line = line
     else:
         self.isFirstLine = False
         columns = line.strip().split(csvKey.delimiter)
         self.chromosome = standardizeChromosome(columns[csvKey.chromColumn])
         self.position = int(columns[csvKey.posColumn])

Example #3

0

Show file

File: sort.py Project: alex-r-bigelow/genepi_ngs_scripts

 def __init__(self, line):
     if "CHROM" in line and "POS" in line:
         self.isFirstLine = True
         self.line = line
     else:
         self.isFirstLine = False
         columns = line.strip().split(csvKey.delimiter)
         self.chromosome = standardizeChromosome(
             columns[csvKey.chromColumn])
         self.position = int(columns[csvKey.posColumn])

Example #4

0

Show file

File: sort.py Project: alex-r-bigelow/genepi_ngs_scripts

 def __init__(self, line):
     if len(line.strip()) == 0:
         self.type = vcfKey.EMPTY
     elif line.startswith('##'):
         line = line.lower()
         if line.startswith('##fileformat'):
             self.type = vcfKey.FIRSTLINE
         elif line.startswith('##contig'):
             self.type = vcfKey.CONTIG
             contigID = line[line.find('ID=') + 3:]
             contigID = contigID[:contigID.find(',')]
             self.chromosome = standardizeChromosome(contigID)
         else:
             self.type = vcfKey.OTHER_META
             self.line = line
     elif line.startswith('#'):
         self.type = vcfKey.HEADER
     else:
         self.type = vcfKey.REGULAR
         temp = vcfLine(line.strip().split('\t'))
         temp.extractChrAndPos()
         self.chromosome = temp.chromosome
         self.position = temp.position

Example #5

0

Show file

File: sort.py Project: alex-r-bigelow/genepi_ngs_scripts

 def __init__(self, line):
     if len(line.strip()) == 0:
         self.type = vcfKey.EMPTY
     elif line.startswith("##"):
         line = line.lower()
         if line.startswith("##fileformat"):
             self.type = vcfKey.FIRSTLINE
         elif line.startswith("##contig"):
             self.type = vcfKey.CONTIG
             contigID = line[line.find("ID=") + 3 :]
             contigID = contigID[: contigID.find(",")]
             self.chromosome = standardizeChromosome(contigID)
         else:
             self.type = vcfKey.OTHER_META
             self.line = line
     elif line.startswith("#"):
         self.type = vcfKey.HEADER
     else:
         self.type = vcfKey.REGULAR
         temp = vcfLine(line.strip().split("\t"))
         temp.extractChrAndPos()
         self.chromosome = temp.chromosome
         self.position = temp.position

Example #6

0

Show file

File: VCFtoCVF.py Project: alex-r-bigelow/genepi_ngs_scripts

def run(args):
    
    separateInfoFields = args.separate_info_fields.strip().lower() == "true"
    countSeparate = args.count_separate.strip().lower() == "true"
    
    ignoreFields = args.ignore_fields
    if ignoreFields == None:
        ignoreFields = []
    ignoreFields = set(ignoreFields)
    
    posLengthWarned = False
    allChrs = []
    positions = []
    alleleColumn = infoDetails("Ref/Alt", 1, False)
    qualColumn = infoDetails("QUAL", 1, False)
    filterColumn = infoDetails("FILTER", args.max_strings, False)
    infoFields = {"Ref/Alt":alleleColumn,"QUAL":qualColumn,"FILTER":filterColumn}
    # TODO: get the numeric ranges, all valid categorical values
    
    infile = open(args.infile,'r')
    for line in infile:
        line = line.strip()
        if len(line) <= 1:
            continue
        elif line.startswith("#"):
            temp = line.lower()
            if temp.startswith("##info"):
                newTag = line[temp.find("id=")+3:]
                newTag = newTag[:newTag.find(',')]
                if infoFields.has_key(newTag):
                    raise Exception("Duplicate INFO ID or use of reserved ID:\t%s" % newTag)
                infoFields[newTag] = infoDetails(newTag, args.max_strings, countSeparate)
                if newTag in ignoreFields:
                    infoFields[newTag].maxedOut = True
            elif temp.startswith("##filter"):
                newTag = line[temp.find("id=")+3:]
                newTag = newTag[:newTag.find(',')]
                filterColumn.addCategory(newTag, 0)
            elif temp.startswith("##contig"):
                chrom = line[temp.find("id=")+3:]
                chrom = newTag[:chrom.find(',')]
                chrom = standardizeChromosome(chrom)
                chrLength = line[temp.find("length=")+3:]
                chrLength = chrLength[:chrLength.find(',')]
                
                allChrs.append(chrom)
                positions.append((0,int(chrLength)))
            else:
                # a sneaky way of freezing the filter column; if other filters are added (without a .vcf pragma line) or we aren't separating info fields,
                # other strings will make this column max out early
                filterColumn.maxCategories = len(filterColumn.categories[0])
        else:
            line = vcfLine(line.split('\t'))
            line.extractChrAndPos()
            
            if not line.chromosome not in allChrs:
                allChrs.append(line.chromosome)
                positions.append((0,0))
            chrIndex = allChrs.index(line.chromosome)
            if line.position > positions[chrIndex][1]:
                positions[chrIndex] = (0,line.position)
                if not posLengthWarned:
                    sys.stderr.write('WARNING: Either ##contig pragma lines aren\'t supplied in your .vcf file or a variant has a position beyond the length of a chromosome.')
                    sys.stderr.write(' In either case, be aware that chromosome lengths in the .cvf file may not be accurate.')
                    posLengthWarned = True
            
            line.extractAlleles()
            alleles = line.alleles
            if not separateInfoFields:
                alleles = ",".join(alleles)
            alleleColumn.addArbitraryValue(line.alleles)
            
            line.extractQual()
            qualColumn.addArbitraryValue(line.qual)
            
            line.extractFilters()
            filters = line.filters
            if not separateInfoFields:
                filters = ",".join(filters)
            filterColumn.addArbitraryValue(filters)
            
            line.extractInfo()
            for k,v in line.info.iteritems():
                if not infoFields.has_key(k):
                    raise Exception("Missing ##INFO pragma for: %s" % k)
                if separateInfoFields:
                    v = ",".split(v)
                infoFields[k].addArbitraryValue(v)
    infile.close()
    
    print "Creating file..."
    outfile = open(args.outfile, 'w')
    
    outfile.write("##\t%s created from %s on %s\n" % (args.outfile,args.infile,str(datetime.datetime.now())))
    outfile.write("#\tChromosome\tCHR\t%s\n" % ("\t".join(allChrs)))
    outfile.write("#\tPosition\tPOS\t%s\n" % ("\t".join(["(%i,%i)" % p for p in positions])))
    outfile.write("#\tID\tID\n")
    
    headers = []
    fieldOrder = sorted(infoFields.iterkeys())
    for f in fieldOrder:
        pragmas = infoFields[f].getPragmas()
        for p in pragmas:
            outfile.write(p + "\n")
            h = p.split("\t")[1]
            headers.append(h)
    
    outfile.write('Chromosome\tPosition\tID\t%s\n' % ("\t".join(headers)))
    
    infile = open(args.infile,'r')
    for line in infile:
        line = line.strip()
        if len(line) <= 1 or line.startswith("#"):
            continue
        line = vcfLine(line.split('\t'))
        line.extractChrAndPos()
        line.extractInfo()
        line.extractAlleles()
        line.info["Ref/Alt"] = line.alleles
        line.extractQual()
        line.info["QUAL"] = str(line.qual)
        line.extractFilters()
        line.info["FILTER"] = line.filters
        outfile.write("%s\t%i\t%s" % (line.chromosome,line.position,line.name))
        for f in fieldOrder:
            values = line.info[f]
            if isinstance(values,list):
                if separateInfoFields:
                    values = "\t".join(values)
                else:
                    values = ",".join(values)
            outfile.write("\t%s" % values)
        outfile.write("\n")
    infile.close()
    outfile.close()

Example #7

0

Show file

def run(args):

    separateInfoFields = args.separate_info_fields.strip().lower() == "true"
    countSeparate = args.count_separate.strip().lower() == "true"

    ignoreFields = args.ignore_fields
    if ignoreFields == None:
        ignoreFields = []
    ignoreFields = set(ignoreFields)

    posLengthWarned = False
    allChrs = []
    positions = []
    alleleColumn = infoDetails("Ref/Alt", 1, False)
    qualColumn = infoDetails("QUAL", 1, False)
    filterColumn = infoDetails("FILTER", args.max_strings, False)
    infoFields = {
        "Ref/Alt": alleleColumn,
        "QUAL": qualColumn,
        "FILTER": filterColumn
    }
    # TODO: get the numeric ranges, all valid categorical values

    infile = open(args.infile, 'r')
    for line in infile:
        line = line.strip()
        if len(line) <= 1:
            continue
        elif line.startswith("#"):
            temp = line.lower()
            if temp.startswith("##info"):
                newTag = line[temp.find("id=") + 3:]
                newTag = newTag[:newTag.find(',')]
                if infoFields.has_key(newTag):
                    raise Exception(
                        "Duplicate INFO ID or use of reserved ID:\t%s" %
                        newTag)
                infoFields[newTag] = infoDetails(newTag, args.max_strings,
                                                 countSeparate)
                if newTag in ignoreFields:
                    infoFields[newTag].maxedOut = True
            elif temp.startswith("##filter"):
                newTag = line[temp.find("id=") + 3:]
                newTag = newTag[:newTag.find(',')]
                filterColumn.addCategory(newTag, 0)
            elif temp.startswith("##contig"):
                chrom = line[temp.find("id=") + 3:]
                chrom = newTag[:chrom.find(',')]
                chrom = standardizeChromosome(chrom)
                chrLength = line[temp.find("length=") + 3:]
                chrLength = chrLength[:chrLength.find(',')]

                allChrs.append(chrom)
                positions.append((0, int(chrLength)))
            else:
                # a sneaky way of freezing the filter column; if other filters are added (without a .vcf pragma line) or we aren't separating info fields,
                # other strings will make this column max out early
                filterColumn.maxCategories = len(filterColumn.categories[0])
        else:
            line = vcfLine(line.split('\t'))
            line.extractChrAndPos()

            if not line.chromosome not in allChrs:
                allChrs.append(line.chromosome)
                positions.append((0, 0))
            chrIndex = allChrs.index(line.chromosome)
            if line.position > positions[chrIndex][1]:
                positions[chrIndex] = (0, line.position)
                if not posLengthWarned:
                    sys.stderr.write(
                        'WARNING: Either ##contig pragma lines aren\'t supplied in your .vcf file or a variant has a position beyond the length of a chromosome.'
                    )
                    sys.stderr.write(
                        ' In either case, be aware that chromosome lengths in the .cvf file may not be accurate.'
                    )
                    posLengthWarned = True

            line.extractAlleles()
            alleles = line.alleles
            if not separateInfoFields:
                alleles = ",".join(alleles)
            alleleColumn.addArbitraryValue(line.alleles)

            line.extractQual()
            qualColumn.addArbitraryValue(line.qual)

            line.extractFilters()
            filters = line.filters
            if not separateInfoFields:
                filters = ",".join(filters)
            filterColumn.addArbitraryValue(filters)

            line.extractInfo()
            for k, v in line.info.iteritems():
                if not infoFields.has_key(k):
                    raise Exception("Missing ##INFO pragma for: %s" % k)
                if separateInfoFields:
                    v = ",".split(v)
                infoFields[k].addArbitraryValue(v)
    infile.close()

    print "Creating file..."
    outfile = open(args.outfile, 'w')

    outfile.write("##\t%s created from %s on %s\n" %
                  (args.outfile, args.infile, str(datetime.datetime.now())))
    outfile.write("#\tChromosome\tCHR\t%s\n" % ("\t".join(allChrs)))
    outfile.write("#\tPosition\tPOS\t%s\n" %
                  ("\t".join(["(%i,%i)" % p for p in positions])))
    outfile.write("#\tID\tID\n")

    headers = []
    fieldOrder = sorted(infoFields.iterkeys())
    for f in fieldOrder:
        pragmas = infoFields[f].getPragmas()
        for p in pragmas:
            outfile.write(p + "\n")
            h = p.split("\t")[1]
            headers.append(h)

    outfile.write('Chromosome\tPosition\tID\t%s\n' % ("\t".join(headers)))

    infile = open(args.infile, 'r')
    for line in infile:
        line = line.strip()
        if len(line) <= 1 or line.startswith("#"):
            continue
        line = vcfLine(line.split('\t'))
        line.extractChrAndPos()
        line.extractInfo()
        line.extractAlleles()
        line.info["Ref/Alt"] = line.alleles
        line.extractQual()
        line.info["QUAL"] = str(line.qual)
        line.extractFilters()
        line.info["FILTER"] = line.filters
        outfile.write("%s\t%i\t%s" %
                      (line.chromosome, line.position, line.name))
        for f in fieldOrder:
            values = line.info[f]
            if isinstance(values, list):
                if separateInfoFields:
                    values = "\t".join(values)
                else:
                    values = ",".join(values)
            outfile.write("\t%s" % values)
        outfile.write("\n")
    infile.close()
    outfile.close()