def annotate(options, refGene): with open(options.output, 'w') as output: csvWriter = csv.writer(output, delimiter='\t', quotechar='|') # Read the rows of the variants TSV file into a list. with open(options.variants) as variants: for row in csv.reader(variants, delimiter='\t', quotechar='|'): if len(row) >= 1: coords = row[0].split(',') if len(coords) >= 4 and coords[1].isdigit(): chrName = "chr" + coords[0] pos = safeReadInt(coords[1]) searchResult = search(chrName, pos, refGene) if searchResult != None: csvWriter.writerow(row + [searchResult]) else: csvWriter.writerow(row)
def readRefGene(options): with open(options.refGene) as refs: refGene = {} for row in csv.reader(refs, delimiter='\t'): if len(row) >= 11: chr = row[2] if chr not in refGene: refGene[chr] = [] direction = row[3] transcriptStart = safeReadInt(row[4]) + 1 transcriptEnd = safeReadInt(row[5]) # All coordinates for all fields in the refGene file # are 0-based start and 1-based end, including the transcript, CDS and exons. # see: http://genome.ucsc.edu/FAQ/FAQtracks.html#tracks1 codingRegionStart = safeReadInt(row[6]) + 1 # add one to fix up zero-based start coordinate codingRegionEnd = safeReadInt(row[7]) # For non-coding genes (who, by definition, have a coding region size of # 0), cdsStart will always equal cdsEnd in the genePred format. # As a convention to help with standardization, we # have made the cdsStart equal the txtStart for non-coding genes. # check if this is a coding gene if codingRegionStart < (codingRegionEnd + 1): codingStart = CodingRegionStart(direction, options.startslack, codingRegionStart, codingRegionEnd) refGene[chr].append(codingStart) exonStarts = map(lambda x: safeReadInt(x) + 1, row[9].rstrip(',').split(',')) # add one to fix up zero-based start coordinate exonEnds = map(lambda x: safeReadInt(x), row[10].rstrip(',').split(',')) # if the gene is on the reverse strand then we swap the start and end coordinates. if direction == '-': (exonStarts, exonEnds) = (exonEnds, exonStarts) # we classify the start and ends of exons depending on whether they fall # inside or outside the coding region, including the partial case where # one end is inside and the other end is outside. for start,end in zip(exonStarts, exonEnds): isStartCoding = isCoding(codingRegionStart, codingRegionEnd, start) isEndCoding = isCoding(codingRegionStart, codingRegionEnd, end) if isStartCoding and isEndCoding: refGene[chr].append(CodingExonBoundary(options.spliceslack, direction, start, 'start')) refGene[chr].append(CodingExonBoundary(options.spliceslack, direction, end, 'end')) elif not isStartCoding and not isEndCoding: refGene[chr].append(NonCodingExonBoundary(options.spliceslack, direction, start, 'start')) refGene[chr].append(NonCodingExonBoundary(options.spliceslack, direction, end, 'end')) elif isStartCoding and not isEndCoding: refGene[chr].append(CodingExonBoundary(options.spliceslack, direction, start, 'start')) refGene[chr].append(PartialCodingExonBoundary(options.spliceslack, direction, end, 'end')) elif not isStartCoding and isEndCoding: refGene[chr].append(PartialCodingExonBoundary(options.spliceslack, direction, start, 'start')) refGene[chr].append(CodingExonBoundary(options.spliceslack, direction, end, 'end')) return refGene
except getopt.GetoptError, err: print str(err) usage() sys.exit(2) options = Options() for o, a in opts: if o == "--variants": options.variants = a elif o == "--bin": options.bin = a elif o == "--keep": options.keep = a elif o == "--log": options.log = a elif o == "--varLikeThresh": options.varLikeThresh = safeReadInt(a) elif o == "--samplesPercent": options.samplesPercent = safeReadInt(a) elif o in ('-h', '--help'): usage() sys.exit(0) if not options.check(): print('Incorrect arguments') usage() exit(2) bamFilenames = args # Read the rows of the variants TSV file into a list. with open(options.variants) as variants: variantList = list(csv.reader(variants, delimiter='\t', quotechar='|')) # compute the presence/absence of each variant in the bam files evidence = getEvidence(variantList, bamFilenames)
def main(): try: opts, args = getopt.getopt(sys.argv[1:], shortOptionsFlags, longOptionsFlags) except getopt.GetoptError, err: print str(err) usage() sys.exit(2) options = Options() for o, a in opts: if o == "--refGene": options.refGene = a elif o == "--variants": options.variants = a elif o == "--spliceslack": options.spliceslack = safeReadInt(a) elif o == "--startslack": options.startslack = safeReadInt(a) elif o == "--output": options.output = a elif o in ('-h', '--help'): usage() sys.exit(0) if not options.check(): print('Incorrect arguments') usage() exit(2) refGene = readRefGene(options) #showRefGene(refGene) annotate(options, refGene)
def readRefGene(options): with open(options.refGene) as refs: refGene = {} for row in csv.reader(refs, delimiter='\t'): if len(row) >= 11: chr = row[2] if chr not in refGene: refGene[chr] = [] direction = row[3] transcriptStart = safeReadInt(row[4]) + 1 transcriptEnd = safeReadInt(row[5]) # All coordinates for all fields in the refGene file # are 0-based start and 1-based end, including the transcript, CDS and exons. # see: http://genome.ucsc.edu/FAQ/FAQtracks.html#tracks1 codingRegionStart = safeReadInt( row[6] ) + 1 # add one to fix up zero-based start coordinate codingRegionEnd = safeReadInt(row[7]) # For non-coding genes (who, by definition, have a coding region size of # 0), cdsStart will always equal cdsEnd in the genePred format. # As a convention to help with standardization, we # have made the cdsStart equal the txtStart for non-coding genes. # check if this is a coding gene if codingRegionStart < (codingRegionEnd + 1): codingStart = CodingRegionStart(direction, options.startslack, codingRegionStart, codingRegionEnd) refGene[chr].append(codingStart) exonStarts = map( lambda x: safeReadInt(x) + 1, row[9].rstrip(',').split( ',')) # add one to fix up zero-based start coordinate exonEnds = map(lambda x: safeReadInt(x), row[10].rstrip(',').split(',')) # if the gene is on the reverse strand then we swap the start and end coordinates. if direction == '-': (exonStarts, exonEnds) = (exonEnds, exonStarts) # we classify the start and ends of exons depending on whether they fall # inside or outside the coding region, including the partial case where # one end is inside and the other end is outside. for start, end in zip(exonStarts, exonEnds): isStartCoding = isCoding(codingRegionStart, codingRegionEnd, start) isEndCoding = isCoding(codingRegionStart, codingRegionEnd, end) if isStartCoding and isEndCoding: refGene[chr].append( CodingExonBoundary(options.spliceslack, direction, start, 'start')) refGene[chr].append( CodingExonBoundary(options.spliceslack, direction, end, 'end')) elif not isStartCoding and not isEndCoding: refGene[chr].append( NonCodingExonBoundary(options.spliceslack, direction, start, 'start')) refGene[chr].append( NonCodingExonBoundary(options.spliceslack, direction, end, 'end')) elif isStartCoding and not isEndCoding: refGene[chr].append( CodingExonBoundary(options.spliceslack, direction, start, 'start')) refGene[chr].append( PartialCodingExonBoundary(options.spliceslack, direction, end, 'end')) elif not isStartCoding and isEndCoding: refGene[chr].append( PartialCodingExonBoundary(options.spliceslack, direction, start, 'start')) refGene[chr].append( CodingExonBoundary(options.spliceslack, direction, end, 'end')) return refGene