def annotate(options, refGene):
    with open(options.output, 'w') as output:
        csvWriter = csv.writer(output, delimiter='\t', quotechar='|')
        # Read the rows of the variants TSV file into a list.
        with open(options.variants) as variants:
            for row in csv.reader(variants, delimiter='\t', quotechar='|'):
                if len(row) >= 1:
                    coords = row[0].split(',')
                    if len(coords) >= 4 and coords[1].isdigit():
                        chrName = "chr" + coords[0]
                        pos = safeReadInt(coords[1])
                        searchResult = search(chrName, pos, refGene)
                        if searchResult != None:
                            csvWriter.writerow(row + [searchResult])
                        else:
                            csvWriter.writerow(row)
def annotate(options, refGene):
    with open(options.output, 'w') as output:
        csvWriter = csv.writer(output, delimiter='\t', quotechar='|')
        # Read the rows of the variants TSV file into a list.
        with open(options.variants) as variants:
            for row in csv.reader(variants, delimiter='\t', quotechar='|'):
                if len(row) >= 1:
                    coords = row[0].split(',')
                    if len(coords) >= 4 and coords[1].isdigit():
                        chrName = "chr" + coords[0]
                        pos = safeReadInt(coords[1])
                        searchResult = search(chrName, pos, refGene)
                        if searchResult != None:
                            csvWriter.writerow(row + [searchResult])
                        else:
                            csvWriter.writerow(row)
def readRefGene(options):
    with open(options.refGene) as refs:
       refGene = {}
       for row in csv.reader(refs, delimiter='\t'):
           if len(row) >= 11:
               chr = row[2]
               if chr not in refGene:
                   refGene[chr] = []
               direction = row[3]
               transcriptStart = safeReadInt(row[4]) + 1
               transcriptEnd = safeReadInt(row[5])
               # All coordinates for all fields in the refGene file
               # are 0-based start and 1-based end, including the transcript, CDS and exons.
               # see: http://genome.ucsc.edu/FAQ/FAQtracks.html#tracks1
               codingRegionStart = safeReadInt(row[6]) + 1 # add one to fix up zero-based start coordinate
               codingRegionEnd = safeReadInt(row[7])
               # For non-coding genes (who, by definition, have a coding region size of
               # 0), cdsStart will always equal cdsEnd in the genePred format.
               # As a convention to help with standardization, we
               # have made the cdsStart equal the txtStart for non-coding genes.

               # check if this is a coding gene
               if codingRegionStart < (codingRegionEnd + 1):
                   codingStart = CodingRegionStart(direction, options.startslack, codingRegionStart, codingRegionEnd)
                   refGene[chr].append(codingStart)
               exonStarts = map(lambda x: safeReadInt(x) + 1, row[9].rstrip(',').split(',')) # add one to fix up zero-based start coordinate
               exonEnds = map(lambda x: safeReadInt(x), row[10].rstrip(',').split(','))
               # if the gene is on the reverse strand then we swap the start and end coordinates.
               if direction == '-':
                  (exonStarts, exonEnds) = (exonEnds, exonStarts)
               # we classify the start and ends of exons depending on whether they fall
               # inside or outside the coding region, including the partial case where
               # one end is inside and the other end is outside.
               for start,end in zip(exonStarts, exonEnds):
                   isStartCoding = isCoding(codingRegionStart, codingRegionEnd, start)
                   isEndCoding = isCoding(codingRegionStart, codingRegionEnd, end)
                   if isStartCoding and isEndCoding:
                       refGene[chr].append(CodingExonBoundary(options.spliceslack, direction, start, 'start'))
                       refGene[chr].append(CodingExonBoundary(options.spliceslack, direction, end, 'end'))
                   elif not isStartCoding and not isEndCoding:
                       refGene[chr].append(NonCodingExonBoundary(options.spliceslack, direction, start, 'start'))
                       refGene[chr].append(NonCodingExonBoundary(options.spliceslack, direction, end, 'end'))
                   elif isStartCoding and not isEndCoding:
                       refGene[chr].append(CodingExonBoundary(options.spliceslack, direction, start, 'start'))
                       refGene[chr].append(PartialCodingExonBoundary(options.spliceslack, direction, end, 'end'))
                   elif not isStartCoding and isEndCoding:
                       refGene[chr].append(PartialCodingExonBoundary(options.spliceslack, direction, start, 'start'))
                       refGene[chr].append(CodingExonBoundary(options.spliceslack, direction, end, 'end'))
    return refGene
 except getopt.GetoptError, err:
     print str(err)
     usage()
     sys.exit(2)
 options = Options()
 for o, a in opts:
     if o == "--variants":
         options.variants = a
     elif o == "--bin":
         options.bin = a
     elif o == "--keep":
         options.keep = a
     elif o == "--log":
         options.log = a
     elif o == "--varLikeThresh":
         options.varLikeThresh = safeReadInt(a)
     elif o == "--samplesPercent":
         options.samplesPercent = safeReadInt(a)
     elif o in ('-h', '--help'):
         usage()
         sys.exit(0)
 if not options.check():
     print('Incorrect arguments')
     usage()
     exit(2)
 bamFilenames = args
 # Read the rows of the variants TSV file into a list.
 with open(options.variants) as variants:
     variantList = list(csv.reader(variants, delimiter='\t', quotechar='|'))
 # compute the presence/absence of each variant in the bam files
 evidence = getEvidence(variantList, bamFilenames)
def main():
    try:
        opts, args = getopt.getopt(sys.argv[1:], shortOptionsFlags, longOptionsFlags)
    except getopt.GetoptError, err:
        print str(err)
        usage()
        sys.exit(2)
    options = Options()
    for o, a in opts:
        if o == "--refGene":
            options.refGene = a
        elif o == "--variants":
            options.variants = a
        elif o == "--spliceslack":
            options.spliceslack = safeReadInt(a)
        elif o == "--startslack":
            options.startslack = safeReadInt(a)
        elif o == "--output":
            options.output = a
        elif o in ('-h', '--help'):
            usage()
            sys.exit(0)
    if not options.check():
        print('Incorrect arguments')
        usage()
        exit(2)
    refGene = readRefGene(options)
    #showRefGene(refGene)
    annotate(options, refGene)
 except getopt.GetoptError, err:
     print str(err)
     usage()
     sys.exit(2)
 options = Options()
 for o, a in opts:
     if o == "--variants":
         options.variants = a
     elif o == "--bin":
         options.bin = a
     elif o == "--keep":
         options.keep = a
     elif o == "--log":
         options.log = a
     elif o == "--varLikeThresh":
         options.varLikeThresh = safeReadInt(a)
     elif o == "--samplesPercent":
         options.samplesPercent = safeReadInt(a)
     elif o in ('-h', '--help'):
         usage()
         sys.exit(0)
 if not options.check():
     print('Incorrect arguments')
     usage()
     exit(2)
 bamFilenames = args
 # Read the rows of the variants TSV file into a list.
 with open(options.variants) as variants:
     variantList = list(csv.reader(variants, delimiter='\t', quotechar='|'))
 # compute the presence/absence of each variant in the bam files
 evidence = getEvidence(variantList, bamFilenames)
def readRefGene(options):
    with open(options.refGene) as refs:
        refGene = {}
        for row in csv.reader(refs, delimiter='\t'):
            if len(row) >= 11:
                chr = row[2]
                if chr not in refGene:
                    refGene[chr] = []
                direction = row[3]
                transcriptStart = safeReadInt(row[4]) + 1
                transcriptEnd = safeReadInt(row[5])
                # All coordinates for all fields in the refGene file
                # are 0-based start and 1-based end, including the transcript, CDS and exons.
                # see: http://genome.ucsc.edu/FAQ/FAQtracks.html#tracks1
                codingRegionStart = safeReadInt(
                    row[6]
                ) + 1  # add one to fix up zero-based start coordinate
                codingRegionEnd = safeReadInt(row[7])
                # For non-coding genes (who, by definition, have a coding region size of
                # 0), cdsStart will always equal cdsEnd in the genePred format.
                # As a convention to help with standardization, we
                # have made the cdsStart equal the txtStart for non-coding genes.

                # check if this is a coding gene
                if codingRegionStart < (codingRegionEnd + 1):
                    codingStart = CodingRegionStart(direction,
                                                    options.startslack,
                                                    codingRegionStart,
                                                    codingRegionEnd)
                    refGene[chr].append(codingStart)
                exonStarts = map(
                    lambda x: safeReadInt(x) + 1, row[9].rstrip(',').split(
                        ','))  # add one to fix up zero-based start coordinate
                exonEnds = map(lambda x: safeReadInt(x),
                               row[10].rstrip(',').split(','))
                # if the gene is on the reverse strand then we swap the start and end coordinates.
                if direction == '-':
                    (exonStarts, exonEnds) = (exonEnds, exonStarts)
                # we classify the start and ends of exons depending on whether they fall
                # inside or outside the coding region, including the partial case where
                # one end is inside and the other end is outside.
                for start, end in zip(exonStarts, exonEnds):
                    isStartCoding = isCoding(codingRegionStart,
                                             codingRegionEnd, start)
                    isEndCoding = isCoding(codingRegionStart, codingRegionEnd,
                                           end)
                    if isStartCoding and isEndCoding:
                        refGene[chr].append(
                            CodingExonBoundary(options.spliceslack, direction,
                                               start, 'start'))
                        refGene[chr].append(
                            CodingExonBoundary(options.spliceslack, direction,
                                               end, 'end'))
                    elif not isStartCoding and not isEndCoding:
                        refGene[chr].append(
                            NonCodingExonBoundary(options.spliceslack,
                                                  direction, start, 'start'))
                        refGene[chr].append(
                            NonCodingExonBoundary(options.spliceslack,
                                                  direction, end, 'end'))
                    elif isStartCoding and not isEndCoding:
                        refGene[chr].append(
                            CodingExonBoundary(options.spliceslack, direction,
                                               start, 'start'))
                        refGene[chr].append(
                            PartialCodingExonBoundary(options.spliceslack,
                                                      direction, end, 'end'))
                    elif not isStartCoding and isEndCoding:
                        refGene[chr].append(
                            PartialCodingExonBoundary(options.spliceslack,
                                                      direction, start,
                                                      'start'))
                        refGene[chr].append(
                            CodingExonBoundary(options.spliceslack, direction,
                                               end, 'end'))
    return refGene
def main():
    try:
        opts, args = getopt.getopt(sys.argv[1:], shortOptionsFlags,
                                   longOptionsFlags)
    except getopt.GetoptError, err:
        print str(err)
        usage()
        sys.exit(2)
    options = Options()
    for o, a in opts:
        if o == "--refGene":
            options.refGene = a
        elif o == "--variants":
            options.variants = a
        elif o == "--spliceslack":
            options.spliceslack = safeReadInt(a)
        elif o == "--startslack":
            options.startslack = safeReadInt(a)
        elif o == "--output":
            options.output = a
        elif o in ('-h', '--help'):
            usage()
            sys.exit(0)
    if not options.check():
        print('Incorrect arguments')
        usage()
        exit(2)
    refGene = readRefGene(options)
    #showRefGene(refGene)
    annotate(options, refGene)