Esempio n. 1
0
             block_sizes = tokens[10].split(',')
             block_starts = tokens[11].split(',')
             # Handle trailing commas
             try:
                 int(block_sizes[-1])
             except ValueError:
                 block_sizes = block_sizes[:-1]
             try:
                 int(block_starts[-1])
             except ValueError:
                 block_starts = block_starts[:-1]
             block_count = len(block_sizes)
             assert block_count == len(block_starts)
             true_seq = ''.join([reference_index.get_stretch(
                                     chrom,
                                     chrom_start + int(block_starts[i]),
                                     int(block_sizes[i])
                                 ) for i in xrange(block_count)]).upper()
             if tokens[5] == '-':
                 true_seq = true_seq[::-1].translate(
                         _reversed_complement_translation_table
                     )
             for i in xrange(args.length):
                 truths[i] += 1
                 if true_seq[i] == seq[i]:
                     recalls[i] += 1
             k += 1
 overall_truths = sum(truths)
 overall_recalls = sum(recalls)
 print 'Read count: %d' % read_count
 print 'Overall error rate: %.12f' % (
Esempio n. 2
0
     for i in xrange(1, len(exons_from_transcript)):
         if exons_from_transcript[i][0] == exons_from_transcript[i-1][0]:
             # Kill any introns 4 bases or smaller
             if (exons_from_transcript[i][1]
                 - exons_from_transcript[i-1][2]) < 5:
                 continue
             intron = (exons_from_transcript[i][0],
                         exons_from_transcript[i-1][2] + 1,
                         exons_from_transcript[i][1] - 1)
             print '\t'.join((intron[0], str(intron[1]), str(intron[2])))
             introns.add(intron)
 if args.bowtie2_idx is not None:
     for intron in introns:
         length = intron[2] - intron[1] + 1
         motif = (reference_index.get_stretch(intron[0],
                                                 intron[1] - 1,
                                                 2),
                  reference_index.get_stretch(intron[0],
                                                 intron[1]
                                                 + length - 3,
                                                 2))
         intron_lengths[length] += 1
         motif_counts[motif] += 1
     for length, frequency in sorted(intron_lengths.items()):
         print >>sys.stderr, '%d\t%d' % (length, frequency)
     all_motifs = set([('GT', 'AG'), ('GC', 'AG'), ('AT', 'AC'),
                       ('CT', 'AC'), ('CT', 'GC'), ('GT', 'AT')])
     canonicals = set([('GT', 'AG'), ('CT', 'AC')])
     less_canonicals = set([('GC', 'AG'), ('CT', 'GC')])
     much_less_canonicals = set([('AT', 'AC'), ('GT', 'AT')])
     canonical = 0
Esempio n. 3
0
     # Recall that GTF is end-inclusive, and so is STAR's junctions.txt
     for i in xrange(1, len(exons_from_transcript)):
         if exons_from_transcript[i][0] == exons_from_transcript[i - 1][0]:
             # Kill any introns 4 bases or smaller
             if (exons_from_transcript[i][1] -
                     exons_from_transcript[i - 1][2]) < 5:
                 continue
             intron = (exons_from_transcript[i][0],
                       exons_from_transcript[i - 1][2] + 1,
                       exons_from_transcript[i][1] - 1)
             print '\t'.join((intron[0], str(intron[1]), str(intron[2])))
             introns.add(intron)
 if args.bowtie2_idx is not None:
     for intron in introns:
         length = intron[2] - intron[1] + 1
         motif = (reference_index.get_stretch(intron[0], intron[1] - 1, 2),
                  reference_index.get_stretch(intron[0],
                                              intron[1] + length - 3, 2))
         intron_lengths[length] += 1
         motif_counts[motif] += 1
     for length, frequency in sorted(intron_lengths.items()):
         print >> sys.stderr, '%d\t%d' % (length, frequency)
     all_motifs = set([('GT', 'AG'), ('GC', 'AG'), ('AT', 'AC'),
                       ('CT', 'AC'), ('CT', 'GC'), ('GT', 'AT')])
     canonicals = set([('GT', 'AG'), ('CT', 'AC')])
     less_canonicals = set([('GC', 'AG'), ('CT', 'GC')])
     much_less_canonicals = set([('AT', 'AC'), ('GT', 'AT')])
     canonical = 0
     less_canonical = 0
     much_less_canonical = 0
     one_off_other = 0
Esempio n. 4
0
             block_sizes = tokens[10].split(',')
             block_starts = tokens[11].split(',')
             # Handle trailing commas
             try:
                 int(block_sizes[-1])
             except ValueError:
                 block_sizes = block_sizes[:-1]
             try:
                 int(block_starts[-1])
             except ValueError:
                 block_starts = block_starts[:-1]
             block_count = len(block_sizes)
             assert block_count == len(block_starts)
             true_seq = ''.join([
                 reference_index.get_stretch(
                     chrom, chrom_start + int(block_starts[i]),
                     int(block_sizes[i])) for i in xrange(block_count)
             ]).upper()
             if tokens[5] == '-':
                 true_seq = true_seq[::-1].translate(
                     _reversed_complement_translation_table)
             for i in xrange(args.length):
                 truths[i] += 1
                 if true_seq[i] == seq[i]:
                     recalls[i] += 1
             k += 1
 overall_truths = sum(truths)
 overall_recalls = sum(recalls)
 print 'Read count: %d' % read_count
 print 'Overall error rate: %.12f' % (
     float(overall_truths - overall_recalls) / overall_truths)