block_sizes = tokens[10].split(',') block_starts = tokens[11].split(',') # Handle trailing commas try: int(block_sizes[-1]) except ValueError: block_sizes = block_sizes[:-1] try: int(block_starts[-1]) except ValueError: block_starts = block_starts[:-1] block_count = len(block_sizes) assert block_count == len(block_starts) true_seq = ''.join([reference_index.get_stretch( chrom, chrom_start + int(block_starts[i]), int(block_sizes[i]) ) for i in xrange(block_count)]).upper() if tokens[5] == '-': true_seq = true_seq[::-1].translate( _reversed_complement_translation_table ) for i in xrange(args.length): truths[i] += 1 if true_seq[i] == seq[i]: recalls[i] += 1 k += 1 overall_truths = sum(truths) overall_recalls = sum(recalls) print 'Read count: %d' % read_count print 'Overall error rate: %.12f' % (
for i in xrange(1, len(exons_from_transcript)): if exons_from_transcript[i][0] == exons_from_transcript[i-1][0]: # Kill any introns 4 bases or smaller if (exons_from_transcript[i][1] - exons_from_transcript[i-1][2]) < 5: continue intron = (exons_from_transcript[i][0], exons_from_transcript[i-1][2] + 1, exons_from_transcript[i][1] - 1) print '\t'.join((intron[0], str(intron[1]), str(intron[2]))) introns.add(intron) if args.bowtie2_idx is not None: for intron in introns: length = intron[2] - intron[1] + 1 motif = (reference_index.get_stretch(intron[0], intron[1] - 1, 2), reference_index.get_stretch(intron[0], intron[1] + length - 3, 2)) intron_lengths[length] += 1 motif_counts[motif] += 1 for length, frequency in sorted(intron_lengths.items()): print >>sys.stderr, '%d\t%d' % (length, frequency) all_motifs = set([('GT', 'AG'), ('GC', 'AG'), ('AT', 'AC'), ('CT', 'AC'), ('CT', 'GC'), ('GT', 'AT')]) canonicals = set([('GT', 'AG'), ('CT', 'AC')]) less_canonicals = set([('GC', 'AG'), ('CT', 'GC')]) much_less_canonicals = set([('AT', 'AC'), ('GT', 'AT')]) canonical = 0
# Recall that GTF is end-inclusive, and so is STAR's junctions.txt for i in xrange(1, len(exons_from_transcript)): if exons_from_transcript[i][0] == exons_from_transcript[i - 1][0]: # Kill any introns 4 bases or smaller if (exons_from_transcript[i][1] - exons_from_transcript[i - 1][2]) < 5: continue intron = (exons_from_transcript[i][0], exons_from_transcript[i - 1][2] + 1, exons_from_transcript[i][1] - 1) print '\t'.join((intron[0], str(intron[1]), str(intron[2]))) introns.add(intron) if args.bowtie2_idx is not None: for intron in introns: length = intron[2] - intron[1] + 1 motif = (reference_index.get_stretch(intron[0], intron[1] - 1, 2), reference_index.get_stretch(intron[0], intron[1] + length - 3, 2)) intron_lengths[length] += 1 motif_counts[motif] += 1 for length, frequency in sorted(intron_lengths.items()): print >> sys.stderr, '%d\t%d' % (length, frequency) all_motifs = set([('GT', 'AG'), ('GC', 'AG'), ('AT', 'AC'), ('CT', 'AC'), ('CT', 'GC'), ('GT', 'AT')]) canonicals = set([('GT', 'AG'), ('CT', 'AC')]) less_canonicals = set([('GC', 'AG'), ('CT', 'GC')]) much_less_canonicals = set([('AT', 'AC'), ('GT', 'AT')]) canonical = 0 less_canonical = 0 much_less_canonical = 0 one_off_other = 0
block_sizes = tokens[10].split(',') block_starts = tokens[11].split(',') # Handle trailing commas try: int(block_sizes[-1]) except ValueError: block_sizes = block_sizes[:-1] try: int(block_starts[-1]) except ValueError: block_starts = block_starts[:-1] block_count = len(block_sizes) assert block_count == len(block_starts) true_seq = ''.join([ reference_index.get_stretch( chrom, chrom_start + int(block_starts[i]), int(block_sizes[i])) for i in xrange(block_count) ]).upper() if tokens[5] == '-': true_seq = true_seq[::-1].translate( _reversed_complement_translation_table) for i in xrange(args.length): truths[i] += 1 if true_seq[i] == seq[i]: recalls[i] += 1 k += 1 overall_truths = sum(truths) overall_recalls = sum(recalls) print 'Read count: %d' % read_count print 'Overall error rate: %.12f' % ( float(overall_truths - overall_recalls) / overall_truths)