MIN_INTRON_SIZE = 1000 samfile = pysam.AlignmentFile(sys.argv[1], 'rb') ss = get_jxns(sys.argv[2]) seq = Seq(load_genome(open(sys.argv[3], 'r'))) for read in samfile.fetch(): blocks = merge_blocks(read.get_blocks()) strand = (read.is_read1 == read.is_reverse) if len(blocks) > 1: for i in xrange(len(blocks) - 1): if strand: five, three = blocks[i][1], blocks[i+1][0] else: three, five = blocks[i][1], blocks[i+1][0] # Some insertion events align to 5'ss and cause false positives # ... easily fixed by asserting a minimum recursive intron length. if abs(five - three) < MIN_RECURSIVE_INTRON_SIZE: continue chrom = samfile.getrname(read.reference_id) if chrom[:3] == 'chr': chrom = chrom[3:] strand_str = '+' if strand else '-' if (chrom, strand_str, five) in ss: intron_three = ss[(chrom, strand_str, five)] if strand and three < intron_three and seq.query(chrom, strand, three): print '\t'.join(map(str, [chrom, five, three, read.query_name, blocks, strand_str])) elif not strand and three > intron_three and seq.query(chrom, strand, three): print '\t'.join(map(str,[chrom, three, five, read.query_name, blocks, strand_str]))
strand_str = '+' if strand else '-' chrom = samfile.getrname(read.reference_id) if chrom[:3] == 'chr': chrom = chrom[3:] # The following two blocks are identical and handle + and - strands. if strand and read.pnext - read.pos > MAX_ALLOWED_INSERT: inner_left = read.get_blocks()[-1][1] # End of mate #1 inner_right = read.pnext # Start of mate #2 # [(five, three), ...] for all splices between read pair splices = jxns.search(chrom, strand_str, inner_left, inner_right) # check if consistent with any anno splice consistent = False for five, three in splices: if ALLOW_ANNO_AGGT and seq.query(chrom, strand, three): continue cassette = five + OVERHANG >= inner_left and three - OVERHANG <= inner_right if cassette and five - inner_left + inner_right - three < MAX_ALLOWED_INSERT: consistent = True if consistent: continue fives = {} for five, three in splices: if five not in fives: fives[five] = three fives[five] = max(three, fives[five]) # Only print if downstream end lies within an annotated intron # and 5'ss close to inner_left. for five, three in fives.items(): if five - read.pos < MAX_FIVE_INSERT and inner_right + OVERHANG < three: print '\t'.join(
strand_str = '+' if strand else '-' chrom = samfile.getrname(read.reference_id) if chrom[:3] == 'chr': chrom = chrom[3:] # The following two blocks are identical and handle + and - strands. if strand and read.pnext - read.pos > MAX_ALLOWED_INSERT: inner_left = read.get_blocks()[-1][1] # End of mate #1 inner_right = read.pnext # Start of mate #2 # [(five, three), ...] for all splices between read pair splices = jxns.search(chrom, strand_str, inner_left, inner_right) # check if consistent with any anno splice consistent = False for five, three in splices: if ALLOW_ANNO_AGGT and seq.query(chrom, strand, three): continue cassette = five + OVERHANG >= inner_left and three - OVERHANG <= inner_right if cassette and five - inner_left + inner_right - three < MAX_ALLOWED_INSERT: consistent = True if consistent: continue fives = {} for five, three in splices: if five not in fives: fives[five] = three fives[five] = max(three, fives[five]) # Only print if downstream end lies within an annotated intron # and 5'ss close to inner_left. for five, three in fives.items(): if five - read.pos < MAX_FIVE_INSERT and inner_right + OVERHANG < three: print '\t'.join(map(str, [chrom, inner_left, inner_right, five, sample, strand_str]))
for i in xrange(len(blocks) - 1): if strand: five, three = blocks[i][1], blocks[i + 1][0] else: three, five = blocks[i][1], blocks[i + 1][0] # Some insertion events align to 5'ss and cause false positives # ... easily fixed by asserting a minimum recursive intron length. if abs(five - three) < MIN_RECURSIVE_INTRON_SIZE: continue chrom = samfile.getrname(read.reference_id) if chrom[:3] == 'chr': chrom = chrom[3:] strand_str = '+' if strand else '-' if (chrom, strand_str, five) in ss: intron_three = ss[(chrom, strand_str, five)] if strand and three < intron_three and seq.query( chrom, strand, three): print '\t'.join( map(str, [ chrom, five, three, read.query_name, blocks, strand_str ])) elif not strand and three > intron_three and seq.query( chrom, strand, three): print '\t'.join( map(str, [ chrom, three, five, read.query_name, blocks, strand_str ]))