Ejemplo n.º 1
0
def main():
    usage = "usage: %prog [options] <gtf file>"
    parser = OptionParser(usage)
    parser.add_option(
        "-c", dest="cds", action="store_true", default=False, help="Use CDS, not exons [Default: %default]"
    )
    (options, args) = parser.parse_args()

    if len(args) != 1:
        parser.error("Must provide gtf file")
    else:
        gtf_file = args[0]

    genes = gff.read_genes(gtf_file)

    for transcript_id in genes:
        g = genes[transcript_id]

        if options.cds:
            block_sizes = ",".join([str(ex.end - ex.start + 1) for ex in g.cds])
            block_starts = ",".join([str(ex.start - g.cds[0].start) for ex in g.cds])

            cols = [
                g.chrom,
                str(g.cds[0].start - 1),
                str(g.cds[-1].end),
                transcript_id,
                "0",
                g.strand,
                "0",
                "0",
                "255,0,0",
                str(len(g.cds)),
                block_sizes,
                block_starts,
            ]

        else:
            block_sizes = ",".join([str(ex.end - ex.start + 1) for ex in g.exons])
            block_starts = ",".join([str(ex.start - g.exons[0].start) for ex in g.exons])

            cols = [
                g.chrom,
                str(g.exons[0].start - 1),
                str(g.exons[-1].end),
                transcript_id,
                "0",
                g.strand,
                "0",
                "0",
                "255,0,0",
                str(len(g.exons)),
                block_sizes,
                block_starts,
            ]

        print "\t".join(cols)
Ejemplo n.º 2
0
def main():
    usage = 'usage: %prog [options] <gtf file>'
    parser = OptionParser(usage)
    parser.add_option('-c',
                      dest='cds',
                      action='store_true',
                      default=False,
                      help='Use CDS, not exons [Default: %default]')
    (options, args) = parser.parse_args()

    if len(args) != 1:
        parser.error('Must provide gtf file')
    else:
        gtf_file = args[0]

    genes = gff.read_genes(gtf_file)

    for transcript_id in genes:
        g = genes[transcript_id]

        if options.cds:
            block_sizes = ','.join(
                [str(ex.end - ex.start + 1) for ex in g.cds])
            block_starts = ','.join(
                [str(ex.start - g.cds[0].start) for ex in g.cds])

            cols = [
                g.chrom,
                str(g.cds[0].start - 1),
                str(g.cds[-1].end), transcript_id, '0', g.strand, '0', '0',
                '255,0,0',
                str(len(g.cds)), block_sizes, block_starts
            ]

        else:
            block_sizes = ','.join(
                [str(ex.end - ex.start + 1) for ex in g.exons])
            block_starts = ','.join(
                [str(ex.start - g.exons[0].start) for ex in g.exons])

            cols = [
                g.chrom,
                str(g.exons[0].start - 1),
                str(g.exons[-1].end), transcript_id, '0', g.strand, '0', '0',
                '255,0,0',
                str(len(g.exons)), block_sizes, block_starts
            ]

        print '\t'.join(cols)
Ejemplo n.º 3
0
def get_tss(gtf_file, upstream, downstream):
    tss_intervals = {}

    genes = gff.read_genes(gtf_file)
    for gid in genes:
        g = genes[gid]
        if g.strand == '+':
            istart = g.exons[0].start-upstream
            iend = g.exons[0].start+downstream
        else:
            istart = g.exons[-1].end-downstream
            iend = g.exons[-1].end+upstream

        tss_intervals.setdefault(g.chrom,[]).append((istart,iend,g.strand,gid))

    for chrom in tss_intervals:
        tss_intervals[chrom].sort()

    return tss_intervals
Ejemplo n.º 4
0
def get_tss(gtf_file, upstream, downstream):
    tss_intervals = {}

    genes = gff.read_genes(gtf_file)
    for gid in genes:
        g = genes[gid]
        if g.strand == '+':
            istart = g.exons[0].start - upstream
            iend = g.exons[0].start + downstream
        else:
            istart = g.exons[-1].end - downstream
            iend = g.exons[-1].end + upstream

        tss_intervals.setdefault(g.chrom, []).append(
            (istart, iend, g.strand, gid))

    for chrom in tss_intervals:
        tss_intervals[chrom].sort()

    return tss_intervals
Ejemplo n.º 5
0
def main():
    usage = 'usage: %prog [options] <ref_gtf>'
    parser = OptionParser(usage)
    #parser.add_option()
    parser.add_option('-d', dest='downstream', type='int', default=1000, help='Downstream bp for promoters [Default: %default]')
    parser.add_option('-f', dest='fpkm_tracking', help='Use cufflinks FPKM estimates to choose the most expressed isoform')
    parser.add_option('-u', dest='upstream', type='int', default=1000, help='Upstream bp for promoters [Default: %default]')
    (options,args) = parser.parse_args()

    if len(args) != 1:
        parser.error('Must provide reference GTF')
    else:
        ref_gtf = args[0]

    g2t = gff.g2t(ref_gtf)
    transcripts = gff.read_genes(ref_gtf)
    source = open(ref_gtf).readline().split()[1]

    if options.fpkm_tracking:
        iso_fpkm_tracking = cufflinks.fpkm_tracking(options.fpkm_tracking)

    for gene_id in g2t:
        gene_transcripts = list(g2t[gene_id])
        gene_strand = transcripts[gene_transcripts[0]].strand
        if gene_strand not in ['+','-']:
            print('WARNING: %s discluded for lack of strand' % gene_id, file=sys.stderr)
            continue

        # choose TSS
        if options.fpkm_tracking:
            # find most expressed isoform
            promoter_tid = gene_transcripts[0]
            max_fpkm = stats.geo_mean([1+fpkm for fpkm in iso_fpkm_tracking.gene_expr(promoter_tid)])
            for transcript_id in gene_transcripts[1:]:
                transcript_fpkm = stats.geo_mean([1+fpkm for fpkm in iso_fpkm_tracking.gene_expr(transcript_id)])
                if math.isnan(max_fpkm) or transcript_fpkm > max_fpkm:
                    promoter_tid = transcript_id
                    max_fpkm = transcript_fpkm

            # get isoform tss
            if gene_strand == '+':
                tss = transcripts[promoter_tid].exons[0].start
            else:
                tss = transcripts[promoter_tid].exons[-1].end

        else:
            # find most upstream tss
            promoter_tid = gene_transcripts[0]
            if gene_strand == '+':
                upstream_tss = transcripts[promoter_tid].exons[0].start
            else:
                upstream_tss = transcripts[promoter_tid].exons[-1].end

            for transcript_id in gene_transcripts[1:]:
                if gene_strand == '+':
                    transcript_pos = transcripts[transcript_id].exons[0].start
                    if transcript_pos < upstream_tss:
                        promoter_tid = transcript_id
                        upstream_tss = transcript_pos
                else:
                    transcript_pos = transcripts[transcript_id].exons[-1].end
                    if transcript_pos > upstream_tss:
                        promoter_tid = transcript_id
                        upstream_tss = transcript_pos

            tss = upstream_tss

        # print promoter from the tss
        if gene_strand == '+':
            if tss - options.upstream < 1:
                print('WARNING: %s discluded for nearness to chromosome end' % gene_id, file=sys.stderr)
            else:
                tx = transcripts[promoter_tid]
                cols = [tx.chrom, source, 'promoter', str(tss-options.upstream), str(tss+options.downstream), '.', tx.strand, '.', gff.kv_gtf(tx.kv)]
                print('\t'.join(cols))

        else:
            if tss - options.downstream < 1:
                print('WARNING: %s discluded for nearness to chromosome end' % gene_id, file=sys.stderr)
            else:
                tx = transcripts[promoter_tid]
                cols = [tx.chrom, source, 'promoter', str(tss-options.downstream), str(tss+options.upstream), '.', tx.strand, '.', gff.kv_gtf(tx.kv)]
                print('\t'.join(cols))
Ejemplo n.º 6
0
def get_splice_intervals(gtf_file, window):
    intervals_5p = {}
    intervals_3p = {}

    genes = gff.read_genes(gtf_file)
    for gid in genes:
        g = genes[gid]

        if len(g.exons) > 1:
            if g.strand == '+':
                # add first 5p site
                exon = g.exons[0]
                if not g.chrom in intervals_5p:
                    intervals_5p[g.chrom] = set()
                intervals_5p[g.chrom].add((exon.end - window/2, exon.end + window/2, g.strand))

                # process internal exons
                for exon in g.exons[1:-1]:
                    # add 3p site
                    if not g.chrom in intervals_3p:
                        intervals_3p[g.chrom] = set()
                    intervals_3p[g.chrom].add((exon.start - window/2, exon.start + window/2, g.strand))

                    # add 5p site
                    if not g.chrom in intervals_5p:
                        intervals_5p[g.chrom] = set()
                    intervals_5p[g.chrom].add((exon.end - window/2, exon.end + window/2, g.strand))

                # add last 3p site
                exon = g.exons[-1]
                if not g.chrom in intervals_3p:
                    intervals_3p[g.chrom] = set()
                intervals_3p[g.chrom].add((exon.start - window/2, exon.start + window/2, g.strand))

            else:
                # add first 5p site
                exon = g.exons[-1]
                if not g.chrom in intervals_5p:
                    intervals_5p[g.chrom] = set()
                intervals_5p[g.chrom].add((exon.end - window/2, exon.end + window/2, g.strand))

                # process internal exons (in reverse order, but doesn't matter)
                for exon in g.exons[1:-1]:
                    # add 3p site
                    if not g.chrom in intervals_3p:
                        intervals_3p[g.chrom] = set()
                    intervals_3p[g.chrom].add((exon.start - window/2, exon.start + window/2, g.strand))

                    # add 5p site
                    if not g.chrom in intervals_5p:
                        intervals_5p[g.chrom] = set()
                    intervals_5p[g.chrom].add((exon.end - window/2, exon.end + window/2, g.strand))

                # add last 3p site
                exon = g.exons[0]
                if not g.chrom in intervals_3p:
                    intervals_3p[g.chrom] = set()
                intervals_3p[g.chrom].add((exon.start - window/2, exon.start + window/2, g.strand))

    # convert sets to sorted lists
    for chrom in intervals_5p:
        intervals_5p[chrom] = sorted(list(intervals_5p[chrom]))
        intervals_3p[chrom] = sorted(list(intervals_3p[chrom]))

    return intervals_5p, intervals_3p
Ejemplo n.º 7
0
def compute_coverage(anchor_gff, event_files, mode, anchor_is_gtf, bins):
    ############################################
    # initialize
    ############################################
    coverage = initialize_coverage(anchor_gff, mode, anchor_is_gtf, bins)    

    if anchor_is_gtf:
        # get transcript structures
        transcripts = gff.read_genes(anchor_gff, key_id='transcript_id')

        # compute lengths
        transcript_lengths = {}
        for tid in transcripts:
            tx = transcripts[tid]
            for exon in tx.exons:
                transcript_lengths[tid] = transcript_lengths.get(tid,0) + exon.end-exon.start+1

    else:
        transcripts = None
        transcript_lengths = None


    events = 0
    for event_file in event_files:
        print >> sys.stderr, 'Computing coverage for %s' % event_file

        ############################################
        # preprocess BAM/GFF
        ############################################
        if event_file[-4:] == '.bam':
            # count fragments and hash multi-mappers
            multi_maps = {}
            for aligned_read in pysam.Samfile(event_file, 'rb'):
                try:
                    nh_tag = aligned_read.opt('NH')
                except:
                    nh_tag = 1.0

                if aligned_read.is_paired:
                    events += 0.5/nh_tag
                else:
                    events += 1.0/nh_tag

                if nh_tag > 1:
                    multi_maps[aligned_read.qname] = nh_tag

        elif event_file[-4:] == '.gff':
            for line in open(event_file):
                events += 1

        else:
            print >> sys.stderr, 'Unknown event file format %s' % event_file

        ############################################
        # intersect BAM w/ anchors
        ############################################
        if event_file[-4:] == '.bam':
            p = subprocess.Popen('intersectBed -split -wo -bed -abam %s -b %s' % (event_file, anchor_gff), shell=True, stdout=subprocess.PIPE)
        else:
            p = subprocess.Popen('intersectBed -s -wo -a %s -b %s' % (event_file, anchor_gff), shell=True, stdout=subprocess.PIPE)

        for line in p.stdout:
            a = line.split('\t')

            if event_file[-4:] == '.bam':
                rstart = int(a[1])+1  # convert back to 1-based gff from bed
                rend = int(a[2])
                rheader = a[3]
            else:
                rstart = int(a[3])
                rend = int(a[4])

            # because intersectBed screws up indels near endpoints
            if rstart < rend:
                if event_file[-4:] == '.bam':
                    acol = 12
                else:
                    acol = 9

                achrom = a[acol]
                astart = int(a[acol+3])
                aend = int(a[acol+4])
                astrand = a[acol+6]

                if anchor_is_gtf:
                    anchor_id = gff.gtf_kv(a[acol+8])['transcript_id']
                else:
                    anchor_id = (achrom, astart, aend)

                # find where to increment
                inc_start, inc_end = find_inc_coords(anchor_id, astart, aend, astrand, rstart, rend, mode, bins, transcripts, transcript_lengths)

                if inc_start != None:
                    if event_file[-4:] == '.bam':
                        # find multi-map number, which may require removing a suffix
                        if rheader in multi_maps:
                            mm = multi_maps[rheader]
                        else:
                            rheader_base = rheader[:rheader.rfind('/')]
                            if rheader_base in multi_maps:
                                mm = multi_maps[rheader_base]
                            else:
                                mm = 1.0
                    else:
                        mm = 1.0

                    # increment!
                    for i in range(inc_start, inc_end):
                        coverage[anchor_id][i] += 1.0/mm

        p.communicate()

    return coverage, events
Ejemplo n.º 8
0
def main():
    usage = 'usage: %prog [options] <ref_gtf> <prerna_gtf>'
    parser = OptionParser(usage)
    parser.add_option('-m', dest='max_genes_overlapped', default=None, type='int', help='Don\'t include isoforms that overlap more than this many genes [Default: %default]')
    (options,args) = parser.parse_args()

    if len(args) != 2:
        parser.error('Must provide reference GTF and output prerna GTF')
    else:
        ref_gtf = args[0]
        prerna_gtf = args[1]

    # read transcripts for filtering/processing
    transcripts = gff.read_genes(ref_gtf, key_id='transcript_id')

    # add unspliced single exon transcripts to hash
    prerna_hash = set()
    for tid in transcripts:
        tx = transcripts[tid]
        if len(tx.exons) == 1:
            tx_key = (tx.chrom, tx.exons[0].start, tx.exons[0].end, tx.strand)
            prerna_hash.add(tx_key)

    # process transcripts
    prerna_out = open(prerna_gtf, 'w')
    prerna_index = 0
    for tid in transcripts:
        tx = transcripts[tid]
        pre_start = tx.exons[0].start
        pre_end = tx.exons[-1].end
        pre_key = (tx.chrom, pre_start, pre_end, tx.strand)

        # print exons
        for i in range(len(tx.exons)):
            cols = (tx.chrom, 'dk', 'exon', str(tx.exons[i].start), str(tx.exons[i].end), '.', tx.strand, '.', gff.kv_gtf(tx.kv))
            print >> prerna_out, '\t'.join(cols)

        # print prernas
        if not pre_key in prerna_hash:
            prerna_hash.add(pre_key)
            pre_kv = copy.copy(tx.kv)
            pre_kv['transcript_id'] = 'PRERNA%d' % prerna_index
            pre_kv['transcript_type'] = 'prerna'
            prerna_index += 1
            cols = (tx.chrom, 'dk', 'exon', str(pre_start), str(pre_end), '.', tx.strand, '.', gff.kv_gtf(pre_kv))
            print >> prerna_out, '\t'.join(cols)

    prerna_out.close()

    if options.max_genes_overlapped != None:
        # intersect with self and compute overlap sets
        p = subprocess.Popen('intersectBed -wo -s -a %s -b %s' % (prerna_gtf, prerna_gtf), shell=True, stdout=subprocess.PIPE)

        tx_overlaps = {}
        for line in p.stdout:
            a = line.split('\t')

            kv1 = gff.gtf_kv(a[8])
            tid1 = kv1['transcript_id']

            if tid1.startswith('PRERNA'):
                gid1 = kv1['gene_id']
                gid2 = gff.gtf_kv(a[17])['gene_id']

                if gid1 != gid2:
                    tx_overlaps.setdefault(tid1,set()).add(gid2)

        p.communicate()

        # filter into a temp gtf
        prerna_tmp_fd, prerna_tmp_file = tempfile.mkstemp()
        prerna_out = open(prerna_tmp_file, 'w')
        for line in open(prerna_gtf):
            a = line.split('\t')
            kv = gff.gtf_kv(a[8])
            tid = kv['transcript_id']
            if len(tx_overlaps.get(tid,[])) <= options.max_genes_overlapped:
                print >> prerna_out, line,
        prerna_out.close()

        # rewrite temp to the final output
        prerna_out = open(prerna_gtf, 'w')
        for line in open(prerna_tmp_file):
            print >> prerna_out, line,
        prerna_out.close()

        os.close(prerna_tmp_fd)
        os.remove(prerna_tmp_file)
Ejemplo n.º 9
0
def main():
    usage = 'usage: %prog [options] <ref_gtf> <prerna_gtf>'
    parser = OptionParser(usage)
    parser.add_option(
        '-m',
        dest='max_genes_overlapped',
        default=None,
        type='int',
        help=
        'Don\'t include isoforms that overlap more than this many genes [Default: %default]'
    )
    (options, args) = parser.parse_args()

    if len(args) != 2:
        parser.error('Must provide reference GTF and output prerna GTF')
    else:
        ref_gtf = args[0]
        prerna_gtf = args[1]

    # read transcripts for filtering/processing
    transcripts = gff.read_genes(ref_gtf, key_id='transcript_id')

    # add unspliced single exon transcripts to hash
    prerna_hash = set()
    for tid in transcripts:
        tx = transcripts[tid]
        if len(tx.exons) == 1:
            tx_key = (tx.chrom, tx.exons[0].start, tx.exons[0].end, tx.strand)
            prerna_hash.add(tx_key)

    # process transcripts
    prerna_out = open(prerna_gtf, 'w')
    prerna_index = 0
    for tid in transcripts:
        tx = transcripts[tid]
        pre_start = tx.exons[0].start
        pre_end = tx.exons[-1].end
        pre_key = (tx.chrom, pre_start, pre_end, tx.strand)

        # print exons
        for i in range(len(tx.exons)):
            cols = (tx.chrom, 'dk', 'exon', str(tx.exons[i].start),
                    str(tx.exons[i].end), '.', tx.strand, '.',
                    gff.kv_gtf(tx.kv))
            print >> prerna_out, '\t'.join(cols)

        # print prernas
        if not pre_key in prerna_hash:
            prerna_hash.add(pre_key)
            pre_kv = copy.copy(tx.kv)
            pre_kv['transcript_id'] = 'PRERNA%d' % prerna_index
            pre_kv['transcript_type'] = 'prerna'
            prerna_index += 1
            cols = (tx.chrom, 'dk', 'exon', str(pre_start), str(pre_end), '.',
                    tx.strand, '.', gff.kv_gtf(pre_kv))
            print >> prerna_out, '\t'.join(cols)

    prerna_out.close()

    if options.max_genes_overlapped != None:
        # intersect with self and compute overlap sets
        p = subprocess.Popen('intersectBed -wo -s -a %s -b %s' %
                             (prerna_gtf, prerna_gtf),
                             shell=True,
                             stdout=subprocess.PIPE)

        tx_overlaps = {}
        for line in p.stdout:
            a = line.split('\t')

            kv1 = gff.gtf_kv(a[8])
            tid1 = kv1['transcript_id']

            if tid1.startswith('PRERNA'):
                gid1 = kv1['gene_id']
                gid2 = gff.gtf_kv(a[17])['gene_id']

                if gid1 != gid2:
                    tx_overlaps.setdefault(tid1, set()).add(gid2)

        p.communicate()

        # filter into a temp gtf
        prerna_tmp_fd, prerna_tmp_file = tempfile.mkstemp()
        prerna_out = open(prerna_tmp_file, 'w')
        for line in open(prerna_gtf):
            a = line.split('\t')
            kv = gff.gtf_kv(a[8])
            tid = kv['transcript_id']
            if len(tx_overlaps.get(tid, [])) <= options.max_genes_overlapped:
                print >> prerna_out, line,
        prerna_out.close()

        # rewrite temp to the final output
        prerna_out = open(prerna_gtf, 'w')
        for line in open(prerna_tmp_file):
            print >> prerna_out, line,
        prerna_out.close()

        os.close(prerna_tmp_fd)
        os.remove(prerna_tmp_file)
Ejemplo n.º 10
0
def compute_coverage(anchor_gff, event_files, mode, anchor_is_gtf, bins):
    ############################################
    # initialize
    ############################################
    coverage = initialize_coverage(anchor_gff, mode, anchor_is_gtf, bins)

    if anchor_is_gtf:
        # get transcript structures
        transcripts = gff.read_genes(anchor_gff, key_id='transcript_id')

        # compute lengths
        transcript_lengths = {}
        for tid in transcripts:
            tx = transcripts[tid]
            for exon in tx.exons:
                transcript_lengths[tid] = transcript_lengths.get(
                    tid, 0) + exon.end - exon.start + 1

    else:
        transcripts = None
        transcript_lengths = None

    events = 0
    for event_file in event_files:
        print >> sys.stderr, 'Computing coverage for %s in %s' % (event_file,
                                                                  anchor_gff)

        ############################################
        # preprocess BAM/GFF
        ############################################
        if event_file[-4:] == '.bam':
            # count fragments and hash multi-mappers
            multi_maps = {}
            for aligned_read in pysam.Samfile(event_file, 'rb'):
                try:
                    nh_tag = aligned_read.opt('NH')
                except:
                    nh_tag = 1.0

                if aligned_read.is_paired:
                    events += 0.5 / nh_tag
                else:
                    events += 1.0 / nh_tag

                if nh_tag > 1:
                    multi_maps[aligned_read.qname] = nh_tag

        elif event_file[-4:] == '.gff':
            for line in open(event_file):
                events += 1

        else:
            print >> sys.stderr, 'Unknown event file format %s' % event_file

        ############################################
        # intersect BAM w/ anchors
        ############################################
        if event_file[-4:] == '.bam':
            p = subprocess.Popen(
                'intersectBed -split -wo -bed -abam %s -b %s' %
                (event_file, anchor_gff),
                shell=True,
                stdout=subprocess.PIPE)
        else:
            p = subprocess.Popen('intersectBed -s -wo -a %s -b %s' %
                                 (event_file, anchor_gff),
                                 shell=True,
                                 stdout=subprocess.PIPE)

        for line in p.stdout:
            a = line.split('\t')

            if event_file[-4:] == '.bam':
                rstart = int(a[1]) + 1  # convert back to 1-based gff from bed
                rend = int(a[2])
                rheader = a[3]
            else:
                rstart = int(a[3])
                rend = int(a[4])

            # because intersectBed screws up indels near endpoints
            if rstart < rend:
                if event_file[-4:] == '.bam':
                    acol = 12
                else:
                    acol = 9

                achrom = a[acol]
                astart = int(a[acol + 3])
                aend = int(a[acol + 4])
                astrand = a[acol + 6]

                if anchor_is_gtf:
                    anchor_id = gff.gtf_kv(a[acol + 8])['transcript_id']
                else:
                    anchor_id = '%s:%d-%d' % (achrom, astart, aend)

                # find where to increment
                inc_start, inc_end = find_inc_coords(anchor_id, astart, aend,
                                                     astrand, rstart, rend,
                                                     mode, bins, transcripts,
                                                     transcript_lengths)

                if inc_start != None:
                    if event_file[-4:] == '.bam':
                        # find multi-map number, which may require removing a suffix
                        if rheader in multi_maps:
                            mm = multi_maps[rheader]
                        else:
                            rheader_base = rheader[:rheader.rfind('/')]
                            if rheader_base in multi_maps:
                                mm = multi_maps[rheader_base]
                            else:
                                mm = 1.0
                    else:
                        mm = 1.0

                    # increment!
                    for i in range(inc_start, inc_end):
                        coverage[anchor_id][i] += 1.0 / mm

        p.communicate()

    return coverage, events
Ejemplo n.º 11
0
def main():
    usage = 'usage: %prog [options] <ref_gtf>'
    parser = OptionParser(usage)
    #parser.add_option()
    parser.add_option('-d',
                      dest='downstream',
                      type='int',
                      default=1000,
                      help='Downstream bp for promoters [Default: %default]')
    parser.add_option(
        '-f',
        dest='fpkm_tracking',
        help='Use cufflinks FPKM estimates to choose the most expressed isoform'
    )
    parser.add_option('-u',
                      dest='upstream',
                      type='int',
                      default=1000,
                      help='Upstream bp for promoters [Default: %default]')
    (options, args) = parser.parse_args()

    if len(args) != 1:
        parser.error('Must provide reference GTF')
    else:
        ref_gtf = args[0]

    g2t = gff.g2t(ref_gtf)
    transcripts = gff.read_genes(ref_gtf)
    source = open(ref_gtf).readline().split()[1]

    if options.fpkm_tracking:
        iso_fpkm_tracking = cufflinks.fpkm_tracking(options.fpkm_tracking)

    for gene_id in g2t:
        gene_transcripts = list(g2t[gene_id])
        gene_strand = transcripts[gene_transcripts[0]].strand
        if gene_strand not in ['+', '-']:
            print >> sys.stderr, 'WARNING: %s discluded for lack of strand' % gene_id
            continue

        # choose TSS
        if options.fpkm_tracking:
            # find most expressed isoform
            promoter_tid = gene_transcripts[0]
            max_fpkm = stats.geo_mean([
                1 + fpkm for fpkm in iso_fpkm_tracking.gene_expr(promoter_tid)
            ])
            for transcript_id in gene_transcripts[1:]:
                transcript_fpkm = stats.geo_mean([
                    1 + fpkm
                    for fpkm in iso_fpkm_tracking.gene_expr(transcript_id)
                ])
                if math.isnan(max_fpkm) or transcript_fpkm > max_fpkm:
                    promoter_tid = transcript_id
                    max_fpkm = transcript_fpkm

            # get isoform tss
            if gene_strand == '+':
                tss = transcripts[promoter_tid].exons[0].start
            else:
                tss = transcripts[promoter_tid].exons[-1].end

        else:
            # find most upstream tss
            promoter_tid = gene_transcripts[0]
            if gene_strand == '+':
                upstream_tss = transcripts[promoter_tid].exons[0].start
            else:
                upstream_tss = transcripts[promoter_tid].exons[-1].end

            for transcript_id in gene_transcripts[1:]:
                if gene_strand == '+':
                    transcript_pos = transcripts[transcript_id].exons[0].start
                    if transcript_pos < upstream_tss:
                        promoter_tid = transcript_id
                        upstream_tss = transcript_pos
                else:
                    transcript_pos = transcripts[transcript_id].exons[-1].end
                    if transcript_pos > upstream_tss:
                        promoter_tid = transcript_id
                        upstream_tss = transcript_pos

            tss = upstream_tss

        # print promoter from the tss
        if gene_strand == '+':
            if tss - options.upstream < 1:
                print >> sys.stderr, 'WARNING: %s discluded for nearness to chromosome end' % gene_id
            else:
                tx = transcripts[promoter_tid]
                cols = [
                    tx.chrom, source, 'promoter',
                    str(tss - options.upstream),
                    str(tss + options.downstream), '.', tx.strand, '.',
                    gff.kv_gtf(tx.kv)
                ]
                print '\t'.join(cols)

        else:
            if tss - options.downstream < 1:
                print >> sys.stderr, 'WARNING: %s discluded for nearness to chromosome end' % gene_id
            else:
                tx = transcripts[promoter_tid]
                cols = [
                    tx.chrom, source, 'promoter',
                    str(tss - options.downstream),
                    str(tss + options.upstream), '.', tx.strand, '.',
                    gff.kv_gtf(tx.kv)
                ]
                print '\t'.join(cols)
Ejemplo n.º 12
0
def get_splice_intervals(gtf_file, window):
    intervals_5p = {}
    intervals_3p = {}

    genes = gff.read_genes(gtf_file)
    for gid in genes:
        g = genes[gid]

        if len(g.exons) > 1:
            if g.strand == "+":
                # add first 5p site
                exon = g.exons[0]
                if not g.chrom in intervals_5p:
                    intervals_5p[g.chrom] = set()
                intervals_5p[g.chrom].add((exon.end - window / 2, exon.end + window / 2, g.strand))

                # process internal exons
                for exon in g.exons[1:-1]:
                    # add 3p site
                    if not g.chrom in intervals_3p:
                        intervals_3p[g.chrom] = set()
                    intervals_3p[g.chrom].add((exon.start - window / 2, exon.start + window / 2, g.strand))

                    # add 5p site
                    if not g.chrom in intervals_5p:
                        intervals_5p[g.chrom] = set()
                    intervals_5p[g.chrom].add((exon.end - window / 2, exon.end + window / 2, g.strand))

                # add last 3p site
                exon = g.exons[-1]
                if not g.chrom in intervals_3p:
                    intervals_3p[g.chrom] = set()
                intervals_3p[g.chrom].add((exon.start - window / 2, exon.start + window / 2, g.strand))

            else:
                # add first 5p site
                exon = g.exons[-1]
                if not g.chrom in intervals_5p:
                    intervals_5p[g.chrom] = set()
                intervals_5p[g.chrom].add((exon.end - window / 2, exon.end + window / 2, g.strand))

                # process internal exons (in reverse order, but doesn't matter)
                for exon in g.exons[1:-1]:
                    # add 3p site
                    if not g.chrom in intervals_3p:
                        intervals_3p[g.chrom] = set()
                    intervals_3p[g.chrom].add((exon.start - window / 2, exon.start + window / 2, g.strand))

                    # add 5p site
                    if not g.chrom in intervals_5p:
                        intervals_5p[g.chrom] = set()
                    intervals_5p[g.chrom].add((exon.end - window / 2, exon.end + window / 2, g.strand))

                # add last 3p site
                exon = g.exons[0]
                if not g.chrom in intervals_3p:
                    intervals_3p[g.chrom] = set()
                intervals_3p[g.chrom].add((exon.start - window / 2, exon.start + window / 2, g.strand))

    # convert sets to sorted lists
    for chrom in intervals_5p:
        intervals_5p[chrom] = sorted(list(intervals_5p[chrom]))
        intervals_3p[chrom] = sorted(list(intervals_3p[chrom]))

    return intervals_5p, intervals_3p