コード例 #1
0
ファイル: gtf_span.py プロジェクト: yuzhenpeng/snakeflow
def main():
    usage = 'usage: %prog [options] <gtf_file>'
    parser = OptionParser(usage)
    #parser.add_option()
    (options, args) = parser.parse_args()

    gtf_file = args[0]

    genes = {}

    for line in open(gtf_file):
        a = line.split()
        gene_id = a[9][1:-2]
        genes.setdefault(gene_id, []).append(line)

    for gene_id in genes:
        start = min([int(line.split()[3]) for line in genes[gene_id]])
        end = max([int(line.split()[4]) for line in genes[gene_id]])

        a = genes[gene_id][0].split('\t')
        kv = gff.gtf_kv(a[8])
        succinct_kv = {'gene_id': kv['gene_id']}
        succinct_kv['transcript_id'] = ','.join(
            list(set([line.split()[11][1:-2] for line in genes[gene_id]])))

        d = [
            a[0], 'gtf', 'gene',
            str(start),
            str(end), '.', a[6], '.',
            gff.kv_gtf(succinct_kv)
        ]
        print '\t'.join(d)
コード例 #2
0
ファイル: rm2gff.py プロジェクト: BioXiao/utility
def main():
    usage = 'usage: %prog [options] <rm out>'
    parser = OptionParser(usage)
    #parser.add_option()
    (options,args) = parser.parse_args()

    if len(args) != 1:
        parser.error('Must provide RepeatMasker .out file')
    else:
        if args[0][-2:] == 'gz':
            rm_in = gzip.open(args[0])
        else:
            rm_in = open(args[0])

    for i in range(4):
        line = rm_in.readline()
    while line:
        a = line.split()

        if a[8] == '+':
            strand = '+'
        else:
            strand = '-'

        cols = (a[4], 'RepeatMasker', 'repeat', a[5], a[6], '.', strand, '.', gff.kv_gtf({'repeat':a[9], 'family':a[10]}))
        print '\t'.join(cols)

        line = rm_in.readline()
コード例 #3
0
ファイル: gtf_span.py プロジェクト: davek44/utility
def main():
    usage = "usage: %prog [options] <gtf_file>"
    parser = OptionParser(usage)
    # parser.add_option()
    (options, args) = parser.parse_args()

    gtf_file = args[0]

    genes = {}

    for line in open(gtf_file):
        a = line.split()
        gene_id = a[9][1:-2]
        genes.setdefault(gene_id, []).append(line)

    for gene_id in genes:
        start = min([int(line.split()[3]) for line in genes[gene_id]])
        end = max([int(line.split()[4]) for line in genes[gene_id]])

        a = genes[gene_id][0].split("\t")
        kv = gff.gtf_kv(a[8])
        succinct_kv = {"gene_id": kv["gene_id"]}
        succinct_kv["transcript_id"] = ",".join(list(set([line.split()[11][1:-2] for line in genes[gene_id]])))

        d = [a[0], "gtf", "gene", str(start), str(end), ".", a[6], ".", gff.kv_gtf(succinct_kv)]
        print "\t".join(d)
コード例 #4
0
def gff_line(a):
    strand = a[8]
    if strand == 'C':
        strand = '-'

    cols = (a[4], 'RepeatMasker', 'repeat', a[5], a[6], '.', strand, '.', gff.kv_gtf({'repeat':a[9], 'family':a[10]}))
    return '\t'.join(cols)
コード例 #5
0
ファイル: rm2gff.py プロジェクト: yuzhenpeng/snakeflow
def main():
    usage = 'usage: %prog [options] <rm out>'
    parser = OptionParser(usage)
    #parser.add_option()
    (options, args) = parser.parse_args()

    if len(args) != 1:
        parser.error('Must provide RepeatMasker .out file')
    else:
        if args[0][-2:] == 'gz':
            rm_in = gzip.open(args[0])
        else:
            rm_in = open(args[0])

    for i in range(4):
        line = rm_in.readline()
    while line:
        a = line.split()

        if a[8] == '+':
            strand = '+'
        else:
            strand = '-'

        cols = (a[4], 'RepeatMasker', 'repeat', a[5], a[6], '.', strand, '.',
                gff.kv_gtf({
                    'repeat': a[9],
                    'family': a[10]
                }))
        print '\t'.join(cols)

        line = rm_in.readline()
コード例 #6
0
ファイル: clip_peaks.py プロジェクト: hjanime/CLIP-Seq
def span_gtf(ref_gtf, out_dir):
    # obtain gene regions
    transcripts = read_genes(ref_gtf, key_id='transcript_id')
    gene_regions = get_gene_regions(transcripts)

    # print
    span_ref_gtf = '%s/span.gtf' % out_dir
    span_ref_open = open(span_ref_gtf, 'w')

    for gid in gene_regions:
        g = gene_regions[gid]
        cols = [g[0], 'clip_peaks', 'exon', str(g[1]), str(g[2]), '.', g[3], '.', gff.kv_gtf({'gene_id':gid, 'transcript_id':gid})]
        print >> span_ref_open, '\t'.join(cols)

    span_ref_open.close()

    return span_ref_gtf
コード例 #7
0
ファイル: gtf2prom_bad.py プロジェクト: yuzhenpeng/snakeflow
def process_chr(chrom, seq, promoters, out_fa, out_gff, promoter_length,
                acgt_t):
    # grab promoters
    for prom in promoters:
        if prom.strand == '+':
            prom_seq = seq[prom.start:prom.start + promoter_length]
        else:
            prom_seq = dna.rc(seq[prom.start:prom.start + promoter_length])
        if acgt_pct(prom_seq) > acgt_t:
            print >> out_fa, '>%s\n%s' % (prom.gtf_kv['transcript_id'],
                                          prom_seq)
            gff_dat = [
                chrom, '.', 'promoter',
                str(prom.start + 1),
                str(prom.start + promoter_length + 1 - 1), '.', prom.strand,
                '.',
                gff.kv_gtf(prom.gtf_kv)
            ]
            print >> out_gff, '\t'.join(gff_dat)
コード例 #8
0
ファイル: promoters.py プロジェクト: davek44/utility
def main():
    usage = 'usage: %prog [options] <ref_gtf>'
    parser = OptionParser(usage)
    #parser.add_option()
    parser.add_option('-d', dest='downstream', type='int', default=1000, help='Downstream bp for promoters [Default: %default]')
    parser.add_option('-f', dest='fpkm_tracking', help='Use cufflinks FPKM estimates to choose the most expressed isoform')
    parser.add_option('-u', dest='upstream', type='int', default=1000, help='Upstream bp for promoters [Default: %default]')
    (options,args) = parser.parse_args()

    if len(args) != 1:
        parser.error('Must provide reference GTF')
    else:
        ref_gtf = args[0]

    g2t = gff.g2t(ref_gtf)
    transcripts = gff.read_genes(ref_gtf)
    source = open(ref_gtf).readline().split()[1]

    if options.fpkm_tracking:
        iso_fpkm_tracking = cufflinks.fpkm_tracking(options.fpkm_tracking)

    for gene_id in g2t:
        gene_transcripts = list(g2t[gene_id])
        gene_strand = transcripts[gene_transcripts[0]].strand
        if gene_strand not in ['+','-']:
            print('WARNING: %s discluded for lack of strand' % gene_id, file=sys.stderr)
            continue

        # choose TSS
        if options.fpkm_tracking:
            # find most expressed isoform
            promoter_tid = gene_transcripts[0]
            max_fpkm = stats.geo_mean([1+fpkm for fpkm in iso_fpkm_tracking.gene_expr(promoter_tid)])
            for transcript_id in gene_transcripts[1:]:
                transcript_fpkm = stats.geo_mean([1+fpkm for fpkm in iso_fpkm_tracking.gene_expr(transcript_id)])
                if math.isnan(max_fpkm) or transcript_fpkm > max_fpkm:
                    promoter_tid = transcript_id
                    max_fpkm = transcript_fpkm

            # get isoform tss
            if gene_strand == '+':
                tss = transcripts[promoter_tid].exons[0].start
            else:
                tss = transcripts[promoter_tid].exons[-1].end

        else:
            # find most upstream tss
            promoter_tid = gene_transcripts[0]
            if gene_strand == '+':
                upstream_tss = transcripts[promoter_tid].exons[0].start
            else:
                upstream_tss = transcripts[promoter_tid].exons[-1].end

            for transcript_id in gene_transcripts[1:]:
                if gene_strand == '+':
                    transcript_pos = transcripts[transcript_id].exons[0].start
                    if transcript_pos < upstream_tss:
                        promoter_tid = transcript_id
                        upstream_tss = transcript_pos
                else:
                    transcript_pos = transcripts[transcript_id].exons[-1].end
                    if transcript_pos > upstream_tss:
                        promoter_tid = transcript_id
                        upstream_tss = transcript_pos

            tss = upstream_tss

        # print promoter from the tss
        if gene_strand == '+':
            if tss - options.upstream < 1:
                print('WARNING: %s discluded for nearness to chromosome end' % gene_id, file=sys.stderr)
            else:
                tx = transcripts[promoter_tid]
                cols = [tx.chrom, source, 'promoter', str(tss-options.upstream), str(tss+options.downstream), '.', tx.strand, '.', gff.kv_gtf(tx.kv)]
                print('\t'.join(cols))

        else:
            if tss - options.downstream < 1:
                print('WARNING: %s discluded for nearness to chromosome end' % gene_id, file=sys.stderr)
            else:
                tx = transcripts[promoter_tid]
                cols = [tx.chrom, source, 'promoter', str(tss-options.downstream), str(tss+options.upstream), '.', tx.strand, '.', gff.kv_gtf(tx.kv)]
                print('\t'.join(cols))
コード例 #9
0
ファイル: cuffify_gtf.py プロジェクト: BioXiao/utility
def main():
    usage = 'usage: %prog [options] <gencode_gtf>'
    parser = OptionParser(usage)
    parser.add_option('-l', dest='min_transcript_length', default=50, type='int')
    (options,args) = parser.parse_args()

    if len(args) != 1:
        parser.error('Must provide GENCODE GTF')
    else:
        full_gtf = args[0]

    ############################################################
    # remove small rna (and non-exon)
    ############################################################
    small_rnas = set(['miRNA','misc_RNA','snRNA','snoRNA','rRNA','Mt_rRNA'])
    sansrna_gtf_fd, sansrna_gtf_file = tempfile.mkstemp()
    sansrna_gtf_out = open(sansrna_gtf_file, 'w')

    # ignore header
    full_gtf_in = open(full_gtf)
    line = full_gtf_in.readline()
    while line[:2] == '##':
        line = full_gtf_in.readline()

    while line:
        a = line.split('\t')

        if a[2] == 'exon':
            kv = gff.gtf_kv(a[8])
            if kv['transcript_type'] not in small_rnas:
                print >> sansrna_gtf_out, line,

        line = full_gtf_in.readline()

    sansrna_gtf_out.close()

    ############################################################
    # remove tiny (unestimatable) transcripts
    ############################################################
    transcript_lengths = {}
    for line in open(sansrna_gtf_file):
        a = line.split('\t')
        if a[2] == 'exon':
            transcript_id = gff.gtf_kv(a[8])['transcript_id']
            transcript_lengths[transcript_id] = transcript_lengths.get(transcript_id,0) + int(a[4])-int(a[3])+1

    sanstiny_gtf_fd, sanstiny_gtf_file = tempfile.mkstemp()
    sanstiny_gtf_out = open(sanstiny_gtf_file, 'w')
    
    for line in open(sansrna_gtf_file):
        a = line.split('\t')
        kv = gff.gtf_kv(a[8])
        if transcript_lengths[kv['transcript_id']] >= options.min_transcript_length:
            print >> sanstiny_gtf_out, line,

    sanstiny_gtf_out.close()

    ############################################################
    # run cuffcompare to get id's
    ############################################################
    subprocess.call('cuffcompare -s $HG19/sequence/hg19.fa -CG -r %s %s' % (sanstiny_gtf_file, sanstiny_gtf_file), shell=True)

    # hash id's by oId
    tss_id = {}
    p_id = {}
    for line in open('cuffcmp.combined.gtf'):
        a = line.split('\t')
        kv = gff.gtf_kv(a[8])

        tss_id[kv['oId']] = kv['tss_id']
        if 'p_id' in kv:
            p_id[kv['oId']] = kv['p_id']

    ############################################################
    # add id's and print
    ############################################################
    unsorted_gtf_fd, unsorted_gtf_file = tempfile.mkstemp()
    unsorted_gtf_out = open(unsorted_gtf_file, 'w')

    for line in open(sanstiny_gtf_file):
        a = line.split('\t')
        kv = gff.gtf_kv(a[8])
        
        kv['tss_id'] = tss_id[kv['transcript_id']]
        if kv['transcript_id'] in p_id:
            kv['p_id'] = p_id[kv['transcript_id']]

        a[8] = gff.kv_gtf(kv)
        print >> unsorted_gtf_out, '\t'.join(a)

    unsorted_gtf_out.close()

    ############################################################
    # might as well sort it!
    ############################################################
    subprocess.call('sortBed -i %s' % unsorted_gtf_file, shell=True)

    ############################################################
    # clean
    ############################################################
    # temp
    os.close(sansrna_gtf_fd)
    os.remove(sansrna_gtf_file)
    os.close(sanstiny_gtf_fd)
    os.remove(sanstiny_gtf_file)
    os.close(unsorted_gtf_fd)
    os.remove(unsorted_gtf_file)

    # cuffcompare
    os.remove('cuffcmp.tracking')
    os.remove('cuffcmp.loci')
    os.remove('cuffcmp.combined.gtf')
    os.remove('cuffcmp.stats')
コード例 #10
0
ファイル: gtf2prom_bad.py プロジェクト: BioXiao/utility
def process_chr(chrom, seq, promoters, out_fa, out_gff, promoter_length, acgt_t):
    # grab promoters
    for prom in promoters:
        if prom.strand == '+':
            prom_seq = seq[prom.start:prom.start+promoter_length]
        else:
            prom_seq = dna.rc(seq[prom.start:prom.start+promoter_length])
        if acgt_pct(prom_seq) > acgt_t:
            print >> out_fa, '>%s\n%s' % (prom.gtf_kv['transcript_id'], prom_seq)
            gff_dat = [chrom, '.', 'promoter', str(prom.start+1), str(prom.start+promoter_length+1-1), '.', prom.strand, '.', gff.kv_gtf(prom.gtf_kv)]
            print >> out_gff, '\t'.join(gff_dat)
コード例 #11
0
ファイル: clean_gtf.py プロジェクト: yuzhenpeng/snakeflow
def main():
    usage = 'usage: %prog [options] <gtf file>'
    parser = OptionParser(usage)
    #parser.add_option()
    (options, args) = parser.parse_args()

    if len(args) != 1:
        parser.error(usage)
    else:
        gtf_file = args[0]

    ############################################
    # fix multi-chromosome genes
    ############################################
    # find multi-chromosome genes
    tx_chrs = {}
    for line in open(gtf_file):
        a = line.split('\t')
        kv = gff.gtf_kv(a[8])
        if kv['transcript_id'].startswith('NM_'):
            tx_chrs.setdefault(kv['transcript_id'], set()).add(a[0])
    multi_genes = set([tid for tid in tx_chrs if len(tx_chrs[tid]) > 1])

    # revise gtf
    tx_gene = {}
    gtf_out = open('tmp.gtf', 'w')
    for line in open(gtf_file):
        a = line.split('\t')
        a[-1] = a[-1].rstrip()
        kv = gff.gtf_kv(a[8])

        # if multi-chrom gene, supplement id's
        if kv['transcript_id'] in multi_genes:
            kv['transcript_id'] += 'c%s' % a[0][3:]
            a[8] = gff.kv_gtf(kv)

        # map trans to gene (forget the actual gene id's; they don't consider "_dup")
        tx_gene[kv['transcript_id']] = kv['transcript_id']

        # print new line
        print >> gtf_out, '\t'.join(a)

    gtf_out.close()

    ############################################
    # merge transcripts into genes
    ############################################
    # intersect and build overlapping transcript graph
    G = networkx.Graph()
    p = subprocess.Popen('intersectBed -f 0.2 -r -wo -s -a tmp.gtf -b tmp.gtf',
                         shell=True,
                         stdout=subprocess.PIPE)
    line = p.stdout.readline()
    while line:
        a = line.split('\t')
        tid1 = gff.gtf_kv(a[8])['transcript_id']
        tid2 = gff.gtf_kv(a[17])['transcript_id']
        G.add_edge(tid1, tid2)
        line = p.stdout.readline()
    p.communicate()

    # combine connected components as genes
    for component in networkx.algorithms.components.connected.connected_components(
            G):
        comp_gene = 'G' + tx_gene[component[0]]
        for tid in component:
            tx_gene[tid] = comp_gene
    for tid in tx_gene:
        if tx_gene[tid][0] != 'G':
            tx_gene[tid] = 'G' + tx_gene[tid]

    ############################################
    # output
    ############################################
    # print
    for line in open('tmp.gtf'):
        a = line.split('\t')
        a[-1] = a[-1].rstrip()

        kv = gff.gtf_kv(a[8])
        kv['gene_id'] = tx_gene[kv['transcript_id']]
        a[8] = gff.kv_gtf(kv)

        print '\t'.join(a)

    # clean
    os.remove('tmp.gtf')
コード例 #12
0
ファイル: gtf_add_prerna.py プロジェクト: BioXiao/utility
def main():
    usage = 'usage: %prog [options] <ref_gtf> <prerna_gtf>'
    parser = OptionParser(usage)
    parser.add_option('-m', dest='max_genes_overlapped', default=None, type='int', help='Don\'t include isoforms that overlap more than this many genes [Default: %default]')
    (options,args) = parser.parse_args()

    if len(args) != 2:
        parser.error('Must provide reference GTF and output prerna GTF')
    else:
        ref_gtf = args[0]
        prerna_gtf = args[1]

    # read transcripts for filtering/processing
    transcripts = gff.read_genes(ref_gtf, key_id='transcript_id')

    # add unspliced single exon transcripts to hash
    prerna_hash = set()
    for tid in transcripts:
        tx = transcripts[tid]
        if len(tx.exons) == 1:
            tx_key = (tx.chrom, tx.exons[0].start, tx.exons[0].end, tx.strand)
            prerna_hash.add(tx_key)

    # process transcripts
    prerna_out = open(prerna_gtf, 'w')
    prerna_index = 0
    for tid in transcripts:
        tx = transcripts[tid]
        pre_start = tx.exons[0].start
        pre_end = tx.exons[-1].end
        pre_key = (tx.chrom, pre_start, pre_end, tx.strand)

        # print exons
        for i in range(len(tx.exons)):
            cols = (tx.chrom, 'dk', 'exon', str(tx.exons[i].start), str(tx.exons[i].end), '.', tx.strand, '.', gff.kv_gtf(tx.kv))
            print >> prerna_out, '\t'.join(cols)

        # print prernas
        if not pre_key in prerna_hash:
            prerna_hash.add(pre_key)
            pre_kv = copy.copy(tx.kv)
            pre_kv['transcript_id'] = 'PRERNA%d' % prerna_index
            pre_kv['transcript_type'] = 'prerna'
            prerna_index += 1
            cols = (tx.chrom, 'dk', 'exon', str(pre_start), str(pre_end), '.', tx.strand, '.', gff.kv_gtf(pre_kv))
            print >> prerna_out, '\t'.join(cols)

    prerna_out.close()

    if options.max_genes_overlapped != None:
        # intersect with self and compute overlap sets
        p = subprocess.Popen('intersectBed -wo -s -a %s -b %s' % (prerna_gtf, prerna_gtf), shell=True, stdout=subprocess.PIPE)

        tx_overlaps = {}
        for line in p.stdout:
            a = line.split('\t')

            kv1 = gff.gtf_kv(a[8])
            tid1 = kv1['transcript_id']

            if tid1.startswith('PRERNA'):
                gid1 = kv1['gene_id']
                gid2 = gff.gtf_kv(a[17])['gene_id']

                if gid1 != gid2:
                    tx_overlaps.setdefault(tid1,set()).add(gid2)

        p.communicate()

        # filter into a temp gtf
        prerna_tmp_fd, prerna_tmp_file = tempfile.mkstemp()
        prerna_out = open(prerna_tmp_file, 'w')
        for line in open(prerna_gtf):
            a = line.split('\t')
            kv = gff.gtf_kv(a[8])
            tid = kv['transcript_id']
            if len(tx_overlaps.get(tid,[])) <= options.max_genes_overlapped:
                print >> prerna_out, line,
        prerna_out.close()

        # rewrite temp to the final output
        prerna_out = open(prerna_gtf, 'w')
        for line in open(prerna_tmp_file):
            print >> prerna_out, line,
        prerna_out.close()

        os.close(prerna_tmp_fd)
        os.remove(prerna_tmp_file)
コード例 #13
0
def main():
    usage = 'usage: %prog [options] <gencode_gtf>'
    parser = OptionParser(usage)
    parser.add_option('-l',
                      dest='min_transcript_length',
                      default=50,
                      type='int')
    (options, args) = parser.parse_args()

    if len(args) != 1:
        parser.error('Must provide GENCODE GTF')
    else:
        full_gtf = args[0]

    ############################################################
    # remove small rna (and non-exon)
    ############################################################
    small_rnas = set(
        ['miRNA', 'misc_RNA', 'snRNA', 'snoRNA', 'rRNA', 'Mt_rRNA'])
    sansrna_gtf_fd, sansrna_gtf_file = tempfile.mkstemp()
    sansrna_gtf_out = open(sansrna_gtf_file, 'w')

    # ignore header
    full_gtf_in = open(full_gtf)
    line = full_gtf_in.readline()
    while line[:2] == '##':
        line = full_gtf_in.readline()

    while line:
        a = line.split('\t')

        if a[2] == 'exon':
            kv = gff.gtf_kv(a[8])
            if kv['transcript_type'] not in small_rnas:
                print >> sansrna_gtf_out, line,

        line = full_gtf_in.readline()

    sansrna_gtf_out.close()

    ############################################################
    # remove tiny (unestimatable) transcripts
    ############################################################
    transcript_lengths = {}
    for line in open(sansrna_gtf_file):
        a = line.split('\t')
        if a[2] == 'exon':
            transcript_id = gff.gtf_kv(a[8])['transcript_id']
            transcript_lengths[transcript_id] = transcript_lengths.get(
                transcript_id, 0) + int(a[4]) - int(a[3]) + 1

    sanstiny_gtf_fd, sanstiny_gtf_file = tempfile.mkstemp()
    sanstiny_gtf_out = open(sanstiny_gtf_file, 'w')

    for line in open(sansrna_gtf_file):
        a = line.split('\t')
        kv = gff.gtf_kv(a[8])
        if transcript_lengths[
                kv['transcript_id']] >= options.min_transcript_length:
            print >> sanstiny_gtf_out, line,

    sanstiny_gtf_out.close()

    ############################################################
    # run cuffcompare to get id's
    ############################################################
    subprocess.call('cuffcompare -s $HG19/sequence/hg19.fa -CG -r %s %s' %
                    (sanstiny_gtf_file, sanstiny_gtf_file),
                    shell=True)

    # hash id's by oId
    tss_id = {}
    p_id = {}
    for line in open('cuffcmp.combined.gtf'):
        a = line.split('\t')
        kv = gff.gtf_kv(a[8])

        tss_id[kv['oId']] = kv['tss_id']
        if 'p_id' in kv:
            p_id[kv['oId']] = kv['p_id']

    ############################################################
    # add id's and print
    ############################################################
    unsorted_gtf_fd, unsorted_gtf_file = tempfile.mkstemp()
    unsorted_gtf_out = open(unsorted_gtf_file, 'w')

    for line in open(sanstiny_gtf_file):
        a = line.split('\t')
        kv = gff.gtf_kv(a[8])

        kv['tss_id'] = tss_id[kv['transcript_id']]
        if kv['transcript_id'] in p_id:
            kv['p_id'] = p_id[kv['transcript_id']]

        a[8] = gff.kv_gtf(kv)
        print >> unsorted_gtf_out, '\t'.join(a)

    unsorted_gtf_out.close()

    ############################################################
    # might as well sort it!
    ############################################################
    subprocess.call('sortBed -i %s' % unsorted_gtf_file, shell=True)

    ############################################################
    # clean
    ############################################################
    # temp
    os.close(sansrna_gtf_fd)
    os.remove(sansrna_gtf_file)
    os.close(sanstiny_gtf_fd)
    os.remove(sanstiny_gtf_file)
    os.close(unsorted_gtf_fd)
    os.remove(unsorted_gtf_file)

    # cuffcompare
    os.remove('cuffcmp.tracking')
    os.remove('cuffcmp.loci')
    os.remove('cuffcmp.combined.gtf')
    os.remove('cuffcmp.stats')
コード例 #14
0
def main():
    usage = 'usage: %prog [options] <ref_gtf> <prerna_gtf>'
    parser = OptionParser(usage)
    parser.add_option(
        '-m',
        dest='max_genes_overlapped',
        default=None,
        type='int',
        help=
        'Don\'t include isoforms that overlap more than this many genes [Default: %default]'
    )
    (options, args) = parser.parse_args()

    if len(args) != 2:
        parser.error('Must provide reference GTF and output prerna GTF')
    else:
        ref_gtf = args[0]
        prerna_gtf = args[1]

    # read transcripts for filtering/processing
    transcripts = gff.read_genes(ref_gtf, key_id='transcript_id')

    # add unspliced single exon transcripts to hash
    prerna_hash = set()
    for tid in transcripts:
        tx = transcripts[tid]
        if len(tx.exons) == 1:
            tx_key = (tx.chrom, tx.exons[0].start, tx.exons[0].end, tx.strand)
            prerna_hash.add(tx_key)

    # process transcripts
    prerna_out = open(prerna_gtf, 'w')
    prerna_index = 0
    for tid in transcripts:
        tx = transcripts[tid]
        pre_start = tx.exons[0].start
        pre_end = tx.exons[-1].end
        pre_key = (tx.chrom, pre_start, pre_end, tx.strand)

        # print exons
        for i in range(len(tx.exons)):
            cols = (tx.chrom, 'dk', 'exon', str(tx.exons[i].start),
                    str(tx.exons[i].end), '.', tx.strand, '.',
                    gff.kv_gtf(tx.kv))
            print >> prerna_out, '\t'.join(cols)

        # print prernas
        if not pre_key in prerna_hash:
            prerna_hash.add(pre_key)
            pre_kv = copy.copy(tx.kv)
            pre_kv['transcript_id'] = 'PRERNA%d' % prerna_index
            pre_kv['transcript_type'] = 'prerna'
            prerna_index += 1
            cols = (tx.chrom, 'dk', 'exon', str(pre_start), str(pre_end), '.',
                    tx.strand, '.', gff.kv_gtf(pre_kv))
            print >> prerna_out, '\t'.join(cols)

    prerna_out.close()

    if options.max_genes_overlapped != None:
        # intersect with self and compute overlap sets
        p = subprocess.Popen('intersectBed -wo -s -a %s -b %s' %
                             (prerna_gtf, prerna_gtf),
                             shell=True,
                             stdout=subprocess.PIPE)

        tx_overlaps = {}
        for line in p.stdout:
            a = line.split('\t')

            kv1 = gff.gtf_kv(a[8])
            tid1 = kv1['transcript_id']

            if tid1.startswith('PRERNA'):
                gid1 = kv1['gene_id']
                gid2 = gff.gtf_kv(a[17])['gene_id']

                if gid1 != gid2:
                    tx_overlaps.setdefault(tid1, set()).add(gid2)

        p.communicate()

        # filter into a temp gtf
        prerna_tmp_fd, prerna_tmp_file = tempfile.mkstemp()
        prerna_out = open(prerna_tmp_file, 'w')
        for line in open(prerna_gtf):
            a = line.split('\t')
            kv = gff.gtf_kv(a[8])
            tid = kv['transcript_id']
            if len(tx_overlaps.get(tid, [])) <= options.max_genes_overlapped:
                print >> prerna_out, line,
        prerna_out.close()

        # rewrite temp to the final output
        prerna_out = open(prerna_gtf, 'w')
        for line in open(prerna_tmp_file):
            print >> prerna_out, line,
        prerna_out.close()

        os.close(prerna_tmp_fd)
        os.remove(prerna_tmp_file)
コード例 #15
0
ファイル: clean_gtf.py プロジェクト: davek44/utility
def main():
    usage = "usage: %prog [options] <gtf file>"
    parser = OptionParser(usage)
    # parser.add_option()
    (options, args) = parser.parse_args()

    if len(args) != 1:
        parser.error(usage)
    else:
        gtf_file = args[0]

    ############################################
    # fix multi-chromosome genes
    ############################################
    # find multi-chromosome genes
    tx_chrs = {}
    for line in open(gtf_file):
        a = line.split("\t")
        kv = gff.gtf_kv(a[8])
        if kv["transcript_id"].startswith("NM_"):
            tx_chrs.setdefault(kv["transcript_id"], set()).add(a[0])
    multi_genes = set([tid for tid in tx_chrs if len(tx_chrs[tid]) > 1])

    # revise gtf
    tx_gene = {}
    gtf_out = open("tmp.gtf", "w")
    for line in open(gtf_file):
        a = line.split("\t")
        a[-1] = a[-1].rstrip()
        kv = gff.gtf_kv(a[8])

        # if multi-chrom gene, supplement id's
        if kv["transcript_id"] in multi_genes:
            kv["transcript_id"] += "c%s" % a[0][3:]
            a[8] = gff.kv_gtf(kv)

        # map trans to gene (forget the actual gene id's; they don't consider "_dup")
        tx_gene[kv["transcript_id"]] = kv["transcript_id"]

        # print new line
        print >> gtf_out, "\t".join(a)

    gtf_out.close()

    ############################################
    # merge transcripts into genes
    ############################################
    # intersect and build overlapping transcript graph
    G = networkx.Graph()
    p = subprocess.Popen("intersectBed -f 0.2 -r -wo -s -a tmp.gtf -b tmp.gtf", shell=True, stdout=subprocess.PIPE)
    line = p.stdout.readline()
    while line:
        a = line.split("\t")
        tid1 = gff.gtf_kv(a[8])["transcript_id"]
        tid2 = gff.gtf_kv(a[17])["transcript_id"]
        G.add_edge(tid1, tid2)
        line = p.stdout.readline()
    p.communicate()

    # combine connected components as genes
    for component in networkx.algorithms.components.connected.connected_components(G):
        comp_gene = "G" + tx_gene[component[0]]
        for tid in component:
            tx_gene[tid] = comp_gene
    for tid in tx_gene:
        if tx_gene[tid][0] != "G":
            tx_gene[tid] = "G" + tx_gene[tid]

    ############################################
    # output
    ############################################
    # print
    for line in open("tmp.gtf"):
        a = line.split("\t")
        a[-1] = a[-1].rstrip()

        kv = gff.gtf_kv(a[8])
        kv["gene_id"] = tx_gene[kv["transcript_id"]]
        a[8] = gff.kv_gtf(kv)

        print "\t".join(a)

    # clean
    os.remove("tmp.gtf")
コード例 #16
0
ファイル: clip_peaks.py プロジェクト: hjanime/CLIP-Seq
def prerna_gtf(ref_gtf, out_dir):
    unspliced_index = 0
    unspliced_hash = set()

    transcripts = read_genes(ref_gtf, key_id='transcript_id')

    pre_ref_gtf = '%s/prerna.gtf' % out_dir
    pre_ref_open = open(pre_ref_gtf, 'w')

    # add unspliced single exon transcripts to hash
    for tid in transcripts:
        tx = transcripts[tid]
        if len(tx.exons) == 1:
            tx_key = (tx.chrom, tx.exons[0].start, tx.exons[0].end, tx.strand)
            unspliced_hash.add(tx_key)
        
    # process transcripts
    for tid in transcripts:
        tx = transcripts[tid]
        pre_start = tx.exons[0].start
        pre_end = tx.exons[-1].end
        pre_key = (tx.chrom, pre_start, pre_end, tx.strand)

        for i in range(len(tx.exons)):
            cols = (tx.chrom, 'clip_peaks', 'exon', str(tx.exons[i].start), str(tx.exons[i].end), '.', tx.strand, '.', gff.kv_gtf(tx.kv))
            print >> pre_ref_open, '\t'.join(cols)

        if not pre_key in unspliced_hash:
            unspliced_hash.add(pre_key)
            pre_kv = copy.copy(tx.kv)
            pre_kv['transcript_id'] = 'UNSPLICED%d' % unspliced_index
            unspliced_index += 1
            cols = (tx.chrom, 'clip_peaks', 'exon', str(pre_start), str(pre_end), '.', tx.strand, '.', gff.kv_gtf(pre_kv))
            print >> pre_ref_open, '\t'.join(cols)

    pre_ref_open.close()

    return pre_ref_gtf
コード例 #17
0
def main():
    usage = 'usage: %prog [options] <ref gtf> <merged gtf>'
    parser = OptionParser(usage)
    (options,args) = parser.parse_args()

    if len(args) != 2:
        parser.error(usage)
    else:
        ref_gtf = args[0]
        merged_gtf = args[1]

    # get mappings
    ref_t2g = gff.t2g(ref_gtf)
    merged_t2g = gff.t2g(merged_gtf)
    merged_g2t = gff.g2t(merged_gtf)

    # hash gene_name's by tid
    ref_gid_names = {}
    for line in open(ref_gtf):
        a = line.split('\t')
        kv = gff.gtf_kv(a[8])
        if 'gene_name' in kv:
            ref_gid_names[kv['gene_id']] = kv['gene_name']

    # hash merged lines by tid
    merged_tid_lines = {}
    for line in open(merged_gtf):
        a = line.split('\t')
        tid = gff.gtf_kv(a[8])['transcript_id']
        merged_tid_lines.setdefault(tid,[]).append(line)

    # intialize orphan gene_id
    orphan_num = 1

    for mgene_id in merged_g2t:
        # count reference genes
        ref_genes = set()
        for tid in merged_g2t[mgene_id]:
            if tid in ref_t2g:
                ref_genes.add(ref_t2g[tid])

        # if no known genes, leave it alone
        if len(ref_genes) == 0:
            for tid in merged_g2t[mgene_id]:
                print ''.join(merged_tid_lines[tid]),

        # if known gene, set gene_id to it
        elif len(ref_genes) == 1:
            new_gene_id = list(ref_genes)[0]
            for tid in merged_g2t[mgene_id]:
                for line in merged_tid_lines[tid]:
                    a = line.split('\t')
                    kv = gff.gtf_kv(a[8])
                    kv['gene_id'] = new_gene_id
                    if new_gene_id in ref_gid_names:
                        kv['gene_name'] = ref_gid_names[new_gene_id]
                    a[8] = gff.kv_gtf(kv)
                    print '\t'.join(a)

        # if two known genes were combined, fix it
        elif len(ref_genes) > 1:
            # compute transcript overlaps and build overlap graph
            tid_overlap_graph = make_overlap_graph(mgene_id, merged_g2t, merged_tid_lines)

            # map each new transcript to the ref gene_id's overlapped
            tid_ref_genes = {}
            for (tid1,tid2) in tid_overlap_graph.edges():
                if tid1 in ref_t2g and tid2 not in ref_t2g:
                    tid_ref_genes.setdefault(tid2,set()).add(ref_t2g[tid1])
                elif tid1 not in ref_t2g and tid2 in ref_t2g:
                    tid_ref_genes.setdefault(tid1,set()).add(ref_t2g[tid2])

            # remove new transcripts overlapping multiple ref gene_id's
            for tid in tid_ref_genes:
                if len(tid_ref_genes[tid]) > 1:
                    print >> sys.stderr, 'Removing %s' % tid
                    tid_overlap_graph.remove_node(tid)

            # remove edges connecting separate reference genes
            for (tid1,tid2) in tid_overlap_graph.edges():
                if tid1 in ref_t2g and tid2 in ref_t2g and ref_t2g[tid1] != ref_t2g[tid2]:
                    tid_overlap_graph.remove_edge(tid1,tid2)

            # map to new gene_id's; missing means eliminate transcript
            tid_new_gid, orphan_num = map_new_gid(tid_overlap_graph, orphan_num, ref_t2g)

            for tid in merged_g2t[mgene_id]:
                if tid in tid_new_gid:
                    for line in merged_tid_lines[tid]:
                        a = line.split('\t')
                        kv = gff.gtf_kv(a[8])
                        kv['gene_id'] = tid_new_gid[tid]
                        if tid_new_gid[tid] in ref_gid_names:
                            kv['gene_name'] = ref_gid_names[tid_new_gid[tid]]
                        a[8] = gff.kv_gtf(kv)
                        print '\t'.join(a)
コード例 #18
0
def main():
    usage = 'usage: %prog [options] <ref_gtf>'
    parser = OptionParser(usage)
    #parser.add_option()
    parser.add_option('-d',
                      dest='downstream',
                      type='int',
                      default=1000,
                      help='Downstream bp for promoters [Default: %default]')
    parser.add_option(
        '-f',
        dest='fpkm_tracking',
        help='Use cufflinks FPKM estimates to choose the most expressed isoform'
    )
    parser.add_option('-u',
                      dest='upstream',
                      type='int',
                      default=1000,
                      help='Upstream bp for promoters [Default: %default]')
    (options, args) = parser.parse_args()

    if len(args) != 1:
        parser.error('Must provide reference GTF')
    else:
        ref_gtf = args[0]

    g2t = gff.g2t(ref_gtf)
    transcripts = gff.read_genes(ref_gtf)
    source = open(ref_gtf).readline().split()[1]

    if options.fpkm_tracking:
        iso_fpkm_tracking = cufflinks.fpkm_tracking(options.fpkm_tracking)

    for gene_id in g2t:
        gene_transcripts = list(g2t[gene_id])
        gene_strand = transcripts[gene_transcripts[0]].strand
        if gene_strand not in ['+', '-']:
            print >> sys.stderr, 'WARNING: %s discluded for lack of strand' % gene_id
            continue

        # choose TSS
        if options.fpkm_tracking:
            # find most expressed isoform
            promoter_tid = gene_transcripts[0]
            max_fpkm = stats.geo_mean([
                1 + fpkm for fpkm in iso_fpkm_tracking.gene_expr(promoter_tid)
            ])
            for transcript_id in gene_transcripts[1:]:
                transcript_fpkm = stats.geo_mean([
                    1 + fpkm
                    for fpkm in iso_fpkm_tracking.gene_expr(transcript_id)
                ])
                if math.isnan(max_fpkm) or transcript_fpkm > max_fpkm:
                    promoter_tid = transcript_id
                    max_fpkm = transcript_fpkm

            # get isoform tss
            if gene_strand == '+':
                tss = transcripts[promoter_tid].exons[0].start
            else:
                tss = transcripts[promoter_tid].exons[-1].end

        else:
            # find most upstream tss
            promoter_tid = gene_transcripts[0]
            if gene_strand == '+':
                upstream_tss = transcripts[promoter_tid].exons[0].start
            else:
                upstream_tss = transcripts[promoter_tid].exons[-1].end

            for transcript_id in gene_transcripts[1:]:
                if gene_strand == '+':
                    transcript_pos = transcripts[transcript_id].exons[0].start
                    if transcript_pos < upstream_tss:
                        promoter_tid = transcript_id
                        upstream_tss = transcript_pos
                else:
                    transcript_pos = transcripts[transcript_id].exons[-1].end
                    if transcript_pos > upstream_tss:
                        promoter_tid = transcript_id
                        upstream_tss = transcript_pos

            tss = upstream_tss

        # print promoter from the tss
        if gene_strand == '+':
            if tss - options.upstream < 1:
                print >> sys.stderr, 'WARNING: %s discluded for nearness to chromosome end' % gene_id
            else:
                tx = transcripts[promoter_tid]
                cols = [
                    tx.chrom, source, 'promoter',
                    str(tss - options.upstream),
                    str(tss + options.downstream), '.', tx.strand, '.',
                    gff.kv_gtf(tx.kv)
                ]
                print '\t'.join(cols)

        else:
            if tss - options.downstream < 1:
                print >> sys.stderr, 'WARNING: %s discluded for nearness to chromosome end' % gene_id
            else:
                tx = transcripts[promoter_tid]
                cols = [
                    tx.chrom, source, 'promoter',
                    str(tss - options.downstream),
                    str(tss + options.upstream), '.', tx.strand, '.',
                    gff.kv_gtf(tx.kv)
                ]
                print '\t'.join(cols)