def main():
    usage = 'usage: %prog [options] <gene/transcript id>'
    parser = OptionParser(usage)
    parser.add_option('-c', dest='cuff_dir', default='%s/research/common/data/lncrna'%os.environ['HOME'], help='Cufflinks output directory with .fpkm_tracking files [Default: %default]')
    parser.add_option('-l', dest='lnc_gtf', default='%s/research/common/data/lncrna/lnc_catalog.gtf'%os.environ['HOME'], help='lncRNA catalog gtf file [Default: %default]')
    parser.add_option('-t', dest='transcript_expr', default=False, action='store_true', help='Return transcript expression rather than gene [Default: %default]')
    (options,args) = parser.parse_args()

    if options.transcript_expr:
        cuff = cufflinks.fpkm_tracking('%s/isoforms.fpkm_tracking' % options.cuff_dir)

        if args[0].find('XLOC') != -1:
            trans_ids = set()
            for line in open(options.lnc_gtf):
                a = line.split('\t')
                kv = gff.gtf_kv(a[8])
                if kv['gene_id'] == args[0]:
                    trans_ids.add(kv['transcript_id'])
        else:
            trans_ids = [args[0]]

        for trans_id in trans_ids:
            print '%s:' % trans_id
            cuff.gene_expr_print(trans_id)

    else:
        cuff = cufflinks.fpkm_tracking('%s/genes.fpkm_tracking' % options.cuff_dir)

        if args[0].find('XLOC') != -1:
            gene_id = args[0]
        else:
            t2g = gff.t2g(options.lnc_gtf)
            gene_id = t2g[args[0]]

        cuff.gene_expr_print(gene_id)
Exemple #2
0
def main():
    usage = 'usage: %prog [options] <gff file>'
    parser = OptionParser(usage)
    parser.add_option('-c',
                      dest='cons_dir',
                      default='%s/research/common/data/phylop' %
                      os.environ['HOME'],
                      help='Conservation directory [Default: %default]')
    parser.add_option(
        '-l',
        dest='lncrna',
        action='store_true',
        default=False,
        help=
        'Use the lncRNA specific file to speed things up [Default: %default]')
    (options, args) = parser.parse_args()

    if len(args) != 1:
        parser.error('Must provide gff file to intersect')
    else:
        gff_file = args[0]

    t2g = gff.t2g(gff_file)

    # build interval trees
    lnc_lengths = {}
    chr_features = {}
    interval2lnc = {}
    lnc_cons = {}
    for line in open(gff_file):
        a = line.split('\t')

        chrom = a[0]
        start = int(a[3])
        end = int(a[4])
        tid = gff.gtf_kv(a[8])['transcript_id']
        align = (chrom, start, end)

        lnc_cons[tid] = []
        lnc_lengths[tid] = lnc_lengths.get(tid, 0) + (end - start + 1)
        if interval2lnc.has_key(align):
            interval2lnc[align].add(tid)
        else:
            interval2lnc[align] = set([tid])
            chr_features.setdefault(chrom, IntervalTree()).insert_interval(
                Interval(start, end))

    # process overlapping chromosome blocks
    if options.lncrna:
        lnc_wig = glob.glob('%s/lnc_catalog.*wigFix*' % options.cons_dir)[0]
        process_file(chr_features, interval2lnc, lnc_cons, lnc_wig)

    else:
        for cons_file in glob.glob('%s/chr*' % options.cons_dir):
            process_file(chr_features, interval2lnc, lnc_cons, cons_file)

    # print table
    for tid in lnc_lengths:
        cons_len = len(lnc_cons[tid])
        cons_cov = float(cons_len) / lnc_lengths[tid]
        if cons_len == 0:
            cons_mean = 0.0
            cons_median = 0.0
            cons_pos = 0.0
            cons_neg = 0.0
        else:
            cons_mean = stats.mean(lnc_cons[tid])
            cons_median = stats.median(lnc_cons[tid])
            cons_pos = len([c
                            for c in lnc_cons[tid] if c > 1]) / float(cons_len)
            cons_neg = len([c
                            for c in lnc_cons[tid] if c < 1]) / float(cons_len)

        cols = (tid, t2g[tid], lnc_lengths[tid], cons_cov, cons_mean,
                cons_median, cons_neg, cons_pos)
        print '%-15s %-15s %7d %9.4f %9.4f %9.4f %9.4f %9.4f' % cols
Exemple #3
0
def main():
    usage = 'usage: %prog [options] <gff file>'
    parser = OptionParser(usage)
    parser.add_option('-c', dest='cons_dir', default='%s/research/common/data/phylop' % os.environ['HOME'], help='Conservation directory [Default: %default]')
    parser.add_option('-l', dest='lncrna', action='store_true', default=False, help='Use the lncRNA specific file to speed things up [Default: %default]')
    (options,args) = parser.parse_args()

    if len(args) != 1:
        parser.error('Must provide gff file to intersect')
    else:
        gff_file = args[0]

    t2g = gff.t2g(gff_file)

    # build interval trees
    lnc_lengths = {}
    chr_features = {}
    interval2lnc = {}
    lnc_cons = {}
    for line in open(gff_file):
        a = line.split('\t')

        chrom = a[0]
        start = int(a[3])
        end = int(a[4])
        tid = gff.gtf_kv(a[8])['transcript_id']
        align = (chrom,start,end)

        lnc_cons[tid] = []
        lnc_lengths[tid] = lnc_lengths.get(tid,0) + (end-start+1)
        if interval2lnc.has_key(align):
            interval2lnc[align].add(tid)
        else:
            interval2lnc[align] = set([tid])
            chr_features.setdefault(chrom, IntervalTree()).insert_interval(Interval(start,end))

    # process overlapping chromosome blocks
    if options.lncrna:
        lnc_wig = glob.glob('%s/lnc_catalog.*wigFix*' % options.cons_dir)[0]
        process_file(chr_features, interval2lnc, lnc_cons, lnc_wig)

    else:
        for cons_file in glob.glob('%s/chr*' % options.cons_dir):
            process_file(chr_features, interval2lnc, lnc_cons, cons_file)

    # print table
    for tid in lnc_lengths:
        cons_len = len(lnc_cons[tid])
        cons_cov = float(cons_len) / lnc_lengths[tid]
        if cons_len == 0:
            cons_mean = 0.0
            cons_median = 0.0
            cons_pos = 0.0
            cons_neg = 0.0
        else:
            cons_mean = stats.mean(lnc_cons[tid])
            cons_median = stats.median(lnc_cons[tid])
            cons_pos = len([c for c in lnc_cons[tid] if c > 1]) / float(cons_len)
            cons_neg = len([c for c in lnc_cons[tid] if c < 1]) / float(cons_len)

        cols = (tid, t2g[tid], lnc_lengths[tid], cons_cov, cons_mean, cons_median, cons_neg, cons_pos)
        print '%-15s %-15s %7d %9.4f %9.4f %9.4f %9.4f %9.4f' % cols
def main():
    usage = 'usage: %prog [options] <ref gtf> <merged gtf>'
    parser = OptionParser(usage)
    (options,args) = parser.parse_args()

    if len(args) != 2:
        parser.error(usage)
    else:
        ref_gtf = args[0]
        merged_gtf = args[1]

    # get mappings
    ref_t2g = gff.t2g(ref_gtf)
    merged_t2g = gff.t2g(merged_gtf)
    merged_g2t = gff.g2t(merged_gtf)

    # hash gene_name's by tid
    ref_gid_names = {}
    for line in open(ref_gtf):
        a = line.split('\t')
        kv = gff.gtf_kv(a[8])
        if 'gene_name' in kv:
            ref_gid_names[kv['gene_id']] = kv['gene_name']

    # hash merged lines by tid
    merged_tid_lines = {}
    for line in open(merged_gtf):
        a = line.split('\t')
        tid = gff.gtf_kv(a[8])['transcript_id']
        merged_tid_lines.setdefault(tid,[]).append(line)

    # intialize orphan gene_id
    orphan_num = 1

    for mgene_id in merged_g2t:
        # count reference genes
        ref_genes = set()
        for tid in merged_g2t[mgene_id]:
            if tid in ref_t2g:
                ref_genes.add(ref_t2g[tid])

        # if no known genes, leave it alone
        if len(ref_genes) == 0:
            for tid in merged_g2t[mgene_id]:
                print ''.join(merged_tid_lines[tid]),

        # if known gene, set gene_id to it
        elif len(ref_genes) == 1:
            new_gene_id = list(ref_genes)[0]
            for tid in merged_g2t[mgene_id]:
                for line in merged_tid_lines[tid]:
                    a = line.split('\t')
                    kv = gff.gtf_kv(a[8])
                    kv['gene_id'] = new_gene_id
                    if new_gene_id in ref_gid_names:
                        kv['gene_name'] = ref_gid_names[new_gene_id]
                    a[8] = gff.kv_gtf(kv)
                    print '\t'.join(a)

        # if two known genes were combined, fix it
        elif len(ref_genes) > 1:
            # compute transcript overlaps and build overlap graph
            tid_overlap_graph = make_overlap_graph(mgene_id, merged_g2t, merged_tid_lines)

            # map each new transcript to the ref gene_id's overlapped
            tid_ref_genes = {}
            for (tid1,tid2) in tid_overlap_graph.edges():
                if tid1 in ref_t2g and tid2 not in ref_t2g:
                    tid_ref_genes.setdefault(tid2,set()).add(ref_t2g[tid1])
                elif tid1 not in ref_t2g and tid2 in ref_t2g:
                    tid_ref_genes.setdefault(tid1,set()).add(ref_t2g[tid2])

            # remove new transcripts overlapping multiple ref gene_id's
            for tid in tid_ref_genes:
                if len(tid_ref_genes[tid]) > 1:
                    print >> sys.stderr, 'Removing %s' % tid
                    tid_overlap_graph.remove_node(tid)

            # remove edges connecting separate reference genes
            for (tid1,tid2) in tid_overlap_graph.edges():
                if tid1 in ref_t2g and tid2 in ref_t2g and ref_t2g[tid1] != ref_t2g[tid2]:
                    tid_overlap_graph.remove_edge(tid1,tid2)

            # map to new gene_id's; missing means eliminate transcript
            tid_new_gid, orphan_num = map_new_gid(tid_overlap_graph, orphan_num, ref_t2g)

            for tid in merged_g2t[mgene_id]:
                if tid in tid_new_gid:
                    for line in merged_tid_lines[tid]:
                        a = line.split('\t')
                        kv = gff.gtf_kv(a[8])
                        kv['gene_id'] = tid_new_gid[tid]
                        if tid_new_gid[tid] in ref_gid_names:
                            kv['gene_name'] = ref_gid_names[tid_new_gid[tid]]
                        a[8] = gff.kv_gtf(kv)
                        print '\t'.join(a)