Exemple #1
0
def map_genes(gtf_file, fpkm_file, pseudocount=0.125, all_isoforms=False, random_zeros=False):
    # get expression data
    if fpkm_file[-5:] == '.diff':
        transcript_fpkm = diff_fpkm(fpkm_file, pseudocount)
    else:
        transcript_fpkm = cuff_fpkm(fpkm_file, pseudocount)

    # get genes
    if all_isoforms:
        g2t = gff.g2t(gtf_file)
    else:
        g2t = {}
        for line in open(gtf_file):
            a = line.split('\t')
            kv = gff.gtf_kv(a[8])

            if kv['transcript_type'] not in ['intron', 'prerna', 'nonsense_mediated_decay', 'retained_intron', 'non_stop_decay']:
                g2t.setdefault(kv['gene_id'],set()).add(kv['transcript_id'])

    # map gene_id's to max expression isoform
    gene_max_iso = {}
    min_fpkm = math.log(pseudocount, 2)
    for gid in g2t:
        max_fpkm_tid = None
        max_fpkm = min_fpkm

        for tid in g2t[gid]:
            if transcript_fpkm.get(tid,min_fpkm) > max_fpkm:
                max_fpkm_tid = tid
                max_fpkm = transcript_fpkm[tid]

        gene_max_iso[gid] = max_fpkm_tid

    # choose isoforms for None
    if random_zeros:
        for gid in g2t:
            if gene_max_iso[gid] == None:
                gene_max_iso[gid] = random.choice(g2t[gid])

    return gene_max_iso
Exemple #2
0
def main():
    usage = 'usage: %prog [options] <ref_gtf>'
    parser = OptionParser(usage)
    #parser.add_option()
    parser.add_option('-d', dest='downstream', type='int', default=1000, help='Downstream bp for promoters [Default: %default]')
    parser.add_option('-f', dest='fpkm_tracking', help='Use cufflinks FPKM estimates to choose the most expressed isoform')
    parser.add_option('-u', dest='upstream', type='int', default=1000, help='Upstream bp for promoters [Default: %default]')
    (options,args) = parser.parse_args()

    if len(args) != 1:
        parser.error('Must provide reference GTF')
    else:
        ref_gtf = args[0]

    g2t = gff.g2t(ref_gtf)
    transcripts = gff.read_genes(ref_gtf)
    source = open(ref_gtf).readline().split()[1]

    if options.fpkm_tracking:
        iso_fpkm_tracking = cufflinks.fpkm_tracking(options.fpkm_tracking)

    for gene_id in g2t:
        gene_transcripts = list(g2t[gene_id])
        gene_strand = transcripts[gene_transcripts[0]].strand
        if gene_strand not in ['+','-']:
            print('WARNING: %s discluded for lack of strand' % gene_id, file=sys.stderr)
            continue

        # choose TSS
        if options.fpkm_tracking:
            # find most expressed isoform
            promoter_tid = gene_transcripts[0]
            max_fpkm = stats.geo_mean([1+fpkm for fpkm in iso_fpkm_tracking.gene_expr(promoter_tid)])
            for transcript_id in gene_transcripts[1:]:
                transcript_fpkm = stats.geo_mean([1+fpkm for fpkm in iso_fpkm_tracking.gene_expr(transcript_id)])
                if math.isnan(max_fpkm) or transcript_fpkm > max_fpkm:
                    promoter_tid = transcript_id
                    max_fpkm = transcript_fpkm

            # get isoform tss
            if gene_strand == '+':
                tss = transcripts[promoter_tid].exons[0].start
            else:
                tss = transcripts[promoter_tid].exons[-1].end

        else:
            # find most upstream tss
            promoter_tid = gene_transcripts[0]
            if gene_strand == '+':
                upstream_tss = transcripts[promoter_tid].exons[0].start
            else:
                upstream_tss = transcripts[promoter_tid].exons[-1].end

            for transcript_id in gene_transcripts[1:]:
                if gene_strand == '+':
                    transcript_pos = transcripts[transcript_id].exons[0].start
                    if transcript_pos < upstream_tss:
                        promoter_tid = transcript_id
                        upstream_tss = transcript_pos
                else:
                    transcript_pos = transcripts[transcript_id].exons[-1].end
                    if transcript_pos > upstream_tss:
                        promoter_tid = transcript_id
                        upstream_tss = transcript_pos

            tss = upstream_tss

        # print promoter from the tss
        if gene_strand == '+':
            if tss - options.upstream < 1:
                print('WARNING: %s discluded for nearness to chromosome end' % gene_id, file=sys.stderr)
            else:
                tx = transcripts[promoter_tid]
                cols = [tx.chrom, source, 'promoter', str(tss-options.upstream), str(tss+options.downstream), '.', tx.strand, '.', gff.kv_gtf(tx.kv)]
                print('\t'.join(cols))

        else:
            if tss - options.downstream < 1:
                print('WARNING: %s discluded for nearness to chromosome end' % gene_id, file=sys.stderr)
            else:
                tx = transcripts[promoter_tid]
                cols = [tx.chrom, source, 'promoter', str(tss-options.downstream), str(tss+options.upstream), '.', tx.strand, '.', gff.kv_gtf(tx.kv)]
                print('\t'.join(cols))
Exemple #3
0
def main():
    usage = 'usage: %prog [options] <gtf> <fasta>'
    parser = OptionParser(usage)
    parser.add_option('-b', dest='bam_length', help='Obtain read length via sampling a distribution from a BAM file [Default: %default]')
    parser.add_option('-e', dest='error_rate', type='float', default=0, help='Error rate (uniform on reads) [Default: %default]')
    parser.add_option('-f', dest='fpkm_file', help='Cufflinks .fpkm_tracking file to use for FPKMs [Default: %default]')
    parser.add_option('-l', dest='read_length', type='int', default=30, help='Read length [Default: %default]')
    parser.add_option('-n', dest='num_reads', type='int', default=100000, help='Number of reads [Default: %default]')
    parser.add_option('-o', dest='output_prefix', default='reads', help='Output files prefix [Default: %default]')
    (options,args) = parser.parse_args()

    if len(args) != 2:
        parser.error('Must provide GTF file and fasta file')
    else:
        gtf_file = args[0]
        fasta_file = args[1]

    if options.bam_length:
        read_length_distribution = bam_length_distribution(options.bam_length)
    else:
        read_length_distribution = {options.read_length:1}

    # read GTF gene_id to transcript_id's mapping
    g2t = gff.g2t(gtf_file)

    # get transcript lengths
    transcript_lengths = {}
    for line in open(gtf_file):
        a = line.split('\t')
        if a[2] == 'exon':
            transcript_id = gff.gtf_kv(a[8])['transcript_id']
            transcript_lengths[transcript_id] = transcript_lengths.get(transcript_id,0) + int(a[4])-int(a[3])+1

    if options.fpkm_file:
        transcript_copies = {}
        fpkm_in = open(options.fpkm_file)
        line = fpkm_in.readline()
        for line in fpkm_in:
            a = line.split('\t')
            transcript_copies[a[0]] = float(a[9])
        fpkm_in.close()

        if sum(transcript_copies.values()) == 0:
            print >> sys.stderr, 'FPKM file shows no expression. Exiting.'
            exit(1)
    else:
        # sample gene copies
        gene_copies_raw = lognorm.rvs(1,size=len(g2t))
        gene_copies_raw_sum = sum(gene_copies_raw)
        gene_copies = dict(zip(g2t.keys(), [gcr/gene_copies_raw_sum for gcr in gene_copies_raw]))

        # sample transcript copies
        transcript_copies = {}
        for gene_id in g2t:
            relative_copies = dict(zip(g2t[gene_id], lognorm.rvs(1,size=len(g2t[gene_id]))))
            relative_sum = sum(relative_copies.values())
            for transcript_id in g2t[gene_id]:
                transcript_copies[transcript_id] = gene_copies[gene_id]*relative_copies[transcript_id]/relative_sum

    # determine transcript probabilities as a function of copy and length
    transcript_weights = {}
    for transcript_id in transcript_copies:
        if transcript_lengths[transcript_id] >= min(read_length_distribution.keys()):
            weight = 0
            for read_length in read_length_distribution:
                weight += read_length_distribution[read_length]*transcript_copies[transcript_id]*(transcript_lengths[transcript_id]-read_length+1)

            if weight > 0:
                transcript_weights[transcript_id] = weight
    weights_sum = sum(transcript_weights.values())
    transcript_probs = dict([(tid,transcript_weights[tid]/weights_sum) for tid in transcript_weights])

    # open fasta file
    fasta = pysam.Fastafile(fasta_file)

    # open output files
    fastq_out = open('%s.fastq' % options.output_prefix, 'w')
    gff_out = open('%s_txome.gff' % options.output_prefix, 'w')

    # for each transcript
    read_index = 1
    for transcript_id in transcript_probs:
        expected_reads = transcript_probs[transcript_id]*options.num_reads
        if expected_reads == 0:
            sampled_reads = 0
        else:
            sampled_reads = poisson.rvs(expected_reads)

        for s in range(sampled_reads):
            read_length = sample_read_length(read_length_distribution)
            if transcript_lengths[transcript_id] > read_length:
                pos = random.randint(0, transcript_lengths[transcript_id]-read_length)
                seq = fasta.fetch(transcript_id, pos, pos+read_length).upper()
                if seq:
                    eseq = inject_errors(seq, options.error_rate)

                    print >> fastq_out, '@read%d\n%s\n+\n%s' % (read_index,eseq,'I'*read_length)
                    print >> gff_out, '\t'.join([transcript_id, 'sim', 'read', str(pos+1), str(pos+read_length), '.', '+', '.', 'read%d'%read_index])

                    read_index += 1
                else:
                    print >> sys.stderr, 'Missing fasta sequence %s:%d-%d' % (transcript_id,pos,(pos+read_length))

    fastq_out.close()
    gff_out.close()

    # map back to genome
    subprocess.call('tgff_cgff.py -c %s %s_txome.gff > %s_genome.gff' % (gtf_file, options.output_prefix, options.output_prefix), shell=True)
Exemple #4
0
def main():
    usage = 'usage: %prog [options] <clip_bam> <ref_gtf>'
    parser = OptionParser(usage)

    # IO options
    parser.add_option('-c', dest='control_bam', help='Control BAM file')
    parser.add_option('-o', dest='out_dir', default='peaks', help='Output directory [Default: %default]')

    # peak calling options
    parser.add_option('-w', dest='window_size', type='int', default=50, help='Window size for scan statistic [Default: %default]')
    parser.add_option('-p', dest='p_val', type='float', default=.01, help='P-value required of window scan statistic tests [Default: %default]')

    # cufflinks options
    parser.add_option('--cuff_done', dest='cuff_done', action='store_true', default=False, help='A cufflinks run to estimate the model parameters is already done [Default: %default]')
    parser.add_option('-t', dest='threads', type='int', default=2, help='Number of threads to use [Default: %default]')

    # debug options
    parser.add_option('-v', '--verbose', dest='verbose', action='store_true', default=False, help='Verbose output [Default: %default]')
    parser.add_option('-g', '--gene', dest='gene_only', help='Call peaks on the specified gene only')
    parser.add_option('--print_windows', dest='print_windows', default=False, action='store_true', help='Print statistics for all windows [Default: %default]')

    (options,args) = parser.parse_args()

    if len(args) != 2:
        parser.error(usage)
    else:
        clip_bam = args[0]
        ref_gtf = args[1]

    if not os.path.isdir(options.out_dir):
        os.mkdir(options.out_dir)

    ############################################
    # parameterize
    ############################################
    if options.verbose:
        print >> sys.stderr, 'Estimating gene abundances...'

    if options.control_bam:
        # make a new gtf w/ unspliced RNAs
        update_ref_gtf = prerna_gtf(ref_gtf, options.out_dir)

        # run Cufflinks on new gtf file and control BAM
        if not options.cuff_done:
            subprocess.call('cufflinks -o %s -p %d -G %s %s' % (options.out_dir, options.threads, update_ref_gtf, options.control_bam), shell=True)
    
    else:
        # make a new gtf file of only loci-spanning RNAs
        update_ref_gtf = span_gtf(ref_gtf, options.out_dir)

        # run Cufflinks on new gtf file and CLIP BAM
        if not options.cuff_done:
            subprocess.call('cufflinks -o %s -p %d -G %s %s' % (options.out_dir, options.threads, update_ref_gtf, clip_bam), shell=True)

    # store transcripts
    transcripts = read_genes(update_ref_gtf, key_id='transcript_id')
    g2t = gff.g2t(update_ref_gtf)

    # set junctions
    set_transcript_junctions(transcripts)

    # set "exon" FPKMs
    set_transcript_fpkms(transcripts, options.out_dir, options.verbose)

    if options.verbose:
        print >> sys.stderr, 'Computing global statistics...'

    # count transcriptome CLIP reads (overestimates small RNA single ended reads by counting antisense)
    subprocess.call('intersectBed -abam %s -b %s/transcripts.gtf > %s/transcripts.bam' % (clip_bam, options.out_dir, options.out_dir), shell=True)
    total_reads = count_reads('%s/transcripts.bam' % options.out_dir)

    # compute # of tests we will perform
    txome_size = transcriptome_size(transcripts, options.window_size)


    ############################################
    # process genes
    ############################################
    # TODO: Can I convert to using transcripts.bam here? Does it affect performance given an indexing?
    # index
    subprocess.call('samtools index %s' % clip_bam, shell=True)

    # open clip-seq bam
    clip_in = pysam.Samfile(clip_bam, 'rb')
    
    # open peak output gff
    peaks_out = open('%s/peaks.gff' % options.out_dir, 'w')
    peak_id = 1

    # open window output
    windows_out = None
    if options.print_windows:
        windows_out = open('%s/window_stats.txt' % options.out_dir, 'w')

    # for each gene
    if options.gene_only:
        gene_ids = [options.gene_only]
    else:
        gene_ids = g2t.keys()

    for gene_id in gene_ids:
        if options.verbose:
            print >> sys.stderr, 'Processing %s...' % gene_id

        # make a more focused transcript hash for this gene
        gene_transcripts = {}
        for tid in g2t[gene_id]:
            gene_transcripts[tid] = transcripts[tid]

        # obtain basic gene attributes
        (gchrom, gstrand, gstart, gend) = gene_attrs(gene_transcripts)

        if options.verbose:
            print >> sys.stderr, '\tFetching alignments...'

        # choose a single event position and weight the reads
        read_pos_weights = position_reads(clip_in, gchrom, gstart, gend, gstrand)

        # find splice junctions
        #junctions = map_splice_junctions(tx)

        if options.verbose:
            print >> sys.stderr, '\tCounting and computing in windows...'

        # count reads and compute p-values in windows
        window_stats = count_windows(clip_in, options.window_size, read_pos_weights, gene_transcripts, gstart, gend, total_reads, txome_size, windows_out)

        if options.verbose:
            print >> sys.stderr, '\tRefining peaks...'

        # post-process windows to peaks
        peaks = windows2peaks(read_pos_weights, gene_transcripts, gstart, window_stats, options.window_size, options.p_val, total_reads, txome_size)        

        # output peaks
        for pstart, pend, pcount, ppval in peaks:
            if ppval > 0:
                peak_score = int(2000/math.pi*math.atan(-math.log(ppval,1000)))
            else:
                peak_score = 1000
            cols = [gchrom, 'clip_peaks', 'peak', str(pstart), str(pend), str(peak_score), gstrand, '.', 'id "PEAK%d"; gene_id "%s"; count "%.1f"; p "%.2e"' % (peak_id,gene_id,pcount,ppval)]
            print >> peaks_out, '\t'.join(cols)
            peak_id += 1

    clip_in.close()
    peaks_out.close()
Exemple #5
0
def main():
    usage = 'usage: %prog [options] <ref gtf> <merged gtf>'
    parser = OptionParser(usage)
    (options,args) = parser.parse_args()

    if len(args) != 2:
        parser.error(usage)
    else:
        ref_gtf = args[0]
        merged_gtf = args[1]

    # get mappings
    ref_t2g = gff.t2g(ref_gtf)
    merged_t2g = gff.t2g(merged_gtf)
    merged_g2t = gff.g2t(merged_gtf)

    # hash gene_name's by tid
    ref_gid_names = {}
    for line in open(ref_gtf):
        a = line.split('\t')
        kv = gff.gtf_kv(a[8])
        if 'gene_name' in kv:
            ref_gid_names[kv['gene_id']] = kv['gene_name']

    # hash merged lines by tid
    merged_tid_lines = {}
    for line in open(merged_gtf):
        a = line.split('\t')
        tid = gff.gtf_kv(a[8])['transcript_id']
        merged_tid_lines.setdefault(tid,[]).append(line)

    # intialize orphan gene_id
    orphan_num = 1

    for mgene_id in merged_g2t:
        # count reference genes
        ref_genes = set()
        for tid in merged_g2t[mgene_id]:
            if tid in ref_t2g:
                ref_genes.add(ref_t2g[tid])

        # if no known genes, leave it alone
        if len(ref_genes) == 0:
            for tid in merged_g2t[mgene_id]:
                print ''.join(merged_tid_lines[tid]),

        # if known gene, set gene_id to it
        elif len(ref_genes) == 1:
            new_gene_id = list(ref_genes)[0]
            for tid in merged_g2t[mgene_id]:
                for line in merged_tid_lines[tid]:
                    a = line.split('\t')
                    kv = gff.gtf_kv(a[8])
                    kv['gene_id'] = new_gene_id
                    if new_gene_id in ref_gid_names:
                        kv['gene_name'] = ref_gid_names[new_gene_id]
                    a[8] = gff.kv_gtf(kv)
                    print '\t'.join(a)

        # if two known genes were combined, fix it
        elif len(ref_genes) > 1:
            # compute transcript overlaps and build overlap graph
            tid_overlap_graph = make_overlap_graph(mgene_id, merged_g2t, merged_tid_lines)

            # map each new transcript to the ref gene_id's overlapped
            tid_ref_genes = {}
            for (tid1,tid2) in tid_overlap_graph.edges():
                if tid1 in ref_t2g and tid2 not in ref_t2g:
                    tid_ref_genes.setdefault(tid2,set()).add(ref_t2g[tid1])
                elif tid1 not in ref_t2g and tid2 in ref_t2g:
                    tid_ref_genes.setdefault(tid1,set()).add(ref_t2g[tid2])

            # remove new transcripts overlapping multiple ref gene_id's
            for tid in tid_ref_genes:
                if len(tid_ref_genes[tid]) > 1:
                    print >> sys.stderr, 'Removing %s' % tid
                    tid_overlap_graph.remove_node(tid)

            # remove edges connecting separate reference genes
            for (tid1,tid2) in tid_overlap_graph.edges():
                if tid1 in ref_t2g and tid2 in ref_t2g and ref_t2g[tid1] != ref_t2g[tid2]:
                    tid_overlap_graph.remove_edge(tid1,tid2)

            # map to new gene_id's; missing means eliminate transcript
            tid_new_gid, orphan_num = map_new_gid(tid_overlap_graph, orphan_num, ref_t2g)

            for tid in merged_g2t[mgene_id]:
                if tid in tid_new_gid:
                    for line in merged_tid_lines[tid]:
                        a = line.split('\t')
                        kv = gff.gtf_kv(a[8])
                        kv['gene_id'] = tid_new_gid[tid]
                        if tid_new_gid[tid] in ref_gid_names:
                            kv['gene_name'] = ref_gid_names[tid_new_gid[tid]]
                        a[8] = gff.kv_gtf(kv)
                        print '\t'.join(a)
Exemple #6
0
def main():
    usage = 'usage: %prog [options] <ref_gtf>'
    parser = OptionParser(usage)
    #parser.add_option()
    parser.add_option('-d',
                      dest='downstream',
                      type='int',
                      default=1000,
                      help='Downstream bp for promoters [Default: %default]')
    parser.add_option(
        '-f',
        dest='fpkm_tracking',
        help='Use cufflinks FPKM estimates to choose the most expressed isoform'
    )
    parser.add_option('-u',
                      dest='upstream',
                      type='int',
                      default=1000,
                      help='Upstream bp for promoters [Default: %default]')
    (options, args) = parser.parse_args()

    if len(args) != 1:
        parser.error('Must provide reference GTF')
    else:
        ref_gtf = args[0]

    g2t = gff.g2t(ref_gtf)
    transcripts = gff.read_genes(ref_gtf)
    source = open(ref_gtf).readline().split()[1]

    if options.fpkm_tracking:
        iso_fpkm_tracking = cufflinks.fpkm_tracking(options.fpkm_tracking)

    for gene_id in g2t:
        gene_transcripts = list(g2t[gene_id])
        gene_strand = transcripts[gene_transcripts[0]].strand
        if gene_strand not in ['+', '-']:
            print('WARNING: %s discluded for lack of strand' % gene_id,
                  file=sys.stderr)
            continue

        # choose TSS
        if options.fpkm_tracking:
            # find most expressed isoform
            promoter_tid = gene_transcripts[0]
            max_fpkm = stats.geo_mean([
                1 + fpkm for fpkm in iso_fpkm_tracking.gene_expr(promoter_tid)
            ])
            for transcript_id in gene_transcripts[1:]:
                transcript_fpkm = stats.geo_mean([
                    1 + fpkm
                    for fpkm in iso_fpkm_tracking.gene_expr(transcript_id)
                ])
                if math.isnan(max_fpkm) or transcript_fpkm > max_fpkm:
                    promoter_tid = transcript_id
                    max_fpkm = transcript_fpkm

            # get isoform tss
            if gene_strand == '+':
                tss = transcripts[promoter_tid].exons[0].start
            else:
                tss = transcripts[promoter_tid].exons[-1].end

        else:
            # find most upstream tss
            promoter_tid = gene_transcripts[0]
            if gene_strand == '+':
                upstream_tss = transcripts[promoter_tid].exons[0].start
            else:
                upstream_tss = transcripts[promoter_tid].exons[-1].end

            for transcript_id in gene_transcripts[1:]:
                if gene_strand == '+':
                    transcript_pos = transcripts[transcript_id].exons[0].start
                    if transcript_pos < upstream_tss:
                        promoter_tid = transcript_id
                        upstream_tss = transcript_pos
                else:
                    transcript_pos = transcripts[transcript_id].exons[-1].end
                    if transcript_pos > upstream_tss:
                        promoter_tid = transcript_id
                        upstream_tss = transcript_pos

            tss = upstream_tss

        # print promoter from the tss
        if gene_strand == '+':
            if tss - options.upstream < 1:
                print('WARNING: %s discluded for nearness to chromosome end' %
                      gene_id,
                      file=sys.stderr)
            else:
                tx = transcripts[promoter_tid]
                cols = [
                    tx.chrom, source, 'promoter',
                    str(tss - options.upstream),
                    str(tss + options.downstream), '.', tx.strand, '.',
                    gff.kv_gtf(tx.kv)
                ]
                print('\t'.join(cols))

        else:
            if tss - options.downstream < 1:
                print('WARNING: %s discluded for nearness to chromosome end' %
                      gene_id,
                      file=sys.stderr)
            else:
                tx = transcripts[promoter_tid]
                cols = [
                    tx.chrom, source, 'promoter',
                    str(tss - options.downstream),
                    str(tss + options.upstream), '.', tx.strand, '.',
                    gff.kv_gtf(tx.kv)
                ]
                print('\t'.join(cols))