Esempio n. 1
0
def main():
    usage = 'usage: %prog [options] <fpkm_tracking> <gene1> <gene2> ...>'
    parser = OptionParser(usage)
    parser.add_option('-l', dest='log', default=False, help='log2 FPKM')
    parser.add_option('-n', dest='names', default=None, help='Sample names, comma-separated')
    parser.add_option('-p', dest='pseudocount', default=.125, type='float', help='Pseudocount for log FPKM [Default: %default]')
    parser.add_option('-o', dest='out_dir', default='cuff_bars', help='Output directory [Default: %default]')
    parser.add_option('-s', dest='samples', default=None, help='Samples to plot, comma-separated')
    parser.add_option('-y', dest='yaxis_match', default=False, action='store_true', help='Match the y-axis of all plots [Default: %default]')
    (options,args) = parser.parse_args()

    if len(args) < 2:
        parser.error('Must provide fpkm_tracking and genes.')
    else:
        fpkm_tracking = args[0]
        genes = args[1:]

    gene_sample_fpkm = read_fpkms(fpkm_tracking, genes, options.log, options.pseudocount)

    if not os.path.isdir(options.out_dir):
        os.mkdir(options.out_dir)

    if options.samples:
        samples = options.samples.split(',')
    else:
        samples = sorted(gene_sample_fpkm[genes[0]].keys())

    if options.names:
        names = options.names.split(',')
    else:
        names = samples

    ymin = 0
    if options.log:
        ymin = np.log2(options.pseudocount)

    if options.yaxis_match:
        ymax = max([gene_sample_fpkm[gene_name][sample][2] for sample in samples for gene_name in genes])
    else:
        ymax = None

    for gene_name in genes:
        df = {}
        df['Sample'] = names
        df['FPKM'] = [gene_sample_fpkm[gene_name][sample][0] for sample in samples]
        df['conf_lo'] = [gene_sample_fpkm[gene_name][sample][1] for sample in samples]
        df['conf_hi'] = [gene_sample_fpkm[gene_name][sample][2] for sample in samples]

        out_pdf = '%s/%s.pdf' % (options.out_dir, gene_name)
        ggplot.plot('%s/cuff_bar.r'%os.environ['RDIR'], df, [ymin, ymax, out_pdf])
Esempio n. 2
0
def main():
    usage = "usage: %prog [options] <fpkm_tracking>"
    parser = OptionParser(usage)
    parser.add_option("-d", dest="diff_file", help="Limit to significantly differentially expressed genes")
    parser.add_option("-g", dest="gtf", help="GTF file of genes to display")
    parser.add_option("-m", dest="min_fpkm", default=0.125, help="Minimum FPKM (for logs) [Default: %default]")
    parser.add_option("-o", dest="out_pdf", default="cuff_heat.pdf", help="Output PDF [Default: %default]")
    parser.add_option("-s", dest="sample", default=1000, help="Sample genes rather than use all [Default: %default]")
    (options, args) = parser.parse_args()

    if len(args) != 1:
        parser.error("Must provide fpkm_tracking")
    else:
        fpkm_tracking = args[0]

    # load expression data
    cuff = cufflinks.fpkm_tracking(fpkm_file=fpkm_tracking)

    # determine genes
    all_genes = set(cuff.genes)
    if options.gtf:
        all_genes = set()
        for line in open(options.gtf):
            a = line.split("\t")
            all_genes.add(gff.gtf_kv(a[8])["gene_id"])

    if options.diff_file:
        # limit to differentially expressed genes
        diff_genes = find_diff(options.diff_file)
        all_genes &= diff_genes

    # sample genes to display
    if len(all_genes) <= options.sample:
        display_genes = all_genes
    else:
        display_genes = random.sample(all_genes, options.sample)

    # build data frame
    df = {"Gene": [], "FPKM": [], "Sample": []}

    for gene_id in display_genes:
        ge = cuff.gene_expr(gene_id)
        if not math.isnan(ge[0]):
            for i in range(len(cuff.experiments)):
                df["Gene"].append(gene_id)
                df["Sample"].append(cuff.experiments[i])
                df["FPKM"].append(math.log(ge[i] + options.min_fpkm, 2))

    # plot
    ggplot.plot("%s/cuff_heat.r" % os.environ["RDIR"], df, [options.out_pdf])
Esempio n. 3
0
def make_output(peak_cov, out_prefix, prange):
    # dump raw counts to file
    raw_out = open('%s_raw.txt' % out_prefix,'w')
    for i in range(-prange/2,prange/2+1):
        print >> raw_out, '%d\t%e' % (i, peak_cov[i+prange/2])
    raw_out.close()

    r_script = '%s/peak_bam_plot.r' % os.environ['RDIR']

    df_dict = {'peak_i':range(-prange/2,prange/2+1),
               'cov':peak_cov}

    out_pdf = '%s.pdf' % out_prefix

    ggplot.plot(r_script, df_dict, [out_pdf])
Esempio n. 4
0
def main():
    usage = 'usage: %prog [options] <fpkm1_file> <fpkm2_file>'
    parser = OptionParser(usage)
    parser.add_option('-g', dest='gtf')
    parser.add_option('-o', dest='out_dir', default='.')
    parser.add_option('-p', dest='pseudocount', default=0.125, type='float')
    (options, args) = parser.parse_args()

    if len(args) != 2:
        parser.error('Must provide two diff files')
    else:
        fpkm1_file = args[0]
        fpkm2_file = args[1]

    cuff1 = cufflinks.fpkm_tracking(fpkm1_file)
    cuff2 = cufflinks.fpkm_tracking(fpkm2_file)

    gtf_genes = set()
    if options.gtf:
        gtf_genes = gff.gtf_gene_set(options.gtf)

    if not os.path.isdir(options.out_dir):
        os.mkdir(options.out_dir)

    for sample in cuff1.experiments:
        # scatter plot fpkm
        df = {'fpkm1': [], 'fpkm2': []}
        for i in range(len(cuff1.genes)):
            if len(gtf_genes) == 0 or cuff1.genes[i] in gtf_genes:
                fpkm1 = cuff1.gene_expr_exp(i, sample)
                fpkm2 = cuff2.gene_expr_exp(i, sample)

                if not math.isnan(fpkm1) and not math.isnan(fpkm2):
                    df['fpkm1'].append(math.log(options.pseudocount + fpkm1,
                                                2))
                    df['fpkm2'].append(math.log(options.pseudocount + fpkm2,
                                                2))

        r_script = '%s/fpkm_fpkm_scatter.r' % os.environ['RDIR']
        out_pdf = '%s/%s_scatter.pdf' % (options.out_dir, sample)
        ggplot.plot(r_script, df, [out_pdf])

        # compute correlation
        cor, p = spearmanr(df['fpkm1'], df['fpkm2'])

        report_out = open('%s/%s_report.txt' % (options.out_dir, sample), 'w')
        print >> report_out, 'Spearman correlation: %f (%e)' % (cor, p)
        report_out.close()
Esempio n. 5
0
def main():
    usage = 'usage: %prog [options] <fpkm1_file> <fpkm2_file>'
    parser = OptionParser(usage)
    parser.add_option('-g', dest='gtf')
    parser.add_option('-o', dest='out_dir', default='.')
    parser.add_option('-p', dest='pseudocount', default=0.125, type='float')
    (options,args) = parser.parse_args()

    if len(args) != 2:
        parser.error('Must provide two diff files')
    else:
        fpkm1_file = args[0]
        fpkm2_file = args[1]

    cuff1 = cufflinks.fpkm_tracking(fpkm1_file)
    cuff2 = cufflinks.fpkm_tracking(fpkm2_file)

    gtf_genes = set()
    if options.gtf:
        gtf_genes = gff.gtf_gene_set(options.gtf)

    if not os.path.isdir(options.out_dir):
        os.mkdir(options.out_dir)

    for sample in cuff1.experiments:
        # scatter plot fpkm
        df = {'fpkm1':[], 'fpkm2':[]}
        for i in range(len(cuff1.genes)):
            if len(gtf_genes) == 0 or cuff1.genes[i] in gtf_genes:
                fpkm1 = cuff1.gene_expr_exp(i, sample)
                fpkm2 = cuff2.gene_expr_exp(i, sample)

                if not math.isnan(fpkm1) and not math.isnan(fpkm2):
                    df['fpkm1'].append(math.log(options.pseudocount+fpkm1,2))
                    df['fpkm2'].append(math.log(options.pseudocount+fpkm2,2))

        r_script = '%s/fpkm_fpkm_scatter.r' % os.environ['RDIR']
        out_pdf = '%s/%s_scatter.pdf' % (options.out_dir, sample)
        ggplot.plot(r_script, df, [out_pdf])

        # compute correlation
        cor, p = spearmanr(df['fpkm1'], df['fpkm2'])

        report_out = open('%s/%s_report.txt' % (options.out_dir,sample), 'w')
        print >> report_out, 'Spearman correlation: %f (%e)' % (cor, p)
        report_out.close()
Esempio n. 6
0
def cdf_plot(te_key, te_diffs, note_diffs, out_pdf, scale):
    rep, fam, orient = te_key

    # name plot
    if fam == '-':
        label = 'dTE-RNAs/%s' % orient
    elif fam == '*':
        label = 'TE-RNAs/%s' % orient
    elif rep == '*':
        label = '%s-RNAs/%s' % (fam, orient)
    else:
        label = '%s-RNAs/%s' % (rep, orient)

    # construct data frame
    df = {}
    df['diff'] = note_diffs + te_diffs
    df['class'] = ['d%s' % label] * len(note_diffs) + [label] * len(te_diffs)

    ggplot.plot('%s/te_diff.r' % os.environ['RDIR'], df, [out_pdf, scale])
Esempio n. 7
0
def cdf_plot(te_or, w_te, wo_te, out_pdf):
    rep, fam, orient = te_or

    # name plot
    if fam == '-':
        label = 'dTE-RNAs/%s' % orient
    elif fam == '*':
        label = 'TE-RNAs/%s' % orient
    elif rep == '*':
        label = '%s-RNAs/%s' % (fam,orient)
    else:
        label = '%s-RNAs/%s' % (rep,orient)

    # construct data frame
    df = {}
    df['fold'] = wo_te + w_te
    df['class'] = ['d%s' % label]*len(wo_te) + [label]*len(w_te)

    ggplot.plot('te_diff.r', df, [out_pdf])
Esempio n. 8
0
def cdf_plot(te_key, te_diffs, note_diffs, out_pdf, scale):
    rep, fam, orient = te_key

    # name plot
    if fam == '-':
        label = 'dTE-RNAs/%s' % orient
    elif fam == '*':
        label = 'TE-RNAs/%s' % orient
    elif rep == '*':
        label = '%s-RNAs/%s' % (fam,orient)
    else:
        label = '%s-RNAs/%s' % (rep,orient)

    # construct data frame
    df = {}
    df['diff'] = note_diffs + te_diffs
    df['class'] = ['d%s' % label]*len(note_diffs) + [label]*len(te_diffs)

    ggplot.plot('%s/te_diff.r' % os.environ['RDIR'], df, [out_pdf, scale])
Esempio n. 9
0
def plot_coverage(bam_te_coverages, dfam_te, orient, labels, out_dir):

    df = {'indexes':[], 'coverage':[], 'coverage_norm':[], 'data':[]}
    for i in range(len(bam_te_coverages)):        
        bam_coverage = bam_te_coverages[i].get((dfam_te,orient),[])

        if len(bam_coverage) > 0:
            df['indexes'] += range(len(bam_coverage))
            df['data'] += [labels[i]]*len(bam_coverage)

            cov_sum = float(sum(bam_coverage))
            bam_coverage_norm = [c/cov_sum for c in bam_coverage]

            if orient == 'rev':
                df['coverage'] += bam_coverage[::-1]
                df['coverage_norm'] += bam_coverage_norm[::-1]
            else:
                df['coverage'] += bam_coverage
                df['coverage_norm'] += bam_coverage_norm

    if len(df['indexes']) > 0:
        out_pre = '%s/%s_%s_cov' % (out_dir, dfam_te, orient)
        ggplot.plot('%s/te_cov.r' % tempura.r_dir, df, [dfam_te, out_pre])
Esempio n. 10
0
def main():
    usage = 'usage: %prog [options] <mode=mid/span> <anchor_gff> <event_bam1,event_bam2,...|event_gff1,event_gff2,...>'
    parser = OptionParser(usage)
    parser.add_option('-a', dest='max_anchors', default=1000, type='int', help='Maximum number of anchors to consider [Default: %default]')
    parser.add_option('-c', dest='control_files', default=None, help='Control BAM or GFF files (comma separated)')
    parser.add_option('-e', dest='plot_heat', default=False, help='Plot as a heatmap [Default: %default]')
    parser.add_option('-l', dest='log', default=False, action='store_true', help='log2 coverage [Default: %default]')
    parser.add_option('-o', dest='output_pre', default='gff_cov', help='Output prefix [Default: %default]')
    parser.add_option('-s', dest='sorted_gene_files', help='Files of sorted gene lists. Plot heatmaps in their order')

    parser.add_option('-b', dest='bins', default=100, type='int', help='Number of bins across the gene span [Default: %default]')
    parser.add_option('-m', dest='min_length', default=None, type='int', help='Minimum anchor length [Default: %default]')

    parser.add_option('-w', dest='window', default=2000, type='int', help='Window around peak middle [Default: %default]')

    (options,args) = parser.parse_args()

    if len(args) != 3:
        parser.error('Must provide mode, anchor GFF, and BAM/GFF file(s)')
    else:
        mode = args[0]
        anchor_gff = args[1]
        event_files = args[2].split(',')

    if options.control_files:
        control_files = options.control_files.split(',')

    anchor_is_gtf = (anchor_gff[-4:] == '.gtf')

    # preprocess anchor GFF
    prep_anchor_fd, prep_anchor_gff = preprocess_anchors(anchor_gff, mode, options.max_anchors, anchor_is_gtf, options.min_length, options.window)

    ############################################
    # compute coverage
    ############################################
    coverage, events = compute_coverage(prep_anchor_gff, event_files, mode, anchor_is_gtf, options.bins)
    if options.control_files:
        coverage_control, events_control = compute_coverage(prep_anchor_gff, control_files, mode, anchor_is_gtf, options.bins)

    # clean
    os.close(prep_anchor_fd)
    os.remove(prep_anchor_gff)

    ############################################
    # normalize
    ############################################
    # normalize coverages (and add pseudocounts)
    for anchor_id in coverage:
        for i in range(len(coverage[anchor_id])):
            coverage[anchor_id][i] = (1+coverage[anchor_id][i])/float(events)
            if options.control_files:
                coverage_control[anchor_id][i] = (1+coverage_control[anchor_id][i])/float(events_control)

    ############################################
    # sort anchors
    ############################################
    anchors_sorted = []
    if options.sorted_gene_files:
        # for each sorted list
        for sorted_gene_file in options.sorted_gene_files.split(','):
            # collect anchor_id's
            anchors_sorted.append([])
            for line in open(sorted_gene_file):
                anchor_id = line.split()[0]
                # verify randomly selected
                if anchor_id in coverage:
                    anchors_sorted[-1].append(anchor_id)

    else:
        # tuple anchor_id's with mean coverage
        stat_aid = []
        for anchor_id in coverage:
            if options.control_files:
                astat = stats.mean([math.log(coverage[anchor_id][i],2) - math.log(coverage_control[anchor_id][i],2) for i in range(len(coverage[anchor_id]))])
            else:
                astat = stats.geo_mean([coverage[anchor_id][i] for i in range(len(coverage[anchor_id]))])

            stat_aid.append((astat, anchor_id))

        # sort
        stat_aid.sort(reverse=True)

        # store as the only sorted list
        anchors_sorted.append([anchor_id for (astat, anchor_id) in stat_aid])

    ############################################
    # plot heatmap(s)
    ############################################
    if options.plot_heat:
        # if multiple sorts, create a dir for the plots
        if len(anchors_sorted) > 1:
            if not os.path.isdir('%s_heat' % options.output_pre):
                os.mkdir('%s_heat' % options.output_pre)

        for s in range(len(anchors_sorted)):
            df = {'Index':[], 'Anchor':[], 'Coverage':[]}
            for si in range(len(anchors_sorted[s])):
                anchor_id = anchors_sorted[s][si]

                for i in range(len(coverage[anchor_id])):
                    if mode == 'mid':
                        df['Index'].append(i - options.window/2)
                    else:
                        df['Index'].append(i)
                    df['Anchor'].append(anchor_id)

                    if options.log:
                        cov = math.log(coverage[anchor_id][i], 2)
                    else:
                        cov = coverage[anchor_id][i]

                    if options.control_files:
                        if options.log:
                            cov -= math.log(coverage_control[anchor_id][i], 2)
                        else:
                            cov = cov / coverage_control[anchor_id][i]

                    df['Coverage'].append('%.4e' % cov)

            r_script = '%s/plot_gff_cov_heat.r' % os.environ['RDIR']
            if len(anchors_sorted) == 1:
                out_pdf = '%s_heat.pdf' % options.output_pre
            else:
                sorted_gene_file = options.sorted_gene_files.split(',')[s]
                sorted_gene_pre = os.path.splitext(os.path.split(sorted_gene_file)[-1])[0]
                out_pdf = '%s_heat/%s.pdf' % (options.output_pre,sorted_gene_pre)

            ggplot.plot(r_script, df, [out_pdf, options.control_files!=None])

    ############################################
    # plot meta-coverage
    ############################################
    df = {'Index':[], 'Coverage':[]}
    if options.control_files:
        df['Type'] = []

    if mode == 'mid':
        index_length = 2*(options.window/2) + 1
    elif mode == 'span':
        index_length = options.bins
    else:
        print >> sys.stderr, 'Unknown mode %s' % mode
        exit(1)

    for i in range(index_length):
        if mode == 'mid':
            df['Index'].append(i - options.window/2)
        else:
            df['Index'].append(i)

        if options.log:
            df['Coverage'].append(stats.geo_mean([coverage[anchor_id][i] for anchor_id in coverage]))
        else:
            df['Coverage'].append(stats.mean([coverage[anchor_id][i] for anchor_id in coverage]))

        if options.control_files:
            df['Type'].append('Primary')

            if mode == 'mid':
                df['Index'].append(i - options.window/2)
            else:
                df['Index'].append(i)

            df['Type'].append('Control')
            if options.log:
                df['Coverage'].append(stats.geo_mean([coverage_control[anchor_id][i] for anchor_id in coverage_control]))
            else:
                df['Coverage'].append(stats.mean([coverage_control[anchor_id][i] for anchor_id in coverage_control]))

    r_script = '%s/plot_gff_cov_meta.r' % os.environ['RDIR']
    ggplot.plot(r_script, df, [options.output_pre])
Esempio n. 11
0
def main():
    usage = 'usage: %prog [options] <peaks gff> <diff>'
    parser = OptionParser(usage)
    parser.add_option('-c',
                      dest='clip_fpkm_file',
                      help='Control FPKM tracking file')
    parser.add_option('-g',
                      dest='ref_gtf',
                      default='%s/gencode.v18.annotation.gtf' %
                      os.environ['GENCODE'])
    parser.add_option('--ggplot',
                      dest='ggplot_script',
                      default='%s/peaks_diff_compare.r' % os.environ['RDIR'],
                      help='Script to make plots with [Default: %default]')
    parser.add_option('-m',
                      dest='max_stat',
                      default=10,
                      type='float',
                      help='Max cuffdiff stat [Default: %default]')
    parser.add_option('-o',
                      dest='output_pre',
                      default='',
                      help='Output prefix [Default: %default]')
    parser.add_option('-r',
                      dest='rbp',
                      default='RBP',
                      help='RBP name [Default: %default]')
    parser.add_option('-s',
                      dest='single_gene_loci',
                      default=False,
                      action='store_true',
                      help='Only use single gene loci [Default: %default]')
    parser.add_option(
        '-t',
        dest='test_stat',
        default=False,
        action='store_true',
        help='Use test statistic rather than fold change [Default: %default]')
    parser.add_option('--sample1',
                      dest='sample1',
                      help='Sample_1 name in cuffdiff')
    parser.add_option('--sample2',
                      dest='sample2',
                      help='Sample_2 name in cuffdiff')
    (options, args) = parser.parse_args()

    if len(args) != 2:
        parser.error('Must provide peaks GFF and .diff file')
    else:
        peaks_gff = args[0]
        diff_file = args[1]

    ##################################################
    # process GTF
    ##################################################
    if options.single_gene_loci:
        single_gtf_fd, single_gtf_file = filter_single(options.ref_gtf)
        options.ref_gtf = single_gtf_file

    gtf_genes = gff.gtf_gene_set(options.ref_gtf)

    ##################################################
    # collect CLIP peak bound genes
    ##################################################
    peak_genes = set()
    p = subprocess.Popen('intersectBed -s -u -a %s -b %s' %
                         (options.ref_gtf, peaks_gff),
                         shell=True,
                         stdout=subprocess.PIPE)
    for line in p.stdout:
        peak_genes.add(gff.gtf_kv(line.split('\t')[8])['gene_id'])
    p.communicate()

    # find expressed genes in peak calls
    silent_genes = set()
    if options.clip_fpkm_file:
        silent_genes = find_silent(options.clip_fpkm_file)

    ##################################################
    # collect RIP stats
    ##################################################
    if options.test_stat:
        rip_fold, rip_bound = ripseq.hash_rip(diff_file,
                                              just_ok=True,
                                              use_fold=False,
                                              max_stat=options.max_stat,
                                              one_rbp=True)
    else:
        rip_fold, rip_bound = ripseq.hash_rip(diff_file,
                                              use_fold=True,
                                              max_stat=options.max_stat,
                                              one_rbp=True)
        rip_fold = ripseq.hash_rip_fold(diff_file,
                                        min_fpkm=0.125,
                                        max_fold=10,
                                        one_rbp=True)

    # TEMP: print bound genes
    # genes_out = open('%s_genes.txt' % options.output_pre, 'w')
    # for gene_id in rip_bound:
    #     if rip_bound[gene_id]:
    #         print >> genes_out, gene_id, rip_fold[gene_id]
    # genes_out.close()

    ##################################################
    # plot bound and unbound distributions
    ##################################################
    # construct data frame
    df_dict = {'Gene': [], 'CLIP': [], 'RIP': []}
    for gene_id in rip_fold:
        if gene_id in gtf_genes and (len(silent_genes) == 0
                                     or gene_id not in silent_genes):
            df_dict['Gene'].append(gene_id)
            df_dict['RIP'].append(rip_fold[gene_id])
            if gene_id in peak_genes:
                df_dict['CLIP'].append('Bound')
            else:
                df_dict['CLIP'].append('Unbound')

    ggplot.plot(options.ggplot_script, df_dict,
                [options.output_pre, options.rbp, options.test_stat])

    ##################################################
    # compute stats on bound and unbound distributions
    ##################################################
    bound_fold = [
        df_dict['RIP'][i] for i in range(len(df_dict['RIP']))
        if df_dict['CLIP'][i] == 'Bound'
    ]
    unbound_fold = [
        df_dict['RIP'][i] for i in range(len(df_dict['RIP']))
        if df_dict['CLIP'][i] == 'Unbound'
    ]

    # perform statistical test
    z, p = stats.mannwhitneyu(bound_fold, unbound_fold)

    stats_out = open('%s_stats.txt' % options.output_pre, 'w')
    cols = (options.rbp, len(bound_fold), stats.mean(bound_fold),
            len(unbound_fold), stats.mean(unbound_fold), z, p)
    print >> stats_out, '%-10s  %5d  %6.2f  %5d  %6.2f  %6.2f  %9.2e' % cols
    stats_out.close()

    ##################################################
    # plot venn diagram
    ##################################################
    rip_genes = set([
        df_dict['Gene'][i] for i in range(len(df_dict['Gene']))
        if rip_bound.get(df_dict['Gene'][i], False)
    ])

    clip_only = len(peak_genes - rip_genes)
    rip_only = len(rip_genes - peak_genes)
    both = len(peak_genes & rip_genes)

    if options.clip_fpkm_file:
        print >> sys.stderr, 'Ignoring silent genes for hypergeometric test'

    # k is x
    # K is n
    # N is M
    # n is N
    # hypergeom.sf(x, M, n, N, loc=0)

    p1 = hypergeom.sf(both - 1, len(gtf_genes), len(peak_genes),
                      len(rip_genes))
    p2 = hypergeom.sf(both - 1, len(gtf_genes), len(rip_genes),
                      len(peak_genes))

    hyper_out = open('%s_hyper.txt' % options.output_pre, 'w')
    cols = (p1, p2, both, clip_only, rip_only, len(peak_genes), len(rip_genes),
            len(gtf_genes))
    print >> hyper_out, '%7.2e  %7.2e  %5d  %5d  %5d  %5d  %5d %5d' % cols
    hyper_out.close()

    if clip_only > 0 and rip_only > 0:
        plt.figure()
        # venn_diag = venn2(subsets=(clip_only, rip_only, both), set_labels=['CLIP', 'fRIP'], set_colors=['#e41a1c', '#377eb8'])
        # venn_diag = venn2(subsets=(clip_only, rip_only, both), set_labels=['CLIP', 'fRIP'], set_colors=['#e41a1c', '#1ae47d'])
        venn_diag = venn2(subsets=(clip_only, rip_only, both),
                          set_labels=['CLIP', 'fRIP'],
                          set_colors=['#e41a1c', '#A1A838'])
        plt.savefig('%s_venn.pdf' % options.output_pre)

    ##################################################
    # clean
    ##################################################
    if options.single_gene_loci:
        os.close(single_gtf_fd)
        os.remove(single_gtf_file)
Esempio n. 12
0
def main():
    usage = 'usage: %prog [options] <gff> <bam1,bam2,...>'
    parser = OptionParser(usage)
    parser.add_option('-c', dest='control_bam_files', default=None, help='Control BAM files (comma separated)')
    parser.add_option('-l', dest='log', default=False, action='store_true', help='log2 coverage [Default: %default]')
    parser.add_option('-k', dest='gtf_key', default=None, help='GTF key to hash gff entries by')
    parser.add_option('-m', dest='max_features', default=2000, type='int', help='Maximum number of features to plot [Default: %default]')
    parser.add_option('-o', dest='output_pre', default='bam', help='Output prefix [Default: %default]')
    parser.add_option('-s', dest='sorted_gene_files', help='Files of sorted gene lists. Plot heatmaps in their order')
    parser.add_option('-u', dest='range', default=2000, type='int', help='Range around peak middle [Default: %default]')
    (options,args) = parser.parse_args()

    if len(args) != 2:
        parser.error('Must provide gtf file and BAM file')
    else:
        gff_file = args[0]
        bam_files = args[1].split(',')

    if options.control_bam_files:
        control_bam_files = options.control_bam_files.split(',')

    ############################################
    # extend GFF entries to range (and sample)
    ############################################
    feature_count = 0
    for line in open(gff_file):
        feature_count += 1

    sample_prob = min(1.0, options.max_features / float(feature_count))

    gff_range_fd, gff_range_file = tempfile.mkstemp()
    gff_range_out = open(gff_range_file, 'w')

    for line in open(gff_file):
        a = line.split('\t')
        
        start = int(a[3])
        end = int(a[4])
        mid = start + (end-start)/2
        a[3] = str(mid - options.range/2)
        a[4] = str(mid + options.range/2)
        a[-1] = a[-1].rstrip()

        if random.random() < sample_prob:
            print >> gff_range_out, '\t'.join(a)

    gff_range_out.close()

    ############################################
    # compute coverage
    ############################################
    coverage, fragments = compute_coverage(gff_range_file, bam_files, options.gtf_key)
    if options.control_bam_files:
        coverage_control, fragments_control = compute_coverage(gff_range_file, control_bam_files, options.gtf_key)

    # clean
    os.close(gff_range_fd)
    os.remove(gff_range_file)

    ############################################
    # normalize
    ############################################
    # normalize coverages (and add pseudocounts)
    for feature_id in coverage:
        for i in range(len(coverage[feature_id])):
            coverage[feature_id][i] = (1+coverage[feature_id][i])/fragments
            if options.control_bam_files:
                coverage_control[feature_id][i] = (1+coverage_control[feature_id][i])/fragments_control    

    ############################################
    # sorted genes
    ############################################
    features_sorted = []
    if options.sorted_gene_files:
        # for each sorted list
        for sorted_gene_file in options.sorted_gene_files.split(','):
            # collect feature_id's
            features_sorted.append([])
            for line in open(sorted_gene_file):
                feature_id = line.split()[0]
                # verify randomly selected
                if feature_id in coverage:
                    features_sorted[-1].append(feature_id)

    else:
        # tuple feature_id's with mean coverage
        feature_id_stat = []
        for feature_id in coverage:
            if options.control_bam_files:
                feature_stat = stats.mean([math.log(coverage[feature_id][i],2) - math.log(coverage_control[feature_id][i],2) for i in range(len(coverage[feature_id]))])
            else:
                feature_stat = stats.geo_mean([coverage[feature_id][i] for i in range(len(coverage[feature_id]))])

            feature_id_stat.append((feature_stat,feature_id))

        # sort
        feature_id_stat.sort(reverse=True)

        # store as the only sorted list
        features_sorted.append([feature_id for (feature_stat, feature_id) in feature_id_stat])

    ############################################
    # plot heatmap(s)
    ############################################
    # if multiple sorts, create a dir for the plots
    if len(features_sorted) > 1:
        if not os.path.isdir('%s_heat' % options.output_pre):
            os.mkdir('%s_heat' % options.output_pre)

    for s in range(len(features_sorted)):
        df = {'Index':[], 'Feature':[], 'Coverage':[]}
        for f in range(len(features_sorted[s])):
            feature_id = features_sorted[s][f]
            for i in range(-options.range/2,options.range/2+1):
                df['Index'].append(i)
                df['Feature'].append(f)

                if options.log:
                    cov = math.log(coverage[feature_id][i+options.range/2],2)
                else:
                    cov = coverage[feature_id][i+options.range/2]

                if options.control_bam_files:
                    if options.log:
                        cov -= math.log(coverage_control[feature_id][i+options.range/2],2)
                    else:
                        cov = cov / coverage_control[feature_id][i+options.range/2]

                df['Coverage'].append('%.4e' % cov)

        r_script = '%s/bam_heat_heat.r' % os.environ['RDIR']
        if len(features_sorted) == 1:
            out_pdf = '%s_heat.pdf' % options.output_pre
        else:
            sorted_gene_file = options.sorted_gene_files.split(',')[s]
            sorted_gene_pre = os.path.splitext(os.path.split(sorted_gene_file)[-1])[0]
            out_pdf = '%s_heat/%s.pdf' % (options.output_pre,sorted_gene_pre)

        ggplot.plot(r_script, df, [out_pdf, options.control_bam_files!=None], df_file='df_heat.txt')

    ############################################
    # plot meta-coverage
    ############################################
    df = {'Index':[], 'Coverage':[]}
    if options.control_bam_files:
        df['Type'] = []

    for i in range(-options.range/2,options.range/2+1):
        df['Index'].append(i)

        if options.log:
            df['Coverage'].append(stats.geo_mean([coverage[feature_id][i+options.range/2] for feature_id in coverage]))
        else:
            df['Coverage'].append(stats.mean([coverage[feature_id][i+options.range/2] for feature_id in coverage]))

        if options.control_bam_files:
            df['Type'].append('Primary')

            df['Index'].append(i)
            df['Type'].append('Control')
            if options.log:
                df['Coverage'].append(stats.geo_mean([coverage_control[feature_id][i+options.range/2] for feature_id in coverage_control]))
            else:
                df['Coverage'].append(stats.mean([coverage_control[feature_id][i+options.range/2] for feature_id in coverage_control]))

    r_script = '%s/bam_heat_meta.r' % os.environ['RDIR']
    out_pdf = '%s_meta.pdf' % options.output_pre

    ggplot.plot(r_script, df, [out_pdf], df_file='df_meta.txt')
Esempio n. 13
0
def main():
    usage = 'usage: %prog [options] <gtf> <diff>'
    parser = OptionParser(usage)
    parser.add_option('-o', dest='out_dir', default='te_diff_regress', help='Output directory to print regression summaries [Default: %default]')
    parser.add_option('-c', dest='scale', default=1, type='float', help='Plot scale [Default: %default]')


    parser.add_option('-t', dest='te_gff', default='%s/hg19.fa.out.tp.gff'%os.environ['MASK'])
    parser.add_option('-r', dest='orientation', default=False, action='store_true', help='Split TEs by orientation [Default: %default]')

    parser.add_option('-m', dest='max_stat', default=None, type='float', help='Maximum stat for plotting [Default: %default]')

    parser.add_option('-s', dest='spread_factor', default=None, type='float', help='Allow multiplicative factor between the shortest and longest transcripts, used to filter [Default: %default]')
    parser.add_option('-l', dest='spread_lower', default=None, type='float', help='Allow multiplicative factor between median and shortest transcripts [Defafult: %default]')
    parser.add_option('-u', dest='spread_upper', default=None, type='float', help='Allow multiplicative factor between median and longest transcripts [Defafult: %default]')

    (options,args) = parser.parse_args()

    if len(args) != 2:
        parser.error('Must provide .gtf and .diff files')
    else:
        ref_gtf = args[0]
        diff_file = args[1]

    # make output directory
    if not os.path.isdir(options.out_dir):
        os.mkdir(options.out_dir)

    if options.spread_factor or options.spread_lower or options.spread_upper:
        # filter for similar length

        if options.spread_factor:
            options.spread_lower = math.sqrt(options.spread_factor)
            options.spread_upper = math.sqrt(options.spread_factor)

        spread_gtf = '%s/spread_factor.gtf' % options.out_dir
        gff.length_filter(ref_gtf, spread_gtf, options.spread_lower, options.spread_lower, verbose=True)

        ref_gtf = spread_gtf

    # hash genes -> TEs -> occurence num
    gene_te_num = te.hash_genes_repeats_num(ref_gtf, options.te_gff, gene_key='transcript_id', add_star=True, stranded=options.orientation)

    # hash diffs stats
    gene_diffs = cuffdiff.hash_diff(diff_file, stat='fold', max_stat=options.max_stat, sample_first='input')

    table_lines = []
    pvals = []

    for spair in gene_diffs:
        sample1, sample2 = spair

        gene_list = list(set(gene_te_num.keys()) & set(gene_diffs[spair].keys()))
                
        for fam in count_tes:
            if options.orientation:
                orients = ['+','-']
            else:
                orients = ['+']

            for orient in orients:
                # hash diff values by TE count
                count_diff = []
                for gene_id in gene_diffs[spair]:
                    if options.orientation:
                        count = gene_te_num.get(gene_id,{}).get(('*',fam,orient), 0)
                    else:
                        count = gene_te_num.get(gene_id,{}).get(('*',fam), 0)

                    while count >= len(count_diff):
                        count_diff.append([])
                    count_diff[count].append(gene_diffs[spair][gene_id])

                df = {'TEs':[], 'stat_low':[], 'stat_mid':[], 'stat_hi':[]}
                for c in range(len(count_diff)):
                    if len(count_diff[c]) > 12:
                        stat_low, stat_mid, stat_hi = stats.quantile(count_diff[c], [.25, .5, .75])
                        df['TEs'].append(c)
                        df['stat_low'].append(stat_low)
                        df['stat_mid'].append(stat_mid)
                        df['stat_hi'].append(stat_hi)
                    else:
                        break

                if len(df['TEs']) > 1:
                    fam_plot = fam[fam.find('/')+1:]

                    if options.orientation:                        
                        out_pdf = '%s/%s-%s_%s_%s.pdf' % (options.out_dir, sample1, sample2, fam_plot, orient)
                        out_df = '%s/%s-%s_%s_%s.df' % (options.out_dir, sample1, sample2, fam_plot, orient)
                    else:
                        out_pdf = '%s/%s-%s_%s.pdf' % (options.out_dir, sample1, sample2, fam_plot)
                        out_df = '%s/%s-%s_%s.df' % (options.out_dir, sample1, sample2, fam_plot)

                    ggplot.plot('%s/te_diff_count.r' % os.environ['RDIR'], df, [out_pdf, options.scale], df_file=out_df)

    if options.spread_factor or options.spread_lower or options.spread_upper:
        os.remove(spread_gtf)
Esempio n. 14
0
def main():
    usage = 'usage: %prog [options] <gtf> <diff>'
    parser = OptionParser(usage)
    parser.add_option(
        '-o',
        dest='out_dir',
        default='te_diff_regress',
        help=
        'Output directory to print regression summaries [Default: %default]')
    parser.add_option('-c',
                      dest='scale',
                      default=1,
                      type='float',
                      help='Plot scale [Default: %default]')

    parser.add_option('-t',
                      dest='te_gff',
                      default='%s/hg19.fa.out.tp.gff' % os.environ['MASK'])
    parser.add_option('-r',
                      dest='orientation',
                      default=False,
                      action='store_true',
                      help='Split TEs by orientation [Default: %default]')

    parser.add_option('-m',
                      dest='max_stat',
                      default=None,
                      type='float',
                      help='Maximum stat for plotting [Default: %default]')

    parser.add_option(
        '-s',
        dest='spread_factor',
        default=None,
        type='float',
        help=
        'Allow multiplicative factor between the shortest and longest transcripts, used to filter [Default: %default]'
    )
    parser.add_option(
        '-l',
        dest='spread_lower',
        default=None,
        type='float',
        help=
        'Allow multiplicative factor between median and shortest transcripts [Defafult: %default]'
    )
    parser.add_option(
        '-u',
        dest='spread_upper',
        default=None,
        type='float',
        help=
        'Allow multiplicative factor between median and longest transcripts [Defafult: %default]'
    )

    (options, args) = parser.parse_args()

    if len(args) != 2:
        parser.error('Must provide .gtf and .diff files')
    else:
        ref_gtf = args[0]
        diff_file = args[1]

    # make output directory
    if not os.path.isdir(options.out_dir):
        os.mkdir(options.out_dir)

    if options.spread_factor or options.spread_lower or options.spread_upper:
        # filter for similar length

        if options.spread_factor:
            options.spread_lower = math.sqrt(options.spread_factor)
            options.spread_upper = math.sqrt(options.spread_factor)

        spread_gtf = '%s/spread_factor.gtf' % options.out_dir
        gff.length_filter(ref_gtf,
                          spread_gtf,
                          options.spread_lower,
                          options.spread_lower,
                          verbose=True)

        ref_gtf = spread_gtf

    # hash genes -> TEs -> occurence num
    gene_te_num = te.hash_genes_repeats_num(ref_gtf,
                                            options.te_gff,
                                            gene_key='transcript_id',
                                            add_star=True,
                                            stranded=options.orientation)

    # hash diffs stats
    gene_diffs = cuffdiff.hash_diff(diff_file,
                                    stat='fold',
                                    max_stat=options.max_stat,
                                    sample_first='input')

    table_lines = []
    pvals = []

    for spair in gene_diffs:
        sample1, sample2 = spair

        gene_list = list(
            set(gene_te_num.keys()) & set(gene_diffs[spair].keys()))

        for fam in count_tes:
            if options.orientation:
                orients = ['+', '-']
            else:
                orients = ['+']

            for orient in orients:
                # hash diff values by TE count
                count_diff = []
                for gene_id in gene_diffs[spair]:
                    if options.orientation:
                        count = gene_te_num.get(gene_id, {}).get(
                            ('*', fam, orient), 0)
                    else:
                        count = gene_te_num.get(gene_id, {}).get(('*', fam), 0)

                    while count >= len(count_diff):
                        count_diff.append([])
                    count_diff[count].append(gene_diffs[spair][gene_id])

                df = {'TEs': [], 'stat_low': [], 'stat_mid': [], 'stat_hi': []}
                for c in range(len(count_diff)):
                    if len(count_diff[c]) > 12:
                        stat_low, stat_mid, stat_hi = stats.quantile(
                            count_diff[c], [.25, .5, .75])
                        df['TEs'].append(c)
                        df['stat_low'].append(stat_low)
                        df['stat_mid'].append(stat_mid)
                        df['stat_hi'].append(stat_hi)
                    else:
                        break

                if len(df['TEs']) > 1:
                    fam_plot = fam[fam.find('/') + 1:]

                    if options.orientation:
                        out_pdf = '%s/%s-%s_%s_%s.pdf' % (options.out_dir,
                                                          sample1, sample2,
                                                          fam_plot, orient)
                        out_df = '%s/%s-%s_%s_%s.df' % (options.out_dir,
                                                        sample1, sample2,
                                                        fam_plot, orient)
                    else:
                        out_pdf = '%s/%s-%s_%s.pdf' % (
                            options.out_dir, sample1, sample2, fam_plot)
                        out_df = '%s/%s-%s_%s.df' % (options.out_dir, sample1,
                                                     sample2, fam_plot)

                    ggplot.plot('%s/te_diff_count.r' % os.environ['RDIR'],
                                df, [out_pdf, options.scale],
                                df_file=out_df)

    if options.spread_factor or options.spread_lower or options.spread_upper:
        os.remove(spread_gtf)
Esempio n. 15
0
def main():
    usage = 'usage:%prog [options] <bed_file> <msa_file>'
    parser = OptionParser(usage)
    parser.add_option(
        '-c',
        dest='consensus_pct',
        default=0.5,
        type='float',
        help=
        'Required proportion of columns with a valid nt to consider it a consensus column [Default: %default]'
    )
    parser.add_option(
        '-d',
        dest='dfam_consensus',
        action='store_true',
        help='Pass the option if you want to use Consensus as defined by Dfam')
    #parser.add_option('-j', dest='condense_pct', type='float', help='Required proportion of entries to be same between 2 columns for them to be merged')
    #parser.add_option('-n', dest='discretize_bins', type='int', help='The number of bins you want to discretize the scores into')
    parser.add_option('-o',
                      dest='output_pre',
                      type='string',
                      help='Prefix of the output files')
    (options, args) = parser.parse_args()

    if len(args) != 2:
        parser.error('Must provide both the BED file and MSA file. Check %s' %
                     usage)
    else:
        bed_file = args[0]
        msa_fasta_file = args[1]

    ##################################################
    # hash scores
    ##################################################
    seq_scores = {}
    for line in open(bed_file):
        a = line.split('\t')
        header = a[0] + ':' + a[1] + '-' + a[2]
        score = float(a[4])
        seq_scores[header] = score

    ##################################################
    # define consensus
    # define columns to condense for regression
    ##################################################
    msa_sequences = {}
    for line in open(msa_fasta_file):
        if line[0] == '>':
            header = line.strip()
            msa_sequences[header] = ''
        else:
            msa_sequences[header] += line.strip()

    if options.dfam_consensus is True:
        consensus_sequence = msa_sequences.pop('>Consensus')
        sequence_length = len(consensus_sequence)
        consensus_columns = []
        for i in range(0, len(consensus_sequence)):
            if consensus_sequence[i] == 'x':
                consensus_columns.append(i)
    else:
        consensus_columns = define_consensus(msa_fasta_file,
                                             options.consensus_pct)
        #sample_sequence = msa_sequences.pop('>Consensus')
        #sequence_length = len(sample_sequence)

    #hamming_cutoff = int(sequence_length - options.condense_pct*sequence_length)
    #condensed_columns, columns_ls_remove = column_condense(msa_sequences, consensus_columns, hamming_cutoff)

    ##################################################
    # map sequences to feature vectors
    ##################################################
    # initialize the dictionary with score and position/nt features
    df_mi = {'Score': []}
    for i in range(len(consensus_columns)):
        position = i + 1
        df_mi[position] = []

    header = ''
    for line in open(msa_fasta_file):
        if line[0] == '>':
            if header and header != 'Consensus':
                # process seq
                df_mi['Score'].append(seq_scores[header])
                for i in range(len(consensus_columns)):
                    position = i + 1
                    seq_i = consensus_columns[i]
                    nt = seq[seq_i].upper()
                    df_mi[position].append(nt)

            header = line[1:].rstrip()
            seq = ''

        else:
            seq += line.rstrip()

    if header and header != 'Consensus':
        # process last seq
        df_mi['Score'].append(seq_scores[header])
        for i in range(len(consensus_columns)):
            position = i + 1
            seq_i = consensus_columns[i]
            nt = seq[seq_i].upper()
            df_mi[position].append(nt)

    ggplot.plot('%s/te_mut_info.r' % tempura.r_dir, df_mi,
                [options.output_pre])
Esempio n. 16
0
def main():
    usage = 'usage: %prog [options] <gff> <bam1,bam2,...>'
    parser = OptionParser(usage)
    parser.add_option('-c',
                      dest='control_bam_files',
                      default=None,
                      help='Control BAM files (comma separated)')
    parser.add_option('-l',
                      dest='log',
                      default=False,
                      action='store_true',
                      help='log2 coverage [Default: %default]')
    parser.add_option('-k',
                      dest='gtf_key',
                      default=None,
                      help='GTF key to hash gff entries by')
    parser.add_option(
        '-m',
        dest='max_features',
        default=2000,
        type='int',
        help='Maximum number of features to plot [Default: %default]')
    parser.add_option('-o',
                      dest='output_pre',
                      default='bam',
                      help='Output prefix [Default: %default]')
    parser.add_option(
        '-s',
        dest='sorted_gene_files',
        help='Files of sorted gene lists. Plot heatmaps in their order')
    parser.add_option('-u',
                      dest='range',
                      default=2000,
                      type='int',
                      help='Range around peak middle [Default: %default]')
    (options, args) = parser.parse_args()

    if len(args) != 2:
        parser.error('Must provide gtf file and BAM file')
    else:
        gff_file = args[0]
        bam_files = args[1].split(',')

    if options.control_bam_files:
        control_bam_files = options.control_bam_files.split(',')

    ############################################
    # extend GFF entries to range (and sample)
    ############################################
    feature_count = 0
    for line in open(gff_file):
        feature_count += 1

    sample_prob = min(1.0, options.max_features / float(feature_count))

    gff_range_fd, gff_range_file = tempfile.mkstemp()
    gff_range_out = open(gff_range_file, 'w')

    for line in open(gff_file):
        a = line.split('\t')

        start = int(a[3])
        end = int(a[4])
        mid = start + (end - start) / 2

        range_start = mid - options.range / 2
        range_end = mid + options.range / 2

        if range_start > 0:
            a[3] = str(mid - options.range / 2)
            a[4] = str(mid + options.range / 2)
            a[-1] = a[-1].rstrip()

            if random.random() < sample_prob:
                print >> gff_range_out, '\t'.join(a)

    gff_range_out.close()

    ############################################
    # compute coverage
    ############################################
    coverage, fragments = compute_coverage(gff_range_file, bam_files,
                                           options.gtf_key)
    if options.control_bam_files:
        coverage_control, fragments_control = compute_coverage(
            gff_range_file, control_bam_files, options.gtf_key)

    # clean
    os.close(gff_range_fd)
    os.remove(gff_range_file)

    ############################################
    # normalize
    ############################################
    # normalize coverages (and add pseudocounts)
    for feature_id in coverage:
        for i in range(len(coverage[feature_id])):
            coverage[feature_id][i] = (1 + coverage[feature_id][i]) / fragments
            if options.control_bam_files:
                coverage_control[feature_id][i] = (
                    1 + coverage_control[feature_id][i]) / fragments_control

    ############################################
    # sorted genes
    ############################################
    features_sorted = []
    if options.sorted_gene_files:
        # for each sorted list
        for sorted_gene_file in options.sorted_gene_files.split(','):
            # collect feature_id's
            features_sorted.append([])
            for line in open(sorted_gene_file):
                feature_id = line.split()[0]
                # verify randomly selected
                if feature_id in coverage:
                    features_sorted[-1].append(feature_id)

    else:
        # tuple feature_id's with mean coverage
        feature_id_stat = []
        for feature_id in coverage:
            if options.control_bam_files:
                feature_stat = stats.mean([
                    math.log(coverage[feature_id][i], 2) -
                    math.log(coverage_control[feature_id][i], 2)
                    for i in range(len(coverage[feature_id]))
                ])
            else:
                feature_stat = stats.geo_mean([
                    coverage[feature_id][i]
                    for i in range(len(coverage[feature_id]))
                ])

            feature_id_stat.append((feature_stat, feature_id))

        # sort
        feature_id_stat.sort(reverse=True)

        # store as the only sorted list
        features_sorted.append(
            [feature_id for (feature_stat, feature_id) in feature_id_stat])

    ############################################
    # plot heatmap(s)
    ############################################
    # if multiple sorts, create a dir for the plots
    if len(features_sorted) > 1:
        if not os.path.isdir('%s_heat' % options.output_pre):
            os.mkdir('%s_heat' % options.output_pre)

    for s in range(len(features_sorted)):
        df = {'Index': [], 'Feature': [], 'Coverage': []}
        for f in range(len(features_sorted[s])):
            feature_id = features_sorted[s][f]
            for i in range(-options.range / 2, options.range / 2 + 1):
                df['Index'].append(i)
                df['Feature'].append(f)

                if options.log:
                    cov = math.log(coverage[feature_id][i + options.range / 2],
                                   2)
                else:
                    cov = coverage[feature_id][i + options.range / 2]

                if options.control_bam_files:
                    if options.log:
                        cov -= math.log(
                            coverage_control[feature_id][i +
                                                         options.range / 2], 2)
                    else:
                        cov = cov / coverage_control[feature_id][
                            i + options.range / 2]

                df['Coverage'].append('%.4e' % cov)

        r_script = '%s/bam_heat_heat.r' % os.environ['RDIR']
        if len(features_sorted) == 1:
            out_pdf = '%s_heat.pdf' % options.output_pre
        else:
            sorted_gene_file = options.sorted_gene_files.split(',')[s]
            sorted_gene_pre = os.path.splitext(
                os.path.split(sorted_gene_file)[-1])[0]
            out_pdf = '%s_heat/%s.pdf' % (options.output_pre, sorted_gene_pre)

        ggplot.plot(r_script, df, [out_pdf, options.control_bam_files != None])

    ############################################
    # plot meta-coverage
    ############################################
    df = {'Index': [], 'Coverage': []}
    if options.control_bam_files:
        df['Type'] = []

    for i in range(-options.range / 2, options.range / 2 + 1):
        df['Index'].append(i)

        if options.log:
            df['Coverage'].append(
                stats.geo_mean([
                    coverage[feature_id][i + options.range / 2]
                    for feature_id in coverage
                ]))
        else:
            df['Coverage'].append(
                stats.mean([
                    coverage[feature_id][i + options.range / 2]
                    for feature_id in coverage
                ]))

        if options.control_bam_files:
            df['Type'].append('Primary')

            df['Index'].append(i)
            df['Type'].append('Control')
            if options.log:
                df['Coverage'].append(
                    stats.geo_mean([
                        coverage_control[feature_id][i + options.range / 2]
                        for feature_id in coverage_control
                    ]))
            else:
                df['Coverage'].append(
                    stats.mean([
                        coverage_control[feature_id][i + options.range / 2]
                        for feature_id in coverage_control
                    ]))

    r_script = '%s/bam_heat_meta.r' % os.environ['RDIR']
    out_pdf = '%s_meta.pdf' % options.output_pre

    ggplot.plot(r_script, df, [out_pdf])
Esempio n. 17
0
def main():
    usage = 'usage: %prog [options] <fpkm_tracking>'
    parser = OptionParser(usage)
    parser.add_option('-a',
                      dest='max_fpkm',
                      type='float',
                      help='Maxium log2 FPKM to plot [Default: %d]')
    parser.add_option(
        '-d',
        dest='diff_file',
        help='Limit to significantly differentially expressed genes')
    parser.add_option('-g', dest='gtf', help='GTF file of genes to display')
    parser.add_option('-m',
                      dest='min_fpkm',
                      default=0,
                      type='float',
                      help='Minimum FPKM [Default: %default]')
    parser.add_option('-p',
                      dest='pseudocount',
                      default=.125,
                      type='float',
                      help='Pseudocount for log FPKM [Default: %default]')
    parser.add_option('-o',
                      dest='out_pdf',
                      default='cuff_heat.pdf',
                      help='Output PDF [Default: %default]')
    parser.add_option(
        '-s',
        dest='sample',
        default=1000,
        type='int',
        help='Sample genes rather than use all [Default: %default]')
    parser.add_option('-u',
                      dest='uppercase',
                      default=False,
                      action='store_true',
                      help='Uppercase sample labels [Default: %default]')
    (options, args) = parser.parse_args()

    if len(args) != 1:
        parser.error('Must provide fpkm_tracking')
    else:
        fpkm_tracking = args[0]

    # load expression data
    cuff = cufflinks.fpkm_tracking(fpkm_file=fpkm_tracking)

    # determine genes
    all_genes = set(cuff.genes)
    if options.gtf:
        all_genes = set()
        for line in open(options.gtf):
            a = line.split('\t')
            all_genes.add(gff.gtf_kv(a[8])['gene_id'])

    if options.diff_file:
        # limit to differentially expressed genes
        diff_genes = find_diff(options.diff_file)
        all_genes &= diff_genes

    else:
        # at least limit to clean genes
        clean_genes = set()
        for gene_id in all_genes:
            ge = cuff.gene_expr(gene_id)
            clean = True
            for i in range(len(ge)):
                if math.isnan(ge[i]):
                    clean = False
                    break
            if clean:
                clean_genes.add(gene_id)

        all_genes &= clean_genes

    if options.min_fpkm > 0:
        expressed_genes = set()
        for gene_id in all_genes:
            ge = cuff.gene_expr(gene_id, not_found=0, fail=0)
            if max(ge) > options.min_fpkm:
                expressed_genes.add(gene_id)

        all_genes &= expressed_genes

    # sample genes to display
    if len(all_genes) <= options.sample:
        display_genes = all_genes
    else:
        display_genes = random.sample(all_genes, options.sample)

    # build data frame
    df = {'Gene': [], 'FPKM': [], 'Sample': []}

    for gene_id in display_genes:
        ge = cuff.gene_expr(gene_id, not_found=0, fail=0)

        for i in range(len(cuff.experiments)):
            df['Gene'].append(gene_id)

            df['Sample'].append(cuff.experiments[i])
            if options.uppercase:
                df['Sample'][-1] = df['Sample'][-1].upper()

            logfpkm = np.log2(ge[i] + options.pseudocount)
            if options.max_fpkm:
                logfpkm = min(options.max_fpkm, logfpkm)
            df['FPKM'].append(logfpkm)

    # plot
    out_df = '%s.df' % options.out_pdf[:-4]
    ggplot.plot('%s/cuff_heat.r' % os.environ['RDIR'],
                df, [options.out_pdf],
                df_file=out_df)
Esempio n. 18
0
def main():
    usage = 'usage:%prog [options] <bed_file> <msa_file>'
    parser = OptionParser(usage)
    parser.add_option('-c', dest='consensus_pct', default=0.5, type='float', help='Required proportion of columns with a valid nt to consider it a consensus column [Default: %default]')
    parser.add_option('-m', dest='model_output_file', default='ols_summary.txt', help='The file to write the model summary')
    parser.add_option('-p', dest='plot_output_file', default='weights_plot.pdf', help='The file to print the plot of index versus weight')
    (options, args) = parser.parse_args()
    
    if len(args) != 2:
        parser.error('Must provide BED file with scores and MSA fasta file')
    else:
        bed_file = args[0]
        msa_fasta_file = args[1]

    ##################################################
    # hash scores
    ##################################################
    seq_scores = {}
    for line in open(bed_file):
        a = line.split('\t')
        header = a[3]
        score = float(a[4])
        seq_scores[header] = score

        
    ##################################################
    # define consensus
    ##################################################
    consensus_columns = define_consensus(msa_fasta_file, options.consensus_pct)


    ##################################################
    # map sequences to feature vectors
    ##################################################
    quaternary_conversion_dict = {'A':[1,0,0], 'C':[0,1,0], 'G':[0,0,1], 'T':[0,0,0], 'N':[0.25,0.25,0.25], '.':[0.25,0.25,0.25], '-':[0.25,0.25,0.25]}

    # initialize the dictionary with score and position/nt features
    df_dict = {'Score':[]}
    for i in range(len(consensus_columns)):
        position = str(i+1)
        df_dict[position+'_A'] = []
        df_dict[position+'_C'] = []
        df_dict[position+'_G'] = []

    header = ''
    for line in open(msa_fasta_file):
        if line[0] == '>':
            if header and header != 'Consensus':
                # process seq
                df_dict['Score'].append(seq_scores[header])
                for i in range(len(consensus_columns)):
                    position = str(i+1)
                    seq_i = consensus_columns[i]
                    nt = seq[seq_i].upper()
                    nt_conv = quaternary_conversion_dict[nt]
                    df_dict[position+'_A'].append(nt_conv[0])
                    df_dict[position+'_C'].append(nt_conv[1])
                    df_dict[position+'_G'].append(nt_conv[2])
            
            header = line[1:].rstrip()
            seq = ''

        else:
            seq += line.rstrip()
    
    if header and header != 'Consensus':
        # process last seq
        df_dict['Score'].append(seq_scores[header])
        for i in range(len(consensus_columns)):
            position = str(i+1)
            seq_i = consensus_columns[i]
            nt = seq[seq_i].upper()
            nt_conv = quaternary_conversion_dict[nt]
            df_dict[position+'_A'].append(nt_conv[0])
            df_dict[position+'_C'].append(nt_conv[1])
            df_dict[position+'_G'].append(nt_conv[2])


    ##################################################
    # perform learning
    ##################################################
    # add y-intercept term
    df_dict['Const'] = [1]*len(df_dict['Score'])

    df = pd.DataFrame(df_dict)
    score = df['Score']
    X = df.drop('Score', axis=1)
    print >> sys.stderr, 'Read in all the sequences and scores. Now fitting the model'
    mod = sm.OLS(score, X)
    res = mod.fit()
    model_output_file = open(options.model_output_file,'w')
    print >> model_output_file, res.summary()
    model_output_file.close()
    print >> sys.stderr, 'Fit an OLS model and print the summary to %s' %(options.model_output_file)


    ##################################################
    # read output
    ##################################################
    position_weights = collections.defaultdict(list)
    flag = False
    for line in open(options.model_output_file, 'r'):
        if line[0:2] == '==':
            flag = False
        elif line[0:2] == '--':
            flag = True
        elif flag:
            contents = line.split()
            if contents[0] != 'Const':
                position = int(contents[0].split('_')[0])
                weight = float(contents[1])
                position_weights[position].append(weight)

    df_dict = {'Position':[], 'Nucleotide':[], 'Weight':[]}
    #print '\t'.join(df_dict.keys())
    for position in position_weights.keys():
        weight_A, weight_C, weight_G = position_weights[position]
        weight_T = 0.0
        nucleotide_weights = [weight_A, weight_C, weight_T, weight_G]
        nucleotide_order = ['A','C','T','G']

        min_weight = min(nucleotide_weights)
        for i in range(0, len(nucleotide_weights)):
            nucleotide_weights[i] = nucleotide_weights[i] - min_weight

        for i in range(0, len(nucleotide_weights)):
            df_dict['Position'].append(position)
            df_dict['Nucleotide'].append(nucleotide_order[i])
            df_dict['Weight'].append(nucleotide_weights[i])
            #print '\t'.join([str(position), nucleotide_order[i], str(nucleotide_weights[i])])

    print >> sys.stderr, 'Now plotting the weights of different nucleotides along each position'
    ggplot.plot('%s/te_score_plots.r' % tempura.r_dir, df_dict, [options.plot_output_file])
    print >> sys.stderr, 'All Done. Check output files'
Esempio n. 19
0
def main():
    usage = 'usage: %prog [options] <mode=mid/span> <anchor_gff> <event_bam1,event_bam2,...|event_gff1,event_gff2,...>'
    parser = OptionParser(usage)
    parser.add_option(
        '-a',
        dest='max_anchors',
        default=1000,
        type='int',
        help='Maximum number of anchors to consider [Default: %default]')
    parser.add_option('-c',
                      dest='control_files',
                      default=None,
                      help='Control BAM or GFF files (comma separated)')
    parser.add_option('-e',
                      dest='plot_heat',
                      default=False,
                      action='store_true',
                      help='Plot as a heatmap [Default: %default]')
    parser.add_option('-l',
                      dest='log',
                      default=False,
                      action='store_true',
                      help='log2 coverage [Default: %default]')
    parser.add_option('--labels',
                      dest='labels',
                      default='Primary,Control',
                      help='Plot labels [Default:%default]')
    parser.add_option('-o',
                      dest='output_pre',
                      default='gff_cov',
                      help='Output prefix [Default: %default]')
    parser.add_option(
        '-s',
        dest='sorted_gene_files',
        help='Files of sorted gene lists. Plot heatmaps in their order')

    parser.add_option('-p',
                      dest='smooth_span',
                      default=0.2,
                      type='float',
                      help='Smoothing span parameter [Default: %default]')

    parser.add_option(
        '-b',
        dest='bins',
        default=100,
        type='int',
        help='Number of bins across the gene span [Default: %default]')
    parser.add_option('-m',
                      dest='min_length',
                      default=None,
                      type='int',
                      help='Minimum anchor length [Default: %default]')

    parser.add_option('-w',
                      dest='window',
                      default=2000,
                      type='int',
                      help='Window around peak middle [Default: %default]')

    (options, args) = parser.parse_args()

    if len(args) != 3:
        parser.error('Must provide mode, anchor GFF, and BAM/GFF file(s)')
    else:
        mode = args[0]
        anchor_gff = args[1]
        event_files = args[2].split(',')

    if options.control_files:
        control_files = options.control_files.split(',')

    plot_labels = options.labels.split(',')

    anchor_is_gtf = (anchor_gff[-4:] == '.gtf')

    # preprocess anchor GFF
    prep_anchor_fd, prep_anchor_gff = preprocess_anchors(
        anchor_gff, mode, options.max_anchors, anchor_is_gtf,
        options.min_length, options.window)

    ############################################
    # compute coverage
    ############################################
    coverage, events = compute_coverage(prep_anchor_gff, event_files, mode,
                                        anchor_is_gtf, options.bins)
    if options.control_files:
        coverage_control, events_control = compute_coverage(
            prep_anchor_gff, control_files, mode, anchor_is_gtf, options.bins)

    # clean
    os.close(prep_anchor_fd)
    os.remove(prep_anchor_gff)

    ############################################
    # normalize
    ############################################
    # normalize coverages (and add pseudocounts)
    for anchor_id in coverage:
        for i in range(len(coverage[anchor_id])):
            coverage[anchor_id][i] = (1 +
                                      coverage[anchor_id][i]) / float(events)
            if options.control_files:
                coverage_control[anchor_id][i] = (
                    1 + coverage_control[anchor_id][i]) / float(events_control)

    ############################################
    # sort anchors
    ############################################
    anchors_sorted = []
    if options.sorted_gene_files:
        # for each sorted list
        for sorted_gene_file in options.sorted_gene_files.split(','):
            # collect anchor_id's
            anchors_sorted.append([])
            for line in open(sorted_gene_file):
                anchor_id = line.split()[0]
                # verify randomly selected
                if anchor_id in coverage:
                    anchors_sorted[-1].append(anchor_id)

    else:
        # tuple anchor_id's with mean coverage
        stat_aid = []
        for anchor_id in coverage:
            if options.control_files:
                astat = stats.mean([
                    math.log(coverage[anchor_id][i], 2) -
                    math.log(coverage_control[anchor_id][i], 2)
                    for i in range(len(coverage[anchor_id]))
                ])
            else:
                astat = stats.geo_mean([
                    coverage[anchor_id][i]
                    for i in range(len(coverage[anchor_id]))
                ])

            stat_aid.append((astat, anchor_id))

        # sort
        stat_aid.sort(reverse=True)

        # store as the only sorted list
        anchors_sorted.append([anchor_id for (astat, anchor_id) in stat_aid])

    ############################################
    # plot heatmap(s)
    ############################################
    if options.plot_heat:
        # if multiple sorts, create a dir for the plots
        if len(anchors_sorted) > 1:
            if not os.path.isdir('%s_heat' % options.output_pre):
                os.mkdir('%s_heat' % options.output_pre)

        for s in range(len(anchors_sorted)):
            df = {'Index': [], 'Anchor': [], 'Coverage': []}
            for si in range(len(anchors_sorted[s])):
                anchor_id = anchors_sorted[s][si]

                for i in range(len(coverage[anchor_id])):
                    if mode == 'mid':
                        df['Index'].append(i - options.window / 2)
                    else:
                        df['Index'].append(i)
                    df['Anchor'].append(anchor_id)

                    if options.log:
                        cov = math.log(coverage[anchor_id][i], 2)
                    else:
                        cov = coverage[anchor_id][i]

                    if options.control_files:
                        if options.log:
                            cov -= math.log(coverage_control[anchor_id][i], 2)
                        else:
                            cov = cov / coverage_control[anchor_id][i]

                    df['Coverage'].append('%.4e' % cov)

            if len(anchors_sorted) == 1:
                out_pdf = '%s_heat.pdf' % options.output_pre
            else:
                sorted_gene_file = options.sorted_gene_files.split(',')[s]
                sorted_gene_pre = os.path.splitext(
                    os.path.split(sorted_gene_file)[-1])[0]
                out_pdf = '%s_heat/%s.pdf' % (options.output_pre,
                                              sorted_gene_pre)

            r_script = '%s/plot_gff_cov_heat.r' % os.environ['RDIR']
            ggplot.plot(r_script, df, [out_pdf, options.control_files != None])

    ############################################
    # plot meta-coverage
    ############################################
    df = {'Index': [], 'Coverage': []}
    if options.control_files:
        df['Type'] = []

    if mode == 'mid':
        index_length = 2 * (options.window / 2) + 1
    elif mode == 'span':
        index_length = options.bins
    else:
        print >> sys.stderr, 'Unknown mode %s' % mode
        exit(1)

    for i in range(index_length):
        if mode == 'mid':
            df['Index'].append(i - options.window / 2)
        else:
            df['Index'].append(i)

        if options.log:
            df['Coverage'].append(
                stats.geo_mean(
                    [coverage[anchor_id][i] for anchor_id in coverage]))
        else:
            df['Coverage'].append(
                stats.mean([coverage[anchor_id][i] for anchor_id in coverage]))

        if options.control_files:
            df['Type'].append('Primary')

            if mode == 'mid':
                df['Index'].append(i - options.window / 2)
            else:
                df['Index'].append(i)

            df['Type'].append('Control')
            if options.log:
                df['Coverage'].append(
                    stats.geo_mean([
                        coverage_control[anchor_id][i]
                        for anchor_id in coverage_control
                    ]))
            else:
                df['Coverage'].append(
                    stats.mean([
                        coverage_control[anchor_id][i]
                        for anchor_id in coverage_control
                    ]))

    r_script = '%s/plot_gff_cov_meta.r' % os.environ['RDIR']
    out_df = '%s_meta.df' % options.output_pre
    ggplot.plot(r_script,
                df, [
                    options.output_pre, options.smooth_span, plot_labels[0],
                    plot_labels[1]
                ],
                df_file=out_df)
Esempio n. 20
0
def main():
    usage = "usage: %prog [options] <diff1_file> <diff2_file>"
    parser = OptionParser(usage)
    parser.add_option("-o", dest="out_dir", default=".")
    (options, args) = parser.parse_args()

    if len(args) != 2:
        parser.error("Must provide two diff files")
    else:
        diff1_file = args[0]
        diff2_file = args[1]

    diff1_stats, diff1_bound = hash_diff(diff1_file)
    diff2_stats, diff2_bound = hash_diff(diff2_file)

    if not os.path.isdir(options.out_dir):
        os.mkdir(options.out_dir)

    for diff_key in diff1_stats:
        sample1, sample2 = diff_key

        gene_stats1 = diff1_stats[diff_key]
        gene_bound1 = diff1_bound[diff_key]
        gene_stats2 = diff2_stats[diff_key]
        gene_bound2 = diff2_bound[diff_key]

        report_out = open("%s/%s-%s_report.txt" % (options.out_dir, sample1, sample2), "w")

        # compare numbers of genes quantified
        common_genes = set(gene_stats1.keys()) & set(gene_stats2.keys())
        print >> report_out, "Genes quantified"
        print >> report_out, "%s\t%d" % (diff1_file, len(gene_stats1))
        print >> report_out, "%s\t%d" % (diff2_file, len(gene_stats2))
        print >> report_out, "Common\t%d" % len(common_genes)
        print >> report_out, ""

        up1 = set([gene_id for gene_id in gene_bound1 if gene_bound1[gene_id]])
        up2 = set([gene_id for gene_id in gene_bound2 if gene_bound2[gene_id]])

        print >> report_out, "Genes upregulated"
        print >> report_out, "%s\t%d" % (diff1_file, len(up1))
        print >> report_out, "%s\t%d" % (diff2_file, len(up2))
        print >> report_out, "Common\t%d" % len(up1 & up2)
        print >> report_out, ""

        down1 = set([gene_id for gene_id in gene_bound1 if not gene_bound1[gene_id]])
        down2 = set([gene_id for gene_id in gene_bound2 if not gene_bound2[gene_id]])

        print >> report_out, "Genes downregulated"
        print >> report_out, "%s\t%d" % (diff1_file, len(down1))
        print >> report_out, "%s\t%d" % (diff2_file, len(down2))
        print >> report_out, "Common\t%d" % len(down1 & down2)
        print >> report_out, ""

        # scatter plot test stat
        df = {"diff1": [], "diff2": []}
        for gene_id in common_genes:
            df["diff1"].append(gene_stats1[gene_id])
            df["diff2"].append(gene_stats2[gene_id])

        r_script = "%s/diff_diff_scatter.r" % os.environ["RDIR"]
        out_pdf = "%s/%s-%s_scatter.pdf" % (options.out_dir, sample1, sample2)
        ggplot.plot(r_script, df, [out_pdf])

        # compute correlation
        cor, p = spearmanr(df["diff1"], df["diff2"])

        print >> report_out, "Spearman correlation: %f" % cor
        print >> report_out, ""

        report_out.close()

        # plot test_stat versus test_stat difference
        df = {"minus": [], "avg": []}
        for gene_id in common_genes:
            df["minus"].append(gene_stats1[gene_id] - gene_stats2[gene_id])
            df["avg"].append(0.5 * gene_stats1[gene_id] + 0.5 * gene_stats2[gene_id])

        r_script = "%s/diff_diff_ma.r" % os.environ["RDIR"]
        out_pdf = "%s/%s-%s_ma.pdf" % (options.out_dir, sample1, sample2)
        ggplot.plot(r_script, df, [out_pdf])
Esempio n. 21
0
def main():
    usage = 'usage: %prog [options] <fpkm_tracking> <gene1> <gene2> ...>'
    parser = OptionParser(usage)
    parser.add_option('-l', dest='log', default=False, help='log2 FPKM')
    parser.add_option('-n',
                      dest='names',
                      default=None,
                      help='Sample names, comma-separated')
    parser.add_option('-p',
                      dest='pseudocount',
                      default=.125,
                      type='float',
                      help='Pseudocount for log FPKM [Default: %default]')
    parser.add_option('-o',
                      dest='out_dir',
                      default='cuff_bars',
                      help='Output directory [Default: %default]')
    parser.add_option('-s',
                      dest='samples',
                      default=None,
                      help='Samples to plot, comma-separated')
    parser.add_option('-y',
                      dest='yaxis_match',
                      default=False,
                      action='store_true',
                      help='Match the y-axis of all plots [Default: %default]')
    (options, args) = parser.parse_args()

    if len(args) < 2:
        parser.error('Must provide fpkm_tracking and genes.')
    else:
        fpkm_tracking = args[0]
        genes = args[1:]

    gene_sample_fpkm = read_fpkms(fpkm_tracking, genes, options.log,
                                  options.pseudocount)

    if not os.path.isdir(options.out_dir):
        os.mkdir(options.out_dir)

    if options.samples:
        samples = options.samples.split(',')
    else:
        samples = sorted(gene_sample_fpkm[genes[0]].keys())

    if options.names:
        names = options.names.split(',')
    else:
        names = samples

    ymin = 0
    if options.log:
        ymin = np.log2(options.pseudocount)

    if options.yaxis_match:
        ymax = max([
            gene_sample_fpkm[gene_name][sample][2] for sample in samples
            for gene_name in genes
        ])
    else:
        ymax = None

    for gene_name in genes:
        df = {}
        df['Sample'] = names
        df['FPKM'] = [
            gene_sample_fpkm[gene_name][sample][0] for sample in samples
        ]
        df['conf_lo'] = [
            gene_sample_fpkm[gene_name][sample][1] for sample in samples
        ]
        df['conf_hi'] = [
            gene_sample_fpkm[gene_name][sample][2] for sample in samples
        ]

        out_pdf = '%s/%s.pdf' % (options.out_dir, gene_name)
        ggplot.plot('%s/cuff_bar.r' % os.environ['RDIR'], df,
                    [ymin, ymax, out_pdf])
Esempio n. 22
0
def main():
    usage='usage:%prog [options] <bed_file> <msa_file>'
    parser = OptionParser(usage)
    parser.add_option('-c', dest='consensus_pct', default=0.5, type='float', help='Required proportion of columns with a valid nt to consider it a consensus column [Default: %default]')
    parser.add_option('-d', dest='dfam_consensus', action='store_true', help='Pass the option if you want to use Consensus as defined by Dfam')
    #parser.add_option('-j', dest='condense_pct', type='float', help='Required proportion of entries to be same between 2 columns for them to be merged')
    #parser.add_option('-n', dest='discretize_bins', type='int', help='The number of bins you want to discretize the scores into')
    parser.add_option('-o', dest='output_pre', type='string', help='Prefix of the output files')
    (options, args) = parser.parse_args()

    if len(args)!=2:
        parser.error('Must provide both the BED file and MSA file. Check %s' %usage)
    else:
        bed_file = args[0]
        msa_fasta_file = args[1]

    ##################################################
    # hash scores
    ##################################################
    seq_scores = {}
    for line in open(bed_file):
        a = line.split('\t')
        header = a[0] + ':' + a[1] + '-' + a[2]
        score = float(a[4])
        seq_scores[header] = score

    ##################################################
    # define consensus
    # define columns to condense for regression
    ##################################################
    msa_sequences = {}
    for line in open(msa_fasta_file):
        if line[0] == '>':
            header = line.strip()
            msa_sequences[header] = ''
        else:
            msa_sequences[header] += line.strip()

    if options.dfam_consensus is True:
        consensus_sequence = msa_sequences.pop('>Consensus')
        sequence_length = len(consensus_sequence)
        consensus_columns = []
        for i in range(0,len(consensus_sequence)):
            if consensus_sequence[i] == 'x':
                consensus_columns.append(i)
    else:
        consensus_columns = define_consensus(msa_fasta_file, options.consensus_pct)
        #sample_sequence = msa_sequences.pop('>Consensus')
        #sequence_length = len(sample_sequence)

    #hamming_cutoff = int(sequence_length - options.condense_pct*sequence_length)
    #condensed_columns, columns_ls_remove = column_condense(msa_sequences, consensus_columns, hamming_cutoff)

    ##################################################
    # map sequences to feature vectors
    ##################################################
    # initialize the dictionary with score and position/nt features
    df_mi = {'Score':[]}
    for i in range(len(consensus_columns)):
        position = i+1
        df_mi[position] = []

    header = ''
    for line in open(msa_fasta_file):
        if line[0] == '>':
            if header and header != 'Consensus':
                # process seq
                df_mi['Score'].append(seq_scores[header])
                for i in range(len(consensus_columns)):
                    position = i+1
                    seq_i = consensus_columns[i]
                    nt = seq[seq_i].upper()
                    df_mi[position].append(nt)
            
            header = line[1:].rstrip()
            seq = ''

        else:
            seq += line.rstrip()
    
    if header and header != 'Consensus':
        # process last seq
        df_mi['Score'].append(seq_scores[header])
        for i in range(len(consensus_columns)):
            position = i+1
            seq_i = consensus_columns[i]
            nt = seq[seq_i].upper()
            df_mi[position].append(nt)

    ggplot.plot('%s/te_mut_info.r' % tempura.r_dir, df_mi, [options.output_pre])
Esempio n. 23
0
def main():
    usage = 'usage: %prog [options] <hg19|mm9> <bam1,bam2,...>'
    parser = OptionParser(usage)
    parser.add_option('-a', dest='annotations', default='rrna,smallrna,cds,utrs_3p,utrs_5p,pseudogene,lncrna,introns,intergenic', help='Comma-separated list of annotation classes to include [Default: %default]')
    parser.add_option('-o', dest='output_prefix', default='annotation', help='Output file prefix [Default: %default]')
    parser.add_option('-p', dest='paired_stranded', action='store_true', default=False, help='Paired end stranded reads, so split intersects by XS tag and strand [Default: %default]')
    parser.add_option('-t', dest='title', default='Title', help='Plot title [Default: %default]')
    parser.add_option('-u', dest='unstranded', action='store_true', default=False, help='Unstranded reads, so count intergenic and renormalize to lessen the impact of double counting [Default: %default]')
    (options,args) = parser.parse_args()

    if len(args) == 2:
        genome = args[0]
        bam_files = args[1].split(',')
    else:
        parser.error(usage)

    if genome == 'hg19':
        annotation_dir = '%s/pie_unstranded' % os.environ['GENCODE']
        assembly_dir = '%s/research/common/data/genomes/hg19/assembly' % os.environ['HOME']
    elif genome == 'mm9':
        annotation_dir = '/n/rinn_data1/indexes/mouse/mm9/annotations/dk_pie'
        assembly_dir = '%s/research/common/data/genomes/mm9/assembly' % os.environ['HOME']
    else:
        parser.error('Genome must specify hg19 or mm9.')

    if options.paired_stranded:
        # split bam files by strand
        for bam_file in bam_files:
            split_bam_xs(bam_file)

    annotation_classes = set(options.annotations.split(','))

    ############################################
    # annotation lengths
    ############################################
    genome_length = count_genome(assembly_dir)

    annotation_lengths = {}
    for ann in annotation_classes:
        if ann != 'intergenic':
            annotation_bed = '%s/%s.bed' % (annotation_dir,ann)
            if os.path.isfile(annotation_bed):
                annotation_lengths[ann] = annotation_length(annotation_bed, assembly_dir)
            else:
                parser.error('Cannot find annotation BED %s' % annotation_bed)
                
    if 'intergenic' in annotation_classes:
        other_annotations_summed = sum(annotation_lengths.values())
        annotation_lengths['intergenic'] = genome_length - other_annotations_summed

    ############################################
    # annotation read counts
    ############################################
    genome_reads = 0
    for bam_file in bam_files:
        genome_reads += count_bam(bam_file)

    annotation_reads = {}
    for ann in annotation_classes:
        if ann != 'intergenic':
            annotation_bed = '%s/%s.bed' % (annotation_dir,ann)
            annotation_reads[ann] = 0
            for bam_file in bam_files:
                annotation_reads[ann] += count_intersection(bam_file, annotation_bed, options.unstranded, options.paired_stranded)

    if 'intergenic' in annotation_classes:
        other_annotations_summed = sum(annotation_reads.values())
        annotation_reads['intergenic'] = genome_reads - other_annotations_summed
    
        if options.unstranded:
            intergenic_reads_sub = annotation_reads['intergenic']
            intergenic_reads = 0
            for bam_file in bam_files:
                intergenic_reads += count_sans_intersection(bam_file, '%s/../gencode.v18.annotation.prerna.gtf' % annotation_dir)

    if options.paired_stranded:
        for bam_file in bam_files:
            os.remove(bam_file[:-4] + '_p.bam')
            os.remove(bam_file[:-4] + '_m.bam')

    ############################################
    # table
    ############################################
    annotation_labels = {'rrna':'rRNA', 'smallrna':'smallRNA', 'cds':'CDS', 'utrs_3p':'3\'UTR', 'utrs_5p':'5\'UTR', 'pseudogene':'Pseudogene', 'lncrna':'lncRNA', 'introns':'Introns', 'intergenic':'Intergenic', 'mrna':'mRNA'}

    reads_sum = float(sum(annotation_reads.values()))
    lengths_sum = float(sum(annotation_lengths.values()))

    annotation_ratio = {}

    counts_out = open('%s_counts.txt' % options.output_prefix, 'w')
    for ann in annotation_classes:
        read_pct = annotation_reads[ann]/reads_sum
        length_pct = annotation_lengths[ann]/lengths_sum

        if read_pct > 0:
            annotation_ratio[ann] = math.log(read_pct/length_pct,2)
        else:
            annotation_ratio[ann] = math.log((1+annotation_reads[ann])/(1+reads_sum),2)

        cols = (annotation_labels[ann], annotation_reads[ann], read_pct, length_pct, annotation_ratio[ann])
        print >> counts_out, '%10s  %8d  %6.4f  %6.4f  %5.2f' % cols
    counts_out.close()

    ############################################
    # pie chart
    ############################################
    pie_df = {'dummy':[], 'annotation':[], 'count':[]}
    for ann in annotation_classes:
        pie_df['dummy'].append('.')
        pie_df['annotation'].append(annotation_labels[ann])
        pie_df['count'].append(annotation_reads[ann])

    ggplot.plot('%s/annotation_pie_pie.r'%os.environ['RDIR'], pie_df, [options.title, '%s_pie.pdf'%options.output_prefix])

    ############################################
    # read:length ratio
    ############################################
    ratio_df = {'annotation':[], 'ratio':[]}
    for ann in annotation_classes:
        ratio_df['annotation'].append(annotation_labels[ann])
        ratio_df['ratio'].append(annotation_ratio[ann])

    ggplot.plot('%s/annotation_pie_ratios.r'%os.environ['RDIR'], ratio_df, [options.title, '%s_ratios.pdf'%options.output_prefix])
Esempio n. 24
0
def main():
    usage = "usage: %prog [options] <hg19|mm9> <gff>"
    parser = OptionParser(usage)
    parser.add_option(
        "-a",
        dest="annotations",
        default="cds,utrs_3p,utrs_5p,lncrna,introns",
        help="Comma-separated list of annotation classes to include [Default: %default]",
    )
    parser.add_option("-o", dest="output_prefix", default="annotation", help="Output file prefix [Default: %default]")
    parser.add_option("-t", dest="title", default="Title", help="Plot title [Default: %default]")
    (options, args) = parser.parse_args()

    if len(args) == 2:
        genome = args[0]
        gff_file = args[1]
    else:
        parser.error(usage)

    if genome == "hg19":
        annotation_dir = "%s/research/common/data/genomes/hg19/annotation/gencode_v15/pie" % os.environ["HOME"]
        assembly_dir = "%s/research/common/data/genomes/hg19/assembly" % os.environ["HOME"]
    elif genome == "mm9":
        annotation_dir = "/n/rinn_data1/indexes/mouse/mm9/annotations/dk_pie"
        assembly_dir = "%s/research/common/data/genomes/mm9/assembly" % os.environ["HOME"]
    else:
        parser.error("Genome must specify hg19 or mm9.")

    annotation_classes = set(options.annotations.split(","))

    ############################################
    # annotation lengths
    ############################################
    genome_length = annotation_pie.count_genome(assembly_dir)

    annotation_lengths = {}
    for ann in annotation_classes:
        if ann != "intergenic":
            annotation_bed = "%s/%s.bed" % (annotation_dir, ann)
            if os.path.isfile(annotation_bed):
                annotation_lengths[ann] = annotation_pie.annotation_length(annotation_bed, assembly_dir)
            else:
                parser.error("Cannot find annotation BED %s" % annotation_bed)

    if "intergenic" in annotation_classes:
        other_annotations_summed = sum(annotation_lengths.values())
        annotation_lengths["intergenic"] = genome_length - other_annotations_summed

    ############################################
    # annotation feature counts
    ############################################
    genome_features = int(subprocess.check_output("wc -l %s" % gff_file, shell=True).split()[0])

    annotation_features = {}
    for ann in annotation_classes:
        if ann != "intergenic":
            annotation_bed = "%s/%s.bed" % (annotation_dir, ann)
            annotation_features[ann] = count_intersection(gff_file, annotation_bed)

    if "intergenic" in annotation_classes:
        other_annotations_summed = sum(annotation_features.values())
        annotation_features["intergenic"] = genome_reads - other_annotations_summed

    ############################################
    # table
    ###########################################
    annotation_labels = {
        "rrna": "rRNA",
        "smallrna": "smallRNA",
        "cds": "CDS",
        "utrs_3p": "3'UTR",
        "utrs_5p": "5'UTR",
        "pseudogene": "Pseudogene",
        "lncrna": "lncRNA",
        "introns": "Introns",
        "intergenic": "Intergenic",
    }

    features_sum = float(sum(annotation_features.values()))
    lengths_sum = float(sum(annotation_lengths.values()))

    annotation_ratio = {}

    counts_out = open("%s_counts.txt" % options.output_prefix, "w")
    for ann in annotation_classes:
        feature_pct = annotation_features[ann] / features_sum
        length_pct = annotation_lengths[ann] / lengths_sum

        if feature_pct > 0:
            annotation_ratio[ann] = math.log(feature_pct / length_pct, 2)
        else:
            annotation_ratio[ann] = math.log((1 + annotation_features[ann]) / (1 + features_sum), 2)

        cols = (annotation_labels[ann], annotation_features[ann], feature_pct, length_pct, annotation_ratio[ann])
        print >> counts_out, "%10s  %8d  %6.4f  %6.4f  %5.2f" % cols
    counts_out.close()

    ############################################
    # pie chart
    ############################################
    pie_df = {"dummy": [], "annotation": [], "count": []}
    for ann in annotation_classes:
        pie_df["dummy"].append(".")
        pie_df["annotation"].append(annotation_labels[ann])
        pie_df["count"].append(annotation_features[ann])

    ggplot.plot(
        "%s/annotation_pie_pie.r" % os.environ["RDIR"], pie_df, [options.title, "%s_pie.pdf" % options.output_prefix]
    )

    ############################################
    # read:length ratio
    ############################################
    ratio_df = {"annotation": [], "ratio": []}
    for ann in annotation_classes:
        ratio_df["annotation"].append(annotation_labels[ann])
        ratio_df["ratio"].append(annotation_ratio[ann])

    ggplot.plot(
        "%s/annotation_pie_ratios.r" % os.environ["RDIR"],
        ratio_df,
        [options.title, "%s_ratios.pdf" % options.output_prefix],
    )
Esempio n. 25
0
def main():
    usage = 'usage: %prog [options] <.read_group_tracking>'
    parser = OptionParser(usage)
    parser.add_option('-g',
                      dest='genes_gtf',
                      help='Print only genes in the given GTF file')
    #parser.add_option('-p', dest='pseudocount', type='float', default=0.125, help='FPKM pseudocount for taking logs [Default: %default]')
    parser.add_option('-o',
                      dest='out_pdf',
                      default='cor_heat.pdf',
                      help='Output heatmap pdf [Default: %default]')
    (options, args) = parser.parse_args()

    if len(args) != 1:
        parser.error(usage)
    else:
        read_group_tracking = args[0]

    # get gene_ids
    gene_set = set()
    if options.genes_gtf:
        for line in open(options.genes_gtf):
            a = line.split('\t')
            gid = gff.gtf_kv(a[8])['gene_id']
            gene_set.add(gid)

    # initialize diff data structures
    cond_rep_gene_fpkm = {}

    # read read group tracking file
    rgt_in = open(read_group_tracking)
    headers = rgt_in.readline()
    line = rgt_in.readline()
    while line:
        a = line.split('\t')

        gene_id = a[0]
        cond = a[1]
        rep = int(a[2])
        fpkm = float(a[6])
        status = a[8].rstrip()

        if status == 'OK' and (len(gene_set) == 0 or gene_id in gene_set):
            if not (cond, rep) in cond_rep_gene_fpkm:
                cond_rep_gene_fpkm[(cond, rep)] = {}

            cond_rep_gene_fpkm[(cond, rep)][gene_id] = fpkm

        line = rgt_in.readline()
    rgt_in.close()

    df_dict = {'Sample1': [], 'Sample2': [], 'Correlation': []}
    cond_reps = cond_rep_gene_fpkm.keys()

    for i in range(len(cond_reps)):
        cond1, rep1 = cond_reps[i]

        for j in range(i + 1, len(cond_reps)):
            cond2, rep2 = cond_reps[j]

            genes12 = set(cond_rep_gene_fpkm[(cond1, rep1)].keys()) & set(
                cond_rep_gene_fpkm[(cond2, rep2)].keys())

            fpkms1 = array([
                cond_rep_gene_fpkm[(cond1, rep1)][gene_id]
                for gene_id in genes12
            ])
            fpkms2 = array([
                cond_rep_gene_fpkm[(cond2, rep2)][gene_id]
                for gene_id in genes12
            ])

            rho, pval = spearmanr(fpkms1, fpkms2)

            cols = (cond1, rep1, cond2, rep2, rho)
            print '%-15s  %1d  %-15s  %1d  %.4f' % cols

            df_dict['Sample1'].append('%s_%d' % (cond1, rep1))
            df_dict['Sample2'].append('%s_%d' % (cond2, rep2))
            df_dict['Correlation'].append(rho)

    # this is broken
    ggplot.plot('%s/cuff_rep_cor.r' % os.environ['RDIR'],
                df_dict, [options.out_pdf],
                debug=True)
Esempio n. 26
0
def main():
    usage = 'usage: %prog [options] <hg19|mm9> <bam1,bam2,...>'
    parser = OptionParser(usage)
    parser.add_option(
        '-a',
        dest='annotations',
        default=
        'rrna,smallrna,cds,utrs_3p,utrs_5p,pseudogene,lncrna,introns,intergenic',
        help=
        'Comma-separated list of annotation classes to include [Default: %default]'
    )
    parser.add_option('-o',
                      dest='output_prefix',
                      default='annotation',
                      help='Output file prefix [Default: %default]')
    parser.add_option(
        '-p',
        dest='paired_stranded',
        action='store_true',
        default=False,
        help=
        'Paired end stranded reads, so split intersects by XS tag and strand [Default: %default]'
    )
    parser.add_option('-t',
                      dest='title',
                      default='',
                      help='Plot title [Default: %default]')
    parser.add_option(
        '-u',
        dest='unstranded',
        action='store_true',
        default=False,
        help=
        'Unstranded reads, so count intergenic and renormalize to lessen the impact of double counting [Default: %default]'
    )
    (options, args) = parser.parse_args()

    if len(args) == 2:
        genome = args[0]
        bam_files = args[1].split(',')
    else:
        parser.error(usage)

    if genome == 'hg19':
        assembly_dir = '%s/research/common/data/genomes/hg19/assembly' % os.environ[
            'HOME']
        if options.paired_stranded:
            annotation_dir = '%s/pie_stranded' % os.environ['GENCODE']
        else:
            annotation_dir = '%s/pie_unstranded' % os.environ['GENCODE']

    elif genome == 'mm9':
        assembly_dir = '%s/research/common/data/genomes/mm9/assembly' % os.environ[
            'HOME']
        if options.paired_stranded:
            print >> sys.stderr, 'Stranded annotation BEDs dont exist for mm9'
            exit(1)
        else:
            annotation_dir = '/n/rinn_data1/indexes/mouse/mm9/annotations/dk_pie'

    else:
        parser.error('Genome must specify hg19 or mm9.')

    if options.paired_stranded:
        # split bam files by strand
        for bam_file in bam_files:
            split_bam_xs(bam_file)

    annotation_classes = set(options.annotations.split(','))

    ############################################
    # annotation lengths
    ############################################
    genome_length = count_genome(assembly_dir)

    annotation_lengths = {}
    for ann in annotation_classes:
        if ann != 'intergenic':
            annotation_bed = '%s/%s.bed' % (annotation_dir, ann)
            if os.path.isfile(annotation_bed):
                annotation_lengths[ann] = annotation_length(
                    annotation_bed, assembly_dir)
            else:
                parser.error('Cannot find annotation BED %s' % annotation_bed)

    if 'intergenic' in annotation_classes:
        other_annotations_summed = sum(annotation_lengths.values())
        annotation_lengths[
            'intergenic'] = genome_length - other_annotations_summed

    ############################################
    # annotation read counts
    ############################################
    genome_reads = 0
    for bam_file in bam_files:
        genome_reads += count_bam(bam_file)

    annotation_reads = {}
    for ann in annotation_classes:
        if ann != 'intergenic':
            annotation_bed = '%s/%s.bed' % (annotation_dir, ann)
            annotation_reads[ann] = 0
            for bam_file in bam_files:
                annotation_reads[ann] += count_intersection(
                    bam_file, annotation_bed, options.unstranded,
                    options.paired_stranded)

    if 'intergenic' in annotation_classes:
        other_annotations_summed = sum(annotation_reads.values())
        annotation_reads[
            'intergenic'] = genome_reads - other_annotations_summed

        if options.unstranded:
            intergenic_reads_sub = annotation_reads['intergenic']
            intergenic_reads = 0
            for bam_file in bam_files:
                intergenic_reads += count_sans_intersection(
                    bam_file,
                    '%s/../gencode.v18.annotation.prerna.gtf' % annotation_dir)

    if options.paired_stranded:
        for bam_file in bam_files:
            os.remove(bam_file[:-4] + '_p.bam')
            os.remove(bam_file[:-4] + '_m.bam')

    ############################################
    # table
    ############################################
    annotation_labels = {
        'rrna': 'rRNA',
        'smallrna': 'smallRNA',
        'cds': 'CDS',
        'utrs_3p': '3\'UTR',
        'utrs_5p': '5\'UTR',
        'pseudogene': 'Pseudogene',
        'lncrna': 'lncRNA',
        'exons': 'Exons',
        'introns': 'Introns',
        'intergenic': 'Intergenic',
        'mrna': 'mRNA'
    }

    reads_sum = float(sum(annotation_reads.values()))
    lengths_sum = float(sum(annotation_lengths.values()))

    annotation_ratio = {}

    counts_out = open('%s_counts.txt' % options.output_prefix, 'w')
    for ann in annotation_classes:
        read_pct = annotation_reads[ann] / reads_sum
        length_pct = annotation_lengths[ann] / lengths_sum

        if read_pct > 0:
            annotation_ratio[ann] = math.log(read_pct / length_pct, 2)
        else:
            annotation_ratio[ann] = math.log(
                (1 + annotation_reads[ann]) / (1 + reads_sum), 2)

        cols = (annotation_labels[ann], annotation_reads[ann], read_pct,
                length_pct, annotation_ratio[ann])
        print >> counts_out, '%10s  %8d  %6.4f  %6.4f  %5.2f' % cols
    counts_out.close()

    ############################################
    # pie chart
    ############################################
    pie_df = {'dummy': [], 'annotation': [], 'count': []}
    for ann in annotation_classes:
        pie_df['dummy'].append('.')
        pie_df['annotation'].append(annotation_labels[ann])
        pie_df['count'].append(annotation_reads[ann])

    out_pdf = '%s_pie.pdf' % options.output_prefix
    ggplot.plot('%s/annotation_pie_pie.r' % os.environ['RDIR'],
                pie_df, [options.title, out_pdf],
                df_file=out_pdf[:-1])

    ############################################
    # read:length ratio
    ############################################
    ratio_df = {'annotation': [], 'ratio': []}
    for ann in annotation_classes:
        ratio_df['annotation'].append(annotation_labels[ann])
        ratio_df['ratio'].append(annotation_ratio[ann])

    ggplot.plot(
        '%s/annotation_pie_ratios.r' % os.environ['RDIR'], ratio_df,
        [options.title, '%s_ratios.pdf' % options.output_prefix])
Esempio n. 27
0
def main():
    usage = 'usage: %prog [options] <fpkm_tracking>'
    parser = OptionParser(usage)
    parser.add_option('-a', dest='max_fpkm', type='float', help='Maxium log2 FPKM to plot [Default: %d]')
    parser.add_option('-d', dest='diff_file', help='Limit to significantly differentially expressed genes')
    parser.add_option('-g', dest='gtf', help='GTF file of genes to display')
    parser.add_option('-m', dest='min_fpkm', default=0, type='float', help='Minimum FPKM [Default: %default]')
    parser.add_option('-p', dest='pseudocount', default=.125, type='float', help='Pseudocount for log FPKM [Default: %default]')
    parser.add_option('-o', dest='out_pdf', default='cuff_heat.pdf', help='Output PDF [Default: %default]')
    parser.add_option('-s', dest='sample', default=1000, type='int', help='Sample genes rather than use all [Default: %default]')
    parser.add_option('-u', dest='uppercase', default=False, action='store_true', help='Uppercase sample labels [Default: %default]')
    (options,args) = parser.parse_args()

    if len(args) != 1:
        parser.error('Must provide fpkm_tracking')
    else:
        fpkm_tracking = args[0]

    # load expression data
    cuff = cufflinks.fpkm_tracking(fpkm_file=fpkm_tracking)

    # determine genes
    all_genes = set(cuff.genes)
    if options.gtf:
        all_genes = set()
        for line in open(options.gtf):
            a = line.split('\t')
            all_genes.add(gff.gtf_kv(a[8])['gene_id'])


    if options.diff_file:
        # limit to differentially expressed genes
        diff_genes = find_diff(options.diff_file)
        all_genes &= diff_genes

    else:
        # at least limit to clean genes
        clean_genes = set()
        for gene_id in all_genes:
            ge = cuff.gene_expr(gene_id)
            clean = True 
            for i in range(len(ge)):
                if math.isnan(ge[i]):
                    clean = False
                    break
            if clean:
                clean_genes.add(gene_id)

        all_genes &= clean_genes


    if options.min_fpkm > 0:
        expressed_genes = set()
        for gene_id in all_genes:
            ge = cuff.gene_expr(gene_id, not_found=0, fail=0)
            if max(ge) > options.min_fpkm:
                expressed_genes.add(gene_id)
                
        all_genes &= expressed_genes

    # sample genes to display
    if len(all_genes) <= options.sample:
        display_genes = all_genes
    else:
        display_genes = random.sample(all_genes, options.sample)

    # build data frame
    df = {'Gene':[], 'FPKM':[], 'Sample':[]}

    for gene_id in display_genes:
        ge = cuff.gene_expr(gene_id, not_found=0, fail=0)

        for i in range(len(cuff.experiments)):
            df['Gene'].append(gene_id)

            df['Sample'].append(cuff.experiments[i])
            if options.uppercase:
                df['Sample'][-1] = df['Sample'][-1].upper()

            logfpkm = np.log2(ge[i]+options.pseudocount)
            if options.max_fpkm:
                logfpkm = min(options.max_fpkm, logfpkm)
            df['FPKM'].append(logfpkm)

    # plot
    out_df = '%s.df' % options.out_pdf[:-4]
    ggplot.plot('%s/cuff_heat.r' % os.environ['RDIR'], df, [options.out_pdf], df_file=out_df)
Esempio n. 28
0
def main():
    usage = 'usage: %prog [options] <.read_group_tracking>'
    parser = OptionParser(usage)
    parser.add_option('-g', dest='genes_gtf', help='Print only genes in the given GTF file')
    #parser.add_option('-p', dest='pseudocount', type='float', default=0.125, help='FPKM pseudocount for taking logs [Default: %default]')
    parser.add_option('-o', dest='out_pdf', default='cor_heat.pdf', help='Output heatmap pdf [Default: %default]')
    (options,args) = parser.parse_args()

    if len(args) != 1:
        parser.error(usage)
    else:
        read_group_tracking = args[0]
    
    # get gene_ids
    gene_set = set()
    if options.genes_gtf:
        for line in open(options.genes_gtf):
            a = line.split('\t')
            gid = gff.gtf_kv(a[8])['gene_id']
            gene_set.add(gid)

    # initialize diff data structures
    cond_rep_gene_fpkm = {}

    # read read group tracking file
    rgt_in = open(read_group_tracking)
    headers = rgt_in.readline()
    line = rgt_in.readline()
    while line:
        a = line.split('\t')

        gene_id = a[0]
        cond = a[1]
        rep = int(a[2])
        fpkm = float(a[6])
        status = a[8].rstrip()

        if status == 'OK' and (len(gene_set) == 0 or gene_id in gene_set):
            if not (cond,rep) in cond_rep_gene_fpkm:
                cond_rep_gene_fpkm[(cond,rep)] = {}
            
            cond_rep_gene_fpkm[(cond,rep)][gene_id] = fpkm

        line = rgt_in.readline()
    rgt_in.close()

    df_dict = {'Sample1':[], 'Sample2':[], 'Correlation':[]}
    cond_reps = cond_rep_gene_fpkm.keys()

    for i in range(len(cond_reps)):
        cond1, rep1 = cond_reps[i]

        for j in range(i+1,len(cond_reps)):
            cond2, rep2 = cond_reps[j]

            genes12 = set(cond_rep_gene_fpkm[(cond1,rep1)].keys()) & set(cond_rep_gene_fpkm[(cond2,rep2)].keys())

            fpkms1 = array([cond_rep_gene_fpkm[(cond1,rep1)][gene_id] for gene_id in genes12])
            fpkms2 = array([cond_rep_gene_fpkm[(cond2,rep2)][gene_id] for gene_id in genes12])

            rho, pval = spearmanr(fpkms1, fpkms2)

            cols = (cond1,rep1,cond2,rep2,rho)
            print '%-15s  %1d  %-15s  %1d  %.4f' % cols

            df_dict['Sample1'].append('%s_%d' % (cond1,rep1))
            df_dict['Sample2'].append('%s_%d' % (cond2,rep2))
            df_dict['Correlation'].append(rho)

    # this is broken
    ggplot.plot('%s/cuff_rep_cor.r' % os.environ['RDIR'], df_dict, [options.out_pdf], debug=True)
Esempio n. 29
0
def main():
    usage = 'usage: %prog [options] <read_group_tracking>'
    parser = OptionParser(usage)
    parser.add_option('-g', dest='gtf', help='GTF file of genes to display')
    parser.add_option('-f', dest='min_fpkm', default=0, type='float', help='Minimum FPKM to consider [Default: %default]')
    parser.add_option('-m', dest='method', default='PCA', help='Dimension reduction method [Default: %default]')
    parser.add_option('-p', dest='pseudocount', default=.125, help='FPKM pseudocount (for logs) [Default: %default]')
    parser.add_option('-o', dest='out_pdf', default='cuff_2d.pdf', help='Output PDF [Default: %default]')
    parser.add_option('-s', dest='square', default=False, action='store_true', help='Square plot [Default: %default]')
    parser.add_option('-u', dest='uppercase', default=False, action='store_true', help='Uppercase sample labels [Default: %default]')
    parser.add_option('-w', dest='whiten', default=False, action='store_true', help='Whiten expression data [Default: %default]')
    (options,args) = parser.parse_args()

    if len(args) != 1:
        parser.error('Must provide fpkm_tracking')
    else:
        read_group_tracking = args[0]

    # load expression data
    gene_fpkm = {}
    rgt_in = open(read_group_tracking)
    rgt_in.readline()
    for line in rgt_in:
        a = line.split()
        gene_fpkm.setdefault(a[0],{})[(a[1],a[2])] = float(a[6])
    rgt_in.close()

    # determine genes
    compute_genes = gene_fpkm.keys()
    if options.gtf:
        compute_genes = set()
        for line in open(options.gtf):
            a = line.split('\t')
            compute_genes.add(gff.gtf_kv(a[8])['gene_id'])
    compute_genes = list(compute_genes)

    # filter for fpkm
    if options.min_fpkm > 0:
        prefilter_genes = copy.copy(compute_genes)
        compute_genes = []
        for gene_id in prefilter_genes:
            ge = gene_fpkm[gene_id].values()
            if max(ge) > options.min_fpkm:
                compute_genes.append(gene_id)

    # construct gene expression matrix
    samples = gene_fpkm[compute_genes[0]].keys()
    X = np.array([[gene_fpkm[gene_id][sam_rep] for gene_id in compute_genes] for sam_rep in samples])
    X = np.log2(X + options.pseudocount)

    if options.whiten:
        X = preprocessing.scale(X)

    ##################################################
    # dimensionality reduction
    ##################################################
    if options.method.lower() == 'mds':
        model = MDS(n_components=2)
    elif options.method.lower() in ['iso','isomap']:
        model = Isomap(n_components=2)
    elif options.method.lower() == 'ica':
        model = FastICA(n_components=2, max_iter=500)
    else:
        model = PCA(n_components=2)
    
    X_dr = model.fit_transform(X)

    ##################################################    
    # plot
    ##################################################
    df = {}
    df['D1'] = X_dr[:,0]
    df['D2'] = X_dr[:,1]
    df['Label'] = ['%s_rep%s' % sam_rep for sam_rep in samples]
     
    if options.uppercase:
        df['Label'] = [label.upper() for label in df['Label']]
        df['Sample'] = [sam.upper() for (sam,rep) in samples]
    else:
        df['Sample'] = [sam for (sam,rep) in samples]

    ggplot.plot('%s/cuff_2d.r' % os.environ['RDIR'], df, [options.out_pdf, options.square])
Esempio n. 30
0
def main():
    usage = 'usage: %prog [options] <diff1_file> <diff2_file>'
    parser = OptionParser(usage)
    parser.add_option('-s', dest='stat', default='test_stat')
    parser.add_option('-g', dest='genes_gtf', default=None)
    parser.add_option('-m', dest='min_fpkm', default=0, type='float')
    parser.add_option('-o', dest='out_dir', default='.')
    parser.add_option('-p', dest='pseudocount', default=0.125, type='float')
    (options, args) = parser.parse_args()

    if len(args) != 2:
        parser.error('Must provide two diff files')
    else:
        diff1_file = args[0]
        diff2_file = args[1]

    gtf_genes = None
    if options.genes_gtf:
        gtf_genes = gff.gtf_gene_set(options.genes_gtf)

    diff1_stats = cuffdiff.hash_stat(diff1_file,
                                     stat=options.stat,
                                     min_fpkm=options.min_fpkm,
                                     pseudocount=options.pseudocount,
                                     gene_set=gtf_genes)
    diff1_sig = cuffdiff.hash_sig(diff1_file, gene_set=gtf_genes)

    diff2_stats = cuffdiff.hash_stat(diff2_file,
                                     stat=options.stat,
                                     min_fpkm=options.min_fpkm,
                                     pseudocount=options.pseudocount,
                                     gene_set=gtf_genes)
    diff2_sig = cuffdiff.hash_sig(diff2_file, gene_set=gtf_genes)

    if not os.path.isdir(options.out_dir):
        os.mkdir(options.out_dir)

    for diff_key in diff1_stats:
        sample1, sample2 = diff_key

        gene_stats1 = diff1_stats[diff_key]
        gene_sig1 = diff1_sig[diff_key]
        gene_stats2 = diff2_stats[diff_key]
        gene_sig2 = diff2_sig[diff_key]

        report_out = open(
            '%s/%s-%s_report.txt' % (options.out_dir, sample1, sample2), 'w')

        # compare numbers of genes quantified
        common_genes = set(gene_stats1.keys()) & set(gene_stats2.keys())
        print >> report_out, 'Genes quantified'
        print >> report_out, '%s\t%d' % (diff1_file, len(gene_stats1))
        print >> report_out, '%s\t%d' % (diff2_file, len(gene_stats2))
        print >> report_out, 'Common\t%d' % len(common_genes)
        print >> report_out, ''

        up1 = set([gene_id for gene_id in gene_sig1 if gene_sig1[gene_id]])
        up2 = set([gene_id for gene_id in gene_sig2 if gene_sig2[gene_id]])

        print >> report_out, 'Genes upregulated'
        print >> report_out, '%s\t%d' % (diff1_file, len(up1))
        print >> report_out, '%s\t%d' % (diff2_file, len(up2))
        print >> report_out, 'Common\t%d' % len(up1 & up2)
        print >> report_out, ''

        down1 = set(
            [gene_id for gene_id in gene_sig1 if not gene_sig1[gene_id]])
        down2 = set(
            [gene_id for gene_id in gene_sig2 if not gene_sig2[gene_id]])

        print >> report_out, 'Genes downregulated'
        print >> report_out, '%s\t%d' % (diff1_file, len(down1))
        print >> report_out, '%s\t%d' % (diff2_file, len(down2))
        print >> report_out, 'Common\t%d' % len(down1 & down2)
        print >> report_out, ''

        # scatter plot test stat
        df = {'diff1': [], 'diff2': []}
        for gene_id in common_genes:
            df['diff1'].append(gene_stats1[gene_id])
            df['diff2'].append(gene_stats2[gene_id])

        r_script = '%s/diff_diff_scatter.r' % os.environ['RDIR']
        out_pdf = '%s/%s-%s_scatter.pdf' % (options.out_dir, sample1, sample2)
        ggplot.plot(r_script, df, [out_pdf], df_file='%s.df' % out_pdf[:-4])

        # compute correlation
        cor, p = spearmanr(df['diff1'], df['diff2'])
        print >> report_out, 'Spearman correlation: %f (%f)' % (cor, p)
        cor, p = pearsonr(df['diff1'], df['diff2'])
        print >> report_out, 'Pearson correlation: %f (%f)' % (cor, p)

        report_out.close()

        # plot test_stat versus test_stat difference
        df = {'minus': [], 'avg': []}
        for gene_id in common_genes:
            df['minus'].append(gene_stats1[gene_id] - gene_stats2[gene_id])
            df['avg'].append(0.5 * gene_stats1[gene_id] +
                             0.5 * gene_stats2[gene_id])

        r_script = '%s/diff_diff_ma.r' % os.environ['RDIR']
        out_pdf = '%s/%s-%s_ma.pdf' % (options.out_dir, sample1, sample2)
        ggplot.plot(r_script, df, [out_pdf])
Esempio n. 31
0
def main():
    usage = 'usage: %prog [options] <hg19|mm9> <gff>'
    parser = OptionParser(usage)
    parser.add_option('-a', dest='annotations', default='cds,utrs_3p,utrs_5p,lncrna,introns', help='Comma-separated list of annotation classes to include [Default: %default]')
    parser.add_option('-o', dest='output_prefix', default='annotation', help='Output file prefix [Default: %default]')
    parser.add_option('-t', dest='title', default='Title', help='Plot title [Default: %default]')
    (options,args) = parser.parse_args()

    if len(args) == 2:
        genome = args[0]
        gff_file = args[1]
    else:
        parser.error(usage)

    if genome == 'hg19':
        annotation_dir = '%s/research/common/data/genomes/hg19/annotation/gencode_v15/pie' % os.environ['HOME']
        assembly_dir = '%s/research/common/data/genomes/hg19/assembly' % os.environ['HOME']
    elif genome == 'mm9':
        annotation_dir = '/n/rinn_data1/indexes/mouse/mm9/annotations/dk_pie'
        assembly_dir = '%s/research/common/data/genomes/mm9/assembly' % os.environ['HOME']
    else:
        parser.error('Genome must specify hg19 or mm9.')

    annotation_classes = set(options.annotations.split(','))

    ############################################
    # annotation lengths
    ############################################
    genome_length = annotation_pie.count_genome(assembly_dir)

    annotation_lengths = {}
    for ann in annotation_classes:
        if ann != 'intergenic':
            annotation_bed = '%s/%s.bed' % (annotation_dir,ann)
            if os.path.isfile(annotation_bed):
                annotation_lengths[ann] = annotation_pie.annotation_length(annotation_bed, assembly_dir)
            else:
                parser.error('Cannot find annotation BED %s' % annotation_bed)
                
    if 'intergenic' in annotation_classes:
        other_annotations_summed = sum(annotation_lengths.values())
        annotation_lengths['intergenic'] = genome_length - other_annotations_summed

    ############################################
    # annotation feature counts
    ############################################
    genome_features = int(subprocess.check_output('wc -l %s' % gff_file, shell=True).split()[0])

    annotation_features = {}
    for ann in annotation_classes:
        if ann != 'intergenic':
            annotation_bed = '%s/%s.bed' % (annotation_dir,ann)
            annotation_features[ann] = count_intersection(gff_file, annotation_bed)

    if 'intergenic' in annotation_classes:
        other_annotations_summed = sum(annotation_features.values())
        annotation_features['intergenic'] = genome_reads - other_annotations_summed        
    
    ############################################
    # table
    ###########################################
    annotation_labels = {'rrna':'rRNA', 'smallrna':'smallRNA', 'cds':'CDS', 'utrs_3p':'3\'UTR', 'utrs_5p':'5\'UTR', 'pseudogene':'Pseudogene', 'lncrna':'lncRNA', 'introns':'Introns', 'intergenic':'Intergenic'}

    features_sum = float(sum(annotation_features.values()))
    lengths_sum = float(sum(annotation_lengths.values()))

    annotation_ratio = {}

    counts_out = open('%s_counts.txt' % options.output_prefix, 'w')
    for ann in annotation_classes:
        feature_pct = annotation_features[ann]/features_sum
        length_pct = annotation_lengths[ann]/lengths_sum

        if feature_pct > 0:
            annotation_ratio[ann] = math.log(feature_pct/length_pct,2)
        else:
            annotation_ratio[ann] = math.log((1+annotation_features[ann])/(1+features_sum),2)

        cols = (annotation_labels[ann], annotation_features[ann], feature_pct, length_pct, annotation_ratio[ann])
        print >> counts_out, '%10s  %8d  %6.4f  %6.4f  %5.2f' % cols
    counts_out.close()

    ############################################
    # pie chart
    ############################################
    pie_df = {'dummy':[], 'annotation':[], 'count':[]}
    for ann in annotation_classes:
        pie_df['dummy'].append('.')
        pie_df['annotation'].append(annotation_labels[ann])
        pie_df['count'].append(annotation_features[ann])

    ggplot.plot('%s/annotation_pie_pie.r'%os.environ['RDIR'], pie_df, [options.title, '%s_pie.pdf'%options.output_prefix])

    ############################################
    # read:length ratio
    ############################################
    ratio_df = {'annotation':[], 'ratio':[]}
    for ann in annotation_classes:
        ratio_df['annotation'].append(annotation_labels[ann])
        ratio_df['ratio'].append(annotation_ratio[ann])

    ggplot.plot('%s/annotation_pie_ratios.r'%os.environ['RDIR'], ratio_df, [options.title, '%s_ratios.pdf'%options.output_prefix])
Esempio n. 32
0
def main():
    usage = 'usage: %prog [options] <peaks gff> <diff>'
    parser = OptionParser(usage)
    parser.add_option('-c', dest='control_fpkm_file', help='Control FPKM tracking file')
    parser.add_option('-g', dest='ref_gtf', default='%s/gencode.v18.annotation.gtf'%os.environ['GENCODE'])
    parser.add_option('-o', dest='output_pre', default='', help='Output prefix [Default: %default]')
    parser.add_option('--sample1', dest='sample1', help='Sample_1 name in cuffdiff')
    parser.add_option('--sample2', dest='sample2', help='Sample_2 name in cuffdiff')
    (options,args) = parser.parse_args()

    if len(args) != 2:
        parser.error('Must provide peaks GFF and .diff file')
    else:
        peaks_gff = args[0]
        diff_file = args[1]

    # find expressed genes in peak calls
    silent_genes = set()
    if options.control_fpkm_file:
        silent_genes = find_silent(options.control_fpkm_file)

    # find peak bound genes
    peak_genes = set()
    p = subprocess.Popen('intersectBed -s -u -a %s -b %s' % (options.ref_gtf,peaks_gff), shell=True, stdout=subprocess.PIPE)
    for line in p.stdout:
        peak_genes.add(gff.gtf_kv(line.split('\t')[8])['gene_id'])
    p.communicate()

    # process RIP
    bound_tstats = []
    unbound_tstats = []
    rip_genes = set()

    diff_in = open(diff_file)
    line = diff_in.readline()
    for line in diff_in:
        a = line.split('\t')

        gene_id = a[0]
        sample1 = a[4]
        sample2 = a[5]
        status = a[6]
        fpkm1 = float(a[7])
        fpkm2 = float(a[8])
        tstat = float(a[10])
        sig = a[13].rstrip()

        if sample2 == 'input':
            tstat *= -1

        if status == 'OK' and not math.isnan(tstat):
            if options.sample1 in [None,sample1] and options.sample2 in [None,sample2]:
                # save RIP bound
                if sig == 'yes':
                    rip_genes.add(gene_id)

                # save test_stat
                if gene_id in peak_genes:
                    bound_tstats.append(tstat)
                else:
                    if not gene_id in silent_genes:
                        unbound_tstats.append(tstat)

    print '%d silent genes' % len(silent_genes)
    print '%d bound genes' % len(bound_tstats)
    print '%d unbound genes' % len(unbound_tstats)

    # perform statistical test
    z, p = stats.mannwhitneyu(bound_tstats, unbound_tstats)
    print z, p

    ##################################################
    # plot bound and unbound distributions
    ##################################################
    # construct data frame
    df_dict = {'Peak':(['Yes']*len(bound_tstats) + ['No']*len(unbound_tstats)),
               'Test_stat':bound_tstats+unbound_tstats}

    r_script = '%s/peaks_diff_compare.r' % os.environ['RDIR']

    ggplot.plot(r_script, df_dict, [options.output_pre])

    ##################################################
    # plot venn diagram
    ##################################################
    clip_only = len(peak_genes - rip_genes)
    rip_only = len(rip_genes - peak_genes)
    both = len(peak_genes & rip_genes)

    plt.figure()
    venn_diag = venn2(subsets=(clip_only, rip_only, both), set_labels=['CLIP', 'RIP'])
    plt.savefig('%s_venn.pdf' % options.output_pre)
Esempio n. 33
0
def main():
    usage = 'usage: %prog [options] <peaks gff> <diff>'
    parser = OptionParser(usage)
    parser.add_option('-c', dest='clip_fpkm_file', help='Control FPKM tracking file')
    parser.add_option('-g', dest='ref_gtf', default='%s/gencode.v18.annotation.gtf'%os.environ['GENCODE'])
    parser.add_option('--ggplot', dest='ggplot_script', default='%s/peaks_diff_compare.r'%os.environ['RDIR'], help='Script to make plots with [Default: %default]')
    parser.add_option('-m', dest='max_stat', default=10, type='float', help='Max cuffdiff stat [Default: %default]')
    parser.add_option('-o', dest='output_pre', default='', help='Output prefix [Default: %default]')
    parser.add_option('-r', dest='rbp', default='RBP', help='RBP name [Default: %default]')
    parser.add_option('-s', dest='single_gene_loci', default=False, action='store_true', help='Only use single gene loci [Default: %default]')
    parser.add_option('-t', dest='test_stat', default=False, action='store_true', help='Use test statistic rather than fold change [Default: %default]')
    parser.add_option('--sample1', dest='sample1', help='Sample_1 name in cuffdiff')
    parser.add_option('--sample2', dest='sample2', help='Sample_2 name in cuffdiff')
    (options,args) = parser.parse_args()

    if len(args) != 2:
        parser.error('Must provide peaks GFF and .diff file')
    else:
        peaks_gff = args[0]
        diff_file = args[1]

    ##################################################
    # process GTF
    ##################################################
    if options.single_gene_loci:
        single_gtf_fd, single_gtf_file = filter_single(options.ref_gtf)
        options.ref_gtf = single_gtf_file

    gtf_genes = gff.gtf_gene_set(options.ref_gtf)

    ##################################################
    # collect CLIP peak bound genes
    ##################################################
    peak_genes = set()
    p = subprocess.Popen('intersectBed -s -u -a %s -b %s' % (options.ref_gtf, peaks_gff), shell=True, stdout=subprocess.PIPE)
    for line in p.stdout:
        peak_genes.add(gff.gtf_kv(line.split('\t')[8])['gene_id'])
    p.communicate()

    # find expressed genes in peak calls
    silent_genes = set()
    if options.clip_fpkm_file:
        silent_genes = find_silent(options.clip_fpkm_file)

    ##################################################
    # collect RIP stats
    ##################################################
    if options.test_stat:
        rip_fold, rip_bound = ripseq.hash_rip(diff_file, just_ok = True, use_fold=False, max_stat=options.max_stat, one_rbp=True)
    else:
        rip_fold, rip_bound = ripseq.hash_rip(diff_file, use_fold=True, max_stat=options.max_stat, one_rbp=True)
        rip_fold = ripseq.hash_rip_fold(diff_file, min_fpkm=0.125, max_fold=10, one_rbp=True)

    ##################################################
    # plot bound and unbound distributions
    ##################################################
    # construct data frame
    df_dict = {'Gene':[], 'CLIP':[], 'RIP':[]}
    for gene_id in rip_fold:
        if gene_id in gtf_genes and (len(silent_genes) == 0 or gene_id not in silent_genes):
            df_dict['Gene'].append(gene_id)
            df_dict['RIP'].append(rip_fold[gene_id])
            if gene_id in peak_genes:
                df_dict['CLIP'].append('Bound')
            else:
                df_dict['CLIP'].append('Unbound')

    ggplot.plot(options.ggplot_script, df_dict, [options.output_pre, options.rbp, options.test_stat])

    ##################################################
    # compute stats on bound and unbound distributions
    ##################################################
    bound_fold = [df_dict['RIP'][i] for i in range(len(df_dict['RIP'])) if df_dict['CLIP'][i] == 'Bound']
    unbound_fold = [df_dict['RIP'][i] for i in range(len(df_dict['RIP'])) if df_dict['CLIP'][i] == 'Unbound']

    # perform statistical test
    z, p = stats.mannwhitneyu(bound_fold, unbound_fold)

    stats_out = open('%s_stats.txt' % options.output_pre, 'w')
    cols = (options.rbp, len(bound_fold), stats.mean(bound_fold), len(unbound_fold), stats.mean(unbound_fold), z, p)    
    print >> stats_out, '%-10s  %5d  %6.2f  %5d  %6.2f  %6.2f  %9.2e' % cols
    stats_out.close()

    ##################################################
    # plot venn diagram
    ##################################################
    rip_genes = set([df_dict['Gene'][i] for i in range(len(df_dict['Gene'])) if rip_bound.get(df_dict['Gene'][i],False)])

    clip_only = len(peak_genes - rip_genes)
    rip_only = len(rip_genes - peak_genes)
    both = len(peak_genes & rip_genes)

    if options.clip_fpkm_file:
        print >> sys.stderr, 'Ignoring silent genes for hypergeometric test'

    # k is x
    # K is n
    # N is M
    # n is N
    # hypergeom.sf(x, M, n, N, loc=0)

    p1 = hypergeom.sf(both-1, len(gtf_genes), len(peak_genes), len(rip_genes))
    p2 = hypergeom.sf(both-1, len(gtf_genes), len(rip_genes), len(peak_genes))

    hyper_out = open('%s_hyper.txt' % options.output_pre, 'w')
    cols = (p1, p2, both, clip_only, rip_only, len(peak_genes), len(rip_genes), len(gtf_genes))
    print >> hyper_out, '%7.2e  %7.2e  %5d  %5d  %5d  %5d  %5d %5d' % cols
    hyper_out.close()

    if clip_only > 0 and rip_only > 0:
        plt.figure()
        # venn_diag = venn2(subsets=(clip_only, rip_only, both), set_labels=['CLIP', 'fRIP'], set_colors=['#e41a1c', '#377eb8'])
        # venn_diag = venn2(subsets=(clip_only, rip_only, both), set_labels=['CLIP', 'fRIP'], set_colors=['#e41a1c', '#1ae47d'])
        venn_diag = venn2(subsets=(clip_only, rip_only, both), set_labels=['CLIP', 'fRIP'], set_colors=['#e41a1c', '#A1A838'])
        plt.savefig('%s_venn.pdf' % options.output_pre)

    ##################################################
    # clean
    ##################################################
    if options.single_gene_loci:
        os.close(single_gtf_fd)
        os.remove(single_gtf_file)
Esempio n. 34
0
def main():
    usage = 'usage: %prog [options] <diff1_file> <diff2_file>'
    parser = OptionParser(usage)
    parser.add_option('-s', dest='stat', default='test_stat')
    parser.add_option('-g', dest='genes_gtf', default=None)
    parser.add_option('-m', dest='min_fpkm', default=0, type='float')    
    parser.add_option('-o', dest='out_dir', default='.')
    parser.add_option('-p', dest='pseudocount', default=0.125, type='float')
    (options,args) = parser.parse_args()

    if len(args) != 2:
        parser.error('Must provide two diff files')
    else:
        diff1_file = args[0]
        diff2_file = args[1]

    gtf_genes = None
    if options.genes_gtf:
        gtf_genes = gff.gtf_gene_set(options.genes_gtf)

    diff1_stats = cuffdiff.hash_stat(diff1_file, stat=options.stat, min_fpkm=options.min_fpkm, pseudocount=options.pseudocount, gene_set=gtf_genes)
    diff1_sig = cuffdiff.hash_sig(diff1_file, gene_set=gtf_genes)

    diff2_stats = cuffdiff.hash_stat(diff2_file, stat=options.stat, min_fpkm=options.min_fpkm, pseudocount=options.pseudocount, gene_set=gtf_genes)
    diff2_sig = cuffdiff.hash_sig(diff2_file, gene_set=gtf_genes)
    
    if not os.path.isdir(options.out_dir):
        os.mkdir(options.out_dir)

    for diff_key in diff1_stats:
        sample1, sample2 = diff_key

        gene_stats1 = diff1_stats[diff_key]
        gene_sig1 = diff1_sig[diff_key]
        gene_stats2 = diff2_stats[diff_key]
        gene_sig2 = diff2_sig[diff_key]

        report_out = open('%s/%s-%s_report.txt' % (options.out_dir,sample1,sample2), 'w')

        # compare numbers of genes quantified
        common_genes = set(gene_stats1.keys()) & set(gene_stats2.keys())
        print >> report_out, 'Genes quantified'
        print >> report_out, '%s\t%d' % (diff1_file,len(gene_stats1))
        print >> report_out, '%s\t%d' % (diff2_file,len(gene_stats2))
        print >> report_out, 'Common\t%d' % len(common_genes)
        print >> report_out, ''

        up1 = set([gene_id for gene_id in gene_sig1 if gene_sig1[gene_id]])
        up2 = set([gene_id for gene_id in gene_sig2 if gene_sig2[gene_id]])

        print >> report_out, 'Genes upregulated'    
        print >> report_out, '%s\t%d' % (diff1_file,len(up1))
        print >> report_out, '%s\t%d' % (diff2_file,len(up2))
        print >> report_out, 'Common\t%d' % len(up1 & up2)
        print >> report_out, ''

        down1 = set([gene_id for gene_id in gene_sig1 if not gene_sig1[gene_id]])
        down2 = set([gene_id for gene_id in gene_sig2 if not gene_sig2[gene_id]])

        print >> report_out, 'Genes downregulated'    
        print >> report_out, '%s\t%d' % (diff1_file,len(down1))
        print >> report_out, '%s\t%d' % (diff2_file,len(down2))
        print >> report_out, 'Common\t%d' % len(down1 & down2)
        print >> report_out, ''

        # scatter plot test stat
        df = {'diff1':[], 'diff2':[]}
        for gene_id in common_genes:
            df['diff1'].append(gene_stats1[gene_id])
            df['diff2'].append(gene_stats2[gene_id])

        r_script = '%s/diff_diff_scatter.r' % os.environ['RDIR']
        out_pdf = '%s/%s-%s_scatter.pdf' % (options.out_dir, sample1, sample2)
        ggplot.plot(r_script, df, [out_pdf], df_file='%s.df'%out_pdf[:-4])

        # compute correlation
        cor, p = spearmanr(df['diff1'], df['diff2'])
        print >> report_out, 'Spearman correlation: %f (%f)' % (cor,p)
        cor, p = pearsonr(df['diff1'], df['diff2'])
        print >> report_out, 'Pearson correlation: %f (%f)' % (cor,p)

        report_out.close()

        # plot test_stat versus test_stat difference
        df = {'minus':[], 'avg':[]}
        for gene_id in common_genes:
            df['minus'].append(gene_stats1[gene_id] - gene_stats2[gene_id])
            df['avg'].append(0.5*gene_stats1[gene_id] + 0.5*gene_stats2[gene_id])

        r_script = '%s/diff_diff_ma.r' % os.environ['RDIR']
        out_pdf = '%s/%s-%s_ma.pdf' % (options.out_dir, sample1, sample2)
        ggplot.plot(r_script, df, [out_pdf])