def main(): usage = 'usage: %prog [options] <fpkm_tracking> <gene1> <gene2> ...>' parser = OptionParser(usage) parser.add_option('-l', dest='log', default=False, help='log2 FPKM') parser.add_option('-n', dest='names', default=None, help='Sample names, comma-separated') parser.add_option('-p', dest='pseudocount', default=.125, type='float', help='Pseudocount for log FPKM [Default: %default]') parser.add_option('-o', dest='out_dir', default='cuff_bars', help='Output directory [Default: %default]') parser.add_option('-s', dest='samples', default=None, help='Samples to plot, comma-separated') parser.add_option('-y', dest='yaxis_match', default=False, action='store_true', help='Match the y-axis of all plots [Default: %default]') (options,args) = parser.parse_args() if len(args) < 2: parser.error('Must provide fpkm_tracking and genes.') else: fpkm_tracking = args[0] genes = args[1:] gene_sample_fpkm = read_fpkms(fpkm_tracking, genes, options.log, options.pseudocount) if not os.path.isdir(options.out_dir): os.mkdir(options.out_dir) if options.samples: samples = options.samples.split(',') else: samples = sorted(gene_sample_fpkm[genes[0]].keys()) if options.names: names = options.names.split(',') else: names = samples ymin = 0 if options.log: ymin = np.log2(options.pseudocount) if options.yaxis_match: ymax = max([gene_sample_fpkm[gene_name][sample][2] for sample in samples for gene_name in genes]) else: ymax = None for gene_name in genes: df = {} df['Sample'] = names df['FPKM'] = [gene_sample_fpkm[gene_name][sample][0] for sample in samples] df['conf_lo'] = [gene_sample_fpkm[gene_name][sample][1] for sample in samples] df['conf_hi'] = [gene_sample_fpkm[gene_name][sample][2] for sample in samples] out_pdf = '%s/%s.pdf' % (options.out_dir, gene_name) ggplot.plot('%s/cuff_bar.r'%os.environ['RDIR'], df, [ymin, ymax, out_pdf])
def main(): usage = "usage: %prog [options] <fpkm_tracking>" parser = OptionParser(usage) parser.add_option("-d", dest="diff_file", help="Limit to significantly differentially expressed genes") parser.add_option("-g", dest="gtf", help="GTF file of genes to display") parser.add_option("-m", dest="min_fpkm", default=0.125, help="Minimum FPKM (for logs) [Default: %default]") parser.add_option("-o", dest="out_pdf", default="cuff_heat.pdf", help="Output PDF [Default: %default]") parser.add_option("-s", dest="sample", default=1000, help="Sample genes rather than use all [Default: %default]") (options, args) = parser.parse_args() if len(args) != 1: parser.error("Must provide fpkm_tracking") else: fpkm_tracking = args[0] # load expression data cuff = cufflinks.fpkm_tracking(fpkm_file=fpkm_tracking) # determine genes all_genes = set(cuff.genes) if options.gtf: all_genes = set() for line in open(options.gtf): a = line.split("\t") all_genes.add(gff.gtf_kv(a[8])["gene_id"]) if options.diff_file: # limit to differentially expressed genes diff_genes = find_diff(options.diff_file) all_genes &= diff_genes # sample genes to display if len(all_genes) <= options.sample: display_genes = all_genes else: display_genes = random.sample(all_genes, options.sample) # build data frame df = {"Gene": [], "FPKM": [], "Sample": []} for gene_id in display_genes: ge = cuff.gene_expr(gene_id) if not math.isnan(ge[0]): for i in range(len(cuff.experiments)): df["Gene"].append(gene_id) df["Sample"].append(cuff.experiments[i]) df["FPKM"].append(math.log(ge[i] + options.min_fpkm, 2)) # plot ggplot.plot("%s/cuff_heat.r" % os.environ["RDIR"], df, [options.out_pdf])
def make_output(peak_cov, out_prefix, prange): # dump raw counts to file raw_out = open('%s_raw.txt' % out_prefix,'w') for i in range(-prange/2,prange/2+1): print >> raw_out, '%d\t%e' % (i, peak_cov[i+prange/2]) raw_out.close() r_script = '%s/peak_bam_plot.r' % os.environ['RDIR'] df_dict = {'peak_i':range(-prange/2,prange/2+1), 'cov':peak_cov} out_pdf = '%s.pdf' % out_prefix ggplot.plot(r_script, df_dict, [out_pdf])
def main(): usage = 'usage: %prog [options] <fpkm1_file> <fpkm2_file>' parser = OptionParser(usage) parser.add_option('-g', dest='gtf') parser.add_option('-o', dest='out_dir', default='.') parser.add_option('-p', dest='pseudocount', default=0.125, type='float') (options, args) = parser.parse_args() if len(args) != 2: parser.error('Must provide two diff files') else: fpkm1_file = args[0] fpkm2_file = args[1] cuff1 = cufflinks.fpkm_tracking(fpkm1_file) cuff2 = cufflinks.fpkm_tracking(fpkm2_file) gtf_genes = set() if options.gtf: gtf_genes = gff.gtf_gene_set(options.gtf) if not os.path.isdir(options.out_dir): os.mkdir(options.out_dir) for sample in cuff1.experiments: # scatter plot fpkm df = {'fpkm1': [], 'fpkm2': []} for i in range(len(cuff1.genes)): if len(gtf_genes) == 0 or cuff1.genes[i] in gtf_genes: fpkm1 = cuff1.gene_expr_exp(i, sample) fpkm2 = cuff2.gene_expr_exp(i, sample) if not math.isnan(fpkm1) and not math.isnan(fpkm2): df['fpkm1'].append(math.log(options.pseudocount + fpkm1, 2)) df['fpkm2'].append(math.log(options.pseudocount + fpkm2, 2)) r_script = '%s/fpkm_fpkm_scatter.r' % os.environ['RDIR'] out_pdf = '%s/%s_scatter.pdf' % (options.out_dir, sample) ggplot.plot(r_script, df, [out_pdf]) # compute correlation cor, p = spearmanr(df['fpkm1'], df['fpkm2']) report_out = open('%s/%s_report.txt' % (options.out_dir, sample), 'w') print >> report_out, 'Spearman correlation: %f (%e)' % (cor, p) report_out.close()
def main(): usage = 'usage: %prog [options] <fpkm1_file> <fpkm2_file>' parser = OptionParser(usage) parser.add_option('-g', dest='gtf') parser.add_option('-o', dest='out_dir', default='.') parser.add_option('-p', dest='pseudocount', default=0.125, type='float') (options,args) = parser.parse_args() if len(args) != 2: parser.error('Must provide two diff files') else: fpkm1_file = args[0] fpkm2_file = args[1] cuff1 = cufflinks.fpkm_tracking(fpkm1_file) cuff2 = cufflinks.fpkm_tracking(fpkm2_file) gtf_genes = set() if options.gtf: gtf_genes = gff.gtf_gene_set(options.gtf) if not os.path.isdir(options.out_dir): os.mkdir(options.out_dir) for sample in cuff1.experiments: # scatter plot fpkm df = {'fpkm1':[], 'fpkm2':[]} for i in range(len(cuff1.genes)): if len(gtf_genes) == 0 or cuff1.genes[i] in gtf_genes: fpkm1 = cuff1.gene_expr_exp(i, sample) fpkm2 = cuff2.gene_expr_exp(i, sample) if not math.isnan(fpkm1) and not math.isnan(fpkm2): df['fpkm1'].append(math.log(options.pseudocount+fpkm1,2)) df['fpkm2'].append(math.log(options.pseudocount+fpkm2,2)) r_script = '%s/fpkm_fpkm_scatter.r' % os.environ['RDIR'] out_pdf = '%s/%s_scatter.pdf' % (options.out_dir, sample) ggplot.plot(r_script, df, [out_pdf]) # compute correlation cor, p = spearmanr(df['fpkm1'], df['fpkm2']) report_out = open('%s/%s_report.txt' % (options.out_dir,sample), 'w') print >> report_out, 'Spearman correlation: %f (%e)' % (cor, p) report_out.close()
def cdf_plot(te_key, te_diffs, note_diffs, out_pdf, scale): rep, fam, orient = te_key # name plot if fam == '-': label = 'dTE-RNAs/%s' % orient elif fam == '*': label = 'TE-RNAs/%s' % orient elif rep == '*': label = '%s-RNAs/%s' % (fam, orient) else: label = '%s-RNAs/%s' % (rep, orient) # construct data frame df = {} df['diff'] = note_diffs + te_diffs df['class'] = ['d%s' % label] * len(note_diffs) + [label] * len(te_diffs) ggplot.plot('%s/te_diff.r' % os.environ['RDIR'], df, [out_pdf, scale])
def cdf_plot(te_or, w_te, wo_te, out_pdf): rep, fam, orient = te_or # name plot if fam == '-': label = 'dTE-RNAs/%s' % orient elif fam == '*': label = 'TE-RNAs/%s' % orient elif rep == '*': label = '%s-RNAs/%s' % (fam,orient) else: label = '%s-RNAs/%s' % (rep,orient) # construct data frame df = {} df['fold'] = wo_te + w_te df['class'] = ['d%s' % label]*len(wo_te) + [label]*len(w_te) ggplot.plot('te_diff.r', df, [out_pdf])
def cdf_plot(te_key, te_diffs, note_diffs, out_pdf, scale): rep, fam, orient = te_key # name plot if fam == '-': label = 'dTE-RNAs/%s' % orient elif fam == '*': label = 'TE-RNAs/%s' % orient elif rep == '*': label = '%s-RNAs/%s' % (fam,orient) else: label = '%s-RNAs/%s' % (rep,orient) # construct data frame df = {} df['diff'] = note_diffs + te_diffs df['class'] = ['d%s' % label]*len(note_diffs) + [label]*len(te_diffs) ggplot.plot('%s/te_diff.r' % os.environ['RDIR'], df, [out_pdf, scale])
def plot_coverage(bam_te_coverages, dfam_te, orient, labels, out_dir): df = {'indexes':[], 'coverage':[], 'coverage_norm':[], 'data':[]} for i in range(len(bam_te_coverages)): bam_coverage = bam_te_coverages[i].get((dfam_te,orient),[]) if len(bam_coverage) > 0: df['indexes'] += range(len(bam_coverage)) df['data'] += [labels[i]]*len(bam_coverage) cov_sum = float(sum(bam_coverage)) bam_coverage_norm = [c/cov_sum for c in bam_coverage] if orient == 'rev': df['coverage'] += bam_coverage[::-1] df['coverage_norm'] += bam_coverage_norm[::-1] else: df['coverage'] += bam_coverage df['coverage_norm'] += bam_coverage_norm if len(df['indexes']) > 0: out_pre = '%s/%s_%s_cov' % (out_dir, dfam_te, orient) ggplot.plot('%s/te_cov.r' % tempura.r_dir, df, [dfam_te, out_pre])
def main(): usage = 'usage: %prog [options] <mode=mid/span> <anchor_gff> <event_bam1,event_bam2,...|event_gff1,event_gff2,...>' parser = OptionParser(usage) parser.add_option('-a', dest='max_anchors', default=1000, type='int', help='Maximum number of anchors to consider [Default: %default]') parser.add_option('-c', dest='control_files', default=None, help='Control BAM or GFF files (comma separated)') parser.add_option('-e', dest='plot_heat', default=False, help='Plot as a heatmap [Default: %default]') parser.add_option('-l', dest='log', default=False, action='store_true', help='log2 coverage [Default: %default]') parser.add_option('-o', dest='output_pre', default='gff_cov', help='Output prefix [Default: %default]') parser.add_option('-s', dest='sorted_gene_files', help='Files of sorted gene lists. Plot heatmaps in their order') parser.add_option('-b', dest='bins', default=100, type='int', help='Number of bins across the gene span [Default: %default]') parser.add_option('-m', dest='min_length', default=None, type='int', help='Minimum anchor length [Default: %default]') parser.add_option('-w', dest='window', default=2000, type='int', help='Window around peak middle [Default: %default]') (options,args) = parser.parse_args() if len(args) != 3: parser.error('Must provide mode, anchor GFF, and BAM/GFF file(s)') else: mode = args[0] anchor_gff = args[1] event_files = args[2].split(',') if options.control_files: control_files = options.control_files.split(',') anchor_is_gtf = (anchor_gff[-4:] == '.gtf') # preprocess anchor GFF prep_anchor_fd, prep_anchor_gff = preprocess_anchors(anchor_gff, mode, options.max_anchors, anchor_is_gtf, options.min_length, options.window) ############################################ # compute coverage ############################################ coverage, events = compute_coverage(prep_anchor_gff, event_files, mode, anchor_is_gtf, options.bins) if options.control_files: coverage_control, events_control = compute_coverage(prep_anchor_gff, control_files, mode, anchor_is_gtf, options.bins) # clean os.close(prep_anchor_fd) os.remove(prep_anchor_gff) ############################################ # normalize ############################################ # normalize coverages (and add pseudocounts) for anchor_id in coverage: for i in range(len(coverage[anchor_id])): coverage[anchor_id][i] = (1+coverage[anchor_id][i])/float(events) if options.control_files: coverage_control[anchor_id][i] = (1+coverage_control[anchor_id][i])/float(events_control) ############################################ # sort anchors ############################################ anchors_sorted = [] if options.sorted_gene_files: # for each sorted list for sorted_gene_file in options.sorted_gene_files.split(','): # collect anchor_id's anchors_sorted.append([]) for line in open(sorted_gene_file): anchor_id = line.split()[0] # verify randomly selected if anchor_id in coverage: anchors_sorted[-1].append(anchor_id) else: # tuple anchor_id's with mean coverage stat_aid = [] for anchor_id in coverage: if options.control_files: astat = stats.mean([math.log(coverage[anchor_id][i],2) - math.log(coverage_control[anchor_id][i],2) for i in range(len(coverage[anchor_id]))]) else: astat = stats.geo_mean([coverage[anchor_id][i] for i in range(len(coverage[anchor_id]))]) stat_aid.append((astat, anchor_id)) # sort stat_aid.sort(reverse=True) # store as the only sorted list anchors_sorted.append([anchor_id for (astat, anchor_id) in stat_aid]) ############################################ # plot heatmap(s) ############################################ if options.plot_heat: # if multiple sorts, create a dir for the plots if len(anchors_sorted) > 1: if not os.path.isdir('%s_heat' % options.output_pre): os.mkdir('%s_heat' % options.output_pre) for s in range(len(anchors_sorted)): df = {'Index':[], 'Anchor':[], 'Coverage':[]} for si in range(len(anchors_sorted[s])): anchor_id = anchors_sorted[s][si] for i in range(len(coverage[anchor_id])): if mode == 'mid': df['Index'].append(i - options.window/2) else: df['Index'].append(i) df['Anchor'].append(anchor_id) if options.log: cov = math.log(coverage[anchor_id][i], 2) else: cov = coverage[anchor_id][i] if options.control_files: if options.log: cov -= math.log(coverage_control[anchor_id][i], 2) else: cov = cov / coverage_control[anchor_id][i] df['Coverage'].append('%.4e' % cov) r_script = '%s/plot_gff_cov_heat.r' % os.environ['RDIR'] if len(anchors_sorted) == 1: out_pdf = '%s_heat.pdf' % options.output_pre else: sorted_gene_file = options.sorted_gene_files.split(',')[s] sorted_gene_pre = os.path.splitext(os.path.split(sorted_gene_file)[-1])[0] out_pdf = '%s_heat/%s.pdf' % (options.output_pre,sorted_gene_pre) ggplot.plot(r_script, df, [out_pdf, options.control_files!=None]) ############################################ # plot meta-coverage ############################################ df = {'Index':[], 'Coverage':[]} if options.control_files: df['Type'] = [] if mode == 'mid': index_length = 2*(options.window/2) + 1 elif mode == 'span': index_length = options.bins else: print >> sys.stderr, 'Unknown mode %s' % mode exit(1) for i in range(index_length): if mode == 'mid': df['Index'].append(i - options.window/2) else: df['Index'].append(i) if options.log: df['Coverage'].append(stats.geo_mean([coverage[anchor_id][i] for anchor_id in coverage])) else: df['Coverage'].append(stats.mean([coverage[anchor_id][i] for anchor_id in coverage])) if options.control_files: df['Type'].append('Primary') if mode == 'mid': df['Index'].append(i - options.window/2) else: df['Index'].append(i) df['Type'].append('Control') if options.log: df['Coverage'].append(stats.geo_mean([coverage_control[anchor_id][i] for anchor_id in coverage_control])) else: df['Coverage'].append(stats.mean([coverage_control[anchor_id][i] for anchor_id in coverage_control])) r_script = '%s/plot_gff_cov_meta.r' % os.environ['RDIR'] ggplot.plot(r_script, df, [options.output_pre])
def main(): usage = 'usage: %prog [options] <peaks gff> <diff>' parser = OptionParser(usage) parser.add_option('-c', dest='clip_fpkm_file', help='Control FPKM tracking file') parser.add_option('-g', dest='ref_gtf', default='%s/gencode.v18.annotation.gtf' % os.environ['GENCODE']) parser.add_option('--ggplot', dest='ggplot_script', default='%s/peaks_diff_compare.r' % os.environ['RDIR'], help='Script to make plots with [Default: %default]') parser.add_option('-m', dest='max_stat', default=10, type='float', help='Max cuffdiff stat [Default: %default]') parser.add_option('-o', dest='output_pre', default='', help='Output prefix [Default: %default]') parser.add_option('-r', dest='rbp', default='RBP', help='RBP name [Default: %default]') parser.add_option('-s', dest='single_gene_loci', default=False, action='store_true', help='Only use single gene loci [Default: %default]') parser.add_option( '-t', dest='test_stat', default=False, action='store_true', help='Use test statistic rather than fold change [Default: %default]') parser.add_option('--sample1', dest='sample1', help='Sample_1 name in cuffdiff') parser.add_option('--sample2', dest='sample2', help='Sample_2 name in cuffdiff') (options, args) = parser.parse_args() if len(args) != 2: parser.error('Must provide peaks GFF and .diff file') else: peaks_gff = args[0] diff_file = args[1] ################################################## # process GTF ################################################## if options.single_gene_loci: single_gtf_fd, single_gtf_file = filter_single(options.ref_gtf) options.ref_gtf = single_gtf_file gtf_genes = gff.gtf_gene_set(options.ref_gtf) ################################################## # collect CLIP peak bound genes ################################################## peak_genes = set() p = subprocess.Popen('intersectBed -s -u -a %s -b %s' % (options.ref_gtf, peaks_gff), shell=True, stdout=subprocess.PIPE) for line in p.stdout: peak_genes.add(gff.gtf_kv(line.split('\t')[8])['gene_id']) p.communicate() # find expressed genes in peak calls silent_genes = set() if options.clip_fpkm_file: silent_genes = find_silent(options.clip_fpkm_file) ################################################## # collect RIP stats ################################################## if options.test_stat: rip_fold, rip_bound = ripseq.hash_rip(diff_file, just_ok=True, use_fold=False, max_stat=options.max_stat, one_rbp=True) else: rip_fold, rip_bound = ripseq.hash_rip(diff_file, use_fold=True, max_stat=options.max_stat, one_rbp=True) rip_fold = ripseq.hash_rip_fold(diff_file, min_fpkm=0.125, max_fold=10, one_rbp=True) # TEMP: print bound genes # genes_out = open('%s_genes.txt' % options.output_pre, 'w') # for gene_id in rip_bound: # if rip_bound[gene_id]: # print >> genes_out, gene_id, rip_fold[gene_id] # genes_out.close() ################################################## # plot bound and unbound distributions ################################################## # construct data frame df_dict = {'Gene': [], 'CLIP': [], 'RIP': []} for gene_id in rip_fold: if gene_id in gtf_genes and (len(silent_genes) == 0 or gene_id not in silent_genes): df_dict['Gene'].append(gene_id) df_dict['RIP'].append(rip_fold[gene_id]) if gene_id in peak_genes: df_dict['CLIP'].append('Bound') else: df_dict['CLIP'].append('Unbound') ggplot.plot(options.ggplot_script, df_dict, [options.output_pre, options.rbp, options.test_stat]) ################################################## # compute stats on bound and unbound distributions ################################################## bound_fold = [ df_dict['RIP'][i] for i in range(len(df_dict['RIP'])) if df_dict['CLIP'][i] == 'Bound' ] unbound_fold = [ df_dict['RIP'][i] for i in range(len(df_dict['RIP'])) if df_dict['CLIP'][i] == 'Unbound' ] # perform statistical test z, p = stats.mannwhitneyu(bound_fold, unbound_fold) stats_out = open('%s_stats.txt' % options.output_pre, 'w') cols = (options.rbp, len(bound_fold), stats.mean(bound_fold), len(unbound_fold), stats.mean(unbound_fold), z, p) print >> stats_out, '%-10s %5d %6.2f %5d %6.2f %6.2f %9.2e' % cols stats_out.close() ################################################## # plot venn diagram ################################################## rip_genes = set([ df_dict['Gene'][i] for i in range(len(df_dict['Gene'])) if rip_bound.get(df_dict['Gene'][i], False) ]) clip_only = len(peak_genes - rip_genes) rip_only = len(rip_genes - peak_genes) both = len(peak_genes & rip_genes) if options.clip_fpkm_file: print >> sys.stderr, 'Ignoring silent genes for hypergeometric test' # k is x # K is n # N is M # n is N # hypergeom.sf(x, M, n, N, loc=0) p1 = hypergeom.sf(both - 1, len(gtf_genes), len(peak_genes), len(rip_genes)) p2 = hypergeom.sf(both - 1, len(gtf_genes), len(rip_genes), len(peak_genes)) hyper_out = open('%s_hyper.txt' % options.output_pre, 'w') cols = (p1, p2, both, clip_only, rip_only, len(peak_genes), len(rip_genes), len(gtf_genes)) print >> hyper_out, '%7.2e %7.2e %5d %5d %5d %5d %5d %5d' % cols hyper_out.close() if clip_only > 0 and rip_only > 0: plt.figure() # venn_diag = venn2(subsets=(clip_only, rip_only, both), set_labels=['CLIP', 'fRIP'], set_colors=['#e41a1c', '#377eb8']) # venn_diag = venn2(subsets=(clip_only, rip_only, both), set_labels=['CLIP', 'fRIP'], set_colors=['#e41a1c', '#1ae47d']) venn_diag = venn2(subsets=(clip_only, rip_only, both), set_labels=['CLIP', 'fRIP'], set_colors=['#e41a1c', '#A1A838']) plt.savefig('%s_venn.pdf' % options.output_pre) ################################################## # clean ################################################## if options.single_gene_loci: os.close(single_gtf_fd) os.remove(single_gtf_file)
def main(): usage = 'usage: %prog [options] <gff> <bam1,bam2,...>' parser = OptionParser(usage) parser.add_option('-c', dest='control_bam_files', default=None, help='Control BAM files (comma separated)') parser.add_option('-l', dest='log', default=False, action='store_true', help='log2 coverage [Default: %default]') parser.add_option('-k', dest='gtf_key', default=None, help='GTF key to hash gff entries by') parser.add_option('-m', dest='max_features', default=2000, type='int', help='Maximum number of features to plot [Default: %default]') parser.add_option('-o', dest='output_pre', default='bam', help='Output prefix [Default: %default]') parser.add_option('-s', dest='sorted_gene_files', help='Files of sorted gene lists. Plot heatmaps in their order') parser.add_option('-u', dest='range', default=2000, type='int', help='Range around peak middle [Default: %default]') (options,args) = parser.parse_args() if len(args) != 2: parser.error('Must provide gtf file and BAM file') else: gff_file = args[0] bam_files = args[1].split(',') if options.control_bam_files: control_bam_files = options.control_bam_files.split(',') ############################################ # extend GFF entries to range (and sample) ############################################ feature_count = 0 for line in open(gff_file): feature_count += 1 sample_prob = min(1.0, options.max_features / float(feature_count)) gff_range_fd, gff_range_file = tempfile.mkstemp() gff_range_out = open(gff_range_file, 'w') for line in open(gff_file): a = line.split('\t') start = int(a[3]) end = int(a[4]) mid = start + (end-start)/2 a[3] = str(mid - options.range/2) a[4] = str(mid + options.range/2) a[-1] = a[-1].rstrip() if random.random() < sample_prob: print >> gff_range_out, '\t'.join(a) gff_range_out.close() ############################################ # compute coverage ############################################ coverage, fragments = compute_coverage(gff_range_file, bam_files, options.gtf_key) if options.control_bam_files: coverage_control, fragments_control = compute_coverage(gff_range_file, control_bam_files, options.gtf_key) # clean os.close(gff_range_fd) os.remove(gff_range_file) ############################################ # normalize ############################################ # normalize coverages (and add pseudocounts) for feature_id in coverage: for i in range(len(coverage[feature_id])): coverage[feature_id][i] = (1+coverage[feature_id][i])/fragments if options.control_bam_files: coverage_control[feature_id][i] = (1+coverage_control[feature_id][i])/fragments_control ############################################ # sorted genes ############################################ features_sorted = [] if options.sorted_gene_files: # for each sorted list for sorted_gene_file in options.sorted_gene_files.split(','): # collect feature_id's features_sorted.append([]) for line in open(sorted_gene_file): feature_id = line.split()[0] # verify randomly selected if feature_id in coverage: features_sorted[-1].append(feature_id) else: # tuple feature_id's with mean coverage feature_id_stat = [] for feature_id in coverage: if options.control_bam_files: feature_stat = stats.mean([math.log(coverage[feature_id][i],2) - math.log(coverage_control[feature_id][i],2) for i in range(len(coverage[feature_id]))]) else: feature_stat = stats.geo_mean([coverage[feature_id][i] for i in range(len(coverage[feature_id]))]) feature_id_stat.append((feature_stat,feature_id)) # sort feature_id_stat.sort(reverse=True) # store as the only sorted list features_sorted.append([feature_id for (feature_stat, feature_id) in feature_id_stat]) ############################################ # plot heatmap(s) ############################################ # if multiple sorts, create a dir for the plots if len(features_sorted) > 1: if not os.path.isdir('%s_heat' % options.output_pre): os.mkdir('%s_heat' % options.output_pre) for s in range(len(features_sorted)): df = {'Index':[], 'Feature':[], 'Coverage':[]} for f in range(len(features_sorted[s])): feature_id = features_sorted[s][f] for i in range(-options.range/2,options.range/2+1): df['Index'].append(i) df['Feature'].append(f) if options.log: cov = math.log(coverage[feature_id][i+options.range/2],2) else: cov = coverage[feature_id][i+options.range/2] if options.control_bam_files: if options.log: cov -= math.log(coverage_control[feature_id][i+options.range/2],2) else: cov = cov / coverage_control[feature_id][i+options.range/2] df['Coverage'].append('%.4e' % cov) r_script = '%s/bam_heat_heat.r' % os.environ['RDIR'] if len(features_sorted) == 1: out_pdf = '%s_heat.pdf' % options.output_pre else: sorted_gene_file = options.sorted_gene_files.split(',')[s] sorted_gene_pre = os.path.splitext(os.path.split(sorted_gene_file)[-1])[0] out_pdf = '%s_heat/%s.pdf' % (options.output_pre,sorted_gene_pre) ggplot.plot(r_script, df, [out_pdf, options.control_bam_files!=None], df_file='df_heat.txt') ############################################ # plot meta-coverage ############################################ df = {'Index':[], 'Coverage':[]} if options.control_bam_files: df['Type'] = [] for i in range(-options.range/2,options.range/2+1): df['Index'].append(i) if options.log: df['Coverage'].append(stats.geo_mean([coverage[feature_id][i+options.range/2] for feature_id in coverage])) else: df['Coverage'].append(stats.mean([coverage[feature_id][i+options.range/2] for feature_id in coverage])) if options.control_bam_files: df['Type'].append('Primary') df['Index'].append(i) df['Type'].append('Control') if options.log: df['Coverage'].append(stats.geo_mean([coverage_control[feature_id][i+options.range/2] for feature_id in coverage_control])) else: df['Coverage'].append(stats.mean([coverage_control[feature_id][i+options.range/2] for feature_id in coverage_control])) r_script = '%s/bam_heat_meta.r' % os.environ['RDIR'] out_pdf = '%s_meta.pdf' % options.output_pre ggplot.plot(r_script, df, [out_pdf], df_file='df_meta.txt')
def main(): usage = 'usage: %prog [options] <gtf> <diff>' parser = OptionParser(usage) parser.add_option('-o', dest='out_dir', default='te_diff_regress', help='Output directory to print regression summaries [Default: %default]') parser.add_option('-c', dest='scale', default=1, type='float', help='Plot scale [Default: %default]') parser.add_option('-t', dest='te_gff', default='%s/hg19.fa.out.tp.gff'%os.environ['MASK']) parser.add_option('-r', dest='orientation', default=False, action='store_true', help='Split TEs by orientation [Default: %default]') parser.add_option('-m', dest='max_stat', default=None, type='float', help='Maximum stat for plotting [Default: %default]') parser.add_option('-s', dest='spread_factor', default=None, type='float', help='Allow multiplicative factor between the shortest and longest transcripts, used to filter [Default: %default]') parser.add_option('-l', dest='spread_lower', default=None, type='float', help='Allow multiplicative factor between median and shortest transcripts [Defafult: %default]') parser.add_option('-u', dest='spread_upper', default=None, type='float', help='Allow multiplicative factor between median and longest transcripts [Defafult: %default]') (options,args) = parser.parse_args() if len(args) != 2: parser.error('Must provide .gtf and .diff files') else: ref_gtf = args[0] diff_file = args[1] # make output directory if not os.path.isdir(options.out_dir): os.mkdir(options.out_dir) if options.spread_factor or options.spread_lower or options.spread_upper: # filter for similar length if options.spread_factor: options.spread_lower = math.sqrt(options.spread_factor) options.spread_upper = math.sqrt(options.spread_factor) spread_gtf = '%s/spread_factor.gtf' % options.out_dir gff.length_filter(ref_gtf, spread_gtf, options.spread_lower, options.spread_lower, verbose=True) ref_gtf = spread_gtf # hash genes -> TEs -> occurence num gene_te_num = te.hash_genes_repeats_num(ref_gtf, options.te_gff, gene_key='transcript_id', add_star=True, stranded=options.orientation) # hash diffs stats gene_diffs = cuffdiff.hash_diff(diff_file, stat='fold', max_stat=options.max_stat, sample_first='input') table_lines = [] pvals = [] for spair in gene_diffs: sample1, sample2 = spair gene_list = list(set(gene_te_num.keys()) & set(gene_diffs[spair].keys())) for fam in count_tes: if options.orientation: orients = ['+','-'] else: orients = ['+'] for orient in orients: # hash diff values by TE count count_diff = [] for gene_id in gene_diffs[spair]: if options.orientation: count = gene_te_num.get(gene_id,{}).get(('*',fam,orient), 0) else: count = gene_te_num.get(gene_id,{}).get(('*',fam), 0) while count >= len(count_diff): count_diff.append([]) count_diff[count].append(gene_diffs[spair][gene_id]) df = {'TEs':[], 'stat_low':[], 'stat_mid':[], 'stat_hi':[]} for c in range(len(count_diff)): if len(count_diff[c]) > 12: stat_low, stat_mid, stat_hi = stats.quantile(count_diff[c], [.25, .5, .75]) df['TEs'].append(c) df['stat_low'].append(stat_low) df['stat_mid'].append(stat_mid) df['stat_hi'].append(stat_hi) else: break if len(df['TEs']) > 1: fam_plot = fam[fam.find('/')+1:] if options.orientation: out_pdf = '%s/%s-%s_%s_%s.pdf' % (options.out_dir, sample1, sample2, fam_plot, orient) out_df = '%s/%s-%s_%s_%s.df' % (options.out_dir, sample1, sample2, fam_plot, orient) else: out_pdf = '%s/%s-%s_%s.pdf' % (options.out_dir, sample1, sample2, fam_plot) out_df = '%s/%s-%s_%s.df' % (options.out_dir, sample1, sample2, fam_plot) ggplot.plot('%s/te_diff_count.r' % os.environ['RDIR'], df, [out_pdf, options.scale], df_file=out_df) if options.spread_factor or options.spread_lower or options.spread_upper: os.remove(spread_gtf)
def main(): usage = 'usage: %prog [options] <gtf> <diff>' parser = OptionParser(usage) parser.add_option( '-o', dest='out_dir', default='te_diff_regress', help= 'Output directory to print regression summaries [Default: %default]') parser.add_option('-c', dest='scale', default=1, type='float', help='Plot scale [Default: %default]') parser.add_option('-t', dest='te_gff', default='%s/hg19.fa.out.tp.gff' % os.environ['MASK']) parser.add_option('-r', dest='orientation', default=False, action='store_true', help='Split TEs by orientation [Default: %default]') parser.add_option('-m', dest='max_stat', default=None, type='float', help='Maximum stat for plotting [Default: %default]') parser.add_option( '-s', dest='spread_factor', default=None, type='float', help= 'Allow multiplicative factor between the shortest and longest transcripts, used to filter [Default: %default]' ) parser.add_option( '-l', dest='spread_lower', default=None, type='float', help= 'Allow multiplicative factor between median and shortest transcripts [Defafult: %default]' ) parser.add_option( '-u', dest='spread_upper', default=None, type='float', help= 'Allow multiplicative factor between median and longest transcripts [Defafult: %default]' ) (options, args) = parser.parse_args() if len(args) != 2: parser.error('Must provide .gtf and .diff files') else: ref_gtf = args[0] diff_file = args[1] # make output directory if not os.path.isdir(options.out_dir): os.mkdir(options.out_dir) if options.spread_factor or options.spread_lower or options.spread_upper: # filter for similar length if options.spread_factor: options.spread_lower = math.sqrt(options.spread_factor) options.spread_upper = math.sqrt(options.spread_factor) spread_gtf = '%s/spread_factor.gtf' % options.out_dir gff.length_filter(ref_gtf, spread_gtf, options.spread_lower, options.spread_lower, verbose=True) ref_gtf = spread_gtf # hash genes -> TEs -> occurence num gene_te_num = te.hash_genes_repeats_num(ref_gtf, options.te_gff, gene_key='transcript_id', add_star=True, stranded=options.orientation) # hash diffs stats gene_diffs = cuffdiff.hash_diff(diff_file, stat='fold', max_stat=options.max_stat, sample_first='input') table_lines = [] pvals = [] for spair in gene_diffs: sample1, sample2 = spair gene_list = list( set(gene_te_num.keys()) & set(gene_diffs[spair].keys())) for fam in count_tes: if options.orientation: orients = ['+', '-'] else: orients = ['+'] for orient in orients: # hash diff values by TE count count_diff = [] for gene_id in gene_diffs[spair]: if options.orientation: count = gene_te_num.get(gene_id, {}).get( ('*', fam, orient), 0) else: count = gene_te_num.get(gene_id, {}).get(('*', fam), 0) while count >= len(count_diff): count_diff.append([]) count_diff[count].append(gene_diffs[spair][gene_id]) df = {'TEs': [], 'stat_low': [], 'stat_mid': [], 'stat_hi': []} for c in range(len(count_diff)): if len(count_diff[c]) > 12: stat_low, stat_mid, stat_hi = stats.quantile( count_diff[c], [.25, .5, .75]) df['TEs'].append(c) df['stat_low'].append(stat_low) df['stat_mid'].append(stat_mid) df['stat_hi'].append(stat_hi) else: break if len(df['TEs']) > 1: fam_plot = fam[fam.find('/') + 1:] if options.orientation: out_pdf = '%s/%s-%s_%s_%s.pdf' % (options.out_dir, sample1, sample2, fam_plot, orient) out_df = '%s/%s-%s_%s_%s.df' % (options.out_dir, sample1, sample2, fam_plot, orient) else: out_pdf = '%s/%s-%s_%s.pdf' % ( options.out_dir, sample1, sample2, fam_plot) out_df = '%s/%s-%s_%s.df' % (options.out_dir, sample1, sample2, fam_plot) ggplot.plot('%s/te_diff_count.r' % os.environ['RDIR'], df, [out_pdf, options.scale], df_file=out_df) if options.spread_factor or options.spread_lower or options.spread_upper: os.remove(spread_gtf)
def main(): usage = 'usage:%prog [options] <bed_file> <msa_file>' parser = OptionParser(usage) parser.add_option( '-c', dest='consensus_pct', default=0.5, type='float', help= 'Required proportion of columns with a valid nt to consider it a consensus column [Default: %default]' ) parser.add_option( '-d', dest='dfam_consensus', action='store_true', help='Pass the option if you want to use Consensus as defined by Dfam') #parser.add_option('-j', dest='condense_pct', type='float', help='Required proportion of entries to be same between 2 columns for them to be merged') #parser.add_option('-n', dest='discretize_bins', type='int', help='The number of bins you want to discretize the scores into') parser.add_option('-o', dest='output_pre', type='string', help='Prefix of the output files') (options, args) = parser.parse_args() if len(args) != 2: parser.error('Must provide both the BED file and MSA file. Check %s' % usage) else: bed_file = args[0] msa_fasta_file = args[1] ################################################## # hash scores ################################################## seq_scores = {} for line in open(bed_file): a = line.split('\t') header = a[0] + ':' + a[1] + '-' + a[2] score = float(a[4]) seq_scores[header] = score ################################################## # define consensus # define columns to condense for regression ################################################## msa_sequences = {} for line in open(msa_fasta_file): if line[0] == '>': header = line.strip() msa_sequences[header] = '' else: msa_sequences[header] += line.strip() if options.dfam_consensus is True: consensus_sequence = msa_sequences.pop('>Consensus') sequence_length = len(consensus_sequence) consensus_columns = [] for i in range(0, len(consensus_sequence)): if consensus_sequence[i] == 'x': consensus_columns.append(i) else: consensus_columns = define_consensus(msa_fasta_file, options.consensus_pct) #sample_sequence = msa_sequences.pop('>Consensus') #sequence_length = len(sample_sequence) #hamming_cutoff = int(sequence_length - options.condense_pct*sequence_length) #condensed_columns, columns_ls_remove = column_condense(msa_sequences, consensus_columns, hamming_cutoff) ################################################## # map sequences to feature vectors ################################################## # initialize the dictionary with score and position/nt features df_mi = {'Score': []} for i in range(len(consensus_columns)): position = i + 1 df_mi[position] = [] header = '' for line in open(msa_fasta_file): if line[0] == '>': if header and header != 'Consensus': # process seq df_mi['Score'].append(seq_scores[header]) for i in range(len(consensus_columns)): position = i + 1 seq_i = consensus_columns[i] nt = seq[seq_i].upper() df_mi[position].append(nt) header = line[1:].rstrip() seq = '' else: seq += line.rstrip() if header and header != 'Consensus': # process last seq df_mi['Score'].append(seq_scores[header]) for i in range(len(consensus_columns)): position = i + 1 seq_i = consensus_columns[i] nt = seq[seq_i].upper() df_mi[position].append(nt) ggplot.plot('%s/te_mut_info.r' % tempura.r_dir, df_mi, [options.output_pre])
def main(): usage = 'usage: %prog [options] <gff> <bam1,bam2,...>' parser = OptionParser(usage) parser.add_option('-c', dest='control_bam_files', default=None, help='Control BAM files (comma separated)') parser.add_option('-l', dest='log', default=False, action='store_true', help='log2 coverage [Default: %default]') parser.add_option('-k', dest='gtf_key', default=None, help='GTF key to hash gff entries by') parser.add_option( '-m', dest='max_features', default=2000, type='int', help='Maximum number of features to plot [Default: %default]') parser.add_option('-o', dest='output_pre', default='bam', help='Output prefix [Default: %default]') parser.add_option( '-s', dest='sorted_gene_files', help='Files of sorted gene lists. Plot heatmaps in their order') parser.add_option('-u', dest='range', default=2000, type='int', help='Range around peak middle [Default: %default]') (options, args) = parser.parse_args() if len(args) != 2: parser.error('Must provide gtf file and BAM file') else: gff_file = args[0] bam_files = args[1].split(',') if options.control_bam_files: control_bam_files = options.control_bam_files.split(',') ############################################ # extend GFF entries to range (and sample) ############################################ feature_count = 0 for line in open(gff_file): feature_count += 1 sample_prob = min(1.0, options.max_features / float(feature_count)) gff_range_fd, gff_range_file = tempfile.mkstemp() gff_range_out = open(gff_range_file, 'w') for line in open(gff_file): a = line.split('\t') start = int(a[3]) end = int(a[4]) mid = start + (end - start) / 2 range_start = mid - options.range / 2 range_end = mid + options.range / 2 if range_start > 0: a[3] = str(mid - options.range / 2) a[4] = str(mid + options.range / 2) a[-1] = a[-1].rstrip() if random.random() < sample_prob: print >> gff_range_out, '\t'.join(a) gff_range_out.close() ############################################ # compute coverage ############################################ coverage, fragments = compute_coverage(gff_range_file, bam_files, options.gtf_key) if options.control_bam_files: coverage_control, fragments_control = compute_coverage( gff_range_file, control_bam_files, options.gtf_key) # clean os.close(gff_range_fd) os.remove(gff_range_file) ############################################ # normalize ############################################ # normalize coverages (and add pseudocounts) for feature_id in coverage: for i in range(len(coverage[feature_id])): coverage[feature_id][i] = (1 + coverage[feature_id][i]) / fragments if options.control_bam_files: coverage_control[feature_id][i] = ( 1 + coverage_control[feature_id][i]) / fragments_control ############################################ # sorted genes ############################################ features_sorted = [] if options.sorted_gene_files: # for each sorted list for sorted_gene_file in options.sorted_gene_files.split(','): # collect feature_id's features_sorted.append([]) for line in open(sorted_gene_file): feature_id = line.split()[0] # verify randomly selected if feature_id in coverage: features_sorted[-1].append(feature_id) else: # tuple feature_id's with mean coverage feature_id_stat = [] for feature_id in coverage: if options.control_bam_files: feature_stat = stats.mean([ math.log(coverage[feature_id][i], 2) - math.log(coverage_control[feature_id][i], 2) for i in range(len(coverage[feature_id])) ]) else: feature_stat = stats.geo_mean([ coverage[feature_id][i] for i in range(len(coverage[feature_id])) ]) feature_id_stat.append((feature_stat, feature_id)) # sort feature_id_stat.sort(reverse=True) # store as the only sorted list features_sorted.append( [feature_id for (feature_stat, feature_id) in feature_id_stat]) ############################################ # plot heatmap(s) ############################################ # if multiple sorts, create a dir for the plots if len(features_sorted) > 1: if not os.path.isdir('%s_heat' % options.output_pre): os.mkdir('%s_heat' % options.output_pre) for s in range(len(features_sorted)): df = {'Index': [], 'Feature': [], 'Coverage': []} for f in range(len(features_sorted[s])): feature_id = features_sorted[s][f] for i in range(-options.range / 2, options.range / 2 + 1): df['Index'].append(i) df['Feature'].append(f) if options.log: cov = math.log(coverage[feature_id][i + options.range / 2], 2) else: cov = coverage[feature_id][i + options.range / 2] if options.control_bam_files: if options.log: cov -= math.log( coverage_control[feature_id][i + options.range / 2], 2) else: cov = cov / coverage_control[feature_id][ i + options.range / 2] df['Coverage'].append('%.4e' % cov) r_script = '%s/bam_heat_heat.r' % os.environ['RDIR'] if len(features_sorted) == 1: out_pdf = '%s_heat.pdf' % options.output_pre else: sorted_gene_file = options.sorted_gene_files.split(',')[s] sorted_gene_pre = os.path.splitext( os.path.split(sorted_gene_file)[-1])[0] out_pdf = '%s_heat/%s.pdf' % (options.output_pre, sorted_gene_pre) ggplot.plot(r_script, df, [out_pdf, options.control_bam_files != None]) ############################################ # plot meta-coverage ############################################ df = {'Index': [], 'Coverage': []} if options.control_bam_files: df['Type'] = [] for i in range(-options.range / 2, options.range / 2 + 1): df['Index'].append(i) if options.log: df['Coverage'].append( stats.geo_mean([ coverage[feature_id][i + options.range / 2] for feature_id in coverage ])) else: df['Coverage'].append( stats.mean([ coverage[feature_id][i + options.range / 2] for feature_id in coverage ])) if options.control_bam_files: df['Type'].append('Primary') df['Index'].append(i) df['Type'].append('Control') if options.log: df['Coverage'].append( stats.geo_mean([ coverage_control[feature_id][i + options.range / 2] for feature_id in coverage_control ])) else: df['Coverage'].append( stats.mean([ coverage_control[feature_id][i + options.range / 2] for feature_id in coverage_control ])) r_script = '%s/bam_heat_meta.r' % os.environ['RDIR'] out_pdf = '%s_meta.pdf' % options.output_pre ggplot.plot(r_script, df, [out_pdf])
def main(): usage = 'usage: %prog [options] <fpkm_tracking>' parser = OptionParser(usage) parser.add_option('-a', dest='max_fpkm', type='float', help='Maxium log2 FPKM to plot [Default: %d]') parser.add_option( '-d', dest='diff_file', help='Limit to significantly differentially expressed genes') parser.add_option('-g', dest='gtf', help='GTF file of genes to display') parser.add_option('-m', dest='min_fpkm', default=0, type='float', help='Minimum FPKM [Default: %default]') parser.add_option('-p', dest='pseudocount', default=.125, type='float', help='Pseudocount for log FPKM [Default: %default]') parser.add_option('-o', dest='out_pdf', default='cuff_heat.pdf', help='Output PDF [Default: %default]') parser.add_option( '-s', dest='sample', default=1000, type='int', help='Sample genes rather than use all [Default: %default]') parser.add_option('-u', dest='uppercase', default=False, action='store_true', help='Uppercase sample labels [Default: %default]') (options, args) = parser.parse_args() if len(args) != 1: parser.error('Must provide fpkm_tracking') else: fpkm_tracking = args[0] # load expression data cuff = cufflinks.fpkm_tracking(fpkm_file=fpkm_tracking) # determine genes all_genes = set(cuff.genes) if options.gtf: all_genes = set() for line in open(options.gtf): a = line.split('\t') all_genes.add(gff.gtf_kv(a[8])['gene_id']) if options.diff_file: # limit to differentially expressed genes diff_genes = find_diff(options.diff_file) all_genes &= diff_genes else: # at least limit to clean genes clean_genes = set() for gene_id in all_genes: ge = cuff.gene_expr(gene_id) clean = True for i in range(len(ge)): if math.isnan(ge[i]): clean = False break if clean: clean_genes.add(gene_id) all_genes &= clean_genes if options.min_fpkm > 0: expressed_genes = set() for gene_id in all_genes: ge = cuff.gene_expr(gene_id, not_found=0, fail=0) if max(ge) > options.min_fpkm: expressed_genes.add(gene_id) all_genes &= expressed_genes # sample genes to display if len(all_genes) <= options.sample: display_genes = all_genes else: display_genes = random.sample(all_genes, options.sample) # build data frame df = {'Gene': [], 'FPKM': [], 'Sample': []} for gene_id in display_genes: ge = cuff.gene_expr(gene_id, not_found=0, fail=0) for i in range(len(cuff.experiments)): df['Gene'].append(gene_id) df['Sample'].append(cuff.experiments[i]) if options.uppercase: df['Sample'][-1] = df['Sample'][-1].upper() logfpkm = np.log2(ge[i] + options.pseudocount) if options.max_fpkm: logfpkm = min(options.max_fpkm, logfpkm) df['FPKM'].append(logfpkm) # plot out_df = '%s.df' % options.out_pdf[:-4] ggplot.plot('%s/cuff_heat.r' % os.environ['RDIR'], df, [options.out_pdf], df_file=out_df)
def main(): usage = 'usage:%prog [options] <bed_file> <msa_file>' parser = OptionParser(usage) parser.add_option('-c', dest='consensus_pct', default=0.5, type='float', help='Required proportion of columns with a valid nt to consider it a consensus column [Default: %default]') parser.add_option('-m', dest='model_output_file', default='ols_summary.txt', help='The file to write the model summary') parser.add_option('-p', dest='plot_output_file', default='weights_plot.pdf', help='The file to print the plot of index versus weight') (options, args) = parser.parse_args() if len(args) != 2: parser.error('Must provide BED file with scores and MSA fasta file') else: bed_file = args[0] msa_fasta_file = args[1] ################################################## # hash scores ################################################## seq_scores = {} for line in open(bed_file): a = line.split('\t') header = a[3] score = float(a[4]) seq_scores[header] = score ################################################## # define consensus ################################################## consensus_columns = define_consensus(msa_fasta_file, options.consensus_pct) ################################################## # map sequences to feature vectors ################################################## quaternary_conversion_dict = {'A':[1,0,0], 'C':[0,1,0], 'G':[0,0,1], 'T':[0,0,0], 'N':[0.25,0.25,0.25], '.':[0.25,0.25,0.25], '-':[0.25,0.25,0.25]} # initialize the dictionary with score and position/nt features df_dict = {'Score':[]} for i in range(len(consensus_columns)): position = str(i+1) df_dict[position+'_A'] = [] df_dict[position+'_C'] = [] df_dict[position+'_G'] = [] header = '' for line in open(msa_fasta_file): if line[0] == '>': if header and header != 'Consensus': # process seq df_dict['Score'].append(seq_scores[header]) for i in range(len(consensus_columns)): position = str(i+1) seq_i = consensus_columns[i] nt = seq[seq_i].upper() nt_conv = quaternary_conversion_dict[nt] df_dict[position+'_A'].append(nt_conv[0]) df_dict[position+'_C'].append(nt_conv[1]) df_dict[position+'_G'].append(nt_conv[2]) header = line[1:].rstrip() seq = '' else: seq += line.rstrip() if header and header != 'Consensus': # process last seq df_dict['Score'].append(seq_scores[header]) for i in range(len(consensus_columns)): position = str(i+1) seq_i = consensus_columns[i] nt = seq[seq_i].upper() nt_conv = quaternary_conversion_dict[nt] df_dict[position+'_A'].append(nt_conv[0]) df_dict[position+'_C'].append(nt_conv[1]) df_dict[position+'_G'].append(nt_conv[2]) ################################################## # perform learning ################################################## # add y-intercept term df_dict['Const'] = [1]*len(df_dict['Score']) df = pd.DataFrame(df_dict) score = df['Score'] X = df.drop('Score', axis=1) print >> sys.stderr, 'Read in all the sequences and scores. Now fitting the model' mod = sm.OLS(score, X) res = mod.fit() model_output_file = open(options.model_output_file,'w') print >> model_output_file, res.summary() model_output_file.close() print >> sys.stderr, 'Fit an OLS model and print the summary to %s' %(options.model_output_file) ################################################## # read output ################################################## position_weights = collections.defaultdict(list) flag = False for line in open(options.model_output_file, 'r'): if line[0:2] == '==': flag = False elif line[0:2] == '--': flag = True elif flag: contents = line.split() if contents[0] != 'Const': position = int(contents[0].split('_')[0]) weight = float(contents[1]) position_weights[position].append(weight) df_dict = {'Position':[], 'Nucleotide':[], 'Weight':[]} #print '\t'.join(df_dict.keys()) for position in position_weights.keys(): weight_A, weight_C, weight_G = position_weights[position] weight_T = 0.0 nucleotide_weights = [weight_A, weight_C, weight_T, weight_G] nucleotide_order = ['A','C','T','G'] min_weight = min(nucleotide_weights) for i in range(0, len(nucleotide_weights)): nucleotide_weights[i] = nucleotide_weights[i] - min_weight for i in range(0, len(nucleotide_weights)): df_dict['Position'].append(position) df_dict['Nucleotide'].append(nucleotide_order[i]) df_dict['Weight'].append(nucleotide_weights[i]) #print '\t'.join([str(position), nucleotide_order[i], str(nucleotide_weights[i])]) print >> sys.stderr, 'Now plotting the weights of different nucleotides along each position' ggplot.plot('%s/te_score_plots.r' % tempura.r_dir, df_dict, [options.plot_output_file]) print >> sys.stderr, 'All Done. Check output files'
def main(): usage = 'usage: %prog [options] <mode=mid/span> <anchor_gff> <event_bam1,event_bam2,...|event_gff1,event_gff2,...>' parser = OptionParser(usage) parser.add_option( '-a', dest='max_anchors', default=1000, type='int', help='Maximum number of anchors to consider [Default: %default]') parser.add_option('-c', dest='control_files', default=None, help='Control BAM or GFF files (comma separated)') parser.add_option('-e', dest='plot_heat', default=False, action='store_true', help='Plot as a heatmap [Default: %default]') parser.add_option('-l', dest='log', default=False, action='store_true', help='log2 coverage [Default: %default]') parser.add_option('--labels', dest='labels', default='Primary,Control', help='Plot labels [Default:%default]') parser.add_option('-o', dest='output_pre', default='gff_cov', help='Output prefix [Default: %default]') parser.add_option( '-s', dest='sorted_gene_files', help='Files of sorted gene lists. Plot heatmaps in their order') parser.add_option('-p', dest='smooth_span', default=0.2, type='float', help='Smoothing span parameter [Default: %default]') parser.add_option( '-b', dest='bins', default=100, type='int', help='Number of bins across the gene span [Default: %default]') parser.add_option('-m', dest='min_length', default=None, type='int', help='Minimum anchor length [Default: %default]') parser.add_option('-w', dest='window', default=2000, type='int', help='Window around peak middle [Default: %default]') (options, args) = parser.parse_args() if len(args) != 3: parser.error('Must provide mode, anchor GFF, and BAM/GFF file(s)') else: mode = args[0] anchor_gff = args[1] event_files = args[2].split(',') if options.control_files: control_files = options.control_files.split(',') plot_labels = options.labels.split(',') anchor_is_gtf = (anchor_gff[-4:] == '.gtf') # preprocess anchor GFF prep_anchor_fd, prep_anchor_gff = preprocess_anchors( anchor_gff, mode, options.max_anchors, anchor_is_gtf, options.min_length, options.window) ############################################ # compute coverage ############################################ coverage, events = compute_coverage(prep_anchor_gff, event_files, mode, anchor_is_gtf, options.bins) if options.control_files: coverage_control, events_control = compute_coverage( prep_anchor_gff, control_files, mode, anchor_is_gtf, options.bins) # clean os.close(prep_anchor_fd) os.remove(prep_anchor_gff) ############################################ # normalize ############################################ # normalize coverages (and add pseudocounts) for anchor_id in coverage: for i in range(len(coverage[anchor_id])): coverage[anchor_id][i] = (1 + coverage[anchor_id][i]) / float(events) if options.control_files: coverage_control[anchor_id][i] = ( 1 + coverage_control[anchor_id][i]) / float(events_control) ############################################ # sort anchors ############################################ anchors_sorted = [] if options.sorted_gene_files: # for each sorted list for sorted_gene_file in options.sorted_gene_files.split(','): # collect anchor_id's anchors_sorted.append([]) for line in open(sorted_gene_file): anchor_id = line.split()[0] # verify randomly selected if anchor_id in coverage: anchors_sorted[-1].append(anchor_id) else: # tuple anchor_id's with mean coverage stat_aid = [] for anchor_id in coverage: if options.control_files: astat = stats.mean([ math.log(coverage[anchor_id][i], 2) - math.log(coverage_control[anchor_id][i], 2) for i in range(len(coverage[anchor_id])) ]) else: astat = stats.geo_mean([ coverage[anchor_id][i] for i in range(len(coverage[anchor_id])) ]) stat_aid.append((astat, anchor_id)) # sort stat_aid.sort(reverse=True) # store as the only sorted list anchors_sorted.append([anchor_id for (astat, anchor_id) in stat_aid]) ############################################ # plot heatmap(s) ############################################ if options.plot_heat: # if multiple sorts, create a dir for the plots if len(anchors_sorted) > 1: if not os.path.isdir('%s_heat' % options.output_pre): os.mkdir('%s_heat' % options.output_pre) for s in range(len(anchors_sorted)): df = {'Index': [], 'Anchor': [], 'Coverage': []} for si in range(len(anchors_sorted[s])): anchor_id = anchors_sorted[s][si] for i in range(len(coverage[anchor_id])): if mode == 'mid': df['Index'].append(i - options.window / 2) else: df['Index'].append(i) df['Anchor'].append(anchor_id) if options.log: cov = math.log(coverage[anchor_id][i], 2) else: cov = coverage[anchor_id][i] if options.control_files: if options.log: cov -= math.log(coverage_control[anchor_id][i], 2) else: cov = cov / coverage_control[anchor_id][i] df['Coverage'].append('%.4e' % cov) if len(anchors_sorted) == 1: out_pdf = '%s_heat.pdf' % options.output_pre else: sorted_gene_file = options.sorted_gene_files.split(',')[s] sorted_gene_pre = os.path.splitext( os.path.split(sorted_gene_file)[-1])[0] out_pdf = '%s_heat/%s.pdf' % (options.output_pre, sorted_gene_pre) r_script = '%s/plot_gff_cov_heat.r' % os.environ['RDIR'] ggplot.plot(r_script, df, [out_pdf, options.control_files != None]) ############################################ # plot meta-coverage ############################################ df = {'Index': [], 'Coverage': []} if options.control_files: df['Type'] = [] if mode == 'mid': index_length = 2 * (options.window / 2) + 1 elif mode == 'span': index_length = options.bins else: print >> sys.stderr, 'Unknown mode %s' % mode exit(1) for i in range(index_length): if mode == 'mid': df['Index'].append(i - options.window / 2) else: df['Index'].append(i) if options.log: df['Coverage'].append( stats.geo_mean( [coverage[anchor_id][i] for anchor_id in coverage])) else: df['Coverage'].append( stats.mean([coverage[anchor_id][i] for anchor_id in coverage])) if options.control_files: df['Type'].append('Primary') if mode == 'mid': df['Index'].append(i - options.window / 2) else: df['Index'].append(i) df['Type'].append('Control') if options.log: df['Coverage'].append( stats.geo_mean([ coverage_control[anchor_id][i] for anchor_id in coverage_control ])) else: df['Coverage'].append( stats.mean([ coverage_control[anchor_id][i] for anchor_id in coverage_control ])) r_script = '%s/plot_gff_cov_meta.r' % os.environ['RDIR'] out_df = '%s_meta.df' % options.output_pre ggplot.plot(r_script, df, [ options.output_pre, options.smooth_span, plot_labels[0], plot_labels[1] ], df_file=out_df)
def main(): usage = "usage: %prog [options] <diff1_file> <diff2_file>" parser = OptionParser(usage) parser.add_option("-o", dest="out_dir", default=".") (options, args) = parser.parse_args() if len(args) != 2: parser.error("Must provide two diff files") else: diff1_file = args[0] diff2_file = args[1] diff1_stats, diff1_bound = hash_diff(diff1_file) diff2_stats, diff2_bound = hash_diff(diff2_file) if not os.path.isdir(options.out_dir): os.mkdir(options.out_dir) for diff_key in diff1_stats: sample1, sample2 = diff_key gene_stats1 = diff1_stats[diff_key] gene_bound1 = diff1_bound[diff_key] gene_stats2 = diff2_stats[diff_key] gene_bound2 = diff2_bound[diff_key] report_out = open("%s/%s-%s_report.txt" % (options.out_dir, sample1, sample2), "w") # compare numbers of genes quantified common_genes = set(gene_stats1.keys()) & set(gene_stats2.keys()) print >> report_out, "Genes quantified" print >> report_out, "%s\t%d" % (diff1_file, len(gene_stats1)) print >> report_out, "%s\t%d" % (diff2_file, len(gene_stats2)) print >> report_out, "Common\t%d" % len(common_genes) print >> report_out, "" up1 = set([gene_id for gene_id in gene_bound1 if gene_bound1[gene_id]]) up2 = set([gene_id for gene_id in gene_bound2 if gene_bound2[gene_id]]) print >> report_out, "Genes upregulated" print >> report_out, "%s\t%d" % (diff1_file, len(up1)) print >> report_out, "%s\t%d" % (diff2_file, len(up2)) print >> report_out, "Common\t%d" % len(up1 & up2) print >> report_out, "" down1 = set([gene_id for gene_id in gene_bound1 if not gene_bound1[gene_id]]) down2 = set([gene_id for gene_id in gene_bound2 if not gene_bound2[gene_id]]) print >> report_out, "Genes downregulated" print >> report_out, "%s\t%d" % (diff1_file, len(down1)) print >> report_out, "%s\t%d" % (diff2_file, len(down2)) print >> report_out, "Common\t%d" % len(down1 & down2) print >> report_out, "" # scatter plot test stat df = {"diff1": [], "diff2": []} for gene_id in common_genes: df["diff1"].append(gene_stats1[gene_id]) df["diff2"].append(gene_stats2[gene_id]) r_script = "%s/diff_diff_scatter.r" % os.environ["RDIR"] out_pdf = "%s/%s-%s_scatter.pdf" % (options.out_dir, sample1, sample2) ggplot.plot(r_script, df, [out_pdf]) # compute correlation cor, p = spearmanr(df["diff1"], df["diff2"]) print >> report_out, "Spearman correlation: %f" % cor print >> report_out, "" report_out.close() # plot test_stat versus test_stat difference df = {"minus": [], "avg": []} for gene_id in common_genes: df["minus"].append(gene_stats1[gene_id] - gene_stats2[gene_id]) df["avg"].append(0.5 * gene_stats1[gene_id] + 0.5 * gene_stats2[gene_id]) r_script = "%s/diff_diff_ma.r" % os.environ["RDIR"] out_pdf = "%s/%s-%s_ma.pdf" % (options.out_dir, sample1, sample2) ggplot.plot(r_script, df, [out_pdf])
def main(): usage = 'usage: %prog [options] <fpkm_tracking> <gene1> <gene2> ...>' parser = OptionParser(usage) parser.add_option('-l', dest='log', default=False, help='log2 FPKM') parser.add_option('-n', dest='names', default=None, help='Sample names, comma-separated') parser.add_option('-p', dest='pseudocount', default=.125, type='float', help='Pseudocount for log FPKM [Default: %default]') parser.add_option('-o', dest='out_dir', default='cuff_bars', help='Output directory [Default: %default]') parser.add_option('-s', dest='samples', default=None, help='Samples to plot, comma-separated') parser.add_option('-y', dest='yaxis_match', default=False, action='store_true', help='Match the y-axis of all plots [Default: %default]') (options, args) = parser.parse_args() if len(args) < 2: parser.error('Must provide fpkm_tracking and genes.') else: fpkm_tracking = args[0] genes = args[1:] gene_sample_fpkm = read_fpkms(fpkm_tracking, genes, options.log, options.pseudocount) if not os.path.isdir(options.out_dir): os.mkdir(options.out_dir) if options.samples: samples = options.samples.split(',') else: samples = sorted(gene_sample_fpkm[genes[0]].keys()) if options.names: names = options.names.split(',') else: names = samples ymin = 0 if options.log: ymin = np.log2(options.pseudocount) if options.yaxis_match: ymax = max([ gene_sample_fpkm[gene_name][sample][2] for sample in samples for gene_name in genes ]) else: ymax = None for gene_name in genes: df = {} df['Sample'] = names df['FPKM'] = [ gene_sample_fpkm[gene_name][sample][0] for sample in samples ] df['conf_lo'] = [ gene_sample_fpkm[gene_name][sample][1] for sample in samples ] df['conf_hi'] = [ gene_sample_fpkm[gene_name][sample][2] for sample in samples ] out_pdf = '%s/%s.pdf' % (options.out_dir, gene_name) ggplot.plot('%s/cuff_bar.r' % os.environ['RDIR'], df, [ymin, ymax, out_pdf])
def main(): usage='usage:%prog [options] <bed_file> <msa_file>' parser = OptionParser(usage) parser.add_option('-c', dest='consensus_pct', default=0.5, type='float', help='Required proportion of columns with a valid nt to consider it a consensus column [Default: %default]') parser.add_option('-d', dest='dfam_consensus', action='store_true', help='Pass the option if you want to use Consensus as defined by Dfam') #parser.add_option('-j', dest='condense_pct', type='float', help='Required proportion of entries to be same between 2 columns for them to be merged') #parser.add_option('-n', dest='discretize_bins', type='int', help='The number of bins you want to discretize the scores into') parser.add_option('-o', dest='output_pre', type='string', help='Prefix of the output files') (options, args) = parser.parse_args() if len(args)!=2: parser.error('Must provide both the BED file and MSA file. Check %s' %usage) else: bed_file = args[0] msa_fasta_file = args[1] ################################################## # hash scores ################################################## seq_scores = {} for line in open(bed_file): a = line.split('\t') header = a[0] + ':' + a[1] + '-' + a[2] score = float(a[4]) seq_scores[header] = score ################################################## # define consensus # define columns to condense for regression ################################################## msa_sequences = {} for line in open(msa_fasta_file): if line[0] == '>': header = line.strip() msa_sequences[header] = '' else: msa_sequences[header] += line.strip() if options.dfam_consensus is True: consensus_sequence = msa_sequences.pop('>Consensus') sequence_length = len(consensus_sequence) consensus_columns = [] for i in range(0,len(consensus_sequence)): if consensus_sequence[i] == 'x': consensus_columns.append(i) else: consensus_columns = define_consensus(msa_fasta_file, options.consensus_pct) #sample_sequence = msa_sequences.pop('>Consensus') #sequence_length = len(sample_sequence) #hamming_cutoff = int(sequence_length - options.condense_pct*sequence_length) #condensed_columns, columns_ls_remove = column_condense(msa_sequences, consensus_columns, hamming_cutoff) ################################################## # map sequences to feature vectors ################################################## # initialize the dictionary with score and position/nt features df_mi = {'Score':[]} for i in range(len(consensus_columns)): position = i+1 df_mi[position] = [] header = '' for line in open(msa_fasta_file): if line[0] == '>': if header and header != 'Consensus': # process seq df_mi['Score'].append(seq_scores[header]) for i in range(len(consensus_columns)): position = i+1 seq_i = consensus_columns[i] nt = seq[seq_i].upper() df_mi[position].append(nt) header = line[1:].rstrip() seq = '' else: seq += line.rstrip() if header and header != 'Consensus': # process last seq df_mi['Score'].append(seq_scores[header]) for i in range(len(consensus_columns)): position = i+1 seq_i = consensus_columns[i] nt = seq[seq_i].upper() df_mi[position].append(nt) ggplot.plot('%s/te_mut_info.r' % tempura.r_dir, df_mi, [options.output_pre])
def main(): usage = 'usage: %prog [options] <hg19|mm9> <bam1,bam2,...>' parser = OptionParser(usage) parser.add_option('-a', dest='annotations', default='rrna,smallrna,cds,utrs_3p,utrs_5p,pseudogene,lncrna,introns,intergenic', help='Comma-separated list of annotation classes to include [Default: %default]') parser.add_option('-o', dest='output_prefix', default='annotation', help='Output file prefix [Default: %default]') parser.add_option('-p', dest='paired_stranded', action='store_true', default=False, help='Paired end stranded reads, so split intersects by XS tag and strand [Default: %default]') parser.add_option('-t', dest='title', default='Title', help='Plot title [Default: %default]') parser.add_option('-u', dest='unstranded', action='store_true', default=False, help='Unstranded reads, so count intergenic and renormalize to lessen the impact of double counting [Default: %default]') (options,args) = parser.parse_args() if len(args) == 2: genome = args[0] bam_files = args[1].split(',') else: parser.error(usage) if genome == 'hg19': annotation_dir = '%s/pie_unstranded' % os.environ['GENCODE'] assembly_dir = '%s/research/common/data/genomes/hg19/assembly' % os.environ['HOME'] elif genome == 'mm9': annotation_dir = '/n/rinn_data1/indexes/mouse/mm9/annotations/dk_pie' assembly_dir = '%s/research/common/data/genomes/mm9/assembly' % os.environ['HOME'] else: parser.error('Genome must specify hg19 or mm9.') if options.paired_stranded: # split bam files by strand for bam_file in bam_files: split_bam_xs(bam_file) annotation_classes = set(options.annotations.split(',')) ############################################ # annotation lengths ############################################ genome_length = count_genome(assembly_dir) annotation_lengths = {} for ann in annotation_classes: if ann != 'intergenic': annotation_bed = '%s/%s.bed' % (annotation_dir,ann) if os.path.isfile(annotation_bed): annotation_lengths[ann] = annotation_length(annotation_bed, assembly_dir) else: parser.error('Cannot find annotation BED %s' % annotation_bed) if 'intergenic' in annotation_classes: other_annotations_summed = sum(annotation_lengths.values()) annotation_lengths['intergenic'] = genome_length - other_annotations_summed ############################################ # annotation read counts ############################################ genome_reads = 0 for bam_file in bam_files: genome_reads += count_bam(bam_file) annotation_reads = {} for ann in annotation_classes: if ann != 'intergenic': annotation_bed = '%s/%s.bed' % (annotation_dir,ann) annotation_reads[ann] = 0 for bam_file in bam_files: annotation_reads[ann] += count_intersection(bam_file, annotation_bed, options.unstranded, options.paired_stranded) if 'intergenic' in annotation_classes: other_annotations_summed = sum(annotation_reads.values()) annotation_reads['intergenic'] = genome_reads - other_annotations_summed if options.unstranded: intergenic_reads_sub = annotation_reads['intergenic'] intergenic_reads = 0 for bam_file in bam_files: intergenic_reads += count_sans_intersection(bam_file, '%s/../gencode.v18.annotation.prerna.gtf' % annotation_dir) if options.paired_stranded: for bam_file in bam_files: os.remove(bam_file[:-4] + '_p.bam') os.remove(bam_file[:-4] + '_m.bam') ############################################ # table ############################################ annotation_labels = {'rrna':'rRNA', 'smallrna':'smallRNA', 'cds':'CDS', 'utrs_3p':'3\'UTR', 'utrs_5p':'5\'UTR', 'pseudogene':'Pseudogene', 'lncrna':'lncRNA', 'introns':'Introns', 'intergenic':'Intergenic', 'mrna':'mRNA'} reads_sum = float(sum(annotation_reads.values())) lengths_sum = float(sum(annotation_lengths.values())) annotation_ratio = {} counts_out = open('%s_counts.txt' % options.output_prefix, 'w') for ann in annotation_classes: read_pct = annotation_reads[ann]/reads_sum length_pct = annotation_lengths[ann]/lengths_sum if read_pct > 0: annotation_ratio[ann] = math.log(read_pct/length_pct,2) else: annotation_ratio[ann] = math.log((1+annotation_reads[ann])/(1+reads_sum),2) cols = (annotation_labels[ann], annotation_reads[ann], read_pct, length_pct, annotation_ratio[ann]) print >> counts_out, '%10s %8d %6.4f %6.4f %5.2f' % cols counts_out.close() ############################################ # pie chart ############################################ pie_df = {'dummy':[], 'annotation':[], 'count':[]} for ann in annotation_classes: pie_df['dummy'].append('.') pie_df['annotation'].append(annotation_labels[ann]) pie_df['count'].append(annotation_reads[ann]) ggplot.plot('%s/annotation_pie_pie.r'%os.environ['RDIR'], pie_df, [options.title, '%s_pie.pdf'%options.output_prefix]) ############################################ # read:length ratio ############################################ ratio_df = {'annotation':[], 'ratio':[]} for ann in annotation_classes: ratio_df['annotation'].append(annotation_labels[ann]) ratio_df['ratio'].append(annotation_ratio[ann]) ggplot.plot('%s/annotation_pie_ratios.r'%os.environ['RDIR'], ratio_df, [options.title, '%s_ratios.pdf'%options.output_prefix])
def main(): usage = "usage: %prog [options] <hg19|mm9> <gff>" parser = OptionParser(usage) parser.add_option( "-a", dest="annotations", default="cds,utrs_3p,utrs_5p,lncrna,introns", help="Comma-separated list of annotation classes to include [Default: %default]", ) parser.add_option("-o", dest="output_prefix", default="annotation", help="Output file prefix [Default: %default]") parser.add_option("-t", dest="title", default="Title", help="Plot title [Default: %default]") (options, args) = parser.parse_args() if len(args) == 2: genome = args[0] gff_file = args[1] else: parser.error(usage) if genome == "hg19": annotation_dir = "%s/research/common/data/genomes/hg19/annotation/gencode_v15/pie" % os.environ["HOME"] assembly_dir = "%s/research/common/data/genomes/hg19/assembly" % os.environ["HOME"] elif genome == "mm9": annotation_dir = "/n/rinn_data1/indexes/mouse/mm9/annotations/dk_pie" assembly_dir = "%s/research/common/data/genomes/mm9/assembly" % os.environ["HOME"] else: parser.error("Genome must specify hg19 or mm9.") annotation_classes = set(options.annotations.split(",")) ############################################ # annotation lengths ############################################ genome_length = annotation_pie.count_genome(assembly_dir) annotation_lengths = {} for ann in annotation_classes: if ann != "intergenic": annotation_bed = "%s/%s.bed" % (annotation_dir, ann) if os.path.isfile(annotation_bed): annotation_lengths[ann] = annotation_pie.annotation_length(annotation_bed, assembly_dir) else: parser.error("Cannot find annotation BED %s" % annotation_bed) if "intergenic" in annotation_classes: other_annotations_summed = sum(annotation_lengths.values()) annotation_lengths["intergenic"] = genome_length - other_annotations_summed ############################################ # annotation feature counts ############################################ genome_features = int(subprocess.check_output("wc -l %s" % gff_file, shell=True).split()[0]) annotation_features = {} for ann in annotation_classes: if ann != "intergenic": annotation_bed = "%s/%s.bed" % (annotation_dir, ann) annotation_features[ann] = count_intersection(gff_file, annotation_bed) if "intergenic" in annotation_classes: other_annotations_summed = sum(annotation_features.values()) annotation_features["intergenic"] = genome_reads - other_annotations_summed ############################################ # table ########################################### annotation_labels = { "rrna": "rRNA", "smallrna": "smallRNA", "cds": "CDS", "utrs_3p": "3'UTR", "utrs_5p": "5'UTR", "pseudogene": "Pseudogene", "lncrna": "lncRNA", "introns": "Introns", "intergenic": "Intergenic", } features_sum = float(sum(annotation_features.values())) lengths_sum = float(sum(annotation_lengths.values())) annotation_ratio = {} counts_out = open("%s_counts.txt" % options.output_prefix, "w") for ann in annotation_classes: feature_pct = annotation_features[ann] / features_sum length_pct = annotation_lengths[ann] / lengths_sum if feature_pct > 0: annotation_ratio[ann] = math.log(feature_pct / length_pct, 2) else: annotation_ratio[ann] = math.log((1 + annotation_features[ann]) / (1 + features_sum), 2) cols = (annotation_labels[ann], annotation_features[ann], feature_pct, length_pct, annotation_ratio[ann]) print >> counts_out, "%10s %8d %6.4f %6.4f %5.2f" % cols counts_out.close() ############################################ # pie chart ############################################ pie_df = {"dummy": [], "annotation": [], "count": []} for ann in annotation_classes: pie_df["dummy"].append(".") pie_df["annotation"].append(annotation_labels[ann]) pie_df["count"].append(annotation_features[ann]) ggplot.plot( "%s/annotation_pie_pie.r" % os.environ["RDIR"], pie_df, [options.title, "%s_pie.pdf" % options.output_prefix] ) ############################################ # read:length ratio ############################################ ratio_df = {"annotation": [], "ratio": []} for ann in annotation_classes: ratio_df["annotation"].append(annotation_labels[ann]) ratio_df["ratio"].append(annotation_ratio[ann]) ggplot.plot( "%s/annotation_pie_ratios.r" % os.environ["RDIR"], ratio_df, [options.title, "%s_ratios.pdf" % options.output_prefix], )
def main(): usage = 'usage: %prog [options] <.read_group_tracking>' parser = OptionParser(usage) parser.add_option('-g', dest='genes_gtf', help='Print only genes in the given GTF file') #parser.add_option('-p', dest='pseudocount', type='float', default=0.125, help='FPKM pseudocount for taking logs [Default: %default]') parser.add_option('-o', dest='out_pdf', default='cor_heat.pdf', help='Output heatmap pdf [Default: %default]') (options, args) = parser.parse_args() if len(args) != 1: parser.error(usage) else: read_group_tracking = args[0] # get gene_ids gene_set = set() if options.genes_gtf: for line in open(options.genes_gtf): a = line.split('\t') gid = gff.gtf_kv(a[8])['gene_id'] gene_set.add(gid) # initialize diff data structures cond_rep_gene_fpkm = {} # read read group tracking file rgt_in = open(read_group_tracking) headers = rgt_in.readline() line = rgt_in.readline() while line: a = line.split('\t') gene_id = a[0] cond = a[1] rep = int(a[2]) fpkm = float(a[6]) status = a[8].rstrip() if status == 'OK' and (len(gene_set) == 0 or gene_id in gene_set): if not (cond, rep) in cond_rep_gene_fpkm: cond_rep_gene_fpkm[(cond, rep)] = {} cond_rep_gene_fpkm[(cond, rep)][gene_id] = fpkm line = rgt_in.readline() rgt_in.close() df_dict = {'Sample1': [], 'Sample2': [], 'Correlation': []} cond_reps = cond_rep_gene_fpkm.keys() for i in range(len(cond_reps)): cond1, rep1 = cond_reps[i] for j in range(i + 1, len(cond_reps)): cond2, rep2 = cond_reps[j] genes12 = set(cond_rep_gene_fpkm[(cond1, rep1)].keys()) & set( cond_rep_gene_fpkm[(cond2, rep2)].keys()) fpkms1 = array([ cond_rep_gene_fpkm[(cond1, rep1)][gene_id] for gene_id in genes12 ]) fpkms2 = array([ cond_rep_gene_fpkm[(cond2, rep2)][gene_id] for gene_id in genes12 ]) rho, pval = spearmanr(fpkms1, fpkms2) cols = (cond1, rep1, cond2, rep2, rho) print '%-15s %1d %-15s %1d %.4f' % cols df_dict['Sample1'].append('%s_%d' % (cond1, rep1)) df_dict['Sample2'].append('%s_%d' % (cond2, rep2)) df_dict['Correlation'].append(rho) # this is broken ggplot.plot('%s/cuff_rep_cor.r' % os.environ['RDIR'], df_dict, [options.out_pdf], debug=True)
def main(): usage = 'usage: %prog [options] <hg19|mm9> <bam1,bam2,...>' parser = OptionParser(usage) parser.add_option( '-a', dest='annotations', default= 'rrna,smallrna,cds,utrs_3p,utrs_5p,pseudogene,lncrna,introns,intergenic', help= 'Comma-separated list of annotation classes to include [Default: %default]' ) parser.add_option('-o', dest='output_prefix', default='annotation', help='Output file prefix [Default: %default]') parser.add_option( '-p', dest='paired_stranded', action='store_true', default=False, help= 'Paired end stranded reads, so split intersects by XS tag and strand [Default: %default]' ) parser.add_option('-t', dest='title', default='', help='Plot title [Default: %default]') parser.add_option( '-u', dest='unstranded', action='store_true', default=False, help= 'Unstranded reads, so count intergenic and renormalize to lessen the impact of double counting [Default: %default]' ) (options, args) = parser.parse_args() if len(args) == 2: genome = args[0] bam_files = args[1].split(',') else: parser.error(usage) if genome == 'hg19': assembly_dir = '%s/research/common/data/genomes/hg19/assembly' % os.environ[ 'HOME'] if options.paired_stranded: annotation_dir = '%s/pie_stranded' % os.environ['GENCODE'] else: annotation_dir = '%s/pie_unstranded' % os.environ['GENCODE'] elif genome == 'mm9': assembly_dir = '%s/research/common/data/genomes/mm9/assembly' % os.environ[ 'HOME'] if options.paired_stranded: print >> sys.stderr, 'Stranded annotation BEDs dont exist for mm9' exit(1) else: annotation_dir = '/n/rinn_data1/indexes/mouse/mm9/annotations/dk_pie' else: parser.error('Genome must specify hg19 or mm9.') if options.paired_stranded: # split bam files by strand for bam_file in bam_files: split_bam_xs(bam_file) annotation_classes = set(options.annotations.split(',')) ############################################ # annotation lengths ############################################ genome_length = count_genome(assembly_dir) annotation_lengths = {} for ann in annotation_classes: if ann != 'intergenic': annotation_bed = '%s/%s.bed' % (annotation_dir, ann) if os.path.isfile(annotation_bed): annotation_lengths[ann] = annotation_length( annotation_bed, assembly_dir) else: parser.error('Cannot find annotation BED %s' % annotation_bed) if 'intergenic' in annotation_classes: other_annotations_summed = sum(annotation_lengths.values()) annotation_lengths[ 'intergenic'] = genome_length - other_annotations_summed ############################################ # annotation read counts ############################################ genome_reads = 0 for bam_file in bam_files: genome_reads += count_bam(bam_file) annotation_reads = {} for ann in annotation_classes: if ann != 'intergenic': annotation_bed = '%s/%s.bed' % (annotation_dir, ann) annotation_reads[ann] = 0 for bam_file in bam_files: annotation_reads[ann] += count_intersection( bam_file, annotation_bed, options.unstranded, options.paired_stranded) if 'intergenic' in annotation_classes: other_annotations_summed = sum(annotation_reads.values()) annotation_reads[ 'intergenic'] = genome_reads - other_annotations_summed if options.unstranded: intergenic_reads_sub = annotation_reads['intergenic'] intergenic_reads = 0 for bam_file in bam_files: intergenic_reads += count_sans_intersection( bam_file, '%s/../gencode.v18.annotation.prerna.gtf' % annotation_dir) if options.paired_stranded: for bam_file in bam_files: os.remove(bam_file[:-4] + '_p.bam') os.remove(bam_file[:-4] + '_m.bam') ############################################ # table ############################################ annotation_labels = { 'rrna': 'rRNA', 'smallrna': 'smallRNA', 'cds': 'CDS', 'utrs_3p': '3\'UTR', 'utrs_5p': '5\'UTR', 'pseudogene': 'Pseudogene', 'lncrna': 'lncRNA', 'exons': 'Exons', 'introns': 'Introns', 'intergenic': 'Intergenic', 'mrna': 'mRNA' } reads_sum = float(sum(annotation_reads.values())) lengths_sum = float(sum(annotation_lengths.values())) annotation_ratio = {} counts_out = open('%s_counts.txt' % options.output_prefix, 'w') for ann in annotation_classes: read_pct = annotation_reads[ann] / reads_sum length_pct = annotation_lengths[ann] / lengths_sum if read_pct > 0: annotation_ratio[ann] = math.log(read_pct / length_pct, 2) else: annotation_ratio[ann] = math.log( (1 + annotation_reads[ann]) / (1 + reads_sum), 2) cols = (annotation_labels[ann], annotation_reads[ann], read_pct, length_pct, annotation_ratio[ann]) print >> counts_out, '%10s %8d %6.4f %6.4f %5.2f' % cols counts_out.close() ############################################ # pie chart ############################################ pie_df = {'dummy': [], 'annotation': [], 'count': []} for ann in annotation_classes: pie_df['dummy'].append('.') pie_df['annotation'].append(annotation_labels[ann]) pie_df['count'].append(annotation_reads[ann]) out_pdf = '%s_pie.pdf' % options.output_prefix ggplot.plot('%s/annotation_pie_pie.r' % os.environ['RDIR'], pie_df, [options.title, out_pdf], df_file=out_pdf[:-1]) ############################################ # read:length ratio ############################################ ratio_df = {'annotation': [], 'ratio': []} for ann in annotation_classes: ratio_df['annotation'].append(annotation_labels[ann]) ratio_df['ratio'].append(annotation_ratio[ann]) ggplot.plot( '%s/annotation_pie_ratios.r' % os.environ['RDIR'], ratio_df, [options.title, '%s_ratios.pdf' % options.output_prefix])
def main(): usage = 'usage: %prog [options] <fpkm_tracking>' parser = OptionParser(usage) parser.add_option('-a', dest='max_fpkm', type='float', help='Maxium log2 FPKM to plot [Default: %d]') parser.add_option('-d', dest='diff_file', help='Limit to significantly differentially expressed genes') parser.add_option('-g', dest='gtf', help='GTF file of genes to display') parser.add_option('-m', dest='min_fpkm', default=0, type='float', help='Minimum FPKM [Default: %default]') parser.add_option('-p', dest='pseudocount', default=.125, type='float', help='Pseudocount for log FPKM [Default: %default]') parser.add_option('-o', dest='out_pdf', default='cuff_heat.pdf', help='Output PDF [Default: %default]') parser.add_option('-s', dest='sample', default=1000, type='int', help='Sample genes rather than use all [Default: %default]') parser.add_option('-u', dest='uppercase', default=False, action='store_true', help='Uppercase sample labels [Default: %default]') (options,args) = parser.parse_args() if len(args) != 1: parser.error('Must provide fpkm_tracking') else: fpkm_tracking = args[0] # load expression data cuff = cufflinks.fpkm_tracking(fpkm_file=fpkm_tracking) # determine genes all_genes = set(cuff.genes) if options.gtf: all_genes = set() for line in open(options.gtf): a = line.split('\t') all_genes.add(gff.gtf_kv(a[8])['gene_id']) if options.diff_file: # limit to differentially expressed genes diff_genes = find_diff(options.diff_file) all_genes &= diff_genes else: # at least limit to clean genes clean_genes = set() for gene_id in all_genes: ge = cuff.gene_expr(gene_id) clean = True for i in range(len(ge)): if math.isnan(ge[i]): clean = False break if clean: clean_genes.add(gene_id) all_genes &= clean_genes if options.min_fpkm > 0: expressed_genes = set() for gene_id in all_genes: ge = cuff.gene_expr(gene_id, not_found=0, fail=0) if max(ge) > options.min_fpkm: expressed_genes.add(gene_id) all_genes &= expressed_genes # sample genes to display if len(all_genes) <= options.sample: display_genes = all_genes else: display_genes = random.sample(all_genes, options.sample) # build data frame df = {'Gene':[], 'FPKM':[], 'Sample':[]} for gene_id in display_genes: ge = cuff.gene_expr(gene_id, not_found=0, fail=0) for i in range(len(cuff.experiments)): df['Gene'].append(gene_id) df['Sample'].append(cuff.experiments[i]) if options.uppercase: df['Sample'][-1] = df['Sample'][-1].upper() logfpkm = np.log2(ge[i]+options.pseudocount) if options.max_fpkm: logfpkm = min(options.max_fpkm, logfpkm) df['FPKM'].append(logfpkm) # plot out_df = '%s.df' % options.out_pdf[:-4] ggplot.plot('%s/cuff_heat.r' % os.environ['RDIR'], df, [options.out_pdf], df_file=out_df)
def main(): usage = 'usage: %prog [options] <.read_group_tracking>' parser = OptionParser(usage) parser.add_option('-g', dest='genes_gtf', help='Print only genes in the given GTF file') #parser.add_option('-p', dest='pseudocount', type='float', default=0.125, help='FPKM pseudocount for taking logs [Default: %default]') parser.add_option('-o', dest='out_pdf', default='cor_heat.pdf', help='Output heatmap pdf [Default: %default]') (options,args) = parser.parse_args() if len(args) != 1: parser.error(usage) else: read_group_tracking = args[0] # get gene_ids gene_set = set() if options.genes_gtf: for line in open(options.genes_gtf): a = line.split('\t') gid = gff.gtf_kv(a[8])['gene_id'] gene_set.add(gid) # initialize diff data structures cond_rep_gene_fpkm = {} # read read group tracking file rgt_in = open(read_group_tracking) headers = rgt_in.readline() line = rgt_in.readline() while line: a = line.split('\t') gene_id = a[0] cond = a[1] rep = int(a[2]) fpkm = float(a[6]) status = a[8].rstrip() if status == 'OK' and (len(gene_set) == 0 or gene_id in gene_set): if not (cond,rep) in cond_rep_gene_fpkm: cond_rep_gene_fpkm[(cond,rep)] = {} cond_rep_gene_fpkm[(cond,rep)][gene_id] = fpkm line = rgt_in.readline() rgt_in.close() df_dict = {'Sample1':[], 'Sample2':[], 'Correlation':[]} cond_reps = cond_rep_gene_fpkm.keys() for i in range(len(cond_reps)): cond1, rep1 = cond_reps[i] for j in range(i+1,len(cond_reps)): cond2, rep2 = cond_reps[j] genes12 = set(cond_rep_gene_fpkm[(cond1,rep1)].keys()) & set(cond_rep_gene_fpkm[(cond2,rep2)].keys()) fpkms1 = array([cond_rep_gene_fpkm[(cond1,rep1)][gene_id] for gene_id in genes12]) fpkms2 = array([cond_rep_gene_fpkm[(cond2,rep2)][gene_id] for gene_id in genes12]) rho, pval = spearmanr(fpkms1, fpkms2) cols = (cond1,rep1,cond2,rep2,rho) print '%-15s %1d %-15s %1d %.4f' % cols df_dict['Sample1'].append('%s_%d' % (cond1,rep1)) df_dict['Sample2'].append('%s_%d' % (cond2,rep2)) df_dict['Correlation'].append(rho) # this is broken ggplot.plot('%s/cuff_rep_cor.r' % os.environ['RDIR'], df_dict, [options.out_pdf], debug=True)
def main(): usage = 'usage: %prog [options] <read_group_tracking>' parser = OptionParser(usage) parser.add_option('-g', dest='gtf', help='GTF file of genes to display') parser.add_option('-f', dest='min_fpkm', default=0, type='float', help='Minimum FPKM to consider [Default: %default]') parser.add_option('-m', dest='method', default='PCA', help='Dimension reduction method [Default: %default]') parser.add_option('-p', dest='pseudocount', default=.125, help='FPKM pseudocount (for logs) [Default: %default]') parser.add_option('-o', dest='out_pdf', default='cuff_2d.pdf', help='Output PDF [Default: %default]') parser.add_option('-s', dest='square', default=False, action='store_true', help='Square plot [Default: %default]') parser.add_option('-u', dest='uppercase', default=False, action='store_true', help='Uppercase sample labels [Default: %default]') parser.add_option('-w', dest='whiten', default=False, action='store_true', help='Whiten expression data [Default: %default]') (options,args) = parser.parse_args() if len(args) != 1: parser.error('Must provide fpkm_tracking') else: read_group_tracking = args[0] # load expression data gene_fpkm = {} rgt_in = open(read_group_tracking) rgt_in.readline() for line in rgt_in: a = line.split() gene_fpkm.setdefault(a[0],{})[(a[1],a[2])] = float(a[6]) rgt_in.close() # determine genes compute_genes = gene_fpkm.keys() if options.gtf: compute_genes = set() for line in open(options.gtf): a = line.split('\t') compute_genes.add(gff.gtf_kv(a[8])['gene_id']) compute_genes = list(compute_genes) # filter for fpkm if options.min_fpkm > 0: prefilter_genes = copy.copy(compute_genes) compute_genes = [] for gene_id in prefilter_genes: ge = gene_fpkm[gene_id].values() if max(ge) > options.min_fpkm: compute_genes.append(gene_id) # construct gene expression matrix samples = gene_fpkm[compute_genes[0]].keys() X = np.array([[gene_fpkm[gene_id][sam_rep] for gene_id in compute_genes] for sam_rep in samples]) X = np.log2(X + options.pseudocount) if options.whiten: X = preprocessing.scale(X) ################################################## # dimensionality reduction ################################################## if options.method.lower() == 'mds': model = MDS(n_components=2) elif options.method.lower() in ['iso','isomap']: model = Isomap(n_components=2) elif options.method.lower() == 'ica': model = FastICA(n_components=2, max_iter=500) else: model = PCA(n_components=2) X_dr = model.fit_transform(X) ################################################## # plot ################################################## df = {} df['D1'] = X_dr[:,0] df['D2'] = X_dr[:,1] df['Label'] = ['%s_rep%s' % sam_rep for sam_rep in samples] if options.uppercase: df['Label'] = [label.upper() for label in df['Label']] df['Sample'] = [sam.upper() for (sam,rep) in samples] else: df['Sample'] = [sam for (sam,rep) in samples] ggplot.plot('%s/cuff_2d.r' % os.environ['RDIR'], df, [options.out_pdf, options.square])
def main(): usage = 'usage: %prog [options] <diff1_file> <diff2_file>' parser = OptionParser(usage) parser.add_option('-s', dest='stat', default='test_stat') parser.add_option('-g', dest='genes_gtf', default=None) parser.add_option('-m', dest='min_fpkm', default=0, type='float') parser.add_option('-o', dest='out_dir', default='.') parser.add_option('-p', dest='pseudocount', default=0.125, type='float') (options, args) = parser.parse_args() if len(args) != 2: parser.error('Must provide two diff files') else: diff1_file = args[0] diff2_file = args[1] gtf_genes = None if options.genes_gtf: gtf_genes = gff.gtf_gene_set(options.genes_gtf) diff1_stats = cuffdiff.hash_stat(diff1_file, stat=options.stat, min_fpkm=options.min_fpkm, pseudocount=options.pseudocount, gene_set=gtf_genes) diff1_sig = cuffdiff.hash_sig(diff1_file, gene_set=gtf_genes) diff2_stats = cuffdiff.hash_stat(diff2_file, stat=options.stat, min_fpkm=options.min_fpkm, pseudocount=options.pseudocount, gene_set=gtf_genes) diff2_sig = cuffdiff.hash_sig(diff2_file, gene_set=gtf_genes) if not os.path.isdir(options.out_dir): os.mkdir(options.out_dir) for diff_key in diff1_stats: sample1, sample2 = diff_key gene_stats1 = diff1_stats[diff_key] gene_sig1 = diff1_sig[diff_key] gene_stats2 = diff2_stats[diff_key] gene_sig2 = diff2_sig[diff_key] report_out = open( '%s/%s-%s_report.txt' % (options.out_dir, sample1, sample2), 'w') # compare numbers of genes quantified common_genes = set(gene_stats1.keys()) & set(gene_stats2.keys()) print >> report_out, 'Genes quantified' print >> report_out, '%s\t%d' % (diff1_file, len(gene_stats1)) print >> report_out, '%s\t%d' % (diff2_file, len(gene_stats2)) print >> report_out, 'Common\t%d' % len(common_genes) print >> report_out, '' up1 = set([gene_id for gene_id in gene_sig1 if gene_sig1[gene_id]]) up2 = set([gene_id for gene_id in gene_sig2 if gene_sig2[gene_id]]) print >> report_out, 'Genes upregulated' print >> report_out, '%s\t%d' % (diff1_file, len(up1)) print >> report_out, '%s\t%d' % (diff2_file, len(up2)) print >> report_out, 'Common\t%d' % len(up1 & up2) print >> report_out, '' down1 = set( [gene_id for gene_id in gene_sig1 if not gene_sig1[gene_id]]) down2 = set( [gene_id for gene_id in gene_sig2 if not gene_sig2[gene_id]]) print >> report_out, 'Genes downregulated' print >> report_out, '%s\t%d' % (diff1_file, len(down1)) print >> report_out, '%s\t%d' % (diff2_file, len(down2)) print >> report_out, 'Common\t%d' % len(down1 & down2) print >> report_out, '' # scatter plot test stat df = {'diff1': [], 'diff2': []} for gene_id in common_genes: df['diff1'].append(gene_stats1[gene_id]) df['diff2'].append(gene_stats2[gene_id]) r_script = '%s/diff_diff_scatter.r' % os.environ['RDIR'] out_pdf = '%s/%s-%s_scatter.pdf' % (options.out_dir, sample1, sample2) ggplot.plot(r_script, df, [out_pdf], df_file='%s.df' % out_pdf[:-4]) # compute correlation cor, p = spearmanr(df['diff1'], df['diff2']) print >> report_out, 'Spearman correlation: %f (%f)' % (cor, p) cor, p = pearsonr(df['diff1'], df['diff2']) print >> report_out, 'Pearson correlation: %f (%f)' % (cor, p) report_out.close() # plot test_stat versus test_stat difference df = {'minus': [], 'avg': []} for gene_id in common_genes: df['minus'].append(gene_stats1[gene_id] - gene_stats2[gene_id]) df['avg'].append(0.5 * gene_stats1[gene_id] + 0.5 * gene_stats2[gene_id]) r_script = '%s/diff_diff_ma.r' % os.environ['RDIR'] out_pdf = '%s/%s-%s_ma.pdf' % (options.out_dir, sample1, sample2) ggplot.plot(r_script, df, [out_pdf])
def main(): usage = 'usage: %prog [options] <hg19|mm9> <gff>' parser = OptionParser(usage) parser.add_option('-a', dest='annotations', default='cds,utrs_3p,utrs_5p,lncrna,introns', help='Comma-separated list of annotation classes to include [Default: %default]') parser.add_option('-o', dest='output_prefix', default='annotation', help='Output file prefix [Default: %default]') parser.add_option('-t', dest='title', default='Title', help='Plot title [Default: %default]') (options,args) = parser.parse_args() if len(args) == 2: genome = args[0] gff_file = args[1] else: parser.error(usage) if genome == 'hg19': annotation_dir = '%s/research/common/data/genomes/hg19/annotation/gencode_v15/pie' % os.environ['HOME'] assembly_dir = '%s/research/common/data/genomes/hg19/assembly' % os.environ['HOME'] elif genome == 'mm9': annotation_dir = '/n/rinn_data1/indexes/mouse/mm9/annotations/dk_pie' assembly_dir = '%s/research/common/data/genomes/mm9/assembly' % os.environ['HOME'] else: parser.error('Genome must specify hg19 or mm9.') annotation_classes = set(options.annotations.split(',')) ############################################ # annotation lengths ############################################ genome_length = annotation_pie.count_genome(assembly_dir) annotation_lengths = {} for ann in annotation_classes: if ann != 'intergenic': annotation_bed = '%s/%s.bed' % (annotation_dir,ann) if os.path.isfile(annotation_bed): annotation_lengths[ann] = annotation_pie.annotation_length(annotation_bed, assembly_dir) else: parser.error('Cannot find annotation BED %s' % annotation_bed) if 'intergenic' in annotation_classes: other_annotations_summed = sum(annotation_lengths.values()) annotation_lengths['intergenic'] = genome_length - other_annotations_summed ############################################ # annotation feature counts ############################################ genome_features = int(subprocess.check_output('wc -l %s' % gff_file, shell=True).split()[0]) annotation_features = {} for ann in annotation_classes: if ann != 'intergenic': annotation_bed = '%s/%s.bed' % (annotation_dir,ann) annotation_features[ann] = count_intersection(gff_file, annotation_bed) if 'intergenic' in annotation_classes: other_annotations_summed = sum(annotation_features.values()) annotation_features['intergenic'] = genome_reads - other_annotations_summed ############################################ # table ########################################### annotation_labels = {'rrna':'rRNA', 'smallrna':'smallRNA', 'cds':'CDS', 'utrs_3p':'3\'UTR', 'utrs_5p':'5\'UTR', 'pseudogene':'Pseudogene', 'lncrna':'lncRNA', 'introns':'Introns', 'intergenic':'Intergenic'} features_sum = float(sum(annotation_features.values())) lengths_sum = float(sum(annotation_lengths.values())) annotation_ratio = {} counts_out = open('%s_counts.txt' % options.output_prefix, 'w') for ann in annotation_classes: feature_pct = annotation_features[ann]/features_sum length_pct = annotation_lengths[ann]/lengths_sum if feature_pct > 0: annotation_ratio[ann] = math.log(feature_pct/length_pct,2) else: annotation_ratio[ann] = math.log((1+annotation_features[ann])/(1+features_sum),2) cols = (annotation_labels[ann], annotation_features[ann], feature_pct, length_pct, annotation_ratio[ann]) print >> counts_out, '%10s %8d %6.4f %6.4f %5.2f' % cols counts_out.close() ############################################ # pie chart ############################################ pie_df = {'dummy':[], 'annotation':[], 'count':[]} for ann in annotation_classes: pie_df['dummy'].append('.') pie_df['annotation'].append(annotation_labels[ann]) pie_df['count'].append(annotation_features[ann]) ggplot.plot('%s/annotation_pie_pie.r'%os.environ['RDIR'], pie_df, [options.title, '%s_pie.pdf'%options.output_prefix]) ############################################ # read:length ratio ############################################ ratio_df = {'annotation':[], 'ratio':[]} for ann in annotation_classes: ratio_df['annotation'].append(annotation_labels[ann]) ratio_df['ratio'].append(annotation_ratio[ann]) ggplot.plot('%s/annotation_pie_ratios.r'%os.environ['RDIR'], ratio_df, [options.title, '%s_ratios.pdf'%options.output_prefix])
def main(): usage = 'usage: %prog [options] <peaks gff> <diff>' parser = OptionParser(usage) parser.add_option('-c', dest='control_fpkm_file', help='Control FPKM tracking file') parser.add_option('-g', dest='ref_gtf', default='%s/gencode.v18.annotation.gtf'%os.environ['GENCODE']) parser.add_option('-o', dest='output_pre', default='', help='Output prefix [Default: %default]') parser.add_option('--sample1', dest='sample1', help='Sample_1 name in cuffdiff') parser.add_option('--sample2', dest='sample2', help='Sample_2 name in cuffdiff') (options,args) = parser.parse_args() if len(args) != 2: parser.error('Must provide peaks GFF and .diff file') else: peaks_gff = args[0] diff_file = args[1] # find expressed genes in peak calls silent_genes = set() if options.control_fpkm_file: silent_genes = find_silent(options.control_fpkm_file) # find peak bound genes peak_genes = set() p = subprocess.Popen('intersectBed -s -u -a %s -b %s' % (options.ref_gtf,peaks_gff), shell=True, stdout=subprocess.PIPE) for line in p.stdout: peak_genes.add(gff.gtf_kv(line.split('\t')[8])['gene_id']) p.communicate() # process RIP bound_tstats = [] unbound_tstats = [] rip_genes = set() diff_in = open(diff_file) line = diff_in.readline() for line in diff_in: a = line.split('\t') gene_id = a[0] sample1 = a[4] sample2 = a[5] status = a[6] fpkm1 = float(a[7]) fpkm2 = float(a[8]) tstat = float(a[10]) sig = a[13].rstrip() if sample2 == 'input': tstat *= -1 if status == 'OK' and not math.isnan(tstat): if options.sample1 in [None,sample1] and options.sample2 in [None,sample2]: # save RIP bound if sig == 'yes': rip_genes.add(gene_id) # save test_stat if gene_id in peak_genes: bound_tstats.append(tstat) else: if not gene_id in silent_genes: unbound_tstats.append(tstat) print '%d silent genes' % len(silent_genes) print '%d bound genes' % len(bound_tstats) print '%d unbound genes' % len(unbound_tstats) # perform statistical test z, p = stats.mannwhitneyu(bound_tstats, unbound_tstats) print z, p ################################################## # plot bound and unbound distributions ################################################## # construct data frame df_dict = {'Peak':(['Yes']*len(bound_tstats) + ['No']*len(unbound_tstats)), 'Test_stat':bound_tstats+unbound_tstats} r_script = '%s/peaks_diff_compare.r' % os.environ['RDIR'] ggplot.plot(r_script, df_dict, [options.output_pre]) ################################################## # plot venn diagram ################################################## clip_only = len(peak_genes - rip_genes) rip_only = len(rip_genes - peak_genes) both = len(peak_genes & rip_genes) plt.figure() venn_diag = venn2(subsets=(clip_only, rip_only, both), set_labels=['CLIP', 'RIP']) plt.savefig('%s_venn.pdf' % options.output_pre)
def main(): usage = 'usage: %prog [options] <peaks gff> <diff>' parser = OptionParser(usage) parser.add_option('-c', dest='clip_fpkm_file', help='Control FPKM tracking file') parser.add_option('-g', dest='ref_gtf', default='%s/gencode.v18.annotation.gtf'%os.environ['GENCODE']) parser.add_option('--ggplot', dest='ggplot_script', default='%s/peaks_diff_compare.r'%os.environ['RDIR'], help='Script to make plots with [Default: %default]') parser.add_option('-m', dest='max_stat', default=10, type='float', help='Max cuffdiff stat [Default: %default]') parser.add_option('-o', dest='output_pre', default='', help='Output prefix [Default: %default]') parser.add_option('-r', dest='rbp', default='RBP', help='RBP name [Default: %default]') parser.add_option('-s', dest='single_gene_loci', default=False, action='store_true', help='Only use single gene loci [Default: %default]') parser.add_option('-t', dest='test_stat', default=False, action='store_true', help='Use test statistic rather than fold change [Default: %default]') parser.add_option('--sample1', dest='sample1', help='Sample_1 name in cuffdiff') parser.add_option('--sample2', dest='sample2', help='Sample_2 name in cuffdiff') (options,args) = parser.parse_args() if len(args) != 2: parser.error('Must provide peaks GFF and .diff file') else: peaks_gff = args[0] diff_file = args[1] ################################################## # process GTF ################################################## if options.single_gene_loci: single_gtf_fd, single_gtf_file = filter_single(options.ref_gtf) options.ref_gtf = single_gtf_file gtf_genes = gff.gtf_gene_set(options.ref_gtf) ################################################## # collect CLIP peak bound genes ################################################## peak_genes = set() p = subprocess.Popen('intersectBed -s -u -a %s -b %s' % (options.ref_gtf, peaks_gff), shell=True, stdout=subprocess.PIPE) for line in p.stdout: peak_genes.add(gff.gtf_kv(line.split('\t')[8])['gene_id']) p.communicate() # find expressed genes in peak calls silent_genes = set() if options.clip_fpkm_file: silent_genes = find_silent(options.clip_fpkm_file) ################################################## # collect RIP stats ################################################## if options.test_stat: rip_fold, rip_bound = ripseq.hash_rip(diff_file, just_ok = True, use_fold=False, max_stat=options.max_stat, one_rbp=True) else: rip_fold, rip_bound = ripseq.hash_rip(diff_file, use_fold=True, max_stat=options.max_stat, one_rbp=True) rip_fold = ripseq.hash_rip_fold(diff_file, min_fpkm=0.125, max_fold=10, one_rbp=True) ################################################## # plot bound and unbound distributions ################################################## # construct data frame df_dict = {'Gene':[], 'CLIP':[], 'RIP':[]} for gene_id in rip_fold: if gene_id in gtf_genes and (len(silent_genes) == 0 or gene_id not in silent_genes): df_dict['Gene'].append(gene_id) df_dict['RIP'].append(rip_fold[gene_id]) if gene_id in peak_genes: df_dict['CLIP'].append('Bound') else: df_dict['CLIP'].append('Unbound') ggplot.plot(options.ggplot_script, df_dict, [options.output_pre, options.rbp, options.test_stat]) ################################################## # compute stats on bound and unbound distributions ################################################## bound_fold = [df_dict['RIP'][i] for i in range(len(df_dict['RIP'])) if df_dict['CLIP'][i] == 'Bound'] unbound_fold = [df_dict['RIP'][i] for i in range(len(df_dict['RIP'])) if df_dict['CLIP'][i] == 'Unbound'] # perform statistical test z, p = stats.mannwhitneyu(bound_fold, unbound_fold) stats_out = open('%s_stats.txt' % options.output_pre, 'w') cols = (options.rbp, len(bound_fold), stats.mean(bound_fold), len(unbound_fold), stats.mean(unbound_fold), z, p) print >> stats_out, '%-10s %5d %6.2f %5d %6.2f %6.2f %9.2e' % cols stats_out.close() ################################################## # plot venn diagram ################################################## rip_genes = set([df_dict['Gene'][i] for i in range(len(df_dict['Gene'])) if rip_bound.get(df_dict['Gene'][i],False)]) clip_only = len(peak_genes - rip_genes) rip_only = len(rip_genes - peak_genes) both = len(peak_genes & rip_genes) if options.clip_fpkm_file: print >> sys.stderr, 'Ignoring silent genes for hypergeometric test' # k is x # K is n # N is M # n is N # hypergeom.sf(x, M, n, N, loc=0) p1 = hypergeom.sf(both-1, len(gtf_genes), len(peak_genes), len(rip_genes)) p2 = hypergeom.sf(both-1, len(gtf_genes), len(rip_genes), len(peak_genes)) hyper_out = open('%s_hyper.txt' % options.output_pre, 'w') cols = (p1, p2, both, clip_only, rip_only, len(peak_genes), len(rip_genes), len(gtf_genes)) print >> hyper_out, '%7.2e %7.2e %5d %5d %5d %5d %5d %5d' % cols hyper_out.close() if clip_only > 0 and rip_only > 0: plt.figure() # venn_diag = venn2(subsets=(clip_only, rip_only, both), set_labels=['CLIP', 'fRIP'], set_colors=['#e41a1c', '#377eb8']) # venn_diag = venn2(subsets=(clip_only, rip_only, both), set_labels=['CLIP', 'fRIP'], set_colors=['#e41a1c', '#1ae47d']) venn_diag = venn2(subsets=(clip_only, rip_only, both), set_labels=['CLIP', 'fRIP'], set_colors=['#e41a1c', '#A1A838']) plt.savefig('%s_venn.pdf' % options.output_pre) ################################################## # clean ################################################## if options.single_gene_loci: os.close(single_gtf_fd) os.remove(single_gtf_file)
def main(): usage = 'usage: %prog [options] <diff1_file> <diff2_file>' parser = OptionParser(usage) parser.add_option('-s', dest='stat', default='test_stat') parser.add_option('-g', dest='genes_gtf', default=None) parser.add_option('-m', dest='min_fpkm', default=0, type='float') parser.add_option('-o', dest='out_dir', default='.') parser.add_option('-p', dest='pseudocount', default=0.125, type='float') (options,args) = parser.parse_args() if len(args) != 2: parser.error('Must provide two diff files') else: diff1_file = args[0] diff2_file = args[1] gtf_genes = None if options.genes_gtf: gtf_genes = gff.gtf_gene_set(options.genes_gtf) diff1_stats = cuffdiff.hash_stat(diff1_file, stat=options.stat, min_fpkm=options.min_fpkm, pseudocount=options.pseudocount, gene_set=gtf_genes) diff1_sig = cuffdiff.hash_sig(diff1_file, gene_set=gtf_genes) diff2_stats = cuffdiff.hash_stat(diff2_file, stat=options.stat, min_fpkm=options.min_fpkm, pseudocount=options.pseudocount, gene_set=gtf_genes) diff2_sig = cuffdiff.hash_sig(diff2_file, gene_set=gtf_genes) if not os.path.isdir(options.out_dir): os.mkdir(options.out_dir) for diff_key in diff1_stats: sample1, sample2 = diff_key gene_stats1 = diff1_stats[diff_key] gene_sig1 = diff1_sig[diff_key] gene_stats2 = diff2_stats[diff_key] gene_sig2 = diff2_sig[diff_key] report_out = open('%s/%s-%s_report.txt' % (options.out_dir,sample1,sample2), 'w') # compare numbers of genes quantified common_genes = set(gene_stats1.keys()) & set(gene_stats2.keys()) print >> report_out, 'Genes quantified' print >> report_out, '%s\t%d' % (diff1_file,len(gene_stats1)) print >> report_out, '%s\t%d' % (diff2_file,len(gene_stats2)) print >> report_out, 'Common\t%d' % len(common_genes) print >> report_out, '' up1 = set([gene_id for gene_id in gene_sig1 if gene_sig1[gene_id]]) up2 = set([gene_id for gene_id in gene_sig2 if gene_sig2[gene_id]]) print >> report_out, 'Genes upregulated' print >> report_out, '%s\t%d' % (diff1_file,len(up1)) print >> report_out, '%s\t%d' % (diff2_file,len(up2)) print >> report_out, 'Common\t%d' % len(up1 & up2) print >> report_out, '' down1 = set([gene_id for gene_id in gene_sig1 if not gene_sig1[gene_id]]) down2 = set([gene_id for gene_id in gene_sig2 if not gene_sig2[gene_id]]) print >> report_out, 'Genes downregulated' print >> report_out, '%s\t%d' % (diff1_file,len(down1)) print >> report_out, '%s\t%d' % (diff2_file,len(down2)) print >> report_out, 'Common\t%d' % len(down1 & down2) print >> report_out, '' # scatter plot test stat df = {'diff1':[], 'diff2':[]} for gene_id in common_genes: df['diff1'].append(gene_stats1[gene_id]) df['diff2'].append(gene_stats2[gene_id]) r_script = '%s/diff_diff_scatter.r' % os.environ['RDIR'] out_pdf = '%s/%s-%s_scatter.pdf' % (options.out_dir, sample1, sample2) ggplot.plot(r_script, df, [out_pdf], df_file='%s.df'%out_pdf[:-4]) # compute correlation cor, p = spearmanr(df['diff1'], df['diff2']) print >> report_out, 'Spearman correlation: %f (%f)' % (cor,p) cor, p = pearsonr(df['diff1'], df['diff2']) print >> report_out, 'Pearson correlation: %f (%f)' % (cor,p) report_out.close() # plot test_stat versus test_stat difference df = {'minus':[], 'avg':[]} for gene_id in common_genes: df['minus'].append(gene_stats1[gene_id] - gene_stats2[gene_id]) df['avg'].append(0.5*gene_stats1[gene_id] + 0.5*gene_stats2[gene_id]) r_script = '%s/diff_diff_ma.r' % os.environ['RDIR'] out_pdf = '%s/%s-%s_ma.pdf' % (options.out_dir, sample1, sample2) ggplot.plot(r_script, df, [out_pdf])