def plotASProfile(tcc, cName, directory = None, min = 0, extra = "0"): if not directory: fN = extra + '.' + tcc + '.png' else: fN = directory + '/' + extra + '.' + tcc + '.png' #Get S Profile tccStretch = cgPeaks.stretch(tcc, cName) highest = tccStretch.getHighestLevel() if highest < min: return 0 sortedX = tccStretch.profile.keys() sortedX.sort() sortedY = [] for X in sortedX: sortedY.append(tccStretch.profile[X]) #Get AS Profile chr, strand, start, end = tcc.strip().split(':') if strand == '1': strand = '-1' else: strand = '1' tcc = cg.makeTcc(chr, strand, start, end) tccStretchAS = cgPeaks.stretch(tcc, cName) highest = tccStretchAS.getHighestLevel() if highest < min: return 0 #AS can have minimum I guess... sortedXAS = tccStretchAS.profile.keys() sortedXAS.sort() sortedYAS = [] for X in sortedXAS: sortedYAS.append(tccStretchAS.profile[X]) #Plot them gDevice = importr('grDevices') gDevice.png(file=fN, width=1680, height=1050) r('split.screen(c(2,1))') r('screen(1)') r.plot(sortedX, sortedY, xlab = "Coordinates", ylab = "(Syn) Expression Level" ) r.lines(sortedX, sortedY, type = "b") r('screen(2)') r.plot(sortedXAS, sortedYAS, xlab = "Coordinates", ylab = "(Anti) Expression Level") r.lines(sortedXAS, sortedYAS, type = "b") gDevice.dev_off()
def val(self): """ Estimate value functions with b-splines and compare """ new_data = pd.DataFrame({'OverallRank': np.linspace(1, 194, 1000)}) fit_a = self.spline_est(self.policy_a['value'], new_data) fit_b = self.spline_est(self.policy_b['value'], new_data) r.pdf(os.path.join(os.path.dirname(self.out_dir), 'value.pdf')) r.plot(new_data['OverallRank'], fit_a, type='l', xlab='Rank_M', ylab='V(Rank)') r.lines(new_data['OverallRank'], fit_b, col='red') r.points(self.policy_a['value']['OverallRank'], self.policy_a['value']['val'], col='black') r.points(self.policy_b['value']['OverallRank'], self.policy_b['value']['val'], col='red') r.legend('topright', np.array(['No Info', 'Info']), lty=np.array([1, 1]), col=np.array(['black', 'red'])) r('dev.off()') diff = np.array(fit_b) - np.array(fit_a) r.pdf(os.path.join(os.path.dirname(self.out_dir), 'value_diff.pdf')) r.plot(new_data['OverallRank'], diff, type='l', xlab='Rank', ylab='V(Rank|info=1) - V(Rank|info=0)') r.abline(h=0, lty=2) r('dev.off()') diff = (np.array(fit_b) - np.array(fit_a)) / np.array(fit_a) r.pdf(os.path.join(os.path.dirname(self.out_dir), 'value_percent_diff.pdf')) r.plot(new_data['OverallRank'], diff, type='l', xlab='Rank', ylab='(V(Rank|info=1) - V(Rank|info=0)) / V(Rank|info=0)') r.abline(h=0, lty=2) r('dev.off()') data_path = dirname(dirname(__file__)) data_path = join(data_path, 'data', 'lawData.csv') data = pd.read_csv(data_path) new_data = deepcopy(data.loc[data['year'] == 2013, 'OverallRank']) #new_data = np.concatenate(( # new_data, np.zeros(lc.N_SCHOOLS - len(new_data)) #)) new_data = pd.DataFrame({'OverallRank': np.array(new_data)}) fit_a = self.spline_est(self.policy_a['value'], new_data) fit_b = self.spline_est(self.policy_b['value'], new_data) diff = np.sum(np.array(fit_b) - np.array(fit_a)) pdiff = diff / np.sum(fit_a) print(" - Change in Producer Surplus: {0}".format(diff)) print(" - Percent change in Producer Surplus: {0}".format(pdiff)) return diff
def plotSmallDeg(tcc, smallCName, degCName, outDir = None, description = "None", nameNum = "0"): if not outDir: fN = nameNum + "." + tcc + '.png' else: fN = outDir + '/' + nameNum + "." + tcc + '.png' #Get deg Profile tccStretch = cgPeaks.stretch(tcc, degCName) sortedX = tccStretch.profile.keys() sortedX.sort() sortedY = [] for X in sortedX: sortedY.append(tccStretch.profile[X]) #Get small tccStretchSmall = cgPeaks.stretch(tcc, smallCName) sortedXAS = tccStretchSmall.profile.keys() sortedXAS.sort() sortedYAS = [] for X in sortedXAS: sortedYAS.append(tccStretchSmall.profile[X]) #Plot them gDevice = importr('grDevices') gDevice.png(file=fN, width=1680, height=1050) r('split.screen(c(2,1))') r('screen(1)') r.plot(sortedX, sortedY, xlab = "Coordinates", ylab = "Degradome Expression" ) r.lines(sortedX, sortedY, type = "b") r('screen(2)') r.plot(sortedXAS, sortedYAS, xlab = description, ylab = "Small Expression") r.lines(sortedXAS, sortedYAS, type = "b") gDevice.dev_off()
def plotProfile(tcc, cName, directory = None, min = 0): if not directory: fN = tcc + '.png' else: fN = directory + '/' + tcc + '.png' tccStretch = cgPeaks.stretch(tcc, cName) highest = tccStretch.getHighestLevel() if highest < min: return 0 sortedX = tccStretch.profile.keys() sortedX.sort() sortedY = [] for X in sortedX: sortedY.append(tccStretch.profile[X]) gDevice = importr('grDevices') gDevice.png(file=fN, width=1680, height=1050) r.plot(sortedX, sortedY, xlab = "Coordinates", ylab = "Expression Level") r.lines(sortedX, sortedY, type = "b") gDevice.dev_off()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-g", "--gtf-file", dest="gtf", type="string", help="GTF containing gene annotations") parser.add_option("-s", "--sort", dest="sort", type="choice", default="length", choices=sort_choices, help="Property to sort rows by. Choices are %s" % ", ".join(sort_choices)) parser.add_option("-b", "--bin-size", dest="bin_size", type="int", default=25, help="Size of window over which to sum reads") parser.add_option("-u", "--upstream-window", dest="us_win", type="int", default=500, help="Amount of sequence upstream of alignment point (less introns)") parser.add_option("-d", "--downstream-window", dest="ds_win", type="int", default=None, help="Amount of sequence downstream of alignment point (default longest segment)") parser.add_option("-a", "--align-at", dest="align_at", type="choice", default="start", choices=align_choices, help="Where to align genes/transcripts at. Choices are %s" % ", ".join(align_choices)) parser.add_option("-H", "--height", dest="height", type="int", default=None, help="Number of rows in output matrix/heigh of plot in px") parser.add_option("-w", "--width", dest="width", type="int", default=None, help="Number of columns in output/width of plot in px" "default based on bin size") parser.add_option("-n", "--normalize", dest="normalize", type="choice", default="none", choices=norm_choices, help="Row normalization to apply. Choices are: %s" % ", ".join(norm_choices)) parser.add_option("-r", "--renormalize", dest="renormalize", type="choice", default="none", choices=norm_choices, help="Row normalization to apply after row/column compression") parser.add_option("--no-plot", dest="plot", action="store_false", default=True, help="Do not output plot - compute matrix only") parser.add_option("--use-matrix", dest="use_matrix", type="string", default=None, help="Use existing matrix") parser.add_option("--annotations", dest="annotations", type="choice", action="append", choices=annotation_choices, help="Add annotations to the output plot") parser.add_option("--reverse-strand", dest="rstrand", action="store_true", default=False, help="Find reads on reverse strand") parser.add_option("-f", "--feature", dest="feature", type="choice", choices=["gene", "transcript"], default="gene", help="use genes or transcripts") parser.add_option("--quantile", dest="quantile", type="float", default=0.99, help="Quantile to use in quantile normalization") parser.add_option("-o", "--outfile-prefix", dest="outfile_pattern", type="string", default=None, help="base of names for output files") parser.add_option("-c", "--crop", dest="crop", type="string", default=None, help="crop view to a certain range on the xaxis. Specify like" "-500:1000") parser.add_option("--format", dest="format", type="string", default="png", help="Output format, use valid R graphics device") parser.add_option("--plus-wig", dest="plus_wig", type="string", help="Use this wig for plus strand info rather than bam file") parser.add_option("--minus-wig", dest="minus_wig", type="string", help="Use this wig for minus strand info rather than bam file") parser.add_option("--bed", dest="bed", type="string", help="Use this bed for signal(must be indexed)") parser.add_option("--norm-mat", dest="norm_mat", type="string", help="Use this matrix for normalizing (e.g. RNA data") parser.add_option("--sort-order-file", dest="sort_file", type="string", default=None, help="Two column file containing gene names in the first column and a numeric value to sort on in the second") # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) if options.plot and (options.height is None): options.height = 100 if options.gtf: f = IOTools.openFile(options.gtf) if options.feature == "gene": gtf_iterator = GTF.flat_gene_iterator(GTF.iterator(f)) else: gtf_iterator = GTF.transcript_iterator(GTF.iterator(f)) lengths = dict() utr3_lengths = dict() utr5_lengths = dict() first_exon_lengths = dict() for transcript in gtf_iterator: lengths[transcript[0].transcript_id] = sum( [e[1] - e[0] for e in GTF.asRanges(transcript, "exon")]) exons = GTF.asRanges(transcript, "exon") utrs = GTF.asRanges(transcript, "UTR") coding = Intervals.truncate(exons, utrs) coding.sort() utr5 = [utr for utr in utrs if utr[1] <= coding[0][0]] utr3 = [utr for utr in utrs if utr[0] >= coding[-1][-1]] if transcript[0].strand == "-": utr3, utr5 = utr5, utr3 if transcript[0].strand == "+" or len(exons) == 1: first_exon_lengths[transcript[0].transcript_id] = \ exons[0][1] - exons[0][0] else: first_exon_lengths[transcript[0].transcript_id] = \ exons[-1][1] - exons[-1][0] utr3_lengths[transcript[0].transcript_id] = sum( [e[1] - e[0] for e in utr3]) utr5_lengths[transcript[0].transcript_id] = sum( [e[1] - e[0] for e in utr5]) lengths = pandas.Series(lengths) utr3_lengths = pandas.Series(utr3_lengths) utr5_lengths = pandas.Series(utr5_lengths) first_exon_lengths = pandas.Series(first_exon_lengths) else: options.sort = "none" options.annotations = None if options.plus_wig: getter = iCLIP.make_getter(plus_wig=options.plus_wig, minus_wig=options.minus_wig) elif options.bed: getter = iCLIP.make_getter(bedfile=options.bed) else: try: getter = iCLIP.make_getter(bamfile=args[0]) except IOError: E.error("Cannot open bamfile %s" % args[0]) return(1) except IndexError: getter = None if options.use_matrix: raw_matrix = pandas.read_csv(options.use_matrix, sep="\t", index_col=0) raw_matrix.columns = raw_matrix.columns.astype("int") else: raw_matrix = get_matrix(getter, lengths, options) if options.crop: crop_from, crop_to = map(int, options.crop.split(":")) raw_matrix = raw_matrix.loc[:, crop_from:crop_to] if options.norm_mat: norm_matrix = pandas.read_csv(options.norm_mat, sep="\t", index_col=0) norm_matrix.columns = norm_matrix.columns.astype("int") if options.crop: norm_matrix = norm_matrix.loc[:, crop_from:crop_to] if all(norm_matrix.columns == raw_matrix.columns) and \ all(raw_matrix.index.isin(norm_matrix.index.values)): norm_matrix = norm_matrix.loc[raw_matrix.index] norm_matrix = norm_matrix.replace( 0, norm_matrix[norm_matrix > 0].min().min()) raw_matrix = raw_matrix/norm_matrix norm_matrix = None else: raise ValueError("Incompatible normalisation matrix") normalized_matrix = normalize(raw_matrix, options.normalize, quantile=options.quantile) if options.sort == "length": sorter = lengths elif options.sort == "3utr": sorter = utr3_lengths elif options.sort == "5utr": sorter = utr5_lengths elif options.sort == "first-exon": sorter = first_exon_lengths elif options.sort == "manual": sorter = pandas.read_csv(options.sort_file, sep="\t", index_col=0, usecols=[0, 1]) sorter = sorter[sorter.columns[0]] elif options.sort == "none": sorter = pandas.Series(range(raw_matrix.shape[0]), index=raw_matrix.index[::-1]) sorter = sorter[sorter.index.isin(normalized_matrix.index)] sorter = sorter.sort_values(ascending=False) sorted_matrix = normalized_matrix.loc[sorter.index.values] compress_matrix = iCLIP.compress_matrix(sorted_matrix, ncols=options.width, nrows=options.height) renormalized_matrix = normalize(compress_matrix, options.renormalize, quantile=options.quantile) if renormalized_matrix is raw_matrix and options.use_matrix is not None: E.info("Input and output matrices are identical, no matrix output") else: if options.outfile_pattern: mat_outfile = IOTools.openFile( options.outfile_pattern + ".matrix.tsv.gz", "w") else: mat_outfile = options.stdout renormalized_matrix.to_csv(mat_outfile, sep="\t") if options.plot: try: from rpy2.robjects import r as R from rpy2 import robjects as ro except: E.info("No rpy2. Not plotting image") return(0) from rpy2.robjects.numpy2ri import numpy2ri ro.conversion.py2ri = numpy2ri ro.numpy2ri.activate() if options.outfile_pattern: plot_outfile = options.outfile_pattern + ".png" else: plot_outfile = "bam2heatmap_out.png" c = R["c"] R[options.format](plot_outfile, width=renormalized_matrix.shape[1] + 72, height=renormalized_matrix.shape[0] + 72, unit="px", res=72) R.par(mai=c(1, 0.5, 0, 0.5)) cols = R["colorRampPalette"](c("white", "blue"))(50) bases = renormalized_matrix.columns.values.astype("int") groups = renormalized_matrix.index.values.astype("int") mat = renormalized_matrix.as_matrix() mat[mat >= 1] = 1 R.image(bases, groups, R.t(mat), zlim=c(0, 1), raster=True, col=cols, xlab="Base", yaxt="n") def _sort_and_compress_annotation(anno): sorted_anno = anno.loc[sorter.index] comp_anno = iCLIP.compress_matrix( sorted_anno, renormalized_matrix.shape[0]) return comp_anno if options.annotations: ends = _sort_and_compress_annotation(lengths) starts = pandas.Series(0, index=renormalized_matrix.index) if options.align_at == "end": starts, ends = -1 * ends, starts if "start" in options.annotations: R.lines(starts.values, starts.index.values, col="black", pch=".") if "end" in options.annotations: R.lines(ends.values, ends.index.values, pch=".", col="black") if "5utr" in options.annotations: utr5s = _sort_and_compress_annotation(utr5_lengths) utr5s = starts + utr5s R.lines(utr5s.values, utr5s.index.values, col="orange", pch=".") if "3utr" in options.annotations: utr3s = _sort_and_compress_annotation(utr3_lengths) utr3s = ends - utr3s R.lines(utr3s.values, utr3s.index.values, col="orange", pch=".") R["dev.off"]() # write footer and output benchmark information. E.Stop()