Example #1
0
def _plt_distr(dat,
               col,
               title='',
               splitBy_pfill=True,
               pfill='label',
               independentpdf=False,
               fname='xdistr.pdf'):
    df = dat[dat[pfill] != 'NA']  ## remove invalid pairs
    n = len(df)
    df = {
        col: robjects.FloatVector(list(df[col])),
        pfill: robjects.StrVector(list(df[pfill]))
    }
    df = robjects.DataFrame(df)

    pp = ggplot2.ggplot(df) + \
        ggplot2.ggtitle('%s [Total = %s]' % (title, n))

    ## Plot1: counts
    if splitBy_pfill:
        p1 = pp + ggplot2.aes_string(x=col, fill=pfill)
    else:
        p1 = pp + ggplot2.aes_string(x=col)

    ## Plot2: density
    if splitBy_pfill:
        p2 = pp + ggplot2.aes_string(x=col, fill=pfill, y='..density..')
    else:
        p2 = pp + ggplot2.aes_string(x=col, y='..density..')
    p2 = p2 + ggplot2.geom_density(alpha=.5, origin=-500)

    if col == 'distance':
        p1 = p1 + \
            ggplot2.geom_histogram(binwidth=1000, alpha=.5, position='identity', origin=-500) + \
            ggplot2.xlim(-1000, 51000)

        p2 = p2 + \
            ggplot2.geom_histogram(binwidth=1000, alpha=.33, position='identity', origin=-500) + \
            ggplot2.xlim(-1000, 51000)
    else:
        p1 = p1 + \
            ggplot2.geom_histogram(alpha=.5, position='identity')

        p2 = p2 + \
            ggplot2.geom_histogram(alpha=.33, position='identity')

        if col == 'correlation':
            p1 = p1 + ggplot2.xlim(-1.1, 1.1)
            p2 = p2 + ggplot2.xlim(-1.1, 1.1)

    if independentpdf:
        grdevices = importr('grDevices')
        grdevices.pdf(file=fname)
        p1.plot()
        p2.plot()
        grdevices.dev_off()
    else:
        p1.plot()
        p2.plot()
    return
Example #2
0
dataf = DataFrame(d)



from rpy2.robjects.lib import ggplot2
p = ggplot2.ggplot(dataf) + \
    ggplot2.geom_line(ggplot2.aes_string(x="n_loop", 
                                         y="time",
                                         colour="code")) + \
    ggplot2.geom_point(ggplot2.aes_string(x="n_loop", 
                                          y="time",
                                          colour="code")) + \
    ggplot2.facet_wrap(Formula('~sequence')) + \
    ggplot2.scale_y_continuous('running time') + \
    ggplot2.scale_x_continuous('repeated n times', ) + \
    ggplot2.xlim(0, max(n_loops)) + \
    ggplot2.labs(title = "Benchmark (running time)")


from rpy2.robjects.packages import importr
grdevices = importr('grDevices')
grdevices.png('../../_static/benchmark_sum.png',
              width = 712, height = 512)
p.plot()
grdevices.dev_off()

#base = importr("base")
stats = importr('stats')
nlme = importr("nlme")
fit = nlme.lmList(Formula('time ~ n_loop | group'), data = dataf, 
                  na_action = stats.na_exclude)
def plot_collectors_curve(args, start_times, read_lengths):
	"""
	Use rpy2 to create a collectors curve of the run
	"""
	r = robjects.r
	r.library("ggplot2")
	grdevices = importr('grDevices')

	# set t_0 as the first measured time for the read.
	t_0 = start_times[0]

	# adjust times to be relative to t_0
	r_start_times = robjects.FloatVector([float(t - t_0) / float(3600) + 0.00000001 \
		for t in start_times])
	r_read_lengths = robjects.IntVector(read_lengths)

	# compute the cumulative based on reads or total base pairs
	if args.plot_type == 'reads':
		y_label = "Total reads"
		cumulative = \
			r.cumsum(robjects.IntVector([1] * len(start_times)))
	elif args.plot_type == 'basepairs':
		y_label = "Total base pairs"
		cumulative = r.cumsum(r_read_lengths)

	# make a data frame of the lists
	d = {'start': r_start_times, 
		'lengths': r_read_lengths,
		'cumul': cumulative}
	df = robjects.DataFrame(d)

	if args.savedf:
		robjects.r("write.table")(df, file=args.savedf, sep="\t")

	# title
	total_reads = len(read_lengths)
	total_bp = sum(read_lengths)
	plot_title = "Yield: " \
		+ str(total_reads) + " reads and " \
		+ str(total_bp) + " base pairs."

	# plot
	gp = ggplot2.ggplot(df)
	pp = gp + ggplot2.aes_string(x='start', y='cumul') \
		+ ggplot2.geom_step(size=2) \
		+ ggplot2.scale_x_continuous('Time (hours)') \
		+ ggplot2.scale_y_continuous(y_label) \
		+ ggplot2.ggtitle(plot_title)

        # extrapolation
	if args.extrapolate:
		start = robjects.ListVector({'a': 1, 'b': 1})
                pp = pp + ggplot2.stat_smooth(fullrange='TRUE', method='nls',
                                              formula='y~a*I((x*3600)^b)',
                                              se='FALSE', start=start) \
                        + ggplot2.xlim(0, float(args.extrapolate))

	if args.theme_bw:
		pp = pp + ggplot2.theme_bw()	

	if args.saveas is not None:
		plot_file = args.saveas
		if plot_file.endswith(".pdf"):
			grdevices.pdf(plot_file, width = 8.5, height = 8.5)
		elif plot_file.endswith(".png"):
			grdevices.png(plot_file, width = 8.5, height = 8.5, 
				units = "in", res = 300)
		else:
			logger.error("Unrecognized extension for %s!" % (plot_file))
			sys.exit()

		pp.plot()
		grdevices.dev_off()
	else:
		pp.plot()
		# keep the plot open until user hits enter
		print('Type enter to exit.')
		raw_input()
Example #4
0
d['group'] = StrVector(
    [d['code'][x] + ':' + d['sequence'][x] for x in xrange(len(d['n_loop']))])
dataf = DataFrame(d)

from rpy2.robjects.lib import ggplot2
p = ggplot2.ggplot(dataf) + \
    ggplot2.geom_line(ggplot2.aes_string(x="n_loop",
                                         y="time",
                                         colour="code")) + \
    ggplot2.geom_point(ggplot2.aes_string(x="n_loop",
                                          y="time",
                                          colour="code")) + \
    ggplot2.facet_wrap(Formula('~sequence')) + \
    ggplot2.scale_y_continuous('running time') + \
    ggplot2.scale_x_continuous('repeated n times', ) + \
    ggplot2.xlim(0, max(n_loops)) + \
    ggplot2.opts(title = "Benchmark (running time)")

from rpy2.robjects.packages import importr
grdevices = importr('grDevices')
grdevices.png('../../_static/benchmark_sum.png', width=712, height=512)
p.plot()
grdevices.dev_off()

#base = importr("base")
stats = importr('stats')
nlme = importr("nlme")
fit = nlme.lmList(Formula('time ~ n_loop | group'),
                  data=dataf,
                  na_action=stats.na_exclude)
Example #5
0
def _plt_percountr(dat, independentpdf=False, fname='xpercount.pdf'):
    def _filt_dat(dat, item, getlabel=True):
        df = pd.DataFrame(dat[item].value_counts())
        df.columns = ['count']
        if getlabel:
            df['label'] = [
                list(dat[dat[item] == i]['label'])[0] for i in df.index
            ]
        n = len(df)
        mx = max(df['count'])
        return df, n, mx

    dat = dat[dat['label'] != 'NA']

    ## NUMBER OF MIRNA PER TSS
    df, n, mx = _filt_dat(dat, 'tss', False)
    df = {'count': robjects.IntVector(df['count'])}
    df = robjects.DataFrame(df)

    pt = ggplot2.ggplot(df) + \
        ggplot2.geom_histogram(binwidth=1, origin=-.5, alpha=.5, position="identity") + \
        ggplot2.xlim(-.5, mx+1) + \
        ggplot2.aes_string(x='count') + \
        ggplot2.ggtitle('TSS [Total = %s]' % n) + \
        ggplot2.labs(x='Number of miRNA per TSS (max = %s)' % mx)

    pt_den = ggplot2.ggplot(df) + \
        ggplot2.aes_string(x='count', y='..density..') + \
        ggplot2.geom_density(binwidth=1, alpha=.5, origin=-.5) + \
        ggplot2.geom_histogram(binwidth=1, alpha=.33, position='identity', origin=-.5) + \
        ggplot2.ggtitle('TSS [Total = %s]' % n) + \
        ggplot2.labs(x='Number of miRNA per TSS (max = %s)' % mx)

    ## NUMBER OF TSS PER MIRNA
    df, n, mx = _filt_dat(dat, 'mirna')
    df = {
        'count': robjects.IntVector(df['count']),
        'label': robjects.StrVector(df['label'])
    }
    df = robjects.DataFrame(df)

    _pm = ggplot2.ggplot(df) + \
        ggplot2.geom_histogram(binwidth=1, origin=-.5, alpha=.5, position="identity") + \
        ggplot2.xlim(-.5, mx+1) + \
        ggplot2.ggtitle('miRNA [Total = %s]' % n)

    _pm_den = ggplot2.ggplot(df) + \
        ggplot2.geom_density(binwidth=1, alpha=.5, origin=-.5) + \
        ggplot2.geom_histogram(binwidth=1, alpha=.33, position='identity', origin=-.5) + \
        ggplot2.ggtitle('miRNA [Total = %s]' % n)

    ## not split by label
    pm = _pm + ggplot2.aes_string(x='count')
    pm_den = _pm_den + ggplot2.aes_string(x='count', y='..density..')

    ## split by label
    pms = _pm + ggplot2.aes_string(x='count', fill='label')
    pm_dens = _pm_den + ggplot2.aes_string(
        x='count', fill='label', y='..density..')

    ## add xlabelling (need to be added after aes_string)
    _xlab = ggplot2.labs(x='Number of TSS per miRNA (max = %s)' % mx)
    pm += _xlab
    pm_den += _xlab
    pms += _xlab
    pm_dens += _xlab

    if independentpdf:
        grdevices = importr('grDevices')
        grdevices.pdf(fname)
        pt.plot()
        pt_den.plot()
        pm.plot()
        pm_den.plot()
        pms.plot()
        pm_dens.plot()
        grdevices.dev_off()
    else:
        pt.plot()
        pt_den.plot()
        pm.plot()
        pm_den.plot()
        pms.plot()
        pm_dens.plot()
    return
Example #6
0
def plot_collectors_curve(args, start_times, read_lengths):
    """
	Use rpy2 to create a collectors curve of the run
	"""
    r = robjects.r
    r.library("ggplot2")
    grdevices = importr('grDevices')

    # set t_0 as the first measured time for the read.
    t_0 = start_times[0]

    # adjust times to be relative to t_0
    r_start_times = robjects.FloatVector([float(t - t_0) / float(3600) + 0.00000001 \
     for t in start_times])
    r_read_lengths = robjects.IntVector(read_lengths)

    # compute the cumulative based on reads or total base pairs
    if args.plot_type == 'reads':
        y_label = "Total reads"
        cumulative = \
         r.cumsum(robjects.IntVector([1] * len(start_times)))
    elif args.plot_type == 'basepairs':
        y_label = "Total base pairs"
        cumulative = r.cumsum(r_read_lengths)

    step = args.skip
    # make a data frame of the lists
    d = {
        'start':
        robjects.FloatVector(
            [r_start_times[n] for n in xrange(0, len(r_start_times), step)]),
        'lengths':
        robjects.IntVector(
            [r_read_lengths[n] for n in xrange(0, len(r_read_lengths), step)]),
        'cumul':
        robjects.IntVector(
            [cumulative[n] for n in xrange(0, len(cumulative), step)])
    }
    df = robjects.DataFrame(d)

    if args.savedf:
        robjects.r("write.table")(df, file=args.savedf, sep="\t")

    # title
    total_reads = len(read_lengths)
    total_bp = sum(read_lengths)
    plot_title = "Yield: " \
     + str(total_reads) + " reads and " \
     + str(total_bp) + " base pairs."

    # plot
    gp = ggplot2.ggplot(df)
    pp = gp + ggplot2.aes_string(x='start', y='cumul') \
     + ggplot2.geom_step(size=2) \
     + ggplot2.scale_x_continuous('Time (hours)') \
     + ggplot2.scale_y_continuous(y_label) \
     + ggplot2.ggtitle(plot_title)

    # extrapolation
    if args.extrapolate:
        start = robjects.ListVector({'a': 1, 'b': 1})
        pp = pp + ggplot2.stat_smooth(fullrange='TRUE', method='nls',
                                      formula='y~a*I((x*3600)^b)',
                                      se='FALSE', start=start) \
                + ggplot2.xlim(0, float(args.extrapolate))

    if args.theme_bw:
        pp = pp + ggplot2.theme_bw()

    if args.saveas is not None:
        plot_file = args.saveas
        if plot_file.endswith(".pdf"):
            grdevices.pdf(plot_file, width=8.5, height=8.5)
        elif plot_file.endswith(".png"):
            grdevices.png(plot_file,
                          width=8.5,
                          height=8.5,
                          units="in",
                          res=300)
        else:
            logger.error("Unrecognized extension for %s!" % (plot_file))
            sys.exit()

        pp.plot()
        grdevices.dev_off()
    else:
        pp.plot()
        # keep the plot open until user hits enter
        print('Type enter to exit.')
        raw_input()
Example #7
0
hour_speed['mph'] = 3600 * (hour_speed.trip_distance / hour_speed.trip_time_in_secs)
hour_speed
   
# Pickups by hour (aggregated across all days)
max_passenger_count = np.max(np.log1p(pickups_hr['passenger_count']))
min_passenger_count = np.log1p(1)

for i in range(0,24):
        temp = pickups_hr[(pickups_hr.hour==i)].reset_index()
        temp.passenger_count = np.log1p(temp.passenger_count)
        temp_r = pandas2ri.py2ri(temp)
        p = ggplot2.ggplot(temp_r) + \
            ggplot2.aes_string(x='rounded_lon', y='rounded_lat', color='passenger_count') + \
            ggplot2.geom_point(size=0.5) + \
            ggplot2.scale_color_gradient(low='black', high='white', limits=np.array([min_passenger_count, max_passenger_count])) + \
            ggplot2.xlim(-74.2, -73.7) + ggplot2.ylim(40.56, 40.93) + \
            ggplot2.labs(x=' ', y=' ', title='NYC taxi pickups %02i:00' % i) + \
            ggplot2.guides(color=False)
        p.save('./plots/taxi_pickups%02i.png' % i, width=4, height=4.5)
        
# Create animated .gif of pickups by hour
import imageio

file_names = sorted((fn for fn in os.listdir('./plots') if fn.startswith('taxi_pickups')))
file_names = ['plots/' + s for s in file_names]
images = []
for filename in file_names:
    images.append(imageio.imread(filename))
imageio.mimsave('./plots/pickups_movie.gif', images, duration=0.4)

# total pickups by date, color