def bar_chart(self, conn, column1, column2, table_chosen, title): # since this is a bar graph only two columns will be there data_df = dfile.double_selector(conn = conn, table= table_chosen, col1 = column1, col2 = column2) bar_plot = ggplot(aes(x=column1, weight=column2), data=data_df) + geom_bar() + labs(title=title) print(bar_plot)
def two_var_intr_effects(self, target, vars, nval=100, plot=True): """ Loads first level interactions. Args: target - Variable identifier (column name or number) specifying the target variable vars - List of variable identifiers (column names or numbers) specifying other selected variables. Must not contain target nval - Number of evaluation points used for calculation. plot - Determines whether or not to plot results. Returns: Pandas dataframe of interaction effects """ # Check if null.models have already been generated check_str = """ function(){ if(exists("null.models")){ return(T) } else { return(F) } } """ if not robjects.r(check_str)()[0]: self.logger.info( 'Null models not generated, generating null models ' '(n=10)') self._generate_interaction_null_models(10, quiet=False) int_str = """ function(target, vars, nval){ interactions <- twovarint(tvar=target, vars=vars, null.models, nval=nval, plot=F) } """ # Check the input type. If int, add one, if string do nothing. target = target if type(target) is str else target + 1 vars = [var if type(var) is str else var + 1 for var in vars] r_interact = robjects.r(int_str)(target, robjects.Vector(np.array(vars)), nval) interact = pd.DataFrame( { 'interact_str': list(r_interact[0]), 'exp_null_int': list(r_interact[1]), 'std_null_int': list(r_interact[2]) }, index=vars) if plot: int_effects = interact.reset_index().rename( columns={'index': 'vars'}) int_effects_m = pd.melt( int_effects, id_vars='vars', value_vars=['interact_str', 'exp_null_int']) p = gg.ggplot(gg.aes(x='vars', fill='variable', weight='value'), data=int_effects_m) \ + gg.geom_bar() \ + gg.labs( title='Two-var interaction effects - {}'.format(target)) print(p) return interact
def plot_sfs(self, pat_out): df = pd.DataFrame({ "freq": [i for i in range(1, len(self.sfs))], "sfs": np.array(self.sfs[1:len(self.sfs)]) }) print df pl = ggplot(df, aes(x="freq", weight="sfs")) + geom_bar() pl.save(pat_out)
def plotAverageLatency(self): averages = [d.averageLatency() for d in self.data] dat = {"device": range(1, len(averages) + 1), "average": averages} dataframe = pandas.DataFrame(dat) chart = ggplot.ggplot(ggplot.aes(x="device", weight="average"), dataframe) \ + ggplot.labs(title="Average Latency Per Device") + \ ggplot.ylab("Average Latency (ms)") + \ ggplot.xlab("Device Number") + \ ggplot.geom_bar(stat="identity") chart.show()
def plotAverageLatency(self): averages = [d.averageLatency() for d in self.data] dat = { "device" : range(1, len(averages) + 1), "average" : averages } dataframe = pandas.DataFrame(dat) chart = ggplot.ggplot(ggplot.aes(x="device", weight="average"), dataframe) \ + ggplot.labs(title="Average Latency Per Device") + \ ggplot.ylab("Average Latency (ms)") + \ ggplot.xlab("Device Number") + \ ggplot.geom_bar(stat="identity") chart.show()
def _ggplot(df, out_file): """Plot faceted items with ggplot wrapper on top of matplotlib. XXX Not yet functional """ import ggplot as gg df["variant.type"] = [vtype_labels[x] for x in df["variant.type"]] df["category"] = [cat_labels[x] for x in df["category"]] df["caller"] = [caller_labels.get(x, None) for x in df["caller"]] p = (gg.ggplot(df, gg.aes(x="caller", y="value.floor")) + gg.geom_bar() + gg.facet_wrap("variant.type", "category") + gg.theme_seaborn()) gg.ggsave(p, out_file)
def generate_intr_effects(self, nval=10, n=10, quiet=False, plot=True): """ Loads R variable interaction effect objects Args: nval - Number of evaluation points used for calculation n - Number of null models to generate for interaction calibaration quiet - Determines whether to print intermediate data. Returns: Pandas dataframe of interaction effects """ self._generate_interaction_null_models(n, quiet) int_str = """ function(ncols, nval){ if(exists("null.models")){ interactions <- interact(c(1:ncols), null.models, nval=nval, plot=F) } else { interactions <- interact(c(1:ncols), nval=nval, plot=F) } } """ ncols = len(self._data['x'].columns.values) r_interact = robjects.r(int_str)(ncols, nval) interact = pd.DataFrame( { 'interact_str': list(r_interact[0]), 'exp_null_int': list(r_interact[1]), 'std_null_int': list(r_interact[2]) }, index=self._data['x'].columns) self._interaction_effects = interact if plot: int_effects = interact.reset_index().rename( columns={'index': 'vars'}) int_effects_m = pd.melt( int_effects, id_vars='vars', value_vars=['interact_str', 'exp_null_int']) p = gg.ggplot(gg.aes(x='vars', fill='variable', weight='value'), data=int_effects_m) \ + gg.geom_bar() \ + gg(title='Interaction Effects') print(p)
def plotAlignmentStat(input, output): """plot Alignment summary using ggplot""" df = pd.read_csv(input, thousands=",") # replace % with '' and convert the type to float #df.replace('%', '', regex=True) print df.dtypes # convert to numeric #df1=df.apply(pd.to_numeric, args=('coerce',)) # Get certain rows print df df = df.iloc[[2, 4, 5], ] #df = df.ix[['Uniquely mapped reads %', 'Number of reads mapped to multiple loci %', 'Reads unmapped: too short %']] dfm = pd.melt(df, id_vars=['category'], var_name='sampleName', value_name='Value') print dfm #from ggplot import * #import pandas as pd #df = pd.DataFrame({"x":[1,2,3,4], "y":[1,3,4,2]}) #ggplot(aes(x="x", weight="y"), df) + geom_bar() #ggplot(diamonds, aes(x='price', fill='cut')) + geom_histogram() + theme_bw() + scale_color_brewer(type='qual') from ggplot import ggplot, geom_bar, aes, theme_bw, ggtitle, coord_flip, geom_histogram #,scale_y_continuous,coord_flip p = ggplot(dfm, aes(x='sampleName', weight='Value', fill='category')) + geom_bar() + theme_bw() + ggtitle( "Alignment Summary stats") + coord_flip( ) # + scale_y_continuous(labels='comma #p = ggplot(dfm, aes(x='sampleName', weight='Value', fill='category')) + geom_bar(position = "stack", stat='identity') + theme_bw() + ggtitle("Alignment Summary stats") + coord_flip()# + scale_y_continuous(labels='comma') + coord_flip() #p = ggplot(df, aes(x = "category", y = "value", fill = "variable")) + \ #geom_bar(stat="bar", labels=df["category"].tolist()) + \ #theme(axis_text_x = element_text(angle=90)) dirname, filename = os.path.split(output) print dirname print filename p.save(output) #ggsave(plot=p, filename=filename, path=dirname) return
def graph2(score_data): """ Average scores for each question on most recent date; Creates and returns graph 2, a bar graph. """ date_column = score_data[0][find_time_stamp(score_data)] columns_data = score_data[0] for i in range(0, len(columns_data)): columns_data[i] = columns_data[i].split('.')[0] data = DataFrame(score_data[1:], columns=columns_data) # Get all columns that are numerical questions so we know what to graph num_questions = data.select_dtypes(include=['int64']).columns.values # Melt data so that each question is in a seperate row new_data = pd.melt(data, id_vars=date_column, value_vars=num_questions, var_name="Question", value_name="Score") # Convert date string into actual data type new_data[date_column] = pd.to_datetime(new_data[date_column], format="%m/%d/%Y") # Latest Dates recent_date = new_data[date_column].max() # Removing all dates that are recent new_data = new_data[new_data.Timestamp == recent_date] # Group all rows with question, and then take the average. new_data = new_data.groupby(['Question']).mean().reset_index() # Create bar graph with data from past week ret = ggplot.ggplot(ggplot.aes(x="Question", weight="Score"), new_data) +\ ggplot.geom_bar() +\ ggplot.ggtitle("Most Recent Average Scores") return ret
def plot1(data): xvar = data['teamID' == 'SFN'] yvar = data['teamID' == 'LAN'] gg = ggplot(data, aes(xvar, yvar)) + geom_bar() return gg
def plot_weather_data(turnstile_weather): ''' You are passed in a dataframe called turnstile_weather. Use turnstile_weather along with ggplot to make a data visualization focused on the MTA and weather data we used in assignment #3. You should feel free to implement something that we discussed in class (e.g., scatterplots, line plots, or histograms) or attempt to implement something more advanced if you'd like. Here are some suggestions for things to investigate and illustrate: * Ridership by time of day or day of week * How ridership varies based on Subway station * Which stations have more exits or entries at different times of day If you'd like to learn more about ggplot and its capabilities, take a look at the documentation at: https://pypi.python.org/pypi/ggplot/ You can check out: https://www.dropbox.com/s/meyki2wl9xfa7yk/\n turnstile_data_master_with_weather.csv To see all the columns and data points included in the turnstile_weather dataframe. However, due to the limitation of our Amazon EC2 server, we are giving you about 1/3 of the actual data in the turnstile_weather dataframe ''' df = turnstile_weather.copy() # we will remove national holidays from the data. May 30 is Memorial Day, # the only national holiday in our data set. Normally this would be done # by passing in the data more elegantly, but since this is a bit more # constrained, we will simply hard code it into the function. national_holidays = ['2011-05-30'] for holiday in national_holidays: df = df[df.DATEn != holiday] # add a column to represent the ISO day of the week for each data point. df[u'weekday'] = df[u'DATEn'].apply(\ lambda x: datetime.datetime.strptime(x, '%Y-%m-%d').isoweekday()) ##now introduce a multiplier variable so that the ENTRIESn_hourly ##values can be modified when we have multiple data days. For example ##if we have 2 fridays with rain the multiplier is 1/2 so that summing ##the modified values will give us the average number of riders ##entering the subways system on a rainy friday. for day in df.weekday.unique(): for rain_status in df.rain.unique(): # number of unique dates with the same weekday and rain status u = df[(df.weekday == day) & (df.rain == rain_status)].\ DATEn.nunique() if u != 0: multiplier = float(1.0 / u) else: multiplier = 0 daily_sum = \ df[(df.weekday == day) & (df.rain == rain_status)].sum() entries_sum = daily_sum.ENTRIESn_hourly multiplier_index_list = \ df[(df.weekday == day) & (df.rain == rain_status)].index df.loc[multiplier_index_list, u'ENTRIESn_hourly'] = \ multiplier * entries_sum ##now we have a dataframe wich is ready to be utilized for making our ##plot using the data contained within. p = ggplot.ggplot(ggplot.aes(x = u'factor(weekday)', \ weight = u'ENTRIESn_hourly', \ fill = u'weekday'),\ data = df) +\ ggplot.geom_bar() +\ ggplot.facet_grid(x = u'rain', y = u'weekday') +\ ggplot.ggtitle('Average Ridership on Sunny & Rainy ISO Weekdays') print p return p
pd.to_datetime(re.sub(r'\scrop_small.png','', x), format='%Y-%m-%dt%H%M')) wormID = 0 data['lifeSpanHours'] = 0 data['lifeSpanDays'] = 0 data['elapsedHours'] = 0 data['elapsedDays'] = 0 data['wormID'] = '' for iWorm in data['path'].unique(): data.loc[data['path'] == iWorm, 'lifeSpanDays'] = (data.loc[data['path'] == iWorm, 'date'].max()-data.loc[data['path'] == iWorm, 'date'].min()).days data.loc[data['path'] == iWorm, 'lifeSpanHours'] = (data.loc[data['path'] == iWorm, 'date'].max()-data.loc[data['path'] == iWorm, 'date'].min()).seconds/3600 + data.loc[data['path'] == iWorm, 'lifeSpanDays'] * 24 data.loc[data['path'] == iWorm, 'elapsedDays'] = (data.loc[data['path'] == iWorm, 'date'] - data.loc[data['path'] == iWorm, 'date'].min())/(86400*1e9) data.loc[data['path'] == iWorm, 'elapsedHours'] = (data.loc[data['path'] == iWorm, 'date'] - data.loc[data['path'] == iWorm, 'date'].min())/(3600*1e9) data.loc[data['path'] == iWorm, 'wormID'] = '%d' % wormID wormID += 1 data['lifeSpanDuration'] = pd.cut(data.lifeSpanDays, 2, labels=["short", "long"]) data.to_csv(dfFile, sep=',', encoding='utf-8', header=True, index=False) data.lifeSpanDays.mean() data.lifeSpanDays.max() data.lifeSpanDays.min() #dataMidPoint = data.loc[data['elapsedDays'] ==4 , :] dataMidPoint = data.loc[data['elapsedDays'] ==1 , :] dataMidPoint.to_csv(dfMidPointFile, sep=',', encoding='utf-8', header=True, index=False) p = gg.ggplot(gg.aes(x='lifeSpanDuration'), data=dataMidPoint) p + gg.geom_bar()
##################################################################################### # Here is an example of using Rodeo: # We'll use the popular package called Pandas # Install it with pip ! pip install pandas # Import it as 'pd' import pandas as pd # Create a dataframe df=pd.DataFrame({"Animal":["dog","dolphin","chicken","ant","spider"],"Legs":[4,0,2,6,8]}) df.head() ##################################################################################### # An example of making a plot: ! pip install ggplot from ggplot import ggplot, aes, geom_bar ggplot(df, aes(x="Animal", weight="Legs")) + geom_bar(fill='blue') # Find this tutorial helpful? Checkout the blue sidebar for more tutorials! import freegames from turtle import * from random import randrange from freegames import square, vector
import os from os.path import join, splitext, exists, abspath, basename import pandas as pd import pdb import ggplot as ggp import utils log = utils.makeLogger('titanic-data-exploration') # Load dataframe DFN = abspath(join(os.curdir, os.pardir, 'datasets', 'titanic-train.csv')) df = pd.read_csv(DFN) log.debug("{} has columns: {}".format(basename(DFN), df.columns.tolist())) # Plot exploratory data analysis log.debug("Plotting exploratory slices") pt = ggp.ggplot(ggp.aes(x='Sex', fill='Survived'), data=df) + \ ggp.geom_bar() # NOTE type "pt" into the debugger to show the plot pdb.set_trace()
# With ggplot import ggplot as gg df_ = df.copy() df_["cat"] = df_.index df_melted = df_.melt(id_vars=["cat"]) cm2 = gg.ggplot(df_melted, gg.aes(x="cat", fill="variable", y ="value")) cm2 += gg.xlab("category") + gg.ylab("frequency") +\ gg.ggtitle("Confusion Matrix") cm2 += gg.geom_bar(stat="identity", position="stack") cm2 #%% # With altair import altair as alt chart = alt.Chart(df_melted).mark_bar().encode( x='cat', y='value', color='variable' )
#! /usr/bin/env/ python #================== This line is 79 spaces wide ==============================# import ggplot test_data = ggplot.mtcars.tail(15) print test_data ##def average_weight(df, x_value, w_value): ## for x_item in df.x_value.unique(): ## for w_item in df.w_value.unique(): ## print df[(df.x_value == x_item) & (df.w_value == w_item)].sum() ##average_weight(test_data, 'cyl', 'carb') p = ggplot.ggplot(ggplot.aes(x = 'factor(cyl)'), data = test_data) +\ ggplot.geom_bar() ## ggplot.facet_grid(x = u'cyl', y = u'gear') print p
import ggplot as gg import ultrasignup as us import numpy as np d = us.event_results(299) p1 = gg.ggplot( gg.aes(x='time_hour',fill='gender'),d[(d.distance=='50K')&(d.time_hour>1.0)]) + \ gg.facet_grid(x='gender') + \ gg.geom_bar(stat="bin",binwidth=.5,position="dodge",colour="black") + \ gg.xlab("Time (hours)") + gg.ylab("Number of Finishers") + \ gg.ggtitle("50K Finishing Times for All Years") p2 = gg.ggplot( gg.aes(x='time_hour',fill='gender'),d[(d.distance=='11 Miler')&(d.time_hour>1.0)]) + \ gg.facet_grid(x='gender') + \ gg.geom_bar(stat="bin",binwidth=.5,position="dodge",colour="black") + \ gg.xlab("Time (hours)") + gg.ylab("Number of Finishers") + \ gg.ggtitle("11M Finishing Times for All Years")
def second(dataframe): plot = ggplot.ggplot( ggplot.aes(x='Speed'), data=dataframe) + ggplot.geom_bar(color='lightblue') + ggplot.ggtitle( "Frequencies of Speeds Among Interfaces") + ggplot.theme_xkcd() plot.show()
}, index=range(t * len(count_tops), t * len(count_tops) + len(count_tops))) probs_list.append(probs_t) # Calculate KL divergences kl_mle_list.append(stats.entropy(true_bins_t, mle_probs_vals)) kl_nn_list.append(stats.entropy(true_bins_t, nn_probs_t)) probs = pd.concat(probs_list) # In[44]: probs_tail = probs[probs.Tenor > 360] gg.ggplot(probs_tail, gg.aes(x='Count Top', weight='Probs True') ) + gg.facet_grid('Tenor') + gg.geom_bar() + gg.geom_step( gg.aes(y='Probs MLE', color='red')) + gg.geom_step( gg.aes(y='Probs NN', color='blue')) + gg.scale_x_continuous( limits=(0, len(count_tops))) # In[57]: # KL divergences kl_df = pd.DataFrame({ 'Tenor': range(0, t_end + 1), 'KL MLE': kl_mle_list, 'KL NN': kl_nn_list }) print kl_df.head()
the_max = np.max(model2scores[model][plot_key_name]) the_min = np.min(model2scores[model][plot_key_name]) total = len(model2scores[model][plot_key_name]) for value in model2scores[model][plot_key_name]: plot_dataset.append( [model, value, the_mean / total, the_std, the_max, the_min]) plot_dataset_pd = pd.DataFrame( plot_dataset, columns=['model', 'value', 'weight', 'std', 'max', 'min']) if 'logloss' in plot_key_name: p = ggplot.ggplot(ggplot.aes(x = 'model', fill = 'model', weight = 'weight'), data = plot_dataset_pd) +\ ggplot.geom_bar(position = 'stack', width = 4) +\ ggplot.geom_errorbar(ggplot.aes(x = 'model', y = 'value')) +\ ggplot.ylim(0 ,5.05) +\ ggplot.ggtitle(plot_key_name) #print(p) elif 'time' in plot_key_name: p = ggplot.ggplot(ggplot.aes(x = 'model', fill = 'model', weight = 'weight'), data = plot_dataset_pd) +\ ggplot.geom_bar(position = 'stack', width = 4) +\ ggplot.geom_errorbar(ggplot.aes(x = 'model', y = 'value')) +\ ggplot.ggtitle(plot_key_name) #print(p)
# For example, select the following lines x = 7 x**2 # and remember to press COMMAND + ENTER # You can also run code directly in the console below. ##################################################################################### # Here is an example of using Rodeo: # We'll use the popular package called Pandas # Install it with pip ! pip install pandas # Import it as 'pd' import pandas as pd # Create a dataframe df=pd.DataFrame({"Animal":["dog","dolphin","chicken","ant","spider"],"Legs":[4,0,2,6,8]}) df.head() ##################################################################################### # An example of making a plot: ! pip install ggplot from ggplot import ggplot, aes, geom_bar ggplot(df, aes(x="Animal", weight="Legs")) + geom_bar(fill='blue') # Find this tutorial helpful? Checkout the blue sidebar for more tutorials!