def aggregate_by_month(coll): data = [x for x in coll.find()] index = [datetime.strptime(x['created']['timestamp'], DT_FRMT) for x in data] df = pd.DataFrame(dict(month=[month(x) for x in index], count=[1 for x in index]), index=index) month_count = df.groupby('month', as_index=False).aggregate(np.count_nonzero) print month_count print ggplot(aes(x='month', y='count'), data=month_count) + geom_bar(stat='identity') + labs(title='By Count') + ylab('Num Records')
def timeseriesplots(self): rawdat = importSPOD(datafolder, 1, minTime, maxTime) rawdat['timeStamp'] = pd.Series(pd.date_range(minTime, maxTime, freq='10s'), index=pd.date_range(minTime, maxTime, freq='10s')).resample('1s', fill_method = 'pad') font = {'weight' : 'bold', 'size' : 6} mpl.rcParams['axes.xmargin'] = .25 mpl.rc('font', **font) base = ggplot(aes(x='timeStamp', y='Base'), data=rawdat) +\ geom_line(color='blue') +\ ylab('Base Sensor (V)') +\ xlab('') + ylim(0,5.1) +\ scale_x_date(labels='%m/%d %H:00', breaks=date_breaks('6 hours')) # theme_matplotlib(mpl.rc('font', **font), matplotlib_defaults=False) ggsave(plot = base, filename = figfolder+'Base.png', width = 8, height = 3) remote = ggplot(aes(x='timeStamp', y='Remote'), data=rawdat) +\ geom_line(color='blue') +\ ylab('Remote Sensor (V)') +\ xlab('') + ylim(0,5.1) +\ scale_x_date(labels='%m/%d %H:00', breaks=date_breaks('6 hours')) # theme_matplotlib(mpl.rc('font', **font), matplotlib_defaults=False) ggsave(plot = remote, filename = figfolder+'Remote.png', width = 8, height = 3)
def test_scale(): meat = _build_meat_df() p = ggplot(aes(x='date', y='beef'), data=meat) print(p + geom_point() + scale_x_continuous("This is the X") + scale_y_continuous("Squared", limits=[0, 1500])) print(p + geom_point() + ylim(0, 1500)) gg = ggplot(aes(x='date', y='beef'), data=meat) + geom_line() print(gg+scale_x_date(labels="%Y-%m-%d"))
def plotHist(arr, category, save_dir): def space2Highfen(string): if ' ' in string: print('{0} has space\n'.format(string)) strList = list(string) length = len(strList) for i in range(length): if strList[i] == ' ': strList[i] = '-' return ''.join(strList) return string arr = [x for x in arr if x != 0] maxi = max(arr) col1 = 'original-'+space2Highfen(category) # col2 = 'linear-'+category # col3 = 'log-'+category col4 = 'log-Scale-'+space2Highfen(category) df = pd.DataFrame(pd.Series(arr), columns = [col1]) #original # df[col2] = (maxi - df[col1])/maxi # df[col3] = (np.log(maxi) - np.log(df[col1]))/np.log(maxi) df[col4] = np.log(df[col1]) #logscale width = 6 height = 5.5 p1 = ggplot(aes(x = col1), data = df) + geom_histogram() # p2 = ggplot(aes(x = col2), data = df) + geom_histogram() # p3 = ggplot(aes(x = col3), data = df) + geom_histogram() p4 = ggplot(aes(x = col4), data = df) + geom_histogram() ggsave(plot = p1, filename = col1 + ".png", path = save_dir, width = width, height = height, dpi = 75) # reduce dpi to save compile time # ggsave(plot = p2, filename = col2 + "no0.png", path = save_dir) # ggsave(plot = p3, filename = col3 + "no0.png", path = save_dir) # ggsave(plot = p4, filename = col4 + ".png", path = save_dir, width = 5, height = 5, dpi = 100) ggsave(plot = p4, filename = col4 + ".png", path = save_dir, width = width, height = height, dpi = 75)
def lineplot_compare(hr_by_team_year_sf_la_csv): #ggplot(data, aes(xvar, yvar, color=category_var)) dataframe = pandas.read_csv(hr_by_team_year_sf_la_csv) gg = ggplot(dataframe, aes(x='yearID', y='HR', color='teamID')) #gives the plot with the two categories seperated from each other. gg = ggplot(dataframe, aes(x='yearID', y='HR', color='teamID')) + geom_point() + geom_line()
def test_geom_rect(): df = pd.DataFrame({ 'xmin': [1,3,5], 'xmax': [2, 3.5, 7], 'ymin': [1, 4, 6], 'ymax': [5, 5, 9], 'fill': ['blue', 'red', 'green'], 'quality': ['good', 'bad', 'ugly'], 'alpha': [0.1, 0.5, 0.9], 'texture': ['hard', 'soft', 'medium']}) p = ggplot(df, aes(xmin='xmin', xmax='xmax', ymin='ymin', ymax='ymax', colour='quality', fill='fill', alpha='alpha', linetype='texture')) p += geom_rect(size=5) assert_same_ggplot(p, 'geom_rect') p = ggplot(df, aes(xmin='xmin', xmax='xmin + 1', ymin='ymin', ymax='ymin + 1')) p += geom_rect() assert_same_ggplot(p, 'geom_rect_plus') p = ggplot(df, aes(x='xmin', y='ymin')) p += geom_point(size=100, colour='red', alpha=0.5) p += geom_rect(aes(fill='fill', xmin='xmin', xmax='xmin + 1', ymin=0, ymax='ymax'), alpha=0.1) assert_same_ggplot(p, 'geom_rect_with_point')
def test_stat_function(): np.random.seed(7776) dnorm = lambda x : (1.0 / np.sqrt(2 * np.pi)) * (np.e ** (-.5 * (x ** 2))) print(ggplot(DataFrame({'x':np.random.normal(size=100)}),aes(x='x')) + \ geom_density() + \ stat_function(fun=dnorm,n=200)) print(ggplot(DataFrame({'x':np.arange(10)}),aes(x='x')) + \ stat_function(fun=np.sin,color="red") + \ stat_function(fun=np.cos,color="blue")) # Test when args = list def to_the_power_of(n,p): return n ** p x = np.random.randn(100) y = x ** 3 y += np.random.randn(100) data = DataFrame({'x':x,'y':y}) print(ggplot(aes(x='x',y='y'),data) + geom_point() + \ stat_function(fun=to_the_power_of,args=[3])) # Test when args = dict def dnorm(x,mean,var): return scipy.stats.norm(mean,var).pdf(x) data = DataFrame({'x':np.arange(-5,6)}) print(ggplot(aes(x='x'),data=data) + \ stat_function(fun=dnorm,color="blue",args={'mean':0.0,'var':0.2}) + \ stat_function(fun=dnorm,color="red",args={'mean':0.0,'var':1.0}) + \ stat_function(fun=dnorm,color="yellow",args={'mean':0.0,'var':5.0}) + \ stat_function(fun=dnorm,color="green",args={'mean':-2.0,'var':0.5}))
def main(): # Set system variables root = r'/Users/DC-MBP/Desktop/final-project' temp = os.path.join(root, 'Temp') data = r'/Users/DC-MBP/Desktop/yelp-api' data_file = 'yelp_academic_dataset_business.json' # Set regression formula rf = 'stars ~ review_count + state + Caters + Attire + BYOB + Alcohol' # Create data file df_business = process_data_restaurant(data, data_file) # Create Vegas data file #create distance from town center 36.175, -115.136389 df_vegas = df_business[df_business.city == "Las Vegas"] df_vegas['distance'] = np.sqrt(np.power(df_vegas.latitude-36.175,2) + np.power(df_vegas.longitude+115.136389,2)) # Create visualizations p1 = ggplot(aes(y='stars', x='review_count'),data=df_business) print(p1 + geom_point()) p2 = ggplot(aes(y='latitude', x='longitude'), data=df_vegas) print(p2 + geom_point()) p3 = ggplot(aes(y='stars', x='distance'), data=df_vegas) print(p3 + geom_point()) print 'End'
def wrapper(name): global pltsize Xt, Yt=loadData(name, 'train') Xv, Yv=loadData(name, 'validate') w = Train(Xt, Yt, 0) print 'Classification Error (TR): ', classifyErr(LRPredict(w, Xt), Yt, 0.5), name print 'Classification Error (VAL):: ',classifyErr(LRPredict(w, Xv), Yv, 0.5), name t1 = 'Classification Error vs Decision Boundary - ' + name + ': Training' t2 = 'Classification Error vs Decision Boundary - ' + name + ': Validation' plotCEDB(w, Xt, Yt, '') plotCEDB(w, Xv, Yv, '') t1 = 'Logistic Regression - ' + name + ': Training' t2 = 'Logistic Regression - ' + name + ': Validation' plotDecisionBoundary(w, Xt, Yt, LRPredict, [0.5], '') plotDecisionBoundary(w, Xv, Yv, LRPredict, [0.5], '') l = array(linspace(0,100,101)) tErr, tClass, vErr, vClass = GridL(Xt, Yt, Xv, Yv, l) DF1 = pd.DataFrame({'TR': pd.Series(tClass), 'VAL': pd.Series(vClass), 'Lambda': pd.Series(l)}) DF1 = pd.melt(DF1,id_vars=['Lambda']) DF2 = pd.DataFrame({'TR': pd.Series(tErr), 'VAL': pd.Series(vErr), 'Lambda': pd.Series(l)}) DF2 = pd.melt(DF2,id_vars=['Lambda']) title1 = 'Classification Error vs Lambda - ' + name title2 = 'Logisitic Loss vs Lambda - ' + name print p1 = ggplot(DF1, aes(x='Lambda', y='value', color='variable')) + geom_line(size=4) + ggtitle('') + ylab('Error') + theme_matplotlib(rc=pltsize, matplotlib_defaults=False) print p2 = ggplot(DF2, aes(x='Lambda', y='value', color='variable')) + geom_line(size=4) + ggtitle('') + ylab('Error') + theme_matplotlib(rc=pltsize, matplotlib_defaults=False)
def test_stat_vhabline_functions(): def fn_x(x): return 1 def fn_y(y): return 1 def fn_xy(x, y): return 1 gg = ggplot(aes(x='wt'), mtcars) # needs y aesthetic with assert_raises(GgplotError): print(gg + stat_abline(slope=fn_xy)) # needs y aesthetic with assert_raises(GgplotError): print(gg + stat_abline(intercept=fn_xy)) gg = ggplot(aes(x='wt', y='mpg'), mtcars) # Functions with 2 args, no problem print(gg + stat_abline(slope=fn_xy, intercept=fn_xy)) # slope function should take 2 args with assert_raises(GgplotError): print(gg + stat_abline(slope=fn_x, intercept=fn_xy)) # intercept function should take 2 args with assert_raises(GgplotError): print(gg + stat_abline(slope=fn_xy, intercept=fn_y)) # intercept function should take 1 arg with assert_raises(GgplotError): print(gg + stat_vline(xintercept=fn_xy)) # intercept function should take 1 arg with assert_raises(GgplotError): print(gg + stat_hline(yintercept=fn_xy))
def plot_sed(tmp,phot = None, fname = None, ignore = None, err = None): ''' make plots using ggplot ''' wav = tmp.df.wav cols = list(tmp.df.columns[1:]) if ignore is not None: for i in ignore: cols.remove(i) df_plot = pd.DataFrame({'log wav(um)':np.log10(wav),'log flux':np.log10(tmp.df.loc[:,cols[0]]),'template':[cols[0] for x in range(len(wav))]}) for i in cols[1:]: df = pd.DataFrame({'log wav(um)':np.log10(wav),'log flux':np.log10(tmp.df.loc[:,i]),'template':[i for x in range(len(wav))]}) df_plot = pd.concat([df_plot,df]) if phot is None: plt_out=ggplot(df_plot,aes(x='log wav(um)',y='log flux',color='template'))+geom_line() elif err is None: if type(phot) != pd.Series: print('phot should be in pandas series') else: df_phot = ({'log wav(um)':np.log10(np.asarray([dict_wav[x] for x in phot.index])), 'log flux':np.log10(phot.values.astype(float)), 'template':['Data' for x in range(len(phot))]}) plt_out=ggplot(df_phot,aes(x='log wav(um)', y='log flux',color='template'))+\ geom_point()+geom_line(df_plot) else: plt_out=ggplot(df_plot,aes(x='log wav(um)',y='log flux',color='template'))+\ geom_line()+geom_point(data = df_phot) #if fname is None: # fname = 'plot' #ggsave(plt_out,fname+'.pdf') self.sed = plt_out
def generateBathroomTilePlot(bl_vs_change_json): df = pd.read_json(bl_vs_change_json) summary_regions = ['ctx-lh-parsorbitalis','ctx-rh-parsorbitalis','ctx-rh-lateralorbitofrontal', 'ctx-lh-lateralorbitofrontal','ctx-rh-frontalpole','ctx-rh-parstriangularis', 'ctx-lh-frontalpole','ctx-lh-parstriangularis','ctx-lh-caudalanteriorcingulate', 'ctx-rh-rostralmiddlefrontal','ctx-lh-caudalmiddlefrontal', 'ctx-rh-caudalanteriorcingulate','ctx-rh-rostralanteriorcingulate', 'ctx-lh-rostralmiddlefrontal','ctx-rh-caudalmiddlefrontal', 'ctx-lh-superiorparietal','ctx-rh-isthmuscingulate', 'ctx-lh-rostralanteriorcingulate','ctx-rh-parsopercularis', 'ctx-rh-superiorparietal','ctx-lh-parsopercularis', 'ctx-rh-medialorbitofrontal','ctx-lh-isthmuscingulate', 'ctx-lh-supramarginal','ctx-lh-inferiorparietal','ctx-rh-supramarginal', 'ctx-lh-superiorfrontal','ctx-rh-superiorfrontal','ctx-rh-middletemporal', 'ctx-lh-middletemporal','ctx-rh-inferiorparietal','ctx-rh-superiortemporal', 'ctx-lh-posteriorcingulate','ctx-lh-precuneus','ctx-lh-medialorbitofrontal', 'ctx-lh-superiortemporal','ctx-rh-posteriorcingulate','ctx-rh-precuneus'] ordering = {x:i for i,x in enumerate(summary_regions)} rank_by = summary_regions # could take subset of cortical summary regions subjects = GROUPS['increasing_low']['N'] df = df[df['rid'].isin(subjects)] baseline_keys = ["%s_bl" % _ for _ in rank_by] change_keys = ["%s_change" % _ for _ in summary_regions] df['rank'] = df[baseline_keys].mean(axis=1) keep_keys = ['rid', 'rank'] + change_keys df = df[keep_keys] df_long = pd.melt(df,id_vars=['rank'],value_vars=change_keys) # sort change df_long['variable'] = [_.replace('_change','') for _ in df_long['variable']] df_long['variable'] = ['%s_%s' % (str(ordering[_]).zfill(2),_) for _ in df_long['variable']] print ggplot(aes(x='variable',y='rank'),data=df_long)+geom_tile(aes(fill='value'))+theme(axis_text_x=element_text(angle=270,size=8), axis_text_y=element_text(size=6))
def plot_weather_data(turnstile_weather): """ Plot turnstile weather data """ # Subway ridership by time of day # Create pivot table with UNIT on one hand, and cummulative entries on the other df_time_of_day = turnstile_weather.loc[:, ['Hour', 'ENTRIESn_hourly']].groupby(['Hour'], as_index = False).sum() # Create plot df_time_of_day_plot = ggplot(df_time_of_day, aes('Hour')) df_time_of_day_plot = df_time_of_day_plot + geom_bar(aes(x = 'Hour', weight = 'ENTRIESn_hourly'), binwidth = 1) + scale_x_continuous(limits = (0, 23)) # Subway ridership by subway station # Create pivot table with UNIT on one hand, and cummulative entries on the other df_subway_station = turnstile_weather.loc[:, ['UNIT', 'ENTRIESn_hourly']].groupby(['UNIT'], as_index = False).sum() # Create plot df_subway_station_plot = ggplot(df_subway_station, aes(x = 'UNIT')) df_subway_station_plot = df_subway_station_plot + geom_bar(aes(x = 'UNIT', weight ='ENTRIESn_hourly')) # Subway ridership, total # Create pivot table with DATEn on one hand, and entries on the other df_total = turnstile_weather.loc[:, ['DATEn', 'ENTRIESn_hourly']].groupby(['DATEn'], as_index = False).sum() # Convert DATEn column to proper datetime df_total['DATEn'] = pandas.to_datetime(df_total['DATEn']) df_total['DATEn'] = [d.date() for d in df_total['DATEn']] # Create plot df_total_plot = ggplot(df_total, aes('DATEn')) df_total_plot = df_total_plot + geom_bar(aes(x = 'DATEn', weight = 'ENTRIESn_hourly')) + scale_x_date() return df_time_of_day_plot, df_subway_station_plot, df_total_plot
def plot_year_doy(df, title, palette='RdYlGn'): """ Plot year / doy with clear percent as color if available""" if 'clear' in df.columns: pct_clear = ((df['clear'] // 20) * 20).astype(np.uint8) df['Percent Clear'] = [' ' * (3 - len(str(v))) + str(v) if v < 100 else str(v) for v in pct_clear] # HACK to get all values shown need = [' 0', ' 20', ' 40', ' 60', ' 80', '100'] to_add = [v for v in need if v not in np.unique(df['Percent Clear'])] for v in to_add: df = pd.concat([df, df[:1]]) df['year'][-1:] = np.nan df['doy'][-1:] = np.nan df['Percent Clear'][-1:] = v plot = ggplot(aes('year', 'doy', color='Percent Clear'), df) plot = plot + scale_color_brewer(type='diverging', palette=palette) else: plot = ggplot(aes('year', 'doy'), df) return(plot + geom_point(size=50) + xlim(df.year.min() - 1, df.year.max() + 1) + ylim(0, 366) + xlab('Year') + ylab('Day of Year') + ggtitle(title))
def entries_histogram(turnstile_weather, fog=False): ''' Before we perform any analysis, it might be useful to take a look at the data we're hoping to analyze. More specifically, lets examine the hourly entries in our NYC subway data and determine what distribution the data follows. This data is stored in a dataframe called turnstile_weather under the ['ENTRIESn_hourly'] column. Why don't you plot two histograms on the same axes, showing hourly entries when raining vs. when not raining. Here's an example on how to plot histograms with pandas and matplotlib: turnstile_weather['column_to_graph'].hist() Your histograph may look similar to the following graph: http://i.imgur.com/9TrkKal.png You can read a bit about using matplotlib and pandas to plot histograms: http://pandas.pydata.org/pandas-docs/stable/visualization.html#histograms You can look at the information contained within the turnstile weather data at the link below: https://www.dropbox.com/s/meyki2wl9xfa7yk/turnstile_data_master_with_weather.csv ''' ''' plot = (ggplot(turnstile_weather, aes(x='ENTRIESn_hourly')) + geom_histogram(data=turnstile_weather[turnstile_weather['fog']==0], position="identity") + ggtitle('Hourly ridership') + xlab('Hourly entries') + ylab('Frequency')) ''' #turnstile_weather['ENTRIESn_hourly'] = np.log(turnstile_weather['ENTRIESn_hourly'][turnstile_weather['ENTRIESn_hourly'] > 0]) turnstile_weather['ENTRIESn_hourly'] = turnstile_weather['ENTRIESn_hourly'].map(lambda y: boxcox1p(y)) if fog: plot = (ggplot(turnstile_weather[turnstile_weather['fog'] == 0], aes(x='ENTRIESn_hourly')) + geom_histogram(color='red') + ggtitle('Hourly ridership on non-foggy days') + xlab('log(Hourly entries+1)') + ylab('Frequency')) else: plot = (ggplot(turnstile_weather[turnstile_weather['fog'] == 1], aes(x='ENTRIESn_hourly')) + geom_histogram(color='blue') + ggtitle('Hourly ridership on foggy days') + xlab('log(Hourly entries+1)') + ylab('Frequency')) # What about fog option? # Can we measure how correlated fog and rain? # Could we find out the level of fog from the meantempi and dewpointi? This would # be hard to predict? # BE SIMPLE AND ENJOY THE FLOP. TAKE FOG! # STOP TRYING TO PREDICT THE FUTURE AND COVER YOUR ASS! return plot
def main(parameters): label = sys.argv[-1] # Sumatra appends the label to the command line subdir = os.path.join("mydata", label) #os.mkdir(subdir) res = {} an = [] ax = [] ay = [] all_df = pd.DataFrame({"i":[],"Name":[]}) final_df = pd.DataFrame({"Algorithm":[],"Task":[],"Steps":[]}) for scenario in parameters["scenarios"]: res[scenario] = {} for algorithm in parameters["algorithms"]: name = scenario+"_"+algorithm fileid = "%s_%s.txt" % (scenario, algorithm) fn = os.path.join(subdir, fileid) da, ap = average_run(parameters["AcceptableScore"],fn) for i,r in ap.iterrows(): an.append(name) ax.append(r["n"]) ay.append(r["avg"]) all_df = all_df.append(da) #print algorithm,ax,ay, ap # if len(ay) == 0: final_df = final_df.append(dict(Algorithm=algorithm,Task=scenario,Steps=ay[-1]),ignore_index=True) # showing that we have enough runs df = pd.DataFrame({"Name":an,"Runs":ax,"Avg":ay}) #print df p = ggplot(aes(x='Runs',y="Avg"), data=df) + geom_point() + geom_line()+ \ facet_wrap("Name") ggsave(p,os.path.join(subdir, "avg_runs.png")) #ploting all runs #all_df["y"] = all_df["0"] #print all_df all_plot = ggplot(aes(x='i', y='avg',colour="Name"), data=all_df) + geom_point() + geom_line() ggsave(all_plot,os.path.join(subdir, "all_runs.png")) #final comparison #do in R #print final_df final_df.to_csv(os.path.join(subdir, "final_comp.csv"),index=False) import subprocess proc = subprocess.Popen(['/usr/bin/Rscript','result.R',subdir], stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout, stderr = proc.communicate() print stdout,stderr proc.wait() print "done",subdir
def test_factor(): p = ggplot(mtcars, aes(x='wt', y='mpg', colour='factor(cyl)', size='mpg', linetype='factor(cyl)')) print(p + geom_line()) print(p + geom_point()) print(p + geom_line() + geom_point()) print(p + geom_point() + geom_line(color='lightblue') + ggtitle("Beef: It's What's for Dinner") + xlab("Date") + ylab("Head of Cattle Slaughtered")) p = ggplot(aes(x='factor(cyl)'), data=mtcars) print(p + geom_bar())
def plot_cost_history(alpha, cost_history): cost_df = pandas.DataFrame({ 'Cost_History': cost_history, 'Iteration': range(len(cost_history)) }) print ggplot(cost_df, aes('Iteration', 'Cost_History')) + \ geom_point() + ggtitle('Cost History for alpha = %.3f' % alpha )
def plot_weather_data(turnstile_weather): ''' Use ggplot to make another data visualization focused on the MTA and weather data we used in assignment #3. You should make a type of visualization different than you did in exercise #1, and try to use the data in a different way (e.g., if you made a lineplot concerning ridership and time of day in exercise #1, maybe look at weather and try to make a histogram in exercise #2). You should feel free to implement something that we discussed in class (e.g., scatterplots, line plots, or histograms) or attempt to implement something more advanced if you'd like. Here are some suggestions for things to investigate and illustrate: * Ridership by time of day or day of week * How ridership varies based on Subway station * Which stations have more exits or entries at different times of day If you'd like to learn more about ggplot and its capabilities, take a look at the documentation at: https://pypi.python.org/pypi/ggplot/ You can check out: https://www.dropbox.com/s/meyki2wl9xfa7yk/turnstile_data_master_with_weather.csv To see all the columns and data points included in the turnstile_weather dataframe. However, due to the limitation of our Amazon EC2 server, we are giving you about 1/3 of the actual data in the turnstile_weather dataframe ''' print turnstile_weather.ix[:10] df = turnstile_weather[['rain', 'ENTRIESn_hourly']].groupby('rain', as_index = False).sum() map_dict = {1: "rain", 0: "no rain"} turnstile_master["weather"] = turnstile_master["rain"].map(map_dict) plot = ggplot(turnstile_master, aes(x='ENTRIESn_hourly', color = 'weather')) \ + geom_bar(aes(weight='ENTRIESn_hourly'),fill= '') \ + ggtitle('NYC Subway ridership by weather') + xlab('Entries') + ylab('Frequency') #print plot plot = ggplot(turnstile_weather, aes(x='ENTRIESn_hourly', color = 'rain')) \ + geom_bar(aes(weight='ENTRIESn_hourly'), fill='green') + geom_density()\ + ggtitle('NYC Subway ridership by day of week') + xlab('Day') + ylab('Entries') plot = ggplot(turnstile_weather, aes(x='ENTRIESn_hourly', color = 'fog')) \ + geom_bar(aes(weight='ENTRIESn_hourly'), fill='orange') \ + ggtitle('NYC Subway ridership by day of week') + xlab('Day') + ylab('Entries') plot = ggplot(df,aes(x='rain')) \ + geom_bar(aes(weight='ENTRIESn_hourly'),fill='orange',binwidth = 0.5) \ + ggtitle('NYC Subway ridership by rain') + xlab('rain') + ylab('Entries') return plot '''
def plotCEDB(w, X, Y, title): global pltsize axis = array(linspace(0,1,101)) out = zeros(len(axis)) py = LRPredict(w,X) for i in range(len(axis)): out[i] = classifyErr(py, Y, axis[i]) DF = pd.DataFrame({'Decision Boundary': pd.Series(axis),'Classification Error': pd.Series(out)}) print ggplot(DF, aes(x='Decision Boundary', y='Classification Error')) + geom_line(size=4) + ggtitle(title) + theme_matplotlib(rc=pltsize, matplotlib_defaults=False)
def test_assign_colors(): """ Test how colors are assigned to different column types. """ df = pd.DataFrame({"values": np.arange(10), "int_col": np.arange(10), "num_col": np.arange(10) / 2, "bool_col": np.random.randn(10) > 0, "char_col": ["a", "b"] * 5}) color_mapping_col = 'color_mapping' # test integer column color_col = "int_col" gg_int = ggplot(df, aes(x="values", y="values", color="int_col")) gg_int += geom_point() gg_int.draw() new_data = assign_continuous_colors(df, gg_int, color_col) expected_cols = new_data[color_mapping_col] actual_cols = gg_int.data[color_mapping_col] assert_true((actual_cols == expected_cols).all()) # test numeric column color_col = "num_col" gg_num = ggplot(df, aes(x="values", y="values", color="num_col")) gg_num += geom_point() gg_num.draw() new_data = assign_continuous_colors(df, gg_int, color_col) expected_cols = new_data[color_mapping_col] actual_cols = gg_num.data[color_mapping_col] assert_true((actual_cols == expected_cols).all()) # test bool column color_col = "bool_col" gg_bool = ggplot(df, aes(x="values", y="values", color="bool_col")) gg_bool += geom_point() gg_bool.draw() new_data = assign_discrete_colors(df, gg_bool, color_col) expected_cols = new_data[color_mapping_col] actual_cols = gg_bool.data[color_mapping_col] assert_true((actual_cols == expected_cols).all()) # test char column color_col = "char_col" gg_char = ggplot(df, aes(x="values", y="values", color="char_col")) gg_char += geom_point() gg_char.draw() new_data = assign_discrete_colors(df, gg_bool, color_col) expected_cols = new_data[color_mapping_col] actual_cols = gg_char.data[color_mapping_col] assert_true((actual_cols == expected_cols).all())
def rebound_list_draw(): if True: global rblist2 print rblist2 powd = DataFrame(rblist2, columns=['index','open','year']) print ggplot(aes(x='index', y='open'), data=powd) + \ geom_point(color='lightblue', size = 9) + \ ggtitle("Rebound") + \ xlab("Date") + \ ylab("Open")
def plot_weather_data(turnstile_weather): ''' You are passed in a dataframe called turnstile_weather. Use turnstile_weather along with ggplot to make a data visualization focused on the MTA and weather data we used in assignment #3. You should feel free to implement something that we discussed in class (e.g., scatterplots, line plots, or histograms) or attempt to implement something more advanced if you'd like. Here are some suggestions for things to investigate and illustrate: * Ridership by time of day or day of week * How ridership varies based on Subway station (UNIT) * Which stations have more exits or entries at different times of day (You can use UNIT as a proxy for subway station.) If you'd like to learn more about ggplot and its capabilities, take a look at the documentation at: https://pypi.python.org/pypi/ggplot/ You can check out: https://www.dropbox.com/s/meyki2wl9xfa7yk/turnstile_data_master_with_weather.csv To see all the columns and data points included in the turnstile_weather dataframe. However, due to the limitation of our Amazon EC2 server, we are giving you a random subset, about 1/3 of the actual data in the turnstile_weather dataframe. ''' df = turnstile_weather[['Hour', 'ENTRIESn_hourly']] q = """ SELECT Hour AS hour, sum(ENTRIESn_hourly)/count(*) AS hourlyentries FROM df GROUP BY hour """ #Execute SQL command against the pandas frame rainy_days = pandasql.sqldf(q.lower(), locals()) print ggplot(rainy_days, aes('hour', 'hourlyentries')) + \ geom_bar(fill = '#cc2127', stat='bar') + \ scale_x_continuous(name="Hour", breaks=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23], labels=['12AM', '1AM', '2AM', '3AM', '4AM', '5AM', '6AM', '7AM', '8AM', '9AM', '10AM', '11AM', '12PM', '1PM', '2PM', '3PM', '4PM', '5PM', '6PM', '7PM', '8PM', '9PM', '10PM', '11PM']) + \ ggtitle("Average ENTRIESn_hourly by Hour") + \ ylab("ENTRIESn_hourly")
def test_geoms(): df = _build_testing_df() gg = ggplot(aes(x="x", color="c"), data=df) print(gg + geom_density() + xlab("x label") + ylab("y label")) gg = ggplot(aes(x="x", y="y", shape="cat2", color="cat"), data=df) print(gg + geom_histogram()) print(gg + geom_histogram() + ggtitle("My Histogram")) print(gg + geom_point()) print(gg + geom_point() + geom_vline(x=50, ymin=-10, ymax=10)) gg = ggplot(aes(x='x', ymax='y', ymin='z', color="cat2"), data=df) print(gg + geom_area())
def graph_data(): """ Graphs the features with event_counts on the x axis and std on the y axis using a ggplot2 extension for Python. You can find out more at ggplot.yhathq.com """ data = pd.read_table('data/features.csv', sep=',') print ggplot(data, aes(x='event_counts', y='std', color='bot')) + \ geom_point()
def plot_sed(self,phot = None, fname = None, ignore = None, err = None): ''' make plots using ggplot ''' wav = self.df.wav cols = list(self.df.columns[1:]) if ignore is not None: for i in ignore: cols.remove(i) df_plot = pd.DataFrame({'log wav(um)':np.log10(wav),'log flux':np.log10(self.df.loc[:,cols[0]]), 'logf_l':np.log10(self.df.loc[:,cols[0]]), 'logf_h':np.log10(self.df.loc[:,cols[0]]), 'template':[cols[0] for x in range(len(wav))]}) for i in cols[1:]: df = pd.DataFrame({'log wav(um)':np.log10(wav),'log flux':np.log10(self.df.loc[:,i]), 'logf_l':np.log10(self.df.loc[:,i]), 'logf_h':np.log10(self.df.loc[:,i]), 'template':[i for x in range(len(wav))]}) df_plot = pd.concat([df_plot,df]) if phot is None: plt_out=ggplot(df_plot,aes(x='log wav(um)',y='log flux',color='template'))+geom_line() elif err is None: print('No error bars') if type(phot) != pd.Series: print('phot should be in pandas series') else: df_phot = pd.DataFrame({'log wav(um)':np.log10(np.asarray([dict_wav[x] for x in phot.index])), 'log flux':np.log10(phot.values.astype(float)), 'template':['Data' for x in range(len(phot))]}) self.phot = df_phot plt_out=ggplot(df_phot,aes(x='log wav(um)', y='log flux',color='template'))+\ geom_point()+geom_line(df_plot)+\ ylim(min(df_phot['log flux'])-1.5,max(df_phot['log flux'])+0.5)+\ xlim(-0.7,1.5) #+geom_point(df_phot,size=40,color='red') else: if type(phot) != pd.Series or type(err) != pd.Series: print('phot and err should be in pandas series with band names as index') else: df_phot = pd.DataFrame({'log wav(um)':np.log10(np.asarray([dict_wav[x] for x in phot.index]).astype(float)), 'log flux':np.log10(phot.values.astype(float)), 'logf_l':np.log10(phot.values.astype(float)-0.95*err.values.astype(float)), 'logf_h':np.log10(phot.values.astype(float)+0.95*err.values.astype(float)), 'template':['Data' for x in range(len(phot))]}) plt_out=ggplot(df_phot,aes(x='log wav(um)',y='log flux',ymax='logf_h',ymin='logf_l',color='template'))+\ geom_point()+geom_pointrange()+geom_line(df_plot)+\ ylim(min(df_phot['log flux'])-1.5,max(df_phot['log flux'])+0.5)+\ xlim(-0.7,1.5) self.phot = df_phot #if fname is None: # fname = 'plot' #ggsave(plt_out,fname+'.pdf') self.sed = plt_out
def test_diamond(): p = ggplot(aes(x='x', y='y', colour='z'), data=diamonds.head(4)) p = p + geom_point() + scale_colour_gradient(low="white", high="red") p = p + facet_wrap("cut") print(p) p = ggplot(aes(x='x', y='y', colour='z'), data=diamonds.head(1000)) p = p + geom_point() + scale_colour_gradient(low="white", high="red") p = p + facet_grid("cut", "clarity") print(p) p = ggplot(aes(x='carat'), data=diamonds) print(p + geom_density() + facet_grid("cut", "clarity"))
def test_facet_grid(): # only use a small subset of the data to speedup tests # N=53940 -> N=7916 and only 2x2 facets _mask1 = (diamonds.cut == "Ideal") | (diamonds.cut == "Good") _mask2 = (diamonds.clarity == "SI2") | (diamonds.clarity == "VS1") _df = diamonds[_mask1 & _mask2] p = ggplot(aes(x='x', y='y', colour='z'), data=_df) p = p + geom_point() + scale_colour_gradient(low="white", high="red") p = p + facet_grid("cut", "clarity") print(p) p = ggplot(aes(x='carat'), data=_df) print(p + geom_density() + facet_grid("cut", "clarity"))
def plot_trip(trip_data_frame, details=tuple(), **kwargs): if check_trip_data_quality(trip_data_frame): plot = ggplot(trip_data_frame, aes(x='x', y='y')) + geom_point(**kwargs) else: d = trip_data_frame.copy() T = len(d) d['bad_x'] = T * [0] d['bad_y'] = T * [0] bad = ~d.check_velocity | ~d.check_angular_velocity d.ix[bad, ['bad_x', 'bad_y']] = d.ix[bad, ['x', 'y']] plot = ggplot(d, aes('x', 'y')) + geom_point(**kwargs) +\ geom_point(aes('bad_x', 'bad_y'), color='red', size=90) return plot
def plot_dayofweek_data(filename): """ Scatter plot of ridership by day-of-week """ turnstile_weather = pandas.read_csv(filename) print ggplot(turnstile_weather, aes("day_week", "ENTRIESn_hourly")) + geom_point(size=5.0, color="red") + xlab( "Day of the week" ) + ggtitle("Ridership by day of the week") + scale_x_continuous( breaks=[0, 1, 2, 3, 4, 5, 6], labels=["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"] ) + ylim( 0 ) return
from __future__ import print_function from ggplot import * print( ggplot(meat, aes(x='date', y='beef')) + stat_smooth() + scale_x_date(labels=date_format('%Y'))) print( ggplot(meat, aes(x='date', y='beef')) + stat_smooth(method='ma', window=12) + scale_x_date(labels=date_format('%Y')))
print() print('TN = {}'.format(TN)) print('FP = {}'.format(FP)) print('FN = {}'.format(FN)) print('TP = {}'.format(TP)) print() FPR, TPR, _ = roc_curve(y_test, model.predict_proba(X_test)[:, 1]) roc_auc = auc(FPR, TPR) # ROC Curve (using Python) plt.figure() plt.plot(FPR, TPR, label='ROC curve (area = %0.2f)' % roc_auc) plt.plot([0, 1], [0, 1], 'k--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate (FPR)') plt.ylabel('True Positive Rate (TPR)') plt.title('ROC Curve') plt.legend(loc="lower right") plt.show() # ROC Curve (using R) # Compile it IPython notebook from ggplot import * df = pd.DataFrame(dict(fpr=FPR, tpr=TPR)) ggplot(df, aes(x='fpr', y='tpr')) + geom_line() + geom_abline(linetype='dashed')
#!/usr/bin/env python # encoding: utf-8 """ @version: python3.7 @author: JYFelt @license: Apache Licence @contact: [email protected] @site: https://blog.csdn.net/weixin_38034182 @software: PyCharm @file: ggplot_demo.py @time: 2019/8/15 17:16 """ from ggplot import * p = ggplot(mtcars, aes('mpg', 'wt', color='factor(cyl)')) + geom_point() + ggtitle('mtcars') print(p)
import pandas as pd from ggplot import * from datetime import datetime workingDirectory = "/home/owen/Dropbox/graphMining/" df = pd.DataFrame.from_csv(path=workingDirectory + "trafficData2.tsv", sep = '\t', header = 0, index_col = False) timestamp = datetime.strptime(df.timestamp, '%Y-%m-%d %X') p = ggplot(aes(x = 'timestamp'), data = df) p + geom_histogram(binwidth = 30)
from ggplot import * print ggplot(diamonds, aes('carat', 'price')) + stat_smooth(method='lm') print ggplot(diamonds, aes('price')) + stat_density()
# <markdowncell> # We're going to be using a RFC at first, so I'm swtiching the 'days on disabled list' metric to a simple injured boolean. # <codecell> data.columns # <codecell> data['InjuredBool'] = data['Days'] >= 1 # <codecell> injury_days_chart = ggplot(aes(x='playerid', y='Days'), data=data) + geom_point() injury_days_chart # <codecell> X_cols = [col for col in data.columns if col not in ['InjuredBool', 'Days']] X = data[X_cols] y = data.InjuredBool # <codecell> objects = [] for each_col in X: if X[each_col].dtype == 'object': objects.append(each_col)
try: qual=float(qual) except: continue else: if vcf_list[6]=='PASS' and qual>= 25: try: POS=int(vcf_list[1]) except: continue if POS <= window*n: type_info+=('\t' + vcf_list[7]) heho_info+=('\t' + vcf_list[9].split(':')[0]) else: window_point=window*n - window/2 row=VariantStat(del_pattern, ins_pattern, HE_pattern, HO_pattern, type_info, heho_info, outputfilename, window_point) pos_list.append(row[0]), heho_list.append(row[1]), del_list.append(row[2]), ins_list.append(row[3]) type_info=vcf_list[7] heho_info=vcf_list[9].split(':')[0] n+=1 # Plotting using ggplot for Python plotting_data=DataFrame({'pos':pos_list, 'heho':heho_list, 'del':del_list, 'ins':ins_list}) heho_p=ggplot(aes(x = 'pos', y = 'heho'), data= plotting_data) + geom_point() + ggtitle('heterozygous / (heterozygous + homozygous)') + scale_x_continuous('Position', breaks = [0, 1e+08, 2e+08, 3e+08], labels = ['0', '100Mb', '200Mb', '300Mb']) + xlim(low=0, high=3.7e8) + scale_y_continuous('Percent (%)', breaks = [25, 50, 75, 100]) + ylim(low = 0, high = 100)# + theme(axis.title.x = element_text()) heho_p.save('heho_ratio_%s.png' % TD(window, 'readable'), dpi = 300)#, width = 8.43, height = 5.28, dpi = 300)#, limitsize = TRUE) del_p=ggplot(aes(x = 'pos', y = 'del'), data= plotting_data) + geom_point() + ggtitle('Deletion') + ylab('Count') + scale_x_continuous('Position', breaks = [0, 1e+08, 2e+08, 3e+08], labels = ['0', '100Mb', '200Mb', '300Mb']) + xlim(low = 0, high = 3.7e8) + scale_y_continuous('Count', breaks = [400, 800, 1200]) + ylim(low = 0) del_p.save('del_stat_%s.png' % TD(window, 'readable'), dpi = 300)#, width = 8.43, height = 5.28, dpi = 300)#, limitsize = TRUE) ins_p=ggplot(aes(x = 'pos', y = 'ins'), data= plotting_data) + geom_point() + ggtitle('Insertion') + ylab('Count') + scale_x_continuous('Position', breaks = [0, 1e+08, 2e+08, 3e+08], labels = ['0', '100Mb', '200Mb', '300Mb']) + xlim(low = 0, high = 3.7e8) + scale_y_continuous('Count', breaks = [400, 800, 1200]) + ylim(low = 0) ins_p.save('ins_stat_%s.png' % TD(window, 'readable'), dpi = 300)#, width = 8.43, height = 5.28, dpi = 300)#, limitsize = TRUE)
import pandas as pd from ggplot import * lst = [] for test_d in filter(lambda x: x.startswith('test-'), os.listdir('.')): for fill in os.listdir(test_d): if not fill.startswith('train-'): continue num = int(fill.split('-')[1]) with open(os.path.join(os.path.join(test_d, fill), 'result.filtered'), "rt") as fd: #fd.readline() error = float(fd.readline().split(':')[1].strip()) lst.append((num, error)) with open("match.csv", "wt") as fd: fd.write("size,error\n") for size, error in sorted(lst): fd.write("%d, %f\n" % (size, error)) dataframe = pd.read_csv("test.csv") gg = ggplot(aes(x='size', y='error'), data=dataframe) + \ geom_point(color='lightblue') + \ stat_smooth(span=.15, color='black', se=True) + \ ggtitle("Germline data") + \ xlab("References count") + \ ylab("Error rate") print(gg)
plt.figure(figsize=(12,12)) sns.jointplot(x=train_df['taxamount'].values, y=train_df['logerror'].values, size=10, color='g') plt.ylabel('Log Error', fontsize=12) plt.xlabel('Tax Amount', fontsize=12) plt.title("Tax Amount Vs Log error", fontsize=15) plt.show() from ggplot import * ggplot(aes(x='yearbuilt', y='logerror'), data=train_df) + \ geom_point(color='steelblue', size=1) + \ stat_smooth() ggplot(aes(x='latitude', y='longitude', color='logerror'), data=train_df) + \ geom_point() + \ scale_color_gradient(low = 'red', high = 'blue') ggplot(aes(x='finishedsquarefeet12', y='taxamount', color='logerror'), data=train_df) + \ geom_point(alpha=0.7) + \ scale_color_gradient(low = 'pink', high = 'blue') ggplot(aes(x='finishedsquarefeet12', y='taxamount', color='logerror'), data=train_df) + \
#clf=svc clf.fit(X, y) print "Accuracy of the model -" print clf.score(X, y) print clf.score(X_1, y_1) #--------------------------- ROC CURVE ------------------------------------------------ #clf.probability=True preds = clf.predict_proba(X_1)[:, 1] fpr, tpr, _ = metrics.roc_curve(y_1, preds) df = pd.DataFrame(dict(fpr=fpr, tpr=tpr)) graph = ggplot(df, aes(x='fpr', y='tpr')) + geom_line( color="blue", size=3) + geom_abline(linetype='dashed') print graph #-------------AUC Curve auc = metrics.auc(fpr, tpr) print auc #-------------------------------Precision and Recall---------------------------- print "Calculating Precision and Recall..." y_2 = clf.predict(X_1) y_3 = np.array(y_1) false_pos = 0 false_neg = 0
from ggplot import * print ggplot(aes(x='date', y='beef'), data=meat) + \ geom_line() plt.show(1)
# For visualization keep 2 principal components print(pca.explained_variance_ratio_[0:2]) #variance explained by first two PCs #x = ctdf.loc[:, ~ctdf.columns.isin(['SubjectId', 'Age'])] firstTwoPCs = pd.DataFrame(data=pca.components_[:, :2], columns=['PC1', 'PC2']) pcScores = pd.DataFrame(data=np.dot(x, firstTwoPCs), columns=['PC1', 'PC2']) ages = ctdf.loc[:, ['Age']].reset_index() pcScores = pd.concat([pcScores, ages['Age']], axis=1) pcScores.head() # In[13]: from ggplot import * #from ggplot import scale_fill_brewer chart = ggplot(pcScores, aes(x='PC1', y='PC2', color='Age')) + geom_point( size=75, alpha=0.8) + ggtitle( "First and Second Principal Components colored by digit") chart # In[ ]: import time from sklearn.manifold import TSNE time_start = time.time() tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300) tsne_results = tsne.fit_transform(features.values) print('t-SNE done! Time elapsed: {} seconds'.format(time.time() - time_start))
def plot_weather_data(version=1, nbins=18): ''' plot_weather_data is passed a dataframe called turnstile_weather. Use turnstile_weather along with ggplot to make another data visualization focused on the MTA and weather data we used in Project 3. Make a type of visualization different than what you did in the previous exercise. Try to use the data in a different way (e.g., if you made a lineplot concerning ridership and time of day in exercise #1, maybe look at weather and try to make a histogram in this exercise). Or try to use multiple encodings in your graph if you didn't in the previous exercise. You should feel free to implement something that we discussed in class (e.g., scatterplots, line plots, or histograms) or attempt to implement something more advanced if you'd like. Here are some suggestions for things to investigate and illustrate: * Ridership by time-of-day or day-of-week * How ridership varies by subway station * Which stations have more exits or entries at different times of day If you'd like to learn more about ggplot and its capabilities, take a look at the documentation at: https://pypi.python.org/pypi/ggplot/ You can check out the link https://www.dropbox.com/s/meyki2wl9xfa7yk/turnstile_data_master_with_weather.csv to see all the columns and data points included in the turnstile_weather dataframe. However, due to the limitation of our Amazon EC2 server, we are giving you a random subset, about 1/3 of the actual data in the turnstile_weather dataframe. ''' turnstile_weather = read_csv_data(version) # bins entries_max = turnstile_weather.ENTRIESn_hourly.max() entries_min = turnstile_weather.ENTRIESn_hourly.min() bins = np.linspace(entries_min, entries_max, nbins) binwidth = bins[1] - bins[0] # probabilities for number of entries per hour on dry days dry = turnstile_weather.ENTRIESn_hourly[turnstile_weather.rain == 0] (hist_dry, bin_edges) = np.histogram(dry, bins) prob_dry = hist_dry / float(np.sum(hist_dry)) # probabilities for number of entries per hour on rainy days rainy = turnstile_weather.ENTRIESn_hourly[turnstile_weather.rain == 1] (hist_rain, bin_edges) = np.histogram(rainy, bins) prob_rain = hist_rain / float(np.sum(hist_rain)) # plot histograms on rainy and dry days plt.hist(np.array(dry), bins=bins, color='w', label='dry') plt.hist(np.array(rainy), bins=bins, label='rainy') plt.legend() plt.title('Histograms of number of entries per hour on rainy and dry days') plt.xlabel('Number of entries per hour') plt.xlim(0, 20000) plt.ylabel('Count') plt.ylim(-1000, 55000) plt.show() # probablities when rainy - probabilities when dry prob_diff = prob_rain - prob_dry # improvised bar plot # geom_bar with stat='identity' does not seem to work # data frame for ggplot df = DataFrame({ 'xmin': bins[:-1], 'xmax': bins[1:], 'ymin': prob_diff * ((prob_diff < 0).astype(float)), 'ymax': prob_diff * ((prob_diff > 0).astype(float)), 'sign': np.sign(prob_diff) }) # plot difference in probabilities on rainy and dry days plot = ggplot(df,aes(xmin='xmin',xmax='xmax',ymin='ymin',ymax='ymax',fill='sign')) + \ geom_rect() + \ geom_hline(yintercept=0,color='black') + \ ggtitle('Probabilty of entries/hr when rainy - probability of entries/hr when dry') + \ xlab('E = Entries/hr') + \ ylab('P(E | rainy) - P(E | dry)') return plot
verbose=False) xs, drs, acts = [], [], [] # reset array memory if episode_number % 20 == 0: check_weight_1 = np.asarray(kmodel.get_weights()[1])[0].sum() check_weight_2 = np.asarray(kmodel.get_weights()[3])[0].sum() print('Weight check 1: {}, weight check 2: {}'.format( check_weight_1, check_weight_2)) if episode_number % save_model_freq == 0: kmodel.save_weights("Models/" + model_name + ".h5") game_history_plot = pd.DataFrame(game_history, columns=["Ep", "Score"]) game_history_plot["EMA100"] = game_history_plot["Score"].ewm( span=100).mean() newplot = (ggplot(aes(x="Ep", y="Score"), data=game_history_plot) + geom_point(color="green") + geom_line(aes(x="Ep", y="EMA100"), color="blue") + geom_hline(y=0, color="darkorange") + ggtitle(model_name)) newplot.save("Plots/" + model_name + ".png") if running_reward > running_best: running_best = running_reward kmodel.save_weights("Models/BEST_" + model_name + ".h5") reward_sum = 0 observation = env.reset() # reset env prev_x = None
def test_scale_facet_wrap_visual(): p = ggplot(aes(x="price"), data=diamonds) + geom_histogram() assert_same_ggplot(p + facet_wrap("cut", scales="free"), "free") assert_same_ggplot(p + facet_wrap("cut", scales="free_x"), "free_x") assert_same_ggplot(p + facet_wrap("cut", scales="free_y"), "free_y") assert_same_ggplot(p + facet_wrap("cut", scales=None), "none")
####### Import Packages ######### import os import numpy as np import pandas as pd from ggplot import * ####### Set Simulation Parameters ######### os.chdir("/Users/bradley/SpeedTest") np.random.seed(123) # set the seed to ensure reproducibility N = 1000 # set number of agents in economy gamma = .5 # set Cobb-Douglas relative preference for consumption tau = .2 # set tax rate ####### Draw Income Data and Optimal Consumption and Leisure ######### epsilon = np.random.normal(size=N) # draw unobserved non-labor income wage = 10+np.random.normal(size=N) # draw observed wage consump = gamma*(1-tau)*wage + gamma*epsilon # Cobb-Douglas demand for c leisure = (1.0-gamma) + ((1.0-gamma)*epsilon)/((1.0-tau)*wage) # Cobb-Douglas demand for l ####### Organize, Describe, and Export Data ######### df = pd.DataFrame() df['consump'] = consump df['leisure'] = leisure df['wage'] = wage df['epsilon'] = epsilon plot_c = ggplot(aes(x='wage',y='consump'),data=df) + stat_smooth() ggsave(plot_c,"plot_c.svg") df.to_csv("consump_leisure.csv", index=False)
from ggplot import * p = ggplot(mtcars, aes('cyl')) + geom_bar() print(p) print(p + theme_bw()) print(p + theme_xkcd()) print(p + theme_matplotlib()) plt.show(1)
import ggplot from ggplot import aes, meat, geom_line, stat_smooth ggplot(aes(x='date', y='beef'), data=meat) +\ geom_line() +\ stat_smooth(colour='blue', span=0.2) ''' ggplot(diamonds, aes(x='carat', y='price', color='cut')) +\ geom_point() +\ scale_color_brewer(type='diverging', palette=4) +\ xlab("Carats") + ylab("Price") + ggtitle("Diamonds") ggplot(diamonds, aes(x='price', fill='cut')) +\ geom_density(alpha=0.25) +\ facet_wrap("clarity") '''
NA_Count = pd.DataFrame({'Sum of NA': df.isnull().sum()}).sort_values(by=['Sum of NA'], ascending=[0]) NA_Count['Percentage'] = NA_Count['Sum of NA'] / df.shape[1] print(sum(NA_Count['Percentage'])) from sklearn.model_selection import train_test_split train, test = train_test_split(df, test_size=0.2, random_state=42) cat = ['waterfront', 'view', 'condition', 'grade'] con = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'sqft_living15', 'sqft_lot15'] from ggplot import * lonlat = ggplot(train, aes(x='long', y='lat', color='price')) + geom_point() + scale_color_gradient(low='white', high='red') + ggtitle( 'Color Map of Price') print(lonlat) lonprice = ggplot(train, aes(x='long', y='price')) + geom_point() + ggtitle('Price VS Longitude') print(lonprice) bedroom_price = ggplot(train, aes(x='bedrooms', y='price')) + geom_point() + ggtitle('Price VS No. of bedrooms') print(bedroom_price) year_built_price = ggplot(train, aes(x='yr_built', y='price')) + geom_point() + ggtitle('Price VS year built') print(year_built_price) def centralize_long(lon):
from ggplot import * ggplot(aes(x='date', y='beef'), data=meat) +\ geom_line() +\ stat_smooth(colour='blue', span=0.2)
history=model.fit({'seq_input':train['seq']},train_output_dict, validation_data=({'seq_input':val['seq']},val_output_dict), nb_epoch=200, batch_size=100, callbacks=[early_stopping,checkpoint,reduce_lr], verbose=1) with open('%s/history.pkl'%(log_dir),'wb') as f: pickle.dump([history.history],f) with open('%s/history.pkl'%(log_dir),'rb') as f: x=pickle.load(f) # Plot the learning curve: history=pd.DataFrame(x[0]) history['epoch']=(range(1,history.shape[0]+1)) history_melt=pd.melt(history,id_vars=['epoch'],value_vars=['loss','val_loss'],var_name='type',value_name='loss') p1=ggplot(history_melt,aes('epoch','loss',color='type'))+geom_line()+theme_bw() p1.save(filename='%s/learning_curve.png'%(fig_dir)) # Plot prediction vs ground truth: pred=model.predict({'seq_input':test['seq'],'reg_input':test['reg']},batch_size=100,verbose=1) plt.scatter(pred,test['expr']) plt.savefig("%s/pred_vs_obs.png"%(fig_dir)) output=np.column_stack((test['expr'], pred[:,0])) np.savetxt("%s/prediction.txt"%(out_dir), output,delimiter='\t')
def box_plot(var): pt = a = ggplot(train, aes(x=var, y='price')) + geom_boxplot() + theme_bw() + ggtitle( 'Boxplot of ' + var + ' and price') return print(pt)
''' #TSNE 2d n_sne = int(size) time_start = time.time() tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=400) tsne_results = tsne.fit_transform(df.loc[rndperm[:n_sne], feat_cols].values) print('t-SNE done! Time elapsed: {} seconds'.format(time.time() - time_start)) df_tsne = df.loc[rndperm[:n_sne], :].copy() df_tsne['x-tsne'] = tsne_results[:, 0] df_tsne['y-tsne'] = tsne_results[:, 1] #plot t-SNE 2d chart2 = ggplot( df_tsne, aes(x='x-tsne', y='y-tsne', color='label') ) \ + geom_point(size=20,alpha=0.7) \ + ggtitle("tSNE dimensions colored by digit") #write chart in console to visulize chart2 chart2 ''' #TSNE 3d n_sne = int(size) time_start = time.time() tsne = TSNE(n_components=3, verbose=1, perplexity=40, n_iter=400) tsne_results = tsne.fit_transform(df.loc[rndperm[:n_sne],feat_cols].values) print('t-SNE done! Time elapsed: {} seconds'.format(time.time()-time_start))
# Create a dataframe df=pd.DataFrame({"Animal":["dog","dolphin","chicken","ant","spider"], "Legs":[4,0,2,6,8]}) df.head() ##################################################################################### # ggplot examples pip.main(['install', 'ggplot']) #from ggplot import ggplot, aes, geom_bar, geom_line, stat_smooth from ggplot import * # bar chart ggplot(df, aes(x="Animal", weight="Legs")) + geom_bar(fill='blue') # line chart with smoothing ggplot(aes(x='date', y='beef'), data=meat) + geom_line() + stat_smooth(colour='blue', span=0.2) # scatter points ggplot(diamonds, aes(x='carat', y='price', color='cut')) +\ geom_point() +\ scale_color_brewer(type='diverging', palette=4) +\ xlab("Carats") + ylab("Price") + ggtitle("Diamonds") # density and facets ggplot(diamonds, aes(x='price', fill='cut')) +\
from pandas import * from ggplot import * import pprint import csv import itertools import ggplot as gg import numpy as np import pandas as pd from datetime import datetime, date, time turnstile_weather=pandas.read_csv("C:/move - bwlee/Data Analysis/Nano/\ Intro to Data Science/project/code/turnstile_data_master_with_weather.csv") plot=ggplot(turnstile_weather,aes(x='ENTRIESn_hourly',y='EXITSn_hourly',color='Hour')) \ + geom_point() \ + scale_color_brewer(type='diverging', palette=4) \ + xlab("Entries") \ + ylab("Exits")\ + ggtitle("Entries vs Exists by hour") #print plot df = DataFrame({"rain": turnstile_weather[turnstile_weather['rain']==1]['ENTRIESn_hourly'], \ "no_rain": turnstile_weather[turnstile_weather['rain'] == 0]['ENTRIESn_hourly']}).fillna(0) df = melt(df) plot = ggplot(aes(x='value', color='variable'), data=df) \ + geom_histogram(binwidth=400) \ + scale_y_log() \ + ylab("Frequency") \ + xlab("Entries Per Hour")\ + ggtitle("Entries Per Hour vs Frequency")
# -*- coding: utf-8 -*- from __future__ import unicode_literals from ggplot import * print ggplot(mtcars, aes(x='mpg')) + geom_histogram() + xlab("Scrüm")
def test_scale_facet_wrap_internals(): def convertText(t): """Return a float for the text value of a matplotlib Text object.""" try: return float(t.get_text()) except: # don't mask the error, just let the assert raise the test failure return 0 def empty(t): """Return True if the Text object is an empty string.""" return len(t.get_text().strip()) == 0 p = ggplot(aes(x="price"), data=diamonds) + geom_histogram() # Only p2 has the new measures for column! p2 = p + facet_wrap("cut", scales="free") print(p2) # FIXME: n_high is the number of columns, not rows, because n_high and # n_wide are being passed backwards to plt.subplot in ggplot.py columns = p2.n_high fig = plt.gcf() # When the scales are free, every plot should have x and y labels. Don't # test the tick values because each plot is free to set its own. for ax in fig.axes: assert_true(len(ax.get_xticklabels()) > 0) assert_true(len(ax.get_yticklabels()) > 0) print(p + facet_wrap("cut", scales="free_x")) fig = plt.gcf() yticks = fig.axes[0].get_yticks() for pos, ax in enumerate(fig.axes): # When only the x-axis is free, all plots should have the same y scale assert_true(all(ax.get_yticks() == yticks)) if pos % columns == 0: # Only plots in the first column should have y labels assert_true( all(list(map(convertText, ax.get_yticklabels())) == yticks)) else: # Plots in all other columns should have no labels assert_true(all(map(empty, ax.get_yticklabels()))) # Every plot should have labels on its x-axis assert_true(len(ax.get_xticklabels()) > 0) print(p + facet_wrap("cut", scales="free_y")) fig = plt.gcf() xticks = fig.axes[0].get_xticks() subplots = len(fig.axes) for pos, ax in enumerate(fig.axes): assert_true(all(ax.get_xticks() == xticks)) if subplots - pos > columns: # Only the bottom plot of each column gets x labels. So only the # last N plots (where N = number of columns) get labels. assert_true(all(map(empty, ax.get_xticklabels()))) else: assert_true( all(list(map(convertText, ax.get_xticklabels())) == xticks)) # All plots should have y labels assert_true(len(ax.get_yticklabels()) > 0) print(p + facet_wrap("cut", scales=None)) fig = plt.gcf() xticks = fig.axes[0].get_xticks() yticks = fig.axes[0].get_yticks() for pos, ax in enumerate(fig.axes): # Every plot should have the same x and y scales assert_true(all(ax.get_xticks() == xticks)) assert_true(all(ax.get_yticks() == yticks)) # Repeat the tests for labels from both free_x and free_y if subplots - pos > columns: assert_true(all(map(empty, ax.get_xticklabels()))) else: assert_true( all(list(map(convertText, ax.get_xticklabels())) == xticks)) if pos % columns == 0: assert_true( all(list(map(convertText, ax.get_yticklabels())) == yticks)) else: assert_true(all(map(empty, ax.get_yticklabels())))
#seaborn import seaborn as sns xt1 = pd.crosstab(mtcarsDF.cyl, mtcarsDF.gear) xt1 sns.heatmap(xt1, cmap='YlGnBu', annot=True, cbar=False) xt2 = pd.crosstab(index=mtcarsDF.gear, columns=[mtcarsDF.am, mtcarsDF.vs], rownames=['Gear'] , colnames =['AM','VS']) xt2 sns.heatmap(xt2) sns.heatmap(xt2, cmap='YlGnBu', annot=True, cbar=False) #ggplot #pip install ggplot from ggplot import * ggplot(data=mtcarsDF, mapping= aes(x='wt', y='mpg')) + geom_point(colour='r') #error tslib https://github.com/yhat/ggpy/issues/662 #%% save to/from excel mtcarsDF.to_csv('mtcars.csv') #check the folder in working dir tab mtcarsDF.to_excel('mtcars.xlsx', sheet_name='mtcars1') mtcarsDF.to_clipboard() #clipboard, paste it anywhere import matplotlib.pyplot as plt #scatter plot plt.scatter(x=mtcarsDF.wt, y=mtcarsDF.mpg) plt.scatter(x='wt', y='mpg', data=mtcarsDF) plt.scatter(x='wt', y='mpg', data=mtcarsDF, label='MTCars : wt vs mpg')
def lineplot_compare(): df = pandas.read_csv('hr_by_team_year_sf_la.csv') print(ggplot(df, aes(x='yearID', y='HR', color='teamID')) + geom_line())
# print(len(baby_unisex_names)) #10221 baby_unisex_names_str = ', '.join( str(unisex_name) for unisex_name in baby_unisex_names) print("%r is %r" % ("Unisex names: ", str(baby_unisex_names_str))) # b) Calculate the share of unisex names, relative to all other names # count occurences of unisex names vs total names list_all_baby_unisex = list( data_frame_baby[data_frame_baby['Name'].isin(baby_unisex_names)]['Name']) total_baby_unisex = len(list_all_baby_unisex) print(total_baby_unisex) print(list_all_baby_unisex) # data_frame.groupby(['occupation', 'gender']).size() # “group by” involves split-apply-combine: # Splitting the data into groups based on some criteria. # Applying a function to each group independently. # Combining the results into a data structure. # # c) for a unisex name, plot name vs time with legend being gender # data_frame_baby[data_frame_baby['Gender'] == 'F'] # data_frame_baby.loc[data_frame_baby["Gender"] == 'F', ["Name"]] #1081683 rows x 1 columns # data_frame_baby[data_frame_baby['Gender'] == 'Unisex']x, y = np.random.random((2, num)) ggplot(aes(x='x', y='y', color='gender'), data=df) +\ geom_point(size=50) +\ theme_bw()