def main(): params = load_params("./params.txt") test_data = load_data(datetime.date(2017, 12, 20)) sim_data = simulate(test_data[0], params) dif_data = [0 for i in range(0, len(test_data[0]))] for i in range(0, len(dif_data) - 3): dif_data[i] = test_data[0][i + 3] - sim_data[i] df = pandas.DataFrame({ 't': range(6, len(test_data[0])), 'price': test_data[0][6:] }) df2 = pandas.DataFrame({ 't': range(12, len(sim_data) - 12), 'price': sim_data[12:-12] }) df3 = pandas.DataFrame({ 't': range(12, len(sim_data) - 12), 'price': dif_data[12:-12] }) a = ggplot.ggplot(ggplot.aes(x='t', y='price'), data=df) \ + ggplot.geom_line() b = ggplot.ggplot(ggplot.aes(x='t', y='price'), data=df2) \ + ggplot.geom_line(color='blue') c = ggplot.ggplot(ggplot.aes(x='t', y='price'), data=df3) \ + ggplot.geom_line(color='blue') a.save('hoge.png') b.save('hoge2.png') c.save('hoge3.png')
def plot(self, what='cumulative_payouts', include_ci=True): import ggplot as gg #This is hacky ... need to DRY out the imports if what == 'cumulative_payouts': plt = self._plot_cumulative_payouts(include_ci=include_ci) elif what == 'avg_accuracy': plt = self._plot_avg_accuracy(include_ci=include_ci) elif what == 'all': summary = self.summary() p1 = self._plot_cumulative_payouts(include_ci=include_ci, summary=summary) p2 = self._plot_avg_accuracy(include_ci=include_ci, summary=summary) d1 = p1.data d2 = p2.data d1['Outcome'] = d1['AverageCumulativePayout'] d2['Outcome'] = d2['AverageAccuracy'] d1['Plot'] = 'Cumulative Payouts' d2['Plot'] = 'Average Accuracy' df = d1.append(d2, ignore_index=True) if include_ci: plt = gg.ggplot(gg.aes(x='Round', y='Outcome', ymin='ymin', ymax='ymax'), data=df) + \ gg.geom_area(alpha=0.5) else: plt = gg.ggplot(gg.aes(x='Round', y='Outcome'), data=df) plt += gg.facet_grid('Plot', scales='free') else: raise ValueError('%s is not a valid option' % what) return plt + gg.geom_line()
def plot_trend_season(dates, ndf_domain, x, x_trend, season, my_domain): # ---------------------- Prepare Data Frame ----------------------- # df_domain = pd.DataFrame(ndf_domain, columns=['Date', 'Volume']) df_domain['Date'] = dates x_lbl = ['Observed Volume' for i in xrange(len(x))] xt_lbl = ['Overall Trend' for i in xrange(len(x_trend))] xs_lbl = ['Repeat Sending Trend' for i in xrange(len(season))] col3 = pd.DataFrame(x_lbl+xt_lbl+xs_lbl) df_plot = pd.concat( (df_domain, col3), axis=1) df_plot.columns = ['Date', 'Volume', 'Data'] # ---------------------- Plot Decomposition ----------------------- # p = ggplot.ggplot(aes(x='Date', y='Volume', color='Data'), data=df_plot) + \ ggplot.geom_line(color='blue', size=2) + \ ggplot.scale_x_date(labels = date_format("%Y-%m-%d"), breaks="1 week") + \ ggplot.xlab("Week (Marked on Mondays)") + \ ggplot.ylab("Message Vol") + \ ggplot.ggtitle("%s Message Volume by Week" % my_domain) + \ ggplot.facet_grid('Data', scales='free_y') + \ ggplot.theme_seaborn() return p
def plot_matches(df_in, date, filename_out, x_var='date_time', y_var="shorthand_search_vol"): """ Plot y-var and save based on specified variables. Assumes that df has already been filtered using dplyr's sift mechanism. Also assumes that a date has been passed in. """ # basic data processing for viz df_in['date_time'] = date + " " + df_in['time'].astype(str) df_in['date_time'] = pd.to_datetime(df_in['date_time'], errors="coerce", infer_datetime_format=True) # build layers for plot p = ggplot(aes(x=x_var, y=y_var, group="match_id", color="match_id"), data=df_in) p += geom_line(size=2) # informative p += labs(x="time (gmt)", y="search volume (scaled to 100)") # p += ggtitle("man. city (h) vs. chelsea (a)\naug. 8 '16, etihad stadium") p += scale_x_date(labels=date_format("%H:%M:%S"), date_breaks="30 minutes") # visual t = theme_gray() t._rcParams['font.size'] = 8 t._rcParams['font.family'] = 'monospace' p += t # done p.save(filename_out, width=16, height=8)
def plot_line(X,y,title=None,labelx=None,labely=None,save=False, colors=None): ''' Show on screen a line plot. Can save to a .pdf file too if specified. X,y - ''' df = pandas.DataFrame() if (title!=None): img_title = title.replace(" ","").replace(".","-") + ".pdf" df['X'] = X for i in range(y.shape[1]): df[str(i)] = y.iloc[:,i].values if colors is None: colors = list(dict(mcolors.BASE_COLORS, **mcolors.CSS4_COLORS).keys()) df = df.iloc[0:df.shape[0]-1, :] p = ggplot(df, aes(x='X')) for i in range(y.shape[1]): if colors not in X.columns.values: p = p + geom_line(aes(y=str(i),color = colors[i])) else: p = p + geom_point(aes(y=str(i),color = colors)) p = p + xlab(labelx) + ylab(labely) + ggtitle(title) if(save): p.save(img_title) else: return p
def plotSetOfArrays(arrays, names, fileName): IDS = np.linspace(0, 1, arrays[0].shape[0]) A = IDS.reshape(arrays[0].shape[0], 1) for i in range(0, len(arrays)): A = np.concatenate((A, arrays[i]), axis=1) Data = pd.DataFrame(A, columns=['noise'] + names) Melted = pd.melt(Data, id_vars=['noise']) pv = ggplot.ggplot(ggplot.aes(x='noise', y='value', colour='variable'), data=Melted) + ggplot.geom_line() + ggplot.geom_point() ggplot.ggsave(pv, './IMG/' + fileName) output_file("iou_scores.html", title="correlation.py example") figure(tools="pan,wheel_zoom,box_zoom,reset,previewsave") hold() line(IDS, arrays[0][:, 0], color='#A6CEE3', legend=names[0]) line(IDS, arrays[1][:, 0], color='#1F78B4', legend=names[1]) line(IDS, arrays[2][:, 0], color='#B2DF8A', legend=names[2]) line(IDS, arrays[3][:, 0], color='#33A02C', legend=names[3]) line(IDS, arrays[4][:, 0], color='#fb9a99', legend=names[4]) curplot().title = "Minimum IOU" grid().grid_line_alpha = 0.3 show()
def plot_roc(self, experiment_type, to_plot): # turn this to string for categorical colour scheme to_plot.loc[:, "parameter"] = [str(par) for par in to_plot.loc[:, "parameter"]] p = gg.ggplot(data = to_plot, aesthetics = gg.aes(x = "FPR", y = "TPR", colour = "parameter")) + \ gg.geom_line(gg.aes(x = "FPR", y = "TPR", colour = "parameter")) + \ gg.ggtitle(experiment_type) + gg.xlab("FPR") + gg.ylab("TPR") gg.ggsave(filename = self.results_path + experiment_type + "_" + self.mode + ".png", plot = p) return
def plot_cost_history(alpha, cost_history): cost_df = pandas.DataFrame({ 'Cost_History': cost_history, 'Iteration': range(len(cost_history)) }) return gp.ggplot(cost_df, gp.aes('Iteration', 'Cost_History')) +\ gp.geom_point() + gp.geom_line() + gp.ggtitle('Cost History for alpha = %.3f' % alpha )
def plot_weather_data(df): df.DATEn = pd.to_datetime(df.DATEn) grouped = df.groupby('DATEn', as_index=False).sum() grouped.index.name = 'DATEn' plot = gp.ggplot(grouped, gp.aes(x='DATEn', y='EXITSn_hourly')) plot += gp.geom_line() plot += gp.ggtitle('Subway Ridership by Day') plot += gp.xlab('Date') plot += gp.ylab('Exits') return plot
def plot_weather_data(df): # older version df.DATEn = pd.to_datetime(df.DATEn) grouped = df.groupby('DATEn', as_index=False).sum() grouped.index.name = 'DATEn' p_title = 'Subway Ridership by Hour vs Raining' p_xlab = 'Hour of the Day' p_ylab = 'Subway Entries' plot = gp.ggplot(grouped, gp.aes(x='DATEn', y='EXITSn_hourly')) + gp.geom_line() + gp.ggtitle(p_title) + gp.xlab(p_xlab) + gp.ylab(p_ylab) return plot
def lineplot_compare(filename): df = pd.read_csv(filename) gg = ( gp.ggplot(df, gp.aes(x="yearID", y="HR", color="teamID")) + gp.geom_point() + gp.geom_line() + gp.ggtitle("Homeruns by Year by Team") + gp.xlab("Homeruns") + gp.ylab("Year") ) return gg
def lineplot(hr_year_csv): df = pandas.read_csv(hr_year_csv) gg = ( gp.ggplot(df, gp.aes(x="yearID", y="HR")) + gp.geom_point(color="red") + gp.geom_line(color="red") + gp.ggtitle("Homeruns by Year") + gp.xlab("Homeruns") + gp.ylab("Year") ) return gg
def show_price(self): price_table = pd.DataFrame( { 'time_step': range(len(self.price)), 'price': self.price }, columns=['time_step', 'price']) p = gp.ggplot(gp.aes(x='time_step', y='price'), data=price_table) + \ gp.geom_line() + \ gp.xlim(0, len(self.price)) + \ gp.ggtitle('Price trend') print(p)
def show_asset(self): asset_table = pd.DataFrame( { 'time_step': range(len(self.asset_history)), 'asset': self.asset_history }, columns=['time_step', 'asset']) p = gp.ggplot(gp.aes(x='time_step', y='asset'), data = asset_table) + \ gp.geom_line() + \ gp.xlim(0, len(self.asset_history)) + \ gp.ggtitle('Asset trend') print(p)
def plotHistogramMeans(hist,fileName): num_clust = hist.shape[0] IDS = np.mat(range(0,num_clust)) IDS = IDS.reshape(num_clust,1) histD = np.concatenate((IDS,hist),axis=1) Data = pd.DataFrame(histD,columns = ['ID']+range(0,hist.shape[1])) Melted = pd.melt(Data,id_vars=['ID']) pv = ggplot.ggplot( ggplot.aes(x='variable',y='value'),data=Melted) + ggplot.geom_line() + ggplot.facet_wrap("ID") print "Saving mean histograms" ggplot.ggsave(pv,'./IMG/'+fileName)
def plot(self): dat = [] for traj in self.trajs: rec = traj.to_np_array() rec_len = rec.shape[0] label = [traj.name] * rec_len lb_array = np.array(label) lb_array = np.expand_dims(lb_array, 1) dat.append(np.concatenate([rec, lb_array], axis=1)) df_data = np.concatenate(dat, axis=0) df = pd.DataFrame(data=df_data, columns=['ep', 'value', 'type']) p = gp.ggplot(gp.aes(x='ep', y='value', color='type'), data=df) + \ gp.geom_line() + gp.ggtitle(self.title)
def plotHistogramMeans(hist, fileName): num_clust = hist.shape[0] IDS = np.mat(range(0, num_clust)) IDS = IDS.reshape(num_clust, 1) histD = np.concatenate((IDS, hist), axis=1) Data = pd.DataFrame(histD, columns=['ID'] + range(0, hist.shape[1])) Melted = pd.melt(Data, id_vars=['ID']) pv = ggplot.ggplot( ggplot.aes(x='variable', y='value'), data=Melted) + ggplot.geom_line() + ggplot.facet_wrap("ID") print "Saving mean histograms" ggplot.ggsave(pv, './IMG/' + fileName)
def lineplot_compare(filename): # Cleaner version with string vars df = pd.read_csv(filename) p_title = "Homeruns by Year by Team" p_xlab = "Homeruns" p_ylab = "Year" gg = ( gp.ggplot(df, gp.aes(x="yearID", y="HR", color="teamID")) + gp.geom_point() + gp.geom_line() + gp.ggtitle(p_title) + gp.xlab(p_xlab) + gp.ylab(p_ylab) ) return gg
def line_chart(self, conn, column1, column2, table_chosen, title): data_df = dfile.double_selector(conn=conn, table=table_chosen, col1=column1, col2=column2) line_plot = ggplot( aes(y=column2, x=column1), data=data_df) + geom_line() + theme_gray() + labs(title=title) now = datetime.datetime.now() b = now print(b) print(b - a) print(line_plot)
def _plot_cumulative_payouts(self, include_ci=True, summary=None): import ggplot as gg if summary is None: summary = self.summary() df = pd.DataFrame({'AverageCumulativePayout': summary['CumulativePayout']['Avg'], 'Std': summary['CumulativePayout']['Std'], 'Round': range(self.n_rounds)}) if include_ci: df['ymin'] = df.AverageCumulativePayout - 1.96 * df.Std df['ymax'] = df.AverageCumulativePayout + 1.96 * df.Std plt = gg.ggplot(gg.aes(x='Round', y='AverageCumulativePayout', ymin='ymin', ymax='ymax'), data=df) + \ gg.geom_area(alpha=0.5) else: plt = gg.ggplot(gg.aes(x='Round', y='AverageCumulativePayout'), data=df) return plt + gg.geom_line()
def googletrend_command(delta_t, threshold=0.0, inverse=False): """the command to run google trend algorithm. :param delta_t: the upper bound for original delta_t parameter :param threshold: upper bound for the threshold of differentiating two classes :param inverse: whether to inverse the classifier """ ## handle filepath and title based on parameter inverse filename = "googletrend" titlename = "ROC of google trend classifier" if inverse: filename += "_inverse" titlename += " (inverse version)" filepath = "./plots/%s.jpg" % filename ## generate data first data = googletrend.preprocess() ## store classifier evaluation metrics into dict output = {} output['tpr'] = [] output['fpr'] = [] output['plot'] = [] for thre in np.arange(0, threshold + 0.1, 0.1): print "==> threshold: %f, inverse: %s" % (thre, inverse) for i in xrange(1, int(delta_t)): googletrend.algorithm(data, i, thre, inverse) tp_rate, fp_rate = googletrend.evaluate(data) # print "delta_t: %d, TPR: %f, FPR: %f" % (i, tp_rate, fp_rate) output['tpr'].append(tp_rate) output['fpr'].append(fp_rate) output['plot'].append('thre_' + str(thre)) ## plot ROC graph ## add a y=x baseline for comparison output['tpr'].extend([0.0, 1.0]) output['fpr'].extend([0.0, 1.0]) output['plot'].extend(['baseline', 'baseline']) df = pd.DataFrame(output) graph = gg.ggplot(df, gg.aes('fpr', 'tpr', color='plot')) + \ gg.theme_seaborn() + \ gg.ggtitle(titlename) + \ gg.xlab("FPR") + \ gg.ylab("TPR") + \ gg.xlim(0.0, 1.0) + \ gg.ylim(0.0, 1.0) + \ gg.geom_point() + \ gg.geom_line() gg.ggsave(plot=graph, filename=filepath, width=6, height=6, dpi=100)
def plot_predictions(date_times, actual_values, predictions, match_id, feature_set_in, filename): """ Plot y-var and save based on specified variables. Assumes that df has already been filtered using dplyr's sift mechanism. Also assumes that a date has been passed in. """ actual_df = pd.DataFrame() actual_df['date_time'] = pd.to_datetime(date_times, errors="coerce", infer_datetime_format=True) actual_df['search_vol'] = actual_values actual_df['match_id'] = "actual" + match_id predict_df = pd.DataFrame() predict_df['date_time'] = pd.to_datetime(date_times, errors="coerce", infer_datetime_format=True) predict_df['search_vol'] = list(predictions) predict_df['match_id'] = "predictedby_" + str(feature_set_in) + match_id plotting_df = pd.concat([actual_df, predict_df], axis=0, ignore_index=True) # build layers for plot p = ggplot(aes(x='date_time', y='search_vol', group="match_id", color="match_id"), data=plotting_df) p += geom_line(size=2) # informative p += labs(x="time (gmt)", y="search volume (scaled to 100)") # p += ggtitle("man. city (h) vs. chelsea (a)\naug. 8 '16, etihad stadium") p += scale_x_date(labels=date_format("%H:%M:%S"), date_breaks="30 minutes") # visual t = theme_gray() t._rcParams['font.size'] = 8 t._rcParams['font.family'] = 'monospace' p += t # done p.save(filename, width=16, height=8)
def graph1(score_data): """ Average score as time goes on; Creates and returns graph 1, a line graph. """ date_column = score_data[0][find_time_stamp(score_data)] data = DataFrame(score_data[1:], columns=score_data[0]) # Get all columns that arlabels = date_format("%Y-%m-%d")e numerical # questions so we know what to graph num_questions = data.select_dtypes(include=['int64']).columns.values # Melt data so that each question is in a seperate row new_data = pd.melt(data, id_vars=date_column, value_vars=num_questions, var_name="Question", value_name="Score") # Convert date string into an actual date type new_data[date_column] = pd.to_datetime(new_data[date_column], format="%m/%d/%Y") # Group all rows with same date and question, and then take the average. new_data = new_data.groupby([date_column, 'Question']).mean().reset_index() new_data['All'] = "Indiviual Questions" new_data2 = new_data.groupby(date_column).mean().reset_index() new_data2['Question'] = "All Questions" new_data2['All'] = "Average of All Questions" new_data = pd.concat([new_data, new_data2]) new_data[date_column] = new_data[date_column].astype('int64') # Create time graph with seperate lines for each question ret = ggplot.ggplot(ggplot.aes(x=date_column, y="Score", colour="Question"), new_data) +\ ggplot.geom_point() +\ ggplot.geom_line() +\ ggplot.facet_grid("All") +\ ggplot.scale_x_continuous(labels=[""], breaks=0) +\ ggplot.labs(x="Time", y="Average Question Score") +\ ggplot.ggtitle("Question Scores Over Time") return ret
def main(log): log.debug('initializing app') p = pyaudio.PyAudio() # Open audio input stream stream = p.open(format = FORMAT, channels = CHANNELS, rate = SAMPLE_RATE, input = True, frames_per_buffer = CHUNK_SIZE) log.debug('opened stream <{}>'.format(stream)) log.debug('reading audio input at rate <{}>'.format(SAMPLE_RATE)) recorded = [] # Start mainloop loops = 0 while True: loops += 1 if loops % 25 == 0: log.debug('recorded <{}> loops'.format(loops)) # Decode chunks of audio data from the stream try: data = stream.read(CHUNK_SIZE) decoded = np.fromstring(data, 'Float32'); mx = max(decoded) recorded.append(mx) # On <C-c>, plot max of recorded data except KeyboardInterrupt as ee: log.debug('closing stream and ending PyAudio') stream.close() p.terminate() df = pd.DataFrame(columns = ['mx', 'time']) df['mx'] = recorded df['time'] = range(len(recorded)) plt = ggplot.ggplot(ggplot.aes(x='time', y='mx'), data=df) +\ ggplot.geom_line() pdb.set_trace() log.debug('quitting') sys.exit(1)
def _plot_avg_accuracy(self, include_ci=True, summary=None): import ggplot as gg if summary is None: summary = self.summary() df = pd.DataFrame({'AverageAccuracy': summary['Accuracy']['Avg'], 'Round': range(self.n_rounds)}) if include_ci: from scipy import stats succ = df.AverageAccuracy * self.n_sim fail = self.n_sim - succ interval = stats.beta(succ + 1, fail + 1).interval(0.95) df['ymin'] = interval[0] df['ymax'] = interval[1] plt = gg.ggplot(gg.aes(x='Round', y='AverageAccuracy', ymin='ymin', ymax='ymax'), data=df) + \ gg.geom_area(alpha=0.5) else: plt = gg.ggplot(gg.aes(x='Round', y='AverageAccuracy'), data=df) return plt + gg.geom_line()
def data_output(data, chart_title): print "Good News! You're data has been returned. I'm happy to show it to you." print "Just tell me how you want it - Table or Line Graph?" data_output = raw_input("Choose table or line > ") if data_output[0].lower() == "t": print "Ok, here's your data." print data elif data_output[0] == "l" or data_output[0].lower() =="g": import ggplot as gg plot = gg.ggplot(gg.aes(x='Month, Year', y='Value'), data=data) + \ gg.geom_point(color='black') + \ gg.geom_line(color='green') + \ gg.ggtitle(chart_title) + \ gg.xlab("Month, Year") + \ gg.ylab("Value") gg.scale_x_date(breaks = gg.date_breaks('1 month'), labels= gg.date_format("%B")) print (plot + gg.theme_xkcd())
def lineplot(hr_year_csv): # A csv file will be passed in as an argument which # contains two columns -- 'HR' (the number of homerun hits) # and 'yearID' (the year in which the homeruns were hit). # # Fill out the body of this function, lineplot, to use the # passed-in csv file, hr_year_csv, and create a # chart with points connected by lines, both colored 'red', # showing the number of HR by year. # # You will want to first load the csv file into a pandas dataframe # and use the pandas dataframe along with ggplot to create your visualization # # You can check out the data in the csv file at the link below: # https://www.dropbox.com/s/awgdal71hc1u06d/hr_year.csv # # You can read more about ggplot at the following link: # https://github.com/yhat/ggplot/ df = pandas.read_csv(hr_year_csv) gg = gp.ggplot(df, gp.aes('yearID', 'HR')) + gp.geom_point(color='red') + gp.geom_line(color='red') return gg
def lineplot(hr_year_csv): # Assume that we have a pandas dataframe file called hr_year, # which contains two columns -- yearID, and HR. # # The pandas dataframe contains the number of HR hit in the # Major League baseball in each year. Can you write a function, # lineplot, that creates a chart with points connected by lines, both # colored 'red', showing the number of HR by year? # # You can check out the data loaded into the dataframe at the link below: # https://www.dropbox.com/s/awgdal71hc1u06d/hr_year.csv # your code here df = pd.read_csv('hr_year.csv') gg = gp.ggplot(df, gp.aes('yearID', 'HR')) gg += gp.geom_point(color='red') gg += gp.geom_line(color='red') gg += gp.ggtitle('Total HRs by Year') gg += gp.xlab('Year') gg += gp.ylab('HR') return gg
def lineplot_compare(hr_by_team_year_sf_la_csv): # Write a function, lineplot_compare, that will read a csv file # called hr_by_team_year_sf_la_csv and plot it using pandas and ggplot2. # # This csv file has three columns -- yearID, HR, and teamID, # representing the total number of HR hit each year by the SF Giants # and LA Dodgers. Produce a visualization comparing the total HR by # year of the two teams. # # You can see the data in hr_by_team_year_sf_la_csv # at the link below: # https://www.dropbox.com/s/wn43cngo2wdle2b/hr_by_team_year_sf_la.csv # # Note that to differentiate between multiple categories on the # same plot in ggplot, we can pass color in with the other arguments # to aes, rather than in our geometry functions. # # For example, ggplot(data, aes(xvar, yvar, color=category_var)). This # should help you. df = pandas.read_csv(hr_by_team_year_sf_la_csv) #print(df) gg = gp.ggplot(df, gp.aes('yearID', 'HR', color='teamID')) + gp.geom_point() + gp.geom_line() return gg
def plot_toxicity_probabilities(self, chart_title=None, use_ggplot=False): """ Plot prior and posterior dose-toxicity curves. :param chart_title: optional chart title. Default is fairly verbose :type chart_title: str :param use_ggplot: True to use ggplot, else matplotlib :type use_ggplot: bool :return: plot of toxicity curves """ if not chart_title: chart_title = "Prior (dashed) and posterior (solid) dose-toxicity curves" chart_title = chart_title + "\n" if use_ggplot: from ggplot import (ggplot, ggtitle, geom_line, geom_hline, aes, ylim) import numpy as np import pandas as pd data = pd.DataFrame({'Dose level': self.dose_levels(), 'Prior': self.prior, 'Posterior': self.prob_tox(), # 'Lower': crm.get_tox_prob_quantile(0.05), # 'Upper': crm.get_tox_prob_quantile(0.95) }) var_name = 'Type' value_name = 'Probability of toxicity' melted_data = pd.melt(data, id_vars='Dose level', var_name=var_name, value_name=value_name) # melted_data['LineType'] = np.where(melted_data.Type=='Posterior', '--', np.where(melted_data.Type=='Prior', '-', '..')) # melted_data['LineType'] = np.where(melted_data.Type=='Posterior', '--', np.where(melted_data.Type=='Prior', '-', '..')) # melted_data['Col'] = np.where(melted_data.Type=='Posterior', 'green', np.where(melted_data.Type=='Prior', 'blue', 'yellow')) # np.where(melted_data.Type=='Posterior', '--', '-') p = ggplot(melted_data, aes(x='Dose level', y=value_name, linetype=var_name)) + geom_line() \ + ggtitle(chart_title) + ylim(0, 1) + geom_hline(yintercept=self.target, color='black') # Can add confidence intervals once I work out linetype=??? in ggplot return p else: import matplotlib.pyplot as plt import numpy as np dl = self.dose_levels() prior_tox = self.prior post_tox = self.prob_tox() post_tox_lower = self.get_tox_prob_quantile(0.05) post_tox_upper = self.get_tox_prob_quantile(0.95) plt.plot(dl, prior_tox, '--', c='black') plt.plot(dl, post_tox, '-', c='black') plt.plot(dl, post_tox_lower, '-.', c='black') plt.plot(dl, post_tox_upper, '-.', c='black') plt.scatter(dl, prior_tox, marker='x', s=300, facecolors='none', edgecolors='k') plt.scatter(dl, post_tox, marker='o', s=300, facecolors='none', edgecolors='k') plt.axhline(self.target) plt.ylim(0, 1) plt.xlim(np.min(dl), np.max(dl)) plt.xticks(dl) plt.ylabel('Probability of toxicity') plt.xlabel('Dose level') plt.title(chart_title) p = plt.gcf() phi = (np.sqrt(5) + 1) / 2. p.set_size_inches(12, 12 / phi)
# -*- coding: utf-8 -*- from ggplot import ggplot, aes, geom_point, geom_line, ggtitle, xlab, ylab data = [] xvar = 'X' yvar = 'Y' print ggplot( data, aes(x='yearID', y='HR')) + \ geom_point(color='red') + \ geom_line(color='red') + \ ggtitle('Number of HR by year') + \ xlab('Year') + \ ylab('Number of HR')
# plot with ggplot import ggplot as gg # ggplot needs data to be in Pandas data = pd.DataFrame( {"train": accuracies, "validation": accuracies_val, "epoch": range(len(accuracies)), }) data_melted = data.melt(id_vars="epoch") p = gg.ggplot(data_melted, gg.aes(x="epoch", y="value", color="variable")) p = p + gg.geom_point() + gg.geom_line() p #%% from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, f1_score #%% # Evaluate model (last epoch) accuracy_FINAL = model.evaluate(x_test, y_test)[1] #%% # Test accuracy with sklearn y_predicted = model.predict(x_test) y_predicted_argmax = np.asarray( y_predicted.argmax(axis=-1) )
data = [] for method in methods: for model in models: for rtol in rtols: print('method: {} model: {} rtol: {}'.format(method.name, model.name, rtol), end='') # Run tic = time.time() result = method(model, rtol) toc = time.time() - tic # Compare to gold standard standard = gold_standards[model.name] diff = result - standard.values max_rel_diff = np.max(diff/standard.max) # Append to table record = (method.name, model.name, rtol, max_rel_diff, toc) print(' err: {} toc: {}'.format(max_rel_diff, toc)) data.append(record) data = DataFrame(data, columns=['method', 'model', 'rtol', 'err', 'time']) print(gg.ggplot(data, gg.aes(x='err', y='time', color='method')) + gg.geom_point(size=60.0) + gg.geom_line() + gg.scale_x_log() + gg.scale_y_log() + gg.xlim(1e-10, 1e-2))
def plotSetOfArrays(arrays,names,fileName): IDS = np.linspace(0,1,arrays[0].shape[0]) A = IDS.reshape(arrays[0].shape[0],1) for i in range(0,len(arrays)): A = np.concatenate((A,arrays[i]),axis=1) Data = pd.DataFrame(A,columns = ['noise']+names) Melted = pd.melt(Data,id_vars=['noise']) pv = ggplot.ggplot(ggplot.aes(x='noise', y='value', colour='variable'), data=Melted) + ggplot.geom_line() + ggplot.geom_point() ggplot.ggsave(pv,'./IMG/'+fileName) output_file("iou_scores.html", title="correlation.py example") figure(tools="pan,wheel_zoom,box_zoom,reset,previewsave") hold() line(IDS, arrays[0][:,0], color='#A6CEE3', legend=names[0]) line(IDS, arrays[1][:,0], color='#1F78B4', legend=names[1]) line(IDS, arrays[2][:,0], color='#B2DF8A', legend=names[2]) line(IDS, arrays[3][:,0], color='#33A02C', legend=names[3]) line(IDS, arrays[4][:,0], color='#fb9a99', legend=names[4]) curplot().title = "Minimum IOU" grid().grid_line_alpha=0.3 show()
def plot_transmission_results(tx_results, percentage_decline, save_path, path_names): #%% what are inputs? # transmission results # There'll be a folder called 'Runs prepared for ...' # all the folders inside that folder will have a CEPAC results folder. # tx_data is a dictionary and will have two keys, 'monthly' and 'popstats' # 'monthly' key will only have primary transmissions data tx_data = deepcopy(tx_results) t = 120 total_var = 3 total_val = 4 # percentage decline # this is also dictionary of percentage decline values for each folder # having cepac results # save_path eaxact folder where you want to save your images # path_names will have paths to transmissions and sensitivity directories #%% plot percentage decline # geberate an environment object first # lets go for line plot data_plot = pd.DataFrame( columns=['x', 'Percentage decline', 'Transmissions', 'Variable'], index=range(0, total_var * total_val)) data_in = pd.read_excel( os.path.join(path_names['transmission'], 'Input files', 'transmission_rate_multiplier_required_inputs.xlsx')) col = [ 'Incidence rate per 100 PY specific to high-risk group 1', 'HIV uninfected individuals in high-risk group 1', 'HIV infected individuals in high-risk group 1' ] col_adj = ['Incidence', 'Uninfected', 'Infected'] data_in[col[0]] = data_in[col[0]].round(1) base_val = [np.float64(0.9), 2960000, 136400] y1_values = {col[0]: [], col[1]: [], col[2]: []} for var in percentage_decline: if 'HIV+' in var: y1_values[col[2]].append(percentage_decline[var]) elif 'HIV-' in var: y1_values[col[1]].append(percentage_decline[var]) elif 'Incidence' in var: y1_values[col[0]].append(percentage_decline[var]) for i in range(len(col)): idx = data_in.loc[data_in.loc[:, col[i]] != base_val[i], col[i]].index.values[0] data_plot.loc[idx - 1:idx + 3 - 1, 'x'] = data_in.loc[idx:idx + 3, col[i]].values data_plot.loc[idx - 1:idx + 3 - 1, 'Variable'] = col_adj[i] data_plot.loc[idx - 1:idx + 3 - 1, 'Percentage decline'] = y1_values[col[i]] # plot df_float = data_plot.loc[data_plot.loc[:, 'Percentage decline'] <= 200, :] (ggplot(aes(x='x', y='Percentage decline'), df_float) + geom_line() + facet_wrap('Variable', scales='free')).save( os.path.join(save_path, 'Percentage decline')) del df_float #%% visualizing transmissions # index = range(time * number of values for each variable * number of variables) def set_abc(run, var_idx, var_name, var_value_idx): # set variable names data_plot_tx.loc[(var_idx - 1) * t:((var_idx - 1) * t) + t - 1, 'Variable'] = var_name # set variable value data_plot_tx.loc[(var_idx - 1) * t:((var_idx - 1) * t) + t - 1, 'Value'] = data_plot.loc[ data_plot.loc[:, 'Variable'] == var_name, 'x'].values[var_value_idx] if 'RunA' in run: data_plot_tx.loc[(var_idx - 1) * t:((var_idx - 1) * t) + t - 1, 'RunA tx'] = tx_data[var]['monthly'][run][ 'transmissions'].iloc[0:t].values elif 'RunB' in run: data_plot_tx.loc[(var_idx - 1) * t:((var_idx - 1) * t) + t - 1, 'RunB tx'] = tx_data[var]['monthly'][run][ 'transmissions'].iloc[0:t].values elif 'RunC' in run: data_plot_tx.loc[(var_idx - 1) * t:((var_idx - 1) * t) + t - 1, 'RunC tx'] = tx_data[var]['monthly'][run][ 'transmissions'].iloc[0:t].values data_plot_tx = pd.DataFrame( index=range(t * total_var * total_val), columns=['Variable', 'Value', 'RunA tx', 'RunB tx', 'RunC tx']) var_idx = -1 var_val_idx = [-1, -1, -1] for var in tx_data: var_idx += 1 if 'HIV+' in var: var_val_idx[2] += 1 var_name = col_adj[2] for run in tx_data[var]['monthly']: set_abc(run, var_idx, var_name, var_val_idx[2]) elif 'HIV-' in var: var_val_idx[1] += 1 var_name = col_adj[1] for run in tx_data[var]['monthly']: set_abc(run, var_idx, var_name, var_val_idx[1]) elif 'Incidence' in var: var_val_idx[0] += 1 var_name = col_adj[0] for run in tx_data[var]['monthly']: set_abc(run, var_idx, var_name, var_val_idx[0]) else: continue data_plot_tx['t'] = 0 t_float = -1 for row in data_plot_tx.index: if t_float == t - 1: t_float = -1 t_float += 1 data_plot_tx.loc[row, 't'] = t_float #%% plots for individual runs run_col = ['RunA tx', 'RunB tx', 'RunC tx'] inci = data_plot_tx.loc[data_plot_tx.loc[:, 'Variable'] == 'Incidence', :] inf = data_plot_tx.loc[data_plot_tx.loc[:, 'Variable'] == 'Infected', :] uninf = data_plot_tx.loc[data_plot_tx.loc[:, 'Variable'] == 'Uninfected', :] for i in run_col: (ggplot(aes(x='t', y=i, color='Value'), data_plot_tx) + geom_line() + facet_wrap('Variable', scales='free')).save( os.path.join( save_path, str(i + r'_transmissions for all variable all values'))) (ggplot(aes(x='t', y=i), inci) + geom_line() + facet_wrap('Variable', 'Value', scales='free')).save( os.path.join( save_path, str(i + r'_plots for individual values of incidence'))) (ggplot(aes(x='t', y=i), inf) + geom_line() + facet_wrap('Variable', 'Value', scales='free')).save( os.path.join( save_path, str(i + r'_plots for individual values of infected population'))) (ggplot(aes(x='t', y=i), uninf) + geom_line() + facet_wrap('Variable', 'Value', scales='free')).save( os.path.join( save_path, str(i + '_plots for individual values of uninfected population'))) #%% compare runs ABC data_plot_abc = {} for var in col_adj: float_df = pd.DataFrame(index=range(0, t * total_var * total_val), columns=['t', 'Value', 'Transmissions', 'Run']) insert_idx = -1 for val in data_plot.loc[data_plot.loc[:, 'Variable'] == var, 'x']: var_df = data_plot_tx.loc[data_plot_tx.loc[:, 'Variable'] == var, :] var_df = var_df.reset_index(drop=True) var_val_df = var_df.loc[var_df.loc[:, 'Value'] == val, :] var_val_df = var_val_df.reset_index(drop=True) for c in ['RunA tx', 'RunB tx', 'RunC tx']: insert_idx += 1 float_df.loc[insert_idx * t:(insert_idx * t) + t - 1, 'Run'] = c float_df.loc[insert_idx * t:(insert_idx * t) + t - 1, 'Transmissions'] = var_val_df.loc[:, c].values float_df.loc[insert_idx * t:(insert_idx * t) + t - 1, 'Run'] = c float_df.loc[insert_idx * t:(insert_idx * t) + t - 1, 'Value'] = val float_df.loc[insert_idx * t:(insert_idx * t) + t - 1, 't'] = np.arange(t) data_plot_abc[var] = float_df.dropna() (ggplot(aes(x='t', y='Transmissions', color='Run'), float_df) + geom_line() + facet_wrap('Value', scales='free') + ggtitle(var)).save( os.path.join( save_path, str(var + '_comparison of transmissions in runs ABC'))) #%% compare runs BC for var in data_plot_abc: float_df = data_plot_abc[var].loc[ data_plot_abc[var].loc[:, 'Run'] != 'RunA tx', :] (ggplot(aes(x='t', y='Transmissions', color='Run'), float_df) + geom_line(alpha=0.2) + facet_wrap('Value', scales='free') + stat_smooth(method='loess', se=False) + ggtitle(var)).save( os.path.join(save_path, str(var + '_comparison of transmissions in runs BC'))) return
from ggplot import aes, geom_line, ggplot, meat import matplotlib.pyplot as plt from bokeh import mpl from bokeh.plotting import output_file, show g = ggplot(aes(x='date', y='beef'), data=meat) + geom_line() g.make() plt.title("Line ggplot-based plot in Bokeh.") output_file("ggplot_line.html", title="ggplot_line.py example") show(mpl.to_bokeh())
print(kValues) print("\nModelių objektai") print(models) print("\nS_k reikšmės") print(WSSSEs) print("\nRezultatų eilutės su K ir S_k reikšmėmis") print(rowsWSSSSE) print("\npyspark.sql.DataFrame turinys su rezultatais") WSSSEDF.show() # Galime nubraižyti $S_K$ priklausomybės nuo $K$ grafiką. # In[26]: gg.ggplot(gg.aes(x="K", y="WSSSE"), data=WSSSEDF.toPandas()) + gg.geom_line() # Aiškiai matome, kad $K = 2$ nėra optimali reikšmė. # Nubraižome grafiką su reikšmėmis $K > 2$ # In[27]: gg.ggplot(gg.aes(x="K", y="WSSSE"), data=WSSSEDF.where(WSSSEDF["K"] > 2).toPandas()) + gg.geom_line() # Su $K = 4$ matomas ženklus pagerėjimas. $K=5$ taip pat suteikia pastebimą pagerėjimą. Sprendžiant tikrą uždavinį tokiu atveju interpretuotume klasterių centrus su $K = 4$ ir $K = 5$, tuomet parinktume prasmingesnę $K$ reikšmę. # $K = 4$ centrai
from ( select pod_id_location, segid, med, p25,p75, p05, p95 from cte2 where n_passes >=100 union all select distinct 0 as pod_id_location, segid, med_mob as med, p25_mob as p25, p75_mob as p75, p05_mob as p05, p95_mob as p95 from cte2 where n_passes >= 100 ) """) qry_job = bqclient.query(qry_str, location='EU', job_config=bigquery.QueryJobConfig()) #save result as dataframe df = qry_job.to_dataframe() df_long = df.melt(id_vars=['pod_id_location', 'segid', 'pod_idx'], value_vars=['p05', 'p25', 'med', 'p75', 'p95'], var_name='yparam', value_name='value') df_long.to_csv(r'.\charts\subsetdistribs.csv') #plots #plt1.save(filename = r'.\charts\ulezpodts.png', width=None, height=None, dpi=200) plt2 = gg.ggplot(df_long, gg.aes( x='pod_idx', y='value', color='yparam')) + gg.geom_point() + gg.geom_line() + gg.xlab( 'pod/segment') + gg.ylab('NO2 (as % of median)') + gg.theme_bw() plt2.save(filename=r'.\charts\ulezsubsetvar.png', width=None, height=None, dpi=200)
for t_float in time: tp_FS, tp_PK = get_weibull(t=t_float, coverage=input_par['uptake'], duration=input_par['duration'], shape=s) plot_prob.loc[row_idx, 'Monthly transition probability'] = tp_FS plot_prob.loc[row_idx + 1, 'Monthly transition probability'] = tp_PK plot_prob.loc[row_idx, 'time'] = t_float plot_prob.loc[row_idx + 1, 'time'] = t_float plot_prob.loc[row_idx, 'Formula'] = 'FS' plot_prob.loc[row_idx + 1, 'Formula'] = 'PK' row_idx += 2 # collect collect_prob['FS ' + str(s)] = plot_prob.loc[ plot_prob.loc[:, 'Formula'] == 'FS', 'Monthly transition probability'].values collect_prob['PK ' + str(s)] = plot_prob.loc[ plot_prob.loc[:, 'Formula'] == 'PK', 'Monthly transition probability'].values # plot x = ggplot(aes( x='time', y='Monthly transition probability', color='Formula'), data=plot_prob) + geom_line() #name = r'Shape: ' + str(s)# + r', Coverage/Uptake = ' + str(input_par['uptake']*100) + r', Coverage time = ' + str(input_par['duration']) + '.jpg' x.save('Weibull' + str(plot_num)) plot_num += 1
def main(): global args, ruleset # Arguments Parser argparser, subparser = parser_setup() register_rules(subparser) args = argparser.parse_args() rulemod = sys.modules["rpgdice.rulesets.%s" % args.ruleset] rulemod.prepare(args, srand) if args.debug: print "DEBUG: args", args print results = list() pool = multiprocessing.Pool() try: for result in pool.map(rulemod.simulate_rolls, rulemod.variables): results.extend(result) pool.close() pool.join() except KeyboardInterrupt: sys.exit(130) if args.debug: print "DEBUG: results:" pprint(results) print conf = dict() conf = {"vlab": "Variables", "xlab": "Outcome", "ylab": "Probability %"} for item in conf: try: conf[item] = getattr(rulemod, item) except: pass columns = ("Graph", conf["vlab"], conf["xlab"], "Count", conf["ylab"]) data = pandas.DataFrame.from_records(results, columns=columns) # Create and save graphs for gkey in rulemod.graphs: # Graph Defaults graph_conf = conf.copy() graph_conf["file_prefix"] = "%s%02d" % (args.ruleset, gkey) graph_conf["file_suffix"] = str() # colors colors_lower = ["#ff0000", "#cc0000", "#993300", "#666600"] colors_upper = ["#006666", "#003399", "#0000cc", "#0000ff"] colors_mid = ["#000000"] color_count = len(rulemod.variables) - 1 if color_count % 2 == 0: lower_slice = (color_count / 2) * -1 upper_slice = color_count / 2 else: lower_slice = ((color_count - 1) / 2) * -1 upper_slice = (color_count + 1) / 2 graph_conf["color_list"] = colors_lower[lower_slice:] + colors_mid + colors_upper[0:upper_slice] # graph_conf from graph graph_items = ( "color_list", "file_prefix", "file_suffix", "graph_type", "limits", "x_breaks", "x_labels", "title", "vlab", "xlab", "ylab", ) for item in graph_items: try: graph_conf[item] = rulemod.graphs[gkey][item] except: try: graph_conf[item] = getattr(rulemod, item) except: if item not in graph_conf: graph_conf[item] = None if args.debug: print "DEBUG: graph_conf:" pprint(graph_conf) print # plot_data plot_data = data.copy() plot_data = plot_data[plot_data["Graph"] == gkey] plot_data.rename( columns={ conf["vlab"]: graph_conf["vlab"], conf["xlab"]: graph_conf["xlab"], conf["ylab"]: graph_conf["ylab"], }, inplace=True, ) plot_data.index = range(1, len(plot_data) + 1) if args.debug: print "DEBUG: plot_data:" pprint(plot_data) print # Create plot if args.graph: plot = ( ggplot.ggplot( ggplot.aes(x=graph_conf["xlab"], y=graph_conf["ylab"], color=graph_conf["vlab"]), data=plot_data ) + ggplot.ggtitle(graph_conf["title"]) + ggplot.theme_gray() + ggplot.scale_colour_manual(values=graph_conf["color_list"]) ) plot.rcParams["font.family"] = "monospace" if graph_conf["x_breaks"] and graph_conf["x_labels"]: plot += ggplot.scale_x_discrete(breaks=graph_conf["x_breaks"], labels=graph_conf["x_labels"]) if graph_conf["limits"]: plot += ggplot.ylim(graph_conf["limits"][0], graph_conf["limits"][1]) if graph_conf["graph_type"] == "bars": plot += ggplot.geom_line(size=20) text_data = plot_data[plot_data["Count"] > 0] text_data.index = range(0, len(text_data)) outcomes = dict(text_data[graph_conf["xlab"]]) percents = dict(text_data[graph_conf["ylab"]]) for k in outcomes: percent = "%4.1f%%" % percents[k] x = outcomes[k] y = percents[k] + 4 color = graph_conf["color_list"][k] plot += ggplot.geom_text(label=[percent], x=[x, x + 1], y=[y, y - 1], color=color) else: plot += ggplot.geom_line() plot += ggplot.geom_point(alpha=0.3, size=50) if hasattr(rulemod, "update_plot"): plot = rulemod.update_plot(gkey, graph_conf, plot, plot_data) if args.dumpsave: filename = "/dev/null" else: filename = "%s%s.png" % (graph_conf["file_prefix"], graph_conf["file_suffix"]) ggplot.ggsave(filename, plot, format="png", dpi=300) return 0
def plot_after_transmission_results(data, path_names): # import input data for tranmission analysis var_and_val = pd.DataFrame(columns=['x', 'Variable'], index=range(0, 12)) plot_lm = pd.DataFrame( columns=['x', 'Life Months', 'Scenario', 'Variable'], index=range(0, 24)) data_in = pd.read_excel( os.path.join(path_names['transmission'], 'Input files', 'transmission_rate_multiplier_required_inputs.xlsx')) col = [ 'Yearly incidence in MSM', 'Number of HIV uninfected individuals (HRG size)', 'Number of HIV infected individuals in primary cohort at t=0' ] col_adj = ['Incidence', 'Uninfected', 'Infected'] base_val = [0.009, 2960000, 136400] for i in range(len(col)): idx = data_in.loc[data_in.loc[:, col[i]] != base_val[i], col[i]].index.values[0] var_and_val.loc[idx - 1:idx + 3 - 1, 'x'] = data_in.loc[idx:idx + 3, col[i]].values var_and_val.loc[idx - 1:idx + 3 - 1, 'Variable'] = col_adj[i] row_idx = -2 var_idx = [-1, -1, -1] for var in data: if 'HIV+' in var: var_idx[2] += 1 plot_lm.loc[row_idx:row_idx + 1, 'x'] = var_and_val.loc[ var_and_val['Variable'] == col_adj[2], 'x'].values[var_idx[2]] plot_lm.loc[row_idx:row_idx + 1, 'Variable'] = var_and_val.loc[ var_and_val['Variable'] == col_adj[2], 'Variable'].values[var_idx[2]] plot_lm.loc[ row_idx:row_idx + 1, 'Life Months'] = data[var]['popstats'].loc[:, 'LMs_'].values plot_lm.loc[ row_idx:row_idx + 1, 'Scenario'] = data[var]['popstats'].loc[:, 'RUN_NAME_'].values elif 'HIV-' in var: var_idx[1] += 1 plot_lm.loc[row_idx:row_idx + 1, 'x'] = var_and_val.loc[ var_and_val['Variable'] == col_adj[1], 'x'].values[var_idx[1]] plot_lm.loc[row_idx:row_idx + 1, 'Variable'] = var_and_val.loc[ var_and_val['Variable'] == col_adj[1], 'Variable'].values[var_idx[1]] plot_lm.loc[ row_idx:row_idx + 1, 'Life Months'] = data[var]['popstats'].loc[:, 'LMs_'].values plot_lm.loc[ row_idx:row_idx + 1, 'Scenario'] = data[var]['popstats'].loc[:, 'RUN_NAME_'].values elif 'Incidence' in var: var_idx[0] += 1 plot_lm.loc[row_idx:row_idx + 1, 'x'] = var_and_val.loc[ var_and_val['Variable'] == col_adj[0], 'x'].values[var_idx[0]] plot_lm.loc[row_idx:row_idx + 1, 'Variable'] = var_and_val.loc[ var_and_val['Variable'] == col_adj[0], 'Variable'].values[var_idx[0]] plot_lm.loc[ row_idx:row_idx + 1, 'Life Months'] = data[var]['popstats'].loc[:, 'LMs_'].values plot_lm.loc[ row_idx:row_idx + 1, 'Scenario'] = data[var]['popstats'].loc[:, 'RUN_NAME_'].values row_idx += 2 # plot save_path = os.path.join(path_names['transmission'], r'Input files', r'Plots for final runs') if not os.path.exists(save_path): os.makedirs(save_path) (ggplot(aes(x='x', y='Life Months', color='Scenario'), plot_lm) + geom_line() + facet_wrap('Variable', scales='free')).save( os.path.join(save_path, 'Comparison of ')) return
def graph(y): data = pd.DataFrame({'iteration': list(range(len(y))), 'RMSE': y}) p = gg.ggplot(gg.aes(x='iteration', y='RMSE'), data=data) + gg.geom_point() + gg.geom_line() return p
print(f.format('Profit Contribution', data['profit'].mean())) print(f.format('Activity', (data['i'] == 'active').mean())) print('\nErgodic Standard Deviations\n') print(f.format('Profit Contribution', data['profit'].std())) print(f.format('Activity', (data['i'] == 'active').std())) # Plot Simulated and Expected Continuous State Path data2 = data[['time', 'profit']].groupby('time').mean() data2['time'] = data2.index print(data2) print(data2.columns) ppp = ggplot(aes('time','profit'), data=data2) + \ geom_line() print(ppp) ppp = ggplot(aes('time','profit','_rep'), data=data[data._rep <3]) + \ geom_point() + \ geom_line(aes('time','profit'), data=data2) print(ppp) print( demo.qplot('time', 'profit', '_rep', data=data[data._rep < 3], geom='line') + geom_line(aes('time', 'profit'), data=data2)) ''' subdata = data[data['_rep'] < 3][['time', 'profit', '_rep']]
#total-based dftmp = df[['n_sub']+brks[:5]].melt(id_vars=['n_sub'],value_vars=brks[:5], var_name = 'stat',value_name = 'value') dftmp['method']=['(Total-Expected Total)/Expected Total']*dftmp['n_sub'].size df_stacked = dftmp #enhancement-based dftmp = df[['n_sub']+brks[5:10]].melt(id_vars=['n_sub'],value_vars=brks[5:10], var_name = 'stat',value_name = 'value') dftmp['method']=['(Enhanc-Expected Enhanc)/Expected Enhanc']*dftmp['n_sub'].size df_stacked = df_stacked.append(dftmp) #enhancements + full sample background dftmp = df[['n_sub']+brks[10:]].melt(id_vars=['n_sub'],value_vars=brks[10:], var_name = 'stat',value_name = 'value') dftmp['method']=['(Enhanc+Expected Backgr-Expected Total)/Expected Total']*dftmp['n_sub'].size df_stacked = df_stacked.append(dftmp) df_stacked['percentile']=['{0}th%'.format(a[1:3]) for a in df_stacked['stat']] #plots #compare all 3 plt1 = gg.ggplot(df_stacked, gg.aes(x='n_sub',y='value',color='percentile'))+gg.geom_line()+gg.xlab('N drives')+gg.ylab('Bias (%)')+gg.theme_bw()+gg.scale_color_manual(values=colors)+gg.geom_hline(y=[-25,25],linetype="dashed",color="gray")+gg.geom_vline(x=[10,15],linetype="dashed",color="gray")+gg.facet_wrap('method')+gg.ggtitle('Bias comparison {0}'.format(title)) plt1.save(filename = r'..\charts\drivebias_laqn_{0}.png'.format(species), width=None, height=None, dpi=300) #plot total alone for presenation plt2 = gg.ggplot(df_stacked[df_stacked['method']=='(Total-Expected Total)/Expected Total'], gg.aes(x='n_sub',y='value',color='percentile'))+gg.geom_line()+gg.xlab('N drives')+gg.ylab('Bias (%)')+gg.ylim(-100,100)+gg.scale_color_manual(values=colors)+gg.geom_hline(y=[-25,25],linetype="dashed",color="gray")+gg.geom_vline(x=[10,15],linetype="dashed",color="gray")+gg.ggtitle('Bias comparison {0}'.format(title)) t = gg.theme_bw() t._rcParams['font.size']=16 plt2 = plt2+t plt2.save(filename = r'..\charts\drivebias_laqn_{0}_total.png'.format(species), width=None, height=None, dpi=300) #plot enhancement alone for presenation plt3 = gg.ggplot(df_stacked[df_stacked['method']=='(Enhanc+Expected Backgr-Expected Total)/Expected Total'], gg.aes(x='n_sub',y='value',color='percentile'))+gg.geom_line()+gg.xlab('N drives')+gg.ylab('Bias (%)')+gg.ylim(-100,100)+gg.scale_color_manual(values=colors)+gg.geom_hline(y=[-25,25],linetype="dashed",color="gray")+gg.geom_vline(x=[10,15],linetype="dashed",color="gray")+gg.ggtitle('Bias comparison {0}'.format(title)) t = gg.theme_bw() t._rcParams['font.size']=16 plt3 = plt3+t plt3.save(filename = r'..\charts\drivebias_laqn_{0}_enhanc.png'.format(species), width=None, height=None, dpi=300)
print("#######################################") print("打印所挖掘的文本文件 text-movie.xls 前几行") print(df.head()) #text = df.comments.iloc[0] 单个影评情感分析实验, iloc中的index值表示第几个应用,编号从0开始 #s = SnowNLP(text) # #print(s.sentiments) def get_sentiment_cn(text): s = SnowNLP(text) return s.sentiments df["sentiment"] = df.comments.apply(get_sentiment_cn) print("#######################################") print("打印所挖掘的文本文件 text-movie.xls 部分影评及其情感分析值") print(df) print("#######################################") print("重要信息") print("所有影评的平均值为:", df.sentiment.mean()) print("所有影评的中位数为:", df.sentiment.median()) ggplot.ggplot(ggplot.aes(x="date", y="sentiment"), data=df) + ggplot.geom_point() + ggplot.geom_line( color='blue') + ggplot.scale_x_date( labels=ggplot.date_format("%Y-%m-%d")) df.sort_values(['sentiment'])[:5]
"""Plot target variable as time series.""" import get_data from ggplot import aes, geom_line, facet_wrap, ggplot if __name__ == "__main__": df = get_data.get_all_data() p = ggplot(df, aes('datetime', 'cap', group='date')) + \ geom_line(alpha=0.2) + \ facet_wrap('name') p.save('../output/time_series.pdf')
# -*- coding:utf-8 -*- # 准备数据 import ggplot as gp # 不太喜欢import * import pandas as pd meat = gp.meat p=gp.ggplot(gp.aes(x='date',y='beef'),data=meat)+gp.geom_point(color='red')+gp.ggtitle(u'散点图') print (p) p=gp.ggplot(gp.aes(x='date',y='beef'),data=meat)+gp.geom_line(color='blue')+gp.ggtitle(u'折线图') print (p) p=gp.ggplot(gp.aes(x='date',y='beef'),data=meat)+gp.geom_point(color='red')+gp.geom_line(color='blue')+gp.ggtitle(u'散点图+折线图') print (p) # 将想要表达的变量组成一列 meat_lng = pd.melt(meat[['date','beef','pork','broilers']],id_vars='date') # meat_lng包含了date,value(变量的值组成的列),variable(变量的名称组成的列) p = gp.ggplot(gp.aes(x='date',y='value',colour='variable'),data=meat_lng)+\ gp.geom_point()+gp.geom_line() print (p) meat_lng = pd.melt(meat[['date','beef','pork','broilers']],id_vars='date') p = gp.ggplot(gp.aes(x='date',y='value',colour='variable'),data=meat_lng)+gp.geom_point()+gp.facet_wrap('variable') print (p) p = gp.ggplot(gp.aes(x='beef'),data=meat)+gp.geom_histogram() print (p)
data = [] for method in methods: for model in models: for rtol in rtols: print('method: {} model: {} rtol: {}'.format( method.name, model.name, rtol), end='') # Run tic = time.time() result = method(model, rtol) toc = time.time() - tic # Compare to gold standard standard = gold_standards[model.name] diff = result - standard.values max_rel_diff = np.max(diff / standard.max) # Append to table record = (method.name, model.name, rtol, max_rel_diff, toc) print(' err: {} toc: {}'.format(max_rel_diff, toc)) data.append(record) data = DataFrame(data, columns=['method', 'model', 'rtol', 'err', 'time']) print( gg.ggplot(data, gg.aes(x='err', y='time', color='method')) + gg.geom_point(size=60.0) + gg.geom_line() + gg.scale_x_log() + gg.scale_y_log() + gg.xlim(1e-10, 1e-2))
def render_png(self, buffer): """ Render timeseries plots as PNG images. """ bucket = self.bucket import matplotlib.font_manager matplotlib.font_manager.findSystemFonts(fontpaths=None, fontext='ttf') import matplotlib try: matplotlib.use('agg') except: pass import matplotlib.pyplot as plt df = self.dataframe #df = df.set_index(['time']) # Compute datetime range boundaries datetime_min = min(df.time) datetime_max = max(df.time) datetime_delta = datetime_max - datetime_min #xmin = pd.to_datetime('2016-05-01') #xmax = pd.to_datetime('2016-08-01') renderer = bucket.tdata.get('renderer', 'matplotlib') if renderer == 'matplotlib': # Bring DataFrame into appropriate format df = dataframe_index_and_sort(df, 'time') # Propagate non-null values forward or backward, otherwise # matplotlib would not plot the sparse data frame properly. # With time series data, using pad/ffill is extremely common so that the “last known value” is available at every time point. # http://pandas.pydata.org/pandas-docs/stable/missing_data.html#filling-missing-values-fillna df.fillna(method='pad', inplace=True) # Make plots of DataFrame using matplotlib / pylab. # http://matplotlib.org/ # http://pandas.pydata.org/pandas-docs/version/0.13.1/visualization.html # http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.plot.html # https://markthegraph.blogspot.de/2015/05/plotting-time-series-dataframes-in.html if 'style' in bucket.tdata and bucket.tdata.style: try: plt.style.use(bucket.tdata.style) except Exception: error_message = u'# Unknown style "{style_name}", available styles: {available}'.format( style_name=bucket.tdata.style, available=plt.style.available) log.error(error_message) return self.request.error_response(bucket, error_message) # Basic plotting #df.plot() #plt.savefig(buffer) # Advanced plotting ax = df.plot() fig = ax.get_figure() # Figure heading title = fig.suptitle(bucket.title.human, fontsize=12) #fig.tight_layout(pad=1.5) # Axis and tick labels ax.set_xlabel('Time') ax.set_ylabel('Value') ax.tick_params(axis='x', labelsize='smaller') # Grid and legend # http://matplotlib.org/users/legend_guide.html # http://matplotlib.org/examples/pylab_examples/legend_demo3.html ax.grid(True) legend_params = dict(ncol=1, loc='center left', bbox_to_anchor=(1, 0.5), fontsize='small', shadow=True, fancybox=True) legend = ax.legend(**legend_params) # title='Origin' #ax.legend(**legend_params) # title='Origin' # Sort list of legend labels # http://stackoverflow.com/questions/22263807/how-is-order-of-items-in-matplotlib-legend-determined/27512450#27512450 # Axis formatting #ax.xaxis_date() #ax.autoscale_view() # Compute appropriate locator and formatter locator, formatter = matplotlib_locator_formatter(datetime_delta, span=1) #ax.xaxis.set_major_locator(locator) ax.xaxis.set_major_formatter(formatter) # Figure formatting fig.autofmt_xdate() # http://stackoverflow.com/questions/10101700/moving-matplotlib-legend-outside-of-the-axis-makes-it-cutoff-by-the-figure-box/10154763#10154763 fig.savefig(buffer, bbox_extra_artists=(title, legend), bbox_inches='tight') # TODO: Add annotations """ # https://stackoverflow.com/questions/11067368/annotate-time-series-plot-in-matplotlib # https://stackoverflow.com/questions/17891493/annotating-points-from-a-pandas-dataframe-in-matplotlib-plot import matplotlib.dates as mdates fig = plot.draw() ax = fig.axes[0] ax.annotate('Test', (mdates.date2num(x[1]), y[1]), xytext=(15, 15), textcoords='offset points', arrowprops=dict(arrowstyle='-|>')) """ elif renderer == 'ggplot': # https://yhat.github.io/ggplot/notebook.html?page=build/docs/examples/Multiple%20Line%20Plot.html # https://stackoverflow.com/questions/23541497/is-there-a-way-to-plot-a-pandas-series-in-ggplot # https://stackoverflow.com/questions/24478925/is-it-possible-to-plot-multiline-chart-on-python-ggplot/24479513#24479513 # https://github.com/yhat/ggplot/blob/master/docs/how-to/Building%20Faceted%20(or%20Trellised)%20Plots.ipynb # https://github.com/yhat/ggplot/blob/master/docs/how-to/Annotating%20Plots%20-%20Titles%20and%20Labels.ipynb # https://github.com/yhat/ggplot/blob/master/docs/how-to/How%20to%20make%20xkcd%20style%20graphs.ipynb from ggplot import ggplot, aes, qplot, geom_line, geom_text, ggtitle, stat_smooth, scale_x_date, date_format, date_breaks from ggplot import theme_538, theme_bw, theme_gray, theme_xkcd # https://stackoverflow.com/questions/24478925/is-it-possible-to-plot-multiline-chart-on-python-ggplot/24479513#24479513 # https://stackoverflow.com/questions/23541497/is-there-a-way-to-plot-a-pandas-series-in-ggplot # Convert DataFrame from wide to long format, retaining "time" as visible column df = dataframe_wide_to_long_indexed(df, 'time') dataframe_index_to_column(df, 'time') # Compute appropriate locator and formatter locator, formatter = matplotlib_locator_formatter(datetime_delta, span=2) plot = ggplot(df, aes(x='time', y='value', color='variable'))\ + geom_line()\ + scale_x_date(limits=(datetime_min, datetime_max), breaks=locator, labels=formatter)\ + ggtitle(bucket.title.human) # Axis labels plot.xlab = 'Time' plot.ylab = 'Value' # Labs #+ stat_smooth(colour='blue', span=0.2) \ #+ geom_text(aes(x='x', y='y'), label='hello world') #+ scale_x_date(limits=(xmin, xmax), breaks=date_breaks('1 hour'), labels=date_format('%Y-%m-%d\n%H:%M')) theme_name = bucket.tdata.get('theme') # TODO: Switching themes will leak some matplotlib/pyplot properties, postpone to future versions if theme_name: if isinstance(theme_name, float): theme_name = str(int(theme_name)) try: theme = eval('theme_' + theme_name) plot += theme() except Exception: error_message = u'# Unknown theme "{theme_name}"'.format( theme_name=theme_name) log.error(error_message) return self.request.error_response(bucket, error_message) plot.save(buffer) # Attempt to reset global matplotlib parameters to get rid of xkcd theme style """ import matplotlib as mpl #mpl.rcParams = mpl.rc_params() #del mpl.rcParams['path.sketch'] #del mpl.rcParams['path.effects'] #mpl.rcParams = mpl.defaultParams.copy() #mpl.rcParams.clear() #mpl.rcdefaults() #mpl.rcParams = mpl.rcParamsOrig if 'axes.prop_cycle' in mpl.rcParams: del mpl.rcParams['axes.prop_cycle'] mpl.rcParams.update({'path.sketch': None, 'path.effects': []}) mpl.rcParams.update(mpl.rc_params()) """ elif renderer == 'seaborn': # TODO: We don't do statistical plotting yet. # https://stanford.edu/~mwaskom/software/seaborn/examples/timeseries_from_dataframe.html # https://stanford.edu/~mwaskom/software/seaborn/generated/seaborn.tsplot.html import seaborn as sns sns.set(style="darkgrid") #sns.tsplot(data=gammas, time="timepoint", unit="subject", condition="ROI", value="BOLD signal") #print dir(df) #df['time'] = pandas.to_datetime(df['time']) #df = df.set_index(df.time) pprint(df) sns.tsplot(data=df, time="time") #sns.tsplot(data=df) plt.savefig(buffer) else: error_message = u'# Unknown renderer "{renderer_name}"'.format( renderer_name=renderer) log.error(error_message) return self.request.error_response(bucket, error_message)
def quarterly_queries(keywords, category, cookies, session, domain, throttle, filing_date, ggplot, month_offset=[-12, 12], trends_url=DEFAULT_TRENDS_URL): """Gets interest data (quarterly) for the 12 months before and 12 months after specified date, then gets interest data for the whole period and merges this data. month_offset: [no. month back, no. months forward] to query Returns daily data over the period. """ aw_range = arrow.Arrow.range begin_period = aget(filing_date).replace(months=month_offset[0]) ended_period = aget(filing_date).replace(months=month_offset[1]) # Set up date ranges to iterate queries across start_range = aw_range('month', YYYY_MM(begin_period), YYYY_MM(ended_period)) ended_range = aw_range('month', YYYY_MM(begin_period).replace(months=3), YYYY_MM(ended_period).replace(months=3)) start_range = [r.datetime for r in start_range][::3] ended_range = [r.datetime for r in ended_range][::3] # Fix last date if incomplete quarter (offset -1 week from today) last_week = arrow.utcnow().replace(weeks=-1).datetime start_range = [d for d in start_range if d < last_week] ended_range = [d for d in ended_range if d < last_week] if len(ended_range) < len(start_range): ended_range += [last_week] # Iterate attention queries through each quarter all_data = [] missing_queries = [] # use this to scale IoT later. for start, end in zip(start_range, ended_range): if start > last_week: break print("Querying period: {s} ~ {e}".format(s=start.date(), e=end.date())) throttle_rate(throttle) response_args = {'url': trends_url.format(domain=domain), 'params': _query_parameters(start, end, keywords, category), 'cookies': cookies, 'session': session} query_data = _check_data(keywords, _process_response( _get_response(**response_args))) if all(int(vals)==0 for date,vals in query_data): query_data = [[date, '0'] for date in arrow.Arrow.range('day', start, end)] missing_queries.append('missing') elif len(query_data[0][0]) > 10: missing_queries.append('weekly') else: missing_queries.append('daily') try: if not aligned_weekly(query_data, all_data): ## Workaround: shift filing date q1 = weekly_date(all_data[-1][-1][0]) q2 = weekly_date(query_data[0][0]) if q1 < q2: start = arrow.get(start).replace(months=-1) response_args['params'] = _query_parameters(start, end, keywords, category) ## Do a new 4month query, overlap/replace previous month. query_data = _check_data(keywords, _process_response( _get_response(**response_args))) if all_data[:-1] != []: q2 = weekly_date(query_data[0][0], 'start') all_data[-1] = [d for d in all_data[-1] if q2 > weekly_date(d[0])] elif q1 >= q2: # if q1 > 1st date in query_data, remove the first few entries query_data = [d for d in query_data if q1 < weekly_date(d[0])] except IndexError: pass except: from IPython import embed; embed() finally: all_data.append(query_data) # Get overall long-term trend data across entire queried period s = begin_period.replace(weeks=-2).datetime e1 = arrow.get(ended_range[-1]).replace(months=+1).datetime e2 = arrow.utcnow().replace(weeks=-1).datetime e = min(e1,e2) print("\n=> Merging with overall period: {s} ~ {e}".format(s=s.date(), e=e.date())) response_args = { 'url': trends_url.format(domain=domain), 'params': _query_parameters(s, e, keywords, category), 'cookies': cookies, 'session': session } query_data = _check_data(keywords, _process_response( _get_response(**response_args))) if len(query_data) > 1: # compute changes in IoI (interest over time) per quarter # and merged quarters together after interpolating data # with daily data. # We cannot mix quarters as Google normalizes each query all_ioi_delta = [] qdat_interp = [] for quarter_data in all_data: if quarter_data != []: quarter_data = [x for x in quarter_data if x[1] != ''] all_ioi_delta += list(zip(*change_in_ioi(*zip(*quarter_data)))) if ggplot: qdat_interp += interpolate_ioi(*zip(*quarter_data))[1] # for plotting only qdate = [date for date, delta_ioi in all_ioi_delta] delta_ioi = [delta_ioi for date, delta_ioi in all_ioi_delta] ydate = [date[-10:] if len(date) > 10 else date for date, ioi in query_data] try: yIoI = [float(ioi) for date, ioi in query_data] except: # from IPython import embed; embed() yIoI = [float(ioi) for date, ioi in query_data[:-1]] ydate, yIoI = interpolate_ioi(ydate, yIoI) # match quarterly and yearly dates and get correct delta IoI # common_date = [x for x in ydate+qdate if x in ydate and x in qdate] common_date = sorted(set(ydate) & set(qdate)) delta_ioi = [delta_ioi for date,delta_ioi in zip(qdate, delta_ioi) if date in common_date] y_ioi = [y for x,y in zip(ydate, yIoI) if x in common_date] # calculate daily %change in IoI and adjust weekly values adj_IoI = [ioi*mult for ioi,mult in zip(y_ioi, delta_ioi)] adj_all_data = [[str(date.date()), round(ioi, 2)] for date,ioi in zip(common_date, adj_IoI)] else: adj_all_data = [[str(date.date()), int(zero)] for date, zero in zip(*interpolate_ioi(*zip(*sum(all_data,[]))))] # from IPython import embed; embed() heading = ["Date", keywords[0].title] querycounts = list(zip((d.date() for d in start_range), missing_queries)) keywords[0].querycounts = querycounts if not ggplot: return [heading] + adj_all_data ## GGplot Only else: # GGPLOT MERGED GTRENDS PLOTS: import pandas as pd from ggplot import ggplot, geom_line, ggtitle, ggsave, scale_colour_manual, ylab, xlab, aes try: ydat = pd.DataFrame(list(zip(common_date, y_ioi)), columns=["Date", 'Weekly series']) mdat = pd.DataFrame(list(zip(common_date, adj_IoI)), columns=['Date', 'Merged series']) qdat = pd.DataFrame(list(zip(common_date, qdat_interp)), columns=['Date', 'Daily series']) ddat = ydat.merge(mdat, on='Date').merge(qdat,on='Date') ddat['Date'] = list(map(pd.to_datetime, ddat['Date'])) ydat['Date'] = list(map(pd.to_datetime, ydat['Date'])) mdat['Date'] = list(map(pd.to_datetime, mdat['Date'])) qdat['Date'] = list(map(pd.to_datetime, qdat['Date'])) except UnboundLocalError as e: raise(UnboundLocalError("No Interest-over-time to plot")) # meltkeys = ['Date','Weekly series','Merged series','Daily series'] # melt = pd.melt(ddat[meltkeys], id_vars='Date') colors = [ '#77bde0', # blue '#b47bc6', # purple '#d55f5f' # red ] entity_type = keywords[0].desc g = ggplot(aes(x='Date', y='Daily series' ), data=ddat) + \ geom_line(aes(x='Date', y='Daily series'), data=qdat, alpha=0.5, color=colors[0]) + \ geom_line(aes(x='Date', y='Merged series'), data=mdat, alpha=0.9, color=colors[1]) + \ geom_line(aes(x='Date', y='Weekly series'), data=ydat, alpha=0.5, color=colors[2], size=1.5) + \ ggtitle("Interest over time for '{}' ({})".format(keywords[0].keyword, entity_type)) + \ ylab("Interest Over Time") + xlab("Date") # from IPython import embed; embed() print(g) # ggsave(BASEDIR + "/iot_{}.png".format(keywords[0].keyword), width=15, height=5) return [heading] + adj_all_data
if True: plot_coverage.loc[ t_float - 1 + t_max - 1, 'Transition Probability'], plot_coverage.loc[ t_float - 1 + t_max - 1, 'Cumulative Probability'] = 0.15, 0.15 #get_weibull(t = t_float, coverage = input_par['uptake'], duration = 1) plot_coverage.loc[t_float - 1 + t_max - 1, 'Model'] = 'Static' plot_coverage.loc[t_float - 1 + t_max - 1, 'Simulation month'] = t_float #plot save_dir = os.path.dirname(os.path.abspath(__file__)) gg_trans_p = ggplot( aes(x='Simulation month', y='Transition Probability', color='Model'), data=plot_coverage ) + geom_line() + ggtitle( 'Weibull transition probabilities for PrEP uptake \n(Shape = 2, Coverage/Uptake = 15%, Target horizon for coverage/uptake = 30 months)' ) #\ #geom_vline(aes(xintercept = input_par['duration']), linetype = 'dashed', color = 'gray') + scale_x_continuous(breaks = sort([min(plot_coverage['Simulation month']), max(plot_coverage['Simulation month'])], length.out=5), input_par['duration']) +\ #geom_hline(aes(yintercept = input_par['uptake']), linetype = 'dashed', color = 'gray') + scale_y_continuous(breaks = sort(c(seq(min(plot_coverage['Transition Probability']), max(plot_coverage['Transition Probability']), length.out=5), input_par['uptake']))) +\ gg_cumul_p = ggplot(aes(x='index', y='Cumulative Probability', color='Model'), data=plot_coverage) + geom_line() gg_trans_p.save(filename='Weibull transition probabilities for PrEP uptake') gg_cumul_p.save(filename='Weibull cumulative probabilities for PrEP uptake') #%% get variation of tx rate x_target_cov = np.array([0.1]) y_target_time = np.array([30]) res_dict, pop = get_threshold_crossing(x_target_cov, y_target_time, input_par['sus_to_inf'])
def ggplot_img(xt): xt = pd.DataFrame({'n': range(len(xt)), 'xt': xt}) p = gp.ggplot(gp.aes(x='n', y='xt'), data=xt) + gp.geom_line(color='black') print(p)