def plot(self, what='cumulative_payouts', include_ci=True): import ggplot as gg #This is hacky ... need to DRY out the imports if what == 'cumulative_payouts': plt = self._plot_cumulative_payouts(include_ci=include_ci) elif what == 'avg_accuracy': plt = self._plot_avg_accuracy(include_ci=include_ci) elif what == 'all': summary = self.summary() p1 = self._plot_cumulative_payouts(include_ci=include_ci, summary=summary) p2 = self._plot_avg_accuracy(include_ci=include_ci, summary=summary) d1 = p1.data d2 = p2.data d1['Outcome'] = d1['AverageCumulativePayout'] d2['Outcome'] = d2['AverageAccuracy'] d1['Plot'] = 'Cumulative Payouts' d2['Plot'] = 'Average Accuracy' df = d1.append(d2, ignore_index=True) if include_ci: plt = gg.ggplot(gg.aes(x='Round', y='Outcome', ymin='ymin', ymax='ymax'), data=df) + \ gg.geom_area(alpha=0.5) else: plt = gg.ggplot(gg.aes(x='Round', y='Outcome'), data=df) plt += gg.facet_grid('Plot', scales='free') else: raise ValueError('%s is not a valid option' % what) return plt + gg.geom_line()
def _post_density_plot(self, func=None, x_name='', plot_title='', include_doses=None, boot_samps=1000): from ggplot import aes, ggplot, geom_density, ggtitle import pandas as pd if include_doses is None: include_doses = range(1, self.num_doses + 1) def my_func(x, samp): tox_probs = _pi_T(x, mu=samp[:, 0], beta=samp[:, 1]) eff_probs = _pi_E(x, mu=samp[:, 2], beta1=samp[:, 3], beta2=samp[:, 4]) u = self.metric(eff_probs, tox_probs) return u if func is None: func = my_func x_boot = [] dose_indices = [] samp = self.pds._samp p = self.pds._probs p /= p.sum() for i, x in enumerate(self.scaled_doses()): dose_index = i+1 if dose_index in include_doses: x = func(x, samp) x_boot.extend(np.random.choice(x, size=boot_samps, replace=True, p=p)) dose_indices.extend(np.repeat(dose_index, boot_samps)) df = pd.DataFrame({x_name: x_boot, 'Dose': dose_indices}) return ggplot(aes(x=x_name, fill='Dose'), data=df) + geom_density(alpha=0.6) + ggtitle(plot_title)
def displacement_plot(centered, limits=None, style=None): u"""Draws nice displacement plots using ggplot2. params: centered (pd.DataFrame): needs cX, cY, Object, Frame columns, probably produced by calling center() above limits (real): Sets the limits of the scales to a square window showing ±limits on each axis. style (Iterable): Collection of strings. Recognized values are 'theme-bw' (which uses theme_bw instead of theme_seaborn) and 'no-terminal-dot' (which does not label the end of tracks which terminate early). Returns: g (gg.ggplot): Plot object """ style = {} if style is None else style centered['Object'] = centered['Object'].map(str) centered = centered.sort(['Frame', 'Object']) g = (gg.ggplot(centered, gg.aes(x='cX', y='cY', color='Object')) + gg.geom_path(size=0.3)) g += gg.theme_bw() # if 'theme-bw' in style else gg.theme_seaborn() if limits: g = g + gg.ylim(-limits, limits) + gg.xlim(-limits, limits) if 'no-terminal-dot' not in style: max_frame = centered['Frame'].max() endframe = centered.groupby('Object')['Frame'].max() endframe = endframe[endframe != max_frame].reset_index() endframe = endframe.merge(centered, on=['Object', 'Frame']) # we should check if endframe is empty before adding it: # https://github.com/yhat/ggplot/issues/425 if not endframe.empty: g += gg.geom_point(data=endframe, color='black', size=1) return g
def scatter(x, y, filename=""): df = pd.DataFrame({ 'x': pd.Series(x), 'y': pd.Series(y) }) p = gg.ggplot(gg.aes(x='x', y='y'), data=df) + gg.geom_point() if filename == "": print p else: gg.ggsave(filename="graphs/scatter/"+filename+".png", plot=p)
def plot_update_frequency(result): import pandas as pd import numpy #turns query results into timeseries of chnages d = [] v = [] for res in result: d.append(pd.Timestamp(res['_id']['timestamp']).to_datetime()) v.append(res['count']) ts = pd.DataFrame(v, index = d, columns = ['changes']) ts = ts.resample('W', how='sum') ts.index.names = ['date'] import ggplot #plots timeseries of changes p = ggplot.ggplot(ts, ggplot.aes(x = ts.index, y=ts['changes'])) +\ ggplot.geom_point(color = 'blue') +\ ggplot.xlab('Period') +\ ggplot.ylab('Changes') +\ ggplot.geom_smooth() +\ ggplot.ylim(low = 0) +\ ggplot.scale_x_date(breaks = ggplot.date_breaks("12 months"), labels = ggplot.date_format('%Y-%m')) +\ ggplot.ggtitle('OpenStreetMaps Denver-Boulder\nChanges per Week') return p
def plot_cost_history(alpha, cost_history): cost_df = pandas.DataFrame({ 'Cost_History': cost_history, 'Iteration': range(len(cost_history)) }) return gp.ggplot(cost_df, gp.aes('Iteration', 'Cost_History')) +\ gp.geom_point() + gp.geom_line() + gp.ggtitle('Cost History for alpha = %.3f' % alpha )
def plot_roc(self, experiment_type, to_plot): # turn this to string for categorical colour scheme to_plot.loc[:, "parameter"] = [str(par) for par in to_plot.loc[:, "parameter"]] p = gg.ggplot(data = to_plot, aesthetics = gg.aes(x = "FPR", y = "TPR", colour = "parameter")) + \ gg.geom_line(gg.aes(x = "FPR", y = "TPR", colour = "parameter")) + \ gg.ggtitle(experiment_type) + gg.xlab("FPR") + gg.ylab("TPR") gg.ggsave(filename = self.results_path + experiment_type + "_" + self.mode + ".png", plot = p) return
def _plot_cumulative_payouts(self, include_ci=True, summary=None): import ggplot as gg if summary is None: summary = self.summary() df = pd.DataFrame({'AverageCumulativePayout': summary['CumulativePayout']['Avg'], 'Std': summary['CumulativePayout']['Std'], 'Round': range(self.n_rounds)}) if include_ci: df['ymin'] = df.AverageCumulativePayout - 1.96 * df.Std df['ymax'] = df.AverageCumulativePayout + 1.96 * df.Std plt = gg.ggplot(gg.aes(x='Round', y='AverageCumulativePayout', ymin='ymin', ymax='ymax'), data=df) + \ gg.geom_area(alpha=0.5) else: plt = gg.ggplot(gg.aes(x='Round', y='AverageCumulativePayout'), data=df) return plt + gg.geom_line()
def signature_data_plot(sd): import ggplot as gg aes = gg.aes(x='set_exp', y='not_exp', color='pearson_r') return gg.ggplot(aes, data=sd) \ + gg.geom_point(size=15) \ + gg.scale_color_gradient(low='yellow', high='red') \ + gg.scale_x_log() + gg.scale_x_continuous(limits=(0.5, 10000)) \ + gg.scale_y_log() + gg.scale_y_continuous(limits=(0.05, 10000))
def plot_deg_distrib(G): (in_deg, out_deg, deg) = wa.degree_distribution(G) in_deg_series = pd.Series(in_deg) out_deg_series = pd.Series(out_deg) in_out = { 'in_deg': in_deg_series, 'out_deg': out_deg_series } df = pd.DataFrame(in_out) df = pd.melt(df) p = gg.ggplot(gg.aes(x='value', color='variable', fill='variable'), data=df2) + gg.geom_histogram(alpha=0.6, binwidth=1) print p
def histogram(self, dataframe, bins=100, width=None, height=None, palette=None, title='Histogram', values=None, groups=None, legend=True): palette = self.__default_options__.get('palette', None) if palette is None else palette return ggplot(dataframe, aes(x=values, fill=groups, color=groups)) + \ geom_histogram(alpha=0.6, breaks=bins, position="fill") + \ self._palette(palette) + \ ggtitle(title) + \ scale_y_continuous(name="Count (%s)" % values)
def plot_weather_data(turnstile_weather): ''' You are passed in a dataframe called turnstile_weather. Use turnstile_weather along with ggplot to make a data visualization focused on the MTA and weather data we used in assignment #3. You should feel free to implement something that we discussed in class (e.g., scatterplots, line plots, or histograms) or attempt to implement something more advanced if you'd like. Here are some suggestions for things to investigate and illustrate: * Ridership by time of day or day of week * How ridership varies based on Subway station * Which stations have more exits or entries at different times of day If you'd like to learn more about ggplot and its capabilities, take a look at the documentation at: https://pypi.python.org/pypi/ggplot/ You can check out: https://www.dropbox.com/s/meyki2wl9xfa7yk/turnstile_data_master_with_weather.csv To see all the columns and data points included in the turnstile_weather dataframe. However, due to the limitation of our Amazon EC2 server, we are giving you about 1/3 of the actual data in the turnstile_weather dataframe ''' #Ridership by day of week - Option 1 (Entries by Day of Week) #pd.options.mode.chained_assignment = None # default='warn' #turnstile_weather['weekday'] = pd.to_datetime(turnstile_weather['DATEn']).apply(pd.datetime.weekday) #plot = gg.ggplot(turnstile_weather, aes('weekday','ENTRIESn_hourly')) + ggtitle('Entries by Day of Week') + xlab('Day of Week') + ylab('Number of Entries') +gg.geom_histogram(stat = "bar", position = "stack")+ scale_x_discrete(limits=(-1, 7), breaks=range(0, 7, 1), labels=["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]) #Ridership by day of week - Option 2 (Avg number of Entries by Day of Week) pd.options.mode.chained_assignment = None # default='warn' turnstile_weather['weekday'] = pd.to_datetime(turnstile_weather['DATEn']).apply(pd.datetime.weekday) averageentries_on_weekday = turnstile_weather.groupby('weekday', as_index=False).ENTRIESn_hourly.mean() averageentries_on_weekday.rename(columns={'ENTRIESn_hourly': 'avg_ENTRIESn_hourly'}, inplace=True) plot = gg.ggplot(averageentries_on_weekday, aes('weekday', 'avg_ENTRIESn_hourly')) + ggtitle('Avg number of Entries by Day of Week') + xlab('Day of Week') + ylab('avg number of Entries') + gg.geom_histogram(stat = "bar", position = "stack") + scale_x_discrete(limits=(-1, 7), breaks=range(0, 7, 1), labels=["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]) #Ridership by Unit(Station) - Option 3 (Entries by UNIT) #pd.options.mode.chained_assignment = None # default='warn' #plot = gg.ggplot(turnstile_weather, aes('UNIT','ENTRIESn_hourly')) + ggtitle('Entries by UNIT') + xlab('UNIT') + ylab('Number of Entries') +gg.geom_histogram(stat = "bar", position = "stack") + scale_x_discrete(limits=(-1, 100), breaks=range(0, 100, 1)) #Ridership by day of week - Option 4 (Avg number of Entries by UNIT) #pd.options.mode.chained_assignment = None # default='warn' #averageentries_unit = turnstile_weather.groupby('UNIT', as_index=False).ENTRIESn_hourly.mean() #averageentries_unit.rename(columns={'ENTRIESn_hourly': 'avg_ENTRIESn_hourly'}, inplace=True) #plot = gg.ggplot(averageentries_unit, aes('UNIT','avg_ENTRIESn_hourly')) + ggtitle('Avg number of Entries by UNIT') + xlab('UNIT') + ylab('avg number of Entries') + gg.geom_histogram(stat = "bar", position = "stack") + scale_x_discrete(limits=(-1, 50), breaks=range(0, 50, 1)) return plot
def plotAverageLatency(self): averages = [d.averageLatency() for d in self.data] dat = { "device" : range(1, len(averages) + 1), "average" : averages } dataframe = pandas.DataFrame(dat) chart = ggplot.ggplot(ggplot.aes(x="device", weight="average"), dataframe) \ + ggplot.labs(title="Average Latency Per Device") + \ ggplot.ylab("Average Latency (ms)") + \ ggplot.xlab("Device Number") + \ ggplot.geom_bar(stat="identity") chart.show()
def plot_weather_data(df): df.DATEn = pd.to_datetime(df.DATEn) grouped = df.groupby('DATEn', as_index=False).sum() grouped.index.name = 'DATEn' plot = gp.ggplot(grouped, gp.aes(x='DATEn', y='EXITSn_hourly')) plot += gp.geom_line() plot += gp.ggtitle('Subway Ridership by Day') plot += gp.xlab('Date') plot += gp.ylab('Exits') return plot
def plot_weather_data(df): # older version df.DATEn = pd.to_datetime(df.DATEn) grouped = df.groupby('DATEn', as_index=False).sum() grouped.index.name = 'DATEn' p_title = 'Subway Ridership by Hour vs Raining' p_xlab = 'Hour of the Day' p_ylab = 'Subway Entries' plot = gp.ggplot(grouped, gp.aes(x='DATEn', y='EXITSn_hourly')) + gp.geom_line() + gp.ggtitle(p_title) + gp.xlab(p_xlab) + gp.ylab(p_ylab) return plot
def plot(self): prob231g_plot_df = self.data.copy() for k in range(self.num_clusters): n = prob231g_plot_df.shape[0] prob231g_plot_df.loc[n] = self.cluster_centers[k] prob231g_plot_df["class_label"] = [label for label in self.class_label] + \ self.num_clusters * ["center"] p = gg.ggplot(prob231g_plot_df, gg.aes(x= "x1", y="x2", colour="class_label")) + \ gg.geom_point() + gg.ggtitle("EM cluster assignments") print p return
def heatmap(self, dataframe, y=None, x=None, values=None, width=None, height=None, max_color=None, min_color=None, mid_color=None, title='Heatmap'): max_color = self.__default_options__.get('max_color', None) if max_color is None else max_color min_color = self.__default_options__.get('min_color', None) if min_color is None else min_color mid_color = self.__default_options__.get('mid_color', None) if mid_color is None else mid_color width = self.__default_options__.get('width', None) if width is None else width palette = gradient(min_color, mid_color, max_color) return ggplot(dataframe, aes(x=x, y=y, fill=values)) + \ geom_tile() + \ self._palette(palette, "div")
def lineplot(hr_year_csv): df = pandas.read_csv(hr_year_csv) gg = ( gp.ggplot(df, gp.aes(x="yearID", y="HR")) + gp.geom_point(color="red") + gp.geom_line(color="red") + gp.ggtitle("Homeruns by Year") + gp.xlab("Homeruns") + gp.ylab("Year") ) return gg
def lineplot_compare(filename): df = pd.read_csv(filename) gg = ( gp.ggplot(df, gp.aes(x="yearID", y="HR", color="teamID")) + gp.geom_point() + gp.geom_line() + gp.ggtitle("Homeruns by Year by Team") + gp.xlab("Homeruns") + gp.ylab("Year") ) return gg
def prob231cd_recover(initialization): filename = "results/prob231cd" + initialization tuple_in = pkl.load(open(filename + ".pkl", "rb")) prob231c_plot_df = tuple_in[0] kmcalls = tuple_in[1] num_trials = tuple_in[2] p = gg.ggplot(prob231c_plot_df, gg.aes(x= "x1", y="x2", colour="data")) + \ gg.geom_point() + gg.ggtitle(initialization + " initialization") gg.ggsave(filename + ".png", plot = p) obj = [kmcalls[i].obj for i in range(num_trials)] obj_stats = {"mean":np.mean(obj), "sd":np.std(obj), "min":np.min(obj)} return obj_stats
def _plot_avg_accuracy(self, include_ci=True, summary=None): import ggplot as gg if summary is None: summary = self.summary() df = pd.DataFrame({'AverageAccuracy': summary['Accuracy']['Avg'], 'Round': range(self.n_rounds)}) if include_ci: from scipy import stats succ = df.AverageAccuracy * self.n_sim fail = self.n_sim - succ interval = stats.beta(succ + 1, fail + 1).interval(0.95) df['ymin'] = interval[0] df['ymax'] = interval[1] plt = gg.ggplot(gg.aes(x='Round', y='AverageAccuracy', ymin='ymin', ymax='ymax'), data=df) + \ gg.geom_area(alpha=0.5) else: plt = gg.ggplot(gg.aes(x='Round', y='AverageAccuracy'), data=df) return plt + gg.geom_line()
def scatter(self, dataframe, x=None, y=None, width=None, height=None, color=None, title='Scatter', xaxis_label=None, yaxis_label=None): color = self.__default_options__.get('palette', None) if color is None else color width = self.__default_options__.get('width', None) if width is None else width gg = ggplot(dataframe, aes(x, y)) + geom_point(color=color, alpha=0.6) + ggtitle(title) if xaxis_label: gg += scale_x_continuous(name=xaxis_label) if yaxis_label: gg += scale_y_continuous(name=xaxis_label) return gg
def plotHistogramMeans(hist,fileName): num_clust = hist.shape[0] IDS = np.mat(range(0,num_clust)) IDS = IDS.reshape(num_clust,1) histD = np.concatenate((IDS,hist),axis=1) Data = pd.DataFrame(histD,columns = ['ID']+range(0,hist.shape[1])) Melted = pd.melt(Data,id_vars=['ID']) pv = ggplot.ggplot( ggplot.aes(x='variable',y='value'),data=Melted) + ggplot.geom_line() + ggplot.facet_wrap("ID") print "Saving mean histograms" ggplot.ggsave(pv,'./IMG/'+fileName)
def _ggplot(df, out_file): """Plot faceted items with ggplot wrapper on top of matplotlib. XXX Not yet functional """ import ggplot as gg df["variant.type"] = [vtype_labels[x] for x in df["variant.type"]] df["category"] = [cat_labels[x] for x in df["category"]] df["caller"] = [caller_labels.get(x, None) for x in df["caller"]] p = (gg.ggplot(df, gg.aes(x="caller", y="value.floor")) + gg.geom_bar() + gg.facet_wrap("variant.type", "category") + gg.theme_seaborn()) gg.ggsave(p, out_file)
def prob231b(initialization = "regular"): cluster_counts = [2,3,5,10,15,20] kmcalls = [0 for i in cluster_counts] for i, num_clusters in enumerate(cluster_counts): kmcalls[i] = KmeansCall(features_only, num_clusters, initialization) kmcalls[i].run_kmeans(verbose = False) df_to_plot = kmcalls[i].data.copy() df_to_plot["class_label"] = [label for label in kmcalls[i].class_label] p = gg.ggplot(df_to_plot, gg.aes(x= "x1", y="x2", colour="class_label")) + \ gg.geom_point() + gg.ggtitle("Synth. data, k=" + str(num_clusters)) metadata = "k=" + str(num_clusters) + "_" + datestring gg.ggsave(filename = "results/" + metadata +".png", plot = p)
def plot_outcomes(self, chart_title=None, use_ggplot=False): """ Plot the outcomes of patients observed. :param chart_title: optional chart title. Default is fairly verbose :type chart_title: str :param use_ggplot: True to use ggplot, else matplotlib :type use_ggplot: bool :return: a plot of patient outcomes """ if not chart_title: chart_title="Each point represents a patient\nA circle indicates no toxicity, a cross toxicity" chart_title = chart_title + "\n" if use_ggplot: if self.size() > 0: from ggplot import (ggplot, ggtitle, geom_text, aes, ylim) import numpy as np import pandas as pd patient_number = range(1, self.size()+1) symbol = np.where(self.toxicities(), 'X', 'O') data = pd.DataFrame({'Patient number': patient_number, 'Dose level': self.doses(), 'DLT': self.toxicities(), 'Symbol': symbol}) p = ggplot(data, aes(x='Patient number', y='Dose level', label='Symbol')) \ + ggtitle(chart_title) + geom_text(aes(size=20, vjust=-0.07)) + ylim(1, 5) return p else: if self.size() > 0: import matplotlib.pyplot as plt import numpy as np patient_number = np.arange(1, self.size()+1) doses_given = np.array(self.doses()) tox_loc = np.array(self.toxicities()).astype('bool') if sum(tox_loc): plt.scatter(patient_number[tox_loc], doses_given[tox_loc], marker='x', s=300, facecolors='none', edgecolors='k') if sum(~tox_loc): plt.scatter(patient_number[~tox_loc], doses_given[~tox_loc], marker='o', s=300, facecolors='none', edgecolors='k') plt.title(chart_title) plt.ylabel('Dose level') plt.xlabel('Patient number') plt.yticks(self.dose_levels()) p = plt.gcf() phi = (np.sqrt(5)+1)/2. p.set_size_inches(12, 12/phi)
def lineplot_compare(filename): # Cleaner version with string vars df = pd.read_csv(filename) p_title = "Homeruns by Year by Team" p_xlab = "Homeruns" p_ylab = "Year" gg = ( gp.ggplot(df, gp.aes(x="yearID", y="HR", color="teamID")) + gp.geom_point() + gp.geom_line() + gp.ggtitle(p_title) + gp.xlab(p_xlab) + gp.ylab(p_ylab) ) return gg
def plot(self, inputs): """Plot the given X and Y axes on a scatter plot""" if inputs.year not in self.dat.Year.values: return if inputs.xvar not in self.dat or inputs.yvar not in self.dat: return subdat = self.dat[self.dat.Year == inputs.year] p = ggplot(subdat, aes(x=inputs.xvar, y=inputs.yvar)) p = p + geom_point() if inputs.shownames: p = p + geom_text(aes(label=self.ID_col), vjust=1, hjust=1) if inputs.linear: p = p + stat_smooth(color="red", method="lm") return p
def main(log): log.debug('initializing app') p = pyaudio.PyAudio() # Open audio input stream stream = p.open(format = FORMAT, channels = CHANNELS, rate = SAMPLE_RATE, input = True, frames_per_buffer = CHUNK_SIZE) log.debug('opened stream <{}>'.format(stream)) log.debug('reading audio input at rate <{}>'.format(SAMPLE_RATE)) recorded = [] # Start mainloop loops = 0 while True: loops += 1 if loops % 25 == 0: log.debug('recorded <{}> loops'.format(loops)) # Decode chunks of audio data from the stream try: data = stream.read(CHUNK_SIZE) decoded = np.fromstring(data, 'Float32'); mx = max(decoded) recorded.append(mx) # On <C-c>, plot max of recorded data except KeyboardInterrupt as ee: log.debug('closing stream and ending PyAudio') stream.close() p.terminate() df = pd.DataFrame(columns = ['mx', 'time']) df['mx'] = recorded df['time'] = range(len(recorded)) plt = ggplot.ggplot(ggplot.aes(x='time', y='mx'), data=df) +\ ggplot.geom_line() pdb.set_trace() log.debug('quitting') sys.exit(1)
def prob231g(): filename = "results/prob231g" num_clusters_231g = 3 emcall = EMCall(features_only, labels_only, num_clusters_231g) emcall.run_em() plt.plot(emcall.log_likelihood_record) plt.title("Likelihood over EM iterations") plt.savefig(filename + "_loglike.png") prob231g_plot_df = emcall.data.copy() prob231g_plot_df["class_label"] = [label for label in emcall.class_label] p = gg.ggplot(prob231g_plot_df, gg.aes(x= "x1", y="x2", colour="class_label")) + \ gg.geom_point() + gg.ggtitle("EM cluster assignments") gg.ggsave(filename + "_clusters.png", plot = p) pkl.dump(obj = emcall, file = open(filename + "_a.pkl", "wb")) print("Done with 231g.") return
for t_float in time: tp_FS, tp_PK = get_weibull(t=t_float, coverage=input_par['uptake'], duration=input_par['duration'], shape=s) plot_prob.loc[row_idx, 'Monthly transition probability'] = tp_FS plot_prob.loc[row_idx + 1, 'Monthly transition probability'] = tp_PK plot_prob.loc[row_idx, 'time'] = t_float plot_prob.loc[row_idx + 1, 'time'] = t_float plot_prob.loc[row_idx, 'Formula'] = 'FS' plot_prob.loc[row_idx + 1, 'Formula'] = 'PK' row_idx += 2 # collect collect_prob['FS ' + str(s)] = plot_prob.loc[ plot_prob.loc[:, 'Formula'] == 'FS', 'Monthly transition probability'].values collect_prob['PK ' + str(s)] = plot_prob.loc[ plot_prob.loc[:, 'Formula'] == 'PK', 'Monthly transition probability'].values # plot x = ggplot(aes( x='time', y='Monthly transition probability', color='Formula'), data=plot_prob) + geom_line() #name = r'Shape: ' + str(s)# + r', Coverage/Uptake = ' + str(input_par['uptake']*100) + r', Coverage time = ' + str(input_par['duration']) + '.jpg' x.save('Weibull' + str(plot_num)) plot_num += 1
def second(dataframe): plot = ggplot.ggplot( ggplot.aes(x='Speed'), data=dataframe) + ggplot.geom_bar(color='lightblue') + ggplot.ggtitle( "Frequencies of Speeds Among Interfaces") + ggplot.theme_xkcd() plot.show()
# 文字说明 plt.plot([1, 2, 3]) plt.text(1, 2, r'$\mu=100, \sigma=15$') plt.show() plt.plot([1, 2, 3]) plt.annotate('test', xy=(1, 2), xytext=(1.5, 2.2), arrowprops=dict(facecolor='black', shrink=0.05)) plt.show() # ggplot示例 import ggplot as gg p = gg.ggplot(df2, aes(x='MSales', y='Comments')) p = p + geom_point() print(p) # 2.可视化与数据分析 # 简单数据分析 import numpy as np df3 = df1[df1['Place'] == '广东 广州'].append(df1[df1['Place'] == '浙江 杭州']) np.mean(df3['MSales']) #求MSales变量均值 np.std(df3['MSales']) #求MSales变量标准差 np.median(df3['MSales']) #求MSales变量中位数 np.percentile(df3['MSales'], 25) #求MSales变量分位数 df3.groupby(by='Place').count() #根据Place分组,计算频数 df3.groupby(by='Place').mean() #根据Place分组,计算均值
def plot(self, inp1, inp2, inp3=None): p = gg.ggplot(gg.aes(x=inp1, y=inp2, color=inp3), data=self.data) + \ gg.geom_point() print p
m=1, alpha=alpha)) #save results to results folder, with plot and printing to screen. metadata = datetime.datetime.now().strftime( "%Y-%m-%d %H:%M:%S") + "test_mode==" + str(test_mode) f = open("results/LSH_vs_KDT_%s.pkl" % metadata, mode='w') pkl.dump(obj=results, file=f) logtimes = [math.log(r.avg_time, 2) for r in results] distances = [r.avg_distance for r in results] methods = [r.method[0:3] for r in results] alpha = [r.alpha for r in results] m = [r.m for r in results] results_df = pd.DataFrame( data={ "logtimes": logtimes, "distances": distances, "methods": methods, "m": m, "alpha": alpha }) print results_df p = gg.ggplot(data = results_df, aesthetics = gg.aes(x = "logtimes", y = "distances", label = "methods")) + \ gg.geom_text() + \ gg.ggtitle("LSH and KD trees: tradeoffs") + \ gg.xlab("Log2 average query time ") + gg.ylab("Average L2 distance from query point)") gg.ggsave(filename="results/LSH_vs_KDT_%s.png" % metadata, plot=p)
def plot_components(df, title, file_loc, experiment_number, dataset_name): chart = ggplot(df, aes(x='comp-one', y='comp-two', color='label')) \ + geom_point(size=75, alpha=0.8) \ + ggtitle(title) chart.save("images/experiment_" + str(experiment_number) + "/" + dataset_name + "/" + file_loc + "/" + title + ".png")
f = '\t{:18s} = {:5.2f}' print('\nErgodic Means') print(f.format('Profit Contribution', data['profit'].mean())) print(f.format('Activity', (data['i'] == 'active').mean())) print('\nErgodic Standard Deviations\n') print(f.format('Profit Contribution', data['profit'].std())) print(f.format('Activity', (data['i'] == 'active').std())) # Plot Simulated and Expected Continuous State Path data2 = data[['time', 'profit']].groupby('time').mean() data2['time'] = data2.index print(data2) print(data2.columns) ppp = ggplot(aes('time','profit'), data=data2) + \ geom_line() print(ppp) ppp = ggplot(aes('time','profit','_rep'), data=data[data._rep <3]) + \ geom_point() + \ geom_line(aes('time','profit'), data=data2) print(ppp) print( demo.qplot('time', 'profit', '_rep', data=data[data._rep < 3], geom='line') + geom_line(aes('time', 'profit'), data=data2)) '''
import ggplot as gg import ultrasignup as us import numpy as np d = us.event_results(299) p1 = gg.ggplot( gg.aes(x='time_hour',fill='gender'),d[(d.distance=='50K')&(d.time_hour>1.0)]) + \ gg.facet_grid(x='gender') + \ gg.geom_bar(stat="bin",binwidth=.5,position="dodge",colour="black") + \ gg.xlab("Time (hours)") + gg.ylab("Number of Finishers") + \ gg.ggtitle("50K Finishing Times for All Years") p2 = gg.ggplot( gg.aes(x='time_hour',fill='gender'),d[(d.distance=='11 Miler')&(d.time_hour>1.0)]) + \ gg.facet_grid(x='gender') + \ gg.geom_bar(stat="bin",binwidth=.5,position="dodge",colour="black") + \ gg.xlab("Time (hours)") + gg.ylab("Number of Finishers") + \ gg.ggtitle("11M Finishing Times for All Years")
count_vect = CountVectorizer() kk = count_vect.fit_transform(subjects_train) analyze = count_vect.build_analyzer() subjects_words_count = subjects_train.apply(lambda x: len(analyze(x))) print(subjects_words_count.describe()) #%% import ggplot as gg df = pd.DataFrame(subjects_words_count, columns = ["count"]) hist = gg.ggplot(df, gg.aes(x = "count")) hist += gg.xlab("# of words") +\ gg.ylab("Frequency") +\ gg.ggtitle("Frequency of words") hist += gg.geom_vline(x = df.mean(), color="red") hist += gg.geom_vline(x = df.median(), color="blue") hist += gg.geom_density(color="green") hist += gg.geom_histogram(binwidth=1, color="grey") hist #%% # 1st attemtp to classify subjects per tag
data = [] for method in methods: for model in models: for rtol in rtols: print('method: {} model: {} rtol: {}'.format( method.name, model.name, rtol), end='') # Run tic = time.time() result = method(model, rtol) toc = time.time() - tic # Compare to gold standard standard = gold_standards[model.name] diff = result - standard.values max_rel_diff = np.max(diff / standard.max) # Append to table record = (method.name, model.name, rtol, max_rel_diff, toc) print(' err: {} toc: {}'.format(max_rel_diff, toc)) data.append(record) data = DataFrame(data, columns=['method', 'model', 'rtol', 'err', 'time']) print( gg.ggplot(data, gg.aes(x='err', y='time', color='method')) + gg.geom_point(size=60.0) + gg.geom_line() + gg.scale_x_log() + gg.scale_y_log() + gg.xlim(1e-10, 1e-2))
def ggplot_img(xt): xt = pd.DataFrame({'n': range(len(xt)), 'xt': xt}) p = gp.ggplot(gp.aes(x='n', y='xt'), data=xt) + gp.geom_line(color='black') print(p)
def plot_after_transmission_results(data, path_names): # import input data for tranmission analysis var_and_val = pd.DataFrame(columns=['x', 'Variable'], index=range(0, 12)) plot_lm = pd.DataFrame( columns=['x', 'Life Months', 'Scenario', 'Variable'], index=range(0, 24)) data_in = pd.read_excel( os.path.join(path_names['transmission'], 'Input files', 'transmission_rate_multiplier_required_inputs.xlsx')) col = [ 'Yearly incidence in MSM', 'Number of HIV uninfected individuals (HRG size)', 'Number of HIV infected individuals in primary cohort at t=0' ] col_adj = ['Incidence', 'Uninfected', 'Infected'] base_val = [0.009, 2960000, 136400] for i in range(len(col)): idx = data_in.loc[data_in.loc[:, col[i]] != base_val[i], col[i]].index.values[0] var_and_val.loc[idx - 1:idx + 3 - 1, 'x'] = data_in.loc[idx:idx + 3, col[i]].values var_and_val.loc[idx - 1:idx + 3 - 1, 'Variable'] = col_adj[i] row_idx = -2 var_idx = [-1, -1, -1] for var in data: if 'HIV+' in var: var_idx[2] += 1 plot_lm.loc[row_idx:row_idx + 1, 'x'] = var_and_val.loc[ var_and_val['Variable'] == col_adj[2], 'x'].values[var_idx[2]] plot_lm.loc[row_idx:row_idx + 1, 'Variable'] = var_and_val.loc[ var_and_val['Variable'] == col_adj[2], 'Variable'].values[var_idx[2]] plot_lm.loc[ row_idx:row_idx + 1, 'Life Months'] = data[var]['popstats'].loc[:, 'LMs_'].values plot_lm.loc[ row_idx:row_idx + 1, 'Scenario'] = data[var]['popstats'].loc[:, 'RUN_NAME_'].values elif 'HIV-' in var: var_idx[1] += 1 plot_lm.loc[row_idx:row_idx + 1, 'x'] = var_and_val.loc[ var_and_val['Variable'] == col_adj[1], 'x'].values[var_idx[1]] plot_lm.loc[row_idx:row_idx + 1, 'Variable'] = var_and_val.loc[ var_and_val['Variable'] == col_adj[1], 'Variable'].values[var_idx[1]] plot_lm.loc[ row_idx:row_idx + 1, 'Life Months'] = data[var]['popstats'].loc[:, 'LMs_'].values plot_lm.loc[ row_idx:row_idx + 1, 'Scenario'] = data[var]['popstats'].loc[:, 'RUN_NAME_'].values elif 'Incidence' in var: var_idx[0] += 1 plot_lm.loc[row_idx:row_idx + 1, 'x'] = var_and_val.loc[ var_and_val['Variable'] == col_adj[0], 'x'].values[var_idx[0]] plot_lm.loc[row_idx:row_idx + 1, 'Variable'] = var_and_val.loc[ var_and_val['Variable'] == col_adj[0], 'Variable'].values[var_idx[0]] plot_lm.loc[ row_idx:row_idx + 1, 'Life Months'] = data[var]['popstats'].loc[:, 'LMs_'].values plot_lm.loc[ row_idx:row_idx + 1, 'Scenario'] = data[var]['popstats'].loc[:, 'RUN_NAME_'].values row_idx += 2 # plot save_path = os.path.join(path_names['transmission'], r'Input files', r'Plots for final runs') if not os.path.exists(save_path): os.makedirs(save_path) (ggplot(aes(x='x', y='Life Months', color='Scenario'), plot_lm) + geom_line() + facet_wrap('Variable', scales='free')).save( os.path.join(save_path, 'Comparison of ')) return
def plot_transmission_results(tx_results, percentage_decline, save_path, path_names): #%% what are inputs? # transmission results # There'll be a folder called 'Runs prepared for ...' # all the folders inside that folder will have a CEPAC results folder. # tx_data is a dictionary and will have two keys, 'monthly' and 'popstats' # 'monthly' key will only have primary transmissions data tx_data = deepcopy(tx_results) t = 120 total_var = 3 total_val = 4 # percentage decline # this is also dictionary of percentage decline values for each folder # having cepac results # save_path eaxact folder where you want to save your images # path_names will have paths to transmissions and sensitivity directories #%% plot percentage decline # geberate an environment object first # lets go for line plot data_plot = pd.DataFrame( columns=['x', 'Percentage decline', 'Transmissions', 'Variable'], index=range(0, total_var * total_val)) data_in = pd.read_excel( os.path.join(path_names['transmission'], 'Input files', 'transmission_rate_multiplier_required_inputs.xlsx')) col = [ 'Incidence rate per 100 PY specific to high-risk group 1', 'HIV uninfected individuals in high-risk group 1', 'HIV infected individuals in high-risk group 1' ] col_adj = ['Incidence', 'Uninfected', 'Infected'] data_in[col[0]] = data_in[col[0]].round(1) base_val = [np.float64(0.9), 2960000, 136400] y1_values = {col[0]: [], col[1]: [], col[2]: []} for var in percentage_decline: if 'HIV+' in var: y1_values[col[2]].append(percentage_decline[var]) elif 'HIV-' in var: y1_values[col[1]].append(percentage_decline[var]) elif 'Incidence' in var: y1_values[col[0]].append(percentage_decline[var]) for i in range(len(col)): idx = data_in.loc[data_in.loc[:, col[i]] != base_val[i], col[i]].index.values[0] data_plot.loc[idx - 1:idx + 3 - 1, 'x'] = data_in.loc[idx:idx + 3, col[i]].values data_plot.loc[idx - 1:idx + 3 - 1, 'Variable'] = col_adj[i] data_plot.loc[idx - 1:idx + 3 - 1, 'Percentage decline'] = y1_values[col[i]] # plot df_float = data_plot.loc[data_plot.loc[:, 'Percentage decline'] <= 200, :] (ggplot(aes(x='x', y='Percentage decline'), df_float) + geom_line() + facet_wrap('Variable', scales='free')).save( os.path.join(save_path, 'Percentage decline')) del df_float #%% visualizing transmissions # index = range(time * number of values for each variable * number of variables) def set_abc(run, var_idx, var_name, var_value_idx): # set variable names data_plot_tx.loc[(var_idx - 1) * t:((var_idx - 1) * t) + t - 1, 'Variable'] = var_name # set variable value data_plot_tx.loc[(var_idx - 1) * t:((var_idx - 1) * t) + t - 1, 'Value'] = data_plot.loc[ data_plot.loc[:, 'Variable'] == var_name, 'x'].values[var_value_idx] if 'RunA' in run: data_plot_tx.loc[(var_idx - 1) * t:((var_idx - 1) * t) + t - 1, 'RunA tx'] = tx_data[var]['monthly'][run][ 'transmissions'].iloc[0:t].values elif 'RunB' in run: data_plot_tx.loc[(var_idx - 1) * t:((var_idx - 1) * t) + t - 1, 'RunB tx'] = tx_data[var]['monthly'][run][ 'transmissions'].iloc[0:t].values elif 'RunC' in run: data_plot_tx.loc[(var_idx - 1) * t:((var_idx - 1) * t) + t - 1, 'RunC tx'] = tx_data[var]['monthly'][run][ 'transmissions'].iloc[0:t].values data_plot_tx = pd.DataFrame( index=range(t * total_var * total_val), columns=['Variable', 'Value', 'RunA tx', 'RunB tx', 'RunC tx']) var_idx = -1 var_val_idx = [-1, -1, -1] for var in tx_data: var_idx += 1 if 'HIV+' in var: var_val_idx[2] += 1 var_name = col_adj[2] for run in tx_data[var]['monthly']: set_abc(run, var_idx, var_name, var_val_idx[2]) elif 'HIV-' in var: var_val_idx[1] += 1 var_name = col_adj[1] for run in tx_data[var]['monthly']: set_abc(run, var_idx, var_name, var_val_idx[1]) elif 'Incidence' in var: var_val_idx[0] += 1 var_name = col_adj[0] for run in tx_data[var]['monthly']: set_abc(run, var_idx, var_name, var_val_idx[0]) else: continue data_plot_tx['t'] = 0 t_float = -1 for row in data_plot_tx.index: if t_float == t - 1: t_float = -1 t_float += 1 data_plot_tx.loc[row, 't'] = t_float #%% plots for individual runs run_col = ['RunA tx', 'RunB tx', 'RunC tx'] inci = data_plot_tx.loc[data_plot_tx.loc[:, 'Variable'] == 'Incidence', :] inf = data_plot_tx.loc[data_plot_tx.loc[:, 'Variable'] == 'Infected', :] uninf = data_plot_tx.loc[data_plot_tx.loc[:, 'Variable'] == 'Uninfected', :] for i in run_col: (ggplot(aes(x='t', y=i, color='Value'), data_plot_tx) + geom_line() + facet_wrap('Variable', scales='free')).save( os.path.join( save_path, str(i + r'_transmissions for all variable all values'))) (ggplot(aes(x='t', y=i), inci) + geom_line() + facet_wrap('Variable', 'Value', scales='free')).save( os.path.join( save_path, str(i + r'_plots for individual values of incidence'))) (ggplot(aes(x='t', y=i), inf) + geom_line() + facet_wrap('Variable', 'Value', scales='free')).save( os.path.join( save_path, str(i + r'_plots for individual values of infected population'))) (ggplot(aes(x='t', y=i), uninf) + geom_line() + facet_wrap('Variable', 'Value', scales='free')).save( os.path.join( save_path, str(i + '_plots for individual values of uninfected population'))) #%% compare runs ABC data_plot_abc = {} for var in col_adj: float_df = pd.DataFrame(index=range(0, t * total_var * total_val), columns=['t', 'Value', 'Transmissions', 'Run']) insert_idx = -1 for val in data_plot.loc[data_plot.loc[:, 'Variable'] == var, 'x']: var_df = data_plot_tx.loc[data_plot_tx.loc[:, 'Variable'] == var, :] var_df = var_df.reset_index(drop=True) var_val_df = var_df.loc[var_df.loc[:, 'Value'] == val, :] var_val_df = var_val_df.reset_index(drop=True) for c in ['RunA tx', 'RunB tx', 'RunC tx']: insert_idx += 1 float_df.loc[insert_idx * t:(insert_idx * t) + t - 1, 'Run'] = c float_df.loc[insert_idx * t:(insert_idx * t) + t - 1, 'Transmissions'] = var_val_df.loc[:, c].values float_df.loc[insert_idx * t:(insert_idx * t) + t - 1, 'Run'] = c float_df.loc[insert_idx * t:(insert_idx * t) + t - 1, 'Value'] = val float_df.loc[insert_idx * t:(insert_idx * t) + t - 1, 't'] = np.arange(t) data_plot_abc[var] = float_df.dropna() (ggplot(aes(x='t', y='Transmissions', color='Run'), float_df) + geom_line() + facet_wrap('Value', scales='free') + ggtitle(var)).save( os.path.join( save_path, str(var + '_comparison of transmissions in runs ABC'))) #%% compare runs BC for var in data_plot_abc: float_df = data_plot_abc[var].loc[ data_plot_abc[var].loc[:, 'Run'] != 'RunA tx', :] (ggplot(aes(x='t', y='Transmissions', color='Run'), float_df) + geom_line(alpha=0.2) + facet_wrap('Value', scales='free') + stat_smooth(method='loess', se=False) + ggtitle(var)).save( os.path.join(save_path, str(var + '_comparison of transmissions in runs BC'))) return
"""Plot target variable as time series.""" import get_data from ggplot import aes, geom_line, facet_wrap, ggplot if __name__ == "__main__": df = get_data.get_all_data() p = ggplot(df, aes('datetime', 'cap', group='date')) + \ geom_line(alpha=0.2) + \ facet_wrap('name') p.save('../output/time_series.pdf')
from plot import plot_trace_points, plot_trace_path from preprocess import smooth_spline, smooth_regress from kalman import kalman_filter import pandas as pd from ggplot import ggplot, aes, geom_point, geom_line if __name__ == '__main__': df = pd.DataFrame( data={ 't': [1, 2, 3, 4, 5], 'lat': [10, 12, 10, 9, 8], 'lon': [100, 99, 98, 95, 97] }) df2 = smooth_spline(df, 0.1) df3 = smooth_regress(df, 0.1, 4) #df4 = smooth_regress(df, 0.1, 3) df4 = kalman_filter(df, 0.1) p = ggplot(aes(x='lat', y='lon'), data=pd.DataFrame(columns=('lat', 'lon'), data={})) p += plot_trace_points(df, color='black') p += plot_trace_path(df2, color='red') p += plot_trace_path(df3, color='green') p += plot_trace_path(df4, color='blue') p.save('test.png')
(vcfdf['TestBias']=='Pass') & (vcfdf['CHROM']==reference) ]['Pi'])) return testwindows # Generate new dataframe with analyses performed per window if options.graphics == True: print "Analysing by "+ str(windowsize) +"sliding windows and generating plots" windowed_df = pd.DataFrame({'window':sorted(list(set(vcfdf['window']))), 'MaxMinor':windowMax(sorted(list(set(vcfdf['window'])))), 'Pi':windowPi(sorted(list(set(vcfdf['window']))))}) # Now try and plot graph p_MaxMinor = gg.ggplot(gg.aes('window', 'MaxMinor'),data=windowed_df) +gg.geom_point() +gg.theme_bw() +gg.labs(x="Genome Position (bp; windowsize="+ str(windowsize) +")", y="Minor Variant Frequency (%)") +gg.ggtitle(vcfoutput + "\n Valid Minor Variant Sites :" + str(len(minorvar))) # Plot Nucleotide Diversity (Pi) along genome p_pi =gg.ggplot(gg.aes('window', 'Pi'),data=windowed_df) +gg.geom_point() +gg.theme_bw() +gg.labs(x="Genome Position (bp; windowsize="+ str(windowsize) +")", y="Mean nucleotide diversity (" + u"\u03c0" +")") +gg.scale_y_continuous(expand=(0,0),limits=(0, windowed_df['Pi'].max(axis=0)+0.001)) +gg.ggtitle(vcfoutput + "\n Genome-wide Mean Nucleotide Diversity (" +u"\u03c0"+ ") :" +str(round(gw_Pi,6))) #p_pi # Facetted plot (still not sorted y axes labels yet) windowed_df_melt = pd.melt(windowed_df, id_vars=['window']) p_combi = gg.ggplot(gg.aes('window', 'value',colour='variable'),data=windowed_df_melt) p_combi = p_combi + gg.geom_point(colour='variable') + gg.facet_grid('variable',scales='free_y')+gg.theme_bw() +gg.labs(x="Genome Position (bp; windowsize="+ str(windowsize) +")") # Print graphs to .png p_combi.save(vcfinput + ".MinorVar_combo.png") p_MaxMinor.save(vcfinput + ".MinorVar.png")
(176, 208)).astype(float)) plt.tight_layout() plt.show() # In[206]: n_sne = 7000 tsne_results = tsne.fit_transform(df.loc[rndperm[:n_sne], feat_cols].values) # In[207]: df_tsne = df.loc[rndperm[:n_sne], :].copy() df_tsne['x-tsne'] = tsne_results[:, 0] df_tsne['y-tsne'] = tsne_results[:, 1] view = ggplot.ggplot(df_tsne, aes( x='x-tsne', y='y-tsne', color='label')) + geom_point( size=70, alpha=0.2) + ggtitle("tSNE dimensions colored by digit") # In[208]: view # In[209]: # --logdir=/Users/glynisttheisen/Desktop/Final # # PCA # In[210]: m_PCA = PCA(n_components=10)
index=range(t * len(count_tops), t * len(count_tops) + len(count_tops))) probs_list.append(probs_t) # Calculate KL divergences kl_mle_list.append(stats.entropy(true_bins_t, mle_probs_vals)) kl_nn_list.append(stats.entropy(true_bins_t, nn_probs_t)) probs = pd.concat(probs_list) # In[44]: probs_tail = probs[probs.Tenor > 360] gg.ggplot(probs_tail, gg.aes(x='Count Top', weight='Probs True') ) + gg.facet_grid('Tenor') + gg.geom_bar() + gg.geom_step( gg.aes(y='Probs MLE', color='red')) + gg.geom_step( gg.aes(y='Probs NN', color='blue')) + gg.scale_x_continuous( limits=(0, len(count_tops))) # In[57]: # KL divergences kl_df = pd.DataFrame({ 'Tenor': range(0, t_end + 1), 'KL MLE': kl_mle_list, 'KL NN': kl_nn_list }) print kl_df.head() print kl_df.tail()
print("#######################################") print("打印所挖掘的文本文件 text-movie.xls 前几行") print(df.head()) #text = df.comments.iloc[0] 单个影评情感分析实验, iloc中的index值表示第几个应用,编号从0开始 #s = SnowNLP(text) # #print(s.sentiments) def get_sentiment_cn(text): s = SnowNLP(text) return s.sentiments df["sentiment"] = df.comments.apply(get_sentiment_cn) print("#######################################") print("打印所挖掘的文本文件 text-movie.xls 部分影评及其情感分析值") print(df) print("#######################################") print("重要信息") print("所有影评的平均值为:", df.sentiment.mean()) print("所有影评的中位数为:", df.sentiment.median()) ggplot.ggplot(ggplot.aes(x="date", y="sentiment"), data=df) + ggplot.geom_point() + ggplot.geom_line( color='blue') + ggplot.scale_x_date( labels=ggplot.date_format("%Y-%m-%d")) df.sort_values(['sentiment'])[:5]
#%% PC Regression lin_reg = LinearRegression() scores = cross_val_score(lin_reg, X95[:,:10], Y) scores.mean() #%% Partial Least Squares from sklearn.cross_decomposition import PLSRegression pls = PLSRegression(n_components=10) Xpls, Ypls = pls.fit_transform(X,Y) #%% Visualization with labeling import ggplot as gg df1['x1'], df1['x2'] = Xpca[:,0],Xpca[:,1] chart = gg.ggplot( df1, gg.aes(x='x1', y='x2', color='has_cites') ) \ + gg.geom_point(size=10, alpha=.8) chart.show() #%% PLS transformation df1['x1'], df1['x2'] = Xpls[:,0],Xpls[:,1] chart = gg.ggplot( df1, gg.aes(x='x1', y='x2', color='has_cites') ) \ + gg.geom_point(size=10, alpha=.8) chart.show() #%% Feature Selection with Elastic Net scaler = StandardScaler() Xscale = scaler.fit_transform(X) from sklearn.linear_model import ElasticNet enet_reg = ElasticNet(alpha=.1, l1_ratio=.5) enet_reg.fit(Xscale,Y) nonzero = enet_reg.coef_ != 0 print(nonzero.sum(),'non-zero of',len(enet_reg.coef_),'coefficients.')
# time_start = time.time() # tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300) # tsne_pca_results = tsne.fit_transform(pca_result) # np.save('tsne_pca_results.npy', tsne_pca_results) # print ('t-SNE done! Time elapsed: {} seconds'.format(time.time()-time_start)) tsne_pca_results = np.load('tsne_pca_results.npy') print(tsne_pca_results.shape) df_tsne = None df_tsne = df.loc[:, :].copy() df_tsne['x-tsne-pca'] = tsne_pca_results[:, 0] df_tsne['y-tsne-pca'] = tsne_pca_results[:, 1] df = df[df['pca-one'] <= 100] # (df['pca-one'].mean() + df['pca-one'].std()) df = df[df['pca-two'] <= 50] # (df['pca-two'].mean() + df['pca-two'].std()) df = df[df['pca-one'] >= -100] # (df['pca-one'].mean() - df['pca-one'].std()) df = df[df['pca-two'] >= -50] # (df['pca-two'].mean() - df['pca-two'].std()) print('Size of the dataframe after outlier removal: {}'.format(df.shape)) pca_chart = ggplot( df.loc[:,:], aes(x='pca-one', y='pca-two', color='label') ) \ + geom_point(size=75,alpha=0.8) \ + ggtitle("First and Second Principal Components colored by digit") pca_chart.save('chart_pca_unet_adv_4.png', dpi=1080) tsne_chart = ggplot( df_tsne, aes(x='x-tsne-pca', y='y-tsne-pca', color='name') ) \ + geom_point(size=70,alpha=0.1) \ + ggtitle("tSNE dimensions colored by Digit (PCA)") tsne_chart.save('chart_tsne_unet_adv_4.png', dpi=1080)
from ggplot import aes, diamonds, geom_density, ggplot import matplotlib.pyplot as plt from bokeh import mpl from bokeh.plotting import output_file, show g = ggplot(diamonds, aes(x='price', color='cut')) + geom_density() g.draw() plt.title("Density ggplot-based plot in Bokeh.") output_file("ggplot_density.html", title="ggplot_density.py example") show(mpl.to_bokeh())
# -*- coding: utf-8 -*- from ggplot import ggplot, aes, geom_point, geom_line, ggtitle, xlab, ylab data = [] xvar = 'X' yvar = 'Y' print ggplot( data, aes(x='yearID', y='HR')) + \ geom_point(color='red') + \ geom_line(color='red') + \ ggtitle('Number of HR by year') + \ xlab('Year') + \ ylab('Number of HR')
where ULEZ = true and no2_ppb <> -999 ) ) group by pod_id_location """) qry_job = bqclient.query(qry_str, location='EU', job_config=job_config) #save result as dataframe df = qry_job.to_dataframe() df_long = df.melt(id_vars=['pod_str', 'pod_idx'], value_vars=['p05', 'p25', 'med', 'p75', 'p95'], var_name='yparam', value_name='value') #plots #plt1 = gg.ggplot(df, gg.aes(x='date_UTC',y='no2_ppb'))+gg.geom_line()+gg.xlab('Time')+gg.ylab('NO2 (ppb)')+gg.theme_bw()+gg.facet_wrap('pod_id_location',scales='free_y') #plt1.save(filename = r'.\charts\ulezpodts.png', width=None, height=None, dpi=200) plt2 = gg.ggplot(df_long, gg.aes( x='pod_str', y='value', color='yparam')) + gg.geom_point() + gg.xlab( 'pod') + gg.ylab('NO2 (as % of median)') + gg.theme_bw() + gg.theme( figure_size=(12, 6)) + gg.scale_x_discrete() plt2.save(filename=r'.\charts\ulezpodvar.png', width=10, height=6, dpi=200) #repeat for mobile data using segid instead of podid where N = 10 and N = 40 #repeat for stationary data at mobile times qry_str = (""" with cte0 as ( --all data, ULEZ pods with 6000 hrs select date_UTC, a.pod_id_location, no2_ppb from AQMesh.NO2_scaled_hightimeres_ppb_20180901_20190630 a join AQMesh.NO2_site_metadata_v2_1_20180901_20190630 b on a.pod_id_location=b.pod_id_location where ULEZ = true and no2_ppb <> -999 and a.pod_id_location in
def graph(y): data = pd.DataFrame({'iteration': list(range(len(y))), 'RMSE': y}) p = gg.ggplot(gg.aes(x='iteration', y='RMSE'), data=data) + gg.geom_point() + gg.geom_line() return p
slope = 0.3 x = randn(num) * 50. + 150.0 y = randn(num) * 5 + x * slope plt.scatter(x, y, c='b') # In[72]: # plt.scatter(x[(y < 1) & (y > -1)], y[(y < 1) & (y > -1)], c='r') # np.argsort, np.sort, complicated index slicing dframe = pd.DataFrame({'x': x, 'y': y}) g = sns.jointplot('x', 'y', data=dframe, kind="reg") # ## Grab Python version of ggplot http://ggplot.yhathq.com/ # In[73]: from ggplot import ggplot, aes, geom_line, stat_smooth, geom_dotplot, geom_point # In[74]: ggplot(aes(x='x', y='y'), data=dframe) + geom_point() + stat_smooth(colour='blue', span=0.2) # In[ ]:
from ggplot import aes, geom_line, ggplot, meat import matplotlib.pyplot as plt from bokeh import mpl from bokeh.plotting import output_file, show g = ggplot(aes(x='date', y='beef'), data=meat) + geom_line() g.make() plt.title("Line ggplot-based plot in Bokeh.") output_file("ggplot_line.html", title="ggplot_line.py example") show(mpl.to_bokeh())
# create a new long-form dataframe for clean plotting purposes values_dict = { "significant": coefficients[feature]["significant"], "insignificant": coefficients[feature]["unsignificant"] } df = pd.DataFrame.from_dict(values_dict, orient='index') df = df.transpose() df = pd.melt(df) df['feature'] = feature dfs_to_concat.append(df) master_df = pd.concat(dfs_to_concat) # histogram p = ggplot(aes(x='value', fill='variable', color='variable'), data=master_df) p += geom_histogram(bins=25, alpha=0.5) p += scale_x_continuous(limits=(-25, 25)) p += ggtitle("sarimax coefficient magnitude distribution") p += facet_wrap("feature", ncol=3, scales="free") p += labs(x=" ", y=" ") # visuals t = theme_gray() t._rcParams['font.size'] = 10 t._rcParams['font.family'] = 'monospace' p += t p.save("arima_1/" + "histogram.png") # boxplot
min(vehicles.year) max(vehicles.year) pd.value_counts(vehicles.fuelType1) pd.value_counts(vehicles.trany) vehicles["trany2"] = vehicles.trany.str[0] pd.value_counts(vehicles.trany2) #%% step 1 ~ 4 on Page 202 from ggplot import ggplot, aes, geom_point, xlab, ylab, ggtitle grouped = vehicles.groupby("year") averaged = grouped['comb08', 'highway08', 'city08'].agg([np.mean]) averaged.columns = ['comb08_mean', 'highway08_mean', 'city08_mean'] averaged['year'] = averaged.index print(ggplot(averaged, aes('year', 'comb08_mean')) + geom_point(color='steelblue') + xlab('Year') + ylab('Average MPG') + ggtitle('All cars')) #%% step 5 criteria1 = vehicles.fuelType1.isin(['Regular Gasoline', 'Prenium Gasoline', 'Midgrade Gasoline']) criteria2 = vehicles.fuelType2.isnull() criteria3 = vehicles.atvType != 'Hybrid' vehicles_non_hybrid = vehicles[criteria1 & criteria2 & criteria3] len(vehicles_non_hybrid) #%% step 6 grouped = vehicles_non_hybrid.groupby(['year']) averaged = grouped['comb08'].agg([np.mean])
soup = BeautifulSoup(data.text,'html.parser') weather_observations = soup.find('table',{'summary': "Daily Weather Observations for Brisbane, Queensland for November 2018"}) tbody = weather_observations.find('tbody') daily_min = [] daily_max = [] for tr in tbody.find_all('tr'): daily_min.append(tofloat(tr.find_all('td')[1].text.strip())) daily_max.append(tofloat(tr.find_all('td')[2].text.strip())) # data = [[a,b] for a,b in zip(daily_min,daily_max)] # convert from list to DataFrame daily_temperature = pd.DataFrame(data=[[a,b,c] for a,b,c in zip(range(1,len(daily_min)+1),daily_min,daily_max)], columns=['day','daily min','daily max']) # print(daily_temperature) # making plots myplot = gg.ggplot(gg.aes(x='day',y='daily_max'), data=daily_temperature) +\ gg.geom_point() # different way of making data frame and plots labels = ['daily_min' for a in range(len(daily_min))] + ['daily_max' for a in range(len(daily_max))] weather_data = pd.DataFrame(data=[[a,b,c] for a,b,c in zip(itertools.chain(range(1,len(daily_min)+1),range(1,len(daily_max)+1)),daily_min+daily_max,labels)],columns = ['day','temp','min-max']) print(weather_data) myplot = gg.ggplot(gg.aes(x='day',y='temp',color='min-max'), data=weather_data) +\ gg.geom_point() myplot.show()