def displacement_plot(centered, limits=None, style=None): u"""Draws nice displacement plots using ggplot2. params: centered (pd.DataFrame): needs cX, cY, Object, Frame columns, probably produced by calling center() above limits (real): Sets the limits of the scales to a square window showing ±limits on each axis. style (Iterable): Collection of strings. Recognized values are 'theme-bw' (which uses theme_bw instead of theme_seaborn) and 'no-terminal-dot' (which does not label the end of tracks which terminate early). Returns: g (gg.ggplot): Plot object """ style = {} if style is None else style centered['Object'] = centered['Object'].map(str) centered = centered.sort(['Frame', 'Object']) g = (gg.ggplot(centered, gg.aes(x='cX', y='cY', color='Object')) + gg.geom_path(size=0.3)) g += gg.theme_bw() # if 'theme-bw' in style else gg.theme_seaborn() if limits: g = g + gg.ylim(-limits, limits) + gg.xlim(-limits, limits) if 'no-terminal-dot' not in style: max_frame = centered['Frame'].max() endframe = centered.groupby('Object')['Frame'].max() endframe = endframe[endframe != max_frame].reset_index() endframe = endframe.merge(centered, on=['Object', 'Frame']) # we should check if endframe is empty before adding it: # https://github.com/yhat/ggplot/issues/425 if not endframe.empty: g += gg.geom_point(data=endframe, color='black', size=1) return g
def scatter(x, y, filename=""): df = pd.DataFrame({'x': pd.Series(x), 'y': pd.Series(y)}) p = gg.ggplot(gg.aes(x='x', y='y'), data=df) + gg.geom_point() if filename == "": print p else: gg.ggsave(filename="graphs/scatter/" + filename + ".png", plot=p)
def plot_line(X,y,title=None,labelx=None,labely=None,save=False, colors=None): ''' Show on screen a line plot. Can save to a .pdf file too if specified. X,y - ''' df = pandas.DataFrame() if (title!=None): img_title = title.replace(" ","").replace(".","-") + ".pdf" df['X'] = X for i in range(y.shape[1]): df[str(i)] = y.iloc[:,i].values if colors is None: colors = list(dict(mcolors.BASE_COLORS, **mcolors.CSS4_COLORS).keys()) df = df.iloc[0:df.shape[0]-1, :] p = ggplot(df, aes(x='X')) for i in range(y.shape[1]): if colors not in X.columns.values: p = p + geom_line(aes(y=str(i),color = colors[i])) else: p = p + geom_point(aes(y=str(i),color = colors)) p = p + xlab(labelx) + ylab(labely) + ggtitle(title) if(save): p.save(img_title) else: return p
def t_sne_visualize(latent_vectors, labels, epoch): print(latent_vectors.shape) X_sample = latent_vectors.data.numpy() / 255 feat_cols = ['pixel' + str(i) for i in range(X_sample.shape[1])] nsne = 1000 df = pd.DataFrame(X_sample, columns=feat_cols) df['label'] = labels df['label'] = df['label'].apply(lambda i: str(i)) rndperm = np.concatenate( (list(range(df.shape[0], df.shape[0])), np.random.permutation(df.shape[0]))) tsne = TSNE(n_components=2, verbose=1, perplexity=30) print('INITIALIZED') tsne_results = tsne.fit_transform(df.loc[rndperm[:nsne], feat_cols].values) print('AFTER FITTING') df_tsne = df.loc[rndperm[:nsne], :].copy() df_tsne['x-tsne'] = tsne_results[:, 0] df_tsne['y-tsne'] = tsne_results[:, 1] chart=ggplot( df_tsne, aes(x='x-tsne', y='y-tsne', color='label')) \ + geom_point(size=70, alpha =0.7) \ + ggtitle("tSNE dimensions colored by digit") chart.save( str(args.dataset) + "tsne-vae/2d-vec-miss" + str(args.remove_label) + "/tsne" + str(epoch) + ".png") return
def plot_update_frequency(result): import pandas as pd import numpy #turns query results into timeseries of chnages d = [] v = [] for res in result: d.append(pd.Timestamp(res['_id']['timestamp']).to_datetime()) v.append(res['count']) ts = pd.DataFrame(v, index = d, columns = ['changes']) ts = ts.resample('W', how='sum') ts.index.names = ['date'] import ggplot #plots timeseries of changes p = ggplot.ggplot(ts, ggplot.aes(x = ts.index, y=ts['changes'])) +\ ggplot.geom_point(color = 'blue') +\ ggplot.xlab('Period') +\ ggplot.ylab('Changes') +\ ggplot.geom_smooth() +\ ggplot.ylim(low = 0) +\ ggplot.scale_x_date(breaks = ggplot.date_breaks("12 months"), labels = ggplot.date_format('%Y-%m')) +\ ggplot.ggtitle('OpenStreetMaps Denver-Boulder\nChanges per Week') return p
def t_sne_visualize(generated,n_sne,epoch): transform = transforms.Compose([transforms.Resize((28, 28)), transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))]) # # mnist_ = datasets.MNIST('data/mnist', train=True, download=True, transform=transform) # X=mnist_.data.numpy()/255 # y=mnist_.targets.numpy() # X=np.reshape(np.ravel(X), (X.shape[0], 28*28)) n_label=7 X_sample=generated.data.numpy()/255 y_sample=list(range(n_label))*n_label X_sample=np.reshape(np.ravel(X_sample), (X_sample.shape[0], 28*28*3)) feat_cols = [ 'pixel'+str(i) for i in range(X_sample.shape[1]) ] df = pd.DataFrame(X_sample,columns=feat_cols) df['label'] = y_sample df['label'] = df['label'].apply(lambda i: str(i)) n_sne=49 rndperm = np.concatenate((list(range(df.shape[0],df.shape[0])),np.random.permutation(df.shape[0]))) tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300) print('INITIALIZED') tsne_results = tsne.fit_transform(df.loc[rndperm[:n_sne],feat_cols].values) print('AFTER FITTING') df_tsne = df.loc[rndperm[:n_sne],:].copy() df_tsne['x-tsne'] = tsne_results[:,0] df_tsne['y-tsne'] = tsne_results[:,1] chart=ggplot( df_tsne, aes(x='x-tsne', y='y-tsne', color='label')) \ + geom_point(size=70, alpha =0.7) \ + ggtitle("tSNE dimensions colored by digit") chart.save("tsne"+str(epoch)+".png") return
def scatter(x, y, filename=""): df = pd.DataFrame({ 'x': pd.Series(x), 'y': pd.Series(y) }) p = gg.ggplot(gg.aes(x='x', y='y'), data=df) + gg.geom_point() if filename == "": print p else: gg.ggsave(filename="graphs/scatter/"+filename+".png", plot=p)
def plotSetOfArrays(arrays, names, fileName): IDS = np.linspace(0, 1, arrays[0].shape[0]) A = IDS.reshape(arrays[0].shape[0], 1) for i in range(0, len(arrays)): A = np.concatenate((A, arrays[i]), axis=1) Data = pd.DataFrame(A, columns=['noise'] + names) Melted = pd.melt(Data, id_vars=['noise']) pv = ggplot.ggplot(ggplot.aes(x='noise', y='value', colour='variable'), data=Melted) + ggplot.geom_line() + ggplot.geom_point() ggplot.ggsave(pv, './IMG/' + fileName) output_file("iou_scores.html", title="correlation.py example") figure(tools="pan,wheel_zoom,box_zoom,reset,previewsave") hold() line(IDS, arrays[0][:, 0], color='#A6CEE3', legend=names[0]) line(IDS, arrays[1][:, 0], color='#1F78B4', legend=names[1]) line(IDS, arrays[2][:, 0], color='#B2DF8A', legend=names[2]) line(IDS, arrays[3][:, 0], color='#33A02C', legend=names[3]) line(IDS, arrays[4][:, 0], color='#fb9a99', legend=names[4]) curplot().title = "Minimum IOU" grid().grid_line_alpha = 0.3 show()
def plot_cost_history(alpha, cost_history): cost_df = pandas.DataFrame({ 'Cost_History': cost_history, 'Iteration': range(len(cost_history)) }) return gp.ggplot(cost_df, gp.aes('Iteration', 'Cost_History')) +\ gp.geom_point() + gp.geom_line() + gp.ggtitle('Cost History for alpha = %.3f' % alpha )
def signature_data_plot(sd): import ggplot as gg aes = gg.aes(x='set_exp', y='not_exp', color='pearson_r') return gg.ggplot(aes, data=sd) \ + gg.geom_point(size=15) \ + gg.scale_color_gradient(low='yellow', high='red') \ + gg.scale_x_log() + gg.scale_x_continuous(limits=(0.5, 10000)) \ + gg.scale_y_log() + gg.scale_y_continuous(limits=(0.05, 10000))
def lineplot(hr_year_csv): df = pandas.read_csv(hr_year_csv) gg = ( gp.ggplot(df, gp.aes(x="yearID", y="HR")) + gp.geom_point(color="red") + gp.geom_line(color="red") + gp.ggtitle("Homeruns by Year") + gp.xlab("Homeruns") + gp.ylab("Year") ) return gg
def plot(self): prob231g_plot_df = self.data.copy() for k in range(self.num_clusters): n = prob231g_plot_df.shape[0] prob231g_plot_df.loc[n] = self.cluster_centers[k] prob231g_plot_df["class_label"] = [label for label in self.class_label] + \ self.num_clusters * ["center"] p = gg.ggplot(prob231g_plot_df, gg.aes(x= "x1", y="x2", colour="class_label")) + \ gg.geom_point() + gg.ggtitle("EM cluster assignments") print p return
def lineplot_compare(filename): df = pd.read_csv(filename) gg = ( gp.ggplot(df, gp.aes(x="yearID", y="HR", color="teamID")) + gp.geom_point() + gp.geom_line() + gp.ggtitle("Homeruns by Year by Team") + gp.xlab("Homeruns") + gp.ylab("Year") ) return gg
def visualize_segmentation(X, var): ''' Prints with ggplot a visualization of the different segments. ''' aux = pandas.DataFrame(index = X.index) aux['fecha'] = X.index.values aux[var] = X[var] aux['Segmento'] = X['segmento'].astype(str) return ggplot(aes(x="fecha", y=var, color="Segmento"), aux) + geom_point() + xlab("Fecha") + ylab(var) + ggtitle("Segmentacion de la variable \"" + var + "\"") + theme(axis_text_x = element_text(color=[0,0,0,0]))
def plot_weather_data(df): df.DATEn = pd.to_datetime(df.DATEn) grouped = df.groupby(['DATEn','rain'], as_index=False).sum() grouped.index.name = ['DATEn','rain'] plot = gp.ggplot(grouped, gp.aes(x='DATEn', y='EXITSn_hourly', color='rain')) plot += gp.geom_line() plot += gp.geom_point() plot += gp.ggtitle('Subway Ridership by Day') plot += gp.xlab('Date') plot += gp.ylab('Exits') return plot
def scatter_vis(costs, tss, path, f): plt.figure() p = ggplot(costs, aes(x="$N$", y="cost")) +\ geom_point() +\ geom_hline(y=costs.cost.mean(), color="grey") +\ geom_hline(y=costs.cost.max(), color="red") +\ geom_hline(y=costs.cost.min(), color="green") +\ ggtitle(f.__name__) p.save(path+scatter_vis.__name__+".pdf")
def scatter(self, dataframe, x=None, y=None, width=None, height=None, color=None, title='Scatter', xaxis_label=None, yaxis_label=None): color = self.__default_options__.get('palette', None) if color is None else color width = self.__default_options__.get('width', None) if width is None else width gg = ggplot(dataframe, aes(x, y)) + geom_point(color=color, alpha=0.6) + ggtitle(title) if xaxis_label: gg += scale_x_continuous(name=xaxis_label) if yaxis_label: gg += scale_y_continuous(name=xaxis_label) return gg
def prob231cd_recover(initialization): filename = "results/prob231cd" + initialization tuple_in = pkl.load(open(filename + ".pkl", "rb")) prob231c_plot_df = tuple_in[0] kmcalls = tuple_in[1] num_trials = tuple_in[2] p = gg.ggplot(prob231c_plot_df, gg.aes(x= "x1", y="x2", colour="data")) + \ gg.geom_point() + gg.ggtitle(initialization + " initialization") gg.ggsave(filename + ".png", plot=p) obj = [kmcalls[i].obj for i in range(num_trials)] obj_stats = {"mean": np.mean(obj), "sd": np.std(obj), "min": np.min(obj)} return obj_stats
def plot_timeline(scenes): # Plot character vs scene timelime # NB: due to limitations in Python ggplot we need to plot with scene on y-axis # in order to label x-ticks by character. # scale_x_continuous and scale_y_continuous behave slightly differently. print (gg.ggplot(gg.aes(y='scene', x='character_code'), data=scenes) + gg.geom_point() + gg.labs(x='Character', y='Scene') + gg.scale_x_continuous( labels=scenes['character'].cat.categories.values.tolist(), breaks=range(len(scenes['character'].cat.categories))) + gg.theme(axis_text_x=gg.element_text(angle=30, hjust=1, size=10)))
def prob231cd_recover(initialization): filename = "results/prob231cd" + initialization tuple_in = pkl.load(open(filename + ".pkl", "rb")) prob231c_plot_df = tuple_in[0] kmcalls = tuple_in[1] num_trials = tuple_in[2] p = gg.ggplot(prob231c_plot_df, gg.aes(x= "x1", y="x2", colour="data")) + \ gg.geom_point() + gg.ggtitle(initialization + " initialization") gg.ggsave(filename + ".png", plot = p) obj = [kmcalls[i].obj for i in range(num_trials)] obj_stats = {"mean":np.mean(obj), "sd":np.std(obj), "min":np.min(obj)} return obj_stats
def visualize_clusters(X, var, color = 'cluster'): ''' Prints with ggplot a visualization of the different clusters. ''' aux = pandas.DataFrame() aux['fecha'] = X.index aux.index = X.index aux[var] = X[var] aux['Cluster'] = X[color] return ggplot(aes(x='fecha', y=var, color='Cluster'), aux) + geom_point() + xlab(var) + ylab("Valor") + ggtitle("Clustering de la variable \"" + var + "\"") + theme(axis_text_x = element_text(color=[0,0,0,0]))
def plot(mydata, opts): # number of mutants killed by exactly 0 tests nd = sum(mydata[mydata.ntests == 0].exactly) d = sum(mydata[mydata.ntests != 0].exactly) total = nd + d print("Not detected = ", nd, "/", total) title = opts['title'] + (' ND=%d/%d (Mu: %3.1f%%)' % (nd, total, (1 - nd / total) * 100.0)) p = gg.ggplot(gg.aes(x=opts['x'], y=opts['y']), data=mydata) + gg.geom_point() +\ gg.xlab(opts['x']) + gg.ylab(opts['y']) + gg.ggtitle(title) #+ \ # gg.xlim(0,lim) p.save(opts['file'])
def prob231b(initialization = "regular"): cluster_counts = [2,3,5,10,15,20] kmcalls = [0 for i in cluster_counts] for i, num_clusters in enumerate(cluster_counts): kmcalls[i] = KmeansCall(features_only, num_clusters, initialization) kmcalls[i].run_kmeans(verbose = False) df_to_plot = kmcalls[i].data.copy() df_to_plot["class_label"] = [label for label in kmcalls[i].class_label] p = gg.ggplot(df_to_plot, gg.aes(x= "x1", y="x2", colour="class_label")) + \ gg.geom_point() + gg.ggtitle("Synth. data, k=" + str(num_clusters)) metadata = "k=" + str(num_clusters) + "_" + datestring gg.ggsave(filename = "results/" + metadata +".png", plot = p)
def plot_age_speed(df): num_rows = df.shape[0] title = 'age v speed' print ggplot(df, aes(s.AGE_COL_NAME, s.SPEED_COL_NAME)) + \ ggtitle(_make_title(title, num_rows))+ \ geom_point(colour='steelblue') + \ scale_x_continuous( # breaks=[10,20,30], # labels=["horrible", "ok", "awesome"] ) return df
def prob231b(initialization="regular"): cluster_counts = [2, 3, 5, 10, 15, 20] kmcalls = [0 for i in cluster_counts] for i, num_clusters in enumerate(cluster_counts): kmcalls[i] = KmeansCall(features_only, num_clusters, initialization) kmcalls[i].run_kmeans(verbose=False) df_to_plot = kmcalls[i].data.copy() df_to_plot["class_label"] = [label for label in kmcalls[i].class_label] p = gg.ggplot(df_to_plot, gg.aes(x= "x1", y="x2", colour="class_label")) + \ gg.geom_point() + gg.ggtitle("Synth. data, k=" + str(num_clusters)) metadata = "k=" + str(num_clusters) + "_" + datestring gg.ggsave(filename="results/" + metadata + ".png", plot=p)
def models_llhd(pm_llhd): """ Tracking the total likelihood of READS in a model(cluster). :param pm_llhd: (np.array) matrix stores read likelihood in every model/cluster. :param type ( x axis: iteration time y axis: sum likelihood log value """ p = gp.ggplot(gp.aes(x="iteration num", y="log value"), data=pm_llhd)\ +gp.geom_point(color="blue")\ +gp.ggtitle(u"model likelihood") print(p)
def plot_distance_trip_time(df): num_rows = df.shape[0] title = 'trip duration v distance travelled' print ggplot(df, aes(s.TRIP_DURATION_COL, s.DISTANCE_TRAVELED_COL_NAME)) + \ ggtitle(_make_title(title, num_rows))+ \ stat_smooth(colour="red") + \ geom_point(colour='steelblue') + \ scale_x_continuous( # breaks=[10,20,30], #labels=["horrible", "ok", "awesome"] ) return df
def lineplot_compare(filename): # Cleaner version with string vars df = pd.read_csv(filename) p_title = "Homeruns by Year by Team" p_xlab = "Homeruns" p_ylab = "Year" gg = ( gp.ggplot(df, gp.aes(x="yearID", y="HR", color="teamID")) + gp.geom_point() + gp.geom_line() + gp.ggtitle(p_title) + gp.xlab(p_xlab) + gp.ylab(p_ylab) ) return gg
def point_chart(self, conn, column1, column2, table_chosen, title): data_df = dfile.double_selector(conn=conn, table=table_chosen, col1=column1, col2=column2) point_plot = ggplot( aes(x=column1, y=column2), data=data_df) + geom_point() + theme_gray() + labs(title=title) now = datetime.datetime.now() b = now print(b) print(b - a) print(point_plot)
def googletrend_command(delta_t, threshold=0.0, inverse=False): """the command to run google trend algorithm. :param delta_t: the upper bound for original delta_t parameter :param threshold: upper bound for the threshold of differentiating two classes :param inverse: whether to inverse the classifier """ ## handle filepath and title based on parameter inverse filename = "googletrend" titlename = "ROC of google trend classifier" if inverse: filename += "_inverse" titlename += " (inverse version)" filepath = "./plots/%s.jpg" % filename ## generate data first data = googletrend.preprocess() ## store classifier evaluation metrics into dict output = {} output['tpr'] = [] output['fpr'] = [] output['plot'] = [] for thre in np.arange(0, threshold + 0.1, 0.1): print "==> threshold: %f, inverse: %s" % (thre, inverse) for i in xrange(1, int(delta_t)): googletrend.algorithm(data, i, thre, inverse) tp_rate, fp_rate = googletrend.evaluate(data) # print "delta_t: %d, TPR: %f, FPR: %f" % (i, tp_rate, fp_rate) output['tpr'].append(tp_rate) output['fpr'].append(fp_rate) output['plot'].append('thre_' + str(thre)) ## plot ROC graph ## add a y=x baseline for comparison output['tpr'].extend([0.0, 1.0]) output['fpr'].extend([0.0, 1.0]) output['plot'].extend(['baseline', 'baseline']) df = pd.DataFrame(output) graph = gg.ggplot(df, gg.aes('fpr', 'tpr', color='plot')) + \ gg.theme_seaborn() + \ gg.ggtitle(titlename) + \ gg.xlab("FPR") + \ gg.ylab("TPR") + \ gg.xlim(0.0, 1.0) + \ gg.ylim(0.0, 1.0) + \ gg.geom_point() + \ gg.geom_line() gg.ggsave(plot=graph, filename=filepath, width=6, height=6, dpi=100)
def plot(self, inputs): """Plot the given X and Y axes on a scatter plot""" if inputs.year not in self.dat.Year.values: return if inputs.xvar not in self.dat or inputs.yvar not in self.dat: return subdat = self.dat[self.dat.Year == inputs.year] p = ggplot(subdat, aes(x=inputs.xvar, y=inputs.yvar)) p = p + geom_point() if inputs.shownames: p = p + geom_text(aes(label=self.ID_col), vjust=1, hjust=1) if inputs.linear: p = p + stat_smooth(color="red", method="lm") return p
def plot_cost_history(alpha, cost_history): """This function is for viewing the plot of your cost history. You can run it by uncommenting this plot_cost_history(alpha, cost_history) call in predictions. If you want to run this locally, you should print the return value from this function. """ cost_df = pandas.DataFrame({ 'Cost_History': cost_history, 'Iteration': range(len(cost_history)) }) return ggplot(cost_df, aes('Iteration', 'Cost_History')) + \ geom_point() + ggtitle('Cost History for alpha = %.3f' % alpha)
def graph1(score_data): """ Average score as time goes on; Creates and returns graph 1, a line graph. """ date_column = score_data[0][find_time_stamp(score_data)] data = DataFrame(score_data[1:], columns=score_data[0]) # Get all columns that arlabels = date_format("%Y-%m-%d")e numerical # questions so we know what to graph num_questions = data.select_dtypes(include=['int64']).columns.values # Melt data so that each question is in a seperate row new_data = pd.melt(data, id_vars=date_column, value_vars=num_questions, var_name="Question", value_name="Score") # Convert date string into an actual date type new_data[date_column] = pd.to_datetime(new_data[date_column], format="%m/%d/%Y") # Group all rows with same date and question, and then take the average. new_data = new_data.groupby([date_column, 'Question']).mean().reset_index() new_data['All'] = "Indiviual Questions" new_data2 = new_data.groupby(date_column).mean().reset_index() new_data2['Question'] = "All Questions" new_data2['All'] = "Average of All Questions" new_data = pd.concat([new_data, new_data2]) new_data[date_column] = new_data[date_column].astype('int64') # Create time graph with seperate lines for each question ret = ggplot.ggplot(ggplot.aes(x=date_column, y="Score", colour="Question"), new_data) +\ ggplot.geom_point() +\ ggplot.geom_line() +\ ggplot.facet_grid("All") +\ ggplot.scale_x_continuous(labels=[""], breaks=0) +\ ggplot.labs(x="Time", y="Average Question Score") +\ ggplot.ggtitle("Question Scores Over Time") return ret
def prob231g(): filename = "results/prob231g" num_clusters_231g = 3 emcall = EMCall(features_only, labels_only, num_clusters_231g) emcall.run_em() plt.plot(emcall.log_likelihood_record) plt.title("Likelihood over EM iterations") plt.savefig(filename + "_loglike.png") prob231g_plot_df = emcall.data.copy() prob231g_plot_df["class_label"] = [label for label in emcall.class_label] p = gg.ggplot(prob231g_plot_df, gg.aes(x= "x1", y="x2", colour="class_label")) + \ gg.geom_point() + gg.ggtitle("EM cluster assignments") gg.ggsave(filename + "_clusters.png", plot = p) pkl.dump(obj = emcall, file = open(filename + "_a.pkl", "wb")) print("Done with 231g.") return
def prob231g(): filename = "results/prob231g" num_clusters_231g = 3 emcall = EMCall(features_only, labels_only, num_clusters_231g) emcall.run_em() plt.plot(emcall.log_likelihood_record) plt.title("Likelihood over EM iterations") plt.savefig(filename + "_loglike.png") prob231g_plot_df = emcall.data.copy() prob231g_plot_df["class_label"] = [label for label in emcall.class_label] p = gg.ggplot(prob231g_plot_df, gg.aes(x= "x1", y="x2", colour="class_label")) + \ gg.geom_point() + gg.ggtitle("EM cluster assignments") gg.ggsave(filename + "_clusters.png", plot=p) pkl.dump(obj=emcall, file=open(filename + "_a.pkl", "wb")) print("Done with 231g.") return
def _plot_and_save_local_ancestry(df, kmer, image_filename, num_chromosomes, id_vars, x_axis, y_scale): print('saving plot as: {}'.format(image_filename)) var_name='chromosome' local_ancestry_df_long = pd.melt(df, id_vars=id_vars, var_name=var_name, value_name='estimated_ancestry') new_names = {} for i in range(1, num_chromosomes + 1): new_names['test_{}'.format(i)] = 2*i - 2 * y_scale new_names['true_{}'.format(i)] = 2*i - 1 * y_scale for key, value in new_names.items(): local_ancestry_df_long.replace(key, value, inplace=True) plot = ggplot.ggplot(ggplot.aes(x=x_axis, y=var_name, color='estimated_ancestry'), data=local_ancestry_df_long) \ + ggplot.geom_point() \ + ggplot.scale_y_continuous(labels=list(new_names.keys()), breaks=list(new_names.values())) \ + ggplot.scale_color_manual(values=['#FF0000', '#0000FF', '#73008C']) \ + ggplot.theme(plot_margin={'top':0.7, 'bottom':0.3}) ### TODO: this should depend on scale plot.save(image_filename)
def biplot(X, color='cluster'): ''' Prints a biplot with ggplot. Requires color variable: "cluster" in the dataframe. ''' pca = PCA(n_components=2) res = pca.fit_transform(filter_numerical(X)) df = pandas.DataFrame(res) df.columns = ["x", "y"] if color == 'cluster': df['Cluster'] = X[color].values color = 'Cluster' else: c = X[color].values c[c=="1"] = "Normal" c[c=="-1"] = "Anomalia" df['Detectado como:'] = c color = 'Detectado como:' return ggplot(aes("x","y", color=color),df) + geom_point(aes(size=40))
def data_output(data, chart_title): print "Good News! You're data has been returned. I'm happy to show it to you." print "Just tell me how you want it - Table or Line Graph?" data_output = raw_input("Choose table or line > ") if data_output[0].lower() == "t": print "Ok, here's your data." print data elif data_output[0] == "l" or data_output[0].lower() =="g": import ggplot as gg plot = gg.ggplot(gg.aes(x='Month, Year', y='Value'), data=data) + \ gg.geom_point(color='black') + \ gg.geom_line(color='green') + \ gg.ggtitle(chart_title) + \ gg.xlab("Month, Year") + \ gg.ylab("Value") gg.scale_x_date(breaks = gg.date_breaks('1 month'), labels= gg.date_format("%B")) print (plot + gg.theme_xkcd())
def lineplot(hr_year_csv): # A csv file will be passed in as an argument which # contains two columns -- 'HR' (the number of homerun hits) # and 'yearID' (the year in which the homeruns were hit). # # Fill out the body of this function, lineplot, to use the # passed-in csv file, hr_year_csv, and create a # chart with points connected by lines, both colored 'red', # showing the number of HR by year. # # You will want to first load the csv file into a pandas dataframe # and use the pandas dataframe along with ggplot to create your visualization # # You can check out the data in the csv file at the link below: # https://www.dropbox.com/s/awgdal71hc1u06d/hr_year.csv # # You can read more about ggplot at the following link: # https://github.com/yhat/ggplot/ df = pandas.read_csv(hr_year_csv) gg = gp.ggplot(df, gp.aes('yearID', 'HR')) + gp.geom_point(color='red') + gp.geom_line(color='red') return gg
def plot(self, title='The amazing peixinho\'s plots'): import ggplot import sklearn.manifold import pandas reduc = self._dataset.get('projection', None) if reduc is None or reduc.shape[1] >= 3: tsne = sklearn.manifold.TSNE() reduc = tsne.fit_transform(self._dataset['feats']) self._dataset['projection'] = reduc df = pandas.DataFrame({ 'X': reduc[:, 0].ravel(), 'Y': reduc[:, 1].ravel(), 'truelabel': self._dataset['truelabel'].ravel() }) df['truelabel'] = df['truelabel'].astype(object) return ggplot.ggplot( ggplot.aes(x='X', y='Y', color='truelabel'), data=df) + ggplot.geom_point() + ggplot.ggtitle(title)
def lineplot(hr_year_csv): # Assume that we have a pandas dataframe file called hr_year, # which contains two columns -- yearID, and HR. # # The pandas dataframe contains the number of HR hit in the # Major League baseball in each year. Can you write a function, # lineplot, that creates a chart with points connected by lines, both # colored 'red', showing the number of HR by year? # # You can check out the data loaded into the dataframe at the link below: # https://www.dropbox.com/s/awgdal71hc1u06d/hr_year.csv # your code here df = pd.read_csv('hr_year.csv') gg = gp.ggplot(df, gp.aes('yearID', 'HR')) gg += gp.geom_point(color='red') gg += gp.geom_line(color='red') gg += gp.ggtitle('Total HRs by Year') gg += gp.xlab('Year') gg += gp.ylab('HR') return gg
def lineplot_compare(hr_by_team_year_sf_la_csv): # Write a function, lineplot_compare, that will read a csv file # called hr_by_team_year_sf_la_csv and plot it using pandas and ggplot2. # # This csv file has three columns -- yearID, HR, and teamID, # representing the total number of HR hit each year by the SF Giants # and LA Dodgers. Produce a visualization comparing the total HR by # year of the two teams. # # You can see the data in hr_by_team_year_sf_la_csv # at the link below: # https://www.dropbox.com/s/wn43cngo2wdle2b/hr_by_team_year_sf_la.csv # # Note that to differentiate between multiple categories on the # same plot in ggplot, we can pass color in with the other arguments # to aes, rather than in our geometry functions. # # For example, ggplot(data, aes(xvar, yvar, color=category_var)). This # should help you. df = pandas.read_csv(hr_by_team_year_sf_la_csv) #print(df) gg = gp.ggplot(df, gp.aes('yearID', 'HR', color='teamID')) + gp.geom_point() + gp.geom_line() return gg
def scatter(self, dataframe, x=None, y=None, width=None, height=None, color=None, title='Scatter', xaxis_label=None, yaxis_label=None, label=None): color = self.__default_options__.get('palette', None) if color is None else color width = self.__default_options__.get('width', None) if width is None else width gg = ggplot(dataframe, aes(x, y)) + geom_point( color=color, alpha=0.6) + ggtitle(title) if xaxis_label: gg += scale_x_continuous(name=xaxis_label) if yaxis_label: gg += scale_y_continuous(name=xaxis_label) return gg
try: session.run(run_script) truematch_mod.append(session.getvalue("mod_delta_m"+str(mc))) runtime_mod.append(gmodel_relaxed.Runtime) except RuntimeError: print ("unable to evaluate true matchin perf of gurobi model in "+fname) except (OSError, NameError, ValueError,RuntimeError): print "unable to process gurobi model in "+fname np.savetxt("runtime.csv", runtime_mod, delimiter=",") np.savetxt("tm.csv", runtime_mod, delimiter=",") tm_df = pd_df({ "tm": truematch_orig + truematch_mod, "runtime": runtime_orig + runtime_mod, "orig_or_mod": ["mod"]*len(runtime_mod) # ["orig"]*nmc }) nmc = len(runtime_mod) #print gg.ggplot(tm_df, aes('orig_or_mod', 'runtime')) + \ # gg.geom_line(colour='steelblue') comp_plot = gg.ggplot(data=tm_df, aesthetics=gg.aes(x='runtime', y='tm')) + gg.geom_point() + gg.scale_x_log10() gg.ggsave("graphmatch_IP_runtime_vs_tm.pdf",plot = comp_plot) print "mean of tm:"+ str(np.mean( truematch_mod))
def plotSetOfArrays(arrays,names,fileName): IDS = np.linspace(0,1,arrays[0].shape[0]) A = IDS.reshape(arrays[0].shape[0],1) for i in range(0,len(arrays)): A = np.concatenate((A,arrays[i]),axis=1) Data = pd.DataFrame(A,columns = ['noise']+names) Melted = pd.melt(Data,id_vars=['noise']) pv = ggplot.ggplot(ggplot.aes(x='noise', y='value', colour='variable'), data=Melted) + ggplot.geom_line() + ggplot.geom_point() ggplot.ggsave(pv,'./IMG/'+fileName) output_file("iou_scores.html", title="correlation.py example") figure(tools="pan,wheel_zoom,box_zoom,reset,previewsave") hold() line(IDS, arrays[0][:,0], color='#A6CEE3', legend=names[0]) line(IDS, arrays[1][:,0], color='#1F78B4', legend=names[1]) line(IDS, arrays[2][:,0], color='#B2DF8A', legend=names[2]) line(IDS, arrays[3][:,0], color='#33A02C', legend=names[3]) line(IDS, arrays[4][:,0], color='#fb9a99', legend=names[4]) curplot().title = "Minimum IOU" grid().grid_line_alpha=0.3 show()
print("#######################################") print("打印所挖掘的文本文件 text-movie.xls 前几行") print(df.head()) #text = df.comments.iloc[0] 单个影评情感分析实验, iloc中的index值表示第几个应用,编号从0开始 #s = SnowNLP(text) # #print(s.sentiments) def get_sentiment_cn(text): s = SnowNLP(text) return s.sentiments df["sentiment"] = df.comments.apply(get_sentiment_cn) print("#######################################") print("打印所挖掘的文本文件 text-movie.xls 部分影评及其情感分析值") print(df) print("#######################################") print("重要信息") print("所有影评的平均值为:", df.sentiment.mean()) print("所有影评的中位数为:", df.sentiment.median()) ggplot.ggplot(ggplot.aes(x="date", y="sentiment"), data=df) + ggplot.geom_point() + ggplot.geom_line( color='blue') + ggplot.scale_x_date( labels=ggplot.date_format("%Y-%m-%d")) df.sort_values(['sentiment'])[:5]
#coding=utf-8 #!/usr/bin/python ### 资料来源:http://nbviewer.ipython.org/gist/wrobstory/1eb8cb704a52d18b9ee8/Up%20and%20Down%20PyData%202014.ipynb # 导入文件模块 import ggplot as gg from ggplot import ggplot import numpy as np import pandas as pd df = pd.read_csv('/Users/zhangbo/github/pydatasv2014/USGS_WindTurbine_201307_cleaned.csv') min_heights = df[df['Rotor Diameter'] > 10] (ggplot(gg.aes(x='Turbine MW', y='Rotor Swept Area'), data=min_heights[:500]) + gg.geom_point(color='#75b5aa', size=75) + gg.ggtitle("Rotor Swept Area vs. Power") + gg.xlab("Power (MW)") + gg.ylab("Rotor Swept Area (m^2)"))
# # Signal (time domain) signal = pd.DataFrame() Fs = 150.0 # sampling rate Ts = 1.0/Fs # sampling interval signal['t'] = np.arange(0, 1, Ts) # time vector ff = 5 # frequency of the signal signal['y'] = np.sin(2*np.pi*ff * signal['t']) # the signal n = len(signal['y']) # length of the signal # Spectrum (freq domain) spectrum = pd.DataFrame() k = np.arange(n) T = n/Fs spectrum['f'] = k/T # frequency range spectrum['Y'] = np.fft.fft(signal['y'])/n # fft (with 1/n normalization) spectrum['a'] = np.abs(spectrum['Y']) # amplitude spectrum spectrum['p'] = np.abs(spectrum['Y'])**2 # power spectrum spectrum_h = spectrum[:n/2] # positive half (real signal -> hermitian spectrum) plot_gg(gg_layer( gg.ggplot(signal, gg.aes(x='t', y='y')), gg.geom_point(), )) plot_gg(gg_layer( gg.ggplot(spectrum_h, gg.aes(x='f', y='a')), gg.geom_point(), ))
def plot_trace_points(df, **kws): from ggplot import geom_point, geom_line, geom_path return geom_point(data = df, **kws)
import pandas as pd import numpy as np # from source import view_and_print_output import ggplot as gg df = pd.DataFrame() for num_layers, num_nodes in [(2, 50), (2, 100), (2, 150), (2, 200), (4, 50), (4, 100), (4, 150), (4, 200)]: file_coarse = '../../data/coarse_lambda_dropout_' + str(num_layers) + '_' + str(num_nodes) + '.txt' newdata = pd.read_csv(file_coarse) newdata = newdata.sort_values(by='validation error', ascending=True) newdata['lambda'] = np.log10(newdata['lambda']) newdata['index'] = (np.arange(len(newdata), dtype='float')/len(newdata))**3 newdata['config'] = str(num_layers * 100 + num_nodes) + ' ' + str(num_layers) + ' ' + str(num_nodes) df = df.append(newdata) print(df.sort_values(by='validation error', ascending=False).head(20)) p = gg.ggplot(gg.aes(x='lambda', y='dropout prob', color='index'), data=df) + \ gg.geom_point() + \ gg.xlab('lambda') + \ gg.ylab('dropout prob') + \ gg.scale_x_continuous(limits=(-5, 2)) + \ gg.facet_wrap('config') print(p) # Conclusion: ignore dropout
clusterAss.iloc[:,0] = clusterAss.iloc[:,0].fillna(0) clusterAss.iloc[:,1] = calcDistance(dataSet, centroid) minSSE = math.inf while(len(centroids) < k): numCurrCluster = len(centroids) for i in range(numCurrCluster): pointsInCurrCluster = dataSet.iloc[clusterAss.iloc[:,0][clusterAss.iloc[:,0] == i].index,:] splitClusterAss, tmpCent = kmeans(pointsInCurrCluster, 2) splitSSE = sum(splitClusterAss.iloc[:,1]) notSplitSSE = sum(clusterAss.iloc[clusterAss.iloc[:,0][clusterAss.iloc[:,0] != i].index,1]) currentSSE = splitSSE + notSplitSSE if(currentSSE < minSSE): minSSE = currentSSE bestClusterToSplit = i bestNewCentroids = tmpCent.copy() bestClusterAss = splitClusterAss.copy() bestClusterAss.loc[bestClusterAss.loc[:,'cluster']==1, 'cluster'] = numCurrCluster bestClusterAss.loc[bestClusterAss.loc[:,'cluster']==0, 'cluster'] = bestClusterToSplit centroids[bestClusterToSplit] = bestNewCentroids.iloc[0, :] centroids.append(bestNewCentroids.iloc[1, :]) clusterAss.iloc[clusterAss.iloc[:,0][clusterAss.iloc[:,0]==bestClusterToSplit].index,:] = bestClusterAss print('Just finished') return(clusterAss, pd.DataFrame(centroids)) data = pd.read_table('/home/samael/learnML/kmeans/kmeans', sep='\t', header=None) data.columns = ['V1', 'V2'] clusterAss1, centroids1 = kmeans(data, 4) clusterAss2, centroids2 = biKmeans(data, 4) ggplot.ggplot(data, ggplot.aes('V1', 'V2')) + ggplot.geom_point(color=clusterAss1.cluster, size=50) + ggplot.geom_hline(y=0) + ggplot.geom_vline(x=0) ggplot.ggplot(data, ggplot.aes('V1', 'V2')) + ggplot.geom_point(color=clusterAss2.cluster, size=50) + ggplot.geom_hline(y=0) + ggplot.geom_vline(x=0)
if (reward == 1): wins_for_player_1[i] += 1.0 elif (reward == 0.5): draw_for_players[i] += 1.0 print(i, wins_for_player_1[i], draw_for_players[i]) data.append({ 'Type': 0, 'Wins': wins_for_player_1[i], 'Training': training_steps * (i - 1) }) data.append({ 'Type': 1, 'Wins': draw_for_players[i], 'Training': training_steps * (i - 1) }) learnitMC(training_steps, epsilon, alpha, n) # learnit(training_steps, epsilon, alpha) # the original learning code. # Pandas gives you the power of R learningdf = pd.DataFrame(data) # I use ggplot when I generate figures in R and would like to use it with Python, HOWEVER: # latest Pandas causes problems for ggplot so I needed these two patches: # https://stackoverflow.com/questions/50591982/importerror-cannot-import-name-timestamp/52378663 # https://github.com/yhat/ggpy/issues/612 p = gg.ggplot(gg.aes(x='Training', y='Wins', group='Type'), data=learningdf)+ gg.xlab('Learning games') + \ gg.ylab('Wins for player 1') + gg.ggtitle("n="+str(n)) + gg.geom_point() + gg.stat_smooth(method='loess') p.make() filename = "experiment_" + str(n) + ".pdf" p.save(filename)
data = [] for method in methods: for model in models: for rtol in rtols: print('method: {} model: {} rtol: {}'.format(method.name, model.name, rtol), end='') # Run tic = time.time() result = method(model, rtol) toc = time.time() - tic # Compare to gold standard standard = gold_standards[model.name] diff = result - standard.values max_rel_diff = np.max(diff/standard.max) # Append to table record = (method.name, model.name, rtol, max_rel_diff, toc) print(' err: {} toc: {}'.format(max_rel_diff, toc)) data.append(record) data = DataFrame(data, columns=['method', 'model', 'rtol', 'err', 'time']) print(gg.ggplot(data, gg.aes(x='err', y='time', color='method')) + gg.geom_point(size=60.0) + gg.geom_line() + gg.scale_x_log() + gg.scale_y_log() + gg.xlim(1e-10, 1e-2))
"orig_or_mod": ["orig"]*nmc + ["mod"]*nmc }) p_val_truematch_diff = st.ttest_rel(truematch_orig, truematch_mod) p_val_timediff = st.ttest_rel(runtime_orig, runtime_mod) p_val_truematch_diff = st.wilcoxon(truematch_orig, truematch_mod) p_val_timediff = st.wilcoxon(runtime_orig, runtime_mod) #print gg.ggplot(tm_df, aes('orig_or_mod', 'tm')) + \ # gg.geom_line(colour='steelblue') #print gg.ggplot(tm_df, aes('orig_or_mod', 'runtime')) + \ # gg.geom_line(colour='steelblue') comp_plot = gg.ggplot(data=tm_df, aesthetics=gg.aes(x='runtime', y='tm', colour='orig_or_mod')) + gg.geom_point() + gg.scale_x_log10() gg.ggsave("orig_IP_vs_modified_IP_3.pdf",plot = comp_plot)
def test_groups_2_aes(self): p = gg.ggplot(gg.aes(x='carat', y='price', color='clarity', shape='cut'), gg.diamonds) + gg.geom_point() _, groups = p._construct_plot_data() self.assertEqual(len(groups), 8*5)
import ggplot as gp import pandas as pd import numpy as np crime = pd.read_csv('crimeRatesByState2005.csv') print( gp.ggplot(gp.aes(x='murder', y='burglary'), data=crime) + gp.geom_point(color='red'))
# optimization costs: print("optimization costs:") for t in all_theta: t = np.mat(t.reshape(n, 1)) print(decorateCost(t)) # calculate prediction efficiency predict = np.round(mllogistic.sigmoid(X*theta)) print("Prediction accuracy = {}".format((predict == y).mean()*100)) detect = predict[np.where(y)] print(" Detection accuracy = {}".format(detect.sum()/detect.size*100)) if False: # scatter ggplot mtcars.cyl = mtcars.cyl.astype(str) # changes cyl to a discrete value point = gg.ggplot(mtcars, gg.aes("disp", "mpg", colour = "cyl")) + gg.geom_point(size = 35) print point # scatter pyplot neg = np.where(y.A1 == 0) pos = np.where(y.A1 == 1) fig, ax = plt.subplots() ax.plot(X_prenorm[neg, 1].A, X_prenorm[neg, 2].A, "ko", markerfacecolor = "b", markersize = 7, label = "cyl4") ax.plot(X_prenorm[pos, 1].A, X_prenorm[pos, 2].A, "ko", markerfacecolor = "r", markersize = 7, label = "cyl6") ax.set_xlabel("disp") ax.set_ylabel("mpg") # here is the grid range margin = 20 u = np.linspace(min(X_prenorm[:,1]).A1 - margin, max(X_prenorm[:,1]).A1 + margin, 50) v = np.linspace(min(X_prenorm[:,2]).A1 - margin, max(X_prenorm[:,2]).A1 + margin, 50)