print(WEEKLY.head()) # In[7]: ## WRITE TO FILE WEEKLY.reset_index(drop=False).to_excel( 'C:/Users/pmwash/Desktop/Disposable Docs/NEWEST MERGED DATASET.xlsx', sheet_name='CHECK') # In[8]: from bokeh.charts import Histogram, output_file, show h2 = Histogram(WEEKLY, values='CasesPerSale', label='Class_Code', color='Class_Code', title='Test Histogram', plot_width=1100, plot_height=900) show(h2) # In[9]: from ggplot import * ggplot(WEEKLY, aes(x='WeekNumber', y='Cases_Received')) + geom_line() # In[10]: from bokeh.charts import BoxPlot
#%matplotlib inline #pylab.rcParams['figure.figsize'] = (10, 6) def convert_int(dataframe, colname): dataframe[colname] = pd.to_numeric(dataframe[colname], errors='coerce') limit_rows = 1000000 df = pd.read_csv("C:/Users/vasir/Desktop/ADM/train_ver2.CSV", nrows=limit_rows) convert_int(df, 'age') print df['age'].dtype df['age'] = df['age'].fillna(-1) cols = ['age'] df[cols] = df[cols].applymap(np.int64) df_frac = df.sample(frac=0.01) p_age = Histogram(df_frac, values='age', title="Age Distribution") #show(p_age) dffrac1 = df_frac.dropna(subset=['sexo'], how='any') dffrac1.head() #dffrac1['sexo']=dffrac1['sexo'].astype('category') p = Bar(dffrac1, 'sexo', title="Sex") #show(p) dffrac2 = df_frac.dropna(subset=['renta'], how='any') bar_renta = Bar(dffrac2, values='renta', label='nomprov', agg='mean', title="City Vs Renta", legend=False,
from bokeh.charts import Histogram, output_file, show from bokeh.layouts import row from bokeh.sampledata.autompg import autompg as df hist = Histogram(df, values='mpg', title="Auto MPG Histogram", plot_width=400) hist2 = Histogram(df, values='mpg', label='cyl', color='cyl', legend='top_right', title="MPG Histogram by Cylinder Count", plot_width=400) output_file('hist.html') show(row(hist, hist2))
def histogram(histDF, values, **kwargs): from bokeh.charts import Histogram return Histogram(histDF[values], **kwargs)
get_ipython().system( u'curl --upload-file restaraunts.html https://transfer.sh/restaruants.html' ) # In[93]: from bokeh.io import output_notebook output_notebook() # In[94]: from bokeh.charts import Histogram, output_file, show # In[96]: p1 = Histogram(samp["SCORE"]) show(p1) # In[103]: p2 = Histogram(mRests, 'SCORE', color='GRADE', title="Score Grouped by Grade", bins=15, legend='top_left') show(p2) # In[99]:
from bokeh.charts import Histogram, defaults, show, output_file from bokeh.layouts import gridplot from bokeh.sampledata.autompg import autompg as df defaults.plot_width = 400 defaults.plot_height = 350 # input options hist = Histogram(df['mpg'], title="df['mpg']") hist2 = Histogram(df, 'displ', title="df, 'displ'") hist3 = Histogram(df, values='hp', title="df, values='hp'", density=True) hist4 = Histogram(df, values='hp', color='cyl', title="df, values='hp', color='cyl'", legend='top_right') hist5 = Histogram(df, values='mpg', bins=50, title="df, values='mpg', bins=50") hist6 = Histogram(df, values='mpg', bins=[10, 15, 25, 100], tooltips=[('Bin', "@label")], title="df, values='mpg', bins=[10, 15, 25, 100]") output_file("histogram_multi.html", title="histogram_multi.py example") show(gridplot(hist, hist2, hist3, hist4, hist5, hist6, ncols=2))
import numpy as np import pandas as pd from bokeh.charts import Histogram, show, output_file # build some distributions and load them into a dict mu, sigma = 0, 0.5 normal = np.random.normal(mu, sigma, 1000) lognormal = np.random.lognormal(mu, sigma, 1000) distributions = OrderedDict(normal=normal, lognormal=lognormal) # create a pandas data frame from the dict df = pd.DataFrame(distributions) distributions = df.to_dict() for k, v in distributions.items(): distributions[k] = v.values() # any of the following commented are valid Histogram inputs #df = list(distributions.values()) #df = tuple(distributions.values()) #df = tuple([tuple(x) for x in distributions.values()]) #df = np.array(list(distributions.values())) #df = list(distributions.values())[0] output_file("histograms.html") hist = Histogram(df, bins=50, legend=True) show(hist)
def plot_timeline_cu(self, width, height, dataframe, cu_id, x_max=None, y_max=None): """ Plot timeline """ # Plot plot_color = tm.get_random_color() # Range condition = 'WHERE cu=' + str(cu_id) if x_max is None: x_max = int(self.get_max("inst", "start + length", condition)) if y_max is None: y_max = int(self.get_count("inst", "uid", condition)) # Get box annotation and cycle info boxannotations, info = self.get_interval_boxannotation(cu_id) cycle_all = info['cycle_all'] cycle_mem_ld = info['cycle_mem_ld'] cycle_mem_st = info['cycle_mem_st'] cycle_other = info['cycle_other'] # Title title = 'cu-' + str(cu_id) + ': ' title += str(cycle_mem_ld) + ' mem ld / ' title += str(cycle_mem_st) + ' mem st / ' title += str(cycle_other) + ' other / ' title += str(cycle_all) + ' all' plot = figure(webgl=True, width=width, height=height, x_range=(0, x_max), y_range=(0, y_max), title=title) y_axis = range(len(dataframe.index)) plot.segment(x0=dataframe['start'], y0=y_axis, x1=dataframe['start'] + dataframe['length'], y1=y_axis, line_width=1, color=dataframe['color']) # Add box annotation for box in boxannotations: plot.add_layout(box) # Plot histogram on the right, ignore zeroes mean = np.round(dataframe['stall'].mean(), 2) median = dataframe['stall'].median() hist_title = 'stall' hist_title += ' / avg ' + str(mean) hist_title += ' / mid ' + str(median) plot_hist = Histogram(dataframe, 'stall', bins=50, height=height, width=height, color=plot_color, title=hist_title) return (plot, plot_hist)
def badass_plot(newpoint, new_df): """ Used for Fraud Case Study This function will add the data of a new point (json or csv) to the graphs that have been build in bokeh Input: newpoint in json form or csv, as long as it's a dictionary, we're good! Return: A tab through visualization of bokeh plots, showing the data points location on these plots """ color_list = ['#FF8C00', '#FAFCCC', '#EA653B', '#EE993D', \ '#EA653B', '#ED9239', '#A4A7AA', '#676767'] output_file("Check-Graphs.html") bins = 15 p1 = figure(width=500, height=500, x_range=(new_df['event_created_to_end'].min(), new_df[u'event_created_to_end'].max())) p1 = Histogram(new_df, values='event_created_to_end', label='label', color='label', title='Duration of \n Event Created to End', density=True, legend='top_right', bins=bins, palette=color_list) vline = Span(location=newpoint[u'event_created_to_end'], dimension='height', line_color='red', line_width=3) p1.xaxis.axis_label = '' p1.xaxis.axis_label_text_font_style = 'normal' p1.yaxis.axis_label_text_font_style = 'normal' p1.xaxis.axis_label_text_font_size = '18pt' p1.yaxis.axis_label_text_font_size = '18pt' p1.background_fill_color = "#acaaa8" p1.background_fill_alpha = 0.5 p1.title.text_font = 'helvetica' p1.title.text_font_size = '18pt' p1.renderers.extend([vline]) bins = 2 p2 = figure(width=500, height=500, x_range=(new_df['total_tickets_sold'].min(), new_df['total_tickets_sold'].max())) p2 = Histogram(new_df, values='total_tickets_sold', label='label', color='label', title='Total Tickets Sold', density=True, legend='top_right', bins=bins, palette=color_list) vline = Span(location=newpoint[u'total_tickets_sold'], dimension='height', line_color='red', line_width=3) p2.xaxis.axis_label = '' p2.xaxis.axis_label_text_font_style = 'normal' p2.yaxis.axis_label_text_font_style = 'normal' p2.xaxis.axis_label_text_font_size = '18pt' p2.yaxis.axis_label_text_font_size = '18pt' p2.background_fill_color = "#acaaa8" p2.background_fill_alpha = 0.5 p2.title.text_font = 'helvetica' p2.title.text_font_size = '18pt' p2.renderers.extend([vline]) bins = 2 p3 = figure(width=500, height=500, x_range=(new_df['payout_type_MISSING'].min(), new_df['payout_type_MISSING'].max())) p3 = Histogram(new_df, values='payout_type_MISSING', label='label', color='label', title='Payout Type Missing \n (Yes/No)', density=True, legend='top_right', bins=bins, palette=color_list) vline = Span(location=newpoint[u'payout_type_MISSING'], dimension='height', line_color='red', line_width=3) p3.xaxis.axis_label = '' p3.xaxis.axis_label_text_font_style = 'normal' p3.yaxis.axis_label_text_font_style = 'normal' p3.xaxis.axis_label_text_font_size = '18pt' p3.yaxis.axis_label_text_font_size = '18pt' p3.background_fill_color = "#acaaa8" p3.background_fill_alpha = 0.5 p3.title.text_font = 'helvetica' p3.title.text_font_size = '18pt' p3.renderers.extend([vline]) bins = 30 p4 = figure(width=500, height=500, x_range=(new_df['median_ticket_cost'].min(), new_df['median_ticket_cost'].max())) p4 = Histogram(new_df, values='median_ticket_cost', label='label', color='label', title='Median Ticket Cost', density=True, legend='top_right', bins=bins, palette=color_list) vline = Span(location=newpoint[u'median_ticket_cost'], dimension='height', line_color='red', line_width=3) p4.xaxis.axis_label = '' p4.xaxis.axis_label_text_font_style = 'normal' p4.yaxis.axis_label_text_font_style = 'normal' p4.xaxis.axis_label_text_font_size = '18pt' p4.yaxis.axis_label_text_font_size = '18pt' p4.background_fill_color = "#acaaa8" p4.background_fill_alpha = 0.5 p4.title.text_font = 'helvetica' p4.title.text_font_size = '18pt' p4.renderers.extend([vline]) tab1 = Panel(child=p1, title='Event Create to End') tab2 = Panel(child=p2, title='Total Tickets Sold') tab3 = Panel(child=p3, title='Payout Type Missing (yes/no)') tab4 = Panel(child=p4, title='Median Ticket Cost') tabs = Tabs(tabs=[tab1, tab2, tab3, tab4]) return file_html(tabs, CDN, 'plot')
# In[6]: flowers[flowers.sepal_length < 5] # In[7]: flowers.describe() # In[8]: display(flowers["species"].unique()) display(flowers.species.value_counts()) # In[9]: hist = Histogram(flowers, values='petal_length') show(hist) # In[10]: hist2 = Histogram(flowers, values='petal_length', label="species", color="species") show(hist2) # In[11]: colormap = {'setosa': 'red', 'versicolor': 'green', 'virginica': 'blue'} colors = [colormap[x] for x in flowers['species']] p = figure(title="Iris Morphology")
def Histograms(): plot = Histogram(df, values= value, color=value, title=title, legend='top_right') # script, div = components(plot) return plot
def dashboard(request, counter_name, db): """ This is the main function for our dashboard it queries the database for the correct data given from the previous page. Then displays those results in three formats. Histogram, timeseries and CDF """ counter_len = len(Counter.objects.using(db).values()) Date = [ Counter.objects.using(db).values()[i]["pub_date"] for i in range(counter_len) ] name = counter_name y_values = Counter.objects.using(db).values_list( "counter_value", flat=True).filter(counter_name=counter_name) points = zip(Date, y_values) ddict = OrderedDict({'Date': Date}) #ddict[name] = y_values #plot specifications TOOLS = "pan,wheel_zoom,box_zoom,reset,hover,save" p = figure(width=1200, height=400, x_axis_type="datetime", tools=TOOLS, title=name + "'s Metrics") p.min_border_left = 100 p.min_border_top = 50 p.border_fill = 'whitesmoke' p.ygrid.band_fill_color = "#E6E6FA" p.title_text_color = "olive" p.title_text_font = "times" p.title_text_font_style = "italic" p.outline_line_width = 7 p.outline_line_alpha = 0.3 p.outline_line_color = "black" #HoverTool specifications source = ColumnDataSource(data=dict(rates=[ slope(points[i][0].second, points[i][1], points[i + 1][0].second, points[i + 1][1]) for i in range(len(points) - 1) ])) hover = p.select(dict(type=HoverTool)) hover.point_policy = "follow_mouse" hover.tooltips = OrderedDict([ ("Counter Name", name), ("Rate of count", "@rates c/us"), ]) p.line(Date, y_values, line_width=1, source=source) p.square(Date, y_values, fill_color=None, line_color="green", size=4) script1, div1 = components(p, CDN) hist = Histogram(list(y_values), bins=50, title='Histogram') hist.border_fill = 'whitesmoke' hist.background_fill = "beige" script2, div2 = components(hist, CDN) area = Area(list(y_values), title="CDF") area.border_fill = "whitesmoke" area.background_fill = "#191970" script3, div3 = components(area, CDN) context = RequestContext( request, { "the_script1": script1, "the_div1": div1, "the_script2": script2, "the_div2": div2, "the_script3": script3, "the_div3": div3 }) return render(request, "counters_app/simple_bokeh.html", context)
#para o esquema 3-5-2 squad_352 = ['GK', 'LWB', 'CB', 'RWB', 'LM', 'CDM', 'CAM', 'CM', 'RM', 'LW', 'RW'] print ('3-5-2') print (get_best_squad(squad_352)) #Visualizações #distribuições data.Age.plot(kind='hist', bins=20) data.Overall.plot(kind='hist', bins=20) #histograma melhorado # entre os melhores times melhoresClubs = data[(data.Club == 'FC Barcelona') | (data.Club == 'Juventus') | (data.Club == 'Real Madrid CF') | (data.Club == 'FC Bayern Munich') | (data.Club == 'Paris Saint-Germain') ] hist = Histogram(data=melhoresClubs, values="Age", color="Club", legend="top_right", bins=12) show(hist) #box plot para analisar salarios dos melhores clubs sns.set(style="whitegrid", color_codes=True) sns.boxplot(x="Club", y="salario", hue="Club", data=melhoresClubs, palette="PRGn") sns.despine(offset=10, trim=True) best = data[data['Overall']> 85] grouped = best.groupby('Club') count_by_club = grouped.count()['Name'].sort_values(ascending = False) ax = sns.countplot(x = 'Club', data = best, order = count_by_club.index) ax.set_xticklabels(labels = count_by_club.index, rotation='vertical') ax.set_ylabel('Numero de jogadores') ax.set_xlabel('Clube') ax.set_title('Clubes com os melhores jogadores')
def GDP_PCA_plot(filename=None, threshold=0.015, lowerbound=2.0, upperbound=1.0e4, factor=0.08): data = np.load('uploads/' + filename) image1 = data.f.image #-np.median(data.f.image) image2 = np.array(image1 * 255 / image1.max(), dtype='uint8') image_avg = np.mean(np.partition(image2, int(len(image2) * 0.2))) image2[image2 < image_avg] = int(image_avg) #H1=cv2.GaussianBlur(image2,(3,3),1.0*np.std(image2)) H1 = gaussian_filter(image2, factor * np.std(image2), mode='nearest') image2 = H1 blobs_log = blob_log(image2, max_sigma=0.3 * np.std(image2), min_sigma=0.02 * np.mean(image2), num_sigma=20, threshold=threshold, overlap=0.6) blobs_log[:, 2] = blobs_log[:, 2] * np.sqrt(2) blobs = blobs_log[(blobs_log[:, 2] > lowerbound) & (blobs_log[:, 2] < upperbound)] xx = (data.f.X.min(), np.round(data.f.X.max(), -1)) yy = (data.f.Y.min(), np.round(data.f.Y.max(), -1)) x_step = (xx[1] - xx[0]) / np.shape(H1)[0] y_step = (yy[1] - yy[0]) / np.shape(H1)[1] #Number of NV height = yy[1] - yy[0] width = xx[1] - xx[0] total = len(blobs) per_nv = round(len(blobs) / float(height * width) * (20 * 20), 2) ######################################################## t = [ 'Original Density Plot,' ' Filename=' + filename, 'Gaussian Filtered Density Plot', 'Total NVs =' + str(total) + ' , NVs per 20x20 pixel area = ' + str(per_nv) ] data_list = [image1, H1, H1] color_list = [ Viridis256, cc.b_linear_bgy_10_95_c74, cc.b_linear_bgy_10_95_c74 ] return_list = [] return_list.append(head) #hover tool hack to work for image function in bokeh. ##http://stackoverflow.com/questions/28176949/convert-list-of-tuples-to-structured-numpy-array px = np.linspace(xx[0], xx[1], np.shape(H1)[0] / 2) py = np.linspace(yy[0], yy[1], np.shape(H1)[1] / 2) px = np.array(px, dtype='uint32') py = np.array(py, dtype='uint32') a = [] for i in px: a.extend(zip(itertools.repeat(i), py)) dt = np.dtype('int,float') X = np.array(a, dtype=dt) x1 = X['f0'] y1 = X['f1'] ################################################################## for i in range(3): color_mapper = LogColorMapper(palette=color_list[i], \ low=np.mean(data_list[i]), \ high=1.0*np.mean(data_list[i])+\ 2.0*np.std(data_list[i])) color_bar = ColorBar(color_mapper=color_mapper,\ label_standoff=12, \ border_line_color=None, \ location=(0,0)) p1 = figure(plot_width=600, plot_height=600,title=t[i],title_text_font_size='12pt',\ x_range=xx,y_range=xx,tools=TOOLS,toolbar_location="below",toolbar_sticky=False,responsive=True) p1.square(x1, y1, alpha=1.0) p1.image(image=[data_list[i]], color_mapper=color_mapper, dh=yy[1] - yy[0], dw=xx[1] - xx[0], x=xx[0], y=xx[0]) p1.add_layout(color_bar, 'right') if i == 2: p1.circle(blobs[:, 1] * x_step + xx[0], blobs[:, 0] * y_step + yy[0], radius=blobs[:, 2] * 1.6, radius_dimension='y', line_color='red', alpha=1.0, line_width=3, fill_color=None) p1.title.text_font_size = "11pt" p1.xaxis.axis_label_text_font_size = "13pt" p1.yaxis.axis_label_text_font_size = "13pt" #plots = {'Navy': p1, 'Blue': p2}; tuple_plot = components(p1) #script2, div2 = components(p2); return_list.append(list(tuple_plot)) p=Histogram(blobs[:,2],\ plot_width=600, plot_height=600,tools=TOOLS,\ toolbar_location="below",toolbar_sticky=False,\ responsive=True,\ title="Distribution of Radii of NV Centers") return_list.append(list(components(p))) return return_list
def get_histogram(t): t1, t2 = ticker1.value, ticker2.value data = get_data(t1, t2) h = Histogram(data[[t]], values=t) h.toolbar_location = None return h
from bokeh.charts import Histogram from bokeh.sampledata.autompg import autompg as df from bokeh.charts import defaults, vplot, hplot, show, output_file defaults.width = 450 defaults.height = 350 # input options hist = Histogram(df['mpg'], title="df['mpg']") hist2 = Histogram(df, 'displ', title="df, 'displ'") hist3 = Histogram(df, values='hp', title="df, values='hp'") hist4 = Histogram(df, values='hp', color='cyl', title="df, values='hp', color='cyl'", legend='top_right') hist5 = Histogram(df, values='mpg', bins=50, title="df, values='mpg', bins=50") output_file("histograms.html") show(vplot(hplot(hist, hist2, hist3), hplot(hist4, hist5)))
As in the previous two chapters, you can interact with the figures you create in this chapter as well, and you may have to scroll down to view the lower portion of some of them. Instructions Import Histogram, output_file, and show from bokeh.charts. Make a histogram called p with the Histogram() function using the 'female_literacy' column of df. You have to first specify df and then 'female_literacy'. Give the histogram a title of 'Female Literacy'. Set the x-axis label using p.xaxis.axis_label. Set the y-axis label using p.yaxis.axis_label. Specify the name 'histogram.html' for the output file and display the histogram p. """ # Import Histogram, output_file, and show from bokeh.charts from bokeh.charts import Histogram, output_file, show # Make a Histogram: p p = Histogram(df, 'female_literacy', title='Female Literacy') # Set the x axis label p.xaxis.axis_label = 'Nb' # Set the y axis label p.xaxis.axis_label = 'Female Literacy' # Specify the name of the output_file and show the result output_file('histogram.html') show(p) """ Console Output or results see plot25.png Great work! In the next exercise, you will learn how to customize histograms by controlling the number of bins. """ """
output_file("tSNE_predicted_labels.html") show(p) p = Scatter(dftsne, x='x', y='y', color='Machine_Label', title='HG002 CrowdVar: Crowd Sourced Labels', legend="bottom_left") output_file("tSNE_CrowdSourced_labels.html") show(p) p = Histogram(log_size, values='INS_log_size', title='HG002 INS: Size Distribution [5000 Samples]', color='LightSlateGray', bins=19, xlabel="Size[log10]", ylabel="Frequency") output_file("tSNE4_INS_Histo_logsize.html") show(p) p = Histogram(log_size, values='INS_log_size', title='HG002 INS: Size Distribution [5000 Samples]', color='LightSlateGray', bins=30, xlabel="Size[log10]", ylabel="Frequency") output_file("tSNE4_INS_Histo_logsize.2.html") show(p)
import numpy as np # we build some distributions and load them into a dict mu, sigma = 0, 0.5 normal = np.random.normal(mu, sigma, 1000) lognormal = np.random.lognormal(mu, sigma, 1000) distributions = dict(normal=normal, lognormal=lognormal) # then we create a pandas df from the dict import pandas as pd df = pd.DataFrame(distributions) # and finally we drop the df into out Histogram chart from bokeh.charts import Histogram hist = Histogram(df, bins=50, filename="histograms.html") hist.title("Histograms").ylabel("frequency").legend(True).width(400).height( 350).show()
from collections import OrderedDict import numpy as np import pandas as pd from bokeh.charts import Histogram # we build some distributions and load them into a dict mu, sigma = 0, 0.5 normal = np.random.normal(mu, sigma, 1000) lognormal = np.random.lognormal(mu, sigma, 1000) distributions = OrderedDict(normal=normal, lognormal=lognormal) # then we create a pandas df from the dict df = pd.DataFrame(distributions) distributions = df.to_dict() for k, v in distributions.items(): distributions[k] = v.values() # any of the following commented are valid Histogram inputs #df = list(distributions.values()) #df = tuple(distributions.values()) #df = tuple([tuple(x) for x in distributions.values()]) #df = np.array(list(distributions.values())) #df = list(distributions.values())[0] hist = Histogram(df, bins=50, filename="histograms.html", legend=True) # hist.title("Histograms").ylabel("frequency").xlabel('distributions').legend(True) # hist.width(400).height(350) hist.show()
from bokeh.charts import Histogram, output_file, show from bokeh.sampledata.autompg import autompg as df p = Histogram(df['mpg'], title="MPG Distribution") output_file("histogram.html", ) show(p)
from bokeh.charts import Histogram, output_file, show from bokeh.sampledata.autompg import autompg as df p = Histogram(df, 'hp', title="HP Distribution") output_file("histogram.html", ) show(p)
def plot_prediction_histogram(images_path, model_path, weights_path, img_shape, required_class, class_name_dict={}, batch_size=16, preprocessing_function=None, plotting_module="matplotlib"): """ Function to plot the histogram of predicted probabilities of a given class images_path --str: full path to the parent directory containing sub-directory(classes) of images model_path --str: full path to a keras model (.json file) (No default) weights_path --str: full path to the weights file (.hdf5 file) (No default) img_shape --tuple: image shape to input to the model (eg : (224,224,3)) (No default) required_class --str: The name of the required class on which to generate ROC (example "dog") class_name_dict --dict: Dictionary mapping of classes (default {}) batch_size --int: The batch_size to use for prediction (Default 16) preprocessing_function --function: The preprocessing function to use before prediction (Default None) plotting_module --str: The plotting module to use. Either of 'matplotlib' or 'bokeh' (Default matplotlib) Output: Plots the histogram of the predicted probabilites of the required class """ if len(class_name_dict) == 0: raise ValueError("Provide the class_name_dict") # loading model and weights json_file = open(model_path, 'r') loaded_model_json = json_file.read() json_file.close() loaded_model = model_from_json(loaded_model_json) # load weights loaded_model.load_weights(weights_path) print('loaded model from disk') # generator for data test_datagen = ImageDataGenerator( preprocessing_function=preprocessing_function) test_generator = test_datagen.flow_from_directory( images_path, target_size=(img_shape[0], img_shape[1]), class_mode='categorical', batch_size=batch_size, shuffle=False, # to get ordered result ) # this is an important step else there is a difference in result between predict and predict_generator test_generator.reset() # reset to start with sample 0 nb_samples = test_generator.samples class_name_dict = test_generator.class_indices # eg {'dog': 0, 'cat': 1} # predict predictions = loaded_model.predict_generator(test_generator, steps=nb_samples // batch_size, max_queue_size=10, workers=1, use_multiprocessing=False, verbose=1) class_idx = class_name_dict[required_class] hist_data = predictions[:, class_idx] if plotting_module == "matplotlib": plt.hist(hist_data, bins=10) plt.xlim(0, 1) plt.title('Histogram of predicted probabilities') plt.xlabel('Predicted probability of {}'.format(required_class)) plt.ylabel('Frequency') elif plotting_module == "bokeh": df = pd.DataFrame( {"Predicted probability of {}".format(required_class): hist_data}) from bokeh.io import output_notebook import warnings warnings.filterwarnings('ignore') from bokeh.charts import Histogram, show output_notebook() p = Histogram(df, "Predicted probability of {}".format(required_class), title="Histogram of predicted probabilities") show(p) else: raise ValueError("Only bokeh and matplotlib are supported")
view2 = CDSView(source=source, filters=[GroupFilter(column_name='handedness', group='L')]) p_L = figure(tools = tools, title = 'Batting Average vs Home Runs of left handed players', height = 200) p_L.xaxis.axis_label = 'Average' p_L.yaxis.axis_label = 'Home Runs' p_L.circle(x="avg", y="HR", color = 'green', hover_color="yellow", view=view2, source = source) view3 = CDSView(source=source, filters=[GroupFilter(column_name='handedness', group='B')]) p_B = figure(tools = tools, title = 'Batting Average vs Home Runs of both handed players', height = 200) p_B.xaxis.axis_label = 'Average' p_B.yaxis.axis_label = 'Home Runs' p_B.circle(x="avg", y="HR", color = 'blue', hover_color="yellow", view=view3, source = source) hist = Histogram(data, values='avg', title="Average of all", plot_height=300) hist2 = Histogram(data, values='avg', label='handedness', color='handedness', legend='top_right', title="Average of R, L and B handedness", plot_height=300) hist3 = Histogram(data, values='HR', title="Home runs of all", plot_height=300) hist4 = Histogram(data, values='HR', label='handedness', color='handedness', legend='top_right', title="Home runs of R, L and B handedness", plot_height=300) marker = {'R': 'red', 'L': 'green', 'B' : 'blue'} markers = ['0.05' if x <= 0.05 else '0.01' if 0.05 < x <= 0.10 else '0.15' if 0.10 < x < 0.15 else '0.20' if 0.15 < x < 0.20 else '0.25' if 0.25 < x < 0.30 else '0.30' for x in data['avg']] scatter1 = Scatter(data, x='weight', y='avg', color='handedness', title="Weight vs Average", xlabel="Weight", ylabel="Average", plot_height=300) scatter2 = Scatter(data, x='height', y='avg', color='handedness',
from bokeh.charts import Histogram, output_file, show from bokeh.sampledata.autompg import autompg as df p = Histogram(df, values='hp', color='navy', title="HP Distribution") output_file("histogram_color.html") show(p)
# ch26.py # ref: # http://bokeh.pydata.org/en/latest/docs/user_guide/charts.html#histograms # Or explicitly as the values keyword argument: from bokeh.charts import Histogram, output_file, show from bokeh.sampledata.autompg import autompg as df p = Histogram(df, values='displ', title="DISPL Distribution") output_file("/tmp/ch26.html", ) show(p)
def get_memory_hists(self, mode='overview'): """ Plot memory histogram in a row """ access = self.__get_memory_access_types(mode) hists = [] data_dfs = [] col_cycle = [] col_index = [] for key, value in access.iteritems(): sql_query = 'SELECT length FROM memory' sql_query += ' WHERE ' + value sql_query += ' ORDER by line' dataframe = pd.read_sql_query(sql_query, self.get_db()) color = tm.get_random_color() col_sched = [] col_location = [] col_value = [] col_type = [] col_color = [] try: # Some statistics mean = np.round_(dataframe["length"].mean(), 2) median = dataframe["length"].median() sum_len = dataframe["length"].sum() col_cycle.append(sum_len) col_index.append(key) # Plot histogram hist_title = key hist_title += ' / avg ' + str(mean) hist_title += ' / mid ' + str(median) plot_hist = Histogram(dataframe["length"].replace(0, np.nan), 'length', bins=50, color=color, xlabel='Latency of instruction in cycle', ylabel='Count', title=hist_title) except ValueError: continue # Ignore NaN if mean == mean and median == median: # Insert mean col_sched.append(self.get_name_instruction_scheduler()) col_location.append(key) col_value.append(mean) col_type.append('Mean') col_color.append(color) # Insert median col_sched.append(self.get_name_instruction_scheduler()) col_location.append(key) col_value.append(median) col_type.append('Median') col_color.append(color) data = {'sched': col_sched, 'location': col_location, 'value': col_value, 'type': col_type, 'color': col_color} data_df = pd.DataFrame(data, index=col_location) data_dfs.append(data_df) hists.append(plot_hist) # Plot break down of cycles data = {'cycle': col_cycle} cycle_df = pd.DataFrame(data, index=col_index) pie_cycle = Donut(cycle_df.replace(0, np.nan), title='Break down: cycles') hists.append(pie_cycle) info = {'hist': hists, 'info': data_dfs} return info
plt2.toolbar_location = None plt2.y_range = Range1d(start=0, end=1) # Save the plot output_file("./templates/plot3_new.html") # show(plt2) ####################################################### # Look at dependence of sale outcome on seller feedback score ####################################################### # hist = Histogram(data_sold, values='feedbackScore', color='listingType', # title="Distribution of feedback scores for sold items", legend='top_left') # # # Save the plot # output_file("./templates/data_exploration.html") # show(hist) ####################################################### # Look at distributions of sale prices by product ####################################################### hist = Histogram(data_sold, values='value', color='productId_value', title="Distribution of sale prices by product", legend='top_left') # Save the plot output_file("./templates/sales_histogram.html") show(hist)
from bokeh.charts import Histogram, output_file, show from bokeh.sampledata.autompg import autompg as df p = Histogram(df, values='mpg', bins=50, title="MPG Distribution (50 bins)") output_file("histogram_bins.html") show(p)
def preProcess(theFileName): df = pd.read_csv(str(theFileName)) if 'Unnamed: 0' in df.columns: df = df.drop('Unnamed: 0', axis=1) labBin = sklearn.preprocessing.LabelBinarizer() df['y'] = labBin.fit_transform(df['y']) dp = pd.get_dummies(df) X = dp.drop('y', axis=1) y = dp[['y']] # get the features theFeatures = X.columns # convert the dataframes to arrays X = X.values y = y.values y.shape = np.shape(y)[0] yOrig = y[:] # need this later for plotting feature impacts # and carry out feature scaling X = StandardScaler().fit_transform(X) #======================================================================= # apply random undersampling if labels are imbalanced labelSkewness = 100 * np.sum(y) * 1. / np.shape(y)[0] if np.min([labelSkewness, 100 - labelSkewness]) < (100. / 3.): rus = RandomUnderSampler(verbose=0) X, y = rus.fit_sample(X, y) #======================================================================= # select optimal number of features thisModel = LogisticRegression(penalty='l1', C=1) rfecv = RFECV(estimator=thisModel, step=1, cv=StratifiedKFold(y, n_folds=3), scoring='f1') Xt = rfecv.fit_transform(X, y) optimalNumberOfFeatures = rfecv.n_features_ introReport = [ 'Optimal Number of Attributes: ' + str(optimalNumberOfFeatures), 'The following attributes are the most influential to the outcome' ] #======================================================================= # plot number of selected features VS cross-validation scores plt.figure(figsize=(12, 8)) plt.xlabel("Number of Attributes", fontsize=20) plt.ylabel("Score", fontsize=20) plt.title("Attribute Selection", fontsize=25) plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_) imgOne = 'static/thePlot.jpg' plt.savefig('flask_files/' + imgOne, dpi=300) #======================================================================= # get the feature feature importance rankings model = RandomForestClassifier(n_estimators=300) model.fit(X, y) theImportances = list(model.feature_importances_) sortedImportances = sorted(theImportances, reverse=True) # ...and print the selected features along with their weights and ranks tableOne = [] for ii in range(1, optimalNumberOfFeatures + 1): tableOne.append( dict(Feature=str(theFeatures[theImportances.index( sortedImportances[ii - 1])]), Weight=str(sortedImportances[ii - 1]), Rank=str(ii))) #======================================================================= # plot histogram of the most important feature thisFeature = 0 allThoseFeatures = dp[theFeatures[theImportances.index( sortedImportances[thisFeature])]] plt.figure(figsize=(12, 8)) combinedOutcomes = plt.hist(allThoseFeatures, bins=10) # plt.hist(allThoseFeatures, bins=10) plt.xlabel('Attribute: ' + theFeatures[theImportances.index(sortedImportances[0])], fontsize=20) plt.ylabel('Count', fontsize=20) plt.title('Impact of the Most Influential Attribute', fontsize=25) imgTwo = 'static/theHist.jpg' plt.savefig('flask_files/' + imgTwo, dpi=300) #======================================================================= # plot impact of the most important feature positiv = allThoseFeatures[yOrig == 1] negativ = allThoseFeatures[yOrig == 0] plt.figure(figsize=(12, 8)) negA = plt.hist(negativ, bins=combinedOutcomes[1]) posA = plt.hist(positiv, bins=combinedOutcomes[1]) # yUpperLimit = np.max([negA[0], posA[0]])*1.01 # plt.subplot(1,2,1) # plt.hist(negativ,bins=combinedOutcomes[1]) # plt.ylim(ymax = yUpperLimit*1.01, ymin = 0) # plt.xlabel(theFeatures[theImportances.index(sortedImportances[thisFeature])], fontsize=16) # plt.ylabel('Count', fontsize=16) # plt.title('Negative', fontsize=20) # # plt.subplot(1,2,2) # plt.hist(positiv,bins=combinedOutcomes[1]) # plt.ylim(ymax = yUpperLimit, ymin = 0) # plt.xlabel(theFeatures[theImportances.index(sortedImportances[thisFeature])], fontsize=16) # plt.title('Positive',fontsize=20) # # imgThree = 'static/theNegPosHist.jpg' # plt.savefig('flask_files/'+imgThree, dpi=300) #======================================================================= a = posA[0] b = negA[0] c = combinedOutcomes[0] posImpact = np.divide(a, c) negImpact = np.divide(b, c) midPoints = [] for i in range(1, len(combinedOutcomes[1])): midPoints.append( (combinedOutcomes[1][i] + combinedOutcomes[1][i - 1]) / 2.) for i in range(len(posImpact)): if np.isnan(posImpact[i]): posImpact[i] = 0 if np.isnan(negImpact[i]): negImpact[i] = 0 plt.figure(figsize=(12, 8)) plt.hold(True) plt.plot(midPoints, posImpact, '.', markersize=20, label='Positive') plt.plot(midPoints, negImpact, 'r.', markersize=20, label='Negative') plt.legend(prop={'size': 20}) plt.xlabel(theFeatures[theImportances.index( sortedImportances[thisFeature])], fontsize=16) plt.ylabel('Relative Impact', fontsize=20) plt.grid() imgThree = 'static/theNegPosHist.jpg' plt.savefig('flask_files/' + imgThree, dpi=300) #======================================================================= # generate plots for report (this is save to an "html" file) from bokeh.charts import Histogram, output_file, show, save, gridplot from bokeh.plotting import figure plotList = [] for i in range(optimalNumberOfFeatures): thisFeatureIs = theFeatures[theImportances.index(sortedImportances[i])] allThoseFeatures = dp[thisFeatureIs] combinedOutcomes = plt.hist(allThoseFeatures, bins=10) positiv = allThoseFeatures[yOrig == 1] negativ = allThoseFeatures[yOrig == 0] negA = plt.hist(negativ, bins=combinedOutcomes[1]) posA = plt.hist(positiv, bins=combinedOutcomes[1]) posImpact = np.divide(posA[0], combinedOutcomes[0]) negImpact = np.divide(negA[0], combinedOutcomes[0]) midPoints = [] for i in range(1, len(combinedOutcomes[1])): midPoints.append( (combinedOutcomes[1][i] + combinedOutcomes[1][i - 1]) / 2.) for i in range(len(posImpact)): if np.isnan(posImpact[i]): posImpact[i] = 0 if np.isnan(negImpact[i]): negImpact[i] = 0 hist0 = Histogram(dp, values=thisFeatureIs, color='blue', title="Impact of " + thisFeatureIs, bins=10) plot0 = figure() plot0.xaxis.axis_label = thisFeatureIs plot0.yaxis.axis_label = "Relative Impact" # plot0.title = "Relative Impact of " + thisFeatureIs plot0.circle(midPoints, list(negImpact), size=10, color="red", alpha=0.9, legend='Negative') plot0.circle(midPoints, list(posImpact), size=10, color="green", alpha=0.9, legend='Positive') plotList.append([hist0, plot0]) output_file("flask_files/static/Report.html", title="Report") hist = gridplot(plotList) save(hist) #======================================================================= # specify the models to run tests with theModels = { 'Logistic Regression': LogisticRegression(penalty='l1'), 'LDA': LinearDiscriminantAnalysis(), 'SVM': SVC(kernel='linear'), 'Random Forest': RandomForestClassifier(n_estimators=300) } # ...then display the results of the tests classifierComparisons = [] for aModel in theModels: model = theModels[aModel] results = cross_validation.cross_val_score(model, Xt, y, scoring='f1', cv=StratifiedKFold( y, n_folds=3)) classifierComparisons.append( dict(Classifier=aModel, Score=np.max(results))) #======================================================================= # display the plots theJPGs = [imgOne, imgTwo, imgThree] #======================================================================= return introReport, tableOne, optimalNumberOfFeatures, classifierComparisons, theJPGs