print(WEEKLY.head())

# In[7]:

## WRITE TO FILE
WEEKLY.reset_index(drop=False).to_excel(
    'C:/Users/pmwash/Desktop/Disposable Docs/NEWEST MERGED DATASET.xlsx',
    sheet_name='CHECK')

# In[8]:

from bokeh.charts import Histogram, output_file, show
h2 = Histogram(WEEKLY,
               values='CasesPerSale',
               label='Class_Code',
               color='Class_Code',
               title='Test Histogram',
               plot_width=1100,
               plot_height=900)
show(h2)

# In[9]:

from ggplot import *

ggplot(WEEKLY, aes(x='WeekNumber', y='Cases_Received')) + geom_line()

# In[10]:

from bokeh.charts import BoxPlot
#%matplotlib inline
#pylab.rcParams['figure.figsize'] = (10, 6)
def convert_int(dataframe, colname):
    dataframe[colname] = pd.to_numeric(dataframe[colname], errors='coerce')


limit_rows = 1000000
df = pd.read_csv("C:/Users/vasir/Desktop/ADM/train_ver2.CSV", nrows=limit_rows)

convert_int(df, 'age')
print df['age'].dtype
df['age'] = df['age'].fillna(-1)
cols = ['age']
df[cols] = df[cols].applymap(np.int64)
df_frac = df.sample(frac=0.01)
p_age = Histogram(df_frac, values='age', title="Age Distribution")
#show(p_age)

dffrac1 = df_frac.dropna(subset=['sexo'], how='any')
dffrac1.head()
#dffrac1['sexo']=dffrac1['sexo'].astype('category')
p = Bar(dffrac1, 'sexo', title="Sex")
#show(p)

dffrac2 = df_frac.dropna(subset=['renta'], how='any')
bar_renta = Bar(dffrac2,
                values='renta',
                label='nomprov',
                agg='mean',
                title="City Vs Renta",
                legend=False,
Esempio n. 3
0
from bokeh.charts import Histogram, output_file, show
from bokeh.layouts import row
from bokeh.sampledata.autompg import autompg as df

hist = Histogram(df, values='mpg', title="Auto MPG Histogram", plot_width=400)
hist2 = Histogram(df, values='mpg', label='cyl', color='cyl', legend='top_right',
                  title="MPG Histogram by Cylinder Count", plot_width=400)

output_file('hist.html')
show(row(hist, hist2))
def histogram(histDF, values, **kwargs):
    from bokeh.charts import Histogram
    return Histogram(histDF[values], **kwargs)
get_ipython().system(
    u'curl --upload-file restaraunts.html https://transfer.sh/restaruants.html'
)

# In[93]:

from bokeh.io import output_notebook
output_notebook()

# In[94]:

from bokeh.charts import Histogram, output_file, show

# In[96]:

p1 = Histogram(samp["SCORE"])
show(p1)

# In[103]:

p2 = Histogram(mRests,
               'SCORE',
               color='GRADE',
               title="Score Grouped by Grade",
               bins=15,
               legend='top_left')

show(p2)

# In[99]:
Esempio n. 6
0
from bokeh.charts import Histogram, defaults, show, output_file
from bokeh.layouts import gridplot
from bokeh.sampledata.autompg import autompg as df

defaults.plot_width = 400
defaults.plot_height = 350

# input options
hist = Histogram(df['mpg'], title="df['mpg']")
hist2 = Histogram(df, 'displ', title="df, 'displ'")
hist3 = Histogram(df, values='hp', title="df, values='hp'", density=True)

hist4 = Histogram(df,
                  values='hp',
                  color='cyl',
                  title="df, values='hp', color='cyl'",
                  legend='top_right')

hist5 = Histogram(df, values='mpg', bins=50, title="df, values='mpg', bins=50")
hist6 = Histogram(df,
                  values='mpg',
                  bins=[10, 15, 25, 100],
                  tooltips=[('Bin', "@label")],
                  title="df, values='mpg', bins=[10, 15, 25, 100]")

output_file("histogram_multi.html", title="histogram_multi.py example")

show(gridplot(hist, hist2, hist3, hist4, hist5, hist6, ncols=2))
Esempio n. 7
0
import numpy as np
import pandas as pd

from bokeh.charts import Histogram, show, output_file

# build some distributions and load them into a dict
mu, sigma = 0, 0.5
normal = np.random.normal(mu, sigma, 1000)
lognormal = np.random.lognormal(mu, sigma, 1000)
distributions = OrderedDict(normal=normal, lognormal=lognormal)

# create a pandas data frame from the dict
df = pd.DataFrame(distributions)
distributions = df.to_dict()

for k, v in distributions.items():
    distributions[k] = v.values()

# any of the following commented are valid Histogram inputs
#df = list(distributions.values())
#df = tuple(distributions.values())
#df = tuple([tuple(x) for x in distributions.values()])
#df = np.array(list(distributions.values()))
#df = list(distributions.values())[0]

output_file("histograms.html")

hist = Histogram(df, bins=50, legend=True)

show(hist)
Esempio n. 8
0
    def plot_timeline_cu(self, width, height,
                         dataframe, cu_id,
                         x_max=None, y_max=None):
        """ Plot timeline """
        # Plot
        plot_color = tm.get_random_color()

        # Range
        condition = 'WHERE cu=' + str(cu_id)
        if x_max is None:
            x_max = int(self.get_max("inst", "start + length", condition))

        if y_max is None:
            y_max = int(self.get_count("inst", "uid", condition))

        # Get box annotation and cycle info
        boxannotations, info = self.get_interval_boxannotation(cu_id)
        cycle_all = info['cycle_all']
        cycle_mem_ld = info['cycle_mem_ld']
        cycle_mem_st = info['cycle_mem_st']
        cycle_other = info['cycle_other']

        # Title
        title = 'cu-' + str(cu_id) + ': '
        title += str(cycle_mem_ld) + ' mem ld / '
        title += str(cycle_mem_st) + ' mem st / '
        title += str(cycle_other) + ' other / '
        title += str(cycle_all) + ' all'

        plot = figure(webgl=True,
                      width=width,
                      height=height,
                      x_range=(0, x_max),
                      y_range=(0, y_max),
                      title=title)

        y_axis = range(len(dataframe.index))

        plot.segment(x0=dataframe['start'],
                     y0=y_axis,
                     x1=dataframe['start'] + dataframe['length'],
                     y1=y_axis,
                     line_width=1,
                     color=dataframe['color'])

        # Add box annotation
        for box in boxannotations:
            plot.add_layout(box)

        # Plot histogram on the right, ignore zeroes
        mean = np.round(dataframe['stall'].mean(), 2)
        median = dataframe['stall'].median()
        hist_title = 'stall'
        hist_title += ' / avg ' + str(mean)
        hist_title += ' / mid ' + str(median)
        plot_hist = Histogram(dataframe,
                              'stall',
                              bins=50,
                              height=height,
                              width=height,
                              color=plot_color,
                              title=hist_title)

        return (plot, plot_hist)
Esempio n. 9
0
def badass_plot(newpoint, new_df):
    """
    Used for Fraud Case Study
    This function will add the data of a new point (json or csv)
    to the graphs that have been build in bokeh
    Input:
        newpoint in json form or csv, as long as it's a dictionary, we're good!

    Return:
        A tab through visualization of bokeh plots, showing the data points
        location on these plots
    """
    color_list = ['#FF8C00', '#FAFCCC', '#EA653B', '#EE993D', \
                  '#EA653B', '#ED9239', '#A4A7AA', '#676767']
    output_file("Check-Graphs.html")

    bins = 15
    p1 = figure(width=500,
                height=500,
                x_range=(new_df['event_created_to_end'].min(),
                         new_df[u'event_created_to_end'].max()))
    p1 = Histogram(new_df,
                   values='event_created_to_end',
                   label='label',
                   color='label',
                   title='Duration of \n Event Created to End',
                   density=True,
                   legend='top_right',
                   bins=bins,
                   palette=color_list)

    vline = Span(location=newpoint[u'event_created_to_end'],
                 dimension='height',
                 line_color='red',
                 line_width=3)
    p1.xaxis.axis_label = ''
    p1.xaxis.axis_label_text_font_style = 'normal'
    p1.yaxis.axis_label_text_font_style = 'normal'
    p1.xaxis.axis_label_text_font_size = '18pt'
    p1.yaxis.axis_label_text_font_size = '18pt'

    p1.background_fill_color = "#acaaa8"
    p1.background_fill_alpha = 0.5
    p1.title.text_font = 'helvetica'
    p1.title.text_font_size = '18pt'
    p1.renderers.extend([vline])

    bins = 2
    p2 = figure(width=500,
                height=500,
                x_range=(new_df['total_tickets_sold'].min(),
                         new_df['total_tickets_sold'].max()))
    p2 = Histogram(new_df,
                   values='total_tickets_sold',
                   label='label',
                   color='label',
                   title='Total Tickets Sold',
                   density=True,
                   legend='top_right',
                   bins=bins,
                   palette=color_list)

    vline = Span(location=newpoint[u'total_tickets_sold'],
                 dimension='height',
                 line_color='red',
                 line_width=3)
    p2.xaxis.axis_label = ''
    p2.xaxis.axis_label_text_font_style = 'normal'
    p2.yaxis.axis_label_text_font_style = 'normal'
    p2.xaxis.axis_label_text_font_size = '18pt'
    p2.yaxis.axis_label_text_font_size = '18pt'

    p2.background_fill_color = "#acaaa8"
    p2.background_fill_alpha = 0.5
    p2.title.text_font = 'helvetica'
    p2.title.text_font_size = '18pt'
    p2.renderers.extend([vline])

    bins = 2
    p3 = figure(width=500,
                height=500,
                x_range=(new_df['payout_type_MISSING'].min(),
                         new_df['payout_type_MISSING'].max()))
    p3 = Histogram(new_df,
                   values='payout_type_MISSING',
                   label='label',
                   color='label',
                   title='Payout Type Missing \n (Yes/No)',
                   density=True,
                   legend='top_right',
                   bins=bins,
                   palette=color_list)

    vline = Span(location=newpoint[u'payout_type_MISSING'],
                 dimension='height',
                 line_color='red',
                 line_width=3)
    p3.xaxis.axis_label = ''
    p3.xaxis.axis_label_text_font_style = 'normal'
    p3.yaxis.axis_label_text_font_style = 'normal'
    p3.xaxis.axis_label_text_font_size = '18pt'
    p3.yaxis.axis_label_text_font_size = '18pt'

    p3.background_fill_color = "#acaaa8"
    p3.background_fill_alpha = 0.5
    p3.title.text_font = 'helvetica'
    p3.title.text_font_size = '18pt'
    p3.renderers.extend([vline])

    bins = 30
    p4 = figure(width=500,
                height=500,
                x_range=(new_df['median_ticket_cost'].min(),
                         new_df['median_ticket_cost'].max()))
    p4 = Histogram(new_df,
                   values='median_ticket_cost',
                   label='label',
                   color='label',
                   title='Median Ticket Cost',
                   density=True,
                   legend='top_right',
                   bins=bins,
                   palette=color_list)

    vline = Span(location=newpoint[u'median_ticket_cost'],
                 dimension='height',
                 line_color='red',
                 line_width=3)
    p4.xaxis.axis_label = ''
    p4.xaxis.axis_label_text_font_style = 'normal'
    p4.yaxis.axis_label_text_font_style = 'normal'
    p4.xaxis.axis_label_text_font_size = '18pt'
    p4.yaxis.axis_label_text_font_size = '18pt'

    p4.background_fill_color = "#acaaa8"
    p4.background_fill_alpha = 0.5
    p4.title.text_font = 'helvetica'
    p4.title.text_font_size = '18pt'
    p4.renderers.extend([vline])

    tab1 = Panel(child=p1, title='Event Create to End')
    tab2 = Panel(child=p2, title='Total Tickets Sold')
    tab3 = Panel(child=p3, title='Payout Type Missing (yes/no)')
    tab4 = Panel(child=p4, title='Median Ticket Cost')

    tabs = Tabs(tabs=[tab1, tab2, tab3, tab4])
    return file_html(tabs, CDN, 'plot')
Esempio n. 10
0
# In[6]:

flowers[flowers.sepal_length < 5]

# In[7]:

flowers.describe()

# In[8]:

display(flowers["species"].unique())
display(flowers.species.value_counts())

# In[9]:

hist = Histogram(flowers, values='petal_length')
show(hist)

# In[10]:

hist2 = Histogram(flowers,
                  values='petal_length',
                  label="species",
                  color="species")
show(hist2)

# In[11]:

colormap = {'setosa': 'red', 'versicolor': 'green', 'virginica': 'blue'}
colors = [colormap[x] for x in flowers['species']]
p = figure(title="Iris Morphology")
Esempio n. 11
0
def Histograms():
    plot = Histogram(df, values= value, color=value,
              title=title, legend='top_right')
#    script, div = components(plot)
    return plot
Esempio n. 12
0
def dashboard(request, counter_name, db):
    """ This is the main function for our dashboard it queries the database for the correct data 
	    given from the previous page. Then displays those results in three formats. Histogram, timeseries and CDF """

    counter_len = len(Counter.objects.using(db).values())

    Date = [
        Counter.objects.using(db).values()[i]["pub_date"]
        for i in range(counter_len)
    ]
    name = counter_name
    y_values = Counter.objects.using(db).values_list(
        "counter_value", flat=True).filter(counter_name=counter_name)

    points = zip(Date, y_values)

    ddict = OrderedDict({'Date': Date})
    #ddict[name] = y_values

    #plot specifications
    TOOLS = "pan,wheel_zoom,box_zoom,reset,hover,save"
    p = figure(width=1200,
               height=400,
               x_axis_type="datetime",
               tools=TOOLS,
               title=name + "'s Metrics")
    p.min_border_left = 100
    p.min_border_top = 50
    p.border_fill = 'whitesmoke'
    p.ygrid.band_fill_color = "#E6E6FA"
    p.title_text_color = "olive"
    p.title_text_font = "times"
    p.title_text_font_style = "italic"

    p.outline_line_width = 7
    p.outline_line_alpha = 0.3
    p.outline_line_color = "black"

    #HoverTool specifications
    source = ColumnDataSource(data=dict(rates=[
        slope(points[i][0].second, points[i][1], points[i + 1][0].second,
              points[i + 1][1]) for i in range(len(points) - 1)
    ]))

    hover = p.select(dict(type=HoverTool))
    hover.point_policy = "follow_mouse"
    hover.tooltips = OrderedDict([
        ("Counter Name", name),
        ("Rate of count", "@rates c/us"),
    ])

    p.line(Date, y_values, line_width=1, source=source)
    p.square(Date, y_values, fill_color=None, line_color="green", size=4)
    script1, div1 = components(p, CDN)

    hist = Histogram(list(y_values), bins=50, title='Histogram')
    hist.border_fill = 'whitesmoke'
    hist.background_fill = "beige"

    script2, div2 = components(hist, CDN)

    area = Area(list(y_values), title="CDF")
    area.border_fill = "whitesmoke"
    area.background_fill = "#191970"

    script3, div3 = components(area, CDN)

    context = RequestContext(
        request, {
            "the_script1": script1,
            "the_div1": div1,
            "the_script2": script2,
            "the_div2": div2,
            "the_script3": script3,
            "the_div3": div3
        })

    return render(request, "counters_app/simple_bokeh.html", context)
Esempio n. 13
0
#para o esquema 3-5-2
squad_352 = ['GK', 'LWB', 'CB', 'RWB', 'LM', 'CDM', 'CAM', 'CM', 'RM', 'LW', 'RW']
print ('3-5-2')
print (get_best_squad(squad_352))

#Visualizações

#distribuições
data.Age.plot(kind='hist', bins=20)
data.Overall.plot(kind='hist', bins=20)

#histograma melhorado
# entre os melhores times
melhoresClubs = data[(data.Club == 'FC Barcelona') | (data.Club == 'Juventus') | (data.Club == 'Real Madrid CF') | (data.Club == 'FC Bayern Munich') | (data.Club == 'Paris Saint-Germain') ]
hist = Histogram(data=melhoresClubs, values="Age", color="Club", legend="top_right", bins=12)
show(hist)

#box plot para analisar salarios dos melhores clubs
sns.set(style="whitegrid", color_codes=True)
sns.boxplot(x="Club", y="salario", hue="Club", data=melhoresClubs, palette="PRGn")
sns.despine(offset=10, trim=True)

best = data[data['Overall']> 85]
grouped = best.groupby('Club')
count_by_club = grouped.count()['Name'].sort_values(ascending = False)
ax = sns.countplot(x = 'Club', data = best, order = count_by_club.index)
ax.set_xticklabels(labels = count_by_club.index, rotation='vertical')
ax.set_ylabel('Numero de jogadores')
ax.set_xlabel('Clube')
ax.set_title('Clubes com os melhores jogadores')
Esempio n. 14
0
def GDP_PCA_plot(filename=None,
                 threshold=0.015,
                 lowerbound=2.0,
                 upperbound=1.0e4,
                 factor=0.08):
    data = np.load('uploads/' + filename)

    image1 = data.f.image  #-np.median(data.f.image)
    image2 = np.array(image1 * 255 / image1.max(), dtype='uint8')
    image_avg = np.mean(np.partition(image2, int(len(image2) * 0.2)))
    image2[image2 < image_avg] = int(image_avg)
    #H1=cv2.GaussianBlur(image2,(3,3),1.0*np.std(image2))
    H1 = gaussian_filter(image2, factor * np.std(image2), mode='nearest')
    image2 = H1
    blobs_log = blob_log(image2,
                         max_sigma=0.3 * np.std(image2),
                         min_sigma=0.02 * np.mean(image2),
                         num_sigma=20,
                         threshold=threshold,
                         overlap=0.6)
    blobs_log[:, 2] = blobs_log[:, 2] * np.sqrt(2)
    blobs = blobs_log[(blobs_log[:, 2] > lowerbound)
                      & (blobs_log[:, 2] < upperbound)]

    xx = (data.f.X.min(), np.round(data.f.X.max(), -1))
    yy = (data.f.Y.min(), np.round(data.f.Y.max(), -1))
    x_step = (xx[1] - xx[0]) / np.shape(H1)[0]
    y_step = (yy[1] - yy[0]) / np.shape(H1)[1]

    #Number of NV
    height = yy[1] - yy[0]
    width = xx[1] - xx[0]
    total = len(blobs)
    per_nv = round(len(blobs) / float(height * width) * (20 * 20), 2)
    ########################################################
    t = [
        'Original Density Plot,'
        ' Filename=' + filename, 'Gaussian Filtered Density Plot',
        'Total NVs =' + str(total) + ' , NVs per 20x20 pixel area = ' +
        str(per_nv)
    ]

    data_list = [image1, H1, H1]
    color_list = [
        Viridis256, cc.b_linear_bgy_10_95_c74, cc.b_linear_bgy_10_95_c74
    ]
    return_list = []
    return_list.append(head)
    #hover tool hack to work for image function in bokeh.
    ##http://stackoverflow.com/questions/28176949/convert-list-of-tuples-to-structured-numpy-array
    px = np.linspace(xx[0], xx[1], np.shape(H1)[0] / 2)
    py = np.linspace(yy[0], yy[1], np.shape(H1)[1] / 2)
    px = np.array(px, dtype='uint32')
    py = np.array(py, dtype='uint32')
    a = []
    for i in px:
        a.extend(zip(itertools.repeat(i), py))
    dt = np.dtype('int,float')
    X = np.array(a, dtype=dt)
    x1 = X['f0']
    y1 = X['f1']
    ##################################################################
    for i in range(3):
        color_mapper = LogColorMapper(palette=color_list[i], \
                              low=np.mean(data_list[i]), \
                              high=1.0*np.mean(data_list[i])+\
                              2.0*np.std(data_list[i]))


        color_bar = ColorBar(color_mapper=color_mapper,\
                              label_standoff=12, \
                              border_line_color=None, \
                              location=(0,0))


        p1 = figure(plot_width=600, plot_height=600,title=t[i],title_text_font_size='12pt',\
                    x_range=xx,y_range=xx,tools=TOOLS,toolbar_location="below",toolbar_sticky=False,responsive=True)
        p1.square(x1, y1, alpha=1.0)
        p1.image(image=[data_list[i]],
                 color_mapper=color_mapper,
                 dh=yy[1] - yy[0],
                 dw=xx[1] - xx[0],
                 x=xx[0],
                 y=xx[0])
        p1.add_layout(color_bar, 'right')
        if i == 2:
            p1.circle(blobs[:, 1] * x_step + xx[0],
                      blobs[:, 0] * y_step + yy[0],
                      radius=blobs[:, 2] * 1.6,
                      radius_dimension='y',
                      line_color='red',
                      alpha=1.0,
                      line_width=3,
                      fill_color=None)

        p1.title.text_font_size = "11pt"
        p1.xaxis.axis_label_text_font_size = "13pt"
        p1.yaxis.axis_label_text_font_size = "13pt"

        #plots = {'Navy': p1, 'Blue': p2};
        tuple_plot = components(p1)
        #script2, div2 = components(p2);

        return_list.append(list(tuple_plot))

    p=Histogram(blobs[:,2],\
            plot_width=600, plot_height=600,tools=TOOLS,\
            toolbar_location="below",toolbar_sticky=False,\
            responsive=True,\
            title="Distribution of Radii of NV Centers")

    return_list.append(list(components(p)))

    return return_list
Esempio n. 15
0
def get_histogram(t):
    t1, t2 = ticker1.value, ticker2.value
    data = get_data(t1, t2)
    h = Histogram(data[[t]], values=t)
    h.toolbar_location = None
    return h
Esempio n. 16
0
from bokeh.charts import Histogram
from bokeh.sampledata.autompg import autompg as df
from bokeh.charts import defaults, vplot, hplot, show, output_file

defaults.width = 450
defaults.height = 350

# input options
hist = Histogram(df['mpg'], title="df['mpg']")
hist2 = Histogram(df, 'displ', title="df, 'displ'")
hist3 = Histogram(df, values='hp', title="df, values='hp'")

hist4 = Histogram(df,
                  values='hp',
                  color='cyl',
                  title="df, values='hp', color='cyl'",
                  legend='top_right')

hist5 = Histogram(df, values='mpg', bins=50, title="df, values='mpg', bins=50")

output_file("histograms.html")

show(vplot(hplot(hist, hist2, hist3), hplot(hist4, hist5)))
Esempio n. 17
0
As in the previous two chapters, you can interact with the figures you create in this chapter as well, and you may have to scroll down to view the lower portion of some of them.
Instructions

    Import Histogram, output_file, and show from bokeh.charts.
    Make a histogram called p with the Histogram() function using the 'female_literacy' column of df. You have to first specify df and then 'female_literacy'. Give the histogram a title of 'Female Literacy'.
    Set the x-axis label using p.xaxis.axis_label.
    Set the y-axis label using p.yaxis.axis_label.
    Specify the name 'histogram.html' for the output file and display the histogram p.

"""
# Import Histogram, output_file, and show from bokeh.charts
from bokeh.charts import Histogram, output_file, show

# Make a Histogram: p
p = Histogram(df, 'female_literacy', title='Female Literacy')

# Set the x axis label
p.xaxis.axis_label = 'Nb'

# Set the y axis label
p.xaxis.axis_label = 'Female Literacy'

# Specify the name of the output_file and show the result
output_file('histogram.html')
show(p)
""" Console Output or results
see plot25.png
Great work! In the next exercise, you will learn how to customize histograms by controlling the number of bins.
"""
"""
output_file("tSNE_predicted_labels.html")
show(p)

p = Scatter(dftsne,
            x='x',
            y='y',
            color='Machine_Label',
            title='HG002 CrowdVar: Crowd Sourced Labels',
            legend="bottom_left")
output_file("tSNE_CrowdSourced_labels.html")
show(p)

p = Histogram(log_size,
              values='INS_log_size',
              title='HG002 INS: Size Distribution [5000 Samples]',
              color='LightSlateGray',
              bins=19,
              xlabel="Size[log10]",
              ylabel="Frequency")
output_file("tSNE4_INS_Histo_logsize.html")
show(p)

p = Histogram(log_size,
              values='INS_log_size',
              title='HG002 INS: Size Distribution [5000 Samples]',
              color='LightSlateGray',
              bins=30,
              xlabel="Size[log10]",
              ylabel="Frequency")
output_file("tSNE4_INS_Histo_logsize.2.html")
show(p)
Esempio n. 19
0
import numpy as np

# we build some distributions and load them into a dict
mu, sigma = 0, 0.5
normal = np.random.normal(mu, sigma, 1000)
lognormal = np.random.lognormal(mu, sigma, 1000)
distributions = dict(normal=normal, lognormal=lognormal)

# then we create a pandas df from the dict
import pandas as pd
df = pd.DataFrame(distributions)

# and finally we drop the df into out Histogram chart
from bokeh.charts import Histogram
hist = Histogram(df, bins=50, filename="histograms.html")
hist.title("Histograms").ylabel("frequency").legend(True).width(400).height(
    350).show()
Esempio n. 20
0
from collections import OrderedDict
import numpy as np
import pandas as pd

from bokeh.charts import Histogram

# we build some distributions and load them into a dict
mu, sigma = 0, 0.5
normal = np.random.normal(mu, sigma, 1000)
lognormal = np.random.lognormal(mu, sigma, 1000)
distributions = OrderedDict(normal=normal, lognormal=lognormal)

# then we create a pandas df from the dict
df = pd.DataFrame(distributions)
distributions = df.to_dict()

for k, v in distributions.items():
    distributions[k] = v.values()

# any of the following commented are valid Histogram inputs
#df = list(distributions.values())
#df = tuple(distributions.values())
#df = tuple([tuple(x) for x in distributions.values()])
#df = np.array(list(distributions.values()))
#df = list(distributions.values())[0]

hist = Histogram(df, bins=50, filename="histograms.html", legend=True)
# hist.title("Histograms").ylabel("frequency").xlabel('distributions').legend(True)
# hist.width(400).height(350)
hist.show()
Esempio n. 21
0
from bokeh.charts import Histogram, output_file, show
from bokeh.sampledata.autompg import autompg as df

p = Histogram(df['mpg'], title="MPG Distribution")

output_file("histogram.html", )

show(p)
from bokeh.charts import Histogram, output_file, show
from bokeh.sampledata.autompg import autompg as df

p = Histogram(df, 'hp', title="HP Distribution")

output_file("histogram.html", )

show(p)
def plot_prediction_histogram(images_path,
                              model_path,
                              weights_path,
                              img_shape,
                              required_class,
                              class_name_dict={},
                              batch_size=16,
                              preprocessing_function=None,
                              plotting_module="matplotlib"):
    """ Function to plot the histogram of predicted probabilities of a given class

    images_path --str: full path to the parent directory containing sub-directory(classes) of images
    model_path --str: full path to a keras model (.json file) (No default)
    weights_path --str: full path to the weights file (.hdf5 file) (No default)
    img_shape --tuple: image shape to input to the model (eg : (224,224,3)) (No default)
    required_class --str: The name of the required class on which to generate ROC (example "dog")
    class_name_dict --dict: Dictionary mapping of classes (default {})
    batch_size --int: The batch_size to use for prediction (Default 16)
    preprocessing_function --function: The preprocessing function to use before prediction (Default None)
    plotting_module --str: The plotting module to use. Either of 'matplotlib' or 'bokeh' (Default matplotlib)

    Output:
    Plots the histogram of the predicted probabilites of the required class

    """
    if len(class_name_dict) == 0:
        raise ValueError("Provide the class_name_dict")

    # loading model and weights
    json_file = open(model_path, 'r')
    loaded_model_json = json_file.read()
    json_file.close()
    loaded_model = model_from_json(loaded_model_json)
    # load weights
    loaded_model.load_weights(weights_path)
    print('loaded model from disk')
    # generator for data
    test_datagen = ImageDataGenerator(
        preprocessing_function=preprocessing_function)
    test_generator = test_datagen.flow_from_directory(
        images_path,
        target_size=(img_shape[0], img_shape[1]),
        class_mode='categorical',
        batch_size=batch_size,
        shuffle=False,  # to get ordered result
    )
    # this is an important step else there is a difference in result between predict and predict_generator
    test_generator.reset()  # reset to start with sample 0
    nb_samples = test_generator.samples
    class_name_dict = test_generator.class_indices  # eg {'dog': 0, 'cat': 1}
    # predict
    predictions = loaded_model.predict_generator(test_generator,
                                                 steps=nb_samples //
                                                 batch_size,
                                                 max_queue_size=10,
                                                 workers=1,
                                                 use_multiprocessing=False,
                                                 verbose=1)
    class_idx = class_name_dict[required_class]
    hist_data = predictions[:, class_idx]

    if plotting_module == "matplotlib":
        plt.hist(hist_data, bins=10)
        plt.xlim(0, 1)
        plt.title('Histogram of predicted probabilities')
        plt.xlabel('Predicted probability of {}'.format(required_class))
        plt.ylabel('Frequency')

    elif plotting_module == "bokeh":
        df = pd.DataFrame(
            {"Predicted probability of {}".format(required_class): hist_data})
        from bokeh.io import output_notebook
        import warnings
        warnings.filterwarnings('ignore')
        from bokeh.charts import Histogram, show
        output_notebook()
        p = Histogram(df,
                      "Predicted probability of {}".format(required_class),
                      title="Histogram of predicted probabilities")
        show(p)

    else:
        raise ValueError("Only bokeh and matplotlib are supported")

view2 = CDSView(source=source, filters=[GroupFilter(column_name='handedness', group='L')])
p_L = figure(tools = tools, title = 'Batting Average vs Home Runs of left handed players', height = 200)
p_L.xaxis.axis_label = 'Average'
p_L.yaxis.axis_label = 'Home Runs'
p_L.circle(x="avg", y="HR", color = 'green', hover_color="yellow", view=view2, source = source)

view3 = CDSView(source=source, filters=[GroupFilter(column_name='handedness', group='B')])
p_B = figure(tools = tools, title = 'Batting Average vs Home Runs of both handed players', height = 200)
p_B.xaxis.axis_label = 'Average'
p_B.yaxis.axis_label = 'Home Runs'
p_B.circle(x="avg", y="HR", color = 'blue', hover_color="yellow", view=view3, source = source)


hist = Histogram(data, values='avg', title="Average of all", plot_height=300)
hist2 = Histogram(data, values='avg', label='handedness', color='handedness', legend='top_right',
                  title="Average of R, L and B handedness", plot_height=300)

hist3 = Histogram(data, values='HR', title="Home runs of all", plot_height=300)
hist4 = Histogram(data, values='HR', label='handedness', color='handedness', legend='top_right',
                  title="Home runs of R, L and B handedness", plot_height=300)

marker = {'R': 'red', 'L': 'green', 'B' : 'blue'}
markers = ['0.05' if x <= 0.05 else '0.01' if 0.05 < x <= 0.10 else '0.15' if 0.10 < x < 0.15 else '0.20' if 0.15 < x < 0.20 else '0.25' if 0.25 < x < 0.30 else '0.30' for x in data['avg']]

scatter1 = Scatter(data, x='weight', y='avg', color='handedness',
                  title="Weight vs Average", xlabel="Weight",
                  ylabel="Average", plot_height=300)

scatter2 = Scatter(data, x='height', y='avg', color='handedness',
Esempio n. 25
0
from bokeh.charts import Histogram, output_file, show
from bokeh.sampledata.autompg import autompg as df

p = Histogram(df, values='hp', color='navy', title="HP Distribution")

output_file("histogram_color.html")

show(p)
Esempio n. 26
0
# ch26.py

# ref:
# http://bokeh.pydata.org/en/latest/docs/user_guide/charts.html#histograms

# Or explicitly as the values keyword argument:

from bokeh.charts import Histogram, output_file, show
from bokeh.sampledata.autompg import autompg as df

p = Histogram(df, values='displ', title="DISPL Distribution")

output_file("/tmp/ch26.html", )

show(p)
Esempio n. 27
0
    def get_memory_hists(self, mode='overview'):
        """ Plot memory histogram in a row """
        access = self.__get_memory_access_types(mode)

        hists = []
        data_dfs = []

        col_cycle = []
        col_index = []

        for key, value in access.iteritems():
            sql_query = 'SELECT length FROM memory'
            sql_query += ' WHERE ' + value
            sql_query += ' ORDER by line'
            dataframe = pd.read_sql_query(sql_query, self.get_db())

            color = tm.get_random_color()

            col_sched = []
            col_location = []
            col_value = []
            col_type = []
            col_color = []

            try:
                # Some statistics
                mean = np.round_(dataframe["length"].mean(), 2)
                median = dataframe["length"].median()
                sum_len = dataframe["length"].sum()
                col_cycle.append(sum_len)
                col_index.append(key)

                # Plot histogram
                hist_title = key
                hist_title += ' / avg ' + str(mean)
                hist_title += ' / mid ' + str(median)
                plot_hist = Histogram(dataframe["length"].replace(0, np.nan),
                                      'length',
                                      bins=50,
                                      color=color,
                                      xlabel='Latency of instruction in cycle',
                                      ylabel='Count',
                                      title=hist_title)
            except ValueError:
                continue

            # Ignore NaN
            if mean == mean and median == median:
                # Insert mean
                col_sched.append(self.get_name_instruction_scheduler())
                col_location.append(key)
                col_value.append(mean)
                col_type.append('Mean')
                col_color.append(color)

                # Insert median
                col_sched.append(self.get_name_instruction_scheduler())
                col_location.append(key)
                col_value.append(median)
                col_type.append('Median')
                col_color.append(color)

                data = {'sched': col_sched,
                        'location': col_location,
                        'value': col_value,
                        'type': col_type,
                        'color': col_color}
                data_df = pd.DataFrame(data, index=col_location)
                data_dfs.append(data_df)

            hists.append(plot_hist)

        # Plot break down of cycles
        data = {'cycle': col_cycle}
        cycle_df = pd.DataFrame(data, index=col_index)
        pie_cycle = Donut(cycle_df.replace(0, np.nan),
                          title='Break down: cycles')
        hists.append(pie_cycle)

        info = {'hist': hists,
                'info': data_dfs}
        return info
Esempio n. 28
0
plt2.toolbar_location = None

plt2.y_range = Range1d(start=0, end=1)
# Save the plot
output_file("./templates/plot3_new.html")
# show(plt2)

#######################################################
# Look at dependence of sale outcome on seller feedback score
#######################################################

# hist = Histogram(data_sold, values='feedbackScore', color='listingType',
#                  title="Distribution of feedback scores for sold items", legend='top_left')
#
# # Save the plot
# output_file("./templates/data_exploration.html")
# show(hist)

#######################################################
# Look at distributions of sale prices by product
#######################################################

hist = Histogram(data_sold,
                 values='value',
                 color='productId_value',
                 title="Distribution of sale prices by product",
                 legend='top_left')

# Save the plot
output_file("./templates/sales_histogram.html")
show(hist)
Esempio n. 29
0
from bokeh.charts import Histogram, output_file, show
from bokeh.sampledata.autompg import autompg as df

p = Histogram(df, values='mpg', bins=50, title="MPG Distribution (50 bins)")

output_file("histogram_bins.html")

show(p)
Esempio n. 30
0
def preProcess(theFileName):
    df = pd.read_csv(str(theFileName))
    if 'Unnamed: 0' in df.columns:
        df = df.drop('Unnamed: 0', axis=1)
    labBin = sklearn.preprocessing.LabelBinarizer()
    df['y'] = labBin.fit_transform(df['y'])
    dp = pd.get_dummies(df)
    X = dp.drop('y', axis=1)
    y = dp[['y']]

    # get the features
    theFeatures = X.columns

    # convert the dataframes to arrays
    X = X.values
    y = y.values
    y.shape = np.shape(y)[0]

    yOrig = y[:]  # need this later for plotting feature impacts

    # and carry out feature scaling
    X = StandardScaler().fit_transform(X)

    #=======================================================================

    # apply random undersampling if labels are imbalanced
    labelSkewness = 100 * np.sum(y) * 1. / np.shape(y)[0]
    if np.min([labelSkewness, 100 - labelSkewness]) < (100. / 3.):
        rus = RandomUnderSampler(verbose=0)
        X, y = rus.fit_sample(X, y)

    #=======================================================================

    # select optimal number of features
    thisModel = LogisticRegression(penalty='l1', C=1)
    rfecv = RFECV(estimator=thisModel,
                  step=1,
                  cv=StratifiedKFold(y, n_folds=3),
                  scoring='f1')
    Xt = rfecv.fit_transform(X, y)

    optimalNumberOfFeatures = rfecv.n_features_
    introReport = [
        'Optimal Number of Attributes: ' + str(optimalNumberOfFeatures),
        'The following attributes are the most influential to the outcome'
    ]

    #=======================================================================

    # plot number of selected features VS cross-validation scores
    plt.figure(figsize=(12, 8))

    plt.xlabel("Number of Attributes", fontsize=20)
    plt.ylabel("Score", fontsize=20)
    plt.title("Attribute Selection", fontsize=25)
    plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)

    imgOne = 'static/thePlot.jpg'
    plt.savefig('flask_files/' + imgOne, dpi=300)

    #=======================================================================

    # get the feature feature importance rankings
    model = RandomForestClassifier(n_estimators=300)
    model.fit(X, y)
    theImportances = list(model.feature_importances_)
    sortedImportances = sorted(theImportances, reverse=True)

    # ...and print the selected features along with their weights and ranks
    tableOne = []
    for ii in range(1, optimalNumberOfFeatures + 1):
        tableOne.append(
            dict(Feature=str(theFeatures[theImportances.index(
                sortedImportances[ii - 1])]),
                 Weight=str(sortedImportances[ii - 1]),
                 Rank=str(ii)))

    #=======================================================================

    # plot histogram of the most important feature
    thisFeature = 0
    allThoseFeatures = dp[theFeatures[theImportances.index(
        sortedImportances[thisFeature])]]

    plt.figure(figsize=(12, 8))

    combinedOutcomes = plt.hist(allThoseFeatures, bins=10)

    #    plt.hist(allThoseFeatures, bins=10)
    plt.xlabel('Attribute: ' +
               theFeatures[theImportances.index(sortedImportances[0])],
               fontsize=20)
    plt.ylabel('Count', fontsize=20)
    plt.title('Impact of the Most Influential Attribute', fontsize=25)

    imgTwo = 'static/theHist.jpg'
    plt.savefig('flask_files/' + imgTwo, dpi=300)

    #=======================================================================

    # plot impact of the most important feature
    positiv = allThoseFeatures[yOrig == 1]
    negativ = allThoseFeatures[yOrig == 0]

    plt.figure(figsize=(12, 8))

    negA = plt.hist(negativ, bins=combinedOutcomes[1])
    posA = plt.hist(positiv, bins=combinedOutcomes[1])
    #    yUpperLimit = np.max([negA[0], posA[0]])*1.01

    #    plt.subplot(1,2,1)
    #    plt.hist(negativ,bins=combinedOutcomes[1])
    #    plt.ylim(ymax = yUpperLimit*1.01, ymin = 0)
    #    plt.xlabel(theFeatures[theImportances.index(sortedImportances[thisFeature])], fontsize=16)
    #    plt.ylabel('Count', fontsize=16)
    #    plt.title('Negative', fontsize=20)
    #
    #    plt.subplot(1,2,2)
    #    plt.hist(positiv,bins=combinedOutcomes[1])
    #    plt.ylim(ymax = yUpperLimit, ymin = 0)
    #    plt.xlabel(theFeatures[theImportances.index(sortedImportances[thisFeature])], fontsize=16)
    #    plt.title('Positive',fontsize=20)
    #
    #    imgThree = 'static/theNegPosHist.jpg'
    #    plt.savefig('flask_files/'+imgThree, dpi=300)

    #=======================================================================

    a = posA[0]
    b = negA[0]
    c = combinedOutcomes[0]

    posImpact = np.divide(a, c)
    negImpact = np.divide(b, c)

    midPoints = []
    for i in range(1, len(combinedOutcomes[1])):
        midPoints.append(
            (combinedOutcomes[1][i] + combinedOutcomes[1][i - 1]) / 2.)

    for i in range(len(posImpact)):
        if np.isnan(posImpact[i]):
            posImpact[i] = 0
        if np.isnan(negImpact[i]):
            negImpact[i] = 0

    plt.figure(figsize=(12, 8))
    plt.hold(True)
    plt.plot(midPoints, posImpact, '.', markersize=20, label='Positive')
    plt.plot(midPoints, negImpact, 'r.', markersize=20, label='Negative')
    plt.legend(prop={'size': 20})
    plt.xlabel(theFeatures[theImportances.index(
        sortedImportances[thisFeature])],
               fontsize=16)
    plt.ylabel('Relative Impact', fontsize=20)
    plt.grid()

    imgThree = 'static/theNegPosHist.jpg'
    plt.savefig('flask_files/' + imgThree, dpi=300)

    #=======================================================================

    # generate plots for report (this is save to an "html" file)

    from bokeh.charts import Histogram, output_file, show, save, gridplot
    from bokeh.plotting import figure

    plotList = []

    for i in range(optimalNumberOfFeatures):
        thisFeatureIs = theFeatures[theImportances.index(sortedImportances[i])]
        allThoseFeatures = dp[thisFeatureIs]
        combinedOutcomes = plt.hist(allThoseFeatures, bins=10)

        positiv = allThoseFeatures[yOrig == 1]
        negativ = allThoseFeatures[yOrig == 0]
        negA = plt.hist(negativ, bins=combinedOutcomes[1])
        posA = plt.hist(positiv, bins=combinedOutcomes[1])
        posImpact = np.divide(posA[0], combinedOutcomes[0])
        negImpact = np.divide(negA[0], combinedOutcomes[0])

        midPoints = []
        for i in range(1, len(combinedOutcomes[1])):
            midPoints.append(
                (combinedOutcomes[1][i] + combinedOutcomes[1][i - 1]) / 2.)

        for i in range(len(posImpact)):
            if np.isnan(posImpact[i]):
                posImpact[i] = 0
            if np.isnan(negImpact[i]):
                negImpact[i] = 0

        hist0 = Histogram(dp,
                          values=thisFeatureIs,
                          color='blue',
                          title="Impact of " + thisFeatureIs,
                          bins=10)
        plot0 = figure()
        plot0.xaxis.axis_label = thisFeatureIs
        plot0.yaxis.axis_label = "Relative Impact"
        #     plot0.title = "Relative Impact of " + thisFeatureIs
        plot0.circle(midPoints,
                     list(negImpact),
                     size=10,
                     color="red",
                     alpha=0.9,
                     legend='Negative')
        plot0.circle(midPoints,
                     list(posImpact),
                     size=10,
                     color="green",
                     alpha=0.9,
                     legend='Positive')
        plotList.append([hist0, plot0])

    output_file("flask_files/static/Report.html", title="Report")
    hist = gridplot(plotList)
    save(hist)

    #=======================================================================

    # specify the models to run tests with
    theModels = {
        'Logistic Regression': LogisticRegression(penalty='l1'),
        'LDA': LinearDiscriminantAnalysis(),
        'SVM': SVC(kernel='linear'),
        'Random Forest': RandomForestClassifier(n_estimators=300)
    }

    # ...then display the results of the tests
    classifierComparisons = []
    for aModel in theModels:
        model = theModels[aModel]
        results = cross_validation.cross_val_score(model,
                                                   Xt,
                                                   y,
                                                   scoring='f1',
                                                   cv=StratifiedKFold(
                                                       y, n_folds=3))
        classifierComparisons.append(
            dict(Classifier=aModel, Score=np.max(results)))

    #=======================================================================

    # display the plots
    theJPGs = [imgOne, imgTwo, imgThree]

    #=======================================================================

    return introReport, tableOne, optimalNumberOfFeatures, classifierComparisons, theJPGs