Esempi in Python per hist, esempi in Python per matplotlib.hist

Esempio n. 1

0

Mostra file

File: cluster.py Progetto: smba/SPLPioneerPublic

def plot_commit_differences(project, log=False):
    # retrieve all commit sizes for xz
    conn = sql.connect('../database.db')
    c = conn.cursor()
    command = Template("SELECT * FROM Revision WHERE project = '$project'")
    command = command.safe_substitute(project=project)
    c.execute(command)

    xs = []
    ys = []
    for row in c:
        xs.append(row[2])
        ys.append(row[4] + row[5])

    conn.commit()

    zs = []  # time until next commit
    for i in xrange(1, len(xs)):
        zs.append(xs[i - 1] - xs[i])

    #zs = sorted(zs, key=lambda x: x[1], reverse = True)
    plt.hist(zs, alpha=0.75)
    plt.xlabel('time to next commit [ms]')
    plt.ylabel('frequency')
    plt.title("Histogram of time between two commits for '" + project + "'")

    if log:
        plt.yscale('log', nonposy='clip')
    #plt.axis([40, 160, 0, 0.03])
    plt.grid(True)
    plt.show()

Esempio n. 2

0

Mostra file

def getCommentLengthsDistribution(comments):
    commentsList = []
    for i in range(0, len(comments)):
        commentsList.append(len(comments[i]))

    plt.hist(commentsList, bins=np.arange(0, 500, 10))
    plt.show()

Esempio n. 3

0

Mostra file

def main():
    store = []
    coins = create_coins(1000)
    for i in xrange(100):
        flip_coins(coins)
        store.append(count_heads(coins))
        plt.figure()
    plt.hist(store)
    plt.savefig('coins.png')

Esempio n. 4

0

Mostra file

File: seasonal_variation.py Progetto: ChristianNHill/The-Mean-Lovers

def Histogram(filename):
    volume = ReadInAttributeFromCSV(filename, 'Personal Injury')

    plt.title('Annual Personal Injury')
    plt.hist(volume, color='black')
    plt.ylabel('Number of')
    plt.xlabel('Seasons')
    plt.savefig('visual')
    plt.close()

Esempio n. 5

0

Mostra file

def histo(column):
    mpg = pandas.read_csv("mpg.csv")
    plt.clf()
    if column in list(mpg.column):
        plt.hist(column)
        plt.title(column)
        plt.savefig("static/histo.png")
    else:
        print("There is no such an attribute in the given data.")
    return app.send_static_file("static/histo.png")

Esempio n. 6

0

Mostra file

def feature_summary(x_col, y_col, show_r2=False):
    """Gives a summary of a feature

    :return:
    """
    # Preparation
    x_name = x_col.name
    y_name = y_col.name
    df = pd.concat([x_col, y_col], axis=1).sort_index()
    plt.rcParams["figure.figsize"] = (10, 7)
    breaks(1)
    print("%s" % x_name)
    print('Quantile:\n', x_col.quantile([0.0, 0.1, 0.25, 0.5, 0.75, 1.0]))

    # Histogram
    plt.subplot(221)
    try:
        plt.hist(x_col, bins=30)
        plt.xlabel(x_name)
        plt.title('Histogram (CF GHP): %s' % x_name)
    except ValueError:
        print("No histogram for %s available" % x_name)

    # Correlation
    if y_name != x_name:
        df = df.sort_values(x_name)
        # df[x_name + "_2"] = df[x_name] * df[x_name]
        # df[x_name + "_3"] = df[x_name] * df[x_name] * df[x_name]
        x = df.drop(y_name, 1)
        reg = linear_model.LinearRegression(normalize=True)
        reg.fit(x, df[y_name])
        # Plot
        plt.subplot(222)
        plt.scatter(df[x_name], df[y_name])
        plt.plot(df[x_name], reg.predict(x), color='g')
        plt.xlabel(x_name)
        plt.xlim([df[x_name].min(), df[x_name].max()])
        plt.title('x:%s / y:%s ' % (x_name, y_name))
        plt.ylabel("Target function: %s" % y_name)
        if show_r2:
            print("R²:", r2_score(df[y_name], reg.predict(x)))
            print(feature_importance(x, reg.coef_))

    # Show plots
    plt.show()

    # Timeline
    x_col.rolling(window=10,
                  center=False).mean().plot(title='%s: Timeline' % x_name,
                                            figsize=(10, 2),
                                            xlim=(170000, 175000))
    plt.show()

    plt.close('all')
    return " "

Esempio n. 7

0

Mostra file

File: readdata_crops.py Progetto: BNAadministrator3/body_sound_recognizer

 def StatisticData(self):
     length = []
     for i in range(sum(self.DataNum)):
         data_input, data_labels = self.GetData(i, mode='non-repetitive')
         length.append(data_input.shape[0])
     print('mean value:', np.mean(length))
     binwidth = 20
     plt.hist(length,
              bins=np.arange(min(length),
                             max(length) + binwidth, binwidth))
     plt.show()

Esempio n. 8

0

Mostra file

def getCommentLengthsDistribution(comments):
    commentsList = []
    for i in range(0, len(comments)):
        commentsList.append(len(comments[i]))

    #fig, ax = plt.subplots()
    plt.hist(commentsList, bins=np.arange(0, 500, 10))
    plt.xlabel('Number of Words in Comment')
    plt.ylabel('Comment Counts')
    plt.title('Histogram of Word Counts in Comments')
    plt.axvline(x=200, color='r', linestyle='dashed', linewidth=2)
    plt.show()

Esempio n. 9

0

Mostra file

File: Statefarm_code.py Progetto: sampathpulukurthi/Python

def histogram_chart(plt, col, Ylabel="Frequency", Xlabel=None, Title="Histogram"):
    col.dropna(inplace=True)
    
    plt.hist(col)
    
    if Ylabel:
        plt.ylabel(Ylabel)
    
    if Xlabel:
        plt.xlabel(Xlabel)
    
    plt.title(Title)

Esempio n. 10

0

Mostra file

def analyze_reads(tmp_cirseq_dir):
    fasta_files = glob.glob(tmp_cirseq_dir + "*.fasta")
    records = {}
    for file in fasta_files:
        records = parse_fasta(file)
        fasta_df = pd.DataFrame.from_dict(records, orient='index')
        fasta_df.index.name = 'id'
        fasta_df.columns = ['seq']
        blast_file = file + ".blast"
        blast_arr_names = [
            "sseqid", "qstart", "qend", "sstart1", "send1", "sstrand",
            "length", "btop", "sstart2", "send2"
        ]
        blast_df = pd.DataFrame()

        data = pd.read_csv(blast_file,
                           sep="\t",
                           header=None,
                           names=blast_arr_names)
        data["sstart2"] = data["sstart1"]
        data["send2"] = data["send1"]
        grouped_df = data.groupby("sseqid").agg({
            'sstart1': 'min',
            'sstart2': 'max',
            'send1': 'min',
            'send2': 'max'
        })
        grouped_df['sstart'] = grouped_df.min(axis=1)
        grouped_df['send'] = grouped_df.max(axis=1)
        blast_df = pd.DataFrame.append(blast_df, grouped_df)
        blast_df = blast_df.join(fasta_df)
        blast_df['edge5'] = blast_df.apply(
            lambda x: extract_location(x["seq"], 0, x["sstart"]), axis=1)
        blast_df['edge3'] = blast_df.apply(
            lambda x: extract_location(x["seq"], x["send"], -1), axis=1)
        blast_df['edge5_100'] = blast_df['edge5'].apply(lambda x: len(x) > 100)
        blast_df['edge3_100'] = blast_df['edge3'].apply(lambda x: len(x) > 100)
        blast_df = blast_df[blast_df.edge3_100 != False]
        blast_df = blast_df[blast_df.edge5_100 != False]
        del blast_df['edge3_100']
        del blast_df['edge5_100']
        blast_df["blast5"] = blast_df.apply(
            lambda row: blast_seq(row["edge5"]), axis=1)
        blast_df["blast3"] = blast_df.apply(
            lambda row: blast_seq(row["edge3"]), axis=1)
        blast_df.to_csv(blast_file + ".edges.csv", sep=',', encoding='utf-8')
        plt.hist(blast_df["edge3"], bins=50)
        plt.hist(blast_df["edge5"], bins=50)
        plt.savefig(tmp_cirseq_dir + 'plot.png')
        return blast_df

Esempio n. 11

0

Mostra file

def graph(x,y,xLabel,yLabel,title,figname):
    plt.clf()
    plt.hist(x,color="c",edgecolor="k",alpha=0.5)
    plt.axvline(np.array(x).mean(),color="k",linestyle="dashed",linewidth=3,label="average")
    plt.xlabel(xLabel)
    plt.ylabel(yLabel)
    plt.title(title)
    
    yAxis = np.arange(0,10,1)
    acRes = [y]
    z = np.array(acRes*10)
    plt.plot(z,yAxis,label="model accuracy")
    p_value = ttest_ind(x,[y])[1]
    plt.plot([],[],label=f"p-value: {np.round(p_value,4)}",color="w")
    plt.legend()
    plt.savefig(figname)

Esempio n. 12

0

Mostra file

 def loss_store2(self, x_train, x_gene):
     with open('./result/genefinalfig/x_train.pickle', 'wb') as fp:
         pickle.dump(x_train, fp)
     with open('./result/genefinalfig/generated.pickle', 'wb') as fp:
         pickle.dump(x_gene, fp)
     bins = 100
     plt.hist(x_gene, bins, facecolor='red', alpha=0.5)
     plt.title('Histogram of distribution of generated data')
     plt.xlabel('Generated data value')
     plt.ylabel('Frequency')
     plt.savefig(
         './result/genefinalfig/WGAN-Generated-data-distribution.jpg')
     plt.close()
     with open('./result/lossfig/wdis.pickle', 'wb') as fp:
         pickle.dump(self.wdis_store, fp)
     t = arange(len(self.wdis_store))
     plt.plot(t, self.wdis_store, 'r--')
     plt.xlabel('Iterations')
     plt.ylabel('Wasserstein distance')
     plt.savefig('./result/lossfig/WGAN-W-distance.jpg')
     plt.close()
     rv_pre, gv_pre, rv_pro, gv_pro = dwp(x_train, x_gene, self.testX,
                                          self.db)
     print 'Totally ' + str(len(rv_pre)) + ' of coordinates are left'
     with open('./result/genefinalfig/rv_pre.pickle', 'wb') as fp:
         pickle.dump(rv_pre, fp)
     with open('./result/genefinalfig/gv_pre.pickle', 'wb') as fp:
         pickle.dump(gv_pre, fp)
     with open('./result/genefinalfig/rv_pro.pickle', 'wb') as fp:
         pickle.dump(rv_pro, fp)
     with open('./result/genefinalfig/gv_pro.pickle', 'wb') as fp:
         pickle.dump(gv_pro, fp)
     rv_pre, gv_pre, rv_pro, gv_pro = fig_add_noise(rv_pre), fig_add_noise(
         gv_pre), fig_add_noise(rv_pro), fig_add_noise(gv_pro)
     plt.scatter(rv_pre, gv_pre)
     plt.title('Dimension-wise prediction, lr')
     plt.xlabel('Real data')
     plt.ylabel('Generated data')
     plt.savefig('./result/genefinalfig/WGAN-dim-wise-prediction.jpg')
     plt.close()
     plt.scatter(rv_pro, gv_pro)
     plt.title('Dimension-wise probability, lr')
     plt.xlabel('Real data')
     plt.ylabel('Generated data')
     plt.savefig('./result/genefinalfig/WGAN-dim-wise-probability.jpg')
     plt.close()

Esempio n. 13

0

Mostra file

File: analisiDati.py Progetto: leonardopoggiani/PECSNproject

def histogram(df, nbin, name, k):
    plt.figure()
    n, bins, patches = plt.hist(df['Mean_' + name],
                                nbin,
                                density=True,
                                facecolor='g',
                                alpha=0.75)
    plt.title('Histogram of ' + name + ' ' + k + 's')
    plt.grid(True)
    plt.show()

Esempio n. 14

0

Mostra file

File: messenger_analysis_panels.py Progetto: Sharn-konet/messenger-analysis

def create_individual_statistics_panel(message_df, title, participants,
                                       colour_palette):
    """ Create a panel which summarises the individual statistics of any user within the selected group.
    """

    all_messages = message_df.loc[message_df['Type'] == 'Message',
                                  'Message'].reset_index()['Message']

    lengths = [*map(len, all_messages)]
    print("The maximum length message you've ever sent is: {}".format(
        max(lengths)))
    print("\nThe message was: \n {}".format(all_messages.loc[lengths.index(
        max(lengths))]))

    lengths.sort()

    # From this it seems like 70 characters would be a good amount of characters to use
    plt.hist(lengths[:int(len(lengths) * 0.95)],
             bins=len({*lengths[:int(len(lengths) * 0.95)]}))
    plt.show()

Esempio n. 15

0

Mostra file

File: gmapper.py Progetto: SkBlaz/GeneMapper

    def display(self, data, candidates, fname, display):
        
        finallist=[]
        for c in candidates:
            finallist.append(c[0])
        #print finallist
        part1 = finallist[:len(finallist)/2]
        part2 = finallist[len(finallist)/2:]
        
        meandiff=int(np.sqrt(np.power(np.mean(part2),2)-np.power(np.mean(part1),2)))
        rangeA = max(part1)-min(part1)
        rangeB = max(part2)-min(part2)
        span = int((rangeA+rangeB)/2)
        dspan = int(meandiff/span)
        theta = float(meandiff/(rangeA+rangeB))
        oneortwo=""
        if dspan >3 and meandiff > 20 or meandiff>36:
            oneortwo = "Two distributions \n\n MD: %d \n Span: %d \n Dspan: %d \n theta: %d" % (meandiff, span, dspan, theta) 
        else:
            oneortwo = "One distribution \n\n MD: %d \n Span: %d \n Dspan: %d \n theta: %d" % (meandiff, span, dspan, theta)

        cans = np.array(candidates)
        plt.plot(cans[:,0],cans[:,1],'ro')
        plt.axhline(max(cans[:,1])/4, color='r')
        plt.axhline(max(cans[:,1]/2), color='r')
        plt.axhline(int(max(cans[:,1]))*0.75, color='r')
        red_patch = mpatches.Patch(color='red', label='75%, 50% and 25% \nof maximum frequency')
        plt.legend(handles=[red_patch])
        plt.ylabel('Frequency of occurence')
        plt.xlabel('separate items')
        plt.title('Frequency distribution estimation graph: %s' %(fname))
        plt.text(max(data)*1.1, max(cans[:,1])*0.62, oneortwo, fontsize = 11, color = 'r')
        plt.hist(data,range(int(min(data)),int(max(data)),1))
        ofile = fname[0:-3]+"png"
        print ("Writing outfile: %s") % (ofile)
        plt.savefig(ofile, bbox_inches='tight')
        if display == True: 
            plt.show()
        return;

Esempio n. 16

0

Mostra file

    def plot_histogram(self):
        """Method to output a histogram of the instance variable data using
        matplotlib pyplot library.

        Args:
            None

        Returns:
            None
        """

        # TODO: Plot a histogram of the data_list using the matplotlib package.
        #       Be sure to label the x and y axes and also give the chart a title

        x = self.mean + self.stdev

        plt.hist(x, 50, density=1, facecolor='b', alpah=0.75)

        plt.xlable('x-axis label')
        plt.ylabel('y-axis label')
        plt.title('Histogram Title')
        plt.axis(self.data)
        plt.show()

Esempio n. 17

0

Mostra file

File: n_dimensionalNormal.py Progetto: tuangauss/DataScienceProjects

def get_graph(n, title):
    """
  Draw a distribution histogram for a sample of N data from 
  n-dimensional Normal distribution
  """

    sample = np.random.normal(size=(N, n))
    dist = np.square(np.linalg.norm(sample, axis=1))
    lower_bound, upper_bound = get_2_std_estimates(dist)
    n, bins, patches = plt.hist(dist, bins='auto', density="true")
    plt.axvline(x=lower_bound, color='red')
    plt.axvline(x=upper_bound, color='red')
    plt.title(title, fontdict={'fontsize': 20})
    plt.show()

Esempio n. 18

0

Mostra file

def plot_per_historgram(per_path:str, save_path:str=None):
    """
    This function plots PER values as a histogram. The plot is saved to `save_path`.
    Args:
        per_path (str): path to per csv file with one column as the sample_id and the other the per value
        save_path (str): path to save the histrogram plot
    """
    import csv
    import matplotlib.pyplot as plt
    import numpy as np

    with open(per_path, 'r') as fid:
        reader = csv.reader(fid, delimiter=',')
        per_list = [float(row[1]) for row in reader]

    plt.hist(per_list, bins=10, range=(0.0, 1.0))
    plt.title("histogram of 2020-10-29 model PER values")
    plt.xlabel("PER bins")
    plt.ylabel("# of records")
    plt.xticks(np.arange(0, 1.1, step=0.1))
    #plt.yticks(labels=per_list)
    if save_path == None:
        save_path = "PER_histogram.png" 
    plt.savefig(save_path)

Esempio n. 19

0

Mostra file

File: Major1 final code.py Progetto: nikhileshsingh1603/projects

from matplotlib import pyplot as plt


# In[11]:


#ploting figure
fig = plt.figure(figsize=(15,12))
plt.suptitle('Histograms of Numerical Columns', fontsize=20)
for i in range(1,dataset2.shape[1]+1):
    plt.subplot(6,5,i)
    f=plt.gca()
    f.axes.get_yaxis().set_visible(False)
    f.set_title(dataset2.columns.values[i-1])
    vals=np.size(dataset2.iloc[:,i-1].unique())
    plt.hist(dataset2.iloc[:,i-1],bins=vals,color='#3F5D7D')
plt.tight_layout(rect=[0,0.03,1,0.95])


# In[12]:


#piechart plots
dataset2=dataset[['housing','is_referred','app_downloaded','web_user', 'app_web_user', 'ios_user',
       'android_user', 'registered_phones', 'payment_type', 'waiting_4_loan',
       'cancelled_loan', 'received_loan', 'rejected_loan', 'zodiac_sign',
       'left_for_two_month_plus', 'left_for_one_month', 'is_referred']]


# In[13]:

Esempio n. 20

0

Mostra file

# Using strings and lists to summarise total number of species in the data set
# https://stackoverflow.com/questions/997797/what-does-s-mean-in-python

species_list = list(data["Species"].unique())
print("Types of species: %s\n" % species_list)

# Create a DataFrame to structure the data correctly
# Reference: http://www.datasciencemadesimple.com/get-list-column-headers-column-name-python-pandas/

d = {
    "SepalLengthCm": data[:, 0],
    "SepalWidthCm": data[:, 1],
    "PetalLengthCm": data[:, 2],
    "PetalWidtCm": data[:, 3],
    "Species": data[:, 4]
}

df = pd.DataFrame(d,
                  columns=[
                      "SepalLengthCm", "SepalWidthCm", "PetalLengthCm",
                      "PetalWidthCm", "Species"
                  ])
df

# Build on Tutorial and plot the data on a histogram
import matplotlib.pyplot as pl

pl.hist(firstcol)
pl.show

Esempio n. 21

0

Mostra file

File: numpy_tutorial.py Progetto: mlanier/Mlanier_Master

import matplotlib as plt
import numpy as np
import pandas as pd

dat=pd.read_csv('Voters.csv').as_matrix()
x=dat[:,0]
y=dat[:,1]
plt.scatter(x,y)
plt.show()
plt.hist(x)
plt.hist(y,bins=15)


#images
train=pd.read_csv('test.csv')
M=train.as_matrix()
im=M[0,1:]
im=im.reshape(28,28)
M=train.as_matrix()
plt.imshow(im)
plt.show()
plt.imshow(im,cmap="gray")


from scipy.stats import norm
norm.pdf(0)
norm.pdf(0,loc=5, scale=10)
r=np.random.randn(10)
norm.pdf(r)
norm.cdf(r)
r=10*np.random.randn(10000)+5

Esempio n. 22

0

Mostra file

File: TP1- answers.py Progetto: sebastienmascha/data-science-ML-python-isep

import matplotlib as plt
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from scipy import stats
from math import sqrt,pi,exp


# In[35]:


#We Generate a random series of 1000 element and we plot them with matplotlib

A = np.random.randint(0,10,1000)
figure,axe = plt.subplots(figsize=(10,4))
plt.hist(A,bins=19,color='red')
axe.set_title("Histogram of the dataset")
axe.set_xlabel("Number")
axe.set_ylabel("Frequency")


# In[36]:


#Compute the mean/max/mode

#Compute the mean : the mean is the sum of all the element 
#divided by the len of the array

mean = sum(A)/len(A)

Esempio n. 23

0

Mostra file

File: Fairlie_IMDB_HW.py Progetto: l2nguyen/Dat6-students

    data[genre] = [genre in movie.split('|') for movie in data.genres]
         
data.head()

data['title'] = [t[0:-7] for t in data.title]
data.head()

data[['score', 'runtime', 'year', 'votes']].describe()

print len(data[data.runtime == 0])

data.runtime[data.runtime==0] = np.nan

data.runtime.describe()

plt.hist(data.year, bins=np.arange(1950, 2013), color='#cccccc')
plt.xlabel("Release Year")
remove_border()
# Received following message: 
# Traceback (most recent call last):
#   File "<stdin>", line 1, in <module>
# AttributeError: 'module'object has not attribute 'hist'
# AND
# AttributeError: 'module'object has not attribute 'xlabel'

plt.hist(data.score, bins=20, color='#cccccc')
plt.xlabel("IMDB rating")
remove_border()
# Again, I'm receiving AttributeError messages. Is there an issue with the matplotlib that is not allowing me to produce a histogram?

plt.scatter(data.year, data.score, lw=0, alpha=.08, color='k')

Esempio n. 24

0

Mostra file

File: pandas_exercises.py Progetto: Padraic-Doran/python-exercises

# Write the code necessary to create a cross tabulation of the number of titles by department. 
# (Hint: this will involve a combination of SQL and python/pandas code)

dept_query = """SELECT * FROM departments join dept_emp on dept_emp.dept_no = departments.dept_no"""
dept_and_dept_emp = pd.read_sql(dept_query, url)
dept_and_dept_emp
t_dept_dept_emp = pd.merge(titles, dept_and_dept_emp)
t_dept_name = t_dept_dept_emp[['title', 'dept_name']]
titles_dept_name.groupby('dept_name').count()

joined = employees.join(titles.set_index('emp_no'),on='emp_no')
mask_current = joined['to_date'].apply(lambda x: '9999' not in str(x))
changed = joined[mask_current]
changed["diff"] = changed["to_date"] - changed["from_date"]
plt.hist(changed['diff'].apply(lambda x: x.days/365),bins=6)
plt.xlabel('Years')
plt.ylabel('# of Employees')
plt.title('Frequency of Job Changes')
plt.show()



# In[103]:


# Use your get_db_url function to help you explore the data from the chipotle database. 
# Use the data to answer the following questions:
    
database_name = "chipotle"
orders_query = """SELECT * FROM orders"""

Esempio n. 25

0

Mostra file

# IPython log file

import pandas as pd
import matplotlib as plt
import matplotlib.pyplot as plt

df = pd.read_csv("http://www.biostat.jhsph.edu/~rpeng/useRbook/faithful.csv")
plt.plot(df["eruptions"], df["waiting"], "b.")
plt.title("eruptions vs waiting")
plt.savefig("scatter.png")
plt.clf()

plt.hist(df["eruptions"])
plt.savefig("eruptions.png")
plt.clf()

plt.hist(df["waiting"])
plt.savefig("waiting.png")
plt.clf()

Esempio n. 26

0

Mostra file

File: Markov chain_Normal distribution.py Progetto: caitlinng/MCMC

        # "Else" reject, though nothing to write

        # Store iterate
        chain[i, 0] = ll
        chain[i, 1:] = param

# In[11]:
'''
Graphs
'''
import matplotlib.pyplot as plt

# Histograms
plt.figure()
plt.hist(chain[int(n_iterates / 2):, 1])
plt.title('mu frequency')
print('mu = ' + str(mu))

plt.figure()
plt.hist(chain[int(n_iterates / 2):, 2])
plt.title('sd frequency')
print('sd = ' + str(sd))

plt.show()

# In[16]:
'''
Graphs alternative

import matplotlib.pyplot as plt

Esempio n. 27

0

Mostra file

import numpy as np
import matplotlib as plt
for i in range(3):
    p = np.random.poisson(lam=6, size=1000)
    np.savetxt("Data Sets/Poisson data set " + str(i), p, delimiter=",")
    plt.xlabel("x")
    plt.ylabel("P(X)=x")
    plt.hist(p, density=True)
    plt.savefig("Graphs/Poisson/Poisson " + str(i) + ".png")
    plt.show()

Esempio n. 28

0

Mostra file

File: Empire_state_mini_project.py Progetto: TiandiZ/Python_General

import numpy as np
import matplotlib as plt
np.random.seed(123)

# Simulate random walk 500 times
all_walks = []
for i in range(500):
    random_walk = [0]
    for x in range(100):
        step = random_walk[-1]
        dice = np.random.randint(1, 7)
        if dice <= 2:
            step = max(0, step - 1)
        elif dice <= 5:
            step = step + 1
        else:
            step = step + np.random.randint(1, 7)
        if np.random.rand() <= 0.001:
            step = 0
        random_walk.append(step)
    all_walks.append(random_walk)

# Create and plot np_aw_t
np_aw_t = np.transpose(np.array(all_walks))

# Select last row from np_aw_t: ends
ends = np_aw_t[-1, :]

# Plot histogram of ends, display plot
plt.hist(ends)
plt.show()

Esempio n. 29

0

Mostra file

File: Investigate_a_Dataset.py Progetto: xc2496/Python_Medical-Appointment-No-Shows

# <a id='eda'></a>
# ## Exploratory Data Analysis
#
# > **Tip**: Now that you've trimmed and cleaned your data, you're ready to move on to exploration. Compute statistics and create visualizations with the goal of addressing the research questions that you posed in the Introduction section. It is recommended that you be systematic with your approach. Look at one variable at a time, and then follow it up by looking at relationships between variables.
#
# ### Question 1: What is the independent variable? What is the dependent variable?

# Gender,ScheduledDay,AppointmentDay, Age, Neighbourhood, Scholarship, Hipertension, Diabetes, Alcoholism, Handcap, SMS_received and WeekDay are independent variables.
# Whether the patient was no-show (No-show)	is the dependent variable
#

# ### Question 2: What are the no show record differences between female and male?

# In[17]:

plt.hist(df['Gender'], color='blue', edgecolor='black', bins=int(180 / 50))
plt.title('Histogram of Gender distribution')
plt.xlabel('Gender')
plt.ylabel('Frequency')

# There are around 70000 female patients and 40000 male patients in the record.

# In[18]:

# name the no show records
attend = df.NoShow == "Yes"
not_attend = df.NoShow == "No"

# In[19]:

plt.hist(df.Gender[attend],

Esempio n. 30

0

Mostra file

File: PR1e_PCA_LDA_hist.py Progetto: carloarp/Pattern-Recognition-2

def main():

    #### LOAD FACE DATA
    face_data, face_label = load_face_data('face(1).mat')

    #### PARTITION DATA INTO TRAIN AND TEST SET
    X_train, X_test, Y_train, Y_test = partition_data(face_data,
                                                      face_label,
                                                      show='no')

    #### OBTAIN ORIGINAL AND NORMALIZED FEATURE VECTORS
    original_train, norm_train = get_original_normalized_feature_vectors(
        X_train, show='no')
    original_test, norm_test = get_original_normalized_feature_vectors(
        X_test, show='no')

    #### DISTANCE DEFINITIONS
    L1_NN = NearestNeighbors(n_neighbors=200, metric='minkowski',
                             p=1)  #manhattan l1
    L2_NN = NearestNeighbors(n_neighbors=200, metric='minkowski',
                             p=2)  #euclidean l2
    Linf_NN = NearestNeighbors(n_neighbors=200,
                               metric='chebyshev')  #chesboard/chebyshev linf

    earthmover = NearestNeighbors(n_neighbors=200,
                                  metric=wasserstein_distance)  #wassterstein
    intersection = NearestNeighbors(
        n_neighbors=200, metric=histogram_intersection)  #intersection
    chisquare = NearestNeighbors(n_neighbors=200, metric=chi)
    kldiv = NearestNeighbors(n_neighbors=200, metric=kl)
    js = NearestNeighbors(n_neighbors=200, metric=distance.jensenshannon)

    #### HISTOGRAM
    A_test = []
    for i in range(0, X_test.shape[1]):
        A_test.append(X_test[:, i])

    A_train = []
    for i in range(0, X_train.shape[1]):
        A_train.append(X_train[:, i])

    bin_width = 10
    intensity_max = 255
    n_bins = math.ceil(intensity_max / bin_width)

    bin_list = np.arange(0, 270, 10).tolist()  # Create a bin list from 0-260
    # It was found empirically that test images' pixel intensities ranged from 0 to ~260
    # Assuming uniform quantisation

    print("List of bins:", '\n', bin_list, '\n')

    X_hist_test = []
    for i in range(0, X_test.shape[1]):
        X_hist, bins, patches = plt.hist(A_test[i], bins=bin_list)
        X_hist_test.append(X_hist)
        plt.close()

    X_hist_train = []
    for j in range(0, X_train.shape[1]):
        X_hist, bins, patches = plt.hist(A_train[j], bins=bin_list)
        X_hist_train.append(X_hist)
        plt.close()
    plt.close()

    X_hist_test = np.asarray(X_hist_test)
    X_hist_train = np.asarray(X_hist_train)

    methods = [
        L2_NN, L1_NN, Linf_NN, earthmover, intersection, chisquare, kldiv, js
    ]
    method_name = [
        'L2', 'L1', 'Linf_NN', 'Earthmover', 'Intersection', 'Chi-Square',
        'K-L Divergence', 'JS'
    ]
    test_datas = [X_hist_test]
    train_datas = [X_hist_train]
    test_name = ['Histogram']
    M_pca_list = [16, 32, 64, 128, 256]
    M_pca_list = [4, 8, 12, 16, 22, 26]  # max of 26 as there are 26 bins
    data_type = [0, 1]

    recall_levels = 11
    M_lda = 10
    lda = LinearDiscriminantAnalysis(n_components=M_lda)

    method_count = 0
    for method in methods:

        #for test_data in test_datas:
        #for type in data_type:

        Mpca_list = []
        mAP_pca_list = []
        mAP_lda_list = []

        acc1_pca_list = []
        acc1_lda_list = []

        acc10_pca_list = []
        acc10_lda_list = []

        for M_pca in M_pca_list:

            #pca = PCA(n_components=M_pca)
            #lda = LinearDiscriminantAnalysis(n_components=M_lda)

            #test_pca = pca.fit_transform(test_data)
            #test_lda = lda.fit_transform(test_pca, Y_test)

            pca = PCA(n_components=M_pca)
            #train_pca = pca.fit_transform(train_datas[0])
            #test_pca = pca.transform(test_datas[0])

            train_pca = pca.fit_transform(X_hist_train)
            test_pca = pca.transform(X_hist_test)

            train_lda = lda.fit_transform(train_pca, Y_train)
            test_lda = lda.transform(test_pca)

            method.fit(test_pca)
            method_nbrs_pca = np.asarray(method.kneighbors(test_pca))
            method_map_pca, method_df_pca, acc1_pca, acc10_pca = calculate_map(
                method_nbrs_pca, Y_test, recall_levels)

            method.fit(test_lda)
            method_nbrs_lda = np.asarray(method.kneighbors(test_lda))
            method_map_lda, method_df_lda, acc1_lda, acc10_lda = calculate_map(
                method_nbrs_lda, Y_test, recall_levels)

            #print(method_name[method_count],test_name[name_count],", Mpca =",M_pca,"PCA mAP:",method_map_pca)
            #print(method_name[method_count],test_name[name_count],", Mpca =",M_pca,"PCA-LDA mAP:",method_map_lda)

            print(method_name[method_count], ", Mpca =", M_pca, "PCA mAP:",
                  method_map_pca, ",Acc@1:", acc1_pca, ",Acc@10:", acc10_pca)
            print(method_name[method_count], ", Mpca =", M_pca, "PCA-LDA mAP:",
                  method_map_lda, ",Acc@1:", acc1_lda, ",Acc@10:", acc10_lda)

            Mpca_list.append(M_pca)
            mAP_pca_list.append(method_map_pca)
            mAP_lda_list.append(method_map_lda)

            acc1_pca_list.append(acc1_pca)
            acc1_lda_list.append(acc1_lda)
            acc10_pca_list.append(acc10_pca)
            acc10_lda_list.append(acc10_lda)

        x1 = Mpca_list
        y1 = mAP_pca_list
        y2 = mAP_lda_list
        y3 = acc1_pca_list
        y4 = acc1_lda_list
        y5 = acc10_pca_list
        y6 = acc10_lda_list

        plt.figure(figsize=(10, 10))

        plt.plot(x1, y1, color='red', label='PCA mAP', marker='o')
        plt.plot(x1, y2, color='red', label='PCA-LDA mAP', marker='x')

        plt.plot(x1, y3, color='blue', label='PCA Acc@rank1', marker='o')
        plt.plot(x1, y4, color='blue', label='PCA-LDA Acc@rank1', marker='x')

        plt.plot(x1, y5, color='green', label='PCA Acc@rank10', marker='o')
        plt.plot(x1, y6, color='green', label='PCA-LDA Acc@rank10', marker='x')

        plt.grid(color='black', linestyle='-',
                 linewidth=0.1)  # parameters for plot grid
        title_name = str(method_name[method_count] + " " + test_name[0] +
                         ' PCA and PCA-LDA Performance')
        plt.title(title_name).set_position([0.5, 1.05])
        plt.xlabel('Mpca')
        plt.ylabel('mAP, Accuracy')
        plt.legend(loc='best')
        '''
		for i, txt in enumerate(y1):
			plt.annotate(txt, (x1[i], y1[i]))
		for i, txt in enumerate(y2):
			plt.annotate(txt, (x1[i], y2[i]))	
		'''

        plt.savefig(title_name)
        #plt.show()
        plt.close()

        print("		")
        method_count = method_count + 1

Esempio n. 31

0

Mostra file

File: Chapter2.py Progetto: ssyue/learningspark-1

__author__ = '49236_000'

import numpy
import os
import sys
from pyspark import SparkContext
import matplotlib.pyplot as hist

#u'1|24|M|technician|85711'
sc = SparkContext("local", "Chapter2")
user_data = sc.textFile("file:///home/hadoop/data/ml-100k/u.user")
user_data.first()
user_fields = user_data.map(lambda line: line.split("|"))
num_user = user_fields.map(lambda fields: fields[0]).count()
num_genders = user_fields.map(lambda fields: fields[2]).distinct().count()
num_occupations = user_fields.map(lambda fields: fields[3]).distinct().count()
num_zipcodes = user_fields.map(lambda fields: fields[4]).distinct().count()

print "User: %d,genders %d,occupations: %d,ZIP codes: %d" % (
    num_user, num_genders, num_occupations, num_zipcodes)

#histogram
ages = user_fields.map(lambda x: int(x[1])).collect()
hist(ages, bins=20, color='lightblue', normed=True)
fig = matplotlib.pyplot.gcf()
fig.set_size_inches(16, 10)

Esempio n. 32

0

Mostra file

File: scrapping_news.py Progetto: fms-1988/noticias


# In[201]:


#df2.iloc[1,6]
df2[df2['titulo'].str.contains("safrinha")==True]


# ## Histograma: numero de caracteres por titulo de noticia

# In[8]:


df['titulo_len'] = df['titulo'].astype(str).apply(len)
plt.hist(x=df['titulo_len'], bins='auto', color='#0504aa', alpha=0.7, rwidth=0.85)
plt.grid(axis='y', alpha=0.75)
plt.xlabel('valor')
plt.ylabel('frequencia')
total_characters = df['titulo_len'].sum()
print('traduzir 1 milhão de caracteres custa 6 usd. Total de caracteres: '+str(total_characters))
#plt.title('Histograma: numero de caracteres por titulo de noticia')


# ## Histograma: numero de palavras por titulo de noticia
# 

# In[9]:


df['titulo_word'] = df['titulo'].apply(lambda x: len(str(x).split()))

Esempio n. 33

0

Mostra file

File: Chapter2.py Progetto: JuntaoZhang/learningspark

__author__ = '49236_000'

import numpy
import os
import sys
from pyspark import SparkContext
import matplotlib.pyplot as hist


#u'1|24|M|technician|85711'
sc=SparkContext("local","Chapter2")
user_data=sc.textFile("file:///home/hadoop/data/ml-100k/u.user")
user_data.first()
user_fields=user_data.map(lambda line: line.split("|"))
num_user=user_fields.map(lambda fields: fields[0]).count()
num_genders=user_fields.map(lambda fields: fields[2]).distinct().count()
num_occupations=user_fields.map(lambda fields: fields[3]).distinct().count()
num_zipcodes=user_fields.map(lambda  fields: fields[4]).distinct().count()

print "User: %d,genders %d,occupations: %d,ZIP codes: %d" % (num_user,num_genders,num_occupations,num_zipcodes)


#histogram
ages = user_fields.map(lambda x: int(x[1])).collect()
hist(ages, bins=20, color='lightblue', normed=True)
fig = matplotlib.pyplot.gcf()
fig.set_size_inches(16, 10)

Esempio n. 34

0

Mostra file

File: Script.py Progetto: Harshal131/Analyzing-Food-Wheels-data-using-Pandas-and-Matplotlib

avg_order = orders.groupby('month').price.mean().reset_index()

std_order = orders.groupby('month').price.std().reset_index()

ax = plt.subplot()
bar_heights = avg_order.price
bar_errors = std_order.price

plt.close('all')
plt.bar(avg_order.month.values,
        avg_order.price.values,
        yerr=bar_errors.price.values,
        capsize=5)
plt.show()

ax.set_xticks(range(len(bar_heights)))
ax.set_xticklabels(['April', 'May', 'June', 'July', 'August', 'September'])
plt.ylabel('Average Amount')
plt.title('Amount over Time')
plt.show()

customer_amount = orders.groupby('customer_id').price.sum().reset_index()

print customer_amount.head()

plt.hist(customer_amount.price.values, range=(0, 200), bins=40)
plt.xlabel('Total Spent')
plt.ylabel("Number of Customers")
plt.title('Customer Expenditure Over 6 Months')

plt.show()

Esempio n. 35

0

Mostra file

File: Titanic Data Analysis.py Progetto: YanhuaHe/Titanic-Data-Analysis

titanic_dataframe.groupby('Sex')[['Age','Fare','Parch','SibSp']].describe()


# There are far more male passenges than female passengers in the data.

# In[7]:

# Take a closer look at the distribution of the age of passengers. It has a wide range of age span.
titanic_dataframe['Age'].describe() 


# In[70]:

get_ipython().magic(u'pylab inline')
plt.hist(titanic_dataframe['Age'].dropna())
plt.xlabel("Passenger Age Range")
plt.ylabel("Count")


# In[71]:

fig = plt.figure()
fig = plt.figure()
fig2 = subplot(1,2,1)
plt.xlabel("Female Age Range")
plt.ylabel("Passenger Count")
titanic_dataframe[titanic_dataframe['Sex']=="female"]['Age'].hist(bins=15)
fig2 = subplot(1,2,2)
titanic_dataframe[titanic_dataframe['Sex']=="male"]['Age'].hist(bins=15)
plt.xlabel("Male Age Range")