コード例 #1
0
ファイル: cluster.py プロジェクト: smba/SPLPioneerPublic
def plot_commit_differences(project, log=False):
    # retrieve all commit sizes for xz
    conn = sql.connect('../database.db')
    c = conn.cursor()
    command = Template("SELECT * FROM Revision WHERE project = '$project'")
    command = command.safe_substitute(project=project)
    c.execute(command)

    xs = []
    ys = []
    for row in c:
        xs.append(row[2])
        ys.append(row[4] + row[5])

    conn.commit()

    zs = []  # time until next commit
    for i in xrange(1, len(xs)):
        zs.append(xs[i - 1] - xs[i])

    #zs = sorted(zs, key=lambda x: x[1], reverse = True)
    plt.hist(zs, alpha=0.75)
    plt.xlabel('time to next commit [ms]')
    plt.ylabel('frequency')
    plt.title("Histogram of time between two commits for '" + project + "'")

    if log:
        plt.yscale('log', nonposy='clip')
    #plt.axis([40, 160, 0, 0.03])
    plt.grid(True)
    plt.show()
コード例 #2
0
def getCommentLengthsDistribution(comments):
    commentsList = []
    for i in range(0, len(comments)):
        commentsList.append(len(comments[i]))

    plt.hist(commentsList, bins=np.arange(0, 500, 10))
    plt.show()
コード例 #3
0
def main():
    store = []
    coins = create_coins(1000)
    for i in xrange(100):
        flip_coins(coins)
        store.append(count_heads(coins))
        plt.figure()
    plt.hist(store)
    plt.savefig('coins.png')
コード例 #4
0
def Histogram(filename):
    volume = ReadInAttributeFromCSV(filename, 'Personal Injury')

    plt.title('Annual Personal Injury')
    plt.hist(volume, color='black')
    plt.ylabel('Number of')
    plt.xlabel('Seasons')
    plt.savefig('visual')
    plt.close()
コード例 #5
0
def histo(column):
    mpg = pandas.read_csv("mpg.csv")
    plt.clf()
    if column in list(mpg.column):
        plt.hist(column)
        plt.title(column)
        plt.savefig("static/histo.png")
    else:
        print("There is no such an attribute in the given data.")
    return app.send_static_file("static/histo.png")
コード例 #6
0
def feature_summary(x_col, y_col, show_r2=False):
    """Gives a summary of a feature

    :return:
    """
    # Preparation
    x_name = x_col.name
    y_name = y_col.name
    df = pd.concat([x_col, y_col], axis=1).sort_index()
    plt.rcParams["figure.figsize"] = (10, 7)
    breaks(1)
    print("%s" % x_name)
    print('Quantile:\n', x_col.quantile([0.0, 0.1, 0.25, 0.5, 0.75, 1.0]))

    # Histogram
    plt.subplot(221)
    try:
        plt.hist(x_col, bins=30)
        plt.xlabel(x_name)
        plt.title('Histogram (CF GHP): %s' % x_name)
    except ValueError:
        print("No histogram for %s available" % x_name)

    # Correlation
    if y_name != x_name:
        df = df.sort_values(x_name)
        # df[x_name + "_2"] = df[x_name] * df[x_name]
        # df[x_name + "_3"] = df[x_name] * df[x_name] * df[x_name]
        x = df.drop(y_name, 1)
        reg = linear_model.LinearRegression(normalize=True)
        reg.fit(x, df[y_name])
        # Plot
        plt.subplot(222)
        plt.scatter(df[x_name], df[y_name])
        plt.plot(df[x_name], reg.predict(x), color='g')
        plt.xlabel(x_name)
        plt.xlim([df[x_name].min(), df[x_name].max()])
        plt.title('x:%s / y:%s ' % (x_name, y_name))
        plt.ylabel("Target function: %s" % y_name)
        if show_r2:
            print("R²:", r2_score(df[y_name], reg.predict(x)))
            print(feature_importance(x, reg.coef_))

    # Show plots
    plt.show()

    # Timeline
    x_col.rolling(window=10,
                  center=False).mean().plot(title='%s: Timeline' % x_name,
                                            figsize=(10, 2),
                                            xlim=(170000, 175000))
    plt.show()

    plt.close('all')
    return " "
コード例 #7
0
 def StatisticData(self):
     length = []
     for i in range(sum(self.DataNum)):
         data_input, data_labels = self.GetData(i, mode='non-repetitive')
         length.append(data_input.shape[0])
     print('mean value:', np.mean(length))
     binwidth = 20
     plt.hist(length,
              bins=np.arange(min(length),
                             max(length) + binwidth, binwidth))
     plt.show()
コード例 #8
0
def getCommentLengthsDistribution(comments):
    commentsList = []
    for i in range(0, len(comments)):
        commentsList.append(len(comments[i]))

    #fig, ax = plt.subplots()
    plt.hist(commentsList, bins=np.arange(0, 500, 10))
    plt.xlabel('Number of Words in Comment')
    plt.ylabel('Comment Counts')
    plt.title('Histogram of Word Counts in Comments')
    plt.axvline(x=200, color='r', linestyle='dashed', linewidth=2)
    plt.show()
コード例 #9
0
def histogram_chart(plt, col, Ylabel="Frequency", Xlabel=None, Title="Histogram"):
    col.dropna(inplace=True)
    
    plt.hist(col)
    
    if Ylabel:
        plt.ylabel(Ylabel)
    
    if Xlabel:
        plt.xlabel(Xlabel)
    
    plt.title(Title)        
コード例 #10
0
def analyze_reads(tmp_cirseq_dir):
    fasta_files = glob.glob(tmp_cirseq_dir + "*.fasta")
    records = {}
    for file in fasta_files:
        records = parse_fasta(file)
        fasta_df = pd.DataFrame.from_dict(records, orient='index')
        fasta_df.index.name = 'id'
        fasta_df.columns = ['seq']
        blast_file = file + ".blast"
        blast_arr_names = [
            "sseqid", "qstart", "qend", "sstart1", "send1", "sstrand",
            "length", "btop", "sstart2", "send2"
        ]
        blast_df = pd.DataFrame()

        data = pd.read_csv(blast_file,
                           sep="\t",
                           header=None,
                           names=blast_arr_names)
        data["sstart2"] = data["sstart1"]
        data["send2"] = data["send1"]
        grouped_df = data.groupby("sseqid").agg({
            'sstart1': 'min',
            'sstart2': 'max',
            'send1': 'min',
            'send2': 'max'
        })
        grouped_df['sstart'] = grouped_df.min(axis=1)
        grouped_df['send'] = grouped_df.max(axis=1)
        blast_df = pd.DataFrame.append(blast_df, grouped_df)
        blast_df = blast_df.join(fasta_df)
        blast_df['edge5'] = blast_df.apply(
            lambda x: extract_location(x["seq"], 0, x["sstart"]), axis=1)
        blast_df['edge3'] = blast_df.apply(
            lambda x: extract_location(x["seq"], x["send"], -1), axis=1)
        blast_df['edge5_100'] = blast_df['edge5'].apply(lambda x: len(x) > 100)
        blast_df['edge3_100'] = blast_df['edge3'].apply(lambda x: len(x) > 100)
        blast_df = blast_df[blast_df.edge3_100 != False]
        blast_df = blast_df[blast_df.edge5_100 != False]
        del blast_df['edge3_100']
        del blast_df['edge5_100']
        blast_df["blast5"] = blast_df.apply(
            lambda row: blast_seq(row["edge5"]), axis=1)
        blast_df["blast3"] = blast_df.apply(
            lambda row: blast_seq(row["edge3"]), axis=1)
        blast_df.to_csv(blast_file + ".edges.csv", sep=',', encoding='utf-8')
        plt.hist(blast_df["edge3"], bins=50)
        plt.hist(blast_df["edge5"], bins=50)
        plt.savefig(tmp_cirseq_dir + 'plot.png')
        return blast_df
コード例 #11
0
def graph(x,y,xLabel,yLabel,title,figname):
    plt.clf()
    plt.hist(x,color="c",edgecolor="k",alpha=0.5)
    plt.axvline(np.array(x).mean(),color="k",linestyle="dashed",linewidth=3,label="average")
    plt.xlabel(xLabel)
    plt.ylabel(yLabel)
    plt.title(title)
    
    yAxis = np.arange(0,10,1)
    acRes = [y]
    z = np.array(acRes*10)
    plt.plot(z,yAxis,label="model accuracy")
    p_value = ttest_ind(x,[y])[1]
    plt.plot([],[],label=f"p-value: {np.round(p_value,4)}",color="w")
    plt.legend()
    plt.savefig(figname)
コード例 #12
0
 def loss_store2(self, x_train, x_gene):
     with open('./result/genefinalfig/x_train.pickle', 'wb') as fp:
         pickle.dump(x_train, fp)
     with open('./result/genefinalfig/generated.pickle', 'wb') as fp:
         pickle.dump(x_gene, fp)
     bins = 100
     plt.hist(x_gene, bins, facecolor='red', alpha=0.5)
     plt.title('Histogram of distribution of generated data')
     plt.xlabel('Generated data value')
     plt.ylabel('Frequency')
     plt.savefig(
         './result/genefinalfig/WGAN-Generated-data-distribution.jpg')
     plt.close()
     with open('./result/lossfig/wdis.pickle', 'wb') as fp:
         pickle.dump(self.wdis_store, fp)
     t = arange(len(self.wdis_store))
     plt.plot(t, self.wdis_store, 'r--')
     plt.xlabel('Iterations')
     plt.ylabel('Wasserstein distance')
     plt.savefig('./result/lossfig/WGAN-W-distance.jpg')
     plt.close()
     rv_pre, gv_pre, rv_pro, gv_pro = dwp(x_train, x_gene, self.testX,
                                          self.db)
     print 'Totally ' + str(len(rv_pre)) + ' of coordinates are left'
     with open('./result/genefinalfig/rv_pre.pickle', 'wb') as fp:
         pickle.dump(rv_pre, fp)
     with open('./result/genefinalfig/gv_pre.pickle', 'wb') as fp:
         pickle.dump(gv_pre, fp)
     with open('./result/genefinalfig/rv_pro.pickle', 'wb') as fp:
         pickle.dump(rv_pro, fp)
     with open('./result/genefinalfig/gv_pro.pickle', 'wb') as fp:
         pickle.dump(gv_pro, fp)
     rv_pre, gv_pre, rv_pro, gv_pro = fig_add_noise(rv_pre), fig_add_noise(
         gv_pre), fig_add_noise(rv_pro), fig_add_noise(gv_pro)
     plt.scatter(rv_pre, gv_pre)
     plt.title('Dimension-wise prediction, lr')
     plt.xlabel('Real data')
     plt.ylabel('Generated data')
     plt.savefig('./result/genefinalfig/WGAN-dim-wise-prediction.jpg')
     plt.close()
     plt.scatter(rv_pro, gv_pro)
     plt.title('Dimension-wise probability, lr')
     plt.xlabel('Real data')
     plt.ylabel('Generated data')
     plt.savefig('./result/genefinalfig/WGAN-dim-wise-probability.jpg')
     plt.close()
コード例 #13
0
def histogram(df, nbin, name, k):
    plt.figure()
    n, bins, patches = plt.hist(df['Mean_' + name],
                                nbin,
                                density=True,
                                facecolor='g',
                                alpha=0.75)
    plt.title('Histogram of ' + name + ' ' + k + 's')
    plt.grid(True)
    plt.show()
コード例 #14
0
def create_individual_statistics_panel(message_df, title, participants,
                                       colour_palette):
    """ Create a panel which summarises the individual statistics of any user within the selected group.
    """

    all_messages = message_df.loc[message_df['Type'] == 'Message',
                                  'Message'].reset_index()['Message']

    lengths = [*map(len, all_messages)]
    print("The maximum length message you've ever sent is: {}".format(
        max(lengths)))
    print("\nThe message was: \n {}".format(all_messages.loc[lengths.index(
        max(lengths))]))

    lengths.sort()

    # From this it seems like 70 characters would be a good amount of characters to use
    plt.hist(lengths[:int(len(lengths) * 0.95)],
             bins=len({*lengths[:int(len(lengths) * 0.95)]}))
    plt.show()
コード例 #15
0
ファイル: gmapper.py プロジェクト: SkBlaz/GeneMapper
    def display(self, data, candidates, fname, display):
        
        finallist=[]
        for c in candidates:
            finallist.append(c[0])
        #print finallist
        part1 = finallist[:len(finallist)/2]
        part2 = finallist[len(finallist)/2:]
        
        meandiff=int(np.sqrt(np.power(np.mean(part2),2)-np.power(np.mean(part1),2)))
        rangeA = max(part1)-min(part1)
        rangeB = max(part2)-min(part2)
        span = int((rangeA+rangeB)/2)
        dspan = int(meandiff/span)
        theta = float(meandiff/(rangeA+rangeB))
        oneortwo=""
        if dspan >3 and meandiff > 20 or meandiff>36:
            oneortwo = "Two distributions \n\n MD: %d \n Span: %d \n Dspan: %d \n theta: %d" % (meandiff, span, dspan, theta) 
        else:
            oneortwo = "One distribution \n\n MD: %d \n Span: %d \n Dspan: %d \n theta: %d" % (meandiff, span, dspan, theta)

        cans = np.array(candidates)
        plt.plot(cans[:,0],cans[:,1],'ro')
        plt.axhline(max(cans[:,1])/4, color='r')
        plt.axhline(max(cans[:,1]/2), color='r')
        plt.axhline(int(max(cans[:,1]))*0.75, color='r')
        red_patch = mpatches.Patch(color='red', label='75%, 50% and 25% \nof maximum frequency')
        plt.legend(handles=[red_patch])
        plt.ylabel('Frequency of occurence')
        plt.xlabel('separate items')
        plt.title('Frequency distribution estimation graph: %s' %(fname))
        plt.text(max(data)*1.1, max(cans[:,1])*0.62, oneortwo, fontsize = 11, color = 'r')
        plt.hist(data,range(int(min(data)),int(max(data)),1))
        ofile = fname[0:-3]+"png"
        print ("Writing outfile: %s") % (ofile)
        plt.savefig(ofile, bbox_inches='tight')
        if display == True: 
            plt.show()
        return;
コード例 #16
0
    def plot_histogram(self):
        """Method to output a histogram of the instance variable data using
        matplotlib pyplot library.

        Args:
            None

        Returns:
            None
        """

        # TODO: Plot a histogram of the data_list using the matplotlib package.
        #       Be sure to label the x and y axes and also give the chart a title

        x = self.mean + self.stdev

        plt.hist(x, 50, density=1, facecolor='b', alpah=0.75)

        plt.xlable('x-axis label')
        plt.ylabel('y-axis label')
        plt.title('Histogram Title')
        plt.axis(self.data)
        plt.show()
コード例 #17
0
def get_graph(n, title):
    """
  Draw a distribution histogram for a sample of N data from 
  n-dimensional Normal distribution
  """

    sample = np.random.normal(size=(N, n))
    dist = np.square(np.linalg.norm(sample, axis=1))
    lower_bound, upper_bound = get_2_std_estimates(dist)
    n, bins, patches = plt.hist(dist, bins='auto', density="true")
    plt.axvline(x=lower_bound, color='red')
    plt.axvline(x=upper_bound, color='red')
    plt.title(title, fontdict={'fontsize': 20})
    plt.show()
コード例 #18
0
def plot_per_historgram(per_path:str, save_path:str=None):
    """
    This function plots PER values as a histogram. The plot is saved to `save_path`.
    Args:
        per_path (str): path to per csv file with one column as the sample_id and the other the per value
        save_path (str): path to save the histrogram plot
    """
    import csv
    import matplotlib.pyplot as plt
    import numpy as np

    with open(per_path, 'r') as fid:
        reader = csv.reader(fid, delimiter=',')
        per_list = [float(row[1]) for row in reader]

    plt.hist(per_list, bins=10, range=(0.0, 1.0))
    plt.title("histogram of 2020-10-29 model PER values")
    plt.xlabel("PER bins")
    plt.ylabel("# of records")
    plt.xticks(np.arange(0, 1.1, step=0.1))
    #plt.yticks(labels=per_list)
    if save_path == None:
        save_path = "PER_histogram.png" 
    plt.savefig(save_path)
コード例 #19
0
from matplotlib import pyplot as plt


# In[11]:


#ploting figure
fig = plt.figure(figsize=(15,12))
plt.suptitle('Histograms of Numerical Columns', fontsize=20)
for i in range(1,dataset2.shape[1]+1):
    plt.subplot(6,5,i)
    f=plt.gca()
    f.axes.get_yaxis().set_visible(False)
    f.set_title(dataset2.columns.values[i-1])
    vals=np.size(dataset2.iloc[:,i-1].unique())
    plt.hist(dataset2.iloc[:,i-1],bins=vals,color='#3F5D7D')
plt.tight_layout(rect=[0,0.03,1,0.95])


# In[12]:


#piechart plots
dataset2=dataset[['housing','is_referred','app_downloaded','web_user', 'app_web_user', 'ios_user',
       'android_user', 'registered_phones', 'payment_type', 'waiting_4_loan',
       'cancelled_loan', 'received_loan', 'rejected_loan', 'zodiac_sign',
       'left_for_two_month_plus', 'left_for_one_month', 'is_referred']]


# In[13]:
コード例 #20
0
# Using strings and lists to summarise total number of species in the data set
# https://stackoverflow.com/questions/997797/what-does-s-mean-in-python

species_list = list(data["Species"].unique())
print("Types of species: %s\n" % species_list)

# Create a DataFrame to structure the data correctly
# Reference: http://www.datasciencemadesimple.com/get-list-column-headers-column-name-python-pandas/

d = {
    "SepalLengthCm": data[:, 0],
    "SepalWidthCm": data[:, 1],
    "PetalLengthCm": data[:, 2],
    "PetalWidtCm": data[:, 3],
    "Species": data[:, 4]
}

df = pd.DataFrame(d,
                  columns=[
                      "SepalLengthCm", "SepalWidthCm", "PetalLengthCm",
                      "PetalWidthCm", "Species"
                  ])
df

# Build on Tutorial and plot the data on a histogram
import matplotlib.pyplot as pl

pl.hist(firstcol)
pl.show
コード例 #21
0
import matplotlib as plt
import numpy as np
import pandas as pd

dat=pd.read_csv('Voters.csv').as_matrix()
x=dat[:,0]
y=dat[:,1]
plt.scatter(x,y)
plt.show()
plt.hist(x)
plt.hist(y,bins=15)


#images
train=pd.read_csv('test.csv')
M=train.as_matrix()
im=M[0,1:]
im=im.reshape(28,28)
M=train.as_matrix()
plt.imshow(im)
plt.show()
plt.imshow(im,cmap="gray")


from scipy.stats import norm
norm.pdf(0)
norm.pdf(0,loc=5, scale=10)
r=np.random.randn(10)
norm.pdf(r)
norm.cdf(r)
r=10*np.random.randn(10000)+5
コード例 #22
0
import matplotlib as plt
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from scipy import stats
from math import sqrt,pi,exp


# In[35]:


#We Generate a random series of 1000 element and we plot them with matplotlib

A = np.random.randint(0,10,1000)
figure,axe = plt.subplots(figsize=(10,4))
plt.hist(A,bins=19,color='red')
axe.set_title("Histogram of the dataset")
axe.set_xlabel("Number")
axe.set_ylabel("Frequency")


# In[36]:


#Compute the mean/max/mode

#Compute the mean : the mean is the sum of all the element 
#divided by the len of the array

mean = sum(A)/len(A)
コード例 #23
0
    data[genre] = [genre in movie.split('|') for movie in data.genres]
         
data.head()

data['title'] = [t[0:-7] for t in data.title]
data.head()

data[['score', 'runtime', 'year', 'votes']].describe()

print len(data[data.runtime == 0])

data.runtime[data.runtime==0] = np.nan

data.runtime.describe()

plt.hist(data.year, bins=np.arange(1950, 2013), color='#cccccc')
plt.xlabel("Release Year")
remove_border()
# Received following message: 
# Traceback (most recent call last):
#   File "<stdin>", line 1, in <module>
# AttributeError: 'module'object has not attribute 'hist'
# AND
# AttributeError: 'module'object has not attribute 'xlabel'

plt.hist(data.score, bins=20, color='#cccccc')
plt.xlabel("IMDB rating")
remove_border()
# Again, I'm receiving AttributeError messages. Is there an issue with the matplotlib that is not allowing me to produce a histogram?

plt.scatter(data.year, data.score, lw=0, alpha=.08, color='k')
コード例 #24
0
# Write the code necessary to create a cross tabulation of the number of titles by department. 
# (Hint: this will involve a combination of SQL and python/pandas code)

dept_query = """SELECT * FROM departments join dept_emp on dept_emp.dept_no = departments.dept_no"""
dept_and_dept_emp = pd.read_sql(dept_query, url)
dept_and_dept_emp
t_dept_dept_emp = pd.merge(titles, dept_and_dept_emp)
t_dept_name = t_dept_dept_emp[['title', 'dept_name']]
titles_dept_name.groupby('dept_name').count()

joined = employees.join(titles.set_index('emp_no'),on='emp_no')
mask_current = joined['to_date'].apply(lambda x: '9999' not in str(x))
changed = joined[mask_current]
changed["diff"] = changed["to_date"] - changed["from_date"]
plt.hist(changed['diff'].apply(lambda x: x.days/365),bins=6)
plt.xlabel('Years')
plt.ylabel('# of Employees')
plt.title('Frequency of Job Changes')
plt.show()



# In[103]:


# Use your get_db_url function to help you explore the data from the chipotle database. 
# Use the data to answer the following questions:
    
database_name = "chipotle"
orders_query = """SELECT * FROM orders"""
コード例 #25
0
# IPython log file

import pandas as pd
import matplotlib as plt
import matplotlib.pyplot as plt

df = pd.read_csv("http://www.biostat.jhsph.edu/~rpeng/useRbook/faithful.csv")
plt.plot(df["eruptions"], df["waiting"], "b.")
plt.title("eruptions vs waiting")
plt.savefig("scatter.png")
plt.clf()

plt.hist(df["eruptions"])
plt.savefig("eruptions.png")
plt.clf()

plt.hist(df["waiting"])
plt.savefig("waiting.png")
plt.clf()
コード例 #26
0
        # "Else" reject, though nothing to write

        # Store iterate
        chain[i, 0] = ll
        chain[i, 1:] = param

# In[11]:
'''
Graphs
'''
import matplotlib.pyplot as plt

# Histograms
plt.figure()
plt.hist(chain[int(n_iterates / 2):, 1])
plt.title('mu frequency')
print('mu = ' + str(mu))

plt.figure()
plt.hist(chain[int(n_iterates / 2):, 2])
plt.title('sd frequency')
print('sd = ' + str(sd))

plt.show()

# In[16]:
'''
Graphs alternative

import matplotlib.pyplot as plt
コード例 #27
0
import numpy as np
import matplotlib as plt
for i in range(3):
    p = np.random.poisson(lam=6, size=1000)
    np.savetxt("Data Sets/Poisson data set " + str(i), p, delimiter=",")
    plt.xlabel("x")
    plt.ylabel("P(X)=x")
    plt.hist(p, density=True)
    plt.savefig("Graphs/Poisson/Poisson " + str(i) + ".png")
    plt.show()
コード例 #28
0
import numpy as np
import matplotlib as plt
np.random.seed(123)

# Simulate random walk 500 times
all_walks = []
for i in range(500):
    random_walk = [0]
    for x in range(100):
        step = random_walk[-1]
        dice = np.random.randint(1, 7)
        if dice <= 2:
            step = max(0, step - 1)
        elif dice <= 5:
            step = step + 1
        else:
            step = step + np.random.randint(1, 7)
        if np.random.rand() <= 0.001:
            step = 0
        random_walk.append(step)
    all_walks.append(random_walk)

# Create and plot np_aw_t
np_aw_t = np.transpose(np.array(all_walks))

# Select last row from np_aw_t: ends
ends = np_aw_t[-1, :]

# Plot histogram of ends, display plot
plt.hist(ends)
plt.show()
# <a id='eda'></a>
# ## Exploratory Data Analysis
#
# > **Tip**: Now that you've trimmed and cleaned your data, you're ready to move on to exploration. Compute statistics and create visualizations with the goal of addressing the research questions that you posed in the Introduction section. It is recommended that you be systematic with your approach. Look at one variable at a time, and then follow it up by looking at relationships between variables.
#
# ### Question 1: What is the independent variable? What is the dependent variable?

# Gender,ScheduledDay,AppointmentDay, Age, Neighbourhood, Scholarship, Hipertension, Diabetes, Alcoholism, Handcap, SMS_received and WeekDay are independent variables.
# Whether the patient was no-show (No-show)	is the dependent variable
#

# ### Question 2: What are the no show record differences between female and male?

# In[17]:

plt.hist(df['Gender'], color='blue', edgecolor='black', bins=int(180 / 50))
plt.title('Histogram of Gender distribution')
plt.xlabel('Gender')
plt.ylabel('Frequency')

# There are around 70000 female patients and 40000 male patients in the record.

# In[18]:

# name the no show records
attend = df.NoShow == "Yes"
not_attend = df.NoShow == "No"

# In[19]:

plt.hist(df.Gender[attend],
コード例 #30
0
def main():

    #### LOAD FACE DATA
    face_data, face_label = load_face_data('face(1).mat')

    #### PARTITION DATA INTO TRAIN AND TEST SET
    X_train, X_test, Y_train, Y_test = partition_data(face_data,
                                                      face_label,
                                                      show='no')

    #### OBTAIN ORIGINAL AND NORMALIZED FEATURE VECTORS
    original_train, norm_train = get_original_normalized_feature_vectors(
        X_train, show='no')
    original_test, norm_test = get_original_normalized_feature_vectors(
        X_test, show='no')

    #### DISTANCE DEFINITIONS
    L1_NN = NearestNeighbors(n_neighbors=200, metric='minkowski',
                             p=1)  #manhattan l1
    L2_NN = NearestNeighbors(n_neighbors=200, metric='minkowski',
                             p=2)  #euclidean l2
    Linf_NN = NearestNeighbors(n_neighbors=200,
                               metric='chebyshev')  #chesboard/chebyshev linf

    earthmover = NearestNeighbors(n_neighbors=200,
                                  metric=wasserstein_distance)  #wassterstein
    intersection = NearestNeighbors(
        n_neighbors=200, metric=histogram_intersection)  #intersection
    chisquare = NearestNeighbors(n_neighbors=200, metric=chi)
    kldiv = NearestNeighbors(n_neighbors=200, metric=kl)
    js = NearestNeighbors(n_neighbors=200, metric=distance.jensenshannon)

    #### HISTOGRAM
    A_test = []
    for i in range(0, X_test.shape[1]):
        A_test.append(X_test[:, i])

    A_train = []
    for i in range(0, X_train.shape[1]):
        A_train.append(X_train[:, i])

    bin_width = 10
    intensity_max = 255
    n_bins = math.ceil(intensity_max / bin_width)

    bin_list = np.arange(0, 270, 10).tolist()  # Create a bin list from 0-260
    # It was found empirically that test images' pixel intensities ranged from 0 to ~260
    # Assuming uniform quantisation

    print("List of bins:", '\n', bin_list, '\n')

    X_hist_test = []
    for i in range(0, X_test.shape[1]):
        X_hist, bins, patches = plt.hist(A_test[i], bins=bin_list)
        X_hist_test.append(X_hist)
        plt.close()

    X_hist_train = []
    for j in range(0, X_train.shape[1]):
        X_hist, bins, patches = plt.hist(A_train[j], bins=bin_list)
        X_hist_train.append(X_hist)
        plt.close()
    plt.close()

    X_hist_test = np.asarray(X_hist_test)
    X_hist_train = np.asarray(X_hist_train)

    methods = [
        L2_NN, L1_NN, Linf_NN, earthmover, intersection, chisquare, kldiv, js
    ]
    method_name = [
        'L2', 'L1', 'Linf_NN', 'Earthmover', 'Intersection', 'Chi-Square',
        'K-L Divergence', 'JS'
    ]
    test_datas = [X_hist_test]
    train_datas = [X_hist_train]
    test_name = ['Histogram']
    M_pca_list = [16, 32, 64, 128, 256]
    M_pca_list = [4, 8, 12, 16, 22, 26]  # max of 26 as there are 26 bins
    data_type = [0, 1]

    recall_levels = 11
    M_lda = 10
    lda = LinearDiscriminantAnalysis(n_components=M_lda)

    method_count = 0
    for method in methods:

        #for test_data in test_datas:
        #for type in data_type:

        Mpca_list = []
        mAP_pca_list = []
        mAP_lda_list = []

        acc1_pca_list = []
        acc1_lda_list = []

        acc10_pca_list = []
        acc10_lda_list = []

        for M_pca in M_pca_list:

            #pca = PCA(n_components=M_pca)
            #lda = LinearDiscriminantAnalysis(n_components=M_lda)

            #test_pca = pca.fit_transform(test_data)
            #test_lda = lda.fit_transform(test_pca, Y_test)

            pca = PCA(n_components=M_pca)
            #train_pca = pca.fit_transform(train_datas[0])
            #test_pca = pca.transform(test_datas[0])

            train_pca = pca.fit_transform(X_hist_train)
            test_pca = pca.transform(X_hist_test)

            train_lda = lda.fit_transform(train_pca, Y_train)
            test_lda = lda.transform(test_pca)

            method.fit(test_pca)
            method_nbrs_pca = np.asarray(method.kneighbors(test_pca))
            method_map_pca, method_df_pca, acc1_pca, acc10_pca = calculate_map(
                method_nbrs_pca, Y_test, recall_levels)

            method.fit(test_lda)
            method_nbrs_lda = np.asarray(method.kneighbors(test_lda))
            method_map_lda, method_df_lda, acc1_lda, acc10_lda = calculate_map(
                method_nbrs_lda, Y_test, recall_levels)

            #print(method_name[method_count],test_name[name_count],", Mpca =",M_pca,"PCA mAP:",method_map_pca)
            #print(method_name[method_count],test_name[name_count],", Mpca =",M_pca,"PCA-LDA mAP:",method_map_lda)

            print(method_name[method_count], ", Mpca =", M_pca, "PCA mAP:",
                  method_map_pca, ",Acc@1:", acc1_pca, ",Acc@10:", acc10_pca)
            print(method_name[method_count], ", Mpca =", M_pca, "PCA-LDA mAP:",
                  method_map_lda, ",Acc@1:", acc1_lda, ",Acc@10:", acc10_lda)

            Mpca_list.append(M_pca)
            mAP_pca_list.append(method_map_pca)
            mAP_lda_list.append(method_map_lda)

            acc1_pca_list.append(acc1_pca)
            acc1_lda_list.append(acc1_lda)
            acc10_pca_list.append(acc10_pca)
            acc10_lda_list.append(acc10_lda)

        x1 = Mpca_list
        y1 = mAP_pca_list
        y2 = mAP_lda_list
        y3 = acc1_pca_list
        y4 = acc1_lda_list
        y5 = acc10_pca_list
        y6 = acc10_lda_list

        plt.figure(figsize=(10, 10))

        plt.plot(x1, y1, color='red', label='PCA mAP', marker='o')
        plt.plot(x1, y2, color='red', label='PCA-LDA mAP', marker='x')

        plt.plot(x1, y3, color='blue', label='PCA Acc@rank1', marker='o')
        plt.plot(x1, y4, color='blue', label='PCA-LDA Acc@rank1', marker='x')

        plt.plot(x1, y5, color='green', label='PCA Acc@rank10', marker='o')
        plt.plot(x1, y6, color='green', label='PCA-LDA Acc@rank10', marker='x')

        plt.grid(color='black', linestyle='-',
                 linewidth=0.1)  # parameters for plot grid
        title_name = str(method_name[method_count] + " " + test_name[0] +
                         ' PCA and PCA-LDA Performance')
        plt.title(title_name).set_position([0.5, 1.05])
        plt.xlabel('Mpca')
        plt.ylabel('mAP, Accuracy')
        plt.legend(loc='best')
        '''
		for i, txt in enumerate(y1):
			plt.annotate(txt, (x1[i], y1[i]))
		for i, txt in enumerate(y2):
			plt.annotate(txt, (x1[i], y2[i]))	
		'''

        plt.savefig(title_name)
        #plt.show()
        plt.close()

        print("		")
        method_count = method_count + 1
コード例 #31
0
ファイル: Chapter2.py プロジェクト: ssyue/learningspark-1
__author__ = '49236_000'

import numpy
import os
import sys
from pyspark import SparkContext
import matplotlib.pyplot as hist

#u'1|24|M|technician|85711'
sc = SparkContext("local", "Chapter2")
user_data = sc.textFile("file:///home/hadoop/data/ml-100k/u.user")
user_data.first()
user_fields = user_data.map(lambda line: line.split("|"))
num_user = user_fields.map(lambda fields: fields[0]).count()
num_genders = user_fields.map(lambda fields: fields[2]).distinct().count()
num_occupations = user_fields.map(lambda fields: fields[3]).distinct().count()
num_zipcodes = user_fields.map(lambda fields: fields[4]).distinct().count()

print "User: %d,genders %d,occupations: %d,ZIP codes: %d" % (
    num_user, num_genders, num_occupations, num_zipcodes)

#histogram
ages = user_fields.map(lambda x: int(x[1])).collect()
hist(ages, bins=20, color='lightblue', normed=True)
fig = matplotlib.pyplot.gcf()
fig.set_size_inches(16, 10)
コード例 #32
0
ファイル: scrapping_news.py プロジェクト: fms-1988/noticias

# In[201]:


#df2.iloc[1,6]
df2[df2['titulo'].str.contains("safrinha")==True]


# ## Histograma: numero de caracteres por titulo de noticia

# In[8]:


df['titulo_len'] = df['titulo'].astype(str).apply(len)
plt.hist(x=df['titulo_len'], bins='auto', color='#0504aa', alpha=0.7, rwidth=0.85)
plt.grid(axis='y', alpha=0.75)
plt.xlabel('valor')
plt.ylabel('frequencia')
total_characters = df['titulo_len'].sum()
print('traduzir 1 milhão de caracteres custa 6 usd. Total de caracteres: '+str(total_characters))
#plt.title('Histograma: numero de caracteres por titulo de noticia')


# ## Histograma: numero de palavras por titulo de noticia
# 

# In[9]:


df['titulo_word'] = df['titulo'].apply(lambda x: len(str(x).split()))
コード例 #33
0
ファイル: Chapter2.py プロジェクト: JuntaoZhang/learningspark
__author__ = '49236_000'

import numpy
import os
import sys
from pyspark import SparkContext
import matplotlib.pyplot as hist


#u'1|24|M|technician|85711'
sc=SparkContext("local","Chapter2")
user_data=sc.textFile("file:///home/hadoop/data/ml-100k/u.user")
user_data.first()
user_fields=user_data.map(lambda line: line.split("|"))
num_user=user_fields.map(lambda fields: fields[0]).count()
num_genders=user_fields.map(lambda fields: fields[2]).distinct().count()
num_occupations=user_fields.map(lambda fields: fields[3]).distinct().count()
num_zipcodes=user_fields.map(lambda  fields: fields[4]).distinct().count()

print "User: %d,genders %d,occupations: %d,ZIP codes: %d" % (num_user,num_genders,num_occupations,num_zipcodes)


#histogram
ages = user_fields.map(lambda x: int(x[1])).collect()
hist(ages, bins=20, color='lightblue', normed=True)
fig = matplotlib.pyplot.gcf()
fig.set_size_inches(16, 10)

avg_order = orders.groupby('month').price.mean().reset_index()

std_order = orders.groupby('month').price.std().reset_index()

ax = plt.subplot()
bar_heights = avg_order.price
bar_errors = std_order.price

plt.close('all')
plt.bar(avg_order.month.values,
        avg_order.price.values,
        yerr=bar_errors.price.values,
        capsize=5)
plt.show()

ax.set_xticks(range(len(bar_heights)))
ax.set_xticklabels(['April', 'May', 'June', 'July', 'August', 'September'])
plt.ylabel('Average Amount')
plt.title('Amount over Time')
plt.show()

customer_amount = orders.groupby('customer_id').price.sum().reset_index()

print customer_amount.head()

plt.hist(customer_amount.price.values, range=(0, 200), bins=40)
plt.xlabel('Total Spent')
plt.ylabel("Number of Customers")
plt.title('Customer Expenditure Over 6 Months')

plt.show()
コード例 #35
0
titanic_dataframe.groupby('Sex')[['Age','Fare','Parch','SibSp']].describe()


# There are far more male passenges than female passengers in the data.

# In[7]:

# Take a closer look at the distribution of the age of passengers. It has a wide range of age span.
titanic_dataframe['Age'].describe() 


# In[70]:

get_ipython().magic(u'pylab inline')
plt.hist(titanic_dataframe['Age'].dropna())
plt.xlabel("Passenger Age Range")
plt.ylabel("Count")


# In[71]:

fig = plt.figure()
fig = plt.figure()
fig2 = subplot(1,2,1)
plt.xlabel("Female Age Range")
plt.ylabel("Passenger Count")
titanic_dataframe[titanic_dataframe['Sex']=="female"]['Age'].hist(bins=15)
fig2 = subplot(1,2,2)
titanic_dataframe[titanic_dataframe['Sex']=="male"]['Age'].hist(bins=15)
plt.xlabel("Male Age Range")