def plot_commit_differences(project, log=False): # retrieve all commit sizes for xz conn = sql.connect('../database.db') c = conn.cursor() command = Template("SELECT * FROM Revision WHERE project = '$project'") command = command.safe_substitute(project=project) c.execute(command) xs = [] ys = [] for row in c: xs.append(row[2]) ys.append(row[4] + row[5]) conn.commit() zs = [] # time until next commit for i in xrange(1, len(xs)): zs.append(xs[i - 1] - xs[i]) #zs = sorted(zs, key=lambda x: x[1], reverse = True) plt.hist(zs, alpha=0.75) plt.xlabel('time to next commit [ms]') plt.ylabel('frequency') plt.title("Histogram of time between two commits for '" + project + "'") if log: plt.yscale('log', nonposy='clip') #plt.axis([40, 160, 0, 0.03]) plt.grid(True) plt.show()
def getCommentLengthsDistribution(comments): commentsList = [] for i in range(0, len(comments)): commentsList.append(len(comments[i])) plt.hist(commentsList, bins=np.arange(0, 500, 10)) plt.show()
def main(): store = [] coins = create_coins(1000) for i in xrange(100): flip_coins(coins) store.append(count_heads(coins)) plt.figure() plt.hist(store) plt.savefig('coins.png')
def Histogram(filename): volume = ReadInAttributeFromCSV(filename, 'Personal Injury') plt.title('Annual Personal Injury') plt.hist(volume, color='black') plt.ylabel('Number of') plt.xlabel('Seasons') plt.savefig('visual') plt.close()
def histo(column): mpg = pandas.read_csv("mpg.csv") plt.clf() if column in list(mpg.column): plt.hist(column) plt.title(column) plt.savefig("static/histo.png") else: print("There is no such an attribute in the given data.") return app.send_static_file("static/histo.png")
def feature_summary(x_col, y_col, show_r2=False): """Gives a summary of a feature :return: """ # Preparation x_name = x_col.name y_name = y_col.name df = pd.concat([x_col, y_col], axis=1).sort_index() plt.rcParams["figure.figsize"] = (10, 7) breaks(1) print("%s" % x_name) print('Quantile:\n', x_col.quantile([0.0, 0.1, 0.25, 0.5, 0.75, 1.0])) # Histogram plt.subplot(221) try: plt.hist(x_col, bins=30) plt.xlabel(x_name) plt.title('Histogram (CF GHP): %s' % x_name) except ValueError: print("No histogram for %s available" % x_name) # Correlation if y_name != x_name: df = df.sort_values(x_name) # df[x_name + "_2"] = df[x_name] * df[x_name] # df[x_name + "_3"] = df[x_name] * df[x_name] * df[x_name] x = df.drop(y_name, 1) reg = linear_model.LinearRegression(normalize=True) reg.fit(x, df[y_name]) # Plot plt.subplot(222) plt.scatter(df[x_name], df[y_name]) plt.plot(df[x_name], reg.predict(x), color='g') plt.xlabel(x_name) plt.xlim([df[x_name].min(), df[x_name].max()]) plt.title('x:%s / y:%s ' % (x_name, y_name)) plt.ylabel("Target function: %s" % y_name) if show_r2: print("R²:", r2_score(df[y_name], reg.predict(x))) print(feature_importance(x, reg.coef_)) # Show plots plt.show() # Timeline x_col.rolling(window=10, center=False).mean().plot(title='%s: Timeline' % x_name, figsize=(10, 2), xlim=(170000, 175000)) plt.show() plt.close('all') return " "
def StatisticData(self): length = [] for i in range(sum(self.DataNum)): data_input, data_labels = self.GetData(i, mode='non-repetitive') length.append(data_input.shape[0]) print('mean value:', np.mean(length)) binwidth = 20 plt.hist(length, bins=np.arange(min(length), max(length) + binwidth, binwidth)) plt.show()
def getCommentLengthsDistribution(comments): commentsList = [] for i in range(0, len(comments)): commentsList.append(len(comments[i])) #fig, ax = plt.subplots() plt.hist(commentsList, bins=np.arange(0, 500, 10)) plt.xlabel('Number of Words in Comment') plt.ylabel('Comment Counts') plt.title('Histogram of Word Counts in Comments') plt.axvline(x=200, color='r', linestyle='dashed', linewidth=2) plt.show()
def histogram_chart(plt, col, Ylabel="Frequency", Xlabel=None, Title="Histogram"): col.dropna(inplace=True) plt.hist(col) if Ylabel: plt.ylabel(Ylabel) if Xlabel: plt.xlabel(Xlabel) plt.title(Title)
def analyze_reads(tmp_cirseq_dir): fasta_files = glob.glob(tmp_cirseq_dir + "*.fasta") records = {} for file in fasta_files: records = parse_fasta(file) fasta_df = pd.DataFrame.from_dict(records, orient='index') fasta_df.index.name = 'id' fasta_df.columns = ['seq'] blast_file = file + ".blast" blast_arr_names = [ "sseqid", "qstart", "qend", "sstart1", "send1", "sstrand", "length", "btop", "sstart2", "send2" ] blast_df = pd.DataFrame() data = pd.read_csv(blast_file, sep="\t", header=None, names=blast_arr_names) data["sstart2"] = data["sstart1"] data["send2"] = data["send1"] grouped_df = data.groupby("sseqid").agg({ 'sstart1': 'min', 'sstart2': 'max', 'send1': 'min', 'send2': 'max' }) grouped_df['sstart'] = grouped_df.min(axis=1) grouped_df['send'] = grouped_df.max(axis=1) blast_df = pd.DataFrame.append(blast_df, grouped_df) blast_df = blast_df.join(fasta_df) blast_df['edge5'] = blast_df.apply( lambda x: extract_location(x["seq"], 0, x["sstart"]), axis=1) blast_df['edge3'] = blast_df.apply( lambda x: extract_location(x["seq"], x["send"], -1), axis=1) blast_df['edge5_100'] = blast_df['edge5'].apply(lambda x: len(x) > 100) blast_df['edge3_100'] = blast_df['edge3'].apply(lambda x: len(x) > 100) blast_df = blast_df[blast_df.edge3_100 != False] blast_df = blast_df[blast_df.edge5_100 != False] del blast_df['edge3_100'] del blast_df['edge5_100'] blast_df["blast5"] = blast_df.apply( lambda row: blast_seq(row["edge5"]), axis=1) blast_df["blast3"] = blast_df.apply( lambda row: blast_seq(row["edge3"]), axis=1) blast_df.to_csv(blast_file + ".edges.csv", sep=',', encoding='utf-8') plt.hist(blast_df["edge3"], bins=50) plt.hist(blast_df["edge5"], bins=50) plt.savefig(tmp_cirseq_dir + 'plot.png') return blast_df
def graph(x,y,xLabel,yLabel,title,figname): plt.clf() plt.hist(x,color="c",edgecolor="k",alpha=0.5) plt.axvline(np.array(x).mean(),color="k",linestyle="dashed",linewidth=3,label="average") plt.xlabel(xLabel) plt.ylabel(yLabel) plt.title(title) yAxis = np.arange(0,10,1) acRes = [y] z = np.array(acRes*10) plt.plot(z,yAxis,label="model accuracy") p_value = ttest_ind(x,[y])[1] plt.plot([],[],label=f"p-value: {np.round(p_value,4)}",color="w") plt.legend() plt.savefig(figname)
def loss_store2(self, x_train, x_gene): with open('./result/genefinalfig/x_train.pickle', 'wb') as fp: pickle.dump(x_train, fp) with open('./result/genefinalfig/generated.pickle', 'wb') as fp: pickle.dump(x_gene, fp) bins = 100 plt.hist(x_gene, bins, facecolor='red', alpha=0.5) plt.title('Histogram of distribution of generated data') plt.xlabel('Generated data value') plt.ylabel('Frequency') plt.savefig( './result/genefinalfig/WGAN-Generated-data-distribution.jpg') plt.close() with open('./result/lossfig/wdis.pickle', 'wb') as fp: pickle.dump(self.wdis_store, fp) t = arange(len(self.wdis_store)) plt.plot(t, self.wdis_store, 'r--') plt.xlabel('Iterations') plt.ylabel('Wasserstein distance') plt.savefig('./result/lossfig/WGAN-W-distance.jpg') plt.close() rv_pre, gv_pre, rv_pro, gv_pro = dwp(x_train, x_gene, self.testX, self.db) print 'Totally ' + str(len(rv_pre)) + ' of coordinates are left' with open('./result/genefinalfig/rv_pre.pickle', 'wb') as fp: pickle.dump(rv_pre, fp) with open('./result/genefinalfig/gv_pre.pickle', 'wb') as fp: pickle.dump(gv_pre, fp) with open('./result/genefinalfig/rv_pro.pickle', 'wb') as fp: pickle.dump(rv_pro, fp) with open('./result/genefinalfig/gv_pro.pickle', 'wb') as fp: pickle.dump(gv_pro, fp) rv_pre, gv_pre, rv_pro, gv_pro = fig_add_noise(rv_pre), fig_add_noise( gv_pre), fig_add_noise(rv_pro), fig_add_noise(gv_pro) plt.scatter(rv_pre, gv_pre) plt.title('Dimension-wise prediction, lr') plt.xlabel('Real data') plt.ylabel('Generated data') plt.savefig('./result/genefinalfig/WGAN-dim-wise-prediction.jpg') plt.close() plt.scatter(rv_pro, gv_pro) plt.title('Dimension-wise probability, lr') plt.xlabel('Real data') plt.ylabel('Generated data') plt.savefig('./result/genefinalfig/WGAN-dim-wise-probability.jpg') plt.close()
def histogram(df, nbin, name, k): plt.figure() n, bins, patches = plt.hist(df['Mean_' + name], nbin, density=True, facecolor='g', alpha=0.75) plt.title('Histogram of ' + name + ' ' + k + 's') plt.grid(True) plt.show()
def create_individual_statistics_panel(message_df, title, participants, colour_palette): """ Create a panel which summarises the individual statistics of any user within the selected group. """ all_messages = message_df.loc[message_df['Type'] == 'Message', 'Message'].reset_index()['Message'] lengths = [*map(len, all_messages)] print("The maximum length message you've ever sent is: {}".format( max(lengths))) print("\nThe message was: \n {}".format(all_messages.loc[lengths.index( max(lengths))])) lengths.sort() # From this it seems like 70 characters would be a good amount of characters to use plt.hist(lengths[:int(len(lengths) * 0.95)], bins=len({*lengths[:int(len(lengths) * 0.95)]})) plt.show()
def display(self, data, candidates, fname, display): finallist=[] for c in candidates: finallist.append(c[0]) #print finallist part1 = finallist[:len(finallist)/2] part2 = finallist[len(finallist)/2:] meandiff=int(np.sqrt(np.power(np.mean(part2),2)-np.power(np.mean(part1),2))) rangeA = max(part1)-min(part1) rangeB = max(part2)-min(part2) span = int((rangeA+rangeB)/2) dspan = int(meandiff/span) theta = float(meandiff/(rangeA+rangeB)) oneortwo="" if dspan >3 and meandiff > 20 or meandiff>36: oneortwo = "Two distributions \n\n MD: %d \n Span: %d \n Dspan: %d \n theta: %d" % (meandiff, span, dspan, theta) else: oneortwo = "One distribution \n\n MD: %d \n Span: %d \n Dspan: %d \n theta: %d" % (meandiff, span, dspan, theta) cans = np.array(candidates) plt.plot(cans[:,0],cans[:,1],'ro') plt.axhline(max(cans[:,1])/4, color='r') plt.axhline(max(cans[:,1]/2), color='r') plt.axhline(int(max(cans[:,1]))*0.75, color='r') red_patch = mpatches.Patch(color='red', label='75%, 50% and 25% \nof maximum frequency') plt.legend(handles=[red_patch]) plt.ylabel('Frequency of occurence') plt.xlabel('separate items') plt.title('Frequency distribution estimation graph: %s' %(fname)) plt.text(max(data)*1.1, max(cans[:,1])*0.62, oneortwo, fontsize = 11, color = 'r') plt.hist(data,range(int(min(data)),int(max(data)),1)) ofile = fname[0:-3]+"png" print ("Writing outfile: %s") % (ofile) plt.savefig(ofile, bbox_inches='tight') if display == True: plt.show() return;
def plot_histogram(self): """Method to output a histogram of the instance variable data using matplotlib pyplot library. Args: None Returns: None """ # TODO: Plot a histogram of the data_list using the matplotlib package. # Be sure to label the x and y axes and also give the chart a title x = self.mean + self.stdev plt.hist(x, 50, density=1, facecolor='b', alpah=0.75) plt.xlable('x-axis label') plt.ylabel('y-axis label') plt.title('Histogram Title') plt.axis(self.data) plt.show()
def get_graph(n, title): """ Draw a distribution histogram for a sample of N data from n-dimensional Normal distribution """ sample = np.random.normal(size=(N, n)) dist = np.square(np.linalg.norm(sample, axis=1)) lower_bound, upper_bound = get_2_std_estimates(dist) n, bins, patches = plt.hist(dist, bins='auto', density="true") plt.axvline(x=lower_bound, color='red') plt.axvline(x=upper_bound, color='red') plt.title(title, fontdict={'fontsize': 20}) plt.show()
def plot_per_historgram(per_path:str, save_path:str=None): """ This function plots PER values as a histogram. The plot is saved to `save_path`. Args: per_path (str): path to per csv file with one column as the sample_id and the other the per value save_path (str): path to save the histrogram plot """ import csv import matplotlib.pyplot as plt import numpy as np with open(per_path, 'r') as fid: reader = csv.reader(fid, delimiter=',') per_list = [float(row[1]) for row in reader] plt.hist(per_list, bins=10, range=(0.0, 1.0)) plt.title("histogram of 2020-10-29 model PER values") plt.xlabel("PER bins") plt.ylabel("# of records") plt.xticks(np.arange(0, 1.1, step=0.1)) #plt.yticks(labels=per_list) if save_path == None: save_path = "PER_histogram.png" plt.savefig(save_path)
from matplotlib import pyplot as plt # In[11]: #ploting figure fig = plt.figure(figsize=(15,12)) plt.suptitle('Histograms of Numerical Columns', fontsize=20) for i in range(1,dataset2.shape[1]+1): plt.subplot(6,5,i) f=plt.gca() f.axes.get_yaxis().set_visible(False) f.set_title(dataset2.columns.values[i-1]) vals=np.size(dataset2.iloc[:,i-1].unique()) plt.hist(dataset2.iloc[:,i-1],bins=vals,color='#3F5D7D') plt.tight_layout(rect=[0,0.03,1,0.95]) # In[12]: #piechart plots dataset2=dataset[['housing','is_referred','app_downloaded','web_user', 'app_web_user', 'ios_user', 'android_user', 'registered_phones', 'payment_type', 'waiting_4_loan', 'cancelled_loan', 'received_loan', 'rejected_loan', 'zodiac_sign', 'left_for_two_month_plus', 'left_for_one_month', 'is_referred']] # In[13]:
# Using strings and lists to summarise total number of species in the data set # https://stackoverflow.com/questions/997797/what-does-s-mean-in-python species_list = list(data["Species"].unique()) print("Types of species: %s\n" % species_list) # Create a DataFrame to structure the data correctly # Reference: http://www.datasciencemadesimple.com/get-list-column-headers-column-name-python-pandas/ d = { "SepalLengthCm": data[:, 0], "SepalWidthCm": data[:, 1], "PetalLengthCm": data[:, 2], "PetalWidtCm": data[:, 3], "Species": data[:, 4] } df = pd.DataFrame(d, columns=[ "SepalLengthCm", "SepalWidthCm", "PetalLengthCm", "PetalWidthCm", "Species" ]) df # Build on Tutorial and plot the data on a histogram import matplotlib.pyplot as pl pl.hist(firstcol) pl.show
import matplotlib as plt import numpy as np import pandas as pd dat=pd.read_csv('Voters.csv').as_matrix() x=dat[:,0] y=dat[:,1] plt.scatter(x,y) plt.show() plt.hist(x) plt.hist(y,bins=15) #images train=pd.read_csv('test.csv') M=train.as_matrix() im=M[0,1:] im=im.reshape(28,28) M=train.as_matrix() plt.imshow(im) plt.show() plt.imshow(im,cmap="gray") from scipy.stats import norm norm.pdf(0) norm.pdf(0,loc=5, scale=10) r=np.random.randn(10) norm.pdf(r) norm.cdf(r) r=10*np.random.randn(10000)+5
import matplotlib as plt import numpy as np import matplotlib.pyplot as plt import pandas as pd from scipy import stats from math import sqrt,pi,exp # In[35]: #We Generate a random series of 1000 element and we plot them with matplotlib A = np.random.randint(0,10,1000) figure,axe = plt.subplots(figsize=(10,4)) plt.hist(A,bins=19,color='red') axe.set_title("Histogram of the dataset") axe.set_xlabel("Number") axe.set_ylabel("Frequency") # In[36]: #Compute the mean/max/mode #Compute the mean : the mean is the sum of all the element #divided by the len of the array mean = sum(A)/len(A)
data[genre] = [genre in movie.split('|') for movie in data.genres] data.head() data['title'] = [t[0:-7] for t in data.title] data.head() data[['score', 'runtime', 'year', 'votes']].describe() print len(data[data.runtime == 0]) data.runtime[data.runtime==0] = np.nan data.runtime.describe() plt.hist(data.year, bins=np.arange(1950, 2013), color='#cccccc') plt.xlabel("Release Year") remove_border() # Received following message: # Traceback (most recent call last): # File "<stdin>", line 1, in <module> # AttributeError: 'module'object has not attribute 'hist' # AND # AttributeError: 'module'object has not attribute 'xlabel' plt.hist(data.score, bins=20, color='#cccccc') plt.xlabel("IMDB rating") remove_border() # Again, I'm receiving AttributeError messages. Is there an issue with the matplotlib that is not allowing me to produce a histogram? plt.scatter(data.year, data.score, lw=0, alpha=.08, color='k')
# Write the code necessary to create a cross tabulation of the number of titles by department. # (Hint: this will involve a combination of SQL and python/pandas code) dept_query = """SELECT * FROM departments join dept_emp on dept_emp.dept_no = departments.dept_no""" dept_and_dept_emp = pd.read_sql(dept_query, url) dept_and_dept_emp t_dept_dept_emp = pd.merge(titles, dept_and_dept_emp) t_dept_name = t_dept_dept_emp[['title', 'dept_name']] titles_dept_name.groupby('dept_name').count() joined = employees.join(titles.set_index('emp_no'),on='emp_no') mask_current = joined['to_date'].apply(lambda x: '9999' not in str(x)) changed = joined[mask_current] changed["diff"] = changed["to_date"] - changed["from_date"] plt.hist(changed['diff'].apply(lambda x: x.days/365),bins=6) plt.xlabel('Years') plt.ylabel('# of Employees') plt.title('Frequency of Job Changes') plt.show() # In[103]: # Use your get_db_url function to help you explore the data from the chipotle database. # Use the data to answer the following questions: database_name = "chipotle" orders_query = """SELECT * FROM orders"""
# IPython log file import pandas as pd import matplotlib as plt import matplotlib.pyplot as plt df = pd.read_csv("http://www.biostat.jhsph.edu/~rpeng/useRbook/faithful.csv") plt.plot(df["eruptions"], df["waiting"], "b.") plt.title("eruptions vs waiting") plt.savefig("scatter.png") plt.clf() plt.hist(df["eruptions"]) plt.savefig("eruptions.png") plt.clf() plt.hist(df["waiting"]) plt.savefig("waiting.png") plt.clf()
# "Else" reject, though nothing to write # Store iterate chain[i, 0] = ll chain[i, 1:] = param # In[11]: ''' Graphs ''' import matplotlib.pyplot as plt # Histograms plt.figure() plt.hist(chain[int(n_iterates / 2):, 1]) plt.title('mu frequency') print('mu = ' + str(mu)) plt.figure() plt.hist(chain[int(n_iterates / 2):, 2]) plt.title('sd frequency') print('sd = ' + str(sd)) plt.show() # In[16]: ''' Graphs alternative import matplotlib.pyplot as plt
import numpy as np import matplotlib as plt for i in range(3): p = np.random.poisson(lam=6, size=1000) np.savetxt("Data Sets/Poisson data set " + str(i), p, delimiter=",") plt.xlabel("x") plt.ylabel("P(X)=x") plt.hist(p, density=True) plt.savefig("Graphs/Poisson/Poisson " + str(i) + ".png") plt.show()
import numpy as np import matplotlib as plt np.random.seed(123) # Simulate random walk 500 times all_walks = [] for i in range(500): random_walk = [0] for x in range(100): step = random_walk[-1] dice = np.random.randint(1, 7) if dice <= 2: step = max(0, step - 1) elif dice <= 5: step = step + 1 else: step = step + np.random.randint(1, 7) if np.random.rand() <= 0.001: step = 0 random_walk.append(step) all_walks.append(random_walk) # Create and plot np_aw_t np_aw_t = np.transpose(np.array(all_walks)) # Select last row from np_aw_t: ends ends = np_aw_t[-1, :] # Plot histogram of ends, display plot plt.hist(ends) plt.show()
# <a id='eda'></a> # ## Exploratory Data Analysis # # > **Tip**: Now that you've trimmed and cleaned your data, you're ready to move on to exploration. Compute statistics and create visualizations with the goal of addressing the research questions that you posed in the Introduction section. It is recommended that you be systematic with your approach. Look at one variable at a time, and then follow it up by looking at relationships between variables. # # ### Question 1: What is the independent variable? What is the dependent variable? # Gender,ScheduledDay,AppointmentDay, Age, Neighbourhood, Scholarship, Hipertension, Diabetes, Alcoholism, Handcap, SMS_received and WeekDay are independent variables. # Whether the patient was no-show (No-show) is the dependent variable # # ### Question 2: What are the no show record differences between female and male? # In[17]: plt.hist(df['Gender'], color='blue', edgecolor='black', bins=int(180 / 50)) plt.title('Histogram of Gender distribution') plt.xlabel('Gender') plt.ylabel('Frequency') # There are around 70000 female patients and 40000 male patients in the record. # In[18]: # name the no show records attend = df.NoShow == "Yes" not_attend = df.NoShow == "No" # In[19]: plt.hist(df.Gender[attend],
def main(): #### LOAD FACE DATA face_data, face_label = load_face_data('face(1).mat') #### PARTITION DATA INTO TRAIN AND TEST SET X_train, X_test, Y_train, Y_test = partition_data(face_data, face_label, show='no') #### OBTAIN ORIGINAL AND NORMALIZED FEATURE VECTORS original_train, norm_train = get_original_normalized_feature_vectors( X_train, show='no') original_test, norm_test = get_original_normalized_feature_vectors( X_test, show='no') #### DISTANCE DEFINITIONS L1_NN = NearestNeighbors(n_neighbors=200, metric='minkowski', p=1) #manhattan l1 L2_NN = NearestNeighbors(n_neighbors=200, metric='minkowski', p=2) #euclidean l2 Linf_NN = NearestNeighbors(n_neighbors=200, metric='chebyshev') #chesboard/chebyshev linf earthmover = NearestNeighbors(n_neighbors=200, metric=wasserstein_distance) #wassterstein intersection = NearestNeighbors( n_neighbors=200, metric=histogram_intersection) #intersection chisquare = NearestNeighbors(n_neighbors=200, metric=chi) kldiv = NearestNeighbors(n_neighbors=200, metric=kl) js = NearestNeighbors(n_neighbors=200, metric=distance.jensenshannon) #### HISTOGRAM A_test = [] for i in range(0, X_test.shape[1]): A_test.append(X_test[:, i]) A_train = [] for i in range(0, X_train.shape[1]): A_train.append(X_train[:, i]) bin_width = 10 intensity_max = 255 n_bins = math.ceil(intensity_max / bin_width) bin_list = np.arange(0, 270, 10).tolist() # Create a bin list from 0-260 # It was found empirically that test images' pixel intensities ranged from 0 to ~260 # Assuming uniform quantisation print("List of bins:", '\n', bin_list, '\n') X_hist_test = [] for i in range(0, X_test.shape[1]): X_hist, bins, patches = plt.hist(A_test[i], bins=bin_list) X_hist_test.append(X_hist) plt.close() X_hist_train = [] for j in range(0, X_train.shape[1]): X_hist, bins, patches = plt.hist(A_train[j], bins=bin_list) X_hist_train.append(X_hist) plt.close() plt.close() X_hist_test = np.asarray(X_hist_test) X_hist_train = np.asarray(X_hist_train) methods = [ L2_NN, L1_NN, Linf_NN, earthmover, intersection, chisquare, kldiv, js ] method_name = [ 'L2', 'L1', 'Linf_NN', 'Earthmover', 'Intersection', 'Chi-Square', 'K-L Divergence', 'JS' ] test_datas = [X_hist_test] train_datas = [X_hist_train] test_name = ['Histogram'] M_pca_list = [16, 32, 64, 128, 256] M_pca_list = [4, 8, 12, 16, 22, 26] # max of 26 as there are 26 bins data_type = [0, 1] recall_levels = 11 M_lda = 10 lda = LinearDiscriminantAnalysis(n_components=M_lda) method_count = 0 for method in methods: #for test_data in test_datas: #for type in data_type: Mpca_list = [] mAP_pca_list = [] mAP_lda_list = [] acc1_pca_list = [] acc1_lda_list = [] acc10_pca_list = [] acc10_lda_list = [] for M_pca in M_pca_list: #pca = PCA(n_components=M_pca) #lda = LinearDiscriminantAnalysis(n_components=M_lda) #test_pca = pca.fit_transform(test_data) #test_lda = lda.fit_transform(test_pca, Y_test) pca = PCA(n_components=M_pca) #train_pca = pca.fit_transform(train_datas[0]) #test_pca = pca.transform(test_datas[0]) train_pca = pca.fit_transform(X_hist_train) test_pca = pca.transform(X_hist_test) train_lda = lda.fit_transform(train_pca, Y_train) test_lda = lda.transform(test_pca) method.fit(test_pca) method_nbrs_pca = np.asarray(method.kneighbors(test_pca)) method_map_pca, method_df_pca, acc1_pca, acc10_pca = calculate_map( method_nbrs_pca, Y_test, recall_levels) method.fit(test_lda) method_nbrs_lda = np.asarray(method.kneighbors(test_lda)) method_map_lda, method_df_lda, acc1_lda, acc10_lda = calculate_map( method_nbrs_lda, Y_test, recall_levels) #print(method_name[method_count],test_name[name_count],", Mpca =",M_pca,"PCA mAP:",method_map_pca) #print(method_name[method_count],test_name[name_count],", Mpca =",M_pca,"PCA-LDA mAP:",method_map_lda) print(method_name[method_count], ", Mpca =", M_pca, "PCA mAP:", method_map_pca, ",Acc@1:", acc1_pca, ",Acc@10:", acc10_pca) print(method_name[method_count], ", Mpca =", M_pca, "PCA-LDA mAP:", method_map_lda, ",Acc@1:", acc1_lda, ",Acc@10:", acc10_lda) Mpca_list.append(M_pca) mAP_pca_list.append(method_map_pca) mAP_lda_list.append(method_map_lda) acc1_pca_list.append(acc1_pca) acc1_lda_list.append(acc1_lda) acc10_pca_list.append(acc10_pca) acc10_lda_list.append(acc10_lda) x1 = Mpca_list y1 = mAP_pca_list y2 = mAP_lda_list y3 = acc1_pca_list y4 = acc1_lda_list y5 = acc10_pca_list y6 = acc10_lda_list plt.figure(figsize=(10, 10)) plt.plot(x1, y1, color='red', label='PCA mAP', marker='o') plt.plot(x1, y2, color='red', label='PCA-LDA mAP', marker='x') plt.plot(x1, y3, color='blue', label='PCA Acc@rank1', marker='o') plt.plot(x1, y4, color='blue', label='PCA-LDA Acc@rank1', marker='x') plt.plot(x1, y5, color='green', label='PCA Acc@rank10', marker='o') plt.plot(x1, y6, color='green', label='PCA-LDA Acc@rank10', marker='x') plt.grid(color='black', linestyle='-', linewidth=0.1) # parameters for plot grid title_name = str(method_name[method_count] + " " + test_name[0] + ' PCA and PCA-LDA Performance') plt.title(title_name).set_position([0.5, 1.05]) plt.xlabel('Mpca') plt.ylabel('mAP, Accuracy') plt.legend(loc='best') ''' for i, txt in enumerate(y1): plt.annotate(txt, (x1[i], y1[i])) for i, txt in enumerate(y2): plt.annotate(txt, (x1[i], y2[i])) ''' plt.savefig(title_name) #plt.show() plt.close() print(" ") method_count = method_count + 1
__author__ = '49236_000' import numpy import os import sys from pyspark import SparkContext import matplotlib.pyplot as hist #u'1|24|M|technician|85711' sc = SparkContext("local", "Chapter2") user_data = sc.textFile("file:///home/hadoop/data/ml-100k/u.user") user_data.first() user_fields = user_data.map(lambda line: line.split("|")) num_user = user_fields.map(lambda fields: fields[0]).count() num_genders = user_fields.map(lambda fields: fields[2]).distinct().count() num_occupations = user_fields.map(lambda fields: fields[3]).distinct().count() num_zipcodes = user_fields.map(lambda fields: fields[4]).distinct().count() print "User: %d,genders %d,occupations: %d,ZIP codes: %d" % ( num_user, num_genders, num_occupations, num_zipcodes) #histogram ages = user_fields.map(lambda x: int(x[1])).collect() hist(ages, bins=20, color='lightblue', normed=True) fig = matplotlib.pyplot.gcf() fig.set_size_inches(16, 10)
# In[201]: #df2.iloc[1,6] df2[df2['titulo'].str.contains("safrinha")==True] # ## Histograma: numero de caracteres por titulo de noticia # In[8]: df['titulo_len'] = df['titulo'].astype(str).apply(len) plt.hist(x=df['titulo_len'], bins='auto', color='#0504aa', alpha=0.7, rwidth=0.85) plt.grid(axis='y', alpha=0.75) plt.xlabel('valor') plt.ylabel('frequencia') total_characters = df['titulo_len'].sum() print('traduzir 1 milhão de caracteres custa 6 usd. Total de caracteres: '+str(total_characters)) #plt.title('Histograma: numero de caracteres por titulo de noticia') # ## Histograma: numero de palavras por titulo de noticia # # In[9]: df['titulo_word'] = df['titulo'].apply(lambda x: len(str(x).split()))
__author__ = '49236_000' import numpy import os import sys from pyspark import SparkContext import matplotlib.pyplot as hist #u'1|24|M|technician|85711' sc=SparkContext("local","Chapter2") user_data=sc.textFile("file:///home/hadoop/data/ml-100k/u.user") user_data.first() user_fields=user_data.map(lambda line: line.split("|")) num_user=user_fields.map(lambda fields: fields[0]).count() num_genders=user_fields.map(lambda fields: fields[2]).distinct().count() num_occupations=user_fields.map(lambda fields: fields[3]).distinct().count() num_zipcodes=user_fields.map(lambda fields: fields[4]).distinct().count() print "User: %d,genders %d,occupations: %d,ZIP codes: %d" % (num_user,num_genders,num_occupations,num_zipcodes) #histogram ages = user_fields.map(lambda x: int(x[1])).collect() hist(ages, bins=20, color='lightblue', normed=True) fig = matplotlib.pyplot.gcf() fig.set_size_inches(16, 10)
avg_order = orders.groupby('month').price.mean().reset_index() std_order = orders.groupby('month').price.std().reset_index() ax = plt.subplot() bar_heights = avg_order.price bar_errors = std_order.price plt.close('all') plt.bar(avg_order.month.values, avg_order.price.values, yerr=bar_errors.price.values, capsize=5) plt.show() ax.set_xticks(range(len(bar_heights))) ax.set_xticklabels(['April', 'May', 'June', 'July', 'August', 'September']) plt.ylabel('Average Amount') plt.title('Amount over Time') plt.show() customer_amount = orders.groupby('customer_id').price.sum().reset_index() print customer_amount.head() plt.hist(customer_amount.price.values, range=(0, 200), bins=40) plt.xlabel('Total Spent') plt.ylabel("Number of Customers") plt.title('Customer Expenditure Over 6 Months') plt.show()
titanic_dataframe.groupby('Sex')[['Age','Fare','Parch','SibSp']].describe() # There are far more male passenges than female passengers in the data. # In[7]: # Take a closer look at the distribution of the age of passengers. It has a wide range of age span. titanic_dataframe['Age'].describe() # In[70]: get_ipython().magic(u'pylab inline') plt.hist(titanic_dataframe['Age'].dropna()) plt.xlabel("Passenger Age Range") plt.ylabel("Count") # In[71]: fig = plt.figure() fig = plt.figure() fig2 = subplot(1,2,1) plt.xlabel("Female Age Range") plt.ylabel("Passenger Count") titanic_dataframe[titanic_dataframe['Sex']=="female"]['Age'].hist(bins=15) fig2 = subplot(1,2,2) titanic_dataframe[titanic_dataframe['Sex']=="male"]['Age'].hist(bins=15) plt.xlabel("Male Age Range")