def _skewed_distributions(ax): dist1 = scstats.norm() dist2 = scstats.gumbel_l(loc = 2.0) y1 = dist1.pdf(x) y2 = dist2.pdf(x) ax.plot(x, y1) ax.fill_between(x, y1, alpha = 0.5, label = 'Before') ax.plot(x, y2) ax.fill_between(x, y2, alpha = 0.5, label = 'After') ax.set_xlim((-5, 5)) ax.legend() return ax
def sample(self, k): X = self.gaussian.rvs(k) # to unfirom norm = stats.norm() U = norm.cdf(X) m1 = stats.gumbel_l() m2 = stats.beta(a=10, b=2) Y0 = m1.ppf(U[:, 0]) Y1 = m2.ppf(U[:, 1]) return (Y0, Y1)
def optimize_loc(res_loc, res_scale, load_distro, conf_target, eps): """Auxiliary function to be used with the scipy.optimize.bisect function to find the location parameters of the resistance distribution that matches a required confidence level. res_loc, res_scale: locations and scale parameters of the distribution load_distro: load distribution (frozen scipy.stats distribution) conf_target: confidence level target eps: limit integration domain where load and resistance pdfs are > eps""" res_distro = ss.gumbel_l(loc=res_loc, scale=res_scale) x, dx = np.linspace(min(load_distro.ppf(eps), res_distro.ppf(eps)), max(load_distro.ppf(1 - eps), res_distro.ppf(1 - eps)), 2000, retstep=True) confidence = 1.0 - pfail(load_distro.pdf, res_distro.cdf, x, dx) return confidence - conf_target
def __init__(self, mean, stdev, dtype='normal', weib_loc=0): if dtype == 'normal': self.dist = ss.norm(loc=mean, scale=stdev) elif dtype == 'gumbel_r': beta = stdev * sqrt(6) / pi mu = mean - euler_gamma * beta self.dist = ss.gumbel_r(loc=mu, scale=beta) elif dtype == 'gumbel_l': beta = stdev * sqrt(6) / pi mu = mean + euler_gamma * beta self.dist = ss.gumbel_l(loc=mu, scale=beta) elif dtype == 'weibull': self.dist = weibull(mean, stdev, weib_loc) else: print('Error dtype.')
def optimize_loc(res_loc, res_scale, load_distro, conf_target, eps): """Auxiliary function to be used with the scipy.optimize.bisect function to find the location parameters of the resistance distribution that matches a required confidence level. res_loc, res_scale: locations and scale parameters of the distribution load_distro: load distribution (frozen scipy.stats distribution) conf_target: confidence level target eps: limit integration domain where load and resistance pdfs are > eps""" res_distro = ss.gumbel_l(loc=res_loc, scale=res_scale) x, dx = np.linspace( min(load_distro.ppf(eps), res_distro.ppf(eps)), max(load_distro.ppf(1-eps), res_distro.ppf(1-eps)), 2000, retstep=True) confidence = 1.0 - pfail(load_distro.pdf, res_distro.cdf, x, dx) return confidence - conf_target
def __init__(self, mean, stdev, dtype='normal', weib_loc=0): if dtype == 'normal': self.dist = ss.norm(loc=mean, scale=stdev) elif dtype == 'gumbel_r': beta = stdev*sqrt(6)/pi mu = mean - euler_gamma * beta self.dist = ss.gumbel_r(loc=mu, scale=beta) elif dtype == 'gumbel_l': beta = stdev*sqrt(6)/pi mu = mean + euler_gamma * beta self.dist = ss.gumbel_l(loc=mu, scale=beta) elif dtype == 'weibull': self.dist = weibull(mean, stdev, weib_loc) else: print('Error dtype.')
(len(bad_sample) - 1) / len(bad_sample), len(bad_sample)))), '*') ## ## Using Kolmogorov-Smirnov test ## The D statistic is the absolute max distance (supremum) between the CDFs of the two samples. ## The closer this number is to 0 the more likely it is that the two samples were drawn from the ## same distribution. ## The p-value returned by the k-s test has the same interpretation as other p-values. You reject ## the null hypothesis that the two samples were drawn from the same distribution if the p-value ## is less than your significance level. You can find tables online for the conversion of the ## D statistic into a p-value if you are interested in the procedure. ## ## stats, pvalue = ss.kstest(rvs=good_sample, cdf=ss.gumbel_l(*ss.gumbel_l.fit(good_sample)).cdf) print('The maximumdistance between CDFs is %.2f.' % stats, end='') print('The sample is Gumbel distributed for a significance level of %.2f' % pvalue) stats, pvalue = ss.kstest(rvs=bad_sample, cdf=ss.gumbel_l(*ss.gumbel_l.fit(bad_sample)).cdf) print('The maximumdistance between CDFs is %.2f.' % stats, end='') print('The sample is Gumbel distributed for a significance level of %.2f' % pvalue) ## ## Using Anderson-Darling test ## The assumption regarding the distribution of the sample is rejected if the output value ## is larger than the critical values for the required significance level. ## For gumbel distributions, the critical values and significance levels are:
load_loc = 100 # location parameter for the load distribution load_scale = 5 # scale parameter for the load distribution res_scale = 3.5 # scale parameter for the resistance distribution eps = 1e-8 # domain = pdf > eps, for load and resistance # frozen load distribution load_distro = ss.gumbel_r(loc=load_loc, scale=load_scale) # finds the location parameter for the resistance distribution that # gives the required conf_target res_loc = sp.optimize.bisect(optimize_loc, load_loc, load_distro.ppf(1 - eps), args=(res_scale, load_distro, conf_target, eps)) # frozen resistance distribution res_distro = ss.gumbel_l(loc=res_loc, scale=res_scale) # recalculates the domain and the confidence level x, dx = np.linspace(min(load_distro.ppf(eps), res_distro.ppf(eps)), max(load_distro.ppf(1 - eps), res_distro.ppf(1 - eps)), 200, retstep=True) confidence = 1.0 - pfail(load_distro.pdf, res_distro.cdf, x, dx) # %% plotting plt.plot(x, load_distro.pdf(x), label='load pdf') plt.plot(x, res_distro.pdf(x), label='resistance pdf') plt.grid() plt.legend(loc='best') plt.show() print('Confidence %.3f%%' % (100 * confidence)) pfailure = pfail_dblchk(load_distro.pdf, res_distro.pdf, x)
def case3(output=True): accuracy_in_each_turn = list() precision_in_each_turn_spam = list() recall_in_each_turn_spam = list() precision_in_each_turn_ham = list() recall_in_each_turn_ham = list() m = np.loadtxt(open("resources/normalized_data.csv", "rb"), delimiter=',') shuffled = np.random.permutation(m) valid.validate_cross_validation(NUMBER_OF_ROUNDS, TRAIN_TEST_RATIO) # equiprobable priors prior_spam = 0.5 prior_ham = 0.5 for i in xrange(NUMBER_OF_ROUNDS): # we're using cross-validation so each iteration we take a different # slice of the data to serve as test set train_set, test_set = prep.split_sets(shuffled, TRAIN_TEST_RATIO, i) #parameter estimation #but now we take ALL attributes into consideration sample_means_word_spam = list() sample_means_word_ham = list() sample_variances_word_spam = list() sample_variances_word_ham = list() # all but the last one for attr_index in xrange(57): sample_means_word_spam.append( nb.take_mean_spam(train_set, attr_index, SPAM_ATTR_INDEX)) sample_means_word_ham.append( nb.take_mean_ham(train_set, attr_index, SPAM_ATTR_INDEX)) sample_variances_word_spam.append( nb.take_variance_spam(train_set, attr_index, SPAM_ATTR_INDEX)) sample_variances_word_ham.append( nb.take_variance_ham(train_set, attr_index, SPAM_ATTR_INDEX)) #sample standard deviations from sample variances sample_std_devs_spam = map(lambda x: x**(1 / 2.0), sample_variances_word_spam) sample_std_devs_ham = map(lambda x: x**(1 / 2.0), sample_variances_word_ham) hits = 0.0 misses = 0.0 #number of instances correctly evaluated as spam correctly_is_spam = 0.0 #total number of spam instances is_spam = 0.0 #total number of instances evaluated as spam guessed_spam = 0.0 #number of instances correctly evaluated as ham correctly_is_ham = 0.0 #total number of ham instances is_ham = 0.0 #total number of instances evaluated as ham guessed_ham = 0.0 # now we test the hypothesis against the test set for row in test_set: # ou seja, o produto de todas as prob. condicionais das palavras dada a classe # eu sei que ta meio confuso, mas se olhar com cuidado eh bonito fazer isso tudo numa linha soh! =) product_of_all_conditional_probs_spam = reduce( lambda acc, cur: acc * stats.gumbel_l( sample_means_word_spam[cur], sample_std_devs_spam[cur]). pdf(row[CASE_2_ATTRIBUTE_INDEXES[cur]]), xrange(10), 1) # nao precisa dividir pelo termo de normalizacao pois so queremos saber qual e o maior! posterior_spam = prior_spam * product_of_all_conditional_probs_spam product_of_all_conditional_probs_ham = reduce( lambda acc, cur: acc * stats.gumbel_l( sample_means_word_ham[cur], sample_std_devs_ham[cur]).pdf( row[CASE_2_ATTRIBUTE_INDEXES[cur]]), xrange(10), 1) posterior_ham = prior_ham * product_of_all_conditional_probs_ham # whichever is greater - that will be our prediction if posterior_spam > posterior_ham: guess = 1 else: guess = 0 if (row[SPAM_ATTR_INDEX] == guess): hits += 1 else: misses += 1 # we'll use these to calculate metrics if (row[SPAM_ATTR_INDEX] == 1): is_spam += 1 if guess == 1: guessed_spam += 1 correctly_is_spam += 1 else: guessed_ham += 1 else: is_ham += 1 if guess == 1: guessed_spam += 1 else: guessed_ham += 1 correctly_is_ham += 1 #accuracy = number of correctly evaluated instances/ # number of instances # # accuracy = hits / (hits + misses) #precision_spam = number of correctly evaluated instances as spam/ # number of spam instances # # # in order to avoid divisions by zero in case nothing was found if (is_spam == 0): precision_spam = 0 else: precision_spam = correctly_is_spam / is_spam #recall_spam = number of correctly evaluated instances as spam/ # number of evaluated instances como spam # # # in order to avoid divisions by zero in case nothing was found if (guessed_spam == 0): recall_spam = 0 else: recall_spam = correctly_is_spam / guessed_spam #precision_ham = number of correctly evaluated instances as ham/ # number of ham instances # # # in order to avoid divisions by zero in case nothing was found if (is_ham == 0): precision_ham = 0 else: precision_ham = correctly_is_ham / is_ham #recall_ham = number of correctly evaluated instances as ham/ # number of evaluated instances como ham # # # in order to avoid divisions by zero in case nothing was found if (guessed_ham == 0): recall_ham = 0 else: recall_ham = correctly_is_ham / guessed_ham accuracy_in_each_turn.append(accuracy) precision_in_each_turn_spam.append(precision_spam) recall_in_each_turn_spam.append(recall_spam) precision_in_each_turn_ham.append(precision_ham) recall_in_each_turn_ham.append(recall_ham) # calculation of means for each metric at the end mean_accuracy = np.mean(accuracy_in_each_turn) std_dev_accuracy = np.std(accuracy_in_each_turn) variance_accuracy = np.var(accuracy_in_each_turn) mean_precision_spam = np.mean(precision_in_each_turn_spam) std_dev_precision_spam = np.std(precision_in_each_turn_spam) variance_precision_spam = np.var(precision_in_each_turn_spam) mean_recall_spam = np.mean(recall_in_each_turn_spam) std_dev_recall_spam = np.std(recall_in_each_turn_spam) variance_recall_spam = np.var(recall_in_each_turn_spam) mean_precision_ham = np.mean(precision_in_each_turn_ham) std_dev_precision_ham = np.std(precision_in_each_turn_ham) variance_precision_ham = np.var(precision_in_each_turn_ham) mean_recall_ham = np.mean(recall_in_each_turn_ham) std_dev_recall_ham = np.std(recall_in_each_turn_ham) variance_recall_ham = np.var(recall_in_each_turn_ham) if output: print "\033[1;32m" print '=============================================' print 'CASE 3 - ALL ATTRIBUTES - USING GUMBEL LEFT MODEL' print '=============================================' print "\033[00m" print 'MEAN ACCURACY: ' + str(round(mean_accuracy, 5)) print 'STD. DEV. OF ACCURACY: ' + str(round(std_dev_accuracy, 5)) print 'VARIANCE OF ACCURACY: ' + str(round(variance_accuracy, 8)) print '' print 'MEAN PRECISION FOR SPAM: ' + str(round(mean_precision_spam, 5)) print 'STD. DEV. OF PRECISION FOR SPAM: ' + str( round(std_dev_precision_spam, 5)) print 'VARIANCE OF PRECISION FOR SPAM: ' + str( round(variance_precision_spam, 8)) print '' print 'MEAN RECALL FOR SPAM: ' + str(round(mean_recall_spam, 5)) print 'STD. DEV. OF RECALL FOR SPAM: ' + str( round(std_dev_recall_spam, 5)) print 'VARIANCE OF RECALL FOR SPAM: ' + str( round(variance_recall_spam, 8)) print '' print 'MEAN PRECISION FOR HAM: ' + str(round(mean_precision_ham, 5)) print 'STD. DEV. OF PRECISION FOR HAM: ' + str( round(std_dev_precision_ham, 5)) print 'VARIANCE OF PRECISION FOR HAM: ' + str( round(variance_precision_ham, 8)) print '' print 'MEAN RECALL FOR HAM: ' + str(round(mean_recall_ham, 5)) print 'STD. DEV. OF RECALL FOR HAM: ' + str( round(std_dev_recall_ham, 5)) print 'VARIANCE OF RECALL FOR HAM: ' + str( round(variance_recall_ham, 8))
# 1.305*np.std(data_tp[data_tp['wd'] == wd][c]) # for wd in set(data_tp['wd'])]) # MLE's mx = max([ ss.gumbel_r(*ss.gumbel_r.fit(data_tp[data_tp['wd'] == wd] [c])).ppf(0.9) for wd in set(data_tp['wd']) ]) else: # # moment estimators # mx = min([np.mean(data_tp[data_tp['wd'] == wd][c]) + # 1.305*np.std(data_tp[data_tp['wd'] == wd][c]) # for wd in set(data_tp['wd'])]) # MLE's mx = min([ ss.gumbel_l(*ss.gumbel_l.fit(data_tp[data_tp['wd'] == wd] [c])).ppf(0.1) for wd in set(data_tp['wd']) ]) rw.extend([mx]) stdev.loc[i] = rw i += 1 # adjust and sort index stdev = stdev.sort_index(by=['Hs', 'Tp']) stdev = stdev.reset_index() del stdev['index'] # %% # this works, but how to apply +/- depending on max/min stdev3 = (1.305 * data.groupby(['Hs', 'Tp', 'wd'])[['Tmax', 'Tmin']].std() + data.groupby(['Hs', 'Tp', 'wd'])[['Tmax', 'Tmin']].mean()).max(
def case1(index=CASE_1_ATTRIBUTE_INDEX,output=True,ret='accuracy'): accuracy_in_each_turn = list() precision_in_each_turn_spam = list() recall_in_each_turn_spam = list() precision_in_each_turn_ham = list() recall_in_each_turn_ham = list() m = np.loadtxt(open("resources/normalized_data.csv","rb"),delimiter=',') shuffled = np.random.permutation(m) valid.validate_cross_validation(NUMBER_OF_ROUNDS,TRAIN_TEST_RATIO) # equiprobable priors prior_spam = 0.5 prior_ham = 0.5 for i in xrange(NUMBER_OF_ROUNDS): # we're using cross-validation so each iteration we take a different # slice of the data to serve as test set train_set,test_set = prep.split_sets(shuffled,TRAIN_TEST_RATIO,i) #parameter estimation sample_mean_word_spam = nb.take_mean_spam(train_set,index,SPAM_ATTR_INDEX) sample_mean_word_ham = nb.take_mean_ham(train_set,index,SPAM_ATTR_INDEX) sample_variance_word_spam = nb.take_variance_spam(train_set,index,SPAM_ATTR_INDEX) sample_variance_word_ham = nb.take_variance_ham(train_set,index,SPAM_ATTR_INDEX) #sample standard deviations from sample variance sample_std_dev_spam = sample_variance_word_spam ** (1/2.0) sample_std_dev_ham = sample_variance_word_ham ** (1/2.0) hits = 0.0 misses = 0.0 #number of instances corretcly evaluated as spam correctly_is_spam = 0.0 #total number of spam instances is_spam = 0.0 #total number of instances evaluated as spam guessed_spam = 0.0 #number of instances correctly evaluated as ham correctly_is_ham = 0.0 #total number of ham instances is_ham = 0.0 #total number of instances evaluated as ham guessed_ham = 0.0 # now we test the hypothesis against the test set for row in test_set: # nao precisa dividir pelo termo de normalizacao pois so queremos saber qual e o maior! posterior_spam = prior_spam * stats.gumbel_l(sample_mean_word_spam, sample_std_dev_spam).pdf(row[index]) posterior_ham = prior_ham * stats.gumbel_l(sample_mean_word_ham, sample_std_dev_ham).pdf(row[index]) # whichever is greater - that will be our evaluation if posterior_spam > posterior_ham: guess = 1 else: guess = 0 if(row[SPAM_ATTR_INDEX] == guess): hits += 1 else: misses += 1 # we'll use these to calculate metrics if (row[SPAM_ATTR_INDEX] == 1 ): is_spam += 1 if guess == 1: guessed_spam += 1 correctly_is_spam += 1 else: guessed_ham += 1 else: is_ham += 1 if guess == 1: guessed_spam += 1 else: guessed_ham += 1 correctly_is_ham += 1 #accuracy = number of correctly evaluated instances/ # number of instances # # accuracy = hits/(hits+misses) #precision_spam = number of correctly evaluated instances as spam/ # number of spam instances # # # in order to avoid divisions by zero in case nothing was found if(is_spam == 0): precision_spam = 0 else: precision_spam = correctly_is_spam/is_spam #recall_spam = number of correctly evaluated instances as spam/ # number of evaluated instances como spam # # # in order to avoid divisions by zero in case nothing was found if(guessed_spam == 0): recall_spam = 0 else: recall_spam = correctly_is_spam/guessed_spam #precision_ham = number of correctly evaluated instances as ham/ # number of ham instances # # # in order to avoid divisions by zero in case nothing was found if(is_ham == 0): precision_ham = 0 else: precision_ham = correctly_is_ham/is_ham #recall_ham = number of correctly evaluated instances as ham/ # number of evaluated instances como ham # # # in order to avoid divisions by zero in case nothing was found if(guessed_ham == 0): recall_ham = 0 else: recall_ham = correctly_is_ham/guessed_ham accuracy_in_each_turn.append(accuracy) precision_in_each_turn_spam.append(precision_spam) recall_in_each_turn_spam.append(recall_spam) precision_in_each_turn_ham.append(precision_ham) recall_in_each_turn_ham.append(recall_ham) # calculation of means for each metric at the end mean_accuracy = np.mean(accuracy_in_each_turn) std_dev_accuracy = np.std(accuracy_in_each_turn) variance_accuracy = np.var(accuracy_in_each_turn) mean_precision_spam = np.mean(precision_in_each_turn_spam) std_dev_precision_spam = np.std(precision_in_each_turn_spam) variance_precision_spam = np.var(precision_in_each_turn_spam) mean_recall_spam = np.mean(recall_in_each_turn_spam) std_dev_recall_spam = np.std(recall_in_each_turn_spam) variance_recall_spam = np.var(recall_in_each_turn_spam) mean_precision_ham = np.mean(precision_in_each_turn_ham) std_dev_precision_ham = np.std(precision_in_each_turn_ham) variance_precision_ham = np.var(precision_in_each_turn_ham) mean_recall_ham = np.mean(recall_in_each_turn_ham) std_dev_recall_ham = np.std(recall_in_each_turn_ham) variance_recall_ham = np.var(recall_in_each_turn_ham) if output: print "\033[1;32m" print '=============================================' print 'CASE 1 - ONE ATTRIBUTE - USING GUMBEL LEFT MODEL' print '=============================================' print "\033[00m" print 'MEAN ACCURACY: '+str(round(mean_accuracy,5)) print 'STD. DEV. OF ACCURACY: '+str(round(std_dev_accuracy,5)) print 'VARIANCE OF ACCURACY: '+str(round(variance_accuracy,8)) print '' print 'MEAN PRECISION FOR SPAM: '+str(round(mean_precision_spam,5)) print 'STD. DEV. OF PRECISION FOR SPAM: '+str(round(std_dev_precision_spam,5)) print 'VARIANCE OF PRECISION FOR SPAM: '+str(round(variance_precision_spam,8)) print '' print 'MEAN RECALL FOR SPAM: '+str(round(mean_recall_spam,5)) print 'STD. DEV. OF RECALL FOR SPAM: '+str(round(std_dev_recall_spam,5)) print 'VARIANCE OF RECALL FOR SPAM: '+str(round(variance_recall_spam,8)) print '' print 'MEAN PRECISION FOR HAM: '+str(round(mean_precision_ham,5)) print 'STD. DEV. OF PRECISION FOR HAM: '+str(round(std_dev_precision_ham,5)) print 'VARIANCE OF PRECISION FOR HAM: '+str(round(variance_precision_ham,8)) print '' print 'MEAN RECALL FOR HAM: '+str(round(mean_recall_ham,5)) print 'STD. DEV. OF RECALL FOR HAM: '+str(round(std_dev_recall_ham,5)) print 'VARIANCE OF RECALL FOR HAM: '+str(round(variance_recall_ham,8)) # we'll only use these return values to compute rankings # for example in script which_attribute_case_1 if ret == 'utility': return mean_accuracy * mean_precision_ham elif ret =='accuracy': return mean_accuracy else: print 'UNKNOWN METRIC: '+ret sys.exit()
# Calculate a few first moments: mean, var, skew, kurt = gumbel_l.stats(moments='mvsk') # Display the probability density function (``pdf``): x = np.linspace(gumbel_l.ppf(0.01), gumbel_l.ppf(0.99), 100) ax.plot(x, gumbel_l.pdf(x), 'r-', lw=5, alpha=0.6, label='gumbel_l pdf') # Alternatively, the distribution object can be called (as a function) # to fix the shape, location and scale parameters. This returns a "frozen" # RV object holding the given parameters fixed. # Freeze the distribution and display the frozen ``pdf``: rv = gumbel_l() ax.plot(x, rv.pdf(x), 'k-', lw=2, label='frozen pdf') # Check accuracy of ``cdf`` and ``ppf``: vals = gumbel_l.ppf([0.001, 0.5, 0.999]) np.allclose([0.001, 0.5, 0.999], gumbel_l.cdf(vals)) # True # Generate random numbers: r = gumbel_l.rvs(size=1000) # And compare the histogram: ax.hist(r, normed=True, histtype='stepfilled', alpha=0.2)
print "mean = ", trace.mean() for bin in [10,20,50,100]: hist,bin_edges=np.histogram(trace,bins=bin) a=np.argmax(hist) print "maxlike = ", bin_edges[a], bin_edges[a+1], (bin_edges[a]+bin_edges[a+1])/2.0 plt.subplot(2,len(things)/2,plot_idx) if plot_idx==2: n, bins, patches = plt.hist(np.array(trace), 50, normed=1, facecolor='green', alpha=0.75) X = sp.gumbel_l.fit(np.array(trace)) print X dist = sp.gumbel_l(X[0],X[1]) x = np.array(bins) y = dist.pdf(x) print y plt.plot(x, y,'k--',linewidth=2) X = sp.norm.fit(np.array(trace)) print X dist = sp.norm(X[0],X[1]) x = np.array(bins) y = dist.pdf(x) plt.plot(x, y,'r--',linewidth=2) X = sp.genextreme.fit(np.array(trace)) print X dist = sp.genextreme(X[0],X[1],X[2])
# -*- coding: utf-8 -*- """ Created on Thu Oct 5 20:20:31 2017 @author: raf """ import numpy as np from scipy import stats as ss from matplotlib import pyplot as plt import quantilelib as ql n = 50 sample = ss.gumbel_l.rvs(size=n, loc=100, random_state=1234) sample.sort() y = ql.llsurvivals(n) plt.plot(sample, y, 'o') # fit that shit params = ss.gumbel_l.fit(sample) fitted_gumbel = ss.gumbel_l(*params) # get two points for plotting x = sample.min(), sample.max() y = -np.log(-fitted_gumbel.logsf(x)) # plot this shit plt.plot(x, y)
for tail in [tailmax, tailmin]: plt.figure() title('TEST TITLE') plt.subplot(221) plt.hist(tail) gumbel = best_fit(tail) loc, scale = gumbel.fit(tail) mygl = gumbel(loc=loc, scale=scale) plt.subplot(222) stats.probplot(tail,dist=mygl,plot=plt) title('bets fit gumbel') loc, scale = stats.gumbel_l.fit(tail) mygl = stats.gumbel_l(loc=loc, scale=scale) plt.subplot(223) stats.probplot(tail,dist=mygl,plot=plt) title('gumbel l') loc, scale = stats.gumbel_r.fit(tail) mygr = stats.gumbel_r(loc=loc, scale=scale) plt.subplot(224) stats.probplot(tail,dist=mygr,plot=plt) title('gumbel r') #import pandas # ##list with the path to various results files from repeated lowering analyses #with open('list_results.txt', 'r') as pf: # list_results = pf.readlines()
# %% Dx = 3.0 x = np.linspace(-4, 4, 100) norm1 = scstats.norm() norm2 = scstats.norm(loc = Dx) fig, ax = plt.subplots(1, 1, figsize = (7, 5)) ax.plot(x, norm1.pdf(x), label = 'before') ax.plot(x+Dx, norm2.pdf(x+Dx), label = 'after') ax.set_xlim((-5, 5+Dx)) ax.set_ylim((-0.01, 1)) ax.legend() plt.savefig("./figures/distribution_shift.png") # %% x = np.linspace(-4, 4, 100) dist1 = scstats.norm() dist2 = scstats.gumbel_l(loc = 1.5) fig, ax = plt.subplots(1, 1, figsize = (7, 5)) ax.plot(x, dist1.pdf(x), label = 'before') ax.plot(x, dist2.pdf(x), label = 'after') ax.set_xlim((-5, 5)) ax.set_ylim((-0.01, 1)) ax.legend() plt.savefig("./figures/distribution_skew.png") # %% import matplotlib.pyplot as plt import scipy.stats as scstats def Supplementary_Figure1A(): x = np.linspace(-4, 4, 100) norm1 = scstats.norm(scale = 0.5) norm2 = scstats.norm(scale = 1.0)
for tail in [tailmax, tailmin]: plt.figure() title('TEST TITLE') plt.subplot(221) plt.hist(tail) gumbel = best_fit(tail) loc, scale = gumbel.fit(tail) mygl = gumbel(loc=loc, scale=scale) plt.subplot(222) stats.probplot(tail, dist=mygl, plot=plt) title('bets fit gumbel') loc, scale = stats.gumbel_l.fit(tail) mygl = stats.gumbel_l(loc=loc, scale=scale) plt.subplot(223) stats.probplot(tail, dist=mygl, plot=plt) title('gumbel l') loc, scale = stats.gumbel_r.fit(tail) mygr = stats.gumbel_r(loc=loc, scale=scale) plt.subplot(224) stats.probplot(tail, dist=mygr, plot=plt) title('gumbel r') #import pandas # ##list with the path to various results files from repeated lowering analyses #with open('list_results.txt', 'r') as pf: # list_results = pf.readlines()
hist, bin_edges = np.histogram(trace, bins=bin) a = np.argmax(hist) print("maxlike = ", bin_edges[a], bin_edges[a + 1], (bin_edges[a] + bin_edges[a + 1]) / 2.0) plt.subplot(2, len(things) / 2, plot_idx) if plot_idx == 2: n, bins, patches = plt.hist(np.array(trace), 50, normed=1, facecolor='green', alpha=0.75) X = sp.gumbel_l.fit(np.array(trace)) print(X) dist = sp.gumbel_l(X[0], X[1]) x = np.array(bins) y = dist.pdf(x) print(y) plt.plot(x, y, 'k--', linewidth=2) X = sp.norm.fit(np.array(trace)) print(X) dist = sp.norm(X[0], X[1]) x = np.array(bins) y = dist.pdf(x) plt.plot(x, y, 'r--', linewidth=2) X = sp.genextreme.fit(np.array(trace)) print(X) dist = sp.genextreme(X[0], X[1], X[2])
def case1(index=CASE_1_ATTRIBUTE_INDEX, output=True, ret='accuracy'): accuracy_in_each_turn = list() precision_in_each_turn_spam = list() recall_in_each_turn_spam = list() precision_in_each_turn_ham = list() recall_in_each_turn_ham = list() m = np.loadtxt(open("resources/normalized_data.csv", "rb"), delimiter=',') shuffled = np.random.permutation(m) valid.validate_cross_validation(NUMBER_OF_ROUNDS, TRAIN_TEST_RATIO) # equiprobable priors prior_spam = 0.5 prior_ham = 0.5 for i in xrange(NUMBER_OF_ROUNDS): # we're using cross-validation so each iteration we take a different # slice of the data to serve as test set train_set, test_set = prep.split_sets(shuffled, TRAIN_TEST_RATIO, i) #parameter estimation sample_mean_word_spam = nb.take_mean_spam(train_set, index, SPAM_ATTR_INDEX) sample_mean_word_ham = nb.take_mean_ham(train_set, index, SPAM_ATTR_INDEX) sample_variance_word_spam = nb.take_variance_spam( train_set, index, SPAM_ATTR_INDEX) sample_variance_word_ham = nb.take_variance_ham( train_set, index, SPAM_ATTR_INDEX) #sample standard deviations from sample variance sample_std_dev_spam = sample_variance_word_spam**(1 / 2.0) sample_std_dev_ham = sample_variance_word_ham**(1 / 2.0) hits = 0.0 misses = 0.0 #number of instances corretcly evaluated as spam correctly_is_spam = 0.0 #total number of spam instances is_spam = 0.0 #total number of instances evaluated as spam guessed_spam = 0.0 #number of instances correctly evaluated as ham correctly_is_ham = 0.0 #total number of ham instances is_ham = 0.0 #total number of instances evaluated as ham guessed_ham = 0.0 # now we test the hypothesis against the test set for row in test_set: # nao precisa dividir pelo termo de normalizacao pois so queremos saber qual e o maior! posterior_spam = prior_spam * stats.gumbel_l( sample_mean_word_spam, sample_std_dev_spam).pdf(row[index]) posterior_ham = prior_ham * stats.gumbel_l( sample_mean_word_ham, sample_std_dev_ham).pdf(row[index]) # whichever is greater - that will be our evaluation if posterior_spam > posterior_ham: guess = 1 else: guess = 0 if (row[SPAM_ATTR_INDEX] == guess): hits += 1 else: misses += 1 # we'll use these to calculate metrics if (row[SPAM_ATTR_INDEX] == 1): is_spam += 1 if guess == 1: guessed_spam += 1 correctly_is_spam += 1 else: guessed_ham += 1 else: is_ham += 1 if guess == 1: guessed_spam += 1 else: guessed_ham += 1 correctly_is_ham += 1 #accuracy = number of correctly evaluated instances/ # number of instances # # accuracy = hits / (hits + misses) #precision_spam = number of correctly evaluated instances as spam/ # number of spam instances # # # in order to avoid divisions by zero in case nothing was found if (is_spam == 0): precision_spam = 0 else: precision_spam = correctly_is_spam / is_spam #recall_spam = number of correctly evaluated instances as spam/ # number of evaluated instances como spam # # # in order to avoid divisions by zero in case nothing was found if (guessed_spam == 0): recall_spam = 0 else: recall_spam = correctly_is_spam / guessed_spam #precision_ham = number of correctly evaluated instances as ham/ # number of ham instances # # # in order to avoid divisions by zero in case nothing was found if (is_ham == 0): precision_ham = 0 else: precision_ham = correctly_is_ham / is_ham #recall_ham = number of correctly evaluated instances as ham/ # number of evaluated instances como ham # # # in order to avoid divisions by zero in case nothing was found if (guessed_ham == 0): recall_ham = 0 else: recall_ham = correctly_is_ham / guessed_ham accuracy_in_each_turn.append(accuracy) precision_in_each_turn_spam.append(precision_spam) recall_in_each_turn_spam.append(recall_spam) precision_in_each_turn_ham.append(precision_ham) recall_in_each_turn_ham.append(recall_ham) # calculation of means for each metric at the end mean_accuracy = np.mean(accuracy_in_each_turn) std_dev_accuracy = np.std(accuracy_in_each_turn) variance_accuracy = np.var(accuracy_in_each_turn) mean_precision_spam = np.mean(precision_in_each_turn_spam) std_dev_precision_spam = np.std(precision_in_each_turn_spam) variance_precision_spam = np.var(precision_in_each_turn_spam) mean_recall_spam = np.mean(recall_in_each_turn_spam) std_dev_recall_spam = np.std(recall_in_each_turn_spam) variance_recall_spam = np.var(recall_in_each_turn_spam) mean_precision_ham = np.mean(precision_in_each_turn_ham) std_dev_precision_ham = np.std(precision_in_each_turn_ham) variance_precision_ham = np.var(precision_in_each_turn_ham) mean_recall_ham = np.mean(recall_in_each_turn_ham) std_dev_recall_ham = np.std(recall_in_each_turn_ham) variance_recall_ham = np.var(recall_in_each_turn_ham) if output: print "\033[1;32m" print '=============================================' print 'CASE 1 - ONE ATTRIBUTE - USING GUMBEL LEFT MODEL' print '=============================================' print "\033[00m" print 'MEAN ACCURACY: ' + str(round(mean_accuracy, 5)) print 'STD. DEV. OF ACCURACY: ' + str(round(std_dev_accuracy, 5)) print 'VARIANCE OF ACCURACY: ' + str(round(variance_accuracy, 8)) print '' print 'MEAN PRECISION FOR SPAM: ' + str(round(mean_precision_spam, 5)) print 'STD. DEV. OF PRECISION FOR SPAM: ' + str( round(std_dev_precision_spam, 5)) print 'VARIANCE OF PRECISION FOR SPAM: ' + str( round(variance_precision_spam, 8)) print '' print 'MEAN RECALL FOR SPAM: ' + str(round(mean_recall_spam, 5)) print 'STD. DEV. OF RECALL FOR SPAM: ' + str( round(std_dev_recall_spam, 5)) print 'VARIANCE OF RECALL FOR SPAM: ' + str( round(variance_recall_spam, 8)) print '' print 'MEAN PRECISION FOR HAM: ' + str(round(mean_precision_ham, 5)) print 'STD. DEV. OF PRECISION FOR HAM: ' + str( round(std_dev_precision_ham, 5)) print 'VARIANCE OF PRECISION FOR HAM: ' + str( round(variance_precision_ham, 8)) print '' print 'MEAN RECALL FOR HAM: ' + str(round(mean_recall_ham, 5)) print 'STD. DEV. OF RECALL FOR HAM: ' + str( round(std_dev_recall_ham, 5)) print 'VARIANCE OF RECALL FOR HAM: ' + str( round(variance_recall_ham, 8)) # we'll only use these return values to compute rankings # for example in script which_attribute_case_1 if ret == 'utility': return mean_accuracy * mean_precision_ham elif ret == 'accuracy': return mean_accuracy else: print 'UNKNOWN METRIC: ' + ret sys.exit()
conf_target = 0.9 # confidence level of non-failure load_loc = 100 # location parameter for the load distribution load_scale = 5 # scale parameter for the load distribution res_scale = 3.5 # scale parameter for the resistance distribution eps = 1e-8 # domain = pdf > eps, for load and resistance # frozen load distribution load_distro = ss.gumbel_r(loc=load_loc, scale=load_scale) # finds the location parameter for the resistance distribution that # gives the required conf_target res_loc = sp.optimize.bisect(optimize_loc, load_loc, load_distro.ppf(1-eps), args=(res_scale, load_distro, conf_target, eps)) # frozen resistance distribution res_distro = ss.gumbel_l(loc=res_loc, scale=res_scale) # recalculates the domain and the confidence level x, dx = np.linspace(min(load_distro.ppf(eps), res_distro.ppf(eps)), max(load_distro.ppf(1-eps), res_distro.ppf(1-eps)), 200, retstep=True) confidence = 1.0 - pfail(load_distro.pdf, res_distro.cdf, x, dx) # %% plotting plt.plot(x, load_distro.pdf(x), label='load pdf') plt.plot(x, res_distro.pdf(x), label='resistance pdf') plt.grid() plt.legend(loc='best') plt.show() print('Confidence %.3f%%' % (100*confidence)) pfailure = pfail_dblchk(load_distro.pdf, res_distro.pdf, x) print('Dbl check %.3f%%' % (100*(1-pfailure)))
def get_y(params, x, tail): if tail == 'upper': return -np.log(-ss.gumbel_r(*params).logcdf(x)) else: return -np.log(-ss.gumbel_l(*params).logsf(x))
def all_dists(): # dists param were taken from scipy.stats official # documentaion examples # Total - 89 return { "alpha": stats.alpha(a=3.57, loc=0.0, scale=1.0), "anglit": stats.anglit(loc=0.0, scale=1.0), "arcsine": stats.arcsine(loc=0.0, scale=1.0), "beta": stats.beta(a=2.31, b=0.627, loc=0.0, scale=1.0), "betaprime": stats.betaprime(a=5, b=6, loc=0.0, scale=1.0), "bradford": stats.bradford(c=0.299, loc=0.0, scale=1.0), "burr": stats.burr(c=10.5, d=4.3, loc=0.0, scale=1.0), "cauchy": stats.cauchy(loc=0.0, scale=1.0), "chi": stats.chi(df=78, loc=0.0, scale=1.0), "chi2": stats.chi2(df=55, loc=0.0, scale=1.0), "cosine": stats.cosine(loc=0.0, scale=1.0), "dgamma": stats.dgamma(a=1.1, loc=0.0, scale=1.0), "dweibull": stats.dweibull(c=2.07, loc=0.0, scale=1.0), "erlang": stats.erlang(a=2, loc=0.0, scale=1.0), "expon": stats.expon(loc=0.0, scale=1.0), "exponnorm": stats.exponnorm(K=1.5, loc=0.0, scale=1.0), "exponweib": stats.exponweib(a=2.89, c=1.95, loc=0.0, scale=1.0), "exponpow": stats.exponpow(b=2.7, loc=0.0, scale=1.0), "f": stats.f(dfn=29, dfd=18, loc=0.0, scale=1.0), "fatiguelife": stats.fatiguelife(c=29, loc=0.0, scale=1.0), "fisk": stats.fisk(c=3.09, loc=0.0, scale=1.0), "foldcauchy": stats.foldcauchy(c=4.72, loc=0.0, scale=1.0), "foldnorm": stats.foldnorm(c=1.95, loc=0.0, scale=1.0), # "frechet_r": stats.frechet_r(c=1.89, loc=0.0, scale=1.0), # "frechet_l": stats.frechet_l(c=3.63, loc=0.0, scale=1.0), "genlogistic": stats.genlogistic(c=0.412, loc=0.0, scale=1.0), "genpareto": stats.genpareto(c=0.1, loc=0.0, scale=1.0), "gennorm": stats.gennorm(beta=1.3, loc=0.0, scale=1.0), "genexpon": stats.genexpon(a=9.13, b=16.2, c=3.28, loc=0.0, scale=1.0), "genextreme": stats.genextreme(c=-0.1, loc=0.0, scale=1.0), "gausshyper": stats.gausshyper(a=13.8, b=3.12, c=2.51, z=5.18, loc=0.0, scale=1.0), "gamma": stats.gamma(a=1.99, loc=0.0, scale=1.0), "gengamma": stats.gengamma(a=4.42, c=-3.12, loc=0.0, scale=1.0), "genhalflogistic": stats.genhalflogistic(c=0.773, loc=0.0, scale=1.0), "gilbrat": stats.gilbrat(loc=0.0, scale=1.0), "gompertz": stats.gompertz(c=0.947, loc=0.0, scale=1.0), "gumbel_r": stats.gumbel_r(loc=0.0, scale=1.0), "gumbel_l": stats.gumbel_l(loc=0.0, scale=1.0), "halfcauchy": stats.halfcauchy(loc=0.0, scale=1.0), "halflogistic": stats.halflogistic(loc=0.0, scale=1.0), "halfnorm": stats.halfnorm(loc=0.0, scale=1.0), "halfgennorm": stats.halfgennorm(beta=0.675, loc=0.0, scale=1.0), "hypsecant": stats.hypsecant(loc=0.0, scale=1.0), "invgamma": stats.invgamma(a=4.07, loc=0.0, scale=1.0), "invgauss": stats.invgauss(mu=0.145, loc=0.0, scale=1.0), "invweibull": stats.invweibull(c=10.6, loc=0.0, scale=1.0), "johnsonsb": stats.johnsonsb(a=4.32, b=3.18, loc=0.0, scale=1.0), "johnsonsu": stats.johnsonsu(a=2.55, b=2.25, loc=0.0, scale=1.0), "ksone": stats.ksone(n=1e03, loc=0.0, scale=1.0), "kstwobign": stats.kstwobign(loc=0.0, scale=1.0), "laplace": stats.laplace(loc=0.0, scale=1.0), "levy": stats.levy(loc=0.0, scale=1.0), "levy_l": stats.levy_l(loc=0.0, scale=1.0), "levy_stable": stats.levy_stable(alpha=0.357, beta=-0.675, loc=0.0, scale=1.0), "logistic": stats.logistic(loc=0.0, scale=1.0), "loggamma": stats.loggamma(c=0.414, loc=0.0, scale=1.0), "loglaplace": stats.loglaplace(c=3.25, loc=0.0, scale=1.0), "lognorm": stats.lognorm(s=0.954, loc=0.0, scale=1.0), "lomax": stats.lomax(c=1.88, loc=0.0, scale=1.0), "maxwell": stats.maxwell(loc=0.0, scale=1.0), "mielke": stats.mielke(k=10.4, s=3.6, loc=0.0, scale=1.0), "nakagami": stats.nakagami(nu=4.97, loc=0.0, scale=1.0), "ncx2": stats.ncx2(df=21, nc=1.06, loc=0.0, scale=1.0), "ncf": stats.ncf(dfn=27, dfd=27, nc=0.416, loc=0.0, scale=1.0), "nct": stats.nct(df=14, nc=0.24, loc=0.0, scale=1.0), "norm": stats.norm(loc=0.0, scale=1.0), "pareto": stats.pareto(b=2.62, loc=0.0, scale=1.0), "pearson3": stats.pearson3(skew=0.1, loc=0.0, scale=1.0), "powerlaw": stats.powerlaw(a=1.66, loc=0.0, scale=1.0), "powerlognorm": stats.powerlognorm(c=2.14, s=0.446, loc=0.0, scale=1.0), "powernorm": stats.powernorm(c=4.45, loc=0.0, scale=1.0), "rdist": stats.rdist(c=0.9, loc=0.0, scale=1.0), "reciprocal": stats.reciprocal(a=0.00623, b=1.01, loc=0.0, scale=1.0), "rayleigh": stats.rayleigh(loc=0.0, scale=1.0), "rice": stats.rice(b=0.775, loc=0.0, scale=1.0), "recipinvgauss": stats.recipinvgauss(mu=0.63, loc=0.0, scale=1.0), "semicircular": stats.semicircular(loc=0.0, scale=1.0), "t": stats.t(df=2.74, loc=0.0, scale=1.0), "triang": stats.triang(c=0.158, loc=0.0, scale=1.0), "truncexpon": stats.truncexpon(b=4.69, loc=0.0, scale=1.0), "truncnorm": stats.truncnorm(a=0.1, b=2, loc=0.0, scale=1.0), "tukeylambda": stats.tukeylambda(lam=3.13, loc=0.0, scale=1.0), "uniform": stats.uniform(loc=0.0, scale=1.0), "vonmises": stats.vonmises(kappa=3.99, loc=0.0, scale=1.0), "vonmises_line": stats.vonmises_line(kappa=3.99, loc=0.0, scale=1.0), "wald": stats.wald(loc=0.0, scale=1.0), "weibull_min": stats.weibull_min(c=1.79, loc=0.0, scale=1.0), "weibull_max": stats.weibull_max(c=2.87, loc=0.0, scale=1.0), "wrapcauchy": stats.wrapcauchy(c=0.0311, loc=0.0, scale=1.0), }
plt.plot(bad_sample, -log(-log(linspace(1/len(bad_sample), (len(bad_sample)-1)/len(bad_sample), len(bad_sample)))), '*') ## ## Using Kolmogorov-Smirnov test ## The D statistic is the absolute max distance (supremum) between the CDFs of the two samples. ## The closer this number is to 0 the more likely it is that the two samples were drawn from the ## same distribution. ## The p-value returned by the k-s test has the same interpretation as other p-values. You reject ## the null hypothesis that the two samples were drawn from the same distribution if the p-value ## is less than your significance level. You can find tables online for the conversion of the ## D statistic into a p-value if you are interested in the procedure. ## ## stats, pvalue = ss.kstest(rvs=good_sample, cdf=ss.gumbel_l(*ss.gumbel_l.fit(good_sample)).cdf) print('The maximumdistance between CDFs is %.2f.' % stats, end='') print('The sample is Gumbel distributed for a significance level of %.2f' % pvalue) stats, pvalue = ss.kstest(rvs=bad_sample, cdf=ss.gumbel_l(*ss.gumbel_l.fit(bad_sample)).cdf) print('The maximumdistance between CDFs is %.2f.' % stats, end='') print('The sample is Gumbel distributed for a significance level of %.2f' % pvalue) ## ## Using Anderson-Darling test ## The assumption regarding the distribution of the sample is rejected if the output value ## is larger than the critical values for the required significance level. ## For gumbel distributions, the critical values and significance levels are: ## [0.456, 0.612, 0.728, 0.843, 0.998] ## [25.0, 10.0, 5.0, 2.5, 1.0] ## I.e, for a sample to be assumed Gumbel distributed with a significant level of 25%,
Ahora usaremos lo que aprendimos arriba para "uniformizar" las marginales. Este diagrama conjunto suele ser cómo se visualizan las cópulas. ''' norm = stats.norm() x_unif = norm.cdf(x) h = sns.jointplot(x_unif[:, 0], x_unif[:, 1], kind='hex', stat_func=None) h.set_axis_labels('Y1', 'Y2', fontsize=16) #%% ''' Ahora solo transformamos los marginales nuevamente a lo que queremos (Gumbel y Beta): ''' m1 = stats.gumbel_l() m2 = stats.beta(a=10, b=2) x1_trans = m1.ppf(x_unif[:, 0]) x2_trans = m2.ppf(x_unif[:, 1]) h = sns.jointplot(x1_trans, x2_trans, kind='kde', xlim=(-6, 2), ylim=(.6, 1.0), stat_func=None) h.set_axis_labels('Maximum river level', 'Probablity of flooding', fontsize=16) #%% '''
def case2(indexes=CASE_2_ATTRIBUTE_INDEXES,output=True): accuracy_in_each_turn = list() precision_in_each_turn_spam = list() recall_in_each_turn_spam = list() precision_in_each_turn_ham = list() recall_in_each_turn_ham = list() m = np.loadtxt(open("resources/normalized_data.csv","rb"),delimiter=',') shuffled = np.random.permutation(m) valid.validate_cross_validation(NUMBER_OF_ROUNDS,TRAIN_TEST_RATIO) # equiprobable priors prior_spam = 0.5 prior_ham = 0.5 for i in xrange(NUMBER_OF_ROUNDS): # we're using cross-validation so each iteration we take a different # slice of the data to serve as test set train_set,test_set = prep.split_sets(shuffled,TRAIN_TEST_RATIO,i) #parameter estimation #but now we take 10 attributes into consideration sample_means_word_spam = list() sample_means_word_ham = list() sample_variances_word_spam = list() sample_variances_word_ham = list() for attr_index in indexes: sample_means_word_spam.append(nb.take_mean_spam(train_set,attr_index,SPAM_ATTR_INDEX)) sample_means_word_ham.append(nb.take_mean_ham(train_set,attr_index,SPAM_ATTR_INDEX)) sample_variances_word_spam.append(nb.take_variance_spam(train_set,attr_index,SPAM_ATTR_INDEX)) sample_variances_word_ham.append(nb.take_variance_ham(train_set,attr_index,SPAM_ATTR_INDEX)) #sample standard deviations from sample variances sample_std_devs_spam = map(lambda x: x ** (1/2.0), sample_variances_word_spam) sample_std_devs_ham = map(lambda x: x ** (1/2.0), sample_variances_word_ham) hits = 0.0 misses = 0.0 #number of instances correctly evaluated as spam correctly_is_spam = 0.0 #total number of spam instances is_spam = 0.0 #total number of instances evaluated as spam guessed_spam = 0.0 #number of instances correctly evaluated as ham correctly_is_ham = 0.0 #total number of ham instances is_ham = 0.0 #total number of instances evaluated as ham guessed_ham = 0.0 # now we test the hypothesis against the test set for row in test_set: # ou seja, o produto de todas as prob. condicionais das palavras dada a classe # eu sei que ta meio confuso, mas se olhar com cuidado eh bonito fazer isso tudo numa linha soh! =) product_of_all_conditional_probs_spam = reduce(lambda acc,cur: acc * stats.gumbel_l(sample_means_word_spam[cur], sample_std_devs_spam[cur]).pdf(row[indexes[cur]]) , xrange(10), 1) # nao precisa dividir pelo termo de normalizacao pois so queremos saber qual e o maior! posterior_spam = prior_spam * product_of_all_conditional_probs_spam product_of_all_conditional_probs_ham = reduce(lambda acc,cur: acc * stats.gumbel_l(sample_means_word_ham[cur], sample_std_devs_ham[cur]).pdf(row[indexes[cur]]) , xrange(10), 1) posterior_ham = prior_ham * product_of_all_conditional_probs_ham # whichever is greater - that will be our prediction if posterior_spam > posterior_ham: guess = 1 else: guess = 0 if(row[SPAM_ATTR_INDEX] == guess): hits += 1 else: misses += 1 # we'll use these to calculate metrics if (row[SPAM_ATTR_INDEX] == 1 ): is_spam += 1 if guess == 1: guessed_spam += 1 correctly_is_spam += 1 else: guessed_ham += 1 else: is_ham += 1 if guess == 1: guessed_spam += 1 else: guessed_ham += 1 correctly_is_ham += 1 #accuracy = number of correctly evaluated instances/ # number of instances # # accuracy = hits/(hits+misses) #precision_spam = number of correctly evaluated instances as spam/ # number of spam instances # # # in order to avoid divisions by zero in case nothing was found if(is_spam == 0): precision_spam = 0 else: precision_spam = correctly_is_spam/is_spam #recall_spam = number of correctly evaluated instances as spam/ # number of evaluated instances como spam # # # in order to avoid divisions by zero in case nothing was found if(guessed_spam == 0): recall_spam = 0 else: recall_spam = correctly_is_spam/guessed_spam #precision_ham = number of correctly evaluated instances as ham/ # number of ham instances # # # in order to avoid divisions by zero in case nothing was found if(is_ham == 0): precision_ham = 0 else: precision_ham = correctly_is_ham/is_ham #recall_ham = number of correctly evaluated instances as ham/ # number of evaluated instances como ham # # # in order to avoid divisions by zero in case nothing was found if(guessed_ham == 0): recall_ham = 0 else: recall_ham = correctly_is_ham/guessed_ham accuracy_in_each_turn.append(accuracy) precision_in_each_turn_spam.append(precision_spam) recall_in_each_turn_spam.append(recall_spam) precision_in_each_turn_ham.append(precision_ham) recall_in_each_turn_ham.append(recall_ham) # calculation of means for each metric at the end mean_accuracy = np.mean(accuracy_in_each_turn) std_dev_accuracy = np.std(accuracy_in_each_turn) variance_accuracy = np.var(accuracy_in_each_turn) mean_precision_spam = np.mean(precision_in_each_turn_spam) std_dev_precision_spam = np.std(precision_in_each_turn_spam) variance_precision_spam = np.var(precision_in_each_turn_spam) mean_recall_spam = np.mean(recall_in_each_turn_spam) std_dev_recall_spam = np.std(recall_in_each_turn_spam) variance_recall_spam = np.var(recall_in_each_turn_spam) mean_precision_ham = np.mean(precision_in_each_turn_ham) std_dev_precision_ham = np.std(precision_in_each_turn_ham) variance_precision_ham = np.var(precision_in_each_turn_ham) mean_recall_ham = np.mean(recall_in_each_turn_ham) std_dev_recall_ham = np.std(recall_in_each_turn_ham) variance_recall_ham = np.var(recall_in_each_turn_ham) if output: print "\033[1;32m" print '=============================================' print 'CASE 2 - TEN ATTRIBUTES - USING GUMBEL LEFT MODEL' print '=============================================' print "\033[00m" print 'MEAN ACCURACY: '+str(round(mean_accuracy,5)) print 'STD. DEV. OF ACCURACY: '+str(round(std_dev_accuracy,5)) print 'VARIANCE OF ACCURACY: '+str(round(variance_accuracy,8)) print '' print 'MEAN PRECISION FOR SPAM: '+str(round(mean_precision_spam,5)) print 'STD. DEV. OF PRECISION FOR SPAM: '+str(round(std_dev_precision_spam,5)) print 'VARIANCE OF PRECISION FOR SPAM: '+str(round(variance_precision_spam,8)) print '' print 'MEAN RECALL FOR SPAM: '+str(round(mean_recall_spam,5)) print 'STD. DEV. OF RECALL FOR SPAM: '+str(round(std_dev_recall_spam,5)) print 'VARIANCE OF RECALL FOR SPAM: '+str(round(variance_recall_spam,8)) print '' print 'MEAN PRECISION FOR HAM: '+str(round(mean_precision_ham,5)) print 'STD. DEV. OF PRECISION FOR HAM: '+str(round(std_dev_precision_ham,5)) print 'VARIANCE OF PRECISION FOR HAM: '+str(round(variance_precision_ham,8)) print '' print 'MEAN RECALL FOR HAM: '+str(round(mean_recall_ham,5)) print 'STD. DEV. OF RECALL FOR HAM: '+str(round(std_dev_recall_ham,5)) print 'VARIANCE OF RECALL FOR HAM: '+str(round(variance_recall_ham,8))