def dPDF(pts,mu,sigma, distribuition, outlier = 0, data = 0, n=10, seed = None): import numpy as np from scipy.interpolate import interp1d from distAnalyze import dpdf, mediaMovel from scipy.stats import norm, lognorm eps = 5e-5 ngrid = int(1e6) if distribuition == 'normal': outlier_inf = outlier_sup = outlier if not data: inf, sup = norm.interval(0.9999, loc = mu, scale = sigma) x = np.linspace(inf-outlier_inf,sup+outlier_sup,ngrid) y = dpdf(x,mu,sigma,distribuition) else: np.random.set_state(seed) d = np.random.normal(mu,sigma,data) inf,sup = min(d)-outlier_inf,max(d)+outlier_sup y,x = np.histogram(d,bins = 'fd',normed = True) x = np.mean(np.array([x[:-1],x[1:]]),0) y = abs(np.diff(mediaMovel(y,n))) x = x[:-1]+np.diff(x)[0]/2 elif distribuition == 'lognormal': outlier_inf = 0 outlier_sup = outlier if not data: inf, sup = lognorm.interval(0.9999, sigma, loc = 0, scale = np.exp(mu)) x = np.linspace(inf-outlier_inf,sup+outlier_sup,ngrid) y = dpdf(x,mu,sigma,distribuition) else: np.random.set_state(seed) d = np.random.lognormal(mu,sigma,data) inf,sup = min(d)-outlier_inf,max(d)+outlier_sup y,x = np.histogram(d,bins = 'fd',normed = True) x = np.mean(np.array([x[:-1],x[1:]]),0) y = abs(np.diff(mediaMovel(y,n))) x = x[:-1]+np.diff(x)[0]/2 y = y/(np.diff(x)[0]*sum(y)) #dy = lambda x,u,s : abs(1/(s**3*sqrt(2*pi))*(u-x)*np.exp(-0.5*((u-x)/s)**2)) cdf = np.cumsum(y) #cdf = np.sum(np.tri(len(x))*y,1) #cdf = np.concatenate(cdf) cdf = cdf/max(cdf) #time.time()-t interp = interp1d(cdf,x, fill_value = 'extrapolate') Y = np.linspace(eps,1-eps,pts) X = interp(Y) return X,Y
def logs(sigma=1, mu=0, area=0.9999): from scipy.stats import lognorm import numpy as np import matplotlib.pyplot as plt from numpy import log, exp scale = median = exp(mu) mode = exp(mu - sigma**2) mean = exp(mu + (sigma**2 / 2)) shape = sigma a, b = lognorm.interval(area, shape, loc=0, scale=np.exp(mu)) x = np.linspace(a, b, 1000000) mode = exp(mu - shape**2) mean = exp(mu + (shape**2 / 2)) pdf = lognorm.pdf(x, shape, loc=0, scale=scale) plt.figure(figsize=(12, 8), dpi=200) plt.plot(x, pdf, label='PDF ($\sigma = %.2f$)' % shape) plt.vlines(mode, 0, pdf.max(), linestyle=':', label='Mode = %.2f' % mode) plt.vlines(mean, 0, lognorm.pdf(mean, shape, loc=0, scale=scale), color='green', linestyle='--', label='Mean = %.2f' % mean) plt.vlines(median, 0, lognorm.pdf(median, shape, loc=0, scale=scale), color='blue', label='Median = %.2f' % median) plt.legend(loc=1) plt.legend(prop={'size': 18}) plt.xlabel('x', fontsize=45) plt.ylabel('Probability', fontsize=45) plt.xticks(size=18) plt.yticks(size=18) plt.tight_layout()
def plot_prob_density(mu, la, predsData, testData, xmin, xmax): from scipy.stats import lognorm fig, axes = plt.subplots(1, 1, figsize=(5, 4), sharey=True, dpi=120) font = "Times New Roman" f3 = lambda x, mu, la: (1 / x * la * (2 * math.pi)**0.5) * np.exp(-( (np.log(x) - mu)**2) / (2 * la**2)) x2 = np.linspace(0, xmax, 300) axes.plot(x2, f3(x2, mu, la)) ymin, ymax = axes.get_ylim() x_bounds = lognorm.interval(alpha=0.95, s=la, scale=np.exp(mu)) x_bounds_std = lognorm.interval(alpha=0.68, s=la, scale=np.exp(mu)) axes.axvline(x=testData.sum(), color='red', linestyle=':') ymaxes = f3(np.asarray(x_bounds), mu, la) / ymax + 0.01 axes.axvline(x=x_bounds[0], color='blue', alpha=0.3, linestyle=':') axes.axvline(x=x_bounds[1], color='blue', alpha=0.3, linestyle=':') xfill = np.linspace(x_bounds[0], x_bounds[1], 100) xfill_std = np.linspace(x_bounds_std[0], x_bounds_std[1], 100) axes.fill_between(xfill, f3(xfill, mu, la), alpha=0.1, color='blue') axes.fill_between(xfill_std, f3(xfill_std, mu, la), alpha=0.1, color='blue') #axes.fill_between(xfill,) axes.text(x=testData.sum() + 1, y=.03 * ymax, s='Actual: ' + str(int(testData.sum())), color='red') #axes.text(x=x_bounds[1]+1,y=ymax*.9,s='Upper 95%:',color='blue') #axes.text(x=x_bounds[1]+1,y=ymax*.82,s=str(round(x_bounds[1],1)),color='blue') #axes.text(x=x_bounds[0]-10,y=ymax*.9,s='Lower 95%:',color='blue') #axes.text(x=x_bounds[0]-10,y=ymax*.82,s=str(round(x_bounds[0],1)),color='blue') axes.set_xlabel('Number of days exceeding threshold', fontname=font, fontweight="heavy", fontsize=12) axes.set_ylabel('Probability density function (-)', fontname=font, fontweight="heavy", fontsize=12) axes.set_ylim(0, ymax) axes.set_xlim(0, xmax) labels = axes.get_xticklabels() + axes.get_yticklabels() [label.set_fontname(font) for label in labels] fig.show() print('**********************************') print('Expected number of days exceeding thermal comfort criteria: ' + str(round(lognorm.mean(s=la, scale=np.exp(mu)), 1)) + ' +/- ' + str(round(lognorm.std(s=la, scale=np.exp(mu)), 1))) print('Most likely number of days exceeding thermal comfort criteria: ' + str(round(np.exp(mu - la**2))) + ' +/- ' + str(round(lognorm.std(s=la, scale=np.exp(mu)), 1))) print( 'Predicted number of days exceeding thermal comfort criteria (deterministic): ' + str(int(np.sum(predsData)))) print('Actual number of days exceeding thermal comfort criteria: ' + str(int(testData.sum()))) print('**********************************') from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score acc_score = accuracy_score(predsData, testData) prec_score = precision_score(predsData, testData) rec_score = recall_score(predsData, testData) roc_auc_score = roc_auc_score(predsData, testData) print("Test Accuracy score: ", acc_score) print("Test Precision score: ", prec_score) print("Test Recall score: ", rec_score) print("Test ROC AUC score: ", roc_auc_score)
def ddPDF(pts, mu, sigma, distribuition, outlier=0, data=0, n=10, seed=None): import numpy as np from scipy.interpolate import interp1d from distAnalyze import ddpdf, mediaMovel from scipy.stats import norm, lognorm from someFunctions import ash eps = 5e-5 ngrid = int(1e6) #ddy = lambda x,u,s: abs(-(s**2-u**2+2*u*x-x**2)/(s**5*sqrt(2*pi))*np.exp(-0.5*((u-x)/s)**2)) if distribuition == 'normal': outlier_inf = outlier_sup = outlier if not data: inf, sup = norm.interval(0.9999, loc=mu, scale=sigma) x = np.linspace(inf - outlier_inf, sup + outlier_sup, ngrid) y = ddpdf(x, mu, sigma, distribuition) else: np.random.set_state(seed) d = np.random.normal(mu, sigma, data) inf, sup = min(d) - outlier_inf, max(d) + outlier_sup #y,x = np.histogram(d,bins = 'fd',normed = True) #x = np.mean(np.array([x[:-1],x[1:]]),0) x, y = ash(d) y = abs(np.diff(y, 2)) x = x[:-2] + np.diff(x)[0] #y = abs(np.diff(mediaMovel(y,n),2)) #x = x[:-2]+np.diff(x)[0] y = y / (np.diff(x)[0] * sum(y)) elif distribuition == 'lognormal': outlier_inf = 0 outlier_sup = outlier inf, sup = lognorm.interval(0.9999, sigma, loc=0, scale=np.exp(mu)) inf = lognorm.pdf(sup, sigma, loc=0, scale=np.exp(mu)) inf = lognorm.ppf(inf, sigma, loc=0, scale=np.exp(mu)) if not data: x = np.linspace(inf - outlier_inf, sup + outlier_sup, ngrid) y = ddpdf(x, mu, sigma, distribuition) else: np.random.set_state(seed) d = np.random.lognormal(mu, sigma, data) #inf,sup = min(d)-outlier_inf,max(d)+outlier_sup # y,x = np.histogram(d,bins = 'fd',normed = True) #x = np.mean(np.array([x[:-1],x[1:]]),0) x, y = ash(d) y = y[x < sup] x = x[x < sup] y = abs(np.diff(y, 2)) #y = abs(np.diff(mediaMovel(y,n),2)) x = x[:-2] + np.diff(x)[0] y = y / (np.diff(x)[0] * sum(y)) #cdf = np.sum(np.tri(len(x))*y,1) cdf = np.cumsum(y) # ============================================================================= # for i in range(1,ngrid): # cdf.append(y[i]+cdf[i-1]) cdf = cdf / max(cdf) # # ============================================================================= interp = interp1d(cdf, x, fill_value='extrapolate') Y = np.linspace(eps, 1 - eps, pts) X = interp(Y) return X, Y
def PDF(pts, mu, sigma, distribuition, outlier=0, data=0, seed=None): from scipy.stats import norm, lognorm import numpy as np from scipy.interpolate import interp1d from someFunctions import ash eps = 5e-5 if distribuition == 'normal': outlier_inf = outlier_sup = outlier if not data: inf, sup = norm.interval(0.9999, loc=mu, scale=sigma) X1 = np.linspace(inf - outlier, mu, int(1e6)) Y1 = norm.pdf(X1, loc=mu, scale=sigma) interp = interp1d(Y1, X1) y1 = np.linspace(Y1[0], Y1[-1], pts // 2 + 1) x1 = interp(y1) X2 = np.linspace(mu, sup + outlier, int(1e6)) Y2 = norm.pdf(X2, loc=mu, scale=sigma) interp = interp1d(Y2, X2) y2 = np.flip(y1, 0) x2 = interp(y2) else: np.random.set_state(seed) d = np.random.normal(mu, sigma, data) inf, sup = min(d) - outlier_inf, max(d) + outlier_sup #yest,xest = np.histogram(d,bins = 'fd',normed = True) xest, yest = ash(d) xest = np.mean(np.array([xest[:-1], xest[1:]]), 0) M = np.where(yest == max(yest))[0][0] m = np.where(yest == min(yest))[0][0] interpL = interp1d(yest[:M + 1], xest[:M + 1], assume_sorted=False, fill_value='extrapolate') interpH = interp1d(yest[M:], xest[M:], assume_sorted=False, fill_value='extrapolate') y1 = np.linspace(yest[m] + eps, yest[M], pts // 2 + 1) x1 = interpL(y1) y2 = np.flip(y1, 0) x2 = interpH(y2) elif distribuition == 'lognormal': outlier_inf = 0 outlier_sup = outlier inf, sup = lognorm.interval(0.9999, sigma, loc=0, scale=np.exp(mu)) inf = lognorm.pdf(sup, sigma, loc=0, scale=np.exp(mu)) inf = lognorm.ppf(inf, sigma, loc=0, scale=np.exp(mu)) if not data: mode = np.exp(mu - sigma**2) X1 = np.linspace(inf - outlier_inf, mode, int(1e6)) Y1 = lognorm.pdf(X1, sigma, loc=0, scale=np.exp(mu)) interp = interp1d(Y1, X1) y1 = np.linspace(Y1[0], Y1[-1], pts // 2 + 1) x1 = interp(y1) X2 = np.linspace(mode, sup + outlier_sup, int(1e6)) Y2 = lognorm.pdf(X2, sigma, loc=0, scale=np.exp(mu)) interp = interp1d(Y2, X2) y2 = np.flip(y1, 0) x2 = interp(y2) else: np.random.set_state(seed) d = np.random.lognormal(mu, sigma, data) #inf,sup = min(d)-outlier_inf,max(d)+outlier_sup #yest,xest = np.histogram(d,bins = 'fd',normed = True) #xest = np.mean(np.array([xest[:-1],xest[1:]]),0) xest, yest = ash(d) yest = yest[xest < sup] xest = xest[xest < sup] M = np.where(yest == max(yest))[0][0] m = np.where(yest == min(yest))[0][0] interpL = interp1d(yest[:M + 1], xest[:M + 1], fill_value='extrapolate') interpH = interp1d(yest[M:], xest[M:]) y1 = np.linspace(yest[m] + eps, yest[M], pts // 2 + 1) x1 = interpL(y1) y2 = np.flip(y1, 0) x2 = interpH(y2) X = np.concatenate([x1[:-1], x2]) Y = np.concatenate([y1[:-1], y2]) return X, Y
def diffArea(nest, outlier=0, data=0, kinds='all', axis='probability', ROI=20, mu=0, sigma=1, weight=False, interpolator='linear', distribuition='normal', seed=None, plot=True): """ Return an error area between a analitic function and a estimated discretization from a distribuition. Parameters ---------- nest: int The number of estimation points. outlier: int, optional Is the point of an outlier event, e.g outlier = 50 will put an event in -50 and +50 if mu = 0. Defaut is 0 data: int, optional If data > 0, a randon data will be inserted insted analitcs data. Defaut is 0. kinds: str or array, optional specifies the kind of distribuition to analize. ('Linspace', 'CDFm', 'PDFm', 'iPDF1', 'iPDF2', 'all'). Defaut is 'all'. axis: str, optional specifies the x axis to analize ('probability', 'derivative', '2nd_derivative', 'X'). Defaut is 'probability'. ROI: int, optional Specifies the number of regions of interest. Defaut is 20. mu: int, optional Specifies the mean of distribuition. Defaut is 0. sigma: int, optional Specifies the standard desviation of a distribuition. Defaut is 1. weight: bool, optional if True, each ROI will have a diferent weight to analyze. Defaut is False interpolator: str, optional Specifies the kind of interpolation as a string ('linear', 'nearest', 'zero', 'slinear', 'quadratic', 'cubic' where 'zero', 'slinear', 'quadratic' and 'cubic' refer to a spline interpolation of zeroth, first, second or third order) or as an integer specifying the order of the spline interpolator to use. Default is 'linear'. distribuition: str, optional Select the distribuition to analyze. ('normal', 'lognormal') Defaut is 'normal' plot: bool, optional If True, a plot will be ploted with the analyzes Defaut is True Returns ------- a, [b,c]: float and float of ndarray. area,[probROIord,areaROIord] returns the sum of total error area and the 'x' and 'y' values. """ import numpy as np from scipy.stats import norm, lognorm from scipy.interpolate import interp1d from numpy import exp import matplotlib.pyplot as plt from statsmodels.distributions import ECDF from distAnalyze import pdf, dpdf, ddpdf, PDF, dPDF, ddPDF area = [] n = [] data = int(data) if distribuition == 'normal': outlier_inf = outlier_sup = outlier elif distribuition == 'lognormal': outlier_inf = 0 outlier_sup = outlier ngrid = int(1e6) truth = pdf if axis == 'probability': truth1 = pdf elif axis == 'derivative': truth1 = dpdf elif axis == '2nd_derivative': truth1 = ddpdf elif axis == 'X': truth1 = lambda x, mu, sigma, distribuition: x #else: return 'No valid axis' probROIord = {} areaROIord = {} div = {} if seed is not None: np.random.set_state(seed) if data: if distribuition == 'normal': d = np.random.normal(mu, sigma, data) elif distribuition == 'lognormal': d = np.random.lognormal(mu, sigma, data) if kinds == 'all': kinds = ['Linspace', 'CDFm', 'PDFm', 'iPDF1', 'iPDF2'] elif type(kinds) == str: kinds = [kinds] for kind in kinds: if distribuition == 'normal': inf, sup = norm.interval(0.9999, loc=mu, scale=sigma) elif distribuition == 'lognormal': inf, sup = lognorm.interval(0.9999, sigma, loc=0, scale=exp(mu)) inf = lognorm.pdf(sup, sigma, loc=0, scale=np.exp(mu)) inf = lognorm.ppf(inf, sigma, loc=0, scale=np.exp(mu)) xgrid = np.linspace(inf, sup, ngrid) xgridROI = xgrid.reshape([ROI, ngrid // ROI]) dx = np.diff(xgrid)[0] if kind == 'Linspace': if not data: xest = np.linspace(inf - outlier_inf, sup + outlier_sup, nest) else: if distribuition == 'normal': #d = np.random.normal(loc = mu, scale = sigma, size = data) inf, sup = min(d), max(d) xest = np.linspace(inf - outlier_inf, sup + outlier_sup, nest) elif distribuition == 'lognormal': #d = np.random.lognormal(mean = mu, sigma = sigma, size = data) inf, sup = min(d), max(d) xest = np.linspace(inf - outlier_inf, sup + outlier_sup, nest) yest = pdf(xest, mu, sigma, distribuition) elif kind == 'CDFm': eps = 5e-5 yest = np.linspace(0 + eps, 1 - eps, nest) if distribuition == 'normal': if not data: xest = norm.ppf(yest, loc=mu, scale=sigma) yest = pdf(xest, mu, sigma, distribuition) else: #d = np.random.normal(loc = mu, scale = sigma, size = data) ecdf = ECDF(d) inf, sup = min(d), max(d) xest = np.linspace(inf, sup, data) yest = ecdf(xest) interp = interp1d(yest, xest, fill_value='extrapolate', kind='nearest') yest = np.linspace(eps, 1 - eps, nest) xest = interp(yest) elif distribuition == 'lognormal': if not data: xest = lognorm.ppf(yest, sigma, loc=0, scale=exp(mu)) yest = pdf(xest, mu, sigma, distribuition) else: #d = np.random.lognormal(mean = mu, sigma = sigma, size = data) ecdf = ECDF(d) inf, sup = min(d), max(d) xest = np.linspace(inf, sup, nest) yest = ecdf(xest) interp = interp1d(yest, xest, fill_value='extrapolate', kind='nearest') yest = np.linspace(eps, 1 - eps, nest) xest = interp(yest) elif kind == 'PDFm': xest, yest = PDF(nest, mu, sigma, distribuition, outlier, data, seed) elif kind == 'iPDF1': xest, yest = dPDF(nest, mu, sigma, distribuition, outlier, data, 10, seed) elif kind == 'iPDF2': xest, yest = ddPDF(nest, mu, sigma, distribuition, outlier, data, 10, seed) YY = pdf(xest, mu, sigma, distribuition) fest = interp1d(xest, YY, kind=interpolator, bounds_error=False, fill_value=(YY[0], YY[-1])) #fest = lambda x: np.concatenate([fest1(x)[fest1(x) != -1],np.ones(len(fest1(x)[fest1(x) == -1]))*fest1(x)[fest1(x) != -1][-1]]) yestGrid = [] ytruthGrid = [] ytruthGrid2 = [] divi = [] for i in range(ROI): yestGrid.append([fest(xgridROI[i])]) ytruthGrid.append([truth(xgridROI[i], mu, sigma, distribuition)]) ytruthGrid2.append([truth1(xgridROI[i], mu, sigma, distribuition)]) divi.append( len( np.intersect1d( np.where(xest >= min(xgridROI[i]))[0], np.where(xest < max(xgridROI[i]))[0]))) diff2 = np.concatenate( abs((np.array(yestGrid) - np.array(ytruthGrid)) * dx)) #diff2[np.isnan(diff2)] = 0 areaROI = np.sum(diff2, 1) divi = np.array(divi) divi[divi == 0] = 1 try: probROI = np.mean(np.sum(ytruthGrid2, 1), 1) except: probROI = np.mean(ytruthGrid2, 1) probROIord[kind] = np.sort(probROI) index = np.argsort(probROI) areaROIord[kind] = areaROI[index] #deletes = ~np.isnan(areaROIord[kind]) #areaROIord[kind] = areaROIord[kind][deletes] #probROIord[kind] = probROIord[kind][deletes] area = np.append(area, np.sum(areaROIord[kind])) n = np.append(n, len(probROIord[kind])) div[kind] = divi[index] if plot: if weight: plt.logy(probROIord[kind], areaROIord[kind] * div[kind], '-o', label=kind, ms=3) else: plt.plot(probROIord[kind], areaROIord[kind], '-o', label=kind, ms=3) plt.yscale('log') plt.xlabel(axis) plt.ylabel('Error') plt.legend() #plt.title('%s - Pontos = %d, div = %s - %s' %(j,nest, divs,interpolator)) return area, [probROIord, areaROIord]
sigma, loc, scale = lognorm.fit(data, floc=0) # print sigma, loc, scale # print lognorm.mean(sigma, loc=loc, scale=scale) read_length_count = 0 """ y_value = lognorm.pdf(data, sigma, loc, scale) background = np.median(y_value) """ end_point = lognorm.interval(0.5, sigma, loc, scale) print end_point # calculate the homogenesous distribution as a comparable reference background = 0.5/(end_point[1] - end_point[0]) print background record_dict = SeqIO.index("D:/Data/20161125/filtered_subreads_first1k.fastq", "fastq") target_seq = [] i= 0 id_list = list(record_dict.keys()) seq_num = len(id_list) while read_length_count <= target_read_length: print read_length_count if i == seq_num:
from scipy.stats import lognorm data2 = [] with open('datafile2.csv') as csvfile2: reader = csv.reader(csvfile2) for row in reader: data2.append(float(row[0])) plt.figure() _ = plt.hist(data2, bins=100) # Parameter estimates for generic data # the argument floc=0 to ensure that it does not treat the location as a free parameter shape1, loc1, scale1 = lognorm.fit(data2, floc=0) mu1 = np.log(scale1) sigma1 = shape1 print("Estimated mu = " + str(mu1)) print("Estimated sigma = " + str(sigma1)) # 0.95 is the alpha value, which specifies a 95 percentile point, as the corresponding 1.96 standard deviations of the mean is given in the formula. ci1 = lognorm.interval(0.95, s=sigma1, loc=loc1, scale=scale1) print("Lognorm function CI = " + str(ci1)) # confidence interval left line one_x12, one_y12 = [ci1[0], ci1[0]], [0, 20] # confidence interval right line two_x12, two_y12 = [ci1[1], ci1[1]], [0, 20] plt.plot(one_x12, one_y12, two_x12, two_y12, marker='o') plt.title("Lognormal distribution confidence interval") plt.show()
def log_normal_distribution( radius_g: float, sigma_g: float, n_bins: int) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: """ Function for returning a log-normal size distribution. See Eq. 9 in Ackerman & Marley (2001). Parameters ---------- radius_g : float Mean geometric radius (um). sigma_g : float Geometric standard deviation (dimensionless). n_bins : int Number of logarithmically-spaced radius bins. Returns ------- np.ndarray Number of grains in each radius bin, normalized to a total of 1 grain. np.ndarray Widths of the radius bins (um). np.ndarray Grain radii (um). """ if sigma_g == 1.0: # The log-normal distribution is equal to a delta # function with sigma_g = 1 radii = np.array([radius_g]) r_width = np.array([np.nan]) dn_grains = np.array([1.0]) else: # Get the radius interval which contains 99.999% # of the distribution interval = lognorm.interval(1.0 - 1e-5, np.log(sigma_g), loc=0.0, scale=radius_g) # Create bin boundaries (um), so +1 because there # are n_bins+1 bin boundaries r_bins = np.logspace(np.log10(interval[0]), np.log10(interval[1]), n_bins + 1) # Width of the radius bins (um) r_width = np.diff(r_bins) # Grain radii (um) at which the size distribution is sampled radii = (r_bins[1:] + r_bins[:-1]) / 2.0 # Number of grains per radius bin width, normalized to an # integrated value of 1 grain, that is, # np.sum(dn_dr*r_width) = 1 # The log-normal distribution from Ackerman & Marley 2001 # gives the same result as scipy.stats.lognorm.pdf with # s = log(sigma_g) and scale=radius_g dn_dr = lognorm.pdf(radii, s=np.log(sigma_g), loc=0.0, scale=radius_g) # Number of grains for each radius bin dn_grains = dn_dr * r_width return dn_grains, r_width, radii