def _plot_histogram_array(self, dataDic, xlabel, displayMiddlePercentile=100, outPercentiles=[1, 5, 10, 25, 75, 90, 95, 99], name=""): ''' Pass a dictionary will plot a histogram for the data on each key. displayMiddlePercentile allows user to plot middle percent of the distribution only and ignore the tails. outPercentiles define the sample percentile values to return for each dataset in the dataDic. Returns a dict of {[key]: (mean, sd, dict(percentile, value))} ''' keys = list(dataDic.keys()) nPlt = len(keys) n = nPlt i = 1 if not (n % 2 == 0) and n > 2: n -= 1 if n > 1: while (n % 2 == 0 and n > 1) or n > 5: i += 1 if not n % 2 == 0: n -= 1 n >>= 1 numCols = int(nPlt / i) if nPlt % i > 0: numCols += 1 numRows = i f = None figSize = (8 if numCols <= 3 else 16, 14 if numRows > 3 else 6) result = dict() self.pyplot_memcheck(nPlt) for j in range(0, nPlt): key = keys[j] f = self._return_histogram(dataDic[key], name + " " + key, j + 1, numRows, numCols, xlabel, nPlt, f, displayMiddlePercentile, figSize) result[key] = (mean(dataDic[key]), sd(dataDic[key]), np.fromiter(map( lambda p: np.percentile(dataDic[key], p), outPercentiles), dtype=np.float)) ##plt.tight_layout() plt.subplots_adjust(hspace=1.0, wspace=0.4) self.save_all_figs() return result
def HybridSemiParametricGPDPDF(xs, u, ydata, shape, loc, scale): ''' Params: xs: unsorted list of datat to fit semi-parametric PDF to. u: threshold to move from Gaussian Kernel estimation to GPD tail fitting. mu: mean of the data. sigma: standard deviation of the data. shape: gpd least squares estimated shape parameter. loc: gpd least squares estimated location parameter. scale: gpd least squares estimated scale parameter. Returns: an array that would result from xs.apply(semiparametric_fittedfunction) or F_n(xs) where F_n is the PDF fit. ''' out = list() mu = mean(ydata) l = (mu - abs(u - mu)) h = (mu + abs(u - mu)) #print('u = %.10f,l = %.10f,h = %.10f'%(u,l,h)) bandwidth = 0.2 srtdxs = sorted(list(xs) + [l, h]) cdf_smoother, bandwidth = kde_statsmodels_m_cdf_output(ydata, srtdxs, bandwidth=bandwidth) d_cdf = dict(zip(srtdxs, cdf_smoother)) pdf_smoother, bandwidth = kde_statsmodels_m_pdf_output(ydata, srtdxs, bandwidth=bandwidth) d_pdf = dict(zip(srtdxs, pdf_smoother)) for x in xs: if x < l: out.append(d_cdf[l] * genpareto.pdf(l - x, shape, loc=loc, scale=scale)) elif x >= h: out.append((1 - d_cdf[h]) * genpareto.pdf(x - h, shape, loc=loc, scale=scale)) else: out.append(d_pdf[x]) return xs, out, srtdxs, pdf_smoother, bandwidth
def _QQPlot(self, rv, name): ''' QQ plot to check normaility of random variables. Parameters ========== rvs : float[] rvs to be tested for normality ''' self.pyplot_memcheck() rvs = sorted(rv) scaled_rvs = (np.array(rvs) - mean(rvs)) / sd(rvs) n = len(rvs) ntiles = np.arange(1, n + 1) / (n + 1) normLn = norm.ppf(ntiles) #measurements = np.random.normal(loc = 20, scale = 5, size=100) fig = plt.figure(figsize=(10, 6)) fig.canvas.set_window_title(name) fig.canvas.figure.set_label(name) plt.subplot(1, 1, 1) plt.title(name[:70] + '\n' + name[70:]) plt.plot(normLn, ntiles) plt.plot(scaled_rvs, ntiles) plt.legend(['normal', name], loc='best') plt.grid(True)