def ParetoDiscrete(eta, x_m = 1, p = 0.95, N = 8): q_p = x_m/(1-p)**(1/eta) grid = np.linspace(x_m, q_p, N+2) probGrid = np.empty(N+2) for i, x in enumerate(grid): if i == 0: probGrid[i] = pareto.cdf((grid[i+1] + x)/2, b = eta, scale = x_m) else: if i == len(grid)-1: probGrid[i] = 1- pareto.cdf((grid[i-1] + x)/2, b = eta, scale = x_m ) else: probGrid[i] = pareto.cdf((grid[i+1] + x)/2, b = eta, scale = x_m) - \ pareto.cdf((grid[i-1] + x)/2, b = eta, scale = x_m ) return [grid, probGrid]
def pareto_dcdf(x, d, b=1, scale=1): """ d^th derivative of the cumulative distribution function at x of the given RV. :param x: array_like quantiles :param d: positive integer derivative order of the cumulative distribution function :param b: positive number shape parameter (default=1) :param scale: positive number scale parameter (default=1) :return: array_like If d = 0: the cumulative distribution function evaluated at x If d = 1: the probability density function evaluated at x If d => 2: the (d-1)-density derivative evaluated at x """ if scale <= 0 or b <= 0: print("The scale and shape parameters must be positive numbers.") if d == 0: output = pareto.cdf(x, b, scale=scale) if d != 0: output = np.where(scale <= x, -(-1/scale)**d*(x/scale)**(-b-d)*np.prod(b+range(d)), 0) return output
def KS_MC(a, n_events, n_draws=10000): """ Run MC trials of computing KS D values for data draw from power law with cumulative index a. """ D = [] for _ in range(n_draws): rvs = pareto.rvs(a, size=n_events) aML = ML_index_analytic(rvs, 1.) cdf = lambda x: pareto.cdf(x, aML) D.append(kstest(rvs, cdf)[0]) return np.sort(D)
def kolTestPareto(P, k): emp_dist_x = np.sort(P) emp_dist = np.array((range(10001)))[1:] / 10000 true_dist = [] for i, item in enumerate(emp_dist_x): true_dist.append(pareto.cdf(item, b=k, loc=0)) #plt.plot(emp_dist_x, emp_dist) #plt.plot(emp_dist_x, true_dist) #plt.show() D_n = np.max(np.abs(emp_dist - true_dist)) Adjust_D_n = (math.sqrt(10000.0) + 0.12 + 0.11 * math.sqrt(10000.0)) * D_n return Adjust_D_n
def dispatch_cdf(data, alpha, xmin, xmax, discrete): if discrete: if np.isinf(xmax): ll = genzipf.cdf(data, alpha, xmin) else: ll = truncated_zipf.cdf(data, alpha, xmin, xmax) else: if np.isinf(xmax): ll = pareto.cdf(data, alpha - 1, scale=xmin) else: ll = truncated_pareto.cdf(data, alpha - 1, float(xmax) / xmin, scale=xmin) return ll
def _generate_KS_cube(): """ Generate a grid of D values for KS tests of power-law behavior. """ a_grid = np.arange(0.2, 2, 0.05) n_grid = [3, 4, 5, 6, 7, 8, 9, 10, 12, 15, 20, 25, 30, 40, 50, 75, 100, 125, 150, 200] m = 1000 Dcube = np.zeros([len(a_grid), len(n_grid), m], dtype='f4') for i, a in enumerate(a_grid): for j, n in enumerate(n_grid): D = [] for k in range(m): rvs = pareto.rvs(a, size=n) aML = ML_index_analytic(rvs, 1.) cdf = lambda x: pareto.cdf(x, aML) D.append(kstest(rvs, cdf)[0]) Dcube[i,j] = np.sort(D) np.save(_path_ks_grid, np.array((a_grid, n_grid, Dcube)))
def KS_test(data, alpha, xmin, xmax=np.inf): """ Give the Kolmogorov-Smirnov distance between the theoretic distribution and the data. :param data: data samples, increasingly ordered if possible, shape (n,) :param alpha: the exponent being tested, float :param xmin: the lower cutoff of the power-law, float :param xmax: the upper cutoff of the power-law, float """ data = _check_data(data, sort=True) data, use_data = _trf_check_bounds(data, xmin, xmax) if xmax == np.inf: cdf = pareto.cdf(data[use_data], alpha - 1, scale=xmin) else: cdf = truncated_pareto.cdf(data[use_data], alpha - 1, float(xmax) / xmin, scale=xmin) n = np.sum(use_data.astype(int)) emp1 = np.arange(n) / float(n) emp2 = np.arange(1, n + 1) / float(n) ks = np.maximum(np.abs(emp1 - cdf), np.abs(emp2 - cdf)) return np.max(ks) if len(ks) else np.inf
def KS_test(edges, counts, alpha, xmin, xmax=np.inf): """ Give the Kolmogorov-Smirnov distance between the theoretic distribution and the binned data. :param edges: increasing bin boundaries, shape (n+1,) :param counts: counts in the bin, shape (n,) :param alpha: the exponent being tested, float :param xmin: the lower cutoff of the power-law, float :param xmax: the upper cutoff of the power-law, float """ edges, counts = _check_bins_counts(edges, counts, sort=True) lefts, widths, use_data, use_edge = _trf_check_bounds( edges, counts, xmin, xmax) if np.isinf(xmax): cdf = pareto.cdf(edges[1:][use_data], alpha - 1, scale=xmin) else: cdf = truncated_pareto.cdf(edges[1:][use_data], alpha - 1, float(xmax) / xmin, scale=xmin) emp = np.cumsum(counts[use_data]) / float(np.sum(counts[use_data])) ks = np.abs(emp - cdf) return np.max(ks) if len(ks) else np.inf
def KS_test(points, counts, alpha, xmin, xmax=np.inf, discrete=False): """ Give the Kolmogorov-Smirnov distance between the theoretic distribution and the data. :param points: observed values, shape (n,) :param counts: number of occurrences for `points`, shape (n,) :param xmin: the lower cutoff of the power-law, float :param xmax: the upper cutoff of the power-law, float :param alpha: the exponent being tested, float :param discrete: interpret as a discrete power-law (genrealized zipf) distribution """ points, counts = _check_points_counts(points, counts, sort=True) points, use_data = _trf_check_bounds(points, counts, xmin, xmax, discrete, force_discard_end=True) # TODO: use dispatch_cdf if discrete: if np.isinf(xmax): cdf = genzipf.cdf(points[use_data], alpha, xmin) else: cdf = truncated_zipf.cdf(points[use_data], alpha, xmin, xmax) else: if np.isinf(xmax): cdf = pareto.cdf(points[use_data], alpha - 1, scale=xmin) else: cdf = truncated_pareto.cdf(points[use_data], alpha - 1, float(xmax) / xmin, scale=xmin) emp = np.cumsum(counts[use_data]) / float(np.sum(counts[use_data])) if not discrete: # This correction is needed because cdf_continuous[xmin] == 0 while emp[xmin] has an important weight emp = np.concatenate(([0], emp[:-1])) ks = np.abs(emp - cdf) return np.max(ks) if len(ks) else np.inf
from scipy.stats import pareto print(pareto.cdf(6, 3, 0, 3))
# Display the probability density function (``pdf``): x = np.linspace(pareto.ppf(0.01, b), pareto.ppf(0.99, b), 100) ax.plot(x, pareto.pdf(x, b), 'r-', lw=5, alpha=0.6, label='pareto pdf') # Alternatively, the distribution object can be called (as a function) # to fix the shape, location and scale parameters. This returns a "frozen" # RV object holding the given parameters fixed. # Freeze the distribution and display the frozen ``pdf``: rv = pareto(b) ax.plot(x, rv.pdf(x), 'k-', lw=2, label='frozen pdf') # Check accuracy of ``cdf`` and ``ppf``: vals = pareto.ppf([0.001, 0.5, 0.999], b) np.allclose([0.001, 0.5, 0.999], pareto.cdf(vals, b)) # True # Generate random numbers: r = pareto.rvs(b, size=1000) # And compare the histogram: ax.hist(r, normed=True, histtype='stepfilled', alpha=0.2) ax.legend(loc='best', frameon=False) plt.show()
for elem in xList: count = data.count(elem) step += count yList.append(step / dataLen) return xList, yList [a, xm] = [5, 1] N = [5, 10, 100, 1000, 10**5] for n in N: print("n = ", n) for i in range(5): data = np.zeros(n) for iteration in range(n): xi = np.random.rand() r = xm / xi**(1 / a) data[iteration] = r print(data) x, y = data_ECDF(data) plt.step([0, *x, 1.1 * x[-1]], [0, *y, 1], label="Pareto ECDF{h}".format(h=i + 1), where='post') xx = np.arange(0, 5, 0.1) plt.plot(xx, pareto.cdf(xx, a, scale=xm), color="black", label="Pareto CDF") plt.legend(loc='lower right', frameon=False) plt.show()
x, _, _ = normal(n, alpha=0.05, reps=1, plot=True) hist, edges = np.histogram(x, bins) expected = [ int(n * (norm.cdf(edges[i + 1], 0, 1) - norm.cdf(edges[i], 0, 1))) for i in range(len(edges) - 1) ] chi2_test(x, expected, alpha, bins) # Pareto distribution beta = 1 k = 2.05 print 'Pareto\n', 'beta=', beta, 'k=', k x = pareto(beta, k=k, n=n, plot=True) hist, edges = np.histogram(x, bins) expected = [ int(n * (pareto_sc.cdf(edges[i + 1], k) - pareto_sc.cdf(edges[i], k))) for i in range(len(edges) - 1) ] chi2_test(x, expected, alpha, bins) k = 2.5 print 'Pareto\n', 'beta=', beta, 'k=', k pareto(beta, k=k, n=n, plot=True) hist, edges = np.histogram(x, bins) expected = [ int(n * (pareto_sc.cdf(edges[i + 1], k) - pareto_sc.cdf(edges[i], k))) for i in range(len(edges) - 1) ] chi2_test(x, expected, alpha, bins) k = 3
def _pdf(self, x, b, m): return pareto.pdf(x, b) / pareto.cdf(m, b)
def _ppf(self, q, b, m): return np.power(1.0 - q * pareto.cdf(m, b), -1.0 / b)
from scipy.stats import pareto print(pareto.cdf(6,3,0,3))
#Pareto alpha = [1.16] loc = 0 scale = 1 for i in alpha: pdf = pareto.pdf(xn, i, loc) plt.figure('Pareto PDF') plt.title('Pareto PDF') ax = sns.lineplot(xn, pdf, color='k') ax.fill_between(xn, pdf, color='olivedrab', alpha=0.2) cdf = pareto.cdf(xc, alpha[0], loc, scale) plt.figure('Pareto CDF') plt.title('Pareto CDF') ax = sns.lineplot(xc, cdf, color='red') ax.fill_between(xc, cdf, color="firebrick", alpha=0.3) #Gamma (com a = é uma exponencial) a = [1, 3, 5] for i in a: pdf = gamma.pdf(xn, i, loc, scale) plt.figure('Gama PDF') plt.title('Gama PDF')
def apply_mapping(long_format_data, description_variables, code_length=3, delimeter="_", description2code={}, case_sensitive=False): """ Create/apply mapping on description variable on data to create variable name for modeling data :param long_format_data: dataframe to be used to create modeling data :type long_format_data: pandas.DataFrame :param description_variables: columns in dataframe for which mapping codes needs to generated :type description_variables: list of string :param code_length: length of code to be geneated, defualt value is 3 :type code_length: int :param delimeter: delimeter used inbetween description mapping code :type delimeter: string :param description2code: mapping in dictionary format with description as key and code as value, defult is empty dictionary :type description2code: dictionary :param case_sensitive: whether mapping to code is case sensitive or not :type case_sensitive: bool :return: dataframe after applying mapping code with column name "_Variable_". Also return mapping of description and code :rtype: tuple of pandas.DataFrame and dictionary """ df4mdl = long_format_data.copy() # trim and convert description columns to string df4mdl[description_variables] = df4mdl[description_variables].astype(str).apply(lambda x: x.str.strip()) # Description present in supplied mapping dictionary new_description = np.unique(df4mdl[description_variables].values) # subset relevant mapping des2code = {} # correct any inconsistency in given description. same description cann't have two or more codes if len(description2code): des2code.update(update_mapping(new_description, description2code, case_sensitive=case_sensitive)) # Generate new mapping if description is not present in supplied mapping dictionary new_description = new_description[~np.isin(new_description, list(des2code.keys()))] if len(new_description): des2code_new = {} avoid_infinite_loop = 1 # generate code if not case_sensitive: series4code_original = pd.Series(new_description, index=new_description).replace(r'\W|_', '', regex=True).str.upper() duplicate_description = series4code_original.index.str.upper().duplicated() series4code = series4code_original[~duplicate_description] else: series4code = pd.Series(new_description, index=new_description).replace(r'\W|_', '', regex=True).str.upper() # distribution of literal- is used to generate code dist_fixed = pareto.cdf(range(1, series4code.str.len().max() + 2), 1) while True: if avoid_infinite_loop == 1: des2code_generate = (series4code * ((code_length / series4code.str.len()).apply(np.ceil) ).astype(int)).str[:code_length] elif avoid_infinite_loop < 5: des2code_generate = series4code.apply(lambda x: generate_code(code_length, x, dist_fixed[1:len(x) + 1])) elif avoid_infinite_loop < 8: des2code_generate = series4code.apply(lambda x: generate_code(code_length, "ABCDEFGHIJKLMNOPQRSTUVWXYZ")) else: des2code_generate = series4code.apply(lambda x: generate_code(code_length, "ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789")) # check if code is duplicate or exists in original description to code mapping des_code_dup = des2code_generate.isin( des2code.values()) | des2code_generate.duplicated( keep=False) | des2code_generate.isin( des2code_new.values()) # Finalize mapping if duplicate is not present des2code_new.update(des2code_generate[~des_code_dup].to_dict()) # break loop if no new description is present if not(des_code_dup.sum()): break # update new description if duplicate codes are present avoid_infinite_loop = avoid_infinite_loop + 1 # update series to generate code series4code = series4code[des_code_dup.values] if not case_sensitive: des2code_new.update( update_mapping( series4code_original[duplicate_description].index, des2code_new, case_sensitive=case_sensitive)) des2code.update(des2code_new) # Create Variable in raw file df4mdl["_Variable_"] = (df4mdl[description_variables] .replace(des2code) .apply(lambda x: delimeter.join(x), axis=1)) return(df4mdl, des2code)