Beispiel #1
0
def ParetoDiscrete(eta, x_m = 1, p = 0.95, N = 8):
    q_p = x_m/(1-p)**(1/eta)
    grid = np.linspace(x_m, q_p, N+2)
    probGrid = np.empty(N+2)
    for i, x in enumerate(grid):
        if i == 0:
            probGrid[i] = pareto.cdf((grid[i+1] + x)/2, b = eta, scale = x_m)
        else: 
            if i == len(grid)-1:
                probGrid[i] = 1- pareto.cdf((grid[i-1] + x)/2, b = eta, scale = x_m )
            else:
                probGrid[i] = pareto.cdf((grid[i+1] + x)/2, b = eta, scale = x_m) - \
                pareto.cdf((grid[i-1] + x)/2, b = eta, scale = x_m )
    return [grid, probGrid]
Beispiel #2
0
def pareto_dcdf(x, d, b=1, scale=1):
    """ d^th derivative of the cumulative distribution function at x of the given RV.

    :param x:  array_like
        quantiles
    :param d: positive integer
        derivative order of the cumulative distribution function
    :param b: positive number
        shape parameter (default=1)
    :param scale: positive number
        scale parameter (default=1)
    :return: array_like
     If d = 0: the cumulative distribution function evaluated at x
     If d = 1: the probability density function evaluated at x
     If d => 2: the (d-1)-density derivative evaluated at x
    """
    if scale <= 0 or b <= 0:
        print("The scale and shape parameters must be positive numbers.")

    if d == 0:
        output = pareto.cdf(x, b, scale=scale)

    if d != 0:
        output = np.where(scale <= x, -(-1/scale)**d*(x/scale)**(-b-d)*np.prod(b+range(d)), 0)

    return output
Beispiel #3
0
def KS_MC(a, n_events, n_draws=10000):
    """
    Run MC trials of computing KS D values for data draw from power law with cumulative index a.
    """

    D = []
    for _ in range(n_draws):
        rvs = pareto.rvs(a, size=n_events)
        aML = ML_index_analytic(rvs, 1.)
        cdf = lambda x: pareto.cdf(x, aML)
        D.append(kstest(rvs, cdf)[0])
    return  np.sort(D)
Beispiel #4
0
def kolTestPareto(P, k):
    emp_dist_x = np.sort(P)
    emp_dist = np.array((range(10001)))[1:] / 10000
    true_dist = []
    for i, item in enumerate(emp_dist_x):
        true_dist.append(pareto.cdf(item, b=k, loc=0))

    #plt.plot(emp_dist_x, emp_dist)
    #plt.plot(emp_dist_x, true_dist)
    #plt.show()

    D_n = np.max(np.abs(emp_dist - true_dist))
    Adjust_D_n = (math.sqrt(10000.0) + 0.12 + 0.11 * math.sqrt(10000.0)) * D_n
    return Adjust_D_n
Beispiel #5
0
def dispatch_cdf(data, alpha, xmin, xmax, discrete):
    if discrete:
        if np.isinf(xmax):
            ll = genzipf.cdf(data, alpha, xmin)
        else:
            ll = truncated_zipf.cdf(data, alpha, xmin, xmax)
    else:
        if np.isinf(xmax):
            ll = pareto.cdf(data, alpha - 1, scale=xmin)
        else:
            ll = truncated_pareto.cdf(data,
                                      alpha - 1,
                                      float(xmax) / xmin,
                                      scale=xmin)
    return ll
Beispiel #6
0
def _generate_KS_cube():
    """
    Generate a grid of D values for KS tests of power-law behavior.
    """
    a_grid = np.arange(0.2, 2, 0.05)
    n_grid = [3, 4, 5, 6, 7, 8, 9, 10, 12, 15, 20, 25, 30, 40, 50, 75, 100, 125, 150, 200]
    m = 1000
    Dcube = np.zeros([len(a_grid), len(n_grid), m], dtype='f4')
    for i, a in enumerate(a_grid):
        for j, n in enumerate(n_grid):
            D = []
            for k in range(m):
                rvs = pareto.rvs(a, size=n)
                aML = ML_index_analytic(rvs, 1.)
                cdf = lambda x: pareto.cdf(x, aML)
                D.append(kstest(rvs, cdf)[0])
            Dcube[i,j] = np.sort(D)
    np.save(_path_ks_grid, np.array((a_grid, n_grid, Dcube)))
Beispiel #7
0
def KS_test(data, alpha, xmin, xmax=np.inf):
    """
    Give the Kolmogorov-Smirnov distance between the theoretic distribution and the data.
    :param data: data samples, increasingly ordered if possible, shape (n,)
    :param alpha: the exponent being tested, float
    :param xmin: the lower cutoff of the power-law, float
    :param xmax: the upper cutoff of the power-law, float
    """
    data = _check_data(data, sort=True)
    data, use_data = _trf_check_bounds(data, xmin, xmax)
    if xmax == np.inf:
        cdf = pareto.cdf(data[use_data], alpha - 1, scale=xmin)
    else:
        cdf = truncated_pareto.cdf(data[use_data],
                                   alpha - 1,
                                   float(xmax) / xmin,
                                   scale=xmin)
    n = np.sum(use_data.astype(int))
    emp1 = np.arange(n) / float(n)
    emp2 = np.arange(1, n + 1) / float(n)
    ks = np.maximum(np.abs(emp1 - cdf), np.abs(emp2 - cdf))
    return np.max(ks) if len(ks) else np.inf
Beispiel #8
0
def KS_test(edges, counts, alpha, xmin, xmax=np.inf):
    """
    Give the Kolmogorov-Smirnov distance between the theoretic distribution and the binned data.
    :param edges: increasing bin boundaries, shape (n+1,)
    :param counts: counts in the bin, shape (n,)
    :param alpha: the exponent being tested, float
    :param xmin: the lower cutoff of the power-law, float
    :param xmax: the upper cutoff of the power-law, float
    """
    edges, counts = _check_bins_counts(edges, counts, sort=True)
    lefts, widths, use_data, use_edge = _trf_check_bounds(
        edges, counts, xmin, xmax)
    if np.isinf(xmax):
        cdf = pareto.cdf(edges[1:][use_data], alpha - 1, scale=xmin)
    else:
        cdf = truncated_pareto.cdf(edges[1:][use_data],
                                   alpha - 1,
                                   float(xmax) / xmin,
                                   scale=xmin)
    emp = np.cumsum(counts[use_data]) / float(np.sum(counts[use_data]))
    ks = np.abs(emp - cdf)
    return np.max(ks) if len(ks) else np.inf
Beispiel #9
0
def KS_test(points, counts, alpha, xmin, xmax=np.inf, discrete=False):
    """
    Give the Kolmogorov-Smirnov distance between the theoretic distribution and the data.
    :param points: observed values, shape (n,)
    :param counts: number of occurrences for `points`, shape (n,)
    :param xmin: the lower cutoff of the power-law, float
    :param xmax: the upper cutoff of the power-law, float
    :param alpha: the exponent being tested, float
    :param discrete: interpret as a discrete power-law (genrealized zipf) distribution
    """
    points, counts = _check_points_counts(points, counts, sort=True)
    points, use_data = _trf_check_bounds(points,
                                         counts,
                                         xmin,
                                         xmax,
                                         discrete,
                                         force_discard_end=True)
    # TODO: use dispatch_cdf
    if discrete:
        if np.isinf(xmax):
            cdf = genzipf.cdf(points[use_data], alpha, xmin)
        else:
            cdf = truncated_zipf.cdf(points[use_data], alpha, xmin, xmax)
    else:
        if np.isinf(xmax):
            cdf = pareto.cdf(points[use_data], alpha - 1, scale=xmin)
        else:
            cdf = truncated_pareto.cdf(points[use_data],
                                       alpha - 1,
                                       float(xmax) / xmin,
                                       scale=xmin)
    emp = np.cumsum(counts[use_data]) / float(np.sum(counts[use_data]))
    if not discrete:
        # This correction is needed because cdf_continuous[xmin] == 0 while emp[xmin] has an important weight
        emp = np.concatenate(([0], emp[:-1]))
    ks = np.abs(emp - cdf)
    return np.max(ks) if len(ks) else np.inf
Beispiel #10
0
from scipy.stats import pareto
print(pareto.cdf(6, 3, 0, 3))
Beispiel #11
0
# Display the probability density function (``pdf``):

x = np.linspace(pareto.ppf(0.01, b), pareto.ppf(0.99, b), 100)
ax.plot(x, pareto.pdf(x, b), 'r-', lw=5, alpha=0.6, label='pareto pdf')

# Alternatively, the distribution object can be called (as a function)
# to fix the shape, location and scale parameters. This returns a "frozen"
# RV object holding the given parameters fixed.

# Freeze the distribution and display the frozen ``pdf``:

rv = pareto(b)
ax.plot(x, rv.pdf(x), 'k-', lw=2, label='frozen pdf')

# Check accuracy of ``cdf`` and ``ppf``:

vals = pareto.ppf([0.001, 0.5, 0.999], b)
np.allclose([0.001, 0.5, 0.999], pareto.cdf(vals, b))
# True

# Generate random numbers:

r = pareto.rvs(b, size=1000)

# And compare the histogram:

ax.hist(r, normed=True, histtype='stepfilled', alpha=0.2)
ax.legend(loc='best', frameon=False)
plt.show()
Beispiel #12
0
    for elem in xList:
        count = data.count(elem)
        step += count
        yList.append(step / dataLen)
    return xList, yList


[a, xm] = [5, 1]
N = [5, 10, 100, 1000, 10**5]

for n in N:
    print("n = ", n)
    for i in range(5):
        data = np.zeros(n)
        for iteration in range(n):
            xi = np.random.rand()
            r = xm / xi**(1 / a)
            data[iteration] = r
        print(data)
        x, y = data_ECDF(data)
        plt.step([0, *x, 1.1 * x[-1]], [0, *y, 1],
                 label="Pareto ECDF{h}".format(h=i + 1),
                 where='post')
    xx = np.arange(0, 5, 0.1)
    plt.plot(xx,
             pareto.cdf(xx, a, scale=xm),
             color="black",
             label="Pareto CDF")
    plt.legend(loc='lower right', frameon=False)
    plt.show()
Beispiel #13
0
x, _, _ = normal(n, alpha=0.05, reps=1, plot=True)
hist, edges = np.histogram(x, bins)
expected = [
    int(n * (norm.cdf(edges[i + 1], 0, 1) - norm.cdf(edges[i], 0, 1)))
    for i in range(len(edges) - 1)
]
chi2_test(x, expected, alpha, bins)

# Pareto distribution
beta = 1
k = 2.05
print 'Pareto\n', 'beta=', beta, 'k=', k
x = pareto(beta, k=k, n=n, plot=True)
hist, edges = np.histogram(x, bins)
expected = [
    int(n * (pareto_sc.cdf(edges[i + 1], k) - pareto_sc.cdf(edges[i], k)))
    for i in range(len(edges) - 1)
]
chi2_test(x, expected, alpha, bins)

k = 2.5
print 'Pareto\n', 'beta=', beta, 'k=', k
pareto(beta, k=k, n=n, plot=True)
hist, edges = np.histogram(x, bins)
expected = [
    int(n * (pareto_sc.cdf(edges[i + 1], k) - pareto_sc.cdf(edges[i], k)))
    for i in range(len(edges) - 1)
]
chi2_test(x, expected, alpha, bins)

k = 3
Beispiel #14
0
 def _pdf(self, x, b, m):
     return pareto.pdf(x, b) / pareto.cdf(m, b)
Beispiel #15
0
 def _ppf(self, q, b, m):
     return np.power(1.0 - q * pareto.cdf(m, b), -1.0 / b)
Beispiel #16
0
from scipy.stats import pareto
print(pareto.cdf(6,3,0,3))
Beispiel #17
0
#Pareto

alpha = [1.16]
loc = 0
scale = 1

for i in alpha:

    pdf = pareto.pdf(xn, i, loc)

    plt.figure('Pareto PDF')
    plt.title('Pareto PDF')
    ax = sns.lineplot(xn, pdf, color='k')
    ax.fill_between(xn, pdf, color='olivedrab', alpha=0.2)

cdf = pareto.cdf(xc, alpha[0], loc, scale)

plt.figure('Pareto CDF')
plt.title('Pareto CDF')
ax = sns.lineplot(xc, cdf, color='red')
ax.fill_between(xc, cdf, color="firebrick", alpha=0.3)

#Gamma (com a = é uma exponencial)
a = [1, 3, 5]

for i in a:

    pdf = gamma.pdf(xn, i, loc, scale)

    plt.figure('Gama PDF')
    plt.title('Gama PDF')
Beispiel #18
0
def apply_mapping(long_format_data, description_variables, code_length=3, delimeter="_", description2code={}, case_sensitive=False):
    """
    Create/apply mapping on description variable on data to create variable name for modeling data

    :param long_format_data: dataframe to be used to create modeling data
    :type long_format_data: pandas.DataFrame
    :param description_variables: columns in dataframe for which mapping codes needs to generated
    :type description_variables: list of string
    :param code_length: length of code to be geneated, defualt value is 3
    :type code_length: int
    :param delimeter: delimeter used inbetween description mapping code
    :type delimeter: string
    :param description2code: mapping in dictionary format with description as key and code as value, defult is empty dictionary
    :type description2code: dictionary
    :param case_sensitive: whether mapping to code is case sensitive or not
    :type case_sensitive: bool
    :return: dataframe after applying mapping code with column name "_Variable_". Also return mapping of description and code
    :rtype: tuple of pandas.DataFrame and dictionary
    """

    df4mdl = long_format_data.copy()
    # trim and convert description columns to string
    df4mdl[description_variables] = df4mdl[description_variables].astype(str).apply(lambda x: x.str.strip())
    # Description present in supplied mapping dictionary
    new_description = np.unique(df4mdl[description_variables].values)
    # subset relevant mapping
    des2code = {}
    # correct any inconsistency in given description. same description cann't have two or more codes
    if len(description2code):
        des2code.update(update_mapping(new_description, description2code, case_sensitive=case_sensitive))

    # Generate new mapping if description is not present in supplied mapping dictionary
    new_description = new_description[~np.isin(new_description, list(des2code.keys()))]
    if len(new_description):
        des2code_new = {}
        avoid_infinite_loop = 1
        # generate code
        if not case_sensitive:
            series4code_original = pd.Series(new_description, index=new_description).replace(r'\W|_', '', regex=True).str.upper()
            duplicate_description = series4code_original.index.str.upper().duplicated()
            series4code = series4code_original[~duplicate_description]
        else:
            series4code = pd.Series(new_description, index=new_description).replace(r'\W|_', '', regex=True).str.upper()
        # distribution of literal- is used to generate code
        dist_fixed = pareto.cdf(range(1, series4code.str.len().max() + 2), 1)
        while True:
            if avoid_infinite_loop == 1:
                des2code_generate = (series4code * ((code_length / series4code.str.len()).apply(np.ceil)
                                                    ).astype(int)).str[:code_length]
            elif avoid_infinite_loop < 5:
                des2code_generate = series4code.apply(lambda x: generate_code(code_length, x, dist_fixed[1:len(x) + 1]))
            elif avoid_infinite_loop < 8:
                des2code_generate = series4code.apply(lambda x: generate_code(code_length, "ABCDEFGHIJKLMNOPQRSTUVWXYZ"))
            else:
                des2code_generate = series4code.apply(lambda x: generate_code(code_length, "ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"))
            # check if code is duplicate or exists in original description to code mapping
            des_code_dup = des2code_generate.isin(
                des2code.values()) | des2code_generate.duplicated(
                keep=False) | des2code_generate.isin(
                des2code_new.values())
            # Finalize mapping if duplicate is not present
            des2code_new.update(des2code_generate[~des_code_dup].to_dict())
            # break loop if no new description is present
            if not(des_code_dup.sum()):
                break
            # update new description if duplicate codes are present
            avoid_infinite_loop = avoid_infinite_loop + 1
            # update series to generate code
            series4code = series4code[des_code_dup.values]
        if not case_sensitive:
            des2code_new.update(
                update_mapping(
                    series4code_original[duplicate_description].index,
                    des2code_new,
                    case_sensitive=case_sensitive))
        des2code.update(des2code_new)

    # Create Variable in raw file
    df4mdl["_Variable_"] = (df4mdl[description_variables]
                            .replace(des2code)
                            .apply(lambda x: delimeter.join(x), axis=1))
    return(df4mdl, des2code)