コード例 #1
0
def calc_KS_1_sample_test(x_points, parameters_list, data, distribution_name, column_type , column_name):
    dict_name = 'KS_' + distribution_name +'_cols'
    dict_name = ['x', 'F_cap_x', 'F_cap_DC_left', 'F_cap_DC_right', 'left_diff_abs', 'right_diff_abs']
    row_list = []
    for x in x_points:
        if(distribution_name == 'binomial'):
            #Find cdf of binomial at given point x
            F_cap_x = binom.cdf(x, parameters_list[0], parameters_list[1])
        if(distribution_name == 'poisson'):
            #Find cdf of poisson at given point x
            F_cap_x = poisson.cdf(x, parameters_list[0])
        if(distribution_name == 'geometric'):
            #Find cdf of geometric at given point x
            F_cap_x = geom.cdf(x, parameters_list[0])
        # Find CDF to the left of point x in the sorted DC dataset
        F_cap_DC_left = get_left_cdf(data ,column_name, x, 'DC_eCDF')
        # Find CDF to the right of point x in the sorted DC dataset
        F_cap_DC_right = get_right_cdf(data, column_name, x, 'DC_eCDF')
        # Find absolute difference between left CDFs of x points and DC datasets
        left_diff_abs = round(abs(F_cap_x - F_cap_DC_left), 4)
        # Find absolute difference between right CDFs of x points and DC datasets
        right_diff_abs = round(abs(F_cap_x - F_cap_DC_right), 4)
    
        row = [x, F_cap_x, F_cap_DC_left, F_cap_DC_right, left_diff_abs, right_diff_abs]
        row_dict = dict(zip(dict_name, row))
        row_list.append(row_dict)
    
    # Build KS Test Table (represented as a dataframe)    
    df_name = 'KS_' + distribution_name +'_df'
    df_name = pd.DataFrame(row_list, columns=dict_name)
    
    # Calculate KS statistic value
    max_diff_x = []
    d_right = df_name.iloc[df_name['right_diff_abs'].idxmax(axis=1)][['x', 'right_diff_abs']]
    d_left = df_name.iloc[df_name['left_diff_abs'].idxmax(axis=1)][['x', 'left_diff_abs']]
    if d_right['right_diff_abs'] == d_left['left_diff_abs']:
        print("KS Statistic is {0} at x = {1} and {2}".format(d_right['right_diff_abs'], d_left['x'], d_right['x']))
        max_diff_x.append(d_right['x'])
        max_diff_x.append(d_left['x'])
    elif d_right['right_diff_abs'] > d_left['left_diff_abs']:
        print("KS Statistic is {0} at x = {1}".format(d_right['right_diff_abs'], d_right['x']))
        max_diff_x.append(d_right['x'])
    else:
        print("KS Statistic is {0} at x = {1}".format(d_left['left_diff_abs'], d_left['x']))
        max_diff_x.append(d_left['x'])

    # Reject/Accept Null Hypothesis based on calculated KS Statistic d and given threshold=0.05
    d = max(d_right['right_diff_abs'], d_left['left_diff_abs'])
    critical_value = 0.05
    hypothesis_type = 'confirmed positive cases' if column_type == 'confirmed' else column_type

    if d > critical_value:
        print("Rejected Null Hypothesis: We reject the hypothesis that the distribution of daily {0} in DC is {3}, as KS Statistic d = {1} exceeds threshold {2}".format(hypothesis_type, d, critical_value, distribution_name))
        print()
    else:
        print("Failed to reject Null Hypothesis: We accept the hypothesis that the distribution of daily {0} is same in both CT and DC, as KS Statistic d = {1} does not exceed threshold {2}".format(hypothesis_type, d, critical_value))
        print()
        
        
    return max_diff_x
コード例 #2
0
def ks_1sample(mme_sample_data, last_week):

    sample_variance = np.var(mme_sample_data, ddof=1)
    sample_mean = np.mean(mme_sample_data)
    binom_p_mme = 1-(sample_variance/sample_mean)
    binom_n_mme = (sample_mean*sample_mean)/(sample_mean - sample_variance)
    poisson_lambda_mme = sample_mean
    geom_p_mme = 1/sample_mean
    geom_differences = []
    binom_differences = []
    poisson_differences = []

    for i in range(7):

        ecdf_left = i/7
        ecdf_right = (i+1)/7

        geom_cdf = geom.cdf(last_week[i], geom_p_mme)
        poisson_cdf = poisson.cdf(last_week[i], poisson_lambda_mme)
        binom_cdf = binom.cdf(last_week[i], binom_n_mme, binom_p_mme)

        geom_differences.append(abs(ecdf_left - geom_cdf))
        geom_differences.append(abs(ecdf_right - geom_cdf))

        binom_differences.append(abs(ecdf_left - binom_cdf))
        binom_differences.append(abs(ecdf_right - binom_cdf))

        poisson_differences.append(abs(ecdf_left - poisson_cdf))
        poisson_differences.append(abs(ecdf_right - poisson_cdf))

    return np.max(geom_differences), np.max(binom_differences), np.max(poisson_differences)
コード例 #3
0
def KS1Sample(data_, true_distro_):
    state1_ = data_.columns[0]
    state2_ = data_.columns[1]

    MME = []
    if true_distro_ == 'poisson':
        MME = MMEPoisson(data_[state1_])
    elif true_distro_ == 'geometric':
        MME = MMEGeometric(data_[state1_])
    elif true_distro_ == 'binomial':
        MME = MMEBinomial(data_[state1_])
    else:
        return None

    print("The MME values : ", MME)

    uniq_val_state2, freq_val_state2 = np.unique(data_[state2_],
                                                 return_counts=True)
    cdf_state2 = np.cumsum(freq_val_state2)
    cdf_state2 = cdf_state2 / cdf_state2[-1]

    # print(uniq_val_state2)
    # print(freq_val_state2)
    # print(cdf_state2)

    max_diff = -1
    for i, value_ in enumerate(uniq_val_state2):
        left_ecdf, right_ecdf = 0, 0
        if i == 0:
            left_ecdf = 0
            right_ecdf = cdf_state2[i]
        elif i == len(uniq_val_state2) - 1:
            left_ecdf = cdf_state2[i - 1]
            right_ecdf = 1
        else:
            left_ecdf = cdf_state2[i - 1]
            right_ecdf = cdf_state2[i]

        true_cdf = 0
        if true_distro_ == 'poisson':
            true_cdf = poisson.cdf(value_, MME[0])
        elif true_distro_ == 'geometric':
            true_cdf = geom.cdf(value_, MME[0])
        elif true_distro_ == 'binomial':
            true_cdf = binom.cdf(value_, MME[0], MME[1])

        diff_ = max(abs(left_ecdf - true_cdf), abs(right_ecdf - true_cdf))
        if diff_ > max_diff:
            max_diff = diff_

        # print(value_, left_ecdf, right_ecdf, true_cdf, diff_)

    if max_diff > 0.05:
        print(
            "Since Max distance(%f) > 0.05, we reject Null Hypothesis (%s has same distribution as %s having true distribution of %s)\n\n"
            % (max_diff, state2_, state1_, true_distro_))
    else:
        print(
            "Since Max distance(%f) <= 0.05, we accept Null Hypothesis (%s has same distribution as %s having true distribution of %s)\n\n"
            % max_diff)
コード例 #4
0
def UnbiasedEstimate(n,k,theta,Beta,theta0,Beta0,func,minT=1000,logf=None,zipfParam=1.5):
    """ This function simulates an unbiased estimate following the algorithm of Rhee and Glynn.

    func is a function that accepts n,k,theta,Beta,theta0,Beta0 and outputs a number or a numpy array
    """
    # Draw the length of the Markov chain from a power law
    #T = minT + np.random.zipf(a=zipfParam)
    # Draw the length of the Markov chain from a geometric distribution
    T = minT + np.random.geometric(zipfParam)
    print("The number of steps in the Markov chain is %i"%T)

    # Initialize variables
    R,I = np.shape(n)
    G1 = np.ones(R)
    G2 = np.ones(R)
    try:
        logf.size
    except AttributeError:
        logf = GetArray(n.max(),Beta)

    est = func(n,k,theta,Beta,theta0,Beta0)
    k1 = k.copy()   # This is the equivalent of k
    k2 = k.copy()   # This is the equivalent of \tilde k
    for step in range(1,T+1):
        kR1 = k1.sum(1)
        kI1 = k1.sum(0)
        kR2 = k2.sum(1)
        kI2 = k2.sum(0)
        # Resample G
        for r in range(R):
            auxGammas = gamma(1,size=max(kR1[r],kR2[r]))
            G1[r] = gamma(theta/Beta)
            G2[r] = G1[r]
            G1[r] += sum(auxGammas[:kR1[r]])
            G2[r] += sum(auxGammas[:kR2[r]])
        # Resample D
        auxGammas = gamma(1,size=[I,max(kI1.max(),kI2.max())])
        auxGammas2 = gamma(1-Beta0,size=I)
        auxGamma = gamma(theta0+I*Beta0)
        D1 = auxGammas2 + np.array([sum(auxGammas[i,:kI1[i]-1]) for i in range(I)])
        D1 = D1/(D1.sum()+auxGamma)
        D2 = auxGammas2 + np.array([sum(auxGammas[i,:kI2[i]-1]) for i in range(I)])
        D2 = D2/(D2.sum()+auxGamma)
        # Resample k
        unif = np.random.uniform(size=k.shape)
        UpdateK(k1,n,I,R,G1,D1,unif,logf,Beta)
        if step>1:
            UpdateK(k2,n,I,R,G2,D2,unif,logf,Beta)
        # Terminate if coupling has merged
        if (k1==k2).all():
            break
        # Otherwise continue sum
        #denom = (1-zipf.cdf(step-minT-1,a=zipfParam)) if step>minT else 1.0
        denom = (1-geom.cdf(step-minT-1,p=zipfParam)) if step>minT else 1.0
        summand = (func(n,k1,theta,Beta,theta0,Beta0)-func(n,k2,theta,Beta,theta0,Beta0))/denom
        est += summand
        print summand
    print est
    return est
 def kolmogorov_criterion(self):
     n = len(self.sample)
     x, emp_cdf = self.distribution_function()
     cdf = geom.cdf(x, self.p)
     d = emp_cdf - cdf
     d_n = np.amax(np.absolute(d))
     s_k = d_n * np.sqrt(n)
     print('geometric', s_k)
コード例 #6
0
ファイル: mete.py プロジェクト: retta95/residence-time
def get_mete_sad_geom(S, N):
    """METE's predicted RAD when the only constraint is N/S

    Keyword arguments:
    S -- the number of species
    N -- the total number of individuals
    """

    assert S > 1, "S must be greater than 1"
    assert N > 0, "N must be greater than 0"
    assert S/N < 1, "N must be greater than S"

    p = S / N
    abundance  = list(empty([S]))
    rank = range(1, int(S)+1)
    rank.reverse()

    for i in range(0, int(S)):
        y = lambda x: geom.cdf(x,p) / geom.cdf(N,p) - (rank[i]-0.5) / S
        abundance[i] = int(round(bisect(y, 0, N)))
    return (abundance, p)
コード例 #7
0
def KS_1_sample_test(states_data,column_type, distribution_type):
    CT_col_name = 'CT ' + column_type
    DC_col_name = 'DC ' + column_type

    # Split the dataset per state and sort the 2 state-specific columns on which we need to perform the KS Test (#cases/#confirmed)
    CT_sorted_df = states_data.loc[(states_data['Date'] >= '2020-10-01') & (states_data['Date'] <= '2020-12-31')][[CT_col_name]].sort_values(CT_col_name).reset_index(drop=True)
    DC_sorted_df = states_data.loc[(states_data['Date'] >= '2020-10-01') & (states_data['Date'] <= '2020-12-31')][[DC_col_name]].sort_values(DC_col_name).reset_index(drop=True)

    # Add a new column denoting the CDF at each point in the DC cases/deaths columns
    DC_sorted_df['DC_eCDF'] = get_cdf_list(DC_sorted_df.shape[0])
    
    # Find distinct datapoints and their corresponding CDFs at each of the points
    DC_distinct_df = DC_sorted_df.drop_duplicates(subset=DC_col_name, keep="last").reset_index(drop=True)
    
    # points for x column for KS test
    x_points = DC_distinct_df[DC_col_name].to_list()
    
    if (distribution_type == 'poisson'):
        #Poisson distribution
        mean = sample_mean(CT_sorted_df, CT_col_name)
        #MME parameters
        lambda_mme = poisson_para(mean)
        #Calls calculation of KS 1 sample statistic 
        max_diff_x = calc_KS_1_sample_test(x_points, [lambda_mme] , DC_distinct_df , 'poisson', column_type,DC_col_name ) 
        
        DC_sorted_df['DC_eCDF_mme'] = DC_sorted_df.apply(lambda row : poisson.cdf(row[DC_col_name], lambda_mme),axis =1)
        plot_KS_1_Sample_eCDF( DC_sorted_df, DC_col_name, max_diff_x, column_type.capitalize(),distribution_type)
        
        
    if (distribution_type == 'geometric'):
        #Geometric distribution
        mean = sample_mean(CT_sorted_df,CT_col_name)
        #MME parameters
        p_mme = geometric_para(mean)
        #Calls calculation of KS 1 sample statistic 
        max_diff_x = calc_KS_1_sample_test(x_points, [p_mme] , DC_distinct_df , 'geometric', column_type, DC_col_name)
        
        DC_sorted_df['DC_eCDF_mme'] = DC_sorted_df.apply(lambda row :  geom.cdf(row[DC_col_name], p_mme),axis =1)
        plot_KS_1_Sample_eCDF( DC_sorted_df, DC_col_name, max_diff_x, column_type.capitalize(),distribution_type)
    
    if (distribution_type == 'binomial'):
        #Binomial distribution
        mean = sample_mean(CT_sorted_df,CT_col_name)
        variance = sample_variance(CT_sorted_df,mean,CT_col_name)
        #MME parameters
        n_mme, p_mme = binomial_para(mean,variance)
        #Calls calculation of KS 1 sample statistic c
        max_diff_x = calc_KS_1_sample_test(x_points, [n_mme, p_mme] , DC_distinct_df , 'binomial', column_type, DC_col_name)
        
        DC_sorted_df['DC_eCDF_mme'] = DC_sorted_df.apply(lambda row : binom.cdf(row[DC_col_name], n_mme,  p_mme),axis =1)
        plot_KS_1_Sample_eCDF(DC_sorted_df, DC_col_name, max_diff_x, column_type.capitalize(),distribution_type)
コード例 #8
0
ファイル: smctest02.py プロジェクト: salmatoolkit/salma
 def calc_real_prob(self):
     fail_prob_one_robot = 0
     p_dist = 1 / (self.experiment.destination_range[1] - self.experiment.destination_range[0] + 1)
     for dist in range(self.experiment.destination_range[0], self.experiment.destination_range[1] + 1):
         for (gprobspec, dprop) in zip(self.experiment.grip_probs, self.experiment.drop_probs):
             fail_prob_one_robot += gprobspec[1] * geom.cdf(dist, dprop)
     fail_prob_one_robot *= p_dist
     k = 7
     print(round(fail_prob_one_robot, k))
     real_prob = (1.0 - fail_prob_one_robot) ** self.experiment.num_robots
     print(round(real_prob, k))
     rp2 = (1.0 - round(fail_prob_one_robot, k)) ** self.experiment.num_robots
     print(round(rp2, k))
     return real_prob
コード例 #9
0
def k_s_test_1_sample(pa, ri, dist='poisson', c=0.05):

    pa_mean = np.mean(pa)
    pa_var = np.var(pa)

    # Sort the data to get the CDFs
    pa, ri = np.sort(pa), np.sort(ri)

    s = min(pa[0], ri[0]) - 100
    e = max(pa[len(pa) - 1], ri[len(ri) - 1]) + 100

    F_pa = []
    F_ri = get_eCDF(sorted_ri_confirmed, s, e)

    # pa CDF at ri change points
    if dist == 'poisson':
        lam_mme = pa_mean
        F_pa = [poisson.cdf(cp, lam_mme) for cp in ri]
    elif dist == 'binomial':
        p_mme = 1 - pa_var / pa_mean
        n_mme = pa_mean / p_mme
        F_pa = [binom.cdf(cp, n_mme, p_mme) for cp in ri]
    elif dist == 'geometric':
        p_mme = 1 / pa_mean
        F_pa = [geom.cdf(cp, p_mme) for cp in ri]

    F_ri_minus, F_ri_plus = F_ri[0:-1], F_ri[1:]

    ks_stat = ks_index = 0

    for i in range(0, len(X)):
        if abs(F_pa[i] - F_ri_minus[i]) > ks_stat:
            ks_stat = abs(F_pa[i] - F_ri_minus[i])
            ks_index = i
        if abs(F_pa[i] - F_ri_plus[i]) > ks_stat:
            ks_stat = abs(F_pa[i] - F_ri_plus[i])
            ks_index = i

    print("ks_stat is {} at {}, where as c is {}".format(
        ks_stat, ri[ks_index], c))
    if ks_stat > c:
        print("d > c, So, we reject Null Hypothesis")
    else:
        print("d <= c, So, we accept Null Hypothesis")

    return
コード例 #10
0
from scipy.stats import geom
import numpy as np

k = 1
p = 0.3

expect = geom.expect(args=(p, ), loc=0)
mean = geom.mean(p)
var = geom.var(p)
sigma = geom.std(p)
pmf = geom.pmf(k, p, loc=0)
cdf = geom.cdf(k, p, loc=0)
ex2 = var + expect**2

print('expected = ', expect)
print('mean = ', mean)
print('variance = ', var)
print('std. dev. = ', sigma)
print('pmf = ', pmf)
print('cdf = ', cdf)
print("E[X]^2 = ", ex2)
コード例 #11
0
ファイル: KS_Eqn_data.py プロジェクト: CraigGin/PDEKoopman2
T = dt * (n_time - 1)  # End time

# Discretize x
x = np.linspace(-L / 2, L / 2, n + 1)
x = x[:n]

# Create vectors of random values for sines

# Sampling of A and phi
X = pyDOE.lhs(2, samples=M, criterion='maximin')
A_vect = X[:, 0]
phi_vect = 2 * np.pi * X[:, 1]

# Sampling of omega
max_omega = 10
cum_distrib = geom.cdf(np.arange(1, max_omega + 1), 0.25)
cum_distrib = cum_distrib / cum_distrib[-1]
numbs = np.random.uniform(size=M)

omega_vect = np.zeros(M)

for k in range(max_omega):
    omega_vect = omega_vect + (numbs < cum_distrib[k])

omega_vect = 11 - omega_vect

# Create vectors of random values for square waves

# Sampling of A, c, and w
X = pyDOE.lhs(3, samples=M, criterion='maximin')
A2_vect = X[:, 0]
コード例 #12
0
ファイル: Geometric_dist.py プロジェクト: Gautam-v-ml/Math
 def cdf(self, x, p):
     cdf = geom.cdf(self, x, p)
     return cdf
コード例 #13
0
ファイル: smctest01.py プロジェクト: salmatoolkit/salma
 def calc_real_prob(self):
     fail_prob_one_robot = 0
     for (gprobspec, dprop) in zip(self.experiment.grip_probs, self.experiment.drop_props):
         fail_prob_one_robot += gprobspec[1] * geom.cdf(self.experiment.x_goal, dprop)
     real_prob = (1.0 - fail_prob_one_robot) ** self.experiment.num_robots
     return real_prob
 def theoretical_distribution(self):
     x = np.array([i for i in range(self.r_low, self.r_up + 1)])
     plt.plot(x, geom.cdf(x, self.p), 'ro')
for tick in ax.xaxis.get_major_ticks():
    tick.label.set_fontsize(5)
for tick in ax.yaxis.get_major_ticks():
    tick.label.set_fontsize(5)
fig.suptitle("Distribución Binomial")
plt.show()

# DISTRIBUCIÓN GEOMETRICA

from scipy.stats import geom

# LA GEOMETRICA DE SCIPY EMPIEZA EN 1 NO EN 0!!! si queremos corregir podremos loc = -1

geom.pmf(0, p=0.25, loc=-1)

geom.cdf(0, p=0.25, loc=-1)

geom.cdf(4, p=0.25, loc=-1)

geom.rvs(p=0.25, size=20, loc=-1)

# Comprobamos lo del loc

geom.cdf(range(5), p=0.3, loc=0)
geom.cdf(range(5), p=0.3, loc=-1)

geom.stats(p=0.25, loc=0, moments="mv")

geom.stats(p=0.25, loc=-1, moments="mv")

n, p = 10, 0.25
コード例 #16
0
plt.show()

# ### Geometric Distribution

# In[4]:

#Geometric Distribution
from scipy.stats import geom
p = 0.6
x = np.arange(geom.ppf(0.01, p), geom.ppf(
    0.99, p))  #Percent Point Function (inverse of cdf — percentiles)

print("Mean              : ", geom.stats(p, moments='m'))
print("Variance          : ", geom.stats(p, moments='v'))
print("Prob. Mass Func.  : ", geom.pmf(x, p))
print("Cum. Density Func.: ", geom.cdf(x, p))

CDF = geom.cdf(x, p)

fig = plt.figure(figsize=(20, 10))
plt.subplot(221)
plt.plot(x, geom.pmf(x, p), 'go', ms=8, label='PMF')
plt.vlines(x, 0, geom.pmf(x, p), colors='g', lw=5, alpha=0.5)
plt.xlabel("Sample Space of Geometric Distribution", fontsize=14)
plt.ylabel("PMF", fontsize=14)
plt.title("Probability Distribution of Geometric(p=0.6) Distribution",
          fontsize=16)
plt.xticks(np.arange(0, 6, 1))
plt.yticks(np.arange(0, 0.8, 0.1))
plt.legend(loc='best', shadow=True)
コード例 #17
0
def get_eval_pvalue(n, theta=0.09241035129185671, p=0.011742364113774086):

    if n == 0: return 1 - theta

    return 1 - (theta + (1 - theta) * geom.cdf(n, p, loc=0))
コード例 #18
0
    #'weight' : 'bold',
    'size': 13
}

matplotlib.rc('font', **font)

num_seats_def = 150
prng = np.random.RandomState(1)  #Pseudorandom number generator
demand_par_a_def = 60.2  #prng.randint(100, size=1).astype(float)
demand_par_b_def = 10.0  #prng.randint(5, size=1).astype(float)
demand_par_c_def = 1.0
epsilons_support_def = 10
pmin_def = 1.0

p = 0.3
prob_eps_rem = (1 - geom.cdf(epsilons_support_def + 1, p)) / (
    epsilons_support_def * 2 + 1)
prob_eps = []
for i in range(0, epsilons_support_def + 1):
    if i == 0:
        prob_eps.append(geom.pmf(i + 1, p) + prob_eps_rem)
    else:
        prob_eps.append(geom.pmf(i + 1, p) / 2 + prob_eps_rem)

prob_eps_rev = prob_eps.copy()
prob_eps_rev.reverse()
prob_eps_rev.pop()
prob_eps_def = prob_eps_rev + prob_eps

p_r_min_def = 0.8 * pmin_def
gamma_def = 0.9
コード例 #19
0
if max_diff > 0.05:
    print("We reject NULL hypothesis, because Max value is " + str(max_diff) +
          " > c : 0.05")
else:
    print("We accept NULL hypothesis, because Max value is " + str(max_diff) +
          " <= c : 0.05")
"""##**KS 1 sample test with geometric as distribution for IN Confirmed**"""

# print(cases_state_mme_geometric)
cdf_y = numpy.array([])
cdf = 0
max_diff = 0
geom_cdf = numpy.array([])
geometric_dipoint = 0
for i in cases_in:
    geom_point = geom.cdf(i, cases_state_mme_geometric)
    geom_cdf = numpy.append(geom_cdf, geom_point)
    if max_diff < numpy.abs(cdf - geom_point):
        max_diff = numpy.abs(cdf - geom_point)
        geometric_dipoint = geom_point
    cdf += 1 / n
    cdf_y = numpy.append(cdf_y, cdf)

plt.figure('Cases Case', figsize=(18, 8))
plt.xlabel('Geometric Distribution')
plt.ylabel('Cumulative Distribution Frequency')
plt.step(cases_in, cdf_y, label="IN")
plt.plot(cases_in, geom_cdf, label="IL")
plt.show()

if max_diff > 0.05:
コード例 #20
0
ファイル: Geo.py プロジェクト: qrsforever/workspace
def testGeom():  # {{{
    """
    Geometric Distribution (discrete)

    Notes
    -----
        伯努利事件进行k次, 第一次成功的概率

    为什么是几何分布呢, 为什么不叫几毛分布?
    与几何数列有关 (乘积倍数)

    p: 成功的概率
    q: 失败的概率(1-p)
    k: 第一次成功时的经历的次数 (前k-1次是失败的)
        geom.pmf(k, p), (1-p)**(k-1)*p)
    """

    # 准备数据: 已知 p.
    # X轴: 第k次才成功
    # Y轴: 概率
    p = 0.4
    xs = np.arange(geom.ppf(0.01, p), geom.ppf(0.99, p), step=1)

    # E(X) = 1/p, D(X) = (1-p)/p**2
    mean, var, skew, kurt = geom.stats(p, moments='mvsk')
    print("mean: %.2f, var: %.2f, skew: %.2f, kurt: %.2f" %
          (mean, var, skew, kurt))

    fig, axs = plt.subplots(2, 2)

    # 显示pmf1
    ys = geom.pmf(xs, p)
    axs[0][0].plot(xs, ys, 'bo', markersize=5, label='geom pmf')
    axs[0][0].vlines(xs,
                     0,
                     ys,
                     colors='b',
                     linewidth=5,
                     alpha=0.5,
                     label='vline pmf')
    axs[0][0].legend(loc='best', frameon=False)

    # 显示pmf2
    ys = (1 - p)**(xs - 1) * p
    axs[0][1].plot(xs, ys, 'bo', markersize=5, label='geom pmf')
    axs[0][1].vlines(xs,
                     0,
                     ys,
                     colors='b',
                     linewidth=5,
                     alpha=0.5,
                     label='vline pmf')
    axs[0][1].legend(loc='best', frameon=False)
    axs[0][1].set_title('ys = (1-p)**(xs-1)*p')

    # 显示cdf P(X<=x)
    ys = geom.cdf(xs, p)
    axs[1][0].plot(xs, ys, 'bo', markersize=5, label='geom cdf')
    axs[1][0].legend(loc='best', frameon=False)
    print(np.allclose(xs, geom.ppf(ys, p)))  # ppf:y-->x cdf:x-->y

    # 生成随机数据(random variables)
    data = geom.rvs(p, size=1000)
    import sys
    sys.path.append("../../thinkstats")
    import Pmf
    pmf = Pmf.MakePmfFromList(data)
    xs, ys = pmf.Render()
    axs[1][1].plot(xs, ys, 'bo', markersize=5, label='rvs-pmf')

    plt.show()
コード例 #21
0
 def cdf(self, k):
     return geom.cdf(k, self.p)
コード例 #22
0
ファイル: mixed_model.py プロジェクト: SaberHQ/tns_merge
 def _cdf(self, x, l, k, p, w):
     wei_cdf = 1 - np.exp(-1 * np.power(x / l, k))
     return w * wei_cdf + (1 - w) * geom.cdf(x, p)
コード例 #23
0
ファイル: prob_py.py プロジェクト: Vedia-JerezDaniel/r-basic
ax.set_title('Función de probabilidad de Geom(0.3)')
ax.vlines(x, 0, geom.pmf(x, p), colors='b', lw=4, alpha=0.5)

rv = geom(p)
ax.vlines(x,
          0,
          rv.pmf(x),
          colors='k',
          linestyles='--',
          lw=1,
          label="Frozen PMF")
ax.legend(loc='best')
plt.show()

print("Media %f" % mean)
print("Varianza %f" % var)
print("Sesgo %f" % skew)
print("Curtosis %f" % kurt)

fig, ax = plt.subplots(1, 1)
prob = geom.cdf(x, p)
ax.plot(x, prob, 'bo', ms=8, label="Función de distribución acumulada")
plt.title('Función de distribución acumulada')
plt.show()

fig, ax = plt.subplots(1, 1)
r = geom.rvs(p, size=10000)
plt.hist(r)
plt.title('Histograma de la random')
plt.show()
コード例 #24
0
def multiday(
    depots,
    sample_generator,
    dist_and_time,
    route_optimizer,
    simulator,
    n_days,
    day_start,
    day_end,
    seed=None,
    replications=1,
    plot=False,
    collection_points=None,
    k=0,
    dist_threshold=20000,
    futile_count_threshold=1,
    cap=20,
    tlim=1e10,
):
    """Multiday Sim

    Paramters
    ---------
    depots : np.array
        2*n_depots array of longitudes and latitudes of the depots.
        This is set up to support multidepot problems. However, to do this properly we'll need to track which depots
        have which packages. So this isn't actually fully supported.
    sample_generator : function
        Takes no inputs, returns two lists, longitudes and latitudes of the packages.
    dist_and_time : function
        Takes longitudes and latitudes, and returns a distance matrix, a time matrix, and a array of time windows.
    route_optimizer : function
        Inputs are the depot numbers, the distance and time matrices, the time windows as a np.array,
        the current day, the day each package arrived, and the number of times each package was futile.
        Outputs are a set of vehicle routes, and a list of packages that were not scheduled.
    simulator : function
        Simulates the deliveries for a set of routes, given the routes, distances, times and time windows.
    n_days : int
        The number of days to simulate.
    day_start : int
        The time for the start of a day
    day_end : int
        The time for the end of a day
    seed : int (Optional)
        The seed to initialise the random number generator with
    replications : int (Optional)
        Defaults to 1. The number of simulations to perform on the optimized route. Only the last is used as the input to the next day.
        (Might be an idea to take the mode values if enough simulations are performed?)
    plot : bool (Optional)
        Whether to display a plot of the current routes.
    collection_points: bool (Optional)
        Whether to enable the use of collection points
    k : int (Optional)
        If we have collection points, the number of collection points
    dist_threshold : int
        The distance a customer needs to be within to have their package assigned to a collection point
    futile_count_threshold : int
        The number of times a package needs to be futile before it gets assigned to a collection point
    cap : int
        The capacity of the collection points
    tlim : int
        The time to run the simulation for. Helps us stop the simulation if we get an extreme buildup of packages
    """
    start = time.time()
    logger.debug("Start multiday sim")

    rg = Generator(PCG64(seed))

    # Pregenerate arrivals
    latitudes_per_day = []
    longitudes_per_day = []
    time_windows_per_day = []
    customers_per_day = []
    allocat_packages_to_collection = [[] for i in range(k)]  # preset package allocation
    customer_to_cp = [
        [] for i in range(k)
    ]  # initialise the customer list for each collection point

    logger.debug("Generating incoming packages")

    for day in range(n_days):
        customers, new_time_windows = sample_generator(rg)
        latitudes_per_day.append([c.lat for c in customers])
        longitudes_per_day.append([c.lon for c in customers])
        time_windows_per_day.append(new_time_windows)
        customers_per_day.append(customers)

    data = []
    n_depots = depots.shape[1]
    delivery_time_windows = np.array(
        [[day_start, day_end] for i in range(n_depots)]
    )  # These are our beliefs about the time windows, not their true value
    arrival_days = np.zeros(n_depots)
    futile_count = np.zeros(n_depots)
    customers = np.array(
        [
            Customer(depots[0, 0], depots[1, 0], 1, 1, rg=rg)
            for i in range(len(depots[0]))
        ]
    )
    packages_at_collection = []
    collection_point_removed_packages = 0
    if collection_points and k != 0:  # choose the number of collection points
        sol_fac_lat, sol_fac_lon, coord, fac_coord = opt_collection_coord(
            k, cap, depots, sample_generator, dist_and_time, seed=None
        )

        # initialise a list of dictionaries for each collection point
        packages_at_collection = [{} for i in range(k)]
        # collection_point_removed_packages = [0 for i in range(k)]
    for day in range(n_days):
        logger.debug("Start day %i" % day)
        collection_point_removed_packages = [0 for i in range(k)]
        # Generate data
        new_time_windows, new_customers = (
            time_windows_per_day[day],
            customers_per_day[day],
        )

        delivery_time_windows = np.vstack((delivery_time_windows, new_time_windows))
        arrival_days = np.append(arrival_days, [day for _ in range(len(new_customers))])
        futile_count = np.append(futile_count, np.zeros(len(new_customers)))
        customers = np.append(customers, new_customers)

        logger.debug("Number of incoming packages: %i" % len(new_customers))
        logger.debug(
            "Current number of packages: %i" % (len(customers) - 1)
        )  # -1 for depo

        logger.debug("Calculating distance and time matrix")

        cp_customers = np.array([])
        collection_dist = 0
        # Remove packages from collection points
        if collection_points and k != 0:
            for i in range(k):
                logger.debug(
                    "Number of packages in collection %i, day %i: %i",
                    i,
                    day,
                    len(packages_at_collection[i]),
                )
                if (
                    len(packages_at_collection[i]) != 0
                ):  # there's packages in the collection point
                    p = 0.6  # success probability
                    # np.random.geometric(p=0.35, size=10000)
                    # np.random.geometric(p=0.6, size=len(packages_at_collection[i]))
                    collected_package = []
                    for c in packages_at_collection[i]:
                        v = packages_at_collection[i][c]
                        # package collection distribution
                        cdf = geom.cdf(v, p)
                        # if the probablity is greater than the random number, the package is picked up
                        if cdf >= rg.random():
                            collected_package.append(c)
                            collection_point_removed_packages[i] += 1
                        else:
                            # add a day to the number of days at collection point
                            packages_at_collection[i][c] += 1
                    for collected in collected_package:
                        packages_at_collection[i].pop(collected)

            # Add customers to collections points, and add visited collection points to customers
            #  a list of undelivered packages in the simulation
            undelivered = np.ones(len(futile_count), dtype=bool)
            for i, c in enumerate(futile_count):
                # a threshold of day count of the package in the system
                if c >= futile_count_threshold and i >= n_depots:
                    cd = os.path.join(
                        os.path.dirname(os.path.abspath(__file__)), "..", "data"
                    )
                    # get the dist from the cusomter's house to the collection points
                    lat_all = sol_fac_lat[:]
                    lon_all = sol_fac_lon[:]
                    lat_all.insert(0, customers[i].lat)
                    lon_all.insert(0, customers[i].lon)
                    if len(lat_all) > 4:
                        print("error")
                    coord_filename = None
                    dist, tm = osrm_get_dist(
                        cd,
                        coord_filename,
                        lat_all,
                        lon_all,
                        source=[0],
                        save=False,
                        host="localhost:5000",
                    )
                    # choose the closest collection point
                    min_value = min(dist[0])
                    # dist_threshold = 20000  # 20km
                    # allow the package to be assigned to the closest collection point if the dist is within the threshold
                    if min_value < dist_threshold:
                        min_ind = dist[0].index(min_value)
                        # assign if its closet collection point still has spare space
                        if len(packages_at_collection[min_ind]) < cap:
                            # assign the package to its closest collection point
                            allocat_packages_to_collection[min_ind].append(
                                i
                            )  # i is the index but not the unique index for the customer?
                            undelivered[
                                i
                            ] = False  # customer to be removed from the delivery list
                            # record the customer list for each collection point
                            customer_to_cp[min_ind].append(customers[i])
                            collection_dist += min_value
                            # tw_to_cp[min_ind].append(delivery_time_windows[i])
                            # ad_to_cp[min_ind].append(arrival_days[i])
                            # fc_to_cp[min_ind].append(futile_count[i])

                            # packages_at_collection[min_ind][customers[i]] = 0
            # Remove packages sent to collection points from customers
            delivery_time_windows = delivery_time_windows[undelivered]
            arrival_days = arrival_days[undelivered]
            futile_count = futile_count[undelivered]
            customers = customers[undelivered]

            cp_customers = np.array([])
            # add collection point as a customer if there is package allocated to it
            for cp in range(k):
                if len(customer_to_cp[cp]) != 0:
                    cp_customers = np.append(
                        cp_customers,
                        Customer(sol_fac_lat[cp], sol_fac_lon[cp], 1, 1, rg=rg),
                    )
            # customers = np.append(customers, new_customers)

            # cp_customers = np.array(
            #     [
            #         Customer(sol_fac_lat[cp], sol_fac_lon[cp], 1, 1, rg=rg)
            #         for cp in range(k)
            #         if len(customer_to_cp[cp]) != 0
            #     ]
            # )
            if len(cp_customers) > 0:
                cp_time_windows = np.array(
                    [[day_start, day_end] for i in range(len(cp_customers))]
                )

                delivery_time_windows = np.vstack(
                    (delivery_time_windows, cp_time_windows)
                )
                customers = np.append(customers, cp_customers)
                futile_count = np.append(futile_count, np.zeros(len(cp_customers)))
                arrival_days = np.append(
                    arrival_days, [day for _ in range(len(cp_customers))]
                )

        # Get times and distances
        dm, tm = dist_and_time(customers)
        if dm is None:
            logger.critical("Distance computation failed. Stopping simulation.")
            # We've exceeded the map bounds. Stop here for now, but we should really handle this more gracefully.
            break
        dm = np.array(dm)
        tm = np.array(tm)

        logger.debug("Compute alternate locations")

        # Setup list of alternate locations
        alternate_locations = []
        temp = customers.tolist()
        while len(temp) > 0:
            c = temp.pop()
            location_index = []
            for a in c.alternates:
                location_index.append(customers.tolist().index(a))
                if a in temp:
                    temp.remove(a)
            alternate_locations.append(location_index)

        logger.debug("Optimise routes")

        # Calulate routes for the day
        routes, unscheduled = route_optimizer(
            [i for i in range(n_depots)],
            dm,
            tm,
            delivery_time_windows,
            day,
            arrival_days,
            futile_count,
            alternate_locations,
        )
        if plot:
            plt.clf()
            routes.plot(
                positions=[(customer.lon, customer.lat) for customer in customers],
                weight_matrix=dm,
            )
            plt.show(block=False)
            plt.pause(0.001)

        futile_count[[i for i in range(len(customers)) if i not in unscheduled]] += 1

        # logger.debug(routes)
        logger.debug("Unscheduled: %s" % unscheduled)

        logger.debug("Start simulations")

        for i in range(replications):
            logger.debug("Replication %i" % i)
            # Simulate behaviour
            distances, times, futile, delivered = simulator(
                routes, dm, tm, delivery_time_windows, customers, rg
            )
            logger.debug("Delivered: %s" % delivered)

            # Data collection to save
            data.append(
                collect_data(
                    day,
                    routes,
                    distances,
                    times,
                    futile,
                    delivered,
                    arrival_days,
                    delivery_time_windows,
                    [len(l) for l in packages_at_collection],
                    collection_point_removed_packages,
                    collection_dist,
                )
            )

        # Remove delivered packages, using just the last result
        undelivered = np.ones(len(customers), dtype=bool)
        for alternates in alternate_locations:  # Remove all alternate locations as well
            for package in delivered:
                if package in alternates:
                    undelivered[alternates] = False
        undelivered[[i for i in range(n_depots)]] = True

        # # get the undelivered list for collection point
        # cp_undelivered = undelivered[-len(cp_customers) :]
        # count_cp_undelivered = 0

        # check if collection point is visited
        for i in range(len(cp_customers)):
            if not undelivered[-i - 1]:  # collection point visited
                for cust in customer_to_cp[
                    -i - 1
                ]:  # add package into the collection point
                    packages_at_collection[-i - 1][cust] = 0
                # allocat_packages_to_collection[min_ind] = []
                customer_to_cp[
                    -i - 1
                ] = []  # reset the customer list for the visited collection point
            else:
                # count_cp_undelivered += 1
                undelivered[
                    -i - 1
                ] = False  # remove collection point in the customer list
                # Generate data
                # new_time_windows, new_arrival_days, new_futile_count, new_customers = (
                #     tw_to_cp[-i - 1],
                #     ad_to_cp[-i - 1],
                #     fc_to_cp[-i - 1],
                #     customer_to_cp[-i - 1],
                # )

        delivery_time_windows = delivery_time_windows[undelivered]
        arrival_days = arrival_days[undelivered]
        futile_count = futile_count[undelivered]
        customers = customers[undelivered]
        # if count_cp_undelivered:
        #     # stick the undelivered collection pacakges on
        #     delivery_time_windows = np.vstack((delivery_time_windows, new_time_windows))
        #     arrival_days = np.append(arrival_days, new_arrival_days)
        #     futile_count = np.append(futile_count, new_futile_count)
        #     customers = np.append(customers, new_customers)

        logger.debug(
            "Number of remaining Packages: %i" % (len(customers) - 1)
        )  # -1 for depo
        if time.time() - start > tlim:
            break

    return data
コード例 #25
0
def OneSampleKS_Geometric(data_cases,datatype):
    row,col = data_cases.shape

    ga_confirmed_data = []
    hi_confirmed_data = []
    
    for index, row in data_cases.iterrows():
        # Oct,Nov,Dec data for both states
        if row['Date'][5] =='1':
            if datatype == 'Confirmed cases':
                ga_confirmed_data.append(row['daily_GA_confirmed'])
                hi_confirmed_data.append(row['daily_HI_confirmed'])
            
            else:
                ga_confirmed_data.append(row['daily_GA_deaths'])
                hi_confirmed_data.append(row['daily_HI_deaths'])
    
    # print("ga data: ",ga_confirmed_data )
    # print("hi data: ",hi_confirmed_data )
    
    # geometric P mme using first state data
    p_mme =  float(len(ga_confirmed_data)) / float(np.sum(ga_confirmed_data))
    
    # Find ecdf of second data
    hi_confirmed_data.sort()
    second_DataLength = len(hi_confirmed_data)
    ecdf_secondData = []
    stepSize = 1 / second_DataLength
    step = 0
    for j in range(second_DataLength - 1):
        step += stepSize
        ecdf_secondData.append(step)
    
    ecdf_secondData.append(1) 
    #print(ecdf_secondData)


    maxDifference = 0
    print("p_mme: ",p_mme)
    # Iterating over all the data points to calculate difference
    for k in range(second_DataLength):
        #print("hi_confirmed_data: ",hi_confirmed_data[k])
        cdfAtPoint = geom.cdf(hi_confirmed_data[k], p_mme)

        if k == 0:
            ecdf_Left = 0
        else:
            ecdf_Left = ecdf_secondData[k-1]

        ecdf_Right = ecdf_secondData[k]
        
        diff_ecdf_Left = abs(cdfAtPoint - ecdf_Left)
        diff_ecdf_Right = abs(cdfAtPoint - ecdf_Right)
        # print("cdfAtPoint: ", cdfAtPoint)
        # print("ecdf_Left: ", ecdf_Left)
        # print("ecdf_Right: ", ecdf_Right)
        
        difference = max(diff_ecdf_Left,diff_ecdf_Right)
        if maxDifference < difference:
            maxDifference = difference

    print("\nMaximum Difference: ", maxDifference)
    c = 0.05
    # Comparing maximum difference with critical value
    if(maxDifference > c):
        print("Null hypothesis is rejected as Oct-Dec 2020 data for the second state does not have the distribution with the obtained MME parameters for ", datatype)
    else:
        print("Null hypothesis is accepted as Oct-Dec 2020 data for the second state have the distribution with the obtained MME parameters for ", datatype)