def calc_KS_1_sample_test(x_points, parameters_list, data, distribution_name, column_type , column_name): dict_name = 'KS_' + distribution_name +'_cols' dict_name = ['x', 'F_cap_x', 'F_cap_DC_left', 'F_cap_DC_right', 'left_diff_abs', 'right_diff_abs'] row_list = [] for x in x_points: if(distribution_name == 'binomial'): #Find cdf of binomial at given point x F_cap_x = binom.cdf(x, parameters_list[0], parameters_list[1]) if(distribution_name == 'poisson'): #Find cdf of poisson at given point x F_cap_x = poisson.cdf(x, parameters_list[0]) if(distribution_name == 'geometric'): #Find cdf of geometric at given point x F_cap_x = geom.cdf(x, parameters_list[0]) # Find CDF to the left of point x in the sorted DC dataset F_cap_DC_left = get_left_cdf(data ,column_name, x, 'DC_eCDF') # Find CDF to the right of point x in the sorted DC dataset F_cap_DC_right = get_right_cdf(data, column_name, x, 'DC_eCDF') # Find absolute difference between left CDFs of x points and DC datasets left_diff_abs = round(abs(F_cap_x - F_cap_DC_left), 4) # Find absolute difference between right CDFs of x points and DC datasets right_diff_abs = round(abs(F_cap_x - F_cap_DC_right), 4) row = [x, F_cap_x, F_cap_DC_left, F_cap_DC_right, left_diff_abs, right_diff_abs] row_dict = dict(zip(dict_name, row)) row_list.append(row_dict) # Build KS Test Table (represented as a dataframe) df_name = 'KS_' + distribution_name +'_df' df_name = pd.DataFrame(row_list, columns=dict_name) # Calculate KS statistic value max_diff_x = [] d_right = df_name.iloc[df_name['right_diff_abs'].idxmax(axis=1)][['x', 'right_diff_abs']] d_left = df_name.iloc[df_name['left_diff_abs'].idxmax(axis=1)][['x', 'left_diff_abs']] if d_right['right_diff_abs'] == d_left['left_diff_abs']: print("KS Statistic is {0} at x = {1} and {2}".format(d_right['right_diff_abs'], d_left['x'], d_right['x'])) max_diff_x.append(d_right['x']) max_diff_x.append(d_left['x']) elif d_right['right_diff_abs'] > d_left['left_diff_abs']: print("KS Statistic is {0} at x = {1}".format(d_right['right_diff_abs'], d_right['x'])) max_diff_x.append(d_right['x']) else: print("KS Statistic is {0} at x = {1}".format(d_left['left_diff_abs'], d_left['x'])) max_diff_x.append(d_left['x']) # Reject/Accept Null Hypothesis based on calculated KS Statistic d and given threshold=0.05 d = max(d_right['right_diff_abs'], d_left['left_diff_abs']) critical_value = 0.05 hypothesis_type = 'confirmed positive cases' if column_type == 'confirmed' else column_type if d > critical_value: print("Rejected Null Hypothesis: We reject the hypothesis that the distribution of daily {0} in DC is {3}, as KS Statistic d = {1} exceeds threshold {2}".format(hypothesis_type, d, critical_value, distribution_name)) print() else: print("Failed to reject Null Hypothesis: We accept the hypothesis that the distribution of daily {0} is same in both CT and DC, as KS Statistic d = {1} does not exceed threshold {2}".format(hypothesis_type, d, critical_value)) print() return max_diff_x
def ks_1sample(mme_sample_data, last_week): sample_variance = np.var(mme_sample_data, ddof=1) sample_mean = np.mean(mme_sample_data) binom_p_mme = 1-(sample_variance/sample_mean) binom_n_mme = (sample_mean*sample_mean)/(sample_mean - sample_variance) poisson_lambda_mme = sample_mean geom_p_mme = 1/sample_mean geom_differences = [] binom_differences = [] poisson_differences = [] for i in range(7): ecdf_left = i/7 ecdf_right = (i+1)/7 geom_cdf = geom.cdf(last_week[i], geom_p_mme) poisson_cdf = poisson.cdf(last_week[i], poisson_lambda_mme) binom_cdf = binom.cdf(last_week[i], binom_n_mme, binom_p_mme) geom_differences.append(abs(ecdf_left - geom_cdf)) geom_differences.append(abs(ecdf_right - geom_cdf)) binom_differences.append(abs(ecdf_left - binom_cdf)) binom_differences.append(abs(ecdf_right - binom_cdf)) poisson_differences.append(abs(ecdf_left - poisson_cdf)) poisson_differences.append(abs(ecdf_right - poisson_cdf)) return np.max(geom_differences), np.max(binom_differences), np.max(poisson_differences)
def KS1Sample(data_, true_distro_): state1_ = data_.columns[0] state2_ = data_.columns[1] MME = [] if true_distro_ == 'poisson': MME = MMEPoisson(data_[state1_]) elif true_distro_ == 'geometric': MME = MMEGeometric(data_[state1_]) elif true_distro_ == 'binomial': MME = MMEBinomial(data_[state1_]) else: return None print("The MME values : ", MME) uniq_val_state2, freq_val_state2 = np.unique(data_[state2_], return_counts=True) cdf_state2 = np.cumsum(freq_val_state2) cdf_state2 = cdf_state2 / cdf_state2[-1] # print(uniq_val_state2) # print(freq_val_state2) # print(cdf_state2) max_diff = -1 for i, value_ in enumerate(uniq_val_state2): left_ecdf, right_ecdf = 0, 0 if i == 0: left_ecdf = 0 right_ecdf = cdf_state2[i] elif i == len(uniq_val_state2) - 1: left_ecdf = cdf_state2[i - 1] right_ecdf = 1 else: left_ecdf = cdf_state2[i - 1] right_ecdf = cdf_state2[i] true_cdf = 0 if true_distro_ == 'poisson': true_cdf = poisson.cdf(value_, MME[0]) elif true_distro_ == 'geometric': true_cdf = geom.cdf(value_, MME[0]) elif true_distro_ == 'binomial': true_cdf = binom.cdf(value_, MME[0], MME[1]) diff_ = max(abs(left_ecdf - true_cdf), abs(right_ecdf - true_cdf)) if diff_ > max_diff: max_diff = diff_ # print(value_, left_ecdf, right_ecdf, true_cdf, diff_) if max_diff > 0.05: print( "Since Max distance(%f) > 0.05, we reject Null Hypothesis (%s has same distribution as %s having true distribution of %s)\n\n" % (max_diff, state2_, state1_, true_distro_)) else: print( "Since Max distance(%f) <= 0.05, we accept Null Hypothesis (%s has same distribution as %s having true distribution of %s)\n\n" % max_diff)
def UnbiasedEstimate(n,k,theta,Beta,theta0,Beta0,func,minT=1000,logf=None,zipfParam=1.5): """ This function simulates an unbiased estimate following the algorithm of Rhee and Glynn. func is a function that accepts n,k,theta,Beta,theta0,Beta0 and outputs a number or a numpy array """ # Draw the length of the Markov chain from a power law #T = minT + np.random.zipf(a=zipfParam) # Draw the length of the Markov chain from a geometric distribution T = minT + np.random.geometric(zipfParam) print("The number of steps in the Markov chain is %i"%T) # Initialize variables R,I = np.shape(n) G1 = np.ones(R) G2 = np.ones(R) try: logf.size except AttributeError: logf = GetArray(n.max(),Beta) est = func(n,k,theta,Beta,theta0,Beta0) k1 = k.copy() # This is the equivalent of k k2 = k.copy() # This is the equivalent of \tilde k for step in range(1,T+1): kR1 = k1.sum(1) kI1 = k1.sum(0) kR2 = k2.sum(1) kI2 = k2.sum(0) # Resample G for r in range(R): auxGammas = gamma(1,size=max(kR1[r],kR2[r])) G1[r] = gamma(theta/Beta) G2[r] = G1[r] G1[r] += sum(auxGammas[:kR1[r]]) G2[r] += sum(auxGammas[:kR2[r]]) # Resample D auxGammas = gamma(1,size=[I,max(kI1.max(),kI2.max())]) auxGammas2 = gamma(1-Beta0,size=I) auxGamma = gamma(theta0+I*Beta0) D1 = auxGammas2 + np.array([sum(auxGammas[i,:kI1[i]-1]) for i in range(I)]) D1 = D1/(D1.sum()+auxGamma) D2 = auxGammas2 + np.array([sum(auxGammas[i,:kI2[i]-1]) for i in range(I)]) D2 = D2/(D2.sum()+auxGamma) # Resample k unif = np.random.uniform(size=k.shape) UpdateK(k1,n,I,R,G1,D1,unif,logf,Beta) if step>1: UpdateK(k2,n,I,R,G2,D2,unif,logf,Beta) # Terminate if coupling has merged if (k1==k2).all(): break # Otherwise continue sum #denom = (1-zipf.cdf(step-minT-1,a=zipfParam)) if step>minT else 1.0 denom = (1-geom.cdf(step-minT-1,p=zipfParam)) if step>minT else 1.0 summand = (func(n,k1,theta,Beta,theta0,Beta0)-func(n,k2,theta,Beta,theta0,Beta0))/denom est += summand print summand print est return est
def kolmogorov_criterion(self): n = len(self.sample) x, emp_cdf = self.distribution_function() cdf = geom.cdf(x, self.p) d = emp_cdf - cdf d_n = np.amax(np.absolute(d)) s_k = d_n * np.sqrt(n) print('geometric', s_k)
def get_mete_sad_geom(S, N): """METE's predicted RAD when the only constraint is N/S Keyword arguments: S -- the number of species N -- the total number of individuals """ assert S > 1, "S must be greater than 1" assert N > 0, "N must be greater than 0" assert S/N < 1, "N must be greater than S" p = S / N abundance = list(empty([S])) rank = range(1, int(S)+1) rank.reverse() for i in range(0, int(S)): y = lambda x: geom.cdf(x,p) / geom.cdf(N,p) - (rank[i]-0.5) / S abundance[i] = int(round(bisect(y, 0, N))) return (abundance, p)
def KS_1_sample_test(states_data,column_type, distribution_type): CT_col_name = 'CT ' + column_type DC_col_name = 'DC ' + column_type # Split the dataset per state and sort the 2 state-specific columns on which we need to perform the KS Test (#cases/#confirmed) CT_sorted_df = states_data.loc[(states_data['Date'] >= '2020-10-01') & (states_data['Date'] <= '2020-12-31')][[CT_col_name]].sort_values(CT_col_name).reset_index(drop=True) DC_sorted_df = states_data.loc[(states_data['Date'] >= '2020-10-01') & (states_data['Date'] <= '2020-12-31')][[DC_col_name]].sort_values(DC_col_name).reset_index(drop=True) # Add a new column denoting the CDF at each point in the DC cases/deaths columns DC_sorted_df['DC_eCDF'] = get_cdf_list(DC_sorted_df.shape[0]) # Find distinct datapoints and their corresponding CDFs at each of the points DC_distinct_df = DC_sorted_df.drop_duplicates(subset=DC_col_name, keep="last").reset_index(drop=True) # points for x column for KS test x_points = DC_distinct_df[DC_col_name].to_list() if (distribution_type == 'poisson'): #Poisson distribution mean = sample_mean(CT_sorted_df, CT_col_name) #MME parameters lambda_mme = poisson_para(mean) #Calls calculation of KS 1 sample statistic max_diff_x = calc_KS_1_sample_test(x_points, [lambda_mme] , DC_distinct_df , 'poisson', column_type,DC_col_name ) DC_sorted_df['DC_eCDF_mme'] = DC_sorted_df.apply(lambda row : poisson.cdf(row[DC_col_name], lambda_mme),axis =1) plot_KS_1_Sample_eCDF( DC_sorted_df, DC_col_name, max_diff_x, column_type.capitalize(),distribution_type) if (distribution_type == 'geometric'): #Geometric distribution mean = sample_mean(CT_sorted_df,CT_col_name) #MME parameters p_mme = geometric_para(mean) #Calls calculation of KS 1 sample statistic max_diff_x = calc_KS_1_sample_test(x_points, [p_mme] , DC_distinct_df , 'geometric', column_type, DC_col_name) DC_sorted_df['DC_eCDF_mme'] = DC_sorted_df.apply(lambda row : geom.cdf(row[DC_col_name], p_mme),axis =1) plot_KS_1_Sample_eCDF( DC_sorted_df, DC_col_name, max_diff_x, column_type.capitalize(),distribution_type) if (distribution_type == 'binomial'): #Binomial distribution mean = sample_mean(CT_sorted_df,CT_col_name) variance = sample_variance(CT_sorted_df,mean,CT_col_name) #MME parameters n_mme, p_mme = binomial_para(mean,variance) #Calls calculation of KS 1 sample statistic c max_diff_x = calc_KS_1_sample_test(x_points, [n_mme, p_mme] , DC_distinct_df , 'binomial', column_type, DC_col_name) DC_sorted_df['DC_eCDF_mme'] = DC_sorted_df.apply(lambda row : binom.cdf(row[DC_col_name], n_mme, p_mme),axis =1) plot_KS_1_Sample_eCDF(DC_sorted_df, DC_col_name, max_diff_x, column_type.capitalize(),distribution_type)
def calc_real_prob(self): fail_prob_one_robot = 0 p_dist = 1 / (self.experiment.destination_range[1] - self.experiment.destination_range[0] + 1) for dist in range(self.experiment.destination_range[0], self.experiment.destination_range[1] + 1): for (gprobspec, dprop) in zip(self.experiment.grip_probs, self.experiment.drop_probs): fail_prob_one_robot += gprobspec[1] * geom.cdf(dist, dprop) fail_prob_one_robot *= p_dist k = 7 print(round(fail_prob_one_robot, k)) real_prob = (1.0 - fail_prob_one_robot) ** self.experiment.num_robots print(round(real_prob, k)) rp2 = (1.0 - round(fail_prob_one_robot, k)) ** self.experiment.num_robots print(round(rp2, k)) return real_prob
def k_s_test_1_sample(pa, ri, dist='poisson', c=0.05): pa_mean = np.mean(pa) pa_var = np.var(pa) # Sort the data to get the CDFs pa, ri = np.sort(pa), np.sort(ri) s = min(pa[0], ri[0]) - 100 e = max(pa[len(pa) - 1], ri[len(ri) - 1]) + 100 F_pa = [] F_ri = get_eCDF(sorted_ri_confirmed, s, e) # pa CDF at ri change points if dist == 'poisson': lam_mme = pa_mean F_pa = [poisson.cdf(cp, lam_mme) for cp in ri] elif dist == 'binomial': p_mme = 1 - pa_var / pa_mean n_mme = pa_mean / p_mme F_pa = [binom.cdf(cp, n_mme, p_mme) for cp in ri] elif dist == 'geometric': p_mme = 1 / pa_mean F_pa = [geom.cdf(cp, p_mme) for cp in ri] F_ri_minus, F_ri_plus = F_ri[0:-1], F_ri[1:] ks_stat = ks_index = 0 for i in range(0, len(X)): if abs(F_pa[i] - F_ri_minus[i]) > ks_stat: ks_stat = abs(F_pa[i] - F_ri_minus[i]) ks_index = i if abs(F_pa[i] - F_ri_plus[i]) > ks_stat: ks_stat = abs(F_pa[i] - F_ri_plus[i]) ks_index = i print("ks_stat is {} at {}, where as c is {}".format( ks_stat, ri[ks_index], c)) if ks_stat > c: print("d > c, So, we reject Null Hypothesis") else: print("d <= c, So, we accept Null Hypothesis") return
from scipy.stats import geom import numpy as np k = 1 p = 0.3 expect = geom.expect(args=(p, ), loc=0) mean = geom.mean(p) var = geom.var(p) sigma = geom.std(p) pmf = geom.pmf(k, p, loc=0) cdf = geom.cdf(k, p, loc=0) ex2 = var + expect**2 print('expected = ', expect) print('mean = ', mean) print('variance = ', var) print('std. dev. = ', sigma) print('pmf = ', pmf) print('cdf = ', cdf) print("E[X]^2 = ", ex2)
T = dt * (n_time - 1) # End time # Discretize x x = np.linspace(-L / 2, L / 2, n + 1) x = x[:n] # Create vectors of random values for sines # Sampling of A and phi X = pyDOE.lhs(2, samples=M, criterion='maximin') A_vect = X[:, 0] phi_vect = 2 * np.pi * X[:, 1] # Sampling of omega max_omega = 10 cum_distrib = geom.cdf(np.arange(1, max_omega + 1), 0.25) cum_distrib = cum_distrib / cum_distrib[-1] numbs = np.random.uniform(size=M) omega_vect = np.zeros(M) for k in range(max_omega): omega_vect = omega_vect + (numbs < cum_distrib[k]) omega_vect = 11 - omega_vect # Create vectors of random values for square waves # Sampling of A, c, and w X = pyDOE.lhs(3, samples=M, criterion='maximin') A2_vect = X[:, 0]
def cdf(self, x, p): cdf = geom.cdf(self, x, p) return cdf
def calc_real_prob(self): fail_prob_one_robot = 0 for (gprobspec, dprop) in zip(self.experiment.grip_probs, self.experiment.drop_props): fail_prob_one_robot += gprobspec[1] * geom.cdf(self.experiment.x_goal, dprop) real_prob = (1.0 - fail_prob_one_robot) ** self.experiment.num_robots return real_prob
def theoretical_distribution(self): x = np.array([i for i in range(self.r_low, self.r_up + 1)]) plt.plot(x, geom.cdf(x, self.p), 'ro')
for tick in ax.xaxis.get_major_ticks(): tick.label.set_fontsize(5) for tick in ax.yaxis.get_major_ticks(): tick.label.set_fontsize(5) fig.suptitle("Distribución Binomial") plt.show() # DISTRIBUCIÓN GEOMETRICA from scipy.stats import geom # LA GEOMETRICA DE SCIPY EMPIEZA EN 1 NO EN 0!!! si queremos corregir podremos loc = -1 geom.pmf(0, p=0.25, loc=-1) geom.cdf(0, p=0.25, loc=-1) geom.cdf(4, p=0.25, loc=-1) geom.rvs(p=0.25, size=20, loc=-1) # Comprobamos lo del loc geom.cdf(range(5), p=0.3, loc=0) geom.cdf(range(5), p=0.3, loc=-1) geom.stats(p=0.25, loc=0, moments="mv") geom.stats(p=0.25, loc=-1, moments="mv") n, p = 10, 0.25
plt.show() # ### Geometric Distribution # In[4]: #Geometric Distribution from scipy.stats import geom p = 0.6 x = np.arange(geom.ppf(0.01, p), geom.ppf( 0.99, p)) #Percent Point Function (inverse of cdf — percentiles) print("Mean : ", geom.stats(p, moments='m')) print("Variance : ", geom.stats(p, moments='v')) print("Prob. Mass Func. : ", geom.pmf(x, p)) print("Cum. Density Func.: ", geom.cdf(x, p)) CDF = geom.cdf(x, p) fig = plt.figure(figsize=(20, 10)) plt.subplot(221) plt.plot(x, geom.pmf(x, p), 'go', ms=8, label='PMF') plt.vlines(x, 0, geom.pmf(x, p), colors='g', lw=5, alpha=0.5) plt.xlabel("Sample Space of Geometric Distribution", fontsize=14) plt.ylabel("PMF", fontsize=14) plt.title("Probability Distribution of Geometric(p=0.6) Distribution", fontsize=16) plt.xticks(np.arange(0, 6, 1)) plt.yticks(np.arange(0, 0.8, 0.1)) plt.legend(loc='best', shadow=True)
def get_eval_pvalue(n, theta=0.09241035129185671, p=0.011742364113774086): if n == 0: return 1 - theta return 1 - (theta + (1 - theta) * geom.cdf(n, p, loc=0))
#'weight' : 'bold', 'size': 13 } matplotlib.rc('font', **font) num_seats_def = 150 prng = np.random.RandomState(1) #Pseudorandom number generator demand_par_a_def = 60.2 #prng.randint(100, size=1).astype(float) demand_par_b_def = 10.0 #prng.randint(5, size=1).astype(float) demand_par_c_def = 1.0 epsilons_support_def = 10 pmin_def = 1.0 p = 0.3 prob_eps_rem = (1 - geom.cdf(epsilons_support_def + 1, p)) / ( epsilons_support_def * 2 + 1) prob_eps = [] for i in range(0, epsilons_support_def + 1): if i == 0: prob_eps.append(geom.pmf(i + 1, p) + prob_eps_rem) else: prob_eps.append(geom.pmf(i + 1, p) / 2 + prob_eps_rem) prob_eps_rev = prob_eps.copy() prob_eps_rev.reverse() prob_eps_rev.pop() prob_eps_def = prob_eps_rev + prob_eps p_r_min_def = 0.8 * pmin_def gamma_def = 0.9
if max_diff > 0.05: print("We reject NULL hypothesis, because Max value is " + str(max_diff) + " > c : 0.05") else: print("We accept NULL hypothesis, because Max value is " + str(max_diff) + " <= c : 0.05") """##**KS 1 sample test with geometric as distribution for IN Confirmed**""" # print(cases_state_mme_geometric) cdf_y = numpy.array([]) cdf = 0 max_diff = 0 geom_cdf = numpy.array([]) geometric_dipoint = 0 for i in cases_in: geom_point = geom.cdf(i, cases_state_mme_geometric) geom_cdf = numpy.append(geom_cdf, geom_point) if max_diff < numpy.abs(cdf - geom_point): max_diff = numpy.abs(cdf - geom_point) geometric_dipoint = geom_point cdf += 1 / n cdf_y = numpy.append(cdf_y, cdf) plt.figure('Cases Case', figsize=(18, 8)) plt.xlabel('Geometric Distribution') plt.ylabel('Cumulative Distribution Frequency') plt.step(cases_in, cdf_y, label="IN") plt.plot(cases_in, geom_cdf, label="IL") plt.show() if max_diff > 0.05:
def testGeom(): # {{{ """ Geometric Distribution (discrete) Notes ----- 伯努利事件进行k次, 第一次成功的概率 为什么是几何分布呢, 为什么不叫几毛分布? 与几何数列有关 (乘积倍数) p: 成功的概率 q: 失败的概率(1-p) k: 第一次成功时的经历的次数 (前k-1次是失败的) geom.pmf(k, p), (1-p)**(k-1)*p) """ # 准备数据: 已知 p. # X轴: 第k次才成功 # Y轴: 概率 p = 0.4 xs = np.arange(geom.ppf(0.01, p), geom.ppf(0.99, p), step=1) # E(X) = 1/p, D(X) = (1-p)/p**2 mean, var, skew, kurt = geom.stats(p, moments='mvsk') print("mean: %.2f, var: %.2f, skew: %.2f, kurt: %.2f" % (mean, var, skew, kurt)) fig, axs = plt.subplots(2, 2) # 显示pmf1 ys = geom.pmf(xs, p) axs[0][0].plot(xs, ys, 'bo', markersize=5, label='geom pmf') axs[0][0].vlines(xs, 0, ys, colors='b', linewidth=5, alpha=0.5, label='vline pmf') axs[0][0].legend(loc='best', frameon=False) # 显示pmf2 ys = (1 - p)**(xs - 1) * p axs[0][1].plot(xs, ys, 'bo', markersize=5, label='geom pmf') axs[0][1].vlines(xs, 0, ys, colors='b', linewidth=5, alpha=0.5, label='vline pmf') axs[0][1].legend(loc='best', frameon=False) axs[0][1].set_title('ys = (1-p)**(xs-1)*p') # 显示cdf P(X<=x) ys = geom.cdf(xs, p) axs[1][0].plot(xs, ys, 'bo', markersize=5, label='geom cdf') axs[1][0].legend(loc='best', frameon=False) print(np.allclose(xs, geom.ppf(ys, p))) # ppf:y-->x cdf:x-->y # 生成随机数据(random variables) data = geom.rvs(p, size=1000) import sys sys.path.append("../../thinkstats") import Pmf pmf = Pmf.MakePmfFromList(data) xs, ys = pmf.Render() axs[1][1].plot(xs, ys, 'bo', markersize=5, label='rvs-pmf') plt.show()
def cdf(self, k): return geom.cdf(k, self.p)
def _cdf(self, x, l, k, p, w): wei_cdf = 1 - np.exp(-1 * np.power(x / l, k)) return w * wei_cdf + (1 - w) * geom.cdf(x, p)
ax.set_title('Función de probabilidad de Geom(0.3)') ax.vlines(x, 0, geom.pmf(x, p), colors='b', lw=4, alpha=0.5) rv = geom(p) ax.vlines(x, 0, rv.pmf(x), colors='k', linestyles='--', lw=1, label="Frozen PMF") ax.legend(loc='best') plt.show() print("Media %f" % mean) print("Varianza %f" % var) print("Sesgo %f" % skew) print("Curtosis %f" % kurt) fig, ax = plt.subplots(1, 1) prob = geom.cdf(x, p) ax.plot(x, prob, 'bo', ms=8, label="Función de distribución acumulada") plt.title('Función de distribución acumulada') plt.show() fig, ax = plt.subplots(1, 1) r = geom.rvs(p, size=10000) plt.hist(r) plt.title('Histograma de la random') plt.show()
def multiday( depots, sample_generator, dist_and_time, route_optimizer, simulator, n_days, day_start, day_end, seed=None, replications=1, plot=False, collection_points=None, k=0, dist_threshold=20000, futile_count_threshold=1, cap=20, tlim=1e10, ): """Multiday Sim Paramters --------- depots : np.array 2*n_depots array of longitudes and latitudes of the depots. This is set up to support multidepot problems. However, to do this properly we'll need to track which depots have which packages. So this isn't actually fully supported. sample_generator : function Takes no inputs, returns two lists, longitudes and latitudes of the packages. dist_and_time : function Takes longitudes and latitudes, and returns a distance matrix, a time matrix, and a array of time windows. route_optimizer : function Inputs are the depot numbers, the distance and time matrices, the time windows as a np.array, the current day, the day each package arrived, and the number of times each package was futile. Outputs are a set of vehicle routes, and a list of packages that were not scheduled. simulator : function Simulates the deliveries for a set of routes, given the routes, distances, times and time windows. n_days : int The number of days to simulate. day_start : int The time for the start of a day day_end : int The time for the end of a day seed : int (Optional) The seed to initialise the random number generator with replications : int (Optional) Defaults to 1. The number of simulations to perform on the optimized route. Only the last is used as the input to the next day. (Might be an idea to take the mode values if enough simulations are performed?) plot : bool (Optional) Whether to display a plot of the current routes. collection_points: bool (Optional) Whether to enable the use of collection points k : int (Optional) If we have collection points, the number of collection points dist_threshold : int The distance a customer needs to be within to have their package assigned to a collection point futile_count_threshold : int The number of times a package needs to be futile before it gets assigned to a collection point cap : int The capacity of the collection points tlim : int The time to run the simulation for. Helps us stop the simulation if we get an extreme buildup of packages """ start = time.time() logger.debug("Start multiday sim") rg = Generator(PCG64(seed)) # Pregenerate arrivals latitudes_per_day = [] longitudes_per_day = [] time_windows_per_day = [] customers_per_day = [] allocat_packages_to_collection = [[] for i in range(k)] # preset package allocation customer_to_cp = [ [] for i in range(k) ] # initialise the customer list for each collection point logger.debug("Generating incoming packages") for day in range(n_days): customers, new_time_windows = sample_generator(rg) latitudes_per_day.append([c.lat for c in customers]) longitudes_per_day.append([c.lon for c in customers]) time_windows_per_day.append(new_time_windows) customers_per_day.append(customers) data = [] n_depots = depots.shape[1] delivery_time_windows = np.array( [[day_start, day_end] for i in range(n_depots)] ) # These are our beliefs about the time windows, not their true value arrival_days = np.zeros(n_depots) futile_count = np.zeros(n_depots) customers = np.array( [ Customer(depots[0, 0], depots[1, 0], 1, 1, rg=rg) for i in range(len(depots[0])) ] ) packages_at_collection = [] collection_point_removed_packages = 0 if collection_points and k != 0: # choose the number of collection points sol_fac_lat, sol_fac_lon, coord, fac_coord = opt_collection_coord( k, cap, depots, sample_generator, dist_and_time, seed=None ) # initialise a list of dictionaries for each collection point packages_at_collection = [{} for i in range(k)] # collection_point_removed_packages = [0 for i in range(k)] for day in range(n_days): logger.debug("Start day %i" % day) collection_point_removed_packages = [0 for i in range(k)] # Generate data new_time_windows, new_customers = ( time_windows_per_day[day], customers_per_day[day], ) delivery_time_windows = np.vstack((delivery_time_windows, new_time_windows)) arrival_days = np.append(arrival_days, [day for _ in range(len(new_customers))]) futile_count = np.append(futile_count, np.zeros(len(new_customers))) customers = np.append(customers, new_customers) logger.debug("Number of incoming packages: %i" % len(new_customers)) logger.debug( "Current number of packages: %i" % (len(customers) - 1) ) # -1 for depo logger.debug("Calculating distance and time matrix") cp_customers = np.array([]) collection_dist = 0 # Remove packages from collection points if collection_points and k != 0: for i in range(k): logger.debug( "Number of packages in collection %i, day %i: %i", i, day, len(packages_at_collection[i]), ) if ( len(packages_at_collection[i]) != 0 ): # there's packages in the collection point p = 0.6 # success probability # np.random.geometric(p=0.35, size=10000) # np.random.geometric(p=0.6, size=len(packages_at_collection[i])) collected_package = [] for c in packages_at_collection[i]: v = packages_at_collection[i][c] # package collection distribution cdf = geom.cdf(v, p) # if the probablity is greater than the random number, the package is picked up if cdf >= rg.random(): collected_package.append(c) collection_point_removed_packages[i] += 1 else: # add a day to the number of days at collection point packages_at_collection[i][c] += 1 for collected in collected_package: packages_at_collection[i].pop(collected) # Add customers to collections points, and add visited collection points to customers # a list of undelivered packages in the simulation undelivered = np.ones(len(futile_count), dtype=bool) for i, c in enumerate(futile_count): # a threshold of day count of the package in the system if c >= futile_count_threshold and i >= n_depots: cd = os.path.join( os.path.dirname(os.path.abspath(__file__)), "..", "data" ) # get the dist from the cusomter's house to the collection points lat_all = sol_fac_lat[:] lon_all = sol_fac_lon[:] lat_all.insert(0, customers[i].lat) lon_all.insert(0, customers[i].lon) if len(lat_all) > 4: print("error") coord_filename = None dist, tm = osrm_get_dist( cd, coord_filename, lat_all, lon_all, source=[0], save=False, host="localhost:5000", ) # choose the closest collection point min_value = min(dist[0]) # dist_threshold = 20000 # 20km # allow the package to be assigned to the closest collection point if the dist is within the threshold if min_value < dist_threshold: min_ind = dist[0].index(min_value) # assign if its closet collection point still has spare space if len(packages_at_collection[min_ind]) < cap: # assign the package to its closest collection point allocat_packages_to_collection[min_ind].append( i ) # i is the index but not the unique index for the customer? undelivered[ i ] = False # customer to be removed from the delivery list # record the customer list for each collection point customer_to_cp[min_ind].append(customers[i]) collection_dist += min_value # tw_to_cp[min_ind].append(delivery_time_windows[i]) # ad_to_cp[min_ind].append(arrival_days[i]) # fc_to_cp[min_ind].append(futile_count[i]) # packages_at_collection[min_ind][customers[i]] = 0 # Remove packages sent to collection points from customers delivery_time_windows = delivery_time_windows[undelivered] arrival_days = arrival_days[undelivered] futile_count = futile_count[undelivered] customers = customers[undelivered] cp_customers = np.array([]) # add collection point as a customer if there is package allocated to it for cp in range(k): if len(customer_to_cp[cp]) != 0: cp_customers = np.append( cp_customers, Customer(sol_fac_lat[cp], sol_fac_lon[cp], 1, 1, rg=rg), ) # customers = np.append(customers, new_customers) # cp_customers = np.array( # [ # Customer(sol_fac_lat[cp], sol_fac_lon[cp], 1, 1, rg=rg) # for cp in range(k) # if len(customer_to_cp[cp]) != 0 # ] # ) if len(cp_customers) > 0: cp_time_windows = np.array( [[day_start, day_end] for i in range(len(cp_customers))] ) delivery_time_windows = np.vstack( (delivery_time_windows, cp_time_windows) ) customers = np.append(customers, cp_customers) futile_count = np.append(futile_count, np.zeros(len(cp_customers))) arrival_days = np.append( arrival_days, [day for _ in range(len(cp_customers))] ) # Get times and distances dm, tm = dist_and_time(customers) if dm is None: logger.critical("Distance computation failed. Stopping simulation.") # We've exceeded the map bounds. Stop here for now, but we should really handle this more gracefully. break dm = np.array(dm) tm = np.array(tm) logger.debug("Compute alternate locations") # Setup list of alternate locations alternate_locations = [] temp = customers.tolist() while len(temp) > 0: c = temp.pop() location_index = [] for a in c.alternates: location_index.append(customers.tolist().index(a)) if a in temp: temp.remove(a) alternate_locations.append(location_index) logger.debug("Optimise routes") # Calulate routes for the day routes, unscheduled = route_optimizer( [i for i in range(n_depots)], dm, tm, delivery_time_windows, day, arrival_days, futile_count, alternate_locations, ) if plot: plt.clf() routes.plot( positions=[(customer.lon, customer.lat) for customer in customers], weight_matrix=dm, ) plt.show(block=False) plt.pause(0.001) futile_count[[i for i in range(len(customers)) if i not in unscheduled]] += 1 # logger.debug(routes) logger.debug("Unscheduled: %s" % unscheduled) logger.debug("Start simulations") for i in range(replications): logger.debug("Replication %i" % i) # Simulate behaviour distances, times, futile, delivered = simulator( routes, dm, tm, delivery_time_windows, customers, rg ) logger.debug("Delivered: %s" % delivered) # Data collection to save data.append( collect_data( day, routes, distances, times, futile, delivered, arrival_days, delivery_time_windows, [len(l) for l in packages_at_collection], collection_point_removed_packages, collection_dist, ) ) # Remove delivered packages, using just the last result undelivered = np.ones(len(customers), dtype=bool) for alternates in alternate_locations: # Remove all alternate locations as well for package in delivered: if package in alternates: undelivered[alternates] = False undelivered[[i for i in range(n_depots)]] = True # # get the undelivered list for collection point # cp_undelivered = undelivered[-len(cp_customers) :] # count_cp_undelivered = 0 # check if collection point is visited for i in range(len(cp_customers)): if not undelivered[-i - 1]: # collection point visited for cust in customer_to_cp[ -i - 1 ]: # add package into the collection point packages_at_collection[-i - 1][cust] = 0 # allocat_packages_to_collection[min_ind] = [] customer_to_cp[ -i - 1 ] = [] # reset the customer list for the visited collection point else: # count_cp_undelivered += 1 undelivered[ -i - 1 ] = False # remove collection point in the customer list # Generate data # new_time_windows, new_arrival_days, new_futile_count, new_customers = ( # tw_to_cp[-i - 1], # ad_to_cp[-i - 1], # fc_to_cp[-i - 1], # customer_to_cp[-i - 1], # ) delivery_time_windows = delivery_time_windows[undelivered] arrival_days = arrival_days[undelivered] futile_count = futile_count[undelivered] customers = customers[undelivered] # if count_cp_undelivered: # # stick the undelivered collection pacakges on # delivery_time_windows = np.vstack((delivery_time_windows, new_time_windows)) # arrival_days = np.append(arrival_days, new_arrival_days) # futile_count = np.append(futile_count, new_futile_count) # customers = np.append(customers, new_customers) logger.debug( "Number of remaining Packages: %i" % (len(customers) - 1) ) # -1 for depo if time.time() - start > tlim: break return data
def OneSampleKS_Geometric(data_cases,datatype): row,col = data_cases.shape ga_confirmed_data = [] hi_confirmed_data = [] for index, row in data_cases.iterrows(): # Oct,Nov,Dec data for both states if row['Date'][5] =='1': if datatype == 'Confirmed cases': ga_confirmed_data.append(row['daily_GA_confirmed']) hi_confirmed_data.append(row['daily_HI_confirmed']) else: ga_confirmed_data.append(row['daily_GA_deaths']) hi_confirmed_data.append(row['daily_HI_deaths']) # print("ga data: ",ga_confirmed_data ) # print("hi data: ",hi_confirmed_data ) # geometric P mme using first state data p_mme = float(len(ga_confirmed_data)) / float(np.sum(ga_confirmed_data)) # Find ecdf of second data hi_confirmed_data.sort() second_DataLength = len(hi_confirmed_data) ecdf_secondData = [] stepSize = 1 / second_DataLength step = 0 for j in range(second_DataLength - 1): step += stepSize ecdf_secondData.append(step) ecdf_secondData.append(1) #print(ecdf_secondData) maxDifference = 0 print("p_mme: ",p_mme) # Iterating over all the data points to calculate difference for k in range(second_DataLength): #print("hi_confirmed_data: ",hi_confirmed_data[k]) cdfAtPoint = geom.cdf(hi_confirmed_data[k], p_mme) if k == 0: ecdf_Left = 0 else: ecdf_Left = ecdf_secondData[k-1] ecdf_Right = ecdf_secondData[k] diff_ecdf_Left = abs(cdfAtPoint - ecdf_Left) diff_ecdf_Right = abs(cdfAtPoint - ecdf_Right) # print("cdfAtPoint: ", cdfAtPoint) # print("ecdf_Left: ", ecdf_Left) # print("ecdf_Right: ", ecdf_Right) difference = max(diff_ecdf_Left,diff_ecdf_Right) if maxDifference < difference: maxDifference = difference print("\nMaximum Difference: ", maxDifference) c = 0.05 # Comparing maximum difference with critical value if(maxDifference > c): print("Null hypothesis is rejected as Oct-Dec 2020 data for the second state does not have the distribution with the obtained MME parameters for ", datatype) else: print("Null hypothesis is accepted as Oct-Dec 2020 data for the second state have the distribution with the obtained MME parameters for ", datatype)