def from_cdf(self): """ Obtain the maximum likelihood form of the Zipf distribution, given the mle value for the Zipf shape parameter (a). Using a, this code generates a rank-abundance distribution (RAD) from the cumulative density function (cdf) using the percent point function (ppf) also known as the quantile function. see: http://www.esapubs.org/archive/ecol/E093/155/appendix-B.htm This is an actual form of the Zipf distribution, obtained from getting the mle for the shape parameter. """ p = md.zipf_solver(self.obs) S = len(self.obs) rv = stats.zipf(a=p) rad = [] for i in range(1, S+1): print rad val = (S - i + 0.5)/S x = rv.ppf(val) rad.append(int(x)) return rad
def zipf_initialization_n(N, alphas, V_max): assert (N == len(alphas)) rv = [zipf(alpha) for alpha in alphas] T = np.zeros((N, V_max)) for i in range(N): for j in range(V_max): T[i, j] = 1 - rv[i].cdf(j) h = T2h(T) return rv, T, h
def test_rvs(self): vals = stats.zipf.rvs(1.5, size=(2, 50)) assert numpy.all(vals >= 1) assert numpy.shape(vals) == (2, 50) assert vals.dtype.char in typecodes["AllInteger"] val = stats.zipf.rvs(1.5) assert isinstance(val, int) val = stats.zipf(1.5).rvs(3) assert isinstance(val, numpy.ndarray) assert val.dtype.char in typecodes["AllInteger"]
def test_rvs(self): vals = stats.zipf.rvs(1.5, size=(2, 50)) assert_(numpy.all(vals >= 1)) assert_(numpy.shape(vals) == (2, 50)) assert_(vals.dtype.char in typecodes['AllInteger']) val = stats.zipf.rvs(1.5) assert_(isinstance(val, int)) val = stats.zipf(1.5).rvs(3) assert_(isinstance(val, numpy.ndarray)) assert_(val.dtype.char in typecodes['AllInteger'])
def gen_weights_zipf(n_weights, zipf_param=1.13): ''' Generate first choice candidate preference frequencies among voters assuming that prefence is zipf distributed. Truncate at n_weights total candidates/frequencies. ''' rv = zipf(zipf_param) out_weights = [rv.pmf(j) for j in range(1, n_weights + 1)] reweight_factor = sum(out_weights) out_weights = [x / reweight_factor for x in out_weights] return out_weights
def simulate_zipf(alpha=1.5, n=10**4, repetitions=10, x_min=None): indexes = list() estimations_alpha = list() estimations_xmin = list() bigger_than_min = list() for k in range(1, repetitions + 1): _zipf_rv = zipf(alpha) discrete_sample = np.sort(_zipf_rv.rvs(size=n)) if x_min is not None: fit_estimating_discrete = pw.Fit(data=discrete_sample, discrete=True, estimate_discrete=False, xmin=x_min) else: fit_estimating_discrete = pw.Fit(data=discrete_sample, discrete=True, estimate_discrete=False) print(fit_estimating_discrete.alpha) print(fit_estimating_discrete.xmin) indexes.append(k) estimations_alpha.append(fit_estimating_discrete.alpha) estimations_xmin.append(fit_estimating_discrete.xmin) if x_min: bigger_than_min.append( sum(np.greater_equal(discrete_sample, x_min))) else: bigger_than_min.append( sum( np.greater_equal(discrete_sample, fit_estimating_discrete.xmin))) if not x_min: plot_results(rep_nums=indexes, alphas=estimations_alpha, xmins=estimations_xmin, resampling=bigger_than_min) else: plot_results(rep_nums=indexes, alphas=estimations_alpha, xmins=None, resampling=bigger_than_min)
def from_cdf(self): """ Obtain the maximum likelihood form of the Zipf distribution, given the mle value for the Zipf shape parameter (a). Using a, this code generates a rank-abundance distribution (RAD) from the cumulative density function (cdf) using the percent point function (ppf) also known as the quantile function. see: http://www.esapubs.org/archive/ecol/E093/155/appendix-B.htm This is an actual form of the Zipf distribution, obtained from getting the mle for the shape parameter. """ p = self.zipf_solver(self.obs) S = len(self.obs) rv = stats.zipf(a=p) rad = [] for i in range(1, S + 1): val = (S - i + 0.5) / S x = rv.ppf(val) rad.append(int(x)) point = collections.namedtuple('Rad_and_p', ['x', 'y']) point_return = point(rad, y=p) return point_return
def gen_ranked_preferences_zipf(n_candidates, n_voters, zipf_param=1.1): ''' Generate ranked choice candidate preference frequencies among voters assuming that preference rankings are zipf distributed. n_voters might need to be about 500 * n_candidates ''' candidates = list(range(n_candidates)) pref_ballot_samples = list() rv = zipf(zipf_param) # zipf of index 0 doesn't exist, thus add 1: ii+1 scaler = sum(rv.pmf(ii + 1) for ii in range(n_voters)) n_prefs = [n_voters * rv.pmf(i + 1) / scaler for i in range(n_voters)] # Generate random preference ordering according to zipf distributed samples offset = 0 for n in n_prefs: m = int(round(n + offset)) offset = n - m + offset tmp_candidates = candidates.copy() shuffle(tmp_candidates) pref_ballot_samples.extend([tuple(tmp_candidates)] * m) return tuple(pref_ballot_samples)
off = [np.log(sum(self.obs))] * len(self.obs) d = pd.DataFrame({'ranks': ranks, 'off': off, 'x':self.obs}) lm = smf.glm(formula='x ~ ranks', data = d, family = sm.families.Poisson()).fit() pred = lm.predict() return pred ad = [20000, 10000, 8000, 6000, 1000, 200, 200, 100, 18, 16, 14, 12, 10, 4, 4, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] a = md.zipf_solver(ad) S = len(ad) rv = stats.zipf(a) rad = [] vals = [] for i in range(1, S+1): vals.append((S - i + 0.5)/S) t = time.clock() x = rv.ppf(vals) elapsed_t = time.clock() - t print x, elapsed_t sys.exit() ranks = range(1,len(ad)+1) zipf_pred = zipf(ad)
def simulate_ln_slowly(alpha=1.5, n=10**4, repetitions=10, x_min=None): mp.dps = 15 ht = BaseHeavyTailedDistribution(slowly_varying_function=lambda n: log(n), alpha=1.5) top = 2 * 10**8 ints = np.arange(1, top, 1) np.set_printoptions(precision=15) _constant = float(ht.get_constant()) result = np.multiply(np.log(ints) * np.power(ints, -alpha), _constant) cum_fun = np.cumsum(result) _max = np.max(cum_fun) indexes = list() estimations_alpha = list() estimations_xmin = list() bigger_than_min = list() for k in range(1, repetitions + 1): samples = uniform.rvs(size=n) bellow = np.extract(samples <= _max, samples) over = np.extract(samples > _max, samples) discrete_sample = list() for i, u in enumerate(bellow): discrete_sample.append(np.searchsorted(cum_fun, u) + 1) _zipf_rv = zipf(alpha) _zipfs_data = np.sort(_zipf_rv.rvs(size=n))[-over.size:] discrete_sample.extend(_zipfs_data) if x_min is not None: fit_estimating_discrete = pw.Fit(data=discrete_sample, discrete=True, estimate_discrete=False, xmin=x_min) else: fit_estimating_discrete = pw.Fit(data=discrete_sample, discrete=True, estimate_discrete=False) print(fit_estimating_discrete.alpha) print(fit_estimating_discrete.xmin) indexes.append(k) estimations_alpha.append(fit_estimating_discrete.alpha) estimations_xmin.append(fit_estimating_discrete.xmin) if x_min: bigger_than_min.append( sum(np.greater_equal(discrete_sample, x_min))) else: bigger_than_min.append( sum( np.greater_equal(discrete_sample, fit_estimating_discrete.xmin))) if not x_min: plot_results(rep_nums=indexes, alphas=estimations_alpha, xmins=estimations_xmin, resampling=bigger_than_min) else: plot_results(rep_nums=indexes, alphas=estimations_alpha, xmins=None, resampling=bigger_than_min)
def debug_sampler_and_plot(): sampler = Basic_Sampler('gpu') # gamma output = sampler.gamma(np.ones(1000)*4.5, 5) plt.figure() plt.hist(output, bins=20, density=True) plt.plot(np.linspace(0, 100, 100), stats.gamma.pdf(np.linspace(0, 100, 100), 4.5, scale=5)) plt.title('gamma(4.5, 5)') plt.show() # standard_gamma output = sampler.standard_gamma(np.ones(1000)*4.5) plt.figure() plt.hist(output, bins=20, density=True) plt.plot(np.linspace(0, 20, 100), stats.gamma.pdf(np.linspace(0, 20, 100), 4.5)) plt.title('standard_gamma(4.5)') plt.show() # dirichlet output = sampler.dirichlet(np.ones(1000)*4.5) plt.figure() plt.hist(output, bins=20, density=True) # x = np.linspace(np.min(output), np.max(output), 100) # plt.plot(x, stats.dirichlet.pdf(x, alpha=np.ones(100)*4.5)) plt.title('dirichlet(4.5)') plt.show() # beta output = sampler.beta(np.ones(1000)*0.5, 0.5) plt.figure() plt.hist(output, bins=20, density=True) plt.plot(np.linspace(0, 1, 100), stats.beta.pdf(np.linspace(0, 1, 100), 0.5, 0.5)) plt.title('beta(0.5, 0.5)') plt.show() # beta(2, 5) output = sampler.beta(np.ones(1000)*2, 5) plt.figure() plt.hist(output, bins=20, density=True) plt.plot(np.linspace(0, 1, 100), stats.beta.pdf(np.linspace(0, 1, 100), 2, 5)) plt.title('beta(2, 5)') plt.show() # normal output = sampler.normal(np.ones(1000)*5, np.ones(1000)*2) plt.figure() plt.hist(output, bins=20, density=True) plt.plot(np.linspace(-2, 13, 100), stats.norm.pdf(np.linspace(-2, 13, 100), 5, scale=2)) plt.title('normal(5, 2)') plt.show() # standard_normal output = sampler.standard_normal(1000) plt.figure() plt.hist(output, bins=20, density=True) plt.plot(np.linspace(-3, 3, 100), stats.norm.pdf(np.linspace(-3, 3, 100))) plt.title('standard_normal()') plt.show() # uniform output = sampler.uniform(np.ones(1000)*(-2), np.ones(1000)*5) plt.figure() plt.hist(output, bins=20, density=True) plt.plot(np.linspace(-3, 6, 100), stats.uniform.pdf(np.linspace(-3, 6, 100), -2, 7)) plt.title('uniform(-2, 5)') plt.show() # standard_uniform output = sampler.standard_uniform(1000) plt.figure() plt.hist(output, bins=20, density=True) plt.plot(np.linspace(-0.3, 1.3, 100), stats.uniform.pdf(np.linspace(-0.3, 1.3, 100))) plt.title('standard_uniform()') plt.show() # binomial output = sampler.binomial(np.ones(1000)*10, np.ones(1000)*0.5) plt.figure() plt.hist(output, bins=np.max(output)-np.min(output), density=True, range=(np.min(output)-0.5, np.max(output)-0.5)) # plt.scatter(np.arange(10), stats.binom._pmf(np.arange(10), 10, 0.5), c='orange', zorder=10) plt.title('binomial(10, 0.5)') plt.show() # negative_binomial output = sampler.negative_binomial(np.ones(1000)*10, 0.5) plt.figure() plt.hist(output, bins=np.max(output)-np.min(output), density=True, range=(np.min(output)-0.5, np.max(output)-0.5)) plt.scatter(np.arange(30), stats.nbinom._pmf(np.arange(30), 10, 0.5), c='orange', zorder=10) plt.title('negative_binomial(10, 0.5)') plt.show() # multinomial output = sampler.multinomial(5, [0.8, 0.2], 1000) # output = sampler.multinomial([10]*4, [[0.8, 0.2]]*4, 3) plt.figure() plt.hist(output[0], bins=10, density=True) plt.title('multinomial(5, [0.8, 0.2])') plt.show() a = np.array([np.array([[i] * 6 for i in range(6)]).reshape(-1), np.array(list(range(6)) * 6)]).T output = stats.multinomial(n=5, p=[0.8, 0.2]).pmf(a) sns.heatmap(output.reshape(6, 6), annot=True) plt.ylabel('number of the 1 kind(p=0.8)') plt.xlabel('number of the 2 kind(p=0.2)') plt.title('stats.multinomial(n=5, p=[0.8, 0.2])') plt.show() # poisson output = sampler.poisson(np.ones(1000)*10) plt.figure() plt.hist(output, bins=22, density=True, range=(-0.5, 21.5)) plt.scatter(np.arange(20), stats.poisson.pmf(np.arange(20), 10), c='orange', zorder=10) plt.title('poisson(10)') plt.show() # cauchy output = sampler.cauchy(np.ones(1000)*1, 0.5) plt.figure() plt.hist(output, bins=20, density=True, range=(-5, 7)) plt.plot(np.linspace(-5, 7, 100), stats.cauchy.pdf(np.linspace(-5, 7, 100), 1, 0.5)) plt.title('cauchy(1, 0.5)') plt.show() # standard_cauchy output = sampler.standard_cauchy(1000) plt.figure() plt.hist(output, bins=20, density=True, range=(-7, 7)) plt.plot(np.linspace(-7, 7, 100), stats.cauchy.pdf(np.linspace(-7, 7, 100))) plt.title('standard_cauchy()') plt.show() # chisquare output = sampler.chisquare(np.ones(1000)*10) plt.figure() plt.hist(output, bins=20, density=True) plt.plot(np.linspace(0, 30, 100), stats.chi2.pdf(np.linspace(0, 30, 100), 10)) plt.title('chisquare(10)') plt.show() # noncentral_chisquare output = sampler.noncentral_chisquare(np.ones(1000)*10, 5) plt.figure() plt.hist(output, bins=20, density=True) # nocentral_chi2 = scale^2 * (chi2 + 2*loc*chi + df*loc^2) # E(Z) = nonc + df # Var(Z) = 2(df+2nonc) plt.title('noncentral_chisquare(df=10, nonc=5)') plt.show() # exponential lam = 0.5 output = sampler.exponential(np.ones(1000)*lam) plt.figure() plt.hist(output, bins=20, density=True) plt.plot(np.linspace(0.01, 4, 100), stats.expon.pdf(np.linspace(0.01, 4, 100), scale=0.5)) plt.title('exponential(0.5)') plt.show() # standard_exponential output = sampler.standard_exponential(1000) plt.figure() plt.hist(output, bins=20, density=True) plt.plot(np.linspace(0.01, 8, 100), stats.expon.pdf(np.linspace(0.01, 8, 100))) plt.title('standard_exponential()') plt.show() # f output = sampler.f(np.ones(1000)*10, 10) plt.figure() plt.hist(output, bins=20, density=True) plt.plot(np.linspace(0, 8, 100), stats.f.pdf(np.linspace(0, 8, 100), 10, 10)) plt.title('f(10, 10)') plt.show() # noncentral_f output = sampler.noncentral_f(np.ones(1000)*10, 10, 5) plt.figure() plt.hist(output, bins=20, density=True) # E(F) = (m+nonc)*n / (m*(n-2)), n>2. # Var(F) = 2*(n/m)**2 * ((m+nonc)**2 + (m+2*nonc)*(n-2)) / ((n-2)**2 * (n-4)) plt.title('noncentral_f(dfnum=10, dfden=10, nonc=5)') plt.show() # geometric output = sampler.geometric(np.ones(1000)*0.1) plt.figure() plt.hist(output, bins=20, density=True) plt.scatter(np.arange(50), stats.geom.pmf(np.arange(50), p=0.1), c='orange', zorder=10) plt.title('geometric(0.1)') plt.show() # gumbel output = sampler.gumbel(np.ones(1000)*5, np.ones(1000)*2) plt.figure() plt.hist(output, bins=20, density=True) plt.plot(np.linspace(0, 20, 100), stats.gumbel_r.pdf(np.linspace(0, 20, 100)+0.01, 5, scale=2)) plt.title('gumbel(5, 2)') plt.show() np.random.gumbel() # hypergeometric output = sampler.hypergeometric(np.ones(1000)*5, 10, 10) plt.figure() plt.hist(output, bins=np.max(output)-np.min(output), density=True, range=(np.min(output)+0.5, np.max(output)+0.5)) plt.scatter(np.arange(10), stats.hypergeom(15, 5, 10).pmf(np.arange(10)), c='orange', zorder=10) # hypergeom(M, n, N), total, I, tiems plt.title('hypergeometric(5, 10, 10)') plt.show() # laplace output = sampler.laplace(np.ones(1000)*5, np.ones(1000)*2) plt.figure() plt.hist(output, bins=20, density=True) plt.plot(np.linspace(-10, 20, 100), stats.laplace.pdf(np.linspace(-10, 20, 100), 5, scale=2)) plt.title('laplace(5, 2)') plt.show() # logistic output = sampler.logistic(np.ones(1000)*5, np.ones(1000)*2) plt.figure() plt.hist(output, bins=20, density=True) plt.plot(np.linspace(-10, 20, 100), stats.logistic.pdf(np.linspace(-10, 20, 100), 5, scale=2)) plt.title('logistic(5, 2)') plt.show() # power output = sampler.power(np.ones(1000)*0.5) plt.figure() plt.hist(output, bins=20, density=True) plt.plot(np.linspace(0, 1.5, 100), stats.powerlaw.pdf(np.linspace(0, 1.5, 100), 0.5)) plt.title('power(0.5)') plt.show() # zipf output = sampler.zipf(np.ones(1000)*1.1) counter = Counter(output) filter = np.array([[key, counter[key]] for key in counter.keys() if key < 50]) plt.figure() plt.scatter(filter[:, 0], filter[:, 1] / 1000) plt.plot(np.arange(1, 50), stats.zipf(1.1).pmf(np.arange(1, 50))) plt.title('zipf(1.1)') plt.show() # pareto output = sampler.pareto(np.ones(1000) * 2, np.ones(1000) * 5) plt.figure() count, bins, _ = plt.hist(output, bins=50, density=True, range=(np.min(output), 100)) a, m = 2., 5. # shape and mode fit = a * m ** a / bins ** (a + 1) plt.plot(bins, max(count) * fit / max(fit), linewidth=2, color='r') plt.title('pareto(2, 5)') plt.show() # rayleigh output = sampler.rayleigh(np.ones(1000)*2.0) plt.figure() plt.hist(output, bins=20, density=True) plt.plot(np.linspace(0, 8, 100), stats.rayleigh(scale=2).pdf(np.linspace(0, 8, 100))) plt.title('rayleigh(2)') plt.show() # t output = sampler.t(np.ones(1000)*2.0) plt.figure() plt.hist(output, bins=20, density=True, range=(-6, 6)) plt.plot(np.linspace(-6, 6, 100), stats.t(2).pdf(np.linspace(-6, 6, 100))) plt.title('t(2)') plt.show() # triangular output = sampler.triangular(np.ones(1000)*0.0, 0.3, 1) plt.figure() plt.hist(output, bins=20, density=True) plt.plot(np.linspace(0, 1, 100), stats.triang.pdf(np.linspace(0, 1, 100), 0.3)) plt.title('triangular(0, 0.3, 1)') plt.show() # weibull output = sampler.weibull(np.ones(1000)*4.5, 5) plt.figure() plt.hist(output, bins=20, density=True) plt.plot(np.linspace(0, 10, 100), stats.weibull_min.pdf(np.linspace(0, 10, 100), 4.5, scale=5)) plt.title('weibull(4.5, 5)') plt.show()
degreess = [{}]*clusters # degrees per node in each cluster. infecteds = [{}]*clusters # infected nodes in each cluster. recovereds = [{}]*clusters # recovered nodes in each cluster. controllers = [{}]*clusters # controllers in each cluster. TasPs_findable = [{}]*clusters # findable nodes for TasP in each cluster. PrEPs_findable = [{}]*clusters # findable nodes for PrEP in each cluster. base_findable = [{}]*clusters # treated during baseline run. ts = [0]*M # Generate a bipartite degree-corrected stochastic blockmodel with assortative rewiring (preserving degree and block). for cluster in range(clusters): while len(LCCs[cluster]) < study_end_quantile/100* n: # this ensures that every graph has a LCC that can sustain a sufficiently-sized epidemic. assortative = True # assortative_cluster[cluster] # assumes assortativity for all clusters. infectivity = ["degree", "degree"] concurrency = ["degree","degree"] k_female = ((average_degree[cluster]-1)*ss.uniform().rvs(n/2)+1).astype(int)*ss.zipf(2.5).rvs(n/2) # will require some editing if mean(degree) differs from K. k_male = ((average_degree[cluster]-1)*ss.uniform().rvs(n/2)+1).astype(int)*ss.zipf(2.5).rvs(n/2) ## k_female = ss.poisson(average_degree[cluster]).rvs(n/2) ## k_male = ss.poisson(average_degree[cluster]).rvs(n/2) counter_threshold = 30 k = np.concatenate((k_female, k_male)) # this assumes both bipartite halves they have the same distribution. k[k>n] = n # eliminates impossibly high values. g = {i: 2*C*i//n for i in range(n)} kappa = [np.sum([k[i] for i in range(n) if g[i] == K]) for K in range(2*C)] m = sum(kappa)/2 theta = [k[i] / kappa[g[i]] for i in range(n)] omega_random = np.zeros((2*C,2*C)) omega_zeros = np.zeros((C,C)) omega_block = np.zeros((C,C)) for i in range(C):
def test_zipf_num_est(datasets, estimators, SAD_number, iterations, fail_threshold): percents = [0.500000, 0.250000, 0.125000, 0.062500, 0.031250, 0.015625] for dataset in datasets: signal.signal(signal.SIGALRM, gf.timeout_handler) if dataset == 'MGRAST': # fix subset l8r IN = mydir + dataset + '-Data' + '/MGRAST/MGRAST-SADs.txt' nsr2_data_zipf = gf.import_NSR2_data(mydir + 'NSR2/' + 'zipf_MGRAST_NSR2.txt') elif dataset == '95' or dataset == '97' or dataset == '99': IN = mydir + dataset + '-Data/' + str(dataset) + '/MGRAST-' + str(dataset) + '-SADs.txt' nsr2_data_zipf = gf.import_NSR2_data(mydir + 'NSR2/' +'zipf_MGRAST'+dataset+'_NSR2.txt') elif dataset == 'HMP': IN = mydir + dataset + '-Data' + '/' + dataset +'-SADs_NAP.txt' nsr2_data_zipf = gf.import_NSR2_data(mydir + 'NSR2/' + 'zipf_'+dataset+'_NSR2.txt') else: IN = mydir + dataset + '-Data' + '/' + dataset +'-SADs.txt' nsr2_data_zipf = gf.import_NSR2_data(mydir + 'NSR2/' + 'zipf_'+dataset+'_NSR2.txt') nsr2_data_zipf_N_site = np.column_stack((nsr2_data_zipf["site"], nsr2_data_zipf["N"])) # Sort these arrays nsr2_data_zipf_sorted = nsr2_data_zipf_N_site[nsr2_data_zipf_N_site[:,1].argsort()[::-1]] nsr2_data_zipf_top100 = nsr2_data_zipf_sorted[:SAD_number,] # Get the SAD numbers zipf_numbers = nsr2_data_zipf_top100[:,0] zipf_numbers = zipf_numbers.astype(int) successful_SADs_samplings = SAD_number for estimator in estimators: OUT = open(mydir + 'SubSampled-Data' + '/' + dataset + '_zipf_' + \ str(estimator) + '_SubSampled_Data.txt', 'w+') num_lines = sum(1 for line in open(IN)) test_lines = 0 succeess_lines = SAD_number while succeess_lines > 0: site = nsr2_data_zipf_sorted[test_lines,0] for j,line in enumerate(open(IN)): if (j != site): continue else: if dataset == "HMP": line = line.strip().split(',') line = [x.strip(' ') for x in line] line = [x.strip('[]') for x in line] site_name = line[0] line.pop(0) else: line = eval(line) obs = map(int, line) # Calculate relative abundance of each OTU # Use that as weights N_0 = float(sum(obs)) S_0 = len(obs) N_max = max(obs) if S_0 < 10 or N_0 <= S_0: test_lines += 1 continue line_ra = map(lambda x: x/N_0, obs) sample_sizes = map(lambda x: round(x*N_0), percents) if any(sample_size <= 10 for sample_size in sample_sizes) == True: test_lines += 1 continue zipf_means = [N_0, S_0, N_max] failed_percents = 0 for k, percent in enumerate(percents): if failed_percents > 0: continue N_max_list_zipf = [] N_0_list_zipf = [] S_0_list_zipf = [] r2_list_zipf = [] gamma_list = [] iter_count_current = 0 iter_count = iterations iter_failed = 0 while iter_count > 0 and iter_failed < fail_threshold: sample_size_k = sample_sizes[0] sample_k = np.random.multinomial(sample_size_k, line_ra, size = None) sample_k_sorted = -np.sort( -sample_k[sample_k != 0] ) N_0_k = sum(sample_k_sorted) S_0_k = sample_k_sorted.size if S_0_k < 10 or N_0_k <= S_0_k: continue N_max_k = max(sample_k_sorted) iter_count_current += 1 # Start the timer. Once 1 second is over, a SIGALRM signal is sent. signal.alarm(2) # This try/except loop ensures that # you'll catch TimeoutException when it's sent. #start_time = time.time() try: # Whatever your function that might hang zipf_class = gf.zipf(sample_k_sorted, estimator) pred_tuple = zipf_class.from_cdf() Zipf_solve_line = zipf_class.zipf_solver(sample_k_sorted) rv = stats.zipf(Zipf_solve_line) pred_zipf = pred_tuple[0] gamma = pred_tuple[1] r2_zipf = macroecotools.obs_pred_rsquare(np.log10(sample_k_sorted), np.log10(pred_zipf)) if (r2_zipf == -float('inf') ) or (r2_zipf == float('inf') ) or (r2_zipf == float('Nan') ): continue else: r2_list_zipf.append(r2_zipf) gamma_list.append(gamma) N_max_list_zipf.append(N_max_k) N_0_list_zipf.append(N_0_k) S_0_list_zipf.append(S_0_k) except gf.TimeoutException: print "Line " + str(j) + ": " + str(estimator) + " timed out" iter_count -= 1 if iter_failed >= fail_threshold: failed_percents += 1 iter_failed += 1 continue # continue the for loop if function takes more than x seconds else: iter_count -= 1 #print("--- %s seconds ---" % (time.time() - start_time)) # Reset the alarm signal.alarm(0) if len(N_0_list_zipf) != iterations: test_lines += 1 continue N_0_zipf_mean = np.mean(N_0_list_zipf) zipf_means.append(N_0_zipf_mean) S_0_zipf_mean = np.mean(S_0_list_zipf) zipf_means.append(S_0_zipf_mean) N_max_zipf_mean = np.mean(N_max_list_zipf) zipf_means.append(N_max_zipf_mean) r2_zipf_mean = np.mean(r2_list_zipf) zipf_means.append(r2_zipf_mean) gamma_zipf_mean = np.mean(gamma_list) zipf_means.append(gamma_zipf_mean) '''Now we check if the lists are the right length there are 6 iterations for the percentage mete/ geom, append four items each iteration. 4*6 = 24, add three original = 27 likewise, for zipf, (5*6) + 3 = 33 ''' if len(zipf_means) == 33: test_lines += 1 succeess_lines -= 1 zipf_means_str = ' '.join(map(str, zipf_means)) #OUT1.write(','.join(map(repr, geom_means_str[i])) print>> OUT, j, zipf_means_str print "Line " + str(j) + ": " + str(succeess_lines) + " SADs to go!" else: test_lines += 1 #print estimator print dataset
Z.moment(0) Z.moment(1) Z.moment(2) Z.moment(3) Z.pdf(0) Z.ppf(0.975) # percentage point function- also called quantile # pmf is probability mass function- but continuous RV's don't have a pmf Z.pmf(0) Z.rvs(10) Z.stats() Z.std() # scipy.stats supports nearly 100 different distributions # And the all behave EXACTLY like this :) stats.zipf? Z2 = stats.zipf(4) Z2.mean() x = np.linspace(-2, 2) # Visualize it plt.plot(x, Z.pdf(x)) plt.clf() plt.plot(x, Z.pdf(x)) x = np.linspace(-4, 4) plt.plot(x, Z.pdf(x)) plt.plot(x, Z.cdf(x)) # You might ask- what's the survival function? plt.plot(x, Z.sf(x)) # Now lets look at the sleep data that Nick introduced last time # Some exploratory analysis of this distribution
class bcolors: HEADER = '\033[95m' OKBLUE = '\033[94m' OKGREEN = '\033[92m' WARNING = '\033[93m' FAIL = '\033[91m' ENDC = '\033[0m' BOLD = '\033[1m' UNDERLINE = '\033[4m' # pre-calculate for fast bounded Zipf: # x_zipf = np.arange(1, 1024 + 1) ZIPF_a = 2.15 # weights = x_zipf ** (-ZIPF_a) zeta_dist = stats.zipf(ZIPF_a) def borda(m): return np.arange(m) def draw_zipf(num_voters, rand=None): if rand is None: rand = np.random.RandomState() return zeta_dist.rvs(size=num_voters, random_state=rand) # return np.random.zipf(a, size=num_voters) def draw_zipf_weights(owners, W=None, return_len_k=False, rand=None):
a = 6.5 mean, var, skew, kurt = zipf.stats(a, moments='mvsk') # Display the probability mass function (``pmf``): x = np.arange(zipf.ppf(0.01, a), zipf.ppf(0.99, a)) ax.plot(x, zipf.pmf(x, a), 'bo', ms=8, label='zipf pmf') ax.vlines(x, 0, zipf.pmf(x, a), colors='b', lw=5, alpha=0.5) # Alternatively, the distribution object can be called (as a function) # to fix the shape and location. This returns a "frozen" RV object holding # the given parameters fixed. # Freeze the distribution and display the frozen ``pmf``: rv = zipf(a) ax.vlines(x, 0, rv.pmf(x), colors='k', linestyles='-', lw=1, label='frozen pmf') ax.legend(loc='best', frameon=False) plt.show() # Check accuracy of ``cdf`` and ``ppf``: prob = zipf.cdf(x, a) np.allclose(x, zipf.ppf(prob, a)) # True