Example #1
0
    def from_cdf(self):
        """ Obtain the maximum likelihood form of the Zipf distribution, given
        the mle value for the Zipf shape parameter (a). Using a, this code
        generates a rank-abundance distribution (RAD) from the cumulative
        density function (cdf) using the percent point function (ppf) also known
        as the quantile function.
        see: http://www.esapubs.org/archive/ecol/E093/155/appendix-B.htm

        This is an actual form of the Zipf distribution, obtained from getting
        the mle for the shape parameter.
        """

        p = md.zipf_solver(self.obs)
        S = len(self.obs)
        rv = stats.zipf(a=p)
        rad = []

        for i in range(1, S+1):
            print rad
            val = (S - i + 0.5)/S
            x = rv.ppf(val)
            rad.append(int(x))


        return rad
def zipf_initialization_n(N, alphas, V_max):
    assert (N == len(alphas))
    rv = [zipf(alpha) for alpha in alphas]
    T = np.zeros((N, V_max))
    for i in range(N):
        for j in range(V_max):
            T[i, j] = 1 - rv[i].cdf(j)
    h = T2h(T)
    return rv, T, h
Example #3
0
 def test_rvs(self):
     vals = stats.zipf.rvs(1.5, size=(2, 50))
     assert numpy.all(vals >= 1)
     assert numpy.shape(vals) == (2, 50)
     assert vals.dtype.char in typecodes["AllInteger"]
     val = stats.zipf.rvs(1.5)
     assert isinstance(val, int)
     val = stats.zipf(1.5).rvs(3)
     assert isinstance(val, numpy.ndarray)
     assert val.dtype.char in typecodes["AllInteger"]
Example #4
0
 def test_rvs(self):
     vals = stats.zipf.rvs(1.5, size=(2, 50))
     assert_(numpy.all(vals >= 1))
     assert_(numpy.shape(vals) == (2, 50))
     assert_(vals.dtype.char in typecodes['AllInteger'])
     val = stats.zipf.rvs(1.5)
     assert_(isinstance(val, int))
     val = stats.zipf(1.5).rvs(3)
     assert_(isinstance(val, numpy.ndarray))
     assert_(val.dtype.char in typecodes['AllInteger'])
def gen_weights_zipf(n_weights, zipf_param=1.13):
    '''
    Generate first choice candidate preference frequencies among voters
    assuming that prefence is zipf distributed. Truncate at n_weights total
    candidates/frequencies.
    '''
    rv = zipf(zipf_param)
    out_weights = [rv.pmf(j) for j in range(1, n_weights + 1)]
    reweight_factor = sum(out_weights)
    out_weights = [x / reweight_factor for x in out_weights]
    return out_weights
Example #6
0
def simulate_zipf(alpha=1.5, n=10**4, repetitions=10, x_min=None):
    indexes = list()
    estimations_alpha = list()
    estimations_xmin = list()
    bigger_than_min = list()
    for k in range(1, repetitions + 1):
        _zipf_rv = zipf(alpha)

        discrete_sample = np.sort(_zipf_rv.rvs(size=n))

        if x_min is not None:
            fit_estimating_discrete = pw.Fit(data=discrete_sample,
                                             discrete=True,
                                             estimate_discrete=False,
                                             xmin=x_min)
        else:
            fit_estimating_discrete = pw.Fit(data=discrete_sample,
                                             discrete=True,
                                             estimate_discrete=False)
        print(fit_estimating_discrete.alpha)
        print(fit_estimating_discrete.xmin)
        indexes.append(k)
        estimations_alpha.append(fit_estimating_discrete.alpha)
        estimations_xmin.append(fit_estimating_discrete.xmin)
        if x_min:
            bigger_than_min.append(
                sum(np.greater_equal(discrete_sample, x_min)))
        else:
            bigger_than_min.append(
                sum(
                    np.greater_equal(discrete_sample,
                                     fit_estimating_discrete.xmin)))

    if not x_min:
        plot_results(rep_nums=indexes,
                     alphas=estimations_alpha,
                     xmins=estimations_xmin,
                     resampling=bigger_than_min)
    else:
        plot_results(rep_nums=indexes,
                     alphas=estimations_alpha,
                     xmins=None,
                     resampling=bigger_than_min)
Example #7
0
 def from_cdf(self):
     """ Obtain the maximum likelihood form of the Zipf distribution, given
     the mle value for the Zipf shape parameter (a). Using a, this code
     generates a rank-abundance distribution (RAD) from the cumulative
     density function (cdf) using the percent point function (ppf) also known
     as the quantile function.
     see: http://www.esapubs.org/archive/ecol/E093/155/appendix-B.htm
     This is an actual form of the Zipf distribution, obtained from getting
     the mle for the shape parameter.
     """
     p = self.zipf_solver(self.obs)
     S = len(self.obs)
     rv = stats.zipf(a=p)
     rad = []
     for i in range(1, S + 1):
         val = (S - i + 0.5) / S
         x = rv.ppf(val)
         rad.append(int(x))
     point = collections.namedtuple('Rad_and_p', ['x', 'y'])
     point_return = point(rad, y=p)
     return point_return
def gen_ranked_preferences_zipf(n_candidates, n_voters, zipf_param=1.1):
    '''
    Generate ranked choice candidate preference frequencies among voters
    assuming that preference rankings are zipf distributed.
    n_voters might need to be about 500 * n_candidates
    '''
    candidates = list(range(n_candidates))
    pref_ballot_samples = list()

    rv = zipf(zipf_param)
    # zipf of index 0 doesn't exist, thus add 1: ii+1
    scaler = sum(rv.pmf(ii + 1) for ii in range(n_voters))
    n_prefs = [n_voters * rv.pmf(i + 1) / scaler for i in range(n_voters)]

    # Generate random preference ordering according to zipf distributed samples
    offset = 0
    for n in n_prefs:
        m = int(round(n + offset))
        offset = n - m + offset
        tmp_candidates = candidates.copy()
        shuffle(tmp_candidates)
        pref_ballot_samples.extend([tuple(tmp_candidates)] * m)

    return tuple(pref_ballot_samples)
Example #9
0
        off = [np.log(sum(self.obs))] * len(self.obs)

        d = pd.DataFrame({'ranks': ranks, 'off': off, 'x':self.obs})

        lm = smf.glm(formula='x ~ ranks', data = d, family = sm.families.Poisson()).fit()
        pred = lm.predict()

        return pred


ad = [20000, 10000, 8000, 6000, 1000, 200, 200, 100, 18, 16, 14, 12, 10, 4, 4, 2, 2, 2, 2, 2, 1,
            1, 1, 1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]

a = md.zipf_solver(ad)
S = len(ad)
rv = stats.zipf(a)

rad = []
vals = []
for i in range(1, S+1):
    vals.append((S - i + 0.5)/S)

t = time.clock()
x = rv.ppf(vals)
elapsed_t = time.clock() - t
print x, elapsed_t
sys.exit()

ranks = range(1,len(ad)+1)

zipf_pred = zipf(ad)
Example #10
0
def simulate_ln_slowly(alpha=1.5, n=10**4, repetitions=10, x_min=None):
    mp.dps = 15
    ht = BaseHeavyTailedDistribution(slowly_varying_function=lambda n: log(n),
                                     alpha=1.5)
    top = 2 * 10**8

    ints = np.arange(1, top, 1)
    np.set_printoptions(precision=15)
    _constant = float(ht.get_constant())
    result = np.multiply(np.log(ints) * np.power(ints, -alpha), _constant)
    cum_fun = np.cumsum(result)
    _max = np.max(cum_fun)

    indexes = list()
    estimations_alpha = list()
    estimations_xmin = list()
    bigger_than_min = list()
    for k in range(1, repetitions + 1):
        samples = uniform.rvs(size=n)
        bellow = np.extract(samples <= _max, samples)
        over = np.extract(samples > _max, samples)
        discrete_sample = list()

        for i, u in enumerate(bellow):
            discrete_sample.append(np.searchsorted(cum_fun, u) + 1)

        _zipf_rv = zipf(alpha)

        _zipfs_data = np.sort(_zipf_rv.rvs(size=n))[-over.size:]
        discrete_sample.extend(_zipfs_data)

        if x_min is not None:
            fit_estimating_discrete = pw.Fit(data=discrete_sample,
                                             discrete=True,
                                             estimate_discrete=False,
                                             xmin=x_min)
        else:
            fit_estimating_discrete = pw.Fit(data=discrete_sample,
                                             discrete=True,
                                             estimate_discrete=False)
        print(fit_estimating_discrete.alpha)
        print(fit_estimating_discrete.xmin)
        indexes.append(k)
        estimations_alpha.append(fit_estimating_discrete.alpha)
        estimations_xmin.append(fit_estimating_discrete.xmin)
        if x_min:
            bigger_than_min.append(
                sum(np.greater_equal(discrete_sample, x_min)))
        else:
            bigger_than_min.append(
                sum(
                    np.greater_equal(discrete_sample,
                                     fit_estimating_discrete.xmin)))

    if not x_min:
        plot_results(rep_nums=indexes,
                     alphas=estimations_alpha,
                     xmins=estimations_xmin,
                     resampling=bigger_than_min)
    else:
        plot_results(rep_nums=indexes,
                     alphas=estimations_alpha,
                     xmins=None,
                     resampling=bigger_than_min)
Example #11
0
def debug_sampler_and_plot():

    sampler = Basic_Sampler('gpu')

    # gamma
    output = sampler.gamma(np.ones(1000)*4.5, 5)
    plt.figure()
    plt.hist(output, bins=20, density=True)
    plt.plot(np.linspace(0, 100, 100), stats.gamma.pdf(np.linspace(0, 100, 100), 4.5, scale=5))
    plt.title('gamma(4.5, 5)')
    plt.show()

    # standard_gamma
    output = sampler.standard_gamma(np.ones(1000)*4.5)
    plt.figure()
    plt.hist(output, bins=20, density=True)
    plt.plot(np.linspace(0, 20, 100), stats.gamma.pdf(np.linspace(0, 20, 100), 4.5))
    plt.title('standard_gamma(4.5)')
    plt.show()

    # dirichlet
    output = sampler.dirichlet(np.ones(1000)*4.5)
    plt.figure()
    plt.hist(output, bins=20, density=True)
    # x = np.linspace(np.min(output), np.max(output), 100)
    # plt.plot(x, stats.dirichlet.pdf(x, alpha=np.ones(100)*4.5))
    plt.title('dirichlet(4.5)')
    plt.show()

    # beta
    output = sampler.beta(np.ones(1000)*0.5, 0.5)
    plt.figure()
    plt.hist(output, bins=20, density=True)
    plt.plot(np.linspace(0, 1, 100), stats.beta.pdf(np.linspace(0, 1, 100), 0.5, 0.5))
    plt.title('beta(0.5, 0.5)')
    plt.show()

    # beta(2, 5)
    output = sampler.beta(np.ones(1000)*2, 5)
    plt.figure()
    plt.hist(output, bins=20, density=True)
    plt.plot(np.linspace(0, 1, 100), stats.beta.pdf(np.linspace(0, 1, 100), 2, 5))
    plt.title('beta(2, 5)')
    plt.show()

    # normal
    output = sampler.normal(np.ones(1000)*5, np.ones(1000)*2)
    plt.figure()
    plt.hist(output, bins=20, density=True)
    plt.plot(np.linspace(-2, 13, 100), stats.norm.pdf(np.linspace(-2, 13, 100), 5, scale=2))
    plt.title('normal(5, 2)')
    plt.show()

    # standard_normal
    output = sampler.standard_normal(1000)
    plt.figure()
    plt.hist(output, bins=20, density=True)
    plt.plot(np.linspace(-3, 3, 100), stats.norm.pdf(np.linspace(-3, 3, 100)))
    plt.title('standard_normal()')
    plt.show()

    # uniform
    output = sampler.uniform(np.ones(1000)*(-2), np.ones(1000)*5)
    plt.figure()
    plt.hist(output, bins=20, density=True)
    plt.plot(np.linspace(-3, 6, 100), stats.uniform.pdf(np.linspace(-3, 6, 100), -2, 7))
    plt.title('uniform(-2, 5)')
    plt.show()

    # standard_uniform
    output = sampler.standard_uniform(1000)
    plt.figure()
    plt.hist(output, bins=20, density=True)
    plt.plot(np.linspace(-0.3, 1.3, 100), stats.uniform.pdf(np.linspace(-0.3, 1.3, 100)))
    plt.title('standard_uniform()')
    plt.show()

    # binomial
    output = sampler.binomial(np.ones(1000)*10, np.ones(1000)*0.5)
    plt.figure()
    plt.hist(output, bins=np.max(output)-np.min(output), density=True, range=(np.min(output)-0.5, np.max(output)-0.5))
    # plt.scatter(np.arange(10), stats.binom._pmf(np.arange(10), 10, 0.5), c='orange', zorder=10)
    plt.title('binomial(10, 0.5)')
    plt.show()

    # negative_binomial
    output = sampler.negative_binomial(np.ones(1000)*10, 0.5)
    plt.figure()
    plt.hist(output, bins=np.max(output)-np.min(output), density=True, range=(np.min(output)-0.5, np.max(output)-0.5))
    plt.scatter(np.arange(30), stats.nbinom._pmf(np.arange(30), 10, 0.5), c='orange', zorder=10)
    plt.title('negative_binomial(10, 0.5)')
    plt.show()

    # multinomial
    output = sampler.multinomial(5, [0.8, 0.2], 1000)
    # output = sampler.multinomial([10]*4, [[0.8, 0.2]]*4, 3)
    plt.figure()
    plt.hist(output[0], bins=10, density=True)
    plt.title('multinomial(5, [0.8, 0.2])')
    plt.show()

    a = np.array([np.array([[i] * 6 for i in range(6)]).reshape(-1), np.array(list(range(6)) * 6)]).T
    output = stats.multinomial(n=5, p=[0.8, 0.2]).pmf(a)
    sns.heatmap(output.reshape(6, 6), annot=True)
    plt.ylabel('number of the 1 kind(p=0.8)')
    plt.xlabel('number of the 2 kind(p=0.2)')
    plt.title('stats.multinomial(n=5, p=[0.8, 0.2])')
    plt.show()

    # poisson
    output = sampler.poisson(np.ones(1000)*10)
    plt.figure()
    plt.hist(output, bins=22, density=True, range=(-0.5, 21.5))
    plt.scatter(np.arange(20), stats.poisson.pmf(np.arange(20), 10), c='orange', zorder=10)
    plt.title('poisson(10)')
    plt.show()

    # cauchy
    output = sampler.cauchy(np.ones(1000)*1, 0.5)
    plt.figure()
    plt.hist(output, bins=20, density=True, range=(-5, 7))
    plt.plot(np.linspace(-5, 7, 100), stats.cauchy.pdf(np.linspace(-5, 7, 100), 1, 0.5))
    plt.title('cauchy(1, 0.5)')
    plt.show()

    # standard_cauchy
    output = sampler.standard_cauchy(1000)
    plt.figure()
    plt.hist(output, bins=20, density=True, range=(-7, 7))
    plt.plot(np.linspace(-7, 7, 100), stats.cauchy.pdf(np.linspace(-7, 7, 100)))
    plt.title('standard_cauchy()')
    plt.show()

    # chisquare
    output = sampler.chisquare(np.ones(1000)*10)
    plt.figure()
    plt.hist(output, bins=20, density=True)
    plt.plot(np.linspace(0, 30, 100), stats.chi2.pdf(np.linspace(0, 30, 100), 10))
    plt.title('chisquare(10)')
    plt.show()

    # noncentral_chisquare
    output = sampler.noncentral_chisquare(np.ones(1000)*10, 5)
    plt.figure()
    plt.hist(output, bins=20, density=True)
    # nocentral_chi2 = scale^2 * (chi2 + 2*loc*chi + df*loc^2)
    # E(Z) = nonc + df
    # Var(Z) = 2(df+2nonc)
    plt.title('noncentral_chisquare(df=10, nonc=5)')
    plt.show()

    # exponential
    lam = 0.5
    output = sampler.exponential(np.ones(1000)*lam)
    plt.figure()
    plt.hist(output, bins=20, density=True)
    plt.plot(np.linspace(0.01, 4, 100), stats.expon.pdf(np.linspace(0.01, 4, 100), scale=0.5))
    plt.title('exponential(0.5)')
    plt.show()

    # standard_exponential
    output = sampler.standard_exponential(1000)
    plt.figure()
    plt.hist(output, bins=20, density=True)
    plt.plot(np.linspace(0.01, 8, 100), stats.expon.pdf(np.linspace(0.01, 8, 100)))
    plt.title('standard_exponential()')
    plt.show()

    # f
    output = sampler.f(np.ones(1000)*10, 10)
    plt.figure()
    plt.hist(output, bins=20, density=True)
    plt.plot(np.linspace(0, 8, 100), stats.f.pdf(np.linspace(0, 8, 100), 10, 10))
    plt.title('f(10, 10)')
    plt.show()

    # noncentral_f
    output = sampler.noncentral_f(np.ones(1000)*10, 10, 5)
    plt.figure()
    plt.hist(output, bins=20, density=True)
    # E(F) = (m+nonc)*n / (m*(n-2)), n>2.
    # Var(F) = 2*(n/m)**2 * ((m+nonc)**2 + (m+2*nonc)*(n-2)) / ((n-2)**2 * (n-4))
    plt.title('noncentral_f(dfnum=10, dfden=10, nonc=5)')
    plt.show()

    # geometric
    output = sampler.geometric(np.ones(1000)*0.1)
    plt.figure()
    plt.hist(output, bins=20, density=True)
    plt.scatter(np.arange(50), stats.geom.pmf(np.arange(50), p=0.1), c='orange', zorder=10)
    plt.title('geometric(0.1)')
    plt.show()

    # gumbel
    output = sampler.gumbel(np.ones(1000)*5, np.ones(1000)*2)
    plt.figure()
    plt.hist(output, bins=20, density=True)
    plt.plot(np.linspace(0, 20, 100), stats.gumbel_r.pdf(np.linspace(0, 20, 100)+0.01, 5, scale=2))
    plt.title('gumbel(5, 2)')
    plt.show()
    np.random.gumbel()

    # hypergeometric
    output = sampler.hypergeometric(np.ones(1000)*5, 10, 10)
    plt.figure()
    plt.hist(output, bins=np.max(output)-np.min(output), density=True, range=(np.min(output)+0.5, np.max(output)+0.5))
    plt.scatter(np.arange(10), stats.hypergeom(15, 5, 10).pmf(np.arange(10)), c='orange', zorder=10)  # hypergeom(M, n, N), total, I, tiems
    plt.title('hypergeometric(5, 10, 10)')
    plt.show()

    # laplace
    output = sampler.laplace(np.ones(1000)*5, np.ones(1000)*2)
    plt.figure()
    plt.hist(output, bins=20, density=True)
    plt.plot(np.linspace(-10, 20, 100), stats.laplace.pdf(np.linspace(-10, 20, 100), 5, scale=2))
    plt.title('laplace(5, 2)')
    plt.show()

    # logistic
    output = sampler.logistic(np.ones(1000)*5, np.ones(1000)*2)
    plt.figure()
    plt.hist(output, bins=20, density=True)
    plt.plot(np.linspace(-10, 20, 100), stats.logistic.pdf(np.linspace(-10, 20, 100), 5, scale=2))
    plt.title('logistic(5, 2)')
    plt.show()

    # power
    output = sampler.power(np.ones(1000)*0.5)
    plt.figure()
    plt.hist(output, bins=20, density=True)
    plt.plot(np.linspace(0, 1.5, 100), stats.powerlaw.pdf(np.linspace(0, 1.5, 100), 0.5))
    plt.title('power(0.5)')
    plt.show()

    # zipf
    output = sampler.zipf(np.ones(1000)*1.1)
    counter = Counter(output)
    filter = np.array([[key, counter[key]] for key in counter.keys() if key < 50])
    plt.figure()
    plt.scatter(filter[:, 0], filter[:, 1] / 1000)
    plt.plot(np.arange(1, 50), stats.zipf(1.1).pmf(np.arange(1, 50)))
    plt.title('zipf(1.1)')
    plt.show()

    # pareto
    output = sampler.pareto(np.ones(1000) * 2, np.ones(1000) * 5)
    plt.figure()
    count, bins, _ = plt.hist(output, bins=50, density=True, range=(np.min(output), 100))
    a, m = 2., 5.  # shape and mode
    fit = a * m ** a / bins ** (a + 1)
    plt.plot(bins, max(count) * fit / max(fit), linewidth=2, color='r')
    plt.title('pareto(2, 5)')
    plt.show()

    # rayleigh
    output = sampler.rayleigh(np.ones(1000)*2.0)
    plt.figure()
    plt.hist(output, bins=20, density=True)
    plt.plot(np.linspace(0, 8, 100), stats.rayleigh(scale=2).pdf(np.linspace(0, 8, 100)))
    plt.title('rayleigh(2)')
    plt.show()

    # t
    output = sampler.t(np.ones(1000)*2.0)
    plt.figure()
    plt.hist(output, bins=20, density=True, range=(-6, 6))
    plt.plot(np.linspace(-6, 6, 100), stats.t(2).pdf(np.linspace(-6, 6, 100)))
    plt.title('t(2)')
    plt.show()

    # triangular
    output = sampler.triangular(np.ones(1000)*0.0, 0.3, 1)
    plt.figure()
    plt.hist(output, bins=20, density=True)
    plt.plot(np.linspace(0, 1, 100), stats.triang.pdf(np.linspace(0, 1, 100), 0.3))
    plt.title('triangular(0, 0.3, 1)')
    plt.show()

    # weibull
    output = sampler.weibull(np.ones(1000)*4.5, 5)
    plt.figure()
    plt.hist(output, bins=20, density=True)
    plt.plot(np.linspace(0, 10, 100), stats.weibull_min.pdf(np.linspace(0, 10, 100), 4.5, scale=5))
    plt.title('weibull(4.5, 5)')
    plt.show()
Example #12
0
degreess       = [{}]*clusters # degrees per node in each cluster.
infecteds      = [{}]*clusters # infected nodes in each cluster.
recovereds     = [{}]*clusters # recovered nodes in each cluster.
controllers    = [{}]*clusters # controllers in each cluster.
TasPs_findable = [{}]*clusters # findable nodes for TasP in each cluster.
PrEPs_findable = [{}]*clusters # findable nodes for PrEP in each cluster.
base_findable  = [{}]*clusters # treated during baseline run.
ts             = [0]*M

# Generate a bipartite degree-corrected stochastic blockmodel with assortative rewiring (preserving degree and block).
for cluster in range(clusters):
    while len(LCCs[cluster]) < study_end_quantile/100* n:  # this ensures that every graph has a LCC that can sustain a sufficiently-sized epidemic.
        assortative = True # assortative_cluster[cluster] # assumes assortativity for all clusters.
        infectivity = ["degree", "degree"]
        concurrency = ["degree","degree"]
        k_female =  ((average_degree[cluster]-1)*ss.uniform().rvs(n/2)+1).astype(int)*ss.zipf(2.5).rvs(n/2) # will require some editing if mean(degree) differs from K.
        k_male   =  ((average_degree[cluster]-1)*ss.uniform().rvs(n/2)+1).astype(int)*ss.zipf(2.5).rvs(n/2)
##        k_female = ss.poisson(average_degree[cluster]).rvs(n/2)
##        k_male   = ss.poisson(average_degree[cluster]).rvs(n/2)

        counter_threshold = 30
        k = np.concatenate((k_female, k_male)) # this assumes both bipartite halves they have the same distribution.
        k[k>n] = n                             # eliminates impossibly high values.
        g = {i: 2*C*i//n for i in range(n)}
        kappa = [np.sum([k[i] for i in range(n) if g[i] == K]) for K in range(2*C)]
        m = sum(kappa)/2
        theta = [k[i] / kappa[g[i]] for i in range(n)]
        omega_random = np.zeros((2*C,2*C))
        omega_zeros  = np.zeros((C,C))
        omega_block  = np.zeros((C,C))
        for i in range(C):
Example #13
0
def test_zipf_num_est(datasets, estimators, SAD_number, iterations, fail_threshold):
    percents = [0.500000, 0.250000, 0.125000, 0.062500, 0.031250, 0.015625]
    for dataset in datasets:
        signal.signal(signal.SIGALRM, gf.timeout_handler)
        if dataset == 'MGRAST':
            # fix subset l8r
            IN = mydir  + dataset + '-Data' + '/MGRAST/MGRAST-SADs.txt'
            nsr2_data_zipf = gf.import_NSR2_data(mydir + 'NSR2/' + 'zipf_MGRAST_NSR2.txt')
        elif dataset == '95' or dataset == '97' or dataset == '99':
            IN = mydir  + dataset + '-Data/' + str(dataset) + '/MGRAST-' + str(dataset) + '-SADs.txt'
            nsr2_data_zipf = gf.import_NSR2_data(mydir + 'NSR2/' +'zipf_MGRAST'+dataset+'_NSR2.txt')
        elif dataset == 'HMP':
            IN = mydir  + dataset + '-Data' + '/' + dataset +'-SADs_NAP.txt'
            nsr2_data_zipf = gf.import_NSR2_data(mydir + 'NSR2/' +  'zipf_'+dataset+'_NSR2.txt')
        else:
            IN = mydir  + dataset + '-Data' + '/' + dataset +'-SADs.txt'
            nsr2_data_zipf = gf.import_NSR2_data(mydir + 'NSR2/' +  'zipf_'+dataset+'_NSR2.txt')

        nsr2_data_zipf_N_site = np.column_stack((nsr2_data_zipf["site"], nsr2_data_zipf["N"]))
        # Sort these arrays
        nsr2_data_zipf_sorted = nsr2_data_zipf_N_site[nsr2_data_zipf_N_site[:,1].argsort()[::-1]]
        nsr2_data_zipf_top100 = nsr2_data_zipf_sorted[:SAD_number,]
        # Get the SAD numbers
        zipf_numbers = nsr2_data_zipf_top100[:,0]
        zipf_numbers = zipf_numbers.astype(int)
        successful_SADs_samplings = SAD_number
        for estimator in estimators:
            OUT = open(mydir + 'SubSampled-Data' + '/' + dataset + '_zipf_' + \
                str(estimator) + '_SubSampled_Data.txt', 'w+')
            num_lines = sum(1 for line in open(IN))
            test_lines = 0
            succeess_lines = SAD_number
            while succeess_lines > 0:
                site = nsr2_data_zipf_sorted[test_lines,0]
                for j,line in enumerate(open(IN)):
                    if (j != site):
                        continue
                    else:
                        if dataset == "HMP":
                            line = line.strip().split(',')
                            line = [x.strip(' ') for x in line]
                            line = [x.strip('[]') for x in line]
                            site_name = line[0]
                            line.pop(0)
                        else:
                            line = eval(line)
                    obs = map(int, line)
                    # Calculate relative abundance of each OTU
                    # Use that as weights
                    N_0 = float(sum(obs))
                    S_0 = len(obs)
                    N_max = max(obs)
                    if S_0 < 10 or N_0 <= S_0:
                        test_lines += 1
                        continue
                    line_ra = map(lambda x: x/N_0, obs)
                    sample_sizes = map(lambda x: round(x*N_0), percents)
                    if any(sample_size <= 10 for sample_size in sample_sizes)  == True:
                        test_lines += 1
                        continue
                    zipf_means = [N_0, S_0, N_max]
                    failed_percents = 0
                    for k, percent in enumerate(percents):
                        if failed_percents > 0:
                            continue
                        N_max_list_zipf = []
                        N_0_list_zipf = []
                        S_0_list_zipf = []
                        r2_list_zipf = []
                        gamma_list = []
                        iter_count_current = 0
                        iter_count = iterations
                        iter_failed = 0
                        while iter_count > 0 and iter_failed < fail_threshold:
                            sample_size_k = sample_sizes[0]
                            sample_k = np.random.multinomial(sample_size_k, line_ra, size = None)
                            sample_k_sorted = -np.sort( -sample_k[sample_k != 0] )
                            N_0_k = sum(sample_k_sorted)
                            S_0_k = sample_k_sorted.size
                            if S_0_k < 10 or N_0_k <= S_0_k:
                                continue
                            N_max_k = max(sample_k_sorted)
                            iter_count_current += 1
                            # Start the timer. Once 1 second is over, a SIGALRM signal is sent.
                            signal.alarm(2)
                            # This try/except loop ensures that
                            #   you'll catch TimeoutException when it's sent.
                            #start_time = time.time()
                            try:
                                # Whatever your function that might hang
                                zipf_class = gf.zipf(sample_k_sorted, estimator)
                                pred_tuple = zipf_class.from_cdf()
                                Zipf_solve_line = zipf_class.zipf_solver(sample_k_sorted)
                                rv = stats.zipf(Zipf_solve_line)
                                pred_zipf = pred_tuple[0]
                                gamma = pred_tuple[1]
                                r2_zipf = macroecotools.obs_pred_rsquare(np.log10(sample_k_sorted), np.log10(pred_zipf))
                                if (r2_zipf == -float('inf') ) or (r2_zipf == float('inf') ) or (r2_zipf == float('Nan') ):
                                    continue
                                else:
                                    r2_list_zipf.append(r2_zipf)
                                    gamma_list.append(gamma)
                                    N_max_list_zipf.append(N_max_k)
                                    N_0_list_zipf.append(N_0_k)
                                    S_0_list_zipf.append(S_0_k)

                            except gf.TimeoutException:
                                print "Line " + str(j) + ": " + str(estimator) + " timed out"
                                iter_count -= 1
                                if iter_failed >= fail_threshold:
                                    failed_percents += 1
                                iter_failed += 1
                                continue # continue the for loop if function takes more than x seconds
                            else:
                                iter_count -= 1
                                #print("--- %s seconds ---" % (time.time() - start_time))
                                # Reset the alarm
                                signal.alarm(0)


                        if len(N_0_list_zipf) != iterations:
                            test_lines += 1
                            continue
                        N_0_zipf_mean = np.mean(N_0_list_zipf)
                        zipf_means.append(N_0_zipf_mean)

                        S_0_zipf_mean = np.mean(S_0_list_zipf)
                        zipf_means.append(S_0_zipf_mean)

                        N_max_zipf_mean = np.mean(N_max_list_zipf)
                        zipf_means.append(N_max_zipf_mean)

                        r2_zipf_mean = np.mean(r2_list_zipf)
                        zipf_means.append(r2_zipf_mean)

                        gamma_zipf_mean = np.mean(gamma_list)
                        zipf_means.append(gamma_zipf_mean)

                    '''Now we check if the lists are the right length
                    there are 6 iterations for the percentage
                    mete/ geom, append four items each iteration.
                    4*6 = 24, add three original = 27
                    likewise, for zipf, (5*6) + 3 = 33 '''
                    if len(zipf_means) == 33:
                        test_lines += 1
                        succeess_lines -= 1
                        zipf_means_str = ' '.join(map(str, zipf_means))
                        #OUT1.write(','.join(map(repr, geom_means_str[i]))
                        print>> OUT, j, zipf_means_str
                        print "Line " + str(j) + ": " + str(succeess_lines) + " SADs to go!"
                    else:
                        test_lines += 1
                #print estimator
            print dataset
Example #14
0
Z.moment(0)
Z.moment(1)
Z.moment(2)
Z.moment(3)
Z.pdf(0)
Z.ppf(0.975)
# percentage point function- also called quantile
# pmf is probability mass function- but continuous RV's don't have a pmf
Z.pmf(0)
Z.rvs(10)
Z.stats()
Z.std()
# scipy.stats supports nearly 100 different distributions
# And the all behave EXACTLY like this :)
stats.zipf?
Z2 = stats.zipf(4)
Z2.mean()
x = np.linspace(-2, 2)
# Visualize it
plt.plot(x, Z.pdf(x))
plt.clf()
plt.plot(x, Z.pdf(x))
x = np.linspace(-4, 4)
plt.plot(x, Z.pdf(x))
plt.plot(x, Z.cdf(x))
# You might ask- what's the survival function?
plt.plot(x, Z.sf(x))


# Now lets look at the sleep data that Nick introduced last time
# Some exploratory analysis of this distribution
Example #15
0
class bcolors:
    HEADER = '\033[95m'
    OKBLUE = '\033[94m'
    OKGREEN = '\033[92m'
    WARNING = '\033[93m'
    FAIL = '\033[91m'
    ENDC = '\033[0m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'


# pre-calculate for fast bounded Zipf:
# x_zipf = np.arange(1, 1024 + 1)
ZIPF_a = 2.15
# weights = x_zipf ** (-ZIPF_a)
zeta_dist = stats.zipf(ZIPF_a)


def borda(m):
    return np.arange(m)


def draw_zipf(num_voters, rand=None):
    if rand is None:
        rand = np.random.RandomState()

    return zeta_dist.rvs(size=num_voters, random_state=rand)
    # return np.random.zipf(a, size=num_voters)


def draw_zipf_weights(owners, W=None, return_len_k=False, rand=None):
Example #16
0
a = 6.5
mean, var, skew, kurt = zipf.stats(a, moments='mvsk')

# Display the probability mass function (``pmf``):

x = np.arange(zipf.ppf(0.01, a), zipf.ppf(0.99, a))
ax.plot(x, zipf.pmf(x, a), 'bo', ms=8, label='zipf pmf')
ax.vlines(x, 0, zipf.pmf(x, a), colors='b', lw=5, alpha=0.5)

# Alternatively, the distribution object can be called (as a function)
# to fix the shape and location. This returns a "frozen" RV object holding
# the given parameters fixed.

# Freeze the distribution and display the frozen ``pmf``:

rv = zipf(a)
ax.vlines(x,
          0,
          rv.pmf(x),
          colors='k',
          linestyles='-',
          lw=1,
          label='frozen pmf')
ax.legend(loc='best', frameon=False)
plt.show()

# Check accuracy of ``cdf`` and ``ppf``:

prob = zipf.cdf(x, a)
np.allclose(x, zipf.ppf(prob, a))
# True