Python pmf Examples, scipy.stats.zipf.pmf Python Examples

Example #1

0

Show file

File: bandwidth_generate.py Project: QiliangLi/RegularTest

def bandwidth_generate(maxbandwith=125,
                       maxrate=0.99,
                       numofnode=16,
                       turn=2):  #size in MB

    a = 1.0000001  #the exponent of zipf, a > 1.

    listofoutput = []  #save the bandwidth
    for i in range(1, numofnode + 1, 1):
        tmp = zipf.pmf(i, a) / zipf.pmf(1,
                                        a) * maxbandwith * maxrate  #generate
        listofoutput.append(int(tmp))
    print(listofoutput)

    fw_used = open("bandwidth_used.txt", "w")
    fw_rest = open("bandwidth_rest.txt", "w")

    for i in range(1, turn + 1, 1):  #output
        shuffle(listofoutput)  #random
        for j in range(0, numofnode - 1, 1):
            fw_used.write("%d " % listofoutput[j])
            fw_rest.write("%d " % (maxbandwith - listofoutput[j]))
        fw_used.write("%d\n" % listofoutput[-1])
        fw_rest.write("%d\n" % (maxbandwith - listofoutput[-1]))

    fw_used.close()
    fw_rest.close()

Example #2

0

Show file

File: generate_google_rates.py Project: yhust/lacs

def generate_google_rates(k, n):
    rates = np.zeros((k,n))
    k = (int)(k)
    fast_rate = 1.0/0.071648
    slow_rate = 1.0/7.429076
    zipf_factor=1.05

    slow_number = 7
    for index_u in range((int)(slow_number)):  # slow
        preference = np.random.permutation(n)
        for index_f in range(n):
            rates[index_u, index_f] = zipf.pmf(preference[index_f]+1, zipf_factor)
        # normalize
        rates[index_u,:] /= sum(rates[index_u,:])
        rates[index_u, :] *= slow_rate



    for index_u in np.arange(slow_number, k):
        preference = np.random.permutation(n)
        for index_f in range(n):
            rates[index_u, index_f] = zipf.pmf(preference[index_f] + 1, zipf_factor)
        rates[index_u, :] /= sum(rates[index_u, :])
        rates[index_u, :] *= fast_rate

    log_rates(k, rates)
    return rates

Example #3

0

Show file

File: ECBenchmark.py Project: wushuhan/Alluxio-DataPlacement

def fileReadInZipf(fileNum, zipfFactor, times):
    p = [0] * fileNum
    for i in range(fileNum):
        p[i] = zipf.pmf(i, zipfFactor)
    p_sum = sum(p)
    for i in range(fileNum):
        p[i] /= p_sum
    x = np.random.choice(range(fileNum), size=times, p=p)
    return x

Example #4

0

Show file

File: test_discrete_distns.py Project: Armavica/scipy

 def test_zipfian_asymptotic(self):
     # test limiting case that zipfian(a, n) -> zipf(a) as n-> oo
     a = 6.5
     N = 10000000
     k = np.arange(1, 21)
     assert_allclose(zipfian.pmf(k, a, N), zipf.pmf(k, a))
     assert_allclose(zipfian.cdf(k, a, N), zipf.cdf(k, a))
     assert_allclose(zipfian.sf(k, a, N), zipf.sf(k, a))
     assert_allclose(zipfian.stats(a, N, moments='msvk'),
                     zipf.stats(a, moments='msvk'))

Example #5

0

Show file

def generate_model_test_rates(file_number, zipf_factor):

    preference = np.random.permutation(file_number)
    rates = np.zeros(file_number)
    for index_f in range(file_number):
        rates[index_f] = zipf.pmf(preference[index_f] + 1, zipf_factor)
        # normalize
        rates /= sum(rates)

    log_rates(rates)

Example #6

0

Show file

File: DistributionAnalyzer.py Project: dbiir/MiDBench

 def __getZipfianDifference(self, a, sample_ss):
     sum_sample = 0
     for i in sample_ss:
         sum_sample += i[1]
     difference = 0
     count = 0
     for i in sample_ss:
         count += 1
         difference += pow((1.0 * sum_sample * zipf.pmf(count, a) - i[1]),
                           2)
     return difference

Example #7

0

Show file

File: generate_opQ.py Project: wushuhan/Alluxio-DataPlacement

def popularity(fileNumber, zipfFactor):
    popularity = list()
    for i in range(1, fileNumber + 1, 1):
        popularity.append(zipf.pmf(i, zipfFactor))
    popularity /= sum(popularity)
    shuffle(popularity)

    fw = open(tests_dir + "/ec_test_files/popularity.txt", "w")
    for item in popularity:
        fw.write(str(item)+'\n')
    fw.close()
    return popularity

Example #8

0

Show file

File: SPTestSetUp.py Project: yhjohn163/SP-Cache

def SPTestSetUp(fileSize, zipfFactor,
                flag):  # file size in MB, flag: whether write the files
    #settings
    fileNumber = 10  #500
    #fileSize = 200 #MB
    #zipfFactor = 1.5
    machineNumber = 30  #30
    SPFactor = 6
    # generate popularity vector
    popularity = list()
    for i in range(1, fileNumber + 1, 1):
        popularity.append(zipf.pmf(i, zipfFactor))
    popularity /= sum(popularity)
    shuffle(popularity)
    tests_dir = os.path.expanduser('~')  # for Linux
    #tests_dir = os.getenv('HOME')# for mac OS
    print "tests dir:" + tests_dir

    if not os.path.exists(tests_dir + "/test_files"):
        os.makedirs(tests_dir + "/test_files")

    fw = open(tests_dir + "/test_files/popularity.txt", "wb")
    for item in popularity:
        fw.write("%s\n" % item)

# calculate the partition_number, in the range of [1, machineNumber]
    kVector = [
        max(min(int(popularity[id] * 100 * SPFactor), machineNumber), 1)
        for id in range(0, fileNumber)
    ]
    #kVector =10*numpy.ones(fileNumber,dtype=numpy.int)
    # print partitionNumber
    fw = open(tests_dir + "/test_files/k.txt", "wb")
    for k in kVector:
        fw.write("%s\n" % k)
    fw.close()

    #create the file of given size
    with open(tests_dir + "/test_files/test_local_file", "wb") as out:
        out.seek((fileSize * 1000 * 1000) - 1)
        out.write('\0')
    out.close()

    # write the files to Alluxio given the kvalues profile
    # remember to add the path of alluxio
    if (flag == 1):
        start = int(round(time.time() * 1000))  # in millisecond
        os.system('./bin/alluxio runSPPrepareFile')
        end = int(round(time.time() * 1000))
        print 'Write %s files takes %s' % (fileNumber, end - start)

Example #9

0

Show file

def make_zipf_plot(counts, tokens, title=None, savepath='./', save=False):
    """
    makes Zipfian distribution plot
    """
    # A Zipf plot
    # adapted from here: https://finnaarupnielsen.wordpress.com/2013/10/22/zipf-plot-for-word-counts-in-brown-corpus/
    # get counts for x and y
    ranks = np.arange(1, len(counts) + 1)
    indices = np.argsort(-counts)
    normalized_frequencies = counts[indices] / sum(counts)

    # make plot
    f = plt.figure(figsize=(10, 10))
    plt.loglog(ranks, normalized_frequencies, marker=".")

    # add the expected Zipfian distribution from the equation
    # 1.07 is usually a good bet for the shape parameter
    plt.loglog(ranks, [z for z in zipf.pmf(ranks, 1.07)])

    # add labels for clarity
    plt.xlabel("Frequency rank of token")
    plt.ylabel("Absolute frequency of token")

    ax = plt.gca()  # get current axis
    ax.set_aspect('equal')  # make the plot square
    plt.grid(True)
    if title is not None:
        plt.title(title)
    else:
        title = 'zipf_plot'  # for saving figure
        plt.title("Zipf plot")

    # add text labels
    last_freq = None
    for i in list(
            np.logspace(-0.5, np.log10(len(counts) - 1), 10).astype(int)):
        if last_freq != normalized_frequencies[
                i]:  # ensure words don't overlap...make sure y-val is different
            dummy = plt.text(ranks[i],
                             normalized_frequencies[i],
                             " " + tokens[indices[i]],
                             verticalalignment="bottom",
                             horizontalalignment="left")
        last_freq = normalized_frequencies[i]

    if save:
        plt.savefig(savepath + title + '.png')

    plt.show()

Example #10

0

Show file

File: generate_rates.py Project: yhust/lacs

def generate_rates(n, arrival_rate, factor):
    zipf_factor = 1.05
    k = 15
    rates = np.zeros((k, n))
    k = (int)(k)

    for index_u in range(k / 3):  # slow
        preference = np.random.permutation(n)
        for index_f in range(n):
            rates[index_u, index_f] = zipf.pmf(preference[index_f] + 1,
                                               zipf_factor)
        # normalize
        rates[index_u, :] /= sum(rates[index_u, :])
        rates[index_u, :] *= arrival_rate
    arrival_rate *= factor

    for index_u in np.arange(k / 3, 2 * k / 3):
        preference = np.random.permutation(n)
        for index_f in range(n):
            rates[index_u, index_f] = zipf.pmf(preference[index_f] + 1,
                                               zipf_factor)
        rates[index_u, :] /= sum(rates[index_u, :])
        rates[index_u, :] *= arrival_rate

    arrival_rate *= factor

    for index_u in np.arange(k * 2 / 3, k):
        preference = np.random.permutation(n)
        for index_f in range(n):
            rates[index_u, index_f] = zipf.pmf(preference[index_f] + 1,
                                               zipf_factor)
        rates[index_u, :] /= sum(rates[index_u, :])
        rates[index_u, :] *= arrival_rate

    log_rates(k, rates)
    return rates

Example #11

0

Show file

File: ceph-read-bench.py Project: HydraZeng/ceph-bench

def prepare_objs(ioctx, reads_num, use_zipf, zipf_parm):
    ioctx.require_ioctx_open()
    cluster_objects = list(ioctx.list_objects())

    objs = []
    count = 0
    if use_zipf:
        objs_p = [
            zipf.pmf(i, zipf_parm) for i in range(1,
                                                  len(cluster_objects) + 1)
        ]
        objs_p /= sum(objs_p)

        objs_c = []
        for p in objs_p:
            c = int(p * reads_num) + 1 if count < reads_num else 0
            objs_c.append(c)
            count += c

        shuffle(objs_c)

        for i, obj in enumerate(cluster_objects):
            key = 0
            length = 0
            for j in range(objs_c[i]):
                if j == 0:
                    key = obj.key
                    length = obj.stat()[0]
                objs.append(dict(key=key, len=length))

    else:
        for obj in cluster_objects:
            objs.append(dict(key=obj.key, len=obj.stat()[0]))
            count += 1
            if count == reads_num:
                return objs
        obj_num = count
        while count < reads_num:
            idx = count % obj_num
            objs.append(dict(key=objs[idx]['key'], len=objs[idx]['len']))
            count += 1

    shuffle(objs)
    return objs

Example #12

0

Show file

def zipf_weights(length, q=0.7) -> list:
    """
    Alternative to the above using Zipf distribution.
    Note that this returns an array where the first element is 0,
    so we will be dropping that and adding to the index.

    int, (float) -> [floats]
    """
    length += 1  # later we drop the first value. Zipf results start with 0

    # Zipf PMF scales inversely to Poisson. This lets us switch distribution
    # without making changes, since we prevent division by zero here.
    if q == 0:
        shape = 1
    else:
        shape = 1 / q

    # Probability mass function to yield weights for weighted choice
    weights = [zipf.pmf(i, shape) for i in range(length)][1:]

    return weights

Example #13

0

Show file

def SPTestSetUp(fileSize, zipfFactor):  # file size in MB, flag: whether write the files
    # settings
    fileNumber = 1  # 500
    # fileSize = 200 #MB
    # zipfFactor = 1.5
    # machineNumber = 2  # 30
    SPFactor = 6
    # generate popularity vector
    popularity = list()
    for i in range(1, fileNumber + 1, 1):
        popularity.append(zipf.pmf(i, zipfFactor))
    popularity /= sum(popularity)
    shuffle(popularity)
    tests_dir = os.path.expanduser('~')  # for Linux
    # tests_dir = os.getenv('HOME')# for mac OS
    print "tests dir:" + tests_dir

    if not os.path.exists(tests_dir + "/test_files"):
        os.makedirs(tests_dir + "/test_files")

    fw = open(tests_dir + "/test_files/popularity.txt", "wb")
    for item in popularity:
        fw.write("%s\n" % item)

        # calculate the partition_number, in the range of [1, machineNumber]
    # kVector = [max(min(int(popularity[id] * 100 * SPFactor), machineNumber), 1) for id in
    #            range(0, fileNumber)]
    kVector = [1,2,3,4]
    # kVector =10*numpy.ones(fileNumber,dtype=numpy.int)
    # print partitionNumber
    fw = open(tests_dir + "/test_files/k.txt", "wb")
    for k in kVector:
        fw.write("%d\n" % k)
    fw.close()

    # create the file of given size
    with open(tests_dir + "/test_files/test_local_file%dMB" % fileSize, "wb") as out:
        out.seek((fileSize * 1000 * 1000) - 1)
        out.write('\0')
    out.close()

Example #14

0

Show file

    logser_p = md.logser_solver(ab)
    logser_values = md.trunc_logser.pmf(x_values,
                                        logser_p,
                                        upper_bound=float("inf"))
    lsll = md.logser_ll(ab, logser_p)

    nb_n, nb_p = md.nbinom_lower_trunc_solver(ab)
    nb_values = md.nbinom_lower_trunc.pmf(x_values, nb_n, nb_p)
    nbll = md.nbinom_lower_trunc_ll(ab, nb_n, nb_p)

    pln_mu, pln_sigma = md.pln_solver(ab)
    pln_values = md.pln.pmf(x_values, pln_mu, pln_sigma, lower_trunc=True)
    plnll = md.pln_ll(ab, pln_mu, pln_sigma)

    zipf_par = md.zipf_solver(ab)
    zipf_values = zipf.pmf(x_values, zipf_par)
    zll = md.zipf_ll(ab, zipf_par)

    ab_y = np.zeros(len(x_values) + 1)
    for j in range(len(ab)):
        ab_y[ab[j]] = ab_y[ab[j]] + 1 / len(ab)

    ax.set_xlim([0, min(50, max(x_values))])

    plt.ylabel('frequency')
    plt.xlabel('abundance')
    plt.title(plot_labels[i])

    # Width originally set at 12 when width was 50.
    # This should be the same proportional width
    width = 3 / min(50, max(x_values)) * 50

Example #15

0

Show file

File: rappor.py Project: azure84/dp_market

def pmf(x, distribution):

    ret = zipf.pmf(x, distribution)
    return ret

Example #16

0

Show file

File: zipfian_plot_example.py Project: Benjamin-Siebold/MSDS-682-Text-Analytics

import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import zipf

x = np.arange(1, 1001)

plt.loglog(x, zipf.pmf(x, 1.07))
plt.show()

plt.plot(x, zipf.pmf(x, 1.07))
plt.show()

for i in [1.07, 2, 3]:
    plt.loglog(x, zipf.pmf(x, i), label=str(i))

plt.legend()
plt.show()

Example #17

0

Show file

File: Zip.py Project: D3F3R4L/Fog4VR

# In[1]:

from scipy.stats import zipf
import numpy as np
import matplotlib.pyplot as plt

a = 2
k = 1
# x = np.arange(zipf.ppf(0.01, a),
#               zipf.ppf(0.99, a))

# rv = zipf(a)
# prob = zipf.cdf(x, a)
# np.allclose(x, zipf.ppf(prob, a))
r = zipf.rvs(a, size=10)
pf = zipf.pmf(k, a, loc=0)
# print(r)
#print(pf)
# pmf(k, a, loc=0)
# la = 1-pf

# In[2]:

# np.random.choice(5, 3, p=[0.1, 0, 0.3, 0.6, 0])

from random import choices
files = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
weights = [
    0.6079271018540265, 0.04356365534955261, 0.04356365534955261,
    0.04356365534955261, 0.04356365534955261, 0.04356365534955261,
    0.04356365534955261, 0.04356365534955261, 0.04356365534955261,

Example #18

0

Show file

import matplotlib.pyplot as plt
from scipy.stats import zipf

lst=[]

with open('/Users/lilucy/Desktop/zipfdata.csv','w') as csvfile:
    fieldnames=['word','count']
    writer=csv.writer(csvfile)
    writer.writerow(fieldnames)
    for row in records:
        wordtokens=row[0].lower()
        count=row[1].lower()
        lst.append((count,wordtokens))


plt.bar([key for val, key in lst], [val for val, key in lst], color='limegreen')
alpha = 1.37065874
total = sum([p for p, c in lst])
plt.plot(range(len(lst)), [zipf.pmf(p, alpha) * total for p in range(1, len(lst) + 1)], color='crimson', lw=3)
plt.ylabel("Frequency")
plt.xticks(rotation='vertical')
plt.tight_layout()
plt.show()

Example #19

0

Show file

File: paramsView.py Project: PyMarc2/virus-propagation-simulator

    def generate_graph_data(self):
        ageGroup = self.tableModel.data[self.selected_item_index.row()][0]
        parameter = self.tableModel.data[self.selected_item_index.row()][1]
        p1 = self.temporaryParametersDict[ageGroup][parameter]["p1"]
        p2 = self.temporaryParametersDict[ageGroup][parameter]["p2"]

        distributionType = self.temporaryParametersDict[ageGroup][parameter][
            "distributionType"]
        xyDict = {"x": [], "y": []}
        try:
            if distributionType == 'Binomial':
                xyDict["x"] = np.arange(binom.ppf(0.01, int(p1), p2 / 100),
                                        binom.ppf(0.99, int(p1), p2 / 100))
                xyDict["y"] = binom.pmf(xyDict["x"], int(p1), p2 / 100)
            elif distributionType == 'Geometric':
                xyDict["x"] = np.arange(geom.ppf(0.01, p1 / 100),
                                        geom.ppf(0.99, p1 / 100))
                xyDict["y"] = geom.pmf(xyDict["x"], p1 / 100)
                if p2 != 0:
                    self.tableModel.setData(
                        self.selected_item_index.sibling(
                            self.selected_item_index.row(), 3), 0, Qt.EditRole)
            elif distributionType == 'Laplacian':
                xyDict["x"] = np.arange(dlaplace.ppf(0.01, p1 / 100),
                                        dlaplace.ppf(0.99, p1 / 100))
                xyDict["y"] = dlaplace.pmf(xyDict["x"], p1 / 100)
                if p2 != 0:
                    self.tableModel.setData(
                        self.selected_item_index.sibling(
                            self.selected_item_index.row(), 3), 0, Qt.EditRole)
            elif distributionType == 'Logarithmic':
                xyDict["x"] = np.arange(logser.ppf(0.01, p1 / 100),
                                        logser.ppf(0.99, p1 / 100))
                xyDict["y"] = logser.pmf(xyDict["x"], p1 / 100)
                if p2 != 0:
                    self.tableModel.setData(
                        self.selected_item_index.sibling(
                            self.selected_item_index.row(), 3), 0, Qt.EditRole)
            elif distributionType == 'Neg. binomial':
                xyDict["x"] = np.arange(nbinom.ppf(0.01, p1, p2 / 100),
                                        nbinom.ppf(0.99, p1, p2 / 100))
                xyDict["y"] = nbinom.pmf(xyDict["x"], p1, p2 / 100)
            elif distributionType == 'Planck':
                xyDict["x"] = np.arange(planck.ppf(0.01, p1 / 100),
                                        planck.ppf(0.99, p1 / 100))
                xyDict["y"] = planck.pmf(xyDict["x"], p1 / 100)
                if p2 != 0:
                    self.tableModel.setData(
                        self.selected_item_index.sibling(
                            self.selected_item_index.row(), 3), 0, Qt.EditRole)
            elif distributionType == 'Poisson':
                xyDict["x"] = np.arange(poisson.ppf(0.01, p1),
                                        poisson.ppf(0.99, p1))
                xyDict["y"] = poisson.pmf(xyDict["x"], p1)
                if p2 != 0:
                    self.tableModel.setData(
                        self.selected_item_index.sibling(
                            self.selected_item_index.row(), 3), 0, Qt.EditRole)
            elif distributionType == 'Uniform':
                if p1 - 0.5 * p2 < 0:
                    p2 = p1
                min = p1 - 0.5 * p2
                max = p1 + 0.5 * p2
                xyDict["x"] = np.arange(randint.ppf(0.01, min, max),
                                        randint.ppf(0.99, min, max))
                xyDict["y"] = randint.pmf(xyDict["x"], min, max)
            elif distributionType == 'Zipf (Zeta)':
                xyDict["x"] = np.arange(zipf.ppf(0.01, p1), zipf.ppf(0.99, p1))
                xyDict["y"] = zipf.pmf(xyDict["x"], p1)
                if p2 != 0:
                    self.tableModel.setData(
                        self.selected_item_index.sibling(
                            self.selected_item_index.row(), 3), 0, Qt.EditRole)
            self.update_graph(xyDict)
        except Exception as E:
            log.error(E)

Example #20

0

Show file

File: ECTestSetUp.py Project: wushuhan/Alluxio-DataPlacement

def ECTestSetUp(filesize,
                fileNumber):  # file size in MB, flag: whether write the files
    # settings
    # fileNumber = 1  # 500
    # fileSize = 200 #MB
    zipfFactor = 1.5
    # machineNumber = 2  # 30
    # SPFactor = 6
    # # generate popularity vector
    popularity = list()
    for i in range(1, fileNumber + 1, 1):
        popularity.append(zipf.pmf(i, zipfFactor))
    popularity /= sum(popularity)
    shuffle(popularity)
    tests_dir = os.path.expanduser('~')  # for Linux
    # tests_dir = os.getenv('HOME')# for mac OS
    print("tests dir:" + tests_dir)

    if not os.path.exists(tests_dir + "/ec_test_files"):
        os.makedirs(tests_dir + "/ec_test_files")

    fw = open(tests_dir + "/ec_test_files/popularity.txt", "w")
    for item in popularity:
        fw.write(str(item) + '\n')

    #filesize = np.random.exponential(1.5, fileNumber)
    #filesize = filesize/min(filesize)*4
    filesize = filesize * 1024 * 1024
    filesizes = [filesize] * fileNumber
    fw = open(tests_dir + "/ec_test_files/fileSize.txt", "w")
    for size in filesizes:
        fw.write(str(int(size)) + '\n')
    fw.close()
    # calculate the partition_number, in the range of [1, machineNumber]
    # kVector = [max(min(int(popularity[id] * 100 * SPFactor), machineNumber), 1) for id in
    #            range(0, fileNumber)]
    kVector = [3] * fileNumber
    # kVector =10*np.ones(fileNumber,dtype=np.int)
    # print partitionNumber
    fw = open(tests_dir + "/ec_test_files/k.txt", "w")
    for k in kVector:
        fw.write(str(k) + '\n')
    fw.close()

    nVector = [1] * fileNumber
    # kVector =10*np.ones(fileNumber,dtype=np.int)
    # print partitionNumber
    fw = open(tests_dir + "/ec_test_files/n.txt", "w")
    for n in nVector:
        fw.write(str(n) + '\n')
    fw.close()

    # create the file of given size
    # with open(tests_dir + "/ec_test_files/test_local_file", "w") as out:
    #     out.seek((fileSize * 1000 * 1000) - 1)
    #     out.write('\0')
    # out.close()

    # write the files to Alluxio given the kvalues profile
    # remember to add the path of alluxio
    # if (flag == 1):
    start = int(round(time.time() * 1000))  # in millisecond
    os.system('$ALLUXIO_HOME/bin/alluxio runECPrepareFile true')
    end = int(round(time.time() * 1000))
    print('Write %s files takes %s' % (fileNumber, end - start))

Example #21

0

Show file

File: chisquare_test.py Project: nomihadar/Thesis

def chisquare(observations, shape_file, min_prob, maxlength, dist):
	
	max_length = maxlength if maxlength else max(observations)
	
	'''
	if not maxlength or (maxlength and max(observations) < maxlength):
		max_length = max(observations)
	else:
		max_length = maxlength
	'''
		
	#remove observations larger than the maximal length
	observed = [o for o in observations if o <= max_length]
	
	#get shape parameters 
	shape_values = get_shape_values(shape_file, dist)

	#define results data frame
	
	results = {"0shape": [], "1chisq": [], "2pvalue": [], 
				"3n.observations": [], "4n.bins": [], 
				"5n.expected < 5": [], "6n.observed < 5": []}	
	
	if dist == "negbinom":
		results["0shape2"] = []
	
	for shape in shape_values:
		
		#calculate expected frequencies:
		if dist == "zipf":
			expect_freq = zipf.pmf(range(1,max_length+1), shape) 
		elif dist == "negbinom":
			r, p = shape
			expect_freq = nbinom.pmf(range(1,max_length+1), r, p) 

		try:
			expect_freq = np.array(expect_freq) / sum(expect_freq)
		except:
			print ("shape caused zero-division: ", shape)
			
		# accumulate frequencies to a minimal probability of MIN_PROB 
		acc_freqs = [0]
		bins_lengths = [0]
		for freq in expect_freq:
			if acc_freqs[-1] < min_prob:
				acc_freqs[-1] += freq
				bins_lengths[-1] += 1
			else:
				acc_freqs.append(freq)
				bins_lengths.append(1)

		acc_expected = np.array(acc_freqs) * len(observed)
		
		#observed:
		observed_hist = list(np.bincount(observed)[1:])
		
		# accumulate observations according to the accumulated frequencies
		i = 0
		acc_observed = []
		for length in bins_lengths:	
			acc_observed.append(sum(observed_hist[i:i+length]))
			i += length
			
		try:
			chisq, pval = stats.chisquare(acc_observed, acc_expected)
		except:
			chisq, pval = -1, -1
		
		'''
		print bins_lengths
		print expect_freq
		print max_length
		print observed_hist
		print acc_observed	
		print acc_expected	
		'''
		
		#count how many bins are less than 5 in both expected and observed 
		less_obs = sum(i < MIN_BIN for i in acc_observed)
		less_exp = sum(i < MIN_BIN for i in acc_expected)
		if dist == "zipf":
			results["0shape"].append(shape)
		if dist == "negbinom":
			results["0shape"].append(r)	
			results["0shape2"].append(p)
		results["1chisq"].append(chisq) 
		results["2pvalue"].append(pval)
		results["3n.observations"].append(len(observed)) 
		results["4n.bins"].append(len(acc_expected))
		results["5n.expected < 5"].append(less_exp)
		results["6n.observed < 5"].append(less_obs)
		
	return pd.DataFrame(results)

Example #22

0

Show file

from scipy.stats import zipf
import numpy as np

fileNumber = 100
zipfFactor = 1.05
popularity = list()
for i in range(1, fileNumber+1 ,1):
    popularity.append(zipf.pmf(i, zipfFactor))
popularity /= sum(popularity)
#popularity = popularity[::-1]

count = list()
size = list()
for pop in popularity:
    this_count = max(min((int)(300*pop),30),1)
    count.append(this_count)
    size.append(100/this_count)
sum(popularity)
print size

Example #23

0

Show file

File: example-plot.py Project: ethanwhite/sad-comparison

    x_values = np.array(range(max(ab) + 2)[1:])

    logser_p = md.logser_solver(ab)
    logser_values = md.trunc_logser.pmf(x_values, logser_p, upper_bound=float("inf"))
    lsll = md.logser_ll(ab, logser_p)

    nb_n, nb_p = md.nbinom_lower_trunc_solver(ab)
    nb_values = md.nbinom_lower_trunc.pmf(x_values, nb_n, nb_p)
    nbll = md.nbinom_lower_trunc_ll(ab, nb_n, nb_p)

    pln_mu, pln_sigma = md.pln_solver(ab)
    pln_values = md.pln.pmf(x_values, pln_mu, pln_sigma, lower_trunc=True)
    plnll = md.pln_ll(ab, pln_mu, pln_sigma)

    zipf_par = md.zipf_solver(ab)
    zipf_values = zipf.pmf(x_values, zipf_par)
    zll = md.zipf_ll(ab, zipf_par)

    ab_y = np.zeros(len(x_values) + 1)
    for j in range(len(ab)):
        ab_y[ab[j]] = ab_y[ab[j]] + 1/len(ab)

    ax.set_xlim([0,min(50, max(x_values))])

    plt.ylabel('frequency')
    plt.xlabel('abundance')
    plt.title(plot_labels[i])

    # Width originally set at 12 when width was 50.
    # This should be the same proportional width
    width = 3 / min(50, max(x_values)) * 50

Example #24

0

Show file

from scipy.stats import zipf
import matplotlib.pyplot as plt

fig, ax = plt.subplots(1, 1)

# Calculate a few first moments:

a = 6.5
mean, var, skew, kurt = zipf.stats(a, moments='mvsk')

# Display the probability mass function (``pmf``):

x = np.arange(zipf.ppf(0.01, a), zipf.ppf(0.99, a))
ax.plot(x, zipf.pmf(x, a), 'bo', ms=8, label='zipf pmf')
ax.vlines(x, 0, zipf.pmf(x, a), colors='b', lw=5, alpha=0.5)

# Alternatively, the distribution object can be called (as a function)
# to fix the shape and location. This returns a "frozen" RV object holding
# the given parameters fixed.

# Freeze the distribution and display the frozen ``pmf``:

rv = zipf(a)
ax.vlines(x,
          0,
          rv.pmf(x),
          colors='k',
          linestyles='-',
          lw=1,
          label='frozen pmf')
ax.legend(loc='best', frameon=False)