def bandwidth_generate(maxbandwith=125,
                       maxrate=0.99,
                       numofnode=16,
                       turn=2):  #size in MB

    a = 1.0000001  #the exponent of zipf, a > 1.

    listofoutput = []  #save the bandwidth
    for i in range(1, numofnode + 1, 1):
        tmp = zipf.pmf(i, a) / zipf.pmf(1,
                                        a) * maxbandwith * maxrate  #generate
        listofoutput.append(int(tmp))
    print(listofoutput)

    fw_used = open("bandwidth_used.txt", "w")
    fw_rest = open("bandwidth_rest.txt", "w")

    for i in range(1, turn + 1, 1):  #output
        shuffle(listofoutput)  #random
        for j in range(0, numofnode - 1, 1):
            fw_used.write("%d " % listofoutput[j])
            fw_rest.write("%d " % (maxbandwith - listofoutput[j]))
        fw_used.write("%d\n" % listofoutput[-1])
        fw_rest.write("%d\n" % (maxbandwith - listofoutput[-1]))

    fw_used.close()
    fw_rest.close()
Exemple #2
0
def generate_google_rates(k, n):
    rates = np.zeros((k,n))
    k = (int)(k)
    fast_rate = 1.0/0.071648
    slow_rate = 1.0/7.429076
    zipf_factor=1.05

    slow_number = 7
    for index_u in range((int)(slow_number)):  # slow
        preference = np.random.permutation(n)
        for index_f in range(n):
            rates[index_u, index_f] = zipf.pmf(preference[index_f]+1, zipf_factor)
        # normalize
        rates[index_u,:] /= sum(rates[index_u,:])
        rates[index_u, :] *= slow_rate



    for index_u in np.arange(slow_number, k):
        preference = np.random.permutation(n)
        for index_f in range(n):
            rates[index_u, index_f] = zipf.pmf(preference[index_f] + 1, zipf_factor)
        rates[index_u, :] /= sum(rates[index_u, :])
        rates[index_u, :] *= fast_rate

    log_rates(k, rates)
    return rates
def fileReadInZipf(fileNum, zipfFactor, times):
    p = [0] * fileNum
    for i in range(fileNum):
        p[i] = zipf.pmf(i, zipfFactor)
    p_sum = sum(p)
    for i in range(fileNum):
        p[i] /= p_sum
    x = np.random.choice(range(fileNum), size=times, p=p)
    return x
 def test_zipfian_asymptotic(self):
     # test limiting case that zipfian(a, n) -> zipf(a) as n-> oo
     a = 6.5
     N = 10000000
     k = np.arange(1, 21)
     assert_allclose(zipfian.pmf(k, a, N), zipf.pmf(k, a))
     assert_allclose(zipfian.cdf(k, a, N), zipf.cdf(k, a))
     assert_allclose(zipfian.sf(k, a, N), zipf.sf(k, a))
     assert_allclose(zipfian.stats(a, N, moments='msvk'),
                     zipf.stats(a, moments='msvk'))
Exemple #5
0
def generate_model_test_rates(file_number, zipf_factor):

    preference = np.random.permutation(file_number)
    rates = np.zeros(file_number)
    for index_f in range(file_number):
        rates[index_f] = zipf.pmf(preference[index_f] + 1, zipf_factor)
        # normalize
        rates /= sum(rates)

    log_rates(rates)
 def __getZipfianDifference(self, a, sample_ss):
     sum_sample = 0
     for i in sample_ss:
         sum_sample += i[1]
     difference = 0
     count = 0
     for i in sample_ss:
         count += 1
         difference += pow((1.0 * sum_sample * zipf.pmf(count, a) - i[1]),
                           2)
     return difference
def popularity(fileNumber, zipfFactor):
    popularity = list()
    for i in range(1, fileNumber + 1, 1):
        popularity.append(zipf.pmf(i, zipfFactor))
    popularity /= sum(popularity)
    shuffle(popularity)

    fw = open(tests_dir + "/ec_test_files/popularity.txt", "w")
    for item in popularity:
        fw.write(str(item)+'\n')
    fw.close()
    return popularity
Exemple #8
0
def SPTestSetUp(fileSize, zipfFactor,
                flag):  # file size in MB, flag: whether write the files
    #settings
    fileNumber = 10  #500
    #fileSize = 200 #MB
    #zipfFactor = 1.5
    machineNumber = 30  #30
    SPFactor = 6
    # generate popularity vector
    popularity = list()
    for i in range(1, fileNumber + 1, 1):
        popularity.append(zipf.pmf(i, zipfFactor))
    popularity /= sum(popularity)
    shuffle(popularity)
    tests_dir = os.path.expanduser('~')  # for Linux
    #tests_dir = os.getenv('HOME')# for mac OS
    print "tests dir:" + tests_dir

    if not os.path.exists(tests_dir + "/test_files"):
        os.makedirs(tests_dir + "/test_files")

    fw = open(tests_dir + "/test_files/popularity.txt", "wb")
    for item in popularity:
        fw.write("%s\n" % item)

# calculate the partition_number, in the range of [1, machineNumber]
    kVector = [
        max(min(int(popularity[id] * 100 * SPFactor), machineNumber), 1)
        for id in range(0, fileNumber)
    ]
    #kVector =10*numpy.ones(fileNumber,dtype=numpy.int)
    # print partitionNumber
    fw = open(tests_dir + "/test_files/k.txt", "wb")
    for k in kVector:
        fw.write("%s\n" % k)
    fw.close()

    #create the file of given size
    with open(tests_dir + "/test_files/test_local_file", "wb") as out:
        out.seek((fileSize * 1000 * 1000) - 1)
        out.write('\0')
    out.close()

    # write the files to Alluxio given the kvalues profile
    # remember to add the path of alluxio
    if (flag == 1):
        start = int(round(time.time() * 1000))  # in millisecond
        os.system('./bin/alluxio runSPPrepareFile')
        end = int(round(time.time() * 1000))
        print 'Write %s files takes %s' % (fileNumber, end - start)
Exemple #9
0
def make_zipf_plot(counts, tokens, title=None, savepath='./', save=False):
    """
    makes Zipfian distribution plot
    """
    # A Zipf plot
    # adapted from here: https://finnaarupnielsen.wordpress.com/2013/10/22/zipf-plot-for-word-counts-in-brown-corpus/
    # get counts for x and y
    ranks = np.arange(1, len(counts) + 1)
    indices = np.argsort(-counts)
    normalized_frequencies = counts[indices] / sum(counts)

    # make plot
    f = plt.figure(figsize=(10, 10))
    plt.loglog(ranks, normalized_frequencies, marker=".")

    # add the expected Zipfian distribution from the equation
    # 1.07 is usually a good bet for the shape parameter
    plt.loglog(ranks, [z for z in zipf.pmf(ranks, 1.07)])

    # add labels for clarity
    plt.xlabel("Frequency rank of token")
    plt.ylabel("Absolute frequency of token")

    ax = plt.gca()  # get current axis
    ax.set_aspect('equal')  # make the plot square
    plt.grid(True)
    if title is not None:
        plt.title(title)
    else:
        title = 'zipf_plot'  # for saving figure
        plt.title("Zipf plot")

    # add text labels
    last_freq = None
    for i in list(
            np.logspace(-0.5, np.log10(len(counts) - 1), 10).astype(int)):
        if last_freq != normalized_frequencies[
                i]:  # ensure words don't overlap...make sure y-val is different
            dummy = plt.text(ranks[i],
                             normalized_frequencies[i],
                             " " + tokens[indices[i]],
                             verticalalignment="bottom",
                             horizontalalignment="left")
        last_freq = normalized_frequencies[i]

    if save:
        plt.savefig(savepath + title + '.png')

    plt.show()
Exemple #10
0
def generate_rates(n, arrival_rate, factor):
    zipf_factor = 1.05
    k = 15
    rates = np.zeros((k, n))
    k = (int)(k)

    for index_u in range(k / 3):  # slow
        preference = np.random.permutation(n)
        for index_f in range(n):
            rates[index_u, index_f] = zipf.pmf(preference[index_f] + 1,
                                               zipf_factor)
        # normalize
        rates[index_u, :] /= sum(rates[index_u, :])
        rates[index_u, :] *= arrival_rate
    arrival_rate *= factor

    for index_u in np.arange(k / 3, 2 * k / 3):
        preference = np.random.permutation(n)
        for index_f in range(n):
            rates[index_u, index_f] = zipf.pmf(preference[index_f] + 1,
                                               zipf_factor)
        rates[index_u, :] /= sum(rates[index_u, :])
        rates[index_u, :] *= arrival_rate

    arrival_rate *= factor

    for index_u in np.arange(k * 2 / 3, k):
        preference = np.random.permutation(n)
        for index_f in range(n):
            rates[index_u, index_f] = zipf.pmf(preference[index_f] + 1,
                                               zipf_factor)
        rates[index_u, :] /= sum(rates[index_u, :])
        rates[index_u, :] *= arrival_rate

    log_rates(k, rates)
    return rates
def prepare_objs(ioctx, reads_num, use_zipf, zipf_parm):
    ioctx.require_ioctx_open()
    cluster_objects = list(ioctx.list_objects())

    objs = []
    count = 0
    if use_zipf:
        objs_p = [
            zipf.pmf(i, zipf_parm) for i in range(1,
                                                  len(cluster_objects) + 1)
        ]
        objs_p /= sum(objs_p)

        objs_c = []
        for p in objs_p:
            c = int(p * reads_num) + 1 if count < reads_num else 0
            objs_c.append(c)
            count += c

        shuffle(objs_c)

        for i, obj in enumerate(cluster_objects):
            key = 0
            length = 0
            for j in range(objs_c[i]):
                if j == 0:
                    key = obj.key
                    length = obj.stat()[0]
                objs.append(dict(key=key, len=length))

    else:
        for obj in cluster_objects:
            objs.append(dict(key=obj.key, len=obj.stat()[0]))
            count += 1
            if count == reads_num:
                return objs
        obj_num = count
        while count < reads_num:
            idx = count % obj_num
            objs.append(dict(key=objs[idx]['key'], len=objs[idx]['len']))
            count += 1

    shuffle(objs)
    return objs
Exemple #12
0
def zipf_weights(length, q=0.7) -> list:
    """
    Alternative to the above using Zipf distribution.
    Note that this returns an array where the first element is 0,
    so we will be dropping that and adding to the index.

    int, (float) -> [floats]
    """
    length += 1  # later we drop the first value. Zipf results start with 0

    # Zipf PMF scales inversely to Poisson. This lets us switch distribution
    # without making changes, since we prevent division by zero here.
    if q == 0:
        shape = 1
    else:
        shape = 1 / q

    # Probability mass function to yield weights for weighted choice
    weights = [zipf.pmf(i, shape) for i in range(length)][1:]

    return weights
Exemple #13
0
def SPTestSetUp(fileSize, zipfFactor):  # file size in MB, flag: whether write the files
    # settings
    fileNumber = 1  # 500
    # fileSize = 200 #MB
    # zipfFactor = 1.5
    # machineNumber = 2  # 30
    SPFactor = 6
    # generate popularity vector
    popularity = list()
    for i in range(1, fileNumber + 1, 1):
        popularity.append(zipf.pmf(i, zipfFactor))
    popularity /= sum(popularity)
    shuffle(popularity)
    tests_dir = os.path.expanduser('~')  # for Linux
    # tests_dir = os.getenv('HOME')# for mac OS
    print "tests dir:" + tests_dir

    if not os.path.exists(tests_dir + "/test_files"):
        os.makedirs(tests_dir + "/test_files")

    fw = open(tests_dir + "/test_files/popularity.txt", "wb")
    for item in popularity:
        fw.write("%s\n" % item)

        # calculate the partition_number, in the range of [1, machineNumber]
    # kVector = [max(min(int(popularity[id] * 100 * SPFactor), machineNumber), 1) for id in
    #            range(0, fileNumber)]
    kVector = [1,2,3,4]
    # kVector =10*numpy.ones(fileNumber,dtype=numpy.int)
    # print partitionNumber
    fw = open(tests_dir + "/test_files/k.txt", "wb")
    for k in kVector:
        fw.write("%d\n" % k)
    fw.close()

    # create the file of given size
    with open(tests_dir + "/test_files/test_local_file%dMB" % fileSize, "wb") as out:
        out.seek((fileSize * 1000 * 1000) - 1)
        out.write('\0')
    out.close()
Exemple #14
0
    logser_p = md.logser_solver(ab)
    logser_values = md.trunc_logser.pmf(x_values,
                                        logser_p,
                                        upper_bound=float("inf"))
    lsll = md.logser_ll(ab, logser_p)

    nb_n, nb_p = md.nbinom_lower_trunc_solver(ab)
    nb_values = md.nbinom_lower_trunc.pmf(x_values, nb_n, nb_p)
    nbll = md.nbinom_lower_trunc_ll(ab, nb_n, nb_p)

    pln_mu, pln_sigma = md.pln_solver(ab)
    pln_values = md.pln.pmf(x_values, pln_mu, pln_sigma, lower_trunc=True)
    plnll = md.pln_ll(ab, pln_mu, pln_sigma)

    zipf_par = md.zipf_solver(ab)
    zipf_values = zipf.pmf(x_values, zipf_par)
    zll = md.zipf_ll(ab, zipf_par)

    ab_y = np.zeros(len(x_values) + 1)
    for j in range(len(ab)):
        ab_y[ab[j]] = ab_y[ab[j]] + 1 / len(ab)

    ax.set_xlim([0, min(50, max(x_values))])

    plt.ylabel('frequency')
    plt.xlabel('abundance')
    plt.title(plot_labels[i])

    # Width originally set at 12 when width was 50.
    # This should be the same proportional width
    width = 3 / min(50, max(x_values)) * 50
Exemple #15
0
def pmf(x, distribution):

    ret = zipf.pmf(x, distribution)
    return ret
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import zipf

x = np.arange(1, 1001)

plt.loglog(x, zipf.pmf(x, 1.07))
plt.show()

plt.plot(x, zipf.pmf(x, 1.07))
plt.show()

for i in [1.07, 2, 3]:
    plt.loglog(x, zipf.pmf(x, i), label=str(i))

plt.legend()
plt.show()
Exemple #17
0
# In[1]:

from scipy.stats import zipf
import numpy as np
import matplotlib.pyplot as plt

a = 2
k = 1
# x = np.arange(zipf.ppf(0.01, a),
#               zipf.ppf(0.99, a))

# rv = zipf(a)
# prob = zipf.cdf(x, a)
# np.allclose(x, zipf.ppf(prob, a))
r = zipf.rvs(a, size=10)
pf = zipf.pmf(k, a, loc=0)
# print(r)
#print(pf)
# pmf(k, a, loc=0)
# la = 1-pf

# In[2]:

# np.random.choice(5, 3, p=[0.1, 0, 0.3, 0.6, 0])

from random import choices
files = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
weights = [
    0.6079271018540265, 0.04356365534955261, 0.04356365534955261,
    0.04356365534955261, 0.04356365534955261, 0.04356365534955261,
    0.04356365534955261, 0.04356365534955261, 0.04356365534955261,
Exemple #18
0
import matplotlib.pyplot as plt
from scipy.stats import zipf

lst=[]

with open('/Users/lilucy/Desktop/zipfdata.csv','w') as csvfile:
    fieldnames=['word','count']
    writer=csv.writer(csvfile)
    writer.writerow(fieldnames)
    for row in records:
        wordtokens=row[0].lower()
        count=row[1].lower()
        lst.append((count,wordtokens))


plt.bar([key for val, key in lst], [val for val, key in lst], color='limegreen')
alpha = 1.37065874
total = sum([p for p, c in lst])
plt.plot(range(len(lst)), [zipf.pmf(p, alpha) * total for p in range(1, len(lst) + 1)], color='crimson', lw=3)
plt.ylabel("Frequency")
plt.xticks(rotation='vertical')
plt.tight_layout()
plt.show()

    def generate_graph_data(self):
        ageGroup = self.tableModel.data[self.selected_item_index.row()][0]
        parameter = self.tableModel.data[self.selected_item_index.row()][1]
        p1 = self.temporaryParametersDict[ageGroup][parameter]["p1"]
        p2 = self.temporaryParametersDict[ageGroup][parameter]["p2"]

        distributionType = self.temporaryParametersDict[ageGroup][parameter][
            "distributionType"]
        xyDict = {"x": [], "y": []}
        try:
            if distributionType == 'Binomial':
                xyDict["x"] = np.arange(binom.ppf(0.01, int(p1), p2 / 100),
                                        binom.ppf(0.99, int(p1), p2 / 100))
                xyDict["y"] = binom.pmf(xyDict["x"], int(p1), p2 / 100)
            elif distributionType == 'Geometric':
                xyDict["x"] = np.arange(geom.ppf(0.01, p1 / 100),
                                        geom.ppf(0.99, p1 / 100))
                xyDict["y"] = geom.pmf(xyDict["x"], p1 / 100)
                if p2 != 0:
                    self.tableModel.setData(
                        self.selected_item_index.sibling(
                            self.selected_item_index.row(), 3), 0, Qt.EditRole)
            elif distributionType == 'Laplacian':
                xyDict["x"] = np.arange(dlaplace.ppf(0.01, p1 / 100),
                                        dlaplace.ppf(0.99, p1 / 100))
                xyDict["y"] = dlaplace.pmf(xyDict["x"], p1 / 100)
                if p2 != 0:
                    self.tableModel.setData(
                        self.selected_item_index.sibling(
                            self.selected_item_index.row(), 3), 0, Qt.EditRole)
            elif distributionType == 'Logarithmic':
                xyDict["x"] = np.arange(logser.ppf(0.01, p1 / 100),
                                        logser.ppf(0.99, p1 / 100))
                xyDict["y"] = logser.pmf(xyDict["x"], p1 / 100)
                if p2 != 0:
                    self.tableModel.setData(
                        self.selected_item_index.sibling(
                            self.selected_item_index.row(), 3), 0, Qt.EditRole)
            elif distributionType == 'Neg. binomial':
                xyDict["x"] = np.arange(nbinom.ppf(0.01, p1, p2 / 100),
                                        nbinom.ppf(0.99, p1, p2 / 100))
                xyDict["y"] = nbinom.pmf(xyDict["x"], p1, p2 / 100)
            elif distributionType == 'Planck':
                xyDict["x"] = np.arange(planck.ppf(0.01, p1 / 100),
                                        planck.ppf(0.99, p1 / 100))
                xyDict["y"] = planck.pmf(xyDict["x"], p1 / 100)
                if p2 != 0:
                    self.tableModel.setData(
                        self.selected_item_index.sibling(
                            self.selected_item_index.row(), 3), 0, Qt.EditRole)
            elif distributionType == 'Poisson':
                xyDict["x"] = np.arange(poisson.ppf(0.01, p1),
                                        poisson.ppf(0.99, p1))
                xyDict["y"] = poisson.pmf(xyDict["x"], p1)
                if p2 != 0:
                    self.tableModel.setData(
                        self.selected_item_index.sibling(
                            self.selected_item_index.row(), 3), 0, Qt.EditRole)
            elif distributionType == 'Uniform':
                if p1 - 0.5 * p2 < 0:
                    p2 = p1
                min = p1 - 0.5 * p2
                max = p1 + 0.5 * p2
                xyDict["x"] = np.arange(randint.ppf(0.01, min, max),
                                        randint.ppf(0.99, min, max))
                xyDict["y"] = randint.pmf(xyDict["x"], min, max)
            elif distributionType == 'Zipf (Zeta)':
                xyDict["x"] = np.arange(zipf.ppf(0.01, p1), zipf.ppf(0.99, p1))
                xyDict["y"] = zipf.pmf(xyDict["x"], p1)
                if p2 != 0:
                    self.tableModel.setData(
                        self.selected_item_index.sibling(
                            self.selected_item_index.row(), 3), 0, Qt.EditRole)
            self.update_graph(xyDict)
        except Exception as E:
            log.error(E)
def ECTestSetUp(filesize,
                fileNumber):  # file size in MB, flag: whether write the files
    # settings
    # fileNumber = 1  # 500
    # fileSize = 200 #MB
    zipfFactor = 1.5
    # machineNumber = 2  # 30
    # SPFactor = 6
    # # generate popularity vector
    popularity = list()
    for i in range(1, fileNumber + 1, 1):
        popularity.append(zipf.pmf(i, zipfFactor))
    popularity /= sum(popularity)
    shuffle(popularity)
    tests_dir = os.path.expanduser('~')  # for Linux
    # tests_dir = os.getenv('HOME')# for mac OS
    print("tests dir:" + tests_dir)

    if not os.path.exists(tests_dir + "/ec_test_files"):
        os.makedirs(tests_dir + "/ec_test_files")

    fw = open(tests_dir + "/ec_test_files/popularity.txt", "w")
    for item in popularity:
        fw.write(str(item) + '\n')

    #filesize = np.random.exponential(1.5, fileNumber)
    #filesize = filesize/min(filesize)*4
    filesize = filesize * 1024 * 1024
    filesizes = [filesize] * fileNumber
    fw = open(tests_dir + "/ec_test_files/fileSize.txt", "w")
    for size in filesizes:
        fw.write(str(int(size)) + '\n')
    fw.close()
    # calculate the partition_number, in the range of [1, machineNumber]
    # kVector = [max(min(int(popularity[id] * 100 * SPFactor), machineNumber), 1) for id in
    #            range(0, fileNumber)]
    kVector = [3] * fileNumber
    # kVector =10*np.ones(fileNumber,dtype=np.int)
    # print partitionNumber
    fw = open(tests_dir + "/ec_test_files/k.txt", "w")
    for k in kVector:
        fw.write(str(k) + '\n')
    fw.close()

    nVector = [1] * fileNumber
    # kVector =10*np.ones(fileNumber,dtype=np.int)
    # print partitionNumber
    fw = open(tests_dir + "/ec_test_files/n.txt", "w")
    for n in nVector:
        fw.write(str(n) + '\n')
    fw.close()

    # create the file of given size
    # with open(tests_dir + "/ec_test_files/test_local_file", "w") as out:
    #     out.seek((fileSize * 1000 * 1000) - 1)
    #     out.write('\0')
    # out.close()

    # write the files to Alluxio given the kvalues profile
    # remember to add the path of alluxio
    # if (flag == 1):
    start = int(round(time.time() * 1000))  # in millisecond
    os.system('$ALLUXIO_HOME/bin/alluxio runECPrepareFile true')
    end = int(round(time.time() * 1000))
    print('Write %s files takes %s' % (fileNumber, end - start))
Exemple #21
0
def chisquare(observations, shape_file, min_prob, maxlength, dist):
	
	max_length = maxlength if maxlength else max(observations)
	
	'''
	if not maxlength or (maxlength and max(observations) < maxlength):
		max_length = max(observations)
	else:
		max_length = maxlength
	'''
		
	#remove observations larger than the maximal length
	observed = [o for o in observations if o <= max_length]
	
	#get shape parameters 
	shape_values = get_shape_values(shape_file, dist)

	#define results data frame
	
	results = {"0shape": [], "1chisq": [], "2pvalue": [], 
				"3n.observations": [], "4n.bins": [], 
				"5n.expected < 5": [], "6n.observed < 5": []}	
	
	if dist == "negbinom":
		results["0shape2"] = []
	
	for shape in shape_values:
		
		#calculate expected frequencies:
		if dist == "zipf":
			expect_freq = zipf.pmf(range(1,max_length+1), shape) 
		elif dist == "negbinom":
			r, p = shape
			expect_freq = nbinom.pmf(range(1,max_length+1), r, p) 

		try:
			expect_freq = np.array(expect_freq) / sum(expect_freq)
		except:
			print ("shape caused zero-division: ", shape)
			
		# accumulate frequencies to a minimal probability of MIN_PROB 
		acc_freqs = [0]
		bins_lengths = [0]
		for freq in expect_freq:
			if acc_freqs[-1] < min_prob:
				acc_freqs[-1] += freq
				bins_lengths[-1] += 1
			else:
				acc_freqs.append(freq)
				bins_lengths.append(1)

		acc_expected = np.array(acc_freqs) * len(observed)
		
		#observed:
		observed_hist = list(np.bincount(observed)[1:])
		
		# accumulate observations according to the accumulated frequencies
		i = 0
		acc_observed = []
		for length in bins_lengths:	
			acc_observed.append(sum(observed_hist[i:i+length]))
			i += length
			
		try:
			chisq, pval = stats.chisquare(acc_observed, acc_expected)
		except:
			chisq, pval = -1, -1
		
		'''
		print bins_lengths
		print expect_freq
		print max_length
		print observed_hist
		print acc_observed	
		print acc_expected	
		'''
		
		#count how many bins are less than 5 in both expected and observed 
		less_obs = sum(i < MIN_BIN for i in acc_observed)
		less_exp = sum(i < MIN_BIN for i in acc_expected)
		if dist == "zipf":
			results["0shape"].append(shape)
		if dist == "negbinom":
			results["0shape"].append(r)	
			results["0shape2"].append(p)
		results["1chisq"].append(chisq) 
		results["2pvalue"].append(pval)
		results["3n.observations"].append(len(observed)) 
		results["4n.bins"].append(len(acc_expected))
		results["5n.expected < 5"].append(less_exp)
		results["6n.observed < 5"].append(less_obs)
		
	return pd.DataFrame(results)
Exemple #22
0
from scipy.stats import zipf
import numpy as np

fileNumber = 100
zipfFactor = 1.05
popularity = list()
for i in range(1, fileNumber+1 ,1):
    popularity.append(zipf.pmf(i, zipfFactor))
popularity /= sum(popularity)
#popularity = popularity[::-1]

count = list()
size = list()
for pop in popularity:
    this_count = max(min((int)(300*pop),30),1)
    count.append(this_count)
    size.append(100/this_count)
sum(popularity)
print size
    x_values = np.array(range(max(ab) + 2)[1:])

    logser_p = md.logser_solver(ab)
    logser_values = md.trunc_logser.pmf(x_values, logser_p, upper_bound=float("inf"))
    lsll = md.logser_ll(ab, logser_p)

    nb_n, nb_p = md.nbinom_lower_trunc_solver(ab)
    nb_values = md.nbinom_lower_trunc.pmf(x_values, nb_n, nb_p)
    nbll = md.nbinom_lower_trunc_ll(ab, nb_n, nb_p)

    pln_mu, pln_sigma = md.pln_solver(ab)
    pln_values = md.pln.pmf(x_values, pln_mu, pln_sigma, lower_trunc=True)
    plnll = md.pln_ll(ab, pln_mu, pln_sigma)

    zipf_par = md.zipf_solver(ab)
    zipf_values = zipf.pmf(x_values, zipf_par)
    zll = md.zipf_ll(ab, zipf_par)

    ab_y = np.zeros(len(x_values) + 1)
    for j in range(len(ab)):
        ab_y[ab[j]] = ab_y[ab[j]] + 1/len(ab)

    ax.set_xlim([0,min(50, max(x_values))])

    plt.ylabel('frequency')
    plt.xlabel('abundance')
    plt.title(plot_labels[i])

    # Width originally set at 12 when width was 50.
    # This should be the same proportional width
    width = 3 / min(50, max(x_values)) * 50
Exemple #24
0
from scipy.stats import zipf
import matplotlib.pyplot as plt

fig, ax = plt.subplots(1, 1)

# Calculate a few first moments:

a = 6.5
mean, var, skew, kurt = zipf.stats(a, moments='mvsk')

# Display the probability mass function (``pmf``):

x = np.arange(zipf.ppf(0.01, a), zipf.ppf(0.99, a))
ax.plot(x, zipf.pmf(x, a), 'bo', ms=8, label='zipf pmf')
ax.vlines(x, 0, zipf.pmf(x, a), colors='b', lw=5, alpha=0.5)

# Alternatively, the distribution object can be called (as a function)
# to fix the shape and location. This returns a "frozen" RV object holding
# the given parameters fixed.

# Freeze the distribution and display the frozen ``pmf``:

rv = zipf(a)
ax.vlines(x,
          0,
          rv.pmf(x),
          colors='k',
          linestyles='-',
          lw=1,
          label='frozen pmf')
ax.legend(loc='best', frameon=False)