Python FalsePositive Examples

Programming Language: Python

Namespace/Package Name: calcFalsePositive

Class/Type: FalsePositive

Examples at hotexamples.com: 6

Python FalsePositive - 6 examples found. These are the top rated real world Python examples of calcFalsePositive.FalsePositive extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

FalsePositive(1)

Frequently Used Methods

FalsePositive (1)

Example #1

Show file

File: Sampling.py Project: 654984799/study3

def rateFixed(rdd,rate):
    print('----固定比例抽样----')
    print('固定比例为：%s' % rate)
    all_size = rdd.count()
    size=math.floor(all_size*rate)
    mult = all_size / size
    sampleRDD=rdd.sample(False,rate)


    #！！！注意！！！当内存中装不下数据集时使用.RDD自动分配到内存+磁盘中
    # sortedRDD=sampleRDD.map(lambda x: (x,x)).sortByKey()
    # newRDD=sortedRDD.filter(lambda x: x)

    # ！！！注意！！！当内存能装得下数据时候使用
    sortRes = sorted(sampleRDD.collect())
    temp = sortRes[0]
    cnt = 1
    for i in range(1, size):
        if (temp != sortRes[i]):
            cnt = cnt + 1
            temp = sortRes[i]
    estAll = math.floor(cnt * mult)
    all, fp = FalsePositive(all_size, estAll)

    print('抽样中集合基数为：%s' % cnt)
    print('由样本估计总体集合基数为：%s' % estAll)
    print('误差率为： %s' % (fp * 100) + '%')

Example #2

Show file

def sequential(rdd,error_rate,size):
    all_size=rdd.count()
    new_error=1.0
    cnt=0

    n=round(all_size/100)
    # sampleRDD = rdd.takeSample(False, size)
    while new_error>error_rate:
        mult = all_size / size
        sampleRDD=rdd.take(size)
        sortRes = sorted(sampleRDD)
        temp = sortRes[0]
        cnt = 1
        for i in range(1, size):
            if (temp != sortRes[i]):
                cnt = cnt + 1
                temp = sortRes[i]
        estAll = floor(cnt * mult)
        all, fp = FalsePositive(all_size, estAll)
        new_error = fp
        # print("当前FP率为：%s"%fp)
        size=size+n
    print('抽样中集合基数为：%s' % cnt)
    print('Sequential由样本估计总体集合基数为：%s' %estAll)
    print('Sequential最终误差率为： %s' % (new_error * 100) + '%')

Example #3

Show file

File: HLLplusplus.py Project: 654984799/study3

 def add(value):
     x = mmh3.hash64(value[0], signed=False)[1]
     a = 64 - hll.b
     i = x >> (64 - hll.b)  # 取64bit哈希值的前b位
     v = hll.left_most_nbit(x << hll.b, a)  # 除去前b位剩下的前导0
     hll.bucket[i] = max(hll.bucket[i], v)
     if value[1] == n - 1:
         num = hll.count()
         all, fp = FalsePositive(n, num)
         print()
         print("HLL++基数估计为：%s" % num)
         print("FP百分率为：%s" % (fp * 100) + '%')

Example #4

Show file

File: Hyperloglog.py Project: 654984799/study3

 def add(value):
     x = mmh3.hash(value[0], signed=False)
     a = 32 - hll.b
     i = x >> (32 - hll.b)  # 取32bit哈希值的前b位
     v = hll.left_most_nbit(x << hll.b, a)  # 除去前b位剩下的前导0
     hll.bucket[i] = max(hll.bucket[i], v)
     # print(type(hll.bucket))
     if value[1]==n-1:
         num=hll.count()
         all,fp=FalsePositive(n,num)
         print()
         print("Hyperloglog基数估计为：%s"%num)
         print("FP百分率为：%s"%(fp*100)+'%')

Example #5

Show file

def normal(rdd, size):
    all_size = rdd.count()
    mult = all_size / size
    sampleRDD = rdd.takeSample(False, size)
    sortRes = sorted(sampleRDD)

    temp = sortRes[0]
    cnt = 1
    for i in range(1, size):
        if (temp != sortRes[i]):
            cnt = cnt + 1
            temp = sortRes[i]
    estAll = floor(cnt * mult)
    all, fp = FalsePositive(all_size, estAll)
    err = (all_size - estAll) / all_size
    print('抽样中集合基数为：%s' % cnt)
    print('由样本估计总体集合基数为：%s' % estAll)
    print('误差率为： %s' % (fp * 100) + '%')

Example #6

Show file

    rdd1 = sc.textFile(
        "file:///home/evan/PycharmProjects/BIg_Data_lab/text_3.txt")
    RDD = rdd1.flatMap(lambda x: x.split(','))

    time1 = time.time()
    m = 100000000  #位向量长度
    n = RDD.count()  #数据总容量
    k = math.ceil((m / n) * math.log(2))  #根据求导得出的最优hash个数
    # 构造BloomFilter
    pb = PrimalBloom(m, k)
    strRDD = RDD

    # 取出RDD中某行到某行的元素
    # stand = RDD.zipWithIndex().filter(lambda x: 0 <= x[1] < 100000).map(lambda x: x[0])
    # 判断x是否在bf中，若不在则加入bf中
    def exist(x):
        flag = False
        if not (pb.contains(x)):
            pb.insert(x)
            flag = True
        return flag

    newRDD = strRDD.filter(lambda x: exist(x) is True)
    time2 = time.time()

    bfnum = newRDD.count()
    allnum, fp = FalsePositive(n, bfnum)
    print("OptimalBloom方法估计集合基数为：%s" % bfnum)
    print("多重集合实际基数为：%s" % allnum)
    print("FalsePositive占比为：%s" % (fp * 10) + "%")
    print("time is :%s" % (time2 - time1) + 's')