Python FalsePositive 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: calcFalsePositive

클래스/타입: FalsePositive

hotexamples.com에서의 예제들: 6

Python FalsePositive - 6개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 calcFalsePositive.FalsePositive에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

FalsePositive(1)

자주 사용되는 메소드들

FalsePositive (1)

예제 #1

파일 보기

파일: Sampling.py 프로젝트: 654984799/study3

def rateFixed(rdd,rate):
    print('----固定比例抽样----')
    print('固定比例为：%s' % rate)
    all_size = rdd.count()
    size=math.floor(all_size*rate)
    mult = all_size / size
    sampleRDD=rdd.sample(False,rate)


    #！！！注意！！！当内存中装不下数据集时使用.RDD自动分配到内存+磁盘中
    # sortedRDD=sampleRDD.map(lambda x: (x,x)).sortByKey()
    # newRDD=sortedRDD.filter(lambda x: x)

    # ！！！注意！！！当内存能装得下数据时候使用
    sortRes = sorted(sampleRDD.collect())
    temp = sortRes[0]
    cnt = 1
    for i in range(1, size):
        if (temp != sortRes[i]):
            cnt = cnt + 1
            temp = sortRes[i]
    estAll = math.floor(cnt * mult)
    all, fp = FalsePositive(all_size, estAll)

    print('抽样中集合基数为：%s' % cnt)
    print('由样本估计总体集合基数为：%s' % estAll)
    print('误差率为： %s' % (fp * 100) + '%')

예제 #2

파일 보기

def sequential(rdd,error_rate,size):
    all_size=rdd.count()
    new_error=1.0
    cnt=0

    n=round(all_size/100)
    # sampleRDD = rdd.takeSample(False, size)
    while new_error>error_rate:
        mult = all_size / size
        sampleRDD=rdd.take(size)
        sortRes = sorted(sampleRDD)
        temp = sortRes[0]
        cnt = 1
        for i in range(1, size):
            if (temp != sortRes[i]):
                cnt = cnt + 1
                temp = sortRes[i]
        estAll = floor(cnt * mult)
        all, fp = FalsePositive(all_size, estAll)
        new_error = fp
        # print("当前FP率为：%s"%fp)
        size=size+n
    print('抽样中集合基数为：%s' % cnt)
    print('Sequential由样本估计总体集合基数为：%s' %estAll)
    print('Sequential最终误差率为： %s' % (new_error * 100) + '%')

예제 #3

파일 보기

파일: HLLplusplus.py 프로젝트: 654984799/study3

 def add(value):
     x = mmh3.hash64(value[0], signed=False)[1]
     a = 64 - hll.b
     i = x >> (64 - hll.b)  # 取64bit哈希值的前b位
     v = hll.left_most_nbit(x << hll.b, a)  # 除去前b位剩下的前导0
     hll.bucket[i] = max(hll.bucket[i], v)
     if value[1] == n - 1:
         num = hll.count()
         all, fp = FalsePositive(n, num)
         print()
         print("HLL++基数估计为：%s" % num)
         print("FP百分率为：%s" % (fp * 100) + '%')

예제 #4

파일 보기

파일: Hyperloglog.py 프로젝트: 654984799/study3

 def add(value):
     x = mmh3.hash(value[0], signed=False)
     a = 32 - hll.b
     i = x >> (32 - hll.b)  # 取32bit哈希值的前b位
     v = hll.left_most_nbit(x << hll.b, a)  # 除去前b位剩下的前导0
     hll.bucket[i] = max(hll.bucket[i], v)
     # print(type(hll.bucket))
     if value[1]==n-1:
         num=hll.count()
         all,fp=FalsePositive(n,num)
         print()
         print("Hyperloglog基数估计为：%s"%num)
         print("FP百分率为：%s"%(fp*100)+'%')

예제 #5

파일 보기

def normal(rdd, size):
    all_size = rdd.count()
    mult = all_size / size
    sampleRDD = rdd.takeSample(False, size)
    sortRes = sorted(sampleRDD)

    temp = sortRes[0]
    cnt = 1
    for i in range(1, size):
        if (temp != sortRes[i]):
            cnt = cnt + 1
            temp = sortRes[i]
    estAll = floor(cnt * mult)
    all, fp = FalsePositive(all_size, estAll)
    err = (all_size - estAll) / all_size
    print('抽样中集合基数为：%s' % cnt)
    print('由样本估计总体集合基数为：%s' % estAll)
    print('误差率为： %s' % (fp * 100) + '%')

예제 #6

파일 보기

    rdd1 = sc.textFile(
        "file:///home/evan/PycharmProjects/BIg_Data_lab/text_3.txt")
    RDD = rdd1.flatMap(lambda x: x.split(','))

    time1 = time.time()
    m = 100000000  #位向量长度
    n = RDD.count()  #数据总容量
    k = math.ceil((m / n) * math.log(2))  #根据求导得出的最优hash个数
    # 构造BloomFilter
    pb = PrimalBloom(m, k)
    strRDD = RDD

    # 取出RDD中某行到某行的元素
    # stand = RDD.zipWithIndex().filter(lambda x: 0 <= x[1] < 100000).map(lambda x: x[0])
    # 判断x是否在bf中，若不在则加入bf中
    def exist(x):
        flag = False
        if not (pb.contains(x)):
            pb.insert(x)
            flag = True
        return flag

    newRDD = strRDD.filter(lambda x: exist(x) is True)
    time2 = time.time()

    bfnum = newRDD.count()
    allnum, fp = FalsePositive(n, bfnum)
    print("OptimalBloom方法估计集合基数为：%s" % bfnum)
    print("多重集合实际基数为：%s" % allnum)
    print("FalsePositive占比为：%s" % (fp * 10) + "%")
    print("time is :%s" % (time2 - time1) + 's')