예제 #1
0
def rateFixed(rdd,rate):
    print('----固定比例抽样----')
    print('固定比例为:%s' % rate)
    all_size = rdd.count()
    size=math.floor(all_size*rate)
    mult = all_size / size
    sampleRDD=rdd.sample(False,rate)


    #!!!注意!!!当内存中装不下数据集时使用.RDD自动分配到内存+磁盘中
    # sortedRDD=sampleRDD.map(lambda x: (x,x)).sortByKey()
    # newRDD=sortedRDD.filter(lambda x: x)

    # !!!注意!!!当内存能装得下数据时候使用
    sortRes = sorted(sampleRDD.collect())
    temp = sortRes[0]
    cnt = 1
    for i in range(1, size):
        if (temp != sortRes[i]):
            cnt = cnt + 1
            temp = sortRes[i]
    estAll = math.floor(cnt * mult)
    all, fp = FalsePositive(all_size, estAll)

    print('抽样中集合基数为:%s' % cnt)
    print('由样本估计总体集合基数为:%s' % estAll)
    print('误差率为: %s' % (fp * 100) + '%')
예제 #2
0
def sequential(rdd,error_rate,size):
    all_size=rdd.count()
    new_error=1.0
    cnt=0

    n=round(all_size/100)
    # sampleRDD = rdd.takeSample(False, size)
    while new_error>error_rate:
        mult = all_size / size
        sampleRDD=rdd.take(size)
        sortRes = sorted(sampleRDD)
        temp = sortRes[0]
        cnt = 1
        for i in range(1, size):
            if (temp != sortRes[i]):
                cnt = cnt + 1
                temp = sortRes[i]
        estAll = floor(cnt * mult)
        all, fp = FalsePositive(all_size, estAll)
        new_error = fp
        # print("当前FP率为:%s"%fp)
        size=size+n
    print('抽样中集合基数为:%s' % cnt)
    print('Sequential由样本估计总体集合基数为:%s' %estAll)
    print('Sequential最终误差率为: %s' % (new_error * 100) + '%')
예제 #3
0
 def add(value):
     x = mmh3.hash64(value[0], signed=False)[1]
     a = 64 - hll.b
     i = x >> (64 - hll.b)  # 取64bit哈希值的前b位
     v = hll.left_most_nbit(x << hll.b, a)  # 除去前b位剩下的前导0
     hll.bucket[i] = max(hll.bucket[i], v)
     if value[1] == n - 1:
         num = hll.count()
         all, fp = FalsePositive(n, num)
         print()
         print("HLL++基数估计为:%s" % num)
         print("FP百分率为:%s" % (fp * 100) + '%')
예제 #4
0
 def add(value):
     x = mmh3.hash(value[0], signed=False)
     a = 32 - hll.b
     i = x >> (32 - hll.b)  # 取32bit哈希值的前b位
     v = hll.left_most_nbit(x << hll.b, a)  # 除去前b位剩下的前导0
     hll.bucket[i] = max(hll.bucket[i], v)
     # print(type(hll.bucket))
     if value[1]==n-1:
         num=hll.count()
         all,fp=FalsePositive(n,num)
         print()
         print("Hyperloglog基数估计为:%s"%num)
         print("FP百分率为:%s"%(fp*100)+'%')
예제 #5
0
def normal(rdd, size):
    all_size = rdd.count()
    mult = all_size / size
    sampleRDD = rdd.takeSample(False, size)
    sortRes = sorted(sampleRDD)

    temp = sortRes[0]
    cnt = 1
    for i in range(1, size):
        if (temp != sortRes[i]):
            cnt = cnt + 1
            temp = sortRes[i]
    estAll = floor(cnt * mult)
    all, fp = FalsePositive(all_size, estAll)
    err = (all_size - estAll) / all_size
    print('抽样中集合基数为:%s' % cnt)
    print('由样本估计总体集合基数为:%s' % estAll)
    print('误差率为: %s' % (fp * 100) + '%')
예제 #6
0
    rdd1 = sc.textFile(
        "file:///home/evan/PycharmProjects/BIg_Data_lab/text_3.txt")
    RDD = rdd1.flatMap(lambda x: x.split(','))

    time1 = time.time()
    m = 100000000  #位向量长度
    n = RDD.count()  #数据总容量
    k = math.ceil((m / n) * math.log(2))  #根据求导得出的最优hash个数
    # 构造BloomFilter
    pb = PrimalBloom(m, k)
    strRDD = RDD

    # 取出RDD中某行到某行的元素
    # stand = RDD.zipWithIndex().filter(lambda x: 0 <= x[1] < 100000).map(lambda x: x[0])
    # 判断x是否在bf中,若不在则加入bf中
    def exist(x):
        flag = False
        if not (pb.contains(x)):
            pb.insert(x)
            flag = True
        return flag

    newRDD = strRDD.filter(lambda x: exist(x) is True)
    time2 = time.time()

    bfnum = newRDD.count()
    allnum, fp = FalsePositive(n, bfnum)
    print("OptimalBloom方法估计集合基数为:%s" % bfnum)
    print("多重集合实际基数为:%s" % allnum)
    print("FalsePositive占比为:%s" % (fp * 10) + "%")
    print("time is :%s" % (time2 - time1) + 's')