def rateFixed(rdd,rate): print('----固定比例抽样----') print('固定比例为:%s' % rate) all_size = rdd.count() size=math.floor(all_size*rate) mult = all_size / size sampleRDD=rdd.sample(False,rate) #!!!注意!!!当内存中装不下数据集时使用.RDD自动分配到内存+磁盘中 # sortedRDD=sampleRDD.map(lambda x: (x,x)).sortByKey() # newRDD=sortedRDD.filter(lambda x: x) # !!!注意!!!当内存能装得下数据时候使用 sortRes = sorted(sampleRDD.collect()) temp = sortRes[0] cnt = 1 for i in range(1, size): if (temp != sortRes[i]): cnt = cnt + 1 temp = sortRes[i] estAll = math.floor(cnt * mult) all, fp = FalsePositive(all_size, estAll) print('抽样中集合基数为:%s' % cnt) print('由样本估计总体集合基数为:%s' % estAll) print('误差率为: %s' % (fp * 100) + '%')
def sequential(rdd,error_rate,size): all_size=rdd.count() new_error=1.0 cnt=0 n=round(all_size/100) # sampleRDD = rdd.takeSample(False, size) while new_error>error_rate: mult = all_size / size sampleRDD=rdd.take(size) sortRes = sorted(sampleRDD) temp = sortRes[0] cnt = 1 for i in range(1, size): if (temp != sortRes[i]): cnt = cnt + 1 temp = sortRes[i] estAll = floor(cnt * mult) all, fp = FalsePositive(all_size, estAll) new_error = fp # print("当前FP率为:%s"%fp) size=size+n print('抽样中集合基数为:%s' % cnt) print('Sequential由样本估计总体集合基数为:%s' %estAll) print('Sequential最终误差率为: %s' % (new_error * 100) + '%')
def add(value): x = mmh3.hash64(value[0], signed=False)[1] a = 64 - hll.b i = x >> (64 - hll.b) # 取64bit哈希值的前b位 v = hll.left_most_nbit(x << hll.b, a) # 除去前b位剩下的前导0 hll.bucket[i] = max(hll.bucket[i], v) if value[1] == n - 1: num = hll.count() all, fp = FalsePositive(n, num) print() print("HLL++基数估计为:%s" % num) print("FP百分率为:%s" % (fp * 100) + '%')
def add(value): x = mmh3.hash(value[0], signed=False) a = 32 - hll.b i = x >> (32 - hll.b) # 取32bit哈希值的前b位 v = hll.left_most_nbit(x << hll.b, a) # 除去前b位剩下的前导0 hll.bucket[i] = max(hll.bucket[i], v) # print(type(hll.bucket)) if value[1]==n-1: num=hll.count() all,fp=FalsePositive(n,num) print() print("Hyperloglog基数估计为:%s"%num) print("FP百分率为:%s"%(fp*100)+'%')
def normal(rdd, size): all_size = rdd.count() mult = all_size / size sampleRDD = rdd.takeSample(False, size) sortRes = sorted(sampleRDD) temp = sortRes[0] cnt = 1 for i in range(1, size): if (temp != sortRes[i]): cnt = cnt + 1 temp = sortRes[i] estAll = floor(cnt * mult) all, fp = FalsePositive(all_size, estAll) err = (all_size - estAll) / all_size print('抽样中集合基数为:%s' % cnt) print('由样本估计总体集合基数为:%s' % estAll) print('误差率为: %s' % (fp * 100) + '%')
rdd1 = sc.textFile( "file:///home/evan/PycharmProjects/BIg_Data_lab/text_3.txt") RDD = rdd1.flatMap(lambda x: x.split(',')) time1 = time.time() m = 100000000 #位向量长度 n = RDD.count() #数据总容量 k = math.ceil((m / n) * math.log(2)) #根据求导得出的最优hash个数 # 构造BloomFilter pb = PrimalBloom(m, k) strRDD = RDD # 取出RDD中某行到某行的元素 # stand = RDD.zipWithIndex().filter(lambda x: 0 <= x[1] < 100000).map(lambda x: x[0]) # 判断x是否在bf中,若不在则加入bf中 def exist(x): flag = False if not (pb.contains(x)): pb.insert(x) flag = True return flag newRDD = strRDD.filter(lambda x: exist(x) is True) time2 = time.time() bfnum = newRDD.count() allnum, fp = FalsePositive(n, bfnum) print("OptimalBloom方法估计集合基数为:%s" % bfnum) print("多重集合实际基数为:%s" % allnum) print("FalsePositive占比为:%s" % (fp * 10) + "%") print("time is :%s" % (time2 - time1) + 's')