/
rappor.py
123 lines (89 loc) · 3.14 KB
/
rappor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
# coding: utf-8
# In[87]:
import pandas as pd
import pymmh3 as mmh3
import numpy as np
import random
seed=1
def Mapping1(clientvalue,cohort,num_hashes,bfsize, allunique):
inptomd=str(cohort)+clientvalue
listofindx=[]
i=0
entries=0
while(entries<num_hashes):
digest = mmh3.hash(inptomd,i) % bfsize
if(digest not in listofindx):
listofindx.append(digest)
entries=entries+1
i=i+1
return listofindx
# In[143]:
def FakeBloomFilter(bloom,f,bfsize):
fakebloomfilter=np.zeros(bfsize)
for i in range(0,bfsize):
chc=np.random.choice(np.array([1,2,3]), p=[f/2,f/2,1-f])
if(chc==1):
fakebloomfilter[i]=0
elif(chc==2):
fakebloomfilter[i]=1
else:
fakebloomfilter[i]=bloom[i]
return fakebloomfilter
# In[144]:
def ProcessEachString(word,cohort,f,bfsize,no_hashes, allunique):
bloom_filter=np.zeros((len(word),bfsize))
getindex=[Mapping1(word[i],cohort[i],no_hashes,bfsize, allunique) for i in range(0,len(word))]
reports = []
for i in range(len(word)):
#print(getindex[i])
for j in range(0,len(getindex[i])):
bloom_filter[i][getindex[i][j]]=1
report=FakeBloomFilter(bloom_filter[i],f,bfsize)
reports.append(report.astype(int).tolist())
return reports
def GetBloomBits(candidatestring,cohort,bfsize,no_hashes, allunique):
bloom_filter_string_cohort=np.zeros(bfsize)
getindex=Mapping1(candidatestring,cohort,no_hashes,bfsize, allunique)
for i in range(0,len(getindex)):
bloom_filter_string_cohort[getindex[i]]=1
return bloom_filter_string_cohort
# In[145]:
def mapBloomFilter(clientvalue, icohort, nhashes, bfsize):
inptomd=str(icohort)+clientvalue
listofindx=[]
i=0
entries=0
while(entries<nhashes):
digest = mmh3.hash(inptomd,i) % bfsize
if(digest not in listofindx):
listofindx.append(digest)
entries=entries+1
i=i+1
vec = np.zeros(bfsize)
for ind in listofindx:
vec[ind] = 1
return vec
def mapCohortsBloomFilter(clientvalue,cohort,num_hashes,bfsize):
for k in range(0,len(cohort)):
inptomd=clientvalue+str(cohort[k])
encoded=hashlib.md5(inptomd.encode('utf-8')).hexdigest()
inds = [ord(encoded[i]) % bfsize for i in range(num_hashes)]
print(inds)
def ProcessDataAndParameters(saminp,no_cohorts,f,bfsize,no_hash, allunique):
matrix = []
np.random.seed(seed)
allcohorts=[]
print(no_cohorts)
for i in range(0,len(saminp)):
stringval=saminp['word'].iloc[i]
curlist=[stringval]*int(saminp['trueFrequency'].iloc[i])
cohorts=np.array([np.random.randint(1,no_cohorts+1) for i in range(0,len(curlist))])
allcohorts.extend(cohorts)
reports = ProcessEachString(curlist,cohorts,f,bfsize,no_hash, allunique)
matrix.extend(reports)
return [matrix,np.array(allcohorts)]
# In[146]:
# infile = pd.read_csv('smallcorpus.csv')
# client = infile.sample(frac=0.1)
# print(client)
# print(ProcessDataAndParameters(saminp,4,0.3,32,2))