from cuckoovec import CuckooVector import numpy as np import string from ngrams import * vocab = [c for c in string.ascii_uppercase + string.ascii_lowercase] # letters n = 5 t = 2 # form random sparse vectors from <= t random words and weights def randsparse(t): return dict(zip(np.random.choice(vocab, t), np.random.randn(t))) s1 = [randsparse(t) for i in range(n)] s2 = [randsparse(t) for i in range(n)] t1 = ngrams(s1) print(t1) t2 = ngrams(s2) v1 = CuckooVector(t1) v2 = CuckooVector(t2) # these should be close to orthogonal. print(v1.dot(v2))
d = 10000000 # high dimension t = 500000 # number of actual features m1 = randsparse(d, t) m2 = randsparse(d, t) v1 = densify(m1, d) v2 = densify(m2, d) cv1 = CuckooVector(m1) cv2 = CuckooVector(m2) print("2-norms: ") print(np.linalg.norm(v1)) print(cv1.norm(2)) print(np.linalg.norm(v2)) print(cv2.norm(2)) print("dots: ") print(np.dot(v1,v2)) print(cv1.dot(cv2)) v1 = v1 + v2 cv1.add(cv2) print("2-norm of sums: ") print(np.linalg.norm(v1)) print(cv1.norm(2)) dist = 0 for k in set().union(m1, m2): dist = dist + (abs(v1[int(k)] - cv1[k])) print("1-distance between sums: ") print(dist)