forked from andrewclegg/sketchy
/
sketchy.py
91 lines (75 loc) · 2.91 KB
/
sketchy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
# Tools for locality-sensitive hashing.
#
# https://github.com/andrewclegg/sketchy
#
# Tested on Jython 2.5.2, should work on cpython 2.5+ except for the
# Hamming distance methods which need bit() from 2.6. Feel free to
# add your own retro versions. Haven't tested it on Python 3 yet.
import random
import sys
from array import array # Consider some sort of bitfield instead?
# Kludge around the function decorator that Pig injects, with a dummy.
if 'outputSchema' not in globals():
def outputSchema(x):
return lambda(y): y
planes = None
""" Create 'size' planes in a 'dim'-dimensional space, with a new random
number generator using 'seed'. """
def make_planes(size, dim, seed):
random.seed(seed)
p = []
for i in xrange(size):
p.append(array('b', (random.choice((-1, 1)) for i in xrange(0, dim))))
return p
""" Calculate cosine similarity of two sparse vectors. """
def sparse_cos_sim(sv1, sv2):
mag_prod = sparse_magnitude(sv1) * sparse_magnitude(sv2)
if mag_prod == 0:
return 0
return sparse_dot_product(sv1, sv2) / mag_prod
""" Calculate dot product of two sparse vectors. """
def sparse_dot_product(sv1, sv2):
d1 = dict(sv1)
d2 = dict(sv2)
tot = 0
for key in set(d1.keys()).intersection(set(d2.keys())):
tot += d1[key] * d2[key]
return tot
""" Calculate magnitude of a sparse vector. """
def sparse_magnitude(sv):
return sum(v**2 for (a, v) in sv)**0.5
""" Calculate dot product of a sparse vector 'sv' against a dense vector 'dv'.
The sparse vector format is described below. No bounds checking is done,
so make sure it doesn't exceed the size of 'dv'. """
def mixed_dot_product(sv, dv):
tot = 0
for (idx, val) in sv:
tot += val * dv[idx]
return tot
""" Calculates the Random Projection hash for a sparse vector 'sv' against a
set of random planes defined by the other variables, using one bit for each
plane. The vector should pe provided as a bag of (dimension, value) tuples.
Only numeric values are supported, so you need to map words, categories etc.
yourself first. """
@outputSchema('lsh:int') # changed signature from long to int - Pig was choking on the cast...
def sparse_random_projection(sv, size, dim, seed):
# Create the planes if they don't already exist in this process
global planes
if planes is None:
planes = make_planes(size, dim, seed)
dps = [mixed_dot_product(sv, plane) for plane in planes]
return sum([2**i if dps[i] > 0 else 0 for i in xrange(0, len(dps))])
if 'Java' in sys.version:
import java.lang.Integer as Integer
import java.lang.Long as Long
# TODO hamming8 and hamming16
def hamming32(i1, i2):
return Integer.bitCount(i1^i2)
def hamming64(l1, l2):
return Long.bitCount(l1^l2)
else:
def hamming8(i1, i2):
return bin(i1^i2).count('1')
hamming16 = hamming8
hamming32 = hamming8
hamming64 = hamming8