Exemple #1
0
class KLSH(KLSHBase):
    '''
    >>>from klsh import KLSH
    >>>klsh = KLSH(
    ...     b=b,    #number of hash bits
    ...     t=t,    #number of
    ...)
    '''
    def __init__(self, b, t, **kw):
        super(KLSH, self).__init__(b=b, t=t, **kw)
        #self._check_parameters()

        self.hash = Hashing(b=b, t=t)
        self.bucket = KlshBucket()
        self.storage = PickleStorage()

    def loadDataSet(self, filename, delim=','):
        self.dataMat = np.loadtxt(filename, delimiter=delim)

    def preprocessing(self):
        dataMat = self.dataMat
        numOfData = dataMat[:, 0].size
        self.insert_matrix(dataMat, numOfData)
        self.store_Wmat()
        self.bucket.store_buckets()

    def insert_matrix(self, matrix, num):
        KerMat = self.hash.kernelMatrix(matrix)
        CenterMat = self.hash.center(KerMat)
        (HashTable, self.W) = self.hash.creatHashTable(CenterMat, self.b,
                                                       self.t)
        for i in xrange(num):
            self.bucket.insert_buckets(matrix[i, :], HashTable[i, :])

    def knn(self, vector, knum, stored=False):
        if stored == False:
            hashed_array = self.hash.do_hashing(
                vector, self.dataMat,
                self.W)  #vector must be the first argument
            knn_vectors = self.bucket.select_knn(knum, hashed_array)
        else:
            self.W = self.load_Wmat()
            self.bucket.load_buckets()
            hashed_array = self.hash.do_hashing(vector, self.dataMat, self.W)
            #hashed_array = self.hash.do_hashing(self.dataMat, vector, self.W)
            knn_vectors = self.bucket.select_knn(knum, hashed_array)

        return knn_vectors

    def store_Wmat(self):
        fw = open('Wmat.data', 'wb')
        self.storage.save(self.W, fw)
        fw.close()

    def load_Wmat(self):
        fr = open('Wmat.data')
        W = self.storage.load(fr)
        fr.close()
        return W
Exemple #2
0
class KLSH(KLSHBase):
    """
    >>>from klsh import KLSH
    >>>klsh = KLSH(
    ...     b=b,    #number of hash bits
    ...     t=t,    #number of
    ...)
    """

    def __init__(self, b, t, **kw):
        super(KLSH, self).__init__(b=b, t=t, **kw)
        # self._check_parameters()

        self.hash = Hashing(b=b, t=t)
        self.bucket = KlshBucket()
        self.storage = PickleStorage()

    def loadDataSet(self, filename, delim=","):
        self.dataMat = np.loadtxt(filename, delimiter=delim)

    def preprocessing(self):
        dataMat = self.dataMat
        numOfData = dataMat[:, 0].size
        self.insert_matrix(dataMat, numOfData)
        self.store_Wmat()
        self.bucket.store_buckets()

    def insert_matrix(self, matrix, num):
        KerMat = self.hash.kernelMatrix(matrix)
        CenterMat = self.hash.center(KerMat)
        (HashTable, self.W) = self.hash.creatHashTable(CenterMat, self.b, self.t)
        for i in xrange(num):
            self.bucket.insert_buckets(matrix[i, :], HashTable[i, :])

    def knn(self, vector, knum, stored=False):
        if stored == False:
            hashed_array = self.hash.do_hashing(vector, self.dataMat, self.W)  # vector must be the first argument
            knn_vectors = self.bucket.select_knn(knum, hashed_array)
        else:
            self.W = self.load_Wmat()
            self.bucket.load_buckets()
            hashed_array = self.hash.do_hashing(vector, self.dataMat, self.W)
            # hashed_array = self.hash.do_hashing(self.dataMat, vector, self.W)
            knn_vectors = self.bucket.select_knn(knum, hashed_array)

        return knn_vectors

    def store_Wmat(self):
        fw = open("Wmat.data", "wb")
        self.storage.save(self.W, fw)
        fw.close()

    def load_Wmat(self):
        fr = open("Wmat.data")
        W = self.storage.load(fr)
        fr.close()
        return W
class LshBucket(object):
    """Data bucket class

    >>> from buckets import LshBucket
    >>> bucket = LshBucket()
    >>> bucket.insert(vector, hashed_array)
    >>> bucket.select(vector, hashed_array)
    """

    def __init__(self, **kw):
        self.L = 0

        for key, value in kw.iteritems():
            setattr(self, key, value)

        self.data = [{} for i in xrange(self.L)]#包含L个哈希表,表中存有哈希值及其对应的向量
        self.index = [[] for i in xrange(self.L)]#包含L个列表,每个列表里包含对应哈希表中的哈希值
        self.storage = PickleStorage()


    def insert(self, vector, hashed_array ):
        assert len(self.data) == self.L
        assert len(self.index) == self.L
        assert len(hashed_array) == self.L
        #assert <test> , <data>
        #if <test> is false, raise error

        L = self.L

        for i in xrange(L):
            self._putInBucket(i,vector,hashed_array[i])

    def store_buckets(self):
        fw1 = open("buckets.data",'wb')
        self.storage.save(self.data,fw1)
        fw1.close()

        fw2 = open("buckets_index.data",'wb')
        self.storage.save(self.index,fw2)
        fw2.close()

    def load_buckets(self):
        fr1 = open('buckets.data')
        self.data = self.storage.load(fr1)
        fr1.close()

        fr2 = open('buckets_index.data')
        self.index = self.storage.load(fr2)
        fr2.close()

    def select(self,query_vector,hashed_array,without_itself=False):
        assert len(self.data) == self.L
        assert len(hashed_array) == self.L

        query_vector_tuple = tuple(query_vector)
        L = self.L
        result = []
        seen = {}

        for i in xrange(L):
            hashed = "".join(map(str,hashed_array[i]))
            data = self.data[i]
            vectors = data.get(hashed,[])#return the list of vectors pointed to hashed

            for vector in vectors:
                key = tuple(vector)
                if key in seen:
                    continue
                if key == query_vector_tuple and without_itself:#是否忽略完全一样的
                    continue

                seen[key] = True
                result.append(vector)

            if len(result) >= L*2:
                break

        return result#a list that composed of vectors


    def _putInBucket(self,i,vector,hashed_array):
        """
        this func build the relationship of (data[hashed] = vector)
        and build the index of hashed_array
        """
        index = self.index[i]
        hashed_array = "".join(map(str, hashed_array))
        #"".join() return an object of string type
        #map() Apply function to every item of iterable and return a list of the results.
        self.index[i] = sorted(set(index + [hashed_array]))
        data = self.data[i]
        if hashed_array not in data:
            data[hashed_array] = []
        data[hashed_array].append(vector)
class HammingHash(BaseHash):
    """Hash function class (Hamming Distance)

    >>> from LSH.hash import HammingHash
    >>> hash = HammingHash()
    >>> hashed_array = hash.do_hashing([123, 456, 789])
    """

    def __init__(self, L, k, d, **kw):
        super(HammingHash, self).__init__(L=L, k=k, d=d, **kw)
        self.storage = PickleStorage()

    def do_hashing(self,vector):
        if not isinstance(vector, (list, tuple)):
            raise TypeError("args should be an array_ref")

        if getattr(self, "d", None) is None:
            self.d = len(vector)

        if self.d != len(vector):
            raise ValueError("invalid dimention number")

        unary_code = self._unarize(vector)
        hash = []
        for i in xrange(self.L):    #the number of hash function
            sampling_bits = self.indexes[i]
            hash.append([unary_code[bit] for bit in sampling_bits])
        return hash

    def do_hashing_index(self,vector):
        #using the indexes from file that has stored
        fr = open('indexes.data')
        indexes_stored = self.storage.load(fr)
        fr.close()

        unary_code = self._unarize(vector)
        hash = []
        for i in xrange(self.L):
            sampling_bits = indexes_stored[i]
            hash.append([unary_code[bit] for bit in sampling_bits])
        return hash

    @property #only read
    def indexes(self):
        if getattr(self, "_indexes", None) is None:
            self._indexes = self._create_indexes()

            #store(pickle) the indexes
            fw = open('indexes.data','wb')
            self.storage.save(self._indexes,fw)
            fw.close()
        return self._indexes

    def _create_indexes(self):
        indexes = []
        for i in xrange(self.L):
            sampling_bits = set()
            #A set object is an unordered collection of distinct hashable objects
            while True:
                bit = random.randint(0, self.d * SCALE - 1)
                if bit not in sampling_bits:
                    sampling_bits.add(bit)
                    if len(sampling_bits) == self.k * SCALE:
                        break
            indexes.append(sorted(sampling_bits))
        return indexes

    def _unarize(self, vector):
        n = float(SCALE) / max(vector)
        unary = []
        for x in vector:
            i = int(x * n)
            j = SCALE - i
            unary += [1] * i
            unary += [0] * j
        return unary
class KlshBucket(object):

    def __init__(self, **kw):
        for key, value in kw.iteritems():
            setattr(self, key, value)

        self.buckets = {}
        self.index = []
        self.storage = PickleStorage()

    def insert_buckets(self, vector, hashed_array):
        index = self.index

        hashed_temp = np.array(hashed_array,dtype=np.int8,ndmin=1)
        hashed_temp = hashed_temp.tolist()
        hashed_list = hashed_temp[0]

        hashed = "".join(map(str, hashed_list))
        self.index = sorted(set(index + [hashed]))
        buckets = self.buckets
        if hashed not in buckets:
            buckets[hashed] = []
        buckets[hashed].append(vector)

    def select_knn(self, k, hashed_array):
        #query_vector_tuple = tuple(query_vector)
        knn_result = [] #[element->{'distance':hamming_dist,'vector':vector}]

        hashed_temp = np.array(hashed_array,dtype=np.int8,ndmin=1)
        hashed_temp = hashed_temp.tolist()
        hashed_list = hashed_temp[0]

        hashed = "".join(map(str, hashed_list))
        buckets = self.buckets
        indexes = self.index

        for index_val in indexes:
            ham_dist = self._ham_dist(hashed,index_val)
            if len(knn_result) < k:
                knn_result.append({'ham_dist':ham_dist,'vector':buckets[index_val]})
                continue

            knn_result.sort(key=lambda x:x['ham_dist'])#对ham_dist进行升序排列
            maximun_dist = knn_result[2]['ham_dist']
            if ham_dist < maximun_dist:
                knn_result[2] = {'ham_dist':ham_dist,'vector':buckets[index_val]}

        return knn_result

    def store_buckets(self):
        fw1 = open("buckets.data",'wb')
        self.storage.save(self.buckets,fw1)
        fw1.close()

        fw2 = open("buckets_index.data",'wb')
        self.storage.save(self.index,fw2)
        fw2.close()

    def load_buckets(self):
        fr1 = open('buckets.data')
        self.buckets = self.storage.load(fr1)
        fr1.close()

        fr2 = open('buckets_index.data')
        self.index = self.storage.load(fr2)
        fr2.close()

    def _ham_dist(self, hashval1, hashval2):
        xor_result = bitarray(hashval1) ^ bitarray(hashval2)
        return xor_result.count()
class KlshBucket(object):
    def __init__(self, **kw):
        for key, value in kw.iteritems():
            setattr(self, key, value)

        self.buckets = {}
        self.index = []
        self.storage = PickleStorage()

    def insert_buckets(self, vector, hashed_array):
        index = self.index

        hashed_temp = np.array(hashed_array, dtype=np.int8, ndmin=1)
        hashed_temp = hashed_temp.tolist()
        hashed_list = hashed_temp[0]

        hashed = "".join(map(str, hashed_list))
        self.index = sorted(set(index + [hashed]))
        buckets = self.buckets
        if hashed not in buckets:
            buckets[hashed] = []
        buckets[hashed].append(vector)

    def select_knn(self, k, hashed_array):
        #query_vector_tuple = tuple(query_vector)
        knn_result = []  #[element->{'distance':hamming_dist,'vector':vector}]

        hashed_temp = np.array(hashed_array, dtype=np.int8, ndmin=1)
        hashed_temp = hashed_temp.tolist()
        hashed_list = hashed_temp[0]

        hashed = "".join(map(str, hashed_list))
        buckets = self.buckets
        indexes = self.index

        for index_val in indexes:
            ham_dist = self._ham_dist(hashed, index_val)
            if len(knn_result) < k:
                knn_result.append({
                    'ham_dist': ham_dist,
                    'vector': buckets[index_val]
                })
                continue

            knn_result.sort(key=lambda x: x['ham_dist'])  #对ham_dist进行升序排列
            maximun_dist = knn_result[2]['ham_dist']
            if ham_dist < maximun_dist:
                knn_result[2] = {
                    'ham_dist': ham_dist,
                    'vector': buckets[index_val]
                }

        return knn_result

    def store_buckets(self):
        fw1 = open("buckets.data", 'wb')
        self.storage.save(self.buckets, fw1)
        fw1.close()

        fw2 = open("buckets_index.data", 'wb')
        self.storage.save(self.index, fw2)
        fw2.close()

    def load_buckets(self):
        fr1 = open('buckets.data')
        self.buckets = self.storage.load(fr1)
        fr1.close()

        fr2 = open('buckets_index.data')
        self.index = self.storage.load(fr2)
        fr2.close()

    def _ham_dist(self, hashval1, hashval2):
        xor_result = bitarray(hashval1) ^ bitarray(hashval2)
        return xor_result.count()