Example #1
0
    def test_random_discretized_projections(self):
        dim = 4
        vector_count = 5000
        vectors = numpy.random.randn(dim, vector_count)

        # First get recall and precision for one 1-dim random hash
        rdp = RandomDiscretizedProjections('rdp', 1, 0.01)
        nearest = NearestFilter(10)
        engine = Engine(dim, lshashes=[rdp], vector_filters=[nearest])
        exp = RecallPrecisionExperiment(10, vectors)
        result = exp.perform_experiment([engine])

        recall1 = result[0][0]
        precision1 = result[0][1]
        searchtime1 = result[0][2]

        print('\nRecall RDP: %f, Precision RDP: %f, SearchTime RDP: %f\n' % \
            (recall1, precision1, searchtime1))

        # Then get recall and precision for one 4-dim random hash
        rdp = RandomDiscretizedProjections('rdp', 2, 0.2)
        engine = Engine(dim, lshashes=[rdp], vector_filters=[nearest])
        result = exp.perform_experiment([engine])

        recall2 = result[0][0]
        precision2 = result[0][1]
        searchtime2 = result[0][2]

        print('\nRecall RDP: %f, Precision RDP: %f, SearchTime RDP: %f\n' % \
            (recall2, precision2, searchtime2))

        # Many things are random here, but the precision should increase
        # with dimension
        self.assertTrue(precision2 > precision1)
Example #2
0
    def __init__(self, level):
        self.feature_vector = {}
        self.parentLevel = int(level)
        print(self.parentLevel)
        self.hashes = dict()

        self.rdp = RandomDiscretizedProjections('rdp', 5, 6, rand_seed=98412194)
        self.rdp.reset(self.parentLevel)
Example #3
0
 def __init__(self, stage, bucket):
     self.parentLevel = 5
     self.rdp = RandomDiscretizedProjections('rdp',
                                             stage,
                                             bucket,
                                             rand_seed=98412194)
     self.rdp.reset(5)
     self.hash_dict = {}
     self.data = defaultdict(list)
Example #4
0
def createLSH(dimensions):
    nearest = NearestFilter(5)
    bin_width = 10
    projections = 50
    rbp = RandomDiscretizedProjections('rbp', projections, bin_width)
    rbp2 = RandomDiscretizedProjections('rbp2', projections, bin_width)
    rbp3 = RandomDiscretizedProjections('rbp3', projections, bin_width)
    rbp4 = RandomDiscretizedProjections('rbp4', projections, bin_width)

    engine = Engine(dimensions, lshashes=[rbp, rbp2, rbp3, rbp4], vector_filters=[nearest])
    return engine
Example #5
0
class featureLsh():
    def __init__(self, stage, bucket):
        self.parentLevel = 5
        self.rdp = RandomDiscretizedProjections('rdp',
                                                stage,
                                                bucket,
                                                rand_seed=98412194)
        self.rdp.reset(5)
        self.hash_dict = {}
        self.data = defaultdict(list)

    def get_hash(self, vector):
        h = self.rdp.hash_vector(vector)[0]
        return h

    def set_hash(self, header):
        self.hash_dict["program"] = "program"
        for i in header:
            key_vec = i.split("_")
            vec = []
            for j in key_vec:
                vec.append(int(j))
            newkey = self.get_hash(vec)
            self.hash_dict[i] = newkey
        print("Setting hash done. Running lsh...")

    def update_dict(self, dicts):
        print("updating_dict")
        for dict in dicts:
            newdict = {}
            for key, value in dict.items():
                newkey = self.hash_dict[key]
                if newkey == "program":
                    newdict[newkey] = value
                else:
                    if not newkey in newdict:
                        if type(value) == str:
                            newdict[newkey] = float(value)
                        else:
                            if isnan(value):
                                newdict[newkey] = 0
                            else:
                                newdict[newkey] = float(value)
                    else:
                        if type(value) == str:
                            newdict[newkey] += float(value)
                        else:
                            if not isnan(value):
                                newdict[newkey] += float(value)
            for key, value in newdict.items():
                self.data[key].append(value)
Example #6
0
class TestRandomDiscretizedProjections(unittest.TestCase):

    def setUp(self):
        self.rbp = RandomDiscretizedProjections('testHash', 10, 0.1)
        self.rbp.reset(100)

    def test_hash_format(self):
        h = self.rbp.hash_vector(numpy.random.randn(100))
        self.assertEqual(len(h), 1)
        self.assertEqual(type(h[0]), type(''))

    def test_hash_deterministic(self):
        x = numpy.random.randn(100)
        first_hash = self.rbp.hash_vector(x)[0]
        for k in range(100):
            self.assertEqual(first_hash, self.rbp.hash_vector(x)[0])

    def test_hash_format_sparse(self):
        h = self.rbp.hash_vector(scipy.sparse.rand(100, 1, density=0.1))
        self.assertEqual(len(h), 1)
        self.assertEqual(type(h[0]), type(''))

    def test_hash_deterministic_sparse(self):
        x = scipy.sparse.rand(100, 1, density=0.1)
        first_hash = self.rbp.hash_vector(x)[0]
        for k in range(100):
            self.assertEqual(first_hash, self.rbp.hash_vector(x)[0])
Example #7
0
class TestRandomDiscretizedProjections(unittest.TestCase):

    def setUp(self):
        self.rbp = RandomDiscretizedProjections('testHash', 10, 0.1)
        self.rbp.reset(100)

    def test_hash_format(self):
        h = self.rbp.hash_vector(numpy.random.randn(100))
        self.assertEqual(len(h), 1)
        self.assertEqual(type(h[0]), type(''))

    def test_hash_deterministic(self):
        x = numpy.random.randn(100)
        first_hash = self.rbp.hash_vector(x)[0]
        for k in range(100):
            self.assertEqual(first_hash, self.rbp.hash_vector(x)[0])

    def test_hash_format_sparse(self):
        h = self.rbp.hash_vector(scipy.sparse.rand(100, 1, density=0.1))
        self.assertEqual(len(h), 1)
        self.assertEqual(type(h[0]), type(''))

    def test_hash_deterministic_sparse(self):
        x = scipy.sparse.rand(100, 1, density=0.1)
        first_hash = self.rbp.hash_vector(x)[0]
        for k in range(100):
            self.assertEqual(first_hash, self.rbp.hash_vector(x)[0])
Example #8
0
    def __init__(
            self,
            data: np.ndarray = None,
            labels: np.ndarray = None,
            k: int = 100,
            projections: int = 3,
            bin_width: int = 10,
            tables: int = 3,
            verbose: bool = True,
            dummy: bool = False,
    ):
        self.k = k
        self.projections = projections
        self.tables = tables

        if not dummy:
            if data is None and labels is None:
                raise Exception('data and labels must be numpy.ndarray when not using dummy indexer')
            t0 = time.time()

            self.engine = HashEngine(
                vectors=data,
                labels=labels,
                lshashes=[RandomDiscretizedProjections(f'rbp_{i}', projections, bin_width=bin_width)
                          for i in range(tables)],
                k=k,
                verbose=verbose,
            )
            self.build_time = time.time() - t0
Example #9
0
 def __init__(self, hasher, number_of_tables=8, length_of_tables=32, bin_width= 1.0, match_thresh=0.2):
     """
     :param hasher:
     @type hasher: Hasher
     """
     LSHIndex.__init__(self, hasher, match_thresh=match_thresh)
     self.setName(number_of_tables=number_of_tables,length_of_tables=length_of_tables,match_thresh=match_thresh,bin_width=bin_width)
     self.tables = [None]*number_of_tables
     for i in range(number_of_tables):
         self.tables[i] = RandomDiscretizedProjections(str(i), length_of_tables,  bin_width)
     self.engine = Engine(self.hasher.dims(), lshashes=self.tables, fetch_vector_filters=[NoVectorFilter()])
Example #10
0
    def test_hash_memory_storage_rdp(self):
        hash1 = RandomDiscretizedProjections('testRDPHash', 10, 0.1)
        hash1.reset(100)

        self.memory.store_hash_configuration(hash1)

        hash2 = RandomDiscretizedProjections(None, None, None)
        hash2.apply_config(self.memory.load_hash_configuration('testRDPHash'))

        self.assertEqual(hash1.dim, hash2.dim)
        self.assertEqual(hash1.hash_name, hash2.hash_name)
        self.assertEqual(hash1.bin_width, hash2.bin_width)
        self.assertEqual(hash1.projection_count, hash2.projection_count)

        for i in range(hash1.normals.shape[0]):
            for j in range(hash1.normals.shape[1]):
                self.assertEqual(hash1.normals[i, j], hash2.normals[i, j])
Example #11
0
def build_content_sim_relation_text(network, signatures):
    def get_nid_gen(signatures):
        for nid, sig in signatures:
            yield nid

    docs = []
    for nid, e in signatures:
        docs.append(' '.join(e))

    # this may become redundant if we exploit the store characteristics
    tfidf = da.get_tfidf_docs(docs)
    # rbp = RandomBinaryProjections('default', 1000)
    lsh_projections = RandomDiscretizedProjections('rnddiscretized', 1000, 2)
    nid_gen = get_nid_gen(signatures)
    text_engine = index_in_text_engine(nid_gen, tfidf, lsh_projections)
    nid_gen = get_nid_gen(signatures)
    create_sim_graph_text(nid_gen, network, text_engine, tfidf,
                          Relation.CONTENT_SIM)
Example #12
0
    def process2(self,vectors1,vectors2,num_bit,bin_width):
        
        # build engine
        self.dimension = np.shape(vectors1)[1]
        self.rdp = RandomDiscretizedProjections('rdp',num_bit,bin_width)
        self.rbp = RandomBinaryProjections('rbp',num_bit)
        self.rdp.reset(self.dimension)
        self.rbp.reset(self.dimension)
        self.normals = self.rdp.vectors
        self.rbp.normals = self.normals
        self.engine1 = self._build_rdp_engine(vectors1,self.rdp,self.normals)
        self.engine2 = self._build_rdp_engine(vectors2,self.rdp,self.normals)
        
        # create new key
        buckets1 = self.engine1.storage.buckets['rdp']
        buckets2 = self.engine2.storage.buckets['rdp']
        
        self.rbdp = {}

        print 'len of buckets1', len(buckets1)
        print 'len of buckets2', len(buckets2)

        keys_int1 = []
        keys_int2 = []

        for key in buckets1:
            ks = [int(x) for x in key.split('_')]
            keys_int1.append(ks)

        for key in buckets2:
            ks = [int(x) for x in key.split('_')]
            keys_int2.append(ks)

        for idx1,key1 in enumerate(buckets1):
            if idx1 % 100 == 0:
                logging.info('{} {}/{}'.format(key1,idx1,len(buckets1)))
            for idx2,key2 in enumerate(buckets2):
                ks1 = keys_int1[idx1]
                ks2 = keys_int2[idx2]
                new_key = [ks1[i] + ks2[i] for i in xrange(len(ks1))]
                new_key = ''.join(['1' if x>=0 else '0' for x in new_key])
                if not new_key in self.rbdp:
                    self.rbdp[new_key] = []
                self.rbdp[new_key].append((key1,key2))
Example #13
0
    def test_hash_memory_storage_rdp(self):
        hash1 = RandomDiscretizedProjections('testRDPHash', 10, 0.1)
        hash1.reset(100)

        self.memory.store_hash_configuration(hash1)

        hash2 = RandomDiscretizedProjections(None, None, None)
        hash2.apply_config(self.memory.load_hash_configuration('testRDPHash'))

        self.assertEqual(hash1.dim, hash2.dim)
        self.assertEqual(hash1.hash_name, hash2.hash_name)
        self.assertEqual(hash1.bin_width, hash2.bin_width)
        self.assertEqual(hash1.projection_count, hash2.projection_count)

        for i in range(hash1.normals.shape[0]):
            for j in range(hash1.normals.shape[1]):
                self.assertEqual(hash1.normals[i, j], hash2.normals[i, j])
Example #14
0
 def setUp(self):
     self.rbp = RandomDiscretizedProjections('testHash', 10, 0.1)
     self.rbp.reset(100)
Example #15
0
import sys
import hashlib
import antlr4
from antlr4 import ParseTreeWalker, ParserRuleContext

from template.Template2Lexer import Template2Lexer
from template.Template2Listener import Template2Listener
from template.Template2Parser import Template2Parser
from nearpy.hashes import RandomBinaryProjections, RandomDiscretizedProjections

MAX_PATH_LENGTH = 100

stacks = dict()
#rdp = RandomBinaryProjections('rbp', 100, rand_seed=98412194)
rdp = RandomDiscretizedProjections('rdp', 10, 1000, rand_seed=98412194)
rdp.reset(MAX_PATH_LENGTH)


def getHash(vector):
    # if len(vector) < MAX_PATH_LENGTH:
    #     vector = vector + (MAX_PATH_LENGTH-len(vector))*[0]
    h = rdp.hash_vector(vector)[0]

    return h


def update(d, entry):
    if entry in d:
        d[entry] += 1
    else:
Example #16
0
 def setUp(self):
     self.rbp = RandomDiscretizedProjections('testHash', 10, 0.1)
     self.rbp.reset(100)
Example #17
0
class FeatureBuilder(Template2Listener):

    def __init__(self, level):
        self.feature_vector = {}
        self.parentLevel = int(level)
        print(self.parentLevel)
        self.hashes = dict()

        self.rdp = RandomDiscretizedProjections('rdp', 5, 6, rand_seed=98412194)
        self.rdp.reset(self.parentLevel)

    def getHash(self, vector):
        if len(vector) < self.parentLevel:
            vector = vector + (self.parentLevel - len(vector)) * [0]
        h = self.rdp.hash_vector(vector)[0]
        # h = '_'.join([str(x) for x in vector])

        return h

    def getParents(self, ctx):
        curLevel = 0
        curNode = ctx
        path = []
        while curNode is not None and curLevel < self.parentLevel:
            #path.append(curNode.getRuleIndex())
            nodename = curNode.__class__.__name__
            path.append(fixed_hashes[nodename])
            curLevel += 1
            curNode = curNode.parentCtx
        return path

    def update_vector(self, ctx):
        if self.parentLevel <= 1:
            name = type(ctx).__name__
            if ctx.parentCtx is not None:
                parentName = type(ctx.parentCtx).__name__
                feature_name = 't_' + parentName + '_' + name
                if feature_name not in self.feature_vector:
                    self.feature_vector[feature_name] = 0
                self.feature_vector[feature_name] += 1
        else:
            path=self.getParents(ctx)
            name=self.getHash(path)

        if name not in self.feature_vector:
            self.feature_vector[name] = 0
        self.feature_vector[name] += 1

    def enterAddop(self, ctx):
        self.update_vector(ctx)

    def enterAnd(self, ctx):
        self.update_vector(ctx)

    def enterArray(self, ctx):
        self.update_vector(ctx)

    def enterArray_access(self, ctx):
        self.update_vector(ctx)

    def enterAssign(self, ctx):
        self.update_vector(ctx)

    def enterBlock(self, ctx):
        self.update_vector(ctx)

    def enterBrackets(self, ctx):
        self.update_vector(ctx)

    def enterData(self, ctx):
        self.update_vector(ctx)

    def enterDecl(self, ctx):
        self.update_vector(ctx)

    def enterPrimitive(self, ctx):
        self.update_vector(ctx)

    def enterNumber(self, ctx):
        self.update_vector(ctx)

    def enterDtype(self, ctx):
        self.update_vector(ctx)

    def enterVector(self, ctx):
        self.update_vector(ctx)

    def enterDims(self, ctx):
        self.update_vector(ctx)

    def enterVectorDIMS(self, ctx):
        self.update_vector(ctx)

    def enterLimits(self, ctx):
        self.update_vector(ctx)

    def enterPrior(self, ctx):
        self.update_vector(ctx)

    def enterParam(self, ctx):
        self.update_vector(ctx)

    def enterParams(self, ctx):
        self.update_vector(ctx)

    def enterDistexpr(self, ctx):
        self.update_vector(ctx)

    def enterLoopcomp(self, ctx):
        self.update_vector(ctx)

    def enterFor_loop(self, ctx):
        self.update_vector(ctx)

    def enterIf_stmt(self, ctx):
        self.update_vector(ctx)

    def enterElse_blk(self, ctx):
        self.update_vector(ctx)

    def enterFunction_call(self, ctx):
        self.update_vector(ctx)

    def enterFparam(self, ctx):
        self.update_vector(ctx)

    def enterFparams(self, ctx):
        self.update_vector(ctx)

    def enterReturn_or_param_type(self, ctx):
        self.update_vector(ctx)

    def enterFunction_decl(self, ctx):
        self.update_vector(ctx)

    def enterTransformedparam(self, ctx):
        self.update_vector(ctx)

    def enterTransformeddata(self, ctx):
        self.update_vector(ctx)

    def enterGeneratedquantities(self, ctx):
        self.update_vector(ctx)

    def enterFunctions(self, ctx):
        self.update_vector(ctx)

    def enterVal(self, ctx):
        self.update_vector(ctx)

    def enterDivop(self, ctx):
        self.update_vector(ctx)

    def enterString(self, ctx):
        self.update_vector(ctx)

    def enterExponop(self, ctx):
        self.update_vector(ctx)

    def enterMinusop(self, ctx):
        self.update_vector(ctx)

    def enterLt(self, ctx):
        self.update_vector(ctx)

    def enterUnary(self, ctx):
        self.update_vector(ctx)

    def enterEq(self, ctx):
        self.update_vector(ctx)

    def enterGt(self, ctx):
        self.update_vector(ctx)

    def enterRef(self, ctx):
        self.update_vector(ctx)

    def enterGeq(self, ctx):
        self.update_vector(ctx)

    def enterMulop(self, ctx):
        self.update_vector(ctx)

    def enterFunction(self, ctx):
        self.update_vector(ctx)

    def enterVecmulop(self, ctx):
        self.update_vector(ctx)

    def enterNe(self, ctx):
        self.update_vector(ctx)

    def enterLeq(self, ctx):
        self.update_vector(ctx)

    def enterTranspose(self, ctx):
        self.update_vector(ctx)

    def enterVecdivop(self, ctx):
        self.update_vector(ctx)

    def enterTernary(self, ctx):
        self.update_vector(ctx)

    def enterSubset(self, ctx):
        self.update_vector(ctx)

    def enterObserve(self, ctx):
        self.update_vector(ctx)

    def enterStatement(self, ctx):
        self.update_vector(ctx)

    def enterQuery(self, ctx):
        self.update_vector(ctx)

    def enterTemplate(self, ctx):
        self.update_vector(ctx)
Example #18
0
# We are looking for the N closest neighbours
N = 20
nearest = NearestFilter(N)

# We will fill this array with all the engines we want to test
engines = []

print 'Creating engines...'

# We are going to test these bin widths
bin_widths = [0.01 * x for x in range(1, 5)]
# Create engines for all configurations
for bin_width in bin_widths:
    # Use four random 1-dim discretized projections
    rdp1 = RandomDiscretizedProjections('rdp1', 4, bin_width)
    rdp2 = RandomDiscretizedProjections('rdp2', 4, bin_width)
    rdp3 = RandomDiscretizedProjections('rdp3', 4, bin_width)
    rdp4 = RandomDiscretizedProjections('rdp4', 4, bin_width)
    #ub1 = UniBucket('uni')

    # Create engine with this configuration
    #engine = Engine(dimension, lshashes=[rdp1, rdp2, rdp3, rdp4],
    #                vector_filters=[unique, nearest])
    engine = Engine(dimension,
                    lshashes=[rdp1, rdp2, rdp3, rdp4],
                    vector_filters=[nearest])

    # Add engine to list of engines to evaluate
    engines.append(engine)
Example #19
0
class DoubleEngine:

    def _build_rdp_engine(self,matrix,rdp,normals):
        # Dimension of our vector space
        dimension = np.shape(matrix)[1]
        n = np.shape(matrix)[0]
        # Create a random binary hash with 10 bits

        # Create engine with pipeline configuration
        engine = Engine(dimension, lshashes=[rdp],storage = MemoryStorage())
        rdp.vectors = normals

        for index in range(n):
            v = matrix[index]
            engine.store_vector(v, '%d' % index)
            
        return engine
    
        

    def process2(self,vectors1,vectors2,num_bit,bin_width):
        
        # build engine
        self.dimension = np.shape(vectors1)[1]
        self.rdp = RandomDiscretizedProjections('rdp',num_bit,bin_width)
        self.rbp = RandomBinaryProjections('rbp',num_bit)
        self.rdp.reset(self.dimension)
        self.rbp.reset(self.dimension)
        self.normals = self.rdp.vectors
        self.rbp.normals = self.normals
        self.engine1 = self._build_rdp_engine(vectors1,self.rdp,self.normals)
        self.engine2 = self._build_rdp_engine(vectors2,self.rdp,self.normals)
        
        # create new key
        buckets1 = self.engine1.storage.buckets['rdp']
        buckets2 = self.engine2.storage.buckets['rdp']
        
        self.rbdp = {}

        print 'len of buckets1', len(buckets1)
        print 'len of buckets2', len(buckets2)

        keys_int1 = []
        keys_int2 = []

        for key in buckets1:
            ks = [int(x) for x in key.split('_')]
            keys_int1.append(ks)

        for key in buckets2:
            ks = [int(x) for x in key.split('_')]
            keys_int2.append(ks)

        for idx1,key1 in enumerate(buckets1):
            if idx1 % 100 == 0:
                logging.info('{} {}/{}'.format(key1,idx1,len(buckets1)))
            for idx2,key2 in enumerate(buckets2):
                ks1 = keys_int1[idx1]
                ks2 = keys_int2[idx2]
                new_key = [ks1[i] + ks2[i] for i in xrange(len(ks1))]
                new_key = ''.join(['1' if x>=0 else '0' for x in new_key])
                if not new_key in self.rbdp:
                    self.rbdp[new_key] = []
                self.rbdp[new_key].append((key1,key2))
        
    def build_permute_index(self,num_permutation,beam_size,hamming_beam_size):
        self.num_permutation = num_permutation
        self.hamming_beam_size = hamming_beam_size
        self.beam_size = beam_size
        self.projection_count = self.rbp.projection_count
        
        # add permutations
        self.permutations = []
        for i in xrange(self.num_permutation):
            p = Permutation(self.projection_count)
            self.permutations.append(p)

        # convert current buckets to an array of bitarray
        buckets = self.rbdp
        original_keys = []
        for key in buckets:
            ba = bitarray(key)
            original_keys.append(ba)

        # build permutation lists
        self.permuted_lists = []
        i = 0
        for p in self.permutations:
            logging.info('Creating Permutation Index: #{}/{}'.format(i,len(self.permutations)))
            i+=1
            permuted_list = []
            for ba in original_keys:
                c = ba.copy()
                p.permute(c)
                permuted_list.append((c,ba))
            # sort the list
            permuted_list = sorted(permuted_list)
            self.permuted_lists.append(permuted_list)
        

    def get_neighbour_keys(self,bucket_key,k):
        # O( np*beam*log(np*beam) )
        # np = number of permutations
        # beam = self.beam_size
        # np * beam == 200 * 100 Still really fast

        query_key = bitarray(bucket_key)
        topk = set()
        for i in xrange(len(self.permutations)):
            p = self.permutations[i]
            plist = self.permuted_lists[i]
            candidates = p.search_revert(plist,query_key,self.beam_size)
            topk = topk.union(set(candidates))
        topk = list(topk)
        topk = sorted(topk, key = lambda x : hamming_distance(x,query_key))
        topk_bin = [x.to01() for x in topk[:k]]
        return topk_bin

    def n2(self,key1,key2,v):
        #return [(cos_dist,(idx1,idx2))]
        def matrix_list(engine,key):
            # return a matrix and a list of keys
            items = engine.storage.buckets['rdp'][key]
            m = []
            l = []
            for v,key in items:
                m.append(v)
                l.append(int(key))
            m = np.array(m)    
            return m,l
        m1,l1 = matrix_list(self.engine1,key1)
        m2,l2 = matrix_list(self.engine2,key2)
        len1 = len(l1)
        len2 = len(l2)
        # a . v 
        av = np.dot(m1,v)
        av = np.repeat(av,len2).reshape(len1,len2)
        # b . v
        bv = np.dot(m2,v)
        bv = np.repeat(bv,len1).reshape(len2,len1).T
        # nominator = a.v + b.v
        nomi = av + bv
        # |v|
        nv = np.linalg.norm(v,2)
        # a.a
        aa = np.sum(m1*m1,axis = 1)
        aa = np.repeat(aa,len2).reshape(len1,len2)
        # b.b
        bb = np.sum(m2*m2,axis = 1)
        bb = np.repeat(bb,len1).reshape(len2,len1).T
        # a.b
        ab = np.dot(m1,m2.T)
        # denominator 
        deno = np.sqrt(aa + bb + 2 * ab) * nv
        # distance matrix 
        dism = nomi / deno
        dist = []
        for i in xrange(len1):
            for j in xrange(len2):
                dis = dism[i,j]
                dist.append((dis,(l1[i],l2[j])))
        return dist

    def neighbours2(self,v,n):
        # one important assumption: just have one hash method
        # Collect candidates from all buckets from all hashes
        candidates = []
        direct_bucket_keys = self.rbp.hash_vector(v)

        # Get the neighbours of candidate_bucket_keys
        candidate_bucket_keys = []
        
        for bucket_key in direct_bucket_keys:
            neighbour_keys = self.get_neighbour_keys(bucket_key,self.hamming_beam_size)
            candidate_bucket_keys.extend(neighbour_keys)
        
        dists = []
        for bucket_key in candidate_bucket_keys:
            comb = self.rbdp[bucket_key]
            print bucket_key, len(comb)
            for key1,key2 in comb:
                dist = self.n2(key1,key2,v)
                dists.extend(dist)

        dists = sorted(dists,key = lambda x: -x[0])
        return dists[:n]
        # If there is no vector filter, just return list of candidates
        return dists