Ejemplo n.º 1
0
def index_user_vectors():

    #print 'Performing indexing with HashPermutations...'

    global engine_perm

    t0 = time.time()

    #print k_dimen, d_dimen

    rbp_perm = RandomBinaryProjections('rbp_perm', d_dimen)

    rbp_perm.reset(k_dimen)

    # Create permutations meta-hash
    permutations = HashPermutations('permut')

    rbp_conf = {'num_permutation': 50, 'beam_size': 10, 'num_neighbour': 250}

    # Add rbp as child hash of permutations hash
    permutations.add_child_hash(rbp_perm, rbp_conf)

    # Create engine
    engine_perm = Engine(k_dimen,
                         lshashes=[permutations],
                         distance=CosineDistance())

    for u in user_vector:

        engine_perm.store_vector(user_vector[u], data=u)

    # Then update permuted index
    permutations.build_permuted_index()

    t1 = time.time()
Ejemplo n.º 2
0
def knn(data, k):
    assert k <= len(
        data
    ) - 1, 'The number of neighbors must be smaller than the data cardinality (minus one)'
    k = k + 1
    n, dimension = data.shape
    ind = []
    dist = []

    if (dimension < 10):
        rbp = RandomBinaryProjections('rbp', dimension)
    else:
        rbp = RandomBinaryProjections('rbp', 10)

    engine = Engine(dimension,
                    lshashes=[rbp],
                    vector_filters=[NearestFilter(k)])

    for i in range(n):
        engine.store_vector(data[i], i)

    for i in range(n):

        N = engine.neighbours(data[i])
        ind.append([x[1] for x in N][1:])
        dist.append([x[2] for x in N][1:])

    return N, dist, ind
Ejemplo n.º 3
0
def index_user_vectors():
	
	print 'Performing indexing with HashPermutations...'
	
	global engine_perm 
	
	t0 = time.time()
	
	print k_dimen, d_dimen
	
	rbp_perm = RandomBinaryProjections('rbp_perm', d_dimen)
	
	rbp_perm.reset(k_dimen)
	
	# Create permutations meta-hash
	permutations = HashPermutations('permut')
	
	rbp_conf = {'num_permutation':50,'beam_size':10,'num_neighbour':250}
	
        # Add rbp as child hash of permutations hash
	permutations.add_child_hash(rbp_perm, rbp_conf)
	
        # Create engine
        engine_perm = Engine(k_dimen, lshashes=[permutations], distance=CosineDistance())
    
	for u in user_vector:
		
		engine_perm.store_vector(user_vector[u], data=u)
		
	 # Then update permuted index
        permutations.build_permuted_index()
    
	t1 = time.time()
	
	print 'Indexing took %f seconds', (t1-t0)
Ejemplo n.º 4
0
def build_environment(config):
    lsh = LSH_sumbeam()
    w2v = MyWord2Vec()
    w2v.load(config)
    lsh.w2v = w2v

    # combine top 20k noun and 20k adj into a single wordlist
    topn = config.getint('space','topn')
    words = w2v.model.vocab.keys()
    wordlist = WordList()
    wordlist.words = words
    wordlist.filter_frequency(w2v,topn)
    wordlist.build_index()

    # build a matrix
    matrix = lsh._list2matrix_w2v(wordlist,lsh.w2v)

    # build an engine
    dim = np.shape(matrix)[1]
    num_bits = 15
    rbp = RandomBinaryProjections('rbp', num_bits)
    rbp.reset(dim)    
    engine = lsh._build_rbp_permute_engine(matrix,rbp)
    num_permutation = 50
    beam_size = 50
    num_neighbour = 100
    engine.build_permute_index(num_permutation,beam_size,num_neighbour)
    
    return lsh,engine,matrix,wordlist
    def __init__(self, dimension, n_bit, alpha):

        self.n_bit = n_bit
        self.dim = dimension
        self.alpha = alpha

        self.sample_space = 2**n_bit

        self.rbp = RandomBinaryProjections('rbp', self.n_bit)
        self.engine = Engine(dimension, lshashes=[self.rbp])
Ejemplo n.º 6
0
 def build_index_sumbeam(self,num_bits):
     # hash the original vector in matrxi1 and matrix2 into engine1 and engine2
     self.dim = np.shape(self.matrix1)[1]
     rbp = RandomBinaryProjections('rbp', num_bits)
     rbp.reset(self.dim)
     self.rbp = rbp
 
     engine1 = self._build_rbp_permute_engine(self.matrix1,rbp)
     engine2 = self._build_rbp_permute_engine(self.matrix2,rbp)
     self.engine1 = engine1
     self.engine2 = engine2
Ejemplo n.º 7
0
def get_hash_config(redis_storage, name):
    config = redis_storage.load_hash_configuration(name)
    if config is not None:
        # Config is existing, create hash with None parameters
        lshash = RandomBinaryProjections(None, None, rand_seed=123)
        # Apply configuration loaded from redis
        lshash.apply_config(config)
    else:
        raise RuntimeError("Hash Config not found")

    return lshash
Ejemplo n.º 8
0
    def generate_lsh_fn(self):
        self.lsh_fn = []
        for i in range(self.L):
            rbp = RandomBinaryProjections('rbp', self.K)
            rbp.reset(self.dim)

            # def fn(x):
            #     mm = mmh3.hash(rbp.hash_vector(x)[0])
            #     return mm % self.R
            def fn(x):
                return 1
            self.lsh_fn.append(fn)
    def __init__(self, *args):
        """
        Initializing dictionary with reduced vectors which represents conferences's members.
        :param args[0] - data: Data class, it represents data from dblp.xml
        :param args[1] - dim: Output dimension for LSH.
        """
        print('Initialization recommender...')
        self.data = args[0]
        self.reduced_conferences = {}
        rbp = RandomBinaryProjections('rbp', args[1])
        rbp.reset(self.data.members_set.__len__())

        cnt = 0
Ejemplo n.º 10
0
class TestRandomBinaryProjections(unittest.TestCase):

    def setUp(self):
        self.rbp = RandomBinaryProjections('testHash', 10)
        self.rbp.reset(100)

    def test_hash_format(self):
        h = self.rbp.hash_vector(numpy.random.randn(100))
        self.assertEqual(len(h), 1)
        self.assertEqual(type(h[0]), type(''))
        self.assertEqual(len(h[0]), 10)
        for c in h[0]:
            self.assertTrue(c == '1' or c == '0')

    def test_hash_deterministic(self):
        x = numpy.random.randn(100)
        first_hash = self.rbp.hash_vector(x)[0]
        for k in range(100):
            self.assertEqual(first_hash, self.rbp.hash_vector(x)[0])

    def test_hash_format_sparse(self):
        h = self.rbp.hash_vector(scipy.sparse.rand(100, 1, density=0.1))
        self.assertEqual(len(h), 1)
        self.assertEqual(type(h[0]), type(''))
        self.assertEqual(len(h[0]), 10)
        for c in h[0]:
            self.assertTrue(c == '1' or c == '0')

    def test_hash_deterministic_sparse(self):
        x = scipy.sparse.rand(100, 1, density=0.1)
        first_hash = self.rbp.hash_vector(x)[0]
        for k in range(100):
            self.assertEqual(first_hash, self.rbp.hash_vector(x)[0])
Ejemplo n.º 11
0
class TestRandomBinaryProjections(unittest.TestCase):

    def setUp(self):
        self.rbp = RandomBinaryProjections('testHash', 10)
        self.rbp.reset(100)

    def test_hash_format(self):
        h = self.rbp.hash_vector(numpy.random.randn(100))
        self.assertEqual(len(h), 1)
        self.assertEqual(type(h[0]), type(''))
        self.assertEqual(len(h[0]), 10)
        for c in h[0]:
            self.assertTrue(c == '1' or c == '0')

    def test_hash_deterministic(self):
        x = numpy.random.randn(100)
        first_hash = self.rbp.hash_vector(x)[0]
        for k in range(100):
            self.assertEqual(first_hash, self.rbp.hash_vector(x)[0])

    def test_hash_format_sparse(self):
        h = self.rbp.hash_vector(scipy.sparse.rand(100, 1, density=0.1))
        self.assertEqual(len(h), 1)
        self.assertEqual(type(h[0]), type(''))
        self.assertEqual(len(h[0]), 10)
        for c in h[0]:
            self.assertTrue(c == '1' or c == '0')

    def test_hash_deterministic_sparse(self):
        x = scipy.sparse.rand(100, 1, density=0.1)
        first_hash = self.rbp.hash_vector(x)[0]
        for k in range(100):
            self.assertEqual(first_hash, self.rbp.hash_vector(x)[0])
class RBP_hasher(object):

    def __init__(self, dimension, n_bit, alpha):

        self.n_bit = n_bit
        self.dim = dimension
        self.alpha = alpha

        self.sample_space = 2**n_bit

        self.rbp = RandomBinaryProjections('rbp', self.n_bit)
        self.engine = Engine(dimension, lshashes=[self.rbp])

    @property
    def params(self):
        return self.rbp.get_config()

    def load(self, config):
        self.rbp.apply_config(config)

    def _string2int(self, s):
        return int(s, 2)

    def __call__(self, v):
        '''
        Convert the returned string into a integer.
        Return a dict based off the weights.
        '''
        s = self.rbp.hash_vector(v)[0]
        weights = {
            self._string2int(s): 1.0,
        }

        if not self.alpha:
            return weights

        # If alpha is non-zero, deposit weight into nearby bins

        slist = map(bool, map(int, list(s)))
        for n in range(len(s)):
            s2list = slist[:]
            s2list[n] = not slist[n]
            s2list = map(str, map(int, s2list))
            s2 = ''.join(s2list)
            idx = self._string2int(s2)
            weights[idx] = self.alpha

        return weights
Ejemplo n.º 13
0
    def build_from_document_corpus(corpus, model_type, model_name,
                                   progress=False, project_events=False, include_events=False, hash_size=50,
                                   log=None, redis_port=6379, filter_chains=None):
        if log is None:
            log = get_console_logger("neighbour indexing")

        log.info("Loading model %s/%s" % (model_type, model_name))
        model = NarrativeChainModel.load_by_type(model_type, model_name)
        vector_size = model.vector_size

        db_filename = "vectors.rdb"
        # Make sure the model directory exists, so we can get the Redis server pointing there
        model_dir = model.get_model_directory(model_name)
        # If the Redis stored db already exists, remove it, so that we don't end up adding to old data
        if os.path.exists(os.path.join(model_dir, db_filename)):
            os.remove(os.path.join(model_dir, db_filename))
        log.info("Storing vectors in %s" % os.path.join(model_dir, db_filename))

        log.info("Preparing neighbour search hash")
        # Create binary hash
        binary_hash = RandomBinaryProjections("%s:%s_binary_hash" % (model_type, model_name), hash_size)

        log.info("Connecting to Redis server on port %d" % redis_port)
        # Prepare an engine for storing the vectors in
        try:
            redis = Redis(host='localhost', port=redis_port, db=0)
        except ConnectionError, e:
            raise RuntimeError("could not connect to redis server on port %s. Is it running? (%s)" % (redis_port, e))
Ejemplo n.º 14
0
def load_search_engine():
    global engine

    # read in the data file
    data = pandas.read_csv(os.path.join('data', 'features.tsv'), sep='\t')
    data_objects = pandas.read_csv(os.path.join('data', 'object_features.tsv'),
                                   sep='\t')

    # Create a random binary hash with 10 bits
    rbp = RandomBinaryProjections('rbp', 10)

    # Create engine with pipeline configuration
    engine = Engine(len(data['features'][0].split(',')),
                    lshashes=[rbp],
                    distance=EuclideanDistance())

    # indexing
    for i in range(0, len(data)):
        engine.store_vector(
            np.asarray(data['features'][i].split(',')).astype('float64'),
            data['filename'][i].replace('images\\\\',
                                        '').replace('images\\',
                                                    '').replace('images/', ''))

    for i in range(0, len(data_objects)):
        engine.store_vector(
            np.asarray(
                data_objects['features'][i].split(',')).astype('float64'),
            data_objects['filename'][i].replace('images\\\\', '').replace(
                'images\\', '').replace('images/', ''))

    return engine
Ejemplo n.º 15
0
def load_engine(sdf_files, feature_matrix, dimension):
    """
    Function that converts the given sdf_files into instances of the sdf_class, then loads them into nearpy Engine.

    Parameters
        sdf_files: a list of sdf_files with their pathname from the current directory. Intended to be fed in from `find_sdf(root_dir)`
        feature_matrix: matrix of training data features to be loaded into engine
        dimension: dimensionality of the feature vectors used for LSH (here: number of cluster centers)

    Returns
        engine: instance of a nearpy engine with all of sdf_files loaded
    
    Sample Usage
        >>> engine = load_engine(sdf_files)
    """
    #dimension here can be altered as well
    rbp = RandomBinaryProjections('rbp', 10)
    engine = Engine(dimension, lshashes=[rbp])

    count = 0
    for index, file_ in enumerate(sdf_files):
        #print file_
        if count % 100 == 0:
            print 'Converted %d files' % (count)
        converted = SDF(file_)
        converted.set_feature_vector(feature_matrix[index])
        converted.add_to_nearpy_engine(engine)
        count += 1
    return engine
Ejemplo n.º 16
0
    def setUp(self):
        logging.basicConfig(level=logging.WARNING)
        numpy.random.seed(11)

        # Create permutations meta-hash
        self.permutations = HashPermutations('permut')

        # Create binary hash as child hash
        rbp = RandomBinaryProjections('rbp1', 4, rand_seed=19)
        rbp_conf = {
            'num_permutation': 50,
            'beam_size': 10,
            'num_neighbour': 100
        }

        # Add rbp as child hash of permutations hash
        self.permutations.add_child_hash(rbp, rbp_conf)

        # Create engine with meta hash and cosine distance
        self.engine_perm = Engine(200,
                                  lshashes=[self.permutations],
                                  distance=CosineDistance())

        # Create engine without permutation meta-hash
        self.engine = Engine(200, lshashes=[rbp], distance=CosineDistance())
Ejemplo n.º 17
0
def build_content_sim_relation_text_lsa(network, signatures):
    def get_nid_gen(signatures):
        for nid, sig in signatures:
            yield nid

    docs = []
    for nid, e in signatures:
        docs.append(' '.join(e))

    # this may become redundant if we exploit the store characteristics
    tfidf = da.get_tfidf_docs(docs)

    print("TF-IDF shape before LSA: " + str(tfidf.shape))
    st = time.time()
    tfidf = lsa_dimensionality_reduction(tfidf)
    et = time.time()
    print("TF-IDF shape after LSA: " + str(tfidf.shape))
    print("Time to compute LSA: {0}".format(str(et - st)))
    lsh_projections = RandomBinaryProjections('default', 10000)
    #lsh_projections = RandomDiscretizedProjections('rnddiscretized', 1000, 2)
    nid_gen = get_nid_gen(signatures)  # to preserve the order nid -> signature
    text_engine = index_in_text_engine(nid_gen,
                                       tfidf,
                                       lsh_projections,
                                       tfidf_is_dense=True)
    nid_gen = get_nid_gen(signatures)  # to preserve the order nid -> signature
    create_sim_graph_text(nid_gen,
                          network,
                          text_engine,
                          tfidf,
                          Relation.CONTENT_SIM,
                          tfidf_is_dense=True)
Ejemplo n.º 18
0
        def RunAnnNearpy(q):
            totalTimer = Timer()

            # Load input dataset.
            Log.Info("Loading dataset", self.verbose)
            queryData = np.genfromtxt(self.dataset[1], delimiter=',')
            train, label = SplitTrainData(self.dataset)

            with totalTimer:
                # Get all the parameters.
                try:
                    # Perform Approximate Nearest-Neighbors
                    dimension = train.shape[1]
                    rbp = RandomBinaryProjections('rbp', 10)
                    engine = Engine(dimension, lshashes=[rbp])
                    for i in range(len(train)):
                        engine.store_vector(train[i], 'data_%d' % i)
                    for i in range(len(queryData)):
                        v = engine.neighbours(queryData[i])
                except Exception as e:
                    Log.Info(e)
                    q.put(e)
                    return -1
            time = totalTimer.ElapsedTime()
            q.put(time)
            return time
Ejemplo n.º 19
0
def test_nearpy(X_train, y_train, X_test, k):
    # We are looking for the k closest neighbours
    nearest = NearestFilter(k)
    X_train_normalized = []
    for i in range(len(X_train)):
        train_example = X_train[i]
        element = ((train_example / np.linalg.norm(train_example)).tolist(),
                   y_train[i].tolist())
        X_train_normalized.append(element)

    engine = Engine(X_train.shape[1],
                    lshashes=[RandomBinaryProjections('default', 10)],
                    distance=CosineDistance(),
                    vector_filters=[nearest])

    #perform hashing for train examples
    for train_example in X_train:
        engine.store_vector(train_example)

    labels = []
    for test_example in X_test:
        neighbors = engine.neighbours(test_example)
        labels.append([
            train_example[1] for train_example in X_train_normalized
            if set(neighbors[0][0]) == set(train_example[0])
        ])
    return labels
def LSH(Layers, K):

    lsh_vectors = database[:, LSH_VECT_START_COL:]
    video_data = database[:, 0:5]

    num_rows, num_cols = lsh_vectors.shape
    dimension = num_cols

    rbp = list()
    for i in range(Layers):
        rbp.append(RandomBinaryProjections(str(i), K))

    # Create engine with pipeline configuration
    engine = Engine(dimension, lshashes=rbp)

    # Index 1000000 random vectors (set their data zo a unique string)
    for index in range(num_rows):
        v = lsh_vectors[index, :]

        meta_data = str(index)+',' + str(int(video_data[index, 0])) + ', ' + str(int(video_data[index, 1])) + ', ' + str(int(video_data[index, 2])) \
                    + ', ' + str(video_data[index, 3]) + ', ' + str(video_data[index, 4])

        engine.store_vector(v, meta_data)

    printOutput(engine.storage.buckets)

    print 'stop'
Ejemplo n.º 21
0
    def __init__(self,
                 dim,
                 lshashes=None,
                 distance=None,
                 fetch_vector_filters=None,
                 vector_filters=None,
                 storage=None):
        """ Keeps the configuration. """
        if lshashes is None:
            lshashes = [RandomBinaryProjections('default', 10)]
        self.lshashes = lshashes
        if distance is None: distance = EuclideanDistance()
        self.distance = distance
        if vector_filters is None: vector_filters = [NearestFilter(10)]
        self.vector_filters = vector_filters
        if fetch_vector_filters is None:
            fetch_vector_filters = [UniqueFilter()]
        self.fetch_vector_filters = fetch_vector_filters
        if storage is None: storage = MemoryStorage()
        self.storage = storage

        # Initialize all hashes for the data space dimension.
        for lshash in self.lshashes:
            lshash.reset(dim)

        print('*** engine init done ***')
Ejemplo n.º 22
0
 def __init__(self, num_features, projection_count=30):
     self.num_features = num_features
     #self.rbp = RandomDiscretizedProjections('default', projection_count, bin_width=100)
     self.rbp = RandomBinaryProjections('default', projection_count)
     #self.rbp = RandomBinaryProjectionTree('default', projection_count, 1)
     self.text_engine = Engine(num_features,
                               lshashes=[self.rbp],
                               distance=CosineDistance())
Ejemplo n.º 23
0
    def __init__(self, x):
        self.n, self.f = x.shape
        # Use NearPy lsh for fast ann
        rbp = RandomBinaryProjections('rbp', 10)

        self.engine = Engine(self.f, lshashes=[rbp])
        for i in np.arange(self.n):
            v = x[i, :]
            self.engine.store_vector(v, i)
Ejemplo n.º 24
0
def main(args):
    """ Main entry.
    """

    data = Dataset(args.dataset)
    num, dim = data.base.shape

    # We are looking for the ten closest neighbours
    nearest = NearestFilter(args.topk)
    # We want unique candidates
    unique = UniqueFilter()

    # Create engines for all configurations
    for nbit, ntbl in itertools.product(args.nbits, args.ntbls):
        logging.info("Creating Engine ...")
        lshashes = [RandomBinaryProjections('rbp%d' % i, nbit)
                    for i in xrange(ntbl)]

        # Create engine with this configuration
        engine = Engine(dim, lshashes=lshashes,
                        vector_filters=[unique, nearest])
        logging.info("\tDone!")

        logging.info("Adding items ...")
        for i in xrange(num):
            engine.store_vector(data.base[i, :], i)
            if i % 100000 == 0:
                logging.info("\t%d/%d" % (i, data.nbae))
        logging.info("\tDone!")

        ids = np.zeros((data.nqry, args.topk), np.int)
        logging.info("Searching ...")
        tic()
        for i in xrange(data.nqry):
            reti = [y for x, y, z in
                    np.array(engine.neighbours(data.query[i]))]
            ids[i, :len(reti)] = reti
            if i % 100 == 0:
                logging.info("\t%d/%d" % (i, data.nqry))
        time_costs = toc()
        logging.info("\tDone!")

        report = os.path.join(args.exp_dir, "report.txt")
        with open(report, "a") as rptf:
            rptf.write("*" * 64 + "\n")
            rptf.write("* %s\n" % time.asctime())
            rptf.write("*" * 64 + "\n")

        r_at_k = compute_stats(data.groundtruth, ids, args.topk)[-1][-1]

        with open(report, "a") as rptf:
            rptf.write("=" * 64 + "\n")
            rptf.write("index_%s-nbit_%d-ntbl_%d\n" % ("NearPy", nbit, ntbl))
            rptf.write("-" * 64 + "\n")
            rptf.write("recall@%-8d%.4f\n" % (args.topk, r_at_k))
            rptf.write("time cost (ms): %.3f\n" %
                       (time_costs * 1000 / data.nqry))
Ejemplo n.º 25
0
    def fit(self, X):
        b = self.params['b']
        self.n, self.f = X.shape
        # Use NearPy lsh for fast ann
        rbp = RandomBinaryProjections('rbp', b)

        self.engine = Engine(self.f, lshashes=[rbp])
        for i in np.arange(self.n):
            v = np.squeeze(np.copy(X[i, :]))
            self.engine.store_vector(v, i)
Ejemplo n.º 26
0
def k_nn_lsh_2(k, word, decade_matrix, index_dict):
    num_rows = decade_matrix.get_shape()[0]
    print("the number of rows:" + str(num_rows))
    rbp = RandomBinaryProjections('rbp', 256)
    engine = Engine(num_rows, lshashes=[rbp])
    for i in range(num_rows):
        print(i)

        engine.store_vector(decade_matrix.getrow(i), "data_%d" % i)
    return engine.neighbours(word)
 def __init__(self, emb_path, feature='title'):
     self.emb_path = emb_path
     self.feature = feature
     self.data_df = None
     self.tfidf = Vectorizer(**get_tfidf_params())
     self.fasttext_embedder = None
     self.fasttext_tfidf = None
     self.dimension = 300
     rbp = RandomBinaryProjections('rbp', 2)
     self.engine = Engine(self.dimension, lshashes=[rbp])
     pass
Ejemplo n.º 28
0
    def build_index(self, X):
        f = X.shape[1]
        n = X.shape[0]

        rbp = RandomBinaryProjections('rbp', 32)
        engine = Engine(f, lshashes=[rbp])

        for i in range(n):
            engine.store_vector(X[i], 'data_%d' % i)

        return engine
Ejemplo n.º 29
0
    def __configure_calculator(self, point_list, point):
        # Dimension of our vector space
        self.__dimension__ = 2

        # Create a random binary hash with 10 bits
        self.__rbp__ = RandomBinaryProjections('rbp', 10)

        # Create engine with pipeline configuration
        self.__engine__ = Engine(self.__dimension__, lshashes=[self.__rbp__])
        self.set_searching_point_list(point_list)
        self.set_query_point(point)
Ejemplo n.º 30
0
 def __init__(self, data_points, sim_threshold=0.5, num_vectors=3):
     self.data_points = data_points
     self.point_num = self.data_points.shape[0]
     self.dimension = self.data_points.shape[1] - 1
     # Create a random binary hash with . bits
     self.rbp = RandomBinaryProjections('rbp', num_vectors, rand_seed=42)
     self.engine = Engine(
         self.dimension,
         lshashes=[self.rbp],
         vector_filters=[DistanceThresholdFilter(1 - sim_threshold)])
     for i in range(self.point_num):
         self.engine.store_vector(self.data_points[i, 1:], '%d' % i)
Ejemplo n.º 31
0
    def load_hashmap(self):
        # Create redis storage adapter
        # need to start redis service
        redis_object = Redis(host='localhost', port=6379, db=14)
        redis_storage = RedisStorage(redis_object)
        try:
            config = redis_storage.load_hash_configuration('test')
            lshash = RandomBinaryProjections(None, None)
            lshash.apply_config(config)

        except:
            # Config is not existing, create hash from scratch, with 10 projections
            lshash = RandomBinaryProjections('test', 10)

        nearest = NearestFilter(self.nn)
        # self.engine = Engine(feature_size, lshashes=[], vector_filters=[])
        self.engine = Engine(self.feature_size,
                             lshashes=[lshash],
                             vector_filters=[nearest],
                             storage=redis_storage,
                             distance=CosineDistance())

        # Do some stuff like indexing or querying with the engine...

        # Finally store hash configuration in redis for later use
        redis_storage.store_hash_configuration(lshash)
Ejemplo n.º 32
0
	def loadHashmap(self, feature_size=129, result_n=1000):  #这里参数没有用到
		'''
		feature_size: hash空间维数大小
		result_n :返回多少个最近邻
		'''
		# Create redis storage adapter
		redis_object = Redis(host='localhost', port=6379, db=0)
		redis_storage = RedisStorage(redis_object)
		try:
			# Get hash config from redis
			config = redis_storage.load_hash_configuration('test')
			# Config is existing, create hash with None parameters
			lshash = RandomBinaryProjections(None, None)
			# Apply configuration loaded from redis
			lshash.apply_config(config)
			
		except:
			# Config is not existing, create hash from scratch, with 10 projections
			lshash = RandomBinaryProjections('test', 0)
			

		# Create engine for feature space of 100 dimensions and use our hash.
		# This will set the dimension of the lshash only the first time, not when
		# using the configuration loaded from redis. Use redis storage to store
		# buckets.
		nearest = NearestFilter(result_n)
		#self.engine = Engine(feature_size, lshashes=[], vector_filters=[])
		self.engine = Engine(feature_size, lshashes=[lshash], vector_filters=[nearest], storage=redis_storage, distance=EuclideanDistance())

		# Do some stuff like indexing or querying with the engine...

		# Finally store hash configuration in redis for later use
		redis_storage.store_hash_configuration(lshash)
 def fit(self, X, y=None, hash="randbinary"):
     X = np.array(X)
     assert len(X.shape) == 2, "X not 2-rank"
     dimension = X.shape[-1]
     if hash == "randbinary":
         rbp = RandomBinaryProjections('rbp', 10)
     elif hash == "pcabinary":
         rbp = PCABinaryProjections('rbp', 10, training_set=X)
     self.engine = Engine(dimension, lshashes=[rbp])
     index = 0
     for x in X:
         self.engine.store_vector(x, str(index))
         index += 1
Ejemplo n.º 34
0
def data_for_layer(basic_path, layer_name, num_folds, experiment,
                   projection_count, start_pc_component, end_pc_component):
    # Read datasets
    basic_path_layer = os.path.join(basic_path, layer_name)

    dataset_files = "ALOI_train_20400.h5"
    hd = h5py.File(os.path.join(basic_path_layer, "full_size", dataset_files),
                   'r')
    dataset_aloi = hd['dataset_1']
    dataset_train_aloi, dataset_test_aloi = split_data_to_test_train(
        dataset_aloi, num_folds, experiment)
    del dataset_aloi
    transformer = TransformImagesPCA(n_components=500)
    transformer.learn_pcs(dataset_train_aloi)
    del dataset_train_aloi

    dataset_files = "Google_train_6675.h5"
    hd = h5py.File(os.path.join(basic_path_layer, "full_size", dataset_files),
                   'r')
    dataset_google = hd['dataset_1']
    dataset_train_google, dataset_test_google = split_data_to_test_train(
        dataset_google, num_folds, experiment)
    del dataset_google
    transformer.learn_pcs(dataset_train_google)
    del dataset_train_google

    dataset_files = "Nexus_train_1180.h5"
    hd = h5py.File(os.path.join(basic_path_layer, "full_size", dataset_files),
                   'r')
    dataset = hd['dataset_1']
    dataset_train, dataset_test = split_data_to_test_train(
        dataset, num_folds, experiment)
    del dataset
    transformer.learn_pcs(dataset_train)
    del dataset_train

    pc_test_nexus = transformer.transform(
        dataset_test)[:, start_pc_component:end_pc_component]
    pc_test_aloi = transformer.transform(
        dataset_test_aloi)[:, start_pc_component:end_pc_component]
    pc_test_google = transformer.transform(
        dataset_test_google)[:, start_pc_component:end_pc_component]

    # Find the LSH vectors
    rbp = RandomBinaryProjections('rbp', projection_count, rand_seed=723657345)
    engine = Engine(end_pc_component - start_pc_component, lshashes=[rbp])

    pc_test_nexus = project_LSH(pc_test_nexus, rbp)
    pc_test_aloi = project_LSH(pc_test_aloi, rbp)
    pc_test_google = project_LSH(pc_test_google, rbp)
    return pc_test_nexus, pc_test_aloi, pc_test_google
Ejemplo n.º 35
0
 def __init__(self, distanceMeasure="EuclideanDistance"):
     self.res_similar = ResnetSimilarity()
     dimension = 2048
     rbp = RandomBinaryProjections('rbp', 10)
     self.engine = Engine(dimension, lshashes=[rbp])
     if distanceMeasure == "EuclideanDistance":
         self.filehandler = open("hashed_objects/hashed_object_euclidean.pkl", 'rb')
     elif distanceMeasure == "Test":
         self.filehandler = open("hashed_objects/hashed_object_example.pkl", 'rb')
     else:
         self.filehandler = open("hashed_objects/hashed_object_Cosine.pkl", 'rb')
     self.engine = pickle.load(self.filehandler)
     self.filehandler.close()
     print("Hash Table Loaded")
Ejemplo n.º 36
0
    def __init__(self):
        redis_object = redis.Redis(host='localhost', port=6379, db=0)
        redis_storage = RedisStorage(redis_object)

        # Get hash config from redis
        config = redis_storage.load_hash_configuration('MyHash')

        if config is None:
            # Config is not existing, create hash from scratch, with 5 projections
            self.lshash = RandomBinaryProjections('MyHash', 5)
        else:
            # Config is existing, create hash with None parameters
            self.lshash = RandomBinaryProjections(None, None)
            # Apply configuration loaded from redis
            self.lshash.apply_config(config)
        # print("HERE")

        # Create engine for feature space of 100 dimensions and use our hash.
        # This will set the dimension of the lshash only the first time, not when
        # using the configuration loaded from redis. Use redis storage to store
        # buckets.
        self.engine = Engine(4, lshashes=[self.lshash], storage=redis_storage)
        redis_storage.store_hash_configuration(self.lshash)
Ejemplo n.º 37
0
    def process2(self,vectors1,vectors2,num_bit,bin_width):
        
        # build engine
        self.dimension = np.shape(vectors1)[1]
        self.rdp = RandomDiscretizedProjections('rdp',num_bit,bin_width)
        self.rbp = RandomBinaryProjections('rbp',num_bit)
        self.rdp.reset(self.dimension)
        self.rbp.reset(self.dimension)
        self.normals = self.rdp.vectors
        self.rbp.normals = self.normals
        self.engine1 = self._build_rdp_engine(vectors1,self.rdp,self.normals)
        self.engine2 = self._build_rdp_engine(vectors2,self.rdp,self.normals)
        
        # create new key
        buckets1 = self.engine1.storage.buckets['rdp']
        buckets2 = self.engine2.storage.buckets['rdp']
        
        self.rbdp = {}

        print 'len of buckets1', len(buckets1)
        print 'len of buckets2', len(buckets2)

        keys_int1 = []
        keys_int2 = []

        for key in buckets1:
            ks = [int(x) for x in key.split('_')]
            keys_int1.append(ks)

        for key in buckets2:
            ks = [int(x) for x in key.split('_')]
            keys_int2.append(ks)

        for idx1,key1 in enumerate(buckets1):
            if idx1 % 100 == 0:
                logging.info('{} {}/{}'.format(key1,idx1,len(buckets1)))
            for idx2,key2 in enumerate(buckets2):
                ks1 = keys_int1[idx1]
                ks2 = keys_int2[idx2]
                new_key = [ks1[i] + ks2[i] for i in xrange(len(ks1))]
                new_key = ''.join(['1' if x>=0 else '0' for x in new_key])
                if not new_key in self.rbdp:
                    self.rbdp[new_key] = []
                self.rbdp[new_key].append((key1,key2))
Ejemplo n.º 38
0
    def test_hash_memory_storage_rbp(self):
        hash1 = RandomBinaryProjections('testRBPHash', 10)
        hash1.reset(100)

        self.memory.store_hash_configuration(hash1)

        hash2 = RandomBinaryProjections(None, None)
        hash2.apply_config(self.memory.load_hash_configuration('testRBPHash'))

        self.assertEqual(hash1.dim, hash2.dim)
        self.assertEqual(hash1.hash_name, hash2.hash_name)
        self.assertEqual(hash1.projection_count, hash2.projection_count)

        for i in range(hash1.normals.shape[0]):
            for j in range(hash1.normals.shape[1]):
                self.assertEqual(hash1.normals[i, j], hash2.normals[i, j])
Ejemplo n.º 39
0
 def setUp(self):
     self.rbp = RandomBinaryProjections('testHash', 10)
     self.rbp.reset(100)
Ejemplo n.º 40
0
class DoubleEngine:

    def _build_rdp_engine(self,matrix,rdp,normals):
        # Dimension of our vector space
        dimension = np.shape(matrix)[1]
        n = np.shape(matrix)[0]
        # Create a random binary hash with 10 bits

        # Create engine with pipeline configuration
        engine = Engine(dimension, lshashes=[rdp],storage = MemoryStorage())
        rdp.vectors = normals

        for index in range(n):
            v = matrix[index]
            engine.store_vector(v, '%d' % index)
            
        return engine
    
        

    def process2(self,vectors1,vectors2,num_bit,bin_width):
        
        # build engine
        self.dimension = np.shape(vectors1)[1]
        self.rdp = RandomDiscretizedProjections('rdp',num_bit,bin_width)
        self.rbp = RandomBinaryProjections('rbp',num_bit)
        self.rdp.reset(self.dimension)
        self.rbp.reset(self.dimension)
        self.normals = self.rdp.vectors
        self.rbp.normals = self.normals
        self.engine1 = self._build_rdp_engine(vectors1,self.rdp,self.normals)
        self.engine2 = self._build_rdp_engine(vectors2,self.rdp,self.normals)
        
        # create new key
        buckets1 = self.engine1.storage.buckets['rdp']
        buckets2 = self.engine2.storage.buckets['rdp']
        
        self.rbdp = {}

        print 'len of buckets1', len(buckets1)
        print 'len of buckets2', len(buckets2)

        keys_int1 = []
        keys_int2 = []

        for key in buckets1:
            ks = [int(x) for x in key.split('_')]
            keys_int1.append(ks)

        for key in buckets2:
            ks = [int(x) for x in key.split('_')]
            keys_int2.append(ks)

        for idx1,key1 in enumerate(buckets1):
            if idx1 % 100 == 0:
                logging.info('{} {}/{}'.format(key1,idx1,len(buckets1)))
            for idx2,key2 in enumerate(buckets2):
                ks1 = keys_int1[idx1]
                ks2 = keys_int2[idx2]
                new_key = [ks1[i] + ks2[i] for i in xrange(len(ks1))]
                new_key = ''.join(['1' if x>=0 else '0' for x in new_key])
                if not new_key in self.rbdp:
                    self.rbdp[new_key] = []
                self.rbdp[new_key].append((key1,key2))
        
    def build_permute_index(self,num_permutation,beam_size,hamming_beam_size):
        self.num_permutation = num_permutation
        self.hamming_beam_size = hamming_beam_size
        self.beam_size = beam_size
        self.projection_count = self.rbp.projection_count
        
        # add permutations
        self.permutations = []
        for i in xrange(self.num_permutation):
            p = Permutation(self.projection_count)
            self.permutations.append(p)

        # convert current buckets to an array of bitarray
        buckets = self.rbdp
        original_keys = []
        for key in buckets:
            ba = bitarray(key)
            original_keys.append(ba)

        # build permutation lists
        self.permuted_lists = []
        i = 0
        for p in self.permutations:
            logging.info('Creating Permutation Index: #{}/{}'.format(i,len(self.permutations)))
            i+=1
            permuted_list = []
            for ba in original_keys:
                c = ba.copy()
                p.permute(c)
                permuted_list.append((c,ba))
            # sort the list
            permuted_list = sorted(permuted_list)
            self.permuted_lists.append(permuted_list)
        

    def get_neighbour_keys(self,bucket_key,k):
        # O( np*beam*log(np*beam) )
        # np = number of permutations
        # beam = self.beam_size
        # np * beam == 200 * 100 Still really fast

        query_key = bitarray(bucket_key)
        topk = set()
        for i in xrange(len(self.permutations)):
            p = self.permutations[i]
            plist = self.permuted_lists[i]
            candidates = p.search_revert(plist,query_key,self.beam_size)
            topk = topk.union(set(candidates))
        topk = list(topk)
        topk = sorted(topk, key = lambda x : hamming_distance(x,query_key))
        topk_bin = [x.to01() for x in topk[:k]]
        return topk_bin

    def n2(self,key1,key2,v):
        #return [(cos_dist,(idx1,idx2))]
        def matrix_list(engine,key):
            # return a matrix and a list of keys
            items = engine.storage.buckets['rdp'][key]
            m = []
            l = []
            for v,key in items:
                m.append(v)
                l.append(int(key))
            m = np.array(m)    
            return m,l
        m1,l1 = matrix_list(self.engine1,key1)
        m2,l2 = matrix_list(self.engine2,key2)
        len1 = len(l1)
        len2 = len(l2)
        # a . v 
        av = np.dot(m1,v)
        av = np.repeat(av,len2).reshape(len1,len2)
        # b . v
        bv = np.dot(m2,v)
        bv = np.repeat(bv,len1).reshape(len2,len1).T
        # nominator = a.v + b.v
        nomi = av + bv
        # |v|
        nv = np.linalg.norm(v,2)
        # a.a
        aa = np.sum(m1*m1,axis = 1)
        aa = np.repeat(aa,len2).reshape(len1,len2)
        # b.b
        bb = np.sum(m2*m2,axis = 1)
        bb = np.repeat(bb,len1).reshape(len2,len1).T
        # a.b
        ab = np.dot(m1,m2.T)
        # denominator 
        deno = np.sqrt(aa + bb + 2 * ab) * nv
        # distance matrix 
        dism = nomi / deno
        dist = []
        for i in xrange(len1):
            for j in xrange(len2):
                dis = dism[i,j]
                dist.append((dis,(l1[i],l2[j])))
        return dist

    def neighbours2(self,v,n):
        # one important assumption: just have one hash method
        # Collect candidates from all buckets from all hashes
        candidates = []
        direct_bucket_keys = self.rbp.hash_vector(v)

        # Get the neighbours of candidate_bucket_keys
        candidate_bucket_keys = []
        
        for bucket_key in direct_bucket_keys:
            neighbour_keys = self.get_neighbour_keys(bucket_key,self.hamming_beam_size)
            candidate_bucket_keys.extend(neighbour_keys)
        
        dists = []
        for bucket_key in candidate_bucket_keys:
            comb = self.rbdp[bucket_key]
            print bucket_key, len(comb)
            for key1,key2 in comb:
                dist = self.n2(key1,key2,v)
                dists.extend(dist)

        dists = sorted(dists,key = lambda x: -x[0])
        return dists[:n]
        # If there is no vector filter, just return list of candidates
        return dists