def index_user_vectors(): #print 'Performing indexing with HashPermutations...' global engine_perm t0 = time.time() #print k_dimen, d_dimen rbp_perm = RandomBinaryProjections('rbp_perm', d_dimen) rbp_perm.reset(k_dimen) # Create permutations meta-hash permutations = HashPermutations('permut') rbp_conf = {'num_permutation': 50, 'beam_size': 10, 'num_neighbour': 250} # Add rbp as child hash of permutations hash permutations.add_child_hash(rbp_perm, rbp_conf) # Create engine engine_perm = Engine(k_dimen, lshashes=[permutations], distance=CosineDistance()) for u in user_vector: engine_perm.store_vector(user_vector[u], data=u) # Then update permuted index permutations.build_permuted_index() t1 = time.time()
def knn(data, k): assert k <= len( data ) - 1, 'The number of neighbors must be smaller than the data cardinality (minus one)' k = k + 1 n, dimension = data.shape ind = [] dist = [] if (dimension < 10): rbp = RandomBinaryProjections('rbp', dimension) else: rbp = RandomBinaryProjections('rbp', 10) engine = Engine(dimension, lshashes=[rbp], vector_filters=[NearestFilter(k)]) for i in range(n): engine.store_vector(data[i], i) for i in range(n): N = engine.neighbours(data[i]) ind.append([x[1] for x in N][1:]) dist.append([x[2] for x in N][1:]) return N, dist, ind
def index_user_vectors(): print 'Performing indexing with HashPermutations...' global engine_perm t0 = time.time() print k_dimen, d_dimen rbp_perm = RandomBinaryProjections('rbp_perm', d_dimen) rbp_perm.reset(k_dimen) # Create permutations meta-hash permutations = HashPermutations('permut') rbp_conf = {'num_permutation':50,'beam_size':10,'num_neighbour':250} # Add rbp as child hash of permutations hash permutations.add_child_hash(rbp_perm, rbp_conf) # Create engine engine_perm = Engine(k_dimen, lshashes=[permutations], distance=CosineDistance()) for u in user_vector: engine_perm.store_vector(user_vector[u], data=u) # Then update permuted index permutations.build_permuted_index() t1 = time.time() print 'Indexing took %f seconds', (t1-t0)
def build_environment(config): lsh = LSH_sumbeam() w2v = MyWord2Vec() w2v.load(config) lsh.w2v = w2v # combine top 20k noun and 20k adj into a single wordlist topn = config.getint('space','topn') words = w2v.model.vocab.keys() wordlist = WordList() wordlist.words = words wordlist.filter_frequency(w2v,topn) wordlist.build_index() # build a matrix matrix = lsh._list2matrix_w2v(wordlist,lsh.w2v) # build an engine dim = np.shape(matrix)[1] num_bits = 15 rbp = RandomBinaryProjections('rbp', num_bits) rbp.reset(dim) engine = lsh._build_rbp_permute_engine(matrix,rbp) num_permutation = 50 beam_size = 50 num_neighbour = 100 engine.build_permute_index(num_permutation,beam_size,num_neighbour) return lsh,engine,matrix,wordlist
def __init__(self, dimension, n_bit, alpha): self.n_bit = n_bit self.dim = dimension self.alpha = alpha self.sample_space = 2**n_bit self.rbp = RandomBinaryProjections('rbp', self.n_bit) self.engine = Engine(dimension, lshashes=[self.rbp])
def build_index_sumbeam(self,num_bits): # hash the original vector in matrxi1 and matrix2 into engine1 and engine2 self.dim = np.shape(self.matrix1)[1] rbp = RandomBinaryProjections('rbp', num_bits) rbp.reset(self.dim) self.rbp = rbp engine1 = self._build_rbp_permute_engine(self.matrix1,rbp) engine2 = self._build_rbp_permute_engine(self.matrix2,rbp) self.engine1 = engine1 self.engine2 = engine2
def get_hash_config(redis_storage, name): config = redis_storage.load_hash_configuration(name) if config is not None: # Config is existing, create hash with None parameters lshash = RandomBinaryProjections(None, None, rand_seed=123) # Apply configuration loaded from redis lshash.apply_config(config) else: raise RuntimeError("Hash Config not found") return lshash
def generate_lsh_fn(self): self.lsh_fn = [] for i in range(self.L): rbp = RandomBinaryProjections('rbp', self.K) rbp.reset(self.dim) # def fn(x): # mm = mmh3.hash(rbp.hash_vector(x)[0]) # return mm % self.R def fn(x): return 1 self.lsh_fn.append(fn)
def __init__(self, *args): """ Initializing dictionary with reduced vectors which represents conferences's members. :param args[0] - data: Data class, it represents data from dblp.xml :param args[1] - dim: Output dimension for LSH. """ print('Initialization recommender...') self.data = args[0] self.reduced_conferences = {} rbp = RandomBinaryProjections('rbp', args[1]) rbp.reset(self.data.members_set.__len__()) cnt = 0
class TestRandomBinaryProjections(unittest.TestCase): def setUp(self): self.rbp = RandomBinaryProjections('testHash', 10) self.rbp.reset(100) def test_hash_format(self): h = self.rbp.hash_vector(numpy.random.randn(100)) self.assertEqual(len(h), 1) self.assertEqual(type(h[0]), type('')) self.assertEqual(len(h[0]), 10) for c in h[0]: self.assertTrue(c == '1' or c == '0') def test_hash_deterministic(self): x = numpy.random.randn(100) first_hash = self.rbp.hash_vector(x)[0] for k in range(100): self.assertEqual(first_hash, self.rbp.hash_vector(x)[0]) def test_hash_format_sparse(self): h = self.rbp.hash_vector(scipy.sparse.rand(100, 1, density=0.1)) self.assertEqual(len(h), 1) self.assertEqual(type(h[0]), type('')) self.assertEqual(len(h[0]), 10) for c in h[0]: self.assertTrue(c == '1' or c == '0') def test_hash_deterministic_sparse(self): x = scipy.sparse.rand(100, 1, density=0.1) first_hash = self.rbp.hash_vector(x)[0] for k in range(100): self.assertEqual(first_hash, self.rbp.hash_vector(x)[0])
class RBP_hasher(object): def __init__(self, dimension, n_bit, alpha): self.n_bit = n_bit self.dim = dimension self.alpha = alpha self.sample_space = 2**n_bit self.rbp = RandomBinaryProjections('rbp', self.n_bit) self.engine = Engine(dimension, lshashes=[self.rbp]) @property def params(self): return self.rbp.get_config() def load(self, config): self.rbp.apply_config(config) def _string2int(self, s): return int(s, 2) def __call__(self, v): ''' Convert the returned string into a integer. Return a dict based off the weights. ''' s = self.rbp.hash_vector(v)[0] weights = { self._string2int(s): 1.0, } if not self.alpha: return weights # If alpha is non-zero, deposit weight into nearby bins slist = map(bool, map(int, list(s))) for n in range(len(s)): s2list = slist[:] s2list[n] = not slist[n] s2list = map(str, map(int, s2list)) s2 = ''.join(s2list) idx = self._string2int(s2) weights[idx] = self.alpha return weights
def build_from_document_corpus(corpus, model_type, model_name, progress=False, project_events=False, include_events=False, hash_size=50, log=None, redis_port=6379, filter_chains=None): if log is None: log = get_console_logger("neighbour indexing") log.info("Loading model %s/%s" % (model_type, model_name)) model = NarrativeChainModel.load_by_type(model_type, model_name) vector_size = model.vector_size db_filename = "vectors.rdb" # Make sure the model directory exists, so we can get the Redis server pointing there model_dir = model.get_model_directory(model_name) # If the Redis stored db already exists, remove it, so that we don't end up adding to old data if os.path.exists(os.path.join(model_dir, db_filename)): os.remove(os.path.join(model_dir, db_filename)) log.info("Storing vectors in %s" % os.path.join(model_dir, db_filename)) log.info("Preparing neighbour search hash") # Create binary hash binary_hash = RandomBinaryProjections("%s:%s_binary_hash" % (model_type, model_name), hash_size) log.info("Connecting to Redis server on port %d" % redis_port) # Prepare an engine for storing the vectors in try: redis = Redis(host='localhost', port=redis_port, db=0) except ConnectionError, e: raise RuntimeError("could not connect to redis server on port %s. Is it running? (%s)" % (redis_port, e))
def load_search_engine(): global engine # read in the data file data = pandas.read_csv(os.path.join('data', 'features.tsv'), sep='\t') data_objects = pandas.read_csv(os.path.join('data', 'object_features.tsv'), sep='\t') # Create a random binary hash with 10 bits rbp = RandomBinaryProjections('rbp', 10) # Create engine with pipeline configuration engine = Engine(len(data['features'][0].split(',')), lshashes=[rbp], distance=EuclideanDistance()) # indexing for i in range(0, len(data)): engine.store_vector( np.asarray(data['features'][i].split(',')).astype('float64'), data['filename'][i].replace('images\\\\', '').replace('images\\', '').replace('images/', '')) for i in range(0, len(data_objects)): engine.store_vector( np.asarray( data_objects['features'][i].split(',')).astype('float64'), data_objects['filename'][i].replace('images\\\\', '').replace( 'images\\', '').replace('images/', '')) return engine
def load_engine(sdf_files, feature_matrix, dimension): """ Function that converts the given sdf_files into instances of the sdf_class, then loads them into nearpy Engine. Parameters sdf_files: a list of sdf_files with their pathname from the current directory. Intended to be fed in from `find_sdf(root_dir)` feature_matrix: matrix of training data features to be loaded into engine dimension: dimensionality of the feature vectors used for LSH (here: number of cluster centers) Returns engine: instance of a nearpy engine with all of sdf_files loaded Sample Usage >>> engine = load_engine(sdf_files) """ #dimension here can be altered as well rbp = RandomBinaryProjections('rbp', 10) engine = Engine(dimension, lshashes=[rbp]) count = 0 for index, file_ in enumerate(sdf_files): #print file_ if count % 100 == 0: print 'Converted %d files' % (count) converted = SDF(file_) converted.set_feature_vector(feature_matrix[index]) converted.add_to_nearpy_engine(engine) count += 1 return engine
def setUp(self): logging.basicConfig(level=logging.WARNING) numpy.random.seed(11) # Create permutations meta-hash self.permutations = HashPermutations('permut') # Create binary hash as child hash rbp = RandomBinaryProjections('rbp1', 4, rand_seed=19) rbp_conf = { 'num_permutation': 50, 'beam_size': 10, 'num_neighbour': 100 } # Add rbp as child hash of permutations hash self.permutations.add_child_hash(rbp, rbp_conf) # Create engine with meta hash and cosine distance self.engine_perm = Engine(200, lshashes=[self.permutations], distance=CosineDistance()) # Create engine without permutation meta-hash self.engine = Engine(200, lshashes=[rbp], distance=CosineDistance())
def build_content_sim_relation_text_lsa(network, signatures): def get_nid_gen(signatures): for nid, sig in signatures: yield nid docs = [] for nid, e in signatures: docs.append(' '.join(e)) # this may become redundant if we exploit the store characteristics tfidf = da.get_tfidf_docs(docs) print("TF-IDF shape before LSA: " + str(tfidf.shape)) st = time.time() tfidf = lsa_dimensionality_reduction(tfidf) et = time.time() print("TF-IDF shape after LSA: " + str(tfidf.shape)) print("Time to compute LSA: {0}".format(str(et - st))) lsh_projections = RandomBinaryProjections('default', 10000) #lsh_projections = RandomDiscretizedProjections('rnddiscretized', 1000, 2) nid_gen = get_nid_gen(signatures) # to preserve the order nid -> signature text_engine = index_in_text_engine(nid_gen, tfidf, lsh_projections, tfidf_is_dense=True) nid_gen = get_nid_gen(signatures) # to preserve the order nid -> signature create_sim_graph_text(nid_gen, network, text_engine, tfidf, Relation.CONTENT_SIM, tfidf_is_dense=True)
def RunAnnNearpy(q): totalTimer = Timer() # Load input dataset. Log.Info("Loading dataset", self.verbose) queryData = np.genfromtxt(self.dataset[1], delimiter=',') train, label = SplitTrainData(self.dataset) with totalTimer: # Get all the parameters. try: # Perform Approximate Nearest-Neighbors dimension = train.shape[1] rbp = RandomBinaryProjections('rbp', 10) engine = Engine(dimension, lshashes=[rbp]) for i in range(len(train)): engine.store_vector(train[i], 'data_%d' % i) for i in range(len(queryData)): v = engine.neighbours(queryData[i]) except Exception as e: Log.Info(e) q.put(e) return -1 time = totalTimer.ElapsedTime() q.put(time) return time
def test_nearpy(X_train, y_train, X_test, k): # We are looking for the k closest neighbours nearest = NearestFilter(k) X_train_normalized = [] for i in range(len(X_train)): train_example = X_train[i] element = ((train_example / np.linalg.norm(train_example)).tolist(), y_train[i].tolist()) X_train_normalized.append(element) engine = Engine(X_train.shape[1], lshashes=[RandomBinaryProjections('default', 10)], distance=CosineDistance(), vector_filters=[nearest]) #perform hashing for train examples for train_example in X_train: engine.store_vector(train_example) labels = [] for test_example in X_test: neighbors = engine.neighbours(test_example) labels.append([ train_example[1] for train_example in X_train_normalized if set(neighbors[0][0]) == set(train_example[0]) ]) return labels
def LSH(Layers, K): lsh_vectors = database[:, LSH_VECT_START_COL:] video_data = database[:, 0:5] num_rows, num_cols = lsh_vectors.shape dimension = num_cols rbp = list() for i in range(Layers): rbp.append(RandomBinaryProjections(str(i), K)) # Create engine with pipeline configuration engine = Engine(dimension, lshashes=rbp) # Index 1000000 random vectors (set their data zo a unique string) for index in range(num_rows): v = lsh_vectors[index, :] meta_data = str(index)+',' + str(int(video_data[index, 0])) + ', ' + str(int(video_data[index, 1])) + ', ' + str(int(video_data[index, 2])) \ + ', ' + str(video_data[index, 3]) + ', ' + str(video_data[index, 4]) engine.store_vector(v, meta_data) printOutput(engine.storage.buckets) print 'stop'
def __init__(self, dim, lshashes=None, distance=None, fetch_vector_filters=None, vector_filters=None, storage=None): """ Keeps the configuration. """ if lshashes is None: lshashes = [RandomBinaryProjections('default', 10)] self.lshashes = lshashes if distance is None: distance = EuclideanDistance() self.distance = distance if vector_filters is None: vector_filters = [NearestFilter(10)] self.vector_filters = vector_filters if fetch_vector_filters is None: fetch_vector_filters = [UniqueFilter()] self.fetch_vector_filters = fetch_vector_filters if storage is None: storage = MemoryStorage() self.storage = storage # Initialize all hashes for the data space dimension. for lshash in self.lshashes: lshash.reset(dim) print('*** engine init done ***')
def __init__(self, num_features, projection_count=30): self.num_features = num_features #self.rbp = RandomDiscretizedProjections('default', projection_count, bin_width=100) self.rbp = RandomBinaryProjections('default', projection_count) #self.rbp = RandomBinaryProjectionTree('default', projection_count, 1) self.text_engine = Engine(num_features, lshashes=[self.rbp], distance=CosineDistance())
def __init__(self, x): self.n, self.f = x.shape # Use NearPy lsh for fast ann rbp = RandomBinaryProjections('rbp', 10) self.engine = Engine(self.f, lshashes=[rbp]) for i in np.arange(self.n): v = x[i, :] self.engine.store_vector(v, i)
def main(args): """ Main entry. """ data = Dataset(args.dataset) num, dim = data.base.shape # We are looking for the ten closest neighbours nearest = NearestFilter(args.topk) # We want unique candidates unique = UniqueFilter() # Create engines for all configurations for nbit, ntbl in itertools.product(args.nbits, args.ntbls): logging.info("Creating Engine ...") lshashes = [RandomBinaryProjections('rbp%d' % i, nbit) for i in xrange(ntbl)] # Create engine with this configuration engine = Engine(dim, lshashes=lshashes, vector_filters=[unique, nearest]) logging.info("\tDone!") logging.info("Adding items ...") for i in xrange(num): engine.store_vector(data.base[i, :], i) if i % 100000 == 0: logging.info("\t%d/%d" % (i, data.nbae)) logging.info("\tDone!") ids = np.zeros((data.nqry, args.topk), np.int) logging.info("Searching ...") tic() for i in xrange(data.nqry): reti = [y for x, y, z in np.array(engine.neighbours(data.query[i]))] ids[i, :len(reti)] = reti if i % 100 == 0: logging.info("\t%d/%d" % (i, data.nqry)) time_costs = toc() logging.info("\tDone!") report = os.path.join(args.exp_dir, "report.txt") with open(report, "a") as rptf: rptf.write("*" * 64 + "\n") rptf.write("* %s\n" % time.asctime()) rptf.write("*" * 64 + "\n") r_at_k = compute_stats(data.groundtruth, ids, args.topk)[-1][-1] with open(report, "a") as rptf: rptf.write("=" * 64 + "\n") rptf.write("index_%s-nbit_%d-ntbl_%d\n" % ("NearPy", nbit, ntbl)) rptf.write("-" * 64 + "\n") rptf.write("recall@%-8d%.4f\n" % (args.topk, r_at_k)) rptf.write("time cost (ms): %.3f\n" % (time_costs * 1000 / data.nqry))
def fit(self, X): b = self.params['b'] self.n, self.f = X.shape # Use NearPy lsh for fast ann rbp = RandomBinaryProjections('rbp', b) self.engine = Engine(self.f, lshashes=[rbp]) for i in np.arange(self.n): v = np.squeeze(np.copy(X[i, :])) self.engine.store_vector(v, i)
def k_nn_lsh_2(k, word, decade_matrix, index_dict): num_rows = decade_matrix.get_shape()[0] print("the number of rows:" + str(num_rows)) rbp = RandomBinaryProjections('rbp', 256) engine = Engine(num_rows, lshashes=[rbp]) for i in range(num_rows): print(i) engine.store_vector(decade_matrix.getrow(i), "data_%d" % i) return engine.neighbours(word)
def __init__(self, emb_path, feature='title'): self.emb_path = emb_path self.feature = feature self.data_df = None self.tfidf = Vectorizer(**get_tfidf_params()) self.fasttext_embedder = None self.fasttext_tfidf = None self.dimension = 300 rbp = RandomBinaryProjections('rbp', 2) self.engine = Engine(self.dimension, lshashes=[rbp]) pass
def build_index(self, X): f = X.shape[1] n = X.shape[0] rbp = RandomBinaryProjections('rbp', 32) engine = Engine(f, lshashes=[rbp]) for i in range(n): engine.store_vector(X[i], 'data_%d' % i) return engine
def __configure_calculator(self, point_list, point): # Dimension of our vector space self.__dimension__ = 2 # Create a random binary hash with 10 bits self.__rbp__ = RandomBinaryProjections('rbp', 10) # Create engine with pipeline configuration self.__engine__ = Engine(self.__dimension__, lshashes=[self.__rbp__]) self.set_searching_point_list(point_list) self.set_query_point(point)
def __init__(self, data_points, sim_threshold=0.5, num_vectors=3): self.data_points = data_points self.point_num = self.data_points.shape[0] self.dimension = self.data_points.shape[1] - 1 # Create a random binary hash with . bits self.rbp = RandomBinaryProjections('rbp', num_vectors, rand_seed=42) self.engine = Engine( self.dimension, lshashes=[self.rbp], vector_filters=[DistanceThresholdFilter(1 - sim_threshold)]) for i in range(self.point_num): self.engine.store_vector(self.data_points[i, 1:], '%d' % i)
def load_hashmap(self): # Create redis storage adapter # need to start redis service redis_object = Redis(host='localhost', port=6379, db=14) redis_storage = RedisStorage(redis_object) try: config = redis_storage.load_hash_configuration('test') lshash = RandomBinaryProjections(None, None) lshash.apply_config(config) except: # Config is not existing, create hash from scratch, with 10 projections lshash = RandomBinaryProjections('test', 10) nearest = NearestFilter(self.nn) # self.engine = Engine(feature_size, lshashes=[], vector_filters=[]) self.engine = Engine(self.feature_size, lshashes=[lshash], vector_filters=[nearest], storage=redis_storage, distance=CosineDistance()) # Do some stuff like indexing or querying with the engine... # Finally store hash configuration in redis for later use redis_storage.store_hash_configuration(lshash)
def loadHashmap(self, feature_size=129, result_n=1000): #这里参数没有用到 ''' feature_size: hash空间维数大小 result_n :返回多少个最近邻 ''' # Create redis storage adapter redis_object = Redis(host='localhost', port=6379, db=0) redis_storage = RedisStorage(redis_object) try: # Get hash config from redis config = redis_storage.load_hash_configuration('test') # Config is existing, create hash with None parameters lshash = RandomBinaryProjections(None, None) # Apply configuration loaded from redis lshash.apply_config(config) except: # Config is not existing, create hash from scratch, with 10 projections lshash = RandomBinaryProjections('test', 0) # Create engine for feature space of 100 dimensions and use our hash. # This will set the dimension of the lshash only the first time, not when # using the configuration loaded from redis. Use redis storage to store # buckets. nearest = NearestFilter(result_n) #self.engine = Engine(feature_size, lshashes=[], vector_filters=[]) self.engine = Engine(feature_size, lshashes=[lshash], vector_filters=[nearest], storage=redis_storage, distance=EuclideanDistance()) # Do some stuff like indexing or querying with the engine... # Finally store hash configuration in redis for later use redis_storage.store_hash_configuration(lshash)
def fit(self, X, y=None, hash="randbinary"): X = np.array(X) assert len(X.shape) == 2, "X not 2-rank" dimension = X.shape[-1] if hash == "randbinary": rbp = RandomBinaryProjections('rbp', 10) elif hash == "pcabinary": rbp = PCABinaryProjections('rbp', 10, training_set=X) self.engine = Engine(dimension, lshashes=[rbp]) index = 0 for x in X: self.engine.store_vector(x, str(index)) index += 1
def data_for_layer(basic_path, layer_name, num_folds, experiment, projection_count, start_pc_component, end_pc_component): # Read datasets basic_path_layer = os.path.join(basic_path, layer_name) dataset_files = "ALOI_train_20400.h5" hd = h5py.File(os.path.join(basic_path_layer, "full_size", dataset_files), 'r') dataset_aloi = hd['dataset_1'] dataset_train_aloi, dataset_test_aloi = split_data_to_test_train( dataset_aloi, num_folds, experiment) del dataset_aloi transformer = TransformImagesPCA(n_components=500) transformer.learn_pcs(dataset_train_aloi) del dataset_train_aloi dataset_files = "Google_train_6675.h5" hd = h5py.File(os.path.join(basic_path_layer, "full_size", dataset_files), 'r') dataset_google = hd['dataset_1'] dataset_train_google, dataset_test_google = split_data_to_test_train( dataset_google, num_folds, experiment) del dataset_google transformer.learn_pcs(dataset_train_google) del dataset_train_google dataset_files = "Nexus_train_1180.h5" hd = h5py.File(os.path.join(basic_path_layer, "full_size", dataset_files), 'r') dataset = hd['dataset_1'] dataset_train, dataset_test = split_data_to_test_train( dataset, num_folds, experiment) del dataset transformer.learn_pcs(dataset_train) del dataset_train pc_test_nexus = transformer.transform( dataset_test)[:, start_pc_component:end_pc_component] pc_test_aloi = transformer.transform( dataset_test_aloi)[:, start_pc_component:end_pc_component] pc_test_google = transformer.transform( dataset_test_google)[:, start_pc_component:end_pc_component] # Find the LSH vectors rbp = RandomBinaryProjections('rbp', projection_count, rand_seed=723657345) engine = Engine(end_pc_component - start_pc_component, lshashes=[rbp]) pc_test_nexus = project_LSH(pc_test_nexus, rbp) pc_test_aloi = project_LSH(pc_test_aloi, rbp) pc_test_google = project_LSH(pc_test_google, rbp) return pc_test_nexus, pc_test_aloi, pc_test_google
def __init__(self, distanceMeasure="EuclideanDistance"): self.res_similar = ResnetSimilarity() dimension = 2048 rbp = RandomBinaryProjections('rbp', 10) self.engine = Engine(dimension, lshashes=[rbp]) if distanceMeasure == "EuclideanDistance": self.filehandler = open("hashed_objects/hashed_object_euclidean.pkl", 'rb') elif distanceMeasure == "Test": self.filehandler = open("hashed_objects/hashed_object_example.pkl", 'rb') else: self.filehandler = open("hashed_objects/hashed_object_Cosine.pkl", 'rb') self.engine = pickle.load(self.filehandler) self.filehandler.close() print("Hash Table Loaded")
def __init__(self): redis_object = redis.Redis(host='localhost', port=6379, db=0) redis_storage = RedisStorage(redis_object) # Get hash config from redis config = redis_storage.load_hash_configuration('MyHash') if config is None: # Config is not existing, create hash from scratch, with 5 projections self.lshash = RandomBinaryProjections('MyHash', 5) else: # Config is existing, create hash with None parameters self.lshash = RandomBinaryProjections(None, None) # Apply configuration loaded from redis self.lshash.apply_config(config) # print("HERE") # Create engine for feature space of 100 dimensions and use our hash. # This will set the dimension of the lshash only the first time, not when # using the configuration loaded from redis. Use redis storage to store # buckets. self.engine = Engine(4, lshashes=[self.lshash], storage=redis_storage) redis_storage.store_hash_configuration(self.lshash)
def process2(self,vectors1,vectors2,num_bit,bin_width): # build engine self.dimension = np.shape(vectors1)[1] self.rdp = RandomDiscretizedProjections('rdp',num_bit,bin_width) self.rbp = RandomBinaryProjections('rbp',num_bit) self.rdp.reset(self.dimension) self.rbp.reset(self.dimension) self.normals = self.rdp.vectors self.rbp.normals = self.normals self.engine1 = self._build_rdp_engine(vectors1,self.rdp,self.normals) self.engine2 = self._build_rdp_engine(vectors2,self.rdp,self.normals) # create new key buckets1 = self.engine1.storage.buckets['rdp'] buckets2 = self.engine2.storage.buckets['rdp'] self.rbdp = {} print 'len of buckets1', len(buckets1) print 'len of buckets2', len(buckets2) keys_int1 = [] keys_int2 = [] for key in buckets1: ks = [int(x) for x in key.split('_')] keys_int1.append(ks) for key in buckets2: ks = [int(x) for x in key.split('_')] keys_int2.append(ks) for idx1,key1 in enumerate(buckets1): if idx1 % 100 == 0: logging.info('{} {}/{}'.format(key1,idx1,len(buckets1))) for idx2,key2 in enumerate(buckets2): ks1 = keys_int1[idx1] ks2 = keys_int2[idx2] new_key = [ks1[i] + ks2[i] for i in xrange(len(ks1))] new_key = ''.join(['1' if x>=0 else '0' for x in new_key]) if not new_key in self.rbdp: self.rbdp[new_key] = [] self.rbdp[new_key].append((key1,key2))
def test_hash_memory_storage_rbp(self): hash1 = RandomBinaryProjections('testRBPHash', 10) hash1.reset(100) self.memory.store_hash_configuration(hash1) hash2 = RandomBinaryProjections(None, None) hash2.apply_config(self.memory.load_hash_configuration('testRBPHash')) self.assertEqual(hash1.dim, hash2.dim) self.assertEqual(hash1.hash_name, hash2.hash_name) self.assertEqual(hash1.projection_count, hash2.projection_count) for i in range(hash1.normals.shape[0]): for j in range(hash1.normals.shape[1]): self.assertEqual(hash1.normals[i, j], hash2.normals[i, j])
def setUp(self): self.rbp = RandomBinaryProjections('testHash', 10) self.rbp.reset(100)
class DoubleEngine: def _build_rdp_engine(self,matrix,rdp,normals): # Dimension of our vector space dimension = np.shape(matrix)[1] n = np.shape(matrix)[0] # Create a random binary hash with 10 bits # Create engine with pipeline configuration engine = Engine(dimension, lshashes=[rdp],storage = MemoryStorage()) rdp.vectors = normals for index in range(n): v = matrix[index] engine.store_vector(v, '%d' % index) return engine def process2(self,vectors1,vectors2,num_bit,bin_width): # build engine self.dimension = np.shape(vectors1)[1] self.rdp = RandomDiscretizedProjections('rdp',num_bit,bin_width) self.rbp = RandomBinaryProjections('rbp',num_bit) self.rdp.reset(self.dimension) self.rbp.reset(self.dimension) self.normals = self.rdp.vectors self.rbp.normals = self.normals self.engine1 = self._build_rdp_engine(vectors1,self.rdp,self.normals) self.engine2 = self._build_rdp_engine(vectors2,self.rdp,self.normals) # create new key buckets1 = self.engine1.storage.buckets['rdp'] buckets2 = self.engine2.storage.buckets['rdp'] self.rbdp = {} print 'len of buckets1', len(buckets1) print 'len of buckets2', len(buckets2) keys_int1 = [] keys_int2 = [] for key in buckets1: ks = [int(x) for x in key.split('_')] keys_int1.append(ks) for key in buckets2: ks = [int(x) for x in key.split('_')] keys_int2.append(ks) for idx1,key1 in enumerate(buckets1): if idx1 % 100 == 0: logging.info('{} {}/{}'.format(key1,idx1,len(buckets1))) for idx2,key2 in enumerate(buckets2): ks1 = keys_int1[idx1] ks2 = keys_int2[idx2] new_key = [ks1[i] + ks2[i] for i in xrange(len(ks1))] new_key = ''.join(['1' if x>=0 else '0' for x in new_key]) if not new_key in self.rbdp: self.rbdp[new_key] = [] self.rbdp[new_key].append((key1,key2)) def build_permute_index(self,num_permutation,beam_size,hamming_beam_size): self.num_permutation = num_permutation self.hamming_beam_size = hamming_beam_size self.beam_size = beam_size self.projection_count = self.rbp.projection_count # add permutations self.permutations = [] for i in xrange(self.num_permutation): p = Permutation(self.projection_count) self.permutations.append(p) # convert current buckets to an array of bitarray buckets = self.rbdp original_keys = [] for key in buckets: ba = bitarray(key) original_keys.append(ba) # build permutation lists self.permuted_lists = [] i = 0 for p in self.permutations: logging.info('Creating Permutation Index: #{}/{}'.format(i,len(self.permutations))) i+=1 permuted_list = [] for ba in original_keys: c = ba.copy() p.permute(c) permuted_list.append((c,ba)) # sort the list permuted_list = sorted(permuted_list) self.permuted_lists.append(permuted_list) def get_neighbour_keys(self,bucket_key,k): # O( np*beam*log(np*beam) ) # np = number of permutations # beam = self.beam_size # np * beam == 200 * 100 Still really fast query_key = bitarray(bucket_key) topk = set() for i in xrange(len(self.permutations)): p = self.permutations[i] plist = self.permuted_lists[i] candidates = p.search_revert(plist,query_key,self.beam_size) topk = topk.union(set(candidates)) topk = list(topk) topk = sorted(topk, key = lambda x : hamming_distance(x,query_key)) topk_bin = [x.to01() for x in topk[:k]] return topk_bin def n2(self,key1,key2,v): #return [(cos_dist,(idx1,idx2))] def matrix_list(engine,key): # return a matrix and a list of keys items = engine.storage.buckets['rdp'][key] m = [] l = [] for v,key in items: m.append(v) l.append(int(key)) m = np.array(m) return m,l m1,l1 = matrix_list(self.engine1,key1) m2,l2 = matrix_list(self.engine2,key2) len1 = len(l1) len2 = len(l2) # a . v av = np.dot(m1,v) av = np.repeat(av,len2).reshape(len1,len2) # b . v bv = np.dot(m2,v) bv = np.repeat(bv,len1).reshape(len2,len1).T # nominator = a.v + b.v nomi = av + bv # |v| nv = np.linalg.norm(v,2) # a.a aa = np.sum(m1*m1,axis = 1) aa = np.repeat(aa,len2).reshape(len1,len2) # b.b bb = np.sum(m2*m2,axis = 1) bb = np.repeat(bb,len1).reshape(len2,len1).T # a.b ab = np.dot(m1,m2.T) # denominator deno = np.sqrt(aa + bb + 2 * ab) * nv # distance matrix dism = nomi / deno dist = [] for i in xrange(len1): for j in xrange(len2): dis = dism[i,j] dist.append((dis,(l1[i],l2[j]))) return dist def neighbours2(self,v,n): # one important assumption: just have one hash method # Collect candidates from all buckets from all hashes candidates = [] direct_bucket_keys = self.rbp.hash_vector(v) # Get the neighbours of candidate_bucket_keys candidate_bucket_keys = [] for bucket_key in direct_bucket_keys: neighbour_keys = self.get_neighbour_keys(bucket_key,self.hamming_beam_size) candidate_bucket_keys.extend(neighbour_keys) dists = [] for bucket_key in candidate_bucket_keys: comb = self.rbdp[bucket_key] print bucket_key, len(comb) for key1,key2 in comb: dist = self.n2(key1,key2,v) dists.extend(dist) dists = sorted(dists,key = lambda x: -x[0]) return dists[:n] # If there is no vector filter, just return list of candidates return dists