def setUp(self): self.V = [] self.V.append((numpy.array([0]), 'data1', 0.4)) self.V.append((numpy.array([1]), 'data2', 0.9)) self.V.append((numpy.array([2]), 'data3', 1.4)) self.V.append((numpy.array([3]), 'data4', 2.1)) self.V.append((numpy.array([4]), 'data5', 0.1)) self.V.append((numpy.array([5]), 'data6', 8.7)) self.V.append((numpy.array([6]), 'data7', 3.4)) self.V.append((numpy.array([7]), 'data8', 2.8)) self.threshold_filter = DistanceThresholdFilter(1.0) self.nearest_filter = NearestFilter(5) self.unique = UniqueFilter()
def test_nearpy(X_train, y_train, X_test, k): # We are looking for the k closest neighbours nearest = NearestFilter(k) X_train_normalized = [] for i in range(len(X_train)): train_example = X_train[i] element = ((train_example / np.linalg.norm(train_example)).tolist(), y_train[i].tolist()) X_train_normalized.append(element) engine = Engine(X_train.shape[1], lshashes=[RandomBinaryProjections('default', 10)], distance=CosineDistance(), vector_filters=[nearest]) #perform hashing for train examples for train_example in X_train: engine.store_vector(train_example) labels = [] for test_example in X_test: neighbors = engine.neighbours(test_example) labels.append([ train_example[1] for train_example in X_train_normalized if set(neighbors[0][0]) == set(train_example[0]) ]) return labels
def __init__(self, feature_file, dimension, neighbour, lsh_project_num): self.feature_file = feature_file self.dimension = dimension self.neighbour = neighbour self.face_feature = defaultdict(str) self.ground_truth = defaultdict(int) # Create permutations meta-hash permutations2 = HashPermutationMapper('permut2') tmp_feature = defaultdict(str) with open(feature_file, 'rb') as f: reader = csv.reader(f, delimiter=' ') for name, feature in reader: tmp_feature[name] = feature matrix = [] label = [] for item in tmp_feature.keys(): v = map(float, tmp_feature[item].split(',')) matrix.append(np.array(v)) label.append(item) random.shuffle(matrix) print 'PCA matric : ', len(matrix) rbp_perm2 = PCABinaryProjections('testPCABPHash', lsh_project_num, matrix) permutations2.add_child_hash(rbp_perm2) # Create engine nearest = NearestFilter(self.neighbour) self.engine = Engine(self.dimension, lshashes=[permutations2], distance=CosineDistance(), vector_filters=[nearest])
def loadHashmap(self, feature_size=129, result_n=1000): #这里参数没有用到 ''' feature_size: hash空间维数大小 result_n :返回多少个最近邻 ''' # Create redis storage adapter redis_object = Redis(host='localhost', port=6379, db=0) redis_storage = RedisStorage(redis_object) try: # Get hash config from redis config = redis_storage.load_hash_configuration('test') # Config is existing, create hash with None parameters lshash = RandomBinaryProjections(None, None) # Apply configuration loaded from redis lshash.apply_config(config) except: # Config is not existing, create hash from scratch, with 10 projections lshash = RandomBinaryProjections('test', 0) # Create engine for feature space of 100 dimensions and use our hash. # This will set the dimension of the lshash only the first time, not when # using the configuration loaded from redis. Use redis storage to store # buckets. nearest = NearestFilter(result_n) #self.engine = Engine(feature_size, lshashes=[], vector_filters=[]) self.engine = Engine(feature_size, lshashes=[lshash], vector_filters=[nearest], storage=redis_storage, distance=EuclideanDistance()) # Do some stuff like indexing or querying with the engine... # Finally store hash configuration in redis for later use redis_storage.store_hash_configuration(lshash)
def __init__(self, dim, lshashes=None, distance=None, fetch_vector_filters=None, vector_filters=None, storage=None): """ Keeps the configuration. """ if lshashes is None: lshashes = [RandomBinaryProjections('default', 10)] self.lshashes = lshashes if distance is None: distance = EuclideanDistance() self.distance = distance if vector_filters is None: vector_filters = [NearestFilter(10)] self.vector_filters = vector_filters if fetch_vector_filters is None: fetch_vector_filters = [UniqueFilter()] self.fetch_vector_filters = fetch_vector_filters if storage is None: storage = MemoryStorage() self.storage = storage # Initialize all hashes for the data space dimension. for lshash in self.lshashes: lshash.reset(dim) print('*** engine init done ***')
def load_hashmap(self): # Create redis storage adapter # need to start redis service redis_object = Redis(host='localhost', port=6379, db=14) redis_storage = RedisStorage(redis_object) try: config = redis_storage.load_hash_configuration('test') lshash = RandomBinaryProjections(None, None) lshash.apply_config(config) except: # Config is not existing, create hash from scratch, with 10 projections lshash = RandomBinaryProjections('test', 10) nearest = NearestFilter(self.nn) # self.engine = Engine(feature_size, lshashes=[], vector_filters=[]) self.engine = Engine(self.feature_size, lshashes=[lshash], vector_filters=[nearest], storage=redis_storage, distance=CosineDistance()) # Do some stuff like indexing or querying with the engine... # Finally store hash configuration in redis for later use redis_storage.store_hash_configuration(lshash)
def test_random_discretized_projections(self): dim = 4 vector_count = 5000 vectors = numpy.random.randn(dim, vector_count) # First get recall and precision for one 1-dim random hash rdp = RandomDiscretizedProjections('rdp', 1, 0.01) nearest = NearestFilter(10) engine = Engine(dim, lshashes=[rdp], vector_filters=[nearest]) exp = RecallPrecisionExperiment(10, vectors) result = exp.perform_experiment([engine]) recall1 = result[0][0] precision1 = result[0][1] searchtime1 = result[0][2] print('\nRecall RDP: %f, Precision RDP: %f, SearchTime RDP: %f\n' % \ (recall1, precision1, searchtime1)) # Then get recall and precision for one 4-dim random hash rdp = RandomDiscretizedProjections('rdp', 2, 0.2) engine = Engine(dim, lshashes=[rdp], vector_filters=[nearest]) result = exp.perform_experiment([engine]) recall2 = result[0][0] precision2 = result[0][1] searchtime2 = result[0][2] print('\nRecall RDP: %f, Precision RDP: %f, SearchTime RDP: %f\n' % \ (recall2, precision2, searchtime2)) # Many things are random here, but the precision should increase # with dimension self.assertTrue(precision2 > precision1)
def __init__(self, metric, n_bits, hash_counts): self._n_bits = n_bits self._hash_counts = hash_counts self._metric = metric self._filter = NearestFilter(10) self.name = 'NearPy(n_bits=%d, hash_counts=%d)' % (self._n_bits, self._hash_counts)
def knn(data, k): assert k <= len( data ) - 1, 'The number of neighbors must be smaller than the data cardinality (minus one)' k = k + 1 n, dimension = data.shape ind = [] dist = [] if (dimension < 10): rbp = RandomBinaryProjections('rbp', dimension) else: rbp = RandomBinaryProjections('rbp', 10) engine = Engine(dimension, lshashes=[rbp], vector_filters=[NearestFilter(k)]) for i in range(n): engine.store_vector(data[i], i) for i in range(n): N = engine.neighbours(data[i]) ind.append([x[1] for x in N][1:]) dist.append([x[2] for x in N][1:]) return N, dist, ind
def test_retrieval(self): # We want 12 projections, 20 results at least rbpt = RandomBinaryProjectionTree('testHash', 12, 20) # Create engine for 100 dimensional feature space, do not forget to set # nearest filter to 20, because default is 10 self.engine = Engine(100, lshashes=[rbpt], vector_filters=[NearestFilter(20)]) # First insert 200000 random vectors #print 'Indexing...' for k in range(200000): x = numpy.random.randn(100) x_data = 'data' self.engine.store_vector(x, x_data) # Now do random queries and check result set size #print 'Querying...' for k in range(10): x = numpy.random.randn(100) n = self.engine.neighbours(x) #print "Candidate count = %d" % self.engine.candidate_count(x) #print "Result size = %d" % len(n) self.assertEqual(len(n), 20)
def main(args): """ Main entry. """ data = Dataset(args.dataset) num, dim = data.base.shape # We are looking for the ten closest neighbours nearest = NearestFilter(args.topk) # We want unique candidates unique = UniqueFilter() # Create engines for all configurations for nbit, ntbl in itertools.product(args.nbits, args.ntbls): logging.info("Creating Engine ...") lshashes = [RandomBinaryProjections('rbp%d' % i, nbit) for i in xrange(ntbl)] # Create engine with this configuration engine = Engine(dim, lshashes=lshashes, vector_filters=[unique, nearest]) logging.info("\tDone!") logging.info("Adding items ...") for i in xrange(num): engine.store_vector(data.base[i, :], i) if i % 100000 == 0: logging.info("\t%d/%d" % (i, data.nbae)) logging.info("\tDone!") ids = np.zeros((data.nqry, args.topk), np.int) logging.info("Searching ...") tic() for i in xrange(data.nqry): reti = [y for x, y, z in np.array(engine.neighbours(data.query[i]))] ids[i, :len(reti)] = reti if i % 100 == 0: logging.info("\t%d/%d" % (i, data.nqry)) time_costs = toc() logging.info("\tDone!") report = os.path.join(args.exp_dir, "report.txt") with open(report, "a") as rptf: rptf.write("*" * 64 + "\n") rptf.write("* %s\n" % time.asctime()) rptf.write("*" * 64 + "\n") r_at_k = compute_stats(data.groundtruth, ids, args.topk)[-1][-1] with open(report, "a") as rptf: rptf.write("=" * 64 + "\n") rptf.write("index_%s-nbit_%d-ntbl_%d\n" % ("NearPy", nbit, ntbl)) rptf.write("-" * 64 + "\n") rptf.write("recall@%-8d%.4f\n" % (args.topk, r_at_k)) rptf.write("time cost (ms): %.3f\n" % (time_costs * 1000 / data.nqry))
def createLSH(dimensions): nearest = NearestFilter(5) bin_width = 10 projections = 50 rbp = RandomDiscretizedProjections('rbp', projections, bin_width) rbp2 = RandomDiscretizedProjections('rbp2', projections, bin_width) rbp3 = RandomDiscretizedProjections('rbp3', projections, bin_width) rbp4 = RandomDiscretizedProjections('rbp4', projections, bin_width) engine = Engine(dimensions, lshashes=[rbp, rbp2, rbp3, rbp4], vector_filters=[nearest]) return engine
def _create_engine(self, k, lshashes=None): self.k_ = k self.engine_ = Engine(self.dimension_, lshashes, distance=self.dist_metric_, vector_filters=[NearestFilter(k)]) for i, feature in enumerate(self.featurized_): if self.transpose_: self.engine_.store_vector(feature.T, i) else: self.engine_.store_vector(feature, i)
def build(self, data, k, cp): n_items, vector_length = data.shape #print(data.shape) #parameters init method_param = init_method_param("nearpy", data=data, cp=cp) hash_counts = method_param["hash_counts"] n_bits = method_param["n_bits"] self.filter = NearestFilter(10) hashes = [] for k in range(hash_counts): nearpy_rbp = nearpy.hashes.RandomBinaryProjections( 'rbp_%d' % k, n_bits) hashes.append(nearpy_rbp) if self.metric == 'euclidean': dist = nearpy.distances.EuclideanDistance() self.index = nearpy.Engine( vector_length, lshashes=hashes, distance=dist, vector_filters=[self.filter]) else: # Default (angular) = Cosine distance self.index = nearpy.Engine( vector_length, lshashes=hashes, vector_filters=[self.filter]) #if self.metric == 'angular': #data = sklearn.preprocessing.normalize(data, axis=1, norm='l2') for i, x in enumerate(data): self.index.store_vector(x, i) # def query_train(self, data, k): self.filter.N = k #if self.metric == 'angular': #data = sklearn.preprocessing.normalize([data], axis=1, norm='l2')[0] neighbors = np.empty((data.shape[0],k), dtype=int) distances = np.empty((data.shape[0],k)) for i in range(len(data)): item_single = self.index.neighbours(data[i]) dp_n = [] dp_d = [] for j in range(len(item_single)): dp_n.append(item_single[j][1]) dp_d.append(item_single[j][2]) neighbors[i] = np.asarray(dp_n) distances[i] = np.asarray(dp_d) return neighbors, distances
def test_experiment_with_unibucket_1(self): dim = 50 vector_count = 100 vectors = numpy.random.randn(dim, vector_count) unibucket = UniBucket('testHash') nearest = NearestFilter(10) engine = Engine(dim, lshashes=[unibucket], vector_filters=[nearest]) exp = RecallPrecisionExperiment(10, vectors) result = exp.perform_experiment([engine]) # Both recall and precision must be one in this case self.assertEqual(result[0][0], 1.0) self.assertEqual(result[0][1], 1.0)
def test_experiment_with_unibucket_3(self): dim = 50 vector_count = 100 vectors = numpy.random.randn(dim, vector_count) unibucket = UniBucket('testHash') nearest = NearestFilter(5) engine = Engine(dim, lshashes=[unibucket], vector_filters=[nearest]) exp = RecallPrecisionExperiment(10, vectors) result = exp.perform_experiment([engine]) # In this case recall is only 0.5 # because the engine returns 5 nearest, but # the experiment looks for 10 nearest. self.assertEqual(result[0][0], 0.5) self.assertEqual(result[0][1], 1.0)
def __init__(self, dim, lshashes=[RandomBinaryProjections('default', 10)], distance=EuclideanDistance(), vector_filters=[NearestFilter(10)], storage=MemoryStorage()): """ Keeps the configuration. """ self.lshashes = lshashes self.distance = distance self.vector_filters = vector_filters self.storage = storage # Initialize all hashes for the data space dimension. for lshash in self.lshashes: lshash.reset(dim)
def test_experiment_with_list_2(self): dim = 50 vector_count = 100 vectors = [] for index in range(vector_count): vectors.append(numpy.random.randn(dim)) unibucket = UniBucket('testHash') nearest = NearestFilter(10) engine = Engine(dim, lshashes=[unibucket], vector_filters=[nearest]) exp = RecallPrecisionExperiment(5, vectors) result = exp.perform_experiment([engine]) # In this case precision is only 0.5 # because the engine returns 10 nearest, but # the experiment only looks for 5 nearest. self.assertEqual(result[0][0], 1.0) self.assertEqual(result[0][1], 0.5)
def lshSearch(dataBase_, query_, k): featureNum_ = len(dataBase_) dimension_ = len(dataBase_[0]) rbp_ = RandomBinaryProjections('rbp', 30) engine_ = Engine(dimension_, lshashes=[rbp_], vector_filters=[NearestFilter(k)]) for i in range(featureNum_): v_ = dataBase_[i] engine_.store_vector(v_, '{}'.format(i)) N_ = engine_.neighbours(query_, distance='euclidean') index_ = [int(x[1]) for x in N_] return index_
def test_random_binary_projections(self): dim = 4 vector_count = 5000 vectors = numpy.random.randn(dim, vector_count) # First get recall and precision for one 1-dim random hash rbp = RandomBinaryProjections('rbp', 32) nearest = NearestFilter(10) engine = Engine(dim, lshashes=[rbp], vector_filters=[nearest]) exp = RecallPrecisionExperiment(10, vectors) result = exp.perform_experiment([engine]) recall1 = result[0][0] precision1 = result[0][1] searchtime1 = result[0][2] print('\nRecall RBP: %f, Precision RBP: %f, SearchTime RBP: %f\n' % \ (recall1, precision1, searchtime1))
class TestVectorFilters(unittest.TestCase): def setUp(self): self.V = [] self.V.append((numpy.array([0]), 'data1', 0.4)) self.V.append((numpy.array([1]), 'data2', 0.9)) self.V.append((numpy.array([2]), 'data3', 1.4)) self.V.append((numpy.array([3]), 'data4', 2.1)) self.V.append((numpy.array([4]), 'data5', 0.1)) self.V.append((numpy.array([5]), 'data6', 8.7)) self.V.append((numpy.array([6]), 'data7', 3.4)) self.V.append((numpy.array([7]), 'data8', 2.8)) self.threshold_filter = DistanceThresholdFilter(1.0) self.nearest_filter = NearestFilter(5) self.unique = UniqueFilter() def test_thresholding(self): result = self.threshold_filter.filter_vectors(self.V) self.assertEqual(len(result), 3) self.assertIn(self.V[0], result) self.assertIn(self.V[1], result) self.assertIn(self.V[4], result) def test_nearest(self): result = self.nearest_filter.filter_vectors(self.V) self.assertEqual(len(result), 5) self.assertIn(self.V[0], result) self.assertIn(self.V[1], result) self.assertIn(self.V[4], result) self.assertIn(self.V[2], result) self.assertIn(self.V[3], result) def test_unique(self): W = self.V W.append((numpy.array([7]), 'data8', 2.8)) W.append((numpy.array([0]), 'data1', 2.8)) W.append((numpy.array([1]), 'data2', 2.8)) W.append((numpy.array([6]), 'data7', 2.8)) result = self.unique.filter_vectors(W) self.assertEqual(len(result), 8)
class TestVectorFilters(unittest.TestCase): def setUp(self): self.V = [] self.V.append((numpy.array([0]), 'data1', 0.4)) self.V.append((numpy.array([1]), 'data2', 0.9)) self.V.append((numpy.array([2]), 'data3', 1.4)) self.V.append((numpy.array([3]), 'data4', 2.1)) self.V.append((numpy.array([4]), 'data5', 0.1)) self.V.append((numpy.array([5]), 'data6', 8.7)) self.V.append((numpy.array([6]), 'data7', 3.4)) self.V.append((numpy.array([7]), 'data8', 2.8)) self.threshold_filter = DistanceThresholdFilter(1.0) self.nearest_filter = NearestFilter(5) self.unique = UniqueFilter() def test_thresholding(self): result = self.threshold_filter.filter_vectors(self.V) self.assertEqual(len(result), 3) self.assertTrue(self.V[0] in result) self.assertTrue(self.V[1] in result) self.assertTrue(self.V[4] in result) def test_nearest(self): result = self.nearest_filter.filter_vectors(self.V) self.assertEqual(len(result), 5) self.assertTrue(self.V[0] in result) self.assertTrue(self.V[1] in result) self.assertTrue(self.V[4] in result) self.assertTrue(self.V[2] in result) self.assertTrue(self.V[3] in result) def test_unique(self): W = self.V W.append((numpy.array([7]), 'data8', 2.8)) W.append((numpy.array([0]), 'data1', 2.8)) W.append((numpy.array([1]), 'data2', 2.8)) W.append((numpy.array([6]), 'data7', 2.8)) result = self.unique.filter_vectors(W) self.assertEqual(len(result), 8)
def test_storage_redis(self): # We want 10 projections, 20 results at least rbpt = RandomBinaryProjectionTree('testHash', 10, 20) # Create engine for 100 dimensional feature space self.engine = Engine(100, lshashes=[rbpt], vector_filters=[NearestFilter(20)]) # First insert 2000 random vectors for k in range(2000): x = numpy.random.randn(100) x_data = 'data' self.engine.store_vector(x, x_data) self.redis_storage.store_hash_configuration(rbpt) rbpt2 = RandomBinaryProjectionTree(None, None, None) rbpt2.apply_config( self.redis_storage.load_hash_configuration('testHash')) self.assertEqual(rbpt.dim, rbpt2.dim) self.assertEqual(rbpt.hash_name, rbpt2.hash_name) self.assertEqual(rbpt.projection_count, rbpt2.projection_count) for i in range(rbpt.normals.shape[0]): for j in range(rbpt.normals.shape[1]): self.assertEqual(rbpt.normals[i, j], rbpt2.normals[i, j]) # Now do random queries and check result set size for k in range(10): x = numpy.random.randn(100) keys1 = rbpt.hash_vector(x, querying=True) keys2 = rbpt2.hash_vector(x, querying=True) self.assertEqual(len(keys1), len(keys2)) for k in range(len(keys1)): self.assertEqual(keys1[k], keys2[k])
def __init__(self, matrix, max_neighbours=20, lshashes=[RandomBinaryProjections("rbp", 10)], vector_filters=[UniqueFilter()], distance=Pearson()): if not isinstance(lshashes, list): raise TypeError("'lshashes' must be an instance of 'list'") if not isinstance(vector_filters, list): raise TypeError("'vector_filters' must be an instance of 'list'") self.underlying = Engine(len(matrix[0]), lshashes=lshashes, vector_filters=vector_filters + [NearestFilter(max_neighbours)], distance=distance) for vector in matrix: self.underlying.store_vector(vector)
print('MAE:', sum(err) / len(err)) print('RMSE', sqrt(sum([num**2 for num in err]) / len(err))) end_time = time() print('运行时间:', end_time - begin_time) print('*' * 50) # LSH搜索回归 print('LSH搜索方案:') rbp = RandomBinaryProjections('rbp', 20) engine1 = Engine(dimension - 1, lshashes=[rbp, rbp, rbp], storage=MemoryStorage(), distance=EuclideanDistance(), vector_filters=[NearestFilter(100)]) engine1.store_many_vectors(dataBase, [i for i in range(featureNum)]) begin_time = time() print(' 预测值 误差') err = [] for m in range(len(queryBase)): query = queryBase[m] N = engine1.neighbours(query, distance='euclidean', fetch_vector_filters=[UniqueFilter()]) index = [int(x[1]) for x in N] # print(index) data = np.array([dataBaseInitial.iloc[index, :]]) data = data[0]
dimensions = 300 filename = os.getenv('VECTORS_FILE', 'glove.6B/glove.6B.300d.small.txt') print('loading vectors') lines = open(filename).read().strip().split('\n') word_vectors = {} for line in lines: split_line = line.split() word = split_line[0] vec = array([float(thing) for thing in split_line[1:]]) word_vectors[word] = vec print('starting engine') nearest = Engine(dimensions, distance=ManhattanDistance(), vector_filters=[NearestFilter(20)], lshashes=[RandomBinaryProjections('rbp', 2, rand_seed=42)]) for word, vec in word_vectors.items(): nearest.store_vector(vec, word) print('ready') def query(array): return [res[1] for res in nearest.neighbours(array)]
def train(self, config): train_dataset = ImageDataset(config['dataset_path'], 'seen', config['data_augmentation_suffixes'], config['allow_different_views']) train_dataset.prepare(config['num_train_pairs']) val_dataset = ImageDataset(config['dataset_path'], 'test') val_dataset.prepare(config['num_val_pairs']) train_generator = DataGenerator( train_dataset, batch_size=config['batch_size'], dim=self.config['input_shape'], shuffle=config['shuffle_training_inputs'], dataset_type=config['dataset_type']) val_generator = DataGenerator( val_dataset, batch_size=config['batch_size'], dim=self.config['input_shape'], shuffle=config['shuffle_training_inputs'], dataset_type=config['dataset_type']) model_path, _ = os.path.split(self.config['model_filename']) callbacks = [ keras.callbacks.TensorBoard(log_dir=self.log_dir, histogram_freq=0, write_graph=True, write_images=False), keras.callbacks.ModelCheckpoint(self.checkpoint_path, verbose=0, save_weights_only=True) ] self.keras_model.compile( loss=utils.contrastive_loss, optimizer=Adam(lr=config['learning_rate']), metrics=[utils.accuracy, utils.auc_roc, 'acc']) history = self.keras_model.fit_generator( generator=train_generator, validation_data=val_generator, epochs=config['epochs'], use_multiprocessing=True, callbacks=callbacks, workers=multiprocessing.cpu_count()) self.keras_model.save(self.config['model_filename']) #------------------------------------------------------- #make new dataset seen_dataset = ImageDataset(config['dataset_path'], 'seen') seen = [] #only add imgs that are returned by nearpy new = [] for i in range(5): new += [ os.path.join(x, 'view_00000{}'.format(i)) for x in os.listdir( os.path.join(config['dataset_path'], 'test')) ] pred_arr = [] dimension = 9984 engine = Engine(dimension, vector_filters=[NearestFilter(5)]) #for i in range(0, iter): #seen = list(seen_dataset._class_labels) seen = [] for i in range(5): seen += [ os.path.join(x, 'view_00000{}'.format(i)) for x in os.listdir( os.path.join(config['dataset_path'], 'seen')) ] for class1 in seen: folder1 = os.path.join( os.path.join(config['dataset_path'], 'seen'), class1) for obj in os.listdir(folder1): im2 = os.path.join(folder1, obj) image = np.load(im2) engine.store_vector(image['arr_0'], class1) for img in new: #nea rpy stuff folder1 = os.path.join( os.path.join(config['dataset_path'], 'test'), img) im = os.path.join(folder1, os.listdir(folder1)[0]) image = np.load(im)['arr_0'] neighbors = engine.neighbours(image) for n in neighbors: folder1 = os.path.join( os.path.join(config['dataset_path'], 'seen'), n[1]) im = os.path.join(folder1, os.listdir(folder1)[0]) #make random neighbor = np.load(im)['arr_0'] prediction = self.predict( [np.array([image]), np.array([neighbor])], 1) pred_arr += [[img[:-12], n[1][:-12], prediction]] for item in pred_arr: val_dataset.prepare_specific(1, item[0], item[1]) f = open("ground.txt", "a") if item[0] == item[1]: f.write('1 {} {} {} '.format(item[0], item[1], item[2])) else: f.write('0 {} {} {} '.format(item[0], item[1], item[2])) f.close()
def worker(A, start): # starttime=datetime.datetime.now() # endtime=datetime.datetime.now() # timee=endtime-starttime timee = 0 num = 0 for j in xrange(kkkk): k1 = 0 #for circ in range(1000): # starttime=datetime.datetime.now() #lshtruple=lsh.query(newcomparearrt[j+start*kkkk],1) #print type(engine) lshtruple = engine.neighbours(newcomparearrt[j + start * kkkk]) #print lshtruple # endtime=datetime.datetime.now() # timee=timee+(endtime-starttime).seconds # if lshtruple: # print lshtruple[0] for f in xrange(len(CC)): #print CC[f] if lshtruple: if (tuple(CC[f]).__eq__(lshtruple[0][0])): k1 = f break #print k1 length3 = len(clusresult[k1]) temp = clusresult[k1] #..................................................................................... # lsh1=LSHash(6,3) # #print temp # ff=0 # for ff in xrange(length3): # lsh1.index(temp[ff]) # starttime1=datetime.datetime.now() # if lsh1.query(newcomparearrt[j],1): # num=num+1 # endtime1=datetime.datetime.now() # timee=timee+(endtime1-starttime1).seconds # del lsh1 #..................................................................................... #nearpy rbp1 = RandomBinaryProjections('rbp2', 10) DIM1 = 3 engine1 = Engine(DIM1, lshashes=[rbp1], distance=CosineDistance(), vector_filters=[NearestFilter(1)]) for ff in xrange(length3): engine1.store_vector(temp[ff], ff) if engine1.candidate_count(newcomparearrt[j]): num = num + 1 #print num #del engine results = engine1.neighbours(newcomparearrt[j]) # print results del engine1 #........................................................................... A.append(num)
args = parser.parse_args() except IOError, msg: parser.error(str(msg)) reader = codecs.getreader('utf8') writer = codecs.getwriter('utf8') infile = reader(args.infile) dictionaries = [reader(d) for d in args.dictionaries] outfile = writer(args.outfile) # make nn indices for each language # Create a random binary hash rbp = RandomBinaryProjections('rbp', args.bits) # create engines for each language engines = [Engine(args.dim, lshashes=[rbp], distance=CosineDistance(), vector_filters=[NearestFilter(args.nbest)]) for x in xrange(args.langs)] # load transformation matrices mats = np.load(args.modelfile) invmats = {} for name, mat in mats.items(): invmats[name]=LA.inv(mat) vocabs = [np.loadtxt(dictionary, dtype=str) for dictionary in dictionaries] vocab = dd(lambda: dict()) for entry in np.vstack(vocabs): word = entry[0] lang = entry[1] vec = entry[2:].astype(float) if word not in vocab[lang]: vocab[lang][word]=vec engines[int(lang)].store_vector(vec, word)
dimension = 1000 # Create permutations meta-hash permutations2 = HashPermutationMapper('permut2') # Create binary hash as child hash rbp_perm2 = RandomBinaryProjections('rbp_perm2', 14) # Add rbp as child hash of permutations hash permutations2.add_child_hash(rbp_perm2) engine = Engine(dimension, lshashes=[permutations2], distance=CosineDistance(), vector_filters=[NearestFilter(5)], storage=MemoryStorage()) i = 0 query = numpy.zeros(dimension) f = open('features2.txt', 'r') # Opening, reading from the file:: for next_read_line in f: next_read_line = next_read_line.rstrip() split_arr = next_read_line.split(" ") split_arr = split_arr[1:] split_arr = list(map(float, split_arr))
# Create data set from two clusters vectors = [] center = numpy.random.randn(dimension) for index in xrange(vector_count / 2): vector = center + 0.01 * numpy.random.randn(dimension) vectors.append(vector) center = numpy.random.randn(dimension) for index in xrange(vector_count / 2): vector = center + 0.01 * numpy.random.randn(dimension) vectors.append(vector) # We are looking for the N closest neighbours N = 20 nearest = NearestFilter(N) # We will fill this array with all the engines we want to test engines = [] print 'Creating engines...' # We are going to test these bin widths bin_widths = [0.01 * x for x in range(1, 5)] # Create engines for all configurations for bin_width in bin_widths: # Use four random 1-dim discretized projections rdp1 = RandomDiscretizedProjections('rdp1', 4, bin_width) rdp2 = RandomDiscretizedProjections('rdp2', 4, bin_width) rdp3 = RandomDiscretizedProjections('rdp3', 4, bin_width) rdp4 = RandomDiscretizedProjections('rdp4', 4, bin_width)