def load_search_engine(): global engine # read in the data file data = pandas.read_csv(os.path.join('data', 'features.tsv'), sep='\t') data_objects = pandas.read_csv(os.path.join('data', 'object_features.tsv'), sep='\t') # Create a random binary hash with 10 bits rbp = RandomBinaryProjections('rbp', 10) # Create engine with pipeline configuration engine = Engine(len(data['features'][0].split(',')), lshashes=[rbp], distance=EuclideanDistance()) # indexing for i in range(0, len(data)): engine.store_vector( np.asarray(data['features'][i].split(',')).astype('float64'), data['filename'][i].replace('images\\\\', '').replace('images\\', '').replace('images/', '')) for i in range(0, len(data_objects)): engine.store_vector( np.asarray( data_objects['features'][i].split(',')).astype('float64'), data_objects['filename'][i].replace('images\\\\', '').replace( 'images\\', '').replace('images/', '')) return engine
def __init__(self, dim, lshashes=None, distance=None, fetch_vector_filters=None, vector_filters=None, storage=None): """ Keeps the configuration. """ if lshashes is None: lshashes = [RandomBinaryProjections('default', 10)] self.lshashes = lshashes if distance is None: distance = EuclideanDistance() self.distance = distance if vector_filters is None: vector_filters = [NearestFilter(10)] self.vector_filters = vector_filters if fetch_vector_filters is None: fetch_vector_filters = [UniqueFilter()] self.fetch_vector_filters = fetch_vector_filters if storage is None: storage = MemoryStorage() self.storage = storage # Initialize all hashes for the data space dimension. for lshash in self.lshashes: lshash.reset(dim) print('*** engine init done ***')
def __init__(self, dim, lshashes=[RandomBinaryProjections('default', 10)], distance=EuclideanDistance(), vector_filters=[NearestFilter(10)], storage=MemoryStorage()): """ Keeps the configuration. """ self.lshashes = lshashes self.distance = distance self.vector_filters = vector_filters self.storage = storage # Initialize all hashes for the data space dimension. for lshash in self.lshashes: lshash.reset(dim)
def test_experiment_with_unibucket_1(self): dim = 50 vector_count = 100 vectors = numpy.random.randn(dim, vector_count) unibucket = UniBucket('testHash') nearest = NearestFilter(10 + 1) engine = Engine(dim, lshashes=[unibucket], vector_filters=[nearest], distance=EuclideanDistance()) exp = RecallPrecisionExperiment(10, vectors) result = exp.perform_experiment([engine]) # Both recall and precision must be one in this case self.assertEqual(result[0][0], 1.0) self.assertEqual(result[0][1], 1.0)
def main(): logging.info("this is main") dimension = 128 # Create a random binary hash with 10 bits rbp = RandomBinaryProjections('rbp', 10) # Create engine with pipeline configuration engine = Engine(dimension, lshashes=[rbp], distance=EuclideanDistance()) mongo = MyMongoDB() # dic = {"name": "lisi","face_incoding": jobs_encoding.tolist()} # mongo.insert(dic) mongo.dbfind({}, engine) while (True): with ThreadPoolExecutor(1) as executor: executor.submit(watchFold(setting["path"], mongo, engine)) executor.shutdown() time.sleep(1) logging.info("waiting.....")
def main(argv): parser = argparse.ArgumentParser(prog='INDEX') parser.add_argument('source', help='path to the source metadata file') parser.add_argument('--hash-size', help='Hash size.', type=int, default=10) parser.add_argument('--num-tables', help='Number of tables.', type=int, default=5) parser.add_argument('--query-index', help='Index to use for query.', type=int, default=0) args = parser.parse_args(argv[1:]) # read in the data file data = pandas.read_csv(args.source, sep='\t') # Create a random binary hash with 10 bits rbp = RandomBinaryProjections('rbp', 10) # Create engine with pipeline configuration engine = Engine(len(data['features'][0].split(',')), lshashes=[rbp], distance=EuclideanDistance()) # indexing for i in range(0, len(data)): engine.store_vector( np.asarray(data['features'][i].split(',')).astype('float64'), data['filename'][i]) # query a vector q_vec response = engine.neighbours( np.asarray( data['features'][args.query_index].split(',')).astype('float64')) pprint(response)
def loadHashmap(self, feature_size=129, result_n=1000): # 这里参数没有用到 ''' feature_size: hash空间维数大小 result_n :返回多少个最近邻 ''' # Create redis storage adapter redis_object = Redis(host='localhost', port=6379, db=0) redis_storage = RedisStorage(redis_object) try: # Get hash config from redis config = redis_storage.load_hash_configuration('test') # Config is existing, create hash with None parameters lshash = RandomBinaryProjections(None, None) # Apply configuration loaded from redis lshash.apply_config(config) except: # Config is not existing, create hash from scratch, with 10 projections lshash = RandomBinaryProjections('test', 10) # Create engine for feature space of 100 dimensions and use our hash. # This will set the dimension of the lshash only the first time, not when # using the configuration loaded from redis. Use redis storage to store # buckets. nearest = NearestFilter(result_n) # self.engine = Engine(feature_size, lshashes=[], vector_filters=[]) self.engine = Engine(feature_size, lshashes=[lshash], vector_filters=[nearest], storage=redis_storage, distance=EuclideanDistance()) # Do some stuff like indexing or querying with the engine... # Finally store hash configuration in redis for later use redis_storage.store_hash_configuration(lshash)
def __init__(self, measure="EuclideanDistance", data_path='data/classed_data/'): self.res = ResnetSimilarity() self.pbar = ProgressBar() # Dimension of our vector space self.dimension = 2048 self.data_path = data_path # Create a random binary hash with 10 bits self.rbp = RandomBinaryProjections('rbp', 10) self.measure = measure self.msote = MemoryStorage() if measure == "EuclideanDistance": self.engine = Engine(self.dimension, lshashes=[self.rbp], storage=self.msote, distance=EuclideanDistance()) else: self.engine = Engine(self.dimension, lshashes=[self.rbp], storage=self.msote, distance=CosineDistance())
def __init__(self, dist=EuclideanDistance(), phi=lambda x: x): NearestNeighbor.__init__(self, dist, phi)
err.append(abs(queryBaseInitial.iloc[m, 0] - output[0])) print('MAE:', sum(err) / len(err)) print('RMSE', sqrt(sum([num**2 for num in err]) / len(err))) end_time = time() print('运行时间:', end_time - begin_time) print('*' * 50) # LSH搜索回归 print('LSH搜索方案:') rbp = RandomBinaryProjections('rbp', 20) engine1 = Engine(dimension - 1, lshashes=[rbp, rbp, rbp], storage=MemoryStorage(), distance=EuclideanDistance(), vector_filters=[NearestFilter(100)]) engine1.store_many_vectors(dataBase, [i for i in range(featureNum)]) begin_time = time() print(' 预测值 误差') err = [] for m in range(len(queryBase)): query = queryBase[m] N = engine1.neighbours(query, distance='euclidean', fetch_vector_filters=[UniqueFilter()]) index = [int(x[1]) for x in N] # print(index) data = np.array([dataBaseInitial.iloc[index, :]])
import copy t_sne_space = lambda x: TSNE(n_components=2, metric='cosine').fit_transform(x) # Maximum distance for clustering CLUSTER_THRESHOLD = 1 # Minimum entropy before a cluster is classified as spam ENTROPY_THRESHOLD = 3.5 SPLIT_JOIN_BALANCE = 1 # TODO implement better system # offset Finnish cluster ids to avoid id conflicts FI_CLUSTER_ID_OFFSET = 10000000 # Locality senstive hashing parameters, chosen based on the paper 'Streaming First Story Detection with applicaiton to Twitter' HYPERPLANE_COUNT = 15 HASH_LAYERS = 8 lsh_distance_func = EuclideanDistance()#CosineDistance() # 1 - cos(a) try: opts, args = getopt(sys.argv[1:], 'v:e:t:i:l:', ['vocab=', 'embeddings=', 'text=', 'idfs=', 'lang=']) except GetoptError as e: print(e, file=sys.stderr) #sys.exit(2) opt_vocab = 'vocab.txt' opt_embeddings = 'vecs.bin' opt_text = '2014_07.ru.txt'#'tweet_replies_non_alpha_true-ru_lem.txt' opt_idfs = 'tweet_idfs.json' opt_lang = 'ru' for o, a in opts: if o in ('-v', '--vocab'):
def setUp(self): self.euclidean = EuclideanDistance()