Ejemplo n.º 1
0
def load_search_engine():
    global engine

    # read in the data file
    data = pandas.read_csv(os.path.join('data', 'features.tsv'), sep='\t')
    data_objects = pandas.read_csv(os.path.join('data', 'object_features.tsv'),
                                   sep='\t')

    # Create a random binary hash with 10 bits
    rbp = RandomBinaryProjections('rbp', 10)

    # Create engine with pipeline configuration
    engine = Engine(len(data['features'][0].split(',')),
                    lshashes=[rbp],
                    distance=EuclideanDistance())

    # indexing
    for i in range(0, len(data)):
        engine.store_vector(
            np.asarray(data['features'][i].split(',')).astype('float64'),
            data['filename'][i].replace('images\\\\',
                                        '').replace('images\\',
                                                    '').replace('images/', ''))

    for i in range(0, len(data_objects)):
        engine.store_vector(
            np.asarray(
                data_objects['features'][i].split(',')).astype('float64'),
            data_objects['filename'][i].replace('images\\\\', '').replace(
                'images\\', '').replace('images/', ''))

    return engine
Ejemplo n.º 2
0
    def __init__(self,
                 dim,
                 lshashes=None,
                 distance=None,
                 fetch_vector_filters=None,
                 vector_filters=None,
                 storage=None):
        """ Keeps the configuration. """
        if lshashes is None:
            lshashes = [RandomBinaryProjections('default', 10)]
        self.lshashes = lshashes
        if distance is None: distance = EuclideanDistance()
        self.distance = distance
        if vector_filters is None: vector_filters = [NearestFilter(10)]
        self.vector_filters = vector_filters
        if fetch_vector_filters is None:
            fetch_vector_filters = [UniqueFilter()]
        self.fetch_vector_filters = fetch_vector_filters
        if storage is None: storage = MemoryStorage()
        self.storage = storage

        # Initialize all hashes for the data space dimension.
        for lshash in self.lshashes:
            lshash.reset(dim)

        print('*** engine init done ***')
Ejemplo n.º 3
0
    def __init__(self,
                 dim,
                 lshashes=[RandomBinaryProjections('default', 10)],
                 distance=EuclideanDistance(),
                 vector_filters=[NearestFilter(10)],
                 storage=MemoryStorage()):
        """ Keeps the configuration. """
        self.lshashes = lshashes
        self.distance = distance
        self.vector_filters = vector_filters
        self.storage = storage

        # Initialize all hashes for the data space dimension.
        for lshash in self.lshashes:
            lshash.reset(dim)
Ejemplo n.º 4
0
    def test_experiment_with_unibucket_1(self):
        dim = 50
        vector_count = 100
        vectors = numpy.random.randn(dim, vector_count)
        unibucket = UniBucket('testHash')
        nearest = NearestFilter(10 + 1)
        engine = Engine(dim, lshashes=[unibucket],
                        vector_filters=[nearest],
                        distance=EuclideanDistance())
        exp = RecallPrecisionExperiment(10, vectors)
        result = exp.perform_experiment([engine])

        # Both recall and precision must be one in this case
        self.assertEqual(result[0][0], 1.0)
        self.assertEqual(result[0][1], 1.0)
Ejemplo n.º 5
0
def main():
    logging.info("this is main")
    dimension = 128
    # Create a random binary hash with 10 bits
    rbp = RandomBinaryProjections('rbp', 10)

    # Create engine with pipeline configuration
    engine = Engine(dimension, lshashes=[rbp], distance=EuclideanDistance())

    mongo = MyMongoDB()
    # dic = {"name": "lisi","face_incoding": jobs_encoding.tolist()}
    # mongo.insert(dic)
    mongo.dbfind({}, engine)
    while (True):
        with ThreadPoolExecutor(1) as executor:
            executor.submit(watchFold(setting["path"], mongo, engine))
            executor.shutdown()
        time.sleep(1)
        logging.info("waiting.....")
Ejemplo n.º 6
0
def main(argv):
    parser = argparse.ArgumentParser(prog='INDEX')
    parser.add_argument('source', help='path to the source metadata file')
    parser.add_argument('--hash-size', help='Hash size.', type=int, default=10)
    parser.add_argument('--num-tables',
                        help='Number of tables.',
                        type=int,
                        default=5)
    parser.add_argument('--query-index',
                        help='Index to use for query.',
                        type=int,
                        default=0)

    args = parser.parse_args(argv[1:])

    # read in the data file
    data = pandas.read_csv(args.source, sep='\t')

    # Create a random binary hash with 10 bits
    rbp = RandomBinaryProjections('rbp', 10)

    # Create engine with pipeline configuration
    engine = Engine(len(data['features'][0].split(',')),
                    lshashes=[rbp],
                    distance=EuclideanDistance())

    # indexing
    for i in range(0, len(data)):
        engine.store_vector(
            np.asarray(data['features'][i].split(',')).astype('float64'),
            data['filename'][i])

    # query a vector q_vec
    response = engine.neighbours(
        np.asarray(
            data['features'][args.query_index].split(',')).astype('float64'))

    pprint(response)
Ejemplo n.º 7
0
    def loadHashmap(self, feature_size=129, result_n=1000):  # 这里参数没有用到
        '''
        feature_size: hash空间维数大小
        result_n :返回多少个最近邻
        '''
        # Create redis storage adapter
        redis_object = Redis(host='localhost', port=6379, db=0)
        redis_storage = RedisStorage(redis_object)
        try:
            # Get hash config from redis
            config = redis_storage.load_hash_configuration('test')
            # Config is existing, create hash with None parameters
            lshash = RandomBinaryProjections(None, None)
            # Apply configuration loaded from redis
            lshash.apply_config(config)

        except:
            # Config is not existing, create hash from scratch, with 10 projections
            lshash = RandomBinaryProjections('test', 10)

        # Create engine for feature space of 100 dimensions and use our hash.
        # This will set the dimension of the lshash only the first time, not when
        # using the configuration loaded from redis. Use redis storage to store
        # buckets.
        nearest = NearestFilter(result_n)
        # self.engine = Engine(feature_size, lshashes=[], vector_filters=[])
        self.engine = Engine(feature_size,
                             lshashes=[lshash],
                             vector_filters=[nearest],
                             storage=redis_storage,
                             distance=EuclideanDistance())

        # Do some stuff like indexing or querying with the engine...

        # Finally store hash configuration in redis for later use
        redis_storage.store_hash_configuration(lshash)
    def __init__(self,
                 measure="EuclideanDistance",
                 data_path='data/classed_data/'):
        self.res = ResnetSimilarity()
        self.pbar = ProgressBar()
        # Dimension of our vector space
        self.dimension = 2048
        self.data_path = data_path

        # Create a random binary hash with 10 bits
        self.rbp = RandomBinaryProjections('rbp', 10)

        self.measure = measure
        self.msote = MemoryStorage()
        if measure == "EuclideanDistance":
            self.engine = Engine(self.dimension,
                                 lshashes=[self.rbp],
                                 storage=self.msote,
                                 distance=EuclideanDistance())
        else:
            self.engine = Engine(self.dimension,
                                 lshashes=[self.rbp],
                                 storage=self.msote,
                                 distance=CosineDistance())
Ejemplo n.º 9
0
 def __init__(self, dist=EuclideanDistance(), phi=lambda x: x):
     NearestNeighbor.__init__(self, dist, phi)
Ejemplo n.º 10
0
    err.append(abs(queryBaseInitial.iloc[m, 0] - output[0]))

print('MAE:', sum(err) / len(err))
print('RMSE', sqrt(sum([num**2 for num in err]) / len(err)))
end_time = time()
print('运行时间:', end_time - begin_time)

print('*' * 50)

# LSH搜索回归
print('LSH搜索方案:')
rbp = RandomBinaryProjections('rbp', 20)
engine1 = Engine(dimension - 1,
                 lshashes=[rbp, rbp, rbp],
                 storage=MemoryStorage(),
                 distance=EuclideanDistance(),
                 vector_filters=[NearestFilter(100)])

engine1.store_many_vectors(dataBase, [i for i in range(featureNum)])

begin_time = time()
print('        预测值      误差')
err = []
for m in range(len(queryBase)):
    query = queryBase[m]
    N = engine1.neighbours(query,
                           distance='euclidean',
                           fetch_vector_filters=[UniqueFilter()])
    index = [int(x[1]) for x in N]
    # print(index)
    data = np.array([dataBaseInitial.iloc[index, :]])
Ejemplo n.º 11
0
import copy
t_sne_space = lambda x: TSNE(n_components=2, metric='cosine').fit_transform(x)
# Maximum distance for clustering
CLUSTER_THRESHOLD = 1
# Minimum entropy before a cluster is classified as spam
ENTROPY_THRESHOLD = 3.5
SPLIT_JOIN_BALANCE = 1

# TODO implement better system
# offset Finnish cluster ids to avoid id conflicts
FI_CLUSTER_ID_OFFSET = 10000000

# Locality senstive hashing parameters, chosen based on the paper 'Streaming First Story Detection with applicaiton to Twitter'
HYPERPLANE_COUNT  = 15
HASH_LAYERS       = 8
lsh_distance_func = EuclideanDistance()#CosineDistance() # 1 - cos(a)

try:
    opts, args = getopt(sys.argv[1:], 'v:e:t:i:l:', ['vocab=', 'embeddings=', 'text=', 'idfs=', 'lang='])
except GetoptError as e:
    print(e, file=sys.stderr)
    #sys.exit(2)

opt_vocab = 'vocab.txt'
opt_embeddings = 'vecs.bin'
opt_text = '2014_07.ru.txt'#'tweet_replies_non_alpha_true-ru_lem.txt'
opt_idfs = 'tweet_idfs.json'
opt_lang = 'ru'

for o, a in opts:
  if o in ('-v', '--vocab'):
Ejemplo n.º 12
0
 def setUp(self):
     self.euclidean = EuclideanDistance()