Example #1
0
def make_tables(dataset, num_queries=1000, num_tables=50, copy=True,
                seed=None, num_threads=0, verbose=True, already_normed=False):
    p = partial(print, file=sys.stderr) if verbose else lambda *a, **kw: None
    if already_normed:
        if copy:
            dataset = dataset.copy()
    else:
        norms = np.linalg.norm(dataset, axis=1)
        if copy:
            dataset = dataset / norms[:, np.newaxis]
        else:
            dataset /= norms[:, np.newaxis]

    normed_mean = dataset.mean(axis=0)
    dataset -= normed_mean

    params_cp = falconn.LSHConstructionParameters()
    params_cp.dimension = dataset.shape[1]
    params_cp.lsh_family = 'cross_polytope'
    params_cp.distance_function = 'euclidean_squared'
    params_cp.l = num_tables
    params_cp.num_rotations = 1  # try 2, maybe
    params_cp.seed = seed if seed is not None else np.random.randint(2**31)
    params_cp.num_setup_threads = num_threads
    params_cp.storage_hash_table = 'bit_packed_flat_hash_table'
    n_bits = int(np.round(np.log2(dataset.shape[0])))
    falconn.compute_number_of_hash_functions(n_bits, params_cp)

    p('Starting building table...', end='')
    table = falconn.LSHIndex(params_cp)
    table.setup(dataset)
    p('done')

    return table, normed_mean
Example #2
0
    def LSHtable(self, file, euclidean=True, number_of_tables=50, hash_fx=18):
        """
        input:                  2-D numpy array
        output:                 LSH table

        Params:
        :file:                  2-D numpy array of document vectors
        :distance_function:     [EuclideanSquared, NegativeInnerProduct]
        :number_of_tables:      (default=50)
        :num_of_rotations:      1
        :seed:                  5721840
        :num_setup_threads:     0
        :hash_fx:               18 (2^18 hash tables)

        """
        dataset = file
        params_cp = falconn.LSHConstructionParameters()
        params_cp.dimension = len(dataset[0])
        params_cp.lsh_family = falconn.LSHFamily.CrossPolytope
        if euclidean == True:
            params_cp.distance_function = falconn.DistanceFunction.EuclideanSquared
        else:
            params_cp.distance_function = falconn.DistanceFunction.NegativeInnerProduct
        params_cp.l = number_of_tables
        params_cp.num_rotations = 1
        params_cp.seed = 5721840
        params_cp.num_setup_threads = 0
        params_cp.storage_hash_table = falconn.StorageHashTable.BitPackedFlatHashTable
        falconn.compute_number_of_hash_functions(hash_fx, params_cp)
        # Construct the LSH table
        LSHtable = falconn.LSHIndex(params_cp)
        LSHtable.setup(dataset)
        return LSHtable
def set_cp(data):
    """
    d = 128
    seed = 119417657
    # Cross polytope hashing
    params_cp = falconn.LSHConstructionParameters()
    params_cp.dimension = d
    params_cp.lsh_family = falconn.LSHFamily.CrossPolytope
    params_cp.distance_function = falconn.DistanceFunction.NegativeInnerProduct
    params_cp.storage_hash_table = falconn.StorageHashTable.FlatHashTable
    params_cp.k = 3
    params_cp.l = 10
    params_cp.num_setup_threads = 0
    params_cp.last_cp_dimension = 16
    params_cp.num_rotations = 3
    params_cp.seed = seed ^ 833840234
    """
    num_points, dim = data.shape
    parms = falconn.get_default_parameters(num_points, dim)
    falconn.compute_number_of_hash_functions(7, parms)

    cp_table = falconn.LSHIndex(parms)
    cp_table.setup(data)
    qo = cp_table.construct_query_object()
    qo.set_num_probes(896)
    return qo
Example #4
0
 def retrival(self, query, dataset=None, *, k=None, threshold=None):
     if dataset is None:
         table = self.last_table
     else:
         hashint = xxhash.xxh64(dataset[:, 0].copy(), self.seed).intdigest()
         if hashint in self.tables:
             table = self.tables[hashint]
         else:
             print('find a new dataset')
             dataset = dataset.astype(np.float32)
             mean = np.mean(dataset, axis=0)
             dataset -= mean
             params = falconn.get_default_parameters(
                 dataset.shape[0], dataset.shape[1])
             falconn.compute_number_of_hash_functions(7, params)
             lsh_index = falconn.LSHIndex(params)
             lsh_index.setup(dataset)
             qtable = lsh_index.construct_query_object()
             qtable.set_num_probes(10000)
             table = (mean, qtable)
             self.tables[hashint] = table
     if table is None:
         raise Exception("Dataset not specific")
     query -= table[0]
     if k is not None and threshold is not None:
         raise ValueError("k and threshold should not pass simultaneously")
     self.last_table = table
     if k is not None:
         return table[1].find_k_nearest_neighbors(query, k)
     if threshold is not None:
         return table[1].find_near_neighbors(query, threshold)
     return table[1].find_nearest_neighbor(query)
Example #5
0
def search(query,number):
    dataset = np.load("/Users/liupengcheng/Downloads/final_data.npy")
    params_cp = falconn.LSHConstructionParameters()
    params_cp.dimension = len(dataset[0])
    params_cp.lsh_family = falconn.LSHFamily.CrossPolytope
    params_cp.distance_function = falconn.DistanceFunction.EuclideanSquared
    params_cp.l = 50
    # we set one rotation, since the data is dense enough,
    # for sparse data set it to 2
    params_cp.num_rotations = 1
    params_cp.seed = 5721840
    # we want to use all the available threads to set up
    params_cp.num_setup_threads = 0
    params_cp.storage_hash_table = falconn.StorageHashTable.BitPackedFlatHashTable
    # we build 18-bit hashes so that each table has
    # 2^18 bins; this is a good choise since 2^18 is of the same
    # order of magnitude as the number of data points
    falconn.compute_number_of_hash_functions(18, params_cp)
    table = falconn.LSHIndex(params_cp)
    table.setup(dataset)
    query_object = table.construct_query_object()
    number_of_probes = 3816
    query_object.set_num_probes(number_of_probes)
    result = query_object.find_k_nearest_neighbors(query,number)
    return result
Example #6
0
def test_number_of_hash_functions():
  params = falconn._internal.LSHConstructionParameters()
  
  params.lsh_family = 'hyperplane'
  params.dimension = 10
  falconn.compute_number_of_hash_functions(5, params)
  assert params.k == 5
  
  params.lsh_family = 'cross_polytope'
  falconn.compute_number_of_hash_functions(5, params)
  assert params.k == 1
  assert params.last_cp_dimension == 16

  params.dimension = 100
  params.lsh_family = 'hyperplane'
  falconn.compute_number_of_hash_functions(8, params)
  assert params.k == 8
  
  params.lsh_family = 'cross_polytope'
  falconn.compute_number_of_hash_functions(8, params)
  assert params.k == 1
  assert params.last_cp_dimension == 128

  falconn.compute_number_of_hash_functions(10, params)
  assert params.k == 2
  assert params.last_cp_dimension == 2
Example #7
0
    def __init__(self, dataset):

        number_of_queries = 10
        # we build only 50 tables, increasing this quantity will improve the query time
        # at a cost of slower preprocessing and larger memory footprint, feel free to
        # play with this number
        number_of_tables = 50

        params_cp = falconn.LSHConstructionParameters()

        params_cp.dimension = len(dataset[0])
        params_cp.lsh_family = falconn.LSHFamily.CrossPolytope
        params_cp.distance_function = falconn.DistanceFunction.EuclideanSquared
        params_cp.l = number_of_tables
        # we set one rotation, since the data is dense enough,
        # for sparse data set it to 2
        params_cp.num_rotations = 1
        params_cp.seed = 5721840
        # we want to use all the available threads to set up
        params_cp.num_setup_threads = 0
        params_cp.storage_hash_table = falconn.StorageHashTable.BitPackedFlatHashTable
        self.params_cp = params_cp

        # we build 18-bit hashes so that each table has
        # 2^18 bins; this is a good choise since 2^18 is of the same
        # order of magnitude as the number of data points
        falconn.compute_number_of_hash_functions(18, params_cp)

        print('Constructing the LSH table')
        self.table = falconn.LSHIndex(params_cp)
        self.table.setup(dataset)
        self.data = dataset
        self.query_object = self.table.construct_query_object()
Example #8
0
def make_tables(dataset,
                num_queries=1000,
                num_tables=50,
                copy=True,
                seed=None,
                num_threads=0,
                verbose=True):
    p = partial(print, file=sys.stderr) if verbose else lambda *a, **kw: None
    norms = np.linalg.norm(dataset, axis=1)
    if copy:
        dataset = dataset / norms[:, np.newaxis]
    else:
        dataset /= norms[:, np.newaxis]

    normed_mean = dataset.mean(axis=0)
    dataset -= normed_mean

    params_cp = falconn.LSHConstructionParameters()
    params_cp.dimension = dataset.shape[1]
    params_cp.lsh_family = 'cross_polytope'
    params_cp.distance_function = 'euclidean_squared'
    params_cp.l = num_tables
    params_cp.num_rotations = 1  # try 2, maybe
    params_cp.seed = seed if seed is not None else np.random.randint(2**31)
    params_cp.num_setup_threads = num_threads
    params_cp.storage_hash_table = 'bit_packed_flat_hash_table'
    n_bits = int(np.round(np.log2(dataset.shape[0])))
    falconn.compute_number_of_hash_functions(n_bits, params_cp)

    p('Starting building table...', end='')
    table = falconn.LSHIndex(params_cp)
    table.setup(dataset)
    p('done')

    return table, normed_mean
Example #9
0
    def _init_falconn(
        self,
        dimension,
        number_bits,
        nb_tables,
    ):
        import falconn

        assert nb_tables >= self._NEIGHBORS

        # LSH parameters
        params_cp = falconn.LSHConstructionParameters()
        params_cp.dimension = dimension
        params_cp.lsh_family = falconn.LSHFamily.CrossPolytope
        params_cp.distance_function = falconn.DistanceFunction.EuclideanSquared
        params_cp.l = nb_tables
        params_cp.num_rotations = 2  # for dense set it to 1; for sparse data set it to 2
        params_cp.seed = 5721840
        # we want to use all the available threads to set up
        params_cp.num_setup_threads = 0
        params_cp.storage_hash_table = falconn.StorageHashTable.BitPackedFlatHashTable

        # we build number_bits-bit hashes so that each table has
        # 2^number_bits bins; a rule of thumb is to have the number
        # of bins be the same order of magnitude as the number of data points
        falconn.compute_number_of_hash_functions(number_bits, params_cp)
        self._falconn_table = falconn.LSHIndex(params_cp)
        self._falconn_query_object = None
        self._FALCONN_NB_TABLES = nb_tables
Example #10
0
    def __set_hierarchical_LSH_Index(self, cluster, number_of_tables, hash_bit):
        #Function defintion: Returns the LSH Index for hierarchical clustering
        # -- Read LSH for more information or README.2
        #params ---
        #cluster: the set of vectors wished to be clustered.
        # number_of_tables: the number of tables used in each nearest neighbor search (see LSH section line 114)
        #hash_bit: Used to determine the strength of the hash_function see README.2 or LSH for more detail
        params_cp = falconn.LSHConstructionParameters()
        params_cp.dimension = len(cluster[0])
        params_cp.lsh_family = falconn.LSHFamily.CrossPolytope
        params_cp.distance_function = falconn.DistanceFunction.EuclideanSquared
        params_cp.l = number_of_tables
        params_cp.num_rotations = 1 #Parameter associated with crosspolytope see Falconnn for more
        params_cp.seed = 5721840
    # we want to use all the available threads to set up
        params_cp.num_setup_threads = 0
        params_cp.storage_hash_table = falconn.StorageHashTable.BitPackedFlatHashTable

        hash_bit = math.floor(math.log(len(cluster),2))
        # we build 32-bit hashes so that each table has
        # 2^32 bins; this is a good choise since 2^32 is of the same
        # order of magnitude as the number of data points
        falconn.compute_number_of_hash_functions(hash_bit, params_cp) #Look at typical number of hash functions
        #Figure out how number of hashfunctions are determined.

        print('Constructing the LSH Index For Cluster Combine Method.')
        t1 = timeit.default_timer()
        table = falconn.LSHIndex(params_cp)
        table.setup(cluster)
        t2 = timeit.default_timer()
        print('Done')
        print('Construction time: {}'.format(t2 - t1))

        self.hierarchical_LSH_Index = table.construct_query_object()
Example #11
0
def test_number_of_hash_functions():
    params = falconn._internal.LSHConstructionParameters()

    params.lsh_family = 'hyperplane'
    params.dimension = 10
    falconn.compute_number_of_hash_functions(5, params)
    assert params.k == 5

    params.lsh_family = 'cross_polytope'
    falconn.compute_number_of_hash_functions(5, params)
    assert params.k == 1
    assert params.last_cp_dimension == 16

    params.dimension = 100
    params.lsh_family = 'hyperplane'
    falconn.compute_number_of_hash_functions(8, params)
    assert params.k == 8

    params.lsh_family = 'cross_polytope'
    falconn.compute_number_of_hash_functions(8, params)
    assert params.k == 1
    assert params.last_cp_dimension == 128

    falconn.compute_number_of_hash_functions(10, params)
    assert params.k == 2
    assert params.last_cp_dimension == 2
Example #12
0
def falconn_table(sig_mat):
    ''' Construct a falconn table with given signature. Return
    a falconn table and the random seed used (for random rotation)
    to construct the falconn table.

    Keyword Argument:

    sig_mat -- A numpy ndarray, where each row is signature at a time
    window center
    '''

    # pre-processing the signature matrix
    # coerce the ndarray into 32-bit floating number

    if sig_mat.dtype != np.float32:
        sig_mat = sig_mat.astype(np.float32)

    # Normalize and center the signature matrix so that
    # the observations are on a unit hypersphere
    sig_mat /= max(1e-6, max(np.linalg.norm(sig_mat, axis=1).reshape(-1, 1)))
    center = np.mean(sig_mat, axis=0)
    sig_mat -= center

    # Instantiate the parameters for the falconn table
    params_cp = falconn.LSHConstructionParameters()
    params_cp.dimension = len(sig_mat[0])
    # Set the LSH family to be Cross Polytope
    params_cp.lsh_family = falconn.LSHFamily.CrossPolytope
    # Set the distance function to be the L2_norm
    # which is the cosine distance
    params_cp.distance_function = falconn.DistanceFunction.EuclideanSquared
    # # Set the randomly-picked seed for table construction
    # params_cp.seed = cp_seed
    # Set the number of random rotation, since the signature is very likely
    # a large sparse matrix
    params_cp.num_rotations = 2
    # select the number of hash tables
    params_cp.l = 50
    params_cp.seed = 5721840
    # Set the thread usage (0 for using all) and storage formats of the
    # falconn table
    params_cp.num_setup_threads = 0
    params_cp.storage_hash_table = (
        falconn.StorageHashTable.BitPackedFlatHashTable)

    # select the number of hash functions according the size
    # of the signature matrix
    num_obs = sig_mat.shape[0]
    bit_num = int(np.log2(num_obs))
    falconn.compute_number_of_hash_functions(bit_num, params_cp)

    # Construct falconn table with configured parameters
    falconn_tab = falconn.LSHIndex(params_cp)
    falconn_tab.setup(sig_mat)

    return falconn_tab
Example #13
0
    def add_to_data(self, point):
        """Return None

        Add a new point to the dataset
        """
        falconn.compute_number_of_hash_functions(18, self.params_cp)

        print('Constructing the LSH table')
        self.table = falconn.LSHIndex(self.params_cp )
        self.data = np.vstack([self.data, point])
        self.table.setup(self.data)
Example #14
0
    def add(self, vecs):
        self.center = np.mean(vecs, axis=0)  # Subtract mean vector later
        self.params_cp = falconn.get_default_parameters(
            num_points=vecs.shape[0],
            dimension=vecs.shape[1],
            distance=falconn.DistanceFunction.EuclideanSquared,
            is_sufficiently_dense=True)
        # self.params_cp.num_setup_threads = 0  # Single thread mode
        bit = int(np.round(np.log2(vecs.shape[0])))
        falconn.compute_number_of_hash_functions(bit, self.params_cp)

        self.table = falconn.LSHIndex(self.params_cp)
        self.table.setup(vecs - self.center)
        self.query_object = self.table.construct_query_object()
Example #15
0
    def build_LSH_index(self):
        params_cp = falconn.LSHConstructionParameters()
        params_cp.dimension = self.vectorized_articles.shape[1]
        params_cp.lsh_family = falconn.LSHFamily.CrossPolytope
        params_cp.distance_function = falconn.DistanceFunction.NegativeInnerProduct
        params_cp.l = 200
        params_cp.num_rotations = 1
        params_cp.num_setup_threads = 0
        params_cp.storage_hash_table = falconn.StorageHashTable.BitPackedFlatHashTable

        falconn.compute_number_of_hash_functions(21, params_cp)
        self.table = falconn.LSHIndex(params_cp)
        self.table.setup(self.vectorized_articles)

        self.query = self.table.construct_query_object()
        self.query.set_num_probes(params_cp.l)
Example #16
0
    def init_lsh(self):
        """
    Initializes locality-sensitive hashing with FALCONN to find nearest neighbors in training data.
    """
        self.query_objects = {
        }  # contains the object that can be queried to find nearest neighbors at each layer.
        # mean of training data representation per layer (that needs to be substracted before LSH).
        self.centers = {}
        for layer in self.layers:
            assert self.nb_tables >= self.neighbors

            # Normalize all the lenghts, since we care about the cosine similarity.
            self.train_activations_lsh[layer] /= np.linalg.norm(
                self.train_activations_lsh[layer], axis=1).reshape(-1, 1)

            # Center the dataset and the queries: this improves the performance of LSH quite a bit.
            center = np.mean(self.train_activations_lsh[layer], axis=0)
            self.train_activations_lsh[layer] -= center
            self.centers[layer] = center

            # LSH parameters
            params_cp = falconn.LSHConstructionParameters()
            params_cp.dimension = len(self.train_activations_lsh[layer][1])
            params_cp.lsh_family = falconn.LSHFamily.CrossPolytope
            params_cp.distance_function = falconn.DistanceFunction.EuclideanSquared
            params_cp.l = self.nb_tables
            params_cp.num_rotations = 2  # for dense set it to 1; for sparse data set it to 2
            params_cp.seed = 5721840
            # we want to use all the available threads to set up
            params_cp.num_setup_threads = 0
            params_cp.storage_hash_table = falconn.StorageHashTable.BitPackedFlatHashTable

            # we build 18-bit hashes so that each table has
            # 2^18 bins; this is a good choice since 2^18 is of the same
            # order of magnitude as the number of data points
            falconn.compute_number_of_hash_functions(self.number_bits,
                                                     params_cp)

            print('Constructing the LSH table')
            table = falconn.LSHIndex(params_cp)
            table.setup(self.train_activations_lsh[layer])

            # Parse test feature vectors and find k nearest neighbors
            query_object = table.construct_query_object()
            query_object.set_num_probes(self.nb_tables)
            self.query_objects[layer] = query_object
Example #17
0
def setup_lsh(X, num_probes=100):
    assert X.ndim == 2
    params_cp = falconn.LSHConstructionParameters()
    params_cp.dimension = X.shape[1]
    params_cp.lsh_family = falconn.LSHFamily.CrossPolytope
    params_cp.distance_function = falconn.DistanceFunction.EuclideanSquared
    params_cp.l = 100
    params_cp.num_rotations = 1
    params_cp.seed = 1234
    params_cp.num_setup_threads = 0
    params_cp.storage_hash_table = falconn.StorageHashTable.BitPackedFlatHashTable
    falconn.compute_number_of_hash_functions(16, params_cp)

    table = falconn.LSHIndex(params_cp)
    table.setup(X)
    query_object = table.construct_query_object()
    query_object.set_num_probes(num_probes)

    return query_object
def _create_bucket(segments):
    """ Creates a bucket of segments
    to use for LSH similarity lookup
    """
    params_cp = falconn.LSHConstructionParameters()
    params_cp.dimension = len(segments[0])
    params_cp.lsh_family = falconn.LSHFamily.CrossPolytope
    params_cp.distance_function = falconn.DistanceFunction.EuclideanSquared
    params_cp.l = 25
    params_cp.num_rotations = 2
    params_cp.seed = 5721840
    params_cp.num_setup_threads = 0
    params_cp.storage_hash_table = (
        falconn.StorageHashTable.BitPackedFlatHashTable)
    falconn.compute_number_of_hash_functions(18, params_cp)

    table = falconn.LSHIndex(params_cp)
    table.setup(segments)

    return (segments, table)
Example #19
0
 def fit(self, X):
     if X.dtype != numpy.float32:
         X = X.astype(numpy.float32)
     if self._metric == 'angular':
         X /= numpy.linalg.norm(X, axis=1).reshape(-1,  1)
     self._center = numpy.mean(X, axis=0)
     X -= self._center
     import falconn
     self._params = falconn.LSHConstructionParameters()
     self._params.dimension = X.shape[1]
     self._params.distance_function = 'euclidean_squared'
     self._params.lsh_family = 'cross_polytope'
     falconn.compute_number_of_hash_functions(self._num_bits, self._params)
     self._params.l = self._num_tables
     self._params.num_rotations = 1
     self._params.num_setup_threads = 0
     self._params.storage_hash_table = 'flat_hash_table'
     self._params.seed = 95225714
     self._index = falconn.LSHIndex(self._params)
     self._index.setup(X)
     self._index.set_num_probes(self._num_probes)
     self._buf = numpy.zeros((X.shape[1],), dtype=numpy.float32)
Example #20
0
    def setup_second_layer(self, number_of_tables=50):
        params_cp = falconn.LSHConstructionParameters()
        params_cp.dimension = self.X.shape[1] + 1
        params_cp.lsh_family = falconn.LSHFamily.CrossPolytope
        params_cp.distance_function = falconn.DistanceFunction.EuclideanSquared
        params_cp.l = number_of_tables
        params_cp.num_rotations = 1
        params_cp.seed = 5721840
        params_cp.num_setup_threads = 0
        params_cp.storage_hash_table = falconn.StorageHashTable.BitPackedFlatHashTable
        falconn.compute_number_of_hash_functions(15, params_cp)

        print('Constructing the LSH table')
        t1 = timeit.default_timer()
        table = falconn.LSHIndex(params_cp)
        self.X_ = self.X_.astype('float')
        table.setup(self.X_)
        t2 = timeit.default_timer()
        print('Done')
        print('Construction time: {}'.format(t2 - t1))

        self.query_object = table.construct_query_object()
Example #21
0
    def __falconn_fit(self):
        """
        Initializes locality-sensitive hashing with FALCONN to find nearest neighbors in training data.
        """

        import falconn

        dimension = self.features.shape[1]
        nb_tables = self.kwargs['nb_tables']
        number_bits = self.kwargs['number_bits']

        # LSH parameters
        params_cp = falconn.LSHConstructionParameters()
        params_cp.dimension = dimension
        params_cp.lsh_family = falconn.LSHFamily.CrossPolytope
        params_cp.distance_function = falconn.DistanceFunction.EuclideanSquared
        params_cp.l = nb_tables
        params_cp.num_rotations = 2  # for dense set it to 1; for sparse data set it to 2
        params_cp.seed = 5721840
        # we want to use all the available threads to set up
        params_cp.num_setup_threads = 0
        params_cp.storage_hash_table = falconn.StorageHashTable.BitPackedFlatHashTable

        # we build number_bits-bit hashes so that each table has
        # 2^number_bits bins; a rule of thumb is to have the number
        # of bins be the same order of magnitude as the number of data points
        falconn.compute_number_of_hash_functions(number_bits, params_cp)
        self._falconn_table = falconn.LSHIndex(params_cp)
        self._falconn_query_object = None
        self._FALCONN_NB_TABLES = nb_tables

        # Center the dataset and the queries: this improves the performance of LSH quite a bit.
        self.center = np.mean(self.features, axis=0)
        self.features -= self.center

        # add features to falconn table
        self._falconn_table.setup(self.features)
Example #22
0
 def fit(self, X):
     if X.dtype != numpy.float32:
         X = X.astype(numpy.float32)
     if self._metric == 'hamming':
         # replace all zeroes by -1
         X[X < 0.5] = -1
     if self._metric == 'angular' or self._metric == 'hamming':
         X /= numpy.linalg.norm(X, axis=1).reshape(-1, 1)
     self._center = numpy.mean(X, axis=0)
     X -= self._center
     self._params = falconn.LSHConstructionParameters()
     self._params.dimension = X.shape[1]
     self._params.distance_function = falconn.DistanceFunction.EuclideanSquared
     self._params.lsh_family = falconn.LSHFamily.CrossPolytope
     falconn.compute_number_of_hash_functions(self._num_bits, self._params)
     self._params.l = self._num_tables
     self._params.num_rotations = 1
     self._params.num_setup_threads = 0
     self._params.storage_hash_table = falconn.StorageHashTable.BitPackedFlatHashTable
     self._params.seed = 95225714
     self._index = falconn.LSHIndex(self._params)
     self._index.setup(X)
     self._query_object = self._index.construct_query_object()
     self._query_object.set_num_probes(self._num_probes)
Example #23
0
    params_cp = falconn.LSHConstructionParameters()
    params_cp.dimension = len(dataset[0])
    params_cp.lsh_family = 'cross_polytope'
    params_cp.distance_function = 'euclidean_squared'
    params_cp.l = number_of_tables
    # we set one rotation, since the data is dense enough,
    # for sparse data set it to 2
    params_cp.num_rotations = 1
    params_cp.seed = 5721840
    # we want to use all the available threads to set up
    params_cp.num_setup_threads = 0
    params_cp.storage_hash_table = 'bit_packed_flat_hash_table'
    # we build 18-bit hashes so that each table has
    # 2^18 bins; this is a good choise since 2^18 is of the same
    # order of magnitude as the number of data points
    falconn.compute_number_of_hash_functions(18, params_cp)

    print('Constructing the LSH table')
    t1 = timeit.default_timer()
    table = falconn.LSHIndex(params_cp)
    table.setup(dataset)
    t2 = timeit.default_timer()
    print('Done')
    print('Construction time: {}'.format(t2 - t1))

    # find the smallest number of probes to achieve accuracy 0.9
    # using the binary search
    print('Choosing number of probes')
    number_of_probes = number_of_tables
    def evaluate_number_of_probes(number_of_probes):
        table.set_num_probes(number_of_probes)
Example #24
0
def lsh_for_ccd(dataset: np.array, queries: list, methoddict: dict,
                lastIndexBefore: int):
    number_of_tables = 10
    # queries = dataset

    params_cp = falconn.LSHConstructionParameters()
    params_cp.dimension = len(dataset[0])
    params_cp.lsh_family = falconn.LSHFamily.CrossPolytope
    params_cp.distance_function = falconn.DistanceFunction.EuclideanSquared
    params_cp.l = number_of_tables
    params_cp.num_rotations = 1
    params_cp.seed = 5721840
    params_cp.num_setup_threads = 0
    params_cp.storage_hash_table = falconn.StorageHashTable.BitPackedFlatHashTable
    falconn.compute_number_of_hash_functions(18, params_cp)

    print('Constructing the LSH table')
    t1 = timeit.default_timer()
    table = falconn.LSHIndex(params_cp)
    table.setup(dataset)
    t2 = timeit.default_timer()
    print('Done')
    print('Construction time: {}'.format(t2 - t1))

    query_object = table.construct_query_object()
    methodfilterset = set()

    currentIter = lastIndexBefore
    totalIter = len(dataset)

    for query in queries:
        neighbors = query_object.find_near_neighbors(query, threshold=endTheta)
        for neighbor in neighbors:
            queryMdKey = hashlib.md5(str(query.tolist()).encode()).hexdigest()
            neighborMdKey = hashlib.md5(
                str(dataset[neighbor].tolist()).encode()).hexdigest()
            # 13ccdCodeLineSeparate112443321234ccdTokenSeparate/home/xxx/xx.java,3,15ccdFileKeySeparate/home/xxx/xx.java,31,45
            left = str(methoddict[queryMdKey])
            right = str(methoddict[neighborMdKey])
            ccdLeft = left.split("ccdCodeLineSeparate")
            ccdRight = right.split("ccdCodeLineSeparate")
            methodsLeftLine = ccdLeft[0]
            methodsRightLine = ccdRight[0]
            ccdTokenLeft = ccdLeft[1].split('ccdTokenSeparate')
            ccdTokenRight = ccdRight[1].split('ccdTokenSeparate')
            methodsLeft = ccdTokenLeft[1]
            methodsRight = ccdTokenRight[1]
            methodsLeftToken = ccdTokenLeft[0]
            methodsRightToken = ccdTokenRight[0]

            if queryMdKey == neighborMdKey:
                tmpStr = methodsLeft
                if "ccdFileKeySeparate" in tmpStr:
                    tmpArr = tmpStr.split("ccdFileKeySeparate")
                    if len(tmpArr) == 2:
                        result = getCloneTuple(tmpArr[0] + "," + tmpArr[1])
                        writer.writerow(result)
                    else:
                        for i in range(0, len(tmpArr)):
                            for j in range(i + 1, len(tmpArr)):
                                result = getCloneTuple(tmpArr[i] + "," +
                                                       tmpArr[j])
                                writer.writerow(result)
                continue

            if neighbor > currentIter:
                if not lineFilter(int(methodsLeftLine), int(methodsRightLine)):
                    dist = np.linalg.norm(query - dataset[neighbor])
                    dist *= dist
                    if dist <= optTheta:
                        getCloneResult(methodsLeft, methodsRight)
                    else:
                        beta = betaMain(methodsLeftToken, methodsRightToken)
                        if beta <= minbeta:
                            continue
                        dist = getOptDist(beta, dist)
                        if dist < cloneTheta:
                            getCloneResult(methodsLeft, methodsRight)
        currentIter = currentIter + 1
        # print("%d / %d \r" % (currentIter, totalIter))
    print(time.time())
Example #25
0
    def __init__(self, dataset, params, num_bits=16):

        fa.compute_number_of_hash_functions(num_bits, params)
        self._table = fa.LSHIndex(params)
        self._table.setup(dataset)
Example #26
0
def generate_candidate_threshold(entity_embedding=None, data_ids="OpenEA", path="", threshold=0.2, output_path=False,
                                 entity_file="ent_embeds,npy", normalize=True, metric="euclidean", lsh_family="hyperplane", number_of_tables=500):

    """
    :param entity_embedding:
    :param data_ids:
    :param path:
    :param threshold:
    :param output_path:
    :param entity_file:
    :param normalize:
    :param metric:  1.inner 向量的内积, 2.euclidean 欧几里的距离(l2 normaliztion 后与cosine distance 成正比)。
    :param lsh_family:
    :return:
    """

    if entity_embedding is None:
        entity_file_path = path + entity_file
        entity_embedding = np.load(entity_file_path)
        print("Load [%s] successfully!" % (entity_file_path))

    if data_ids is "OpenEA":
        ent2id1, id2ent1, max_id = read_ent_id(path + "kg1_ent_ids")
        ent2id2, id2ent2, max_id = read_ent_id(path + "kg2_ent_ids")
        paths = path.split('/')
        test_path = "/".join([paths[1], paths[2], paths[3], "datasets", paths[7], paths[8], paths[9]])
        test_ids = []
        with open('/' + test_path + r"/test_links", 'r', encoding='utf-8') as f:
            for line in f.readlines():
                items = line.strip().split("\t")
                id1, id2 = int(ent2id1[items[0]]), int(ent2id2[items[1]])
                # maxx_id = max(maxx_id, id1, id2)
                test_ids.append([id1, id2])
        data_ids = test_ids

    if data_ids is "dbp15k":
        # train_ids = read_ids(path+"sup_ent_ids")
        test_ids = read_ids(path + "ref_ent_ids")  # 只考虑测试集上匹配
        # test_ids.extend(train_ids)
        data_ids = test_ids
    data_ids = np.array(data_ids).astype(int)
    entity_embedding = entity_embedding.astype(np.float32)
    if metric == "euclidean":
        entity_embedding -= np.mean(entity_embedding, axis=0)

    Lvec = np.array([entity_embedding[e] for e in data_ids[:, 0]])
    Rvec = np.array([entity_embedding[e] for e in data_ids[:, 1]])
    if os.path.exists(path + "mapping_mat.npy"):   # OpenEA模型转换后的最终向量
        mapping = np.load(path + "mapping_mat.npy")
        #print("mapping shape:", mapping.shape)
        Lvec = np.matmul(Lvec, mapping)
        #print("load mapping succussuflly!")

    if normalize:
        Lvec = preprocessing.normalize(Lvec, norm="l2", axis=1)
        Rvec = preprocessing.normalize(Rvec, norm="l2", axis=1)

    seed = 119417657
    L_True = data_ids[:, 0].tolist()
    print("shape:", entity_embedding.shape)
    params_cp = falconn.LSHConstructionParameters()
    params_cp.dimension = entity_embedding.shape[1]
    if lsh_family == "crosspolytope":
        params_cp.lsh_family = falconn.LSHFamily.CrossPolytope
    elif lsh_family == "hyperplane":
        params_cp.lsh_family = falconn.LSHFamily.Hyperplane
    if metric == "euclidean":
        params_cp.distance_function = falconn.DistanceFunction.EuclideanSquared
    elif metric == "inner":
        params_cp.distance_function = falconn.DistanceFunction.NegativeInnerProduct

    params_cp.l = number_of_tables
    params_cp.num_rotations = 1
    params_cp.seed = seed
    # we want to use all the available threads to set up
    params_cp.num_setup_threads = 2
    params_cp.storage_hash_table = falconn.StorageHashTable.BitPackedFlatHashTable
    # we build 18-bit hashes so that each table has
    # 2^18 bins; this is a good choise since 2^18 is of the same
    # order of magnitude as the number of data points
    falconn.compute_number_of_hash_functions(20, params_cp)
    # print('Constructing the LSH table')
    t1 = timeit.default_timer()
    table = falconn.LSHIndex(params_cp)
    table.setup(Lvec)
    t2 = timeit.default_timer()

    print('Construction time: {}'.format(t2 - t1))
    query_object = table.construct_query_object()
    number_of_probes = number_of_tables
    print('Choosing number of probes: ', number_of_probes)
    query_object.set_num_probes(number_of_probes)
    t1 = timeit.default_timer()
    true_cnt = 0
    total = 0
    true_all = data_ids.shape[0]
    node_pairs = []
    print("Metric:", metric, "Threshold:", threshold)
    for ids_index, pair in enumerate(data_ids):
        ans = query_object.find_near_neighbors(Rvec[ids_index], threshold=threshold)
        #print(len(ans))
        for index in range(len(ans)):
            if pair[0] == L_True[ans[index]]:
                true_cnt += 1
                node_pairs.append((pair[0], pair[1], 1))
            else:
                node_pairs.append((L_True[ans[index]], pair[1], 0))
        total += len(ans)
    print('Threshold:[%f] True cnt:[%d] Generate All cnt:[%d] Total:[%d] Recall:[%f] P/E ratio:[%f] Metric:[%s]'
          % (threshold, true_cnt, total, true_all, true_cnt/true_all, total/true_all, metric))

    t2 = timeit.default_timer()
    print('Generate Candidate time: {}'.format(t2 - t1))
    if output_path == True:
        output_path = "/".join(path.split('/')[:-1]) + '/topk_' + str(threshold) + '_name_ngram'
        print('output path:', output_path)
        with open(output_path, 'w', encoding='utf8') as f:
            for pair in node_pairs:
                f.writelines(pair[0] + '\t' + pair[1] + '\t' + str(pair[2]) + '\n')
Example #27
0
        # queries -= center
        # print('Done')
        #assert dataset.dtype == np.float32
        number_of_probes = [900]
        #

        params_cp_blue = falconn.LSHConstructionParameters()
        params_cp_blue.dimension = len(dataset_blue[0])
        params_cp_blue.lsh_family = falconn.LSHFamily.CrossPolytope
        params_cp_blue.distance_function = falconn.DistanceFunction.EuclideanSquared
        params_cp_blue.l = number_of_tables
        params_cp_blue.num_rotations = 1
        params_cp_blue.seed = 666666
        params_cp_blue.num_setup_threads = 1
        params_cp_blue.storage_hash_table = falconn.StorageHashTable.BitPackedFlatHashTable
        falconn.compute_number_of_hash_functions(20, params_cp_blue)

        print('Constructing the LSH table')
        t1 = timeit.default_timer()
        table_blue = falconn.LSHIndex(params_cp_blue)
        table_blue.setup(dataset_blue)
        t2 = timeit.default_timer()
        query_object_blue = table_blue.construct_query_object()
        print('Done')
        print('Construction time: {}'.format((t2 - t1)))

        params_cp_green = falconn.LSHConstructionParameters()
        params_cp_green.dimension = len(dataset_green[0])
        params_cp_green.lsh_family = falconn.LSHFamily.CrossPolytope
        params_cp_green.distance_function = falconn.DistanceFunction.EuclideanSquared
        params_cp_green.l = number_of_tables
# In[ ]:

import falconn

# In[ ]:

parameters = falconn.LSHConstructionParameters()
num_tables = 1
parameters.l = num_tables
parameters.dimension = num_dimensions
parameters.distance_function = falconn.DistanceFunction.EuclideanSquared
parameters.lsh_family = falconn.LSHFamily.CrossPolytope
parameters.num_rotations = 1
parameters.num_setup_threads = 1
parameters.storage_hash_table = falconn.StorageHashTable.BitPackedFlatHashTable
falconn.compute_number_of_hash_functions(16, parameters)

index = falconn.LSHIndex(parameters)
get_ipython().run_line_magic('time', 'index.setup(dataset)')

query_object = index.construct_query_object()
num_probes = 1
query_object.set_num_probes(num_probes)

get_ipython().run_line_magic(
    'timeit', 'query_object.find_k_nearest_neighbors(query, 5)')

# In[ ]:

query = dataset[5000]
print(query_object.find_k_nearest_neighbors(query, 5))
Example #29
0
    def set_clustering_LSH_Index(self, number_of_queries, query_accuracy, number_of_tables, hash_bit):
        #Function defintion: Returns the LSH Index -- Read LSH for more information or
        # README.2

        #parameters
        #number_of_queries:The number of queries used to determine the number_of_probes
        #query_accuracy: Specifies the level of accuracy of the Index
        #Setting query_accuracy = 1 degenerates LSH index into linear search.
        #number_of_tables:the number of hash_tables used for a given nearest
        #neighbor search
        #hash_bit: Used to determine the number of hash functions. READ_ME for detail.
        print("Setting Clustering Index")

        queries = self.w2v_vectors[(len(self.w2v_vectors)-number_of_queries):]
        w2v_vectors = self.w2v_vectors[:(len(self.w2v_vectors)-number_of_queries)]

        #Normalize vectors
        center = np.mean(w2v_vectors, axis=0)
        w2v_vectors -= center
        queries -= center


        #perform linear scan to return correct answers
        answers = self.linearScan_answerGenerator(w2v_vectors, queries)

        #Set number of probes----
        print('Choosing number of probes')
        init_number_of_probes = 600
        # END -------

        #Parameters -----
        params_cp = falconn.LSHConstructionParameters()
        params_cp.dimension = len(w2v_vectors[0]) # = 50 for Glove6B.50d
        params_cp.lsh_family = falconn.LSHFamily.CrossPolytope
        params_cp.distance_function = falconn.DistanceFunction.EuclideanSquared
        params_cp.l = number_of_tables
        params_cp.num_rotations = 1
        params_cp.seed = 5721840
        params_cp.num_setup_threads = 0
        params_cp.storage_hash_table = falconn.StorageHashTable.BitPackedFlatHashTable
        falconn.compute_number_of_hash_functions(hash_bit, params_cp)
        # END ------

        #Constructing LSH Index -----
        print('Constructing the LSH Index')
        t1 = timeit.default_timer()
        table = falconn.LSHIndex(params_cp)
        table.setup(w2v_vectors)
        t2 = timeit.default_timer()
        print('Done')
        print('Construction time: {}'.format(t2 - t1))
        query_object = table.construct_query_object()
        number_of_probes = self.probeGenerator(query_accuracy, init_number_of_probes, query_object, answers, queries, number_of_tables)
        query_object.set_num_probes(number_of_probes)
        #--------

        # Performance Statistics
        t1 = timeit.default_timer()
        score = 0
        for (i, query) in enumerate(queries):
            if query_object.find_nearest_neighbor(query) == answers[i]:
                score += 1
        t2 = timeit.default_timer()
        print('Query time: {}'.format((t2 - t1) / len(queries)))
        print('Precision: {}'.format(float(score) / len(queries)))
        self.query_object = query_object
        print("Vectors Successfully Hashed. Clustering LSH Index Created")