コード例 #1
0
ファイル: test_bit_utils.py プロジェクト: spongezhang/SMQTK
 def test_int_to_bit_vector_large_1(self):
     numpy.testing.assert_array_equal(
         bit_utils.int_to_bit_vector_large(1),
         [True]
     )
     numpy.testing.assert_array_equal(
         bit_utils.int_to_bit_vector_large(1, 7),
         ([False] * 6) + [True]
     )
コード例 #2
0
ファイル: test_bit_utils.py プロジェクト: spongezhang/SMQTK
 def test_int_to_bit_vector_large_0(self):
     # Need at least one bit to represent 0.
     numpy.testing.assert_array_equal(
         bit_utils.int_to_bit_vector_large(0),
         [False]
     )
     # Force 5 bits.
     numpy.testing.assert_array_equal(
         bit_utils.int_to_bit_vector_large(0, 5),
         [False, False, False, False, False]
     )
コード例 #3
0
    def test_remove_from_index_invalid_key_single(self):
        bt = SkLearnBallTreeHashIndex(random_seed=0)
        index = np.ndarray((1000, 256), bool)
        for i in range(1000):
            index[i] = int_to_bit_vector_large(i, 256)
        bt.build_index(index)
        # Copy post-build index for checking no removal occurred
        bt_data = np.copy(bt.bt.data)

        self.assertRaises(KeyError, bt.remove_from_index, [
            int_to_bit_vector_large(1001, 256),
        ])
        np.testing.assert_array_equal(bt_data, np.asarray(bt.bt.data))
コード例 #4
0
ファイル: test_bit_utils.py プロジェクト: spongezhang/SMQTK
    def test_int_to_bit_vector_large_large(self):
        # Try large integer bit vectors
        int_val = (2**256) - 1
        expected_vector = [True] * 256
        numpy.testing.assert_array_equal(
            bit_utils.int_to_bit_vector_large(int_val),
            expected_vector
        )

        int_val = (2**512)
        expected_vector = [True] + ([False] * 512)
        numpy.testing.assert_array_equal(
            bit_utils.int_to_bit_vector_large(int_val),
            expected_vector
        )
コード例 #5
0
ファイル: linear.py プロジェクト: fabgithub/SMQTK
    def nn(self, h, n=1):
        """
        Return the nearest `N` neighbors to the given hash code.

        Distances are in the range [0,1] and are the percent different each
        neighbor hash is from the query, based on the number of bits contained
        in the query.

        :param h: Hash code to compute the neighbors of. Should be the same bit
            length as indexed hash codes.
        :type h: numpy.ndarray[bool] | list[bool]

        :param n: Number of nearest neighbors to find.
        :type n: int

        :raises ValueError: No index to query from.

        :return: Tuple of nearest N hash codes and a tuple of the distance
            values to those neighbors.
        :rtype: (tuple[numpy.ndarray[bool], tuple[float])

        """
        super(LinearHashIndex, self).nn(h, n)

        h_int = bit_vector_to_int_large(h)
        bits = len(h)
        #: :type: list[int|long]
        near_codes = \
            heapq.nsmallest(n, self.index,
                            lambda e: hamming_distance(h_int, e)
                            )
        distances = list(map(hamming_distance, near_codes,
                             [h_int] * len(near_codes)))
        return [int_to_bit_vector_large(c, bits) for c in near_codes], \
               [d / float(bits) for d in distances]
コード例 #6
0
ファイル: linear.py プロジェクト: dhandeo/SMQTK
    def nn(self, h, n=1):
        """
        Return the nearest `N` neighbors to the given hash code.

        Distances are in the range [0,1] and are the percent different each
        neighbor hash is from the query, based on the number of bits contained
        in the query.

        :param h: Hash code to compute the neighbors of. Should be the same bit
            length as indexed hash codes.
        :type h: numpy.ndarray[bool]

        :param n: Number of nearest neighbors to find.
        :type n: int

        :raises ValueError: No index to query from.

        :return: Tuple of nearest N hash codes and a tuple of the distance
            values to those neighbors.
        :rtype: (tuple[numpy.ndarray[bool], tuple[float])

        """
        super(LinearHashIndex, self).nn(h, n)

        h_int = bit_vector_to_int_large(h)
        bits = len(h)
        #: :type: list[int|long]
        near_codes = \
            heapq.nsmallest(n, self.index,
                            lambda e: hamming_distance(h_int, e)
                            )
        distances = map(hamming_distance, near_codes,
                        [h_int] * len(near_codes))
        return [int_to_bit_vector_large(c, bits) for c in near_codes], \
               [d / float(bits) for d in distances]
コード例 #7
0
    def test_remove_from_index_invalid_key_multiple(self):
        # Test that mixed valid and invalid keys raises KeyError and does not
        # modify the index.
        bt = SkLearnBallTreeHashIndex(random_seed=0)
        index = np.ndarray((1000, 256), bool)
        for i in range(1000):
            index[i] = int_to_bit_vector_large(i, 256)
        bt.build_index(index)
        # Copy post-build index for checking no removal occurred
        bt_data = np.copy(bt.bt.data)

        self.assertRaises(KeyError, bt.remove_from_index, [
            int_to_bit_vector_large(42, 256),
            int_to_bit_vector_large(1008, 256),
        ])
        np.testing.assert_array_equal(bt_data, np.asarray(bt.bt.data))
コード例 #8
0
    def test_remove_from_index(self):
        # Test that we actually remove from the index.
        bt = SkLearnBallTreeHashIndex(random_seed=0)
        index = np.ndarray((1000, 256), bool)
        for i in range(1000):
            index[i] = int_to_bit_vector_large(i, 256)
        bt.build_index(index)
        # Copy post-build index for checking no removal occurred
        bt_data = np.copy(bt.bt.data)

        bt.remove_from_index([
            int_to_bit_vector_large(42, 256),
            int_to_bit_vector_large(998, 256),
        ])
        # Make sure expected arrays are missing from data block.
        new_data = np.asarray(bt.bt.data)
        self.assertEqual(new_data.shape, (998, 256))
        new_data_set = set(tuple(r) for r in new_data.tolist())
        self.assertNotIn(tuple(int_to_bit_vector_large(42, 256)), new_data_set)
        self.assertNotIn(tuple(int_to_bit_vector_large(998, 256)),
                         new_data_set)
コード例 #9
0
    def test_remove_from_index_last_element(self):
        """
        Test removing the final the only element / final elements from the
        index.
        """
        # Add one hash, remove one hash.
        bt = SkLearnBallTreeHashIndex(random_seed=0)
        index = np.ndarray((1, 256), bool)
        index[0] = int_to_bit_vector_large(1, 256)
        bt.build_index(index)
        self.assertEqual(bt.count(), 1)
        bt.remove_from_index(index)
        self.assertEqual(bt.count(), 0)
        self.assertIsNone(bt.bt)

        # Add many hashes, remove many hashes in batches until zero
        bt = SkLearnBallTreeHashIndex(random_seed=0)
        index = np.ndarray((1000, 256), bool)
        for i in range(1000):
            index[i] = int_to_bit_vector_large(i, 256)
        bt.build_index(index)
        # Remove first 250
        bt.remove_from_index(index[:250])
        self.assertEqual(bt.count(), 750)
        self.assertIsNotNone(bt.bt)
        # Remove second 250
        bt.remove_from_index(index[250:500])
        self.assertEqual(bt.count(), 500)
        self.assertIsNotNone(bt.bt)
        # Remove third 250
        bt.remove_from_index(index[500:750])
        self.assertEqual(bt.count(), 250)
        self.assertIsNotNone(bt.bt)
        # Remove final 250
        bt.remove_from_index(index[750:])
        self.assertEqual(bt.count(), 0)
        self.assertIsNone(bt.bt)
コード例 #10
0
    def test_remove_from_index_last_element_with_cache(self):
        """
        Test removing final element also clears the cache element.
        """
        c = DataMemoryElement()
        bt = SkLearnBallTreeHashIndex(cache_element=c, random_seed=0)
        index = np.ndarray((1, 256), bool)
        index[0] = int_to_bit_vector_large(1, 256)

        bt.build_index(index)
        self.assertEqual(bt.count(), 1)
        self.assertFalse(c.is_empty())

        bt.remove_from_index(index)
        self.assertEqual(bt.count(), 0)
        self.assertTrue(c.is_empty())
コード例 #11
0
ファイル: make_balltree.py プロジェクト: thezedwards/SMQTK
def main():
    args = cli_parser().parse_args()

    initialize_logging(logging.getLogger('smqtk'), logging.DEBUG)
    initialize_logging(logging.getLogger('__main__'), logging.DEBUG)
    log = logging.getLogger(__name__)

    hash2uuids_fp = os.path.abspath(args.hash2uuids_fp)
    bit_len = args.bit_len
    leaf_size = args.leaf_size
    rand_seed = args.rand_seed
    balltree_model_fp = os.path.abspath(args.balltree_model_fp)

    assert os.path.isfile(hash2uuids_fp), "Bad path: '%s'" % hash2uuids_fp
    assert os.path.isdir(os.path.dirname(balltree_model_fp)), \
        "Bad path: %s" % balltree_model_fp

    log.debug("hash2uuids_fp    : %s", hash2uuids_fp)
    log.debug("bit_len          : %d", bit_len)
    log.debug("leaf_size        : %d", leaf_size)
    log.debug("rand_seed        : %d", rand_seed)
    log.debug("balltree_model_fp: %s", balltree_model_fp)


    log.info("Loading hash2uuids table")
    with open(hash2uuids_fp) as f:
        hash2uuids = cPickle.load(f)

    log.info("Computing hash-code vectors")
    hash_vectors = []  #[int_to_bit_vector_large(h, bit_len) for h in hash2uuids]
    rs = [0] * 7
    for h in hash2uuids:
        hash_vectors.append( int_to_bit_vector_large(h, bit_len) )
        report_progress(log.debug, rs, 1.)

    log.info("Initializing ball tree")
    btree = SkLearnBallTreeHashIndex(balltree_model_fp, leaf_size, rand_seed)

    log.info("Building ball tree")
    btree.build_index(hash_vectors)
コード例 #12
0
ファイル: linear.py プロジェクト: Kitware/SMQTK
    def _nn(self, h, n=1):
        """
        Internal method to be implemented by sub-classes to return the nearest
        `N` neighbor hash codes as bit-vectors to the given hash code
        bit-vector.

        Distances are in the range [0,1] and are the percent different each
        neighbor hash is from the query, based on the number of bits contained
        in the query (normalized hamming distance).

        When this internal method is called, we have already checked that our
        index is not empty.

        :param h: Hash code to compute the neighbors of. Should be the same bit
            length as indexed hash codes.
        :type h: numpy.ndarray[bool]

        :param n: Number of nearest neighbors to find.
        :type n: int

        :return: Tuple of nearest N hash codes and a tuple of the distance
            values to those neighbors.
        :rtype: (tuple[numpy.ndarray[bool]], tuple[float])

        """
        with self._model_lock:
            h_int = bit_vector_to_int_large(h)
            bits = len(h)
            #: :type: list[int|long]
            near_codes = \
                heapq.nsmallest(n, self.index,
                                lambda e: hamming_distance(h_int, e)
                                )
            distances = map(hamming_distance, near_codes,
                            [h_int] * len(near_codes))
            return [int_to_bit_vector_large(c, bits) for c in near_codes], \
                   [d / float(bits) for d in distances]
コード例 #13
0
    def _nn(self, h, n=1):
        """
        Internal method to be implemented by sub-classes to return the nearest
        `N` neighbor hash codes as bit-vectors to the given hash code
        bit-vector.

        Distances are in the range [0,1] and are the percent different each
        neighbor hash is from the query, based on the number of bits contained
        in the query (normalized hamming distance).

        When this internal method is called, we have already checked that our
        index is not empty.

        :param h: Hash code to compute the neighbors of. Should be the same bit
            length as indexed hash codes.
        :type h: numpy.ndarray[bool]

        :param n: Number of nearest neighbors to find.
        :type n: int

        :return: Tuple of nearest N hash codes and a tuple of the distance
            values to those neighbors.
        :rtype: (tuple[numpy.ndarray[bool]], tuple[float])

        """
        with self._model_lock:
            h_int = bit_vector_to_int_large(h)
            bits = len(h)
            #: :type: list[int|long]
            near_codes = \
                heapq.nsmallest(n, self.index,
                                lambda e: hamming_distance(h_int, e)
                                )
            distances = map(hamming_distance, near_codes,
                            [h_int] * len(near_codes))
            return [int_to_bit_vector_large(c, bits) for c in near_codes], \
                   [d / float(bits) for d in distances]