def _make_ftor_itq(self, bits=32): itq_ftor = ItqFunctor(bit_length=bits, random_seed=self.RANDOM_SEED) def itq_fit(D): itq_ftor.fit(D) return itq_ftor, itq_fit
def nearestNeighborIndex(item, user, descriptorSet): """ Get the nearest neighbor index from a given item and descriptor set. :param item: Item to find the nn index from, usually the item that the user is performing the nearest neighbors search on. :param user: The owner of the .smqtk folder. :param descriptorSet: The relevant descriptor set. """ folder = ModelImporter.model('folder') _GirderDataElement = functools.partial(GirderDataElement, api_root=getApiUrl(), token=getCurrentToken()['_id']) smqtkFolder = folder.createFolder(folder.load(item['folderId'], user=user), '.smqtk', reuseExisting=True) try: meanVecFileId = localSmqtkFileIdFromName(smqtkFolder, 'mean_vec.npy') rotationFileId = localSmqtkFileIdFromName(smqtkFolder, 'rotation.npy') hash2uuidsFileId = localSmqtkFileIdFromName(smqtkFolder, 'hash2uuids.pickle') except Exception: logger.warn('SMQTK files didn\'t exist for performing NN on %s' % item['_id']) return None # TODO Should these be Girder data elements? Unnecessary HTTP requests. functor = ItqFunctor(mean_vec_cache=_GirderDataElement(meanVecFileId), rotation_cache=_GirderDataElement(rotationFileId)) hash2uuidsKV = MemoryKeyValueStore(_GirderDataElement(hash2uuidsFileId)) return LSHNearestNeighborIndex(functor, descriptorSet, hash2uuidsKV, read_only=True)
def test_save_model_with_writable_caches(self): # If one or both cache elements are read-only, no saving. expected_mean_vec = numpy.array([1, 2, 3]) expected_rotation = numpy.eye(3) expected_mean_vec_bytes = six.BytesIO() # noinspection PyTypeChecker numpy.save(expected_mean_vec_bytes, expected_mean_vec) expected_mean_vec_bytes = expected_mean_vec_bytes.getvalue() expected_rotation_bytes = six.BytesIO() # noinspection PyTypeChecker numpy.save(expected_rotation_bytes, expected_rotation) expected_rotation_bytes = expected_rotation_bytes.getvalue() itq = ItqFunctor() itq.mean_vec = expected_mean_vec itq.rotation = expected_rotation itq.mean_vec_cache_elem = DataMemoryElement(readonly=False) itq.rotation_cache_elem = DataMemoryElement(readonly=False) itq.save_model() self.assertEqual(itq.mean_vec_cache_elem.get_bytes(), expected_mean_vec_bytes) self.assertEqual(itq.rotation_cache_elem.get_bytes(), expected_rotation_bytes)
def test_fit_short_descriptors_for_bit_length(self): # Should error when input descriptors have fewer dimensions than set bit # length for output hash codes (limitation of PCA method currently # used). fit_descriptors = [] for i in range(3): d = DescriptorMemoryElement(six.b('test'), i) d.set_vector([-1 + i, -1 + i]) fit_descriptors.append(d) itq = ItqFunctor(bit_length=8) self.assertRaisesRegex( ValueError, "Input descriptors have fewer features than requested bit encoding", itq.fit, fit_descriptors) self.assertIsNone(itq.mean_vec) self.assertIsNone(itq.rotation) # Should behave the same when input is an iterable self.assertRaisesRegex( ValueError, "Input descriptors have fewer features than requested bit encoding", itq.fit, iter(fit_descriptors)) self.assertIsNone(itq.mean_vec) self.assertIsNone(itq.rotation)
def test_save_model_with_read_only_cache(self): # If one or both cache elements are read-only, no saving. expected_mean_vec = numpy.array([1, 2, 3]) expected_rotation = numpy.eye(3) itq = ItqFunctor() itq.mean_vec = expected_mean_vec itq.rotation = expected_rotation # read-only mean-vec cache itq.mean_vec_cache_elem = DataMemoryElement(readonly=True) itq.rotation_cache_elem = DataMemoryElement(readonly=False) itq.save_model() self.assertEqual(itq.mean_vec_cache_elem.get_bytes(), six.b('')) self.assertEqual(itq.rotation_cache_elem.get_bytes(), six.b('')) # read-only rotation cache itq.mean_vec_cache_elem = DataMemoryElement(readonly=False) itq.rotation_cache_elem = DataMemoryElement(readonly=True) itq.save_model() self.assertEqual(itq.mean_vec_cache_elem.get_bytes(), six.b('')) self.assertEqual(itq.rotation_cache_elem.get_bytes(), six.b('')) # Both read-only itq.mean_vec_cache_elem = DataMemoryElement(readonly=True) itq.rotation_cache_elem = DataMemoryElement(readonly=True) itq.save_model() self.assertEqual(itq.mean_vec_cache_elem.get_bytes(), six.b('')) self.assertEqual(itq.rotation_cache_elem.get_bytes(), six.b(''))
def test_get_hash(self): fit_descriptors = [] for i in range(5): d = DescriptorMemoryElement(six.b('test'), i) d.set_vector([-2. + i, -2. + i]) fit_descriptors.append(d) # The following "rotation" matrix should cause any 2-feature descriptor # to the right of the line ``y = -x`` to be True, and to the left as # False. If on the line, should be True. itq = ItqFunctor(bit_length=1, random_seed=0) itq.mean_vec = numpy.array([0., 0.]) itq.rotation = numpy.array([[1. / sqrt(2)], [1. / sqrt(2)]]) numpy.testing.assert_array_equal(itq.get_hash(numpy.array([1, 1])), [True]) numpy.testing.assert_array_equal(itq.get_hash(numpy.array([-1, -1])), [False]) numpy.testing.assert_array_equal(itq.get_hash(numpy.array([-1, 1])), [True]) numpy.testing.assert_array_equal( itq.get_hash(numpy.array([-1.001, 1])), [False]) numpy.testing.assert_array_equal( itq.get_hash(numpy.array([-1, 1.001])), [True]) numpy.testing.assert_array_equal(itq.get_hash(numpy.array([1, -1])), [True]) numpy.testing.assert_array_equal( itq.get_hash(numpy.array([1, -1.001])), [False]) numpy.testing.assert_array_equal( itq.get_hash(numpy.array([1.001, -1])), [True])
def test_fit_with_cache(self): fit_descriptors = [] for i in range(5): d = DescriptorMemoryElement(six.b('test'), i) d.set_vector([-2. + i, -2. + i]) fit_descriptors.append(d) itq = ItqFunctor(DataMemoryElement(), DataMemoryElement(), bit_length=1, random_seed=0) itq.fit(fit_descriptors) # TODO: Explanation as to why this is the expected result. numpy.testing.assert_array_almost_equal(itq.mean_vec, [0, 0]) numpy.testing.assert_array_almost_equal(itq.rotation, [[1 / sqrt(2)], [1 / sqrt(2)]]) self.assertIsNotNone(itq.mean_vec_cache_elem) numpy.testing.assert_array_almost_equal( numpy.load(six.BytesIO(itq.mean_vec_cache_elem.get_bytes())), [0, 0]) self.assertIsNotNone(itq.rotation_cache_elem) numpy.testing.assert_array_almost_equal( numpy.load(six.BytesIO(itq.rotation_cache_elem.get_bytes())), [[1 / sqrt(2)], [1 / sqrt(2)]])
def _nearestNeighborIndex(sid, descriptor_set): """ Retrieve the Nearest neighbor index for a given session. :param sid: ID of the session :param descriptor_set: The descriptor set corresponding to the session id, see _descriptorSetFromSessionId. :returns: Nearest neighbor index or None if no session exists :rtype: LSHNearestNeighborIndex|None """ session = ModelImporter.model('item').findOne({'_id': ObjectId(sid)}) if not session: return None else: smqtkFolder = {'_id': ObjectId(session['meta']['smqtk_folder_id'])} functor = ItqFunctor( smqtkDataElementFromGirderFileId( localSmqtkFileIdFromName(smqtkFolder, 'mean_vec.npy')), smqtkDataElementFromGirderFileId( localSmqtkFileIdFromName(smqtkFolder, 'rotation.npy'))) hash2uuidsKV = MemoryKeyValueStore( smqtkDataElementFromGirderFileId( localSmqtkFileIdFromName(smqtkFolder, 'hash2uuids.pickle'))) return LSHNearestNeighborIndex(functor, descriptor_set, hash2uuidsKV, read_only=True)
def test_norm_vector_n2(self): itq = ItqFunctor(normalize=2) v = numpy.array([1, 0]) numpy.testing.assert_array_almost_equal(itq._norm_vector(v), [1, 0]) v = numpy.array([1, 1]) numpy.testing.assert_array_almost_equal(itq._norm_vector(v), [1. / sqrt(2), 1. / sqrt(2)])
def test_fit_has_model(self): # When trying to run fit where there is already a mean vector and # rotation set. itq = ItqFunctor() itq.mean_vec = 'sim vec' itq.rotation = 'sim rot' self.assertRaisesRegex(RuntimeError, "Model components have already been loaded.", itq.fit, [])
def test_get_config_no_cache(self): itq = ItqFunctor(bit_length=1, itq_iterations=2, normalize=3, random_seed=4) c = itq.get_config() NT.assert_equal(c['bit_length'], 1) NT.assert_equal(c['itq_iterations'], 2) NT.assert_equal(c['normalize'], 3) NT.assert_equal(c['random_seed'], 4) NT.assert_is_none(c['mean_vec_cache']['type']) NT.assert_is_none(c['rotation_cache']['type'])
def test_norm_vector_no_normalization(self): itq = ItqFunctor(normalize=None) v = numpy.array([0, 1]) numpy.testing.assert_array_equal(itq._norm_vector(v), v) v = numpy.array([[0, 1, 1, .4, .1]]) numpy.testing.assert_array_equal(itq._norm_vector(v), v) v = numpy.array([0] * 128) numpy.testing.assert_array_equal(itq._norm_vector(v), v)
def test_save_model_no_caches(self): expected_mean_vec = numpy.array([1, 2, 3]) expected_rotation = numpy.eye(3) # Cache variables should remain None after save. itq = ItqFunctor() itq.mean_vec = expected_mean_vec itq.rotation = expected_rotation itq.save_model() self.assertIsNone(itq.mean_vec_cache_elem) self.assertIsNone(itq.mean_vec_cache_elem)
def test_configuration(self): i = LSHNearestNeighborIndex(lsh_functor=ItqFunctor(), descriptor_set=MemoryDescriptorSet(), hash2uuids_kvstore=MemoryKeyValueStore(), hash_index=LinearHashIndex(), distance_method='euclidean', read_only=True) for inst in configuration_test_helper( i): # type: LSHNearestNeighborIndex assert isinstance(inst.lsh_functor, LshFunctor) assert isinstance(inst.descriptor_set, MemoryDescriptorSet) assert isinstance(inst.hash_index, LinearHashIndex) assert isinstance(inst.hash2uuids_kvstore, MemoryKeyValueStore) assert inst.distance_method == 'euclidean' assert inst.read_only is True
def test_has_model(self): itq = ItqFunctor() # with no vector/rotation set, should return false. self.assertFalse(itq.has_model()) # If only one of the two is None, then false should be returned. itq.mean_vec = 'mean vec' itq.rotation = None self.assertFalse(itq.has_model()) itq.mean_vec = None itq.rotation = 'rotation' self.assertFalse(itq.has_model()) # If both are not None, return true. itq.mean_vec = 'mean vec' itq.rotation = 'rotation' self.assertTrue(itq.has_model())
def test_get_config_with_cache_elements(self): itq = ItqFunctor(bit_length=5, itq_iterations=6, normalize=7, random_seed=8) itq.mean_vec_cache_elem = DataMemoryElement('cached vec bytes') itq.rotation_cache_elem = DataMemoryElement('cached rot bytes') c = itq.get_config() NT.assert_equal(c['bit_length'], 5) NT.assert_equal(c['itq_iterations'], 6) NT.assert_equal(c['normalize'], 7) NT.assert_equal(c['random_seed'], 8) NT.assert_equal(c['mean_vec_cache']['type'], "DataMemoryElement") NT.assert_equal(c['mean_vec_cache']['DataMemoryElement']['bytes'], 'cached vec bytes') NT.assert_equal(c['rotation_cache']['DataMemoryElement']['bytes'], 'cached rot bytes')
def test_fit(self): fit_descriptors = [] for i in range(5): d = DescriptorMemoryElement('test', i) d.set_vector([-2. + i, -2. + i]) fit_descriptors.append(d) itq = ItqFunctor(bit_length=1, random_seed=0) itq.fit(fit_descriptors) # TODO: Explanation as to why this is the expected result. numpy.testing.assert_array_almost_equal(itq.mean_vec, [0, 0]) numpy.testing.assert_array_almost_equal(itq.rotation, [[1 / sqrt(2)], [1 / sqrt(2)]]) NT.assert_is_none(itq.mean_vec_cache_elem) NT.assert_is_none(itq.rotation_cache_elem)
def test_get_config_with_cache_elements(self): itq = ItqFunctor(bit_length=5, itq_iterations=6, normalize=7, random_seed=8) itq.mean_vec_cache_elem = DataMemoryElement(b'cached vec bytes') itq.rotation_cache_elem = DataMemoryElement(b'cached rot bytes') c = itq.get_config() self.assertEqual(c['bit_length'], 5) self.assertEqual(c['itq_iterations'], 6) self.assertEqual(c['normalize'], 7) self.assertEqual(c['random_seed'], 8) self.assertEqual(c['mean_vec_cache']['type'], "DataMemoryElement") # Check using string encodings of set bytes (JSON compliant). self.assertEqual(c['mean_vec_cache']['DataMemoryElement']['bytes'], 'cached vec bytes') self.assertEqual(c['rotation_cache']['DataMemoryElement']['bytes'], 'cached rot bytes')
def compute_hash_codes(task, folderId, **kwargs): """ Celery task for computing hash codes on a given folder (descriptor index). :param task: Celery provided task object. :param folderId: The folder to train ITQ for, note this is only used to infer the descriptor index. """ task.job_manager.updateProgress(message='Computing Hash Codes', forceFlush=True) index = descriptorIndexFromFolderId(task.girder_client, folderId) smqtkFolder = getCreateFolder(task.girder_client, folderId, '.smqtk') meanVecFileId = smqtkFileIdFromName(task.girder_client, smqtkFolder, 'mean_vec.npy') rotationFileId = smqtkFileIdFromName(task.girder_client, smqtkFolder, 'rotation.npy') hash2uuidsFile = initializeItemWithFile( task.girder_client, createOverwriteItem(task.girder_client, smqtkFolder['_id'], 'hash2uuids.pickle')) functor = ItqFunctor( mean_vec_cache=GirderDataElement( meanVecFileId, api_root=task.request.apiUrl, token=task.request.jobInfoSpec['headers']['Girder-Token']), rotation_cache=GirderDataElement( rotationFileId, api_root=task.request.apiUrl, token=task.request.jobInfoSpec['headers']['Girder-Token'])) hash2uuids = compute_functions.compute_hash_codes(index.iterkeys(), index, functor, use_mp=False) data = pickle.dumps(dict((y, x) for (x, y) in hash2uuids)) task.girder_client.uploadFileContents(hash2uuidsFile['_id'], six.BytesIO(data), len(data))
def itq(task, folderId, **kwargs): """ Celery task for training ITQ on a given folder. This trains ITQ on all descriptors within the index. Since this is typically called after computing descriptors, it will often only contain what's in the folder. :param task: Celery provided task object. :param folderId: The folder to train ITQ for, note this is only used to infer the descriptor index. """ task.job_manager.updateProgress(message='Training ITQ', forceFlush=True) index = descriptorIndexFromFolderId(task.girder_client, folderId) if not index.count(): # TODO SMQTK should account for this? raise Exception('Descriptor index is empty, cannot train ITQ.') smqtkFolder = getCreateFolder(task.girder_client, folderId, '.smqtk') meanVecFile = initializeItemWithFile( task.girder_client, createOverwriteItem(task.girder_client, smqtkFolder['_id'], 'mean_vec.npy')) rotationFile = initializeItemWithFile( task.girder_client, createOverwriteItem(task.girder_client, smqtkFolder['_id'], 'rotation.npy')) functor = ItqFunctor( mean_vec_cache=GirderDataElement( meanVecFile['_id'], api_root=task.request.apiUrl, token=task.request.jobInfoSpec['headers']['Girder-Token']), rotation_cache=GirderDataElement( rotationFile['_id'], api_root=task.request.apiUrl, token=task.request.jobInfoSpec['headers']['Girder-Token'])) functor.fit(index.iterdescriptors(), use_multiprocessing=False)
def test_build_index_read_only(self): index = LSHNearestNeighborIndex(ItqFunctor(), MemoryDescriptorIndex(), MemoryKeyValueStore(), read_only=True) ntools.assert_raises(ReadOnlyError, index.build_index, [])