Ejemplo n.º 1
0
    def set_bytes(self, b):
        """
        Set bytes to this data element in the form of a string.

        Not all implementations may support setting bytes (writing). See the
        ``writable`` method.

        :param b: bytes to set.
        :type b: str

        :raises ReadOnlyError: This data element can only be read from / does
            not support writing.

        """
        if not self.writable():
            raise ReadOnlyError(
                'Unauthorized access to write to Girder file %s' %
                self.file_id)

        try:
            self.gc.uploadFileContents(self.file_id, six.BytesIO(b), len(b))
        except girder_client.HttpError as e:
            if e.status == 401:
                raise ReadOnlyError('Unauthorized access to write to Girder '
                                    'file %s' % self.file_id)
            else:
                raise e
Ejemplo n.º 2
0
    def remove_many_descriptors(self, uuids):
        """
        Remove descriptors associated to given descriptor UUIDs from this index.

        :param uuids: Iterable of descriptor UUIDs to remove.
        :type uuids: collections.Iterable[collections.Hashable]

        :raises KeyError: A given UUID doesn't associate with a
            DescriptorElement in this index.

        """
        if self.read_only:
            raise ReadOnlyError("Cannot clear a read-only index.")

        q = self.DELETE_MANY_TMPL.format(
            table_name=self.table_name,
            uuid_col=self.uuid_col,
        )
        str_uuid_set = set(str(uid) for uid in uuids)
        v = {'uuid_tuple': tuple(str_uuid_set)}

        def execute(c):
            c.execute(q, v)

            # Check query UUIDs against rows that would actually be deleted.
            deleted_uuid_set = set(r[0] for r in c.fetchall())
            for uid in str_uuid_set:
                if uid not in deleted_uuid_set:
                    raise KeyError(uid)

        list(self._single_execute(execute))
Ejemplo n.º 3
0
    def remove_descriptor(self, uuid):
        """
        Remove a descriptor from this index by the given UUID.

        :param uuid: UUID of the DescriptorElement to remove.
        :type uuid: collections.Hashable

        :raises KeyError: The given UUID doesn't associate to a
            DescriptorElement in this index.

        """
        if self.read_only:
            raise ReadOnlyError("Cannot clear a read-only index.")

        q = self.DELETE_LIKE_TMPL.format(
            table_name=self.table_name,
            uuid_col=self.uuid_col,
        )
        v = {'uuid_like': str(uuid)}

        def execute(c):
            c.execute(q, v)
            # Nothing deleted if rowcount == 0
            # (otherwise 1 when deleted a thing)
            if c.rowcount == 0:
                raise KeyError(uuid)

        list(self._single_execute(execute))
Ejemplo n.º 4
0
    def add_descriptor(self, descriptor):
        """
        Add a descriptor to this index.

        Adding the same descriptor multiple times should not add multiple copies
        of the descriptor in the index (based on UUID). Added descriptors
        overwrite indexed descriptors based on UUID.

        :param descriptor: Descriptor to index.
        :type descriptor: smqtk.representation.DescriptorElement

        """
        if self.read_only:
            raise ReadOnlyError("Cannot clear a read-only index.")

        q = self.UPSERT_TMPL.format(
            table_name=self.table_name,
            uuid_col=self.uuid_col,
            element_col=self.element_col,
        )
        v = {
            'uuid_val':
            str(descriptor.uuid()),
            'element_val':
            psycopg2.Binary(pickle.dumps(descriptor, self.pickle_protocol))
        }

        def exec_hook(cur):
            cur.execute(q, v)

        list(self._single_execute(exec_hook))
Ejemplo n.º 5
0
    def build_index(self, descriptors):
        """
        Build the index over the descriptor data elements.

        Subsequent calls to this method should rebuild the index, not add to
        it, or raise an exception to as to protect the current index.

        :raises ValueError: No data available in the given iterable.

        :param descriptors: Iterable of descriptor elements to build index
            over.
        :type descriptors:
            collections.Iterable[smqtk.representation.DescriptorElement]

        """
        if self.read_only:
            raise ReadOnlyError("Cannot modify container attributes due to "
                                "being in read-only mode.")

        super(FaissNearestNeighborsIndex, self).build_index(descriptors)

        self._log.info("Building new FAISS index")

        self._log.debug("Clearing and adding new descriptor elements")
        self._descriptor_set.clear()
        self._descriptor_set.add_many_descriptors(descriptors)

        self._log.debug('Building FAISS index')
        self._build_faiss_model()
Ejemplo n.º 6
0
    def _build_index(self, descriptors):
        """
        Internal method to be implemented by sub-classes to build the index with
        the given descriptor data elements.

        Subsequent calls to this method should rebuild the current index.  This
        method shall not add to the existing index nor raise an exception to as
        to protect the current index.

        :param descriptors: Iterable of descriptor elements to build index
            over.
        :type descriptors:
            collections.Iterable[smqtk.representation.DescriptorElement]

        """
        with self._model_lock:
            if self._read_only:
                raise ReadOnlyError(
                    "Cannot modify container attributes due to "
                    "being in read-only mode.")

            self._log.info("Building new MRPT index")

            self._log.debug("Clearing and adding new descriptor elements")
            # NOTE: It may be the case for some DescriptorIndex implementations,
            # this clear may interfere with iteration when part of the input
            # iterator of descriptors was this index's previous descriptor-set,
            # as is the case with ``update_index``.
            self._descriptor_set.clear()
            self._descriptor_set.add_many_descriptors(descriptors)

            self._log.debug('Building MRPT index')
            self._build_multiple_trees()

            self._save_mrpt_model()
Ejemplo n.º 7
0
    def _remove_from_index(self, uids):
        """
        Internal method to be implemented by sub-classes to partially remove
        descriptors from this index associated with the given UIDs.

        :param uids: Iterable of UIDs of descriptors to remove from this index.
        :type uids: collections.Iterable[collections.Hashable]

        :raises KeyError: One or more UIDs provided do not match any stored
            descriptors.

        """
        if self.read_only:
            raise ReadOnlyError("Cannot modify read-only index.")

        with self._model_lock:
            # Check that provided IDs are present in uid2idx mapping.
            uids_d = collections.deque()
            for uid in uids:
                if uid not in self._uid2idx_kvs:
                    raise KeyError(uid)
                uids_d.append(uid)

            # Remove elements from structures
            # - faiss remove_ids requires a np.ndarray of int64 type.
            rm_idxs = np.asarray([self._uid2idx_kvs[uid] for uid in uids_d],
                                 dtype=np.int64)
            self._faiss_index.remove_ids(rm_idxs)
            self._descriptor_set.remove_many_descriptors(uids_d)
            self._uid2idx_kvs.remove_many(uids_d)
            self._idx2uid_kvs.remove_many(rm_idxs)
            self._save_faiss_model()
Ejemplo n.º 8
0
    def add_many(self, d):
        """
        Add multiple key-value pairs at a time into this store as represented in
        the provided dictionary `d`.

        :param d: Dictionary of key-value pairs to add to this store.
        :type d: dict[collections.Hashable, object]

        :return: Self.
        :rtype: KeyValueStore

        """
        # Custom override to take advantage of PSQL batching.
        if self.is_read_only():
            raise ReadOnlyError("Cannot add to read-only instance %s." % self)

        q = self.SqlTemplates.UPSERT_TMPL.format(
            table_name=self._table_name,
            key_col=self._key_col,
            value_col=self._value_col,
        )

        # Iterator over transformed inputs into values for statement.
        def val_iter():
            for key, val in six.iteritems(d):
                yield {
                    'key': self._py_to_bin(key),
                    'val': self._py_to_bin(val)
                }

        def cb(cur, v_batch):
            cur.executemany(q, v_batch)

        list(self._psql_helper.batch_execute(val_iter(), cb, self._batch_size))
Ejemplo n.º 9
0
    def _remove_from_index(self, uids):
        """
        Remove descriptors from this index associated with the given UIDs.

        :param uids: Iterable of UIDs of descriptors to remove from this index.
        :type uids: collections.Iterable[collections.Hashable]

        :raises KeyError: One or more UIDs provided do not match any stored
            descriptors.  The index should not be modified.
        :raises ReadOnlyError: This index is set to be read-only and cannot be
            modified.

        """
        with self._model_lock:
            if self.read_only:
                raise ReadOnlyError("Cannot modify container attributes due "
                                    "to being in read-only mode.")

            uids = list(uids)

            # Remove UIDs from our hash2uid-kvs
            # - get the hash for each input UID's descriptor, remove UID from
            #   recorded association set.
            # - `get_many_descriptors` fails when bad UIDs are provided
            #   (KeyError).
            self._log.debug("Removing hash2uid entries for UID's descriptors")
            h_vectors = collections.deque()
            h_ints = collections.deque()
            for d in self.descriptor_index.get_many_descriptors(uids):
                h_vec = self.lsh_functor.get_hash(d.vector())
                h_vectors.append(h_vec)
                h_int = bit_vector_to_int_large(h_vec)
                h_ints.append(h_int)

            # If we're here, then all given UIDs mapped to an indexed
            # descriptor.  Proceed with removal from hash2uids kvs.  If a hash
            # no longer maps anything, remove that hash from the hash index if
            # we have one.
            hashes_for_removal = collections.deque()
            for uid, h_int, h_vec in zip(uids, h_ints, h_vectors):
                # noinspection PyUnresolvedReferences
                new_uid_set = self.hash2uuids_kvstore.get(h_int) - {uid}
                # If the resolved UID set is not empty re-add it, otherwise
                # remove the
                if new_uid_set:
                    self.hash2uuids_kvstore.add(h_int, new_uid_set)
                else:
                    hashes_for_removal.append(h_vec)
                    self.hash2uuids_kvstore.remove(h_int)

            # call remove-from-index on hash-index if we have one and there are
            # hashes to be removed.
            if self.hash_index and hashes_for_removal:
                self.hash_index.remove_from_index(hashes_for_removal)

            # Remove descriptors from our set matching the given UIDs.
            self.descriptor_index.remove_many_descriptors(uids)
Ejemplo n.º 10
0
    def _update_index(self, descriptors):
        """
        Internal method to be implemented by sub-classes to additively update
        the current index with the one or more descriptor elements given.

        If no index exists yet, a new one should be created using the given
        descriptors.

        :raises ReadOnlyError: This index is set to be read-only and cannot be
            modified.

        :param descriptors: Iterable of descriptor elements to add to this
            index.
        :type descriptors:
            collections.Iterable[smqtk.representation.DescriptorElement]

        """
        with self._model_lock:
            if self.read_only:
                raise ReadOnlyError("Cannot modify container attributes due "
                                    "to being in read-only mode.")
            # tee out iterable for use in adding to index as well as hash code
            # generation.
            d_for_index, d_for_hashing = itertools.tee(descriptors, 2)

            self._log.debug("Updating descriptor index.")
            self.descriptor_index.add_many_descriptors(d_for_index)

            self._log.debug("Generating hash codes for new descriptors")
            prog_reporter = ProgressReporter(self._log.debug, 1.0).start()
            #: :type: collections.deque[numpy.ndarray[bool]]
            hash_vectors = collections.deque()  # for updating hash_index
            # for updating kv-store after collecting new hash codes
            kvstore_update = {}
            for d in d_for_hashing:
                h_vec = self.lsh_functor.get_hash(d.vector())
                hash_vectors.append(h_vec)
                h_int = bit_vector_to_int_large(h_vec)
                # Get, update and reinsert hash UUID set object.
                if h_int not in kvstore_update:
                    #: :type: set
                    kvstore_update[h_int] = \
                        self.hash2uuids_kvstore.get(h_int, set())
                kvstore_update[h_int] |= {d.uuid()}
                prog_reporter.increment_report()
            prog_reporter.report()

            self._log.debug("Updating kv-store with new hash codes")
            self.hash2uuids_kvstore.add_many(kvstore_update)
            del kvstore_update

            if self.hash_index is not None:
                self._log.debug("Updating hash index structure.")
                self.hash_index.update_index(hash_vectors)
Ejemplo n.º 11
0
    def matrix(self, m):
        """
        :param numpy.ndarray m:
            New ndarray instance to set as the contained matrix.

        :raises ReadOnlyError: This data element can only be read from / does
            not support writing.
        """
        if not self.writable():
            raise ReadOnlyError("This %s element is read only." % self)
        self._matrix = numpy.asarray(m)
Ejemplo n.º 12
0
    def _build_index(self, descriptors):
        """
        Internal method to be implemented by sub-classes to build the index with
        the given descriptor data elements.

        Subsequent calls to this method should rebuild the current index.  This
        method shall not add to the existing index nor raise an exception to as
        to protect the current index.

        :raises ReadOnlyError: This index is set to be read-only and cannot be
            modified.

        :param descriptors: Iterable of descriptor elements to build index
            over.
        :type descriptors:
            collections.Iterable[smqtk.representation.DescriptorElement]

        """
        with self._model_lock:
            if self.read_only:
                raise ReadOnlyError(
                    "Cannot modify container attributes due to "
                    "being in read-only mode.")

            self._log.debug("Clearing and adding new descriptor elements")
            self.descriptor_index.clear()
            self.descriptor_index.add_many_descriptors(descriptors)

            self._log.debug("Generating hash codes")
            #: :type: collections.deque[numpy.ndarray[bool]]
            hash_vectors = collections.deque()
            self.hash2uuids_kvstore.clear()
            prog_reporter = ProgressReporter(self._log.debug, 1.0).start()
            for d in self.descriptor_index:
                h_vec = self.lsh_functor.get_hash(d.vector())
                hash_vectors.append(h_vec)

                h_int = bit_vector_to_int_large(h_vec)

                # Get, update and reinsert hash UUID set object
                #: :type: set
                hash_uuid_set = self.hash2uuids_kvstore.get(h_int, set())
                hash_uuid_set.add(d.uuid())
                self.hash2uuids_kvstore.add(h_int, hash_uuid_set)

                prog_reporter.increment_report()
            prog_reporter.report()

            if self.hash_index is not None:
                self._log.debug("Clearing and building hash index of type %s",
                                type(self.hash_index))
                # a build is supposed to clear previous state.
                self.hash_index.build_index(hash_vectors)
Ejemplo n.º 13
0
    def build_index(self, descriptors):
        """
        Build the index over the descriptor data elements. This in turn builds
        the configured hash index if one is set.

        Subsequent calls to this method should rebuild the index, not add to
        it, or raise an exception to as to protect the current index. Rebuilding
        the LSH index involves clearing the set descriptor index, key-value
        store and, if set, the hash index.

        :raises ValueError: No data available in the given iterable.

        :param descriptors: Iterable of descriptor elements to build index
            over.
        :type descriptors:
            collections.Iterable[smqtk.representation.DescriptorElement]

        """
        if self.read_only:
            raise ReadOnlyError("Cannot modify container attributes due to "
                                "being in read-only mode.")

        self._log.debug("Clearing and adding new descriptor elements")
        self.descriptor_index.clear()
        self.descriptor_index.add_many_descriptors(descriptors)

        self._log.debug("Generating hash codes")
        state = [0] * 7
        hash_vectors = collections.deque()
        self.hash2uuids_kvstore.clear()
        for d in self.descriptor_index:
            h = self.lsh_functor.get_hash(d.vector())
            hash_vectors.append(h)

            h_int = bit_vector_to_int_large(h)

            # Get, update and reinsert hash UUID set object
            #: :type: set
            hash_uuid_set = self.hash2uuids_kvstore.get(h_int, set())
            hash_uuid_set.add(d.uuid())
            self.hash2uuids_kvstore.add(h_int, hash_uuid_set)

            report_progress(self._log.debug, state, 1.0)
        state[1] -= 1
        report_progress(self._log.debug, state, 0)

        if self.hash_index is not None:
            self._log.debug("Clearing and building hash index of type %s",
                            type(self.hash_index))
            # a build is supposed to clear previous state.
            self.hash_index.build_index(hash_vectors)
Ejemplo n.º 14
0
    def _build_index(self, descriptors):
        """
        Internal method to be implemented by sub-classes to build the index
        with the given descriptor data elements.

        Subsequent calls to this method should rebuild the current index.  This
        method shall not add to the existing index nor raise an exception to as
        to protect the current index.

        :raises ReadOnlyError: This index is set to be read-only and cannot be
            modified.

        :param descriptors: Iterable of descriptor elements to build index
            over.
        :type descriptors:
            collections.abc.Iterable[smqtk.representation.DescriptorElement]

        """
        with self._model_lock:
            if self.read_only:
                raise ReadOnlyError("Cannot modify container attributes due "
                                    "to being in read-only mode.")

            self._log.debug("Clearing and adding new descriptor elements")
            self.descriptor_set.clear()
            self.descriptor_set.add_many_descriptors(descriptors)

            self._log.debug("Generating hash codes")
            hash_vectors: Deque[numpy.ndarray] = collections.deque()
            self.hash2uuids_kvstore.clear()
            prog_reporter = ProgressReporter(self._log.debug, 1.0).start()
            # We just cleared the previous store, so aggregate new kv-mapping
            # in ``kvstore_update`` for single update after loop.
            kvstore_update: Dict[
                int, Set[Hashable]
            ] = collections.defaultdict(set)
            for d in self.descriptor_set:
                h_vec = self.lsh_functor.get_hash(d.vector())
                hash_vectors.append(h_vec)
                h_int = bit_vector_to_int_large(h_vec)
                kvstore_update[h_int] |= {d.uuid()}
                prog_reporter.increment_report()
            prog_reporter.report()
            self.hash2uuids_kvstore.add_many(kvstore_update)
            del kvstore_update

            if self.hash_index is not None:
                self._log.debug("Clearing and building hash index of type %s",
                                type(self.hash_index))
                # a build is supposed to clear previous state.
                self.hash_index.build_index(hash_vectors)
Ejemplo n.º 15
0
    def cache(self):
        """
        Cache the current table if a cache has been configured.
        """
        if self.cache_element:
            if self.cache_element.is_read_only():
                raise ReadOnlyError("Cache element (%s) is read-only." %
                                    self.cache_element)

            with self._element_map_lock:
                with SimpleTimer("Caching memory data-set table",
                                 self._log.debug):
                    self.cache_element.set_bytes(
                        pickle.dumps(self._element_map, self.pickle_protocol))
Ejemplo n.º 16
0
    def clear(self):
        """
        Clear this key-value store.

        *NOTE:* **Implementing sub-classes should call this super-method. This
        super method should not be considered a critical section for thread
        safety.**

        :raises ReadOnlyError: If this instance is marked as read-only.

        """
        if self.is_read_only():
            raise ReadOnlyError("Cannot clear a read-only %s instance." %
                                self.__class__.__name__)
Ejemplo n.º 17
0
    def set_bytes(self, b):
        """
        Set bytes to this data element in the form of a string.

        Not all implementations may support setting bytes (writing). See the
        ``writable`` method.

        :param b: bytes to set.
        :type b: str

        :raises ReadOnlyError: This data element can only be read from / does
            not support writing.

        """
        raise ReadOnlyError("HBase elements cannot write data.")
Ejemplo n.º 18
0
    def clear(self):
        """
        Clear this descriptor index's entries.
        """
        if self.read_only:
            raise ReadOnlyError("Cannot clear a read-only index.")

        q = self.DELETE_LIKE_TMPL.format(
            table_name=self.table_name,
            uuid_col=self.uuid_col,
        )

        def exec_hook(cur):
            cur.execute(q, {'uuid_like': '%'})

        list(self._single_execute(exec_hook))
Ejemplo n.º 19
0
    def set_bytes(self, b):
        """
        Set bytes to this data element in the form of a string.

        Previous content type value is maintained.

        :param b: bytes to set.
        :type b: str

        :raises ReadOnlyError: This data element can only be read from / does
            not support writing.

        """
        if not self._readonly:
            self._bytes = b
        else:
            raise ReadOnlyError("This memory element cannot be written to.")
Ejemplo n.º 20
0
    def add_many(self, d):
        """
        Add multiple key-value pairs at a time into this store as represented
        in the provided dictionary `d`.

        :param d: Dictionary of key-value pairs to add to this store.
        :type d: dict[collections.Hashable, object]

        :raises ReadOnlyError: If this instance is marked as read-only.

        :return: Self.
        :rtype: KeyValueStore

        """
        # Input keys must already be hashable because they're in a dictionary.
        if self.is_read_only():
            raise ReadOnlyError("Cannot add to read-only instance %s." % self)
Ejemplo n.º 21
0
    def set_bytes(self, b):
        """
        Set bytes to this data element in the form of a string.

        Not all implementations may support setting bytes (writing). See the
        ``writable`` method.

        :param b: bytes to set.
        :type b: str

        :raises ReadOnlyError: This data element can only be read from / does
            not support writing.

        """
        if not self._readonly:
            safe_file_write(self._filepath, b)
        else:
            raise ReadOnlyError("This file element is read only.")
Ejemplo n.º 22
0
    def _remove_from_index(self, uids):
        """
        Internal method to be implemented by sub-classes to partially remove
        descriptors from this index associated with the given UIDs.

        :param uids: Iterable of UIDs of descriptors to remove from this index.
        :type uids: collections.Iterable[collections.Hashable]

        :raises KeyError: One or more UIDs provided do not match any stored
            descriptors.

        """
        with self._model_lock:
            if self._read_only:
                raise ReadOnlyError("Cannot modify container attributes due "
                                    "to being in read-only mode.")
            self._descriptor_set.remove_many_descriptors(uids)
            self.build_index(self._descriptor_set)
Ejemplo n.º 23
0
    def remove(self, key):
        """
        Remove a single key-value entry.

        :param key: Key to remove.
        :type key: collections.Hashable

        :raises ReadOnlyError: If this instance is marked as read-only.
        :raises KeyError: The given key is not present in this store and no
            default value given.

        :return: Self.
        :rtype: KeyValueStore

        """
        if self.is_read_only():
            raise ReadOnlyError("Cannot remove from read-only instance %s." %
                                self)
Ejemplo n.º 24
0
    def set_bytes(self, b):
        """
        Set bytes to this data element.

        Not all implementations may support setting bytes (check ``writable``
        method return).

        This base abstract method should be called by sub-class implementations
        first. We check for mutability based on ``writable()`` method return.

        :param b: bytes to set.
        :type b: bytes

        :raises ReadOnlyError: This data element can only be read from / does
            not support writing.

        """
        if not self.writable():
            raise ReadOnlyError("This %s element is read only." % self)
Ejemplo n.º 25
0
    def remove_many(self, keys):
        """
        Remove multiple keys and associated values.

        :param keys: Iterable of keys to remove.  If this is empty this method
            does nothing.
        :type keys: collections.Iterable[collections.Hashable]

        :raises ReadOnlyError: If this instance is marked as read-only.
        :raises KeyError: The given key is not present in this store and no
            default value given.  The store is not modified if any key is
            invalid.

        :return: Self.
        :rtype: KeyValueStore

        """
        if self.is_read_only():
            raise ReadOnlyError("Cannot remove from read-only instance %s." %
                                self)
Ejemplo n.º 26
0
    def add_many_descriptors(self, descriptors):
        """
        Add multiple descriptors at one time.

        Adding the same descriptor multiple times should not add multiple copies
        of the descriptor in the set (based on UUID). Added descriptors
        overwrite set descriptors based on UUID.

        :param descriptors: Iterable of descriptor instances to add to this
            set.
        :type descriptors:
            collections.Iterable[smqtk.representation.DescriptorElement]

        """
        if self.read_only:
            raise ReadOnlyError("Cannot clear a read-only set.")

        q = self.UPSERT_TMPL.format(
            table_name=self.table_name,
            uuid_col=self.uuid_col,
            element_col=self.element_col,
        )

        # Transform input into
        def iter_elements():
            for d in descriptors:
                yield {
                    'uuid_val':
                    str(d.uuid()),
                    'element_val':
                    psycopg2.Binary(pickle.dumps(d, self.pickle_protocol))
                }

        def exec_hook(cur, batch):
            cur.executemany(q, batch)

        self._log.debug("Adding many descriptors")
        list(
            self.psql_helper.batch_execute(iter_elements(), exec_hook,
                                           self.multiquery_batch_size))
Ejemplo n.º 27
0
    def add(self, key, value):
        """
        Add a key-value pair to this store.

        *NOTE:* **Implementing sub-classes should call this super-method. This
        super method should not be considered a critical section for thread
        safety unless ``is_read_only`` is not thread-safe.**

        :param key: Key for the value. Must be hashable.
        :type key: collections.Hashable

        :param value: Python object to store.
        :type value: object

        :raises ReadOnlyError: If this instance is marked as read-only.

        :return: Self.
        :rtype: KeyValueStore

        """
        if self.is_read_only():
            raise ReadOnlyError("Cannot add to read-only instance %s." % self)
Ejemplo n.º 28
0
    def _update_index(self, descriptors):
        """
        Internal method to be implemented by sub-classes to additively update
        the current index with the one or more descriptor elements given.

        If no index exists yet, a new one should be created using the given
        descriptors.

        *NOTE:* This implementation fully rebuilds the index using the current
        index contents merged with the provided new descriptor elements.

        :param descriptors: Iterable of descriptor elements to add to this
            index.
        :type descriptors:
            collections.Iterable[smqtk.representation.DescriptorElement]

        """
        with self._model_lock:
            if self._read_only:
                raise ReadOnlyError("Cannot modify container attributes due "
                                    "to being in read-only mode.")
            self._log.debug("Updating index by rebuilding with union. ")
            self.build_index(chain(self._descriptor_set, descriptors))
Ejemplo n.º 29
0
    def _update_index(self, descriptors):
        """
        Internal method to be implemented by sub-classes to additively update
        the current index with the one or more descriptor elements given.

        If no index exists yet, a new one should be created using the given
        descriptors.

        If any descriptors have already been added, they will be not be
        re-inserted, but a warning will be raised.

        :param descriptors: Iterable of descriptor elements to add to this
            index.
        :type descriptors:
            collections.abc.Iterable[smqtk.representation.DescriptorElement]

        """
        if self.read_only:
            raise ReadOnlyError("Cannot modify read-only index.")

        if self._faiss_index is None:
            self._build_index(descriptors)
            return

        self._log.debug('Updating FAISS index')

        with self._model_lock:
            # Remove any uids which have already been indexed. This gracefully
            # handles the unusual case that the underlying FAISS index and the
            # SMQTK descriptor set have fallen out of sync due to an unexpected
            # external failure.
            desc_list = []
            for descriptor_ in descriptors:
                if descriptor_.uuid() in self._uid2idx_kvs:
                    warnings.warn(
                        "Descriptor with UID {} already present in this"
                        " index".format(descriptor_.uuid()))
                else:
                    desc_list.append(descriptor_)
            if not desc_list:
                self._log.info("No new descriptors provided not already "
                               "present in this index. No update necessary.")
                return
            data, new_uuids = self._descriptors_to_matrix(desc_list)

            n, d = data.shape

            old_ntotal = self.count()

            next_next_index = self._next_index + n
            new_ids = np.arange(self._next_index, next_next_index)
            self._next_index = next_next_index

            assert self._faiss_index.d == d, \
                "FAISS index dimension doesn't match data dimension"
            # noinspection PyArgumentList
            self._faiss_index.add_with_ids(data, new_ids)
            assert self._faiss_index.ntotal == old_ntotal + n, \
                "New FAISS index size doesn't match old + data size"
            self._log.info(
                "FAISS index has been updated with %d"
                " new vectors", n)

            self._log.debug("Adding new descriptor elements")
            self._descriptor_set.add_many_descriptors(desc_list)
            assert len(self._descriptor_set) == old_ntotal + n, \
                "New descriptor set size doesn't match old + data size"

            new_ids = new_ids.astype(object)

            self._uid2idx_kvs.add_many(dict(zip(new_uuids, new_ids)))
            assert len(self._uid2idx_kvs) == old_ntotal + n, \
                "New uid2idx kvs size doesn't match old + new data size."

            self._idx2uid_kvs.add_many(dict(zip(new_ids, new_uuids)))
            assert len(self._idx2uid_kvs) == old_ntotal + n, \
                "New idx2uid kvs size doesn't match old + new data size."

            self._save_faiss_model()
Ejemplo n.º 30
0
    def _remove_from_index(self, uids):
        """
        Remove descriptors from this index associated with the given UIDs.

        :param uids: Iterable of UIDs of descriptors to remove from this index.
        :type uids: collections.Iterable[collections.Hashable]

        :raises KeyError: One or more UIDs provided do not match any stored
            descriptors.  The index should not be modified.
        :raises ReadOnlyError: This index is set to be read-only and cannot be
            modified.

        """
        with self._model_lock:
            if self.read_only:
                raise ReadOnlyError("Cannot modify container attributes due "
                                    "to being in read-only mode.")

            uids = list(uids)

            # Remove UIDs from our hash2uid-kvs
            # - get the hash for each input UID's descriptor, remove UID from
            #   recorded association set.
            # - `get_many_descriptors` fails when bad UIDs are provided
            #   (KeyError).
            self._log.debug("Removing hash2uid entries for UID's descriptors")
            h_vectors = collections.deque()
            h_ints = collections.deque()
            for d in self.descriptor_index.get_many_descriptors(uids):
                h_vec = self.lsh_functor.get_hash(d.vector())
                h_vectors.append(h_vec)
                h_int = bit_vector_to_int_large(h_vec)
                h_ints.append(h_int)

            # If we're here, then all given UIDs mapped to an indexed
            # descriptor.  Proceed with removal from hash2uids kvs.  If a hash
            # no longer maps anything, remove that key from the KVS.
            hashes_for_removal = collections.deque()
            # store key-value pairs to update after loop in batch call
            kvs_update = {}
            # store keys to remove after loop in batch-call
            kvs_remove = set()
            for uid, h_int, h_vec in zip(uids, h_ints, h_vectors):
                if h_int not in kvs_update:
                    # First time seeing key, cache current value
                    kvs_update[h_int] = \
                        self.hash2uuids_kvstore.get(h_int, set())
                kvs_update[h_int] -= {uid}
                # If the resolves UID set is empty, flag the key for removal.
                if not kvs_update[h_int]:
                    del kvs_update[h_int]
                    kvs_remove.add(h_int)
                    hashes_for_removal.append(h_vec)
            self._log.debug("Updating hash2uuids: modified relations")
            self.hash2uuids_kvstore.add_many(kvs_update)
            self._log.debug("Updating hash2uuids: removing empty hash keys")
            self.hash2uuids_kvstore.remove_many(kvs_remove)
            del kvs_update, kvs_remove

            # call remove-from-index on hash-index if we have one and there are
            # hashes to be removed.
            if self.hash_index and hashes_for_removal:
                self.hash_index.remove_from_index(hashes_for_removal)

            # Remove descriptors from our set matching the given UIDs.
            self.descriptor_index.remove_many_descriptors(uids)