Exemple #1
0
    def _create_psql_helper(
        cls,
        db_name: str,
        db_host: Optional[str],
        db_port: Optional[int],
        db_user: Optional[str],
        db_pass: Optional[str],
        table_name: str,
        uuid_col: str,
        type_col: str,
        binary_col: str,
        itersize: int = 1000,
        create_table: bool = True
    ) -> PsqlConnectionHelper:
        """
        Internal helper function for creating PSQL connection helpers for class
        instances.

        :param db_name: The name of the database to connect to.
        :param db_host: Host address of the Postgres server. If None, we
            assume the server is on the local machine and use the UNIX socket.
            This might be a required field on Windows machines (not tested yet).
        :param db_port: Port the Postgres server is exposed on. If None, we
            assume the default port (5423).
        :param db_user: Postgres user to connect as. If None, postgres
            defaults to using the current accessing user account name on the
            operating system.
        :param db_pass: Password for the user we're connecting as. This may be
            None if no password is to be used.
        :param table_name: String label of the database table to use.
        :param uuid_col: The column label for descriptor UUID storage
        :param type_col: The column label for descriptor type string storage.
        :param binary_col: The column label for descriptor vector binary
            storage.
        :param itersize: Number of records fetched per network round trip when
            iterating over a named cursor. This parameter only does anything if
            a named cursor is used.
        :param create_table: Whether to try to create the storing table before
            returning the connection helper. If the configured user does not
            have sufficient permissions to create the table and it does not
            currently exist, an exception will be raised.

        :return: PsqlConnectionHelper utility.
        """
        helper = PsqlConnectionHelper(
            db_name, db_host, db_port, db_user, db_pass,
            itersize=itersize, table_upsert_lock=PSQL_TABLE_CREATE_RLOCK
        )

        if create_table:
            helper.set_table_upsert_sql(
                cls.UPSERT_TABLE_TMPL.format(
                    table_name=table_name,
                    type_col=type_col,
                    uuid_col=uuid_col,
                    binary_col=binary_col
                )
            )

        return helper
Exemple #2
0
class TestPsqlConnectionHelper (unittest.TestCase):

    def setUp(self) -> None:
        self.conn_helper = PsqlConnectionHelper()

    def test_batch_execute_on_empty_iterable(self) -> None:
        # noinspection PyUnusedLocal
        def exec_hook(cur: psycopg2._psycopg.cursor, batch: Iterable) -> None:
            raise Exception('This line shouldn\'t be reached with an empty '
                            'iterable.')

        list(self.conn_helper.batch_execute(iter(()), exec_hook, 1))
class PostgresDescriptorSet(DescriptorSet):
    """
    DescriptorSet implementation that stored DescriptorElement references in
    a PostgreSQL database.

    A ``PostgresDescriptorSet`` effectively controls the entire table. Thus
    a ``clear()`` call will remove everything from the table.

    PostgreSQL version support:
        - 9.4

    Table format:
        <uuid col>      TEXT NOT NULL
        <element col>   BYTEA NOT NULL

        <uuid_col> should be the primary key (we assume unique).

    We require that the no column labels not be 'true' for the use of a value
    return shortcut.

    """

    #
    # The following are SQL query templates. The string formatting using {}'s
    # is used to fill in the query before using it in an execute with instance
    # specific values. The ``%()s`` formatting is special for the execute
    # where-by psycopg2 will fill in the values appropriately as specified in a
    # second dictionary argument to ``cursor.execute(query, value_dict)``.
    #
    UPSERT_TABLE_TMPL = norm_psql_cmd_string("""
        CREATE TABLE IF NOT EXISTS {table_name:s} (
          {uuid_col:s} TEXT NOT NULL,
          {element_col:s} BYTEA NOT NULL,
          PRIMARY KEY ({uuid_col:s})
        );
    """)

    SELECT_TMPL = norm_psql_cmd_string("""
        SELECT {col:s}
          FROM {table_name:s}
    """)

    SELECT_LIKE_TMPL = norm_psql_cmd_string("""
        SELECT {element_col:s}
          FROM {table_name:s}
         WHERE {uuid_col:s} like %(uuid_like)s
    """)

    # So we can ensure we get back elements in specified order
    #   - reference [1]
    SELECT_MANY_ORDERED_TMPL = norm_psql_cmd_string("""
        SELECT {table_name:s}.{element_col:s}
          FROM {table_name:s}
          JOIN (
            SELECT *
            FROM unnest(%(uuid_list)s) with ordinality
          ) AS __ordering__ ({uuid_col:s}, {uuid_col:s}_order)
            ON {table_name:s}.{uuid_col:s} = __ordering__.{uuid_col:s}
          ORDER BY __ordering__.{uuid_col:s}_order
    """)

    UPSERT_TMPL = norm_psql_cmd_string("""
        WITH upsert AS (
          UPDATE {table_name:s}
            SET {element_col:s} = %(element_val)s
            WHERE {uuid_col:s} = %(uuid_val)s
            RETURNING *
          )
        INSERT INTO {table_name:s}
          ({uuid_col:s}, {element_col:s})
          SELECT %(uuid_val)s, %(element_val)s
            WHERE NOT EXISTS (SELECT * FROM upsert)
    """)

    DELETE_LIKE_TMPL = norm_psql_cmd_string("""
        DELETE FROM {table_name:s}
              WHERE {uuid_col:s} like %(uuid_like)s
    """)

    DELETE_MANY_TMPL = norm_psql_cmd_string("""
        DELETE FROM {table_name:s}
              WHERE {uuid_col:s} in %(uuid_tuple)s
          RETURNING uid
    """)

    @classmethod
    def is_usable(cls) -> bool:
        return psycopg2 is not None

    def __init__(self,
                 table_name: str = 'descriptor_set',
                 uuid_col: str = 'uid',
                 element_col: str = 'element',
                 db_name: str = 'postgres',
                 db_host: Optional[str] = None,
                 db_port: Optional[int] = None,
                 db_user: Optional[str] = None,
                 db_pass: Optional[str] = None,
                 multiquery_batch_size: Optional[int] = 1000,
                 pickle_protocol: int = -1,
                 read_only: bool = False,
                 create_table: bool = True):
        """
        Initialize set instance.

        :param table_name: Name of the table to use.
        :param uuid_col: Name of the column containing the UUID signatures.
        :param element_col: Name of the table column that will contain
            serialized elements.
        :param db_name: The name of the database to connect to.
        :param db_host: Host address of the Postgres server. If None, we
            assume the server is on the local machine and use the UNIX socket.
            This might be a required field on Windows machines (not tested yet).
        :param db_port: Port the Postgres server is exposed on. If None, we
            assume the default port (5423).
        :param db_user: Postgres user to connect as. If None, postgres
            defaults to using the current accessing user account name on the
            operating system.
        :param db_pass: Password for the user we're connecting as. This may be
            None if no password is to be used.
        :param multiquery_batch_size: For queries that handle sending or
            receiving many queries at a time, batch queries based on this size.
            If this is None, then no batching occurs.

            The advantage of batching is that it reduces the memory impact for
            queries dealing with a very large number of elements (don't have to
            store the full query for all elements in RAM), but the transaction
            will be some amount slower due to splitting the query into multiple
            transactions.
        :param pickle_protocol: Pickling protocol to use. We will use -1 by
            default (latest version, probably binary).
        :param read_only: Only allow read actions against this set.
            Modification actions will throw a ReadOnlyError exceptions.
        :param create_table: If this instance should try to create the storing
            table before actions are performed against it when not set to be
            read-only. If the configured user does not have sufficient
            permissions to create the table and it does not currently exist, an
            exception will be raised.
        """
        super(PostgresDescriptorSet, self).__init__()

        self.table_name = table_name
        self.uuid_col = uuid_col
        self.element_col = element_col

        self.multiquery_batch_size = multiquery_batch_size
        self.pickle_protocol = pickle_protocol
        self.read_only = bool(read_only)
        self.create_table = create_table

        # Checking parameters where necessary
        if self.multiquery_batch_size is not None:
            self.multiquery_batch_size = int(self.multiquery_batch_size)
            assert self.multiquery_batch_size > 0, \
                "A given batch size must be greater than 0 in size " \
                "(given: %d)." % self.multiquery_batch_size
        assert -1 <= self.pickle_protocol <= 2, \
            ("Given pickle protocol is not in the known valid range. Given: %s"
             % self.pickle_protocol)

        self.psql_helper = PsqlConnectionHelper(db_name, db_host, db_port,
                                                db_user, db_pass,
                                                self.multiquery_batch_size,
                                                PSQL_TABLE_CREATE_RLOCK)
        if not self.read_only and self.create_table:
            self.psql_helper.set_table_upsert_sql(
                self.UPSERT_TABLE_TMPL.format(
                    table_name=self.table_name,
                    uuid_col=self.uuid_col,
                    element_col=self.element_col,
                ))

    def get_config(self) -> Dict[str, Any]:
        return {
            "table_name": self.table_name,
            "uuid_col": self.uuid_col,
            "element_col": self.element_col,
            "db_name": self.psql_helper.db_name,
            "db_host": self.psql_helper.db_host,
            "db_port": self.psql_helper.db_port,
            "db_user": self.psql_helper.db_user,
            "db_pass": self.psql_helper.db_pass,
            "multiquery_batch_size": self.multiquery_batch_size,
            "pickle_protocol": self.pickle_protocol,
            "read_only": self.read_only,
            "create_table": self.create_table,
        }

    def count(self) -> int:
        """
        :return: Number of descriptor elements stored in this set.
        """
        # Just count UUID column to limit data read.
        q = self.SELECT_TMPL.format(
            col='count(%s)' % self.uuid_col,
            table_name=self.table_name,
        )

        def exec_hook(cur: psycopg2.extensions.cursor) -> None:
            cur.execute(q)

        # There's only going to be one row returned with one element in it.
        return list(
            self.psql_helper.single_execute(exec_hook,
                                            yield_result_rows=True))[0][0]

    def clear(self) -> None:
        if self.read_only:
            raise ReadOnlyError("Cannot clear a read-only set.")

        q = self.DELETE_LIKE_TMPL.format(
            table_name=self.table_name,
            uuid_col=self.uuid_col,
        )

        def exec_hook(cur: psycopg2.extensions.cursor) -> None:
            cur.execute(q, {'uuid_like': '%'})

        list(self.psql_helper.single_execute(exec_hook))

    def has_descriptor(self, uuid: Hashable) -> bool:
        q = self.SELECT_LIKE_TMPL.format(
            # hacking return value to something simple
            element_col='true',
            table_name=self.table_name,
            uuid_col=self.uuid_col,
        )

        def exec_hook(cur: psycopg2.extensions.cursor) -> None:
            cur.execute(q, {'uuid_like': str(uuid)})

        # Should either yield one or zero rows
        return bool(
            list(
                self.psql_helper.single_execute(exec_hook,
                                                yield_result_rows=True)))

    def add_descriptor(self, descriptor: DescriptorElement) -> None:
        """
        Add a descriptor to this set.

        Adding the same descriptor multiple times should not add multiple copies
        of the descriptor in the set (based on UUID). Added descriptors
        overwrite set descriptors based on UUID.

        :param descriptor: Descriptor to set.
        """
        if self.read_only:
            raise ReadOnlyError("Cannot clear a read-only set.")

        q = self.UPSERT_TMPL.format(
            table_name=self.table_name,
            uuid_col=self.uuid_col,
            element_col=self.element_col,
        )
        v = {
            'uuid_val':
            str(descriptor.uuid()),
            'element_val':
            psycopg2.Binary(pickle.dumps(descriptor, self.pickle_protocol))
        }

        def exec_hook(cur: psycopg2.extensions.cursor) -> None:
            cur.execute(q, v)

        list(self.psql_helper.single_execute(exec_hook))

    def add_many_descriptors(self,
                             descriptors: Iterable[DescriptorElement]) -> None:
        """
        Add multiple descriptors at one time.

        Adding the same descriptor multiple times should not add multiple copies
        of the descriptor in the set (based on UUID). Added descriptors
        overwrite set descriptors based on UUID.

        :param descriptors: Iterable of descriptor instances to add to this
            set.
        """
        if self.read_only:
            raise ReadOnlyError("Cannot clear a read-only set.")

        q = self.UPSERT_TMPL.format(
            table_name=self.table_name,
            uuid_col=self.uuid_col,
            element_col=self.element_col,
        )

        # Transform input into
        def elements() -> Generator[Dict[str, Any], None, None]:
            for d in descriptors:
                yield {
                    'uuid_val':
                    str(d.uuid()),
                    'element_val':
                    psycopg2.Binary(pickle.dumps(d, self.pickle_protocol))
                }

        def exec_hook(cur: psycopg2.extensions.cursor,
                      batch: Sequence[Dict[str, Any]]) -> None:
            cur.executemany(q, batch)

        LOG.debug("Adding many descriptors")
        list(
            self.psql_helper.batch_execute(elements(), exec_hook,
                                           self.multiquery_batch_size))

    def get_descriptor(self, uuid: Hashable) -> DescriptorElement:
        """
        Get the descriptor in this set that is associated with the given UUID.

        :param uuid: UUID of the DescriptorElement to get.

        :raises KeyError: The given UUID doesn't associate to a
            DescriptorElement in this set.

        :return: DescriptorElement associated with the queried UUID.
        """
        q = self.SELECT_LIKE_TMPL.format(
            element_col=self.element_col,
            table_name=self.table_name,
            uuid_col=self.uuid_col,
        )
        v = {'uuid_like': str(uuid)}

        def eh(c: psycopg2.extensions.cursor) -> None:
            c.execute(q, v)
            if c.rowcount == 0:
                raise KeyError(uuid)
            elif c.rowcount != 1:
                raise RuntimeError("Found more than one entry for the given "
                                   "uuid '%s' (got: %d)" % (uuid, c.rowcount))

        r = list(self.psql_helper.single_execute(eh, yield_result_rows=True))
        return pickle.loads(bytes(r[0][0]))

    def get_many_descriptors(
            self, uuids: Iterable[Hashable]
    ) -> Generator[DescriptorElement, None, None]:
        """
        Get an iterator over descriptors associated to given descriptor UUIDs.

        :param uuids: Iterable of descriptor UUIDs to query for.

        :raises KeyError: A given UUID doesn't associate with a
            DescriptorElement in this set.

        :return: Iterator of descriptors associated to given uuid values.
        """
        q = self.SELECT_MANY_ORDERED_TMPL.format(
            table_name=self.table_name,
            element_col=self.element_col,
            uuid_col=self.uuid_col,
        )

        # Cache UUIDs received in order so we can check when we miss one in
        # order to raise a KeyError.
        uuid_order = []

        def elems() -> Generator[str, None, None]:
            for uid in uuids:
                uuid_order.append(uid)
                yield str(uid)

        def exec_hook(cur: psycopg2.extensions.cursor,
                      batch: Sequence[str]) -> None:
            v = {'uuid_list': batch}
            # LOG.debug('query: %s', cur.mogrify(q, v))
            cur.execute(q, v)

        LOG.debug("Getting many descriptors")
        # The SELECT_MANY_ORDERED_TMPL query ensures that elements returned are
        #   in the UUID order given to this method. Thus, if the iterated UUIDs
        #   and iterated return rows do not exactly line up, the query join
        #   failed to match a query UUID to something in the database.
        #   - We also check that the number of rows we got back is the same
        #     as elements yielded, else there were trailing UUIDs that did not
        #     match anything in the database.
        g = self.psql_helper.batch_execute(elems(),
                                           exec_hook,
                                           self.multiquery_batch_size,
                                           yield_result_rows=True)
        i = 0
        for r, expected_uuid in zip(g, uuid_order):
            d = pickle.loads(bytes(r[0]))
            if d.uuid() != expected_uuid:
                raise KeyError(expected_uuid)
            yield d
            i += 1

        if len(uuid_order) != i:
            # just report the first one that's bad
            raise KeyError(uuid_order[i])

    def remove_descriptor(self, uuid: Hashable) -> None:
        """
        Remove a descriptor from this set by the given UUID.

        :param uuid: UUID of the DescriptorElement to remove.

        :raises KeyError: The given UUID doesn't associate to a
            DescriptorElement in this set.
        """
        if self.read_only:
            raise ReadOnlyError("Cannot remove from a read-only set.")

        q = self.DELETE_LIKE_TMPL.format(
            table_name=self.table_name,
            uuid_col=self.uuid_col,
        )
        v = {'uuid_like': str(uuid)}

        def execute(c: psycopg2.extensions.cursor) -> None:
            c.execute(q, v)
            # Nothing deleted if rowcount == 0
            # (otherwise 1 when deleted a thing)
            if c.rowcount == 0:
                raise KeyError(uuid)

        list(self.psql_helper.single_execute(execute))

    def remove_many_descriptors(self, uuids: Iterable[Hashable]) -> None:
        """
        Remove descriptors associated to given descriptor UUIDs from this set.

        :param uuids: Iterable of descriptor UUIDs to remove.

        :raises KeyError: A given UUID doesn't associate with a
            DescriptorElement in this set.
        """
        if self.read_only:
            raise ReadOnlyError("Cannot remove from a read-only set.")

        q = self.DELETE_MANY_TMPL.format(
            table_name=self.table_name,
            uuid_col=self.uuid_col,
        )
        str_uuid_set = set(str(uid) for uid in uuids)
        v = {'uuid_tuple': tuple(str_uuid_set)}

        def execute(c: psycopg2.extensions.cursor) -> None:
            c.execute(q, v)

            # Check query UUIDs against rows that would actually be deleted.
            deleted_uuid_set = set(r[0] for r in c.fetchall())
            for uid in str_uuid_set:
                if uid not in deleted_uuid_set:
                    raise KeyError(uid)

        list(self.psql_helper.single_execute(execute))

    def keys(self) -> Generator[Hashable, None, None]:
        """
        Return an iterator over set descriptor keys, which are their UUIDs.
        """
        # Getting UUID through the element because the UUID might not be a
        # string type, and the true type is encoded with the DescriptorElement
        # instance.
        for d in self.descriptors():
            yield d.uuid()

    def descriptors(self) -> Generator[DescriptorElement, None, None]:
        """
        Return an iterator over set descriptor element instances.
        """
        def execute(c: psycopg2.extensions.cursor) -> None:
            c.execute(
                self.SELECT_TMPL.format(col=self.element_col,
                                        table_name=self.table_name))

        #: :type: __generator
        execution_results = self.psql_helper.single_execute(
            execute, yield_result_rows=True, named=True)
        for r in execution_results:
            d = pickle.loads(bytes(r[0]))
            yield d

    def items(
            self) -> Generator[Tuple[Hashable, DescriptorElement], None, None]:
        """
        Return an iterator over set descriptor key and instance pairs.
        :rtype: collections.abc.Iterator[(collections.abc.Hashable,
                                          smqtk.representation.DescriptorElement)]
        """
        for d in self.descriptors():
            yield d.uuid(), d
    def __init__(self,
                 table_name: str = 'descriptor_set',
                 uuid_col: str = 'uid',
                 element_col: str = 'element',
                 db_name: str = 'postgres',
                 db_host: Optional[str] = None,
                 db_port: Optional[int] = None,
                 db_user: Optional[str] = None,
                 db_pass: Optional[str] = None,
                 multiquery_batch_size: Optional[int] = 1000,
                 pickle_protocol: int = -1,
                 read_only: bool = False,
                 create_table: bool = True):
        """
        Initialize set instance.

        :param table_name: Name of the table to use.
        :param uuid_col: Name of the column containing the UUID signatures.
        :param element_col: Name of the table column that will contain
            serialized elements.
        :param db_name: The name of the database to connect to.
        :param db_host: Host address of the Postgres server. If None, we
            assume the server is on the local machine and use the UNIX socket.
            This might be a required field on Windows machines (not tested yet).
        :param db_port: Port the Postgres server is exposed on. If None, we
            assume the default port (5423).
        :param db_user: Postgres user to connect as. If None, postgres
            defaults to using the current accessing user account name on the
            operating system.
        :param db_pass: Password for the user we're connecting as. This may be
            None if no password is to be used.
        :param multiquery_batch_size: For queries that handle sending or
            receiving many queries at a time, batch queries based on this size.
            If this is None, then no batching occurs.

            The advantage of batching is that it reduces the memory impact for
            queries dealing with a very large number of elements (don't have to
            store the full query for all elements in RAM), but the transaction
            will be some amount slower due to splitting the query into multiple
            transactions.
        :param pickle_protocol: Pickling protocol to use. We will use -1 by
            default (latest version, probably binary).
        :param read_only: Only allow read actions against this set.
            Modification actions will throw a ReadOnlyError exceptions.
        :param create_table: If this instance should try to create the storing
            table before actions are performed against it when not set to be
            read-only. If the configured user does not have sufficient
            permissions to create the table and it does not currently exist, an
            exception will be raised.
        """
        super(PostgresDescriptorSet, self).__init__()

        self.table_name = table_name
        self.uuid_col = uuid_col
        self.element_col = element_col

        self.multiquery_batch_size = multiquery_batch_size
        self.pickle_protocol = pickle_protocol
        self.read_only = bool(read_only)
        self.create_table = create_table

        # Checking parameters where necessary
        if self.multiquery_batch_size is not None:
            self.multiquery_batch_size = int(self.multiquery_batch_size)
            assert self.multiquery_batch_size > 0, \
                "A given batch size must be greater than 0 in size " \
                "(given: %d)." % self.multiquery_batch_size
        assert -1 <= self.pickle_protocol <= 2, \
            ("Given pickle protocol is not in the known valid range. Given: %s"
             % self.pickle_protocol)

        self.psql_helper = PsqlConnectionHelper(db_name, db_host, db_port,
                                                db_user, db_pass,
                                                self.multiquery_batch_size,
                                                PSQL_TABLE_CREATE_RLOCK)
        if not self.read_only and self.create_table:
            self.psql_helper.set_table_upsert_sql(
                self.UPSERT_TABLE_TMPL.format(
                    table_name=self.table_name,
                    uuid_col=self.uuid_col,
                    element_col=self.element_col,
                ))
class PostgresKeyValueStore(KeyValueStore):
    """
    PostgreSQL-backed key-value storage.
    """
    class SqlTemplates:
        """
        Container for static PostgreSQL queries used by the containing class.
        """

        UPSERT_TABLE_TMPL = norm_psql_cmd_string("""
            CREATE TABLE IF NOT EXISTS {table_name:s} (
              {key_col:s} BYTEA NOT NULL,
              {value_col:s} BYTEA NOT NULL,
              PRIMARY KEY ({key_col:s})
            );
        """)

        SELECT_TMPL = norm_psql_cmd_string("""
            SELECT {query:s} FROM {table_name:s};
        """)

        SELECT_LIKE_TMPL = norm_psql_cmd_string("""
            SELECT {query:s}
              FROM {table_name:s}
             WHERE {key_col:s} LIKE %(key_like)s
        """)

        SELECT_MANY_TMPL = norm_psql_cmd_string("""
            SELECT {query:s}
              FROM {table_name:s}
             WHERE {key_col:s} IN %(key_tuple)s
        """)

        UPSERT_TMPL = norm_psql_cmd_string("""
            INSERT INTO {table_name:s} ({key_col:s}, {value_col:s})
                VALUES (%(key)s, %(val)s)
                ON CONFLICT ({key_col:s})
                    DO UPDATE
                        SET {value_col:s} = EXCLUDED.{value_col:s}
        """)

        DELETE_LIKE_TMPL = norm_psql_cmd_string("""
            DELETE FROM {table_name:s}
            WHERE {key_col:s} LIKE %(key_like)s
        """)

        DELETE_ALL = norm_psql_cmd_string("""
            DELETE FROM {table_name:s}
        """)

    @classmethod
    def is_usable(cls) -> bool:
        return psycopg2 is not None

    def __init__(self,
                 table_name: str = "data_set",
                 key_col: str = 'key',
                 value_col: str = 'value',
                 db_name: str = 'postgres',
                 db_host: Optional[str] = None,
                 db_port: Optional[int] = None,
                 db_user: Optional[str] = None,
                 db_pass: Optional[str] = None,
                 batch_size: Optional[int] = 1000,
                 pickle_protocol: int = -1,
                 read_only: bool = False,
                 create_table: bool = True):
        """
        Initialize a PostgreSQL-backed data set instance.

        :param table_name: Name of the table to use.
        :type table_name: str

        :param key_col: Name of the column containing the UUID signatures.
        :type key_col: str

        :param value_col: Name of the table column that will contain
            serialized elements.
        :type value_col: str

        :param db_name: The name of the database to connect to.
        :type db_name: str

        :param db_host: Host address of the Postgres server. If None, we
            assume the server is on the local machine and use the UNIX socket.
            This might be a required field on Windows machines (not tested yet).
        :type db_host: str | None

        :param db_port: Port the Postgres server is exposed on. If None, we
            assume the default port (5423).
        :type db_port: int | None

        :param db_user: Postgres user to connect as. If None, postgres
            defaults to using the current accessing user account name on the
            operating system.
        :type db_user: str | None

        :param db_pass: Password for the user we're connecting as. This may be
            None if no password is to be used.
        :type db_pass: str | None

        :param batch_size: For queries that handle sending or
            receiving many queries at a time, batch queries based on this size.
            If this is None, then no batching occurs.

            The advantage of batching is that it reduces the memory impact for
            queries dealing with a very large number of elements (don't have to
            store the full query for all elements in RAM), but the transaction
            will be some amount slower due to splitting the query into multiple
            transactions.
        :type batch_size: int | None

        :param pickle_protocol: Pickling protocol to use. We will use -1 by
            default (latest version, probably binary).
        :type pickle_protocol: int

        :param read_only: Only allow read actions against this index.
            Modification actions will throw a ReadOnlyError exceptions.
        :type read_only: bool

        :param create_table: If this instance should try to create the storing
            table before actions are performed against it when not set to be
            read-only. If the configured user does not have sufficient
            permissions to create the table and it does not currently exist, an
            exception will be raised.
        :type create_table: bool

        """
        super(PostgresKeyValueStore, self).__init__()

        self._table_name = table_name
        self._key_col = key_col
        self._value_col = value_col

        self._batch_size = batch_size
        self._pickle_protocol = pickle_protocol
        self._read_only = bool(read_only)
        self._create_table = create_table

        # Checking parameters where necessary
        if self._batch_size is not None:
            self._batch_size = int(self._batch_size)
            assert self._batch_size > 0, \
                "A given batch size must be greater than 0 in size " \
                "(given: %d)." % self._batch_size
        assert -1 <= self._pickle_protocol <= 2, \
            ("Given pickle protocol is not in the known valid range [-1, 2]. "
             "Given: %s." % self._pickle_protocol)

        # helper structure for SQL operations.
        self._psql_helper = PsqlConnectionHelper(
            db_name,
            db_host,
            db_port,
            db_user,
            db_pass,
            itersize=batch_size,
            table_upsert_lock=PSQL_TABLE_CREATE_RLOCK,
        )

        # Only set table upsert if not read-only.
        if not self._read_only and self._create_table:
            # NOT read-only, so allow table upsert.
            self._psql_helper.set_table_upsert_sql(
                self.SqlTemplates.UPSERT_TABLE_TMPL.format(
                    table_name=self._table_name,
                    key_col=self._key_col,
                    value_col=self._value_col))

    @staticmethod
    def _py_to_bin(k: Any) -> "psycopg2.Binary":
        """
        Convert a python hashable value into psycopg2.Binary via pickle.

        :param k: Python object instance to be converted into a
            ``psycopg2.Binary`` instance via ``pickle`` serialization.

        :return: ``psycopg2.Binary`` buffer instance to use for insertion into
            or query against a table.
        """
        return psycopg2.Binary(pickle.dumps(k))

    @staticmethod
    def _bin_to_py(b: "psycopg2.Binary") -> Any:
        """
        Un-"translate" psycopg2.Binary value (buffer) to a python type.

        :param b: ``psycopg2.Binary`` buffer instance as retrieved from a
            PostgreSQL query.

        :return: Python object instance as loaded via pickle from the given
            ``psycopg2.Binary`` buffer.
        """
        return pickle.loads(bytes(b))

    def get_config(self) -> Dict:
        """
        Return a JSON-compliant dictionary that could be passed to this class's
        ``from_config`` method to produce an instance with identical
        configuration.

        In the common case, this involves naming the keys of the dictionary
        based on the initialization argument names as if it were to be passed
        to the constructor via dictionary expansion.

        :return: JSON type compliant configuration dictionary.
        :rtype: dict

        """
        return {
            "table_name": self._table_name,
            "key_col": self._key_col,
            "value_col": self._value_col,
            "db_name": self._psql_helper.db_name,
            "db_host": self._psql_helper.db_host,
            "db_port": self._psql_helper.db_port,
            "db_user": self._psql_helper.db_user,
            "db_pass": self._psql_helper.db_pass,
            "batch_size": self._batch_size,
            "pickle_protocol": self._pickle_protocol,
            "read_only": self._read_only,
            "create_table": self._create_table,
        }

    def __repr__(self) -> str:
        """
        :return: Representative string for this class.
        """
        return (
            f"{super().__repr__()}{{"
            f"table_name: {self._table_name}, "
            f"key_col: {self._key_col}, "
            f"value_col: {self._value_col}, "
            f"db_name: {self._psql_helper.db_name}, "
            f"db_host: {self._psql_helper.db_host}, "
            f"db_port: {self._psql_helper.db_port}, "
            f"db_pass: {'***' if self._psql_helper.db_pass is not None else 'None'}, "
            f"batch_size: {self._batch_size}, "
            f"pickle_protocol: {self._pickle_protocol}, "
            f"read_only: {self._read_only}, "
            f"create_table: {self._create_table}"
            f"}}")

    def count(self) -> int:
        """
        :return: The number of key-value relationships in this store.
        :rtype: int | long
        """
        def cb(cur: psycopg2.extensions.cursor) -> None:
            cur.execute(
                self.SqlTemplates.SELECT_TMPL.format(
                    query='count(%s)' % self._key_col,
                    table_name=self._table_name,
                ))

        return list(
            self._psql_helper.single_execute(cb, yield_result_rows=True))[0][0]

    def keys(self) -> Iterator[Hashable]:
        """
        :return: Iterator over keys in this store.
        :rtype: collections.abc.Iterator[collections.abc.Hashable]
        """
        def cb(cur: psycopg2.extensions.cursor) -> None:
            cur.execute(
                self.SqlTemplates.SELECT_TMPL.format(
                    query=self._key_col,
                    table_name=self._table_name,
                ))

        # We can use a named cursor because this is a select statement as well
        # as server table size may be large.
        for r in self._psql_helper.single_execute(cb,
                                                  yield_result_rows=True,
                                                  named=True):
            # Convert from buffer -> string -> python
            yield self._bin_to_py(r[0])

    def values(self) -> Iterator[Any]:
        """
        :return: Iterator over values in this store. Values are not guaranteed
            to be in any particular order.
        """
        def cb(cur: psycopg2.extensions.cursor) -> None:
            cur.execute(
                self.SqlTemplates.SELECT_TMPL.format(
                    query=self._value_col,
                    table_name=self._table_name,
                ))

        for r in self._psql_helper.single_execute(cb,
                                                  yield_result_rows=True,
                                                  named=True):
            # Convert from buffer -> string -> python
            yield self._bin_to_py(r[0])

    def is_read_only(self) -> bool:
        """
        :return: True if this instance is read-only and False if it is not.
        :rtype: bool
        """
        return self._read_only

    def has(self, key: Hashable) -> bool:
        """
        Check if this store has a value for the given key.

        :param key: Key to check for a value for.
        :type key: collections.abc.Hashable

        :return: If this store has a value for the given key.
        :rtype: bool

        """
        super(PostgresKeyValueStore, self).has(key)

        # Try to select based on given key value. If any rows are returned,
        # there is clearly a key that matches.
        q = self.SqlTemplates.SELECT_LIKE_TMPL.format(
            query='true',
            table_name=self._table_name,
            key_col=self._key_col,
        )

        def cb(cur: psycopg2.extensions.cursor) -> None:
            cur.execute(q, {'key_like': self._py_to_bin(key)})

        return bool(
            list(self._psql_helper.single_execute(cb, yield_result_rows=True)))

    def add(self, key: Hashable, value: Any) -> "PostgresKeyValueStore":
        """
        Add a key-value pair to this store.

        :param key: Key for the value. Must be hashable.
        :param value: Python object to store.

        :raises ReadOnlyError: If this instance is marked as read-only.

        :return: Self.
        """
        super(PostgresKeyValueStore, self).add(key, value)

        q = self.SqlTemplates.UPSERT_TMPL.format(
            table_name=self._table_name,
            key_col=self._key_col,
            value_col=self._value_col,
        )
        v = {
            'key': self._py_to_bin(key),
            'val': self._py_to_bin(value),
        }

        def cb(cur: psycopg2.extensions.cursor) -> None:
            cur.execute(q, v)

        list(self._psql_helper.single_execute(cb))
        return self

    def add_many(self, d: Mapping[Hashable, Any]) -> "PostgresKeyValueStore":
        """
        Add multiple key-value pairs at a time into this store as represented in
        the provided dictionary `d`.

        :param d: Dictionary of key-value pairs to add to this store.

        :return: Self.
        """
        super(PostgresKeyValueStore, self).add_many(d)

        q = self.SqlTemplates.UPSERT_TMPL.format(
            table_name=self._table_name,
            key_col=self._key_col,
            value_col=self._value_col,
        )

        # Iterator over transformed inputs into values for statement.
        def val_iter() -> Iterator:
            for key, val in d.items():
                yield {
                    'key': self._py_to_bin(key),
                    'val': self._py_to_bin(val)
                }

        def cb(cur: psycopg2.extensions.cursor, v_batch: Iterable) -> None:
            psycopg2.extras.execute_batch(cur,
                                          q,
                                          v_batch,
                                          page_size=self._batch_size)

        list(self._psql_helper.batch_execute(val_iter(), cb, self._batch_size))
        return self

    def remove(self, key: Hashable) -> KeyValueStore:
        """
        Remove a single key-value entry.

        :param key: Key to remove.
        :type key: collections.abc.Hashable

        :raises ReadOnlyError: If this instance is marked as read-only.
        :raises KeyError: The given key is not present in this store and no
            default value given.

        :return: Self.
        :rtype: KeyValueStore

        """
        super(PostgresKeyValueStore, self).remove(key)
        if key not in self:
            raise KeyError(key)

        q = self.SqlTemplates.DELETE_LIKE_TMPL.format(
            table_name=self._table_name,
            key_col=self._key_col,
        )
        v = dict(key_like=self._py_to_bin(key))

        def cb(cursor: psycopg2.extensions.cursor) -> None:
            cursor.execute(q, v)

        list(self._psql_helper.single_execute(cb))
        return self

    def _check_contained_keys(self, keys: Set[Hashable]) -> Set[Hashable]:
        """
        Check if the table contains the following keys.

        :param set keys: Keys to check for.

        :return: An set of keys NOT present in the table.
        :rtype: set[collections.abc.Hashable]
        """
        def key_like_iter() -> Iterable:
            for k_ in keys:
                yield self._py_to_bin(k_)

        has_many_q = self.SqlTemplates.SELECT_MANY_TMPL.format(
            query=self._key_col,
            table_name=self._table_name,
            key_col=self._key_col,
        )

        # Keys found in table
        matched_keys: Set[Hashable] = set()

        def cb(cursor: psycopg2.extensions.cursor, batch: Iterable) -> None:
            cursor.execute(has_many_q, {'key_tuple': tuple(batch)})
            matched_keys.update(self._bin_to_py(r[0]) for r in cursor)

        list(
            self._psql_helper.batch_execute(key_like_iter(), cb,
                                            self._batch_size))

        return keys - matched_keys

    def remove_many(self, keys: Iterable[Hashable]) -> KeyValueStore:
        """
        Remove multiple keys and associated values.

        :param keys: Iterable of keys to remove.  If this is empty this method
            does nothing.
        :type keys: collections.abc.Iterable[collections.abc.Hashable]

        :raises ReadOnlyError: If this instance is marked as read-only.
        :raises KeyError: The given key is not present in this store and no
            default value given.  The store is not modified if any key is
            invalid.

        :return: Self.
        :rtype: KeyValueStore

        """
        super(PostgresKeyValueStore, self).remove_many(keys)
        keys = set(keys)

        # Check that all keys requested for removal are contained in our table
        # before attempting to remove any of them.
        key_diff = self._check_contained_keys(keys)
        # If we're trying to remove a key not in our table, appropriately raise
        # a KeyError.
        if key_diff:
            if len(key_diff) == 1:
                raise KeyError(list(key_diff)[0])
            else:
                raise KeyError(key_diff)

        # Proceed with removal
        def key_like_iter() -> Iterator:
            """ Iterator over query value sets. """
            for k_ in keys:
                yield self._py_to_bin(k_)

        del_q = self.SqlTemplates.DELETE_LIKE_TMPL.format(
            table_name=self._table_name,
            key_col=self._key_col,
        )

        def del_cb(cursor: psycopg2.extensions.cursor,
                   v_batch: Iterable) -> None:
            # Execute the query with a list of value dicts.
            psycopg2.extras.execute_batch(cursor,
                                          del_q, [{
                                              'key_like': k
                                          } for k in v_batch],
                                          page_size=self._batch_size)

        list(
            self._psql_helper.batch_execute(key_like_iter(), del_cb,
                                            self._batch_size))
        return self

    def get(self, key: Hashable, default: Any = NO_DEFAULT_VALUE) -> Any:
        """
        Get the value for the given key.

        *NOTE:* **Implementing sub-classes are responsible for raising a
        ``KeyError`` where appropriate.**

        :param key: Key to get the value of.
        :param default: Optional default value if the given key is not present
            in this store. This may be any value except for the
            ``NO_DEFAULT_VALUE`` constant (custom anonymous class instance).

        :raises KeyError: The given key is not present in this store and no
            default value given.

        :return: Deserialized python object stored for the given key.
        """
        q = self.SqlTemplates.SELECT_LIKE_TMPL.format(
            query=self._value_col,
            table_name=self._table_name,
            key_col=self._key_col,
        )
        v = {'key_like': self._py_to_bin(key)}

        def cb(cur: psycopg2.extensions.cursor) -> None:
            cur.execute(q, v)

        rows = list(
            self._psql_helper.single_execute(cb, yield_result_rows=True))
        # If no rows and no default, raise KeyError.
        if len(rows) == 0:
            if default is NO_DEFAULT_VALUE:
                raise KeyError(key)
            else:
                return default
        return self._bin_to_py(rows[0][0])

    def get_many(self,
                 keys: Iterable[Hashable],
                 default: Any = NO_DEFAULT_VALUE) -> Iterable[Any]:
        """
        Get the values for the given keys.

        *NOTE:* **Implementing sub-classes are responsible for raising a
        ``KeyError`` where appropriate.**

        :param keys: The keys for which associated values are requested.
        :param default: Optional default value if a given key is not present
            in this store. This may be any value except for the
            ``NO_DEFAULT_VALUE`` constant (custom anonymous class instance).

        :raises KeyError: A given key is not present in this store and no
            default value given.

        :return: Iterable of deserialized python objects stored for the given
            keys in the order that the corresponding keys were provided.
        """
        sql_command_string = self.SqlTemplates.SELECT_MANY_TMPL.format(
            query=', '.join((self._key_col, self._value_col)),
            table_name=self._table_name,
            key_col=self._key_col)
        keys = list(keys)

        sql_keys = tuple(self._py_to_bin(key_) for key_ in keys)
        sql_variables = {'key_tuple': sql_keys}

        def postgres_callback(cursor: psycopg2.extensions.cursor) -> None:
            cursor.execute(sql_command_string, sql_variables)

        retrieved_dict = {
            self._bin_to_py(row_[0]): self._bin_to_py(row_[1])
            for row_ in self._psql_helper.single_execute(
                postgres_callback, yield_result_rows=True)
        }

        if default is NO_DEFAULT_VALUE:
            for key_ in keys:
                yield retrieved_dict[key_]
        else:
            for key_ in keys:
                yield retrieved_dict.get(key_, default)

    def clear(self) -> KeyValueStore:
        """
        Clear this key-value store.

        *NOTE:* **Implementing sub-classes should call this super-method. This
        super method should not be considered a critical section for thread
        safety.**

        :raises ReadOnlyError: If this instance is marked as read-only.

        """
        q = self.SqlTemplates.DELETE_ALL.format(table_name=self._table_name)

        def cb(cur: psycopg2.extensions.cursor) -> None:
            cur.execute(q)

        list(self._psql_helper.single_execute(cb))
        return self
Exemple #6
0
class PostgresDataElement(DataElement):  # lgtm [py/missing-equals]
    """
    Data element bytes stored in PostgreSQL database.

    Storage table should have three columns for the following components:
    - data SHA1 (effective UID)
    - data content-type / MIMETYPE
    - data bytes

    Efficient connection pooling may be achieved via external utilities like
    PGBounder.

    Due to the use of the "ON CONFLICT" clause in upserting data, this
    implementation requires at least PostgreSQL version 9.5 or greater.

    """

    # SHA1 checksum of 0-length data (empty bytes)
    EMPTY_SHA = hashlib.sha1(b'').hexdigest()

    class CommandTemplates(object):
        """ Encapsulation of command templates. """

        # Upsert table for storage if desired
        #
        # Format params:
        # - table_name
        # - id_col
        # - sha1_col
        # - mime_col
        # - byte_col
        UPSERT_TABLE = norm_psql_cmd_string("""
            CREATE TABLE IF NOT EXISTS {table_name:s} (
              {id_col:s}   TEXT NOT NULL,
              {sha1_col:s} TEXT NOT NULL,
              {mime_col:s} TEXT NOT NULL,
              {byte_col:s} BYTEA NOT NULL,
              PRIMARY KEY ({id_col:s})
            );
        """)

        # Select ``col`` for a given entry ID.
        #
        # Query Format params:
        # - col
        # - table_name
        # - id_col
        #
        # Value params:
        # - id_val
        SELECT = norm_psql_cmd_string("""
            SELECT {col:s}
              FROM {table_name:s}
              WHERE {id_col:s} = %(id_val)s
            ;
        """)

        # Upsert content-type/data for a uid
        #
        # Query Format params:
        # - table_name
        # - id_col
        # - sha1_col
        # - mime_col
        # - byte_col
        #
        # Value params:
        # - id_val
        # - sha1_val
        # - mime_val
        # - byte_val
        #
        # SQL format from:
        #   https://hashrocket.com/blog/posts/upsert-records-with-postgresql-9-5
        #
        UPSERT_DATA = norm_psql_cmd_string("""
            INSERT INTO {table_name:s} ({id_col:s}, {sha1_col:s}, {mime_col:s}, {byte_col:s})
                VALUES ( %(id_val)s, %(sha1_val)s, %(mime_val)s, %(byte_val)s )
                ON CONFLICT ({id_col:s})
                    DO UPDATE
                        SET ({sha1_col:s}, {mime_col:s}, {byte_col:s})
                          = (EXCLUDED.{sha1_col:s}, EXCLUDED.{mime_col:s}, EXCLUDED.{byte_col:s})
            ;
        """)

        # Same as ``UPSERT_DATA`` but does not set the mimetype on an update.
        # This is meant to atomically update the byte data without changing the
        # existing mimetype.
        UPSERT_DATA_NO_MIME = norm_psql_cmd_string("""
            INSERT INTO {table_name:s} ({id_col:s}, {sha1_col:s}, {mime_col:s}, {byte_col:s})
                VALUES ( %(id_val)s, %(sha1_val)s, %(mime_val)s, %(byte_val)s )
                ON CONFLICT ({id_col:s})
                    DO UPDATE
                        SET ({sha1_col:s}, {byte_col:s})
                          = (EXCLUDED.{sha1_col:s}, EXCLUDED.{byte_col:s})
            ;
        """)

    @classmethod
    def is_usable(cls) -> bool:
        if psycopg2 is None:
            LOG.warning("Not usable. Requires the psycopg2 module.")
            return False
        return True

    def __init__(self,
                 element_id: str,
                 content_type: Optional[str] = None,
                 table_name: str = "psql_data_elements",
                 id_col: str = "id",
                 sha1_col: str = "sha1",
                 mime_col: str = "mime",
                 byte_col: str = "bytes",
                 db_name: str = "postgres",
                 db_host: Optional[str] = "/tmp",
                 db_port: Optional[int] = 5433,
                 db_user: Optional[str] = None,
                 db_pass: Optional[str] = None,
                 read_only: bool = False,
                 create_table: bool = True):
        """
        Create a new PostgreSQL-based data element.

        If the tabled mapped to the provided ``table_name`` already exists, we
        expect the provided columns to match the following types:
        - ``id_col`` is expected to be TEXT
        - ``sha1_col`` is expected to be TEXT
        - ``type_col`` is expected to be TEXT
        - ``byte_col`` is expected to be BYTEA

        Default database connection parameters are assuming the use of a
        non-default, non-postgres-user cluster where the current user's name is
        equivalent to a valid role in the database.

        :param element_id: ID to reference a specific data element row in the
            table.  This is required in the same way that a path is required to
            point to a file on a filesystem.
        :type element_id: str

        :param content_type: Expected mime-type of byte data set to this
            element.  This only affects setting the mime-type field when setting
            new bytes.  ``content_type()`` will always reflect what is stored in
            the backend, or lack there-of.

            If this mime-type differs from an existing stored value,
            this mime-type will overwrite the stored value on the next call to
            ``set_bytes``.  If this is None and there is no mime-type already
            set in the database, no mime-type will be set on the next
            ``set_bytes`` call.
        :type content_type: str | None

        :param table_name: String label of the table in the database to interact
            with.
        :type table_name: str

        :param id_col: Name of the element ID column in ``table_name``.
        :type id_col: str

        :param sha1_col: Name of the SHA1 column in ``table_name``.
        :type sha1_col: str

        :param mime_col: Name of the MIMETYPE column in ``table_name``.
        :type mime_col: str

        :param byte_col: Name of the column storing byte data in ``table_name``.
        :type byte_col: str

        :param db_host: Host address of the PostgreSQL server. If None, we
            assume the server is on the local machine and use the UNIX socket.
            This might be a required field on Windows machines (not tested yet).
        :type db_host: str | None

        :param db_port: Port the Postgres server is exposed on. If None, we
            assume a default port (5433).
        :type db_port: int | None

        :param db_name: The name of the database to connect to.
        :type db_name: str

        :param db_user: Postgres user to connect as. If None, postgres
            defaults to using the current accessing user account name on the
            operating system.
        :type db_user: str | None

        :param db_pass: Password for the user we're connecting as. This may be
            None if no password is to be used.
        :type db_pass: str | None

        :param read_only: Only allow reading of this data.  Modification actions
            will throw a ReadOnlyError exceptions.
        :type read_only: bool

        :param create_table: If this instance should try to create the storing
            table before actions are performed against it. If the configured
            user does not have sufficient permissions to create the table and it
            does not currently exist, an exception will be raised.
        :type create_table: bool

        """
        super(PostgresDataElement, self).__init__()

        if not isinstance(element_id, str):
            raise ValueError("Element ID should be a string type for this "
                             "implementation. Database storage is typed.")

        self._element_id = element_id
        self._content_type = content_type
        self._table_name = table_name

        self._id_col = id_col
        self._sha1_col = sha1_col
        self._mime_col = mime_col
        self._byte_col = byte_col

        self._read_only = read_only
        self._create_table = create_table

        # itersize is hard-coded because a single-element perspective should
        # only be retrieving one row at a time.
        self._psql_helper = PsqlConnectionHelper(
            db_name, db_host, db_port, db_user, db_pass, 10,
            GLOBAL_PSQL_TABLE_CREATE_RLOCK)

        # Set table creation SQL in helper
        if not self._read_only:
            self._psql_helper.set_table_upsert_sql(
                self.CommandTemplates.UPSERT_TABLE.format(
                    table_name=self._table_name,
                    id_col=self._id_col,
                    sha1_col=self._sha1_col,
                    mime_col=self._mime_col,
                    byte_col=byte_col,
                ))

    def __repr__(self) -> str:
        return "{:s}[id=\"{:s}\"]" \
            .format(self.__class__.__name__, self._element_id)

    def get_config(self) -> Dict:
        """
        Return a JSON-compliant dictionary that could be passed to this class's
        ``from_config`` method to produce an instance with identical
        configuration.

        :return: JSON type compliant configuration dictionary.
        :rtype: dict

        """
        return {
            "element_id": self._element_id,
            "table_name": self._table_name,
            "id_col": self._id_col,
            "sha1_col": self._sha1_col,
            "mime_col": self._mime_col,
            "byte_col": self._byte_col,
            "db_name": self._psql_helper.db_name,
            "db_host": self._psql_helper.db_host,
            "db_port": self._psql_helper.db_port,
            "db_user": self._psql_helper.db_user,
            "db_pass": self._psql_helper.db_pass,
            "read_only": self._read_only,
            "create_table": self._create_table,
        }

    def content_type(self) -> Optional[str]:
        """
        :return: Standard type/subtype string for this data element, or None if
            the content type is unknown.
        :rtype: str or None
        """
        q = self.CommandTemplates.SELECT.format(
            col=self._mime_col,
            table_name=self._table_name,
            id_col=self._id_col,
        )
        v = dict(id_val=self._element_id)

        def cb(cursor: psycopg2.extensions.cursor) -> None:
            """
            :type cursor: psycopg2.extensions.cursor
            """
            cursor.execute(q, v)

        r = list(self._psql_helper.single_execute(cb, yield_result_rows=True))
        if not r:
            return None
        elif len(r) > 1:
            raise RuntimeError("Somehow found multiple entries for the same"
                               "element ID (there should only be one).")
        return r[0][0]

    def is_empty(self) -> bool:
        """
        Check if this element contains no bytes.

        The intent of this method is to quickly check if there is any data
        behind this element, ideally without having to read all/any of the
        underlying data.

        :return: If this element contains 0 bytes.
        :rtype: bool

        """
        q = self.CommandTemplates.SELECT.format(
            col="octet_length(%s)" % self._byte_col,
            table_name=self._table_name,
            id_col=self._id_col,
        )
        v = dict(id_val=self._element_id)

        def cb(cursor: psycopg2.extensions.cursor) -> None:
            """
            :type cursor: psycopg2.extensions.cursor
            """
            cursor.execute(q, v)

        r = list(self._psql_helper.single_execute(cb, yield_result_rows=True))
        if not r:
            # No rows returned, meaning not entry for our element ID and no
            # bytes stored.
            return True
        elif len(r) > 1:
            raise RuntimeError("Somehow found multiple entries for the same"
                               "element ID (there should only be one).")

        num_bytes = int(r[0][0])
        if num_bytes == 0:
            # There was an entry, but the number of bytes stored was zero.
            return True
        else:
            # Non-zero number of bytes stored.
            return False

    def sha1(self) -> str:
        """
        Get the SHA1 checksum of this element's binary content.

        :return: SHA1 hex checksum of the data content.
        :rtype: str
        """
        q = self.CommandTemplates.SELECT.format(
            col=self._sha1_col,
            table_name=self._table_name,
            id_col=self._id_col,
        )
        v = dict(id_val=self._element_id, )

        def cb(cursor: psycopg2.extensions.cursor) -> None:
            """
            :type cursor: psycopg2.extensions.cursor
            """
            cursor.execute(q, v)

        r = list(self._psql_helper.single_execute(cb, yield_result_rows=True))
        if not r:
            # no rows for element ID, so no bytes. Return SHA1 of empty string
            return self.EMPTY_SHA
        return r[0][0]

    def get_bytes(self) -> bytes:
        """
        :return: Get the bytes for this data element.
        :rtype: bytes
        """
        q = self.CommandTemplates.SELECT.format(
            col=self._byte_col,
            table_name=self._table_name,
            id_col=self._id_col,
        )
        v = dict(id_val=self._element_id)

        def cb(cursor: psycopg2.extensions.cursor) -> None:
            """
            :type cursor: psycopg2.extensions.cursor
            """
            cursor.execute(q, v)

        r = list(self._psql_helper.single_execute(cb, yield_result_rows=True))
        if not r or len(r[0][0]) == 0:
            # No returned rows for element ID or if no bytes are stored.
            return bytes()
        else:
            return bytes(r[0][0])

    def writable(self) -> bool:
        """
        :return: if this instance supports setting bytes.
        :rtype: bool
        """
        return not self._read_only

    def set_bytes(self, b: bytes) -> None:
        """
        Set bytes to this data element.

        Not all implementations may support setting bytes (check ``writable``
        method return).

        This base abstract method should be called by sub-class implementations
        first. We check for mutability based on ``writable()`` method return.

        :param b: bytes to set.
        :type b: byte

        :raises ReadOnlyError: This data element can only be read from / does
            not support writing.

        """
        super(PostgresDataElement, self).set_bytes(b)

        b_sha1 = hashlib.sha1(b).hexdigest()

        # TODO: Fallback to ``content_type()`` return if none provided in self.
        if self._content_type:
            # We have a content/mime type override as specified at element
            # construction.
            b_mimetype = self._content_type
            q_tmpl = self.CommandTemplates.UPSERT_DATA
        else:
            # Leave the mimetype alone or set an empty mimetype (none specified
            # at construction).
            b_mimetype = ""
            q_tmpl = self.CommandTemplates.UPSERT_DATA_NO_MIME

        q = q_tmpl.format(
            table_name=self._table_name,
            id_col=self._id_col,
            sha1_col=self._sha1_col,
            mime_col=self._mime_col,
            byte_col=self._byte_col,
        )
        v = dict(id_val=self._element_id,
                 sha1_val=b_sha1,
                 mime_val=b_mimetype,
                 byte_val=psycopg2.Binary(b))

        def cb(cursor: psycopg2.extensions.cursor) -> None:
            """
            :type cursor: psycopg2.extensions.cursor
            """
            # TODO: Could be smart here and only update if content-type/byte
            #       data differs while keeping a row-lock between queries.
            cursor.execute(q, v)

        list(self._psql_helper.single_execute(cb))
Exemple #7
0
    def __init__(self,
                 element_id: str,
                 content_type: Optional[str] = None,
                 table_name: str = "psql_data_elements",
                 id_col: str = "id",
                 sha1_col: str = "sha1",
                 mime_col: str = "mime",
                 byte_col: str = "bytes",
                 db_name: str = "postgres",
                 db_host: Optional[str] = "/tmp",
                 db_port: Optional[int] = 5433,
                 db_user: Optional[str] = None,
                 db_pass: Optional[str] = None,
                 read_only: bool = False,
                 create_table: bool = True):
        """
        Create a new PostgreSQL-based data element.

        If the tabled mapped to the provided ``table_name`` already exists, we
        expect the provided columns to match the following types:
        - ``id_col`` is expected to be TEXT
        - ``sha1_col`` is expected to be TEXT
        - ``type_col`` is expected to be TEXT
        - ``byte_col`` is expected to be BYTEA

        Default database connection parameters are assuming the use of a
        non-default, non-postgres-user cluster where the current user's name is
        equivalent to a valid role in the database.

        :param element_id: ID to reference a specific data element row in the
            table.  This is required in the same way that a path is required to
            point to a file on a filesystem.
        :type element_id: str

        :param content_type: Expected mime-type of byte data set to this
            element.  This only affects setting the mime-type field when setting
            new bytes.  ``content_type()`` will always reflect what is stored in
            the backend, or lack there-of.

            If this mime-type differs from an existing stored value,
            this mime-type will overwrite the stored value on the next call to
            ``set_bytes``.  If this is None and there is no mime-type already
            set in the database, no mime-type will be set on the next
            ``set_bytes`` call.
        :type content_type: str | None

        :param table_name: String label of the table in the database to interact
            with.
        :type table_name: str

        :param id_col: Name of the element ID column in ``table_name``.
        :type id_col: str

        :param sha1_col: Name of the SHA1 column in ``table_name``.
        :type sha1_col: str

        :param mime_col: Name of the MIMETYPE column in ``table_name``.
        :type mime_col: str

        :param byte_col: Name of the column storing byte data in ``table_name``.
        :type byte_col: str

        :param db_host: Host address of the PostgreSQL server. If None, we
            assume the server is on the local machine and use the UNIX socket.
            This might be a required field on Windows machines (not tested yet).
        :type db_host: str | None

        :param db_port: Port the Postgres server is exposed on. If None, we
            assume a default port (5433).
        :type db_port: int | None

        :param db_name: The name of the database to connect to.
        :type db_name: str

        :param db_user: Postgres user to connect as. If None, postgres
            defaults to using the current accessing user account name on the
            operating system.
        :type db_user: str | None

        :param db_pass: Password for the user we're connecting as. This may be
            None if no password is to be used.
        :type db_pass: str | None

        :param read_only: Only allow reading of this data.  Modification actions
            will throw a ReadOnlyError exceptions.
        :type read_only: bool

        :param create_table: If this instance should try to create the storing
            table before actions are performed against it. If the configured
            user does not have sufficient permissions to create the table and it
            does not currently exist, an exception will be raised.
        :type create_table: bool

        """
        super(PostgresDataElement, self).__init__()

        if not isinstance(element_id, str):
            raise ValueError("Element ID should be a string type for this "
                             "implementation. Database storage is typed.")

        self._element_id = element_id
        self._content_type = content_type
        self._table_name = table_name

        self._id_col = id_col
        self._sha1_col = sha1_col
        self._mime_col = mime_col
        self._byte_col = byte_col

        self._read_only = read_only
        self._create_table = create_table

        # itersize is hard-coded because a single-element perspective should
        # only be retrieving one row at a time.
        self._psql_helper = PsqlConnectionHelper(
            db_name, db_host, db_port, db_user, db_pass, 10,
            GLOBAL_PSQL_TABLE_CREATE_RLOCK)

        # Set table creation SQL in helper
        if not self._read_only:
            self._psql_helper.set_table_upsert_sql(
                self.CommandTemplates.UPSERT_TABLE.format(
                    table_name=self._table_name,
                    id_col=self._id_col,
                    sha1_col=self._sha1_col,
                    mime_col=self._mime_col,
                    byte_col=byte_col,
                ))
Exemple #8
0
 def setUp(self) -> None:
     self.conn_helper = PsqlConnectionHelper()
Exemple #9
0
    def __init__(self,
                 table_name: str = "psql_data_elements",
                 id_col: str = "id",
                 sha1_col: str = "sha1",
                 mime_col: str = "mime",
                 byte_col: str = "bytes",
                 db_name: str = "postgres",
                 db_host: Optional[str] = "/tmp",
                 db_port: Optional[int] = 5432,
                 db_user: Optional[str] = None,
                 db_pass: Optional[str] = None,
                 itersize: int = 1000,
                 read_only: bool = False,
                 create_table: bool = True):
        """
        Create a PostgreSQL-based data set instance.

        If the tabled mapped to the provided ``table_name`` already exists, we
        expect the provided columns to match the following types:
        - ``id_col`` is expected to be TEXT
        - ``sha1_col`` is expected to be TEXT
        - ``type_col`` is expected to be TEXT
        - ``byte_col`` is expected to be BYTEA

        Default database connection parameters are assuming the use of a
        non-default, non-postgres-user cluster where the current user's name is
        equivalent to a valid role in the database.

        :param str table_name:
            String label of the table in the database to interact with.
        :param str id_col:
            Name of the element ID column in ``table_name``.
        :param str sha1_col:
            Name of the SHA1 column in ``table_name``.
        :param str mime_col:
            Name of the MIMETYPE column in ``table_name``.
        :param str byte_col:
            Name of the column storing byte data in ``table_name``.
        :param str|None db_host:
            Host address of the PostgreSQL server. If None, we assume the
            server is on the local machine and use the UNIX socket. This might
            be a required field on Windows machines (not tested yet).
        :param int|None db_port:
            Port the Postgres server is exposed on. If None, we assume a
            default port (5433).
        :param str db_name:
            The name of the database to connect to.
        :param str|None db_user:
            Postgres user to connect as. If None, postgres defaults to using
            the current accessing user account name on the operating system.
        :param str|None db_pass:
            Password for the user we're connecting as. This may be None if no
            password is to be used.
        :param int itersize:
            Number of records fetched per network round trip when iterating
            over a named cursor.
        :param bool read_only:
            Only allow reading of this data.  Modification actions will throw a
            ReadOnlyError exceptions.
        :param bool create_table:
            If this instance should try to create the storing table before
            actions are performed against it. If the configured user does not
            have sufficient permissions to create the table and it does not
            currently exist, an exception will be raised.

        """
        super(PostgresNativeDataSet, self).__init__()

        itersize = int(itersize)
        if itersize <= 0:
            raise ValueError("Itersize must be greater than 0.")

        self._table_name = table_name
        self._id_col = id_col
        self._sha1_col = sha1_col
        self._mime_col = mime_col
        self._byte_col = byte_col
        self._read_only = read_only
        self._create_table = create_table

        self._psql_helper = PsqlConnectionHelper(db_name, db_host, db_port,
                                                 db_user, db_pass, itersize)

        # Set table creation SQL in helper
        if not self._read_only:
            self._psql_helper.set_table_upsert_sql(
                PostgresDataElement.CommandTemplates.UPSERT_TABLE.format(
                    table_name=self._table_name,
                    id_col=self._id_col,
                    sha1_col=self._sha1_col,
                    mime_col=self._mime_col,
                    byte_col=byte_col,
                ))
Exemple #10
0
class PostgresNativeDataSet(DataSet):
    """
    Dataset that stores data elements natively in a PostgreSQL database.

    Elements stored in this data set implementation will be copied as
    PostgresDataElements, which are stored based on this data set's
    configuration.

    Data elements retrieved from this data set will be of the
    PostgresDataElement class type.

    Data elements stored will cast use the string conversion of its UUID in the
    database. Currently that is OK since data element UUID is a checksum which
    is returned in a standard way as a string. If this changes in the future
    then this implementation will either be limited in what it may take in to
    store or will require revision to handle such a later standard.
    """
    @classmethod
    def is_usable(cls) -> bool:
        if psycopg2 is None:
            warnings.warn("PostgresNativeDataSet not usable due to psycopg2 "
                          "package not being importable.")
            return False
        return True

    def __init__(self,
                 table_name: str = "psql_data_elements",
                 id_col: str = "id",
                 sha1_col: str = "sha1",
                 mime_col: str = "mime",
                 byte_col: str = "bytes",
                 db_name: str = "postgres",
                 db_host: Optional[str] = "/tmp",
                 db_port: Optional[int] = 5432,
                 db_user: Optional[str] = None,
                 db_pass: Optional[str] = None,
                 itersize: int = 1000,
                 read_only: bool = False,
                 create_table: bool = True):
        """
        Create a PostgreSQL-based data set instance.

        If the tabled mapped to the provided ``table_name`` already exists, we
        expect the provided columns to match the following types:
        - ``id_col`` is expected to be TEXT
        - ``sha1_col`` is expected to be TEXT
        - ``type_col`` is expected to be TEXT
        - ``byte_col`` is expected to be BYTEA

        Default database connection parameters are assuming the use of a
        non-default, non-postgres-user cluster where the current user's name is
        equivalent to a valid role in the database.

        :param str table_name:
            String label of the table in the database to interact with.
        :param str id_col:
            Name of the element ID column in ``table_name``.
        :param str sha1_col:
            Name of the SHA1 column in ``table_name``.
        :param str mime_col:
            Name of the MIMETYPE column in ``table_name``.
        :param str byte_col:
            Name of the column storing byte data in ``table_name``.
        :param str|None db_host:
            Host address of the PostgreSQL server. If None, we assume the
            server is on the local machine and use the UNIX socket. This might
            be a required field on Windows machines (not tested yet).
        :param int|None db_port:
            Port the Postgres server is exposed on. If None, we assume a
            default port (5433).
        :param str db_name:
            The name of the database to connect to.
        :param str|None db_user:
            Postgres user to connect as. If None, postgres defaults to using
            the current accessing user account name on the operating system.
        :param str|None db_pass:
            Password for the user we're connecting as. This may be None if no
            password is to be used.
        :param int itersize:
            Number of records fetched per network round trip when iterating
            over a named cursor.
        :param bool read_only:
            Only allow reading of this data.  Modification actions will throw a
            ReadOnlyError exceptions.
        :param bool create_table:
            If this instance should try to create the storing table before
            actions are performed against it. If the configured user does not
            have sufficient permissions to create the table and it does not
            currently exist, an exception will be raised.

        """
        super(PostgresNativeDataSet, self).__init__()

        itersize = int(itersize)
        if itersize <= 0:
            raise ValueError("Itersize must be greater than 0.")

        self._table_name = table_name
        self._id_col = id_col
        self._sha1_col = sha1_col
        self._mime_col = mime_col
        self._byte_col = byte_col
        self._read_only = read_only
        self._create_table = create_table

        self._psql_helper = PsqlConnectionHelper(db_name, db_host, db_port,
                                                 db_user, db_pass, itersize)

        # Set table creation SQL in helper
        if not self._read_only:
            self._psql_helper.set_table_upsert_sql(
                PostgresDataElement.CommandTemplates.UPSERT_TABLE.format(
                    table_name=self._table_name,
                    id_col=self._id_col,
                    sha1_col=self._sha1_col,
                    mime_col=self._mime_col,
                    byte_col=byte_col,
                ))

    def get_config(self) -> Dict:
        return {
            "table_name": self._table_name,
            "id_col": self._id_col,
            "sha1_col": self._sha1_col,
            "mime_col": self._mime_col,
            "byte_col": self._byte_col,
            "db_name": self._psql_helper.db_name,
            "db_host": self._psql_helper.db_host,
            "db_port": self._psql_helper.db_port,
            "db_user": self._psql_helper.db_user,
            "db_pass": self._psql_helper.db_pass,
            "itersize": self._psql_helper.itersize,
            "read_only": self._read_only,
            "create_table": self._create_table,
        }

    def _gen_psql_element(
            self,
            uid: str,
            content_type: Optional[str] = None) -> PostgresDataElement:
        """
        Internal method to generate a psql data element with appropriate psql
        parameters.
        :param collections.abc.Hashable uid: UUID of data element.
        :param None|str content_type: Content type / MIME type of the element.
        :returns: Generated data element instance.
        """
        e = PostgresDataElement(uid,
                                content_type=content_type,
                                table_name=self._table_name,
                                id_col=self._id_col,
                                sha1_col=self._sha1_col,
                                mime_col=self._mime_col,
                                byte_col=self._byte_col,
                                read_only=self._read_only,
                                create_table=self._create_table)
        # Share PSQL helper instance.
        e._psql_helper = self._psql_helper
        return e

    def __iter__(self) -> Iterator[DataElement]:
        """
        :return: Generator over the DataElements contained in this set in no
            particular order.
        """
        # Select all UUIDs and content type, yielding constructed psql
        # data elements.
        q = "SELECT {id_col:s}, {mime_col:s} FROM {table_name:s};".format(
            id_col=self._id_col,
            mime_col=self._mime_col,
            table_name=self._table_name,
        )

        def cb(cursor: psycopg2.extensions.cursor) -> None:
            cursor.execute(q)

        for r in self._psql_helper.single_execute(cb, yield_result_rows=True):
            e_uuid, e_mimetype = r
            yield self._gen_psql_element(e_uuid, e_mimetype)

    def count(self) -> int:
        """
        :return: The number of data elements in this set.
        :rtype: int
        """
        # Query count of rows in table (select on id col only)
        count_query = "SELECT count({id_col:s}) FROM {table_name:s};".format(
            id_col=self._id_col,
            table_name=self._table_name,
        )

        def cb(cursor: psycopg2._psycopg.cursor) -> None:
            cursor.execute(count_query)

        r = list(self._psql_helper.single_execute(cb, yield_result_rows=True))
        if not r:
            # No rows in table
            return 0
        return int(r[0][0])

    def uuids(self) -> Set:
        """
        :return: A new set of uuids represented in this data set.
        :rtype: set
        """
        # TODO: UPDATE TO ITERATOR INSTEAD OF SET RETURN TYPE
        q = "SELECT {id_col:s} FROM {table_name:s};".format(
            id_col=self._id_col,
            table_name=self._table_name,
        )

        def cb(cursor: psycopg2._psycopg.cursor) -> None:
            cursor.execute(q)

        return {
            r[0]
            for r in self._psql_helper.single_execute(cb,
                                                      yield_result_rows=True)
        }

    def has_uuid(self, uuid: Hashable) -> bool:
        """
        Test if the given uuid refers to an element in this data set.

        :param uuid: Unique ID to test for inclusion. This should match the
            type that the set implementation expects or cares about.
        :type uuid: collections.abc.Hashable

        :return: True if the given uuid matches an element in this set, or
            False if it does not.
        :rtype: bool

        """
        # Query for table id col values
        q = "SELECT {id_col:s} FROM {table_name:s} "\
            "WHERE {id_col:s} = %(id_val)s;"\
            .format(id_col=self._id_col,
                    table_name=self._table_name)

        def cb(cursor: psycopg2._psycopg.cursor) -> None:
            cursor.execute(q, {'id_val': str(uuid)})

        return bool(
            list(self._psql_helper.single_execute(cb, yield_result_rows=True)))

    def add_data(self, *elems: DataElement) -> None:
        """
        Add the given data element(s) instance to this data set.

        *NOTE: Implementing methods should check that input elements are in
        fact DataElement instances.*

        :param elems: Data element(s) to add
        :type elems: smqtk.representation.DataElement

        """
        # TODO: Optimize for batch insertion using custom query.
        for e in elems:
            # UUID return from data element is currently a checksum as defined
            # by the interface as a string.
            pe = self._gen_psql_element(str(e.uuid()), e.content_type())
            pe.set_bytes(e.get_bytes())

    def get_data(self, uuid: Hashable) -> DataElement:
        """
        Get the data element the given uuid references, or raise an
        exception if the uuid does not reference any element in this set.

        :raises KeyError: If the given uuid does not refer to an element in
            this data set.

        :param uuid: The uuid of the element to retrieve.
        :type uuid: collections.abc.Hashable

        :return: The data element instance for the given uuid.
        :rtype: smqtk.representation.DataElement

        """
        # Query for content type recorded in our table to use for PSQL element
        # construction.
        q = "SELECT {ct_col:s} FROM {table_name:s} " \
            "WHERE {id_col:s} = %(id_val)s;" \
            .format(ct_col=self._mime_col,
                    table_name=self._table_name,
                    id_col=self._id_col)

        def cb(cursor: psycopg2.extensions.cursor) -> None:
            cursor.execute(q, {'id_val': str(uuid)})

        r = list(self._psql_helper.single_execute(cb, yield_result_rows=True))
        if not r:
            # No rows matching the input uuid were found.
            raise KeyError(uuid)

        # Create and return the PSQL element.
        ct = str(r[0][0])
        return self._gen_psql_element(str(uuid), content_type=ct)