def _get_psql_helper(self): """ Internal method to create on demand the PSQL connection helper class. :return: PsqlConnectionHelper utility. :rtype: PsqlConnectionHelper """ # `hasattr` check used for backwards compatibility when interacting with # databases containing elements serialized before the inclusion of this # helper class. if self._psql_helper is None: # Only using a transport iteration size of 1 since this element is # only meant to refer to a single entry in the associated table. self._psql_helper = PsqlConnectionHelper( self.db_name, self.db_host, self.db_port, self.db_user, self.db_pass, itersize=1, table_upsert_lock=PSQL_TABLE_CREATE_RLOCK) # Register table upsert command if self.create_table: self._psql_helper.set_table_upsert_sql( self.UPSERT_TABLE_TMPL.format( table_name=self.table_name, type_col=self.type_col, uuid_col=self.uuid_col, binary_col=self.binary_col, )) return self._psql_helper
class TestPsqlConnectionHelper(unittest.TestCase): def setUp(self): self.conn_helper = PsqlConnectionHelper() def test_batch_execute_on_empty_iterable(self): # noinspection PyUnusedLocal def exec_hook(cur, batch): raise Exception('This line shouldn\'t be reached with an empty ' 'iterable.') list(self.conn_helper.batch_execute(iter(()), exec_hook, 1))
def __init__(self, table_name="data_set", key_col='key', value_col='value', db_name='postgres', db_host=None, db_port=None, db_user=None, db_pass=None, batch_size=1000, pickle_protocol=-1, read_only=False, create_table=True): """ Initialize a PostgreSQL-backed data set instance. :param table_name: Name of the table to use. :type table_name: str :param key_col: Name of the column containing the UUID signatures. :type key_col: str :param value_col: Name of the table column that will contain serialized elements. :type value_col: str :param db_name: The name of the database to connect to. :type db_name: str :param db_host: Host address of the Postgres server. If None, we assume the server is on the local machine and use the UNIX socket. This might be a required field on Windows machines (not tested yet). :type db_host: str | None :param db_port: Port the Postgres server is exposed on. If None, we assume the default port (5423). :type db_port: int | None :param db_user: Postgres user to connect as. If None, postgres defaults to using the current accessing user account name on the operating system. :type db_user: str | None :param db_pass: Password for the user we're connecting as. This may be None if no password is to be used. :type db_pass: str | None :param batch_size: For queries that handle sending or receiving many queries at a time, batch queries based on this size. If this is None, then no batching occurs. The advantage of batching is that it reduces the memory impact for queries dealing with a very large number of elements (don't have to store the full query for all elements in RAM), but the transaction will be some amount slower due to splitting the query into multiple transactions. :type batch_size: int | None :param pickle_protocol: Pickling protocol to use. We will use -1 by default (latest version, probably binary). :type pickle_protocol: int :param read_only: Only allow read actions against this index. Modification actions will throw a ReadOnlyError exceptions. :type read_only: bool :param create_table: If this instance should try to create the storing table before actions are performed against it when not set to be read-only. If the configured user does not have sufficient permissions to create the table and it does not currently exist, an exception will be raised. :type create_table: bool """ super(PostgresKeyValueStore, self).__init__() self._table_name = table_name self._key_col = key_col self._value_col = value_col self._batch_size = batch_size self._pickle_protocol = pickle_protocol self._read_only = bool(read_only) self._create_table = create_table # Checking parameters where necessary if self._batch_size is not None: self._batch_size = int(self._batch_size) assert self._batch_size > 0, \ "A given batch size must be greater than 0 in size " \ "(given: %d)." % self._batch_size assert -1 <= self._pickle_protocol <= 2, \ ("Given pickle protocol is not in the known valid range [-1, 2]. " "Given: %s." % self._pickle_protocol) # helper structure for SQL operations. self._psql_helper = PsqlConnectionHelper( db_name, db_host, db_port, db_user, db_pass, itersize=batch_size, table_upsert_lock=PSQL_TABLE_CREATE_RLOCK, ) # Only set table upsert if not read-only. if not self._read_only and self._create_table: # NOT read-only, so allow table upsert. self._psql_helper.set_table_upsert_sql( self.SqlTemplates.UPSERT_TABLE_TMPL.format( table_name=self._table_name, key_col=self._key_col, value_col=self._value_col))
class PostgresKeyValueStore(KeyValueStore): """ PostgreSQL-backed key-value storage. """ class SqlTemplates(object): """ Container for static PostgreSQL queries used by the containing class. """ UPSERT_TABLE_TMPL = norm_psql_cmd_string(""" CREATE TABLE IF NOT EXISTS {table_name:s} ( {key_col:s} BYTEA NOT NULL, {value_col:s} BYTEA NOT NULL, PRIMARY KEY ({key_col:s}) ); """) SELECT_TMPL = norm_psql_cmd_string(""" SELECT {query:s} FROM {table_name:s}; """) SELECT_LIKE_TMPL = norm_psql_cmd_string(""" SELECT {query:s} FROM {table_name:s} WHERE {key_col:s} LIKE %(key_like)s """) SELECT_MANY_TMPL = norm_psql_cmd_string(""" SELECT {query:s} FROM {table_name:s} WHERE {key_col:s} IN %(key_tuple)s """) UPSERT_TMPL = norm_psql_cmd_string(""" WITH upsert AS ( UPDATE {table_name:s} SET {value_col:s} = %(val)s WHERE {key_col:s} = %(key)s RETURNING * ) INSERT INTO {table_name:s} ({key_col:s}, {value_col:s}) SELECT %(key)s, %(val)s WHERE NOT EXISTS (SELECT * FROM upsert) """) DELETE_LIKE_TMPL = norm_psql_cmd_string(""" DELETE FROM {table_name:s} WHERE {key_col:s} LIKE %(key_like)s """) DELETE_ALL = norm_psql_cmd_string(""" DELETE FROM {table_name:s} """) @classmethod def is_usable(cls): return psycopg2 is not None def __init__(self, table_name="data_set", key_col='key', value_col='value', db_name='postgres', db_host=None, db_port=None, db_user=None, db_pass=None, batch_size=1000, pickle_protocol=-1, read_only=False, create_table=True): """ Initialize a PostgreSQL-backed data set instance. :param table_name: Name of the table to use. :type table_name: str :param key_col: Name of the column containing the UUID signatures. :type key_col: str :param value_col: Name of the table column that will contain serialized elements. :type value_col: str :param db_name: The name of the database to connect to. :type db_name: str :param db_host: Host address of the Postgres server. If None, we assume the server is on the local machine and use the UNIX socket. This might be a required field on Windows machines (not tested yet). :type db_host: str | None :param db_port: Port the Postgres server is exposed on. If None, we assume the default port (5423). :type db_port: int | None :param db_user: Postgres user to connect as. If None, postgres defaults to using the current accessing user account name on the operating system. :type db_user: str | None :param db_pass: Password for the user we're connecting as. This may be None if no password is to be used. :type db_pass: str | None :param batch_size: For queries that handle sending or receiving many queries at a time, batch queries based on this size. If this is None, then no batching occurs. The advantage of batching is that it reduces the memory impact for queries dealing with a very large number of elements (don't have to store the full query for all elements in RAM), but the transaction will be some amount slower due to splitting the query into multiple transactions. :type batch_size: int | None :param pickle_protocol: Pickling protocol to use. We will use -1 by default (latest version, probably binary). :type pickle_protocol: int :param read_only: Only allow read actions against this index. Modification actions will throw a ReadOnlyError exceptions. :type read_only: bool :param create_table: If this instance should try to create the storing table before actions are performed against it when not set to be read-only. If the configured user does not have sufficient permissions to create the table and it does not currently exist, an exception will be raised. :type create_table: bool """ super(PostgresKeyValueStore, self).__init__() self._table_name = table_name self._key_col = key_col self._value_col = value_col self._batch_size = batch_size self._pickle_protocol = pickle_protocol self._read_only = bool(read_only) self._create_table = create_table # Checking parameters where necessary if self._batch_size is not None: self._batch_size = int(self._batch_size) assert self._batch_size > 0, \ "A given batch size must be greater than 0 in size " \ "(given: %d)." % self._batch_size assert -1 <= self._pickle_protocol <= 2, \ ("Given pickle protocol is not in the known valid range [-1, 2]. " "Given: %s." % self._pickle_protocol) # helper structure for SQL operations. self._psql_helper = PsqlConnectionHelper( db_name, db_host, db_port, db_user, db_pass, itersize=batch_size, table_upsert_lock=PSQL_TABLE_CREATE_RLOCK, ) # Only set table upsert if not read-only. if not self._read_only and self._create_table: # NOT read-only, so allow table upsert. self._psql_helper.set_table_upsert_sql( self.SqlTemplates.UPSERT_TABLE_TMPL.format( table_name=self._table_name, key_col=self._key_col, value_col=self._value_col)) @staticmethod def _py_to_bin(k): """ Convert a python hashable value into psycopg2.Binary via pickle. :param k: Python object instance to be converted into a ``psycopg2.Binary`` instance via ``pickle`` serialization. :type k: object :return: ``psycopg2.Binary`` buffer instance to use for insertion into or query against a table. :rtype: psycopg2.Binary """ return psycopg2.Binary(pickle.dumps(k)) @staticmethod def _bin_to_py(b): """ Un-"translate" psycopg2.Binary value (buffer) to a python type. :param b: ``psycopg2.Binary`` buffer instance as retrieved from a PostgreSQL query. :type b: psycopg2.Binary :return: Python object instance as loaded via pickle from the given ``psycopg2.Binary`` buffer. :rtype: object """ return pickle.loads(bytes(b)) def get_config(self): """ Return a JSON-compliant dictionary that could be passed to this class's ``from_config`` method to produce an instance with identical configuration. In the common case, this involves naming the keys of the dictionary based on the initialization argument names as if it were to be passed to the constructor via dictionary expansion. :return: JSON type compliant configuration dictionary. :rtype: dict """ return { "table_name": self._table_name, "key_col": self._key_col, "value_col": self._value_col, "db_name": self._psql_helper.db_name, "db_host": self._psql_helper.db_host, "db_port": self._psql_helper.db_port, "db_user": self._psql_helper.db_user, "db_pass": self._psql_helper.db_pass, "batch_size": self._batch_size, "pickle_protocol": self._pickle_protocol, "read_only": self._read_only, "create_table": self._create_table, } def __repr__(self): """ Return representative string for this class. :return: Representative string for this class. :rtype: str """ return super(PostgresKeyValueStore, self).__repr__() \ % ("table_name: %s, key_col: %s, value_col: %s, " "db_name: %s, db_host: %s, db_port: %s, db_user: %s, " "db_pass: %s, batch_size: %d, pickle_protocol: %d, " "read_only: %s, create_table: %s" % (self._table_name, self._key_col, self._value_col, self._psql_helper.db_name, self._psql_helper.db_host, self._psql_helper.db_port, self._psql_helper.db_user, self._psql_helper.db_pass, self._batch_size, self._pickle_protocol, self._read_only, self._create_table)) def count(self): """ :return: The number of key-value relationships in this store. :rtype: int | long """ def cb(cur): cur.execute( self.SqlTemplates.SELECT_TMPL.format( query='count(%s)' % self._key_col, table_name=self._table_name, )) return list( self._psql_helper.single_execute(cb, yield_result_rows=True))[0][0] def keys(self): """ :return: Iterator over keys in this store. :rtype: collections.Iterator[collections.Hashable] """ def cb(cur): cur.execute( self.SqlTemplates.SELECT_TMPL.format( query=self._key_col, table_name=self._table_name, )) # We can use a named cursor because this is a select statement as well # as server table size may be large. for r in self._psql_helper.single_execute(cb, yield_result_rows=True, named=True): # Convert from buffer -> string -> python yield self._bin_to_py(r[0]) def values(self): """ :return: Iterator over values in this store. Values are not guaranteed to be in any particular order. :rtype: collections.Iterator[object] """ def cb(cur): cur.execute( self.SqlTemplates.SELECT_TMPL.format( query=self._value_col, table_name=self._table_name, )) for r in self._psql_helper.single_execute(cb, yield_result_rows=True, named=True): # Convert from buffer -> string -> python yield self._bin_to_py(r[0]) def is_read_only(self): """ :return: True if this instance is read-only and False if it is not. :rtype: bool """ return self._read_only def has(self, key): """ Check if this store has a value for the given key. :param key: Key to check for a value for. :type key: collections.Hashable :return: If this store has a value for the given key. :rtype: bool """ super(PostgresKeyValueStore, self).has(key) # Try to select based on given key value. If any rows are returned, # there is clearly a key that matches. q = self.SqlTemplates.SELECT_LIKE_TMPL.format( query='true', table_name=self._table_name, key_col=self._key_col, ) def cb(cur): cur.execute(q, {'key_like': self._py_to_bin(key)}) return bool( list(self._psql_helper.single_execute(cb, yield_result_rows=True))) def add(self, key, value): """ Add a key-value pair to this store. :param key: Key for the value. Must be hashable. :type key: collections.Hashable :param value: Python object to store. :type value: object :raises ReadOnlyError: If this instance is marked as read-only. :return: Self. :rtype: KeyValueStore """ super(PostgresKeyValueStore, self).add(key, value) q = self.SqlTemplates.UPSERT_TMPL.format( table_name=self._table_name, key_col=self._key_col, value_col=self._value_col, ) v = { 'key': self._py_to_bin(key), 'val': self._py_to_bin(value), } def cb(cur): cur.execute(q, v) list(self._psql_helper.single_execute(cb)) return self def add_many(self, d): """ Add multiple key-value pairs at a time into this store as represented in the provided dictionary `d`. :param d: Dictionary of key-value pairs to add to this store. :type d: dict[collections.Hashable, object] :return: Self. :rtype: KeyValueStore """ super(PostgresKeyValueStore, self).add_many(d) q = self.SqlTemplates.UPSERT_TMPL.format( table_name=self._table_name, key_col=self._key_col, value_col=self._value_col, ) # Iterator over transformed inputs into values for statement. def val_iter(): for key, val in six.iteritems(d): yield { 'key': self._py_to_bin(key), 'val': self._py_to_bin(val) } def cb(cur, v_batch): psycopg2.extras.execute_batch(cur, q, v_batch, page_size=self._batch_size) list(self._psql_helper.batch_execute(val_iter(), cb, self._batch_size)) return self def remove(self, key): """ Remove a single key-value entry. :param key: Key to remove. :type key: collections.Hashable :raises ReadOnlyError: If this instance is marked as read-only. :raises KeyError: The given key is not present in this store and no default value given. :return: Self. :rtype: KeyValueStore """ super(PostgresKeyValueStore, self).remove(key) if key not in self: raise KeyError(key) q = self.SqlTemplates.DELETE_LIKE_TMPL.format( table_name=self._table_name, key_col=self._key_col, ) v = dict(key_like=self._py_to_bin(key)) def cb(cursor): cursor.execute(q, v) list(self._psql_helper.single_execute(cb)) return self def _check_contained_keys(self, keys): """ Check if the table contains the following keys. :param set keys: Keys to check for. :return: An set of keys NOT present in the table. :rtype: set[collections.Hashable] """ def key_like_iter(): for k_ in keys: yield self._py_to_bin(k_) has_many_q = self.SqlTemplates.SELECT_MANY_TMPL.format( query=self._key_col, table_name=self._table_name, key_col=self._key_col, ) # Keys found in table matched_keys = set() def cb(cursor, batch): cursor.execute(has_many_q, {'key_tuple': tuple(batch)}) matched_keys.update(self._bin_to_py(r[0]) for r in cursor) list( self._psql_helper.batch_execute(key_like_iter(), cb, self._batch_size)) return keys - matched_keys def remove_many(self, keys): """ Remove multiple keys and associated values. :param keys: Iterable of keys to remove. If this is empty this method does nothing. :type keys: collections.Iterable[collections.Hashable] :raises ReadOnlyError: If this instance is marked as read-only. :raises KeyError: The given key is not present in this store and no default value given. The store is not modified if any key is invalid. :return: Self. :rtype: KeyValueStore """ super(PostgresKeyValueStore, self).remove_many(keys) keys = set(keys) # Check that all keys requested for removal are contained in our table # before attempting to remove any of them. key_diff = self._check_contained_keys(keys) # If we're trying to remove a key not in our table, appropriately raise # a KeyError. if key_diff: if len(key_diff) == 1: raise KeyError(list(key_diff)[0]) else: raise KeyError(key_diff) # Proceed with removal def key_like_iter(): """ Iterator over query value sets. """ for k_ in keys: yield self._py_to_bin(k_) del_q = self.SqlTemplates.DELETE_LIKE_TMPL.format( table_name=self._table_name, key_col=self._key_col, ) def del_cb(cursor, v_batch): # Execute the query with a list of value dicts. psycopg2.extras.execute_batch(cursor, del_q, [{ 'key_like': k } for k in v_batch], page_size=self._batch_size) list( self._psql_helper.batch_execute(key_like_iter(), del_cb, self._batch_size)) return self def get(self, key, default=NO_DEFAULT_VALUE): """ Get the value for the given key. *NOTE:* **Implementing sub-classes are responsible for raising a ``KeyError`` where appropriate.** :param key: Key to get the value of. :type key: collections.Hashable :param default: Optional default value if the given key is not present in this store. This may be any value except for the ``NO_DEFAULT_VALUE`` constant (custom anonymous class instance). :type default: object :raises KeyError: The given key is not present in this store and no default value given. :return: Deserialized python object stored for the given key. :rtype: object """ q = self.SqlTemplates.SELECT_LIKE_TMPL.format( query=self._value_col, table_name=self._table_name, key_col=self._key_col, ) v = {'key_like': self._py_to_bin(key)} def cb(cur): cur.execute(q, v) rows = list( self._psql_helper.single_execute(cb, yield_result_rows=True)) # If no rows and no default, raise KeyError. if len(rows) == 0: if default is NO_DEFAULT_VALUE: raise KeyError(key) else: return default return self._bin_to_py(rows[0][0]) def get_many(self, keys, default=NO_DEFAULT_VALUE): """ Get the values for the given keys. *NOTE:* **Implementing sub-classes are responsible for raising a ``KeyError`` where appropriate.** :param keys: The keys for which associated values are requested. :type keys: collections.Iterable[collections.Hashable] :param default: Optional default value if a given key is not present in this store. This may be any value except for the ``NO_DEFAULT_VALUE`` constant (custom anonymous class instance). :type default: object :raises KeyError: A given key is not present in this store and no default value given. :return: Iterable of deserialized python objects stored for the given keys in the order that the corresponding keys were provided. :rtype: collections.Iterable """ sql_command_string = self.SqlTemplates.SELECT_MANY_TMPL.format( query=', '.join((self._key_col, self._value_col)), table_name=self._table_name, key_col=self._key_col) keys = list(keys) sql_keys = tuple(self._py_to_bin(key_) for key_ in keys) sql_variables = {'key_tuple': sql_keys} def postgres_callback(cursor): cursor.execute(sql_command_string, sql_variables) retrieved_dict = { self._bin_to_py(row_[0]): self._bin_to_py(row_[1]) for row_ in self._psql_helper.single_execute( postgres_callback, yield_result_rows=True) } if default is NO_DEFAULT_VALUE: for key_ in keys: yield retrieved_dict[key_] else: for key_ in keys: yield retrieved_dict.get(key_, default) def clear(self): """ Clear this key-value store. *NOTE:* **Implementing sub-classes should call this super-method. This super method should not be considered a critical section for thread safety.** :raises ReadOnlyError: If this instance is marked as read-only. """ q = self.SqlTemplates.DELETE_ALL.format(table_name=self._table_name) def cb(cur): cur.execute(q) list(self._psql_helper.single_execute(cb))
def __init__(self, table_name="data_set", key_col='key', value_col='value', db_name='postgres', db_host=None, db_port=None, db_user=None, db_pass=None, batch_size=1000, pickle_protocol=-1, read_only=False, create_table=True): """ Initialize a PostgreSQL-backed data set instance. :param table_name: Name of the table to use. :type table_name: str :param key_col: Name of the column containing the UUID signatures. :type key_col: str :param value_col: Name of the table column that will contain serialized elements. :type value_col: str :param db_name: The name of the database to connect to. :type db_name: str :param db_host: Host address of the Postgres server. If None, we assume the server is on the local machine and use the UNIX socket. This might be a required field on Windows machines (not tested yet). :type db_host: str | None :param db_port: Port the Postgres server is exposed on. If None, we assume the default port (5423). :type db_port: int | None :param db_user: Postgres user to connect as. If None, postgres defaults to using the current accessing user account name on the operating system. :type db_user: str | None :param db_pass: Password for the user we're connecting as. This may be None if no password is to be used. :type db_pass: str | None :param batch_size: For queries that handle sending or receiving many queries at a time, batch queries based on this size. If this is None, then no batching occurs. The advantage of batching is that it reduces the memory impact for queries dealing with a very large number of elements (don't have to store the full query for all elements in RAM), but the transaction will be some amount slower due to splitting the query into multiple transactions. :type batch_size: int | None :param pickle_protocol: Pickling protocol to use. We will use -1 by default (latest version, probably binary). :type pickle_protocol: int :param read_only: Only allow read actions against this index. Modification actions will throw a ReadOnlyError exceptions. :type read_only: bool :param create_table: If this instance should try to create the storing table before actions are performed against it when not set to be read-only. If the configured user does not have sufficient permissions to create the table and it does not currently exist, an exception will be raised. :type create_table: bool """ super(PostgresKeyValueStore, self).__init__() self._table_name = table_name self._key_col = key_col self._value_col = value_col self._batch_size = batch_size self._pickle_protocol = pickle_protocol self._read_only = bool(read_only) self._create_table = create_table # Checking parameters where necessary if self._batch_size is not None: self._batch_size = int(self._batch_size) assert self._batch_size > 0, \ "A given batch size must be greater than 0 in size " \ "(given: %d)." % self._batch_size assert -1 <= self._pickle_protocol <= 2, \ ("Given pickle protocol is not in the known valid range [-1, 2]. " "Given: %s." % self._pickle_protocol) # helper structure for SQL operations. self._psql_helper = PsqlConnectionHelper( db_name, db_host, db_port, db_user, db_pass, itersize=batch_size, table_upsert_lock=PSQL_TABLE_CREATE_RLOCK, ) # Only set table upsert if not read-only. if not self._read_only and self._create_table: # NOT read-only, so allow table upsert. self._psql_helper.set_table_upsert_sql( self.SqlTemplates.UPSERT_TABLE_TMPL.format( table_name=self._table_name, key_col=self._key_col, value_col=self._value_col ) )
class PostgresKeyValueStore (KeyValueStore): """ PostgreSQL-backed key-value storage. """ class SqlTemplates (object): """ Container for static PostgreSQL queries used by the containing class. """ UPSERT_TABLE_TMPL = norm_psql_cmd_string(""" CREATE TABLE IF NOT EXISTS {table_name:s} ( {key_col:s} BYTEA NOT NULL, {value_col:s} BYTEA NOT NULL, PRIMARY KEY ({key_col:s}) ); """) SELECT_TMPL = norm_psql_cmd_string(""" SELECT {query:s} FROM {table_name:s}; """) SELECT_LIKE_TMPL = norm_psql_cmd_string(""" SELECT {query:s} FROM {table_name:s} WHERE {key_col:s} LIKE %(key_like)s """) SELECT_MANY_TMPL = norm_psql_cmd_string(""" SELECT {query:s} FROM {table_name:s} WHERE {key_col:s} IN %(key_tuple)s """) UPSERT_TMPL = norm_psql_cmd_string(""" WITH upsert AS ( UPDATE {table_name:s} SET {value_col:s} = %(val)s WHERE {key_col:s} = %(key)s RETURNING * ) INSERT INTO {table_name:s} ({key_col:s}, {value_col:s}) SELECT %(key)s, %(val)s WHERE NOT EXISTS (SELECT * FROM upsert) """) DELETE_LIKE_TMPL = norm_psql_cmd_string(""" DELETE FROM {table_name:s} WHERE {key_col:s} LIKE %(key_like)s """) DELETE_ALL = norm_psql_cmd_string(""" DELETE FROM {table_name:s} """) @classmethod def is_usable(cls): return psycopg2 is not None def __init__(self, table_name="data_set", key_col='key', value_col='value', db_name='postgres', db_host=None, db_port=None, db_user=None, db_pass=None, batch_size=1000, pickle_protocol=-1, read_only=False, create_table=True): """ Initialize a PostgreSQL-backed data set instance. :param table_name: Name of the table to use. :type table_name: str :param key_col: Name of the column containing the UUID signatures. :type key_col: str :param value_col: Name of the table column that will contain serialized elements. :type value_col: str :param db_name: The name of the database to connect to. :type db_name: str :param db_host: Host address of the Postgres server. If None, we assume the server is on the local machine and use the UNIX socket. This might be a required field on Windows machines (not tested yet). :type db_host: str | None :param db_port: Port the Postgres server is exposed on. If None, we assume the default port (5423). :type db_port: int | None :param db_user: Postgres user to connect as. If None, postgres defaults to using the current accessing user account name on the operating system. :type db_user: str | None :param db_pass: Password for the user we're connecting as. This may be None if no password is to be used. :type db_pass: str | None :param batch_size: For queries that handle sending or receiving many queries at a time, batch queries based on this size. If this is None, then no batching occurs. The advantage of batching is that it reduces the memory impact for queries dealing with a very large number of elements (don't have to store the full query for all elements in RAM), but the transaction will be some amount slower due to splitting the query into multiple transactions. :type batch_size: int | None :param pickle_protocol: Pickling protocol to use. We will use -1 by default (latest version, probably binary). :type pickle_protocol: int :param read_only: Only allow read actions against this index. Modification actions will throw a ReadOnlyError exceptions. :type read_only: bool :param create_table: If this instance should try to create the storing table before actions are performed against it when not set to be read-only. If the configured user does not have sufficient permissions to create the table and it does not currently exist, an exception will be raised. :type create_table: bool """ super(PostgresKeyValueStore, self).__init__() self._table_name = table_name self._key_col = key_col self._value_col = value_col self._batch_size = batch_size self._pickle_protocol = pickle_protocol self._read_only = bool(read_only) self._create_table = create_table # Checking parameters where necessary if self._batch_size is not None: self._batch_size = int(self._batch_size) assert self._batch_size > 0, \ "A given batch size must be greater than 0 in size " \ "(given: %d)." % self._batch_size assert -1 <= self._pickle_protocol <= 2, \ ("Given pickle protocol is not in the known valid range [-1, 2]. " "Given: %s." % self._pickle_protocol) # helper structure for SQL operations. self._psql_helper = PsqlConnectionHelper( db_name, db_host, db_port, db_user, db_pass, itersize=batch_size, table_upsert_lock=PSQL_TABLE_CREATE_RLOCK, ) # Only set table upsert if not read-only. if not self._read_only and self._create_table: # NOT read-only, so allow table upsert. self._psql_helper.set_table_upsert_sql( self.SqlTemplates.UPSERT_TABLE_TMPL.format( table_name=self._table_name, key_col=self._key_col, value_col=self._value_col ) ) @staticmethod def _py_to_bin(k): """ Convert a python hashable value into psycopg2.Binary via pickle. :param k: Python object instance to be converted into a ``psycopg2.Binary`` instance via ``pickle`` serialization. :type k: object :return: ``psycopg2.Binary`` buffer instance to use for insertion into or query against a table. :rtype: psycopg2.Binary """ return psycopg2.Binary(pickle.dumps(k)) @staticmethod def _bin_to_py(b): """ Un-"translate" psycopg2.Binary value (buffer) to a python type. :param b: ``psycopg2.Binary`` buffer instance as retrieved from a PostgreSQL query. :type b: psycopg2.Binary :return: Python object instance as loaded via pickle from the given ``psycopg2.Binary`` buffer. :rtype: object """ return pickle.loads(bytes(b)) def get_config(self): """ Return a JSON-compliant dictionary that could be passed to this class's ``from_config`` method to produce an instance with identical configuration. In the common case, this involves naming the keys of the dictionary based on the initialization argument names as if it were to be passed to the constructor via dictionary expansion. :return: JSON type compliant configuration dictionary. :rtype: dict """ return { "table_name": self._table_name, "key_col": self._key_col, "value_col": self._value_col, "db_name": self._psql_helper.db_name, "db_host": self._psql_helper.db_host, "db_port": self._psql_helper.db_port, "db_user": self._psql_helper.db_user, "db_pass": self._psql_helper.db_pass, "batch_size": self._batch_size, "pickle_protocol": self._pickle_protocol, "read_only": self._read_only, "create_table": self._create_table, } def __repr__(self): """ Return representative string for this class. :return: Representative string for this class. :rtype: str """ return super(PostgresKeyValueStore, self).__repr__() \ % ("table_name: %s, key_col: %s, value_col: %s, " "db_name: %s, db_host: %s, db_port: %s, db_user: %s, " "db_pass: %s, batch_size: %d, pickle_protocol: %d, " "read_only: %s, create_table: %s" % (self._table_name, self._key_col, self._value_col, self._psql_helper.db_name, self._psql_helper.db_host, self._psql_helper.db_port, self._psql_helper.db_user, self._psql_helper.db_pass, self._batch_size, self._pickle_protocol, self._read_only, self._create_table)) def count(self): """ :return: The number of key-value relationships in this store. :rtype: int | long """ def cb(cur): cur.execute(self.SqlTemplates.SELECT_TMPL.format( query='count(%s)' % self._key_col, table_name=self._table_name, )) return list(self._psql_helper.single_execute( cb, yield_result_rows=True ))[0][0] def keys(self): """ :return: Iterator over keys in this store. :rtype: collections.Iterator[collections.Hashable] """ def cb(cur): cur.execute(self.SqlTemplates.SELECT_TMPL.format( query=self._key_col, table_name=self._table_name, )) # We can use a named cursor because this is a select statement as well # as server table size may be large. for r in self._psql_helper.single_execute(cb, yield_result_rows=True, named=True): # Convert from buffer -> string -> python yield self._bin_to_py(r[0]) def values(self): """ :return: Iterator over values in this store. Values are not guaranteed to be in any particular order. :rtype: collections.Iterator[object] """ def cb(cur): cur.execute(self.SqlTemplates.SELECT_TMPL.format( query=self._value_col, table_name=self._table_name, )) for r in self._psql_helper.single_execute(cb, yield_result_rows=True, named=True): # Convert from buffer -> string -> python yield self._bin_to_py(r[0]) def is_read_only(self): """ :return: True if this instance is read-only and False if it is not. :rtype: bool """ return self._read_only def has(self, key): """ Check if this store has a value for the given key. :param key: Key to check for a value for. :type key: collections.Hashable :return: If this store has a value for the given key. :rtype: bool """ super(PostgresKeyValueStore, self).has(key) # Try to select based on given key value. If any rows are returned, # there is clearly a key that matches. q = self.SqlTemplates.SELECT_LIKE_TMPL.format( query='true', table_name=self._table_name, key_col=self._key_col, ) def cb(cur): cur.execute(q, {'key_like': self._py_to_bin(key)}) return bool(list(self._psql_helper.single_execute( cb, yield_result_rows=True ))) def add(self, key, value): """ Add a key-value pair to this store. :param key: Key for the value. Must be hashable. :type key: collections.Hashable :param value: Python object to store. :type value: object :raises ReadOnlyError: If this instance is marked as read-only. :return: Self. :rtype: KeyValueStore """ super(PostgresKeyValueStore, self).add(key, value) q = self.SqlTemplates.UPSERT_TMPL.format( table_name=self._table_name, key_col=self._key_col, value_col=self._value_col, ) v = { 'key': self._py_to_bin(key), 'val': self._py_to_bin(value), } def cb(cur): cur.execute(q, v) list(self._psql_helper.single_execute(cb)) return self def add_many(self, d): """ Add multiple key-value pairs at a time into this store as represented in the provided dictionary `d`. :param d: Dictionary of key-value pairs to add to this store. :type d: dict[collections.Hashable, object] :return: Self. :rtype: KeyValueStore """ super(PostgresKeyValueStore, self).add_many(d) q = self.SqlTemplates.UPSERT_TMPL.format( table_name=self._table_name, key_col=self._key_col, value_col=self._value_col, ) # Iterator over transformed inputs into values for statement. def val_iter(): for key, val in six.iteritems(d): yield { 'key': self._py_to_bin(key), 'val': self._py_to_bin(val) } def cb(cur, v_batch): psycopg2.extras.execute_batch(cur, q, v_batch, page_size=self._batch_size) list(self._psql_helper.batch_execute(val_iter(), cb, self._batch_size)) return self def remove(self, key): """ Remove a single key-value entry. :param key: Key to remove. :type key: collections.Hashable :raises ReadOnlyError: If this instance is marked as read-only. :raises KeyError: The given key is not present in this store and no default value given. :return: Self. :rtype: KeyValueStore """ super(PostgresKeyValueStore, self).remove(key) if key not in self: raise KeyError(key) q = self.SqlTemplates.DELETE_LIKE_TMPL.format( table_name=self._table_name, key_col=self._key_col, ) v = dict( key_like=self._py_to_bin(key) ) def cb(cursor): cursor.execute(q, v) list(self._psql_helper.single_execute(cb)) return self def _check_contained_keys(self, keys): """ Check if the table contains the following keys. :param set keys: Keys to check for. :return: An set of keys NOT present in the table. :rtype: set[collections.Hashable] """ def key_like_iter(): for k_ in keys: yield self._py_to_bin(k_) has_many_q = self.SqlTemplates.SELECT_MANY_TMPL.format( query=self._key_col, table_name=self._table_name, key_col=self._key_col, ) # Keys found in table matched_keys = set() def cb(cursor, batch): cursor.execute(has_many_q, {'key_tuple': tuple(batch)}) matched_keys.update(self._bin_to_py(r[0]) for r in cursor) list(self._psql_helper.batch_execute(key_like_iter(), cb, self._batch_size)) return keys - matched_keys def remove_many(self, keys): """ Remove multiple keys and associated values. :param keys: Iterable of keys to remove. If this is empty this method does nothing. :type keys: collections.Iterable[collections.Hashable] :raises ReadOnlyError: If this instance is marked as read-only. :raises KeyError: The given key is not present in this store and no default value given. The store is not modified if any key is invalid. :return: Self. :rtype: KeyValueStore """ super(PostgresKeyValueStore, self).remove_many(keys) keys = set(keys) # Check that all keys requested for removal are contained in our table # before attempting to remove any of them. key_diff = self._check_contained_keys(keys) # If we're trying to remove a key not in our table, appropriately raise # a KeyError. if key_diff: if len(key_diff) == 1: raise KeyError(list(key_diff)[0]) else: raise KeyError(key_diff) # Proceed with removal def key_like_iter(): """ Iterator over query value sets. """ for k_ in keys: yield self._py_to_bin(k_) del_q = self.SqlTemplates.DELETE_LIKE_TMPL.format( table_name=self._table_name, key_col=self._key_col, ) def del_cb(cursor, v_batch): # Execute the query with a list of value dicts. psycopg2.extras.execute_batch(cursor, del_q, [{'key_like': k} for k in v_batch], page_size=self._batch_size) list(self._psql_helper.batch_execute(key_like_iter(), del_cb, self._batch_size)) return self def get(self, key, default=NO_DEFAULT_VALUE): """ Get the value for the given key. *NOTE:* **Implementing sub-classes are responsible for raising a ``KeyError`` where appropriate.** :param key: Key to get the value of. :type key: collections.Hashable :param default: Optional default value if the given key is not present in this store. This may be any value except for the ``NO_DEFAULT_VALUE`` constant (custom anonymous class instance). :type default: object :raises KeyError: The given key is not present in this store and no default value given. :return: Deserialized python object stored for the given key. :rtype: object """ q = self.SqlTemplates.SELECT_LIKE_TMPL.format( query=self._value_col, table_name=self._table_name, key_col=self._key_col, ) v = {'key_like': self._py_to_bin(key)} def cb(cur): cur.execute(q, v) rows = list(self._psql_helper.single_execute( cb, yield_result_rows=True )) # If no rows and no default, raise KeyError. if len(rows) == 0: if default is NO_DEFAULT_VALUE: raise KeyError(key) else: return default return self._bin_to_py(rows[0][0]) def clear(self): """ Clear this key-value store. *NOTE:* **Implementing sub-classes should call this super-method. This super method should not be considered a critical section for thread safety.** :raises ReadOnlyError: If this instance is marked as read-only. """ q = self.SqlTemplates.DELETE_ALL.format(table_name=self._table_name) def cb(cur): cur.execute(q) list(self._psql_helper.single_execute(cb))
class PostgresDescriptorIndex (DescriptorIndex): """ DescriptorIndex implementation that stored DescriptorElement references in a PostgreSQL database. A ``PostgresDescriptorIndex`` effectively controls the entire table. Thus a ``clear()`` call will remove everything from the table. PostgreSQL version support: - 9.4 Table format: <uuid col> TEXT NOT NULL <element col> BYTEA NOT NULL <uuid_col> should be the primary key (we assume unique). We require that the no column labels not be 'true' for the use of a value return shortcut. """ # # The following are SQL query templates. The string formatting using {}'s # is used to fill in the query before using it in an execute with instance # specific values. The ``%()s`` formatting is special for the execute where- # by psycopg2 will fill in the values appropriately as specified in a second # dictionary argument to ``cursor.execute(query, value_dict)``. # UPSERT_TABLE_TMPL = norm_psql_cmd_string(""" CREATE TABLE IF NOT EXISTS {table_name:s} ( {uuid_col:s} TEXT NOT NULL, {element_col:s} BYTEA NOT NULL, PRIMARY KEY ({uuid_col:s}) ); """) SELECT_TMPL = norm_psql_cmd_string(""" SELECT {col:s} FROM {table_name:s} """) SELECT_LIKE_TMPL = norm_psql_cmd_string(""" SELECT {element_col:s} FROM {table_name:s} WHERE {uuid_col:s} like %(uuid_like)s """) # So we can ensure we get back elements in specified order # - reference [1] SELECT_MANY_ORDERED_TMPL = norm_psql_cmd_string(""" SELECT {table_name:s}.{element_col:s} FROM {table_name:s} JOIN ( SELECT * FROM unnest(%(uuid_list)s) with ordinality ) AS __ordering__ ({uuid_col:s}, {uuid_col:s}_order) ON {table_name:s}.{uuid_col:s} = __ordering__.{uuid_col:s} ORDER BY __ordering__.{uuid_col:s}_order """) UPSERT_TMPL = norm_psql_cmd_string(""" WITH upsert AS ( UPDATE {table_name:s} SET {element_col:s} = %(element_val)s WHERE {uuid_col:s} = %(uuid_val)s RETURNING * ) INSERT INTO {table_name:s} ({uuid_col:s}, {element_col:s}) SELECT %(uuid_val)s, %(element_val)s WHERE NOT EXISTS (SELECT * FROM upsert) """) DELETE_LIKE_TMPL = norm_psql_cmd_string(""" DELETE FROM {table_name:s} WHERE {uuid_col:s} like %(uuid_like)s """) DELETE_MANY_TMPL = norm_psql_cmd_string(""" DELETE FROM {table_name:s} WHERE {uuid_col:s} in %(uuid_tuple)s RETURNING uid """) @classmethod def is_usable(cls): return psycopg2 is not None def __init__(self, table_name='descriptor_index', uuid_col='uid', element_col='element', db_name='postgres', db_host=None, db_port=None, db_user=None, db_pass=None, multiquery_batch_size=1000, pickle_protocol=-1, read_only=False, create_table=True): """ Initialize index instance. :param table_name: Name of the table to use. :type table_name: str :param uuid_col: Name of the column containing the UUID signatures. :type uuid_col: str :param element_col: Name of the table column that will contain serialized elements. :type element_col: str :param db_name: The name of the database to connect to. :type db_name: str :param db_host: Host address of the Postgres server. If None, we assume the server is on the local machine and use the UNIX socket. This might be a required field on Windows machines (not tested yet). :type db_host: str | None :param db_port: Port the Postgres server is exposed on. If None, we assume the default port (5423). :type db_port: int | None :param db_user: Postgres user to connect as. If None, postgres defaults to using the current accessing user account name on the operating system. :type db_user: str | None :param db_pass: Password for the user we're connecting as. This may be None if no password is to be used. :type db_pass: str | None :param multiquery_batch_size: For queries that handle sending or receiving many queries at a time, batch queries based on this size. If this is None, then no batching occurs. The advantage of batching is that it reduces the memory impact for queries dealing with a very large number of elements (don't have to store the full query for all elements in RAM), but the transaction will be some amount slower due to splitting the query into multiple transactions. :type multiquery_batch_size: int | None :param pickle_protocol: Pickling protocol to use. We will use -1 by default (latest version, probably binary). :type pickle_protocol: int :param read_only: Only allow read actions against this index. Modification actions will throw a ReadOnlyError exceptions. :type read_only: bool :param create_table: If this instance should try to create the storing table before actions are performed against it when not set to be read-only. If the configured user does not have sufficient permissions to create the table and it does not currently exist, an exception will be raised. :type create_table: bool """ super(PostgresDescriptorIndex, self).__init__() self.table_name = table_name self.uuid_col = uuid_col self.element_col = element_col self.multiquery_batch_size = multiquery_batch_size self.pickle_protocol = pickle_protocol self.read_only = bool(read_only) self.create_table = create_table # Checking parameters where necessary if self.multiquery_batch_size is not None: self.multiquery_batch_size = int(self.multiquery_batch_size) assert self.multiquery_batch_size > 0, \ "A given batch size must be greater than 0 in size " \ "(given: %d)." % self.multiquery_batch_size assert -1 <= self.pickle_protocol <= 2, \ ("Given pickle protocol is not in the known valid range. Given: %s" % self.pickle_protocol) self.psql_helper = PsqlConnectionHelper(db_name, db_host, db_port, db_user, db_pass, self.multiquery_batch_size, PSQL_TABLE_CREATE_RLOCK) if not self.read_only and self.create_table: self.psql_helper.set_table_upsert_sql( self.UPSERT_TABLE_TMPL.format( table_name=self.table_name, uuid_col=self.uuid_col, element_col=self.element_col, ) ) def get_config(self): return { "table_name": self.table_name, "uuid_col": self.uuid_col, "element_col": self.element_col, "db_name": self.psql_helper.db_name, "db_host": self.psql_helper.db_host, "db_port": self.psql_helper.db_port, "db_user": self.psql_helper.db_user, "db_pass": self.psql_helper.db_pass, "multiquery_batch_size": self.multiquery_batch_size, "pickle_protocol": self.pickle_protocol, "read_only": self.read_only, "create_table": self.create_table, } def count(self): """ :return: Number of descriptor elements stored in this index. :rtype: int | long """ # Just count UUID column to limit data read. q = self.SELECT_TMPL.format( col='count(%s)' % self.uuid_col, table_name=self.table_name, ) def exec_hook(cur): cur.execute(q) # There's only going to be one row returned with one element in it. return list(self.psql_helper.single_execute( exec_hook, yield_result_rows=True ))[0][0] def clear(self): """ Clear this descriptor index's entries. """ if self.read_only: raise ReadOnlyError("Cannot clear a read-only index.") q = self.DELETE_LIKE_TMPL.format( table_name=self.table_name, uuid_col=self.uuid_col, ) def exec_hook(cur): cur.execute(q, {'uuid_like': '%'}) list(self.psql_helper.single_execute(exec_hook)) def has_descriptor(self, uuid): """ Check if a DescriptorElement with the given UUID exists in this index. :param uuid: UUID to query for :type uuid: collections.Hashable :return: True if a DescriptorElement with the given UUID exists in this index, or False if not. :rtype: bool """ q = self.SELECT_LIKE_TMPL.format( # hacking return value to something simple element_col='true', table_name=self.table_name, uuid_col=self.uuid_col, ) def exec_hook(cur): cur.execute(q, {'uuid_like': str(uuid)}) # Should either yield one or zero rows return bool(list(self.psql_helper.single_execute( exec_hook, yield_result_rows=True ))) def add_descriptor(self, descriptor): """ Add a descriptor to this index. Adding the same descriptor multiple times should not add multiple copies of the descriptor in the index (based on UUID). Added descriptors overwrite indexed descriptors based on UUID. :param descriptor: Descriptor to index. :type descriptor: smqtk.representation.DescriptorElement """ if self.read_only: raise ReadOnlyError("Cannot clear a read-only index.") q = self.UPSERT_TMPL.format( table_name=self.table_name, uuid_col=self.uuid_col, element_col=self.element_col, ) v = { 'uuid_val': str(descriptor.uuid()), 'element_val': psycopg2.Binary( pickle.dumps(descriptor, self.pickle_protocol) ) } def exec_hook(cur): cur.execute(q, v) list(self.psql_helper.single_execute(exec_hook)) def add_many_descriptors(self, descriptors): """ Add multiple descriptors at one time. Adding the same descriptor multiple times should not add multiple copies of the descriptor in the index (based on UUID). Added descriptors overwrite indexed descriptors based on UUID. :param descriptors: Iterable of descriptor instances to add to this index. :type descriptors: collections.Iterable[smqtk.representation.DescriptorElement] """ if self.read_only: raise ReadOnlyError("Cannot clear a read-only index.") q = self.UPSERT_TMPL.format( table_name=self.table_name, uuid_col=self.uuid_col, element_col=self.element_col, ) # Transform input into def iter_elements(): for d in descriptors: yield { 'uuid_val': str(d.uuid()), 'element_val': psycopg2.Binary( pickle.dumps(d, self.pickle_protocol) ) } def exec_hook(cur, batch): cur.executemany(q, batch) self._log.debug("Adding many descriptors") list(self.psql_helper.batch_execute(iter_elements(), exec_hook, self.multiquery_batch_size)) def get_descriptor(self, uuid): """ Get the descriptor in this index that is associated with the given UUID. :param uuid: UUID of the DescriptorElement to get. :type uuid: collections.Hashable :raises KeyError: The given UUID doesn't associate to a DescriptorElement in this index. :return: DescriptorElement associated with the queried UUID. :rtype: smqtk.representation.DescriptorElement """ q = self.SELECT_LIKE_TMPL.format( element_col=self.element_col, table_name=self.table_name, uuid_col=self.uuid_col, ) v = {'uuid_like': str(uuid)} def eh(c): c.execute(q, v) if c.rowcount == 0: raise KeyError(uuid) elif c.rowcount != 1: raise RuntimeError("Found more than one entry for the given " "uuid '%s' (got: %d)" % (uuid, c.rowcount)) r = list(self.psql_helper.single_execute(eh, yield_result_rows=True)) return pickle.loads(str(r[0][0])) def get_many_descriptors(self, uuids): """ Get an iterator over descriptors associated to given descriptor UUIDs. :param uuids: Iterable of descriptor UUIDs to query for. :type uuids: collections.Iterable[collections.Hashable] :raises KeyError: A given UUID doesn't associate with a DescriptorElement in this index. :return: Iterator of descriptors associated to given uuid values. :rtype: __generator[smqtk.representation.DescriptorElement] """ q = self.SELECT_MANY_ORDERED_TMPL.format( table_name=self.table_name, element_col=self.element_col, uuid_col=self.uuid_col, ) # Cache UUIDs received in order so we can check when we miss one in # order to raise a KeyError. uuid_order = [] def iterelems(): for uid in uuids: uuid_order.append(uid) yield str(uid) def exec_hook(cur, batch): v = {'uuid_list': batch} # self._log.debug('query: %s', cur.mogrify(q, v)) cur.execute(q, v) self._log.debug("Getting many descriptors") # The SELECT_MANY_ORDERED_TMPL query ensures that elements returned are # in the UUID order given to this method. Thus, if the iterated UUIDs # and iterated return rows do not exactly line up, the query join # failed to match a query UUID to something in the database. # - We also check that the number of rows we got back is the same # as elements yielded, else there were trailing UUIDs that did not # match anything in the database. g = self.psql_helper.batch_execute(iterelems(), exec_hook, self.multiquery_batch_size, yield_result_rows=True) i = 0 for r, expected_uuid in zip(g, uuid_order): d = pickle.loads(str(r[0])) if d.uuid() != expected_uuid: raise KeyError(expected_uuid) yield d i += 1 if len(uuid_order) != i: # just report the first one that's bad raise KeyError(uuid_order[i]) def remove_descriptor(self, uuid): """ Remove a descriptor from this index by the given UUID. :param uuid: UUID of the DescriptorElement to remove. :type uuid: collections.Hashable :raises KeyError: The given UUID doesn't associate to a DescriptorElement in this index. """ if self.read_only: raise ReadOnlyError("Cannot remove from a read-only index.") q = self.DELETE_LIKE_TMPL.format( table_name=self.table_name, uuid_col=self.uuid_col, ) v = {'uuid_like': str(uuid)} def execute(c): c.execute(q, v) # Nothing deleted if rowcount == 0 # (otherwise 1 when deleted a thing) if c.rowcount == 0: raise KeyError(uuid) list(self.psql_helper.single_execute(execute)) def remove_many_descriptors(self, uuids): """ Remove descriptors associated to given descriptor UUIDs from this index. :param uuids: Iterable of descriptor UUIDs to remove. :type uuids: collections.Iterable[collections.Hashable] :raises KeyError: A given UUID doesn't associate with a DescriptorElement in this index. """ if self.read_only: raise ReadOnlyError("Cannot remove from a read-only index.") q = self.DELETE_MANY_TMPL.format( table_name=self.table_name, uuid_col=self.uuid_col, ) str_uuid_set = set(str(uid) for uid in uuids) v = {'uuid_tuple': tuple(str_uuid_set)} def execute(c): c.execute(q, v) # Check query UUIDs against rows that would actually be deleted. deleted_uuid_set = set(r[0] for r in c.fetchall()) for uid in str_uuid_set: if uid not in deleted_uuid_set: raise KeyError(uid) list(self.psql_helper.single_execute(execute)) def iterkeys(self): """ Return an iterator over indexed descriptor keys, which are their UUIDs. :rtype: collections.Iterator[collections.Hashable] """ # Getting UUID through the element because the UUID might not be a # string type, and the true type is encoded with the DescriptorElement # instance. for d in self.iterdescriptors(): yield d.uuid() def iterdescriptors(self): """ Return an iterator over indexed descriptor element instances. :rtype: collections.Iterator[smqtk.representation.DescriptorElement] """ def execute(c): c.execute(self.SELECT_TMPL.format( col=self.element_col, table_name=self.table_name )) #: :type: __generator execution_results = self.psql_helper.single_execute( execute, yield_result_rows=True, named=True ) for r in execution_results: d = pickle.loads(str(r[0])) yield d def iteritems(self): """ Return an iterator over indexed descriptor key and instance pairs. :rtype: collections.Iterator[(collections.Hashable, smqtk.representation.DescriptorElement)] """ for d in self.iterdescriptors(): yield d.uuid(), d
def __init__(self, table_name="psql_data_elements", id_col="id", sha1_col="sha1", mime_col="mime", byte_col="bytes", db_name="postgres", db_host="/tmp", db_port=5432, db_user=None, db_pass=None, itersize=1000, read_only=False, create_table=True): """ Create a PostgreSQL-based data set instance. If the tabled mapped to the provided ``table_name`` already exists, we expect the provided columns to match the following types: - ``id_col`` is expected to be TEXT - ``sha1_col`` is expected to be TEXT - ``type_col`` is expected to be TEXT - ``byte_col`` is expected to be BYTEA Default database connection parameters are assuming the use of a non-default, non-postgres-user cluster where the current user's name is equivalent to a valid role in the database. :param str table_name: String label of the table in the database to interact with. :param str id_col: Name of the element ID column in ``table_name``. :param str sha1_col: Name of the SHA1 column in ``table_name``. :param str mime_col: Name of the MIMETYPE column in ``table_name``. :param str byte_col: Name of the column storing byte data in ``table_name``. :param str|None db_host: Host address of the PostgreSQL server. If None, we assume the server is on the local machine and use the UNIX socket. This might be a required field on Windows machines (not tested yet). :param str|None db_port: Port the Postgres server is exposed on. If None, we assume a default port (5433). :param str db_name: The name of the database to connect to. :param str|None db_user: Postgres user to connect as. If None, postgres defaults to using the current accessing user account name on the operating system. :param str|None db_pass: Password for the user we're connecting as. This may be None if no password is to be used. :param int itersize: Number of records fetched per network round trip when iterating over a named cursor. :param bool read_only: Only allow reading of this data. Modification actions will throw a ReadOnlyError exceptions. :param bool create_table: If this instance should try to create the storing table before actions are performed against it. If the configured user does not have sufficient permissions to create the table and it does not currently exist, an exception will be raised. """ super(PostgresNativeDataSet, self).__init__() itersize = int(itersize) if itersize <= 0: raise ValueError("Itersize must be greater than 0.") self._table_name = table_name self._id_col = id_col self._sha1_col = sha1_col self._mime_col = mime_col self._byte_col = byte_col self._read_only = read_only self._create_table = create_table self._psql_helper = PsqlConnectionHelper(db_name, db_host, db_port, db_user, db_pass, itersize) # Set table creation SQL in helper if not self._read_only: self._psql_helper.set_table_upsert_sql( PostgresDataElement.CommandTemplates.UPSERT_TABLE.format( table_name=self._table_name, id_col=self._id_col, sha1_col=self._sha1_col, mime_col=self._mime_col, byte_col=byte_col, ) )
class PostgresNativeDataSet (DataSet): """ Dataset that stores data elements natively in a PostgreSQL database. Elements stored in this data set implementation will be copied as PostgresDataElements, which are stored based on this data set's configuration. Data elements retrieved from this data set will be of the PostgresDataElement class type. """ @classmethod def is_usable(cls): if psycopg2 is None: warnings.warn("PostgresNativeDataSet not usable due to psycopg2 " "package not being importable.") return False return True def __init__(self, table_name="psql_data_elements", id_col="id", sha1_col="sha1", mime_col="mime", byte_col="bytes", db_name="postgres", db_host="/tmp", db_port=5432, db_user=None, db_pass=None, itersize=1000, read_only=False, create_table=True): """ Create a PostgreSQL-based data set instance. If the tabled mapped to the provided ``table_name`` already exists, we expect the provided columns to match the following types: - ``id_col`` is expected to be TEXT - ``sha1_col`` is expected to be TEXT - ``type_col`` is expected to be TEXT - ``byte_col`` is expected to be BYTEA Default database connection parameters are assuming the use of a non-default, non-postgres-user cluster where the current user's name is equivalent to a valid role in the database. :param str table_name: String label of the table in the database to interact with. :param str id_col: Name of the element ID column in ``table_name``. :param str sha1_col: Name of the SHA1 column in ``table_name``. :param str mime_col: Name of the MIMETYPE column in ``table_name``. :param str byte_col: Name of the column storing byte data in ``table_name``. :param str|None db_host: Host address of the PostgreSQL server. If None, we assume the server is on the local machine and use the UNIX socket. This might be a required field on Windows machines (not tested yet). :param str|None db_port: Port the Postgres server is exposed on. If None, we assume a default port (5433). :param str db_name: The name of the database to connect to. :param str|None db_user: Postgres user to connect as. If None, postgres defaults to using the current accessing user account name on the operating system. :param str|None db_pass: Password for the user we're connecting as. This may be None if no password is to be used. :param int itersize: Number of records fetched per network round trip when iterating over a named cursor. :param bool read_only: Only allow reading of this data. Modification actions will throw a ReadOnlyError exceptions. :param bool create_table: If this instance should try to create the storing table before actions are performed against it. If the configured user does not have sufficient permissions to create the table and it does not currently exist, an exception will be raised. """ super(PostgresNativeDataSet, self).__init__() itersize = int(itersize) if itersize <= 0: raise ValueError("Itersize must be greater than 0.") self._table_name = table_name self._id_col = id_col self._sha1_col = sha1_col self._mime_col = mime_col self._byte_col = byte_col self._read_only = read_only self._create_table = create_table self._psql_helper = PsqlConnectionHelper(db_name, db_host, db_port, db_user, db_pass, itersize) # Set table creation SQL in helper if not self._read_only: self._psql_helper.set_table_upsert_sql( PostgresDataElement.CommandTemplates.UPSERT_TABLE.format( table_name=self._table_name, id_col=self._id_col, sha1_col=self._sha1_col, mime_col=self._mime_col, byte_col=byte_col, ) ) def get_config(self): return { "table_name": self._table_name, "id_col": self._id_col, "sha1_col": self._sha1_col, "mime_col": self._mime_col, "byte_col": self._byte_col, "db_name": self._psql_helper.db_name, "db_host": self._psql_helper.db_host, "db_port": self._psql_helper.db_port, "db_user": self._psql_helper.db_user, "db_pass": self._psql_helper.db_pass, "itersize": self._psql_helper.itersize, "read_only": self._read_only, "create_table": self._create_table, } def _gen_psql_element(self, uid, content_type=None): """ Internal method to generate a psql data element with appropriate psql parameters. :param collections.abc.Hashable uid: UUID of data element. :param None|str content_type: Content type / MIME type of the element. """ e = PostgresDataElement( uid, content_type=content_type, table_name=self._table_name, id_col=self._id_col, sha1_col=self._sha1_col, mime_col=self._mime_col, byte_col=self._byte_col, read_only=self._read_only, create_table=self._create_table ) # Share PSQL helper instance. e._psql_helper = self._psql_helper return e def __iter__(self): """ :return: Generator over the DataElements contained in this set in no particular order. """ # Select all UUIDs and content type, yielding constructed psql # data elements. q = "SELECT {id_col:s}, {mime_col:s} FROM {table_name:s};".format( id_col=self._id_col, mime_col=self._mime_col, table_name=self._table_name, ) def cb(cursor): cursor.execute(q) for r in self._psql_helper.single_execute(cb, yield_result_rows=True): e_uuid, e_mimetype = r yield self._gen_psql_element(e_uuid, e_mimetype) def count(self): """ :return: The number of data elements in this set. :rtype: int """ # Query count of rows in table (select on id col only) count_query = "SELECT count({id_col:s}) FROM {table_name:s};".format( id_col=self._id_col, table_name=self._table_name, ) def cb(cursor): cursor.execute(count_query) r = list(self._psql_helper.single_execute(cb, yield_result_rows=True)) if not r: # No rows in table return 0 return int(r[0][0]) def uuids(self): """ :return: A new set of uuids represented in this data set. :rtype: set """ # TODO: UPDATE TO ITERATOR INSTEAD OF SET RETURN TYPE q = "SELECT {id_col:s} FROM {table_name:s};".format( id_col=self._id_col, table_name=self._table_name, ) def cb(cursor): cursor.execute(q) return {r[0] for r in self._psql_helper.single_execute(cb, yield_result_rows=True)} def has_uuid(self, uuid): """ Test if the given uuid refers to an element in this data set. :param uuid: Unique ID to test for inclusion. This should match the type that the set implementation expects or cares about. :type uuid: collections.abc.Hashable :return: True if the given uuid matches an element in this set, or False if it does not. :rtype: bool """ # Query for table id col values q = "SELECT {id_col:s} FROM {table_name:s} "\ "WHERE {id_col:s} = %(id_val)s;"\ .format(id_col=self._id_col, table_name=self._table_name) def cb(cursor): cursor.execute(q, {'id_val': str(uuid)}) return bool(list( self._psql_helper.single_execute(cb, yield_result_rows=True) )) def add_data(self, *elems): """ Add the given data element(s) instance to this data set. *NOTE: Implementing methods should check that input elements are in fact DataElement instances.* :param elems: Data element(s) to add :type elems: smqtk.representation.DataElement """ # TODO: Optimize for batch insertion using custom query. for e in elems: pe = self._gen_psql_element(e.uuid(), e.content_type()) pe.set_bytes(e.get_bytes()) def get_data(self, uuid): """ Get the data element the given uuid references, or raise an exception if the uuid does not reference any element in this set. :raises KeyError: If the given uuid does not refer to an element in this data set. :param uuid: The uuid of the element to retrieve. :type uuid: collections.abc.Hashable :return: The data element instance for the given uuid. :rtype: smqtk.representation.DataElement """ # Query for content type recorded in our table to use for PSQL element # construction. q = "SELECT {ct_col:s} FROM {table_name:s} " \ "WHERE {id_col:s} = %(id_val)s;" \ .format(ct_col=self._mime_col, table_name=self._table_name, id_col=self._id_col) def cb(cursor): cursor.execute(q, {'id_val': str(uuid)}) r = list(self._psql_helper.single_execute(cb, yield_result_rows=True)) if not r: # No rows matching the input uuid were found. raise KeyError(uuid) # Create and return the PSQL element. ct = str(r[0][0]) return self._gen_psql_element(uuid, content_type=ct)
def setUp(self): self.conn_helper = PsqlConnectionHelper()
class PostgresDataElement(DataElement): # lgtm [py/missing-equals] """ Data element bytes stored in PostgreSQL database. Storage table should have three columns for the following components: - data SHA1 (effective UID) - data content-type / MIMETYPE - data bytes Efficient connection pooling may be achieved via external utilities like PGBounder. Due to the use of the "ON CONFLICT" clause in upserting data, this implementation requires at least PostgreSQL version 9.5 or greater. """ # SHA1 checksum of 0-length data (empty bytes) EMPTY_SHA = hashlib.sha1(b'').hexdigest() class CommandTemplates(object): """ Encapsulation of command templates. """ # Upsert table for storage if desired # # Format params: # - table_name # - id_col # - sha1_col # - mime_col # - byte_col UPSERT_TABLE = norm_psql_cmd_string(""" CREATE TABLE IF NOT EXISTS {table_name:s} ( {id_col:s} TEXT NOT NULL, {sha1_col:s} TEXT NOT NULL, {mime_col:s} TEXT NOT NULL, {byte_col:s} BYTEA NOT NULL, PRIMARY KEY ({id_col:s}) ); """) # Select ``col`` for a given entry ID. # # Query Format params: # - col # - table_name # - id_col # # Value params: # - id_val SELECT = norm_psql_cmd_string(""" SELECT {col:s} FROM {table_name:s} WHERE {id_col:s} = %(id_val)s ; """) # Upsert content-type/data for a uid # # Query Format params: # - table_name # - id_col # - sha1_col # - mime_col # - byte_col # # Value params: # - id_val # - sha1_val # - mime_val # - byte_val # # SQL format from: # https://hashrocket.com/blog/posts/upsert-records-with-postgresql-9-5 # UPSERT_DATA = norm_psql_cmd_string(""" INSERT INTO {table_name:s} ({id_col:s}, {sha1_col:s}, {mime_col:s}, {byte_col:s}) VALUES ( %(id_val)s, %(sha1_val)s, %(mime_val)s, %(byte_val)s ) ON CONFLICT ({id_col:s}) DO UPDATE SET ({sha1_col:s}, {mime_col:s}, {byte_col:s}) = (EXCLUDED.{sha1_col:s}, EXCLUDED.{mime_col:s}, EXCLUDED.{byte_col:s}) ; """) # Same as ``UPSERT_DATA`` but does not set the mimetype on an update. # This is meant to atomically update the byte data without changing the # existing mimetype. UPSERT_DATA_NO_MIME = norm_psql_cmd_string(""" INSERT INTO {table_name:s} ({id_col:s}, {sha1_col:s}, {mime_col:s}, {byte_col:s}) VALUES ( %(id_val)s, %(sha1_val)s, %(mime_val)s, %(byte_val)s ) ON CONFLICT ({id_col:s}) DO UPDATE SET ({sha1_col:s}, {byte_col:s}) = (EXCLUDED.{sha1_col:s}, EXCLUDED.{byte_col:s}) ; """) @classmethod def is_usable(cls): if psycopg2 is None: cls.get_logger().warning("Not usable. " "Requires the psycopg2 module.") return False return True def __init__(self, element_id, content_type=None, table_name="psql_data_elements", id_col="id", sha1_col="sha1", mime_col="mime", byte_col="bytes", db_name="postgres", db_host="/tmp", db_port=5433, db_user=None, db_pass=None, read_only=False, create_table=True): """ Create a new PostgreSQL-based data element. If the tabled mapped to the provided ``table_name`` already exists, we expect the provided columns to match the following types: - ``id_col`` is expected to be TEXT - ``sha1_col`` is expected to be TEXT - ``type_col`` is expected to be TEXT - ``byte_col`` is expected to be BYTEA Default database connection parameters are assuming the use of a non-default, non-postgres-user cluster where the current user's name is equivalent to a valid role in the database. :param element_id: ID to reference a specific data element row in the table. This is required in the same way that a path is required to point to a file on a filesystem. :type element_id: str :param content_type: Expected mime-type of byte data set to this element. This only affects setting the mime-type field when setting new bytes. ``content_type()`` will always reflect what is stored in the backend, or lack there-of. If this mime-type differs from an existing stored value, this mime-type will overwrite the stored value on the next call to ``set_bytes``. If this is None and there is no mime-type already set in the database, no mime-type will be set on the next ``set_bytes`` call. :type content_type: str | None :param table_name: String label of the table in the database to interact with. :type table_name: str :param id_col: Name of the element ID column in ``table_name``. :type id_col: str :param sha1_col: Name of the SHA1 column in ``table_name``. :type sha1_col: str :param mime_col: Name of the MIMETYPE column in ``table_name``. :type mime_col: str :param byte_col: Name of the column storing byte data in ``table_name``. :type byte_col: str :param db_host: Host address of the PostgreSQL server. If None, we assume the server is on the local machine and use the UNIX socket. This might be a required field on Windows machines (not tested yet). :type db_host: str | None :param db_port: Port the Postgres server is exposed on. If None, we assume a default port (5433). :type db_port: int | None :param db_name: The name of the database to connect to. :type db_name: str :param db_user: Postgres user to connect as. If None, postgres defaults to using the current accessing user account name on the operating system. :type db_user: str | None :param db_pass: Password for the user we're connecting as. This may be None if no password is to be used. :type db_pass: str | None :param read_only: Only allow reading of this data. Modification actions will throw a ReadOnlyError exceptions. :type read_only: bool :param create_table: If this instance should try to create the storing table before actions are performed against it. If the configured user does not have sufficient permissions to create the table and it does not currently exist, an exception will be raised. :type create_table: bool """ super(PostgresDataElement, self).__init__() if not isinstance(element_id, six.string_types): raise ValueError("Element ID should be a string type.") self._element_id = element_id self._content_type = content_type self._table_name = table_name self._id_col = id_col self._sha1_col = sha1_col self._mime_col = mime_col self._byte_col = byte_col self._read_only = read_only self._create_table = create_table # itersize is hard-coded because a single-element perspective should # only be retrieving one row at a time. self._psql_helper = PsqlConnectionHelper( db_name, db_host, db_port, db_user, db_pass, 10, GLOBAL_PSQL_TABLE_CREATE_RLOCK) # Set table creation SQL in helper if not self._read_only: self._psql_helper.set_table_upsert_sql( self.CommandTemplates.UPSERT_TABLE.format( table_name=self._table_name, id_col=self._id_col, sha1_col=self._sha1_col, mime_col=self._mime_col, byte_col=byte_col, )) def __repr__(self): return "{:s}[id=\"{:s}\"]" \ .format(self.__class__.__name__, self._element_id) def get_config(self): """ Return a JSON-compliant dictionary that could be passed to this class's ``from_config`` method to produce an instance with identical configuration. :return: JSON type compliant configuration dictionary. :rtype: dict """ return { "element_id": self._element_id, "table_name": self._table_name, "id_col": self._id_col, "sha1_col": self._sha1_col, "mime_col": self._mime_col, "byte_col": self._byte_col, "db_name": self._psql_helper.db_name, "db_host": self._psql_helper.db_host, "db_port": self._psql_helper.db_port, "db_user": self._psql_helper.db_user, "db_pass": self._psql_helper.db_pass, "read_only": self._read_only, "create_table": self._create_table, } def content_type(self): """ :return: Standard type/subtype string for this data element, or None if the content type is unknown. :rtype: str or None """ q = self.CommandTemplates.SELECT.format( col=self._mime_col, table_name=self._table_name, id_col=self._id_col, ) v = dict(id_val=self._element_id) def cb(cursor): """ :type cursor: psycopg2._psycopg.cursor """ cursor.execute(q, v) r = list(self._psql_helper.single_execute(cb, yield_result_rows=True)) if not r: return None elif len(r) > 1: raise RuntimeError("Somehow found multiple entries for the same" "element ID (there should only be one).") return r[0][0] def is_empty(self): """ Check if this element contains no bytes. The intent of this method is to quickly check if there is any data behind this element, ideally without having to read all/any of the underlying data. :return: If this element contains 0 bytes. :rtype: bool """ q = self.CommandTemplates.SELECT.format( col="octet_length(%s)" % self._byte_col, table_name=self._table_name, id_col=self._id_col, ) v = dict(id_val=self._element_id) def cb(cursor): """ :type cursor: psycopg2._psycopg.cursor """ cursor.execute(q, v) r = list(self._psql_helper.single_execute(cb, yield_result_rows=True)) if not r: # No rows returned, meaning not entry for our element ID and no # bytes stored. return True elif len(r) > 1: raise RuntimeError("Somehow found multiple entries for the same" "element ID (there should only be one).") num_bytes = int(r[0][0]) if num_bytes == 0: # There was an entry, but the number of bytes stored was zero. return True else: # Non-zero number of bytes stored. return False def sha1(self): """ Get the SHA1 checksum of this element's binary content. :return: SHA1 hex checksum of the data content. :rtype: str """ q = self.CommandTemplates.SELECT.format( col=self._sha1_col, table_name=self._table_name, id_col=self._id_col, ) v = dict(id_val=self._element_id, ) def cb(cursor): """ :type cursor: psycopg2._psycopg.cursor """ cursor.execute(q, v) r = list(self._psql_helper.single_execute(cb, yield_result_rows=True)) if not r: # no rows for element ID, so no bytes. Return SHA1 of empty string return self.EMPTY_SHA return r[0][0] def get_bytes(self): """ :return: Get the bytes for this data element. :rtype: bytes """ q = self.CommandTemplates.SELECT.format( col=self._byte_col, table_name=self._table_name, id_col=self._id_col, ) v = dict(id_val=self._element_id) def cb(cursor): """ :type cursor: psycopg2._psycopg.cursor """ cursor.execute(q, v) r = list(self._psql_helper.single_execute(cb, yield_result_rows=True)) if not r or len(r[0][0]) == 0: # No returned rows for element ID or if no bytes are stored. return bytes() else: return bytes(r[0][0]) def writable(self): """ :return: if this instance supports setting bytes. :rtype: bool """ return not self._read_only def set_bytes(self, b): """ Set bytes to this data element. Not all implementations may support setting bytes (check ``writable`` method return). This base abstract method should be called by sub-class implementations first. We check for mutability based on ``writable()`` method return. :param b: bytes to set. :type b: byte :raises ReadOnlyError: This data element can only be read from / does not support writing. """ super(PostgresDataElement, self).set_bytes(b) b_sha1 = hashlib.sha1(b).hexdigest() # TODO: Fallback to ``content_type()`` return if none provided in self. if self._content_type: # We have a content/mime type override as specified at element # construction. b_mimetype = self._content_type q_tmpl = self.CommandTemplates.UPSERT_DATA else: # Leave the mimetype alone or set an empty mimetype (none specified # at construction). b_mimetype = "" q_tmpl = self.CommandTemplates.UPSERT_DATA_NO_MIME q = q_tmpl.format( table_name=self._table_name, id_col=self._id_col, sha1_col=self._sha1_col, mime_col=self._mime_col, byte_col=self._byte_col, ) v = dict(id_val=self._element_id, sha1_val=b_sha1, mime_val=b_mimetype, byte_val=psycopg2.Binary(b)) def cb(cursor): """ :type cursor: psycopg2._psycopg.cursor """ # TODO: Could be smart here and only update if content-type/byte # data differs while keeping a row-lock between queries. cursor.execute(q, v) list(self._psql_helper.single_execute(cb))
def __init__(self, element_id, content_type=None, table_name="psql_data_elements", id_col="id", sha1_col="sha1", mime_col="mime", byte_col="bytes", db_name="postgres", db_host="/tmp", db_port=5433, db_user=None, db_pass=None, read_only=False, create_table=True): """ Create a new PostgreSQL-based data element. If the tabled mapped to the provided ``table_name`` already exists, we expect the provided columns to match the following types: - ``id_col`` is expected to be TEXT - ``sha1_col`` is expected to be TEXT - ``type_col`` is expected to be TEXT - ``byte_col`` is expected to be BYTEA Default database connection parameters are assuming the use of a non-default, non-postgres-user cluster where the current user's name is equivalent to a valid role in the database. :param element_id: ID to reference a specific data element row in the table. This is required in the same way that a path is required to point to a file on a filesystem. :type element_id: str :param content_type: Expected mime-type of byte data set to this element. This only affects setting the mime-type field when setting new bytes. ``content_type()`` will always reflect what is stored in the backend, or lack there-of. If this mime-type differs from an existing stored value, this mime-type will overwrite the stored value on the next call to ``set_bytes``. If this is None and there is no mime-type already set in the database, no mime-type will be set on the next ``set_bytes`` call. :type content_type: str | None :param table_name: String label of the table in the database to interact with. :type table_name: str :param id_col: Name of the element ID column in ``table_name``. :type id_col: str :param sha1_col: Name of the SHA1 column in ``table_name``. :type sha1_col: str :param mime_col: Name of the MIMETYPE column in ``table_name``. :type mime_col: str :param byte_col: Name of the column storing byte data in ``table_name``. :type byte_col: str :param db_host: Host address of the PostgreSQL server. If None, we assume the server is on the local machine and use the UNIX socket. This might be a required field on Windows machines (not tested yet). :type db_host: str | None :param db_port: Port the Postgres server is exposed on. If None, we assume a default port (5433). :type db_port: int | None :param db_name: The name of the database to connect to. :type db_name: str :param db_user: Postgres user to connect as. If None, postgres defaults to using the current accessing user account name on the operating system. :type db_user: str | None :param db_pass: Password for the user we're connecting as. This may be None if no password is to be used. :type db_pass: str | None :param read_only: Only allow reading of this data. Modification actions will throw a ReadOnlyError exceptions. :type read_only: bool :param create_table: If this instance should try to create the storing table before actions are performed against it. If the configured user does not have sufficient permissions to create the table and it does not currently exist, an exception will be raised. :type create_table: bool """ super(PostgresDataElement, self).__init__() if not isinstance(element_id, six.string_types): raise ValueError("Element ID should be a string type.") self._element_id = element_id self._content_type = content_type self._table_name = table_name self._id_col = id_col self._sha1_col = sha1_col self._mime_col = mime_col self._byte_col = byte_col self._read_only = read_only self._create_table = create_table # itersize is hard-coded because a single-element perspective should # only be retrieving one row at a time. self._psql_helper = PsqlConnectionHelper( db_name, db_host, db_port, db_user, db_pass, 10, GLOBAL_PSQL_TABLE_CREATE_RLOCK) # Set table creation SQL in helper if not self._read_only: self._psql_helper.set_table_upsert_sql( self.CommandTemplates.UPSERT_TABLE.format( table_name=self._table_name, id_col=self._id_col, sha1_col=self._sha1_col, mime_col=self._mime_col, byte_col=byte_col, ))
class PostgresDescriptorSet(DescriptorSet): """ DescriptorSet implementation that stored DescriptorElement references in a PostgreSQL database. A ``PostgresDescriptorSet`` effectively controls the entire table. Thus a ``clear()`` call will remove everything from the table. PostgreSQL version support: - 9.4 Table format: <uuid col> TEXT NOT NULL <element col> BYTEA NOT NULL <uuid_col> should be the primary key (we assume unique). We require that the no column labels not be 'true' for the use of a value return shortcut. """ # # The following are SQL query templates. The string formatting using {}'s # is used to fill in the query before using it in an execute with instance # specific values. The ``%()s`` formatting is special for the execute # where-by psycopg2 will fill in the values appropriately as specified in a # second dictionary argument to ``cursor.execute(query, value_dict)``. # UPSERT_TABLE_TMPL = norm_psql_cmd_string(""" CREATE TABLE IF NOT EXISTS {table_name:s} ( {uuid_col:s} TEXT NOT NULL, {element_col:s} BYTEA NOT NULL, PRIMARY KEY ({uuid_col:s}) ); """) SELECT_TMPL = norm_psql_cmd_string(""" SELECT {col:s} FROM {table_name:s} """) SELECT_LIKE_TMPL = norm_psql_cmd_string(""" SELECT {element_col:s} FROM {table_name:s} WHERE {uuid_col:s} like %(uuid_like)s """) # So we can ensure we get back elements in specified order # - reference [1] SELECT_MANY_ORDERED_TMPL = norm_psql_cmd_string(""" SELECT {table_name:s}.{element_col:s} FROM {table_name:s} JOIN ( SELECT * FROM unnest(%(uuid_list)s) with ordinality ) AS __ordering__ ({uuid_col:s}, {uuid_col:s}_order) ON {table_name:s}.{uuid_col:s} = __ordering__.{uuid_col:s} ORDER BY __ordering__.{uuid_col:s}_order """) UPSERT_TMPL = norm_psql_cmd_string(""" WITH upsert AS ( UPDATE {table_name:s} SET {element_col:s} = %(element_val)s WHERE {uuid_col:s} = %(uuid_val)s RETURNING * ) INSERT INTO {table_name:s} ({uuid_col:s}, {element_col:s}) SELECT %(uuid_val)s, %(element_val)s WHERE NOT EXISTS (SELECT * FROM upsert) """) DELETE_LIKE_TMPL = norm_psql_cmd_string(""" DELETE FROM {table_name:s} WHERE {uuid_col:s} like %(uuid_like)s """) DELETE_MANY_TMPL = norm_psql_cmd_string(""" DELETE FROM {table_name:s} WHERE {uuid_col:s} in %(uuid_tuple)s RETURNING uid """) @classmethod def is_usable(cls): return psycopg2 is not None def __init__(self, table_name='descriptor_set', uuid_col='uid', element_col='element', db_name='postgres', db_host=None, db_port=None, db_user=None, db_pass=None, multiquery_batch_size=1000, pickle_protocol=-1, read_only=False, create_table=True): """ Initialize set instance. :param table_name: Name of the table to use. :type table_name: str :param uuid_col: Name of the column containing the UUID signatures. :type uuid_col: str :param element_col: Name of the table column that will contain serialized elements. :type element_col: str :param db_name: The name of the database to connect to. :type db_name: str :param db_host: Host address of the Postgres server. If None, we assume the server is on the local machine and use the UNIX socket. This might be a required field on Windows machines (not tested yet). :type db_host: str | None :param db_port: Port the Postgres server is exposed on. If None, we assume the default port (5423). :type db_port: int | None :param db_user: Postgres user to connect as. If None, postgres defaults to using the current accessing user account name on the operating system. :type db_user: str | None :param db_pass: Password for the user we're connecting as. This may be None if no password is to be used. :type db_pass: str | None :param multiquery_batch_size: For queries that handle sending or receiving many queries at a time, batch queries based on this size. If this is None, then no batching occurs. The advantage of batching is that it reduces the memory impact for queries dealing with a very large number of elements (don't have to store the full query for all elements in RAM), but the transaction will be some amount slower due to splitting the query into multiple transactions. :type multiquery_batch_size: int | None :param pickle_protocol: Pickling protocol to use. We will use -1 by default (latest version, probably binary). :type pickle_protocol: int :param read_only: Only allow read actions against this set. Modification actions will throw a ReadOnlyError exceptions. :type read_only: bool :param create_table: If this instance should try to create the storing table before actions are performed against it when not set to be read-only. If the configured user does not have sufficient permissions to create the table and it does not currently exist, an exception will be raised. :type create_table: bool """ super(PostgresDescriptorSet, self).__init__() self.table_name = table_name self.uuid_col = uuid_col self.element_col = element_col self.multiquery_batch_size = multiquery_batch_size self.pickle_protocol = pickle_protocol self.read_only = bool(read_only) self.create_table = create_table # Checking parameters where necessary if self.multiquery_batch_size is not None: self.multiquery_batch_size = int(self.multiquery_batch_size) assert self.multiquery_batch_size > 0, \ "A given batch size must be greater than 0 in size " \ "(given: %d)." % self.multiquery_batch_size assert -1 <= self.pickle_protocol <= 2, \ ("Given pickle protocol is not in the known valid range. Given: %s" % self.pickle_protocol) self.psql_helper = PsqlConnectionHelper(db_name, db_host, db_port, db_user, db_pass, self.multiquery_batch_size, PSQL_TABLE_CREATE_RLOCK) if not self.read_only and self.create_table: self.psql_helper.set_table_upsert_sql( self.UPSERT_TABLE_TMPL.format( table_name=self.table_name, uuid_col=self.uuid_col, element_col=self.element_col, )) def get_config(self): return { "table_name": self.table_name, "uuid_col": self.uuid_col, "element_col": self.element_col, "db_name": self.psql_helper.db_name, "db_host": self.psql_helper.db_host, "db_port": self.psql_helper.db_port, "db_user": self.psql_helper.db_user, "db_pass": self.psql_helper.db_pass, "multiquery_batch_size": self.multiquery_batch_size, "pickle_protocol": self.pickle_protocol, "read_only": self.read_only, "create_table": self.create_table, } def count(self): """ :return: Number of descriptor elements stored in this set. :rtype: int | long """ # Just count UUID column to limit data read. q = self.SELECT_TMPL.format( col='count(%s)' % self.uuid_col, table_name=self.table_name, ) def exec_hook(cur): cur.execute(q) # There's only going to be one row returned with one element in it. return list( self.psql_helper.single_execute(exec_hook, yield_result_rows=True))[0][0] def clear(self): """ Clear this descriptor set's entries. """ if self.read_only: raise ReadOnlyError("Cannot clear a read-only set.") q = self.DELETE_LIKE_TMPL.format( table_name=self.table_name, uuid_col=self.uuid_col, ) def exec_hook(cur): cur.execute(q, {'uuid_like': '%'}) list(self.psql_helper.single_execute(exec_hook)) def has_descriptor(self, uuid): """ Check if a DescriptorElement with the given UUID exists in this set. :param uuid: UUID to query for :type uuid: collections.Hashable :return: True if a DescriptorElement with the given UUID exists in this set, or False if not. :rtype: bool """ q = self.SELECT_LIKE_TMPL.format( # hacking return value to something simple element_col='true', table_name=self.table_name, uuid_col=self.uuid_col, ) def exec_hook(cur): cur.execute(q, {'uuid_like': str(uuid)}) # Should either yield one or zero rows return bool( list( self.psql_helper.single_execute(exec_hook, yield_result_rows=True))) def add_descriptor(self, descriptor): """ Add a descriptor to this set. Adding the same descriptor multiple times should not add multiple copies of the descriptor in the set (based on UUID). Added descriptors overwrite set descriptors based on UUID. :param descriptor: Descriptor to set. :type descriptor: smqtk.representation.DescriptorElement """ if self.read_only: raise ReadOnlyError("Cannot clear a read-only set.") q = self.UPSERT_TMPL.format( table_name=self.table_name, uuid_col=self.uuid_col, element_col=self.element_col, ) v = { 'uuid_val': str(descriptor.uuid()), 'element_val': psycopg2.Binary(pickle.dumps(descriptor, self.pickle_protocol)) } def exec_hook(cur): cur.execute(q, v) list(self.psql_helper.single_execute(exec_hook)) def add_many_descriptors(self, descriptors): """ Add multiple descriptors at one time. Adding the same descriptor multiple times should not add multiple copies of the descriptor in the set (based on UUID). Added descriptors overwrite set descriptors based on UUID. :param descriptors: Iterable of descriptor instances to add to this set. :type descriptors: collections.Iterable[smqtk.representation.DescriptorElement] """ if self.read_only: raise ReadOnlyError("Cannot clear a read-only set.") q = self.UPSERT_TMPL.format( table_name=self.table_name, uuid_col=self.uuid_col, element_col=self.element_col, ) # Transform input into def iter_elements(): for d in descriptors: yield { 'uuid_val': str(d.uuid()), 'element_val': psycopg2.Binary(pickle.dumps(d, self.pickle_protocol)) } def exec_hook(cur, batch): cur.executemany(q, batch) self._log.debug("Adding many descriptors") list( self.psql_helper.batch_execute(iter_elements(), exec_hook, self.multiquery_batch_size)) def get_descriptor(self, uuid): """ Get the descriptor in this set that is associated with the given UUID. :param uuid: UUID of the DescriptorElement to get. :type uuid: collections.Hashable :raises KeyError: The given UUID doesn't associate to a DescriptorElement in this set. :return: DescriptorElement associated with the queried UUID. :rtype: smqtk.representation.DescriptorElement """ q = self.SELECT_LIKE_TMPL.format( element_col=self.element_col, table_name=self.table_name, uuid_col=self.uuid_col, ) v = {'uuid_like': str(uuid)} def eh(c): c.execute(q, v) if c.rowcount == 0: raise KeyError(uuid) elif c.rowcount != 1: raise RuntimeError("Found more than one entry for the given " "uuid '%s' (got: %d)" % (uuid, c.rowcount)) r = list(self.psql_helper.single_execute(eh, yield_result_rows=True)) return pickle.loads(bytes(r[0][0])) def get_many_descriptors(self, uuids): """ Get an iterator over descriptors associated to given descriptor UUIDs. :param uuids: Iterable of descriptor UUIDs to query for. :type uuids: collections.Iterable[collections.Hashable] :raises KeyError: A given UUID doesn't associate with a DescriptorElement in this set. :return: Iterator of descriptors associated to given uuid values. :rtype: __generator[smqtk.representation.DescriptorElement] """ q = self.SELECT_MANY_ORDERED_TMPL.format( table_name=self.table_name, element_col=self.element_col, uuid_col=self.uuid_col, ) # Cache UUIDs received in order so we can check when we miss one in # order to raise a KeyError. uuid_order = [] def iterelems(): for uid in uuids: uuid_order.append(uid) yield str(uid) def exec_hook(cur, batch): v = {'uuid_list': batch} # self._log.debug('query: %s', cur.mogrify(q, v)) cur.execute(q, v) self._log.debug("Getting many descriptors") # The SELECT_MANY_ORDERED_TMPL query ensures that elements returned are # in the UUID order given to this method. Thus, if the iterated UUIDs # and iterated return rows do not exactly line up, the query join # failed to match a query UUID to something in the database. # - We also check that the number of rows we got back is the same # as elements yielded, else there were trailing UUIDs that did not # match anything in the database. g = self.psql_helper.batch_execute(iterelems(), exec_hook, self.multiquery_batch_size, yield_result_rows=True) i = 0 for r, expected_uuid in zip(g, uuid_order): d = pickle.loads(bytes(r[0])) if d.uuid() != expected_uuid: raise KeyError(expected_uuid) yield d i += 1 if len(uuid_order) != i: # just report the first one that's bad raise KeyError(uuid_order[i]) def remove_descriptor(self, uuid): """ Remove a descriptor from this set by the given UUID. :param uuid: UUID of the DescriptorElement to remove. :type uuid: collections.Hashable :raises KeyError: The given UUID doesn't associate to a DescriptorElement in this set. """ if self.read_only: raise ReadOnlyError("Cannot remove from a read-only set.") q = self.DELETE_LIKE_TMPL.format( table_name=self.table_name, uuid_col=self.uuid_col, ) v = {'uuid_like': str(uuid)} def execute(c): c.execute(q, v) # Nothing deleted if rowcount == 0 # (otherwise 1 when deleted a thing) if c.rowcount == 0: raise KeyError(uuid) list(self.psql_helper.single_execute(execute)) def remove_many_descriptors(self, uuids): """ Remove descriptors associated to given descriptor UUIDs from this set. :param uuids: Iterable of descriptor UUIDs to remove. :type uuids: collections.Iterable[collections.Hashable] :raises KeyError: A given UUID doesn't associate with a DescriptorElement in this set. """ if self.read_only: raise ReadOnlyError("Cannot remove from a read-only set.") q = self.DELETE_MANY_TMPL.format( table_name=self.table_name, uuid_col=self.uuid_col, ) str_uuid_set = set(str(uid) for uid in uuids) v = {'uuid_tuple': tuple(str_uuid_set)} def execute(c): c.execute(q, v) # Check query UUIDs against rows that would actually be deleted. deleted_uuid_set = set(r[0] for r in c.fetchall()) for uid in str_uuid_set: if uid not in deleted_uuid_set: raise KeyError(uid) list(self.psql_helper.single_execute(execute)) def iterkeys(self): """ Return an iterator over set descriptor keys, which are their UUIDs. :rtype: collections.Iterator[collections.Hashable] """ # Getting UUID through the element because the UUID might not be a # string type, and the true type is encoded with the DescriptorElement # instance. for d in self.iterdescriptors(): yield d.uuid() def iterdescriptors(self): """ Return an iterator over set descriptor element instances. :rtype: collections.Iterator[smqtk.representation.DescriptorElement] """ def execute(c): c.execute( self.SELECT_TMPL.format(col=self.element_col, table_name=self.table_name)) #: :type: __generator execution_results = self.psql_helper.single_execute( execute, yield_result_rows=True, named=True) for r in execution_results: d = pickle.loads(bytes(r[0])) yield d def iteritems(self): """ Return an iterator over set descriptor key and instance pairs. :rtype: collections.Iterator[(collections.Hashable, smqtk.representation.DescriptorElement)] """ for d in self.iterdescriptors(): yield d.uuid(), d
class PostgresDescriptorElement(DescriptorElement): """ Descriptor element whose vector is stored in a Postgres database. We assume we will work with a Postgres version of at least 9.4 (due to versions tested). Efficient connection pooling may be achieved via external utilities like PGBounder. """ ARRAY_DTYPE = numpy.float64 UPSERT_TABLE_TMPL = norm_psql_cmd_string(""" CREATE TABLE IF NOT EXISTS {table_name:s} ( {type_col:s} TEXT NOT NULL, {uuid_col:s} TEXT NOT NULL, {binary_col:s} BYTEA NOT NULL, PRIMARY KEY ({type_col:s}, {uuid_col:s}) ); """) SELECT_TMPL = norm_psql_cmd_string(""" SELECT {binary_col:s} FROM {table_name:s} WHERE {type_col:s} = %(type_val)s AND {uuid_col:s} = %(uuid_val)s ; """) UPSERT_TMPL = norm_psql_cmd_string(""" WITH upsert AS ( UPDATE {table_name:s} SET {binary_col:s} = %(binary_val)s WHERE {type_col:s} = %(type_val)s AND {uuid_col:s} = %(uuid_val)s RETURNING * ) INSERT INTO {table_name:s} ({type_col:s}, {uuid_col:s}, {binary_col:s}) SELECT %(type_val)s, %(uuid_val)s, %(binary_val)s WHERE NOT EXISTS (SELECT * FROM upsert); """) @classmethod def is_usable(cls): if psycopg2 is None: cls.get_logger().warning("Not usable. Requires psycopg2 module") return False return True def __init__(self, type_str, uuid, table_name='descriptors', uuid_col='uid', type_col='type_str', binary_col='vector', db_name='postgres', db_host=None, db_port=None, db_user=None, db_pass=None, create_table=True): """ Initialize new PostgresDescriptorElement attached to some database credentials. We require that storage tables treat uuid AND type string columns as primary keys. The type and uuid columns should be of the 'text' type. The binary column should be of the 'bytea' type. Default argument values assume a local PostgreSQL database with a table created via the ``etc/smqtk/postgres/descriptor_element/example_table_init.sql`` file (relative to the SMQTK source tree or install root). NOTES: - Not all uuid types used here are necessarily of the ``uuid.UUID`` type, thus the recommendation to use a ``text`` type for the column. For certain specific use cases they may be proper ``uuid.UUID`` instances or strings, but this cannot be generally assumed. :param type_str: Type of descriptor. This is usually the name of the content descriptor that generated this vector. :type type_str: str :param uuid: Unique ID reference of the descriptor. :type uuid: collections.Hashable :param table_name: String label of the database table to use. :type table_name: str :param uuid_col: The column label for descriptor UUID storage :type uuid_col: str :param type_col: The column label for descriptor type string storage. :type type_col: str :param binary_col: The column label for descriptor vector binary storage. :type binary_col: str :param db_host: Host address of the Postgres server. If None, we assume the server is on the local machine and use the UNIX socket. This might be a required field on Windows machines (not tested yet). :type db_host: str | None :param db_port: Port the Postgres server is exposed on. If None, we assume the default port (5423). :type db_port: int | None :param db_name: The name of the database to connect to. :type db_name: str :param db_user: Postgres user to connect as. If None, postgres defaults to using the current accessing user account name on the operating system. :type db_user: str | None :param db_pass: Password for the user we're connecting as. This may be None if no password is to be used. :type db_pass: str | None :param create_table: If this instance should try to create the storing table before actions are performed against it. If the configured user does not have sufficient permissions to create the table and it does not currently exist, an exception will be raised. :type create_table: bool """ super(PostgresDescriptorElement, self).__init__(type_str, uuid) self.table_name = table_name self.uuid_col = uuid_col self.type_col = type_col self.binary_col = binary_col self.create_table = create_table self.db_name = db_name self.db_host = db_host self.db_port = db_port self.db_user = db_user self.db_pass = db_pass self._psql_helper = None def __getstate__(self): """ Construct serialization state. Due to the psql_helper containing a lock, it cannot be serialized. This is OK due to our creation of the helper on demand. The cost incurred by discarding the instance upon serialization is that once deserialized elsewhere the helper instance will have to be created. Since this creation post-deserialization only happens once, this is acceptable. """ state = super(PostgresDescriptorElement, self).__getstate__() state.update({ "table_name": self.table_name, "uuid_col": self.uuid_col, "type_col": self.type_col, "binary_col": self.binary_col, "create_table": self.create_table, "db_name": self.db_name, "db_host": self.db_host, "db_port": self.db_port, "db_user": self.db_user, "db_pass": self.db_pass, }) return state def __setstate__(self, state): # Base DescriptorElement parts super(PostgresDescriptorElement, self).__setstate__(state) # Our parts self.table_name = state['table_name'] self.uuid_col = state['uuid_col'] self.type_col = state['type_col'] self.binary_col = state['binary_col'] self.create_table = state['create_table'] self.db_name = state['db_name'] self.db_host = state['db_host'] self.db_port = state['db_port'] self.db_user = state['db_user'] self.db_pass = state['db_pass'] self._psql_helper = None def _get_psql_helper(self): """ Internal method to create on demand the PSQL connection helper class. :return: PsqlConnectionHelper utility. :rtype: PsqlConnectionHelper """ # `hasattr` check used for backwards compatibility when interacting with # databases containing elements serialized before the inclusion of this # helper class. if self._psql_helper is None: # Only using a transport iteration size of 1 since this element is # only meant to refer to a single entry in the associated table. self._psql_helper = PsqlConnectionHelper( self.db_name, self.db_host, self.db_port, self.db_user, self.db_pass, itersize=1, table_upsert_lock=PSQL_TABLE_CREATE_RLOCK) # Register table upsert command if self.create_table: self._psql_helper.set_table_upsert_sql( self.UPSERT_TABLE_TMPL.format( table_name=self.table_name, type_col=self.type_col, uuid_col=self.uuid_col, binary_col=self.binary_col, )) return self._psql_helper def get_config(self): return { "table_name": self.table_name, "uuid_col": self.uuid_col, "type_col": self.type_col, "binary_col": self.binary_col, "create_table": self.create_table, "db_name": self.db_name, "db_host": self.db_host, "db_port": self.db_port, "db_user": self.db_user, "db_pass": self.db_pass, } def has_vector(self): """ Check if the target database has a vector for our keys. This also returns True if we have a cached vector since there must have been a source vector to draw from if there is a cache of it. If a vector is cached, this resets the cache expiry timeout. :return: Whether or not this container current has a descriptor vector stored. :rtype: bool """ # Very similar to vector query, but replacing vector binary return with # a true/null return. Save a little bit of time compared to testing # vector return. # OLD: return self.vector() is not None # Using static value 'true' for binary "column" to reduce data return # volume. q_select = self.SELECT_TMPL.format( **{ 'binary_col': 'true', 'table_name': self.table_name, 'type_col': self.type_col, 'uuid_col': self.uuid_col, }) q_select_values = { "type_val": self.type(), "uuid_val": str(self.uuid()) } def cb(cursor): cursor.execute(q_select, q_select_values) # Should either yield one or zero rows. psql_helper = self._get_psql_helper() return bool( list(psql_helper.single_execute(cb, yield_result_rows=True))) def vector(self): """ Return this element's vector, or None if we don't have one. :return: Get the stored descriptor vector as a numpy array. This returns None of there is no vector stored in this container. :rtype: numpy.ndarray or None """ q_select = self.SELECT_TMPL.format( **{ "binary_col": self.binary_col, "table_name": self.table_name, "type_col": self.type_col, "uuid_col": self.uuid_col, }) q_select_values = { "type_val": self.type(), "uuid_val": str(self.uuid()) } # query execution callback # noinspection PyProtectedMember def cb(cursor): # type: (psycopg2._psycopg.cursor) -> None cursor.execute(q_select, q_select_values) # This should only fetch a single row. Cannot yield more than one due # use of primary keys. psql_helper = self._get_psql_helper() r = list(psql_helper.single_execute(cb, yield_result_rows=True)) if not r: return None else: b = r[0][0] v = numpy.frombuffer(b, self.ARRAY_DTYPE) return v def set_vector(self, new_vec): """ Set the contained vector. If this container already stores a descriptor vector, this will overwrite it. If we are configured to use caching, and one has not been cached yet, then we cache the vector and start a thread to monitor access times and to remove the cache if the access timeout has expired. If a vector was already cached, this new vector replaces the old one, the vector database-side is replaced, and the cache expiry timeout is reset. :raises ValueError: ``new_vec`` was not a numpy ndarray. :param new_vec: New vector to contain. This must be a numpy array. :type new_vec: numpy.ndarray :returns: Self. :rtype: PostgresDescriptorElement """ if not isinstance(new_vec, numpy.ndarray): new_vec = numpy.copy(new_vec) if new_vec.dtype != self.ARRAY_DTYPE: try: new_vec = new_vec.astype(self.ARRAY_DTYPE) except TypeError: raise ValueError("Could not convert input to a vector of type " "%s." % self.ARRAY_DTYPE) q_upsert = self.UPSERT_TMPL.strip().format( **{ "table_name": self.table_name, "binary_col": self.binary_col, "type_col": self.type_col, "uuid_col": self.uuid_col, }) q_upsert_values = { "binary_val": psycopg2.Binary(new_vec), "type_val": self.type(), "uuid_val": str(self.uuid()), } # query execution callback # noinspection PyProtectedMember def cb(cursor): # type: (psycopg2._psycopg.cursor) -> None cursor.execute(q_upsert, q_upsert_values) # No return but need to force iteration. psql_helper = self._get_psql_helper() list(psql_helper.single_execute(cb, yield_result_rows=False)) return self
def _create_psql_helper(cls, db_name, db_host, db_port, db_user, db_pass, table_name, uuid_col, type_col, binary_col, itersize=1000, create_table=True): """ Internal helper function for creating PSQL connection helpers for class instances. :param db_name: The name of the database to connect to. :type db_name: str :param db_host: Host address of the Postgres server. If None, we assume the server is on the local machine and use the UNIX socket. This might be a required field on Windows machines (not tested yet). :type db_host: str | None :param db_port: Port the Postgres server is exposed on. If None, we assume the default port (5423). :type db_port: int | None :param db_user: Postgres user to connect as. If None, postgres defaults to using the current accessing user account name on the operating system. :type db_user: str | None :param db_pass: Password for the user we're connecting as. This may be None if no password is to be used. :type db_pass: str | None :param table_name: String label of the database table to use. :type table_name: str :param uuid_col: The column label for descriptor UUID storage :type uuid_col: str :param type_col: The column label for descriptor type string storage. :type type_col: str :param binary_col: The column label for descriptor vector binary storage. :type binary_col: str :param itersize: Number of records fetched per network round trip when iterating over a named cursor. This parameter only does anything if a named cursor is used. :type itersize: int :param create_table: Whether to try to create the storing table before returning the connection helper. If the configured user does not have sufficient permissions to create the table and it does not currently exist, an exception will be raised. :type create_table: bool :return: PsqlConnectionHelper utility. :rtype: PsqlConnectionHelper """ helper = PsqlConnectionHelper( db_name, db_host, db_port, db_user, db_pass, itersize=itersize, table_upsert_lock=PSQL_TABLE_CREATE_RLOCK) if create_table: helper.set_table_upsert_sql( cls.UPSERT_TABLE_TMPL.format(table_name=table_name, type_col=type_col, uuid_col=uuid_col, binary_col=binary_col)) return helper