def test_from_conn_not_registered(): """ Tests helpful error message on attempt to choose unregistered conn type. """ conn = Mock() conn.__class__ = "Not a real class" with pytest.raises(ETLHelperHelperError, match=r'Unsupported connection type.*'): DB_HELPER_FACTORY.from_conn(conn)
def test_from_db_params_not_registered(): """ Tests helpful error message on attempt to choose unregistered db_params type. """ db_params = MagicMock(DbParams) db_params.dbtype = 'Not a real type' with pytest.raises(ETLHelperHelperError, match=r'Unsupported DbParams.dbtype.*'): DB_HELPER_FACTORY.from_db_params(db_params)
def test_sqlalchemy_conn_string(monkeypatch, db_params, expected): monkeypatch.setenv('DB_PASSWORD', 'mypassword') helper = DB_HELPER_FACTORY.from_db_params(db_params) conn_str = helper.get_sqlalchemy_connection_string(db_params, 'DB_PASSWORD') assert conn_str == expected
def validate_params(self): """ Validate database parameters. Should validate that a dbtype is a valid one and that the appropriate params have been passed for a particular db_type. :raises ETLHelperParamsError: Error if params are invalid """ # Get a set of the attributes to compare against required attributes. given = set(self.keys()) try: required_params = DB_HELPER_FACTORY.from_dbtype( self.dbtype).required_params except ETLHelperHelperError: msg = f'{self.dbtype} not in valid types ({DB_HELPER_FACTORY.helpers.keys()})' # from None suppresses lower errors in the stack trace # Deeper error is recorded in ETLHelperDbParamsError.__context__ raise ETLHelperDbParamsError(msg) from None unset_params = (given ^ required_params) & required_params if unset_params: msg = f'{unset_params} not set. Required parameters are {required_params}' raise ETLHelperDbParamsError(msg) valid_params = required_params.union({'dbtype'}) bad_params = given ^ valid_params if bad_params: msg = f"Invalid parameter(s): {bad_params}" raise ETLHelperDbParamsError(msg)
def execute(query, conn, parameters=()): """ Run SQL query against connection. :param query: str, SQL query to execute :param conn: dbapi connection :param parameters: sequence or dict of bind variables to insert in the query """ logger.info("Executing query") logger.debug(f"Executing:\n\n{query}\n\nwith parameters:\n\n" f"{parameters}\n\nagainst\n\n{conn}") helper = DB_HELPER_FACTORY.from_conn(conn) with helper.cursor(conn) as cursor: # Run query try: cursor.execute(query, parameters) conn.commit() except helper.sql_exceptions as exc: # Even though we haven't modified data, we have to rollback to # clear the failed transaction before any others can be started. conn.rollback() msg = (f"SQL query raised an error.\n\n{query}\n\n" f"Required paramstyle: {helper.paramstyle}\n\n{exc}\n") raise ETLHelperQueryError(msg)
def from_environment(cls, prefix='ETLHelper_'): """ Create DbParams object from parameters specified by environment variables e.g. ETLHelper_dbtype, ETLHelper_host, ETLHelper_port, etc. :param prefix: str, prefix to environment variable names """ dbparams_keys = [key for key in os.environ if key.startswith(prefix)] dbparams_from_env = { key.replace(prefix, '').lower(): os.environ[key] for key in dbparams_keys } # Ensure dbtype has been set dbtype_var = f'{prefix}dbtype' dbtype = dbparams_from_env.get('dbtype', None) if dbtype is None: msg = f"{dbtype_var} environment variable is not set" raise ETLHelperDbParamsError(msg) # Only include the required params # This prevents something like ETLHelper_password being added required_params = DB_HELPER_FACTORY.from_dbtype( dbtype).required_params | {'dbtype'} dbparams_from_env = { key: dbparams_from_env[key] for key in required_params } return cls(**dbparams_from_env)
def test_from_dbparams(dbtype_keyword, expected_helper): """ Tests correct helper produced given a db params object """ db_params = MagicMock(DbParams) db_params.dbtype = dbtype_keyword helper = DB_HELPER_FACTORY.from_db_params(db_params) assert isinstance(helper, expected_helper)
def executemany(query, rows, conn, commit_chunks=True): """ Use query to insert/update data from rows to database at conn. This method uses the executemany or execute_batch (PostgreSQL) commands to process the data in chunks and avoid creating a new database connection for each row. Row data are passed as parameters into query. commit_chunks controls if chunks the transaction should be committed after each chunk has been inserted. Committing chunks means that errors during a long-running insert do not require all all data to be loaded again. The disadvantage is that investigation may be required to determine exactly which records have been successfully transferred. :param query: str, SQL insert command with placeholders for data :param rows: List of tuples containing data to be inserted/updated :param conn: dbapi connection :param commit_chunks: bool, commit after each chunk has been inserted/updated :return row_count: int, number of rows inserted/updated """ logger.info(f"Executing many (chunksize={CHUNKSIZE})") logger.debug(f"Executing:\n\n{query}\n\nagainst\n\n{conn}") helper = DB_HELPER_FACTORY.from_conn(conn) processed = 0 with helper.cursor(conn) as cursor: for chunk in _chunker(rows, CHUNKSIZE): # Run query try: # Chunker pads to whole chunk with None; remove these chunk = [row for row in chunk if row is not None] # Show first row as example of data if processed == 0: logger.debug(f"First row: {chunk[0]}") # Execute query helper.executemany(cursor, query, chunk) processed += len(chunk) except helper.sql_exceptions as exc: # Rollback to clear the failed transaction before any others can # be # started. conn.rollback() msg = f"SQL query raised an error.\n\n{query}\n\n{exc}\n" raise ETLHelperInsertError(msg) logger.info(f'{processed} rows processed') # Commit changes so far if commit_chunks: conn.commit() # Commit changes where not already committed if not commit_chunks: conn.commit() logger.info(f'{processed} rows processed in total')
def test_from_conn(expected_helper, db_class): """ Tests correct helper produced given a conn object """ conn = Mock() # conn.__class__ = cx_Oracle.Connection conn.__class__ = db_class helper = DB_HELPER_FACTORY.from_conn(conn) assert isinstance(helper, expected_helper)
def get_connection_string(db_params, password_variable): """ Get a connection string :param db_params: DbParams object or similar with appropriate attributes :param password_variable: str, name of environment variable with password :return: str, Connection string """ helper = DB_HELPER_FACTORY.from_db_params(db_params) return helper.get_connection_string(db_params, password_variable)
def __setattr__(self, item, value): # Prepare set of valid_params # dbtype has to be added as it is used to determine required_params valid_params = DB_HELPER_FACTORY.from_dbtype( self.dbtype).required_params valid_params = valid_params.union({'dbtype'}) if item not in valid_params: msg = f"'{item}' is not a valid DbParams attribute: {valid_params}" raise AttributeError(msg) self[item] = value
def test_connect(monkeypatch, db_params, driver, expected): # Arrange monkeypatch.setenv('DB_PASSWORD', 'mypassword') mock_connect = Mock() monkeypatch.setattr(driver, 'connect', mock_connect) helper = DB_HELPER_FACTORY.from_db_params(db_params) # Act helper.connect(db_params, 'DB_PASSWORD') # Assert mock_connect.assert_called_with(expected)
def connect(db_params, password_variable=None, **kwargs): """ Return database connection. :param db_params: DbParams object or similar with appropriate attributes :param password_variable: str, name of environment variable with password :param kwargs: connection specific keyword arguments e.g. row_factory :return: Connection object """ helper = DB_HELPER_FACTORY.from_db_params(db_params) # Helpers will raise ETLHelperConnectionError if connection fails conn = helper.connect(db_params, password_variable, **kwargs) return conn
def generate_insert_sql(table, row, conn): """Generate insert SQL for table, getting column names from row and the placeholder style from the connection. `row` is either a namedtuple or a dictionary.""" helper = DB_HELPER_FACTORY.from_conn(conn) paramstyles = { "qmark": "?", "numeric": ":{number}", "named": ":{name}", "format": "%s", "pyformat": "%({name})s" } # Namedtuples use a query with positional placeholders if not hasattr(row, 'keys'): paramstyle = helper.positional_paramstyle # Convert namedtuple to dictionary to easily access keys try: row = row._asdict() except AttributeError: msg = f"Row is not dictionary or namedtuple ({type(row)})" raise ETLHelperInsertError(msg) columns = row.keys() if paramstyle == "numeric": placeholders = [ paramstyles[paramstyle].format(number=i + 1) for i in range(len(columns)) ] else: placeholders = [paramstyles[paramstyle]] * len(columns) # Dictionaries use a query with named placeholders else: paramstyle = helper.named_paramstyle if not paramstyle: msg = ( f"Database connection ({str(conn.__class__)}) doesn't support named parameters. " "Pass data as namedtuples instead.") raise ETLHelperInsertError(msg) columns = row.keys() placeholders = [ paramstyles[paramstyle].format(name=c) for c in columns ] sql = f"INSERT INTO {table} ({', '.join(columns)}) VALUES ({', '.join(placeholders)})" return sql
def validate_params(self): """ Validate database parameters. Should validate that a dbtype is a valid one and that the appropriate params have been passed for a particular db_type. :raises ETLHelperParamsError: Error if params are invalid """ # Get a set of the attributes to compare against required attributes. given = set(self.keys()) try: required_params = DB_HELPER_FACTORY.from_dbtype(self.dbtype).required_params except ETLHelperHelperError: msg = f'{self.dbtype} not in valid types ({DB_HELPER_FACTORY.helpers.keys()})' raise ETLHelperDbParamsError(msg) unset_params = (given ^ required_params) & required_params if unset_params: msg = f'{unset_params} not set. Required parameters are {required_params}' raise ETLHelperDbParamsError(msg)
def executemany(query, conn, rows, on_error=None, commit_chunks=True, chunk_size=CHUNKSIZE): """ Use query to insert/update data from rows to database at conn. This method uses the executemany or execute_batch (PostgreSQL) commands to process the data in chunks and avoid creating a new database connection for each row. Row data are passed as parameters into query. Default behaviour is to raise an exception in the case of SQL errors such as primary key violations. If the on_error parameter is specified, the exception will be caught then then rows of each chunk re-tried individually. Further errors will be caught and appended to a list of (row, exception) tuples. on_error is a function that is called at the end of each chunk, with the list as the only argument. commit_chunks controls if chunks the transaction should be committed after each chunk has been inserted. Committing chunks means that errors during a long-running insert do not require all data to be loaded again. The disadvantage is that investigation may be required to determine exactly which records have been successfully transferred. :param query: str, SQL insert command with placeholders for data :param conn: dbapi connection :param rows: List of tuples containing data to be inserted/updated :param on_error: Function to be applied to failed rows in each chunk :param commit_chunks: bool, commit after each chunk has been inserted/updated :param chunk_size: int, size of chunks to group data by :return row_count: int, number of rows inserted/updated """ logger.info("Executing many (chunk_size=%s)", chunk_size) logger.debug("Executing:\n\n%s\n\nagainst\n\n%s", query, conn) helper = DB_HELPER_FACTORY.from_conn(conn) processed = 0 failed = 0 with helper.cursor(conn) as cursor: for chunk in _chunker(rows, chunk_size): # Run query try: # Chunker pads to whole chunk with None; remove these chunk = [row for row in chunk if row is not None] # Show first row as example of data if processed == 0: logger.debug(f"First row: {chunk[0]}") # Execute query helper.executemany(cursor, query, chunk) except helper.sql_exceptions as exc: # Rollback to clear the failed transaction before any others can # be started. conn.rollback() # Collect and process failed rows if on_error function provided if on_error: # Temporarily disable logging old_level = logger.level logger.setLevel(logging.ERROR) try: failed_rows = _execute_by_row(query, conn, chunk) finally: # Restore logging logger.setLevel(old_level) failed += len(failed_rows) logger.debug("Calling on_error function on %s failed rows", failed) on_error(failed_rows) else: msg = ( f"SQL query raised an error.\n\n{query}\n\n" f"Required paramstyle: {helper.paramstyle}\n\n{exc}\n") raise ETLHelperInsertError(msg) processed += len(chunk) logger.info('%s rows processed (%s failed)', processed, failed) # Commit changes so far if commit_chunks: conn.commit() # Commit changes where not already committed if not commit_chunks: conn.commit() logger.info(f'{processed} rows processed in total')
def iter_chunks(select_query, conn, parameters=(), row_factory=namedtuple_row_factory, transform=None, read_lob=False, chunk_size=CHUNKSIZE): """ Run SQL query against connection and return iterator object to loop over results in batches of chunksize (default 5000). The row_factory changes the output format of the results. Other row factories e.g. dict_row_factory are available. The transform function is applied to chunks of data as they are extracted from the database. The read_lob parameter will convert Oracle LOB objects to strings. It is required to access results of some Oracle Spatial functions. :param select_query: str, SQL query to execute :param conn: dbapi connection :param parameters: sequence or dict of bind variables to insert in the query :param row_factory: function that accepts a cursor and returns a function for parsing each row :param transform: function that accepts an iterable (e.g. list) of rows and returns an iterable of rows (possibly of different shape) :param read_lob: bool, convert Oracle LOB objects to strings :param chunk_size: int, size of chunks to group data by """ logger.info("Fetching rows (chunk_size=%s)", chunk_size) logger.debug(f"Fetching:\n\n{select_query}\n\nwith parameters:\n\n" f"{parameters}\n\nagainst\n\n{conn}") helper = DB_HELPER_FACTORY.from_conn(conn) with helper.cursor(conn) as cursor: # Run query try: cursor.execute(select_query, parameters) except helper.sql_exceptions as exc: # Even though we haven't modified data, we have to rollback to # clear the failed transaction before any others can be started. conn.rollback() msg = (f"SQL query raised an error.\n\n{select_query}\n\n" f"Required paramstyle: {helper.paramstyle}\n\n{exc}\n") raise ETLHelperExtractError(msg) # Set row factory create_row = row_factory(cursor) # Parse results first_pass = True while True: rows = cursor.fetchmany(chunk_size) # No more rows to process if not rows: if first_pass: msg = "No rows returned" else: if cursor.rowcount == -1: # SQLite3 drive doesn't support row count (always -1) msg = "All rows returned" else: msg = f"{cursor.rowcount} rows returned" logger.info(msg) # Close the active transaction conn.commit() return # Convert Oracle LOBs to strings if required if read_lob: rows = _read_lob(rows) # Apply row_factory rows = (create_row(row) for row in rows) # Apply transform if transform: rows = transform(rows) # Return data yield rows first_pass = False
def iter_chunks(select_query, conn, parameters=(), row_factory=namedtuple_rowfactory, transform=None, read_lob=False): """ Run SQL query against connection and return iterator object to loop over results in batches of etlhelper.etl.CHUNKSIZE (default 5000). The row_factory changes the output format of the results. Other row factories e.g. dict_rowfactory are available. The transform function is applied to chunks of data as they are extracted from the database. The read_lob parameter will convert Oracle LOB objects to strings. It is required to access results of some Oracle Spatial functions. :param select_query: str, SQL query to execute :param conn: dbapi connection :param parameters: sequence or dict of bind variables to insert in the query :param row_factory: function that accepts a cursor and returns a function for parsing each row :param transform: function that accepts an iterable (e.g. list) of rows and returns an iterable of rows (possibly of different shape) :param read_lob: bool, convert Oracle LOB objects to strings """ helper = DB_HELPER_FACTORY.from_conn(conn) with helper.cursor(conn) as cursor: # Run query try: cursor.execute(select_query, parameters) except helper.sql_exceptions as exc: # Even though we haven't modified data, we have to rollback to # clear the failed transaction before any others can be started. conn.rollback() msg = f"SQL query raised an error.\n\n{select_query}\n\n{exc}\n" raise ETLHelperExtractError(msg) # Set row factory create_row = row_factory(cursor) # Parse results while True: rows = cursor.fetchmany(CHUNKSIZE) # cursor.rowcount is number of records transferred from the server if cursor.rowcount == 0: logging.debug("iter_chunks: No records returned") return # No more rows to process if not rows: logging.debug( f"iter_chunks: {cursor.rowcount} records returned") return # Convert Oracle LOBs to strings if required if read_lob: rows = _read_lob(rows) # Apply row_factory rows = (create_row(row) for row in rows) # Apply transform if transform: rows = transform(rows) # Return data yield rows
def test_from_db_params_bad_type(): with pytest.raises(ETLHelperHelperError, match=r'Expected DbParams-like object.*'): DB_HELPER_FACTORY.from_db_params('some string')
def test_from_conn_bad_type(): with pytest.raises(ETLHelperHelperError, match=r'Expected connection-like object.*'): DB_HELPER_FACTORY.from_conn('some string')
def paramstyle(self): """The DBAPI2 paramstyle attribute for database type""" return DB_HELPER_FACTORY.from_dbtype(self.dbtype).paramstyle