class s3connector(object): def __init__(self): self.client = Session( config.s3_config['access_key'], config.s3_config['secret_key'], region_name=config.s3_config['region']).client('s3') def write(self, key, data): try: self.client.put_object(Bucket=config.s3_config['bucket'], Key=key, Body=data) return True except Exception: return False def read(self, key, start=None, end=None): try: if start is not None: _response = self.client.get_object( Bucket=config.s3_config['bucket'], Key=key, Range="bytes=%d-%d" % (start, end)) else: _response = self.client.get_object( Bucket=config.s3_config['bucket'], Key=key) return _response['Body'].read() except Exception: return None def delete(self, key): try: self.client.delete_object(Bucket=config.s3_config['bucket'], Key=key) return True except Exception: return False
class TableConfig(object): """Base class for a specific table. This class will have all methods that are engine agnostic--methods that only act with the host server or the data warehouse. All methods that interact with a specific engine should raise a NotImplementedError here and be overwritten with a subclass. Parameters ---------- database_alias : str Config file name of the database db_connection_params : db.ConnectionParams DB connection parameters derived from a parsed connection string destination_table_name : str The name of the table where data should be loaded in the data warehouse destination_schema_name : str The name of the schema where data should be loaded in the data warehouse. Note : this should be "public" unless there's a good reason to segregate data. source_table_name : str name of the table in the source database. If a query_file is provided, this is purely for logging and monitoring query_file : str, optional path of a fully qualified SQL file that contains all the logic needed to extract a table. columns_to_drop : list of str, optional Defined by the YAML file, a list of columns that should not be imported into the data warehouse distribution_key : str, optional destination column name to use as the Redshift distkey sort_keys : list of str, optional destination column names to use as the Redshift sortkeys append_only : bool, optional Indicates that rows should only be inserted to this table and never updated or deleted. If True, primary_key has no effect. Default: False primary_key : str or list(str), optional Columns used to match records when updating the destination table. If not provided, primary keys are inferred from the source table. Has no effect if append_only is True Default: None index_column : str, optional Column used to identify new or updated rows in the source table. Persisted in the index table. index_sql : str, optional Custom SQL to be run against the source DB to find the current max index. Standard templating for the table applies and Druzhba expects the index to be returned in a column called `index_value`. Overrides index_column. truncate_file : str, optional Path to a fully qualified SQL file that contains logic to truncate the table when full-refreshing. Required to pass --full-refresh if a table defined is defined by a query_file. full_refresh : boolean, optional flag that forces a full deletion of a table prior to loading new data, rather than deleting only on matched PKs. Setting True will conflict with index_column. rebuild : boolean, optional flag to rebuild the table completely. Implies full_refresh. Incompatible with query_file and certain conditions per-database-driver. type_map : dict, optional override type conversion from the source DB to redshift. This is set by YAML, and is an empty list of no configuration is provided. ex: { 'tinyint(1)': 'smallint', 'char(35)': 'varchar(70)', 'bigint(20) unsigned': 'bigint' } include_comments : boolean, optional flag to specify whether or not to ingest table and column comments when building or rebuilding the target table. Attributes ---------- columns : list of str All columns read from the source table foreign_keys : list of str generated from the `create table` syntax, a list of foreign key relationships to create for a table after all tables are created pks : list of str generated from the `create table` syntax, a list of source column names that define the PK key_name : str Once the dump has happened, the file is uploaded to s3 at this key location old_index_value : str Used in the `where` clause, this is the most recent index value for a given table currently in the data warehouse new_index_value : str Used in the `where` clause, this is the max index value for a given table currently in the source database. data : dict Read from table definition in the yaml file to supply data to the Jinja templating db_template_data : dict Read from db definition in the yaml file to supply data to the Jinja templating Notes ----- All parameters are also set as attributes """ DESTINATION_TABLE_OK = "ok" DESTINATION_TABLE_REBUILD = "rebuild" DESTINATION_TABLE_DNE = "non-existent" DESTINATION_TABLE_INCORRECT = "incorrect" max_file_size = 100 * 1024 ** 2 def __init__( self, database_alias, db_connection_params, destination_table_name, destination_schema_name, source_table_name, index_schema, index_table, query_file=None, distribution_key=None, sort_keys=None, index_column=None, index_sql=None, truncate_file=None, columns_to_drop=None, type_map=None, primary_key=None, not_null_date=False, full_refresh=False, rebuild=False, schema_file=None, data=None, append_only=False, db_template_data=None, include_comments=True, ): self.database_alias = database_alias self.db_host = db_connection_params.host self.db_port = db_connection_params.port self.db_name = db_connection_params.name self.db_user = db_connection_params.user self.db_password = db_connection_params.password self.db_additional_parameters = db_connection_params.additional self._columns = None self.columns_to_drop = columns_to_drop or [] self.destination_table_name = destination_table_name self.destination_schema_name = destination_schema_name self.query_file = query_file self.schema_file = schema_file self.distribution_key = distribution_key self.sort_keys = sort_keys self.source_table_name = source_table_name self.index_column = index_column self.index_sql = index_sql self.truncate_file = truncate_file self.append_only = append_only self.full_refresh = full_refresh self.rebuild = rebuild self.type_map = self._clean_type_map(type_map) self.primary_key = ( [primary_key] if isinstance(primary_key, str) else primary_key ) self.not_null_date = not_null_date self.foreign_keys = [] self.pks = [] self._old_index_value = "notset" self._new_index_value = "notset" self._destination_table_status = None self.table_template_data = data self.db_template_data = db_template_data self.index_schema = index_schema self.index_table = index_table self.include_comments = include_comments self.date_key = datetime.datetime.strftime( datetime.datetime.utcnow(), "%Y%m%dT%H%M%S" ) self.row_count = None self.upload_size = 0 self.starttime = None self.endtime = None self.rows_inserted = None self.rows_deleted = None self.num_data_files = 0 self.manifest_mode = False self.logger = logging.getLogger(f"druzhba.{database_alias}.{source_table_name}") self.s3 = Session().client("s3") @classmethod def _clean_type_map(cls, type_map): if not type_map: return {} for k, v in type_map.items(): type_map[k.lower()] = v return type_map @classmethod def validate_yaml_configuration(cls, yaml_config): """ Validate YAML configuration. Note that this can differ slightly from runtime config, since full_refresh may be forced even when it would fail these checks. """ table = yaml_config["source_table_name"] index_column = yaml_config.get("index_column") index_sql = yaml_config.get("index_sql") append_only = yaml_config.get("append_only") full_refresh = yaml_config.get("full_refresh") primary_key = yaml_config.get("primary_key") query_file = yaml_config.get("query_file") schema_file = yaml_config.get("schema_file") has_incremental_index = index_column or index_sql if not has_incremental_index and append_only: raise ConfigurationError("Append_only without incremental index", table) if full_refresh and append_only: raise ConfigurationError("Append_only with full_refresh", table) elif not has_incremental_index and not full_refresh: raise ConfigurationError( "Incremental update with no specified index", table ) elif index_column and full_refresh: raise ConfigurationError("Full refresh with index_column", table) elif index_sql and full_refresh: raise ConfigurationError("Full refresh with index_sql", table) elif index_sql and index_column: raise ConfigurationError("index_sql and index_column", table) elif query_file and not primary_key and not append_only and not full_refresh: raise ConfigurationError( "incremental query_file without primary_key", table ) elif query_file and not os.path.isfile(os.path.join(CONFIG_DIR, query_file)): raise ConfigurationError("nonexistent query_file", table) elif schema_file and not os.path.isfile(os.path.join(CONFIG_DIR, schema_file)): raise ConfigurationError("nonexistent schema_file", table) def validate_runtime_configuration(self): """ Validate this instance's configuration state, which can differ from yaml configurations allowed by validate_yaml_configuration and runs after connecting to the database. """ if self.rebuild and self.truncate_file: msg = ( "Cannot rebuild a table with a truncate_file " "because it would not be correct to drop the table." ) raise ConfigurationError(msg, self.source_table_name) elif self.rebuild and self.schema_file: msg = ( "Cannot rebuild a table with a schema file, need " "support for passing in the table name to create." ) raise ConfigurationError(msg, self.source_table_name) @property def s3_key_prefix(self): return "{}/{}.{}.{}".format( get_redshift().s3_config.prefix, self.database_alias, self.source_table_name, self.date_key, ) def single_s3_data_key(self): """Returns the S3 path to upload a single avro file to""" if self.manifest_mode: raise TableStateError( "Attempted to treat a manifest upload as a single file" ) return "{}.avro".format(self.s3_key_prefix) def manifest_s3_data_key(self): if not self.manifest_mode: raise TableStateError( "Attempted to treat a single file upload as a manifest" ) return "{}.manifest".format(self.s3_key_prefix) def numbered_s3_data_key(self, file_num): return "{}/{:05d}.avro".format(self.s3_key_prefix, file_num) def next_s3_data_file_key(self): if self.manifest_mode: return self.numbered_s3_data_key(self.num_data_files) else: return self.single_s3_data_key() @property def copy_target_key(self): if self.manifest_mode: return self.manifest_s3_data_key() else: return self.single_s3_data_key() @property def copy_target_url(self): return _s3_url(get_redshift().s3_config.bucket, self.copy_target_key) def data_file_keys(self): if self.manifest_mode: for fn in range(self.num_data_files): yield self.numbered_s3_data_key(fn) else: yield self.single_s3_data_key() @property def connection_vars(self): raise NotImplementedError @property def avro_type_map(self): raise NotImplementedError def get_query_from_file(self): env = Environment( loader=FileSystemLoader(os.path.join(CONFIG_DIR)), autoescape=select_autoescape(["sql"]), undefined=StrictUndefined, ) template = env.get_template(self.query_file) return template.render( db=self.db_template_data, table=self.table_template_data, run=self.run_template_data, ) def get_sql_description(self, sql): raise NotImplementedError def get_query_sql(self): if self.query_file: return self.get_query_from_file() else: return self._get_query_sql() + self.where_clause() def _get_query_sql(self): raise NotImplementedError @property def columns(self): if not self._columns: _table_attributes, columns = self.get_sql_description(self.get_query_sql()) self._columns = [column[0] for column in columns] return self._columns def query(self, sql): raise NotImplementedError def query_fetchone(self, sql): results = self.query(sql) return next(results) def _check_index_values(self): """Returns true if new index is greater than old index and defined""" if self.full_refresh: if self.index_column or self.index_sql: msg = ( "Index was found, but %s was forced. " "Old index value will be ignored, but new " "index value will be recorded." ) self.logger.info(msg, "rebuild" if self.rebuild else "full-refresh") return True if self.append_only: return True # If there is no previous index, we're fine if self.old_index_value is None: return True # There's an old index but can't load a new value. if self.new_index_value is None and self.old_index_value is not None: msg = "Index expected but not found. Last value was %s. Dumping full table" self.logger.warning(msg, self.old_index_value) return False try: # old_index_value comes in as a unicode new_index_value as the sql # type if isinstance(self.new_index_value, int): is_inverted = int(self.old_index_value) > self.new_index_value elif isinstance(self.new_index_value, datetime.datetime): old_index_dt = datetime.datetime.strptime( self.old_index_value, "%Y-%m-%d %H:%M:%S.%f" ) is_inverted = old_index_dt > self.new_index_value else: self.logger.warning( "Unknown type %s for index %s", type(self.new_index_value), self.old_index_value, ) return False except (ValueError, TypeError) as ex: self.logger.warning("Could not check index: %s", str(ex)) return False if is_inverted: self.logger.warning( "Index value has decreased for table %s.%s. " "May need to do full refresh", self.db_name, self.source_table_name, ) return not is_inverted def row_generator(self): sql = self.get_query_sql() self._check_index_values() self.logger.info("Extracting %s table %s", self.db_name, self.source_table_name) self.logger.debug("Running SQL: %s", sql) return self.query(sql) @property def run_template_data(self): return { "destination_schema_name": self.destination_schema_name, "destination_table_name": self.destination_table_name, "db_name": self.db_name, "source_table_name": self.source_table_name, "index_column": self.index_column, "new_index_value": self.new_index_value, "old_index_value": self.old_index_value, } def _load_old_index_value(self): """Sets and gets the index_value property, retrieved from Redshift Returns ------- index_value : variable Since index_value can vary from table to table, this can be many different types. Most common will be a datetime or int, but could also be a date or string. Returns None if no previous index value found """ query = f""" SELECT index_value FROM "{self.index_schema}"."{self.index_table}" WHERE datastore_name = %s AND database_name = %s AND table_name = %s ORDER BY created_ts DESC LIMIT 1; """ self.logger.debug("Querying Redshift for last updated index") with get_redshift().cursor() as cur: cur.execute( query, (self.database_alias, self.db_name, self.source_table_name) ) index_value = cur.fetchone() if index_value is None: self.logger.info( "No index found. Dumping entire table: %s.", self.source_table_name ) return index_value else: self.logger.info("Index found: %s", index_value[0]) return index_value[0] @property def old_index_value(self): if self.full_refresh: return None # we use 'notset' rather than None because None is a valid output if self._old_index_value is "notset": self._old_index_value = self._load_old_index_value() return self._old_index_value def _load_new_index_value(self): # Abstract to support DB-specific quoting raise NotImplementedError @property def new_index_value(self): # we use 'notset' rather than None because None is a valid output if self._new_index_value is "notset": if self.index_sql: env = Environment( autoescape=select_autoescape(["sql"]), undefined=StrictUndefined ) template = env.from_string(self.index_sql) query = template.render( db=self.db_template_data, table=self.table_template_data, # Cannot include the run template data # here because we do not know the index values yet ) self._new_index_value = self.query_fetchone(query)["index_value"] elif self.index_column: self._new_index_value = self._load_new_index_value() else: self._new_index_value = None if self.query_file and self.index_sql and self._new_index_value is None: # Handles a special case where the index_sql query returns no rows # but the custom sql file is expecting both old and new index values return 0 return self._new_index_value def where_clause(self): """Method for filtering get_data, if tables are able to be sliced on some index value. Returns ------- str valid SQL featuring just the WHERE clause """ where_clause = "\nWHERE " if not self.index_column or self.full_refresh: # If no index_column, there is no where clause. The whole # source table is dumped. return "" if self.new_index_value is None: # Either the table is empty or the index_column is all NULL return "" if self.old_index_value: # This should always happen except on the initial load where_clause += "{} > '{}' AND ".format( self.index_column, self.old_index_value ) where_clause += "{} <= '{}'".format(self.index_column, self.new_index_value) return where_clause def get_destination_table_columns(self): query = """ SELECT "column" FROM pg_table_def WHERE schemaname = %s AND tablename = %s; """ with get_redshift().cursor() as cur: self.set_search_path(cur) cur.execute( query, (self.destination_schema_name, self.destination_table_name) ) results = cur.fetchall() return [x[0] for x in results] def get_destination_table_status(self): """Queries the data warehouse to determine if the desired destination table exists and if so, if it matches the expected configuration. Returns ------- str Representing our plan for what to do with the destination table Includes: - DESTINATION_TABLE_DNE -> build it if possible - DESTINATION_TABLE_REBUILD -> rebuild it - DESTINATION_TABLE_OK -> leave it - DESTINATION_TABLE_INCORRECT -> error """ dw_columns = set(self.get_destination_table_columns()) source_columns = set(self.columns) expected = source_columns - set(self.columns_to_drop) unexpected_dw_columns = dw_columns - expected unexpected_source_columns = expected - dw_columns if len(dw_columns) == 0: self.logger.info("Destination table does not exist.") return self.DESTINATION_TABLE_DNE elif self.rebuild: # We're rebuilding it so we don't care if it's right, # so exit before we log any errors self.logger.info("Attempting to rebuild destination table.") return self.DESTINATION_TABLE_REBUILD elif dw_columns == expected: return self.DESTINATION_TABLE_OK elif len(unexpected_dw_columns) > 0: msg = ( "Columns exist in the warehouse table that are not in " "the source: `%s`" ) self.logger.warning(msg, "`, `".join(unexpected_dw_columns)) return self.DESTINATION_TABLE_INCORRECT elif len(unexpected_source_columns) > 0: msg = ( "Columns exist in the source table that are not in the " + "warehouse. Skipping column(s): `%s`" ) self.logger.warning(msg, "`, `".join(unexpected_source_columns)) # Copy from avro will just ignore the extra columns so we can proceed return self.DESTINATION_TABLE_OK else: raise RuntimeError("Unhandled case in get_destination_table_status") def query_description_to_avro(self, sql): _table_attributes, columns = self.get_sql_description(sql) fields = [] for col_desc in columns: col_name = col_desc[0] schema = {"name": col_name} try: col_type = col_desc[1].split("(")[0] except AttributeError: col_type = col_desc[1] if col_type in self.avro_type_map["string"]: schema["type"] = ["null", "string"] elif col_type in self.avro_type_map["int"]: schema["type"] = ["null", "int"] elif col_type in self.avro_type_map["double"]: schema["type"] = ["null", "double"] elif col_type in self.avro_type_map["long"]: schema["type"] = ["null", "long"] elif col_type in self.avro_type_map["boolean"]: schema["type"] = ["null", "boolean"] elif col_type in self.avro_type_map["decimal"]: # fastavro now supports decimal types, but Redshift does not schema["type"] = ["null", "string"] else: self.logger.warning( "unmatched data type for column %s in %s table %s", col_desc[0], self.db_name, self.source_table_name, ) schema["type"] = ["null", "string"] fields.append(schema) return fields def set_last_updated_index(self): """Adds a new index to the pipeline_table_index table for updated tables """ if self.new_index_value is None: return query = f""" INSERT INTO "{self.index_schema}"."{self.index_table}" VALUES (%s, %s, %s, %s) """ if isinstance(self.new_index_value, int): new_index_value = str(self.new_index_value) elif isinstance(self.new_index_value, datetime.datetime): new_index_value = self.new_index_value.strftime("%Y-%m-%d %H:%M:%S.%f") else: msg = "Don't know how to handle index {} of type {}".format( self.new_index_value, str(type(self.new_index_value)) ) raise TypeError(msg) self.logger.info("Updating index table") with get_redshift().cursor() as cur: args = ( self.database_alias, self.db_name, self.source_table_name, new_index_value, ) self.logger.debug(cur.mogrify(query, args)) cur.execute(query, args) def create_table_keys(self, distkey=None, sortkeys=None): output = "" distkey = distkey or self.distribution_key if distkey: output += "distkey({})\n".format(distkey) sortkeys = sortkeys or self.sort_keys if sortkeys: output += "compound " if len(sortkeys) > 1 else "" output += "sortkey({})\n".format(",".join(sortkeys)) return output def query_to_redshift_create_table(self, sql, table_name): raise NotImplementedError def check_destination_table_status(self): """Get the source table schema, convert it to Redshift compatibility, and ensure the table exists as expected in the data warehouse. Sets self._destination_table_status if we can proceed, or raises if not. Raises ------ InvalidSchemaError Raised when the target table has columns not recognized in the source table, and we're not rebuilding the table. (Unrecognized source columns will log but still proceed) MigrationError Raised when the target table needs to be create or rebuilt but cannot be done automatically. """ self.logger.info("Getting CREATE TABLE command") self._destination_table_status = self.get_destination_table_status() if self._destination_table_status in ( self.DESTINATION_TABLE_DNE, self.DESTINATION_TABLE_REBUILD, ): self.logger.info("Verifying that the table can be created.") try: # Only called to see if it raises, the actual table # will be created later self.query_to_redshift_create_table( self.get_query_sql(), self.destination_table_name ) except NotImplementedError: raise MigrationError( "Automatic table creation was not implemented for " "this database, manual migration needed." ) elif self._destination_table_status == self.DESTINATION_TABLE_INCORRECT: raise InvalidSchemaError( "Extra columns exist in redshift table. Migration needed" ) def register_extract_monitor(self, starttime, endtime): """Adds an entry into the extract monitor for a given extract task Parameters ---------- starttime : datetime.datetime datetime object generated at the beginning of the data extraction endtime : datetime.datetime datetime object generated at the end of the data extraction """ query = """ INSERT INTO "public"."table_extract_detail" VALUES ( %(task_id)s, %(class_name)s, %(task_date_params)s, %(task_other_params)s, %(start_dt)s, %(end_dt)s, %(run_time_sec)s, %(manifest_path)s, %(data_path)s, %(output_exists)s, %(row_count)s, %(upload_size)s, %(exception)s ); """ args = { "task_id": "{}(alias={}, database={}, table={})".format( self.__class__.__name__, self.database_alias, self.db_name, self.source_table_name, ), "class_name": self.__class__.__name__, "task_date_params": None, "task_other_params": None, "start_dt": starttime.replace(microsecond=0), "end_dt": endtime.replace(microsecond=0), "run_time_sec": (endtime - starttime).total_seconds(), "manifest_path": self.copy_target_url, "data_path": "s3://{}/{}".format( get_redshift().s3_config.bucket, self.s3_key_prefix ), "output_exists": self.row_count > 0, "row_count": self.row_count, "upload_size": self.upload_size, "exception": None, } self.logger.info("Inserting record into table_extract_detail") with get_redshift().cursor() as cur: cur.execute(query, args) def register_load_monitor(self): """Adds an entry into the load monitor for a given load task Parameters ---------- starttime : datetime datetime object generated at the beginning of the load endtime : datetime datetime object generated at the end of the load rows_inserted : int Total number of rows generated by the Redshift COPY command rows_deleted : int Count of rows deleted by primary key in the destination table load_size : int Size in bytes of the file in S3 used in the Redshift COPY command """ query = """ INSERT INTO "public"."table_load_detail" VALUES ( %(task_id)s, %(class_name)s, %(task_date_params)s, %(task_other_params)s, %(target_table)s, %(start_dt)s, %(end_dt)s, %(run_time_sec)s, %(extract_task_update_id)s, %(data_path)s, %(manifest_cleaned)s, %(rows_inserted)s, %(rows_deleted)s, %(load_size)s, %(exception)s ); """ task_id = "{}(alias={}, database={}, table={})".format( self.__class__.__name__, self.database_alias, self.db_name, self.source_table_name, ) target_table = "{}.{}".format( self.destination_schema_name, self.destination_table_name ) args = { "task_id": task_id, "class_name": self.__class__.__name__, "task_date_params": None, "task_other_params": None, "target_table": target_table, "start_dt": self.starttime.replace(microsecond=0), "end_dt": self.endtime.replace(microsecond=0), "run_time_sec": (self.endtime - self.starttime).total_seconds(), "extract_task_update_id": task_id, "data_path": self.copy_target_url, "manifest_cleaned": False, "rows_inserted": self.rows_inserted, "rows_deleted": self.rows_deleted, "load_size": self.upload_size, "exception": None, } self.logger.info("Inserting record into table_load_detail") with get_redshift().cursor() as cur: cur.execute(query, args) def extract(self): """Serializes full db result set and uploads to s3 The data will be uploaded either as a single file or as a set of files with a manifest """ # TODO: Do we not currently execute the extract monitor? # starttime = datetime.datetime.utcnow() results_schema = self.query_description_to_avro(self.get_query_sql()) results_iter = self.row_generator() done = False while not done: done = self.avro_to_s3(results_iter, results_schema) if self.num_data_files == 0: self.logger.info( "No data extracted; not uploading to s3 for %s table %s", self.db_name, self.source_table_name, ) if self.manifest_mode: self.write_manifest_file() # endtime = datetime.datetime.utcnow() # self.register_extract_monitor(starttime, endtime) def avro_to_s3(self, results_iter, results_schema): """Attempts to serialize a result set to an AVRO file returns true if it complete writes the entire result_iter and false if there were records remaining when it hit the maximum file size. """ with BytesIO() as f: complete, row_count = write_avro_file( f, results_iter, results_schema, self.destination_table_name, self.max_file_size, ) if self.row_count is None: self.row_count = row_count else: self.row_count += row_count self.upload_size += f.tell() if not complete: self.manifest_mode = True if row_count > 0: self._upload_s3( f, get_redshift().s3_config.bucket, self.next_s3_data_file_key() ) self.num_data_files += 1 return complete def write_manifest_file(self): if not self.manifest_mode: raise TableStateError("Cannot write manifest when not in manifest mode") entries = [ {"url": _s3_url(get_redshift().s3_config.bucket, key), "mandatory": True} for key in self.data_file_keys() ] manifest = {"entries": entries} with BytesIO() as f: f.write(json.dumps(manifest).encode()) self._upload_s3( f, get_redshift().s3_config.bucket, self.manifest_s3_data_key() ) def _upload_s3(self, f, bucket, key): """ Upload a file to this table's s3 key. Parameters ---------- f : an open file handle. s3_path : string indicating the s3 location to write to """ MB = 1024 ** 2 s3_config = TransferConfig(multipart_threshold=10 * MB) f.seek(0) self.logger.info("Writing s3 file %s", _s3_url(bucket, key)) retries = 3 retries_remaining = retries while retries_remaining > 0: try: self.s3.upload_fileobj(f, bucket, key, Config=s3_config) self.logger.info("Wrote s3 file %s", _s3_url(bucket, key)) return except KeyError: # retry on intermittent credential error retries_remaining -= 1 if retries_remaining > 0: time.sleep(3 * (retries - retries_remaining) ** 2) else: raise def set_search_path(self, cursor): """This sets the search_path for a Redshift session. The default search_path is "'$user', public"; this replaces it with just the destination_schema_name. Parameters ---------- cursor : Redshift cursor A cursor is passed in rather than generated because we need to modify the existing cursor and not create a new one """ query = "SET search_path TO {};".format(self.destination_schema_name) self.logger.debug(query) cursor.execute(query) def get_delete_sql(self): if self.full_refresh: if self._destination_table_status == self.DESTINATION_TABLE_REBUILD: # We'll just drop it return "" if self.truncate_file: env = Environment( loader=FileSystemLoader(os.path.join("datacfg")), autoescape=select_autoescape(["sql"]), undefined=StrictUndefined, ) template = env.get_template(self.truncate_file) return template.render( db=self.db_template_data, table=self.table_template_data, run=self.run_template_data, ) else: return 'DELETE FROM "{}";'.format(self.destination_table_name) elif not self.append_only: if self.primary_key: # override from db yaml file pks = self.primary_key else: # pk column discovered from the source table pks = self.pks if not pks and self.index_column: raise InvalidSchemaError( "Specifying an index column without primary key would " "result in all records in the existing table being " "deleted. If this is the desired behavior, run with " "--full-refresh instead. If not, check if primary keys" "can be inferred from the upstream database." ) constraints = [ '"{0}"."{2}" = "{1}"."{2}"'.format( self.staging_table_name, self.destination_table_name, pk ) for pk in pks ] constraint_string = " AND ".join(constraints) return 'DELETE FROM "{}" USING "{}" WHERE {};'.format( self.destination_table_name, self.staging_table_name, constraint_string, ) else: # Should only land here when append_only # in which case we're not deleting return None def get_grant_sql(self, cursor): """Get SQL statements to restore permissions to the staging table after a rebuild.""" get_permissions_sql = """ SELECT use.usename = CURRENT_USER AS "owned" , c.relacl AS "permissions" FROM pg_class c LEFT JOIN pg_namespace nsp ON c.relnamespace = nsp.oid LEFT JOIN pg_user use ON c.relowner = use.usesysid WHERE c.relkind = 'r' AND nsp.nspname = '{schema}' AND c.relname = '{table}' """.format( schema=self.destination_schema_name, table=self.destination_table_name ) cursor.execute(get_permissions_sql) permissions_result = cursor.fetchall() if len(permissions_result) == 0: self.logger.info( "No existing permissions found for %s.%s", self.destination_schema_name, self.destination_table_name, ) return None elif len(permissions_result) > 1: raise MigrationError("Got multiple permissions rows for table") is_owner, permissions_str = permissions_result[0] if not is_owner: raise MigrationError( "Can't rebuild target table because it has another owner" ) self.logger.info( "Got existing permissions for table to add to %s: %s", self.staging_table_name, permissions_str, ) permissions = Permissions.parse(permissions_str) if permissions is None: raise MigrationError( "Couldn't parse permissions {} to rebuild target table".format( permissions_str ) ) grant_template = "GRANT {grant} ON {table} TO {group}{name};" grant_sqls = [ grant_template.format( grant=g, table=self.staging_table_name, # Should we not restore users? group="GROUP " if p.is_group else "", name=p.name, ) for p in permissions for g in p.grants ] return "\n".join(grant_sqls) @property def staging_table_name(self): return "{}_{}_staging".format(self.database_alias, self.destination_table_name) def load(self): """The Load phase of the pipeline. Takes a file in S3 and issues a Redshift COPY command to import the data into a staging table. It then upserts the data by deleting rows in the destination table that match on PK, inserts all rows from the staging table into the destination table, and then deletes the staging table. """ self.starttime = datetime.datetime.utcnow() # Initializing Data delete_clause = self.get_delete_sql() staging_table = self.staging_table_name destination_table = self.destination_table_name is_normal_load = self._destination_table_status == self.DESTINATION_TABLE_OK is_rebuild = self._destination_table_status == self.DESTINATION_TABLE_REBUILD is_dne = self._destination_table_status == self.DESTINATION_TABLE_DNE with get_redshift().cursor() as cur: self.set_search_path(cur) # If table does not exist, create it if is_dne: create_table = self.query_to_redshift_create_table( self.get_query_sql(), self.destination_table_name ) cur.execute(create_table) elif not is_normal_load and not is_rebuild: raise RuntimeError( "Invalid table status in redshift_copy: {}".format( self._destination_table_status ) ) # If there is no row updates, just skip copy and return if self.row_count == 0: return cur.execute("BEGIN TRANSACTION;") # Lock the table early to avoid deadlocks in many-to-one pipelines. query = generate_lock_query(destination_table) cur.execute(query) query = generate_drop_exists_query(staging_table) cur.execute(query) if is_rebuild: # Build staging table anew and grant it appropriate permissions self.logger.info( "Creating staging table to rebuild %s", destination_table ) create_staging_table = self.query_to_redshift_create_table( self.get_query_sql(), staging_table ) permissions_sql = self.get_grant_sql(cur) cur.execute(create_staging_table) if permissions_sql: self.logger.info( "Copying permissions onto %s:\n%s", staging_table, permissions_sql, ) cur.execute(permissions_sql) else: # If not rebuilding, create staging with LIKE self.logger.info("Creating staging table %s", staging_table) query = generate_create_table_like_query( staging_table, destination_table ) cur.execute(query) # Issuing Copy Command self.logger.info("Issuing copy command") query = generate_copy_query( staging_table, self.copy_target_url, get_redshift().iam_copy_role, self.manifest_mode, ) self.logger.debug(query) cur.execute(query) # Row delete and count logic if is_rebuild or (self.append_only and not self.full_refresh): self.rows_deleted = 0 else: cur.execute(delete_clause) self.rows_deleted = cur.rowcount # Row insert and count logic if is_rebuild: self.logger.info("Swapping staging table into %s", destination_table) # DNE overrides rebuild, so we can assume the table exists query = generate_drop_query(destination_table) self.logger.debug(query) cur.execute(query) query = generate_rename_query(staging_table, destination_table) self.logger.debug(query) cur.execute(query) query = generate_count_query(destination_table) self.logger.debug(query) cur.execute(query) self.rows_inserted = cur.fetchall()[0] else: query = generate_insert_all_query(staging_table, destination_table) self.logger.debug(query) cur.execute(query) self.rows_inserted = cur.rowcount query = generate_drop_query(staging_table) self.logger.debug(query) cur.execute(query) cur.execute("END TRANSACTION;") self.register_and_cleanup() def register_and_cleanup(self): # Register in index table. self.set_last_updated_index() # Register in monitor table # self.endtime = datetime.datetime.utcnow() # self.register_load_monitor() # Clean up S3 for key in self.data_file_keys(): self.s3.delete_object(Bucket=get_redshift().s3_config.bucket, Key=key) if self.manifest_mode: self.s3.delete_object( Bucket=get_redshift().s3_config.bucket, Key=self.manifest_s3_data_key() )