def register_extract_monitor(self, starttime, endtime): """Adds an entry into the extract monitor for a given extract task Parameters ---------- starttime : datetime.datetime datetime object generated at the beginning of the data extraction endtime : datetime.datetime datetime object generated at the end of the data extraction """ query = """ INSERT INTO "public"."table_extract_detail" VALUES ( %(task_id)s, %(class_name)s, %(task_date_params)s, %(task_other_params)s, %(start_dt)s, %(end_dt)s, %(run_time_sec)s, %(manifest_path)s, %(data_path)s, %(output_exists)s, %(row_count)s, %(upload_size)s, %(exception)s ); """ args = { "task_id": "{}(alias={}, database={}, table={})".format( self.__class__.__name__, self.database_alias, self.db_name, self.source_table_name, ), "class_name": self.__class__.__name__, "task_date_params": None, "task_other_params": None, "start_dt": starttime.replace(microsecond=0), "end_dt": endtime.replace(microsecond=0), "run_time_sec": (endtime - starttime).total_seconds(), "manifest_path": self.copy_target_url, "data_path": "s3://{}/{}".format(get_redshift().s3_config.bucket, self.s3_key_prefix), "output_exists": self.row_count > 0, "row_count": self.row_count, "upload_size": self.upload_size, "exception": None, } self.logger.info("Inserting record into table_extract_detail") with get_redshift().cursor() as cur: cur.execute(query, args)
def write_manifest_file(self): if not self.manifest_mode: raise TableStateError("Cannot write manifest when not in manifest mode") entries = [ {"url": _s3_url(get_redshift().s3_config.bucket, key), "mandatory": True} for key in self.data_file_keys() ] manifest = {"entries": entries} with BytesIO() as f: f.write(json.dumps(manifest).encode()) self._upload_s3( f, get_redshift().s3_config.bucket, self.manifest_s3_data_key() )
def avro_to_s3(self, results_iter, results_schema): """Attempts to serialize a result set to an AVRO file returns true if it complete writes the entire result_iter and false if there were records remaining when it hit the maximum file size. """ with BytesIO() as f: complete, row_count = write_avro_file( f, results_iter, results_schema, self.destination_table_name, self.max_file_size, ) if self.row_count is None: self.row_count = row_count else: self.row_count += row_count self.upload_size += f.tell() if not complete: self.manifest_mode = True if row_count > 0: self._upload_s3( f, get_redshift().s3_config.bucket, self.next_s3_data_file_key() ) self.num_data_files += 1 return complete
def set_last_updated_index(self): """Adds a new index to the pipeline_table_index table for updated tables """ if self.new_index_value is None: return query = f""" INSERT INTO "{self.index_schema}"."{self.index_table}" VALUES (%s, %s, %s, %s) """ if isinstance(self.new_index_value, int): new_index_value = str(self.new_index_value) elif isinstance(self.new_index_value, datetime.datetime): new_index_value = self.new_index_value.strftime("%Y-%m-%d %H:%M:%S.%f") else: msg = "Don't know how to handle index {} of type {}".format( self.new_index_value, str(type(self.new_index_value)) ) raise TypeError(msg) self.logger.info("Updating index table") with get_redshift().cursor() as cur: args = ( self.database_alias, self.db_name, self.source_table_name, new_index_value, ) self.logger.debug(cur.mogrify(query, args)) cur.execute(query, args)
def s3_key_prefix(self): return "{}/{}.{}.{}".format( get_redshift().s3_config.prefix, self.database_alias, self.source_table_name, self.date_key, )
def register_and_cleanup(self): # Register in index table. self.set_last_updated_index() # Register in monitor table # self.endtime = datetime.datetime.utcnow() # self.register_load_monitor() # Clean up S3 for key in self.data_file_keys(): self.s3.delete_object(Bucket=get_redshift().s3_config.bucket, Key=key) if self.manifest_mode: self.s3.delete_object( Bucket=get_redshift().s3_config.bucket, Key=self.manifest_s3_data_key() )
def _load_old_index_value(self): """Sets and gets the index_value property, retrieved from Redshift Returns ------- index_value : variable Since index_value can vary from table to table, this can be many different types. Most common will be a datetime or int, but could also be a date or string. Returns None if no previous index value found """ query = f""" SELECT index_value FROM "{self.index_schema}"."{self.index_table}" WHERE datastore_name = %s AND database_name = %s AND table_name = %s ORDER BY created_ts DESC LIMIT 1; """ self.logger.debug("Querying Redshift for last updated index") with get_redshift().cursor() as cur: cur.execute( query, (self.database_alias, self.db_name, self.source_table_name)) index_value = cur.fetchone() if index_value is None: self.logger.info("No index found. Dumping entire table: %s.", self.source_table_name) return index_value else: self.logger.info("Index found: %s", index_value[0]) return index_value[0]
def register_load_monitor(self): """Adds an entry into the load monitor for a given load task Parameters ---------- starttime : datetime datetime object generated at the beginning of the load endtime : datetime datetime object generated at the end of the load rows_inserted : int Total number of rows generated by the Redshift COPY command rows_deleted : int Count of rows deleted by primary key in the destination table load_size : int Size in bytes of the file in S3 used in the Redshift COPY command """ query = """ INSERT INTO "public"."table_load_detail" VALUES ( %(task_id)s, %(class_name)s, %(task_date_params)s, %(task_other_params)s, %(target_table)s, %(start_dt)s, %(end_dt)s, %(run_time_sec)s, %(extract_task_update_id)s, %(data_path)s, %(manifest_cleaned)s, %(rows_inserted)s, %(rows_deleted)s, %(load_size)s, %(exception)s ); """ task_id = "{}(alias={}, database={}, table={})".format( self.__class__.__name__, self.database_alias, self.db_name, self.source_table_name, ) target_table = "{}.{}".format( self.destination_schema_name, self.destination_table_name ) args = { "task_id": task_id, "class_name": self.__class__.__name__, "task_date_params": None, "task_other_params": None, "target_table": target_table, "start_dt": self.starttime.replace(microsecond=0), "end_dt": self.endtime.replace(microsecond=0), "run_time_sec": (self.endtime - self.starttime).total_seconds(), "extract_task_update_id": task_id, "data_path": self.copy_target_url, "manifest_cleaned": False, "rows_inserted": self.rows_inserted, "rows_deleted": self.rows_deleted, "load_size": self.upload_size, "exception": None, } self.logger.info("Inserting record into table_load_detail") with get_redshift().cursor() as cur: cur.execute(query, args)
def get_destination_table_columns(self): query = """ SELECT "column" FROM pg_table_def WHERE schemaname = %s AND tablename = %s; """ with get_redshift().cursor() as cur: self.set_search_path(cur) cur.execute( query, (self.destination_schema_name, self.destination_table_name) ) results = cur.fetchall() return [x[0] for x in results]
def copy_target_url(self): return _s3_url(get_redshift().s3_config.bucket, self.copy_target_key)
def load(self): """The Load phase of the pipeline. Takes a file in S3 and issues a Redshift COPY command to import the data into a staging table. It then upserts the data by deleting rows in the destination table that match on PK, inserts all rows from the staging table into the destination table, and then deletes the staging table. """ self.starttime = datetime.datetime.utcnow() # Initializing Data delete_clause = self.get_delete_sql() staging_table = self.staging_table_name destination_table = self.destination_table_name is_normal_load = self._destination_table_status == self.DESTINATION_TABLE_OK is_rebuild = self._destination_table_status == self.DESTINATION_TABLE_REBUILD is_dne = self._destination_table_status == self.DESTINATION_TABLE_DNE with get_redshift().cursor() as cur: self.set_search_path(cur) # If table does not exist, create it if is_dne: create_table = self.query_to_redshift_create_table( self.get_query_sql(), self.destination_table_name ) cur.execute(create_table) elif not is_normal_load and not is_rebuild: raise RuntimeError( "Invalid table status in redshift_copy: {}".format( self._destination_table_status ) ) # If there is no row updates, just skip copy and return if self.row_count == 0: return cur.execute("BEGIN TRANSACTION;") # Lock the table early to avoid deadlocks in many-to-one pipelines. query = generate_lock_query(destination_table) cur.execute(query) query = generate_drop_exists_query(staging_table) cur.execute(query) if is_rebuild: # Build staging table anew and grant it appropriate permissions self.logger.info( "Creating staging table to rebuild %s", destination_table ) create_staging_table = self.query_to_redshift_create_table( self.get_query_sql(), staging_table ) permissions_sql = self.get_grant_sql(cur) cur.execute(create_staging_table) if permissions_sql: self.logger.info( "Copying permissions onto %s:\n%s", staging_table, permissions_sql, ) cur.execute(permissions_sql) else: # If not rebuilding, create staging with LIKE self.logger.info("Creating staging table %s", staging_table) query = generate_create_table_like_query( staging_table, destination_table ) cur.execute(query) # Issuing Copy Command self.logger.info("Issuing copy command") query = generate_copy_query( staging_table, self.copy_target_url, get_redshift().iam_copy_role, self.manifest_mode, ) self.logger.debug(query) cur.execute(query) # Row delete and count logic if is_rebuild or (self.append_only and not self.full_refresh): self.rows_deleted = 0 else: cur.execute(delete_clause) self.rows_deleted = cur.rowcount # Row insert and count logic if is_rebuild: self.logger.info("Swapping staging table into %s", destination_table) # DNE overrides rebuild, so we can assume the table exists query = generate_drop_query(destination_table) self.logger.debug(query) cur.execute(query) query = generate_rename_query(staging_table, destination_table) self.logger.debug(query) cur.execute(query) query = generate_count_query(destination_table) self.logger.debug(query) cur.execute(query) self.rows_inserted = cur.fetchall()[0] else: query = generate_insert_all_query(staging_table, destination_table) self.logger.debug(query) cur.execute(query) self.rows_inserted = cur.rowcount query = generate_drop_query(staging_table) self.logger.debug(query) cur.execute(query) cur.execute("END TRANSACTION;") self.register_and_cleanup()