def IncrementalLoad(self, dbCommon, tables): self.fileUtilities.EmptyFolderContents(self.fileUtilities.sqlFolder) try: # This is where we last ended. Start at 1 + this end athenaSchemaName = AthenaUtilities.ComposeAthenaSchemaName(tables["schemaName"]) chunkEnd = int(AthenaUtilities.GetMaxValue(self.awsParams, athenaSchemaName, tables["table"], tables["incrementalconditions"]["keyfield"], self.logger)) except ValueError: chunkEnd = 0 # Table does not exist yet except: raise #chunkEnd = 2960000000 #maxValue = 3708000000 # 2249000000 3708000000 maxValue = BCPUtilities.GetMaxValueSQLServer(dbCommon, tables, self.logger) chunkSize = tables["incrementalconditions"]["chunksize"] chunkStart, chunkEnd = self.UpdateChunkStartEnd(chunkEnd, chunkSize, maxValue) fieldTerminator = self.job["fieldTerminator"] rowTerminator = None # Not using this. Stick with the default of CR/LF. self.job["rowTerminator"] chunkStartData = chunkStart # Each ETL gets the same date so that we can do a smart insert based on ETL and chunkStartData partitionValue = datetime.datetime.strftime(datetime.date.today(), '%Y-%m-%d') while chunkStart <= maxValue: sqlPullDataScript = BCPUtilities.CreatePullScript(dbCommon, tables, chunkStart, chunkEnd, self.logger, self.fileUtilities, self.location) # Construct a file name that is meaning full. That is, it has the start and end IDs fileBaseName = tables["incrementalconditions"]["keyfield"] + "-" + BCPUtilities.ComponseRangeString(chunkStart, chunkEnd) outputCSV = self.fileUtilities.csvFolder + fileBaseName + ".csv" self.fileUtilities.EmptyFolderContents(self.fileUtilities.csvFolder) self.bcpUtilities.BulkExtract(sqlPullDataScript, outputCSV, dbCommon, tables, fieldTerminator, rowTerminator, self.job["bcpUtilityDirOnLinux"], self.fileUtilities, self.logger) # Process the data using Spark and save as Parquet spark = SparkUtilities.GetCreateSparkSession(self.logger) df = SparkUtilities.ReadCSVFile(spark, tables, fieldTerminator, False, self.fileUtilities.csvFolder, self.logger) SparkUtilities.SaveParquet(df, self.fileUtilities, fileBaseName) self.UploadFilesCreateAthenaTablesAndSqlScripts(tables, self.fileUtilities.parquet, partitionValue) tables["new"] = "N" # Do not recreate if chunkSize < 0: break; # Done with the single load chunkStart, chunkEnd = self.UpdateChunkStartEnd(chunkEnd, chunkSize, maxValue) # Load only the data that we processed into Redshift. We cannot use the run ETL date parition value # since we are loading the data based on record IDs customWhereCondition = tables["incrementalconditions"]["keyfield"] + " >= " + str(chunkStartData) self.LoadDataFromAthenaIntoRedShiftLocalScripts(tables, customWhereCondition)
def GetLatestValuationDateInAthena(self, table): ''' Get the last year month (based on valuation date) that has been process in Athena ''' try: athenaSchemaName = AthenaUtilities.ComposeAthenaSchemaName( table["schemaName"]) latestValuationDateInAthena = AthenaUtilities.GetMaxValue( self.awsParams, athenaSchemaName, table["table"], "etl_valuationdate", self.logger) except ValueError: latestValuationDateInAthena = None # Some really low value in case the table has not been created yet except: raise return latestValuationDateInAthena
def LoadDataFromAthenaIntoRedShiftS3Scripts(self, table): ''' If at a later time we decide to drop the Redshift table and re-load the data from Athena, we need a utility to do that ''' # Download scripts from S3 to local folder AthenaUtilities.DownloadScriptsForRedShift( self.awsParams, table, self.fileUtilities.sqlFolder) self.LoadDataFromAthenaIntoRedShiftLocalScripts(table)
def GetMaxPublishDateInAthena(self, catalog): ''' Get the last year month (based on valuation date) that has been process in Athena ''' try: athenaSchemaName = AthenaUtilities.ComposeAthenaSchemaName( catalog["schemaName"]) maxDate = AthenaUtilities.GetMaxValue(self.awsParams, athenaSchemaName, catalog["paramTable"], "publisheddate", self.logger) if maxDate == 'max_val': maxDate = None except ValueError: maxDate = None #'2017-10-10' # Some really low value in case the table has not been created yet except: raise return maxDate
def BuildSqlSubStatement(tableSettings, schema, table, validation, whereClause, columnsToExclude): if AthenaUtilities.ContainsSpecialFields( tableSettings) or len(columnsToExclude) > 0: fields = HandleSpecialFields(tableSettings, validation, columnsToExclude) else: fields = " * " # Perform straight compare sqlScript = "\n SELECT" + fields + " FROM " + schema + '.' + table if whereClause is not None: sqlScript = sqlScript + "\n WHERE " + whereClause return sqlScript
def GetParameters(self, table): ''' get the value of the last valuation date ''' try: athenaSchemaName = AthenaUtilities.ComposeAthenaSchemaName( table["schemaName"]) try: maxval = int( AthenaUtilities.GetMaxValue(self.awsParams, athenaSchemaName, table["paramTable"], "etl_rundate", self.logger)) prevPartition = maxval except: prevPartition = 20000101 # this is just to make sure we have a period to start with and by default is Jan 2000 except: self.logger.exception(self.moduleName + " - we had an error in GetParameters") raise return str(prevPartition)
def GetLastUpdateDate(self, table): ''' Get the last Update Date registered in Athena ''' try: athenaSchemaName = AthenaUtilities.ComposeAthenaSchemaName( table["schemaName"]) last_update_date = AthenaUtilities.GetMaxValue( self.awsParams, athenaSchemaName, table["table"], "etl_last_update_date", self.logger) if last_update_date is None: last_update_date = datetime.date.today() - datetime.timedelta( days=2) #To process from yesterday by default except StandardError as err: self.logger.info(self.moduleName + " - GetLastUpdateDate() Error: " + err.message) return datetime.datetime.strptime(str(last_update_date), "%Y-%m-%d").date()
def UploadFilesCreateAthenaTablesAndSqlScripts(self, table, localParquetFolderName, partitionValue=None): ''' Upload Parquet files into S3 Create Athena Table/Partition Create script to create a RedShift table and save to S3 (note that the ETL may not necessarily load data into Redshift) Create script to insert data into Redshift and save to S3 (note that the ETL may not necessarily load data into Redshift) ''' if not FileUtilities.FilesExistInFolder(localParquetFolderName + "*.parquet"): # Nothing was created. We have a problem self.logger.info( self.moduleName + " - No parquet files were created for current partition in: " + localParquetFolderName + ". Nothing was processed on Athena.") return False self.fileUtilities.CreateTableSql(table, self.fileUtilities.sqlFolder) scriptPartitionValue = partitionValue if AthenaUtilities.IsTablePartitioned(table): # For partitioned tables, the script will insert a where clause by default. However, if we are doing a new load # skip the where clause so that we can have SQL script that is capable of loading all the data from Athena # into RedShift in the future s3FolderLocation = AthenaUtilities.ComposeAthenaS3DataFileKey( table["schemaName"], table["table"]) if not S3Utilities.KeyExist( self.awsParams, s3FolderLocation ): # Do not update scripts if data has been previously loaded scriptPartitionValue = None AthenaUtilities.SqlToLoadDataFromAthena(self.logger, table, self.fileUtilities.sqlFolder, scriptPartitionValue) AthenaUtilities.UploadFilesAndCreateAthenaTables( self.awsParams, localParquetFolderName, table, self.fileUtilities.sqlFolder, self.logger, partitionValue) return True
def LoadDataFromAthenaIntoRedShiftLocalScripts(self, table, customWhereCondition=None): ''' If at a later time we decide to drop the Redshift table and re-load the data from Athena, we need a utility to do that ''' # Under the hood the table will be recreated if the new flag is on or if the table does not exist # Load the data from Athena into RedShift after that. The load query only loads what needed from Athena scriptToCreateRedshiftTable = FileUtilities.ComposeCreateTableSqlFilename( table, self.fileUtilities.sqlFolder) RedshiftUtilities.PSqlExecute(scriptToCreateRedshiftTable, self.logger) scriptToLoadDataFromAthena = AthenaUtilities.ComposeInsertIntoSqlFilename( table, self.fileUtilities.sqlFolder) if customWhereCondition: # Replace the existing where clause with the custom clause customWhereCondition = " AND " + customWhereCondition + ";" replacements = {';': customWhereCondition} scriptToLoadDataFromAthenaCustom = scriptToLoadDataFromAthena + "_custom.sql" self.fileUtilities.ReplaceStringInFile( scriptToLoadDataFromAthena, scriptToLoadDataFromAthenaCustom, replacements) scriptToLoadDataFromAthena = scriptToLoadDataFromAthenaCustom RedshiftUtilities.PSqlExecute(scriptToLoadDataFromAthena, self.logger)
def ProcessTables(self, dbCommon, tables): ''' Process the current table to load it up ''' try: self.logger.debug(self.moduleName + " -- ProcessTables for " + tables["table"] + " starting") # Cleanup first (TODO - Need a more generic way to do this) self.fileUtilities.EmptyFolderContents(self.fileUtilities.sqlFolder) # Variables used for handling chunks. -1 for full load chunkStart = chunkEnd = maxValue = chunkSize = -1 if "incrementalconditions" in tables: incrementalConditions = tables["incrementalconditions"] if "startID" in incrementalConditions: chunkEnd = incrementalConditions["startID"] - 1 else: athenaSchemaName = AthenaUtilities.ComposeAthenaSchemaName(tables["schemaName"]) try: # This is where we last ended. Start at 1 + this end chunkEnd = int(AthenaUtilities.GetMaxValue(self.awsParams, athenaSchemaName, tables["table"], tables["distkey"], self.logger)) except ValueError: chunkEnd = 0 # Table does not exist yet except: raise if "endID" in incrementalConditions: maxValue = incrementalConditions["endID"] else: # TODO - Fix this. Also, we should start at the source min value not 0. maxValue = 2000000000 #BCPUtilities.GetMaxValueSQLServer(dbCommon, tables, chunkStart) chunkSize = tables["incrementalconditions"]["chunksize"] chunkStart, chunkEnd = self.UpdateChunkStartEnd(chunkEnd, chunkSize, maxValue) fieldDelimiter = self.job["delimiter"] if "delimiter" in tables: fieldDelimiter = tables["delimiter"] while chunkStart <= maxValue: partitionValue = self.GetPartitionValue(tables, chunkStart) sqlPullDataScript = BCPUtilities.CreatePullScript(dbCommon, tables, chunkStart, chunkEnd, self.logger, self.fileUtilities, self.location) # Construct a file name that is meaning full. That is, it has the start and end IDs outputCSV = self.fileUtilities.csvFolder + BCPUtilities.ComponseRangeString(chunkStart, chunkEnd) + ".csv" self.fileUtilities.EmptyFolderContents(self.fileUtilities.csvFolder) self.bcpUtilities.BulkExtract(sqlPullDataScript, outputCSV, dbCommon, tables, fieldDelimiter, self.job["bcpUtilityDirOnLinux"], self.fileUtilities, self.logger) # Process the data using Spark and save as Parquet spark = SparkUtilities.GetCreateSparkSession(self.logger) schema = SparkUtilities.BuildSparkSchema(tables) df = (spark.read .format("com.databricks.spark.csv") .options(header='false', delimiter=fieldDelimiter) .schema(schema) .load(self.fileUtilities.csvFolder) ) df.printSchema() df.show() df = SparkUtilities.ProcessSpecialCharsIfAny(df, tables) self.logger.info(self.moduleName + " -- " + "DONE READING " + str(df.count()) + " ROWS. Now saving as parquet file...") self.fileUtilities.EmptyFolderContents(self.fileUtilities.parquet) SparkUtilities.SaveParquet(df, self.fileUtilities) # Need to load the data and clear the local space self.UploadFilesCreateAthenaTablesAndSqlScripts(tables, self.fileUtilities.parquet, partitionValue) tables["new"] = "N" # Do not recreate if chunkSize < 0: break; # Done with the single load chunkStart, chunkEnd = self.UpdateChunkStartEnd(chunkEnd, chunkSize, maxValue) # TODO - Need to make sure we don't end up with duplicate data if we run the code # Twice on the same day self.LoadDataFromAthenaIntoRedShiftLocalScripts(tables) self.logger.debug(self.moduleName + " -- ProcessTables for " + tables["table"] + " finished") except: self.logger.exception(self.moduleName + " - we had an error in ProcessDatabase for " + tables["table"]) raise