def IncrementalLoad(self, dbCommon, tables):
     self.fileUtilities.EmptyFolderContents(self.fileUtilities.sqlFolder)
     try:
         # This is where we last ended.  Start at 1 + this end
         athenaSchemaName = AthenaUtilities.ComposeAthenaSchemaName(tables["schemaName"])
         chunkEnd = int(AthenaUtilities.GetMaxValue(self.awsParams, athenaSchemaName, tables["table"],
                                                tables["incrementalconditions"]["keyfield"], self.logger))
     except ValueError:
         chunkEnd = 0 # Table does not exist yet
     except:
         raise
             
     #chunkEnd = 2960000000
     #maxValue = 3708000000 # 2249000000 3708000000
     maxValue = BCPUtilities.GetMaxValueSQLServer(dbCommon, tables, self.logger)
         
     chunkSize = tables["incrementalconditions"]["chunksize"]
     chunkStart, chunkEnd = self.UpdateChunkStartEnd(chunkEnd, chunkSize, maxValue)
 
     fieldTerminator = self.job["fieldTerminator"]
     rowTerminator = None # Not using this. Stick with the default of CR/LF.  self.job["rowTerminator"]
 
     chunkStartData = chunkStart
     # Each ETL gets the same date so that we can do a smart insert based on ETL and chunkStartData
     partitionValue = datetime.datetime.strftime(datetime.date.today(), '%Y-%m-%d')
     while chunkStart <= maxValue:
         sqlPullDataScript = BCPUtilities.CreatePullScript(dbCommon, tables, chunkStart, chunkEnd,
                                                           self.logger, self.fileUtilities, self.location)
         # Construct a file name that is meaning full.  That is, it has the start and end IDs
         fileBaseName = tables["incrementalconditions"]["keyfield"] + "-" + BCPUtilities.ComponseRangeString(chunkStart, chunkEnd)
         outputCSV = self.fileUtilities.csvFolder + fileBaseName + ".csv"
         self.fileUtilities.EmptyFolderContents(self.fileUtilities.csvFolder)
         self.bcpUtilities.BulkExtract(sqlPullDataScript, outputCSV, dbCommon, tables, fieldTerminator, rowTerminator,
                                       self.job["bcpUtilityDirOnLinux"], self.fileUtilities, self.logger)
     
         # Process the data using Spark and save as Parquet
         spark = SparkUtilities.GetCreateSparkSession(self.logger)
         df = SparkUtilities.ReadCSVFile(spark, tables, fieldTerminator, False,
                                         self.fileUtilities.csvFolder, self.logger)
         SparkUtilities.SaveParquet(df, self.fileUtilities, fileBaseName)
         self.UploadFilesCreateAthenaTablesAndSqlScripts(tables, self.fileUtilities.parquet, partitionValue)
         
         tables["new"] = "N" # Do not recreate
         if chunkSize < 0:
             break;  # Done with the single load
         chunkStart, chunkEnd = self.UpdateChunkStartEnd(chunkEnd, chunkSize, maxValue)
     
     # Load only the data that we processed into Redshift.  We cannot use the run ETL date parition value
     # since we are loading the data based on record IDs
     customWhereCondition = tables["incrementalconditions"]["keyfield"] + " >= " + str(chunkStartData)
     self.LoadDataFromAthenaIntoRedShiftLocalScripts(tables, customWhereCondition)
Beispiel #2
0
    def GetLatestValuationDateInAthena(self, table):
        '''
        Get the last year month (based on valuation date) that has been process in Athena
        '''
        try:
            athenaSchemaName = AthenaUtilities.ComposeAthenaSchemaName(
                table["schemaName"])
            latestValuationDateInAthena = AthenaUtilities.GetMaxValue(
                self.awsParams, athenaSchemaName, table["table"],
                "etl_valuationdate", self.logger)
        except ValueError:
            latestValuationDateInAthena = None  # Some really low value in case the table has not been created yet
        except:
            raise

        return latestValuationDateInAthena
Beispiel #3
0
 def LoadDataFromAthenaIntoRedShiftS3Scripts(self, table):
     '''
     If at a later time we decide to drop the Redshift table and re-load the data from Athena, we need a utility to do that
     '''
     # Download scripts from S3 to local folder
     AthenaUtilities.DownloadScriptsForRedShift(
         self.awsParams, table, self.fileUtilities.sqlFolder)
     self.LoadDataFromAthenaIntoRedShiftLocalScripts(table)
    def GetMaxPublishDateInAthena(self, catalog):
        '''
        Get the last year month (based on valuation date) that has been process in Athena
        '''
        try:
            athenaSchemaName = AthenaUtilities.ComposeAthenaSchemaName(
                catalog["schemaName"])
            maxDate = AthenaUtilities.GetMaxValue(self.awsParams,
                                                  athenaSchemaName,
                                                  catalog["paramTable"],
                                                  "publisheddate", self.logger)
            if maxDate == 'max_val':
                maxDate = None
        except ValueError:
            maxDate = None  #'2017-10-10' # Some really low value in case the table has not been created yet
        except:
            raise

        return maxDate
def BuildSqlSubStatement(tableSettings, schema, table, validation, whereClause,
                         columnsToExclude):
    if AthenaUtilities.ContainsSpecialFields(
            tableSettings) or len(columnsToExclude) > 0:
        fields = HandleSpecialFields(tableSettings, validation,
                                     columnsToExclude)
    else:
        fields = " * "  # Perform straight compare
    sqlScript = "\n  SELECT" + fields + "  FROM " + schema + '.' + table
    if whereClause is not None:
        sqlScript = sqlScript + "\n  WHERE " + whereClause
    return sqlScript
Beispiel #6
0
 def GetParameters(self, table):
     '''
     get the value of the last valuation date
     '''
     try:
         athenaSchemaName = AthenaUtilities.ComposeAthenaSchemaName(
             table["schemaName"])
         try:
             maxval = int(
                 AthenaUtilities.GetMaxValue(self.awsParams,
                                             athenaSchemaName,
                                             table["paramTable"],
                                             "etl_rundate", self.logger))
             prevPartition = maxval
         except:
             prevPartition = 20000101  # this is just to make sure we have a period to start with and by default is Jan 2000
     except:
         self.logger.exception(self.moduleName +
                               " - we had an error in GetParameters")
         raise
     return str(prevPartition)
Beispiel #7
0
    def GetLastUpdateDate(self, table):
        '''
        Get the last Update Date registered in Athena
        '''
        try:
            athenaSchemaName = AthenaUtilities.ComposeAthenaSchemaName(
                table["schemaName"])

            last_update_date = AthenaUtilities.GetMaxValue(
                self.awsParams, athenaSchemaName, table["table"],
                "etl_last_update_date", self.logger)

            if last_update_date is None:
                last_update_date = datetime.date.today() - datetime.timedelta(
                    days=2)  #To process from yesterday by default
        except StandardError as err:
            self.logger.info(self.moduleName +
                             " - GetLastUpdateDate() Error: " + err.message)

        return datetime.datetime.strptime(str(last_update_date),
                                          "%Y-%m-%d").date()
Beispiel #8
0
    def UploadFilesCreateAthenaTablesAndSqlScripts(self,
                                                   table,
                                                   localParquetFolderName,
                                                   partitionValue=None):
        '''
        Upload Parquet files into S3
        Create Athena Table/Partition
        Create script to create a RedShift table and save to S3 (note that the ETL may not necessarily load data into Redshift)
        Create script to insert data into Redshift and save to S3  (note that the ETL may not necessarily load data into Redshift)
        '''
        if not FileUtilities.FilesExistInFolder(localParquetFolderName +
                                                "*.parquet"):
            # Nothing was created.  We have a problem
            self.logger.info(
                self.moduleName +
                " - No parquet files were created for current partition in: " +
                localParquetFolderName + ".  Nothing was processed on Athena.")
            return False

        self.fileUtilities.CreateTableSql(table, self.fileUtilities.sqlFolder)

        scriptPartitionValue = partitionValue
        if AthenaUtilities.IsTablePartitioned(table):
            # For partitioned tables, the script will insert a where clause by default.  However, if we are doing a new load
            # skip the where clause so that we can have SQL script that is capable of loading all the data from Athena
            # into RedShift in the future
            s3FolderLocation = AthenaUtilities.ComposeAthenaS3DataFileKey(
                table["schemaName"], table["table"])
            if not S3Utilities.KeyExist(
                    self.awsParams, s3FolderLocation
            ):  # Do not update scripts if data has been previously loaded
                scriptPartitionValue = None
        AthenaUtilities.SqlToLoadDataFromAthena(self.logger, table,
                                                self.fileUtilities.sqlFolder,
                                                scriptPartitionValue)

        AthenaUtilities.UploadFilesAndCreateAthenaTables(
            self.awsParams, localParquetFolderName, table,
            self.fileUtilities.sqlFolder, self.logger, partitionValue)
        return True
Beispiel #9
0
    def LoadDataFromAthenaIntoRedShiftLocalScripts(self,
                                                   table,
                                                   customWhereCondition=None):
        '''
        If at a later time we decide to drop the Redshift table and re-load the data from Athena, we need a utility to do that
        '''
        # Under the hood the table will be recreated if the new flag is on or if the table does not exist
        # Load the data from Athena into RedShift after that.  The load query only loads what needed from Athena
        scriptToCreateRedshiftTable = FileUtilities.ComposeCreateTableSqlFilename(
            table, self.fileUtilities.sqlFolder)
        RedshiftUtilities.PSqlExecute(scriptToCreateRedshiftTable, self.logger)

        scriptToLoadDataFromAthena = AthenaUtilities.ComposeInsertIntoSqlFilename(
            table, self.fileUtilities.sqlFolder)
        if customWhereCondition:
            # Replace the existing where clause with the custom clause
            customWhereCondition = " AND " + customWhereCondition + ";"
            replacements = {';': customWhereCondition}
            scriptToLoadDataFromAthenaCustom = scriptToLoadDataFromAthena + "_custom.sql"
            self.fileUtilities.ReplaceStringInFile(
                scriptToLoadDataFromAthena, scriptToLoadDataFromAthenaCustom,
                replacements)
            scriptToLoadDataFromAthena = scriptToLoadDataFromAthenaCustom
        RedshiftUtilities.PSqlExecute(scriptToLoadDataFromAthena, self.logger)
Beispiel #10
0
    def ProcessTables(self, dbCommon, tables):
        '''
        Process the current table to load it up
        '''
        try:
            self.logger.debug(self.moduleName + " -- ProcessTables for  " + tables["table"] + " starting")
            
            # Cleanup first (TODO - Need a more generic way to do this)
            self.fileUtilities.EmptyFolderContents(self.fileUtilities.sqlFolder)
            
            # Variables used for handling chunks.  -1 for full load
            chunkStart =  chunkEnd = maxValue = chunkSize = -1
            
            if "incrementalconditions" in tables:
                incrementalConditions = tables["incrementalconditions"]
                if "startID" in incrementalConditions:
                    chunkEnd = incrementalConditions["startID"] - 1
                else:
                    athenaSchemaName = AthenaUtilities.ComposeAthenaSchemaName(tables["schemaName"])
                    try:
                        # This is where we last ended.  Start at 1 + this end
                        chunkEnd = int(AthenaUtilities.GetMaxValue(self.awsParams, athenaSchemaName, tables["table"], tables["distkey"], self.logger))
                    except ValueError:
                        chunkEnd = 0 # Table does not exist yet
                    except:
                        raise

                if "endID" in incrementalConditions:
                    maxValue = incrementalConditions["endID"]
                else:
                    # TODO - Fix this.  Also, we should start at the source min value not 0.
                    maxValue = 2000000000 #BCPUtilities.GetMaxValueSQLServer(dbCommon, tables, chunkStart)
                    
                chunkSize = tables["incrementalconditions"]["chunksize"]
                chunkStart, chunkEnd = self.UpdateChunkStartEnd(chunkEnd, chunkSize, maxValue)
                    
            fieldDelimiter = self.job["delimiter"]
            if "delimiter" in tables:
                fieldDelimiter = tables["delimiter"]
            
            while chunkStart <= maxValue:
                partitionValue = self.GetPartitionValue(tables, chunkStart)
                sqlPullDataScript = BCPUtilities.CreatePullScript(dbCommon, tables, chunkStart, chunkEnd,
                                                                  self.logger, self.fileUtilities, self.location)
                # Construct a file name that is meaning full.  That is, it has the start and end IDs
                outputCSV = self.fileUtilities.csvFolder + BCPUtilities.ComponseRangeString(chunkStart, chunkEnd) + ".csv"
                self.fileUtilities.EmptyFolderContents(self.fileUtilities.csvFolder)
                self.bcpUtilities.BulkExtract(sqlPullDataScript, outputCSV, dbCommon, tables, fieldDelimiter,
                                              self.job["bcpUtilityDirOnLinux"], self.fileUtilities, self.logger)
                # Process the data using Spark and save as Parquet
                spark = SparkUtilities.GetCreateSparkSession(self.logger)
                schema = SparkUtilities.BuildSparkSchema(tables)
                df = (spark.read
                         .format("com.databricks.spark.csv")
                         .options(header='false', delimiter=fieldDelimiter)
                         .schema(schema)
                         .load(self.fileUtilities.csvFolder)
                         )
                df.printSchema()
                df.show()
                df = SparkUtilities.ProcessSpecialCharsIfAny(df, tables)
            
                self.logger.info(self.moduleName + " -- " + "DONE READING " + str(df.count()) + " ROWS.  Now saving as parquet file...")
                self.fileUtilities.EmptyFolderContents(self.fileUtilities.parquet)
                SparkUtilities.SaveParquet(df, self.fileUtilities)
            
                # Need to load the data and clear the local space
                self.UploadFilesCreateAthenaTablesAndSqlScripts(tables, self.fileUtilities.parquet, partitionValue)
                
                tables["new"] = "N" # Do not recreate
                if chunkSize < 0:
                    break;  # Done with the single load
                chunkStart, chunkEnd = self.UpdateChunkStartEnd(chunkEnd, chunkSize, maxValue)
            
            # TODO - Need to make sure we don't end up with duplicate data if we run the code
            # Twice on the same day
            self.LoadDataFromAthenaIntoRedShiftLocalScripts(tables)

            self.logger.debug(self.moduleName + " -- ProcessTables for  " + tables["table"] + " finished")
        except:
            self.logger.exception(self.moduleName + " - we had an error in ProcessDatabase for " + tables["table"])
            raise