Esempio n. 1
0
 def IncrementalLoad(self, dbCommon, tables):
     self.fileUtilities.EmptyFolderContents(self.fileUtilities.sqlFolder)
     try:
         # This is where we last ended.  Start at 1 + this end
         athenaSchemaName = AthenaUtilities.ComposeAthenaSchemaName(tables["schemaName"])
         chunkEnd = int(AthenaUtilities.GetMaxValue(self.awsParams, athenaSchemaName, tables["table"],
                                                tables["incrementalconditions"]["keyfield"], self.logger))
     except ValueError:
         chunkEnd = 0 # Table does not exist yet
     except:
         raise
             
     #chunkEnd = 2960000000
     #maxValue = 3708000000 # 2249000000 3708000000
     maxValue = BCPUtilities.GetMaxValueSQLServer(dbCommon, tables, self.logger)
         
     chunkSize = tables["incrementalconditions"]["chunksize"]
     chunkStart, chunkEnd = self.UpdateChunkStartEnd(chunkEnd, chunkSize, maxValue)
 
     fieldTerminator = self.job["fieldTerminator"]
     rowTerminator = None # Not using this. Stick with the default of CR/LF.  self.job["rowTerminator"]
 
     chunkStartData = chunkStart
     # Each ETL gets the same date so that we can do a smart insert based on ETL and chunkStartData
     partitionValue = datetime.datetime.strftime(datetime.date.today(), '%Y-%m-%d')
     while chunkStart <= maxValue:
         sqlPullDataScript = BCPUtilities.CreatePullScript(dbCommon, tables, chunkStart, chunkEnd,
                                                           self.logger, self.fileUtilities, self.location)
         # Construct a file name that is meaning full.  That is, it has the start and end IDs
         fileBaseName = tables["incrementalconditions"]["keyfield"] + "-" + BCPUtilities.ComponseRangeString(chunkStart, chunkEnd)
         outputCSV = self.fileUtilities.csvFolder + fileBaseName + ".csv"
         self.fileUtilities.EmptyFolderContents(self.fileUtilities.csvFolder)
         self.bcpUtilities.BulkExtract(sqlPullDataScript, outputCSV, dbCommon, tables, fieldTerminator, rowTerminator,
                                       self.job["bcpUtilityDirOnLinux"], self.fileUtilities, self.logger)
     
         # Process the data using Spark and save as Parquet
         spark = SparkUtilities.GetCreateSparkSession(self.logger)
         df = SparkUtilities.ReadCSVFile(spark, tables, fieldTerminator, False,
                                         self.fileUtilities.csvFolder, self.logger)
         SparkUtilities.SaveParquet(df, self.fileUtilities, fileBaseName)
         self.UploadFilesCreateAthenaTablesAndSqlScripts(tables, self.fileUtilities.parquet, partitionValue)
         
         tables["new"] = "N" # Do not recreate
         if chunkSize < 0:
             break;  # Done with the single load
         chunkStart, chunkEnd = self.UpdateChunkStartEnd(chunkEnd, chunkSize, maxValue)
     
     # Load only the data that we processed into Redshift.  We cannot use the run ETL date parition value
     # since we are loading the data based on record IDs
     customWhereCondition = tables["incrementalconditions"]["keyfield"] + " >= " + str(chunkStartData)
     self.LoadDataFromAthenaIntoRedShiftLocalScripts(tables, customWhereCondition)
Esempio n. 2
0
    def GetLatestValuationDateInAthena(self, table):
        '''
        Get the last year month (based on valuation date) that has been process in Athena
        '''
        try:
            athenaSchemaName = AthenaUtilities.ComposeAthenaSchemaName(
                table["schemaName"])
            latestValuationDateInAthena = AthenaUtilities.GetMaxValue(
                self.awsParams, athenaSchemaName, table["table"],
                "etl_valuationdate", self.logger)
        except ValueError:
            latestValuationDateInAthena = None  # Some really low value in case the table has not been created yet
        except:
            raise

        return latestValuationDateInAthena
Esempio n. 3
0
    def GetMaxPublishDateInAthena(self, catalog):
        '''
        Get the last year month (based on valuation date) that has been process in Athena
        '''
        try:
            athenaSchemaName = AthenaUtilities.ComposeAthenaSchemaName(
                catalog["schemaName"])
            maxDate = AthenaUtilities.GetMaxValue(self.awsParams,
                                                  athenaSchemaName,
                                                  catalog["paramTable"],
                                                  "publisheddate", self.logger)
            if maxDate == 'max_val':
                maxDate = None
        except ValueError:
            maxDate = None  #'2017-10-10' # Some really low value in case the table has not been created yet
        except:
            raise

        return maxDate
Esempio n. 4
0
 def GetParameters(self, table):
     '''
     get the value of the last valuation date
     '''
     try:
         athenaSchemaName = AthenaUtilities.ComposeAthenaSchemaName(
             table["schemaName"])
         try:
             maxval = int(
                 AthenaUtilities.GetMaxValue(self.awsParams,
                                             athenaSchemaName,
                                             table["paramTable"],
                                             "etl_rundate", self.logger))
             prevPartition = maxval
         except:
             prevPartition = 20000101  # this is just to make sure we have a period to start with and by default is Jan 2000
     except:
         self.logger.exception(self.moduleName +
                               " - we had an error in GetParameters")
         raise
     return str(prevPartition)
Esempio n. 5
0
    def GetLastUpdateDate(self, table):
        '''
        Get the last Update Date registered in Athena
        '''
        try:
            athenaSchemaName = AthenaUtilities.ComposeAthenaSchemaName(
                table["schemaName"])

            last_update_date = AthenaUtilities.GetMaxValue(
                self.awsParams, athenaSchemaName, table["table"],
                "etl_last_update_date", self.logger)

            if last_update_date is None:
                last_update_date = datetime.date.today() - datetime.timedelta(
                    days=2)  #To process from yesterday by default
        except StandardError as err:
            self.logger.info(self.moduleName +
                             " - GetLastUpdateDate() Error: " + err.message)

        return datetime.datetime.strptime(str(last_update_date),
                                          "%Y-%m-%d").date()
Esempio n. 6
0
    def ProcessTables(self, dbCommon, tables):
        '''
        Process the current table to load it up
        '''
        try:
            self.logger.debug(self.moduleName + " -- ProcessTables for  " + tables["table"] + " starting")
            
            # Cleanup first (TODO - Need a more generic way to do this)
            self.fileUtilities.EmptyFolderContents(self.fileUtilities.sqlFolder)
            
            # Variables used for handling chunks.  -1 for full load
            chunkStart =  chunkEnd = maxValue = chunkSize = -1
            
            if "incrementalconditions" in tables:
                incrementalConditions = tables["incrementalconditions"]
                if "startID" in incrementalConditions:
                    chunkEnd = incrementalConditions["startID"] - 1
                else:
                    athenaSchemaName = AthenaUtilities.ComposeAthenaSchemaName(tables["schemaName"])
                    try:
                        # This is where we last ended.  Start at 1 + this end
                        chunkEnd = int(AthenaUtilities.GetMaxValue(self.awsParams, athenaSchemaName, tables["table"], tables["distkey"], self.logger))
                    except ValueError:
                        chunkEnd = 0 # Table does not exist yet
                    except:
                        raise

                if "endID" in incrementalConditions:
                    maxValue = incrementalConditions["endID"]
                else:
                    # TODO - Fix this.  Also, we should start at the source min value not 0.
                    maxValue = 2000000000 #BCPUtilities.GetMaxValueSQLServer(dbCommon, tables, chunkStart)
                    
                chunkSize = tables["incrementalconditions"]["chunksize"]
                chunkStart, chunkEnd = self.UpdateChunkStartEnd(chunkEnd, chunkSize, maxValue)
                    
            fieldDelimiter = self.job["delimiter"]
            if "delimiter" in tables:
                fieldDelimiter = tables["delimiter"]
            
            while chunkStart <= maxValue:
                partitionValue = self.GetPartitionValue(tables, chunkStart)
                sqlPullDataScript = BCPUtilities.CreatePullScript(dbCommon, tables, chunkStart, chunkEnd,
                                                                  self.logger, self.fileUtilities, self.location)
                # Construct a file name that is meaning full.  That is, it has the start and end IDs
                outputCSV = self.fileUtilities.csvFolder + BCPUtilities.ComponseRangeString(chunkStart, chunkEnd) + ".csv"
                self.fileUtilities.EmptyFolderContents(self.fileUtilities.csvFolder)
                self.bcpUtilities.BulkExtract(sqlPullDataScript, outputCSV, dbCommon, tables, fieldDelimiter,
                                              self.job["bcpUtilityDirOnLinux"], self.fileUtilities, self.logger)
                # Process the data using Spark and save as Parquet
                spark = SparkUtilities.GetCreateSparkSession(self.logger)
                schema = SparkUtilities.BuildSparkSchema(tables)
                df = (spark.read
                         .format("com.databricks.spark.csv")
                         .options(header='false', delimiter=fieldDelimiter)
                         .schema(schema)
                         .load(self.fileUtilities.csvFolder)
                         )
                df.printSchema()
                df.show()
                df = SparkUtilities.ProcessSpecialCharsIfAny(df, tables)
            
                self.logger.info(self.moduleName + " -- " + "DONE READING " + str(df.count()) + " ROWS.  Now saving as parquet file...")
                self.fileUtilities.EmptyFolderContents(self.fileUtilities.parquet)
                SparkUtilities.SaveParquet(df, self.fileUtilities)
            
                # Need to load the data and clear the local space
                self.UploadFilesCreateAthenaTablesAndSqlScripts(tables, self.fileUtilities.parquet, partitionValue)
                
                tables["new"] = "N" # Do not recreate
                if chunkSize < 0:
                    break;  # Done with the single load
                chunkStart, chunkEnd = self.UpdateChunkStartEnd(chunkEnd, chunkSize, maxValue)
            
            # TODO - Need to make sure we don't end up with duplicate data if we run the code
            # Twice on the same day
            self.LoadDataFromAthenaIntoRedShiftLocalScripts(tables)

            self.logger.debug(self.moduleName + " -- ProcessTables for  " + tables["table"] + " finished")
        except:
            self.logger.exception(self.moduleName + " - we had an error in ProcessDatabase for " + tables["table"])
            raise