def ProcessTables(self, dbCommon, tables): ''' process steps: pulls file from share and place in raw folder ''' try: spark = SparkUtilities.GetCreateSparkSession(self.logger) self.fileUtilities.EmptyFolderContents( self.fileUtilities.csvFolder) if "srcS3DataFolder" in tables: self.DownloadFilesFromS3(tables) df = SparkUtilities.ReadCSVFile(spark, tables, tables["delimiter"], False, self.fileUtilities.csvFolder, self.logger) if "adjustFormat" in tables: for fld in tables["adjustFormat"]: df = SparkUtilities.FormatColumn(df, fld["name"], fld["inputFormat"]) # remove any null records df = df.dropna(how='all') SparkUtilities.SaveParquet(df, self.fileUtilities) self.UploadFilesCreateAthenaTablesAndSqlScripts( tables, self.fileUtilities.parquet, None) if "loadToRedshift" in tables and tables["loadToRedshift"] == "Y": self.LoadDataFromAthenaIntoRedShiftLocalScripts(tables) except: self.logger.exception(self.moduleName + " - we had an error in ProcessRequest") raise
def ProcessTables(self, dbCommon, tables): ''' get the last partition value and use that as the date to pull data then put that data into Athena ''' try: outputCSV = self.fileUtilities.csvFolder + self.moduleName + ".CSV" fieldTerminator = self.job["fieldTerminator"] if "fieldTerminator" in tables: fieldTerminator = tables["fieldTerminator"] rowTerminator = None # Not using this. Stick with the default of CR/LF. self.job["rowTerminator"] if "pullTemplate" in tables: sqlPullDataScript = self.CreatePullScript(tables) self.bcpUtilities.BulkExtract(self.fileUtilities.LoadSQLQuery(sqlPullDataScript), outputCSV, dbCommon, tables, fieldTerminator, rowTerminator, self.job["bcpUtilityDirOnLinux"], self.fileUtilities, self.logger) spark = SparkUtilities.GetCreateSparkSession(self.logger) df = SparkUtilities.ReadCSVFile(spark, tables, self.job["fieldTerminator"], False, self.fileUtilities.csvFolder, self.logger) SparkUtilities.SaveParquet(df, self.fileUtilities) self.UploadFilesCreateAthenaTablesAndSqlScripts(tables, self.fileUtilities.parquet) if "loadToRedshift" in tables and tables["loadToRedshift"] == "Y": self.LoadDataFromAthenaIntoRedShiftLocalScripts(tables) except: self.logger.exception(self.moduleName + " - we had an error in ProcessTables") raise
def ProcessTables(self, dbCommon, tables): ''' process steps: pulls file from share and place in raw folder ''' try: self.DownloadFilesFromS3(tables) for fileName in self.fileUtilities.ScanFolder( self.localTempDirectory + '/raw/'): outPutFileName = self.GenerateCSVFromSpreadSheet( tables, self.localTempDirectory + '/raw/', fileName) spark = SparkUtilities.GetCreateSparkSession(self.logger) df = SparkUtilities.ReadCSVFile(spark, tables, tables["delimiter"], False, self.fileUtilities.csvFolder, self.logger) if "adjustFormat" in tables: for fld in tables["adjustFormat"]: df = SparkUtilities.FormatColumn(df, fld["name"], fld["inputFormat"]) SparkUtilities.SaveParquet(df, self.fileUtilities) self.UploadFilesCreateAthenaTablesAndSqlScripts( tables, self.fileUtilities.parquet) self.LoadDataFromAthenaIntoRedShiftLocalScripts(tables) except: self.logger.exception(self.moduleName + " - we had an error in ProcessTables") raise
def ProcessTables(self, dbCommon, tables): ''' process steps: pulls file from share and place in raw folder ''' try: self.rawFolder = self.localTempDirectory + "/" + "Raw" self.BulkDownload() self.ProcessFiles() spark = SparkUtilities.GetCreateSparkSession(self.logger) df = SparkUtilities.ReadCSVFile(spark, tables, self.job["delimiter"], False, self.fileUtilities.csvFolder, self.logger) SparkUtilities.SaveParquet(df, self.fileUtilities) self.UploadFilesCreateAthenaTablesAndSqlScripts( tables, self.fileUtilities.parquet) self.fileUtilities.EmptyFolderContents( self.fileUtilities.csvFolder) if "loadToRedshift" in tables and tables["loadToRedshift"] == "Y": self.LoadDataFromAthenaIntoRedShiftLocalScripts(tables) except: self.logger.exception(self.moduleName + " - we had an error in ProcessRequest") raise
def ProcessTables(self, dbCommon, tables): ''' pulls data from different sheets and put that information into csv file ''' try: xl = ExcelUtilities(self.logger) localFilepath = self.fileUtilities.FindMostCurrentFile( self.job["foldertoscan"]) csvfile = self.CreateCsvFile(tables) csvWriter = csv.writer(csvfile, quoting=csv.QUOTE_ALL) if localFilepath is not None: self.ProcessFile(xl, localFilepath, csvWriter) csvfile.close() spark = SparkUtilities.GetCreateSparkSession(self.logger) df = SparkUtilities.ReadCSVFile(spark, tables, self.job["delimiter"], False, self.fileUtilities.csvFolder, self.logger) if "adjustFormat" in tables: for fld in tables["adjustFormat"]: df = SparkUtilities.FormatColumn(df, fld["name"], fld["inputFormat"]) SparkUtilities.SaveParquet(df, self.fileUtilities) self.UploadFilesCreateAthenaTablesAndSqlScripts( tables, self.fileUtilities.parquet) self.fileUtilities.EmptyFolderContents( self.fileUtilities.csvFolder) if "loadToRedshift" in tables and tables["loadToRedshift"] == "Y": self.LoadDataFromAthenaIntoRedShiftLocalScripts(tables) except: self.logger.exception(self.moduleName + " - we had an error in ProcessTables") raise
def IncrementalLoad(self, dbCommon, tables): self.fileUtilities.EmptyFolderContents(self.fileUtilities.sqlFolder) try: # This is where we last ended. Start at 1 + this end athenaSchemaName = AthenaUtilities.ComposeAthenaSchemaName(tables["schemaName"]) chunkEnd = int(AthenaUtilities.GetMaxValue(self.awsParams, athenaSchemaName, tables["table"], tables["incrementalconditions"]["keyfield"], self.logger)) except ValueError: chunkEnd = 0 # Table does not exist yet except: raise #chunkEnd = 2960000000 #maxValue = 3708000000 # 2249000000 3708000000 maxValue = BCPUtilities.GetMaxValueSQLServer(dbCommon, tables, self.logger) chunkSize = tables["incrementalconditions"]["chunksize"] chunkStart, chunkEnd = self.UpdateChunkStartEnd(chunkEnd, chunkSize, maxValue) fieldTerminator = self.job["fieldTerminator"] rowTerminator = None # Not using this. Stick with the default of CR/LF. self.job["rowTerminator"] chunkStartData = chunkStart # Each ETL gets the same date so that we can do a smart insert based on ETL and chunkStartData partitionValue = datetime.datetime.strftime(datetime.date.today(), '%Y-%m-%d') while chunkStart <= maxValue: sqlPullDataScript = BCPUtilities.CreatePullScript(dbCommon, tables, chunkStart, chunkEnd, self.logger, self.fileUtilities, self.location) # Construct a file name that is meaning full. That is, it has the start and end IDs fileBaseName = tables["incrementalconditions"]["keyfield"] + "-" + BCPUtilities.ComponseRangeString(chunkStart, chunkEnd) outputCSV = self.fileUtilities.csvFolder + fileBaseName + ".csv" self.fileUtilities.EmptyFolderContents(self.fileUtilities.csvFolder) self.bcpUtilities.BulkExtract(sqlPullDataScript, outputCSV, dbCommon, tables, fieldTerminator, rowTerminator, self.job["bcpUtilityDirOnLinux"], self.fileUtilities, self.logger) # Process the data using Spark and save as Parquet spark = SparkUtilities.GetCreateSparkSession(self.logger) df = SparkUtilities.ReadCSVFile(spark, tables, fieldTerminator, False, self.fileUtilities.csvFolder, self.logger) SparkUtilities.SaveParquet(df, self.fileUtilities, fileBaseName) self.UploadFilesCreateAthenaTablesAndSqlScripts(tables, self.fileUtilities.parquet, partitionValue) tables["new"] = "N" # Do not recreate if chunkSize < 0: break; # Done with the single load chunkStart, chunkEnd = self.UpdateChunkStartEnd(chunkEnd, chunkSize, maxValue) # Load only the data that we processed into Redshift. We cannot use the run ETL date parition value # since we are loading the data based on record IDs customWhereCondition = tables["incrementalconditions"]["keyfield"] + " >= " + str(chunkStartData) self.LoadDataFromAthenaIntoRedShiftLocalScripts(tables, customWhereCondition)
def ProcessTables(self, dbCommon, tables): ''' Process the current table to load it up ''' self.logger.debug(self.moduleName + " -- ProcessTables for " + tables["table"] + " starting") self.ProcessFiles(dbCommon) spark = SparkUtilities.GetCreateSparkSession(self.logger) df = SparkUtilities.ReadCSVFile(spark, tables, dbCommon["delimiter"], False, self.fileUtilities.csvFolder, self.logger) SparkUtilities.SaveParquet(df, self.fileUtilities) self.UploadFilesCreateAthenaTablesAndSqlScripts( tables, self.fileUtilities.parquet) self.LoadDataFromAthenaIntoRedShiftS3Scripts(tables) self.logger.debug(self.moduleName + " -- ProcessTables for " + tables["table"] + " Done.")
def ProcessCatalogs(self, dbCommon, catalog): ''' Process the liquids balance catalog. ''' try: self.logger.debug(self.moduleName + " -- " + "Processing data for catalog: " + catalog["name"]) self.GetLastLiquidsBalanceFileInfo(dbCommon) spark = SparkUtilities.GetCreateSparkSession(self.logger) dfConsolidated = None for tableDDL in catalog["tables"]: if tableDDL["type"] == "raw": csvInfo = self.GetSheetDataToCsv(dbCommon, tableDDL) df = SparkUtilities.ReadCSVFile(spark, csvInfo["tableDDL"], self.job["delimiter"], True, csvInfo["csvFileName"], self.logger) if dfConsolidated is None: dfConsolidated = self.MeltDataFrame( catalog["applyCategoryCol"], tableDDL, df) else: dfConsolidated.unionAll( self.MeltDataFrame(catalog["applyCategoryCol"], tableDDL, df)) for tableDDL in catalog["tables"]: if tableDDL["type"] == "destination": SparkUtilities.SaveParquet(dfConsolidated, self.fileUtilities) self.UploadFilesCreateAthenaTablesAndSqlScripts( tableDDL, self.fileUtilities.parquet) self.LoadDataFromAthenaIntoRedShiftLocalScripts(tableDDL) break except Exception as err: self.logger.error(self.moduleName + " - Error while trying to load table. Error: " + err.message) raise
def ProcessTables(self, dbCommon, tables): ''' process steps: pulls file from share and place in raw folder ''' try: rawFolder = self.localTempDirectory + '/raw/' if "srcS3DataFolder" in tables: self.DownloadFilesFromS3(tables) xl = ExcelUtilities(self.logger) outPutFileName = self.fileUtilities.csvFolder + self.moduleName + '.csv' xl.Excel2CSV(rawFolder + tables["srcFile"],\ None,\ outPutFileName,\ self.fileUtilities.csvFolder,\ defDateFormat='%Y-%m-%d',\ skiprows=tables["skipRows"]) else: self.ProcessWebCall(tables, rawFolder) spark = SparkUtilities.GetCreateSparkSession(self.logger) df = SparkUtilities.ReadCSVFile(spark, tables, tables["delimiter"], False, self.fileUtilities.csvFolder, self.logger) if "adjustFormat" in tables: for fld in tables["adjustFormat"]: df = SparkUtilities.FormatColumn(df, fld["name"], fld["inputFormat"]) self.fileUtilities.EmptyFolderContents(rawFolder) # remove any null records df = df.dropna(how='all') SparkUtilities.SaveParquet(df, self.fileUtilities) self.UploadFilesCreateAthenaTablesAndSqlScripts( tables, self.fileUtilities.parquet) self.fileUtilities.EmptyFolderContents( self.fileUtilities.csvFolder) if "loadToRedshift" in tables and tables["loadToRedshift"] == "Y": self.LoadDataFromAthenaIntoRedShiftLocalScripts(tables) except: self.logger.exception(self.moduleName + " - we had an error in ProcessRequest") raise
def ProcessTables(self, dbCommon, tables): ''' Process the data for the table ''' s3Key = self.job["s3Filename"] self.logger.info(self.moduleName + " - Processing file: " + s3Key) fileName = ntpath.basename(s3Key) localGzipFilepath = self.fileUtilities.gzipFolder + "/" + fileName S3Utilities.S3Copy(s3Key, localGzipFilepath) # Unzip the file rather than reading the gzip as Spark is faster with csv localCSVFilepath = self.fileUtilities.csvFolder + "/" + fileName + ".csv" self.fileUtilities.GunzipFile(localGzipFilepath, localCSVFilepath) spark = SparkUtilities.GetCreateSparkSession(self.logger) df = SparkUtilities.ReadCSVFile(spark, tables, self.job["delimiter"], True, self.fileUtilities.csvFolder, self.logger) SparkUtilities.SaveParquet(df, self.fileUtilities) self.UploadFilesCreateAthenaTablesAndSqlScripts(tables, self.fileUtilities.parquet) self.LoadDataFromAthenaIntoRedShiftS3Scripts(tables) self.logger.info(self.moduleName + " -- " + "ProcessTable " + " finished ")
def ProcessTables(self, dbCommon, tables): ''' process steps: pulls file from share and place in raw folder ''' try: rawFolder = self.localTempDirectory + '/raw/' flist = self.DownloadFile(rawFolder) xl = ExcelUtilities(self.logger) outPutFileName = self.fileUtilities.csvFolder + self.moduleName + '.csv' for fl in flist: xl.Excel2CSV(rawFolder + fl,\ 'Sheet1',\ outPutFileName,\ self.fileUtilities.csvFolder) spark = SparkUtilities.GetCreateSparkSession(self.logger) df = SparkUtilities.ReadCSVFile(spark, tables, self.job["delimiter"], False, self.fileUtilities.csvFolder, self.logger) if "adjustFormat" in tables: for fld in tables["adjustFormat"]: df = SparkUtilities.FormatColumn(df, fld["name"], fld["inputFormat"]) SparkUtilities.SaveParquet(df, self.fileUtilities) self.UploadFilesCreateAthenaTablesAndSqlScripts( tables, self.fileUtilities.parquet) self.fileUtilities.EmptyFolderContents( self.fileUtilities.csvFolder) if "loadToRedshift" in tables and tables["loadToRedshift"] == "Y": self.LoadDataFromAthenaIntoRedShiftLocalScripts(tables) except: self.logger.exception(self.moduleName + " - we had an error in ProcessTables") raise
def LoadClassRefDF(self, spark): ''' Loads de class reference data ''' xReferencesDF = {} for catalog in self.job["catalogs"]: if catalog["name"] == "xReferences": for xrefTable in catalog["tables"]: if self.xRefPulled is False: S3Utilities.CopyItemsAWSCli( "s3://" + self.job["bucketName"] + xrefTable["s3SourceFolder"] + xrefTable["sourceFileName"], self.fileUtilities.csvFolder, "--quiet") xReferencesDF[ xrefTable["table"]] = SparkUtilities.ReadCSVFile( spark, xrefTable, self.job["delimiter"], False, self.fileUtilities.csvFolder + "/" + xrefTable["sourceFileName"], self.logger) self.xRefPulled = True return xReferencesDF
def ProcessTables(self, dbCommonNotUsed, table): ''' the actual process starts here ''' try: strDateTodayMinus1 = datetime.datetime.strftime( datetime.date.today() - datetime.timedelta(days=1), "%Y-%m-%d") latestValuationDateInAthena = self.GetLatestValuationDateInAthena( table) if (latestValuationDateInAthena == strDateTodayMinus1): self.logger.debug( self.moduleName + " -- " + "*** Totem data is already up-to-date as of: " + latestValuationDateInAthena + " ***") return self.SetGlobalVariables() yearMonthsToProcess = self.GetYearMonthsToProcess( latestValuationDateInAthena) #yearMonthsToProcess = self.GetYearMonthsToProcess("2017-11-10") # For debugging for yearMonth in yearMonthsToProcess: self.logger.debug(self.moduleName + " -- " + "Processing Year-Month: " + yearMonth) strDateToday = datetime.datetime.strftime( datetime.date.today(), "%Y-%m-%d") self.fileUtilities.EmptyFolderContents( self.fileUtilities.csvFolder) fileListForYearMonth = self.GetFileListForYearMonth(yearMonth) for fileName in fileListForYearMonth: self.GetFile(yearMonth, fileName) spark = SparkUtilities.GetCreateSparkSession(self.logger) df = SparkUtilities.ReadCSVFile(spark, table, self.job["delimiter"], False, self.fileUtilities.csvFolder, self.logger) # The data frame contains a number of valuation dates. Get the distinct valuation dates # and create a partition for each valuation date distinctValuationDates = sorted( df.select(df.valuationdate).distinct().collect()) for item in distinctValuationDates: # Process new days only. Skip today so that we don't get partials. Otherwise we will have # to delete data from Athena/RedShift to avoid duplicates if item.valuationdate <= latestValuationDateInAthena or item.valuationdate == strDateToday: continue self.logger.debug(self.moduleName + " - Processing Valuation Date: " + item.valuationdate) dfValuationDate = df.filter( df.valuationdate == item.valuationdate) fileBaseName = "ValuationDate-" + item.valuationdate SparkUtilities.SaveParquet(dfValuationDate, self.fileUtilities, fileBaseName) self.UploadFilesCreateAthenaTablesAndSqlScripts( table, self.fileUtilities.parquet, item.valuationdate) if "loadToRedshift" in table and table[ "loadToRedshift"] == "Y": self.LoadDataFromAthenaIntoRedShiftLocalScripts(table) self.logger.info(self.moduleName + " - Finished processing.") except: self.logger.exception( self.moduleName + " - we had an error in ProcessDatabase for " + table["table"]) raise
def ProcessDataSource(self, srcCategory, tables): ''' Process each category ''' self.logger.debug(self.moduleName + " -- " + "ProcessCategory " + " starting ") try: for year in srcCategory["years"]: url = srcCategory["urlPrefix"] + year + "." + srcCategory[ "urlExt"] self.logger.info(self.moduleName + " - Processing url: " + url) localFilePath = self.localTempDirectory + "/raw/" +\ ntpath.basename(srcCategory["urlPrefix"]) +\ year + "." + srcCategory["urlExt"] scrubbedFilepath = self.localTempDirectory + "/scrub/" +\ ntpath.basename(srcCategory["urlPrefix"]) +\ year + "." + srcCategory["urlExt"] FileUtilities.DownloadFromURL(url, localFilePath) if srcCategory[ "urlExt"] == "zip": # Unzip the file if we receive a zip format unzipFilelocation = self.localTempDirectory + "/raw/" self.fileUtilities.UnzipFile(localFilePath, unzipFilelocation) localFilePath = unzipFilelocation + srcCategory[ "unzipFilename"] scrubbedFilepath = self.localTempDirectory + "/scrub/" + year + "_" + srcCategory[ "unzipFilename"] # Need to clean up the file and add the tags tag = srcCategory["srcCategory"] + "," + srcCategory[ "srcDescription"] + "," replacements = { '^': tag, '" ': '"', '#VALUE!': '', r'\.,': ' ,' } self.fileUtilities.ReplaceStringInFile(localFilePath, scrubbedFilepath, replacements) spark = SparkUtilities.GetCreateSparkSession(self.logger) df = SparkUtilities.ReadCSVFile( spark, tables, self.job["delimiter"], True, self.localTempDirectory + "/scrub/", self.logger) if "adjustFormat" in srcCategory: for fld in srcCategory["adjustFormat"]: df = SparkUtilities.FormatColumn(df, fld["name"], fld["inputFormat"]) df.write.parquet(self.fileUtilities.parquet, mode="append") self.fileUtilities.EmptyFolderContents(self.localTempDirectory + "/scrub/") self.fileUtilities.EmptyFolderContents(self.localTempDirectory + "/raw/") self.logger.debug(self.moduleName + " -- " + "ProcessCategory for " + srcCategory["srcCategory"] + " finished ") except: self.logger.exception(self.moduleName + " - we had an error in ProcessCategory") raise