def UploadScriptsToDesignatedS3Location(localScriptsFilepath, tableSettings): ''' Upload the script files, typically table creation and upload, to the designated S3 location ''' s3FolderLocation = AthenaUtilities.ComposeAthenaS3ScriptKey( tableSettings["schemaName"], tableSettings["table"]) S3Utilities.DeleteFileFromS3TempUsingAWSCLi(s3FolderLocation, "--recursive") # Upload only scripts that we plan to keep for later reuse scriptToCreateRedshift = FileUtilities.ComposeCreateTableSqlFilename( tableSettings, localScriptsFilepath) scriptToInsertIntoRedshift = AthenaUtilities.ComposeInsertIntoSqlFilename( tableSettings, localScriptsFilepath) S3Utilities.S3Copy(scriptToCreateRedshift, s3FolderLocation) S3Utilities.S3Copy(scriptToInsertIntoRedshift, s3FolderLocation) return s3FolderLocation
def ProcessCatalogs(self, dbCommon, catalog): ''' Process each file ''' # Load the data from the S3 data lake into Redshift using Athena/Redshift Spectrum s3Key = dbCommon["s3SrcDirectory"] + "/" + catalog["s3Filename"] self.logger.info(self.moduleName + " - Processing file: " + s3Key) FileUtilities.EmptyFolderContents( self.fileUtilities.gzipFolder ) # Clear the folder from the previous run FileUtilities.EmptyFolderContents( self.fileUtilities.csvFolder ) # Clear the folder from the previous run fileName = ntpath.basename(s3Key) localGzipFilepath = self.fileUtilities.gzipFolder + "/" + fileName S3Utilities.S3Copy(s3Key, localGzipFilepath) localExcelFilepath = self.fileUtilities.csvFolder + "/" + fileName # Remove the gz extension localExcelFilepath = re.sub(r'\.gz$', '', localExcelFilepath) self.fileUtilities.GunzipFile(localGzipFilepath, localExcelFilepath) # Don't have a raw excel reader for Spark so use Pandas self.logger.info(self.moduleName + " - Processing Excel file: " + localExcelFilepath) pandasDf = pd.read_excel(localExcelFilepath, catalog["excelSheetName"], index_col=None, na_values=['NaN'], skiprows=catalog["skipRows"]) pandasDf = PandasUtilities.ConvertDateTimeToObject(pandasDf) spark = SparkUtilities.GetCreateSparkSession(self.logger) table = catalog["tables"][0] # There is only table in a catalog schema = SparkUtilities.BuildSparkSchema(table) df = spark.createDataFrame(pandasDf, schema) df = SparkUtilities.ConvertNanToNull(df) SparkUtilities.SaveParquet(df, self.fileUtilities) self.UploadFilesCreateAthenaTablesAndSqlScripts( table, self.fileUtilities.parquet) self.LoadDataFromAthenaIntoRedShiftS3Scripts(table) self.logger.debug(self.moduleName + " -- " + "ProcessS3File for file: " + s3Key + " finished.\n\n")
def ProcessTables(self, dbCommon, tables): ''' Process the data for the table ''' s3Key = self.job["s3Filename"] self.logger.info(self.moduleName + " - Processing file: " + s3Key) fileName = ntpath.basename(s3Key) localGzipFilepath = self.fileUtilities.gzipFolder + "/" + fileName S3Utilities.S3Copy(s3Key, localGzipFilepath) # Unzip the file rather than reading the gzip as Spark is faster with csv localCSVFilepath = self.fileUtilities.csvFolder + "/" + fileName + ".csv" self.fileUtilities.GunzipFile(localGzipFilepath, localCSVFilepath) spark = SparkUtilities.GetCreateSparkSession(self.logger) df = SparkUtilities.ReadCSVFile(spark, tables, self.job["delimiter"], True, self.fileUtilities.csvFolder, self.logger) SparkUtilities.SaveParquet(df, self.fileUtilities) self.UploadFilesCreateAthenaTablesAndSqlScripts(tables, self.fileUtilities.parquet) self.LoadDataFromAthenaIntoRedShiftS3Scripts(tables) self.logger.info(self.moduleName + " -- " + "ProcessTable " + " finished ")