def UploadFilesCreateAthenaTablesAndSqlScripts(self, table, localParquetFolderName, partitionValue=None): ''' Upload Parquet files into S3 Create Athena Table/Partition Create script to create a RedShift table and save to S3 (note that the ETL may not necessarily load data into Redshift) Create script to insert data into Redshift and save to S3 (note that the ETL may not necessarily load data into Redshift) ''' if not FileUtilities.FilesExistInFolder(localParquetFolderName + "*.parquet"): # Nothing was created. We have a problem self.logger.info( self.moduleName + " - No parquet files were created for current partition in: " + localParquetFolderName + ". Nothing was processed on Athena.") return False self.fileUtilities.CreateTableSql(table, self.fileUtilities.sqlFolder) scriptPartitionValue = partitionValue if AthenaUtilities.IsTablePartitioned(table): # For partitioned tables, the script will insert a where clause by default. However, if we are doing a new load # skip the where clause so that we can have SQL script that is capable of loading all the data from Athena # into RedShift in the future s3FolderLocation = AthenaUtilities.ComposeAthenaS3DataFileKey( table["schemaName"], table["table"]) if not S3Utilities.KeyExist( self.awsParams, s3FolderLocation ): # Do not update scripts if data has been previously loaded scriptPartitionValue = None AthenaUtilities.SqlToLoadDataFromAthena(self.logger, table, self.fileUtilities.sqlFolder, scriptPartitionValue) AthenaUtilities.UploadFilesAndCreateAthenaTables( self.awsParams, localParquetFolderName, table, self.fileUtilities.sqlFolder, self.logger, partitionValue) return True
def UploadFilesAndCreateAthenaTables(awsParams, localParquetFilepath, tableSettings, localScriptsFilepath, logger, partitionValue): ''' Upload file to Designated S3 Athena passive lake location and create Athena tables Do this using Athena credentials ''' # Need the proper credentials to write to the Athena lake old_key, old_secret_key = awsParams.SwitchS3CredentialsToAthena() # For partitioned tables, the creation scripts in S3 will be build once to insert ALL the data from Athena to Redshift # Incremental runs will not update the S3 scripts since they are designed to incrementally update the RedShift tables updateScriptsInS3 = True if AthenaUtilities.IsTablePartitioned(tableSettings): s3FolderLocation = AthenaUtilities.ComposeAthenaS3DataFileKey( tableSettings["schemaName"], tableSettings["table"]) updateScriptsInS3 = not S3Utilities.KeyExist( awsParams, s3FolderLocation ) # Do not update scripts if data has been previously loaded # Save the Parquet file(s) in the designated S3 location and create the corresponding Athena tables s3FolderLocation = AthenaUtilities.UploadDataFilesToDesignatedS3Location( localParquetFilepath, tableSettings, partitionValue) AthenaUtilities.CreateAthenaTablesUsingAthenaCLI( tableSettings, s3FolderLocation, partitionValue, logger) # Save the SQL Script files in the designated S3 location in case we need to delete the data from RedShift to save space # The scripts in S3 will reload ALL the data to make sure the table is fully re-built if updateScriptsInS3: AthenaUtilities.UploadScriptsToDesignatedS3Location( localScriptsFilepath, tableSettings) logger.info("AthenaUtilities -- " + "Done uploading data to S3:" + s3FolderLocation) awsParams.SwitchS3CredentialsTo(old_key, old_secret_key)