Ejemplo n.º 1
0
    def UploadFilesCreateAthenaTablesAndSqlScripts(self,
                                                   table,
                                                   localParquetFolderName,
                                                   partitionValue=None):
        '''
        Upload Parquet files into S3
        Create Athena Table/Partition
        Create script to create a RedShift table and save to S3 (note that the ETL may not necessarily load data into Redshift)
        Create script to insert data into Redshift and save to S3  (note that the ETL may not necessarily load data into Redshift)
        '''
        if not FileUtilities.FilesExistInFolder(localParquetFolderName +
                                                "*.parquet"):
            # Nothing was created.  We have a problem
            self.logger.info(
                self.moduleName +
                " - No parquet files were created for current partition in: " +
                localParquetFolderName + ".  Nothing was processed on Athena.")
            return False

        self.fileUtilities.CreateTableSql(table, self.fileUtilities.sqlFolder)

        scriptPartitionValue = partitionValue
        if AthenaUtilities.IsTablePartitioned(table):
            # For partitioned tables, the script will insert a where clause by default.  However, if we are doing a new load
            # skip the where clause so that we can have SQL script that is capable of loading all the data from Athena
            # into RedShift in the future
            s3FolderLocation = AthenaUtilities.ComposeAthenaS3DataFileKey(
                table["schemaName"], table["table"])
            if not S3Utilities.KeyExist(
                    self.awsParams, s3FolderLocation
            ):  # Do not update scripts if data has been previously loaded
                scriptPartitionValue = None
        AthenaUtilities.SqlToLoadDataFromAthena(self.logger, table,
                                                self.fileUtilities.sqlFolder,
                                                scriptPartitionValue)

        AthenaUtilities.UploadFilesAndCreateAthenaTables(
            self.awsParams, localParquetFolderName, table,
            self.fileUtilities.sqlFolder, self.logger, partitionValue)
        return True
Ejemplo n.º 2
0
    def UploadFilesAndCreateAthenaTables(awsParams, localParquetFilepath,
                                         tableSettings, localScriptsFilepath,
                                         logger, partitionValue):
        '''
        Upload file to Designated S3 Athena passive lake location and create Athena tables
        Do this using Athena credentials
        '''
        # Need the proper credentials to write to the Athena lake
        old_key, old_secret_key = awsParams.SwitchS3CredentialsToAthena()

        # For partitioned tables, the creation scripts in S3 will be build once to insert ALL the data from Athena to Redshift
        # Incremental runs will not update the S3 scripts since they are designed to incrementally update the RedShift tables
        updateScriptsInS3 = True
        if AthenaUtilities.IsTablePartitioned(tableSettings):
            s3FolderLocation = AthenaUtilities.ComposeAthenaS3DataFileKey(
                tableSettings["schemaName"], tableSettings["table"])
            updateScriptsInS3 = not S3Utilities.KeyExist(
                awsParams, s3FolderLocation
            )  # Do not update scripts if data has been previously loaded

        # Save  the Parquet file(s) in the designated S3 location and create the corresponding Athena tables
        s3FolderLocation = AthenaUtilities.UploadDataFilesToDesignatedS3Location(
            localParquetFilepath, tableSettings, partitionValue)
        AthenaUtilities.CreateAthenaTablesUsingAthenaCLI(
            tableSettings, s3FolderLocation, partitionValue, logger)

        # Save  the SQL Script files in the designated S3 location in case we need to delete the data from RedShift to save space
        # The scripts in S3 will reload ALL the data to make sure the table is fully re-built
        if updateScriptsInS3:
            AthenaUtilities.UploadScriptsToDesignatedS3Location(
                localScriptsFilepath, tableSettings)

        logger.info("AthenaUtilities -- " + "Done uploading data to S3:" +
                    s3FolderLocation)

        awsParams.SwitchS3CredentialsTo(old_key, old_secret_key)