def DownloadFile(self, s3Key, localGzipFilepath): ''' Wrapper to download file ''' self.logger.info(self.moduleName + " Downloading file: " + s3Key) try: S3Utilities.DownloadFileFromS3(self.awsParams.s3, self.job["bucketName"], s3Key, localGzipFilepath) except Exception as e: self.logger.exception( "Error while downloading file: {}".format(s3Key)) self.logger.exception("{}".format(str(e))) raise
def BulkUploadToS3(self, s3subFolder): ''' Uploads all GZIP files created into S3 to be uploaded later... ''' self.logger.info(self.moduleName + " - Uploading GZIP files to s3 folder...") s3Sub = None if s3subFolder is not None: s3Sub = '/' + s3subFolder S3Utilities.SyncFolderAWSCli(self.fileUtilities.gzipFolder, "s3://" + self.job["bucketName"] + self.job["s3GzipFolderBase"] + s3Sub, args='''--quiet --include "*.gz"''', dbug="Y")
def DownloadFilesFromS3(self, tablesJson): ''' Download all files from the s3 data folder. ''' try: self.logger.debug(self.moduleName + " -- " + "DownloadFilesFromS3" + " starting ") rawFolder = self.localTempDirectory + '/raw/' S3Utilities.CopyItemsAWSCli("s3://" + tablesJson["srcBucketName"] + tablesJson["srcS3DataFolder"], rawFolder, "--recursive --quiet") self.logger.debug(self.moduleName + " -- " + "DownloadFilesFromS3" + " finished ") except Exception as err: self.logger.error(self.moduleName + " - Error while trying to download files from s3. Error: " + err.message) raise
def DownloadScriptsForRedShift(awsParams, tableSettings, localScriptsFilepath): ''' Download the script files, typically table creation and upload, from the designated S3 location ''' # Need the proper credentials to write to the Athena lake old_key, old_secret_key = awsParams.SwitchS3CredentialsToAthena() s3FolderLocation = AthenaUtilities.ComposeAthenaS3ScriptKey( tableSettings["schemaName"], tableSettings["table"]) S3Utilities.S3RecursvieCopy(s3FolderLocation, localScriptsFilepath) awsParams.SwitchS3CredentialsTo(old_key, old_secret_key) return s3FolderLocation
def BulkDownload(self): ''' Download the entire bucket of EIA 860 ''' for path in self.job["s3SrcDirectory"]: try: sourcePath = "s3://" + self.job["bucketName"] + "/" + path outputPath = self.localTempDirectory + "/" S3Utilities.CopyItemsAWSCli(sourcePath, outputPath, "--recursive --quiet") except: self.logger.exception( "Exception in PGCREIA860.BulkDownload. Location {}".format( sourcePath)) raise
def GetListOfFilesOnS3(self): ''' Get the list of files on S3 under the given bucket & source directory and download the files ''' try: return S3Utilities.GetListOfFiles(self.awsParams.s3, self.job["bucketName"], self.job["s3SrcDirectory"][1:]) except Exception: self.logger.exception( "Exception in PGCRFERCFilings.GetListOfFilesOnS3") self.logger.exception( "Exception while fetching the list of files from S3 bucket: {}, path:{}" .format(self.job["bucketName"], self.job["s3SrcDirectory"][1:])) raise
def DownloadFile(self, s3Key, outputLocation): ''' Worker function to download the file ''' self.logger.info(" Downloading file: " + s3Key) try: s3Key = "/" + s3Key unzippedFile = s3Key.split("/")[-1] localGzipFilepath = outputLocation + unzippedFile S3Utilities.DownloadFileFromS3(self.awsParams.s3, self.job["bucketName"], s3Key, localGzipFilepath) except Exception: self.logger.exception( "Error while downloading file: {}".format(s3Key)) raise
def LoadAllFromS3(self): ''' Load all CSVs from the Vantage's S3 bucket into Redshift ''' rsConnect = None try: s3DataFolder = "s3://" + self.job["bucketName"] + self.job[ "s3ToDirectory"] rsConnect = RedshiftUtilities.Connect( dbname=self.awsParams.redshift['Database'], host=self.awsParams.redshift['Hostname'], port=self.awsParams.redshift['Port'], user=self.awsParams.redshiftCredential['Username'], password=self.awsParams.redshiftCredential['Password']) for dsScript in self.job["extractingScripts"]: RedshiftUtilities.LoadDataFromS3( rsConnect, self.awsParams.s3, { "destinationSchema": self.job["destinationSchema"], "tableName": self.job["tableName"] + dsScript["tableSuffix"], "s3Filename": s3DataFolder + "/" + dsScript["tableSuffix"] + ".CSV.GZ", "fileFormat": self.job["fileFormat"], "dateFormat": self.job["dateFormat"], "delimiter": self.job["delimiter"] }, self.logger, "N") self.logger.info(self.moduleName + " - Cleaning s3 data folder...") S3Utilities.DeleteFileFromS3TempUsingAWSCLi( s3DataFolder, "--recursive --quiet") except Exception: self.logger.error( self.moduleName + " - Error while trying to save into Redshift from s3 folder.") raise finally: if rsConnect is not None: rsConnect.close()
def LoadAllData(self): ''' Process: 1) push Attribute and data gz files to S3 2) load data into Redshift from S3 ''' self.CreateFolders("N") # this just sets the variable we will need self.fileUtilities = FileUtilities(self.logger) rsConnect = RedshiftUtilities.Connect(dbname=self.awsParams.redshift['Database'], host=self.awsParams.redshift['Hostname'], port=self.awsParams.redshift['Port'], user=self.awsParams.redshiftCredential['Username'], password=self.awsParams.redshiftCredential['Password']) for table in self.job["tables"]: ### # first create zip files for all we want to send to S3 ### s3folder = "s3://" + self.job["bucketName"] + self.job["s3GzipFolderBase"] if table["type"] == "attributes": sourceFolder = self.gzipFolder + "attr" destFolder = s3folder + "/attribute" else: # data types sourceFolder = self.gzipFolder + "data" destFolder = s3folder + "/data" S3Utilities.CopyItemsAWSCli(sourceFolder, destFolder, '''--recursive --quiet --include "*.gz"''') RedshiftUtilities.LoadDataFromS3(rsConnect, self.awsParams.s3, { "destinationSchema": self.job["destinationSchema"], "tableName": table["name"], "s3Filename": destFolder, "fileFormat": self.job["fileFormat"], "dateFormat": self.job["dateFormat"], "delimiter": self.job["delimiter"] }, self.logger, "N") # S3Utilities.DeleteFileFromS3TempUsingAWSCLi(destFolder, # '''--recursive --quiet --include "*.gz"''') rsConnect.close()
def ProcessCatalogs(self, dbCommon, catalog): ''' Process each file ''' # Load the data from the S3 data lake into Redshift using Athena/Redshift Spectrum s3Key = dbCommon["s3SrcDirectory"] + "/" + catalog["s3Filename"] self.logger.info(self.moduleName + " - Processing file: " + s3Key) FileUtilities.EmptyFolderContents( self.fileUtilities.gzipFolder ) # Clear the folder from the previous run FileUtilities.EmptyFolderContents( self.fileUtilities.csvFolder ) # Clear the folder from the previous run fileName = ntpath.basename(s3Key) localGzipFilepath = self.fileUtilities.gzipFolder + "/" + fileName S3Utilities.S3Copy(s3Key, localGzipFilepath) localExcelFilepath = self.fileUtilities.csvFolder + "/" + fileName # Remove the gz extension localExcelFilepath = re.sub(r'\.gz$', '', localExcelFilepath) self.fileUtilities.GunzipFile(localGzipFilepath, localExcelFilepath) # Don't have a raw excel reader for Spark so use Pandas self.logger.info(self.moduleName + " - Processing Excel file: " + localExcelFilepath) pandasDf = pd.read_excel(localExcelFilepath, catalog["excelSheetName"], index_col=None, na_values=['NaN'], skiprows=catalog["skipRows"]) pandasDf = PandasUtilities.ConvertDateTimeToObject(pandasDf) spark = SparkUtilities.GetCreateSparkSession(self.logger) table = catalog["tables"][0] # There is only table in a catalog schema = SparkUtilities.BuildSparkSchema(table) df = spark.createDataFrame(pandasDf, schema) df = SparkUtilities.ConvertNanToNull(df) SparkUtilities.SaveParquet(df, self.fileUtilities) self.UploadFilesCreateAthenaTablesAndSqlScripts( table, self.fileUtilities.parquet) self.LoadDataFromAthenaIntoRedShiftS3Scripts(table) self.logger.debug(self.moduleName + " -- " + "ProcessS3File for file: " + s3Key + " finished.\n\n")
def ProcessTable(self,table): ''' Process data for the table :param table: :return: ''' s3Key = self.job["s3Filename"] self.logger.info(self.moduleName + " - Processing file: " + s3Key) self.fileUtilities.moduleName = self.moduleName self.fileUtilities.localBaseDirectory = self.localTempDirectory + "/" + table["table"] self.fileUtilities.CreateFolders(self.job["folders"]) fileName = ntpath.basename(s3Key) local7zipFilePath = self.fileUtilities.gzipFolder+ "/" +fileName S3Utilities.DownloadFileFromS3(self.awsParams.s3,self.job["bucketName"], s3Key,local7zipFilePath) localCsvFilepath = self.fileUtilities.csvFolder + "/" + fileName localCsvFilepath = re.sub(r'\.zip$', '', localCsvFilepath) self.fileUtilities.UnzipUsing7z(local7zipFilePath,localCsvFilepath) fileToBeloaded = localCsvFilepath+'/'+'emission_05-11-2017.csv' spark = SparkUtilities.GetCreateSparkSession(self.logger) schema = SparkUtilities.BuildSparkSchema(table) df = (spark.read .format("com.databricks.spark.csv") .options(header='true', delimiter=self.job["delimiter"],ignoreTrailingWhiteSpace='true') .schema(schema) .load(fileToBeloaded) ) #df.show() self.logger.info( self.moduleName + " -- " + "Done reading " + str(df.count()) + " rows. Now saving as parquet file...") SparkUtilities.SaveParquet(df, self.fileUtilities) self.UploadFilesCreateAthenaTablesAndSqlScripts(table,self.fileUtilities.parquet) self.logger.info(self.moduleName + " -- " + "UploadFilesCreateAthenaTablesAndSqlScripts " + " finished ")
def PushFilesToS3(self): ''' push files to s3 server ''' try: self.logger.debug(self.moduleName + " -- " + "PushFilesToS3" + " starting ") S3Utilities.SyncFolderAWSCli(self.localTempDirectory + '/zips', "s3://" + self.job["bucketName"] + '/' + self.job["s3SrcDirectory"] + '/zips', args='''--quiet --include "*.zip"''', dbug="Y") self.logger.debug(self.moduleName + " -- " + "PushFilesToS3" + " finished ") except: self.logger.exception(self.moduleName + "- we had an error in PushFilesToS3") raise
def DownloadFilesFromS3(self, tablesJson): ''' Download files from the s3 data folder. ''' try: self.logger.debug(self.moduleName + " -- " + "DownloadFileFromS3" + " starting " + tablesJson["srcFile"]) S3Utilities.CopyItemsAWSCli( "s3://" + tablesJson["srcBucketName"] + tablesJson["srcS3DataFolder"] + tablesJson["srcFile"], self.fileUtilities.csvFolder, "--quiet") self.logger.debug(self.moduleName + " -- " + "DownloadFileFromS3" + " finished " + tablesJson["srcFile"]) except Exception as err: self.logger.error( self.moduleName + " - Error while trying to download file from s3. Error: " + err.message) raise
def ProcessTables(self, dbCommon, tables): ''' Process the data for the table ''' s3Key = self.job["s3Filename"] self.logger.info(self.moduleName + " - Processing file: " + s3Key) fileName = ntpath.basename(s3Key) localGzipFilepath = self.fileUtilities.gzipFolder + "/" + fileName S3Utilities.S3Copy(s3Key, localGzipFilepath) # Unzip the file rather than reading the gzip as Spark is faster with csv localCSVFilepath = self.fileUtilities.csvFolder + "/" + fileName + ".csv" self.fileUtilities.GunzipFile(localGzipFilepath, localCSVFilepath) spark = SparkUtilities.GetCreateSparkSession(self.logger) df = SparkUtilities.ReadCSVFile(spark, tables, self.job["delimiter"], True, self.fileUtilities.csvFolder, self.logger) SparkUtilities.SaveParquet(df, self.fileUtilities) self.UploadFilesCreateAthenaTablesAndSqlScripts(tables, self.fileUtilities.parquet) self.LoadDataFromAthenaIntoRedShiftS3Scripts(tables) self.logger.info(self.moduleName + " -- " + "ProcessTable " + " finished ")
def UploadFilesCreateAthenaTablesAndSqlScripts(self, table, localParquetFolderName, partitionValue=None): ''' Upload Parquet files into S3 Create Athena Table/Partition Create script to create a RedShift table and save to S3 (note that the ETL may not necessarily load data into Redshift) Create script to insert data into Redshift and save to S3 (note that the ETL may not necessarily load data into Redshift) ''' if not FileUtilities.FilesExistInFolder(localParquetFolderName + "*.parquet"): # Nothing was created. We have a problem self.logger.info( self.moduleName + " - No parquet files were created for current partition in: " + localParquetFolderName + ". Nothing was processed on Athena.") return False self.fileUtilities.CreateTableSql(table, self.fileUtilities.sqlFolder) scriptPartitionValue = partitionValue if AthenaUtilities.IsTablePartitioned(table): # For partitioned tables, the script will insert a where clause by default. However, if we are doing a new load # skip the where clause so that we can have SQL script that is capable of loading all the data from Athena # into RedShift in the future s3FolderLocation = AthenaUtilities.ComposeAthenaS3DataFileKey( table["schemaName"], table["table"]) if not S3Utilities.KeyExist( self.awsParams, s3FolderLocation ): # Do not update scripts if data has been previously loaded scriptPartitionValue = None AthenaUtilities.SqlToLoadDataFromAthena(self.logger, table, self.fileUtilities.sqlFolder, scriptPartitionValue) AthenaUtilities.UploadFilesAndCreateAthenaTables( self.awsParams, localParquetFolderName, table, self.fileUtilities.sqlFolder, self.logger, partitionValue) return True
def LoadAirMarketsTables(self): ''' Performs the final step to insert multiple files located in s3 into the final table in Redshift. ''' try: s3DataFolder = "s3://" + self.job["bucketName"] + self.job[ "s3ToDirectory"] rsConnect = RedshiftUtilities.Connect( dbname=self.awsParams.redshift['Database'], host=self.awsParams.redshift['Hostname'], port=self.awsParams.redshift['Port'], user=self.awsParams.redshiftCredential['Username'], password=self.awsParams.redshiftCredential['Password']) RedshiftUtilities.LoadDataFromS3( rsConnect, self.awsParams.s3, { "destinationSchema": self.job["destinationSchema"], "tableName": self.job["tableName"] + self.job["srcFileNamePrefix"], "s3Filename": s3DataFolder, "fileFormat": self.job["fileFormat"], "dateFormat": self.job["dateFormat"], "delimiter": self.job["delimiter"] }, self.logger, "N") self.logger.info("Cleaning s3 data folder...") S3Utilities.DeleteFileFromS3TempUsingAWSCLi( s3DataFolder, "--recursive --quiet") except Exception: self.logger.error( "Error while trying to save into Redshift from s3 folder.") raise
def GetFilesFromS3(self): ''' pull down files from s3 ''' localFilepath = None try: self.logger.debug(self.moduleName + " -- " + "GetFilesFromS3" + " starting ") s3Key = self.job["s3SrcDirectory"] + "/" + self.job["filetoscan"] self.logger.info(self.moduleName + " - Processing file: " + s3Key) localFilepath = self.localTempDirectory + "/" + ntpath.basename( s3Key) S3Utilities.DownloadFileFromS3(self.awsParams.s3, self.job["bucketName"], s3Key, localFilepath) self.logger.debug(self.moduleName + " -- " + "GetFilesFromS3" + " finished ") except: self.logger.exception(self.moduleName + " - we had an error in : GetFilesFromS3") raise return localFilepath
def MoveToS3(self): ''' move gzip files to s3 and clean local instance ''' try: self.logger.debug(self.moduleName + " -- " + "MoveToS3 " + " starting ") ### # move any gzip files to the s3 server ### s3folder = "s3://" + self.job["bucketName"] + self.job[ "s3GzipFolderBase"] S3Utilities.SyncFolderAWSCli(self.fileUtilities.gzipFolder, s3folder, args='''--quiet --include "*.gz"''', dbug="N") # Cleanup local files FileUtilities.EmptyFolderContents(self.fileUtilities.gzipFolder) self.logger.debug(self.moduleName + " -- " + "MoveToS3 " + " finished ") except: self.logger.exception(self.moduleName + " - we had an error in MoveToS3") raise
def CleanupArea(self): ''' 1) clean up the local area on app server 2) clean up files in the temp folder on S3 ''' for fld in self.processParams.configdata["folders"]: if fld["name"] == 'sql': self.CreateFolder(fld, "N") elif fld["name"] == 'gzips': pass else: self.CreateFolder(fld, "Y") user = os.environ.get("USER", "") if not user: user = os.environ.get("USERNAME", "") # Load file to S3 at a temporary location bucketName = "ihs-temp" s3TempKey = "eaa/src/temp/" + user + "/" s3FullPath = "s3://" + bucketName + "/" + s3TempKey S3Utilities.DeleteFileFromS3TempUsingAWSCLi(s3FullPath, '''--recursive --quiet --include "*.zip"''')
def UploadFilesAndCreateAthenaTables(awsParams, localParquetFilepath, tableSettings, localScriptsFilepath, logger, partitionValue): ''' Upload file to Designated S3 Athena passive lake location and create Athena tables Do this using Athena credentials ''' # Need the proper credentials to write to the Athena lake old_key, old_secret_key = awsParams.SwitchS3CredentialsToAthena() # For partitioned tables, the creation scripts in S3 will be build once to insert ALL the data from Athena to Redshift # Incremental runs will not update the S3 scripts since they are designed to incrementally update the RedShift tables updateScriptsInS3 = True if AthenaUtilities.IsTablePartitioned(tableSettings): s3FolderLocation = AthenaUtilities.ComposeAthenaS3DataFileKey( tableSettings["schemaName"], tableSettings["table"]) updateScriptsInS3 = not S3Utilities.KeyExist( awsParams, s3FolderLocation ) # Do not update scripts if data has been previously loaded # Save the Parquet file(s) in the designated S3 location and create the corresponding Athena tables s3FolderLocation = AthenaUtilities.UploadDataFilesToDesignatedS3Location( localParquetFilepath, tableSettings, partitionValue) AthenaUtilities.CreateAthenaTablesUsingAthenaCLI( tableSettings, s3FolderLocation, partitionValue, logger) # Save the SQL Script files in the designated S3 location in case we need to delete the data from RedShift to save space # The scripts in S3 will reload ALL the data to make sure the table is fully re-built if updateScriptsInS3: AthenaUtilities.UploadScriptsToDesignatedS3Location( localScriptsFilepath, tableSettings) logger.info("AthenaUtilities -- " + "Done uploading data to S3:" + s3FolderLocation) awsParams.SwitchS3CredentialsTo(old_key, old_secret_key)
def LoadClassRefDF(self, spark): ''' Loads de class reference data ''' xReferencesDF = {} for catalog in self.job["catalogs"]: if catalog["name"] == "xReferences": for xrefTable in catalog["tables"]: if self.xRefPulled is False: S3Utilities.CopyItemsAWSCli( "s3://" + self.job["bucketName"] + xrefTable["s3SourceFolder"] + xrefTable["sourceFileName"], self.fileUtilities.csvFolder, "--quiet") xReferencesDF[ xrefTable["table"]] = SparkUtilities.ReadCSVFile( spark, xrefTable, self.job["delimiter"], False, self.fileUtilities.csvFolder + "/" + xrefTable["sourceFileName"], self.logger) self.xRefPulled = True return xReferencesDF