def LoadErcotTables(self): ''' Performs the final step to insert multiple files located in s3 into the final table in Redshift. ''' try: s3DataFolder = "s3://" + self.job["bucketName"] + self.job[ "s3ToDirectory"] rsConnect = RedshiftUtilities.Connect( dbname=self.awsParams.redshift['Database'], host=self.awsParams.redshift['Hostname'], port=self.awsParams.redshift['Port'], user=self.awsParams.redshiftCredential['Username'], password=self.awsParams.redshiftCredential['Password']) RedshiftUtilities.LoadDataFromS3( rsConnect, self.awsParams.s3, { "destinationSchema": self.job["destinationSchema"], "tableName": self.job["tableName"] + "DAM", "s3Filename": s3DataFolder, "fileFormat": self.job["fileFormat"], "dateFormat": self.job["dateFormat"], "delimiter": self.job["delimiter"] }, self.logger, "N") self.logger.info("Cleaning s3 data folder...") S3Utilities.DeleteFileFromS3TempUsingAWSCLi( s3DataFolder, "--recursive --quiet") except Exception: self.logger.error( "Error while trying to save into Redshift from s3 folder.") raise finally: if rsConnect is not None: rsConnect.close()
def LoadEIANetGen(self, tableSuffix): ''' Performs the final step to insert multiple files located in s3 into the final table in Redshift. ''' try: file_name = self.job["ExcelSheets"][0]["Name"].split( '.')[0] + '.csv.gz' s3DataFolder = "s3://" + self.job["bucketName"] + self.job[ "s3ToDirectory"] + file_name rsConnect = RedshiftUtilities.Connect( dbname=self.awsParams.redshift['Database'], host=self.awsParams.redshift['Hostname'], port=self.awsParams.redshift['Port'], user=self.awsParams.redshiftCredential['Username'], password=self.awsParams.redshiftCredential['Password']) RedshiftUtilities.LoadDataFromS3( rsConnect, self.awsParams.s3, { "destinationSchema": self.job["destinationSchema"], "tableName": self.job["tableName"] + tableSuffix, "s3Filename": s3DataFolder, "fileFormat": self.job["fileFormat"], "dateFormat": self.job["dateFormat"], "delimiter": self.job["delimiter"] }, self.logger, "N") self.logger.info("Cleaning s3 data folder...") S3Utilities.DeleteFileFromS3TempUsingAWSCLi( s3DataFolder, "--recursive --quiet") except Exception: self.logger.error( "Error while trying to save into Redshift from s3 folder.") raise
def LoadAllFromS3(self): ''' Process a single category configured in the categories dictionary in the jobConfig. ''' try: s3DataFolder = "s3://" + self.job["bucketName"] + self.job[ "s3ToDirectory"] rsConnect = RedshiftUtilities.Connect( dbname=self.awsParams.redshift['Database'], host=self.awsParams.redshift['Hostname'], port=self.awsParams.redshift['Port'], user=self.awsParams.redshiftCredential['Username'], password=self.awsParams.redshiftCredential['Password']) RedshiftUtilities.LoadDataFromS3( rsConnect, self.awsParams.s3, { "destinationSchema": self.job["destinationSchema"], "tableName": self.job["tableName"], "s3Filename": s3DataFolder, "fileFormat": self.job["fileFormat"], "dateFormat": self.job["dateFormat"], "delimiter": self.job["delimiter"] }, self.logger, "N") self.logger.info(self.moduleName + " - Cleaning s3 data folder...") S3Utilities.DeleteFileFromS3TempUsingAWSCLi( s3DataFolder, "--recursive --quiet") except Exception: self.logger.error( self.moduleName + " - Error while trying to save into Redshift from s3 folder.") raise
def DeleteFilesFromAWS(self, table): ''' Deletes files from AWS ''' s3DataFolder = "s3://" + self.job["bucketName"] + self.job[ "s3ToDirectory"][table] S3Utilities.DeleteFileFromS3TempUsingAWSCLi(s3DataFolder, "--recursive --quiet")
def LoadAllFromS3(self): ''' Load all CSVs from the Vantage's S3 bucket into Redshift ''' rsConnect = None try: s3DataFolder = "s3://" + self.job["bucketName"] + self.job[ "s3ToDirectory"] rsConnect = RedshiftUtilities.Connect( dbname=self.awsParams.redshift['Database'], host=self.awsParams.redshift['Hostname'], port=self.awsParams.redshift['Port'], user=self.awsParams.redshiftCredential['Username'], password=self.awsParams.redshiftCredential['Password']) for dsScript in self.job["extractingScripts"]: RedshiftUtilities.LoadDataFromS3( rsConnect, self.awsParams.s3, { "destinationSchema": self.job["destinationSchema"], "tableName": self.job["tableName"] + dsScript["tableSuffix"], "s3Filename": s3DataFolder + "/" + dsScript["tableSuffix"] + ".CSV.GZ", "fileFormat": self.job["fileFormat"], "dateFormat": self.job["dateFormat"], "delimiter": self.job["delimiter"] }, self.logger, "N") self.logger.info(self.moduleName + " - Cleaning s3 data folder...") S3Utilities.DeleteFileFromS3TempUsingAWSCLi( s3DataFolder, "--recursive --quiet") except Exception: self.logger.error( self.moduleName + " - Error while trying to save into Redshift from s3 folder.") raise finally: if rsConnect is not None: rsConnect.close()
def UploadDataFilesToDesignatedS3Location(localParquetFilepath, tableSettings, partitionValue): ''' Upload the data files, typically Parquet files, to the designated S3 location ''' s3FolderLocation = AthenaUtilities.ComposeAthenaS3DataFileKey( tableSettings["schemaName"], tableSettings["table"]) partitionKeyValueFolder = "" if AthenaUtilities.IsTablePartitioned(tableSettings): partitionKey = AthenaUtilities.GetPartitionKey(tableSettings) if not partitionKey: raise ValueError( 'Partition key cannot be null for partitioned tables.') if not partitionValue: raise ValueError( 'Partition value cannot be null for partitioned tables.') partitionKeyValueFolder = partitionKey + "=" + partitionValue + "/" s3FolderLocationData = s3FolderLocation + partitionKeyValueFolder # Only delete specific partition # There is no simple option to delete the whole S3 folder. It would be easy to make a mistake and delete the entire # data set in the passive lake. Do the FULL deletion MANUALLY if tableSettings["new"] == "Y" or ("clearPartition" in tableSettings and tableSettings["clearPartition"] == "Y"): S3Utilities.DeleteFileFromS3TempUsingAWSCLi( s3FolderLocationData, "--recursive") # Only copy the *.parquet files S3Utilities.S3RecursvieCopy( localParquetFilepath, s3FolderLocationData, "--exclude \"*\" --include \"*.parquet\" ") #======================================================================= # For testing purposes - Copy the file to a holding directory # import glob # import shutil # dst = "/s3/" + tableSettings["table"] # src = localParquetFilepath + "*.parquet" # for fileName in glob.glob(src): # print(fileName) # shutil.move(fileName, dst) #======================================================================= return s3FolderLocation
def UploadScriptsToDesignatedS3Location(localScriptsFilepath, tableSettings): ''' Upload the script files, typically table creation and upload, to the designated S3 location ''' s3FolderLocation = AthenaUtilities.ComposeAthenaS3ScriptKey( tableSettings["schemaName"], tableSettings["table"]) S3Utilities.DeleteFileFromS3TempUsingAWSCLi(s3FolderLocation, "--recursive") # Upload only scripts that we plan to keep for later reuse scriptToCreateRedshift = FileUtilities.ComposeCreateTableSqlFilename( tableSettings, localScriptsFilepath) scriptToInsertIntoRedshift = AthenaUtilities.ComposeInsertIntoSqlFilename( tableSettings, localScriptsFilepath) S3Utilities.S3Copy(scriptToCreateRedshift, s3FolderLocation) S3Utilities.S3Copy(scriptToInsertIntoRedshift, s3FolderLocation) return s3FolderLocation
def CleanupArea(self): ''' 1) clean up the local area on app server 2) clean up files in the temp folder on S3 ''' for fld in self.processParams.configdata["folders"]: if fld["name"] == 'sql': self.CreateFolder(fld, "N") elif fld["name"] == 'gzips': pass else: self.CreateFolder(fld, "Y") user = os.environ.get("USER", "") if not user: user = os.environ.get("USERNAME", "") # Load file to S3 at a temporary location bucketName = "ihs-temp" s3TempKey = "eaa/src/temp/" + user + "/" s3FullPath = "s3://" + bucketName + "/" + s3TempKey S3Utilities.DeleteFileFromS3TempUsingAWSCLi(s3FullPath, '''--recursive --quiet --include "*.zip"''')