def LoadFileIntoRedshift(rsConnect, s3, logger, fileUtilities, localFilepath, destinationSchema,\ redshiftDestTable, fileFormat, dateFormat, delimiter, isManifest='N'): ''' Load file from local drive to RedShift Zip the file, upload to S3 and then load into RedShift ''' if isManifest == 'Y': zipLocalFilepath = localFilepath else: # Zip the file zipLocalFilepath = localFilepath + ".gz" fileUtilities.GzipFile(localFilepath, zipLocalFilepath) bucketName, s3TempKey = S3Utilities.UploadFileToS3Temp(s3, zipLocalFilepath) # Build the job definition file job = {} job["destinationSchema"] = destinationSchema job["tableName"] = redshiftDestTable job["s3Filename"] = S3Utilities.GetS3FileName(bucketName, s3TempKey) job["fileFormat"] = fileFormat job["dateFormat"] = dateFormat job["delimiter"] = delimiter RedshiftUtilities.LoadDataFromS3(rsConnect, s3, job, logger, isManifest) S3Utilities.DeleteFile(s3, bucketName, s3TempKey)
def Start(self, logger, moduleName, filelocs): ''' main routine ''' try: ApplicationBase.Start(self, logger, moduleName, filelocs) self.logger.info(self.moduleName + " - Processing: ") outputCSVfileName = self.localTempDirectory + '/PheonixDocuments.csv' self.logger.info(self.moduleName + " - Pull documents from Phoenix: ") jsonDocuments = self.PullDataFromPhoenix() self.logger.info(self.moduleName + " - save contents to CSV file from Phoenix: ") self.ExportToCSV(outputCSVfileName, jsonDocuments) self.logger.info(self.moduleName + " - push documents csv file to S3: ") bucketName, s3TempKey = S3Utilities.UploadFileToS3Temp(self.awsParams.s3, outputCSVfileName) self.logger.info(self.moduleName + " - Create document table: ") psConnect = self.GetPSConnection() self.CreatePostgresTables(psConnect) self.logger.info(self.moduleName + " - pull document s3 to database server temp: ") postgresTempFile = self.DownloadFromS3ToPSTempDir(psConnect, bucketName, s3TempKey) self.logger.info(self.moduleName + " - load documents csv file: ") self.LoadDataFromPostgresTempDir(psConnect, postgresTempFile) self.logger.info(self.moduleName + " - clean up temp file: ") S3Utilities.DeleteFile(self.awsParams.s3, bucketName, s3TempKey) except: logger.exception(moduleName + " - Exception in start!") raise
def Start(self, logger, moduleName, filelocs): ''' main routine ''' try: ApplicationBase.Start(self, logger, moduleName, filelocs) s3Key = self.job["s3SrcDirectory"] + "/" + self.job["fileToLoad"] self.logger.info(self.moduleName + " - Processing file: " + s3Key) localFilepath = self.localTempDirectory + "/" + ntpath.basename( s3Key) S3Utilities.DownloadFileFromS3(self.awsParams.s3, self.job["bucketName"], s3Key, localFilepath) df = pd.read_excel(localFilepath, "Major Variables", index_col=None, na_values=['NaN'], skiprows=1, parse_cols="C:E,G:I", header=None) # Save the data as CSV outputCSVfileName = self.localTempDirectory + '/SampleData.csv' df.to_csv(outputCSVfileName, sep=str(self.job["delimiter"]), encoding='utf-8', index=False) # Update the CSV file into a temporary S3 location. Postgres will download it from there to its local directory bucketName, s3TempKey = S3Utilities.UploadFileToS3Temp( self.awsParams.s3, outputCSVfileName) psConnect = self.GetPSConnection() # Postgres tables are created using a connection (rather than psql) self.CreatePostgresTables(psConnect) postgresTempFile = self.DownloadFromS3ToPSTempDir( psConnect, bucketName, s3TempKey) self.LoadDataFromPostgresTempDir(psConnect, postgresTempFile) S3Utilities.DeleteFile(self.awsParams.s3, bucketName, s3TempKey) self.LoadBaseAttributes(psConnect) self.LoadBaseData(psConnect, '1000', 'glm_value') self.LoadBaseData(psConnect, '2000', 'arima_value') self.LoadBaseData(psConnect, '3000', 'lasso_value') # self.LoadBaseData(psConnect,'4000', 'nn_value') # self.LoadBaseData(psConnect,'5000', 'spectre_value') psConnect.close() self.logger.debug(" SampleData CSV loaded to RedShift") except: logger.exception(moduleName + " - Exception in start!") raise
def testDeleteFile(self): testFile = self.createTestingFile( "testDeleteFile.txt", "Testing DeleteFile from S3Utilities...") testFileReturned = testFile.replace(".txt", "_returned.txt") bucketName, s3TempKey = S3Utilities.UploadFileToS3Temp( self.awsParams.s3, testFile) S3Utilities.DeleteFile(self.awsParams.s3, bucketName, s3TempKey) try: S3Utilities.DownloadFileFromS3(self.awsParams.s3, bucketName, s3TempKey, testFileReturned) self.assertFalse(os.path.isfile(testFileReturned), "File was not deleted from the cloud.") except Exception as err: if err.status != 404: self.fail( "Error registered while trying to delete a file from the cloud. Error:" + err.message)
def ProcessLiquidBalanceFile(self): ''' place holder ''' try: rsConnect = RedshiftUtilities.Connect( dbname=self.awsParams.redshift['Database'], host=self.awsParams.redshift['Hostname'], port=self.awsParams.redshift['Port'], user=self.awsParams.redshiftCredential['Username'], password=self.awsParams.redshiftCredential['Password']) for sheetConfig in self.job["sheetsToExtract"]: self.ExtractSheet(sheetConfig) s3key = self.job["s3SrcDirectory"] + "/" + sheetConfig[ "outputName"] + "." + self.job["sheetsOutputFormat"] + ".gz" self.logger.info( self.moduleName + " Uploading information to redshift for worksheet: " + sheetConfig["name"]) job = {} job["destinationSchema"] = self.job["destinationSchema"] job["tableName"] = sheetConfig["tempTableName"] job["s3Filename"] = S3Utilities.GetS3FileName( self.job["bucketName"], s3key) job["fileFormat"] = self.job["fileFormat"] job["dateFormat"] = self.job["dateFormat"] job["delimiter"] = sheetConfig["delimiter"] RedshiftUtilities.LoadDataFromS3(rsConnect, self.awsParams.s3, job, self.logger) S3Utilities.DeleteFile(self.awsParams.s3, self.job["bucketName"], s3key) except: self.logger.exception( self.moduleName + " [ProcessLiquidBalanceFile] - We had an error in LiquidsBalance during processBlock" ) raise