def LoadCategory(self, srcCategory): ''' Process a single category configured in the categories dictionary in the jobConfig. ''' try: self.logger.debug(self.moduleName + " -- " + "LoadCategory" + " starting ") processingFile = self.DownloadFile(srcCategory) fileOut = processingFile.replace(".dbf", ".txt") dbfUtils = DBFUtilities(self.logger) dbfUtils.ConvertToCSV(processingFile, fileOut, srcCategory["delimiter"], False) rsConnect = self.etlUtilities.GetAWSConnection(self.awsParams) self.logger.info(self.moduleName + " - Loading file " + fileOut + "...") RedshiftUtilities.LoadFileIntoRedshift(rsConnect, self.awsParams.s3, self.logger, self.fileUtilities, fileOut, self.job["destinationSchema"], self.job["tableName"] + "_" + srcCategory["srcCategory"], srcCategory["fileFormat"], srcCategory["dateFormat"], srcCategory["delimiter"]) self.logger.debug(self.moduleName + " -- " + "LoadCategory" + " finished ") except Exception as err: self.logger.error(self.moduleName + " - Error while trying to load category...") raise Exception(err.message)
def LoadCSVFile(self, localFilePath, loadName): ''' For each file we need to process, provide the data loader the s3 key and destination table name ''' self.logger.info("Loading data into Redshift") try: rsConnect = RedshiftUtilities.Connect( dbname=self.awsParams.redshift['Database'], host=self.awsParams.redshift['Hostname'], port=self.awsParams.redshift['Port'], user=self.awsParams.redshiftCredential['Username'], password=self.awsParams.redshiftCredential['Password']) RedshiftUtilities.LoadFileIntoRedshift( rsConnect, self.awsParams.s3, self.logger, self.fileUtilities, localFilePath, self.job["destinationSchema"], self.job["tableName"] + loadName["redshiftTableSuffix"], self.job["fileFormat"], self.job["dateFormat"], self.job["delimiter"]) except Exception: self.logger.exception("Exception in PGCREIA860.LoadCSVFile") self.logger.exception( "Error while uploading to table:{}, filePath:{}".format( self.job["tableName"] + loadName["redshiftTableSuffix"], localFilePath)) raise finally: if rsConnect is not None: rsConnect.close() self.logger.info(self.moduleName + " - Finished Processing S3 file: " + loadName["redshiftTableSuffix"])
def LoadCategory(self, srcCategory): ''' Process a single category configured in the categories dictionary in the jobConfig. ''' try: self.logger.debug(self.moduleName + " -- " + "LoadCategory" + " starting ") processingFile = self.DownloadFile(srcCategory) processingCSV = self.GetCSVFile(processingFile, srcCategory) rsConnect = self.etlUtilities.GetAWSConnection(self.awsParams) self.logger.debug(self.moduleName + " - Loading file " + processingCSV + "...") RedshiftUtilities.LoadFileIntoRedshift( rsConnect, self.awsParams.s3, self.logger, self.fileUtilities, processingCSV, self.job["destinationSchema"], self.job["tableName"] + "_" + srcCategory["srcCategory"], srcCategory["fileFormat"], srcCategory["dateFormat"], srcCategory["delimiter"]) self.logger.debug(self.moduleName + " -- " + "LoadCategory" + " finished ") except Exception as err: self.logger.error(self.moduleName + " - Error while trying to load category...") raise Exception(err.message)
def LoadIntoRedshift(self, job): ''' Does the actual loading of data into Redshift ''' self.logger.info("Loading {} into Redshift".format(job["Name"])) try: rsConnect = RedshiftUtilities.Connect( dbname=self.awsParams.redshift['Database'], host=self.awsParams.redshift['Hostname'], port=self.awsParams.redshift['Port'], user=self.awsParams.redshiftCredential['Username'], password=self.awsParams.redshiftCredential['Password']) RedshiftUtilities.LoadFileIntoRedshift( rsConnect, self.awsParams.s3, self.logger, self.fileUtilities, self.localTempDirectory + "/cleaned/" + job["Name"] + ".CSV", self.job["destinationSchema"], self.job["tableName"].lower().replace("f1_", "") + job["Name"], self.job["fileFormat"], self.job["dateFormat"], self.job["outputDelimiter"]) rsConnect.close() except Exception as e: self.logger.exception( "we had an error in FoxPro.LoadIntoRedshift() while loading data into Redshift" ) self.logger.exception("{}".format(str(e)))
def ProcessCategory(self, rsConnect, srcCategory): ''' Executes the processing for a single category configured... ''' url = srcCategory["url"] self.logger.info(self.moduleName + " - Processing url: " + url) localFilepath = self.localTempDirectory + "/" + ntpath.basename( srcCategory["url"]) fileDownload = urllib.URLopener() fileDownload.retrieve(url, localFilepath) self.fileUtilities.UnzipFile(localFilepath, self.localTempDirectory) localFilepath = self.localTempDirectory + "/" + srcCategory[ "unzipFilename"] redshiftDestTable = self.job["tableName"] + srcCategory[ "redshiftTableSuffixOrigin"] RedshiftUtilities.LoadFileIntoRedshift( rsConnect, self.awsParams.s3, self.logger, self.fileUtilities, localFilepath, self.job["destinationSchema"], redshiftDestTable, self.job["fileFormat"], srcCategory["dateFormat"], self.job["delimiter"])
def AsyncLoadFilesToRedShift(self, proc): ''' load files into RedShift ''' try: for pFile in proc["processfile"]: pFileNoPath = pFile.replace('/', '_') self.logger.debug(self.moduleName + " -- " + "AsyncLoadFilesToRedShift for " + pFile + " starting ") rsConnect = self.etlUtilities.GetAWSConnection(self.awsParams) outputfileName = self.localTempDirectory + '/scrubbed/' + 'scrub_' + pFileNoPath rsTable = 'working_' + proc["processname"] RedshiftUtilities.LoadFileIntoRedshift( rsConnect, self.awsParams.s3, self.logger, self.fileUtilities, outputfileName, self.job["destinationSchema"], rsTable, self.job["fileFormat"], self.job["dateFormat"], self.job["delimiter"]) rsConnect.close() self.logger.debug(self.moduleName + " -- " + "AsyncLoadFilesToRedShift for " + pFile + " finished ") except: self.logger.exception( self.moduleName + "- we had an error in AsyncLoadFilesToRedShift ") raise
def ProcessS3File(self, srcFileParameter): ''' Process each file ''' self.logger.debug(self.moduleName + " -- " + "ProcessS3File" + " starting ") s3Key = self.job["s3SrcDirectory"] + "/" + srcFileParameter[ "s3Filename"] self.logger.info(self.moduleName + " - Processing file: " + s3Key) fileName = ntpath.basename(s3Key) localGzipFilepath = self.localTempDirectory + "/raw/" + fileName #---------------------------------------------------------------------- S3Utilities.DownloadFileFromS3(self.awsParams.s3, self.job["bucketName"], s3Key, localGzipFilepath) # Remove the gz extension localExcelFilepath = re.sub(r'\.gz$', '', localGzipFilepath) self.fileUtilities.GunzipFile(localGzipFilepath, localExcelFilepath) self.logger.info(self.moduleName + " - Processing Excel file: " + localExcelFilepath) self.fileUtilities.DeleteFile(localGzipFilepath) fileNameNoExt = fileName.split('.', 1)[0] outPutFileName = self.fileUtilities.csvFolder + fileNameNoExt + '.csv' xl = ExcelUtilities(self.logger) xl.Excel2CSV(localExcelFilepath,\ srcFileParameter["excelSheetName"],\ outPutFileName,\ self.fileUtilities.csvFolder,\ skiprows=srcFileParameter["skipRows"]) self.fileUtilities.EmptyFolderContents(self.localTempDirectory + "/raw/") for tables in srcFileParameter["tables"]: fname = self.fileUtilities.CreateTableSql( tables, self.fileUtilities.sqlFolder) RedshiftUtilities.PSqlExecute(fname, self.logger) # ----------------------------------------------------------------------------- self.logger.info(self.moduleName + " - Loading data into Redshift...") rsConnect = self.etlUtilities.GetAWSConnection(self.awsParams) RedshiftUtilities.LoadFileIntoRedshift( rsConnect, self.awsParams.s3, self.logger, self.fileUtilities, outPutFileName, tables["schemaName"], tables["table"], self.job["fileFormat"], self.job["dateFormat"], self.job["delimiter"]) # Cleanup rsConnect.close() self.fileUtilities.EmptyFolderContents(self.fileUtilities.csvFolder) self.logger.debug(self.moduleName + " -- " + "ProcessS3File for file: " + s3Key + " finished ")
def LoadSeriesIntoRedshift(self, fileNameNoExt, rsConnect, suffix): ''' Load series data into redshift ''' localFilename = self.ComposeFileName(fileNameNoExt, suffix) fullRedshiftDestTable = self.job["tableName"] + \ fileNameNoExt + "_" + suffix RedshiftUtilities.LoadFileIntoRedshift( rsConnect, self.awsParams.s3, self.logger, self.fileUtilities, localFilename, self.job["destinationSchema"], fullRedshiftDestTable, self.job["fileFormat"], self.job["dateFormat"], self.job["delimiter"])
def Start(self, logger, moduleName, filelocs): ''' main routine ''' currProcId = None try: ApplicationBase.Start(self, logger, moduleName, filelocs) self.logger.debug(self.moduleName + " -- " + " starting ") currProcId = self.etlUtilities.GetRunID( filelocs["tblEtl"]["table"], self.moduleName) ### # establish connection to Access database ### conn = self.EstablishConnection() cur = conn.cursor() sqlline = self.FixSQLStatement() cur.execute(sqlline) outputfileName = self.localTempDirectory + '/ENPdata.csv' self.ConvertToCSV(cur, outputfileName) ### # load the CSV to RedShift ### self.logger.debug(self.moduleName + " - ENP load CSV to RedShift") rsConnect = self.etlUtilities.GetAWSConnection(self.awsParams) RedshiftUtilities.LoadFileIntoRedshift( rsConnect, self.awsParams.s3, self.logger, self.fileUtilities, outputfileName, self.job["destinationSchema"], self.job["tableName"], self.job["fileFormat"], self.job["dateFormat"], self.job["delimiter"]) self.logger.debug(self.moduleName + " - ENP CSV loaded to RedShift") # Cleanup rsConnect.close() cur.close() conn.close() if self.job["cleanlocal"] == "Y": self.fileUtilities.RemoveFolder(self.localTempDirectory) self.logger.debug(self.moduleName + " -- " + " finished ") except Exception as err: self.logger.exception(moduleName + " - Exception! Error: " + err.message) if self.etlUtilities.CompleteInstance(filelocs["tblEtl"]["table"],\ currProcId, 'F') is not True: self.logger.info(self.moduleName + " - we could not Complete Instance.") raise Exception(err.message)
def LoadData(self, iso, localFilePath, fp): ''' Method to load ISO data into Redshift ''' self.logger.info("Loading ISO data into Redshift") rsConnect = RedshiftUtilities.Connect( dbname=self.awsParams.redshift['Database'], host=self.awsParams.redshift['Hostname'], port=self.awsParams.redshift['Port'], user=self.awsParams.redshiftCredential['Username'], password=self.awsParams.redshiftCredential['Password']) RedshiftUtilities.LoadFileIntoRedshift( rsConnect, self.awsParams.s3, self.logger, self.fileUtilities, localFilePath + fp, self.job["destinationSchema"], self.job["tableName"] + iso["Name"], self.job["fileFormat"], self.job["dateFormat"], self.job["delimiter"])
def ProcessSubJob(self, subJob): self.logger.info("Start the bcpUtilities.RunBCPJob for table " + subJob.get("destination")) #TODO: set time capture to log file self.bcpUtilities.RunBCPJob( self.job["bcpParameters"]["sqlServerloginInfo"], self.job["bcpParameters"]["bcpUtilityDirOnLinux"], self.fileUtilities.LoadSQLQuery( self.fileUtilities.GetApplicationDirectory("Hindsight") + subJob.get("inputQuery")), self.localTempDirectory + "/" + subJob.get("destination"), subJob.get("fieldTerminator"), subJob.get("rowTerminator")) self.logger.info("Start of the cleaning process for table " + subJob.get("destination")) if (subJob.get("charsToBeReplaced") is not None) and (len(subJob.get("charsToBeReplaced")) != 0): self.fileUtilities.ReplaceIterativelyInFile( self.bcpUtilities.GetFullFilePath(subJob.get("destination")), self.bcpUtilities.GetFullFilePath( self.bcpUtilities.GetFileToBeUploaded( subJob.get("destination"), subJob.get("charsToBeReplaced"))), subJob.get("charsToBeReplaced")) self.logger.info( "Start of the uploading process to Redshift process for table " + subJob.get("destination")) rsConnect = psycopg2.connect( dbname=self.awsParams.redshift['Database'], host=self.awsParams.redshift['Hostname'], port=self.awsParams.redshift['Port'], user=self.awsParams.redshiftCredential['Username'], password=self.awsParams.redshiftCredential['Password']) RedshiftUtilities.LoadFileIntoRedshift( rsConnect, self.awsParams.s3, self.logger, self.fileUtilities, self.bcpUtilities.GetFullFilePath( self.bcpUtilities.GetFileToBeUploaded( subJob.get("destination"), subJob.get("charsToBeReplaced"))), subJob["destinationSchema"], subJob.get("destination"), self.job["bcpParameters"]["fileFormat"], self.job["bcpParameters"]["dateFormat"], self.job["bcpParameters"]["delimiter"]) rsConnect.close()
def LoadXLSToRedshift(self, fileInStage, reportConfig): ''' Load a rigpoint data excel file into redshift ''' rsConnect = None try: self.logger.debug(self.moduleName + " -- " + "LoadXLSToRedshift" + fileInStage + " starting ") fileNameCSV = os.path.splitext(fileInStage)[0] + ".csv" rsConnect = self.etlUtilities.GetAWSConnection(self.awsParams) dataFrame = pandas.read_excel( fileInStage, sheetname=reportConfig["excelSheetName"], index_col=None, na_values=['NaN'], skiprows=reportConfig["skipRows"], skip_footer=reportConfig["skipFooter"]) dataFrame.to_csv(fileNameCSV, header=False, sep=str(reportConfig["delimiter"]), encoding='utf-8', index=False) RedshiftUtilities.LoadFileIntoRedshift( rsConnect, self.awsParams.s3, self.logger, self.fileUtilities, fileNameCSV, self.job["destinationSchema"], self.job["tableName"] + "_" + reportConfig["name"], reportConfig["fileFormat"], reportConfig["dateFormat"], reportConfig["delimiter"]) self.logger.debug(self.moduleName + " -- " + "LoadXLSToRedshift" + fileInStage + " finished ") except Exception as err: self.logger.error( self.moduleName + " Error while trying to load file into Redshift: " + err.message) raise finally: if rsConnect is not None: rsConnect.close()
def LoadData(self, outPutFileName): ''' load data into Redshift ''' try: self.logger.debug(self.moduleName + " -- " + "LoadData for " + outPutFileName + " starting ") rsConnect = self.etlUtilities.GetAWSConnection(self.awsParams) RedshiftUtilities.LoadFileIntoRedshift( rsConnect, self.awsParams.s3, self.logger, self.fileUtilities, outPutFileName, self.job["destinationSchema"], self.job["tableName"], self.job["fileFormat"], self.job["dateFormat"], self.job["delimiter"]) rsConnect.close() self.logger.debug(self.moduleName + " -- " + "LoadData for " + outPutFileName + " finished ") except: self.logger.exception(self.moduleName + " - we had an error in : LoadData") raise