def Start(self, logger, moduleName, filelocs): try: ApplicationBase.Start(self, logger, moduleName, filelocs) self.fileUtilities.EmptyFolderContents( self.localTempDirectory + "/QC/") #delete and recreate the folder hqc = HindsightQC(self.logger, self.fileUtilities, self.bcpUtilities, self.localTempDirectory) hqc.Get_sql_server_rowcounts("pre") #Get pre-ETL rowcounts #Execute the pre-etl queries for sqlFile in self.job["bcpParameters"].get("preETLQueries"): RedshiftUtilities.PSqlExecute( self.fileUtilities.GetApplicationDirectory("Hindsight") + sqlFile, logger) for subJob in self.job["bcpParameters"]["subJobs"]: if subJob.get("destinationSchema") is None: subJob["destinationSchema"] = self.job["bcpParameters"][ "destinationSchema"] self.ProcessSubJob(subJob) #Get SQL Server rowcounts hqc.Get_sql_server_rowcounts("post") #Execute the post-etl queries to prepare the data post-ETL prior to loading into the production tables for sqlFile in self.job["bcpParameters"].get("postETLQueries"): RedshiftUtilities.PSqlExecute( self.fileUtilities.GetApplicationDirectory("Hindsight") + sqlFile, logger) #Get Redshift rowcounts hqc.Get_redshift_rowcounts("post") #Execute the post-etl qc queries status = hqc.ValidateETL() #Check whether the ETL passed the QC #Check 1: inter-version counts. Are the difference beyond a particular threshold #Check 2: pre-sql v/s post-redshift. Are the differences beyond a particular threshold #If the ETL doesn't pass the QC, do not update/insert the prod tables #If the ETL passed the QC, insert into production tables (data, attributes, history) if status == True: self.logger.info("ETL good to go") for sqlFile in self.job["bcpParameters"].get( "FinalLoadQueries"): #=========================================================== # add a process to backup data/attributes history tables # Download to S3 #=========================================================== RedshiftUtilities.PSqlExecute( self.fileUtilities.GetApplicationDirectory("Hindsight") + sqlFile, logger) else: self.logger.warning("Bad ETL. No go!") print hqc.TimeElaspsed() except: logger.exception(moduleName + " - Exception!") raise
def LoadEIANetGen(self, tableSuffix): ''' Performs the final step to insert multiple files located in s3 into the final table in Redshift. ''' try: file_name = self.job["ExcelSheets"][0]["Name"].split( '.')[0] + '.csv.gz' s3DataFolder = "s3://" + self.job["bucketName"] + self.job[ "s3ToDirectory"] + file_name rsConnect = RedshiftUtilities.Connect( dbname=self.awsParams.redshift['Database'], host=self.awsParams.redshift['Hostname'], port=self.awsParams.redshift['Port'], user=self.awsParams.redshiftCredential['Username'], password=self.awsParams.redshiftCredential['Password']) RedshiftUtilities.LoadDataFromS3( rsConnect, self.awsParams.s3, { "destinationSchema": self.job["destinationSchema"], "tableName": self.job["tableName"] + tableSuffix, "s3Filename": s3DataFolder, "fileFormat": self.job["fileFormat"], "dateFormat": self.job["dateFormat"], "delimiter": self.job["delimiter"] }, self.logger, "N") self.logger.info("Cleaning s3 data folder...") S3Utilities.DeleteFileFromS3TempUsingAWSCLi( s3DataFolder, "--recursive --quiet") except Exception: self.logger.error( "Error while trying to save into Redshift from s3 folder.") raise
def LoadErcotTables(self): ''' Performs the final step to insert multiple files located in s3 into the final table in Redshift. ''' try: s3DataFolder = "s3://" + self.job["bucketName"] + self.job[ "s3ToDirectory"] rsConnect = RedshiftUtilities.Connect( dbname=self.awsParams.redshift['Database'], host=self.awsParams.redshift['Hostname'], port=self.awsParams.redshift['Port'], user=self.awsParams.redshiftCredential['Username'], password=self.awsParams.redshiftCredential['Password']) RedshiftUtilities.LoadDataFromS3( rsConnect, self.awsParams.s3, { "destinationSchema": self.job["destinationSchema"], "tableName": self.job["tableName"] + "DAM", "s3Filename": s3DataFolder, "fileFormat": self.job["fileFormat"], "dateFormat": self.job["dateFormat"], "delimiter": self.job["delimiter"] }, self.logger, "N") self.logger.info("Cleaning s3 data folder...") S3Utilities.DeleteFileFromS3TempUsingAWSCLi( s3DataFolder, "--recursive --quiet") except Exception: self.logger.error( "Error while trying to save into Redshift from s3 folder.") raise finally: if rsConnect is not None: rsConnect.close()
def ProcessCatalogs(self, dbCommon, catalog): ''' pulls data from each table in the catalog ''' try: self.logger.debug(self.moduleName + " -- ProcessCatalogs for " + catalog["name"] + " starting") for tables in catalog["tables"]: fname = self.fileUtilities.CreateTableSql( tables, self.fileUtilities.sqlFolder) RedshiftUtilities.PSqlExecute(fname, self.logger) for tables in catalog["tables"]: if tables["type"] == 'working': self.ProcessTables(dbCommon, tables) ### # now go back and update any tables that have special characters in them ### for tables in catalog["tables"]: if "specialCharacterScript" in tables: self.ResetSpecialChars(dbCommon, tables) for tables in catalog["tables"]: if "updateSection" in tables: updScriptName = self.CreateUpdateScript(dbCommon, tables) RedshiftUtilities.PSqlExecute(updScriptName, self.logger) self.logger.debug(self.moduleName + " -- ProcessCatalogs for " + catalog["name"] + " finished") except: self.logger.exception( self.moduleName + " - we had an error in ProcessCatalogs for " + catalog["name"]) raise
def LoadIntoRedshift(self, job): ''' Does the actual loading of data into Redshift ''' self.logger.info("Loading {} into Redshift".format(job["Name"])) try: rsConnect = RedshiftUtilities.Connect( dbname=self.awsParams.redshift['Database'], host=self.awsParams.redshift['Hostname'], port=self.awsParams.redshift['Port'], user=self.awsParams.redshiftCredential['Username'], password=self.awsParams.redshiftCredential['Password']) RedshiftUtilities.LoadFileIntoRedshift( rsConnect, self.awsParams.s3, self.logger, self.fileUtilities, self.localTempDirectory + "/cleaned/" + job["Name"] + ".CSV", self.job["destinationSchema"], self.job["tableName"].lower().replace("f1_", "") + job["Name"], self.job["fileFormat"], self.job["dateFormat"], self.job["outputDelimiter"]) rsConnect.close() except Exception as e: self.logger.exception( "we had an error in FoxPro.LoadIntoRedshift() while loading data into Redshift" ) self.logger.exception("{}".format(str(e)))
def LoadCSVFile(self, localFilePath, loadName): ''' For each file we need to process, provide the data loader the s3 key and destination table name ''' self.logger.info("Loading data into Redshift") try: rsConnect = RedshiftUtilities.Connect( dbname=self.awsParams.redshift['Database'], host=self.awsParams.redshift['Hostname'], port=self.awsParams.redshift['Port'], user=self.awsParams.redshiftCredential['Username'], password=self.awsParams.redshiftCredential['Password']) RedshiftUtilities.LoadFileIntoRedshift( rsConnect, self.awsParams.s3, self.logger, self.fileUtilities, localFilePath, self.job["destinationSchema"], self.job["tableName"] + loadName["redshiftTableSuffix"], self.job["fileFormat"], self.job["dateFormat"], self.job["delimiter"]) except Exception: self.logger.exception("Exception in PGCREIA860.LoadCSVFile") self.logger.exception( "Error while uploading to table:{}, filePath:{}".format( self.job["tableName"] + loadName["redshiftTableSuffix"], localFilePath)) raise finally: if rsConnect is not None: rsConnect.close() self.logger.info(self.moduleName + " - Finished Processing S3 file: " + loadName["redshiftTableSuffix"])
def LoadAllFromS3(self, s3Source, tableName): ''' Process a single category configured in the categories dictionary in the jobConfig. ''' try: s3DataSource = "s3://" + self.job["bucketName"] + s3Source rsConnect = RedshiftUtilities.Connect( dbname=self.awsParams.redshift['Database'], host=self.awsParams.redshift['Hostname'], port=self.awsParams.redshift['Port'], user=self.awsParams.redshiftCredential['Username'], password=self.awsParams.redshiftCredential['Password']) RedshiftUtilities.LoadDataFromS3( rsConnect, self.awsParams.s3, { "destinationSchema": self.job["destinationSchema"], "tableName": tableName, "s3Filename": s3DataSource, "fileFormat": self.job["fileFormat"], "dateFormat": self.job["dateFormat"], "delimiter": self.job["delimiter"] }, self.logger, "N") except Exception: self.logger.error( self.moduleName + " - Error while trying to save into Redshift from s3 folder.") raise
def ProcessS3File(self, srcFileParameter): ''' For each file we need to process, provide the data loader the s3 key and destination table name ''' jobParams = dict(self.job) jobParams["s3Filename"] = "s3://" + self.job["bucketName"] + "/" + \ self.job["s3SrcDirectory"] + "/" + srcFileParameter["s3Filename"] jobParams["tableName"] = self.job["tableName"] + \ srcFileParameter["redshiftTableSuffix"] self.logger.info(self.moduleName + " - Processing S3 file: " + jobParams["s3Filename"]) rsConnect = RedshiftUtilities.Connect( dbname=self.awsParams.redshift['Database'], host=self.awsParams.redshift['Hostname'], port=self.awsParams.redshift['Port'], user=self.awsParams.redshiftCredential['Username'], password=self.awsParams.redshiftCredential['Password']) RedshiftUtilities.LoadDataFromS3(rsConnect, self.awsParams.s3, jobParams, self.logger) rsConnect.close() self.logger.info(self.moduleName + " - Finished Processing S3 file: " + jobParams["s3Filename"])
def LoadAllFromS3(self): ''' Process a single category configured in the categories dictionary in the jobConfig. ''' try: s3DataFolder = "s3://" + self.job["bucketName"] + self.job[ "s3ToDirectory"] rsConnect = RedshiftUtilities.Connect( dbname=self.awsParams.redshift['Database'], host=self.awsParams.redshift['Hostname'], port=self.awsParams.redshift['Port'], user=self.awsParams.redshiftCredential['Username'], password=self.awsParams.redshiftCredential['Password']) RedshiftUtilities.LoadDataFromS3( rsConnect, self.awsParams.s3, { "destinationSchema": self.job["destinationSchema"], "tableName": self.job["tableName"], "s3Filename": s3DataFolder, "fileFormat": self.job["fileFormat"], "dateFormat": self.job["dateFormat"], "delimiter": self.job["delimiter"] }, self.logger, "N") self.logger.info(self.moduleName + " - Cleaning s3 data folder...") S3Utilities.DeleteFileFromS3TempUsingAWSCLi( s3DataFolder, "--recursive --quiet") except Exception: self.logger.error( self.moduleName + " - Error while trying to save into Redshift from s3 folder.") raise
def ProcessS3File(self, srcFileParameter): ''' Process each file ''' self.logger.debug(self.moduleName + " -- " + "ProcessS3File" + " starting ") s3Key = self.job["s3SrcDirectory"] + "/" + srcFileParameter[ "s3Filename"] self.logger.info(self.moduleName + " - Processing file: " + s3Key) fileName = ntpath.basename(s3Key) localGzipFilepath = self.localTempDirectory + "/raw/" + fileName #---------------------------------------------------------------------- S3Utilities.DownloadFileFromS3(self.awsParams.s3, self.job["bucketName"], s3Key, localGzipFilepath) # Remove the gz extension localExcelFilepath = re.sub(r'\.gz$', '', localGzipFilepath) self.fileUtilities.GunzipFile(localGzipFilepath, localExcelFilepath) self.logger.info(self.moduleName + " - Processing Excel file: " + localExcelFilepath) self.fileUtilities.DeleteFile(localGzipFilepath) fileNameNoExt = fileName.split('.', 1)[0] outPutFileName = self.fileUtilities.csvFolder + fileNameNoExt + '.csv' xl = ExcelUtilities(self.logger) xl.Excel2CSV(localExcelFilepath,\ srcFileParameter["excelSheetName"],\ outPutFileName,\ self.fileUtilities.csvFolder,\ skiprows=srcFileParameter["skipRows"]) self.fileUtilities.EmptyFolderContents(self.localTempDirectory + "/raw/") for tables in srcFileParameter["tables"]: fname = self.fileUtilities.CreateTableSql( tables, self.fileUtilities.sqlFolder) RedshiftUtilities.PSqlExecute(fname, self.logger) # ----------------------------------------------------------------------------- self.logger.info(self.moduleName + " - Loading data into Redshift...") rsConnect = self.etlUtilities.GetAWSConnection(self.awsParams) RedshiftUtilities.LoadFileIntoRedshift( rsConnect, self.awsParams.s3, self.logger, self.fileUtilities, outPutFileName, tables["schemaName"], tables["table"], self.job["fileFormat"], self.job["dateFormat"], self.job["delimiter"]) # Cleanup rsConnect.close() self.fileUtilities.EmptyFolderContents(self.fileUtilities.csvFolder) self.logger.debug(self.moduleName + " -- " + "ProcessS3File for file: " + s3Key + " finished ")
def LoadCategory(self, srcCategory): ''' Process a single category configured in the categories dictionary in the jobConfig. ''' try: self.logger.debug(self.moduleName + " -- " + "LoadCategory" + " starting ") processingFile = self.DownloadFile(srcCategory) fileOut = processingFile.replace(".dbf", ".txt") dbfUtils = DBFUtilities(self.logger) dbfUtils.ConvertToCSV(processingFile, fileOut, srcCategory["delimiter"], False) rsConnect = self.etlUtilities.GetAWSConnection(self.awsParams) self.logger.info(self.moduleName + " - Loading file " + fileOut + "...") RedshiftUtilities.LoadFileIntoRedshift(rsConnect, self.awsParams.s3, self.logger, self.fileUtilities, fileOut, self.job["destinationSchema"], self.job["tableName"] + "_" + srcCategory["srcCategory"], srcCategory["fileFormat"], srcCategory["dateFormat"], srcCategory["delimiter"]) self.logger.debug(self.moduleName + " -- " + "LoadCategory" + " finished ") except Exception as err: self.logger.error(self.moduleName + " - Error while trying to load category...") raise Exception(err.message)
def ProcessRequest(self): ''' process steps: pulls file from share and place in raw folder ''' try: rawFolder = self.localTempDirectory + '/raw/' flist = self.DownloadFile(rawFolder) xl = ExcelUtilities(self.logger) outPutFileName = self.fileUtilities.csvFolder + self.moduleName + '.csv' for fl in flist: xl.Excel2CSV(rawFolder + fl,\ 'Sheet1',\ outPutFileName,\ self.fileUtilities.csvFolder) outputGZ = self.fileUtilities.gzipFolder + self.moduleName + '.csv.gz' self.fileUtilities.GzipFile(outPutFileName, outputGZ) self.fileUtilities.EmptyFolderContents( self.fileUtilities.csvFolder) for tables in self.job["tables"]: fname = self.fileUtilities.CreateTableSql( tables, self.fileUtilities.sqlFolder) RedshiftUtilities.PSqlExecute(fname, self.logger) self.BulkUploadToS3() self.LoadData(tables) except: self.logger.exception(self.moduleName + " - we had an error in ProcessRequest") raise
def AsyncBuildTables(self, proc): ''' build the tables in RedShift ''' try: ### ## first create the file so we can use it ### self.logger.info(self.moduleName + " - " + proc["processname"] + " - SQL tables starting.") sqlTableCreationScript = ApplicationBase.BuildTableCreationScriptTable( self, proc["sqlscript"], proc["processname"], templateFolder="sql", sqlFolder="sql") ### # now we create the table from the file created ### RedshiftUtilities.PSqlExecute(sqlTableCreationScript, self.logger) self.logger.info(self.moduleName + " - " + proc["processname"] + " - SQL tables created finished.") except: self.logger.exception(self.moduleName + "- we had an error in AsyncBuildCXR") raise
def CompareResults(): ''' Compare results between stage and prod ''' logger = None schemas = ["eaa_dev.cfl_", "eaa_prod."] schemas = ["eaa_dev.cfl_", "eaa_dev."] print "Processing. Compare results between schemas: " + schemas[0] + " and " + schemas[1] tables = ["auto_lv_production", "auto_lv_sales", "auto_parc", "cftc", "eia_pet_imports_series_attributes", "eia_pet_imports_series_data", "eia_pet_series_attributes", "eia_pet_series_data", "eia_steo_series_attributes", "eia_steo_series_data", "totem", "opis_retail_price", "opis_retail_volume", "chemicals", "enp", "rigcount"] tables = ["eia_pet_imports_series_attributes", "eia_pet_imports_series_data", "eia_pet_series_attributes", "eia_pet_series_data", "eia_steo_series_attributes", "eia_steo_series_data"] awsParams = ConfigureAWS.ConfigureAWS() awsParams.LoadAWSConfiguration(logger) rsConnect = psycopg2.connect(dbname=awsParams.redshift['Database'], host=awsParams.redshift['Hostname'], port=awsParams.redshift['Port'], user=awsParams.redshiftCredential['Username'], password=awsParams.redshiftCredential['Password']) for table in tables: recCount = {} for schema in schemas: redshiftTableName = schema + table try: recCount[schema] = RedshiftUtilities.GetRecordCount( rsConnect, redshiftTableName) except psycopg2.ProgrammingError: recCount[schema] = -1 # Skip this table and mark count as -1 diff = recCount[schemas[0]] - recCount[schemas[1]] flag = " *** Count Different! ***" if diff != 0 else "" print("Table: " + table + ", " + schemas[0] + ": " + str(recCount[schemas[0]]) + ", " + schemas[1] + ": " + str(recCount[schemas[1]]) + ", diff: " + str(diff) + flag) rsConnect.close()
def PrepODATACat(self, paramsList, filelocs, currProcId): ''' gets a list of the type of data that we are going to pull and the fields for each ''' try: self.logger.debug(self.moduleName + " -- " + "PrepODATACat " + " started ") processTableArray = [] endpoint = self.job["ODATA"]["endpoint"] muHelper = MagellanUtilities() muHelper.logger = self.logger commonParams = {} commonParams["sqlFolder"] = self.fileUtilities.sqlFolder commonParams["csvFolder"] = self.fileUtilities.csvFolder commonParams["gzipFolder"] = self.fileUtilities.gzipFolder muHelper.commonParams = commonParams for catalog in self.job["ODATA"]["Catalogs"]: if catalog["execute"] == "Y": passparms = { "endpoint": endpoint, "name": catalog["name"], "tables": catalog["tables"] } processTableArray.append(passparms) for tblJson in catalog["tables"]: fname = self.fileUtilities.CreateTableSql( tblJson, self.fileUtilities.sqlFolder) RedshiftUtilities.PSqlExecute(fname, self.logger) self.GatherFields(catalog["tables"]) self.filter, fromDate = self.CreateFilter( catalog, paramsList) self.logger.debug(fromDate) self.PullOdata(passparms, fromDate) for tblJson in catalog["tables"]: if "destName" in tblJson: self.MoveToS3(tblJson["s3subfolder"][:4], fromDate, tblJson["s3subfolder"]) self.LoadData(fromDate, tblJson["s3subfolder"], tblJson) if self.etlUtilities.SetInstanceParameters(filelocs["tblEtl"]["table"],\ currProcId,\ json.dumps( {"lastrun":fromDate, "daysback": self.job["daysback"]} )) is not True: self.logger.info(self.moduleName + " - we could not set the instance.") self.UpdateTable(filelocs["tblEtl"]["schemaName"], filelocs["tblEtl"]["table"], catalog["tables"], currProcId) self.logger.debug(self.moduleName + " -- " + "PrepODATACat " + " finished ") except: self.logger.exception(self.moduleName + "- we had an error in PrepODATACat") raise
def LoadCategory(self, srcCategory): ''' Process a single category configured in the categories dictionary in the jobConfig. ''' try: self.logger.debug(self.moduleName + " -- " + "LoadCategory" + " starting ") processingFile = self.DownloadFile(srcCategory) processingCSV = self.GetCSVFile(processingFile, srcCategory) rsConnect = self.etlUtilities.GetAWSConnection(self.awsParams) self.logger.debug(self.moduleName + " - Loading file " + processingCSV + "...") RedshiftUtilities.LoadFileIntoRedshift( rsConnect, self.awsParams.s3, self.logger, self.fileUtilities, processingCSV, self.job["destinationSchema"], self.job["tableName"] + "_" + srcCategory["srcCategory"], srcCategory["fileFormat"], srcCategory["dateFormat"], srcCategory["delimiter"]) self.logger.debug(self.moduleName + " -- " + "LoadCategory" + " finished ") except Exception as err: self.logger.error(self.moduleName + " - Error while trying to load category...") raise Exception(err.message)
def ProcessCategory(self, rsConnect, srcCategory): ''' Executes the processing for a single category configured... ''' url = srcCategory["url"] self.logger.info(self.moduleName + " - Processing url: " + url) localFilepath = self.localTempDirectory + "/" + ntpath.basename( srcCategory["url"]) fileDownload = urllib.URLopener() fileDownload.retrieve(url, localFilepath) self.fileUtilities.UnzipFile(localFilepath, self.localTempDirectory) localFilepath = self.localTempDirectory + "/" + srcCategory[ "unzipFilename"] redshiftDestTable = self.job["tableName"] + srcCategory[ "redshiftTableSuffixOrigin"] RedshiftUtilities.LoadFileIntoRedshift( rsConnect, self.awsParams.s3, self.logger, self.fileUtilities, localFilepath, self.job["destinationSchema"], redshiftDestTable, self.job["fileFormat"], srcCategory["dateFormat"], self.job["delimiter"])
def AsyncLoadFilesToRedShift(self, proc): ''' load files into RedShift ''' try: for pFile in proc["processfile"]: pFileNoPath = pFile.replace('/', '_') self.logger.debug(self.moduleName + " -- " + "AsyncLoadFilesToRedShift for " + pFile + " starting ") rsConnect = self.etlUtilities.GetAWSConnection(self.awsParams) outputfileName = self.localTempDirectory + '/scrubbed/' + 'scrub_' + pFileNoPath rsTable = 'working_' + proc["processname"] RedshiftUtilities.LoadFileIntoRedshift( rsConnect, self.awsParams.s3, self.logger, self.fileUtilities, outputfileName, self.job["destinationSchema"], rsTable, self.job["fileFormat"], self.job["dateFormat"], self.job["delimiter"]) rsConnect.close() self.logger.debug(self.moduleName + " -- " + "AsyncLoadFilesToRedShift for " + pFile + " finished ") except: self.logger.exception( self.moduleName + "- we had an error in AsyncLoadFilesToRedShift ") raise
def LoadData(self, folderName, subFolder, tblJson): ''' load the data from s3 into RedShift ''' try: self.logger.debug(self.moduleName + " -- " + "LoadData" + " starting ") rsConnect = self.etlUtilities.GetAWSConnection(self.awsParams) s3folder = "s3://" + self.job["bucketName"] + self.job["s3GzipFolderBase"] +\ "/" + folderName + '/' + subFolder RedshiftUtilities.LoadDataFromS3( rsConnect, self.awsParams.s3, { "destinationSchema": self.job["destinationSchema"], "tableName": tblJson["table"], "s3Filename": s3folder, "fileFormat": self.job["fileFormat"], "dateFormat": self.job["dateFormat"], "delimiter": self.job["delimiter"] }, self.logger, "N") self.logger.info(self.moduleName + " - Finished loading s3 data to " +\ self.job["destinationSchema"] + '.' + tblJson["table"]) rsConnect.close() self.logger.debug(self.moduleName + " -- " + "LoadData" + " finished ") except: self.logger.exception(self.moduleName + " - we had an error in LoadData") raise
def ExecuteCreateTableScript(self, serieT, tableT): ''' Executed the federated timeseries tables script ''' tableName = self.job['federatedTables'][serieT][tableT]['name'] tb = {} tb['fields'] = [] tb['schemaName'] = self.job['destinationSchema'] tb['table'] = tableName tb["new"] = self.job['federatedTables'][serieT][tableT]['new'] tb['sortkey'] = self.job['federatedTables'][serieT]['sortkey'] if 'distkey' in self.job['federatedTables'][serieT]: if self.job['federatedTables'][serieT]['distkey'] <> '': tb['distkey'] = self.job['federatedTables'][serieT]['distkey'] for field in self.job['federatedTables'][serieT]['fields']: if 'exclusiveTo' in field: if tableT in field['exclusiveTo']: tb['fields'].append(field) else: tb['fields'].append(field) fname = self.fileUtilities.CreateTableSql(tb, self.localTempDirectory) RedshiftUtilities.PSqlExecute(fname, self.logger)
def LoadData(self, iso, localFilePath, fp): ''' Method to load ISO data into Redshift ''' self.logger.info("Loading ISO data into Redshift") rsConnect = RedshiftUtilities.Connect( dbname=self.awsParams.redshift['Database'], host=self.awsParams.redshift['Hostname'], port=self.awsParams.redshift['Port'], user=self.awsParams.redshiftCredential['Username'], password=self.awsParams.redshiftCredential['Password']) RedshiftUtilities.LoadFileIntoRedshift( rsConnect, self.awsParams.s3, self.logger, self.fileUtilities, localFilePath + fp, self.job["destinationSchema"], self.job["tableName"] + iso["Name"], self.job["fileFormat"], self.job["dateFormat"], self.job["delimiter"])
def AsyncBuildCXR(self, proc, basesql): ''' actually build the cross reference tables ''' try: self.logger.debug(self.moduleName + " -- " + "AsyncBuildCXR for " + proc["name"] + " starting ") ### ## first create the file so we can use it ### sqlTableCreationScript = ApplicationBase.BuildTableCreationScriptTable( self, basesql, proc["name"], templateFolder="sql", sqlFolder="sql") ### ## execute DDL so we now have the blank table ### RedshiftUtilities.PSqlExecute(sqlTableCreationScript, self.logger) self.logger.debug(self.moduleName + " -- " + "AsyncBuildCXR for " + proc["name"] + " finished ") except: self.logger.exception(self.moduleName + "- we had an error in AsyncBuildCXR") raise
def LoadAllFromS3(self): ''' Load all CSVs from the Vantage's S3 bucket into Redshift ''' rsConnect = None try: s3DataFolder = "s3://" + self.job["bucketName"] + self.job[ "s3ToDirectory"] rsConnect = RedshiftUtilities.Connect( dbname=self.awsParams.redshift['Database'], host=self.awsParams.redshift['Hostname'], port=self.awsParams.redshift['Port'], user=self.awsParams.redshiftCredential['Username'], password=self.awsParams.redshiftCredential['Password']) for dsScript in self.job["extractingScripts"]: RedshiftUtilities.LoadDataFromS3( rsConnect, self.awsParams.s3, { "destinationSchema": self.job["destinationSchema"], "tableName": self.job["tableName"] + dsScript["tableSuffix"], "s3Filename": s3DataFolder + "/" + dsScript["tableSuffix"] + ".CSV.GZ", "fileFormat": self.job["fileFormat"], "dateFormat": self.job["dateFormat"], "delimiter": self.job["delimiter"] }, self.logger, "N") self.logger.info(self.moduleName + " - Cleaning s3 data folder...") S3Utilities.DeleteFileFromS3TempUsingAWSCLi( s3DataFolder, "--recursive --quiet") except Exception: self.logger.error( self.moduleName + " - Error while trying to save into Redshift from s3 folder.") raise finally: if rsConnect is not None: rsConnect.close()
def LoadAllData(self): ''' Process: 1) push Attribute and data gz files to S3 2) load data into Redshift from S3 ''' self.CreateFolders("N") # this just sets the variable we will need self.fileUtilities = FileUtilities(self.logger) rsConnect = RedshiftUtilities.Connect(dbname=self.awsParams.redshift['Database'], host=self.awsParams.redshift['Hostname'], port=self.awsParams.redshift['Port'], user=self.awsParams.redshiftCredential['Username'], password=self.awsParams.redshiftCredential['Password']) for table in self.job["tables"]: ### # first create zip files for all we want to send to S3 ### s3folder = "s3://" + self.job["bucketName"] + self.job["s3GzipFolderBase"] if table["type"] == "attributes": sourceFolder = self.gzipFolder + "attr" destFolder = s3folder + "/attribute" else: # data types sourceFolder = self.gzipFolder + "data" destFolder = s3folder + "/data" S3Utilities.CopyItemsAWSCli(sourceFolder, destFolder, '''--recursive --quiet --include "*.gz"''') RedshiftUtilities.LoadDataFromS3(rsConnect, self.awsParams.s3, { "destinationSchema": self.job["destinationSchema"], "tableName": table["name"], "s3Filename": destFolder, "fileFormat": self.job["fileFormat"], "dateFormat": self.job["dateFormat"], "delimiter": self.job["delimiter"] }, self.logger, "N") # S3Utilities.DeleteFileFromS3TempUsingAWSCLi(destFolder, # '''--recursive --quiet --include "*.gz"''') rsConnect.close()
def Start(self, logger, moduleName, filelocs): ''' main routine ''' currProcId = None try: ApplicationBase.Start(self, logger, moduleName, filelocs) self.logger.debug(self.moduleName + " -- " + " starting ") currProcId = self.etlUtilities.GetRunID( filelocs["tblEtl"]["table"], self.moduleName) ### # set up to run create folder ### self.fileUtilities.moduleName = self.moduleName self.fileUtilities.localBaseDirectory = self.localTempDirectory self.fileUtilities.CreateFolders(self.job["folders"]) self.fromDate = self.GetFromDate() ### localFilepath = self.GetMostRecentFile(self.job["foldertoscan"]) # localFilepath = r'C:\tmp\IHS Markit Outlook for Global Oil Market Fundamentals - September 2017.xlsx' for tables in self.job["tables"]: fname = self.fileUtilities.CreateTableSql( tables, self.fileUtilities.sqlFolder) RedshiftUtilities.PSqlExecute(fname, self.logger) outPutFileName = self.fileUtilities.csvFolder +\ self.fromDate +\ "_" + tables["table"] + '.csv' outputGZ = self.fileUtilities.gzipFolder + self.fromDate +\ "_" + tables["table"] + '.csv.gz' tableJson = tables xl = ExcelUtilities(logger) if sys.version[0] == '3': csvfile = open(outPutFileName, 'w', newline='') elif sys.version[0] == '2': csvfile = open(outPutFileName, 'wb') csvWriter = csv.writer(csvfile, quoting=csv.QUOTE_ALL) if localFilepath is not None: self.ProcessFile(xl, localFilepath, csvWriter) csvfile.close() self.fileUtilities.GzipFile(outPutFileName, outputGZ) self.BulkUploadToS3() self.LoadData(tableJson) if self.job["cleanlocal"] == "Y": self.fileUtilities.RemoveFolder(self.localTempDirectory) self.logger.debug(self.moduleName + " -- " + " finished ") except Exception as err: self.logger.exception(moduleName + " - Exception! Error: " + err.message) if self.etlUtilities.CompleteInstance(filelocs["tblEtl"]["table"],\ currProcId, 'F') is not True: self.logger.info(self.moduleName + " - we could not Complete Instance.") raise Exception(err.message)
def CreatePostgresTables(self, psConnect): ''' Create table in local PS database Data will be removed ''' sqlTableCreationScript = self.BuildTableCreationScript(self.job['psSqlScript']) # The following code will recreate all the tables. EXISTING DATA WILL BE DELETED RedshiftUtilities.ExecuteSQLScript(psConnect, sqlTableCreationScript, self.logger) self.logger.info(self.moduleName + " - SQL tables created.")
def ExecuteCreateTable(self): ''' Checks if the tables needs to be created ''' tb = self.job['ddl'] tb['schemaName'] = self.job['destinationSchema'] fname = self.fileUtilities.CreateTableSql(tb, self.localTempDirectory) RedshiftUtilities.PSqlExecute(fname, self.logger)
def UnloadData(self, s3TempFolder): ''' Unloads only the names or series keys to s3 ''' unloadQuery = "select distinct name from " + self.job['destinationSchema'] + "." + \ self.job['federatedTables']['seriesAttributes']['history']['name'] rsConnect = RedshiftUtilities.Connect( dbname=self.awsParams.redshift['Database'], host=self.awsParams.redshift['Hostname'], port=self.awsParams.redshift['Port'], user=self.awsParams.redshiftCredential['Username'], password=self.awsParams.redshiftCredential['Password']) RedshiftUtilities.UnloadDataToS3(rsConnect, self.awsParams.s3, { "query": unloadQuery, "s3Folder": s3TempFolder }, self.logger)
def PostLoadETL(self): ''' Create the L2 tables post data load ''' postLoadScriptTemplate = self.job.get("PostLoadScript") if postLoadScriptTemplate is not None: sqlTableCreationScript = super(PGCRERCOT, self).BuildTableCreationScript(postLoadScriptTemplate) RedshiftUtilities.PSqlExecute(sqlTableCreationScript, self.logger) self.logger.info(self.moduleName + " - SQL tables created.")
def CreateTables(self, sqlTemplateFilename): ''' Create the actual tables ''' sqlTableCreationScript = self.BuildTableCreationScript( sqlTemplateFilename) # The following code will recreate all the tables. EXISTING DATA WILL BE DELETED RedshiftUtilities.PSqlExecute(sqlTableCreationScript, self.logger) self.logger.info(self.moduleName + " - SQL tables created.")