def ProcessFiles(self, lastModifiedDatetime): ''' Start processing the ERCOT files ''' maxModifiedDatetime = None try: filesOnS3, maxModifiedDatetime = self.GetNewFiles(lastModifiedDatetime) filesOnS3 = [fl.replace(self.job["s3SrcDirectory"][1:],"") for fl in filesOnS3] for fileConfig in self.job["files"]: zipFiles = list(filter(re.compile(fileConfig["FileRegex"]).match, filesOnS3)) for zipFileName in zipFiles: # eg DAM_Hr_LMP_2011.zip self.DownloadFile(self.job["s3SrcDirectory"][1:] + zipFileName, self.localTempDirectory + "/raw/") self.fileUtilities.UnzipUsing7z(self.localTempDirectory + "/raw/" + zipFileName, self.localTempDirectory + "/output/") level2Files = self.fileUtilities.ScanFolder(self.localTempDirectory + "/output/") level2Files = [l2File for l2File in level2Files if l2File.lower().endswith("_csv.zip")] # exclude all non-csv files for l2File in level2Files: # eg cdr.00012328.0000000000000000.20110101.131852.DAMHRLMPNP4183_csv.zip l2zip = self.localTempDirectory + "/output/" + l2File self.fileUtilities.UnzipUsing7z(l2zip, self.localTempDirectory + "/csvs/") FileUtilities.RemoveFileIfItExists(l2zip) # delete the file after unzipping FileUtilities.RemoveFileIfItExists(self.localTempDirectory + "/raw/" + zipFileName) # delete the parent file self.AddColumnSkipHeader(self.localTempDirectory + "/csvs/") # Add column DSTFlag if it doesn't exist and skip header self.PackFiles(self.localTempDirectory + "/csvs/", self.localTempDirectory + "/packed/") except: self.logger.exception("Error while processing ERCOT files") raise return maxModifiedDatetime
def CreateUpdateScript(self, pSchema, pTable): ''' takes the template for the Update sqlscript and customizes it ''' sqlUpdateScript = None try: self.logger.debug(self.moduleName + " -- " + "UpDate Table Script" + " starting ") sqlUpdateTemplate = self.location + '/' + self.job[ "sqlUpdateScript"] sqlUpdateScript = self.localTempDirectory + "/" + re.sub( 'Template.sql$', '.sql', self.job["sqlUpdateScript"]) FileUtilities.RemoveFileIfItExists(sqlUpdateScript) with open(sqlUpdateTemplate) as infile, open(sqlUpdateScript, 'w') as outfile: for line in infile: line = line.replace('{schemaName}', self.job["destinationSchema"]) line = line.replace('{tbname}', 'Totem') line = line.replace('{tbtotem}', self.sourceTableName) line = line.replace('{tbstats}', pSchema + "." + pTable) line = line.replace('{procid}', str(self.currProcId)) outfile.write(line) self.logger.debug(self.moduleName + " -- " + "UpDate Table Script" + " finished ") except: self.logger.exception(self.moduleName + " - we had an error in UpDate Table Script") raise return sqlUpdateScript
def CreatePullScript(self): ''' takes the template for the pull script and customizes it for the data we need ''' sqlPullDataScript = None try: self.logger.debug(self.moduleName + " -- " + "CreatePullScript" + " starting ") sqlPullDataTemplate = self.location + '/sql/' + self.job[ "sqlPullDataScriptTemplate"] sqlPullDataScript = self.localTempDirectory + "/sql/" + re.sub( 'Template.sql$', '.sql', self.job["sqlPullDataScriptTemplate"]) FileUtilities.RemoveFileIfItExists(sqlPullDataScript) with open(sqlPullDataTemplate) as infile, open( sqlPullDataScript, 'w') as outfile: for line in infile: outfile.write(line) self.logger.debug(self.moduleName + " -- " + "CreatePullScript" + " finished ") except: self.logger.exception(self.moduleName + " - we had an error in CreatePullScript") raise return sqlPullDataScript
def CreateMigrationScript(self): ''' takes the template for the Update sqlscript and customizes it ''' sqlMigrateScript = None try: ### # make sure that we have a place to put the sql script ### sqlScript = 'PopulateHistoryTemplate.sql' self.logger.debug(self.moduleName + " -- " + "CreateMigrationScript" + " starting ") sqlMigrateTemplate = self.location + '/sql/' + sqlScript sqlMigrateScript = self.commonParams["sqlFolder"] + re.sub( 'Template.sql$', '.sql', sqlScript) #commonParams["cat"] FileUtilities.CreateFolder(self.commonParams["sqlFolder"]) FileUtilities.RemoveFileIfItExists(sqlMigrateScript) ### # gather variables needed ### schemaName = None attrSrc = None dataSrc = None attrDest = None dataDest = None orderByFields = None partByFields = None for table in self.commonParams["cat"]["tables"]: schemaName = table["schemaName"] if table["type"] == "attributes": attrSrc = table["srctable"] attrDest = table["table"] if "partition" in table: orderByFields = table["partition"]["order"] partByFields = table["partition"]["over"] elif table["type"] == "series": dataSrc = table["srctable"] dataDest = table["table"] with open(sqlMigrateTemplate) as infile, open( sqlMigrateScript, 'w') as outfile: for line in infile: line = line.replace('{schemaName}', schemaName) line = line.replace('{attrSrc}', attrSrc) line = line.replace('{dataSrc}', dataSrc) line = line.replace('{attrDest}', attrDest) line = line.replace('{dataDest}', dataDest) line = line.replace('{orderByFields}', orderByFields) line = line.replace('{partByFields}', partByFields) outfile.write(line) self.logger.debug(self.moduleName + " -- " + "CreateMigrationScript" + " finished ") except: self.logger.exception( self.moduleName + " - we had an error in CreateMigrationScript") raise return sqlMigrateScript
def CreateUpdateScript(self, pEtlSchema, pEtlTable, tblJson, currProcId): ''' takes the template for the Update sqlscript and customizes it ''' sqlUpdateScript = None try: self.logger.debug(self.moduleName + " -- " + "UpDate Table Script" + " starting ") sqlUpdateTemplate = self.location + '/' + self.job[ "sqlUpdateScript"] sqlUpdateScript = self.localTempDirectory + "/" + re.sub( 'Template.sql$', '.sql', self.job["sqlUpdateScript"]) FileUtilities.RemoveFileIfItExists(sqlUpdateScript) ### # gather variables needed ### tbattributesourceName = None tbattributedestinationName = None tbdatasourceName = None tbdatadestinationName = None for table in tblJson: if "destName" in table: if table["type"] == "attributes": tbattributesourceName = table["table"] tbattributedestinationName = table["destName"] elif table["type"] == "series": tbdatasourceName = table["table"] tbdatadestinationName = table["destName"] with open(sqlUpdateTemplate) as infile, open(sqlUpdateScript, 'w') as outfile: for line in infile: line = line.replace('{schemaName}', self.job["destinationSchema"]) line = line.replace('{tbattributesourceName}', tbattributesourceName) line = line.replace('{tbattributedestinationName}', tbattributedestinationName) line = line.replace('{tbdatasourceName}', tbdatasourceName) line = line.replace('{tbdatadestinationName}', tbdatadestinationName) line = line.replace('{tbstats}', pEtlSchema + "." + pEtlTable) line = line.replace('{procid}', str(currProcId)) outfile.write(line) self.logger.debug(self.moduleName + " -- " + "UpDate Table Script" + " finished ") except: self.logger.exception(self.moduleName + " - we had an error in UpDate Table Script") raise return sqlUpdateScript
def ClassifyFiles(self): ''' classifies the files as Transactions, ident, contracts, indexPub ''' self.logger.info("Inside PGCRFERCFilings.ClassifyFiles") #======================================================================= # get the list of all csv files # FileUtilities.ScanFolder doesn't do a recursive listing, hence wrote a new method #======================================================================= searchPath = self.localTempDirectory + self.job["folderPath"]["raw"] fileNames = self.fileUtilities.GetListOfFilesRecursively( searchPath, filetype="*.CSV") self.logger.info("{} files found".format(len(fileNames))) for fileName in fileNames: try: fileType = PGCRFERCQuarterlyFilings.GetFileType( os.path.basename( fileName)) #transactions, contracts, ident, indexPub outputFileName = os.path.basename( fileName) #returns the filename folderPath = os.path.dirname( fileName) #returns the directory name #=============================================================== # Special handling for transactions to speed up the file processing # Use native file processing for transactions files #=============================================================== if fileType == "transactions": self.SaveTransactions(fileType, fileName, outputFileName) FileUtilities.RemoveFileIfItExists( fileName) #deletes the input file after processing else: self.SaveAsCSV(fileType, fileName, outputFileName) FileUtilities.RemoveFileIfItExists( fileName) #deletes the input file after processing except Exception: self.logger.exception( "Exception in PGCRFERCFilings.ClassifyFiles while handling a file in the path: {}" .format(folderPath)) raise
def CreateUpdSpcCharScript(self, dbCommon, tblJson): ''' takes the template for the Update sqlscript and customizes it ''' specialCharacterScript = None try: self.logger.debug(self.moduleName + " -- " + "CreateUpdSpcCharScript " + " starting ") specialCharacterScriptTemplate = self.location + '/sql/' + dbCommon[ tblJson["specialCharacterScript"]] outName = re.sub('Template.sql$', '.sql', dbCommon[tblJson["specialCharacterScript"]]) outName = re.sub("TableName", tblJson["table"], outName) specialCharacterScript = self.localTempDirectory + "/sql/" + outName FileUtilities.RemoveFileIfItExists(specialCharacterScript) fields = "set " cmaNdx = 0 for fldDesc in tblJson["fields"]: if "specialcharacters" in fldDesc: fldName = fldDesc["name"] numConversion = len(fldDesc["specialcharacters"]) repString = self.BuildReplaceString( fldName, fldDesc["specialcharacters"], numConversion) if cmaNdx > 0: fields = fields + ", " cmaNdx = 1 fields = fields + fldName + " = " + str(repString) with open(specialCharacterScriptTemplate) as infile, open( specialCharacterScript, 'w') as outfile: for line in infile: line = line.replace('{schemaname}', tblJson["schemaName"]) line = line.replace('{workingtable}', tblJson["workingtable"]) line = line.replace('{desttable}', tblJson["table"]) line = line.replace('{fieldnames}', fields) outfile.write(line) self.logger.debug(self.moduleName + " -- " + "CreateUpdSpcCharScript " + " finished ") except: self.logger.exception( self.moduleName + " - we had an error in CreateUpdSpcCharScript") raise return specialCharacterScript
def CreatePullScript(self, paramsList): ''' takes the template for the pull script and customizes it for the data we need ''' sqlPullDataScript = None try: self.logger.debug(self.moduleName + " -- " + "CreatePullScript" + " starting ") lastDate = None fromDate = '' lensparamsList = len(paramsList) if lensparamsList > 0: if "lastrun" in paramsList: lastDate = paramsList["lastrun"] if lastDate is not None: ### # bump date by one day ### # fromDate = datetime.datetime.strptime(lastDate, '%m/%d/%Y') +\ # datetime.timedelta(days=1) # fromDate = fromDate.strftime('%m/%d/%Y') fromDate = datetime.datetime.strptime(lastDate, '%m/%d/%Y') if fromDate > datetime.datetime.today() - datetime.timedelta( days=1): fromDate = datetime.date.today() - datetime.timedelta( days=1) fromDate = datetime.datetime.strftime(fromDate, '%m/%d/%Y') sqlPullDataTemplate = self.location + '/sql/' + self.job[ "sqlPullDataScriptTemplate"] sqlPullDataScript = self.localTempDirectory + "/sql/" + re.sub( 'Template.sql$', '.sql', self.job["sqlPullDataScriptTemplate"]) FileUtilities.RemoveFileIfItExists(sqlPullDataScript) with open(sqlPullDataTemplate) as infile, open( sqlPullDataScript, 'w') as outfile: for line in infile: line = line.replace('{lastrundate}', fromDate) outfile.write(line) self.logger.debug(self.moduleName + " -- " + "CreatePullScript" + " finished ") if fromDate is not self.cBlank: fromDate = datetime.datetime.strptime(fromDate, '%m/%d/%Y') fromDate = fromDate.strftime('%Y%m%d') except: self.logger.exception(self.moduleName + " - we had an error in CreatePullScript") raise return sqlPullDataScript, fromDate
def CleanAndPack(self): ''' Removes blank lines & empty strings ("") from files ''' self.logger.info("Inside PGCRFERCFilings.RemoveBlankLines") for folder in list(self.job["folderPath"].keys()): files = self.fileUtilities.ScanFolder(self.localTempDirectory + "/" + folder + "/") for iFile in files: inputFile = self.localTempDirectory + "/" + folder + "/" + iFile cleanedFile = self.localTempDirectory + "/" + folder + "/cleaned_" + iFile self.fileUtilities.RemoveBlankLines(inputFile, cleanedFile) FileUtilities.RemoveFileIfItExists(inputFile) self.fileUtilities.GzipFile(cleanedFile, cleanedFile) #gzip the file self.fileUtilities.DeleteFile( cleanedFile) #delete the CSV file
def CreatePullScript(self, dbCommon, table, currVal, incVal, mxValue): ''' takes the template for the pull script and customizes it for the data we need based on the fields in the config file ''' sqlPullDataScript = None try: self.logger.debug(self.moduleName + " -- " + "CreatePullScript for " + table["table"] + " starting ") sqlTemplate = dbCommon["sqlPullDataScriptTemplate"] if "pullTemplate" in table: templateType = table["pullTemplate"] sqlTemplate = dbCommon[str(templateType)] sqlPullDataTemplate = self.location + '/sql/' + sqlTemplate ### # fix name of output script ### outName = re.sub('Template.sql$', '.sql', sqlTemplate) outName = re.sub(dbCommon["name"], table["table"], outName) sqlPullDataScript = self.localTempDirectory + "/sql/" + str( currVal) + "_" + outName FileUtilities.RemoveFileIfItExists(sqlPullDataScript) fields = self.GetInnerFields(table["fields"]) self.fromKey = currVal + incVal if self.fromKey > mxValue: self.fromKey = mxValue whereClause = self.GetWhereClause(table, currVal) with open(sqlPullDataTemplate) as infile, open( sqlPullDataScript, 'w') as outfile: for line in infile: line = line.replace('{intable}', table["sourcetable"]) line = line.replace('{infields}', fields) line = line.replace('{whereclause}', whereClause) outfile.write(line) self.logger.debug(self.moduleName + " -- " + "CreatePullScript" + " finished ") except: self.logger.exception(self.moduleName + " - we had an error in CreatePullScript") raise return sqlPullDataScript
def ProcessFiles(self): ''' Controls the workflow for the conversion, clean up and pack of the input files. ''' self.logger.info(self.moduleName + " - Processing file: " + self.processingFile) rawFileName = self.localTempDirectory + "/" + self.processingFile csvFilename = self.localTempDirectory + "/" + self.processingFile.split(".")[0] + ".csv" try: columnNames = [] df = pandas.read_excel(rawFileName, sheetname=self.job["worksheetName"], index_col=None, na_values=None, skiprows=self.job["skipRows"], skip_footer=self.job["skipFooter"]) for colName in df.head(0): if colName not in self.job["columns_no_melt"]: columnNames.append(self.FormatColNameDate(colName)) else: columnNames.append(colName) df.columns = columnNames df = df.melt(id_vars=self.job["columns_no_melt"]) df.to_csv(csvFilename, header=False, sep=str(self.job["delimiter"]), encoding='utf-8', index=False) self.fileUtilities.GzipFile(csvFilename, csvFilename + ".gz") self.fileUtilities.DeleteFile(csvFilename) except XLRDError: self.logger.info(self.moduleName + " - No tab named '" + self.job["worksheetName"] + "' in " + self.processingFile) except Exception: self.logger.error(self.moduleName + " - Error while trying to process file " + self.processingFile) raise finally: FileUtilities.RemoveFileIfItExists(rawFileName)
def ProcessFiles(self, dbCommon): ''' Controls the workflow for the conversion, clean up and pack of the input files. ''' srcFileName = dbCommon["srcSharedFolder"] + dbCommon["fileName"] self.logger.info(self.moduleName + " - Processing file: " + srcFileName) dstFileName = self.fileUtilities.csvFolder + dbCommon["fileName"] shutil.copyfile(srcFileName, dstFileName) csvFilename = dstFileName + ".csv" try: columnNames = [] df = pandas.read_excel(dstFileName, sheet_name=dbCommon["worksheetName"], index_col=None, na_values=None, skiprows=dbCommon["skipRows"], skip_footer=dbCommon["skipFooter"]) for colName in df.head(0): if colName not in dbCommon["columns_no_melt"]: columnNames.append(self.FormatColNameDate(colName)) else: columnNames.append(colName) df.columns = columnNames df = df.melt(id_vars=dbCommon["columns_no_melt"]) df.to_csv(csvFilename, header=False, sep=str(dbCommon["delimiter"]), encoding='utf-8', index=False) except XLRDError: self.logger.info(self.moduleName + " - No tab named '" + dbCommon["worksheetName"] + "' in " + dstFileName) except Exception: self.logger.error(self.moduleName + " - Error while trying to process file " + dstFileName) raise finally: FileUtilities.RemoveFileIfItExists(dstFileName)
def CreateUpdateScript(self, dbCommon, tblJson): ''' takes the template for the Update sqlscript and customizes it ''' sqlUpdateScript = None try: self.logger.debug(self.moduleName + " -- " + "UpDate Table Script" + " starting ") sqlUpdateTemplate = self.location + '/sql/' + dbCommon[ "sqlUpdateScript"] outName = re.sub('Template.sql$', tblJson["table"] + '.sql', dbCommon["sqlUpdateScript"]) sqlUpdateScript = self.localTempDirectory + "/sql/" + outName FileUtilities.RemoveFileIfItExists(sqlUpdateScript) with open(sqlUpdateTemplate) as infile, open(sqlUpdateScript, 'w') as outfile: for line in infile: line = line.replace('{destschemaname}', tblJson["schemaName"]) line = line.replace( '{workingschemaname}', tblJson["updateSection"]["workingschemaname"]) line = line.replace( '{workingtable}', tblJson["updateSection"]["workingtable"]) line = line.replace('{desttable}', tblJson["table"]) line = line.replace('{keys}', tblJson["updateSection"]["keyfields"]) line = line.replace('{join}', tblJson["updateSection"]["join"]) outfile.write(line) self.logger.debug(self.moduleName + " -- " + "UpDate Table Script" + " finished ") except: self.logger.exception(self.moduleName + " - we had an error in UpDate Table Script") raise return sqlUpdateScript
def ProcessFiles(self): ''' Controls the workflow for the conversion, clean up and pack of the input files. ''' filesToProcess = self.fileUtilities.ScanFolder(self.rawFolder) for emFile in filesToProcess: self.logger.info(self.moduleName + " - Processing file: " + emFile) rawFileName = self.rawFolder + "/" + emFile csvFilename = self.fileUtilities.csvFolder + os.path.splitext( emFile)[0] + ".csv" try: surveyDatedt = self.GetData(rawFileName, "getSurveyDate") if isinstance(surveyDatedt, float): surveyDatedt = self.FormatSurveyDate(emFile) elif isinstance(surveyDatedt, basestring): if "," in surveyDatedt: tmpDatedt = datetime.strptime(surveyDatedt, '%B %d, %Y') surveyDatedt = datetime.strftime(tmpDatedt, "%Y-%m-%d") df = self.GetData(rawFileName) df = self.DfCleanUp(df, surveyDatedt) df.to_csv(csvFilename, header=False, sep=str(self.job["delimiter"]), encoding='utf-8', index=False) except XLRDError: self.logger.info(self.moduleName + " - No tab named '" + self.job["worksheetName"] + "' in " + emFile) except Exception: self.logger.error(self.moduleName + " - Error while trying to process " + emFile) raise finally: FileUtilities.RemoveFileIfItExists(rawFileName)
def RecursivelyUnzipFiles(self, srcDirectory): ''' Recursively unzips the files ''' srcDirectory = srcDirectory.strip() #trim trailing spaces if any if srcDirectory[ -1] != "/": #if the path doesn't end with forward slash, append one srcDirectory = srcDirectory + "/" print(srcDirectory) #======================================================================= # get the list of files in the given path and unzip them #======================================================================= files = self.fileUtilities.ScanFolder(srcDirectory) for unzippedFile in files: try: if unzippedFile.lower().endswith( ".zip"): #we are looking for only zip files inputFilename = srcDirectory + "/" + unzippedFile #build the full path to the zip file outputFolder = unzippedFile.split(".")[ 0] #get the filename without the zip part outputDirectory = srcDirectory + "/" + unzippedFile.split( "." )[0] #build the output directory to which the file is to be unzipped FileUtilities.CreateFolder( outputDirectory ) #Create the folder to be unzipped into self.fileUtilities.UnzipUsing7z( inputFilename, outputDirectory) #unzip using the 7z utility FileUtilities.RemoveFileIfItExists( inputFilename ) #deletes the zip file after unzipping it self.RecursivelyUnzipFiles( srcDirectory + "/" + outputFolder) #recursive call to this method except: self.logger.exception( "Exception in PGCRFERCFilings.RecursivelyUnzipFiles while unzipping file: {}" .format(unzippedFile)) raise
def CreatePullScript(self, tables, lastDate): ''' takes the template for the pull script and customizes it for the data we need ''' sqlPullDataScript = None try: self.logger.debug(self.moduleName + " -- " + "CreatePullScript" + " starting ") if lastDate is not None: fromDate = datetime.datetime.strptime(lastDate, '%m/%d/%Y') if fromDate > datetime.datetime.today() - datetime.timedelta( days=1): fromDate = datetime.date.today() - datetime.timedelta( days=1) fromDate = datetime.datetime.strftime(fromDate, '%m/%d/%Y') sqlPullDataTemplate = self.location + '/SQL/' + tables[ "pullTemplate"] sqlPullDataScript = self.fileUtilities.sqlFolder + re.sub( 'Template.sql$', '.sql', tables["pullTemplate"]) FileUtilities.RemoveFileIfItExists(sqlPullDataScript) with open(sqlPullDataTemplate) as infile, open( sqlPullDataScript, 'w') as outfile: for line in infile: line = line.replace('{lastrundate}', fromDate) outfile.write(line) self.logger.debug(self.moduleName + " -- " + "CreatePullScript" + " finished ") if fromDate is not self.cBlank: fromDate = datetime.datetime.strptime(fromDate, '%m/%d/%Y') fromDate = fromDate.strftime('%Y%m%d') except: self.logger.exception(self.moduleName + " - we had an error in CreatePullScript") raise return sqlPullDataScript
def ProcessFiles(self, lastModifiedDatetime): ''' Start processing the ERCOT files ''' maxModifiedDatetime = None try: filesOnS3, maxModifiedDatetime = self.GetNewFiles( lastModifiedDatetime) filesOnS3 = [ fl.replace(self.job["s3SrcDirectory"][1:], "") for fl in filesOnS3 ] for fileConfig in self.job["files"]: zipFiles = list( filter( re.compile(fileConfig["FileRegex"]).match, filesOnS3)) for zipFileName in zipFiles: # eg cdr.00012328.0000000000000000.20170619.123601002.DAMHRLMPNP4183_csv.zip self.DownloadFile( self.job["s3SrcDirectory"][1:] + zipFileName, self.localTempDirectory + "/raw/") self.fileUtilities.UnzipUsing7z( self.localTempDirectory + "/raw/" + zipFileName.split("/")[1], self.localTempDirectory + "/output/") FileUtilities.RemoveFileIfItExists( self.localTempDirectory + "/raw/" + zipFileName) # delete the parent file self.AddColumnSkipHeader( self.localTempDirectory + "/output/" ) # Add column DSTFlag if it doesn't exist and skip header self.PackFiles(self.localTempDirectory + "/output/", self.localTempDirectory + "/packed/") except: self.logger.exception("Error while processing ERCOT files") raise return (maxModifiedDatetime, len(filesOnS3))
class ECRConnect(ApplicationBase): ''' This class is used to get the Risk data from IHS Connect, transform it and load it into Redshift. ''' def __init__(self): ''' Initial settings ''' super(ECRConnect, self).__init__() self.awsParams = "" self.csvFile = None self.csvFileHistory = None self.fileUtilities = FileUtilities(self.logger) self.location = FileUtilities.PathToForwardSlash( os.path.dirname(os.path.abspath(__file__))) def TransformToCsv(self, jData): ''' Transforms from json to csv file. ''' try: # Gets the latest version df = json_normalize(jData, 'Risks', ['Country']) df['ClassName'] = '' df['ClassAvg'] = '' df = df[[ 'Country', 'Name', 'Value', 'Description', 'ClassName', 'ClassAvg', 'UpdatedOn' ]] df.to_csv(self.csvFile, header=False, sep=str(self.job["delimiter"]), encoding='utf-8', index=False) self.fileUtilities.GzipFile(self.csvFile, self.csvFile + ".gz") self.fileUtilities.RemoveFileIfItExists(self.csvFile) # Gets the history df = json_normalize(jData, ['Risks', 'History'], ['Country', ['Risks', 'Name']]) df = df[['Country', 'Risks.Name', 'Value', 'UpdatedOn']] df.to_csv(self.csvFileHistory, header=False, sep=str(self.job["delimiter"]), encoding='utf-8', index=False) self.fileUtilities.GzipFile(self.csvFileHistory, self.csvFileHistory + ".gz") self.fileUtilities.RemoveFileIfItExists(self.csvFileHistory) except Exception as err: self.logger.error( "Error while trying to transform json to csv. Error:" + err.message) raise def GetAndTransform(self): ''' Download all files. ''' try: request = urllib2.Request(self.job["connectAPI"]["baseurl"] + self.job["connectAPI"]["riskService"]) base64string = base64.b64encode( '%s:%s' % (self.job["connectAPI"]["username"], self.job["connectAPI"]["password"])) request.add_header("Authorization", "Basic %s" % base64string) response = urllib2.urlopen(request) jData = json.load(response) self.TransformToCsv(jData) except Exception as err: self.logger.error( "Error while trying to get and transform from IHS Connect API service. Error:" + err.message) raise def LoadAllFromS3(self, s3Source, tableName): ''' Process a single category configured in the categories dictionary in the jobConfig. ''' try: s3DataSource = "s3://" + self.job["bucketName"] + s3Source rsConnect = RedshiftUtilities.Connect( dbname=self.awsParams.redshift['Database'], host=self.awsParams.redshift['Hostname'], port=self.awsParams.redshift['Port'], user=self.awsParams.redshiftCredential['Username'], password=self.awsParams.redshiftCredential['Password']) RedshiftUtilities.LoadDataFromS3( rsConnect, self.awsParams.s3, { "destinationSchema": self.job["destinationSchema"], "tableName": tableName, "s3Filename": s3DataSource, "fileFormat": self.job["fileFormat"], "dateFormat": self.job["dateFormat"], "delimiter": self.job["delimiter"] }, self.logger, "N") except Exception: self.logger.error( self.moduleName + " - Error while trying to save into Redshift from s3 folder.") raise def UploadToS3(self): ''' Uploads all GZIP files created into S3 to be uploaded later... ''' self.logger.info(self.moduleName + " - Uploading GZIP files to s3 folder...") fileName = self.job["fileNameOut"] + ".gz" fileNameHistory = self.job["fileNameOutHistory"] + ".gz" S3Utilities.CopyItemsAWSCli( self.localTempDirectory + "/" + fileName, 's3://' + self.job["bucketName"] + self.job["s3ToDirectory"] + '/' + fileName) S3Utilities.CopyItemsAWSCli( self.localTempDirectory + "/" + fileNameHistory, 's3://' + self.job["bucketName"] + self.job["s3ToDirectory"] + '/' + fileNameHistory) def ExecutePostETL(self): ''' Will execute the post load sql script... ''' try: sqlTemplate = self.location + "/" + self.job["postSQLScript"] sqlScript = self.localTempDirectory + "/" + self.job[ "postSQLScript"] self.fileUtilities.CreateActualFileFromTemplate( sqlTemplate, sqlScript, self.job["destinationSchema"], self.job["tableName"]) RedshiftUtilities.PSqlExecute(sqlScript, self.logger) except Exception as err: self.logger.error( self.moduleName + " - Error while updating the countries codes. Message: " + err.message) raise def Start(self, logger, moduleName, filelocs): try: ApplicationBase.Start(self, logger, moduleName, filelocs) self.csvFile = self.localTempDirectory + "/" + self.job[ "fileNameOut"] self.csvFileHistory = self.localTempDirectory + "/" + self.job[ "fileNameOutHistory"] self.GetAndTransform() self.UploadToS3() self.LoadAllFromS3( self.job["s3ToDirectory"] + '/' + self.job["fileNameOut"] + '.gz', self.job["tableName"]) self.LoadAllFromS3( self.job["s3ToDirectory"] + '/' + self.job["fileNameOutHistory"] + '.gz', self.job["tableName"] + '_history') self.LoadAllFromS3( self.job["xReference"]["s3DataDirectory"], self.job["tableName"] + self.job["xReference"]["tableNameSfx"]) self.ExecutePostETL() except Exception as err: self.logger.exception(moduleName + " - Exception! Error: " + err.message) raise Exception(err.message)
class Vantage(ApplicationBase): ''' This class is used to get the Vanatage data from IHS Vantage Database, transform it and load it into Redshift. ''' def __init__(self): ''' Initial settings ''' super(Vantage, self).__init__() self.awsParams = "" self.packedFolder = None self.rawFolder = None self.fileUtilities = FileUtilities(self.logger) self.location = FileUtilities.PathToForwardSlash( os.path.dirname(os.path.abspath(__file__))) def BulkExtractAll(self): ''' Controls the flow thru the different data sets coming from Vantage DB. ''' try: for dsScript in self.job["extractingScripts"]: self.logger.info(self.moduleName + " Starts extracting " + dsScript["tableSuffix"] + " data...") self.bcpUtilities.RunBCPJob( self.job["mssqlLoginInfo"], self.job["bcpUtilityDirOnLinux"], self.fileUtilities.LoadSQLQuery(self.location + dsScript["scriptFile"]), self.localTempDirectory + "/Raw/" + dsScript["tableSuffix"] + ".CSV", self.job["delimiter"]) except Exception as err: self.logger.error( "Error while trying to Bulk Extract all. Message: " + err.message) raise def TransformAndPackAll(self): ''' Compress the csv files created. ''' rawFiles = self.fileUtilities.ScanFolder(self.rawFolder, None, "CSV") try: for rFile in rawFiles: rFileFull = self.rawFolder + "/" + rFile self.logger.info(self.moduleName + " started compressing file: " + rFile) self.fileUtilities.GzipFile( rFileFull, self.packedFolder + "/" + rFile + ".GZ") self.fileUtilities.RemoveFileIfItExists(rFileFull) except Exception as err: self.logger.error(self.moduleName + " Error while compressing raw files. Message: " + err.message) raise def LoadAllFromS3(self): ''' Load all CSVs from the Vantage's S3 bucket into Redshift ''' rsConnect = None try: s3DataFolder = "s3://" + self.job["bucketName"] + self.job[ "s3ToDirectory"] rsConnect = RedshiftUtilities.Connect( dbname=self.awsParams.redshift['Database'], host=self.awsParams.redshift['Hostname'], port=self.awsParams.redshift['Port'], user=self.awsParams.redshiftCredential['Username'], password=self.awsParams.redshiftCredential['Password']) for dsScript in self.job["extractingScripts"]: RedshiftUtilities.LoadDataFromS3( rsConnect, self.awsParams.s3, { "destinationSchema": self.job["destinationSchema"], "tableName": self.job["tableName"] + dsScript["tableSuffix"], "s3Filename": s3DataFolder + "/" + dsScript["tableSuffix"] + ".CSV.GZ", "fileFormat": self.job["fileFormat"], "dateFormat": self.job["dateFormat"], "delimiter": self.job["delimiter"] }, self.logger, "N") self.logger.info(self.moduleName + " - Cleaning s3 data folder...") S3Utilities.DeleteFileFromS3TempUsingAWSCLi( s3DataFolder, "--recursive --quiet") except Exception: self.logger.error( self.moduleName + " - Error while trying to save into Redshift from s3 folder.") raise finally: if rsConnect is not None: rsConnect.close() def BulkUploadToS3(self): ''' Uploads all GZIP files created into S3 to be uploaded later... ''' self.logger.info(self.moduleName + " - Uploading GZIP files to s3 folder...") S3Utilities.CopyItemsAWSCli( self.packedFolder, "s3://" + self.job["bucketName"] + self.job["s3ToDirectory"], "--recursive --quiet") def Start(self, logger, moduleName, filelocs): try: ApplicationBase.Start(self, logger, moduleName, filelocs) self.packedFolder = self.localTempDirectory + "/Packed" self.rawFolder = self.localTempDirectory + "/Raw" self.fileUtilities.RemoveFolder(self.packedFolder) self.fileUtilities.RemoveFolder(self.rawFolder) self.fileUtilities.CreateFolder(self.packedFolder) self.fileUtilities.CreateFolder(self.rawFolder) self.BulkExtractAll() self.TransformAndPackAll() self.BulkUploadToS3() self.LoadAllFromS3() except Exception as err: self.logger.exception(moduleName + " - Exception! Error: " + err.message) raise Exception(err.message)