def CreateMigrationScript(self): ''' takes the template for the Update sqlscript and customizes it ''' sqlMigrateScript = None try: ### # make sure that we have a place to put the sql script ### sqlScript = 'PopulateHistoryTemplate.sql' self.logger.debug(self.moduleName + " -- " + "CreateMigrationScript" + " starting ") sqlMigrateTemplate = self.location + '/sql/' + sqlScript sqlMigrateScript = self.commonParams["sqlFolder"] + re.sub( 'Template.sql$', '.sql', sqlScript) #commonParams["cat"] FileUtilities.CreateFolder(self.commonParams["sqlFolder"]) FileUtilities.RemoveFileIfItExists(sqlMigrateScript) ### # gather variables needed ### schemaName = None attrSrc = None dataSrc = None attrDest = None dataDest = None orderByFields = None partByFields = None for table in self.commonParams["cat"]["tables"]: schemaName = table["schemaName"] if table["type"] == "attributes": attrSrc = table["srctable"] attrDest = table["table"] if "partition" in table: orderByFields = table["partition"]["order"] partByFields = table["partition"]["over"] elif table["type"] == "series": dataSrc = table["srctable"] dataDest = table["table"] with open(sqlMigrateTemplate) as infile, open( sqlMigrateScript, 'w') as outfile: for line in infile: line = line.replace('{schemaName}', schemaName) line = line.replace('{attrSrc}', attrSrc) line = line.replace('{dataSrc}', dataSrc) line = line.replace('{attrDest}', attrDest) line = line.replace('{dataDest}', dataDest) line = line.replace('{orderByFields}', orderByFields) line = line.replace('{partByFields}', partByFields) outfile.write(line) self.logger.debug(self.moduleName + " -- " + "CreateMigrationScript" + " finished ") except: self.logger.exception( self.moduleName + " - we had an error in CreateMigrationScript") raise return sqlMigrateScript
def CreateSQLFiles(self, proc, dest): ''' Routine to create sql files to use to create tables in RedShift ''' try: for table in proc["tables"]: fname = self.commonParams["sqlFolder"] + "Create_" + table[ "name"] + ".sql" self.logger.info(fname) outfile = open(fname, "w") outLine = "DROP TABLE IF EXISTS {}.{};".format( dest, table["name"]) outLine = FileUtilities.PutLine(outLine, outfile) outLine = "CREATE TABLE {}.{} (".format(dest, table["name"]) outLine = FileUtilities.PutLine(outLine, outfile) ndx = 0 for fld in table["fields"]: if ndx > 0: outLine = ',' ndx = ndx + 1 outLine = outLine + fld["name"] + " " + fld["type"] if fld["type"] == "VARCHAR": outLine = outLine + "(" + fld["size"] + ") ENCODE LZO" outLine = FileUtilities.PutLine(outLine, outfile) outfile.write(");") outfile.close() except: self.logger.exception(self.moduleName + "- we had an error in CreateSQLFiles ") raise
def ProcessFiles(self, lastModifiedDatetime): ''' Start processing the ERCOT files ''' maxModifiedDatetime = None try: filesOnS3, maxModifiedDatetime = self.GetNewFiles(lastModifiedDatetime) filesOnS3 = [fl.replace(self.job["s3SrcDirectory"][1:],"") for fl in filesOnS3] for fileConfig in self.job["files"]: zipFiles = list(filter(re.compile(fileConfig["FileRegex"]).match, filesOnS3)) for zipFileName in zipFiles: # eg DAM_Hr_LMP_2011.zip self.DownloadFile(self.job["s3SrcDirectory"][1:] + zipFileName, self.localTempDirectory + "/raw/") self.fileUtilities.UnzipUsing7z(self.localTempDirectory + "/raw/" + zipFileName, self.localTempDirectory + "/output/") level2Files = self.fileUtilities.ScanFolder(self.localTempDirectory + "/output/") level2Files = [l2File for l2File in level2Files if l2File.lower().endswith("_csv.zip")] # exclude all non-csv files for l2File in level2Files: # eg cdr.00012328.0000000000000000.20110101.131852.DAMHRLMPNP4183_csv.zip l2zip = self.localTempDirectory + "/output/" + l2File self.fileUtilities.UnzipUsing7z(l2zip, self.localTempDirectory + "/csvs/") FileUtilities.RemoveFileIfItExists(l2zip) # delete the file after unzipping FileUtilities.RemoveFileIfItExists(self.localTempDirectory + "/raw/" + zipFileName) # delete the parent file self.AddColumnSkipHeader(self.localTempDirectory + "/csvs/") # Add column DSTFlag if it doesn't exist and skip header self.PackFiles(self.localTempDirectory + "/csvs/", self.localTempDirectory + "/packed/") except: self.logger.exception("Error while processing ERCOT files") raise return maxModifiedDatetime
def __init__(self): ''' Initial settings ''' super(VantageAthenaSpark, self).__init__() self.fileUtilities = FileUtilities(self.logger) self.location = FileUtilities.PathToForwardSlash( os.path.dirname(os.path.abspath(__file__)))
def CheckWorkingFolders(self): ''' Check if the working folders are out there to re-create them ''' self.logger.info(self.moduleName + "Checking on working folders...") FileUtilities.RemoveFolder(self.rawFolder) FileUtilities.RemoveFolder(self.csvFolder) FileUtilities.CreateFolder(self.rawFolder) FileUtilities.CreateFolder(self.csvFolder)
def __init__(self): ''' Initial settings ''' super(AutoInsight, self).__init__() self.awsParams = "" self.processingFile = None self.fileUtilities = FileUtilities(self.logger) self.location = FileUtilities.PathToForwardSlash(os.path.dirname(os.path.abspath(__file__)))
def __init__(self): ''' Initial settings ''' super(ECRConnectAthenaSpark, self).__init__() self.jDataPulled = False self.xRefPulled = False self.fileUtilities = FileUtilities(self.logger) self.location = FileUtilities.PathToForwardSlash( os.path.dirname(os.path.abspath(__file__)))
def CleanWorkingFolders(self): ''' Ensures the folders are cleaned and ready before the process execution. ''' self.logger.info("Cleaning local working folders...") FileUtilities.RemoveFolder(self.tempFolder) FileUtilities.RemoveFolder(self.packedFolder) FileUtilities.CreateFolder(self.tempFolder) FileUtilities.CreateFolder(self.packedFolder)
def __init__(self): ''' Initial settings ''' super(ECRConnect, self).__init__() self.awsParams = "" self.csvFile = None self.csvFileHistory = None self.fileUtilities = FileUtilities(self.logger) self.location = FileUtilities.PathToForwardSlash( os.path.dirname(os.path.abspath(__file__)))
def __init__(self): ''' Initial settings ''' super(Vantage, self).__init__() self.awsParams = "" self.packedFolder = None self.rawFolder = None self.fileUtilities = FileUtilities(self.logger) self.location = FileUtilities.PathToForwardSlash( os.path.dirname(os.path.abspath(__file__)))
def CreateFolders(self): ''' Creates folder if it doesn't exist If it already exists, empties the folder contents ''' FileUtilities.EmptyFolderContents(self.localTempDirectory + self.job["downloadPath"]) FileUtilities.EmptyFolderContents(self.localTempDirectory + "/cleaned/") FileUtilities.EmptyFolderContents(self.localTempDirectory + "/packed/") for fp in self.job["foxpro_files"]: FileUtilities.EmptyFolderContents(self.localTempDirectory + "/packed/" + fp["Name"] + "/")
def __init__(self): ''' Initial settings ''' super(PGCRAirMarketsAthenaSpark, self).__init__() self.awsParams = "" self.tempFolder = None self.packedFolder = None self.rawDataFolder = None self.toPackFiles = [] self.fileUtilities = FileUtilities(self.logger) self.location = FileUtilities.PathToForwardSlash(os.path.dirname(os.path.abspath(__file__)))
def CreateFolders(self): ''' Creates folders ''' FileUtilities.EmptyFolderContents(self.localTempDirectory + self.job["folderPath"]["raw"]) FileUtilities.EmptyFolderContents(self.localTempDirectory + self.job["folderPath"]["ident"]) FileUtilities.EmptyFolderContents( self.localTempDirectory + self.job["folderPath"]["transactions"]) FileUtilities.EmptyFolderContents(self.localTempDirectory + self.job["folderPath"]["contracts"]) FileUtilities.EmptyFolderContents(self.localTempDirectory + self.job["folderPath"]["indexPub"])
def __init__(self): ''' Initial settings ''' super(HistoricalBrentAthenaSpark, self).__init__() self.location = FileUtilities.PathToForwardSlash( os.path.dirname(os.path.abspath(__file__)))
def CleanFiles(self, iso): ''' Generic cleaning wrapper ''' self.logger.info("Cleaning data") ignoreLines = None columnCount = None inputPath = self.localTempDirectory + "/" + iso["Name"] + "/" outputPath = self.localTempDirectory + "/" + iso["Name"] + "/cleaned/" FileUtilities.CreateFolder( outputPath) #creates the cleaned folder if doesn't already exist if iso.get("IgnoreLines") is not None: ignoreLines = iso.get("IgnoreLines") if iso.get("column_count") is not None: columnCount = iso.get("column_count") listOfFiles = self.fileUtilities.GetListOfFiles( inputPath, self.job["input_file_type"]) #get all the CSV files self.logger.info("Files found: {}".format(str(len(listOfFiles)))) for fp in listOfFiles: try: self.fileUtilities.CleanFile(inputPath + fp, outputPath + fp, IgnoreLines=ignoreLines, ColumnCount=columnCount, Delimiter=self.job["delimiter"]) except Exception as ex: self.logger.exception( "Error while cleaning the MISO file: {}".format(fp)) self.logger.exception("{}".format(str(ex))) raise
def __init__(self, logger): ''' Constructor ''' self.logger = logger self.location = FileUtilities.PathToForwardSlash( os.path.dirname(os.path.abspath(__file__)))
def __init__(self): ''' Initial settings ''' super(AthenaSparkIncrementalExample, self).__init__() self.location = FileUtilities.PathToForwardSlash( os.path.dirname(os.path.abspath(__file__)))
def CreatePullScript(self): ''' takes the template for the pull script and customizes it for the data we need ''' sqlPullDataScript = None try: self.logger.debug(self.moduleName + " -- " + "CreatePullScript" + " starting ") sqlPullDataTemplate = self.location + '/sql/' + self.job[ "sqlPullDataScriptTemplate"] sqlPullDataScript = self.localTempDirectory + "/sql/" + re.sub( 'Template.sql$', '.sql', self.job["sqlPullDataScriptTemplate"]) FileUtilities.RemoveFileIfItExists(sqlPullDataScript) with open(sqlPullDataTemplate) as infile, open( sqlPullDataScript, 'w') as outfile: for line in infile: outfile.write(line) self.logger.debug(self.moduleName + " -- " + "CreatePullScript" + " finished ") except: self.logger.exception(self.moduleName + " - we had an error in CreatePullScript") raise return sqlPullDataScript
def __init__(self): ''' Constructor ''' super(MagellanODATAAthenaSpark, self).__init__() self.location = FileUtilities.PathToForwardSlash( os.path.dirname(os.path.abspath(__file__)))
def __init__(self): ''' Initial settings ''' super(FHWA, self).__init__() self.location = FileUtilities.PathToForwardSlash( os.path.dirname(os.path.abspath(__file__)))
def __init__(self): ''' Constructor for this class ''' self.location = FileUtilities.PathToForwardSlash( os.path.dirname(os.path.abspath(__file__))) self.awsParams = ""
def CreateUpdateScript(self, pSchema, pTable): ''' takes the template for the Update sqlscript and customizes it ''' sqlUpdateScript = None try: self.logger.debug(self.moduleName + " -- " + "UpDate Table Script" + " starting ") sqlUpdateTemplate = self.location + '/' + self.job[ "sqlUpdateScript"] sqlUpdateScript = self.localTempDirectory + "/" + re.sub( 'Template.sql$', '.sql', self.job["sqlUpdateScript"]) FileUtilities.RemoveFileIfItExists(sqlUpdateScript) with open(sqlUpdateTemplate) as infile, open(sqlUpdateScript, 'w') as outfile: for line in infile: line = line.replace('{schemaName}', self.job["destinationSchema"]) line = line.replace('{tbname}', 'Totem') line = line.replace('{tbtotem}', self.sourceTableName) line = line.replace('{tbstats}', pSchema + "." + pTable) line = line.replace('{procid}', str(self.currProcId)) outfile.write(line) self.logger.debug(self.moduleName + " -- " + "UpDate Table Script" + " finished ") except: self.logger.exception(self.moduleName + " - we had an error in UpDate Table Script") raise return sqlUpdateScript
def ProcessApps(logger, processArray, folderlocs): ''' ProcessApps process all the applications that are turned on. ''' FileUtilities.CreateFolder(folderlocs["relativeOutputfolder"]) try: ab = ApplicationBase() ev = ab.LoadEnvironmentVariables(logger) if "tblEtl" in folderlocs: etlUtilities = EtlLoggingUtilities(logger) etlUtilities.awsParams = ev.awsParams etlUtilities.appschema = folderlocs["tblEtl"]["appschema"] etlUtilities.etlSchema = folderlocs["tblEtl"]["schemaName"] for proc in processArray: module = proc["module"] baseName = module.rsplit('.', 1)[1] logger.info(baseName + " - Starting module.") moduleName = importlib.import_module(module) className = getattr(moduleName, baseName)() className.Start(logger, baseName, folderlocs) # For single threading if "tblEtl" in folderlocs: procid = etlUtilities.GetRunID(folderlocs["tblEtl"]["table"], str(baseName)) if procid > -1: etlUtilities.CompleteInstance(folderlocs["tblEtl"]["table"], procid, 'C') except: logger.exception("Exception processing application modules!") raise logger.info(baseName + " - module COMPLETED.")
def __init__(self): ''' Initial settings ''' super(AutoLightVehiclesAthena, self).__init__() self.location = FileUtilities.PathToForwardSlash( os.path.dirname(os.path.abspath(__file__)))
def __init__(self): ''' Constructor @summary: Explanation of sql The primary table is PeriodValues all the other tables are used as supporting Product --> to get the definition of the product if needed Location --> obtain the description of location and although I am against hardcoding and because of the layout of the data 1) LocationTypeID of 2 is regional and this data could be double dipped so we are omitting those as well as 2) Exceptions to the rule are USR and DDR that also has the chance to be double dippled Category --> obtain the description of the category ID 1) We are only interested in specific categories and based on our Chemical expert they are : a. 10 --> Production b. 12 --> Total Supply c. 15 --> Total Demand d. 19 --> Domestic Demand SubCategory --> although we are not using this table it is important to note it since we are only looking at the summary level where the value = 0 rather than each broken down level from the PeriodValues table Last comment is the we are storing the name of the database in the config file since it could change periodically depending on what is provided for instance currently the last good database is WASP_2015t There are two pulls of the database each one 6 months apart. The first one will be in the form of WASP2015 and the next on six months later will have a t at the end such as WASP2015t ''' super(ChemicalsAthenaSpark, self).__init__() self.location = FileUtilities.PathToForwardSlash(os.path.dirname(os.path.abspath(__file__)))
def SkipHeader(self, inputFolderPath): ''' Skips the Header ''' csvFiles = self.fileUtilities.ScanFolder(inputFolderPath) for csvFile in csvFiles: FileUtilities.RemoveLines(inputFolderPath + csvFile, [1])
def __init__(self): ''' Constructor ''' super(Counter, self).__init__() self.location = FileUtilities.PathToForwardSlash( os.path.dirname(os.path.abspath(__file__)))
def ProcessFiles(self, downloadedFiles): ''' Process the XML files ''' counter = 1 for xmlFile in downloadedFiles: collection = xml.dom.minidom.parse(xmlFile).documentElement fileFullPath = self.localTempDirectory + "/kml" + str(counter) + ".csv" self.fileUtilities.CreateFile(fileFullPath) for folder in collection.getElementsByTagName("Folder"): if folder.getElementsByTagName("name")[0].childNodes[0].data == "LMP Points": #name tag for placemark in folder.getElementsByTagName("Placemark"): plantDetails = self.GetPlantDetails(placemark) line = "{}|{}|{}|{}|{}|{}|{}|{}\n".format(plantDetails["name"], plantDetails["settlementPoint"], plantDetails["plantName"], plantDetails["plantAddress"], plantDetails["county"], plantDetails["utility"], plantDetails["latitude"], plantDetails["longitude"]) FileUtilities.WriteToFile(fileFullPath, line) self.PackFile(fileFullPath) self.fileUtilities.DeleteFile(xmlFile) counter = counter + 1
def MoveToS3(self, localFolderName, folderName, subFolder): ''' move gzip files to s3 and clean local instance localFolderName --> local folder name subFolder --> date folderName --> folder name on s3 ''' try: self.logger.debug(self.moduleName + " -- " + "MoveToS3 " + localFolderName + " starting ") ### # move any gzip files to the s3 server ### s3folder = "s3://" + self.job["bucketName"] + self.job["s3GzipFolderBase"] +\ "/" + folderName + '/' + subFolder localFolder = self.fileUtilities.gzipFolder + localFolderName S3Utilities.SyncFolderAWSCli(localFolder, s3folder, args='''--quiet --include "*.gz"''', dbug="Y") # Cleanup local files FileUtilities.EmptyFolderContents(localFolder) self.logger.debug(self.moduleName + " -- " + "MoveToS3 " + localFolderName + " finished ") except: self.logger.exception(self.moduleName + " - we had an error in MoveToS3") raise
def __init__(self): ''' Initial settings commonParams = {} commonParams["cat"] = cat { see below } commonParams["moduleName"] = self.moduleName commonParams["loggerParams"] = "log" commonParams["sqlFolder"] = self.sqlFolder cat is a json nugget that defines the tables required cat = { "name": "giif", "tables": [ { "schemaName": "eaa_dev", "srctable": "GEForcast_giif_attributes_working", "table": "GEForcastHistory_giif_attributes", "new": "Y", "type": "attributes", "partition": { "over": "mnemonic, frequencychar", "order": "publisheddate" }, "fields": [ { "name": "object_id", "type": "VARCHAR", "size": "30" }, { "name": "name", "type": "VARCHAR", "size": "100" }, { "name": "mnemonic", "type": "VARCHAR", "size": "100" }, { "name": "frequencyChar", "type": "VARCHAR", "size": "2" }, { "name": "geo", "type": "VARCHAR", "size": "20" }, { "name": "startDate", "type": "Date"}, { "name": "endDate", "type": "Date"}, { "name": "updatedDate", "type": "Date"}, { "name": "publishedDate", "type": "Date"}, { "name": "longLabel", "type": "VARCHAR", "size": "2000" }, { "name": "dataEdge", "type": "VARCHAR", "size": "100" } ], "sortkey":"object_id" }, { "schemaName": "eaa_dev", "srctable": "GEForcast_giif_data_working", "table": "GEForcastHistory_giif_data", "new": "Y", "type": "series", "fields": [ { "name": "object_id", "type": "VARCHAR", "size": "30" }, { "name": "date", "type": "DATE" }, { "name": "value", "type": "FLOAT8" } ], "sortkey":"object_id, date" }] } ''' self.commonParams = {} self.fileUtilities = None self.logger = None self.moduleName = None self.location = FileUtilities.PathToForwardSlash( os.path.dirname(os.path.abspath(__file__)))