Exemple #1
0
 def ProcessFiles(self, lastModifiedDatetime):
     '''
     Start processing the ERCOT files
     '''
     maxModifiedDatetime = None
     try:
         filesOnS3, maxModifiedDatetime = self.GetNewFiles(lastModifiedDatetime)
         filesOnS3 = [fl.replace(self.job["s3SrcDirectory"][1:],"") for fl in filesOnS3]
         for fileConfig in self.job["files"]:
             zipFiles = list(filter(re.compile(fileConfig["FileRegex"]).match, filesOnS3))
             for zipFileName in zipFiles:  # eg DAM_Hr_LMP_2011.zip
                 self.DownloadFile(self.job["s3SrcDirectory"][1:] + zipFileName, self.localTempDirectory + "/raw/")
                 self.fileUtilities.UnzipUsing7z(self.localTempDirectory + "/raw/" + zipFileName, self.localTempDirectory + "/output/")
                 level2Files = self.fileUtilities.ScanFolder(self.localTempDirectory + "/output/")
                 level2Files = [l2File for l2File in level2Files if l2File.lower().endswith("_csv.zip")]  # exclude all non-csv files
                 for l2File in level2Files:  # eg cdr.00012328.0000000000000000.20110101.131852.DAMHRLMPNP4183_csv.zip
                     l2zip = self.localTempDirectory + "/output/" + l2File
                     self.fileUtilities.UnzipUsing7z(l2zip, self.localTempDirectory + "/csvs/")
                     FileUtilities.RemoveFileIfItExists(l2zip)  # delete the file after unzipping
                 FileUtilities.RemoveFileIfItExists(self.localTempDirectory + "/raw/" + zipFileName)  # delete the parent file
                 self.AddColumnSkipHeader(self.localTempDirectory + "/csvs/")  # Add column DSTFlag if it doesn't exist and skip header
                 self.PackFiles(self.localTempDirectory + "/csvs/", self.localTempDirectory + "/packed/")
     except:
         self.logger.exception("Error while processing ERCOT files")
         raise
     return maxModifiedDatetime
Exemple #2
0
 def CreateUpdateScript(self, pSchema, pTable):
     '''
     takes the template for the Update sqlscript and customizes it
     '''
     sqlUpdateScript = None
     try:
         self.logger.debug(self.moduleName + " -- " +
                           "UpDate Table Script" + " starting ")
         sqlUpdateTemplate = self.location + '/' + self.job[
             "sqlUpdateScript"]
         sqlUpdateScript = self.localTempDirectory + "/" + re.sub(
             'Template.sql$', '.sql', self.job["sqlUpdateScript"])
         FileUtilities.RemoveFileIfItExists(sqlUpdateScript)
         with open(sqlUpdateTemplate) as infile, open(sqlUpdateScript,
                                                      'w') as outfile:
             for line in infile:
                 line = line.replace('{schemaName}',
                                     self.job["destinationSchema"])
                 line = line.replace('{tbname}', 'Totem')
                 line = line.replace('{tbtotem}', self.sourceTableName)
                 line = line.replace('{tbstats}', pSchema + "." + pTable)
                 line = line.replace('{procid}', str(self.currProcId))
                 outfile.write(line)
         self.logger.debug(self.moduleName + " -- " +
                           "UpDate Table Script" + " finished ")
     except:
         self.logger.exception(self.moduleName +
                               " - we had an error in UpDate Table Script")
         raise
     return sqlUpdateScript
Exemple #3
0
    def CreatePullScript(self):
        '''
        takes the template for the pull script and customizes it for the data we need
        '''
        sqlPullDataScript = None
        try:
            self.logger.debug(self.moduleName + " -- " + "CreatePullScript" +
                              " starting ")
            sqlPullDataTemplate = self.location + '/sql/' + self.job[
                "sqlPullDataScriptTemplate"]
            sqlPullDataScript = self.localTempDirectory + "/sql/" + re.sub(
                'Template.sql$', '.sql', self.job["sqlPullDataScriptTemplate"])
            FileUtilities.RemoveFileIfItExists(sqlPullDataScript)

            with open(sqlPullDataTemplate) as infile, open(
                    sqlPullDataScript, 'w') as outfile:
                for line in infile:
                    outfile.write(line)
            self.logger.debug(self.moduleName + " -- " + "CreatePullScript" +
                              " finished ")
        except:
            self.logger.exception(self.moduleName +
                                  " - we had an error in CreatePullScript")
            raise
        return sqlPullDataScript
Exemple #4
0
    def CreateMigrationScript(self):
        '''
        takes the template for the Update sqlscript and customizes it
        '''
        sqlMigrateScript = None
        try:
            ###
            #  make sure that we have a place to put the sql script
            ###
            sqlScript = 'PopulateHistoryTemplate.sql'
            self.logger.debug(self.moduleName + " -- " +
                              "CreateMigrationScript" + " starting ")
            sqlMigrateTemplate = self.location + '/sql/' + sqlScript
            sqlMigrateScript = self.commonParams["sqlFolder"] + re.sub(
                'Template.sql$', '.sql', sqlScript)
            #commonParams["cat"]
            FileUtilities.CreateFolder(self.commonParams["sqlFolder"])
            FileUtilities.RemoveFileIfItExists(sqlMigrateScript)
            ###
            #  gather variables needed
            ###
            schemaName = None
            attrSrc = None
            dataSrc = None
            attrDest = None
            dataDest = None
            orderByFields = None
            partByFields = None

            for table in self.commonParams["cat"]["tables"]:
                schemaName = table["schemaName"]
                if table["type"] == "attributes":
                    attrSrc = table["srctable"]
                    attrDest = table["table"]
                    if "partition" in table:
                        orderByFields = table["partition"]["order"]
                        partByFields = table["partition"]["over"]
                elif table["type"] == "series":
                    dataSrc = table["srctable"]
                    dataDest = table["table"]

            with open(sqlMigrateTemplate) as infile, open(
                    sqlMigrateScript, 'w') as outfile:
                for line in infile:
                    line = line.replace('{schemaName}', schemaName)
                    line = line.replace('{attrSrc}', attrSrc)
                    line = line.replace('{dataSrc}', dataSrc)
                    line = line.replace('{attrDest}', attrDest)
                    line = line.replace('{dataDest}', dataDest)
                    line = line.replace('{orderByFields}', orderByFields)
                    line = line.replace('{partByFields}', partByFields)
                    outfile.write(line)
            self.logger.debug(self.moduleName + " -- " +
                              "CreateMigrationScript" + " finished ")
        except:
            self.logger.exception(
                self.moduleName +
                " - we had an error in CreateMigrationScript")
            raise
        return sqlMigrateScript
Exemple #5
0
    def CreateUpdateScript(self, pEtlSchema, pEtlTable, tblJson, currProcId):
        '''
        takes the template for the Update sqlscript and customizes it
        '''
        sqlUpdateScript = None
        try:
            self.logger.debug(self.moduleName + " -- " +
                              "UpDate Table Script" + " starting ")
            sqlUpdateTemplate = self.location + '/' + self.job[
                "sqlUpdateScript"]
            sqlUpdateScript = self.localTempDirectory + "/" + re.sub(
                'Template.sql$', '.sql', self.job["sqlUpdateScript"])
            FileUtilities.RemoveFileIfItExists(sqlUpdateScript)
            ###
            #  gather variables needed
            ###

            tbattributesourceName = None
            tbattributedestinationName = None
            tbdatasourceName = None
            tbdatadestinationName = None

            for table in tblJson:
                if "destName" in table:
                    if table["type"] == "attributes":
                        tbattributesourceName = table["table"]
                        tbattributedestinationName = table["destName"]
                    elif table["type"] == "series":
                        tbdatasourceName = table["table"]
                        tbdatadestinationName = table["destName"]

            with open(sqlUpdateTemplate) as infile, open(sqlUpdateScript,
                                                         'w') as outfile:
                for line in infile:
                    line = line.replace('{schemaName}',
                                        self.job["destinationSchema"])
                    line = line.replace('{tbattributesourceName}',
                                        tbattributesourceName)
                    line = line.replace('{tbattributedestinationName}',
                                        tbattributedestinationName)
                    line = line.replace('{tbdatasourceName}', tbdatasourceName)
                    line = line.replace('{tbdatadestinationName}',
                                        tbdatadestinationName)
                    line = line.replace('{tbstats}',
                                        pEtlSchema + "." + pEtlTable)
                    line = line.replace('{procid}', str(currProcId))
                    outfile.write(line)
            self.logger.debug(self.moduleName + " -- " +
                              "UpDate Table Script" + " finished ")
        except:
            self.logger.exception(self.moduleName +
                                  " - we had an error in UpDate Table Script")
            raise
        return sqlUpdateScript
 def ClassifyFiles(self):
     '''
     classifies the files as Transactions, ident, contracts, indexPub
     '''
     self.logger.info("Inside PGCRFERCFilings.ClassifyFiles")
     #=======================================================================
     # get the list of all csv files
     # FileUtilities.ScanFolder doesn't do a recursive listing, hence wrote a new method
     #=======================================================================
     searchPath = self.localTempDirectory + self.job["folderPath"]["raw"]
     fileNames = self.fileUtilities.GetListOfFilesRecursively(
         searchPath, filetype="*.CSV")
     self.logger.info("{} files found".format(len(fileNames)))
     for fileName in fileNames:
         try:
             fileType = PGCRFERCQuarterlyFilings.GetFileType(
                 os.path.basename(
                     fileName))  #transactions, contracts, ident, indexPub
             outputFileName = os.path.basename(
                 fileName)  #returns the filename
             folderPath = os.path.dirname(
                 fileName)  #returns the directory name
             #===============================================================
             # Special handling for transactions to speed up the file processing
             # Use native file processing for transactions files
             #===============================================================
             if fileType == "transactions":
                 self.SaveTransactions(fileType, fileName, outputFileName)
                 FileUtilities.RemoveFileIfItExists(
                     fileName)  #deletes the input file after processing
             else:
                 self.SaveAsCSV(fileType, fileName, outputFileName)
                 FileUtilities.RemoveFileIfItExists(
                     fileName)  #deletes the input file after processing
         except Exception:
             self.logger.exception(
                 "Exception in PGCRFERCFilings.ClassifyFiles while handling a file in the path: {}"
                 .format(folderPath))
             raise
Exemple #7
0
    def CreateUpdSpcCharScript(self, dbCommon, tblJson):
        '''
        takes the template for the Update sqlscript and customizes it
        '''
        specialCharacterScript = None
        try:
            self.logger.debug(self.moduleName + " -- " +
                              "CreateUpdSpcCharScript " + " starting ")

            specialCharacterScriptTemplate = self.location + '/sql/' + dbCommon[
                tblJson["specialCharacterScript"]]

            outName = re.sub('Template.sql$', '.sql',
                             dbCommon[tblJson["specialCharacterScript"]])
            outName = re.sub("TableName", tblJson["table"], outName)
            specialCharacterScript = self.localTempDirectory + "/sql/" + outName

            FileUtilities.RemoveFileIfItExists(specialCharacterScript)

            fields = "set "
            cmaNdx = 0
            for fldDesc in tblJson["fields"]:
                if "specialcharacters" in fldDesc:
                    fldName = fldDesc["name"]
                    numConversion = len(fldDesc["specialcharacters"])
                    repString = self.BuildReplaceString(
                        fldName, fldDesc["specialcharacters"], numConversion)
                    if cmaNdx > 0:
                        fields = fields + ", "
                    cmaNdx = 1
                    fields = fields + fldName + " = " + str(repString)

            with open(specialCharacterScriptTemplate) as infile, open(
                    specialCharacterScript, 'w') as outfile:
                for line in infile:
                    line = line.replace('{schemaname}', tblJson["schemaName"])
                    line = line.replace('{workingtable}',
                                        tblJson["workingtable"])
                    line = line.replace('{desttable}', tblJson["table"])
                    line = line.replace('{fieldnames}', fields)
                    outfile.write(line)
            self.logger.debug(self.moduleName + " -- " +
                              "CreateUpdSpcCharScript " + " finished ")
        except:
            self.logger.exception(
                self.moduleName +
                " - we had an error in CreateUpdSpcCharScript")
            raise
        return specialCharacterScript
Exemple #8
0
    def CreatePullScript(self, paramsList):
        '''
        takes the template for the pull script and customizes it for the data we need
        '''
        sqlPullDataScript = None
        try:
            self.logger.debug(self.moduleName + " -- " + "CreatePullScript" +
                              " starting ")
            lastDate = None
            fromDate = ''
            lensparamsList = len(paramsList)
            if lensparamsList > 0:
                if "lastrun" in paramsList:
                    lastDate = paramsList["lastrun"]
            if lastDate is not None:
                ###
                #  bump date by one day
                ###
                #                fromDate = datetime.datetime.strptime(lastDate, '%m/%d/%Y') +\
                #                            datetime.timedelta(days=1)
                #                fromDate = fromDate.strftime('%m/%d/%Y')
                fromDate = datetime.datetime.strptime(lastDate, '%m/%d/%Y')
                if fromDate > datetime.datetime.today() - datetime.timedelta(
                        days=1):
                    fromDate = datetime.date.today() - datetime.timedelta(
                        days=1)
                fromDate = datetime.datetime.strftime(fromDate, '%m/%d/%Y')

            sqlPullDataTemplate = self.location + '/sql/' + self.job[
                "sqlPullDataScriptTemplate"]
            sqlPullDataScript = self.localTempDirectory + "/sql/" + re.sub(
                'Template.sql$', '.sql', self.job["sqlPullDataScriptTemplate"])
            FileUtilities.RemoveFileIfItExists(sqlPullDataScript)

            with open(sqlPullDataTemplate) as infile, open(
                    sqlPullDataScript, 'w') as outfile:
                for line in infile:
                    line = line.replace('{lastrundate}', fromDate)
                    outfile.write(line)
            self.logger.debug(self.moduleName + " -- " + "CreatePullScript" +
                              " finished ")
            if fromDate is not self.cBlank:
                fromDate = datetime.datetime.strptime(fromDate, '%m/%d/%Y')
                fromDate = fromDate.strftime('%Y%m%d')
        except:
            self.logger.exception(self.moduleName +
                                  " - we had an error in CreatePullScript")
            raise
        return sqlPullDataScript, fromDate
 def CleanAndPack(self):
     '''
     Removes blank lines & empty strings ("") from files
     '''
     self.logger.info("Inside PGCRFERCFilings.RemoveBlankLines")
     for folder in list(self.job["folderPath"].keys()):
         files = self.fileUtilities.ScanFolder(self.localTempDirectory +
                                               "/" + folder + "/")
         for iFile in files:
             inputFile = self.localTempDirectory + "/" + folder + "/" + iFile
             cleanedFile = self.localTempDirectory + "/" + folder + "/cleaned_" + iFile
             self.fileUtilities.RemoveBlankLines(inputFile, cleanedFile)
             FileUtilities.RemoveFileIfItExists(inputFile)
             self.fileUtilities.GzipFile(cleanedFile,
                                         cleanedFile)  #gzip the file
             self.fileUtilities.DeleteFile(
                 cleanedFile)  #delete the CSV file
Exemple #10
0
    def CreatePullScript(self, dbCommon, table, currVal, incVal, mxValue):
        '''
        takes the template for the pull script and customizes it for the data we need
        based on the fields in the config file
        '''
        sqlPullDataScript = None
        try:
            self.logger.debug(self.moduleName + " -- " +
                              "CreatePullScript for " + table["table"] +
                              " starting ")

            sqlTemplate = dbCommon["sqlPullDataScriptTemplate"]
            if "pullTemplate" in table:
                templateType = table["pullTemplate"]
                sqlTemplate = dbCommon[str(templateType)]
            sqlPullDataTemplate = self.location + '/sql/' + sqlTemplate
            ###
            #  fix name of output script
            ###
            outName = re.sub('Template.sql$', '.sql', sqlTemplate)
            outName = re.sub(dbCommon["name"], table["table"], outName)
            sqlPullDataScript = self.localTempDirectory + "/sql/" + str(
                currVal) + "_" + outName
            FileUtilities.RemoveFileIfItExists(sqlPullDataScript)

            fields = self.GetInnerFields(table["fields"])

            self.fromKey = currVal + incVal
            if self.fromKey > mxValue:
                self.fromKey = mxValue
            whereClause = self.GetWhereClause(table, currVal)

            with open(sqlPullDataTemplate) as infile, open(
                    sqlPullDataScript, 'w') as outfile:
                for line in infile:
                    line = line.replace('{intable}', table["sourcetable"])
                    line = line.replace('{infields}', fields)
                    line = line.replace('{whereclause}', whereClause)
                    outfile.write(line)
            self.logger.debug(self.moduleName + " -- " + "CreatePullScript" +
                              " finished ")
        except:
            self.logger.exception(self.moduleName +
                                  " - we had an error in CreatePullScript")
            raise
        return sqlPullDataScript
Exemple #11
0
    def ProcessFiles(self):
        '''
        Controls the workflow for the conversion, clean up and pack of the input files.
        '''
        self.logger.info(self.moduleName + " - Processing file: " + self.processingFile)

        rawFileName = self.localTempDirectory + "/" + self.processingFile
        csvFilename = self.localTempDirectory + "/" + self.processingFile.split(".")[0] + ".csv"

        try:
            columnNames = []

            df = pandas.read_excel(rawFileName,
                                   sheetname=self.job["worksheetName"],
                                   index_col=None,
                                   na_values=None,
                                   skiprows=self.job["skipRows"],
                                   skip_footer=self.job["skipFooter"])

            for colName in df.head(0):
                if colName not in self.job["columns_no_melt"]:
                    columnNames.append(self.FormatColNameDate(colName))
                else:
                    columnNames.append(colName)

            df.columns = columnNames
            df = df.melt(id_vars=self.job["columns_no_melt"])

            df.to_csv(csvFilename,
                      header=False,
                      sep=str(self.job["delimiter"]),
                      encoding='utf-8',
                      index=False)

            self.fileUtilities.GzipFile(csvFilename, csvFilename + ".gz")
            self.fileUtilities.DeleteFile(csvFilename)
        except XLRDError:
            self.logger.info(self.moduleName + " - No tab named '" + self.job["worksheetName"] + "' in " + self.processingFile)
        except Exception:
            self.logger.error(self.moduleName + " - Error while trying to process file " +  self.processingFile)
            raise
        finally:
            FileUtilities.RemoveFileIfItExists(rawFileName)
    def ProcessFiles(self, dbCommon):
        '''
        Controls the workflow for the conversion, clean up and pack of the input files.
        '''
        srcFileName = dbCommon["srcSharedFolder"] + dbCommon["fileName"]
        self.logger.info(self.moduleName + " - Processing file: " +
                         srcFileName)

        dstFileName = self.fileUtilities.csvFolder + dbCommon["fileName"]
        shutil.copyfile(srcFileName, dstFileName)
        csvFilename = dstFileName + ".csv"

        try:
            columnNames = []
            df = pandas.read_excel(dstFileName,
                                   sheet_name=dbCommon["worksheetName"],
                                   index_col=None,
                                   na_values=None,
                                   skiprows=dbCommon["skipRows"],
                                   skip_footer=dbCommon["skipFooter"])
            for colName in df.head(0):
                if colName not in dbCommon["columns_no_melt"]:
                    columnNames.append(self.FormatColNameDate(colName))
                else:
                    columnNames.append(colName)
            df.columns = columnNames
            df = df.melt(id_vars=dbCommon["columns_no_melt"])
            df.to_csv(csvFilename,
                      header=False,
                      sep=str(dbCommon["delimiter"]),
                      encoding='utf-8',
                      index=False)
        except XLRDError:
            self.logger.info(self.moduleName + " - No tab named '" +
                             dbCommon["worksheetName"] + "' in " + dstFileName)
        except Exception:
            self.logger.error(self.moduleName +
                              " - Error while trying to process file " +
                              dstFileName)
            raise
        finally:
            FileUtilities.RemoveFileIfItExists(dstFileName)
Exemple #13
0
    def CreateUpdateScript(self, dbCommon, tblJson):
        '''
        takes the template for the Update sqlscript and customizes it
        '''
        sqlUpdateScript = None
        try:
            self.logger.debug(self.moduleName + " -- " +
                              "UpDate Table Script" + " starting ")
            sqlUpdateTemplate = self.location + '/sql/' + dbCommon[
                "sqlUpdateScript"]

            outName = re.sub('Template.sql$', tblJson["table"] + '.sql',
                             dbCommon["sqlUpdateScript"])

            sqlUpdateScript = self.localTempDirectory + "/sql/" + outName
            FileUtilities.RemoveFileIfItExists(sqlUpdateScript)

            with open(sqlUpdateTemplate) as infile, open(sqlUpdateScript,
                                                         'w') as outfile:
                for line in infile:
                    line = line.replace('{destschemaname}',
                                        tblJson["schemaName"])
                    line = line.replace(
                        '{workingschemaname}',
                        tblJson["updateSection"]["workingschemaname"])
                    line = line.replace(
                        '{workingtable}',
                        tblJson["updateSection"]["workingtable"])
                    line = line.replace('{desttable}', tblJson["table"])
                    line = line.replace('{keys}',
                                        tblJson["updateSection"]["keyfields"])
                    line = line.replace('{join}',
                                        tblJson["updateSection"]["join"])

                    outfile.write(line)
            self.logger.debug(self.moduleName + " -- " +
                              "UpDate Table Script" + " finished ")
        except:
            self.logger.exception(self.moduleName +
                                  " - we had an error in UpDate Table Script")
            raise
        return sqlUpdateScript
Exemple #14
0
    def ProcessFiles(self):
        '''
        Controls the workflow for the conversion, clean up and pack of the input files.
        '''
        filesToProcess = self.fileUtilities.ScanFolder(self.rawFolder)

        for emFile in filesToProcess:
            self.logger.info(self.moduleName + " - Processing file: " + emFile)

            rawFileName = self.rawFolder + "/" + emFile
            csvFilename = self.fileUtilities.csvFolder + os.path.splitext(
                emFile)[0] + ".csv"

            try:
                surveyDatedt = self.GetData(rawFileName, "getSurveyDate")

                if isinstance(surveyDatedt, float):
                    surveyDatedt = self.FormatSurveyDate(emFile)
                elif isinstance(surveyDatedt, basestring):
                    if "," in surveyDatedt:
                        tmpDatedt = datetime.strptime(surveyDatedt,
                                                      '%B %d, %Y')
                        surveyDatedt = datetime.strftime(tmpDatedt, "%Y-%m-%d")

                df = self.GetData(rawFileName)
                df = self.DfCleanUp(df, surveyDatedt)

                df.to_csv(csvFilename,
                          header=False,
                          sep=str(self.job["delimiter"]),
                          encoding='utf-8',
                          index=False)

            except XLRDError:
                self.logger.info(self.moduleName + " - No tab named '" +
                                 self.job["worksheetName"] + "' in " + emFile)
            except Exception:
                self.logger.error(self.moduleName +
                                  " - Error while trying to process " + emFile)
                raise
            finally:
                FileUtilities.RemoveFileIfItExists(rawFileName)
 def RecursivelyUnzipFiles(self, srcDirectory):
     '''
     Recursively unzips the files
     '''
     srcDirectory = srcDirectory.strip()  #trim trailing spaces if any
     if srcDirectory[
             -1] != "/":  #if the path doesn't end with forward slash, append one
         srcDirectory = srcDirectory + "/"
     print(srcDirectory)
     #=======================================================================
     # get the list of files in the given path and unzip them
     #=======================================================================
     files = self.fileUtilities.ScanFolder(srcDirectory)
     for unzippedFile in files:
         try:
             if unzippedFile.lower().endswith(
                     ".zip"):  #we are looking for only zip files
                 inputFilename = srcDirectory + "/" + unzippedFile  #build the full path to the zip file
                 outputFolder = unzippedFile.split(".")[
                     0]  #get the filename without the zip part
                 outputDirectory = srcDirectory + "/" + unzippedFile.split(
                     "."
                 )[0]  #build the output directory to which the file is to be unzipped
                 FileUtilities.CreateFolder(
                     outputDirectory
                 )  #Create the folder to be unzipped into
                 self.fileUtilities.UnzipUsing7z(
                     inputFilename,
                     outputDirectory)  #unzip using the 7z utility
                 FileUtilities.RemoveFileIfItExists(
                     inputFilename
                 )  #deletes the zip file after unzipping it
                 self.RecursivelyUnzipFiles(
                     srcDirectory + "/" +
                     outputFolder)  #recursive call to this method
         except:
             self.logger.exception(
                 "Exception in PGCRFERCFilings.RecursivelyUnzipFiles while unzipping file: {}"
                 .format(unzippedFile))
             raise
Exemple #16
0
    def CreatePullScript(self, tables, lastDate):
        '''
        takes the template for the pull script and customizes it for the data we need
        '''
        sqlPullDataScript = None
        try:
            self.logger.debug(self.moduleName + " -- " + "CreatePullScript" +
                              " starting ")
            if lastDate is not None:
                fromDate = datetime.datetime.strptime(lastDate, '%m/%d/%Y')
                if fromDate > datetime.datetime.today() - datetime.timedelta(
                        days=1):
                    fromDate = datetime.date.today() - datetime.timedelta(
                        days=1)
                fromDate = datetime.datetime.strftime(fromDate, '%m/%d/%Y')

            sqlPullDataTemplate = self.location + '/SQL/' + tables[
                "pullTemplate"]
            sqlPullDataScript = self.fileUtilities.sqlFolder + re.sub(
                'Template.sql$', '.sql', tables["pullTemplate"])
            FileUtilities.RemoveFileIfItExists(sqlPullDataScript)

            with open(sqlPullDataTemplate) as infile, open(
                    sqlPullDataScript, 'w') as outfile:
                for line in infile:
                    line = line.replace('{lastrundate}', fromDate)
                    outfile.write(line)
            self.logger.debug(self.moduleName + " -- " + "CreatePullScript" +
                              " finished ")
            if fromDate is not self.cBlank:
                fromDate = datetime.datetime.strptime(fromDate, '%m/%d/%Y')
                fromDate = fromDate.strftime('%Y%m%d')
        except:
            self.logger.exception(self.moduleName +
                                  " - we had an error in CreatePullScript")
            raise
        return sqlPullDataScript
 def ProcessFiles(self, lastModifiedDatetime):
     '''
     Start processing the ERCOT files
     '''
     maxModifiedDatetime = None
     try:
         filesOnS3, maxModifiedDatetime = self.GetNewFiles(
             lastModifiedDatetime)
         filesOnS3 = [
             fl.replace(self.job["s3SrcDirectory"][1:], "")
             for fl in filesOnS3
         ]
         for fileConfig in self.job["files"]:
             zipFiles = list(
                 filter(
                     re.compile(fileConfig["FileRegex"]).match, filesOnS3))
             for zipFileName in zipFiles:  # eg cdr.00012328.0000000000000000.20170619.123601002.DAMHRLMPNP4183_csv.zip
                 self.DownloadFile(
                     self.job["s3SrcDirectory"][1:] + zipFileName,
                     self.localTempDirectory + "/raw/")
                 self.fileUtilities.UnzipUsing7z(
                     self.localTempDirectory + "/raw/" +
                     zipFileName.split("/")[1],
                     self.localTempDirectory + "/output/")
                 FileUtilities.RemoveFileIfItExists(
                     self.localTempDirectory + "/raw/" +
                     zipFileName)  # delete the parent file
                 self.AddColumnSkipHeader(
                     self.localTempDirectory + "/output/"
                 )  # Add column DSTFlag if it doesn't exist and skip header
                 self.PackFiles(self.localTempDirectory + "/output/",
                                self.localTempDirectory + "/packed/")
     except:
         self.logger.exception("Error while processing ERCOT files")
         raise
     return (maxModifiedDatetime, len(filesOnS3))
Exemple #18
0
class ECRConnect(ApplicationBase):
    '''
    This class is used to get the Risk data from IHS Connect, transform it and load it into Redshift.
    '''
    def __init__(self):
        '''
        Initial settings
        '''
        super(ECRConnect, self).__init__()

        self.awsParams = ""
        self.csvFile = None
        self.csvFileHistory = None
        self.fileUtilities = FileUtilities(self.logger)
        self.location = FileUtilities.PathToForwardSlash(
            os.path.dirname(os.path.abspath(__file__)))

    def TransformToCsv(self, jData):
        '''
        Transforms from json to csv file.
        '''
        try:
            # Gets the latest version
            df = json_normalize(jData, 'Risks', ['Country'])
            df['ClassName'] = ''
            df['ClassAvg'] = ''
            df = df[[
                'Country', 'Name', 'Value', 'Description', 'ClassName',
                'ClassAvg', 'UpdatedOn'
            ]]

            df.to_csv(self.csvFile,
                      header=False,
                      sep=str(self.job["delimiter"]),
                      encoding='utf-8',
                      index=False)

            self.fileUtilities.GzipFile(self.csvFile, self.csvFile + ".gz")
            self.fileUtilities.RemoveFileIfItExists(self.csvFile)

            # Gets the history
            df = json_normalize(jData, ['Risks', 'History'],
                                ['Country', ['Risks', 'Name']])
            df = df[['Country', 'Risks.Name', 'Value', 'UpdatedOn']]

            df.to_csv(self.csvFileHistory,
                      header=False,
                      sep=str(self.job["delimiter"]),
                      encoding='utf-8',
                      index=False)

            self.fileUtilities.GzipFile(self.csvFileHistory,
                                        self.csvFileHistory + ".gz")
            self.fileUtilities.RemoveFileIfItExists(self.csvFileHistory)
        except Exception as err:
            self.logger.error(
                "Error while trying to transform json to csv. Error:" +
                err.message)
            raise

    def GetAndTransform(self):
        '''
        Download all files.
        '''
        try:
            request = urllib2.Request(self.job["connectAPI"]["baseurl"] +
                                      self.job["connectAPI"]["riskService"])
            base64string = base64.b64encode(
                '%s:%s' % (self.job["connectAPI"]["username"],
                           self.job["connectAPI"]["password"]))
            request.add_header("Authorization", "Basic %s" % base64string)

            response = urllib2.urlopen(request)
            jData = json.load(response)

            self.TransformToCsv(jData)
        except Exception as err:
            self.logger.error(
                "Error while trying to get and transform from IHS Connect API service. Error:"
                + err.message)
            raise

    def LoadAllFromS3(self, s3Source, tableName):
        '''
        Process a single category configured in the categories dictionary in the jobConfig.
        '''
        try:
            s3DataSource = "s3://" + self.job["bucketName"] + s3Source

            rsConnect = RedshiftUtilities.Connect(
                dbname=self.awsParams.redshift['Database'],
                host=self.awsParams.redshift['Hostname'],
                port=self.awsParams.redshift['Port'],
                user=self.awsParams.redshiftCredential['Username'],
                password=self.awsParams.redshiftCredential['Password'])

            RedshiftUtilities.LoadDataFromS3(
                rsConnect, self.awsParams.s3, {
                    "destinationSchema": self.job["destinationSchema"],
                    "tableName": tableName,
                    "s3Filename": s3DataSource,
                    "fileFormat": self.job["fileFormat"],
                    "dateFormat": self.job["dateFormat"],
                    "delimiter": self.job["delimiter"]
                }, self.logger, "N")
        except Exception:
            self.logger.error(
                self.moduleName +
                " - Error while trying to save into Redshift from s3 folder.")
            raise

    def UploadToS3(self):
        '''
        Uploads all GZIP files created into S3 to be uploaded later...
        '''
        self.logger.info(self.moduleName +
                         " - Uploading GZIP files to s3 folder...")
        fileName = self.job["fileNameOut"] + ".gz"
        fileNameHistory = self.job["fileNameOutHistory"] + ".gz"

        S3Utilities.CopyItemsAWSCli(
            self.localTempDirectory + "/" + fileName,
            's3://' + self.job["bucketName"] + self.job["s3ToDirectory"] +
            '/' + fileName)

        S3Utilities.CopyItemsAWSCli(
            self.localTempDirectory + "/" + fileNameHistory,
            's3://' + self.job["bucketName"] + self.job["s3ToDirectory"] +
            '/' + fileNameHistory)

    def ExecutePostETL(self):
        '''
        Will execute the post load sql script...
        '''
        try:
            sqlTemplate = self.location + "/" + self.job["postSQLScript"]
            sqlScript = self.localTempDirectory + "/" + self.job[
                "postSQLScript"]

            self.fileUtilities.CreateActualFileFromTemplate(
                sqlTemplate, sqlScript, self.job["destinationSchema"],
                self.job["tableName"])
            RedshiftUtilities.PSqlExecute(sqlScript, self.logger)
        except Exception as err:
            self.logger.error(
                self.moduleName +
                " - Error while updating the countries codes. Message: " +
                err.message)
            raise

    def Start(self, logger, moduleName, filelocs):
        try:
            ApplicationBase.Start(self, logger, moduleName, filelocs)

            self.csvFile = self.localTempDirectory + "/" + self.job[
                "fileNameOut"]
            self.csvFileHistory = self.localTempDirectory + "/" + self.job[
                "fileNameOutHistory"]

            self.GetAndTransform()
            self.UploadToS3()
            self.LoadAllFromS3(
                self.job["s3ToDirectory"] + '/' + self.job["fileNameOut"] +
                '.gz', self.job["tableName"])
            self.LoadAllFromS3(
                self.job["s3ToDirectory"] + '/' +
                self.job["fileNameOutHistory"] + '.gz',
                self.job["tableName"] + '_history')
            self.LoadAllFromS3(
                self.job["xReference"]["s3DataDirectory"],
                self.job["tableName"] + self.job["xReference"]["tableNameSfx"])
            self.ExecutePostETL()
        except Exception as err:
            self.logger.exception(moduleName + " - Exception! Error: " +
                                  err.message)
            raise Exception(err.message)
Exemple #19
0
class Vantage(ApplicationBase):
    '''
    This class is used to get the Vanatage data from IHS Vantage Database, transform it and load it into Redshift.
    '''
    def __init__(self):
        '''
        Initial settings
        '''
        super(Vantage, self).__init__()

        self.awsParams = ""
        self.packedFolder = None
        self.rawFolder = None
        self.fileUtilities = FileUtilities(self.logger)
        self.location = FileUtilities.PathToForwardSlash(
            os.path.dirname(os.path.abspath(__file__)))

    def BulkExtractAll(self):
        '''
        Controls the flow thru the different data sets coming from Vantage DB.
        '''
        try:
            for dsScript in self.job["extractingScripts"]:
                self.logger.info(self.moduleName + " Starts extracting " +
                                 dsScript["tableSuffix"] + " data...")

                self.bcpUtilities.RunBCPJob(
                    self.job["mssqlLoginInfo"],
                    self.job["bcpUtilityDirOnLinux"],
                    self.fileUtilities.LoadSQLQuery(self.location +
                                                    dsScript["scriptFile"]),
                    self.localTempDirectory + "/Raw/" +
                    dsScript["tableSuffix"] + ".CSV", self.job["delimiter"])
        except Exception as err:
            self.logger.error(
                "Error while trying to Bulk Extract all. Message: " +
                err.message)
            raise

    def TransformAndPackAll(self):
        '''
        Compress the csv files created.
        '''
        rawFiles = self.fileUtilities.ScanFolder(self.rawFolder, None, "CSV")

        try:
            for rFile in rawFiles:
                rFileFull = self.rawFolder + "/" + rFile

                self.logger.info(self.moduleName +
                                 " started compressing file: " + rFile)

                self.fileUtilities.GzipFile(
                    rFileFull, self.packedFolder + "/" + rFile + ".GZ")

                self.fileUtilities.RemoveFileIfItExists(rFileFull)
        except Exception as err:
            self.logger.error(self.moduleName +
                              " Error while compressing raw files. Message: " +
                              err.message)
            raise

    def LoadAllFromS3(self):
        '''
        Load all CSVs from the Vantage's S3 bucket into Redshift
        '''
        rsConnect = None

        try:
            s3DataFolder = "s3://" + self.job["bucketName"] + self.job[
                "s3ToDirectory"]

            rsConnect = RedshiftUtilities.Connect(
                dbname=self.awsParams.redshift['Database'],
                host=self.awsParams.redshift['Hostname'],
                port=self.awsParams.redshift['Port'],
                user=self.awsParams.redshiftCredential['Username'],
                password=self.awsParams.redshiftCredential['Password'])

            for dsScript in self.job["extractingScripts"]:
                RedshiftUtilities.LoadDataFromS3(
                    rsConnect, self.awsParams.s3, {
                        "destinationSchema":
                        self.job["destinationSchema"],
                        "tableName":
                        self.job["tableName"] + dsScript["tableSuffix"],
                        "s3Filename":
                        s3DataFolder + "/" + dsScript["tableSuffix"] +
                        ".CSV.GZ",
                        "fileFormat":
                        self.job["fileFormat"],
                        "dateFormat":
                        self.job["dateFormat"],
                        "delimiter":
                        self.job["delimiter"]
                    }, self.logger, "N")

            self.logger.info(self.moduleName + " - Cleaning s3 data folder...")

            S3Utilities.DeleteFileFromS3TempUsingAWSCLi(
                s3DataFolder, "--recursive --quiet")
        except Exception:
            self.logger.error(
                self.moduleName +
                " - Error while trying to save into Redshift from s3 folder.")
            raise
        finally:
            if rsConnect is not None:
                rsConnect.close()

    def BulkUploadToS3(self):
        '''
        Uploads all GZIP files created into S3 to be uploaded later...
        '''
        self.logger.info(self.moduleName +
                         " - Uploading GZIP files to s3 folder...")

        S3Utilities.CopyItemsAWSCli(
            self.packedFolder,
            "s3://" + self.job["bucketName"] + self.job["s3ToDirectory"],
            "--recursive --quiet")

    def Start(self, logger, moduleName, filelocs):
        try:
            ApplicationBase.Start(self, logger, moduleName, filelocs)

            self.packedFolder = self.localTempDirectory + "/Packed"
            self.rawFolder = self.localTempDirectory + "/Raw"

            self.fileUtilities.RemoveFolder(self.packedFolder)
            self.fileUtilities.RemoveFolder(self.rawFolder)

            self.fileUtilities.CreateFolder(self.packedFolder)
            self.fileUtilities.CreateFolder(self.rawFolder)

            self.BulkExtractAll()
            self.TransformAndPackAll()
            self.BulkUploadToS3()
            self.LoadAllFromS3()
        except Exception as err:
            self.logger.exception(moduleName + " - Exception! Error: " +
                                  err.message)
            raise Exception(err.message)