Esempio n. 1
0
 def SkipHeader(self, inputFolderPath):
     '''
     Skips the Header
     '''
     csvFiles = self.fileUtilities.ScanFolder(inputFolderPath)
     for csvFile in csvFiles:
         FileUtilities.RemoveLines(inputFolderPath + csvFile, [1])
Esempio n. 2
0
class PGCRAirMarkets(ApplicationBase):
    '''
    Code to process the PGCR Air Markets data
    '''
    def __init__(self):
        '''
        Initial settings
        '''
        super(PGCRAirMarkets, self).__init__()

        self.awsParams = ""
        self.tempFolder = None
        self.packedFolder = None
        self.rawDataFolder = None
        self.toPackFiles = []

        self.fileUtilities = FileUtilities(self.logger)
        self.location = FileUtilities.PathToForwardSlash(
            os.path.dirname(os.path.abspath(__file__)))

    def SynchronizeSourceFolder(self):
        '''
        Synchronize the source folder from the AirMarkets bucket in s3
        '''
        self.logger.info("Synchronizing ZIP files from s3 folder...")

        S3Utilities.SyncFolderAWSCli(
            "s3://" + self.job["bucketName"] + self.job["s3SrcDirectory"],
            self.rawDataFolder, True)

    def CleanUpAndPack(self):
        '''
        Main control to iterate thru the folders cleaning the files and packing them to be uploaded to s3.
        '''
        rawFolders = self.fileUtilities.ScanFolder(self.rawDataFolder)

        for rawFolderName in rawFolders:
            self.toPackFiles = []

            self.DecompressFromRaw(rawFolderName)
            self.CleanUpRawCSV(rawFolderName)
            self.PackFiles(rawFolderName)

    def PackFiles(self, rawFolderName):
        '''
        Compress the files for a given folder, right now is only the emissions file being packed.
        '''
        self.logger.info("Packing files for folder " + rawFolderName + "...")

        for csvFile in self.toPackFiles:
            airMarketGzFile = self.packedFolder + "/" + ntpath.basename(
                csvFile) + ".gz"

            self.fileUtilities.GzipFile(csvFile, airMarketGzFile)
            self.fileUtilities.DeleteFile(csvFile)

    def CleanUpRawCSV(self, rawFolderName):
        '''
        Performs the clean-up for the emissions files replacing bd characters.
        '''
        allFiles = self.fileUtilities.ScanFolder(self.tempFolder, None, "csv")
        fileList = [
            fileName for fileName in allFiles
            if self.job["srcFileNamePrefix"] in fileName
        ]
        fileListToDel = [
            fileName for fileName in allFiles
            if self.job["srcFileNamePrefix"] not in fileName
        ]

        self.logger.info("Cleaning up files for folder " + rawFolderName +
                         "...")

        for airMarketFile in fileList:
            fullFileName = self.tempFolder + "/" + airMarketFile
            toPackFileName = self.tempFolder + "/" + self.job[
                "srcFileNamePrefix"] + "_" + rawFolderName + ".csv"

            self.fileUtilities.ReplaceIterativelyInFile(
                fullFileName, toPackFileName, [{
                    r"[^\x00-\x76]+": ""
                }, {
                    "'": "`"
                }])

            self.logger.info("toPackFileName" + toPackFileName)
            fc = self.fileUtilities.GetFileContents(toPackFileName)
            if fc is not None and len(fc) > 2:
                self.logger.info("2nd line from the file")
                self.logger.info(fc[1].strip())
            else:
                self.logger.info("File is empty")

            lineCount = self.fileUtilities.GetLineCount(toPackFileName)
            if lineCount > 0:
                self.fileUtilities.RemoveLines(toPackFileName,
                                               self.job["removeLines"])
                self.toPackFiles.append(toPackFileName)
            else:
                self.logger.warning("Empty file: " + toPackFileName)
            self.fileUtilities.DeleteFile(fullFileName)

        for airMarketFile in fileListToDel:
            self.fileUtilities.DeleteFile(self.tempFolder + "/" +
                                          airMarketFile)

    def DecompressFromRaw(self, rawFolderName):
        '''
        Extracts the files from the EPADownload.zip file...
        '''
        try:
            filePath = self.rawDataFolder + "/" + rawFolderName + "/" + self.job[
                "inputZipFileName"]

            self.logger.info("Unpacking file: " + filePath)
            self.fileUtilities.UnzipUsing7z(filePath, self.tempFolder)
        except Exception as err:
            self.logger.info("Unable to decompress file: " + filePath +
                             " Error: " + err.message)

    def UploadPackedToS3(self):
        '''
        Uploads all files packed to s3.
        '''
        self.logger.info("Uploading GZIP files to s3 folder...")

        S3Utilities.CopyItemsAWSCli(
            self.packedFolder,
            "s3://" + self.job["bucketName"] + self.job["s3ToDirectory"],
            "--recursive --quiet")

    def LoadAirMarketsTables(self):
        '''
        Performs the final step to insert multiple files located in s3 into the final table in Redshift.
        '''
        try:
            s3DataFolder = "s3://" + self.job["bucketName"] + self.job[
                "s3ToDirectory"]

            rsConnect = RedshiftUtilities.Connect(
                dbname=self.awsParams.redshift['Database'],
                host=self.awsParams.redshift['Hostname'],
                port=self.awsParams.redshift['Port'],
                user=self.awsParams.redshiftCredential['Username'],
                password=self.awsParams.redshiftCredential['Password'])

            RedshiftUtilities.LoadDataFromS3(
                rsConnect, self.awsParams.s3, {
                    "destinationSchema":
                    self.job["destinationSchema"],
                    "tableName":
                    self.job["tableName"] + self.job["srcFileNamePrefix"],
                    "s3Filename":
                    s3DataFolder,
                    "fileFormat":
                    self.job["fileFormat"],
                    "dateFormat":
                    self.job["dateFormat"],
                    "delimiter":
                    self.job["delimiter"]
                }, self.logger, "N")

            self.logger.info("Cleaning s3 data folder...")

            S3Utilities.DeleteFileFromS3TempUsingAWSCLi(
                s3DataFolder, "--recursive --quiet")
        except Exception:
            self.logger.error(
                "Error while trying to save into Redshift from s3 folder.")
            raise

    def CleanWorkingFolders(self):
        '''
        Ensures the folders are cleaned and ready before the process execution.
        '''
        self.logger.info("Cleaning local working folders...")

        FileUtilities.RemoveFolder(self.tempFolder)
        FileUtilities.RemoveFolder(self.packedFolder)

        FileUtilities.CreateFolder(self.tempFolder)
        FileUtilities.CreateFolder(self.packedFolder)

    def Start(self, logger, moduleName, filelocs):
        '''
        Start of routine
        '''
        try:
            ApplicationBase.Start(self, logger, moduleName, filelocs)

            self.tempFolder = self.localTempDirectory + "/Temp"
            self.packedFolder = self.localTempDirectory + "/Packed"
            self.rawDataFolder = self.localTempDirectory + "/RawData"

            self.CleanWorkingFolders()
            self.SynchronizeSourceFolder()
            self.CleanUpAndPack()
            self.UploadPackedToS3()
            self.LoadAirMarketsTables()
        except:
            logger.exception(moduleName + " - Exception!")
            raise
class PGCRAirMarketsAthenaSpark(ApplicationBase):
    '''
    Code to process the PGCR Air Markets data
    '''
    def __init__(self):
        '''
        Initial settings
        '''
        super(PGCRAirMarketsAthenaSpark, self).__init__()

        self.awsParams = ""
        self.tempFolder = None
        self.packedFolder = None
        self.rawDataFolder = None
        self.toPackFiles = []

        self.fileUtilities = FileUtilities(self.logger)
        self.location = FileUtilities.PathToForwardSlash(os.path.dirname(os.path.abspath(__file__)))

    def SynchronizeSourceFolder(self):
        '''
        Synchronize the source folder from the AirMarkets bucket in s3
        '''
        self.logger.info("Synchronizing ZIP files from s3 folder...")

        S3Utilities.SyncFolderAWSCli("s3://" + self.job["bucketName"] + self.job["s3SrcDirectory"],
                                     self.rawDataFolder,
                                     True)

    def CleanUpAndPack(self):
        '''
        Main control to iterate thru the folders cleaning the files and packing them to be uploaded to s3.
        '''
        rawFolders = self.fileUtilities.ScanFolder(self.rawDataFolder)

        for rawFolderName in rawFolders:
            self.toPackFiles = []

            self.DecompressFromRaw(rawFolderName)
            self.CleanUpRawCSV(rawFolderName)
            self.PackFiles(rawFolderName)

    def PackFiles(self, rawFolderName):
        '''
        Compress the files for a given folder, right now is only the emissions file being packed.
        '''
        self.logger.info("Packing files for folder " + rawFolderName + "...")

        for csvFile in self.toPackFiles:
            airMarketGzFile = self.packedFolder + "/" + ntpath.basename(csvFile) + ".gz"

            self.fileUtilities.GzipFile(csvFile, airMarketGzFile)
            self.fileUtilities.DeleteFile(csvFile)

    def CleanUpRawCSV(self, rawFolderName):
        '''
        Performs the clean-up for the emissions files replacing bd characters.
        '''
        allFiles = self.fileUtilities.ScanFolder(self.tempFolder, None, "csv")
        fileList = [fileName for fileName in allFiles if self.job["srcFileNamePrefix"] in fileName]
        fileListToDel = [fileName for fileName in allFiles if self.job["srcFileNamePrefix"] not in fileName]

        self.logger.info("Cleaning up files for folder " + rawFolderName + "...")

        for airMarketFile in fileList:
            fullFileName = self.tempFolder + "/" + airMarketFile
            toPackFileName = self.tempFolder + "/" + self.job["srcFileNamePrefix"] + "_" + rawFolderName + ".csv"

            self.fileUtilities.ReplaceIterativelyInFile(fullFileName,
                                                        toPackFileName,
                                                        [{r"[^\x00-\x76]+":""}, {"'":"`"}])

            self.fileUtilities.RemoveLines(toPackFileName, self.job["removeLines"])
            self.toPackFiles.append(toPackFileName)
            self.fileUtilities.DeleteFile(fullFileName)

        for airMarketFile in fileListToDel:
            self.fileUtilities.DeleteFile(self.tempFolder + "/" + airMarketFile)

    def DecompressFromRaw(self, rawFolderName):
        '''
        Extracts the files from the EPADownload.zip file...
        '''
        try:
            filePath = self.rawDataFolder + "/" + rawFolderName + "/" + self.job["inputZipFileName"]

            self.logger.info("Unpacking file: " + filePath)
            self.fileUtilities.UnzipUsing7z(filePath, self.tempFolder)
        except StandardError as err:
            self.logger.info("Unable to decompress file: " + filePath + " Error: " + err.message)

    def UploadPackedToS3(self):
        '''
        Uploads all files packed to s3.
        '''
        self.logger.info("Uploading GZIP files to s3 folder...")

        S3Utilities.CopyItemsAWSCli(self.packedFolder,
                                    "s3://" + self.job["bucketName"] + self.job["s3ToDirectory"],
                                    "--recursive --quiet")

    def LoadAirMarketsTables(self):
        '''
        Performs the final step to insert multiple files located in s3 into the final table in Redshift.
        '''
        try:
            s3DataFolder = "s3://" + self.job["bucketName"] + self.job["s3ToDirectory"]

            rsConnect = RedshiftUtilities.Connect(dbname=self.awsParams.redshift['Database'],
                                                  host=self.awsParams.redshift['Hostname'],
                                                  port=self.awsParams.redshift['Port'],
                                                  user=self.awsParams.redshiftCredential['Username'],
                                                  password=self.awsParams.redshiftCredential['Password'])

            RedshiftUtilities.LoadDataFromS3(rsConnect, self.awsParams.s3,
                                             {
                                                 "destinationSchema": self.job["destinationSchema"],
                                                 "tableName": self.job["tableName"] + self.job["srcFileNamePrefix"],
                                                 "s3Filename": s3DataFolder,
                                                 "fileFormat": self.job["fileFormat"],
                                                 "dateFormat": self.job["dateFormat"],
                                                 "delimiter": self.job["delimiter"]
                                             },
                                             self.logger, "N")

            self.logger.info("Cleaning s3 data folder...")

            S3Utilities.DeleteFileFromS3TempUsingAWSCLi(s3DataFolder, "--recursive --quiet")
        except Exception:
            self.logger.error("Error while trying to save into Redshift from s3 folder.")
            raise

    def CleanWorkingFolders(self):
        '''
        Ensures the folders are cleaned and ready before the process execution.
        '''
        self.logger.info("Cleaning local working folders...")

        FileUtilities.RemoveFolder(self.tempFolder)
        FileUtilities.RemoveFolder(self.packedFolder)

        FileUtilities.CreateFolder(self.tempFolder)
        FileUtilities.CreateFolder(self.packedFolder)

    def ProcessTable(self,table):
        '''
        Process data for the table
        :param table:
        :return:
        '''

        s3Key = self.job["s3Filename"]
        self.logger.info(self.moduleName + " - Processing file: " + s3Key)

        self.fileUtilities.moduleName = self.moduleName
        self.fileUtilities.localBaseDirectory = self.localTempDirectory + "/" + table["table"]
        self.fileUtilities.CreateFolders(self.job["folders"])

        fileName = ntpath.basename(s3Key)

        local7zipFilePath = self.fileUtilities.gzipFolder+ "/" +fileName

        S3Utilities.DownloadFileFromS3(self.awsParams.s3,self.job["bucketName"],
                                       s3Key,local7zipFilePath)

        localCsvFilepath = self.fileUtilities.csvFolder + "/" + fileName
        localCsvFilepath = re.sub(r'\.zip$', '', localCsvFilepath)


        self.fileUtilities.UnzipUsing7z(local7zipFilePath,localCsvFilepath)
        fileToBeloaded = localCsvFilepath+'/'+'emission_05-11-2017.csv'

        spark = SparkUtilities.GetCreateSparkSession(self.logger)
        schema = SparkUtilities.BuildSparkSchema(table)

        df = (spark.read
              .format("com.databricks.spark.csv")
              .options(header='true', delimiter=self.job["delimiter"],ignoreTrailingWhiteSpace='true')
              .schema(schema)
              .load(fileToBeloaded)
              )

        #df.show()
        self.logger.info(
            self.moduleName + " -- " + "Done reading " + str(df.count()) + " rows.  Now saving as parquet file...")
        SparkUtilities.SaveParquet(df, self.fileUtilities)
        self.UploadFilesCreateAthenaTablesAndSqlScripts(table,self.fileUtilities.parquet)
        self.logger.info(self.moduleName + " -- " + "UploadFilesCreateAthenaTablesAndSqlScripts " + " finished ")

    def Start(self, logger, moduleName, filelocs):
        '''
        Start of routine
        '''
        try:
            ApplicationBase.Start(self, logger, moduleName, filelocs)
            self.logger.debug(self.moduleName + " -- " + " starting ")
            currProcId = self.etlUtilities.GetRunID(filelocs["tblEtl"]["table"], self.moduleName)

            for table in self.job["tables"]:
                self.ProcessTable(table)

            if self.job["cleanlocal"] == "Y":
                self.fileUtilities.RemoveFolder(self.localTempDirectory)

            self.logger.debug(self.moduleName + " -- " + " finished ")

        except Exception as err:
            self.logger.exception(moduleName + " - Exception! Error: " + err.message)
            if self.etlUtilities.CompleteInstance(filelocs["tblEtl"]["table"], \
                                                  currProcId, 'F') is not True:
                self.logger.info(self.moduleName + " - we could not Complete Instance.")
            raise Exception(err.message)