Exemple #1
0
    def CheckWorkingFolders(self):
        '''
        Check if the working folders are out there to re-create them
        '''
        self.logger.info(self.moduleName + "Checking on working folders...")

        FileUtilities.RemoveFolder(self.rawFolder)
        FileUtilities.RemoveFolder(self.csvFolder)
        FileUtilities.CreateFolder(self.rawFolder)
        FileUtilities.CreateFolder(self.csvFolder)
Exemple #2
0
    def CleanWorkingFolders(self):
        '''
        Ensures the folders are cleaned and ready before the process execution.
        '''
        self.logger.info("Cleaning local working folders...")

        FileUtilities.RemoveFolder(self.tempFolder)
        FileUtilities.RemoveFolder(self.packedFolder)

        FileUtilities.CreateFolder(self.tempFolder)
        FileUtilities.CreateFolder(self.packedFolder)
Exemple #3
0
 def CreateFolder(self, folder, createIt):
     '''
     if a folder needs to be created it does it and if we want a fresh folder it will do that to
     '''
     try:
         fName = folder["name"]
         tfoldername = self.localTempDirectory + "/" + folder["folder"] + "/"
         if fName == "sql":
             self.sqlFolder = tfoldername
         elif fName == "csv":
             self.csvFolder = tfoldername
         elif fName == "zips":
             self.zipFolder = tfoldername
         elif fName == "gzips":
             self.gzipFolder = tfoldername
         if createIt == "Y":
             if folder["new"] == "Y":
                 FileUtilities.RemoveFolder(tfoldername)
             FileUtilities.CreateFolder(tfoldername)
     except:
         self.logger.exception(self.moduleName + " had an issue in CreateFolder for " + folder)
         raise
 def EmptyPackedFolder(self):
     '''
     Empties the packed folder
     '''
     FileUtilities.RemoveFolder(self.packedFolder)
class MagellanUtilities(object):
    '''
    method that can be used to convert json files to csv files
    '''
    def __init__(self):
        self.commonParams = {}
        self.fl = None
        self.moduleName = 'MagellanUtilities'
        self.localTempDirectory = None
        self.fileUtilities = None
        self.logger = None

    def BuildTables(self, tables):
        '''
        Builds the tables
        '''
        try:
            for table in tables:
                fname = self.commonParams["sqlFolder"] + "Create_" + table[
                    "name"] + ".sql"
                RedshiftUtilities.PSqlExecute(fname, self.logger)
        except:
            self.logger.exception(self.moduleName +
                                  "- we had an error in BuildTables")
            raise

    def CreateSQLFiles(self, proc, dest):
        '''
        Routine to create sql files to use to create tables in RedShift
        '''
        try:
            for table in proc["tables"]:
                fname = self.commonParams["sqlFolder"] + "Create_" + table[
                    "name"] + ".sql"
                self.logger.info(fname)
                outfile = open(fname, "w")
                outLine = "DROP TABLE IF EXISTS {}.{};".format(
                    dest, table["name"])
                outLine = FileUtilities.PutLine(outLine, outfile)
                outLine = "CREATE TABLE {}.{} (".format(dest, table["name"])
                outLine = FileUtilities.PutLine(outLine, outfile)
                ndx = 0
                for fld in table["fields"]:
                    if ndx > 0:
                        outLine = ','
                    ndx = ndx + 1
                    outLine = outLine + fld["name"] + " " + fld["type"]
                    if fld["type"] == "VARCHAR":
                        outLine = outLine + "(" + fld["size"] + ")  ENCODE LZO"
                    outLine = FileUtilities.PutLine(outLine, outfile)
                outfile.write(");")
                outfile.close()
        except:
            self.logger.exception(self.moduleName +
                                  "- we had an error in CreateSQLFiles ")
            raise

    def InitiateCsvFile(self, batchName, preFx, subFolder):
        '''
        initial creation of CSV file
        '''
        try:
            csvFileName = self.commonParams[
                "csvFolder"] + subFolder + "/" + preFx + batchName + ".csv"
            outFile = open(csvFileName, "ab")
            return outFile
        except:
            self.logger.exception(self.moduleName +
                                  "- we had an error in InitiateCsvFile ")
            raise

    def CreateCsvFile(self, batchName, preFx, outputHolding, subFolder):
        '''
        create the csv file from the array and name the file with the prefix at the front
        '''
        try:
            ###
            #  initiate the CSV files for each type
            ###
            outFile = self.InitiateCsvFile(batchName, preFx, subFolder)
            csvWriter = csv.writer(outFile, quoting=csv.QUOTE_ALL)
            for oArray in outputHolding:
                outLine = []
                for item in oArray:
                    if isinstance(item, basestring):
                        outLine.append(item.encode('utf-8'))
                    else:
                        outLine.append(item)
                csvWriter.writerow(outLine)
            outFile.close()
        except:
            self.logger.exception(self.moduleName +
                                  "- we had an error in CreateCsvFile ")
            raise

    def ContinueProcessJson(self, rec):
        '''
        just a simple test to see if we want to continue processing record
        '''
        try:
            retVal = False
            if "observations" in rec:
                if rec["observations"] is not None:
                    if len(rec["observations"]) > 0:
                        retVal = True
            return retVal
        except:
            self.logger.exception(self.moduleName +
                                  "- we had an error in ContinueProcessJson ")
            raise

    def FindVal(self, inRec, inItem):
        '''
        runs thru the json object and searches for the key and if found
        then return the associated value
        '''
        rtnFound = False
        rtnValue = ''
        try:
            for key in inRec.keys():
                if str(key) == str(inItem):
                    rtnFound = True
                    rtnValue = inRec[key]
                    return rtnFound, rtnValue
                if isinstance(inRec[key], dict):
                    rtnFound, rtnValue = self.FindVal(inRec[key], inItem)
        except:
            self.logger.exception(self.moduleName +
                                  "- we had an error in FindVal ")
            raise
        return rtnFound, rtnValue

    def CheckValue(self, inRec, inItem, fType):
        '''
        Helper method so that we remove cr lf if they are in the string
        '''
        try:
            found = False
            found, val = self.FindVal(inRec, inItem)
            if found is False:
                return ''
            if fType == 'VARCHAR':
                if isinstance(val, str):
                    val = self.fileUtilities.RemoveSpecialCharsFromString(val)
                    val = re.sub(r'\\', r'\\\\', val)
                elif isinstance(val, unicode):
                    val = self.fileUtilities.RemoveSpecialCharsFromString(val)
                    val = re.sub(r'\\', r'\\\\', val)
            if fType == "DATE":
                if isinstance(val, str):
                    tval = datetime.strptime(val, '%Y-%m-%d')
                    if tval.year < 1900:
                        tval = datetime.strptime('1900-01-01', '%Y-%m-%d')
                    val = tval.strftime('%Y-%m-%d')
                elif isinstance(val, datetime):
                    val = val.strftime('%Y-%m-%d')
            return val
        except:
            self.logger.exception(self.moduleName +
                                  "- we had an error in CheckValue - " +
                                  inItem)
            raise

    def LoadJData(self, src, jFile):
        '''
        try and load the data into a Json Object
        '''
        jdata = open(src + "/" + jFile).read()
        dataFile = None
        encodeList = [None, "cp1252"]
        for enc in encodeList:
            try:
                if enc is not None:
                    dataFile = json.loads(jdata, encoding=enc)
                else:
                    dataFile = json.loads(jdata)
                break
            except ValueError:
                continue
        if dataFile is None:
            self.logger.exception(self.moduleName +
                                  "- we had an error in LoadJData - " + jFile)
            raise Exception('could not load json file')
        else:
            return dataFile

    def ProcessJsonFile(self, src, jfl, batchName):
        '''
        loads the json and then calls the process routine
        '''
        try:
            dataFile = self.LoadJData(src, jfl)
            self.ProcessJson(dataFile, batchName)
        except:
            self.logger.exception(
                self.moduleName +
                "- we had an error in ProcessJsonFile with file " + jfl)
            raise

    def LoadAttrArray(self, rec, srcId, keyField='source_id'):
        '''
        return array of attribute values
        '''
        outAttrRecArray = []
        try:
            for fld in self.commonParams["attrFields"]:
                if fld["name"] == keyField:
                    outAttrRecArray.append(srcId)
                else:
                    outAttrRecArray.append(
                        self.CheckValue(rec, fld["name"], fld["type"]))
        except:
            self.logger.exception(self.moduleName +
                                  "- we had an error in LoadAttrArray ")
            raise
        return outAttrRecArray

    def ProcessJson(self, dataFile, batchName):
        '''
        process one json file and create two csv files from it
        '''
        try:  # pylint: disable=too-many-nested-blocks
            outputAttrHolding = []
            outputDataHolding = []
            ###
            #  gets all the atributes
            ###
            if 'value' in dataFile:
                for rec in dataFile["value"]:
                    if self.ContinueProcessJson(rec) is True:
                        srcId = self.CheckValue(rec, "source_id", "VARCHAR")
                        for obsRec in rec["observations"]:
                            outputDataHolding.append(
                                [srcId, obsRec["date"], obsRec["value"]])

                        outAttrRecArray = self.LoadAttrArray(rec, srcId)
                        outputAttrHolding.append(outAttrRecArray)
            else:
                rec = dataFile
                if self.ContinueProcessJson(rec) is True:
                    srcId = self.CheckValue(rec, "source_id", "VARCHAR")
                    for obsRec in rec["observations"]:
                        outputDataHolding.append(
                            [srcId, obsRec["date"], obsRec["value"]])

                    outAttrRecArray = self.LoadAttrArray(rec, srcId)
                    outputAttrHolding.append(outAttrRecArray)

            ###
            #  now create fill csv file
            ###
            self.CreateCsvFile(batchName, "attr_", outputAttrHolding,
                               "attribute")
            self.CreateCsvFile(batchName, "data_", outputDataHolding, "data")

        except:
            self.logger.exception(
                self.moduleName +
                "- we had an error in ProcessJsonFile with batch " + batchName)
            raise

    def ProcessZipContents(self, zFileFolder, batchName):
        '''
        process all the files that were in the zip file
        '''
        try:
            onlyFiles = [
                fl for fl in listdir(zFileFolder)
                if isfile(join(zFileFolder, fl))
            ]
            for jfl in onlyFiles:
                self.ProcessJsonFile(zFileFolder, jfl, batchName)
        except:
            self.logger.exception(
                self.moduleName +
                "- we had an error in ProcessZipContents with batch " +
                batchName)
            raise

    def GZipItUp(self, batchFolderName):
        '''
        routine to gzip the csv files and put them in a gzip folder
        '''
        try:
            ###
            #  since we are one csv per batch we can just zip the combined csv
            ###

            FileUtilities.CreateFolder(self.commonParams["gzipFolder"] +
                                       "attr/")
            FileUtilities.CreateFolder(self.commonParams["gzipFolder"] +
                                       "data/")
            if self.commonParams["csvFolder"].endswith("/"):
                attrFileNamecsv = self.commonParams[
                    "csvFolder"] + 'attribute/' + 'attr_' + batchFolderName + '.csv'
                dataFileNamecsv = self.commonParams[
                    "csvFolder"] + 'data/' + 'data_' + batchFolderName + '.csv'
            else:
                attrFileNamecsv = self.commonParams[
                    "csvFolder"] + '/attribute/' + 'attr_' + batchFolderName + '.csv'
                dataFileNamecsv = self.commonParams[
                    "csvFolder"] + '/data/' + 'data_' + batchFolderName + '.csv'
            if self.commonParams["gzipFolder"].endswith("/"):
                attrFileNameGz = self.commonParams[
                    "gzipFolder"] + "attr/" + 'attr_' + batchFolderName + '.csv.gz'
                dataFileNameGz = self.commonParams[
                    "gzipFolder"] + "data/" + 'data_' + batchFolderName + '.csv.gz'
            else:
                attrFileNameGz = self.commonParams[
                    "gzipFolder"] + "/attr/" + 'attr_' + batchFolderName + '.csv.gz'
                dataFileNameGz = self.commonParams[
                    "gzipFolder"] + "/data/" + 'data_' + batchFolderName + '.csv.gz'

            self.fileUtilities.GzipFile(attrFileNamecsv, attrFileNameGz)
            self.fileUtilities.GzipFile(dataFileNamecsv, dataFileNameGz)
        except:
            self.logger.exception(self.moduleName +
                                  "- we had an error in ZipItUp with " +
                                  batchFolderName)
            raise

    def StartHere(self):
        '''
        initial starting routine
        '''
        try:
            self.moduleName = self.commonParams["moduleName"]
            self.logger = FileUtilities.CreateLogger(
                self.commonParams["loggerParams"])
            self.logger.info("zipfile = " + self.fl + " started " +
                             datetime.now().strftime('%Y-%m-%d %I:%M:%S'))
            ###
            #  pull this file to local instance
            ###
            fileName = ntpath.basename(self.fl)
            batchFolderName = re.sub(r'\.zip$', '', fileName)
            ###
            #  make sure we have this folder
            ###
            self.fileUtilities = FileUtilities(self.logger)
            segZipFolder = self.commonParams[
                "zipFolder"] + batchFolderName + "/"
            self.fileUtilities.RemoveFolder(segZipFolder)
            self.fileUtilities.CreateFolder(segZipFolder)
            localGzipFilepath = self.commonParams[
                "localTempDirectory"] + "/" + fileName
            self.fileUtilities.UnzipFile(localGzipFilepath, segZipFolder)
            zipContentFolder = re.sub(r'\/$', '', segZipFolder)
            directories = [
                fName for fName in os.listdir(segZipFolder)
                if os.path.isdir(os.path.join(segZipFolder, fName))
            ]
            for dirs in directories:
                zipContentFolder = os.path.join(segZipFolder, dirs)
            self.ProcessZipContents(zipContentFolder, batchFolderName)
            self.GZipItUp(batchFolderName)
            self.logger.info("zipfile = " + self.fl + " finished " +
                             datetime.now().strftime('%Y-%m-%d %I:%M:%S'))
        except:
            self.logger.exception(self.commonParams["moduleName"] +
                                  "- we had an error in StartHere")
            raise
Exemple #6
0
class Vantage(ApplicationBase):
    '''
    This class is used to get the Vanatage data from IHS Vantage Database, transform it and load it into Redshift.
    '''
    def __init__(self):
        '''
        Initial settings
        '''
        super(Vantage, self).__init__()

        self.awsParams = ""
        self.packedFolder = None
        self.rawFolder = None
        self.fileUtilities = FileUtilities(self.logger)
        self.location = FileUtilities.PathToForwardSlash(
            os.path.dirname(os.path.abspath(__file__)))

    def BulkExtractAll(self):
        '''
        Controls the flow thru the different data sets coming from Vantage DB.
        '''
        try:
            for dsScript in self.job["extractingScripts"]:
                self.logger.info(self.moduleName + " Starts extracting " +
                                 dsScript["tableSuffix"] + " data...")

                self.bcpUtilities.RunBCPJob(
                    self.job["mssqlLoginInfo"],
                    self.job["bcpUtilityDirOnLinux"],
                    self.fileUtilities.LoadSQLQuery(self.location +
                                                    dsScript["scriptFile"]),
                    self.localTempDirectory + "/Raw/" +
                    dsScript["tableSuffix"] + ".CSV", self.job["delimiter"])
        except Exception as err:
            self.logger.error(
                "Error while trying to Bulk Extract all. Message: " +
                err.message)
            raise

    def TransformAndPackAll(self):
        '''
        Compress the csv files created.
        '''
        rawFiles = self.fileUtilities.ScanFolder(self.rawFolder, None, "CSV")

        try:
            for rFile in rawFiles:
                rFileFull = self.rawFolder + "/" + rFile

                self.logger.info(self.moduleName +
                                 " started compressing file: " + rFile)

                self.fileUtilities.GzipFile(
                    rFileFull, self.packedFolder + "/" + rFile + ".GZ")

                self.fileUtilities.RemoveFileIfItExists(rFileFull)
        except Exception as err:
            self.logger.error(self.moduleName +
                              " Error while compressing raw files. Message: " +
                              err.message)
            raise

    def LoadAllFromS3(self):
        '''
        Load all CSVs from the Vantage's S3 bucket into Redshift
        '''
        rsConnect = None

        try:
            s3DataFolder = "s3://" + self.job["bucketName"] + self.job[
                "s3ToDirectory"]

            rsConnect = RedshiftUtilities.Connect(
                dbname=self.awsParams.redshift['Database'],
                host=self.awsParams.redshift['Hostname'],
                port=self.awsParams.redshift['Port'],
                user=self.awsParams.redshiftCredential['Username'],
                password=self.awsParams.redshiftCredential['Password'])

            for dsScript in self.job["extractingScripts"]:
                RedshiftUtilities.LoadDataFromS3(
                    rsConnect, self.awsParams.s3, {
                        "destinationSchema":
                        self.job["destinationSchema"],
                        "tableName":
                        self.job["tableName"] + dsScript["tableSuffix"],
                        "s3Filename":
                        s3DataFolder + "/" + dsScript["tableSuffix"] +
                        ".CSV.GZ",
                        "fileFormat":
                        self.job["fileFormat"],
                        "dateFormat":
                        self.job["dateFormat"],
                        "delimiter":
                        self.job["delimiter"]
                    }, self.logger, "N")

            self.logger.info(self.moduleName + " - Cleaning s3 data folder...")

            S3Utilities.DeleteFileFromS3TempUsingAWSCLi(
                s3DataFolder, "--recursive --quiet")
        except Exception:
            self.logger.error(
                self.moduleName +
                " - Error while trying to save into Redshift from s3 folder.")
            raise
        finally:
            if rsConnect is not None:
                rsConnect.close()

    def BulkUploadToS3(self):
        '''
        Uploads all GZIP files created into S3 to be uploaded later...
        '''
        self.logger.info(self.moduleName +
                         " - Uploading GZIP files to s3 folder...")

        S3Utilities.CopyItemsAWSCli(
            self.packedFolder,
            "s3://" + self.job["bucketName"] + self.job["s3ToDirectory"],
            "--recursive --quiet")

    def Start(self, logger, moduleName, filelocs):
        try:
            ApplicationBase.Start(self, logger, moduleName, filelocs)

            self.packedFolder = self.localTempDirectory + "/Packed"
            self.rawFolder = self.localTempDirectory + "/Raw"

            self.fileUtilities.RemoveFolder(self.packedFolder)
            self.fileUtilities.RemoveFolder(self.rawFolder)

            self.fileUtilities.CreateFolder(self.packedFolder)
            self.fileUtilities.CreateFolder(self.rawFolder)

            self.BulkExtractAll()
            self.TransformAndPackAll()
            self.BulkUploadToS3()
            self.LoadAllFromS3()
        except Exception as err:
            self.logger.exception(moduleName + " - Exception! Error: " +
                                  err.message)
            raise Exception(err.message)
Exemple #7
0
class ApplicationBase(object):
    __metaclass__ = ABCMeta
    '''
    Application Base class to perform many of the basic ETL process
    '''
    def __init__(self):
        '''
        Define the class attributes
        '''
        self.logger = None
        self.moduleName = None
        self.awsParams = None
        self.fileUtilities = None
        self.bcpUtilities = None
        self.job = None
        self.localTempDirectory = None
        self.location = None
        self.localTempDataDirectory = None
        self.etlUtilities = None

    def BuildTableCreationScript(self, sqlTemplateFilename):
        '''
        Construct the actual DDL script from the template by replacing the appropriate tokens
        '''
        sqlTableCreationTemplate = self.location + '/' + sqlTemplateFilename
        sqlTableCreationScript = self.localTempDirectory + "/" + re.sub(
            'Template.sql$', '.sql', sqlTemplateFilename)
        self.fileUtilities.CreateActualFileFromTemplate(sqlTableCreationTemplate, sqlTableCreationScript,\
                                                        self.job["destinationSchema"], self.job["tableName"])
        self.logger.info(self.moduleName + " - SQL files created.")
        return sqlTableCreationScript

    def BuildTableCreationScriptTable(self,
                                      sqlTemplateFilename,
                                      tableName,
                                      templateFolder=None,
                                      sqlFolder=None):
        '''
        Construct the actual DDL script from the template for the specific table by replacing the appropriate tokens
        '''
        sqlTableCreationTemplate = self.location + '/'
        if templateFolder is None:
            sqlTableCreationTemplate = sqlTableCreationTemplate + sqlTemplateFilename
        else:
            sqlTableCreationTemplate = sqlTableCreationTemplate + templateFolder + '/' + sqlTemplateFilename

        sqlTableCreationScript = self.localTempDirectory + "/"
        if sqlFolder is not None:
            sqlTableCreationScript = sqlTableCreationScript + sqlFolder + "/"
        sqlTableCreationScript = sqlTableCreationScript + tableName + re.sub(
            'Template.sql$', '.sql', sqlTemplateFilename)
        #        sqlTableCreationScript = self.localTempDirectory + "/" + tableName + re.sub('Template.sql$', '.sql', sqlTemplateFilename)
        self.fileUtilities.CreateActualFileFromTemplate(
            sqlTableCreationTemplate, sqlTableCreationScript,
            self.job["destinationSchema"], tableName)
        self.logger.info(self.moduleName + " - " + tableName +
                         " - SQL files created.")
        return sqlTableCreationScript

    def CreateTables(self, sqlTemplateFilename):
        '''
        Create the actual tables
        '''
        sqlTableCreationScript = self.BuildTableCreationScript(
            sqlTemplateFilename)

        # The following code will recreate all the tables.  EXISTING DATA WILL BE DELETED
        RedshiftUtilities.PSqlExecute(sqlTableCreationScript, self.logger)
        self.logger.info(self.moduleName + " - SQL tables created.")

    def LoadEnvironmentVariables(self, logger):
        '''
        sub method to just load in all environment variables
        '''
        self.logger = logger
        # Load the AWS configuration parameters for S3 and Redshift
        self.awsParams = ConfigureAWS.ConfigureAWS()
        self.awsParams.LoadAWSConfiguration(self.logger)
        return self

    def Start(self, logger, moduleName, filelocs):
        '''
        Start the process.  Do the common operations.
        '''
        self = self.LoadEnvironmentVariables(logger)
        #        self.logger = logger
        self.logger.info(moduleName + " - Getting configuration information.")

        self.moduleName = moduleName

        # Load the job parameters
        self.fileUtilities = FileUtilities(logger)
        jobConfigFile = self.location + '/' 'jobConfig.json'
        self.job = self.fileUtilities.LoadJobConfiguration(jobConfigFile)

        # This is where all the work files will be created
        self.localTempDirectory = FileUtilities.PathToForwardSlash(
            filelocs["relativeOutputfolder"] + "/" + moduleName)
        FileUtilities.CreateFolder(self.localTempDirectory)

        # This is where all the local data will be located
        if "relativeInputfolder" in filelocs:
            self.localTempDataDirectory = FileUtilities.PathToForwardSlash(
                filelocs["relativeInputfolder"] + "/" + moduleName)
            FileUtilities.CreateFolder(self.localTempDataDirectory)

        self.bcpUtilities = BCPUtilities(logger, self.fileUtilities,
                                         self.awsParams,
                                         self.localTempDirectory)

        # Create tables if we have a valid script
        if "sqlScript" in self.job:
            self.CreateTables(self.job["sqlScript"])

        #  Create etlprocess log table if it does not already exist
        if "tblEtl" in filelocs:
            self.etlUtilities = EtlLoggingUtilities(self.logger)
            self.etlUtilities.awsParams = self.awsParams
            self.etlUtilities.filelocs = filelocs
            self.etlUtilities.moduleName = self.moduleName
            self.etlUtilities.appschema = filelocs["tblEtl"]["appschema"]
            self.etlUtilities.StartEtlLogging()

        if "folders" in self.job:
            self.fileUtilities.moduleName = self.moduleName
            self.fileUtilities.localBaseDirectory = self.localTempDirectory
            self.fileUtilities.CreateFolders(self.job["folders"])

    def CreateFolders(self, subFolder):
        '''
        Create the various subfolders defined in the jobConfig.jon for the table being processes
        '''
        self.fileUtilities.moduleName = self.moduleName
        self.fileUtilities.localBaseDirectory = self.localTempDirectory + "/" + subFolder
        self.fileUtilities.CreateFolders(self.job["folders"])

    def UploadFilesCreateAthenaTablesAndSqlScripts(self,
                                                   table,
                                                   localParquetFolderName,
                                                   partitionValue=None):
        '''
        Upload Parquet files into S3
        Create Athena Table/Partition
        Create script to create a RedShift table and save to S3 (note that the ETL may not necessarily load data into Redshift)
        Create script to insert data into Redshift and save to S3  (note that the ETL may not necessarily load data into Redshift)
        '''
        if not FileUtilities.FilesExistInFolder(localParquetFolderName +
                                                "*.parquet"):
            # Nothing was created.  We have a problem
            self.logger.info(
                self.moduleName +
                " - No parquet files were created for current partition in: " +
                localParquetFolderName + ".  Nothing was processed on Athena.")
            return False

        self.fileUtilities.CreateTableSql(table, self.fileUtilities.sqlFolder)

        scriptPartitionValue = partitionValue
        if AthenaUtilities.IsTablePartitioned(table):
            # For partitioned tables, the script will insert a where clause by default.  However, if we are doing a new load
            # skip the where clause so that we can have SQL script that is capable of loading all the data from Athena
            # into RedShift in the future
            s3FolderLocation = AthenaUtilities.ComposeAthenaS3DataFileKey(
                table["schemaName"], table["table"])
            if not S3Utilities.KeyExist(
                    self.awsParams, s3FolderLocation
            ):  # Do not update scripts if data has been previously loaded
                scriptPartitionValue = None
        AthenaUtilities.SqlToLoadDataFromAthena(self.logger, table,
                                                self.fileUtilities.sqlFolder,
                                                scriptPartitionValue)

        AthenaUtilities.UploadFilesAndCreateAthenaTables(
            self.awsParams, localParquetFolderName, table,
            self.fileUtilities.sqlFolder, self.logger, partitionValue)
        return True

    def LoadDataFromAthenaIntoRedShiftLocalScripts(self,
                                                   table,
                                                   customWhereCondition=None):
        '''
        If at a later time we decide to drop the Redshift table and re-load the data from Athena, we need a utility to do that
        '''
        # Under the hood the table will be recreated if the new flag is on or if the table does not exist
        # Load the data from Athena into RedShift after that.  The load query only loads what needed from Athena
        scriptToCreateRedshiftTable = FileUtilities.ComposeCreateTableSqlFilename(
            table, self.fileUtilities.sqlFolder)
        RedshiftUtilities.PSqlExecute(scriptToCreateRedshiftTable, self.logger)

        scriptToLoadDataFromAthena = AthenaUtilities.ComposeInsertIntoSqlFilename(
            table, self.fileUtilities.sqlFolder)
        if customWhereCondition:
            # Replace the existing where clause with the custom clause
            customWhereCondition = " AND " + customWhereCondition + ";"
            replacements = {';': customWhereCondition}
            scriptToLoadDataFromAthenaCustom = scriptToLoadDataFromAthena + "_custom.sql"
            self.fileUtilities.ReplaceStringInFile(
                scriptToLoadDataFromAthena, scriptToLoadDataFromAthenaCustom,
                replacements)
            scriptToLoadDataFromAthena = scriptToLoadDataFromAthenaCustom
        RedshiftUtilities.PSqlExecute(scriptToLoadDataFromAthena, self.logger)

    def LoadDataFromAthenaIntoRedShiftS3Scripts(self, table):
        '''
        If at a later time we decide to drop the Redshift table and re-load the data from Athena, we need a utility to do that
        '''
        # Download scripts from S3 to local folder
        AthenaUtilities.DownloadScriptsForRedShift(
            self.awsParams, table, self.fileUtilities.sqlFolder)
        self.LoadDataFromAthenaIntoRedShiftLocalScripts(table)

    def ProcessTables(self, dbCommon, tables):
        """ Process Tables in the actual derived class """
        # YOU MUST IMPLEMENT THIS METHOD IN THE DERIVED CLASS
        raise NotImplementedError()

    def ProcessCatalogs(self, dbCommon, catalog):
        '''
        pulls data from each table in the catalog
        '''
        try:
            self.logger.debug(self.moduleName + " -- ProcessCatalogs for  " +
                              catalog["name"] + " starting")
            for tables in catalog["tables"]:
                self.ProcessTables(dbCommon, tables)
            self.logger.debug(self.moduleName + " -- ProcessCatalogs for  " +
                              catalog["name"] + " finished ----------.")
        except:
            self.logger.exception(
                self.moduleName +
                " - we had an error in ProcessCatalogs for " + catalog["name"])
            raise

    def ProcessDatabase(self, databaseSettings):
        '''
        takes the database settings and tries to process them
        '''
        try:
            self.logger.debug(self.moduleName + " -- ProcessDatabase for " +
                              databaseSettings["common"]["name"] + " starting")
            for catalog in databaseSettings["catalogs"]:
                if "execute" not in catalog or catalog["execute"] == 'Y':
                    self.ProcessCatalogs(databaseSettings["common"], catalog)
                else:
                    self.logger.debug(self.moduleName +
                                      " -- ProcessDatabase skip for " +
                                      catalog["name"])

            if "cleanlocal" in self.job and self.job["cleanlocal"] == "Y":
                self.fileUtilities.RemoveFolder(self.localTempDirectory)

            self.logger.debug(self.moduleName + " -- ProcessDatabase for " +
                              databaseSettings["common"]["name"] + " finished")
        except:
            self.logger.exception(
                self.moduleName +
                " - we had an error in ProcessDatabase for " +
                databaseSettings["common"]["name"])
            raise

    def ProcessInput(self, logger, moduleName, filelocs):
        '''
        Bootstrap code that process all the databases, catalogs and tables
        '''
        currProcId = None
        try:
            self.logger.debug(self.moduleName + " -- " + "Starting...")
            if "tblEtl" in filelocs:
                currProcId = self.etlUtilities.GetRunID(
                    filelocs["tblEtl"]["table"], self.moduleName)

            if "Databases" in self.job:
                for databaseSettings in self.job["Databases"]:
                    if databaseSettings["execute"] == 'Y':
                        self.ProcessDatabase(databaseSettings)
                    else:
                        self.logger.debug(self.moduleName +
                                          " -- Skipping database: " +
                                          databaseSettings["common"]["name"])
            elif "catalogs" in self.job:
                self.ProcessDatabase(self.job)
            elif "tables" in self.job:
                dbCommon = None
                if "common" in self.job:
                    dbCommon = self.job["common"]
                self.ProcessCatalogs(dbCommon, self.job)
            if self.job["cleanlocal"] == "Y":
                self.fileUtilities.RemoveFolder(self.localTempDirectory)

            self.logger.debug(self.moduleName + " -- " + " finished.")
        except Exception as err:
            self.logger.exception(moduleName + " - Exception! Error: " +
                                  err.message)
            if "tblEtl" in filelocs and self.etlUtilities.CompleteInstance(\
                filelocs["tblEtl"]["table"], currProcId, 'F') is not True:
                self.logger.info(self.moduleName +
                                 " - we could not Complete Instance.")
            raise Exception(err.message)
Exemple #8
0
class Consensus(ApplicationBase):
    '''
    This class is used to control the data load process from different OPEC file sources.
    '''
    def __init__(self):
        '''
        Initial settings
        '''
        super(Consensus, self).__init__()

        self.awsParams = ""
        self.rawFolder = None
        self.csvFolder = None
        self.fileUtilities = FileUtilities(self.logger)
        self.location = FileUtilities.PathToForwardSlash(
            os.path.dirname(os.path.abspath(__file__)))

    def BulkDownload(self):
        '''
        Download all files.
        '''
        sharedFiles = self.fileUtilities.ScanFolder(
            self.job["srcSharedFolder"])

        self.logger.info(self.moduleName +
                         " - Downloading files from shared folder...")

        for fileName in sharedFiles:
            if (fileName[:2] == self.job["fileNamePrefix"]
                ) and os.path.splitext(fileName)[1] in self.job["validExts"]:
                shutil.copyfile(
                    os.path.join(self.job["srcSharedFolder"], fileName),
                    self.rawFolder + "/" + fileName)

    def DfCleanUp(self, df, surveyDateVal):
        '''
        Converts the actual excel file into csv for the worksheet configured.
        '''
        bankNameColumnIn = "Unnamed: 0"
        surveyDateColName = "surveyDate"

        for colName in self.job["columnsToDrop"]:
            df = df.drop(colName, 1)

        df = df.drop(self.job["dropAfterHeader"], 0)

        for colName in df.head(0):
            dtTest = colName

            if not isinstance(dtTest,
                              datetime) and colName != bankNameColumnIn:
                df = df.drop(colName, 1)

        df = df.assign(surveyDate=surveyDateVal)

        newOrder = [surveyDateColName]

        for colName in df.head(0):
            if colName != surveyDateColName:
                newOrder.append(colName)

        df = df[newOrder]
        df = df.melt(id_vars=[surveyDateColName, bankNameColumnIn])

        return df

    def LoadAllFromS3(self):
        '''
        Process a single category configured in the categories dictionary in the jobConfig.
        '''
        try:
            s3DataFolder = "s3://" + self.job["bucketName"] + self.job[
                "s3ToDirectory"]

            rsConnect = RedshiftUtilities.Connect(
                dbname=self.awsParams.redshift['Database'],
                host=self.awsParams.redshift['Hostname'],
                port=self.awsParams.redshift['Port'],
                user=self.awsParams.redshiftCredential['Username'],
                password=self.awsParams.redshiftCredential['Password'])

            RedshiftUtilities.LoadDataFromS3(
                rsConnect, self.awsParams.s3, {
                    "destinationSchema": self.job["destinationSchema"],
                    "tableName": self.job["tableName"],
                    "s3Filename": s3DataFolder,
                    "fileFormat": self.job["fileFormat"],
                    "dateFormat": self.job["dateFormat"],
                    "delimiter": self.job["delimiter"]
                }, self.logger, "N")

            self.logger.info(self.moduleName + " - Cleaning s3 data folder...")

            S3Utilities.DeleteFileFromS3TempUsingAWSCLi(
                s3DataFolder, "--recursive --quiet")
        except Exception:
            self.logger.error(
                self.moduleName +
                " - Error while trying to save into Redshift from s3 folder.")
            raise

    def GetData(self, rawFileName, mode=None):
        '''
        Returns the data frame or survey date
        '''

        if mode == "getSurveyDate":
            skipRows = 0
        else:
            skipRows = self.job["skipRows"]

        df = pandas.read_excel(rawFileName,
                               sheetname=self.job["worksheetName"],
                               index_col=None,
                               na_values=["na"],
                               skiprows=skipRows,
                               skip_footer=self.job["skipFooter"])

        if mode == "getSurveyDate":
            valRerturn = df.iloc[self.job["surveyDateRow"] - 2][0]
        else:
            valRerturn = df

        return valRerturn

    @staticmethod
    def FormatSurveyDate(emFile):
        '''
        Returns the date based on the file's name
        '''

        surveyDateColValue = os.path.splitext(emFile)[0]
        surveyDateColValue = surveyDateColValue[2:len(surveyDateColValue)]
        surveyDateColValue = surveyDateColValue.replace("CF", "")
        surveyDateColValue = str(surveyDateColValue[3:]) + "-" + str(
            list(calendar.month_abbr).index(surveyDateColValue[:3])) + "-01"

        return surveyDateColValue

    def ProcessFiles(self):
        '''
        Controls the workflow for the conversion, clean up and pack of the input files.
        '''
        filesToProcess = self.fileUtilities.ScanFolder(self.rawFolder)

        for emFile in filesToProcess:
            self.logger.info(self.moduleName + " - Processing file: " + emFile)

            rawFileName = self.rawFolder + "/" + emFile
            csvFilename = self.csvFolder + "/" + os.path.splitext(
                emFile)[0] + ".csv"

            try:
                surveyDatedt = self.GetData(rawFileName, "getSurveyDate")

                if isinstance(surveyDatedt, float):
                    surveyDatedt = self.FormatSurveyDate(emFile)

                df = self.GetData(rawFileName)
                df = self.DfCleanUp(df, surveyDatedt)

                df.to_csv(csvFilename,
                          header=False,
                          sep=str(self.job["delimiter"]),
                          encoding='utf-8',
                          index=False)

                self.fileUtilities.GzipFile(csvFilename, csvFilename + ".gz")
                self.fileUtilities.DeleteFile(csvFilename)
            except XLRDError:
                self.logger.info(self.moduleName + " - No tab named '" +
                                 self.job["worksheetName"] + "' in " + emFile)
            except Exception:
                self.logger.error(self.moduleName +
                                  " - Error while trying to process " + emFile)
                raise
            finally:
                FileUtilities.RemoveFileIfItExists(rawFileName)

    def CheckWorkingFolders(self):
        '''
        Check if the working folders are out there to re-create them
        '''
        self.logger.info(self.moduleName + "Checking on working folders...")

        FileUtilities.RemoveFolder(self.rawFolder)
        FileUtilities.RemoveFolder(self.csvFolder)
        FileUtilities.CreateFolder(self.rawFolder)
        FileUtilities.CreateFolder(self.csvFolder)

    def BulkUploadToS3(self):
        '''
        Uploads all GZIP files created into S3 to be uploaded later...
        '''
        self.logger.info(self.moduleName +
                         " - Uploading GZIP files to s3 folder...")

        S3Utilities.CopyItemsAWSCli(
            self.csvFolder,
            "s3://" + self.job["bucketName"] + self.job["s3ToDirectory"],
            "--recursive --quiet")

    def Start(self, logger, moduleName, filelocs):
        currProcId = None
        try:
            ApplicationBase.Start(self, logger, moduleName, filelocs)
            self.logger.debug(self.moduleName + " -- " + " starting ")
            currProcId = self.etlUtilities.GetRunID(
                filelocs["tblEtl"]["table"], self.moduleName)

            self.rawFolder = self.localTempDirectory + "/" + "Raw"
            self.csvFolder = self.localTempDirectory + "/" + "CSV"

            self.CheckWorkingFolders()
            self.BulkDownload()
            self.ProcessFiles()
            self.BulkUploadToS3()
            self.LoadAllFromS3()
            if self.job["cleanlocal"] == "Y":
                self.fileUtilities.RemoveFolder(self.localTempDirectory)
            self.logger.debug(self.moduleName + " -- " + " finished ")
        except Exception as err:
            self.logger.exception(moduleName + " - Exception! Error: " +
                                  err.message)
            if self.etlUtilities.CompleteInstance(filelocs["tblEtl"]["table"],\
                                             currProcId, 'F') is not True:
                self.logger.info(self.moduleName +
                                 " - we could not Complete Instance.")
            raise Exception(err.message)
Exemple #9
0
class AutoInsight(ApplicationBase):
    '''
    This class is used to control the data load process for Auto Insight.
    '''

    def __init__(self):
        '''
        Initial settings
        '''
        super(AutoInsight, self).__init__()

        self.awsParams = ""
        self.processingFile = None
        self.fileUtilities = FileUtilities(self.logger)
        self.location = FileUtilities.PathToForwardSlash(os.path.dirname(os.path.abspath(__file__)))

    def BulkDownload(self):
        '''
        Download all files.
        '''
        sharedFiles = self.fileUtilities.ScanFolder(self.job["srcSharedFolder"])

        self.logger.info(self.moduleName + " - Downloading files from shared folder...")

        for fileName in sharedFiles:
            if fileName == self.job["fileName"]:
                self.processingFile = fileName
                shutil.copyfile(os.path.join(self.job["srcSharedFolder"], fileName), self.localTempDirectory + "/" + self.processingFile)

    def LoadAllFromS3(self):
        '''
        Process a single category configured in the categories dictionary in the jobConfig.
        '''
        try:
            s3DataFolder = "s3://" + self.job["bucketName"] + self.job["s3ToDirectory"]

            rsConnect = RedshiftUtilities.Connect(dbname=self.awsParams.redshift['Database'],
                                                  host=self.awsParams.redshift['Hostname'],
                                                  port=self.awsParams.redshift['Port'],
                                                  user=self.awsParams.redshiftCredential['Username'],
                                                  password=self.awsParams.redshiftCredential['Password'])

            RedshiftUtilities.LoadDataFromS3(rsConnect, self.awsParams.s3,
                                             {
                                                 "destinationSchema": self.job["destinationSchema"],
                                                 "tableName": self.job["ddl"]["table"],
                                                 "s3Filename": s3DataFolder,
                                                 "fileFormat": self.job["fileFormat"],
                                                 "dateFormat": self.job["dateFormat"],
                                                 "delimiter": self.job["delimiter"]
                                             },
                                             self.logger, "N")

            self.logger.info(self.moduleName + " - Cleaning s3 data folder...")

            S3Utilities.DeleteFileFromS3TempUsingAWSCLi(s3DataFolder, "--recursive --quiet")
        except Exception:
            self.logger.error(self.moduleName + " - Error while trying to save into Redshift from s3 folder.")
            raise

    @staticmethod
    def FormatColNameDate(dtText):
        '''
        Converts the abbreviated date to YYYY-MM-01 format
        '''
        textFixed = None
        if sys.version[0] == '2':
            customException = StandardError()
        elif sys.version[0] == '3':
            customException = Exception()
        
        try:
            textFixed = str(dtText[4:]) + "-" + str(list(calendar.month_abbr).index(dtText[:3])) + "-01"
        except customException:
            textFixed = dtText

        return textFixed

    def ProcessFiles(self):
        '''
        Controls the workflow for the conversion, clean up and pack of the input files.
        '''
        self.logger.info(self.moduleName + " - Processing file: " + self.processingFile)

        rawFileName = self.localTempDirectory + "/" + self.processingFile
        csvFilename = self.localTempDirectory + "/" + self.processingFile.split(".")[0] + ".csv"

        try:
            columnNames = []

            df = pandas.read_excel(rawFileName,
                                   sheetname=self.job["worksheetName"],
                                   index_col=None,
                                   na_values=None,
                                   skiprows=self.job["skipRows"],
                                   skip_footer=self.job["skipFooter"])

            for colName in df.head(0):
                if colName not in self.job["columns_no_melt"]:
                    columnNames.append(self.FormatColNameDate(colName))
                else:
                    columnNames.append(colName)

            df.columns = columnNames
            df = df.melt(id_vars=self.job["columns_no_melt"])

            df.to_csv(csvFilename,
                      header=False,
                      sep=str(self.job["delimiter"]),
                      encoding='utf-8',
                      index=False)

            self.fileUtilities.GzipFile(csvFilename, csvFilename + ".gz")
            self.fileUtilities.DeleteFile(csvFilename)
        except XLRDError:
            self.logger.info(self.moduleName + " - No tab named '" + self.job["worksheetName"] + "' in " + self.processingFile)
        except Exception:
            self.logger.error(self.moduleName + " - Error while trying to process file " +  self.processingFile)
            raise
        finally:
            FileUtilities.RemoveFileIfItExists(rawFileName)

    def BulkUploadToS3(self):
        '''
        Uploads all GZIP files created into S3 to be uploaded later...
        '''
        self.logger.info(self.moduleName + " - Uploading GZIP files to s3 folder...")

        fileName = self.processingFile.split(".")[0] + ".csv.gz"
        S3Utilities.CopyItemsAWSCli(self.localTempDirectory + "/" + fileName,
                                    "s3://" + self.job["bucketName"] + self.job["s3ToDirectory"], "--quiet")

    def ExecuteCreateTable(self):
        '''
        Checks if the tables needs to be created
        '''
        tb = self.job['ddl']
        tb['schemaName'] = self.job['destinationSchema']

        fname = self.fileUtilities.CreateTableSql(tb, self.localTempDirectory)
        RedshiftUtilities.PSqlExecute(fname, self.logger)

    def Start(self, logger, moduleName, filelocs):
        currProcId = None
        try:
            ApplicationBase.Start(self, logger, moduleName, filelocs)
            self.logger.debug(self.moduleName + " -- " + " starting ")
            currProcId = self.etlUtilities.GetRunID(filelocs["tblEtl"]["table"], self.moduleName)

            self.ExecuteCreateTable()
            self.BulkDownload()
            self.ProcessFiles()
            self.BulkUploadToS3()
            self.LoadAllFromS3()

            if self.job["cleanlocal"] == "Y":
                self.fileUtilities.RemoveFolder(self.localTempDirectory)

            self.logger.debug(self.moduleName + " -- " + " finished ")
        except Exception as err:
            self.logger.exception(moduleName + " - Exception! Error: " + err.message)

            if self.etlUtilities.CompleteInstance(filelocs["tblEtl"]["table"], currProcId, 'F') is not True:
                self.logger.info(self.moduleName + " - we could not Complete Instance.")

            raise Exception()
class PGCRAirMarketsAthenaSpark(ApplicationBase):
    '''
    Code to process the PGCR Air Markets data
    '''
    def __init__(self):
        '''
        Initial settings
        '''
        super(PGCRAirMarketsAthenaSpark, self).__init__()

        self.awsParams = ""
        self.tempFolder = None
        self.packedFolder = None
        self.rawDataFolder = None
        self.toPackFiles = []

        self.fileUtilities = FileUtilities(self.logger)
        self.location = FileUtilities.PathToForwardSlash(os.path.dirname(os.path.abspath(__file__)))

    def SynchronizeSourceFolder(self):
        '''
        Synchronize the source folder from the AirMarkets bucket in s3
        '''
        self.logger.info("Synchronizing ZIP files from s3 folder...")

        S3Utilities.SyncFolderAWSCli("s3://" + self.job["bucketName"] + self.job["s3SrcDirectory"],
                                     self.rawDataFolder,
                                     True)

    def CleanUpAndPack(self):
        '''
        Main control to iterate thru the folders cleaning the files and packing them to be uploaded to s3.
        '''
        rawFolders = self.fileUtilities.ScanFolder(self.rawDataFolder)

        for rawFolderName in rawFolders:
            self.toPackFiles = []

            self.DecompressFromRaw(rawFolderName)
            self.CleanUpRawCSV(rawFolderName)
            self.PackFiles(rawFolderName)

    def PackFiles(self, rawFolderName):
        '''
        Compress the files for a given folder, right now is only the emissions file being packed.
        '''
        self.logger.info("Packing files for folder " + rawFolderName + "...")

        for csvFile in self.toPackFiles:
            airMarketGzFile = self.packedFolder + "/" + ntpath.basename(csvFile) + ".gz"

            self.fileUtilities.GzipFile(csvFile, airMarketGzFile)
            self.fileUtilities.DeleteFile(csvFile)

    def CleanUpRawCSV(self, rawFolderName):
        '''
        Performs the clean-up for the emissions files replacing bd characters.
        '''
        allFiles = self.fileUtilities.ScanFolder(self.tempFolder, None, "csv")
        fileList = [fileName for fileName in allFiles if self.job["srcFileNamePrefix"] in fileName]
        fileListToDel = [fileName for fileName in allFiles if self.job["srcFileNamePrefix"] not in fileName]

        self.logger.info("Cleaning up files for folder " + rawFolderName + "...")

        for airMarketFile in fileList:
            fullFileName = self.tempFolder + "/" + airMarketFile
            toPackFileName = self.tempFolder + "/" + self.job["srcFileNamePrefix"] + "_" + rawFolderName + ".csv"

            self.fileUtilities.ReplaceIterativelyInFile(fullFileName,
                                                        toPackFileName,
                                                        [{r"[^\x00-\x76]+":""}, {"'":"`"}])

            self.fileUtilities.RemoveLines(toPackFileName, self.job["removeLines"])
            self.toPackFiles.append(toPackFileName)
            self.fileUtilities.DeleteFile(fullFileName)

        for airMarketFile in fileListToDel:
            self.fileUtilities.DeleteFile(self.tempFolder + "/" + airMarketFile)

    def DecompressFromRaw(self, rawFolderName):
        '''
        Extracts the files from the EPADownload.zip file...
        '''
        try:
            filePath = self.rawDataFolder + "/" + rawFolderName + "/" + self.job["inputZipFileName"]

            self.logger.info("Unpacking file: " + filePath)
            self.fileUtilities.UnzipUsing7z(filePath, self.tempFolder)
        except StandardError as err:
            self.logger.info("Unable to decompress file: " + filePath + " Error: " + err.message)

    def UploadPackedToS3(self):
        '''
        Uploads all files packed to s3.
        '''
        self.logger.info("Uploading GZIP files to s3 folder...")

        S3Utilities.CopyItemsAWSCli(self.packedFolder,
                                    "s3://" + self.job["bucketName"] + self.job["s3ToDirectory"],
                                    "--recursive --quiet")

    def LoadAirMarketsTables(self):
        '''
        Performs the final step to insert multiple files located in s3 into the final table in Redshift.
        '''
        try:
            s3DataFolder = "s3://" + self.job["bucketName"] + self.job["s3ToDirectory"]

            rsConnect = RedshiftUtilities.Connect(dbname=self.awsParams.redshift['Database'],
                                                  host=self.awsParams.redshift['Hostname'],
                                                  port=self.awsParams.redshift['Port'],
                                                  user=self.awsParams.redshiftCredential['Username'],
                                                  password=self.awsParams.redshiftCredential['Password'])

            RedshiftUtilities.LoadDataFromS3(rsConnect, self.awsParams.s3,
                                             {
                                                 "destinationSchema": self.job["destinationSchema"],
                                                 "tableName": self.job["tableName"] + self.job["srcFileNamePrefix"],
                                                 "s3Filename": s3DataFolder,
                                                 "fileFormat": self.job["fileFormat"],
                                                 "dateFormat": self.job["dateFormat"],
                                                 "delimiter": self.job["delimiter"]
                                             },
                                             self.logger, "N")

            self.logger.info("Cleaning s3 data folder...")

            S3Utilities.DeleteFileFromS3TempUsingAWSCLi(s3DataFolder, "--recursive --quiet")
        except Exception:
            self.logger.error("Error while trying to save into Redshift from s3 folder.")
            raise

    def CleanWorkingFolders(self):
        '''
        Ensures the folders are cleaned and ready before the process execution.
        '''
        self.logger.info("Cleaning local working folders...")

        FileUtilities.RemoveFolder(self.tempFolder)
        FileUtilities.RemoveFolder(self.packedFolder)

        FileUtilities.CreateFolder(self.tempFolder)
        FileUtilities.CreateFolder(self.packedFolder)

    def ProcessTable(self,table):
        '''
        Process data for the table
        :param table:
        :return:
        '''

        s3Key = self.job["s3Filename"]
        self.logger.info(self.moduleName + " - Processing file: " + s3Key)

        self.fileUtilities.moduleName = self.moduleName
        self.fileUtilities.localBaseDirectory = self.localTempDirectory + "/" + table["table"]
        self.fileUtilities.CreateFolders(self.job["folders"])

        fileName = ntpath.basename(s3Key)

        local7zipFilePath = self.fileUtilities.gzipFolder+ "/" +fileName

        S3Utilities.DownloadFileFromS3(self.awsParams.s3,self.job["bucketName"],
                                       s3Key,local7zipFilePath)

        localCsvFilepath = self.fileUtilities.csvFolder + "/" + fileName
        localCsvFilepath = re.sub(r'\.zip$', '', localCsvFilepath)


        self.fileUtilities.UnzipUsing7z(local7zipFilePath,localCsvFilepath)
        fileToBeloaded = localCsvFilepath+'/'+'emission_05-11-2017.csv'

        spark = SparkUtilities.GetCreateSparkSession(self.logger)
        schema = SparkUtilities.BuildSparkSchema(table)

        df = (spark.read
              .format("com.databricks.spark.csv")
              .options(header='true', delimiter=self.job["delimiter"],ignoreTrailingWhiteSpace='true')
              .schema(schema)
              .load(fileToBeloaded)
              )

        #df.show()
        self.logger.info(
            self.moduleName + " -- " + "Done reading " + str(df.count()) + " rows.  Now saving as parquet file...")
        SparkUtilities.SaveParquet(df, self.fileUtilities)
        self.UploadFilesCreateAthenaTablesAndSqlScripts(table,self.fileUtilities.parquet)
        self.logger.info(self.moduleName + " -- " + "UploadFilesCreateAthenaTablesAndSqlScripts " + " finished ")

    def Start(self, logger, moduleName, filelocs):
        '''
        Start of routine
        '''
        try:
            ApplicationBase.Start(self, logger, moduleName, filelocs)
            self.logger.debug(self.moduleName + " -- " + " starting ")
            currProcId = self.etlUtilities.GetRunID(filelocs["tblEtl"]["table"], self.moduleName)

            for table in self.job["tables"]:
                self.ProcessTable(table)

            if self.job["cleanlocal"] == "Y":
                self.fileUtilities.RemoveFolder(self.localTempDirectory)

            self.logger.debug(self.moduleName + " -- " + " finished ")

        except Exception as err:
            self.logger.exception(moduleName + " - Exception! Error: " + err.message)
            if self.etlUtilities.CompleteInstance(filelocs["tblEtl"]["table"], \
                                                  currProcId, 'F') is not True:
                self.logger.info(self.moduleName + " - we could not Complete Instance.")
            raise Exception(err.message)