Esempio n. 1
0
    def CheckWorkingFolders(self):
        '''
        Check if the working folders are out there to re-create them
        '''
        self.logger.info(self.moduleName + "Checking on working folders...")

        FileUtilities.RemoveFolder(self.rawFolder)
        FileUtilities.RemoveFolder(self.csvFolder)
        FileUtilities.CreateFolder(self.rawFolder)
        FileUtilities.CreateFolder(self.csvFolder)
Esempio n. 2
0
    def CleanWorkingFolders(self):
        '''
        Ensures the folders are cleaned and ready before the process execution.
        '''
        self.logger.info("Cleaning local working folders...")

        FileUtilities.RemoveFolder(self.tempFolder)
        FileUtilities.RemoveFolder(self.packedFolder)

        FileUtilities.CreateFolder(self.tempFolder)
        FileUtilities.CreateFolder(self.packedFolder)
Esempio n. 3
0
 def CleanFiles(self, iso):
     '''
     Generic cleaning wrapper
     '''
     self.logger.info("Cleaning data")
     ignoreLines = None
     columnCount = None
     inputPath = self.localTempDirectory + "/" + iso["Name"] + "/"
     outputPath = self.localTempDirectory + "/" + iso["Name"] + "/cleaned/"
     FileUtilities.CreateFolder(
         outputPath)  #creates the cleaned folder if doesn't already exist
     if iso.get("IgnoreLines") is not None:
         ignoreLines = iso.get("IgnoreLines")
     if iso.get("column_count") is not None:
         columnCount = iso.get("column_count")
     listOfFiles = self.fileUtilities.GetListOfFiles(
         inputPath, self.job["input_file_type"])  #get all the CSV files
     self.logger.info("Files found: {}".format(str(len(listOfFiles))))
     for fp in listOfFiles:
         try:
             self.fileUtilities.CleanFile(inputPath + fp,
                                          outputPath + fp,
                                          IgnoreLines=ignoreLines,
                                          ColumnCount=columnCount,
                                          Delimiter=self.job["delimiter"])
         except Exception as ex:
             self.logger.exception(
                 "Error while cleaning the MISO file: {}".format(fp))
             self.logger.exception("{}".format(str(ex)))
             raise
Esempio n. 4
0
def ProcessApps(logger, processArray, folderlocs):
    '''
    ProcessApps process all the applications that are turned on.
    '''
    FileUtilities.CreateFolder(folderlocs["relativeOutputfolder"])
    try:
        ab = ApplicationBase()
        ev = ab.LoadEnvironmentVariables(logger)
        if "tblEtl" in folderlocs:
            etlUtilities = EtlLoggingUtilities(logger)
            etlUtilities.awsParams = ev.awsParams
            etlUtilities.appschema = folderlocs["tblEtl"]["appschema"]
            etlUtilities.etlSchema = folderlocs["tblEtl"]["schemaName"]

        for proc in processArray:
            module = proc["module"]
            baseName = module.rsplit('.', 1)[1]

            logger.info(baseName + " - Starting module.")
            moduleName = importlib.import_module(module)
            className = getattr(moduleName, baseName)()
            className.Start(logger, baseName, folderlocs) # For single threading

            if "tblEtl" in folderlocs:
                procid = etlUtilities.GetRunID(folderlocs["tblEtl"]["table"], str(baseName))
                if procid > -1:
                    etlUtilities.CompleteInstance(folderlocs["tblEtl"]["table"], procid, 'C')

    except:
        logger.exception("Exception processing application modules!")
        raise
    logger.info(baseName + " - module COMPLETED.")
Esempio n. 5
0
    def CreateMigrationScript(self):
        '''
        takes the template for the Update sqlscript and customizes it
        '''
        sqlMigrateScript = None
        try:
            ###
            #  make sure that we have a place to put the sql script
            ###
            sqlScript = 'PopulateHistoryTemplate.sql'
            self.logger.debug(self.moduleName + " -- " +
                              "CreateMigrationScript" + " starting ")
            sqlMigrateTemplate = self.location + '/sql/' + sqlScript
            sqlMigrateScript = self.commonParams["sqlFolder"] + re.sub(
                'Template.sql$', '.sql', sqlScript)
            #commonParams["cat"]
            FileUtilities.CreateFolder(self.commonParams["sqlFolder"])
            FileUtilities.RemoveFileIfItExists(sqlMigrateScript)
            ###
            #  gather variables needed
            ###
            schemaName = None
            attrSrc = None
            dataSrc = None
            attrDest = None
            dataDest = None
            orderByFields = None
            partByFields = None

            for table in self.commonParams["cat"]["tables"]:
                schemaName = table["schemaName"]
                if table["type"] == "attributes":
                    attrSrc = table["srctable"]
                    attrDest = table["table"]
                    if "partition" in table:
                        orderByFields = table["partition"]["order"]
                        partByFields = table["partition"]["over"]
                elif table["type"] == "series":
                    dataSrc = table["srctable"]
                    dataDest = table["table"]

            with open(sqlMigrateTemplate) as infile, open(
                    sqlMigrateScript, 'w') as outfile:
                for line in infile:
                    line = line.replace('{schemaName}', schemaName)
                    line = line.replace('{attrSrc}', attrSrc)
                    line = line.replace('{dataSrc}', dataSrc)
                    line = line.replace('{attrDest}', attrDest)
                    line = line.replace('{dataDest}', dataDest)
                    line = line.replace('{orderByFields}', orderByFields)
                    line = line.replace('{partByFields}', partByFields)
                    outfile.write(line)
            self.logger.debug(self.moduleName + " -- " +
                              "CreateMigrationScript" + " finished ")
        except:
            self.logger.exception(
                self.moduleName +
                " - we had an error in CreateMigrationScript")
            raise
        return sqlMigrateScript
Esempio n. 6
0
    def Start(self, logger, moduleName, filelocs):
        '''
        Start the process.  Do the common operations.
        '''
        self = self.LoadEnvironmentVariables(logger)
        #        self.logger = logger
        self.logger.info(moduleName + " - Getting configuration information.")

        self.moduleName = moduleName

        # Load the job parameters
        self.fileUtilities = FileUtilities(logger)
        jobConfigFile = self.location + '/' 'jobConfig.json'
        self.job = self.fileUtilities.LoadJobConfiguration(jobConfigFile)

        # This is where all the work files will be created
        self.localTempDirectory = FileUtilities.PathToForwardSlash(
            filelocs["relativeOutputfolder"] + "/" + moduleName)
        FileUtilities.CreateFolder(self.localTempDirectory)

        # This is where all the local data will be located
        if "relativeInputfolder" in filelocs:
            self.localTempDataDirectory = FileUtilities.PathToForwardSlash(
                filelocs["relativeInputfolder"] + "/" + moduleName)
            FileUtilities.CreateFolder(self.localTempDataDirectory)

        self.bcpUtilities = BCPUtilities(logger, self.fileUtilities,
                                         self.awsParams,
                                         self.localTempDirectory)

        # Create tables if we have a valid script
        if "sqlScript" in self.job:
            self.CreateTables(self.job["sqlScript"])

        #  Create etlprocess log table if it does not already exist
        if "tblEtl" in filelocs:
            self.etlUtilities = EtlLoggingUtilities(self.logger)
            self.etlUtilities.awsParams = self.awsParams
            self.etlUtilities.filelocs = filelocs
            self.etlUtilities.moduleName = self.moduleName
            self.etlUtilities.appschema = filelocs["tblEtl"]["appschema"]
            self.etlUtilities.StartEtlLogging()

        if "folders" in self.job:
            self.fileUtilities.moduleName = self.moduleName
            self.fileUtilities.localBaseDirectory = self.localTempDirectory
            self.fileUtilities.CreateFolders(self.job["folders"])
Esempio n. 7
0
 def DownloadFilesFromS3(self):
     '''
     Downloads all files from S3
     '''
     for iso in self.job["iso_files"]:
         keys = S3Utilities.GetListOfFiles(self.awsParams.s3, self.job["bucketName"], self.job["s3SrcDirectory"][1:] + iso["Name"] + "/")
         for key in keys:
             s3Key = "/" + key
             FileUtilities.CreateFolder(self.localTempDirectory + "/" + iso["Name"] + "/")
             localGzipFilepath = self.localTempDirectory + "/" + iso["Name"] + "/" + key.split("/")[-1]
             self.DownloadFile(s3Key, localGzipFilepath)
Esempio n. 8
0
    def GZipItUp(self, batchFolderName):
        '''
        routine to gzip the csv files and put them in a gzip folder
        '''
        try:
            ###
            #  since we are one csv per batch we can just zip the combined csv
            ###

            FileUtilities.CreateFolder(self.commonParams["gzipFolder"] +
                                       "attr/")
            FileUtilities.CreateFolder(self.commonParams["gzipFolder"] +
                                       "data/")
            if self.commonParams["csvFolder"].endswith("/"):
                attrFileNamecsv = self.commonParams[
                    "csvFolder"] + 'attribute/' + 'attr_' + batchFolderName + '.csv'
                dataFileNamecsv = self.commonParams[
                    "csvFolder"] + 'data/' + 'data_' + batchFolderName + '.csv'
            else:
                attrFileNamecsv = self.commonParams[
                    "csvFolder"] + '/attribute/' + 'attr_' + batchFolderName + '.csv'
                dataFileNamecsv = self.commonParams[
                    "csvFolder"] + '/data/' + 'data_' + batchFolderName + '.csv'
            if self.commonParams["gzipFolder"].endswith("/"):
                attrFileNameGz = self.commonParams[
                    "gzipFolder"] + "attr/" + 'attr_' + batchFolderName + '.csv.gz'
                dataFileNameGz = self.commonParams[
                    "gzipFolder"] + "data/" + 'data_' + batchFolderName + '.csv.gz'
            else:
                attrFileNameGz = self.commonParams[
                    "gzipFolder"] + "/attr/" + 'attr_' + batchFolderName + '.csv.gz'
                dataFileNameGz = self.commonParams[
                    "gzipFolder"] + "/data/" + 'data_' + batchFolderName + '.csv.gz'

            self.fileUtilities.GzipFile(attrFileNamecsv, attrFileNameGz)
            self.fileUtilities.GzipFile(dataFileNamecsv, dataFileNameGz)
        except:
            self.logger.exception(self.moduleName +
                                  "- we had an error in ZipItUp with " +
                                  batchFolderName)
            raise
Esempio n. 9
0
def ProcessApps(logger, processArray, folderlocs):
    '''
    ProcessApps process all the applications that are turned on.
    '''

    FileUtilities.CreateFolder(folderlocs["relativeOutputfolder"])
##
#  runs holds a list of the processes we are wanting to run
##
    runs = []
###
#  if you have a new process make sure you add it here
###
    try:
        ab = ApplicationBase()
        ev = ab.LoadEnvironmentVariables(logger)
        if "tblEtl" in folderlocs:
            etlUtilities = EtlLoggingUtilities(logger)
            etlUtilities.awsParams = ev.awsParams
            etlUtilities.appschema = folderlocs["tblEtl"]["appschema"]
            etlUtilities.etlSchema = folderlocs["tblEtl"]["schemaName"]
            
        for proc in processArray:
            module = proc["module"]
            baseName = module.rsplit('.', 1)[1]

            logger.info(baseName + " - Starting module.")
            moduleName = importlib.import_module(module)
            className = getattr(moduleName, baseName)()

            #className.Start(logger, baseName, folderlocs) # For single threading
            # For multi-threading
            runs.append(Thread(name=baseName, target=className.Start, args=(logger, baseName, folderlocs)))

        for rn in runs:
            rn.start()

        for rn in runs:
            rn.join()
            if rn.is_alive() is False and "tblEtl" in folderlocs:
                procid = etlUtilities.GetRunID(folderlocs["tblEtl"]["table"], str(rn.name))
                if procid > -1:
                    etlUtilities.CompleteInstance(folderlocs["tblEtl"]["table"], procid, 'C')

    except:
        logger.exception("Exception processing application modules!")
        raise
    logger.info("All threads complete.")
Esempio n. 10
0
 def DownloadFiles(self, listOfFiles):
     '''
     Downloads the file from the S3 bucket to the data folder
     '''
     path = self.localTempDirectory + self.job["downloadPath"]
     for fp in listOfFiles:
         try:
             year = re.findall(r"\d{4}",
                               fp)[-1]  #gets the year from the file path
             fileName = fp.split("/")[-1]
             FileUtilities.CreateFolder(path + year + "/")
             self.DownloadFile("/" + fp, path + year + "/" + fileName)
             #unzip to the folder path as filename without the file extension
             self.fileUtilities.UnzipUsing7z(
                 path + year + "/" + fileName,
                 path + year + "/" + fileName[:-4])
             self.ProcessFiles(path + year + "/" + fileName[:-4], year)
         except:
             self.logger.exception(
                 "Error while downloading file: {}".format(fp))
             raise
Esempio n. 11
0
 def RecursivelyUnzipFiles(self, srcDirectory):
     '''
     Recursively unzips the files
     '''
     srcDirectory = srcDirectory.strip()  #trim trailing spaces if any
     if srcDirectory[
             -1] != "/":  #if the path doesn't end with forward slash, append one
         srcDirectory = srcDirectory + "/"
     print(srcDirectory)
     #=======================================================================
     # get the list of files in the given path and unzip them
     #=======================================================================
     files = self.fileUtilities.ScanFolder(srcDirectory)
     for unzippedFile in files:
         try:
             if unzippedFile.lower().endswith(
                     ".zip"):  #we are looking for only zip files
                 inputFilename = srcDirectory + "/" + unzippedFile  #build the full path to the zip file
                 outputFolder = unzippedFile.split(".")[
                     0]  #get the filename without the zip part
                 outputDirectory = srcDirectory + "/" + unzippedFile.split(
                     "."
                 )[0]  #build the output directory to which the file is to be unzipped
                 FileUtilities.CreateFolder(
                     outputDirectory
                 )  #Create the folder to be unzipped into
                 self.fileUtilities.UnzipUsing7z(
                     inputFilename,
                     outputDirectory)  #unzip using the 7z utility
                 FileUtilities.RemoveFileIfItExists(
                     inputFilename
                 )  #deletes the zip file after unzipping it
                 self.RecursivelyUnzipFiles(
                     srcDirectory + "/" +
                     outputFolder)  #recursive call to this method
         except:
             self.logger.exception(
                 "Exception in PGCRFERCFilings.RecursivelyUnzipFiles while unzipping file: {}"
                 .format(unzippedFile))
             raise
Esempio n. 12
0
 def CreateFolder(self, folder, createIt):
     '''
     if a folder needs to be created it does it and if we want a fresh folder it will do that to
     '''
     try:
         fName = folder["name"]
         tfoldername = self.localTempDirectory + "/" + folder["folder"] + "/"
         if fName == "sql":
             self.sqlFolder = tfoldername
         elif fName == "csv":
             self.csvFolder = tfoldername
         elif fName == "zips":
             self.zipFolder = tfoldername
         elif fName == "gzips":
             self.gzipFolder = tfoldername
         if createIt == "Y":
             if folder["new"] == "Y":
                 FileUtilities.RemoveFolder(tfoldername)
             FileUtilities.CreateFolder(tfoldername)
     except:
         self.logger.exception(self.moduleName + " had an issue in CreateFolder for " + folder)
         raise
Esempio n. 13
0
class MagellanUtilities(object):
    '''
    method that can be used to convert json files to csv files
    '''
    def __init__(self):
        self.commonParams = {}
        self.fl = None
        self.moduleName = 'MagellanUtilities'
        self.localTempDirectory = None
        self.fileUtilities = None
        self.logger = None

    def BuildTables(self, tables):
        '''
        Builds the tables
        '''
        try:
            for table in tables:
                fname = self.commonParams["sqlFolder"] + "Create_" + table[
                    "name"] + ".sql"
                RedshiftUtilities.PSqlExecute(fname, self.logger)
        except:
            self.logger.exception(self.moduleName +
                                  "- we had an error in BuildTables")
            raise

    def CreateSQLFiles(self, proc, dest):
        '''
        Routine to create sql files to use to create tables in RedShift
        '''
        try:
            for table in proc["tables"]:
                fname = self.commonParams["sqlFolder"] + "Create_" + table[
                    "name"] + ".sql"
                self.logger.info(fname)
                outfile = open(fname, "w")
                outLine = "DROP TABLE IF EXISTS {}.{};".format(
                    dest, table["name"])
                outLine = FileUtilities.PutLine(outLine, outfile)
                outLine = "CREATE TABLE {}.{} (".format(dest, table["name"])
                outLine = FileUtilities.PutLine(outLine, outfile)
                ndx = 0
                for fld in table["fields"]:
                    if ndx > 0:
                        outLine = ','
                    ndx = ndx + 1
                    outLine = outLine + fld["name"] + " " + fld["type"]
                    if fld["type"] == "VARCHAR":
                        outLine = outLine + "(" + fld["size"] + ")  ENCODE LZO"
                    outLine = FileUtilities.PutLine(outLine, outfile)
                outfile.write(");")
                outfile.close()
        except:
            self.logger.exception(self.moduleName +
                                  "- we had an error in CreateSQLFiles ")
            raise

    def InitiateCsvFile(self, batchName, preFx, subFolder):
        '''
        initial creation of CSV file
        '''
        try:
            csvFileName = self.commonParams[
                "csvFolder"] + subFolder + "/" + preFx + batchName + ".csv"
            outFile = open(csvFileName, "ab")
            return outFile
        except:
            self.logger.exception(self.moduleName +
                                  "- we had an error in InitiateCsvFile ")
            raise

    def CreateCsvFile(self, batchName, preFx, outputHolding, subFolder):
        '''
        create the csv file from the array and name the file with the prefix at the front
        '''
        try:
            ###
            #  initiate the CSV files for each type
            ###
            outFile = self.InitiateCsvFile(batchName, preFx, subFolder)
            csvWriter = csv.writer(outFile, quoting=csv.QUOTE_ALL)
            for oArray in outputHolding:
                outLine = []
                for item in oArray:
                    if isinstance(item, basestring):
                        outLine.append(item.encode('utf-8'))
                    else:
                        outLine.append(item)
                csvWriter.writerow(outLine)
            outFile.close()
        except:
            self.logger.exception(self.moduleName +
                                  "- we had an error in CreateCsvFile ")
            raise

    def ContinueProcessJson(self, rec):
        '''
        just a simple test to see if we want to continue processing record
        '''
        try:
            retVal = False
            if "observations" in rec:
                if rec["observations"] is not None:
                    if len(rec["observations"]) > 0:
                        retVal = True
            return retVal
        except:
            self.logger.exception(self.moduleName +
                                  "- we had an error in ContinueProcessJson ")
            raise

    def FindVal(self, inRec, inItem):
        '''
        runs thru the json object and searches for the key and if found
        then return the associated value
        '''
        rtnFound = False
        rtnValue = ''
        try:
            for key in inRec.keys():
                if str(key) == str(inItem):
                    rtnFound = True
                    rtnValue = inRec[key]
                    return rtnFound, rtnValue
                if isinstance(inRec[key], dict):
                    rtnFound, rtnValue = self.FindVal(inRec[key], inItem)
        except:
            self.logger.exception(self.moduleName +
                                  "- we had an error in FindVal ")
            raise
        return rtnFound, rtnValue

    def CheckValue(self, inRec, inItem, fType):
        '''
        Helper method so that we remove cr lf if they are in the string
        '''
        try:
            found = False
            found, val = self.FindVal(inRec, inItem)
            if found is False:
                return ''
            if fType == 'VARCHAR':
                if isinstance(val, str):
                    val = self.fileUtilities.RemoveSpecialCharsFromString(val)
                    val = re.sub(r'\\', r'\\\\', val)
                elif isinstance(val, unicode):
                    val = self.fileUtilities.RemoveSpecialCharsFromString(val)
                    val = re.sub(r'\\', r'\\\\', val)
            if fType == "DATE":
                if isinstance(val, str):
                    tval = datetime.strptime(val, '%Y-%m-%d')
                    if tval.year < 1900:
                        tval = datetime.strptime('1900-01-01', '%Y-%m-%d')
                    val = tval.strftime('%Y-%m-%d')
                elif isinstance(val, datetime):
                    val = val.strftime('%Y-%m-%d')
            return val
        except:
            self.logger.exception(self.moduleName +
                                  "- we had an error in CheckValue - " +
                                  inItem)
            raise

    def LoadJData(self, src, jFile):
        '''
        try and load the data into a Json Object
        '''
        jdata = open(src + "/" + jFile).read()
        dataFile = None
        encodeList = [None, "cp1252"]
        for enc in encodeList:
            try:
                if enc is not None:
                    dataFile = json.loads(jdata, encoding=enc)
                else:
                    dataFile = json.loads(jdata)
                break
            except ValueError:
                continue
        if dataFile is None:
            self.logger.exception(self.moduleName +
                                  "- we had an error in LoadJData - " + jFile)
            raise Exception('could not load json file')
        else:
            return dataFile

    def ProcessJsonFile(self, src, jfl, batchName):
        '''
        loads the json and then calls the process routine
        '''
        try:
            dataFile = self.LoadJData(src, jfl)
            self.ProcessJson(dataFile, batchName)
        except:
            self.logger.exception(
                self.moduleName +
                "- we had an error in ProcessJsonFile with file " + jfl)
            raise

    def LoadAttrArray(self, rec, srcId, keyField='source_id'):
        '''
        return array of attribute values
        '''
        outAttrRecArray = []
        try:
            for fld in self.commonParams["attrFields"]:
                if fld["name"] == keyField:
                    outAttrRecArray.append(srcId)
                else:
                    outAttrRecArray.append(
                        self.CheckValue(rec, fld["name"], fld["type"]))
        except:
            self.logger.exception(self.moduleName +
                                  "- we had an error in LoadAttrArray ")
            raise
        return outAttrRecArray

    def ProcessJson(self, dataFile, batchName):
        '''
        process one json file and create two csv files from it
        '''
        try:  # pylint: disable=too-many-nested-blocks
            outputAttrHolding = []
            outputDataHolding = []
            ###
            #  gets all the atributes
            ###
            if 'value' in dataFile:
                for rec in dataFile["value"]:
                    if self.ContinueProcessJson(rec) is True:
                        srcId = self.CheckValue(rec, "source_id", "VARCHAR")
                        for obsRec in rec["observations"]:
                            outputDataHolding.append(
                                [srcId, obsRec["date"], obsRec["value"]])

                        outAttrRecArray = self.LoadAttrArray(rec, srcId)
                        outputAttrHolding.append(outAttrRecArray)
            else:
                rec = dataFile
                if self.ContinueProcessJson(rec) is True:
                    srcId = self.CheckValue(rec, "source_id", "VARCHAR")
                    for obsRec in rec["observations"]:
                        outputDataHolding.append(
                            [srcId, obsRec["date"], obsRec["value"]])

                    outAttrRecArray = self.LoadAttrArray(rec, srcId)
                    outputAttrHolding.append(outAttrRecArray)

            ###
            #  now create fill csv file
            ###
            self.CreateCsvFile(batchName, "attr_", outputAttrHolding,
                               "attribute")
            self.CreateCsvFile(batchName, "data_", outputDataHolding, "data")

        except:
            self.logger.exception(
                self.moduleName +
                "- we had an error in ProcessJsonFile with batch " + batchName)
            raise

    def ProcessZipContents(self, zFileFolder, batchName):
        '''
        process all the files that were in the zip file
        '''
        try:
            onlyFiles = [
                fl for fl in listdir(zFileFolder)
                if isfile(join(zFileFolder, fl))
            ]
            for jfl in onlyFiles:
                self.ProcessJsonFile(zFileFolder, jfl, batchName)
        except:
            self.logger.exception(
                self.moduleName +
                "- we had an error in ProcessZipContents with batch " +
                batchName)
            raise

    def GZipItUp(self, batchFolderName):
        '''
        routine to gzip the csv files and put them in a gzip folder
        '''
        try:
            ###
            #  since we are one csv per batch we can just zip the combined csv
            ###

            FileUtilities.CreateFolder(self.commonParams["gzipFolder"] +
                                       "attr/")
            FileUtilities.CreateFolder(self.commonParams["gzipFolder"] +
                                       "data/")
            if self.commonParams["csvFolder"].endswith("/"):
                attrFileNamecsv = self.commonParams[
                    "csvFolder"] + 'attribute/' + 'attr_' + batchFolderName + '.csv'
                dataFileNamecsv = self.commonParams[
                    "csvFolder"] + 'data/' + 'data_' + batchFolderName + '.csv'
            else:
                attrFileNamecsv = self.commonParams[
                    "csvFolder"] + '/attribute/' + 'attr_' + batchFolderName + '.csv'
                dataFileNamecsv = self.commonParams[
                    "csvFolder"] + '/data/' + 'data_' + batchFolderName + '.csv'
            if self.commonParams["gzipFolder"].endswith("/"):
                attrFileNameGz = self.commonParams[
                    "gzipFolder"] + "attr/" + 'attr_' + batchFolderName + '.csv.gz'
                dataFileNameGz = self.commonParams[
                    "gzipFolder"] + "data/" + 'data_' + batchFolderName + '.csv.gz'
            else:
                attrFileNameGz = self.commonParams[
                    "gzipFolder"] + "/attr/" + 'attr_' + batchFolderName + '.csv.gz'
                dataFileNameGz = self.commonParams[
                    "gzipFolder"] + "/data/" + 'data_' + batchFolderName + '.csv.gz'

            self.fileUtilities.GzipFile(attrFileNamecsv, attrFileNameGz)
            self.fileUtilities.GzipFile(dataFileNamecsv, dataFileNameGz)
        except:
            self.logger.exception(self.moduleName +
                                  "- we had an error in ZipItUp with " +
                                  batchFolderName)
            raise

    def StartHere(self):
        '''
        initial starting routine
        '''
        try:
            self.moduleName = self.commonParams["moduleName"]
            self.logger = FileUtilities.CreateLogger(
                self.commonParams["loggerParams"])
            self.logger.info("zipfile = " + self.fl + " started " +
                             datetime.now().strftime('%Y-%m-%d %I:%M:%S'))
            ###
            #  pull this file to local instance
            ###
            fileName = ntpath.basename(self.fl)
            batchFolderName = re.sub(r'\.zip$', '', fileName)
            ###
            #  make sure we have this folder
            ###
            self.fileUtilities = FileUtilities(self.logger)
            segZipFolder = self.commonParams[
                "zipFolder"] + batchFolderName + "/"
            self.fileUtilities.RemoveFolder(segZipFolder)
            self.fileUtilities.CreateFolder(segZipFolder)
            localGzipFilepath = self.commonParams[
                "localTempDirectory"] + "/" + fileName
            self.fileUtilities.UnzipFile(localGzipFilepath, segZipFolder)
            zipContentFolder = re.sub(r'\/$', '', segZipFolder)
            directories = [
                fName for fName in os.listdir(segZipFolder)
                if os.path.isdir(os.path.join(segZipFolder, fName))
            ]
            for dirs in directories:
                zipContentFolder = os.path.join(segZipFolder, dirs)
            self.ProcessZipContents(zipContentFolder, batchFolderName)
            self.GZipItUp(batchFolderName)
            self.logger.info("zipfile = " + self.fl + " finished " +
                             datetime.now().strftime('%Y-%m-%d %I:%M:%S'))
        except:
            self.logger.exception(self.commonParams["moduleName"] +
                                  "- we had an error in StartHere")
            raise
Esempio n. 14
0
class Vantage(ApplicationBase):
    '''
    This class is used to get the Vanatage data from IHS Vantage Database, transform it and load it into Redshift.
    '''
    def __init__(self):
        '''
        Initial settings
        '''
        super(Vantage, self).__init__()

        self.awsParams = ""
        self.packedFolder = None
        self.rawFolder = None
        self.fileUtilities = FileUtilities(self.logger)
        self.location = FileUtilities.PathToForwardSlash(
            os.path.dirname(os.path.abspath(__file__)))

    def BulkExtractAll(self):
        '''
        Controls the flow thru the different data sets coming from Vantage DB.
        '''
        try:
            for dsScript in self.job["extractingScripts"]:
                self.logger.info(self.moduleName + " Starts extracting " +
                                 dsScript["tableSuffix"] + " data...")

                self.bcpUtilities.RunBCPJob(
                    self.job["mssqlLoginInfo"],
                    self.job["bcpUtilityDirOnLinux"],
                    self.fileUtilities.LoadSQLQuery(self.location +
                                                    dsScript["scriptFile"]),
                    self.localTempDirectory + "/Raw/" +
                    dsScript["tableSuffix"] + ".CSV", self.job["delimiter"])
        except Exception as err:
            self.logger.error(
                "Error while trying to Bulk Extract all. Message: " +
                err.message)
            raise

    def TransformAndPackAll(self):
        '''
        Compress the csv files created.
        '''
        rawFiles = self.fileUtilities.ScanFolder(self.rawFolder, None, "CSV")

        try:
            for rFile in rawFiles:
                rFileFull = self.rawFolder + "/" + rFile

                self.logger.info(self.moduleName +
                                 " started compressing file: " + rFile)

                self.fileUtilities.GzipFile(
                    rFileFull, self.packedFolder + "/" + rFile + ".GZ")

                self.fileUtilities.RemoveFileIfItExists(rFileFull)
        except Exception as err:
            self.logger.error(self.moduleName +
                              " Error while compressing raw files. Message: " +
                              err.message)
            raise

    def LoadAllFromS3(self):
        '''
        Load all CSVs from the Vantage's S3 bucket into Redshift
        '''
        rsConnect = None

        try:
            s3DataFolder = "s3://" + self.job["bucketName"] + self.job[
                "s3ToDirectory"]

            rsConnect = RedshiftUtilities.Connect(
                dbname=self.awsParams.redshift['Database'],
                host=self.awsParams.redshift['Hostname'],
                port=self.awsParams.redshift['Port'],
                user=self.awsParams.redshiftCredential['Username'],
                password=self.awsParams.redshiftCredential['Password'])

            for dsScript in self.job["extractingScripts"]:
                RedshiftUtilities.LoadDataFromS3(
                    rsConnect, self.awsParams.s3, {
                        "destinationSchema":
                        self.job["destinationSchema"],
                        "tableName":
                        self.job["tableName"] + dsScript["tableSuffix"],
                        "s3Filename":
                        s3DataFolder + "/" + dsScript["tableSuffix"] +
                        ".CSV.GZ",
                        "fileFormat":
                        self.job["fileFormat"],
                        "dateFormat":
                        self.job["dateFormat"],
                        "delimiter":
                        self.job["delimiter"]
                    }, self.logger, "N")

            self.logger.info(self.moduleName + " - Cleaning s3 data folder...")

            S3Utilities.DeleteFileFromS3TempUsingAWSCLi(
                s3DataFolder, "--recursive --quiet")
        except Exception:
            self.logger.error(
                self.moduleName +
                " - Error while trying to save into Redshift from s3 folder.")
            raise
        finally:
            if rsConnect is not None:
                rsConnect.close()

    def BulkUploadToS3(self):
        '''
        Uploads all GZIP files created into S3 to be uploaded later...
        '''
        self.logger.info(self.moduleName +
                         " - Uploading GZIP files to s3 folder...")

        S3Utilities.CopyItemsAWSCli(
            self.packedFolder,
            "s3://" + self.job["bucketName"] + self.job["s3ToDirectory"],
            "--recursive --quiet")

    def Start(self, logger, moduleName, filelocs):
        try:
            ApplicationBase.Start(self, logger, moduleName, filelocs)

            self.packedFolder = self.localTempDirectory + "/Packed"
            self.rawFolder = self.localTempDirectory + "/Raw"

            self.fileUtilities.RemoveFolder(self.packedFolder)
            self.fileUtilities.RemoveFolder(self.rawFolder)

            self.fileUtilities.CreateFolder(self.packedFolder)
            self.fileUtilities.CreateFolder(self.rawFolder)

            self.BulkExtractAll()
            self.TransformAndPackAll()
            self.BulkUploadToS3()
            self.LoadAllFromS3()
        except Exception as err:
            self.logger.exception(moduleName + " - Exception! Error: " +
                                  err.message)
            raise Exception(err.message)
Esempio n. 15
0
 def CreateFolders(self):
     '''
     Creates the folders if they do not exist already
     '''
     FileUtilities.CreateFolder(self.localTempDirectory + "/processed/")
     FileUtilities.CreateFolder(self.localTempDirectory + "/cleaned/")