コード例 #1
0
    def LoadCategory(self, srcCategory):
        '''
        Process a single category configured in the categories dictionary in the jobConfig.
        '''
        try:
            self.logger.debug(self.moduleName + " -- " + "LoadCategory" + " starting ")
            processingFile = self.DownloadFile(srcCategory)
            fileOut = processingFile.replace(".dbf", ".txt")

            dbfUtils = DBFUtilities(self.logger)
            dbfUtils.ConvertToCSV(processingFile, fileOut, srcCategory["delimiter"], False)

            rsConnect = self.etlUtilities.GetAWSConnection(self.awsParams)
            self.logger.info(self.moduleName + " - Loading file " + fileOut + "...")

            RedshiftUtilities.LoadFileIntoRedshift(rsConnect,
                                                   self.awsParams.s3,
                                                   self.logger,
                                                   self.fileUtilities,
                                                   fileOut,
                                                   self.job["destinationSchema"],
                                                   self.job["tableName"] + "_" + srcCategory["srcCategory"],
                                                   srcCategory["fileFormat"],
                                                   srcCategory["dateFormat"],
                                                   srcCategory["delimiter"])
            self.logger.debug(self.moduleName + " -- " + "LoadCategory" + " finished ")
        except Exception as err:
            self.logger.error(self.moduleName + " - Error while trying to load category...")
            raise Exception(err.message)
コード例 #2
0
ファイル: PGCREIA860.py プロジェクト: eulertech/backup
    def LoadCSVFile(self, localFilePath, loadName):
        '''
        For each file we need to process, provide the data loader the s3 key
        and destination table name
        '''
        self.logger.info("Loading data into Redshift")
        try:
            rsConnect = RedshiftUtilities.Connect(
                dbname=self.awsParams.redshift['Database'],
                host=self.awsParams.redshift['Hostname'],
                port=self.awsParams.redshift['Port'],
                user=self.awsParams.redshiftCredential['Username'],
                password=self.awsParams.redshiftCredential['Password'])

            RedshiftUtilities.LoadFileIntoRedshift(
                rsConnect, self.awsParams.s3, self.logger, self.fileUtilities,
                localFilePath, self.job["destinationSchema"],
                self.job["tableName"] + loadName["redshiftTableSuffix"],
                self.job["fileFormat"], self.job["dateFormat"],
                self.job["delimiter"])
        except Exception:
            self.logger.exception("Exception in PGCREIA860.LoadCSVFile")
            self.logger.exception(
                "Error while uploading to table:{}, filePath:{}".format(
                    self.job["tableName"] + loadName["redshiftTableSuffix"],
                    localFilePath))
            raise
        finally:
            if rsConnect is not None:
                rsConnect.close()
            self.logger.info(self.moduleName +
                             " - Finished Processing S3 file: " +
                             loadName["redshiftTableSuffix"])
コード例 #3
0
ファイル: OPEC.py プロジェクト: eulertech/backup
    def LoadCategory(self, srcCategory):
        '''
        Process a single category configured in the categories dictionary in the jobConfig.
        '''
        try:
            self.logger.debug(self.moduleName + " -- " + "LoadCategory" +
                              " starting ")
            processingFile = self.DownloadFile(srcCategory)
            processingCSV = self.GetCSVFile(processingFile, srcCategory)
            rsConnect = self.etlUtilities.GetAWSConnection(self.awsParams)

            self.logger.debug(self.moduleName + " - Loading file " +
                              processingCSV + "...")

            RedshiftUtilities.LoadFileIntoRedshift(
                rsConnect, self.awsParams.s3, self.logger, self.fileUtilities,
                processingCSV, self.job["destinationSchema"],
                self.job["tableName"] + "_" + srcCategory["srcCategory"],
                srcCategory["fileFormat"], srcCategory["dateFormat"],
                srcCategory["delimiter"])
            self.logger.debug(self.moduleName + " -- " + "LoadCategory" +
                              " finished ")
        except Exception as err:
            self.logger.error(self.moduleName +
                              " - Error while trying to load category...")
            raise Exception(err.message)
コード例 #4
0
 def LoadIntoRedshift(self, job):
     '''
     Does the actual loading of data into Redshift
     '''
     self.logger.info("Loading {} into Redshift".format(job["Name"]))
     try:
         rsConnect = RedshiftUtilities.Connect(
             dbname=self.awsParams.redshift['Database'],
             host=self.awsParams.redshift['Hostname'],
             port=self.awsParams.redshift['Port'],
             user=self.awsParams.redshiftCredential['Username'],
             password=self.awsParams.redshiftCredential['Password'])
         RedshiftUtilities.LoadFileIntoRedshift(
             rsConnect, self.awsParams.s3, self.logger, self.fileUtilities,
             self.localTempDirectory + "/cleaned/" + job["Name"] + ".CSV",
             self.job["destinationSchema"],
             self.job["tableName"].lower().replace("f1_", "") + job["Name"],
             self.job["fileFormat"], self.job["dateFormat"],
             self.job["outputDelimiter"])
         rsConnect.close()
     except Exception as e:
         self.logger.exception(
             "we had an error in FoxPro.LoadIntoRedshift() while loading data into Redshift"
         )
         self.logger.exception("{}".format(str(e)))
コード例 #5
0
ファイル: JODI.py プロジェクト: eulertech/backup
    def ProcessCategory(self, rsConnect, srcCategory):
        '''
        Executes the processing for a single category configured...
        '''
        url = srcCategory["url"]
        self.logger.info(self.moduleName + " - Processing url: " + url)

        localFilepath = self.localTempDirectory + "/" + ntpath.basename(
            srcCategory["url"])

        fileDownload = urllib.URLopener()
        fileDownload.retrieve(url, localFilepath)

        self.fileUtilities.UnzipFile(localFilepath, self.localTempDirectory)
        localFilepath = self.localTempDirectory + "/" + srcCategory[
            "unzipFilename"]

        redshiftDestTable = self.job["tableName"] + srcCategory[
            "redshiftTableSuffixOrigin"]

        RedshiftUtilities.LoadFileIntoRedshift(
            rsConnect, self.awsParams.s3, self.logger, self.fileUtilities,
            localFilepath, self.job["destinationSchema"], redshiftDestTable,
            self.job["fileFormat"], srcCategory["dateFormat"],
            self.job["delimiter"])
コード例 #6
0
ファイル: IEA.py プロジェクト: eulertech/backup
    def AsyncLoadFilesToRedShift(self, proc):
        '''
        load files into RedShift
        '''
        try:
            for pFile in proc["processfile"]:
                pFileNoPath = pFile.replace('/', '_')
                self.logger.debug(self.moduleName + " -- " +
                                  "AsyncLoadFilesToRedShift for " + pFile +
                                  " starting ")
                rsConnect = self.etlUtilities.GetAWSConnection(self.awsParams)
                outputfileName = self.localTempDirectory + '/scrubbed/' + 'scrub_' + pFileNoPath
                rsTable = 'working_' + proc["processname"]

                RedshiftUtilities.LoadFileIntoRedshift(
                    rsConnect, self.awsParams.s3, self.logger,
                    self.fileUtilities, outputfileName,
                    self.job["destinationSchema"], rsTable,
                    self.job["fileFormat"], self.job["dateFormat"],
                    self.job["delimiter"])
                rsConnect.close()
                self.logger.debug(self.moduleName + " -- " +
                                  "AsyncLoadFilesToRedShift for " + pFile +
                                  " finished ")
        except:
            self.logger.exception(
                self.moduleName +
                "- we had an error in AsyncLoadFilesToRedShift ")
            raise
コード例 #7
0
ファイル: AutoLightVehicles.py プロジェクト: eulertech/backup
    def ProcessS3File(self, srcFileParameter):
        '''
        Process each file
        '''
        self.logger.debug(self.moduleName + " -- " + "ProcessS3File" +
                          " starting ")
        s3Key = self.job["s3SrcDirectory"] + "/" + srcFileParameter[
            "s3Filename"]
        self.logger.info(self.moduleName + " - Processing file: " + s3Key)

        fileName = ntpath.basename(s3Key)
        localGzipFilepath = self.localTempDirectory + "/raw/" + fileName

        #----------------------------------------------------------------------
        S3Utilities.DownloadFileFromS3(self.awsParams.s3,
                                       self.job["bucketName"], s3Key,
                                       localGzipFilepath)

        # Remove the gz extension
        localExcelFilepath = re.sub(r'\.gz$', '', localGzipFilepath)
        self.fileUtilities.GunzipFile(localGzipFilepath, localExcelFilepath)

        self.logger.info(self.moduleName + " - Processing Excel file: " +
                         localExcelFilepath)
        self.fileUtilities.DeleteFile(localGzipFilepath)
        fileNameNoExt = fileName.split('.', 1)[0]
        outPutFileName = self.fileUtilities.csvFolder + fileNameNoExt + '.csv'
        xl = ExcelUtilities(self.logger)
        xl.Excel2CSV(localExcelFilepath,\
                    srcFileParameter["excelSheetName"],\
                    outPutFileName,\
                    self.fileUtilities.csvFolder,\
                    skiprows=srcFileParameter["skipRows"])
        self.fileUtilities.EmptyFolderContents(self.localTempDirectory +
                                               "/raw/")
        for tables in srcFileParameter["tables"]:
            fname = self.fileUtilities.CreateTableSql(
                tables, self.fileUtilities.sqlFolder)
            RedshiftUtilities.PSqlExecute(fname, self.logger)
        # -----------------------------------------------------------------------------
        self.logger.info(self.moduleName + " - Loading data into Redshift...")
        rsConnect = self.etlUtilities.GetAWSConnection(self.awsParams)

        RedshiftUtilities.LoadFileIntoRedshift(
            rsConnect, self.awsParams.s3, self.logger, self.fileUtilities,
            outPutFileName, tables["schemaName"], tables["table"],
            self.job["fileFormat"], self.job["dateFormat"],
            self.job["delimiter"])
        # Cleanup
        rsConnect.close()
        self.fileUtilities.EmptyFolderContents(self.fileUtilities.csvFolder)

        self.logger.debug(self.moduleName + " -- " +
                          "ProcessS3File for file: " + s3Key + " finished ")
コード例 #8
0
ファイル: EIA.py プロジェクト: eulertech/backup
 def LoadSeriesIntoRedshift(self, fileNameNoExt, rsConnect, suffix):
     '''
     Load series data into redshift
     '''
     localFilename = self.ComposeFileName(fileNameNoExt, suffix)
     fullRedshiftDestTable = self.job["tableName"] + \
         fileNameNoExt + "_" + suffix
     RedshiftUtilities.LoadFileIntoRedshift(
         rsConnect, self.awsParams.s3, self.logger, self.fileUtilities,
         localFilename, self.job["destinationSchema"],
         fullRedshiftDestTable, self.job["fileFormat"],
         self.job["dateFormat"], self.job["delimiter"])
コード例 #9
0
    def Start(self, logger, moduleName, filelocs):
        '''
        main routine
        '''
        currProcId = None
        try:
            ApplicationBase.Start(self, logger, moduleName, filelocs)
            self.logger.debug(self.moduleName + " -- " + " starting ")
            currProcId = self.etlUtilities.GetRunID(
                filelocs["tblEtl"]["table"], self.moduleName)
            ###
            #  establish connection to Access database
            ###
            conn = self.EstablishConnection()
            cur = conn.cursor()
            sqlline = self.FixSQLStatement()
            cur.execute(sqlline)

            outputfileName = self.localTempDirectory + '/ENPdata.csv'
            self.ConvertToCSV(cur, outputfileName)
            ###
            #  load the CSV to RedShift
            ###
            self.logger.debug(self.moduleName + " - ENP load CSV to RedShift")

            rsConnect = self.etlUtilities.GetAWSConnection(self.awsParams)

            RedshiftUtilities.LoadFileIntoRedshift(
                rsConnect, self.awsParams.s3, self.logger, self.fileUtilities,
                outputfileName, self.job["destinationSchema"],
                self.job["tableName"], self.job["fileFormat"],
                self.job["dateFormat"], self.job["delimiter"])

            self.logger.debug(self.moduleName +
                              " - ENP CSV loaded to RedShift")

            # Cleanup
            rsConnect.close()
            cur.close()
            conn.close()
            if self.job["cleanlocal"] == "Y":
                self.fileUtilities.RemoveFolder(self.localTempDirectory)
            self.logger.debug(self.moduleName + " -- " + " finished ")
        except Exception as err:
            self.logger.exception(moduleName + " - Exception! Error: " +
                                  err.message)
            if self.etlUtilities.CompleteInstance(filelocs["tblEtl"]["table"],\
                                             currProcId, 'F') is not True:
                self.logger.info(self.moduleName +
                                 " - we could not Complete Instance.")
            raise Exception(err.message)
コード例 #10
0
 def LoadData(self, iso, localFilePath, fp):
     '''
     Method to load ISO data into Redshift
     '''
     self.logger.info("Loading ISO data into Redshift")
     rsConnect = RedshiftUtilities.Connect(
         dbname=self.awsParams.redshift['Database'],
         host=self.awsParams.redshift['Hostname'],
         port=self.awsParams.redshift['Port'],
         user=self.awsParams.redshiftCredential['Username'],
         password=self.awsParams.redshiftCredential['Password'])
     RedshiftUtilities.LoadFileIntoRedshift(
         rsConnect, self.awsParams.s3, self.logger, self.fileUtilities,
         localFilePath + fp, self.job["destinationSchema"],
         self.job["tableName"] + iso["Name"], self.job["fileFormat"],
         self.job["dateFormat"], self.job["delimiter"])
コード例 #11
0
ファイル: Hindsight.py プロジェクト: eulertech/backup
    def ProcessSubJob(self, subJob):
        self.logger.info("Start the bcpUtilities.RunBCPJob for table " +
                         subJob.get("destination"))
        #TODO: set time capture to log file
        self.bcpUtilities.RunBCPJob(
            self.job["bcpParameters"]["sqlServerloginInfo"],
            self.job["bcpParameters"]["bcpUtilityDirOnLinux"],
            self.fileUtilities.LoadSQLQuery(
                self.fileUtilities.GetApplicationDirectory("Hindsight") +
                subJob.get("inputQuery")),
            self.localTempDirectory + "/" + subJob.get("destination"),
            subJob.get("fieldTerminator"), subJob.get("rowTerminator"))

        self.logger.info("Start of the cleaning process for table " +
                         subJob.get("destination"))
        if (subJob.get("charsToBeReplaced")
                is not None) and (len(subJob.get("charsToBeReplaced")) != 0):
            self.fileUtilities.ReplaceIterativelyInFile(
                self.bcpUtilities.GetFullFilePath(subJob.get("destination")),
                self.bcpUtilities.GetFullFilePath(
                    self.bcpUtilities.GetFileToBeUploaded(
                        subJob.get("destination"),
                        subJob.get("charsToBeReplaced"))),
                subJob.get("charsToBeReplaced"))

        self.logger.info(
            "Start of the uploading process to Redshift process for table " +
            subJob.get("destination"))
        rsConnect = psycopg2.connect(
            dbname=self.awsParams.redshift['Database'],
            host=self.awsParams.redshift['Hostname'],
            port=self.awsParams.redshift['Port'],
            user=self.awsParams.redshiftCredential['Username'],
            password=self.awsParams.redshiftCredential['Password'])

        RedshiftUtilities.LoadFileIntoRedshift(
            rsConnect, self.awsParams.s3, self.logger, self.fileUtilities,
            self.bcpUtilities.GetFullFilePath(
                self.bcpUtilities.GetFileToBeUploaded(
                    subJob.get("destination"),
                    subJob.get("charsToBeReplaced"))),
            subJob["destinationSchema"], subJob.get("destination"),
            self.job["bcpParameters"]["fileFormat"],
            self.job["bcpParameters"]["dateFormat"],
            self.job["bcpParameters"]["delimiter"])

        rsConnect.close()
コード例 #12
0
    def LoadXLSToRedshift(self, fileInStage, reportConfig):
        '''
        Load a rigpoint data excel file into redshift
        '''
        rsConnect = None

        try:
            self.logger.debug(self.moduleName + " -- " + "LoadXLSToRedshift" +
                              fileInStage + " starting ")

            fileNameCSV = os.path.splitext(fileInStage)[0] + ".csv"
            rsConnect = self.etlUtilities.GetAWSConnection(self.awsParams)
            dataFrame = pandas.read_excel(
                fileInStage,
                sheetname=reportConfig["excelSheetName"],
                index_col=None,
                na_values=['NaN'],
                skiprows=reportConfig["skipRows"],
                skip_footer=reportConfig["skipFooter"])

            dataFrame.to_csv(fileNameCSV,
                             header=False,
                             sep=str(reportConfig["delimiter"]),
                             encoding='utf-8',
                             index=False)

            RedshiftUtilities.LoadFileIntoRedshift(
                rsConnect, self.awsParams.s3, self.logger, self.fileUtilities,
                fileNameCSV, self.job["destinationSchema"],
                self.job["tableName"] + "_" + reportConfig["name"],
                reportConfig["fileFormat"], reportConfig["dateFormat"],
                reportConfig["delimiter"])
            self.logger.debug(self.moduleName + " -- " + "LoadXLSToRedshift" +
                              fileInStage + " finished ")

        except Exception as err:
            self.logger.error(
                self.moduleName +
                " Error while trying to load file into Redshift: " +
                err.message)
            raise
        finally:
            if rsConnect is not None:
                rsConnect.close()
コード例 #13
0
    def LoadData(self, outPutFileName):
        '''
        load data into Redshift
        '''
        try:
            self.logger.debug(self.moduleName + " -- " + "LoadData for " +
                              outPutFileName + " starting ")
            rsConnect = self.etlUtilities.GetAWSConnection(self.awsParams)

            RedshiftUtilities.LoadFileIntoRedshift(
                rsConnect, self.awsParams.s3, self.logger, self.fileUtilities,
                outPutFileName, self.job["destinationSchema"],
                self.job["tableName"], self.job["fileFormat"],
                self.job["dateFormat"], self.job["delimiter"])
            rsConnect.close()

            self.logger.debug(self.moduleName + " -- " + "LoadData for " +
                              outPutFileName + " finished ")
        except:
            self.logger.exception(self.moduleName +
                                  " - we had an error in : LoadData")
            raise