예제 #1
0
    def LoadFileIntoRedshift(rsConnect, s3, logger, fileUtilities, localFilepath, destinationSchema,\
                             redshiftDestTable, fileFormat, dateFormat, delimiter, isManifest='N'):
        '''
        Load file from local drive to RedShift
        Zip the file, upload to S3 and then load into RedShift
        '''
        if isManifest == 'Y':
            zipLocalFilepath = localFilepath
        else:
            # Zip the file
            zipLocalFilepath = localFilepath + ".gz"
            fileUtilities.GzipFile(localFilepath, zipLocalFilepath)

        bucketName, s3TempKey = S3Utilities.UploadFileToS3Temp(s3, zipLocalFilepath)

        # Build the job definition file
        job = {}
        job["destinationSchema"] = destinationSchema
        job["tableName"] = redshiftDestTable
        job["s3Filename"] = S3Utilities.GetS3FileName(bucketName, s3TempKey)
        job["fileFormat"] = fileFormat
        job["dateFormat"] = dateFormat
        job["delimiter"] = delimiter

        RedshiftUtilities.LoadDataFromS3(rsConnect, s3, job, logger, isManifest)

        S3Utilities.DeleteFile(s3, bucketName, s3TempKey)
예제 #2
0
    def Start(self, logger, moduleName, filelocs):
        '''
        main routine
        '''
        try:
            ApplicationBase.Start(self, logger, moduleName, filelocs)
            self.logger.info(self.moduleName + " - Processing: ")
            outputCSVfileName = self.localTempDirectory + '/PheonixDocuments.csv'

            self.logger.info(self.moduleName + " - Pull documents from Phoenix: ")
            jsonDocuments = self.PullDataFromPhoenix()
            self.logger.info(self.moduleName + " - save contents to CSV file from Phoenix: ")
            self.ExportToCSV(outputCSVfileName, jsonDocuments)
            self.logger.info(self.moduleName + " - push documents csv file to S3: ")
            bucketName, s3TempKey = S3Utilities.UploadFileToS3Temp(self.awsParams.s3, outputCSVfileName)

            self.logger.info(self.moduleName + " - Create document table: ")
            psConnect = self.GetPSConnection()
            self.CreatePostgresTables(psConnect)

            self.logger.info(self.moduleName + " - pull document s3 to database server temp: ")
            postgresTempFile = self.DownloadFromS3ToPSTempDir(psConnect, bucketName, s3TempKey)
            self.logger.info(self.moduleName + " - load documents csv file: ")
            self.LoadDataFromPostgresTempDir(psConnect, postgresTempFile)
            self.logger.info(self.moduleName + " - clean up temp file: ")
            S3Utilities.DeleteFile(self.awsParams.s3, bucketName, s3TempKey)
        except:
            logger.exception(moduleName + " - Exception in start!")
            raise
예제 #3
0
    def Start(self, logger, moduleName, filelocs):
        '''
        main routine
        '''
        try:
            ApplicationBase.Start(self, logger, moduleName, filelocs)

            s3Key = self.job["s3SrcDirectory"] + "/" + self.job["fileToLoad"]
            self.logger.info(self.moduleName + " - Processing file: " + s3Key)

            localFilepath = self.localTempDirectory + "/" + ntpath.basename(
                s3Key)
            S3Utilities.DownloadFileFromS3(self.awsParams.s3,
                                           self.job["bucketName"], s3Key,
                                           localFilepath)

            df = pd.read_excel(localFilepath,
                               "Major Variables",
                               index_col=None,
                               na_values=['NaN'],
                               skiprows=1,
                               parse_cols="C:E,G:I",
                               header=None)

            #  Save the data as CSV
            outputCSVfileName = self.localTempDirectory + '/SampleData.csv'
            df.to_csv(outputCSVfileName,
                      sep=str(self.job["delimiter"]),
                      encoding='utf-8',
                      index=False)

            # Update the CSV file into a temporary S3 location.  Postgres will download it from there to its local directory
            bucketName, s3TempKey = S3Utilities.UploadFileToS3Temp(
                self.awsParams.s3, outputCSVfileName)

            psConnect = self.GetPSConnection()
            # Postgres tables are created using a connection (rather than psql)
            self.CreatePostgresTables(psConnect)

            postgresTempFile = self.DownloadFromS3ToPSTempDir(
                psConnect, bucketName, s3TempKey)
            self.LoadDataFromPostgresTempDir(psConnect, postgresTempFile)

            S3Utilities.DeleteFile(self.awsParams.s3, bucketName, s3TempKey)

            self.LoadBaseAttributes(psConnect)
            self.LoadBaseData(psConnect, '1000', 'glm_value')
            self.LoadBaseData(psConnect, '2000', 'arima_value')
            self.LoadBaseData(psConnect, '3000', 'lasso_value')
            #           self.LoadBaseData(psConnect,'4000', 'nn_value')
            #            self.LoadBaseData(psConnect,'5000', 'spectre_value')

            psConnect.close()
            self.logger.debug(" SampleData CSV loaded to RedShift")

        except:
            logger.exception(moduleName + " - Exception in start!")
            raise
예제 #4
0
    def testDeleteFile(self):
        testFile = self.createTestingFile(
            "testDeleteFile.txt", "Testing DeleteFile from S3Utilities...")
        testFileReturned = testFile.replace(".txt", "_returned.txt")

        bucketName, s3TempKey = S3Utilities.UploadFileToS3Temp(
            self.awsParams.s3, testFile)
        S3Utilities.DeleteFile(self.awsParams.s3, bucketName, s3TempKey)

        try:
            S3Utilities.DownloadFileFromS3(self.awsParams.s3, bucketName,
                                           s3TempKey, testFileReturned)
            self.assertFalse(os.path.isfile(testFileReturned),
                             "File was not deleted from the cloud.")
        except Exception as err:
            if err.status != 404:
                self.fail(
                    "Error registered while trying to delete a file from the cloud. Error:"
                    + err.message)
예제 #5
0
    def ProcessLiquidBalanceFile(self):
        '''
        place holder
        '''
        try:
            rsConnect = RedshiftUtilities.Connect(
                dbname=self.awsParams.redshift['Database'],
                host=self.awsParams.redshift['Hostname'],
                port=self.awsParams.redshift['Port'],
                user=self.awsParams.redshiftCredential['Username'],
                password=self.awsParams.redshiftCredential['Password'])

            for sheetConfig in self.job["sheetsToExtract"]:
                self.ExtractSheet(sheetConfig)

                s3key = self.job["s3SrcDirectory"] + "/" + sheetConfig[
                    "outputName"] + "." + self.job["sheetsOutputFormat"] + ".gz"
                self.logger.info(
                    self.moduleName +
                    " Uploading information to redshift for worksheet: " +
                    sheetConfig["name"])

                job = {}
                job["destinationSchema"] = self.job["destinationSchema"]
                job["tableName"] = sheetConfig["tempTableName"]
                job["s3Filename"] = S3Utilities.GetS3FileName(
                    self.job["bucketName"], s3key)
                job["fileFormat"] = self.job["fileFormat"]
                job["dateFormat"] = self.job["dateFormat"]
                job["delimiter"] = sheetConfig["delimiter"]

                RedshiftUtilities.LoadDataFromS3(rsConnect, self.awsParams.s3,
                                                 job, self.logger)
                S3Utilities.DeleteFile(self.awsParams.s3,
                                       self.job["bucketName"], s3key)
        except:
            self.logger.exception(
                self.moduleName +
                " [ProcessLiquidBalanceFile] - We had an error in LiquidsBalance during processBlock"
            )
            raise