Ejemplo n.º 1
0
    def LoadErcotTables(self):
        '''
        Performs the final step to insert multiple files located in s3 into the final table in Redshift.
        '''
        try:
            s3DataFolder = "s3://" + self.job["bucketName"] + self.job[
                "s3ToDirectory"]

            rsConnect = RedshiftUtilities.Connect(
                dbname=self.awsParams.redshift['Database'],
                host=self.awsParams.redshift['Hostname'],
                port=self.awsParams.redshift['Port'],
                user=self.awsParams.redshiftCredential['Username'],
                password=self.awsParams.redshiftCredential['Password'])

            RedshiftUtilities.LoadDataFromS3(
                rsConnect, self.awsParams.s3, {
                    "destinationSchema": self.job["destinationSchema"],
                    "tableName": self.job["tableName"] + "DAM",
                    "s3Filename": s3DataFolder,
                    "fileFormat": self.job["fileFormat"],
                    "dateFormat": self.job["dateFormat"],
                    "delimiter": self.job["delimiter"]
                }, self.logger, "N")

            self.logger.info("Cleaning s3 data folder...")
            S3Utilities.DeleteFileFromS3TempUsingAWSCLi(
                s3DataFolder, "--recursive --quiet")
        except Exception:
            self.logger.error(
                "Error while trying to save into Redshift from s3 folder.")
            raise
        finally:
            if rsConnect is not None:
                rsConnect.close()
Ejemplo n.º 2
0
    def LoadEIANetGen(self, tableSuffix):
        '''
        Performs the final step to insert multiple files located in s3 into the final table in Redshift.
        '''
        try:
            file_name = self.job["ExcelSheets"][0]["Name"].split(
                '.')[0] + '.csv.gz'
            s3DataFolder = "s3://" + self.job["bucketName"] + self.job[
                "s3ToDirectory"] + file_name

            rsConnect = RedshiftUtilities.Connect(
                dbname=self.awsParams.redshift['Database'],
                host=self.awsParams.redshift['Hostname'],
                port=self.awsParams.redshift['Port'],
                user=self.awsParams.redshiftCredential['Username'],
                password=self.awsParams.redshiftCredential['Password'])

            RedshiftUtilities.LoadDataFromS3(
                rsConnect, self.awsParams.s3, {
                    "destinationSchema": self.job["destinationSchema"],
                    "tableName": self.job["tableName"] + tableSuffix,
                    "s3Filename": s3DataFolder,
                    "fileFormat": self.job["fileFormat"],
                    "dateFormat": self.job["dateFormat"],
                    "delimiter": self.job["delimiter"]
                }, self.logger, "N")

            self.logger.info("Cleaning s3 data folder...")

            S3Utilities.DeleteFileFromS3TempUsingAWSCLi(
                s3DataFolder, "--recursive --quiet")
        except Exception:
            self.logger.error(
                "Error while trying to save into Redshift from s3 folder.")
            raise
Ejemplo n.º 3
0
    def LoadAllFromS3(self):
        '''
        Process a single category configured in the categories dictionary in the jobConfig.
        '''
        try:
            s3DataFolder = "s3://" + self.job["bucketName"] + self.job[
                "s3ToDirectory"]

            rsConnect = RedshiftUtilities.Connect(
                dbname=self.awsParams.redshift['Database'],
                host=self.awsParams.redshift['Hostname'],
                port=self.awsParams.redshift['Port'],
                user=self.awsParams.redshiftCredential['Username'],
                password=self.awsParams.redshiftCredential['Password'])

            RedshiftUtilities.LoadDataFromS3(
                rsConnect, self.awsParams.s3, {
                    "destinationSchema": self.job["destinationSchema"],
                    "tableName": self.job["tableName"],
                    "s3Filename": s3DataFolder,
                    "fileFormat": self.job["fileFormat"],
                    "dateFormat": self.job["dateFormat"],
                    "delimiter": self.job["delimiter"]
                }, self.logger, "N")

            self.logger.info(self.moduleName + " - Cleaning s3 data folder...")

            S3Utilities.DeleteFileFromS3TempUsingAWSCLi(
                s3DataFolder, "--recursive --quiet")
        except Exception:
            self.logger.error(
                self.moduleName +
                " - Error while trying to save into Redshift from s3 folder.")
            raise
Ejemplo n.º 4
0
 def DeleteFilesFromAWS(self, table):
     '''
     Deletes files from AWS
     '''
     s3DataFolder = "s3://" + self.job["bucketName"] + self.job[
         "s3ToDirectory"][table]
     S3Utilities.DeleteFileFromS3TempUsingAWSCLi(s3DataFolder,
                                                 "--recursive --quiet")
Ejemplo n.º 5
0
    def LoadAllFromS3(self):
        '''
        Load all CSVs from the Vantage's S3 bucket into Redshift
        '''
        rsConnect = None

        try:
            s3DataFolder = "s3://" + self.job["bucketName"] + self.job[
                "s3ToDirectory"]

            rsConnect = RedshiftUtilities.Connect(
                dbname=self.awsParams.redshift['Database'],
                host=self.awsParams.redshift['Hostname'],
                port=self.awsParams.redshift['Port'],
                user=self.awsParams.redshiftCredential['Username'],
                password=self.awsParams.redshiftCredential['Password'])

            for dsScript in self.job["extractingScripts"]:
                RedshiftUtilities.LoadDataFromS3(
                    rsConnect, self.awsParams.s3, {
                        "destinationSchema":
                        self.job["destinationSchema"],
                        "tableName":
                        self.job["tableName"] + dsScript["tableSuffix"],
                        "s3Filename":
                        s3DataFolder + "/" + dsScript["tableSuffix"] +
                        ".CSV.GZ",
                        "fileFormat":
                        self.job["fileFormat"],
                        "dateFormat":
                        self.job["dateFormat"],
                        "delimiter":
                        self.job["delimiter"]
                    }, self.logger, "N")

            self.logger.info(self.moduleName + " - Cleaning s3 data folder...")

            S3Utilities.DeleteFileFromS3TempUsingAWSCLi(
                s3DataFolder, "--recursive --quiet")
        except Exception:
            self.logger.error(
                self.moduleName +
                " - Error while trying to save into Redshift from s3 folder.")
            raise
        finally:
            if rsConnect is not None:
                rsConnect.close()
Ejemplo n.º 6
0
    def UploadDataFilesToDesignatedS3Location(localParquetFilepath,
                                              tableSettings, partitionValue):
        '''
        Upload the data files, typically Parquet files, to the designated S3 location
        '''
        s3FolderLocation = AthenaUtilities.ComposeAthenaS3DataFileKey(
            tableSettings["schemaName"], tableSettings["table"])

        partitionKeyValueFolder = ""
        if AthenaUtilities.IsTablePartitioned(tableSettings):
            partitionKey = AthenaUtilities.GetPartitionKey(tableSettings)
            if not partitionKey:
                raise ValueError(
                    'Partition key cannot be null for partitioned tables.')
            if not partitionValue:
                raise ValueError(
                    'Partition value cannot be null for partitioned tables.')
            partitionKeyValueFolder = partitionKey + "=" + partitionValue + "/"

        s3FolderLocationData = s3FolderLocation + partitionKeyValueFolder

        # Only delete specific partition
        # There is no simple option to delete the whole S3 folder.  It would be easy to make a mistake and delete the entire
        # data set in the passive lake.  Do the FULL deletion MANUALLY
        if tableSettings["new"] == "Y" or ("clearPartition" in tableSettings
                                           and tableSettings["clearPartition"]
                                           == "Y"):
            S3Utilities.DeleteFileFromS3TempUsingAWSCLi(
                s3FolderLocationData, "--recursive")

        # Only copy the *.parquet files
        S3Utilities.S3RecursvieCopy(
            localParquetFilepath, s3FolderLocationData,
            "--exclude \"*\" --include \"*.parquet\" ")

        #=======================================================================
        # For testing purposes - Copy the file to a holding directory
        # import glob
        # import shutil
        # dst = "/s3/" + tableSettings["table"]
        # src = localParquetFilepath + "*.parquet"
        # for fileName in glob.glob(src):
        #     print(fileName)
        #     shutil.move(fileName, dst)
        #=======================================================================

        return s3FolderLocation
Ejemplo n.º 7
0
    def UploadScriptsToDesignatedS3Location(localScriptsFilepath,
                                            tableSettings):
        '''
        Upload the script files, typically table creation and upload, to the designated S3 location
        '''
        s3FolderLocation = AthenaUtilities.ComposeAthenaS3ScriptKey(
            tableSettings["schemaName"], tableSettings["table"])
        S3Utilities.DeleteFileFromS3TempUsingAWSCLi(s3FolderLocation,
                                                    "--recursive")

        # Upload only scripts that we plan to keep for later reuse
        scriptToCreateRedshift = FileUtilities.ComposeCreateTableSqlFilename(
            tableSettings, localScriptsFilepath)
        scriptToInsertIntoRedshift = AthenaUtilities.ComposeInsertIntoSqlFilename(
            tableSettings, localScriptsFilepath)

        S3Utilities.S3Copy(scriptToCreateRedshift, s3FolderLocation)
        S3Utilities.S3Copy(scriptToInsertIntoRedshift, s3FolderLocation)
        return s3FolderLocation
Ejemplo n.º 8
0
    def CleanupArea(self):
        '''
        1)  clean up the local area on app server
        2)  clean up files in the temp folder on S3
        '''
        for fld in self.processParams.configdata["folders"]:
            if fld["name"] == 'sql':
                self.CreateFolder(fld, "N")
            elif fld["name"] == 'gzips':
                pass
            else:
                self.CreateFolder(fld, "Y")

        user = os.environ.get("USER", "")
        if not user:
            user = os.environ.get("USERNAME", "")

        # Load file to S3 at a temporary location
        bucketName = "ihs-temp"
        s3TempKey = "eaa/src/temp/" + user + "/"
        s3FullPath = "s3://" + bucketName + "/" + s3TempKey

        S3Utilities.DeleteFileFromS3TempUsingAWSCLi(s3FullPath,
                                                    '''--recursive --quiet --include "*.zip"''')