예제 #1
0
    def UploadScriptsToDesignatedS3Location(localScriptsFilepath,
                                            tableSettings):
        '''
        Upload the script files, typically table creation and upload, to the designated S3 location
        '''
        s3FolderLocation = AthenaUtilities.ComposeAthenaS3ScriptKey(
            tableSettings["schemaName"], tableSettings["table"])
        S3Utilities.DeleteFileFromS3TempUsingAWSCLi(s3FolderLocation,
                                                    "--recursive")

        # Upload only scripts that we plan to keep for later reuse
        scriptToCreateRedshift = FileUtilities.ComposeCreateTableSqlFilename(
            tableSettings, localScriptsFilepath)
        scriptToInsertIntoRedshift = AthenaUtilities.ComposeInsertIntoSqlFilename(
            tableSettings, localScriptsFilepath)

        S3Utilities.S3Copy(scriptToCreateRedshift, s3FolderLocation)
        S3Utilities.S3Copy(scriptToInsertIntoRedshift, s3FolderLocation)
        return s3FolderLocation
예제 #2
0
    def ProcessCatalogs(self, dbCommon, catalog):
        '''
        Process each file
        '''
        # Load the data from the S3 data lake into Redshift using Athena/Redshift Spectrum
        s3Key = dbCommon["s3SrcDirectory"] + "/" + catalog["s3Filename"]
        self.logger.info(self.moduleName + " - Processing file: " + s3Key)

        FileUtilities.EmptyFolderContents(
            self.fileUtilities.gzipFolder
        )  # Clear the folder from the previous run
        FileUtilities.EmptyFolderContents(
            self.fileUtilities.csvFolder
        )  # Clear the folder from the previous run
        fileName = ntpath.basename(s3Key)
        localGzipFilepath = self.fileUtilities.gzipFolder + "/" + fileName
        S3Utilities.S3Copy(s3Key, localGzipFilepath)

        localExcelFilepath = self.fileUtilities.csvFolder + "/" + fileName
        # Remove the gz extension
        localExcelFilepath = re.sub(r'\.gz$', '', localExcelFilepath)
        self.fileUtilities.GunzipFile(localGzipFilepath, localExcelFilepath)

        # Don't have a raw excel reader for Spark so use Pandas
        self.logger.info(self.moduleName + " - Processing Excel file: " +
                         localExcelFilepath)
        pandasDf = pd.read_excel(localExcelFilepath,
                                 catalog["excelSheetName"],
                                 index_col=None,
                                 na_values=['NaN'],
                                 skiprows=catalog["skipRows"])
        pandasDf = PandasUtilities.ConvertDateTimeToObject(pandasDf)

        spark = SparkUtilities.GetCreateSparkSession(self.logger)
        table = catalog["tables"][0]  # There is only table in a catalog
        schema = SparkUtilities.BuildSparkSchema(table)
        df = spark.createDataFrame(pandasDf, schema)
        df = SparkUtilities.ConvertNanToNull(df)
        SparkUtilities.SaveParquet(df, self.fileUtilities)
        self.UploadFilesCreateAthenaTablesAndSqlScripts(
            table, self.fileUtilities.parquet)
        self.LoadDataFromAthenaIntoRedShiftS3Scripts(table)
        self.logger.debug(self.moduleName + " -- " +
                          "ProcessS3File for file: " + s3Key +
                          " finished.\n\n")
예제 #3
0
    def ProcessTables(self, dbCommon, tables):
        '''
        Process the data for the table
        '''
        s3Key = self.job["s3Filename"]
        self.logger.info(self.moduleName + " - Processing file: " + s3Key)
        
        fileName = ntpath.basename(s3Key)
        localGzipFilepath = self.fileUtilities.gzipFolder + "/" + fileName
        S3Utilities.S3Copy(s3Key, localGzipFilepath)

        # Unzip the file rather than reading the gzip as Spark is faster with csv
        localCSVFilepath = self.fileUtilities.csvFolder + "/" + fileName + ".csv"
        self.fileUtilities.GunzipFile(localGzipFilepath, localCSVFilepath)

        spark = SparkUtilities.GetCreateSparkSession(self.logger)
        df = SparkUtilities.ReadCSVFile(spark, tables, self.job["delimiter"], True, self.fileUtilities.csvFolder, self.logger)
        SparkUtilities.SaveParquet(df, self.fileUtilities)
        self.UploadFilesCreateAthenaTablesAndSqlScripts(tables, self.fileUtilities.parquet)
        self.LoadDataFromAthenaIntoRedShiftS3Scripts(tables)
        self.logger.info(self.moduleName + " -- " + "ProcessTable " + " finished ")