Exemple #1
0
 def DownloadFile(self, s3Key, localGzipFilepath):
     '''
     Wrapper to download file
     '''
     self.logger.info(self.moduleName + " Downloading file: " + s3Key)
     try:
         S3Utilities.DownloadFileFromS3(self.awsParams.s3,
                                        self.job["bucketName"], s3Key,
                                        localGzipFilepath)
     except Exception as e:
         self.logger.exception(
             "Error while downloading file: {}".format(s3Key))
         self.logger.exception("{}".format(str(e)))
         raise
Exemple #2
0
 def BulkUploadToS3(self, s3subFolder):
     '''
     Uploads all GZIP files created into S3 to be uploaded later...
     '''
     self.logger.info(self.moduleName +
                      " - Uploading GZIP files to s3 folder...")
     s3Sub = None
     if s3subFolder is not None:
         s3Sub = '/' + s3subFolder
     S3Utilities.SyncFolderAWSCli(self.fileUtilities.gzipFolder,
                                  "s3://" + self.job["bucketName"] +
                                  self.job["s3GzipFolderBase"] + s3Sub,
                                  args='''--quiet --include "*.gz"''',
                                  dbug="Y")
    def DownloadFilesFromS3(self, tablesJson):
        '''
        Download all files from the s3 data folder.
        '''
        try:
            self.logger.debug(self.moduleName + " -- " + "DownloadFilesFromS3" + " starting ")
            rawFolder = self.localTempDirectory + '/raw/'
            S3Utilities.CopyItemsAWSCli("s3://" + tablesJson["srcBucketName"] + tablesJson["srcS3DataFolder"],
                                        rawFolder,
                                        "--recursive --quiet")

            self.logger.debug(self.moduleName + " -- " + "DownloadFilesFromS3" + " finished ")
        except Exception as err:
            self.logger.error(self.moduleName + " - Error while trying to download files from s3. Error: " + err.message)
            raise
Exemple #4
0
    def DownloadScriptsForRedShift(awsParams, tableSettings,
                                   localScriptsFilepath):
        '''
        Download the script files, typically table creation and upload, from the designated S3 location
        '''
        # Need the proper credentials to write to the Athena lake
        old_key, old_secret_key = awsParams.SwitchS3CredentialsToAthena()

        s3FolderLocation = AthenaUtilities.ComposeAthenaS3ScriptKey(
            tableSettings["schemaName"], tableSettings["table"])
        S3Utilities.S3RecursvieCopy(s3FolderLocation, localScriptsFilepath)

        awsParams.SwitchS3CredentialsTo(old_key, old_secret_key)

        return s3FolderLocation
Exemple #5
0
 def BulkDownload(self):
     '''
     Download the entire bucket of EIA 860
     '''
     for path in self.job["s3SrcDirectory"]:
         try:
             sourcePath = "s3://" + self.job["bucketName"] + "/" + path
             outputPath = self.localTempDirectory + "/"
             S3Utilities.CopyItemsAWSCli(sourcePath, outputPath,
                                         "--recursive --quiet")
         except:
             self.logger.exception(
                 "Exception in PGCREIA860.BulkDownload. Location {}".format(
                     sourcePath))
             raise
 def GetListOfFilesOnS3(self):
     '''
     Get the list of files on S3 under the given bucket & source directory and download the files
     '''
     try:
         return S3Utilities.GetListOfFiles(self.awsParams.s3,
                                           self.job["bucketName"],
                                           self.job["s3SrcDirectory"][1:])
     except Exception:
         self.logger.exception(
             "Exception in PGCRFERCFilings.GetListOfFilesOnS3")
         self.logger.exception(
             "Exception while fetching the list of files from S3 bucket: {}, path:{}"
             .format(self.job["bucketName"],
                     self.job["s3SrcDirectory"][1:]))
         raise
 def DownloadFile(self, s3Key, outputLocation):
     '''
     Worker function to download the file
     '''
     self.logger.info(" Downloading file: " + s3Key)
     try:
         s3Key = "/" + s3Key
         unzippedFile = s3Key.split("/")[-1]
         localGzipFilepath = outputLocation + unzippedFile
         S3Utilities.DownloadFileFromS3(self.awsParams.s3,
                                        self.job["bucketName"], s3Key,
                                        localGzipFilepath)
     except Exception:
         self.logger.exception(
             "Error while downloading file: {}".format(s3Key))
         raise
Exemple #8
0
    def LoadAllFromS3(self):
        '''
        Load all CSVs from the Vantage's S3 bucket into Redshift
        '''
        rsConnect = None

        try:
            s3DataFolder = "s3://" + self.job["bucketName"] + self.job[
                "s3ToDirectory"]

            rsConnect = RedshiftUtilities.Connect(
                dbname=self.awsParams.redshift['Database'],
                host=self.awsParams.redshift['Hostname'],
                port=self.awsParams.redshift['Port'],
                user=self.awsParams.redshiftCredential['Username'],
                password=self.awsParams.redshiftCredential['Password'])

            for dsScript in self.job["extractingScripts"]:
                RedshiftUtilities.LoadDataFromS3(
                    rsConnect, self.awsParams.s3, {
                        "destinationSchema":
                        self.job["destinationSchema"],
                        "tableName":
                        self.job["tableName"] + dsScript["tableSuffix"],
                        "s3Filename":
                        s3DataFolder + "/" + dsScript["tableSuffix"] +
                        ".CSV.GZ",
                        "fileFormat":
                        self.job["fileFormat"],
                        "dateFormat":
                        self.job["dateFormat"],
                        "delimiter":
                        self.job["delimiter"]
                    }, self.logger, "N")

            self.logger.info(self.moduleName + " - Cleaning s3 data folder...")

            S3Utilities.DeleteFileFromS3TempUsingAWSCLi(
                s3DataFolder, "--recursive --quiet")
        except Exception:
            self.logger.error(
                self.moduleName +
                " - Error while trying to save into Redshift from s3 folder.")
            raise
        finally:
            if rsConnect is not None:
                rsConnect.close()
Exemple #9
0
    def LoadAllData(self):
        '''
        Process:
        1)  push Attribute and data gz files to S3
        2)  load data into Redshift from S3
        '''
        self.CreateFolders("N")  #  this just sets the variable we will need
        self.fileUtilities = FileUtilities(self.logger)

        rsConnect = RedshiftUtilities.Connect(dbname=self.awsParams.redshift['Database'],
                                              host=self.awsParams.redshift['Hostname'],
                                              port=self.awsParams.redshift['Port'],
                                              user=self.awsParams.redshiftCredential['Username'],
                                              password=self.awsParams.redshiftCredential['Password'])

        for table in self.job["tables"]:
            ###
            #  first create zip files for all we want to send to S3
            ###
            s3folder = "s3://" + self.job["bucketName"] + self.job["s3GzipFolderBase"]
            if table["type"] == "attributes":
                sourceFolder = self.gzipFolder + "attr"
                destFolder = s3folder + "/attribute"
            else:  # data types
                sourceFolder = self.gzipFolder + "data"
                destFolder = s3folder + "/data"

            S3Utilities.CopyItemsAWSCli(sourceFolder,
                                        destFolder,
                                        '''--recursive --quiet --include "*.gz"''')

            RedshiftUtilities.LoadDataFromS3(rsConnect, self.awsParams.s3,
                                             {
                                                 "destinationSchema": self.job["destinationSchema"],
                                                 "tableName": table["name"],
                                                 "s3Filename": destFolder,
                                                 "fileFormat": self.job["fileFormat"],
                                                 "dateFormat": self.job["dateFormat"],
                                                 "delimiter": self.job["delimiter"]
                                             },
                                             self.logger, "N")

#            S3Utilities.DeleteFileFromS3TempUsingAWSCLi(destFolder,
#                                                        '''--recursive --quiet --include "*.gz"''')

        rsConnect.close()
Exemple #10
0
    def ProcessCatalogs(self, dbCommon, catalog):
        '''
        Process each file
        '''
        # Load the data from the S3 data lake into Redshift using Athena/Redshift Spectrum
        s3Key = dbCommon["s3SrcDirectory"] + "/" + catalog["s3Filename"]
        self.logger.info(self.moduleName + " - Processing file: " + s3Key)

        FileUtilities.EmptyFolderContents(
            self.fileUtilities.gzipFolder
        )  # Clear the folder from the previous run
        FileUtilities.EmptyFolderContents(
            self.fileUtilities.csvFolder
        )  # Clear the folder from the previous run
        fileName = ntpath.basename(s3Key)
        localGzipFilepath = self.fileUtilities.gzipFolder + "/" + fileName
        S3Utilities.S3Copy(s3Key, localGzipFilepath)

        localExcelFilepath = self.fileUtilities.csvFolder + "/" + fileName
        # Remove the gz extension
        localExcelFilepath = re.sub(r'\.gz$', '', localExcelFilepath)
        self.fileUtilities.GunzipFile(localGzipFilepath, localExcelFilepath)

        # Don't have a raw excel reader for Spark so use Pandas
        self.logger.info(self.moduleName + " - Processing Excel file: " +
                         localExcelFilepath)
        pandasDf = pd.read_excel(localExcelFilepath,
                                 catalog["excelSheetName"],
                                 index_col=None,
                                 na_values=['NaN'],
                                 skiprows=catalog["skipRows"])
        pandasDf = PandasUtilities.ConvertDateTimeToObject(pandasDf)

        spark = SparkUtilities.GetCreateSparkSession(self.logger)
        table = catalog["tables"][0]  # There is only table in a catalog
        schema = SparkUtilities.BuildSparkSchema(table)
        df = spark.createDataFrame(pandasDf, schema)
        df = SparkUtilities.ConvertNanToNull(df)
        SparkUtilities.SaveParquet(df, self.fileUtilities)
        self.UploadFilesCreateAthenaTablesAndSqlScripts(
            table, self.fileUtilities.parquet)
        self.LoadDataFromAthenaIntoRedShiftS3Scripts(table)
        self.logger.debug(self.moduleName + " -- " +
                          "ProcessS3File for file: " + s3Key +
                          " finished.\n\n")
    def ProcessTable(self,table):
        '''
        Process data for the table
        :param table:
        :return:
        '''

        s3Key = self.job["s3Filename"]
        self.logger.info(self.moduleName + " - Processing file: " + s3Key)

        self.fileUtilities.moduleName = self.moduleName
        self.fileUtilities.localBaseDirectory = self.localTempDirectory + "/" + table["table"]
        self.fileUtilities.CreateFolders(self.job["folders"])

        fileName = ntpath.basename(s3Key)

        local7zipFilePath = self.fileUtilities.gzipFolder+ "/" +fileName

        S3Utilities.DownloadFileFromS3(self.awsParams.s3,self.job["bucketName"],
                                       s3Key,local7zipFilePath)

        localCsvFilepath = self.fileUtilities.csvFolder + "/" + fileName
        localCsvFilepath = re.sub(r'\.zip$', '', localCsvFilepath)


        self.fileUtilities.UnzipUsing7z(local7zipFilePath,localCsvFilepath)
        fileToBeloaded = localCsvFilepath+'/'+'emission_05-11-2017.csv'

        spark = SparkUtilities.GetCreateSparkSession(self.logger)
        schema = SparkUtilities.BuildSparkSchema(table)

        df = (spark.read
              .format("com.databricks.spark.csv")
              .options(header='true', delimiter=self.job["delimiter"],ignoreTrailingWhiteSpace='true')
              .schema(schema)
              .load(fileToBeloaded)
              )

        #df.show()
        self.logger.info(
            self.moduleName + " -- " + "Done reading " + str(df.count()) + " rows.  Now saving as parquet file...")
        SparkUtilities.SaveParquet(df, self.fileUtilities)
        self.UploadFilesCreateAthenaTablesAndSqlScripts(table,self.fileUtilities.parquet)
        self.logger.info(self.moduleName + " -- " + "UploadFilesCreateAthenaTablesAndSqlScripts " + " finished ")
Exemple #12
0
 def PushFilesToS3(self):
     '''
     push files to s3 server
     '''
     try:
         self.logger.debug(self.moduleName + " -- " + "PushFilesToS3" +
                           " starting ")
         S3Utilities.SyncFolderAWSCli(self.localTempDirectory + '/zips',
                                      "s3://" + self.job["bucketName"] +
                                      '/' + self.job["s3SrcDirectory"] +
                                      '/zips',
                                      args='''--quiet --include "*.zip"''',
                                      dbug="Y")
         self.logger.debug(self.moduleName + " -- " + "PushFilesToS3" +
                           " finished ")
     except:
         self.logger.exception(self.moduleName +
                               "- we had an error in PushFilesToS3")
         raise
Exemple #13
0
 def DownloadFilesFromS3(self, tablesJson):
     '''
     Download files from the s3 data folder.
     '''
     try:
         self.logger.debug(self.moduleName + " -- " + "DownloadFileFromS3" +
                           " starting " + tablesJson["srcFile"])
         S3Utilities.CopyItemsAWSCli(
             "s3://" + tablesJson["srcBucketName"] +
             tablesJson["srcS3DataFolder"] + tablesJson["srcFile"],
             self.fileUtilities.csvFolder, "--quiet")
         self.logger.debug(self.moduleName + " -- " + "DownloadFileFromS3" +
                           " finished " + tablesJson["srcFile"])
     except Exception as err:
         self.logger.error(
             self.moduleName +
             " - Error while trying to download file from s3. Error: " +
             err.message)
         raise
Exemple #14
0
    def ProcessTables(self, dbCommon, tables):
        '''
        Process the data for the table
        '''
        s3Key = self.job["s3Filename"]
        self.logger.info(self.moduleName + " - Processing file: " + s3Key)
        
        fileName = ntpath.basename(s3Key)
        localGzipFilepath = self.fileUtilities.gzipFolder + "/" + fileName
        S3Utilities.S3Copy(s3Key, localGzipFilepath)

        # Unzip the file rather than reading the gzip as Spark is faster with csv
        localCSVFilepath = self.fileUtilities.csvFolder + "/" + fileName + ".csv"
        self.fileUtilities.GunzipFile(localGzipFilepath, localCSVFilepath)

        spark = SparkUtilities.GetCreateSparkSession(self.logger)
        df = SparkUtilities.ReadCSVFile(spark, tables, self.job["delimiter"], True, self.fileUtilities.csvFolder, self.logger)
        SparkUtilities.SaveParquet(df, self.fileUtilities)
        self.UploadFilesCreateAthenaTablesAndSqlScripts(tables, self.fileUtilities.parquet)
        self.LoadDataFromAthenaIntoRedShiftS3Scripts(tables)
        self.logger.info(self.moduleName + " -- " + "ProcessTable " + " finished ")
Exemple #15
0
    def UploadFilesCreateAthenaTablesAndSqlScripts(self,
                                                   table,
                                                   localParquetFolderName,
                                                   partitionValue=None):
        '''
        Upload Parquet files into S3
        Create Athena Table/Partition
        Create script to create a RedShift table and save to S3 (note that the ETL may not necessarily load data into Redshift)
        Create script to insert data into Redshift and save to S3  (note that the ETL may not necessarily load data into Redshift)
        '''
        if not FileUtilities.FilesExistInFolder(localParquetFolderName +
                                                "*.parquet"):
            # Nothing was created.  We have a problem
            self.logger.info(
                self.moduleName +
                " - No parquet files were created for current partition in: " +
                localParquetFolderName + ".  Nothing was processed on Athena.")
            return False

        self.fileUtilities.CreateTableSql(table, self.fileUtilities.sqlFolder)

        scriptPartitionValue = partitionValue
        if AthenaUtilities.IsTablePartitioned(table):
            # For partitioned tables, the script will insert a where clause by default.  However, if we are doing a new load
            # skip the where clause so that we can have SQL script that is capable of loading all the data from Athena
            # into RedShift in the future
            s3FolderLocation = AthenaUtilities.ComposeAthenaS3DataFileKey(
                table["schemaName"], table["table"])
            if not S3Utilities.KeyExist(
                    self.awsParams, s3FolderLocation
            ):  # Do not update scripts if data has been previously loaded
                scriptPartitionValue = None
        AthenaUtilities.SqlToLoadDataFromAthena(self.logger, table,
                                                self.fileUtilities.sqlFolder,
                                                scriptPartitionValue)

        AthenaUtilities.UploadFilesAndCreateAthenaTables(
            self.awsParams, localParquetFolderName, table,
            self.fileUtilities.sqlFolder, self.logger, partitionValue)
        return True
Exemple #16
0
    def LoadAirMarketsTables(self):
        '''
        Performs the final step to insert multiple files located in s3 into the final table in Redshift.
        '''
        try:
            s3DataFolder = "s3://" + self.job["bucketName"] + self.job[
                "s3ToDirectory"]

            rsConnect = RedshiftUtilities.Connect(
                dbname=self.awsParams.redshift['Database'],
                host=self.awsParams.redshift['Hostname'],
                port=self.awsParams.redshift['Port'],
                user=self.awsParams.redshiftCredential['Username'],
                password=self.awsParams.redshiftCredential['Password'])

            RedshiftUtilities.LoadDataFromS3(
                rsConnect, self.awsParams.s3, {
                    "destinationSchema":
                    self.job["destinationSchema"],
                    "tableName":
                    self.job["tableName"] + self.job["srcFileNamePrefix"],
                    "s3Filename":
                    s3DataFolder,
                    "fileFormat":
                    self.job["fileFormat"],
                    "dateFormat":
                    self.job["dateFormat"],
                    "delimiter":
                    self.job["delimiter"]
                }, self.logger, "N")

            self.logger.info("Cleaning s3 data folder...")

            S3Utilities.DeleteFileFromS3TempUsingAWSCLi(
                s3DataFolder, "--recursive --quiet")
        except Exception:
            self.logger.error(
                "Error while trying to save into Redshift from s3 folder.")
            raise
Exemple #17
0
 def GetFilesFromS3(self):
     '''
     pull down files from s3
     '''
     localFilepath = None
     try:
         self.logger.debug(self.moduleName + " -- " + "GetFilesFromS3" +
                           " starting ")
         s3Key = self.job["s3SrcDirectory"] + "/" + self.job["filetoscan"]
         self.logger.info(self.moduleName + " - Processing file: " + s3Key)
         localFilepath = self.localTempDirectory + "/" + ntpath.basename(
             s3Key)
         S3Utilities.DownloadFileFromS3(self.awsParams.s3,
                                        self.job["bucketName"], s3Key,
                                        localFilepath)
         self.logger.debug(self.moduleName + " -- " + "GetFilesFromS3" +
                           " finished ")
     except:
         self.logger.exception(self.moduleName +
                               " - we had an error in : GetFilesFromS3")
         raise
     return localFilepath
Exemple #18
0
 def MoveToS3(self):
     '''
     move gzip files to s3 and clean local instance
     '''
     try:
         self.logger.debug(self.moduleName + " -- " + "MoveToS3 " +
                           " starting ")
         ###
         #  move any gzip files to the s3 server
         ###
         s3folder = "s3://" + self.job["bucketName"] + self.job[
             "s3GzipFolderBase"]
         S3Utilities.SyncFolderAWSCli(self.fileUtilities.gzipFolder,
                                      s3folder,
                                      args='''--quiet --include "*.gz"''',
                                      dbug="N")
         # Cleanup local files
         FileUtilities.EmptyFolderContents(self.fileUtilities.gzipFolder)
         self.logger.debug(self.moduleName + " -- " + "MoveToS3 " +
                           " finished ")
     except:
         self.logger.exception(self.moduleName +
                               " - we had an error in MoveToS3")
         raise
Exemple #19
0
    def CleanupArea(self):
        '''
        1)  clean up the local area on app server
        2)  clean up files in the temp folder on S3
        '''
        for fld in self.processParams.configdata["folders"]:
            if fld["name"] == 'sql':
                self.CreateFolder(fld, "N")
            elif fld["name"] == 'gzips':
                pass
            else:
                self.CreateFolder(fld, "Y")

        user = os.environ.get("USER", "")
        if not user:
            user = os.environ.get("USERNAME", "")

        # Load file to S3 at a temporary location
        bucketName = "ihs-temp"
        s3TempKey = "eaa/src/temp/" + user + "/"
        s3FullPath = "s3://" + bucketName + "/" + s3TempKey

        S3Utilities.DeleteFileFromS3TempUsingAWSCLi(s3FullPath,
                                                    '''--recursive --quiet --include "*.zip"''')
Exemple #20
0
    def UploadFilesAndCreateAthenaTables(awsParams, localParquetFilepath,
                                         tableSettings, localScriptsFilepath,
                                         logger, partitionValue):
        '''
        Upload file to Designated S3 Athena passive lake location and create Athena tables
        Do this using Athena credentials
        '''
        # Need the proper credentials to write to the Athena lake
        old_key, old_secret_key = awsParams.SwitchS3CredentialsToAthena()

        # For partitioned tables, the creation scripts in S3 will be build once to insert ALL the data from Athena to Redshift
        # Incremental runs will not update the S3 scripts since they are designed to incrementally update the RedShift tables
        updateScriptsInS3 = True
        if AthenaUtilities.IsTablePartitioned(tableSettings):
            s3FolderLocation = AthenaUtilities.ComposeAthenaS3DataFileKey(
                tableSettings["schemaName"], tableSettings["table"])
            updateScriptsInS3 = not S3Utilities.KeyExist(
                awsParams, s3FolderLocation
            )  # Do not update scripts if data has been previously loaded

        # Save  the Parquet file(s) in the designated S3 location and create the corresponding Athena tables
        s3FolderLocation = AthenaUtilities.UploadDataFilesToDesignatedS3Location(
            localParquetFilepath, tableSettings, partitionValue)
        AthenaUtilities.CreateAthenaTablesUsingAthenaCLI(
            tableSettings, s3FolderLocation, partitionValue, logger)

        # Save  the SQL Script files in the designated S3 location in case we need to delete the data from RedShift to save space
        # The scripts in S3 will reload ALL the data to make sure the table is fully re-built
        if updateScriptsInS3:
            AthenaUtilities.UploadScriptsToDesignatedS3Location(
                localScriptsFilepath, tableSettings)

        logger.info("AthenaUtilities -- " + "Done uploading data to S3:" +
                    s3FolderLocation)

        awsParams.SwitchS3CredentialsTo(old_key, old_secret_key)
    def LoadClassRefDF(self, spark):
        '''
        Loads de class reference data
        '''
        xReferencesDF = {}

        for catalog in self.job["catalogs"]:
            if catalog["name"] == "xReferences":
                for xrefTable in catalog["tables"]:
                    if self.xRefPulled is False:
                        S3Utilities.CopyItemsAWSCli(
                            "s3://" + self.job["bucketName"] +
                            xrefTable["s3SourceFolder"] +
                            xrefTable["sourceFileName"],
                            self.fileUtilities.csvFolder, "--quiet")

                    xReferencesDF[
                        xrefTable["table"]] = SparkUtilities.ReadCSVFile(
                            spark, xrefTable, self.job["delimiter"], False,
                            self.fileUtilities.csvFolder + "/" +
                            xrefTable["sourceFileName"], self.logger)

        self.xRefPulled = True
        return xReferencesDF