Ejemplo n.º 1
0
    def ProcessTable(self, table):
        '''
        Process the data for the table
        '''
        s3Key = table["s3Filename"]
        self.logger.info(self.moduleName + " - Processing file: " + s3Key)

        self.CreateFolders(table["table"])

        fileName = ntpath.basename(s3Key)
        localTxtFilepath = self.fileUtilities.csvFolder + "/" + fileName
        S3Utilities.DownloadFileFromS3(self.awsParams.s3,
                                       self.job["bucketName"], s3Key,
                                       localTxtFilepath)

        spark = SparkUtilities.GetCreateSparkSession(self.logger)
        schema = SparkUtilities.BuildSparkSchema(table)
        df = (spark.read.format("com.databricks.spark.csv").options(
            header='false', delimiter=self.job["delimiter"]).schema(
                schema).load(localTxtFilepath))
        self.logger.info(self.moduleName + " -- " + "Done reading " +
                         str(df.count()) +
                         " rows.  Now saving as parquet file...")
        SparkUtilities.SaveParquet(df, self.fileUtilities)
        self.UploadFilesCreateAthenaTablesAndSqlScripts(
            table, self.fileUtilities.parquet, table["partitionValue"])
        self.logger.info(self.moduleName + " -- " + "ProcessTable " +
                         " finished ")
Ejemplo n.º 2
0
    def Start(self, logger, moduleName, filelocs):
        '''
        main routine
        '''
        try:
            ApplicationBase.Start(self, logger, moduleName, filelocs)

            s3Key = self.job["s3SrcDirectory"] + "/" + self.job["fileToLoad"]
            self.logger.info(self.moduleName + " - Processing file: " + s3Key)

            localFilepath = self.localTempDirectory + "/" + ntpath.basename(
                s3Key)
            S3Utilities.DownloadFileFromS3(self.awsParams.s3,
                                           self.job["bucketName"], s3Key,
                                           localFilepath)

            df = pd.read_excel(localFilepath,
                               "Major Variables",
                               index_col=None,
                               na_values=['NaN'],
                               skiprows=1,
                               parse_cols="C:E,G:I",
                               header=None)

            #  Save the data as CSV
            outputCSVfileName = self.localTempDirectory + '/SampleData.csv'
            df.to_csv(outputCSVfileName,
                      sep=str(self.job["delimiter"]),
                      encoding='utf-8',
                      index=False)

            # Update the CSV file into a temporary S3 location.  Postgres will download it from there to its local directory
            bucketName, s3TempKey = S3Utilities.UploadFileToS3Temp(
                self.awsParams.s3, outputCSVfileName)

            psConnect = self.GetPSConnection()
            # Postgres tables are created using a connection (rather than psql)
            self.CreatePostgresTables(psConnect)

            postgresTempFile = self.DownloadFromS3ToPSTempDir(
                psConnect, bucketName, s3TempKey)
            self.LoadDataFromPostgresTempDir(psConnect, postgresTempFile)

            S3Utilities.DeleteFile(self.awsParams.s3, bucketName, s3TempKey)

            self.LoadBaseAttributes(psConnect)
            self.LoadBaseData(psConnect, '1000', 'glm_value')
            self.LoadBaseData(psConnect, '2000', 'arima_value')
            self.LoadBaseData(psConnect, '3000', 'lasso_value')
            #           self.LoadBaseData(psConnect,'4000', 'nn_value')
            #            self.LoadBaseData(psConnect,'5000', 'spectre_value')

            psConnect.close()
            self.logger.debug(" SampleData CSV loaded to RedShift")

        except:
            logger.exception(moduleName + " - Exception in start!")
            raise
Ejemplo n.º 3
0
 def DownloadAllFiles(self, sheet):
     '''
     Worker function to download the file
     '''
     s3Key = "/" + self.job["s3SrcDirectory"] + "/" + sheet["Name"]
     self.logger.info(" Downloading file: " + s3Key)
     localFilePath = self.localTempDirectory + "/" + sheet["Name"]
     S3Utilities.DownloadFileFromS3(self.awsParams.s3,
                                    self.job["bucketName"], s3Key,
                                    localFilePath)
Ejemplo n.º 4
0
    def ProcessS3File(self, srcFileParameter):
        '''
        Process each file
        '''
        self.logger.debug(self.moduleName + " -- " + "ProcessS3File" +
                          " starting ")
        s3Key = self.job["s3SrcDirectory"] + "/" + srcFileParameter[
            "s3Filename"]
        self.logger.info(self.moduleName + " - Processing file: " + s3Key)

        fileName = ntpath.basename(s3Key)
        localGzipFilepath = self.localTempDirectory + "/raw/" + fileName

        #----------------------------------------------------------------------
        S3Utilities.DownloadFileFromS3(self.awsParams.s3,
                                       self.job["bucketName"], s3Key,
                                       localGzipFilepath)

        # Remove the gz extension
        localExcelFilepath = re.sub(r'\.gz$', '', localGzipFilepath)
        self.fileUtilities.GunzipFile(localGzipFilepath, localExcelFilepath)

        self.logger.info(self.moduleName + " - Processing Excel file: " +
                         localExcelFilepath)
        self.fileUtilities.DeleteFile(localGzipFilepath)
        fileNameNoExt = fileName.split('.', 1)[0]
        outPutFileName = self.fileUtilities.csvFolder + fileNameNoExt + '.csv'
        xl = ExcelUtilities(self.logger)
        xl.Excel2CSV(localExcelFilepath,\
                    srcFileParameter["excelSheetName"],\
                    outPutFileName,\
                    self.fileUtilities.csvFolder,\
                    skiprows=srcFileParameter["skipRows"])
        self.fileUtilities.EmptyFolderContents(self.localTempDirectory +
                                               "/raw/")
        for tables in srcFileParameter["tables"]:
            fname = self.fileUtilities.CreateTableSql(
                tables, self.fileUtilities.sqlFolder)
            RedshiftUtilities.PSqlExecute(fname, self.logger)
        # -----------------------------------------------------------------------------
        self.logger.info(self.moduleName + " - Loading data into Redshift...")
        rsConnect = self.etlUtilities.GetAWSConnection(self.awsParams)

        RedshiftUtilities.LoadFileIntoRedshift(
            rsConnect, self.awsParams.s3, self.logger, self.fileUtilities,
            outPutFileName, tables["schemaName"], tables["table"],
            self.job["fileFormat"], self.job["dateFormat"],
            self.job["delimiter"])
        # Cleanup
        rsConnect.close()
        self.fileUtilities.EmptyFolderContents(self.fileUtilities.csvFolder)

        self.logger.debug(self.moduleName + " -- " +
                          "ProcessS3File for file: " + s3Key + " finished ")
Ejemplo n.º 5
0
 def DownloadFile(self, s3Key, localGzipFilepath):
     '''
     Worker function to download the file
     '''
     self.logger.info(self.moduleName + " Downloading file: " + s3Key)
     try:
         S3Utilities.DownloadFileFromS3(self.awsParams.s3, self.job["bucketName"], s3Key, localGzipFilepath)
     except Exception as ex:
         self.logger.exception("Error while downloading file: {}".format(s3Key))
         self.logger.exception("{}".format(str(ex)))
         raise
Ejemplo n.º 6
0
 def DownloadAllFiles(self, srcFileParameter):
     '''
     Worker function to download the file
     '''
     s3Key = "/" + self.job["s3SrcDirectory"] + "/" + srcFileParameter["SubFolder"] + "/" +\
             srcFileParameter["s3Filename"] + self.job["srcfileFormat"]
     self.logger.info(" Downloading file: " + s3Key)
     localFilePath = self.localTempDirectory + "/" + srcFileParameter[
         "s3Filename"] + self.job["srcfileFormat"]
     S3Utilities.DownloadFileFromS3(self.awsParams.s3,
                                    self.job["bucketName"], s3Key,
                                    localFilePath)
Ejemplo n.º 7
0
    def testDownloadFileFromS3(self):
        testFile = self.createTestingFile(
            "testDownloadFileFromS3.txt",
            "Testing DownloadFileFromS3 from S3Utilities...")
        testFileReturned = testFile.replace(".txt", "_returned.txt")

        bucketName, s3TempKey = S3Utilities.UploadFileToS3Temp(
            self.awsParams.s3, testFile)
        S3Utilities.DownloadFileFromS3(self.awsParams.s3, bucketName,
                                       s3TempKey, testFileReturned)
        self.assertTrue(os.path.isfile(testFileReturned),
                        "File could not be downloaded from the cloud.")
Ejemplo n.º 8
0
 def DownloadFile(self, s3Key, outputLocation):
     '''
     Worker function to download the file
     '''
     self.logger.info(" Downloading file: " + s3Key)
     try:
         s3Key = "/" + s3Key
         unzippedFile = s3Key.split("/")[-1]
         localGzipFilepath = outputLocation + unzippedFile
         S3Utilities.DownloadFileFromS3(self.awsParams.s3, self.job["bucketName"], s3Key, localGzipFilepath)
     except Exception:
         self.logger.exception("Error while downloading file: {}".format(s3Key))
         raise
Ejemplo n.º 9
0
    def testUploadFileToS3Temp(self):
        testFile = self.createTestingFile(
            "testUploadFileToS3Temp.txt",
            "Testing UploadFileToS3Temp from S3Utilities...")
        testFileReturned = testFile.replace(".txt", "_returned.txt")

        bucketName, s3TempKey = S3Utilities.UploadFileToS3Temp(
            self.awsParams.s3, testFile)

        S3Utilities.DownloadFileFromS3(self.awsParams.s3, bucketName,
                                       s3TempKey, testFileReturned)
        self.assertTrue(os.path.isfile(testFileReturned),
                        "File was not found or uploaded at the cloud bucket.")
Ejemplo n.º 10
0
 def DownloadFile(self, s3Key, localGzipFilepath):
     '''
     Wrapper to download file
     '''
     self.logger.info(self.moduleName + " Downloading file: " + s3Key)
     try:
         S3Utilities.DownloadFileFromS3(self.awsParams.s3,
                                        self.job["bucketName"], s3Key,
                                        localGzipFilepath)
     except:
         self.logger.exception(
             "Error while downloading file: {}".format(s3Key))
         raise
Ejemplo n.º 11
0
 def DownloadFiles(self):
     '''
     Download the XML files
     '''
     fileList = S3Utilities.GetListOfFiles(self.awsParams.s3, self.job["bucketName"], self.job["s3SrcDirectory"][1:])
     downloadedFiles = []
     for fl in fileList:
         fileName = fl.split("/")[-1]
         s3Key = "/" + fl
         outputPath = self.localTempDirectory + "/" + fileName
         S3Utilities.DownloadFileFromS3(self.awsParams.s3, self.job["bucketName"], s3Key, outputPath)
         downloadedFiles.append(outputPath)
     return downloadedFiles
Ejemplo n.º 12
0
 def DownloadFiles(self):
     '''
     Download the entire bucket of IHSMarkitData
     '''
     fileList = S3Utilities.GetListOfFiles(self.awsParams.s3,
                                           self.job["bucketName"],
                                           self.job["s3SrcDirectory"][1:])
     for fileName in fileList:
         try:
             inputFileFullPath = self.localTempDirectory + "/" + fileName.split(
                 "/")[-1]
             S3Utilities.DownloadFileFromS3(self.awsParams.s3,
                                            self.job["bucketName"],
                                            fileName, inputFileFullPath)
         except Exception:
             self.logger.exception("Download Error for file " + fileName)
             raise
Ejemplo n.º 13
0
 def DownDataFiles(self, dbCommon):
     '''
     Download all the files and unzip them
     '''
     s3SrcDirectory = dbCommon["s3SrcDirectory"]
     bucketName = s3SrcDirectory.replace("s3://","").split("/")[0].strip()
     directory = s3SrcDirectory.replace("s3://"+bucketName, "")
     fileList = S3Utilities.GetListOfFiles(self.awsParams.s3, bucketName, directory[1:])
     for fileName in fileList:
         try:
             inputFileFullPath = self.localTempDirectory + "/" + fileName.split("/")[-1]
             S3Utilities.DownloadFileFromS3(self.awsParams.s3, self.job["bucketName"], fileName, inputFileFullPath)
             unzipFolder = self.fileUtilities.gzipFolder + inputFileFullPath.split('.')[0] + "/"
             self.fileUtilities.UnzipFile(inputFileFullPath, unzipFolder)
         except Exception:
             self.logger.exception("Download Error for file " + fileName)
             raise
Ejemplo n.º 14
0
    def testUploadFileToS3(self):
        testFileName = "testUploadFileToS3.txt"
        testFile = self.createTestingFile(
            testFileName, "Testing testUploadFileToS3 from S3Utilities...")
        testFileReturned = testFile.replace(".txt", "_returned.txt")

        bucketName = self.config["S3Utilities"]["testBucketName"]
        s3TempKey = self.config["S3Utilities"][
            "s3TempKeyFolder"] + "/" + testFileName

        S3Utilities.UploadFileToS3(self.awsParams.s3, testFile, bucketName,
                                   s3TempKey)

        S3Utilities.DownloadFileFromS3(self.awsParams.s3, bucketName,
                                       s3TempKey, testFileReturned)
        self.assertTrue(
            os.path.isfile(testFileReturned),
            "File was not uploaded correctly to the cloud bucket.")
Ejemplo n.º 15
0
    def ProcessTable(self,table):
        '''
        Process data for the table
        :param table:
        :return:
        '''

        s3Key = self.job["s3Filename"]
        self.logger.info(self.moduleName + " - Processing file: " + s3Key)

        self.fileUtilities.moduleName = self.moduleName
        self.fileUtilities.localBaseDirectory = self.localTempDirectory + "/" + table["table"]
        self.fileUtilities.CreateFolders(self.job["folders"])

        fileName = ntpath.basename(s3Key)

        local7zipFilePath = self.fileUtilities.gzipFolder+ "/" +fileName

        S3Utilities.DownloadFileFromS3(self.awsParams.s3,self.job["bucketName"],
                                       s3Key,local7zipFilePath)

        localCsvFilepath = self.fileUtilities.csvFolder + "/" + fileName
        localCsvFilepath = re.sub(r'\.zip$', '', localCsvFilepath)


        self.fileUtilities.UnzipUsing7z(local7zipFilePath,localCsvFilepath)
        fileToBeloaded = localCsvFilepath+'/'+'emission_05-11-2017.csv'

        spark = SparkUtilities.GetCreateSparkSession(self.logger)
        schema = SparkUtilities.BuildSparkSchema(table)

        df = (spark.read
              .format("com.databricks.spark.csv")
              .options(header='true', delimiter=self.job["delimiter"],ignoreTrailingWhiteSpace='true')
              .schema(schema)
              .load(fileToBeloaded)
              )

        #df.show()
        self.logger.info(
            self.moduleName + " -- " + "Done reading " + str(df.count()) + " rows.  Now saving as parquet file...")
        SparkUtilities.SaveParquet(df, self.fileUtilities)
        self.UploadFilesCreateAthenaTablesAndSqlScripts(table,self.fileUtilities.parquet)
        self.logger.info(self.moduleName + " -- " + "UploadFilesCreateAthenaTablesAndSqlScripts " + " finished ")
Ejemplo n.º 16
0
    def testDeleteFile(self):
        testFile = self.createTestingFile(
            "testDeleteFile.txt", "Testing DeleteFile from S3Utilities...")
        testFileReturned = testFile.replace(".txt", "_returned.txt")

        bucketName, s3TempKey = S3Utilities.UploadFileToS3Temp(
            self.awsParams.s3, testFile)
        S3Utilities.DeleteFile(self.awsParams.s3, bucketName, s3TempKey)

        try:
            S3Utilities.DownloadFileFromS3(self.awsParams.s3, bucketName,
                                           s3TempKey, testFileReturned)
            self.assertFalse(os.path.isfile(testFileReturned),
                             "File was not deleted from the cloud.")
        except Exception as err:
            if err.status != 404:
                self.fail(
                    "Error registered while trying to delete a file from the cloud. Error:"
                    + err.message)
Ejemplo n.º 17
0
 def GetFilesFromS3(self):
     '''
     pull down files from s3
     '''
     localFilepath = None
     try:
         self.logger.debug(self.moduleName + " -- " + "GetFilesFromS3" +
                           " starting ")
         s3Key = self.job["s3SrcDirectory"] + "/" + self.job["filetoscan"]
         self.logger.info(self.moduleName + " - Processing file: " + s3Key)
         localFilepath = self.localTempDirectory + "/" + ntpath.basename(
             s3Key)
         S3Utilities.DownloadFileFromS3(self.awsParams.s3,
                                        self.job["bucketName"], s3Key,
                                        localFilepath)
         self.logger.debug(self.moduleName + " -- " + "GetFilesFromS3" +
                           " finished ")
     except:
         self.logger.exception(self.moduleName +
                               " - we had an error in : GetFilesFromS3")
         raise
     return localFilepath