def ProcessTable(self, table): ''' Process the data for the table ''' s3Key = table["s3Filename"] self.logger.info(self.moduleName + " - Processing file: " + s3Key) self.CreateFolders(table["table"]) fileName = ntpath.basename(s3Key) localTxtFilepath = self.fileUtilities.csvFolder + "/" + fileName S3Utilities.DownloadFileFromS3(self.awsParams.s3, self.job["bucketName"], s3Key, localTxtFilepath) spark = SparkUtilities.GetCreateSparkSession(self.logger) schema = SparkUtilities.BuildSparkSchema(table) df = (spark.read.format("com.databricks.spark.csv").options( header='false', delimiter=self.job["delimiter"]).schema( schema).load(localTxtFilepath)) self.logger.info(self.moduleName + " -- " + "Done reading " + str(df.count()) + " rows. Now saving as parquet file...") SparkUtilities.SaveParquet(df, self.fileUtilities) self.UploadFilesCreateAthenaTablesAndSqlScripts( table, self.fileUtilities.parquet, table["partitionValue"]) self.logger.info(self.moduleName + " -- " + "ProcessTable " + " finished ")
def Start(self, logger, moduleName, filelocs): ''' main routine ''' try: ApplicationBase.Start(self, logger, moduleName, filelocs) s3Key = self.job["s3SrcDirectory"] + "/" + self.job["fileToLoad"] self.logger.info(self.moduleName + " - Processing file: " + s3Key) localFilepath = self.localTempDirectory + "/" + ntpath.basename( s3Key) S3Utilities.DownloadFileFromS3(self.awsParams.s3, self.job["bucketName"], s3Key, localFilepath) df = pd.read_excel(localFilepath, "Major Variables", index_col=None, na_values=['NaN'], skiprows=1, parse_cols="C:E,G:I", header=None) # Save the data as CSV outputCSVfileName = self.localTempDirectory + '/SampleData.csv' df.to_csv(outputCSVfileName, sep=str(self.job["delimiter"]), encoding='utf-8', index=False) # Update the CSV file into a temporary S3 location. Postgres will download it from there to its local directory bucketName, s3TempKey = S3Utilities.UploadFileToS3Temp( self.awsParams.s3, outputCSVfileName) psConnect = self.GetPSConnection() # Postgres tables are created using a connection (rather than psql) self.CreatePostgresTables(psConnect) postgresTempFile = self.DownloadFromS3ToPSTempDir( psConnect, bucketName, s3TempKey) self.LoadDataFromPostgresTempDir(psConnect, postgresTempFile) S3Utilities.DeleteFile(self.awsParams.s3, bucketName, s3TempKey) self.LoadBaseAttributes(psConnect) self.LoadBaseData(psConnect, '1000', 'glm_value') self.LoadBaseData(psConnect, '2000', 'arima_value') self.LoadBaseData(psConnect, '3000', 'lasso_value') # self.LoadBaseData(psConnect,'4000', 'nn_value') # self.LoadBaseData(psConnect,'5000', 'spectre_value') psConnect.close() self.logger.debug(" SampleData CSV loaded to RedShift") except: logger.exception(moduleName + " - Exception in start!") raise
def DownloadAllFiles(self, sheet): ''' Worker function to download the file ''' s3Key = "/" + self.job["s3SrcDirectory"] + "/" + sheet["Name"] self.logger.info(" Downloading file: " + s3Key) localFilePath = self.localTempDirectory + "/" + sheet["Name"] S3Utilities.DownloadFileFromS3(self.awsParams.s3, self.job["bucketName"], s3Key, localFilePath)
def ProcessS3File(self, srcFileParameter): ''' Process each file ''' self.logger.debug(self.moduleName + " -- " + "ProcessS3File" + " starting ") s3Key = self.job["s3SrcDirectory"] + "/" + srcFileParameter[ "s3Filename"] self.logger.info(self.moduleName + " - Processing file: " + s3Key) fileName = ntpath.basename(s3Key) localGzipFilepath = self.localTempDirectory + "/raw/" + fileName #---------------------------------------------------------------------- S3Utilities.DownloadFileFromS3(self.awsParams.s3, self.job["bucketName"], s3Key, localGzipFilepath) # Remove the gz extension localExcelFilepath = re.sub(r'\.gz$', '', localGzipFilepath) self.fileUtilities.GunzipFile(localGzipFilepath, localExcelFilepath) self.logger.info(self.moduleName + " - Processing Excel file: " + localExcelFilepath) self.fileUtilities.DeleteFile(localGzipFilepath) fileNameNoExt = fileName.split('.', 1)[0] outPutFileName = self.fileUtilities.csvFolder + fileNameNoExt + '.csv' xl = ExcelUtilities(self.logger) xl.Excel2CSV(localExcelFilepath,\ srcFileParameter["excelSheetName"],\ outPutFileName,\ self.fileUtilities.csvFolder,\ skiprows=srcFileParameter["skipRows"]) self.fileUtilities.EmptyFolderContents(self.localTempDirectory + "/raw/") for tables in srcFileParameter["tables"]: fname = self.fileUtilities.CreateTableSql( tables, self.fileUtilities.sqlFolder) RedshiftUtilities.PSqlExecute(fname, self.logger) # ----------------------------------------------------------------------------- self.logger.info(self.moduleName + " - Loading data into Redshift...") rsConnect = self.etlUtilities.GetAWSConnection(self.awsParams) RedshiftUtilities.LoadFileIntoRedshift( rsConnect, self.awsParams.s3, self.logger, self.fileUtilities, outPutFileName, tables["schemaName"], tables["table"], self.job["fileFormat"], self.job["dateFormat"], self.job["delimiter"]) # Cleanup rsConnect.close() self.fileUtilities.EmptyFolderContents(self.fileUtilities.csvFolder) self.logger.debug(self.moduleName + " -- " + "ProcessS3File for file: " + s3Key + " finished ")
def DownloadFile(self, s3Key, localGzipFilepath): ''' Worker function to download the file ''' self.logger.info(self.moduleName + " Downloading file: " + s3Key) try: S3Utilities.DownloadFileFromS3(self.awsParams.s3, self.job["bucketName"], s3Key, localGzipFilepath) except Exception as ex: self.logger.exception("Error while downloading file: {}".format(s3Key)) self.logger.exception("{}".format(str(ex))) raise
def DownloadAllFiles(self, srcFileParameter): ''' Worker function to download the file ''' s3Key = "/" + self.job["s3SrcDirectory"] + "/" + srcFileParameter["SubFolder"] + "/" +\ srcFileParameter["s3Filename"] + self.job["srcfileFormat"] self.logger.info(" Downloading file: " + s3Key) localFilePath = self.localTempDirectory + "/" + srcFileParameter[ "s3Filename"] + self.job["srcfileFormat"] S3Utilities.DownloadFileFromS3(self.awsParams.s3, self.job["bucketName"], s3Key, localFilePath)
def testDownloadFileFromS3(self): testFile = self.createTestingFile( "testDownloadFileFromS3.txt", "Testing DownloadFileFromS3 from S3Utilities...") testFileReturned = testFile.replace(".txt", "_returned.txt") bucketName, s3TempKey = S3Utilities.UploadFileToS3Temp( self.awsParams.s3, testFile) S3Utilities.DownloadFileFromS3(self.awsParams.s3, bucketName, s3TempKey, testFileReturned) self.assertTrue(os.path.isfile(testFileReturned), "File could not be downloaded from the cloud.")
def DownloadFile(self, s3Key, outputLocation): ''' Worker function to download the file ''' self.logger.info(" Downloading file: " + s3Key) try: s3Key = "/" + s3Key unzippedFile = s3Key.split("/")[-1] localGzipFilepath = outputLocation + unzippedFile S3Utilities.DownloadFileFromS3(self.awsParams.s3, self.job["bucketName"], s3Key, localGzipFilepath) except Exception: self.logger.exception("Error while downloading file: {}".format(s3Key)) raise
def testUploadFileToS3Temp(self): testFile = self.createTestingFile( "testUploadFileToS3Temp.txt", "Testing UploadFileToS3Temp from S3Utilities...") testFileReturned = testFile.replace(".txt", "_returned.txt") bucketName, s3TempKey = S3Utilities.UploadFileToS3Temp( self.awsParams.s3, testFile) S3Utilities.DownloadFileFromS3(self.awsParams.s3, bucketName, s3TempKey, testFileReturned) self.assertTrue(os.path.isfile(testFileReturned), "File was not found or uploaded at the cloud bucket.")
def DownloadFile(self, s3Key, localGzipFilepath): ''' Wrapper to download file ''' self.logger.info(self.moduleName + " Downloading file: " + s3Key) try: S3Utilities.DownloadFileFromS3(self.awsParams.s3, self.job["bucketName"], s3Key, localGzipFilepath) except: self.logger.exception( "Error while downloading file: {}".format(s3Key)) raise
def DownloadFiles(self): ''' Download the XML files ''' fileList = S3Utilities.GetListOfFiles(self.awsParams.s3, self.job["bucketName"], self.job["s3SrcDirectory"][1:]) downloadedFiles = [] for fl in fileList: fileName = fl.split("/")[-1] s3Key = "/" + fl outputPath = self.localTempDirectory + "/" + fileName S3Utilities.DownloadFileFromS3(self.awsParams.s3, self.job["bucketName"], s3Key, outputPath) downloadedFiles.append(outputPath) return downloadedFiles
def DownloadFiles(self): ''' Download the entire bucket of IHSMarkitData ''' fileList = S3Utilities.GetListOfFiles(self.awsParams.s3, self.job["bucketName"], self.job["s3SrcDirectory"][1:]) for fileName in fileList: try: inputFileFullPath = self.localTempDirectory + "/" + fileName.split( "/")[-1] S3Utilities.DownloadFileFromS3(self.awsParams.s3, self.job["bucketName"], fileName, inputFileFullPath) except Exception: self.logger.exception("Download Error for file " + fileName) raise
def DownDataFiles(self, dbCommon): ''' Download all the files and unzip them ''' s3SrcDirectory = dbCommon["s3SrcDirectory"] bucketName = s3SrcDirectory.replace("s3://","").split("/")[0].strip() directory = s3SrcDirectory.replace("s3://"+bucketName, "") fileList = S3Utilities.GetListOfFiles(self.awsParams.s3, bucketName, directory[1:]) for fileName in fileList: try: inputFileFullPath = self.localTempDirectory + "/" + fileName.split("/")[-1] S3Utilities.DownloadFileFromS3(self.awsParams.s3, self.job["bucketName"], fileName, inputFileFullPath) unzipFolder = self.fileUtilities.gzipFolder + inputFileFullPath.split('.')[0] + "/" self.fileUtilities.UnzipFile(inputFileFullPath, unzipFolder) except Exception: self.logger.exception("Download Error for file " + fileName) raise
def testUploadFileToS3(self): testFileName = "testUploadFileToS3.txt" testFile = self.createTestingFile( testFileName, "Testing testUploadFileToS3 from S3Utilities...") testFileReturned = testFile.replace(".txt", "_returned.txt") bucketName = self.config["S3Utilities"]["testBucketName"] s3TempKey = self.config["S3Utilities"][ "s3TempKeyFolder"] + "/" + testFileName S3Utilities.UploadFileToS3(self.awsParams.s3, testFile, bucketName, s3TempKey) S3Utilities.DownloadFileFromS3(self.awsParams.s3, bucketName, s3TempKey, testFileReturned) self.assertTrue( os.path.isfile(testFileReturned), "File was not uploaded correctly to the cloud bucket.")
def ProcessTable(self,table): ''' Process data for the table :param table: :return: ''' s3Key = self.job["s3Filename"] self.logger.info(self.moduleName + " - Processing file: " + s3Key) self.fileUtilities.moduleName = self.moduleName self.fileUtilities.localBaseDirectory = self.localTempDirectory + "/" + table["table"] self.fileUtilities.CreateFolders(self.job["folders"]) fileName = ntpath.basename(s3Key) local7zipFilePath = self.fileUtilities.gzipFolder+ "/" +fileName S3Utilities.DownloadFileFromS3(self.awsParams.s3,self.job["bucketName"], s3Key,local7zipFilePath) localCsvFilepath = self.fileUtilities.csvFolder + "/" + fileName localCsvFilepath = re.sub(r'\.zip$', '', localCsvFilepath) self.fileUtilities.UnzipUsing7z(local7zipFilePath,localCsvFilepath) fileToBeloaded = localCsvFilepath+'/'+'emission_05-11-2017.csv' spark = SparkUtilities.GetCreateSparkSession(self.logger) schema = SparkUtilities.BuildSparkSchema(table) df = (spark.read .format("com.databricks.spark.csv") .options(header='true', delimiter=self.job["delimiter"],ignoreTrailingWhiteSpace='true') .schema(schema) .load(fileToBeloaded) ) #df.show() self.logger.info( self.moduleName + " -- " + "Done reading " + str(df.count()) + " rows. Now saving as parquet file...") SparkUtilities.SaveParquet(df, self.fileUtilities) self.UploadFilesCreateAthenaTablesAndSqlScripts(table,self.fileUtilities.parquet) self.logger.info(self.moduleName + " -- " + "UploadFilesCreateAthenaTablesAndSqlScripts " + " finished ")
def testDeleteFile(self): testFile = self.createTestingFile( "testDeleteFile.txt", "Testing DeleteFile from S3Utilities...") testFileReturned = testFile.replace(".txt", "_returned.txt") bucketName, s3TempKey = S3Utilities.UploadFileToS3Temp( self.awsParams.s3, testFile) S3Utilities.DeleteFile(self.awsParams.s3, bucketName, s3TempKey) try: S3Utilities.DownloadFileFromS3(self.awsParams.s3, bucketName, s3TempKey, testFileReturned) self.assertFalse(os.path.isfile(testFileReturned), "File was not deleted from the cloud.") except Exception as err: if err.status != 404: self.fail( "Error registered while trying to delete a file from the cloud. Error:" + err.message)
def GetFilesFromS3(self): ''' pull down files from s3 ''' localFilepath = None try: self.logger.debug(self.moduleName + " -- " + "GetFilesFromS3" + " starting ") s3Key = self.job["s3SrcDirectory"] + "/" + self.job["filetoscan"] self.logger.info(self.moduleName + " - Processing file: " + s3Key) localFilepath = self.localTempDirectory + "/" + ntpath.basename( s3Key) S3Utilities.DownloadFileFromS3(self.awsParams.s3, self.job["bucketName"], s3Key, localFilepath) self.logger.debug(self.moduleName + " -- " + "GetFilesFromS3" + " finished ") except: self.logger.exception(self.moduleName + " - we had an error in : GetFilesFromS3") raise return localFilepath