def ProcessTables(self, dbCommon, tables): ''' process steps: pulls file from share and place in raw folder ''' try: processingFile = self.DownloadFile(self.job["srcCategories"]) self.CreateCSVFile(processingFile, self.job["srcCategories"]) spark = SparkUtilities.GetCreateSparkSession(self.logger) schemaAllString = SparkUtilities.BuildSparkSchema(tables, True) schema = SparkUtilities.BuildSparkSchema(tables) df = (spark.read .format("com.databricks.spark.csv") .options(header=False, delimiter=self.job["srcCategories"]["delimiter"]) .option("ignoreTrailingWhiteSpace", "true") .option("ignoreLeadingWhiteSpace", "true") .schema(schemaAllString) .load(self.fileUtilities.csvFolder) ) df = SparkUtilities.ReplaceAll(df, "\xE2\x80\x93", "") df2 = SparkUtilities.ConvertTypesToSchema(df, schema) SparkUtilities.SaveParquet(df2, self.fileUtilities) self.UploadFilesCreateAthenaTablesAndSqlScripts(tables, self.fileUtilities.parquet) if "loadToRedshift" in tables and tables["loadToRedshift"] == "Y": self.LoadDataFromAthenaIntoRedShiftLocalScripts(tables) except: self.logger.exception(self.moduleName + " - we had an error in ProcessTables") raise
def ProcessCatalogs(self, dbCommon, catalog): ''' Process the current table to load it up ''' try: FileUtilities.EmptyFolderContents( self.fileUtilities.gzipFolder ) # Clear the folder from the previous run FileUtilities.EmptyFolderContents( self.fileUtilities.csvFolder ) # Clear the folder from the previous run url = dbCommon["urlPrefix"] + catalog["name"] + "." + dbCommon[ "urlExt"] self.logger.info(self.moduleName + " - Processing url: " + url) localZipFilepath = self.fileUtilities.gzipFolder + "/" + \ catalog["name"] + "." + dbCommon["urlExt"] self.fileUtilities.DownloadFromURL(url, localZipFilepath) self.fileUtilities.UnzipFile(localZipFilepath, self.fileUtilities.csvFolder) localFilepath = self.fileUtilities.csvFolder + "/" + catalog[ "name"] + ".txt" spark = SparkUtilities.GetCreateSparkSession(self.logger) dfMaster = spark.read.json(localFilepath) dfMaster = dfMaster.filter(dfMaster.series_id != "") for table in catalog["tables"]: self.logger.info(self.moduleName + " -- " + "Processing table: " + table["table"]) # The column names being used in the source may be different from the once in the final # database. Select columns based on source and then rename to destination schemaSrc = SparkUtilities.BuildSparkSchema(table, useValidation=True) if table["dataSet"] == "attributes": df = dfMaster.select(schemaSrc.names) elif table["dataSet"] == "data": print( dfMaster.rdd.take(5) ) # There is some instability we need to monitor. Print seems to slow down and stabilize the run??? df = dfMaster.rdd.flatMap(lambda row: EIAAthenaSpark. ProcessDataRecords(row)).toDF( schemaSrc.names) else: raise ValueError("Undefined dataSet type") schemaDst = SparkUtilities.BuildSparkSchema(table) df = SparkUtilities.RenameColumnsToSchema(df, schemaDst) df = SparkUtilities.ConvertTypesToSchema(df, schemaDst) self.logger.info(self.moduleName + " -- " + "Done reading " + str(df.count()) + " rows. Now saving as parquet file...") FileUtilities.EmptyFolderContents( self.fileUtilities.sqlFolder ) # Clear the folder from the previous run SparkUtilities.SaveParquet(df, self.fileUtilities) self.UploadFilesCreateAthenaTablesAndSqlScripts( table, self.fileUtilities.parquet) self.LoadDataFromAthenaIntoRedShiftLocalScripts(table) self.logger.debug(self.moduleName + " -- " + "ProcessS3File for: " + url + " finished.\n\n") except: self.logger.exception("we had an error in EIA on ProcessS3File") raise