def ProcessTables(self, dbCommon, tables): ''' process steps: pulls file from share and place in raw folder ''' try: processingFile = self.DownloadFile(self.job["srcCategories"]) self.CreateCSVFile(processingFile, self.job["srcCategories"]) spark = SparkUtilities.GetCreateSparkSession(self.logger) schemaAllString = SparkUtilities.BuildSparkSchema(tables, True) schema = SparkUtilities.BuildSparkSchema(tables) df = (spark.read .format("com.databricks.spark.csv") .options(header=False, delimiter=self.job["srcCategories"]["delimiter"]) .option("ignoreTrailingWhiteSpace", "true") .option("ignoreLeadingWhiteSpace", "true") .schema(schemaAllString) .load(self.fileUtilities.csvFolder) ) df = SparkUtilities.ReplaceAll(df, "\xE2\x80\x93", "") df2 = SparkUtilities.ConvertTypesToSchema(df, schema) SparkUtilities.SaveParquet(df2, self.fileUtilities) self.UploadFilesCreateAthenaTablesAndSqlScripts(tables, self.fileUtilities.parquet) if "loadToRedshift" in tables and tables["loadToRedshift"] == "Y": self.LoadDataFromAthenaIntoRedShiftLocalScripts(tables) except: self.logger.exception(self.moduleName + " - we had an error in ProcessTables") raise
def CreateParquetFilesAndLoad(self, catalog, partitionValue): ''' Creates the parquet files ''' try: spark = SparkUtilities.GetCreateSparkSession(self.logger) for tables in catalog["tables"]: if tables["type"] == "attributes": srcFolder = self.fileUtilities.csvFolder + '/attribute/' else: srcFolder = self.fileUtilities.csvFolder + '/data/' tableSchema = SparkUtilities.BuildSparkSchema(tables) df = (spark.read.format("com.databricks.spark.csv").options( header=False, delimiter=self.job["delimiter"]).schema( tableSchema).load(srcFolder)) SparkUtilities.SaveParquet(df, self.fileUtilities) self.UploadFilesCreateAthenaTablesAndSqlScripts( tables, self.fileUtilities.parquet, partitionValue) self.fileUtilities.EmptyFolderContents( self.fileUtilities.parquet) self.fileUtilities.EmptyFolderContents(srcFolder) except Exception as ex: self.logger.exception( self.moduleName + " - we had an error in CreateParquetFilesAndLoad " + ex.message) raise
def ProcessTables(self, dbCommon, tables): ''' Process a single category configured in the categories dictionary in the jobConfig. ''' try: self.logger.debug(self.moduleName + " -- " + "LoadCategory" + " starting ") processingFile = self.DownloadFile() fileOut = processingFile.replace(".dbf", ".txt") dbfUtils = DBFUtilities(self.logger) dbfUtils.ConvertToCSV(processingFile, fileOut, self.job["delimiter"], False) spark = SparkUtilities.GetCreateSparkSession(self.logger) schema = SparkUtilities.BuildSparkSchema(tables) df = (spark.read.format("com.databricks.spark.csv").options( header='false', delimiter=self.job["delimiter"]).schema(schema).load(fileOut)) self.logger.info(self.moduleName + " -- " + "Done reading " + str(df.count()) + " rows. Now saving as parquet file...") SparkUtilities.SaveParquet(df, self.fileUtilities) self.UploadFilesCreateAthenaTablesAndSqlScripts( tables, self.fileUtilities.parquet) self.LoadDataFromAthenaIntoRedShiftLocalScripts(tables) self.logger.debug(self.moduleName + " -- " + "LoadCategory" + " finished ") except Exception as err: self.logger.error(self.moduleName + " - Error while trying to load category...") raise Exception(err.message)
def ProcessTable(self, table): ''' Process the data for the table ''' s3Key = table["s3Filename"] self.logger.info(self.moduleName + " - Processing file: " + s3Key) self.CreateFolders(table["table"]) fileName = ntpath.basename(s3Key) localTxtFilepath = self.fileUtilities.csvFolder + "/" + fileName S3Utilities.DownloadFileFromS3(self.awsParams.s3, self.job["bucketName"], s3Key, localTxtFilepath) spark = SparkUtilities.GetCreateSparkSession(self.logger) schema = SparkUtilities.BuildSparkSchema(table) df = (spark.read.format("com.databricks.spark.csv").options( header='false', delimiter=self.job["delimiter"]).schema( schema).load(localTxtFilepath)) self.logger.info(self.moduleName + " -- " + "Done reading " + str(df.count()) + " rows. Now saving as parquet file...") SparkUtilities.SaveParquet(df, self.fileUtilities) self.UploadFilesCreateAthenaTablesAndSqlScripts( table, self.fileUtilities.parquet, table["partitionValue"]) self.logger.info(self.moduleName + " -- " + "ProcessTable " + " finished ")
def ProcessTables(self, dbCommon, tables): ''' Will load the ENP Yearly Table ''' try: outputfileName = self.fileUtilities.csvFolder + '/ENPdata.csv' conn = self.EstablishConnection(dbCommon) cur = conn.cursor() sqlline = self.FixSQLStatement(dbCommon) cur.execute(sqlline) self.ConvertToCSV(cur, outputfileName) spark = SparkUtilities.GetCreateSparkSession(self.logger) schema = SparkUtilities.BuildSparkSchema(tables) df = (spark.read.format("com.databricks.spark.csv").options( header='false', delimiter=self.job["delimiter"]).schema( schema).load(outputfileName)) self.logger.info(self.moduleName + " -- " + "Done reading " + str(df.count()) + " rows. Now saving as parquet file...") SparkUtilities.SaveParquet(df, self.fileUtilities) self.UploadFilesCreateAthenaTablesAndSqlScripts( tables, self.fileUtilities.parquet) self.LoadDataFromAthenaIntoRedShiftLocalScripts(tables) except Exception as err: self.logger.error(self.moduleName + " - Error while trying to load table. Error:" + err.message) raise
def ProcessTables(self, dbCommon, tables): ''' Process each Vantage table. ''' try: self.logger.debug(self.moduleName + " -- " + "Processing data for table:" + tables["table"]) fileName = self.BulkExtract(tables["table"], tables["scriptFile"], dbCommon) spark = SparkUtilities.GetCreateSparkSession(self.logger) schema = SparkUtilities.BuildSparkSchema(tables) df = (spark.read.format("com.databricks.spark.csv").options( header='false', delimiter=self.job["delimiter"]).schema(schema).load(fileName)) self.logger.info(self.moduleName + " -- " + "Done reading " + str(df.count()) + " rows. Now saving as parquet file...") SparkUtilities.SaveParquet(df, self.fileUtilities) self.UploadFilesCreateAthenaTablesAndSqlScripts( tables, self.fileUtilities.parquet) self.LoadDataFromAthenaIntoRedShiftLocalScripts(tables) except Exception as err: self.logger.error(self.moduleName + " - Error while trying to load table. Error: " + err.message) raise
def ProcessTables(self, dbCommon, tables): ''' Process each file ''' self.logger.debug(self.moduleName + " -- ProcessTables for " + tables["table"] + " starting") FileUtilities.EmptyFolderContents(self.fileUtilities.csvFolder) # Clear the folder from the previous run self.ProcessFiles(tables) spark = SparkUtilities.GetCreateSparkSession(self.logger) # We will compute "period_type" later schemaWithoutPeriodType = SparkUtilities.BuildSparkSchema(tables, excludeComputed=True) df = (spark.read .format("com.databricks.spark.csv") .options(header=False, delimiter=self.job['delimiter'], ignoreTrailingWhiteSpace=True, ignoreLeadingWhiteSpace=True) .schema(schemaWithoutPeriodType) .load(self.fileUtilities.csvFolder) ) if "filterData" in tables: df = df.filter(tables["filterData"]) # Replace "NEW" with blank. E.g. DEC1990NEW to DEC1990 from pyspark.sql import functions as F #@UnresolvedImport df = SparkUtilities.RenameColumnsInList(df, [("period", "period_old")]) # Rename column since we cannot edit in place df = df.withColumn("period", F.regexp_replace(df["period_old"], "NEW", "")) # Compute "period_type". Following simple rules have been applied # MAY2013 - 7 characters so assumed to be 'M' # Q12017 - 6 characters so assumed to be 'Q' # 2017 - 4 characters so assumed to be 'Y' df = df.withColumn("period_type", F.when(F.length(df.period)==7, "M").when(F.length(df.period)==6, "Q").when(F.length(df.period)==4, "Y").otherwise("")) # Reorder the columns based on the input column order schema = SparkUtilities.BuildSparkSchema(tables) df = df.select(schema.names) SparkUtilities.SaveParquet(df, self.fileUtilities) self.UploadFilesCreateAthenaTablesAndSqlScripts(tables, self.fileUtilities.parquet) self.LoadDataFromAthenaIntoRedShiftS3Scripts(tables) self.logger.debug(self.moduleName + " -- ProcessTables for " + tables["table"] + " finished")
def ProcessCatalogs(self, dbCommon, catalog): ''' Process each file ''' # Load the data from the S3 data lake into Redshift using Athena/Redshift Spectrum s3Key = dbCommon["s3SrcDirectory"] + "/" + catalog["s3Filename"] self.logger.info(self.moduleName + " - Processing file: " + s3Key) FileUtilities.EmptyFolderContents( self.fileUtilities.gzipFolder ) # Clear the folder from the previous run FileUtilities.EmptyFolderContents( self.fileUtilities.csvFolder ) # Clear the folder from the previous run fileName = ntpath.basename(s3Key) localGzipFilepath = self.fileUtilities.gzipFolder + "/" + fileName S3Utilities.S3Copy(s3Key, localGzipFilepath) localExcelFilepath = self.fileUtilities.csvFolder + "/" + fileName # Remove the gz extension localExcelFilepath = re.sub(r'\.gz$', '', localExcelFilepath) self.fileUtilities.GunzipFile(localGzipFilepath, localExcelFilepath) # Don't have a raw excel reader for Spark so use Pandas self.logger.info(self.moduleName + " - Processing Excel file: " + localExcelFilepath) pandasDf = pd.read_excel(localExcelFilepath, catalog["excelSheetName"], index_col=None, na_values=['NaN'], skiprows=catalog["skipRows"]) pandasDf = PandasUtilities.ConvertDateTimeToObject(pandasDf) spark = SparkUtilities.GetCreateSparkSession(self.logger) table = catalog["tables"][0] # There is only table in a catalog schema = SparkUtilities.BuildSparkSchema(table) df = spark.createDataFrame(pandasDf, schema) df = SparkUtilities.ConvertNanToNull(df) SparkUtilities.SaveParquet(df, self.fileUtilities) self.UploadFilesCreateAthenaTablesAndSqlScripts( table, self.fileUtilities.parquet) self.LoadDataFromAthenaIntoRedShiftS3Scripts(table) self.logger.debug(self.moduleName + " -- " + "ProcessS3File for file: " + s3Key + " finished.\n\n")
def ProcessTable(self,table): ''' Process data for the table :param table: :return: ''' s3Key = self.job["s3Filename"] self.logger.info(self.moduleName + " - Processing file: " + s3Key) self.fileUtilities.moduleName = self.moduleName self.fileUtilities.localBaseDirectory = self.localTempDirectory + "/" + table["table"] self.fileUtilities.CreateFolders(self.job["folders"]) fileName = ntpath.basename(s3Key) local7zipFilePath = self.fileUtilities.gzipFolder+ "/" +fileName S3Utilities.DownloadFileFromS3(self.awsParams.s3,self.job["bucketName"], s3Key,local7zipFilePath) localCsvFilepath = self.fileUtilities.csvFolder + "/" + fileName localCsvFilepath = re.sub(r'\.zip$', '', localCsvFilepath) self.fileUtilities.UnzipUsing7z(local7zipFilePath,localCsvFilepath) fileToBeloaded = localCsvFilepath+'/'+'emission_05-11-2017.csv' spark = SparkUtilities.GetCreateSparkSession(self.logger) schema = SparkUtilities.BuildSparkSchema(table) df = (spark.read .format("com.databricks.spark.csv") .options(header='true', delimiter=self.job["delimiter"],ignoreTrailingWhiteSpace='true') .schema(schema) .load(fileToBeloaded) ) #df.show() self.logger.info( self.moduleName + " -- " + "Done reading " + str(df.count()) + " rows. Now saving as parquet file...") SparkUtilities.SaveParquet(df, self.fileUtilities) self.UploadFilesCreateAthenaTablesAndSqlScripts(table,self.fileUtilities.parquet) self.logger.info(self.moduleName + " -- " + "UploadFilesCreateAthenaTablesAndSqlScripts " + " finished ")
samplejson = '''{ "fields": [ {"metadata": {}, "nullable": true, "name": "sourceset", "type": "string"}, {"metadata": {}, "nullable": true, "name": "sourcesetdesc", "type": "string"}, {"metadata": {}, "nullable": true, "name": "market_and_exchange_names", "type": "string"}, {"metadata": {}, "nullable": true, "name": "as_of_date_in_form_yymmdd", "type": "integer"}, {"metadata": {}, "nullable": true, "name": "report_date_as_yyyy_mm_dd", "type": "string"} ], "type": "struct" }''' #schemaJson = json.loads(samplejson) #from pyspark.sql.types import StructType#@UnresolvedImport #schema = StructType.fromJson(schemaJson) schema = SparkUtilities.BuildSparkSchema(table) #print(schema) localTxtFilepath = "C:/tmp/testfiles/COTHist2011.csv" df = (sqlContext.read.format("csv").option("header", "true").option( "delimiter", ",").option("ignoreTrailingWhiteSpace", "true").option("ignoreLeadingWhiteSpace", "true").schema(schema).load(localTxtFilepath)) df.printSchema() df.show() sc.stop()
def ProcessCatalogs(self, dbCommon, catalog): ''' Process the current table to load it up ''' try: FileUtilities.EmptyFolderContents( self.fileUtilities.gzipFolder ) # Clear the folder from the previous run FileUtilities.EmptyFolderContents( self.fileUtilities.csvFolder ) # Clear the folder from the previous run url = dbCommon["urlPrefix"] + catalog["name"] + "." + dbCommon[ "urlExt"] self.logger.info(self.moduleName + " - Processing url: " + url) localZipFilepath = self.fileUtilities.gzipFolder + "/" + \ catalog["name"] + "." + dbCommon["urlExt"] self.fileUtilities.DownloadFromURL(url, localZipFilepath) self.fileUtilities.UnzipFile(localZipFilepath, self.fileUtilities.csvFolder) localFilepath = self.fileUtilities.csvFolder + "/" + catalog[ "name"] + ".txt" spark = SparkUtilities.GetCreateSparkSession(self.logger) dfMaster = spark.read.json(localFilepath) dfMaster = dfMaster.filter(dfMaster.series_id != "") for table in catalog["tables"]: self.logger.info(self.moduleName + " -- " + "Processing table: " + table["table"]) # The column names being used in the source may be different from the once in the final # database. Select columns based on source and then rename to destination schemaSrc = SparkUtilities.BuildSparkSchema(table, useValidation=True) if table["dataSet"] == "attributes": df = dfMaster.select(schemaSrc.names) elif table["dataSet"] == "data": print( dfMaster.rdd.take(5) ) # There is some instability we need to monitor. Print seems to slow down and stabilize the run??? df = dfMaster.rdd.flatMap(lambda row: EIAAthenaSpark. ProcessDataRecords(row)).toDF( schemaSrc.names) else: raise ValueError("Undefined dataSet type") schemaDst = SparkUtilities.BuildSparkSchema(table) df = SparkUtilities.RenameColumnsToSchema(df, schemaDst) df = SparkUtilities.ConvertTypesToSchema(df, schemaDst) self.logger.info(self.moduleName + " -- " + "Done reading " + str(df.count()) + " rows. Now saving as parquet file...") FileUtilities.EmptyFolderContents( self.fileUtilities.sqlFolder ) # Clear the folder from the previous run SparkUtilities.SaveParquet(df, self.fileUtilities) self.UploadFilesCreateAthenaTablesAndSqlScripts( table, self.fileUtilities.parquet) self.LoadDataFromAthenaIntoRedShiftLocalScripts(table) self.logger.debug(self.moduleName + " -- " + "ProcessS3File for: " + url + " finished.\n\n") except: self.logger.exception("we had an error in EIA on ProcessS3File") raise
def ProcessTables(self, dbCommon, tables): ''' Process the current table to load it up ''' try: self.logger.debug(self.moduleName + " -- ProcessTables for " + tables["table"] + " starting") # Cleanup first (TODO - Need a more generic way to do this) self.fileUtilities.EmptyFolderContents(self.fileUtilities.sqlFolder) # Variables used for handling chunks. -1 for full load chunkStart = chunkEnd = maxValue = chunkSize = -1 if "incrementalconditions" in tables: incrementalConditions = tables["incrementalconditions"] if "startID" in incrementalConditions: chunkEnd = incrementalConditions["startID"] - 1 else: athenaSchemaName = AthenaUtilities.ComposeAthenaSchemaName(tables["schemaName"]) try: # This is where we last ended. Start at 1 + this end chunkEnd = int(AthenaUtilities.GetMaxValue(self.awsParams, athenaSchemaName, tables["table"], tables["distkey"], self.logger)) except ValueError: chunkEnd = 0 # Table does not exist yet except: raise if "endID" in incrementalConditions: maxValue = incrementalConditions["endID"] else: # TODO - Fix this. Also, we should start at the source min value not 0. maxValue = 2000000000 #BCPUtilities.GetMaxValueSQLServer(dbCommon, tables, chunkStart) chunkSize = tables["incrementalconditions"]["chunksize"] chunkStart, chunkEnd = self.UpdateChunkStartEnd(chunkEnd, chunkSize, maxValue) fieldDelimiter = self.job["delimiter"] if "delimiter" in tables: fieldDelimiter = tables["delimiter"] while chunkStart <= maxValue: partitionValue = self.GetPartitionValue(tables, chunkStart) sqlPullDataScript = BCPUtilities.CreatePullScript(dbCommon, tables, chunkStart, chunkEnd, self.logger, self.fileUtilities, self.location) # Construct a file name that is meaning full. That is, it has the start and end IDs outputCSV = self.fileUtilities.csvFolder + BCPUtilities.ComponseRangeString(chunkStart, chunkEnd) + ".csv" self.fileUtilities.EmptyFolderContents(self.fileUtilities.csvFolder) self.bcpUtilities.BulkExtract(sqlPullDataScript, outputCSV, dbCommon, tables, fieldDelimiter, self.job["bcpUtilityDirOnLinux"], self.fileUtilities, self.logger) # Process the data using Spark and save as Parquet spark = SparkUtilities.GetCreateSparkSession(self.logger) schema = SparkUtilities.BuildSparkSchema(tables) df = (spark.read .format("com.databricks.spark.csv") .options(header='false', delimiter=fieldDelimiter) .schema(schema) .load(self.fileUtilities.csvFolder) ) df.printSchema() df.show() df = SparkUtilities.ProcessSpecialCharsIfAny(df, tables) self.logger.info(self.moduleName + " -- " + "DONE READING " + str(df.count()) + " ROWS. Now saving as parquet file...") self.fileUtilities.EmptyFolderContents(self.fileUtilities.parquet) SparkUtilities.SaveParquet(df, self.fileUtilities) # Need to load the data and clear the local space self.UploadFilesCreateAthenaTablesAndSqlScripts(tables, self.fileUtilities.parquet, partitionValue) tables["new"] = "N" # Do not recreate if chunkSize < 0: break; # Done with the single load chunkStart, chunkEnd = self.UpdateChunkStartEnd(chunkEnd, chunkSize, maxValue) # TODO - Need to make sure we don't end up with duplicate data if we run the code # Twice on the same day self.LoadDataFromAthenaIntoRedShiftLocalScripts(tables) self.logger.debug(self.moduleName + " -- ProcessTables for " + tables["table"] + " finished") except: self.logger.exception(self.moduleName + " - we had an error in ProcessDatabase for " + tables["table"]) raise
def ProcessTables(self, dbCommon, tables): ''' get the last partition value and use that as the date to pull data then put that data into Athena ''' try: outputCSV = self.fileUtilities.csvFolder + self.moduleName + ".CSV" fieldTerminator = self.job["fieldTerminator"] if "fieldTerminator" in tables: fieldTerminator = tables["fieldTerminator"] rawFolder = self.localTempDirectory + "/raw/" rowTerminator = None # Not using this. Stick with the default of CR/LF. self.job["rowTerminator"] if "pullTemplate" in tables: lastRunDate = self.GetParameters(tables) formattedLastRunDate = lastRunDate[4:6] + '/' + lastRunDate[ 6:8] + '/' + lastRunDate[:4] sqlPullDataScript = self.CreatePullScript( tables, formattedLastRunDate) self.bcpUtilities.BulkExtract( self.fileUtilities.LoadSQLQuery(sqlPullDataScript), outputCSV, dbCommon, tables, fieldTerminator, rowTerminator, self.job["bcpUtilityDirOnLinux"], self.fileUtilities, self.logger) self.masterSchema = SparkUtilities.BuildSparkSchema(tables) self.fileUtilities.MoveFilesFromOneFolderToAnother(self.fileUtilities.csvFolder,\ rawFolder,\ '*.csv') return ### # load data frame from CSV file ### partitionValue = self.GetPartitionValue() self.fileUtilities.EmptyFolderContents( self.fileUtilities.csvFolder) spark = SparkUtilities.GetCreateSparkSession(self.logger) df = (spark.read.format("com.databricks.spark.csv").options( header=False, delimiter=fieldTerminator).schema( self.masterSchema).load(rawFolder)) cols = [] for field in tables["fields"]: if "athenaOnly" in field: if field["athenaOnly"] != "Y": cols.append(field["name"]) else: cols.append(field["name"]) if tables["type"] == "attributes": dfAttributes = df.select(cols).distinct() if dfAttributes.count() == 0: self.logger.debug( self.moduleName + " - no records to process for Attribute data") return SparkUtilities.SaveParquet(dfAttributes, self.fileUtilities) elif tables["type"] == "series": dfSeries = df.select(cols) if "adjustFormat" in tables: for fld in tables["adjustFormat"]: dfSeries = SparkUtilities.FormatColumn( dfSeries, fld["name"], fld["inputFormat"]) if dfSeries.count() == 0: self.logger.debug( self.moduleName + " - no records to process for Series data") return SparkUtilities.SaveParquet(dfSeries, self.fileUtilities) self.UploadFilesCreateAthenaTablesAndSqlScripts( tables, self.fileUtilities.parquet, partitionValue) if "loadToRedshift" in tables and tables["loadToRedshift"] == "Y": self.LoadDataFromAthenaIntoRedShiftLocalScripts(tables) except: self.logger.exception(self.moduleName + " - we had an error in ProcessTables") raise