def CreateFolders(self): ''' Creates folder if it doesn't exist If it already exists, empties the folder contents ''' FileUtilities.EmptyFolderContents(self.localTempDirectory + self.job["downloadPath"]) FileUtilities.EmptyFolderContents(self.localTempDirectory + "/cleaned/") FileUtilities.EmptyFolderContents(self.localTempDirectory + "/packed/") for fp in self.job["foxpro_files"]: FileUtilities.EmptyFolderContents(self.localTempDirectory + "/packed/" + fp["Name"] + "/")
def CreateFolders(self): ''' Creates folders ''' FileUtilities.EmptyFolderContents(self.localTempDirectory + self.job["folderPath"]["raw"]) FileUtilities.EmptyFolderContents(self.localTempDirectory + self.job["folderPath"]["ident"]) FileUtilities.EmptyFolderContents( self.localTempDirectory + self.job["folderPath"]["transactions"]) FileUtilities.EmptyFolderContents(self.localTempDirectory + self.job["folderPath"]["contracts"]) FileUtilities.EmptyFolderContents(self.localTempDirectory + self.job["folderPath"]["indexPub"])
def MoveToS3(self, localFolderName, folderName, subFolder): ''' move gzip files to s3 and clean local instance localFolderName --> local folder name subFolder --> date folderName --> folder name on s3 ''' try: self.logger.debug(self.moduleName + " -- " + "MoveToS3 " + localFolderName + " starting ") ### # move any gzip files to the s3 server ### s3folder = "s3://" + self.job["bucketName"] + self.job["s3GzipFolderBase"] +\ "/" + folderName + '/' + subFolder localFolder = self.fileUtilities.gzipFolder + localFolderName S3Utilities.SyncFolderAWSCli(localFolder, s3folder, args='''--quiet --include "*.gz"''', dbug="Y") # Cleanup local files FileUtilities.EmptyFolderContents(localFolder) self.logger.debug(self.moduleName + " -- " + "MoveToS3 " + localFolderName + " finished ") except: self.logger.exception(self.moduleName + " - we had an error in MoveToS3") raise
def ProcessCatalogs(self, dbCommon, catalog): ''' Process each file ''' # Load the data from the S3 data lake into Redshift using Athena/Redshift Spectrum s3Key = dbCommon["s3SrcDirectory"] + "/" + catalog["s3Filename"] self.logger.info(self.moduleName + " - Processing file: " + s3Key) FileUtilities.EmptyFolderContents( self.fileUtilities.gzipFolder ) # Clear the folder from the previous run FileUtilities.EmptyFolderContents( self.fileUtilities.csvFolder ) # Clear the folder from the previous run fileName = ntpath.basename(s3Key) localGzipFilepath = self.fileUtilities.gzipFolder + "/" + fileName S3Utilities.S3Copy(s3Key, localGzipFilepath) localExcelFilepath = self.fileUtilities.csvFolder + "/" + fileName # Remove the gz extension localExcelFilepath = re.sub(r'\.gz$', '', localExcelFilepath) self.fileUtilities.GunzipFile(localGzipFilepath, localExcelFilepath) # Don't have a raw excel reader for Spark so use Pandas self.logger.info(self.moduleName + " - Processing Excel file: " + localExcelFilepath) pandasDf = pd.read_excel(localExcelFilepath, catalog["excelSheetName"], index_col=None, na_values=['NaN'], skiprows=catalog["skipRows"]) pandasDf = PandasUtilities.ConvertDateTimeToObject(pandasDf) spark = SparkUtilities.GetCreateSparkSession(self.logger) table = catalog["tables"][0] # There is only table in a catalog schema = SparkUtilities.BuildSparkSchema(table) df = spark.createDataFrame(pandasDf, schema) df = SparkUtilities.ConvertNanToNull(df) SparkUtilities.SaveParquet(df, self.fileUtilities) self.UploadFilesCreateAthenaTablesAndSqlScripts( table, self.fileUtilities.parquet) self.LoadDataFromAthenaIntoRedShiftS3Scripts(table) self.logger.debug(self.moduleName + " -- " + "ProcessS3File for file: " + s3Key + " finished.\n\n")
def ProcessTables(self, dbCommon, tables): ''' Process each file ''' self.logger.debug(self.moduleName + " -- ProcessTables for " + tables["table"] + " starting") FileUtilities.EmptyFolderContents(self.fileUtilities.csvFolder) # Clear the folder from the previous run self.ProcessFiles(tables) spark = SparkUtilities.GetCreateSparkSession(self.logger) # We will compute "period_type" later schemaWithoutPeriodType = SparkUtilities.BuildSparkSchema(tables, excludeComputed=True) df = (spark.read .format("com.databricks.spark.csv") .options(header=False, delimiter=self.job['delimiter'], ignoreTrailingWhiteSpace=True, ignoreLeadingWhiteSpace=True) .schema(schemaWithoutPeriodType) .load(self.fileUtilities.csvFolder) ) if "filterData" in tables: df = df.filter(tables["filterData"]) # Replace "NEW" with blank. E.g. DEC1990NEW to DEC1990 from pyspark.sql import functions as F #@UnresolvedImport df = SparkUtilities.RenameColumnsInList(df, [("period", "period_old")]) # Rename column since we cannot edit in place df = df.withColumn("period", F.regexp_replace(df["period_old"], "NEW", "")) # Compute "period_type". Following simple rules have been applied # MAY2013 - 7 characters so assumed to be 'M' # Q12017 - 6 characters so assumed to be 'Q' # 2017 - 4 characters so assumed to be 'Y' df = df.withColumn("period_type", F.when(F.length(df.period)==7, "M").when(F.length(df.period)==6, "Q").when(F.length(df.period)==4, "Y").otherwise("")) # Reorder the columns based on the input column order schema = SparkUtilities.BuildSparkSchema(tables) df = df.select(schema.names) SparkUtilities.SaveParquet(df, self.fileUtilities) self.UploadFilesCreateAthenaTablesAndSqlScripts(tables, self.fileUtilities.parquet) self.LoadDataFromAthenaIntoRedShiftS3Scripts(tables) self.logger.debug(self.moduleName + " -- ProcessTables for " + tables["table"] + " finished")
def MoveToS3(self): ''' move gzip files to s3 and clean local instance ''' try: self.logger.debug(self.moduleName + " -- " + "MoveToS3 " + " starting ") ### # move any gzip files to the s3 server ### s3folder = "s3://" + self.job["bucketName"] + self.job[ "s3GzipFolderBase"] S3Utilities.SyncFolderAWSCli(self.fileUtilities.gzipFolder, s3folder, args='''--quiet --include "*.gz"''', dbug="N") # Cleanup local files FileUtilities.EmptyFolderContents(self.fileUtilities.gzipFolder) self.logger.debug(self.moduleName + " -- " + "MoveToS3 " + " finished ") except: self.logger.exception(self.moduleName + " - we had an error in MoveToS3") raise
def ProcessCatalogs(self, dbCommon, catalog): ''' Process the current table to load it up ''' try: FileUtilities.EmptyFolderContents( self.fileUtilities.gzipFolder ) # Clear the folder from the previous run FileUtilities.EmptyFolderContents( self.fileUtilities.csvFolder ) # Clear the folder from the previous run url = dbCommon["urlPrefix"] + catalog["name"] + "." + dbCommon[ "urlExt"] self.logger.info(self.moduleName + " - Processing url: " + url) localZipFilepath = self.fileUtilities.gzipFolder + "/" + \ catalog["name"] + "." + dbCommon["urlExt"] self.fileUtilities.DownloadFromURL(url, localZipFilepath) self.fileUtilities.UnzipFile(localZipFilepath, self.fileUtilities.csvFolder) localFilepath = self.fileUtilities.csvFolder + "/" + catalog[ "name"] + ".txt" spark = SparkUtilities.GetCreateSparkSession(self.logger) dfMaster = spark.read.json(localFilepath) dfMaster = dfMaster.filter(dfMaster.series_id != "") for table in catalog["tables"]: self.logger.info(self.moduleName + " -- " + "Processing table: " + table["table"]) # The column names being used in the source may be different from the once in the final # database. Select columns based on source and then rename to destination schemaSrc = SparkUtilities.BuildSparkSchema(table, useValidation=True) if table["dataSet"] == "attributes": df = dfMaster.select(schemaSrc.names) elif table["dataSet"] == "data": print( dfMaster.rdd.take(5) ) # There is some instability we need to monitor. Print seems to slow down and stabilize the run??? df = dfMaster.rdd.flatMap(lambda row: EIAAthenaSpark. ProcessDataRecords(row)).toDF( schemaSrc.names) else: raise ValueError("Undefined dataSet type") schemaDst = SparkUtilities.BuildSparkSchema(table) df = SparkUtilities.RenameColumnsToSchema(df, schemaDst) df = SparkUtilities.ConvertTypesToSchema(df, schemaDst) self.logger.info(self.moduleName + " -- " + "Done reading " + str(df.count()) + " rows. Now saving as parquet file...") FileUtilities.EmptyFolderContents( self.fileUtilities.sqlFolder ) # Clear the folder from the previous run SparkUtilities.SaveParquet(df, self.fileUtilities) self.UploadFilesCreateAthenaTablesAndSqlScripts( table, self.fileUtilities.parquet) self.LoadDataFromAthenaIntoRedShiftLocalScripts(table) self.logger.debug(self.moduleName + " -- " + "ProcessS3File for: " + url + " finished.\n\n") except: self.logger.exception("we had an error in EIA on ProcessS3File") raise
def EmptyPackedFolder(self): ''' Empties the packed folder ''' FileUtilities.EmptyFolderContents(self.localTempDirectory + "/packed/")
def Start(self, logger, moduleName, filelocs): ''' main routine for Totem ''' try: ApplicationBase.Start(self, logger, moduleName, filelocs) self.SetUpLocalEnvironment() self.currProcId = self.etlUtilities.GetRunID( filelocs["tblEtl"]["table"], self.moduleName) lastRunRecJson = self.etlUtilities.GetLastGoodRun( filelocs["tblEtl"]["table"], self.moduleName) paramsList = [] if lastRunRecJson is not None: paramsList = json.loads(lastRunRecJson["params"]) ### # if we have run this before let's get a list of what they were so that we can use it ### prevmonth = [] currmonth = [] lenDatesArray = len(paramsList) if lenDatesArray > 0: if "currmonth" in paramsList: prevmonth = paramsList["currmonth"] ### # check and make sure we at least process the current month ### prevmonth, currmonth, tdArray = self.CleantdArray(prevmonth) for dte in tdArray: ## # run thru the dates and find the files associated with each date ## # if dte > '2010-05': # continue self.logger.debug(self.moduleName + " -- " + "date processing " + dte) tflArray = self.GetFileList(dte) for fls in tflArray: self.GetFile(dte, fls) FileUtilities.EmptyFolderContents( self.fileUtilities.csvFolder) FileUtilities.EmptyFolderContents(self.localTempDirectory + '/working') self.MoveToS3() ### # now load the s3 files into Redshift ### self.LoadData() if self.etlUtilities.SetInstanceParameters(filelocs["tblEtl"]["table"],\ self.currProcId,\ json.dumps({"lastrun":prevmonth, "currmonth": currmonth})) is not True: self.logger.info(self.moduleName + " - we could not set the instance.") self.UpdateTable(filelocs["tblEtl"]["schemaName"], filelocs["tblEtl"]["table"]) if self.job["cleanlocal"] == "Y": for fld in self.job["folders"]: self.fileUtilities.CreateLocalFolder(fld) self.logger.info(self.moduleName + " - Finished processing.") except: self.logger.exception(moduleName + " - Exception!") if self.etlUtilities.CompleteInstance(filelocs["tblEtl"]["table"],\ self.currProcId, 'F') is not True: self.logger.info(self.moduleName + " - we could not Complete Instance.") raise
class ConsensusAthenaSpark(ApplicationBase): ''' This class is used to control the data load process from different OPEC file sources. ''' def __init__(self): ''' Initial settings ''' super(ConsensusAthenaSpark, self).__init__() self.rawFolder = None self.fileUtilities = FileUtilities(self.logger) self.location = FileUtilities.PathToForwardSlash( os.path.dirname(os.path.abspath(__file__))) def BulkDownload(self): ''' Download all files. ''' sharedFiles = self.fileUtilities.ScanFolder( self.job["srcSharedFolder"]) self.logger.info(self.moduleName + " - Downloading files from shared folder...") for fileName in sharedFiles: if (fileName[:2] == self.job["fileNamePrefix"] ) and os.path.splitext(fileName)[1] in self.job["validExts"]: shutil.copyfile( os.path.join(self.job["srcSharedFolder"], fileName), self.rawFolder + "/" + fileName) def DfCleanUp(self, df, surveyDateVal): ''' Converts the actual excel file into csv for the worksheet configured. ''' bankNameColumnIn = "Unnamed: 0" surveyDateColName = "surveyDate" for colName in self.job["columnsToDrop"]: df = df.drop(colName, 1) df = df.drop(self.job["dropAfterHeader"], 0) for colName in df.head(0): dtTest = colName if not isinstance(dtTest, datetime) and colName != bankNameColumnIn: df = df.drop(colName, 1) df = df.assign(surveyDate=surveyDateVal) newOrder = [surveyDateColName] for colName in df.head(0): if colName != surveyDateColName: newOrder.append(colName) df = df[newOrder] df = df.melt(id_vars=[surveyDateColName, bankNameColumnIn]) return df def GetData(self, rawFileName, mode=None): ''' Returns the data frame or survey date ''' if mode == "getSurveyDate": skipRows = 0 else: skipRows = self.job["skipRows"] df = pandas.read_excel(rawFileName, sheetname=self.job["worksheetName"], index_col=None, na_values=["na"], skiprows=skipRows, skip_footer=self.job["skipFooter"]) if mode == "getSurveyDate": valRerturn = df.iloc[self.job["surveyDateRow"] - 2][0] else: valRerturn = df return valRerturn @staticmethod def FormatSurveyDate(emFile): ''' Returns the date based on the file's name ''' surveyDateColValue = os.path.splitext(emFile)[0] surveyDateColValue = surveyDateColValue[2:len(surveyDateColValue)] surveyDateColValue = surveyDateColValue.replace("CF", "") surveyDateColValue = str(surveyDateColValue[3:]) + "-" + str( list(calendar.month_abbr).index(surveyDateColValue[:3])) + "-01" return surveyDateColValue def ProcessFiles(self): ''' Controls the workflow for the conversion, clean up and pack of the input files. ''' filesToProcess = self.fileUtilities.ScanFolder(self.rawFolder) for emFile in filesToProcess: self.logger.info(self.moduleName + " - Processing file: " + emFile) rawFileName = self.rawFolder + "/" + emFile csvFilename = self.fileUtilities.csvFolder + os.path.splitext( emFile)[0] + ".csv" try: surveyDatedt = self.GetData(rawFileName, "getSurveyDate") if isinstance(surveyDatedt, float): surveyDatedt = self.FormatSurveyDate(emFile) elif isinstance(surveyDatedt, basestring): if "," in surveyDatedt: tmpDatedt = datetime.strptime(surveyDatedt, '%B %d, %Y') surveyDatedt = datetime.strftime(tmpDatedt, "%Y-%m-%d") df = self.GetData(rawFileName) df = self.DfCleanUp(df, surveyDatedt) df.to_csv(csvFilename, header=False, sep=str(self.job["delimiter"]), encoding='utf-8', index=False) except XLRDError: self.logger.info(self.moduleName + " - No tab named '" + self.job["worksheetName"] + "' in " + emFile) except Exception: self.logger.error(self.moduleName + " - Error while trying to process " + emFile) raise finally: FileUtilities.RemoveFileIfItExists(rawFileName) def ProcessTables(self, dbCommon, tables): ''' process steps: pulls file from share and place in raw folder ''' try: self.rawFolder = self.localTempDirectory + "/" + "Raw" self.BulkDownload() self.ProcessFiles() spark = SparkUtilities.GetCreateSparkSession(self.logger) df = SparkUtilities.ReadCSVFile(spark, tables, self.job["delimiter"], False, self.fileUtilities.csvFolder, self.logger) SparkUtilities.SaveParquet(df, self.fileUtilities) self.UploadFilesCreateAthenaTablesAndSqlScripts( tables, self.fileUtilities.parquet) self.fileUtilities.EmptyFolderContents( self.fileUtilities.csvFolder) if "loadToRedshift" in tables and tables["loadToRedshift"] == "Y": self.LoadDataFromAthenaIntoRedShiftLocalScripts(tables) except: self.logger.exception(self.moduleName + " - we had an error in ProcessRequest") raise def Start(self, logger, moduleName, filelocs): ''' Start of routine ''' ApplicationBase.Start(self, logger, moduleName, filelocs) # At some point this will be part of Start ApplicationBase.ProcessInput(self, logger, moduleName, filelocs)